From 77348450aad438fdaade1c83382fc68c4773849a Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sun, 28 Apr 2024 13:21:38 -0400 Subject: [PATCH 1/2] test: Add failing test for 1.10.0 (fixed in 1.10.1) --- examples/empty.csv | 1 + tests/test_table/test_from_csv.py | 8 ++++++++ 2 files changed, 9 insertions(+) create mode 100644 examples/empty.csv diff --git a/examples/empty.csv b/examples/empty.csv new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/examples/empty.csv @@ -0,0 +1 @@ + diff --git a/tests/test_table/test_from_csv.py b/tests/test_table/test_from_csv.py index d4cbc693..c2f0fbf7 100644 --- a/tests/test_table/test_from_csv.py +++ b/tests/test_table/test_from_csv.py @@ -187,3 +187,11 @@ def test_from_csv_row_limit_too_high(self): self.assertColumnTypes(table2, [Number, Text, Boolean, Date, DateTime, TimeDelta]) self.assertRows(table2, table1.rows) + + def test_from_csv_empty(self): + table = Table.from_csv('examples/empty.csv') + + self.assertColumnNames(table, []) + self.assertColumnTypes(table, []) + + self.assertRows(table, []) From fa236021f48df71cba217b4eb17e442289ff5844 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sun, 28 Apr 2024 14:11:19 -0400 Subject: [PATCH 2/2] fix: Don't error on piped data 1db7277 #779 https://til.simonwillison.net/python/io-bufferedreader --- .github/workflows/ci.yml | 5 +++++ CHANGELOG.rst | 5 +++++ agate/table/from_csv.py | 24 +++++++++++++++++------- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fe5f1ee3..215615b9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,6 +31,11 @@ jobs: PYTHONIOENCODING: utf-8 PYTHONUTF8: 1 run: pytest --cov agate + - name: Read from stdin + if: matrix.os != 'windows-latest' + run: python -c 'import sys; import agate; agate.Table.from_csv(sys.stdin, sniff_limit=1)' < examples/test.csv + - name: Read from pipe + run: printf 'a,b,c\n1,2,3' | python -c 'import sys; import agate; agate.Table.from_csv(sys.stdin, sniff_limit=1)' - run: python charts.py - env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 14e28e81..42aab857 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,8 @@ +Unreleased +---------- + +- fix: Version 1.10.0 errors on piped data. + 1.10.1 - April 28, 2024 ----------------------- diff --git a/agate/table/from_csv.py b/agate/table/from_csv.py index 860f4732..b20f2aa2 100644 --- a/agate/table/from_csv.py +++ b/agate/table/from_csv.py @@ -1,5 +1,6 @@ +import io import itertools -from io import StringIO +import sys @classmethod @@ -63,14 +64,23 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk if sniff_limit is None: # Reads to the end of the tile, but avoid reading the file twice. - handle = StringIO(f.read()) - kwargs['dialect'] = csv.Sniffer().sniff(handle.getvalue()) + handle = io.StringIO(f.read()) + sample = handle.getvalue() elif sniff_limit > 0: - offset = f.tell() + if f == sys.stdin: + # "At most one single read on the raw stream is done to satisfy the call. The number of bytes returned + # may be less or more than requested." In other words, it reads the buffer_size, which might be less or + # more than the sniff_limit. On my machine, the buffer_size of sys.stdin.buffer is the length of the + # input, up to 65536. This assumes that users don't sniff more than 64 KiB. + # https://docs.python.org/3/library/io.html#io.BufferedReader.peek + sample = sys.stdin.buffer.peek(sniff_limit).decode(encoding, 'ignore')[:sniff_limit] # reads *bytes* + else: + offset = f.tell() + sample = f.read(sniff_limit) # reads *characters* + f.seek(offset) # can't do f.seek(-sniff_limit, os.SEEK_CUR) on file opened in text mode - # Reads only the start of the file. - kwargs['dialect'] = csv.Sniffer().sniff(f.read(sniff_limit)) - f.seek(offset) + if sniff_limit is None or sniff_limit > 0: + kwargs['dialect'] = csv.Sniffer().sniff(sample) reader = csv.reader(handle, header=header, **kwargs)