From e5ef16eba72c7769cba601bcb6df5577eba48415 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 17 Oct 2023 23:17:59 -0400 Subject: [PATCH] fix: Reconfigure the encoding of standard input according to the --encoding option, closes #1038 --- CHANGELOG.rst | 1 + csvkit/cli.py | 5 ++++- csvkit/utilities/csvstack.py | 2 +- tests/test_convert/test_fixed.py | 8 ++++---- tests/test_utilities/test_csvclean.py | 4 ++-- tests/test_utilities/test_csvformat.py | 8 ++++---- tests/test_utilities/test_csvjson.py | 4 ++-- tests/test_utilities/test_csvlook.py | 4 ++-- tests/test_utilities/test_csvsort.py | 4 ++-- tests/test_utilities/test_csvsql.py | 14 +++++++------- tests/test_utilities/test_csvstack.py | 10 +++++----- tests/test_utilities/test_in2csv.py | 18 +++++++++--------- tests/test_utilities/test_sql2csv.py | 10 +++++----- tests/utils.py | 14 +++++++------- 14 files changed, 55 insertions(+), 51 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 37310b31f..c5e191514 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,6 +10,7 @@ Unreleased * :doc:`/scripts/csvstat` adds a :code:`--non-nulls` option to only output counts of non-null values. * :doc:`/scripts/csvstat` adds a :code:`--max-precision` option to only output the most decimal places. * feat: Add a :code:`--null-value` option to commands with the :code:`--blanks` option, to convert additional values to NULL. +* fix: Reconfigure the encoding of standard input according to the :code:`--encoding` option, which defaults to ``utf-8-sig``. Affected users no longer need to set the ``PYTHONIOENCODING`` environment variable. * fix: Prompt the user if additional input is expected (i.e. if no input file or piped data is provided) in :doc:`/scripts/csvjoin`, :doc:`/scripts/csvsql` and :doc:`/scripts/csvstack`. * fix: No longer errors if a NUL byte occurs in an input file. * Add Python 3.12 support. diff --git a/csvkit/cli.py b/csvkit/cli.py index e62a066a9..d32be6381 100644 --- a/csvkit/cli.py +++ b/csvkit/cli.py @@ -233,11 +233,14 @@ def _init_common_parser(self): '-V', '--version', action='version', version='%(prog)s 1.2.0', help='Display version information and exit.') - def _open_input_file(self, path): + def _open_input_file(self, path, opened=False): """ Open the input file specified on the command line. """ if not path or path == '-': + # "UnsupportedOperation: It is not possible to set the encoding or newline of stream after the first read" + if not opened: + sys.stdin.reconfigure(encoding=self.args.encoding) f = sys.stdin else: extension = splitext(path)[1] diff --git a/csvkit/utilities/csvstack.py b/csvkit/utilities/csvstack.py index 8179cdc20..1a07ee96b 100644 --- a/csvkit/utilities/csvstack.py +++ b/csvkit/utilities/csvstack.py @@ -108,7 +108,7 @@ def main(self): output.writerow(headers) for i, path in enumerate(self.args.input_paths): - f = self._open_input_file(path) + f = self._open_input_file(path, opened=True) file_is_stdin = path == '-' if has_groups: diff --git a/tests/test_convert/test_fixed.py b/tests/test_convert/test_fixed.py index 3ce91b2ea..d8f57be78 100644 --- a/tests/test_convert/test_fixed.py +++ b/tests/test_convert/test_fixed.py @@ -1,4 +1,4 @@ -from io import StringIO +import io from csvkit.convert import fixed from csvkit.utilities.in2csv import In2CSV @@ -23,7 +23,7 @@ def test_fixed_skip_lines(self): self.assertEqual(f.read(), output) def test_fixed_no_inference(self): - input_file = StringIO(' 1 2 3') + input_file = io.BytesIO(b' 1 2 3') with stdin_as_string(input_file): self.assertLines(['--no-inference', '-f', 'fixed', '--schema', @@ -36,7 +36,7 @@ def test_fixed_no_inference(self): def test_fixed_streaming(self): with open('examples/testfixed') as f, open('examples/testfixed_schema.csv') as schema: - output_file = StringIO() + output_file = io.StringIO() fixed.fixed2csv(f, schema, output=output_file) output = output_file.getvalue() output_file.close() @@ -91,7 +91,7 @@ def test_schematic_line_parser(self): bar,6,2 baz,8,5""" - f = StringIO(schema) + f = io.StringIO(schema) parser = fixed.FixedWidthRowParser(f) f.close() diff --git a/tests/test_utilities/test_csvclean.py b/tests/test_utilities/test_csvclean.py index 0d76698fb..1d284c942 100644 --- a/tests/test_utilities/test_csvclean.py +++ b/tests/test_utilities/test_csvclean.py @@ -1,6 +1,6 @@ +import io import os import sys -from io import StringIO from unittest.mock import patch from csvkit.utilities.csvclean import CSVClean, launch_new_instance @@ -17,7 +17,7 @@ def tearDown(self): def assertCleaned(self, basename, output_lines, error_lines, additional_args=[]): args = [f'examples/{basename}.csv'] + additional_args - output_file = StringIO() + output_file = io.StringIO() utility = CSVClean(args, output_file) utility.run() diff --git a/tests/test_utilities/test_csvformat.py b/tests/test_utilities/test_csvformat.py index 70567152b..5e2ebf5e2 100644 --- a/tests/test_utilities/test_csvformat.py +++ b/tests/test_utilities/test_csvformat.py @@ -1,5 +1,5 @@ +import io import sys -from io import StringIO from unittest.mock import patch from csvkit.utilities.csvformat import CSVFormat, launch_new_instance @@ -54,7 +54,7 @@ def test_tab_delimiter(self): ]) def test_quotechar(self): - input_file = StringIO('a,b,c\n1*2,3,4\n') + input_file = io.BytesIO(b'a,b,c\n1*2,3,4\n') with stdin_as_string(input_file): self.assertLines(['-Q', '*'], [ @@ -65,7 +65,7 @@ def test_quotechar(self): input_file.close() def test_doublequote(self): - input_file = StringIO('a\n"a ""quoted"" string"') + input_file = io.BytesIO(b'a\n"a ""quoted"" string"') with stdin_as_string(input_file): self.assertLines(['-P', '#', '-B'], [ @@ -76,7 +76,7 @@ def test_doublequote(self): input_file.close() def test_escapechar(self): - input_file = StringIO('a,b,c\n1"2,3,4\n') + input_file = io.BytesIO(b'a,b,c\n1"2,3,4\n') with stdin_as_string(input_file): self.assertLines(['-P', '#', '-U', '3'], [ diff --git a/tests/test_utilities/test_csvjson.py b/tests/test_utilities/test_csvjson.py index 4bfb4ec4f..302456c5d 100644 --- a/tests/test_utilities/test_csvjson.py +++ b/tests/test_utilities/test_csvjson.py @@ -1,6 +1,6 @@ +import io import json import sys -from io import StringIO from unittest.mock import patch from csvkit.utilities.csvjson import CSVJSON, launch_new_instance @@ -58,7 +58,7 @@ def test_keying(self): self.assertDictEqual(js, {'True': {'a': True, 'c': 3.0, 'b': 2.0}}) def test_duplicate_keys(self): - output_file = StringIO() + output_file = io.StringIO() utility = CSVJSON(['-k', 'a', 'examples/dummy3.csv'], output_file) self.assertRaisesRegex(ValueError, 'Value True is not unique in the key column.', diff --git a/tests/test_utilities/test_csvlook.py b/tests/test_utilities/test_csvlook.py index f11c4b9f6..d3817b107 100644 --- a/tests/test_utilities/test_csvlook.py +++ b/tests/test_utilities/test_csvlook.py @@ -1,5 +1,5 @@ +import io import sys -from io import StringIO from unittest.mock import patch from csvkit.utilities.csvlook import CSVLook, launch_new_instance @@ -127,7 +127,7 @@ def test_max_column_width(self): ]) def test_stdin(self): - input_file = StringIO('a,b,c\n1,2,3\n4,5,6\n') + input_file = io.BytesIO(b'a,b,c\n1,2,3\n4,5,6\n') with stdin_as_string(input_file): self.assertLines([], [ diff --git a/tests/test_utilities/test_csvsort.py b/tests/test_utilities/test_csvsort.py index d053d93f3..06cea1ddc 100644 --- a/tests/test_utilities/test_csvsort.py +++ b/tests/test_utilities/test_csvsort.py @@ -1,5 +1,5 @@ +import io import sys -from io import StringIO from unittest.mock import patch from csvkit.utilities.csvsort import CSVSort, launch_new_instance @@ -78,7 +78,7 @@ def test_sort_t_and_nulls(self): self.assertEqual(test_order, new_order) def test_stdin(self): - input_file = StringIO('a,b,c\n4,5,6\n1,2,3\n') + input_file = io.BytesIO(b'a,b,c\n4,5,6\n1,2,3\n') with stdin_as_string(input_file): self.assertLines([], [ diff --git a/tests/test_utilities/test_csvsql.py b/tests/test_utilities/test_csvsql.py index 9d49d54e8..4bca6e475 100644 --- a/tests/test_utilities/test_csvsql.py +++ b/tests/test_utilities/test_csvsql.py @@ -1,6 +1,6 @@ +import io import os import sys -from io import StringIO from textwrap import dedent from unittest.mock import patch @@ -108,7 +108,7 @@ def test_linenumbers(self): ''')) # noqa: W291 def test_stdin(self): - input_file = StringIO('a,b,c\n4,2,3\n') + input_file = io.BytesIO(b'a,b,c\n4,2,3\n') with stdin_as_string(input_file): sql = self.get_output(['--tables', 'foo']) @@ -124,7 +124,7 @@ def test_stdin(self): input_file.close() def test_stdin_and_filename(self): - input_file = StringIO("a,b,c\n1,2,3\n") + input_file = io.BytesIO(b'a,b,c\n1,2,3\n') with stdin_as_string(input_file): sql = self.get_output(['-', 'examples/dummy.csv']) @@ -135,7 +135,7 @@ def test_stdin_and_filename(self): input_file.close() def test_query(self): - input_file = StringIO("a,b,c\n1,2,3\n") + input_file = io.BytesIO(b'a,b,c\n1,2,3\n') with stdin_as_string(input_file): sql = self.get_output(['--query', 'SELECT m.usda_id, avg(i.sepal_length) AS mean_sepal_length FROM iris ' @@ -150,7 +150,7 @@ def test_query(self): input_file.close() def test_query_empty(self): - input_file = StringIO() + input_file = io.BytesIO() with stdin_as_string(input_file): output = self.get_output(['--query', 'SELECT 1']) @@ -185,14 +185,14 @@ def test_before_after_insert(self): 'SELECT 1; CREATE TABLE foobar (date DATE)', '--after-insert', 'INSERT INTO dummy VALUES (0, 5, 6)']) - output_file = StringIO() + output_file = io.StringIO() utility = SQL2CSV(['--db', 'sqlite:///' + self.db_file, '--query', 'SELECT * FROM foobar'], output_file) utility.run() output = output_file.getvalue() output_file.close() self.assertEqual(output, 'date\n') - output_file = StringIO() + output_file = io.StringIO() utility = SQL2CSV(['--db', 'sqlite:///' + self.db_file, '--query', 'SELECT * FROM dummy'], output_file) utility.run() output = output_file.getvalue() diff --git a/tests/test_utilities/test_csvstack.py b/tests/test_utilities/test_csvstack.py index a63f2c485..7187971a6 100644 --- a/tests/test_utilities/test_csvstack.py +++ b/tests/test_utilities/test_csvstack.py @@ -21,7 +21,7 @@ def test_skip_lines(self): ]) def test_skip_lines_stdin(self): - with open('examples/test_skip_lines.csv') as f, stdin_as_string(f): + with open('examples/test_skip_lines.csv', 'rb') as f, stdin_as_string(f): self.assertRows(['--skip-lines', '3', '-', 'examples/test_skip_lines.csv'], [ ['a', 'b', 'c'], ['1', '2', '3'], @@ -62,14 +62,14 @@ def test_multiple_file_stack_col_ragged(self): ]) def test_multiple_file_stack_col_ragged_stdin(self): - with open('examples/dummy.csv') as f, stdin_as_string(f): + with open('examples/dummy.csv', 'rb') as f, stdin_as_string(f): self.assertRows(['-', 'examples/dummy_col_shuffled_ragged.csv'], [ ['a', 'b', 'c', 'd'], ['1', '2', '3', ''], ['1', '2', '3', '4'], ]) - with open('examples/dummy.csv') as f, stdin_as_string(f): + with open('examples/dummy.csv', 'rb') as f, stdin_as_string(f): self.assertRows(['examples/dummy_col_shuffled_ragged.csv', '-'], [ ['b', 'c', 'a', 'd'], ['2', '3', '1', '4'], @@ -101,14 +101,14 @@ def test_no_header_row_basic(self): ]) def test_no_header_row_basic_stdin(self): - with open('examples/no_header_row.csv') as f, stdin_as_string(f): + with open('examples/no_header_row.csv', 'rb') as f, stdin_as_string(f): self.assertRows(['--no-header-row', '-', 'examples/no_header_row2.csv'], [ ['a', 'b', 'c'], ['1', '2', '3'], ['4', '5', '6'], ]) - with open('examples/no_header_row.csv') as f, stdin_as_string(f): + with open('examples/no_header_row.csv', 'rb') as f, stdin_as_string(f): self.assertRows(['--no-header-row', 'examples/no_header_row2.csv', '-'], [ ['a', 'b', 'c'], ['4', '5', '6'], diff --git a/tests/test_utilities/test_in2csv.py b/tests/test_utilities/test_in2csv.py index 49543c484..c78e0a7ed 100644 --- a/tests/test_utilities/test_in2csv.py +++ b/tests/test_utilities/test_in2csv.py @@ -1,6 +1,6 @@ +import io import os import sys -from io import StringIO from unittest.mock import patch from csvkit.utilities.in2csv import In2CSV, launch_new_instance @@ -38,7 +38,7 @@ def test_blanks(self): self.assertConverted('csv', 'examples/blanks.csv', 'examples/blanks.csv', ['--blanks']) def test_null_value(self): - input_file = StringIO('a,b\nn/a,\\N') + input_file = io.BytesIO(b'a,b\nn/a,\\N') with stdin_as_string(input_file): self.assertLines(['-f', 'csv', '--null-value', '\\N'], [ @@ -49,7 +49,7 @@ def test_null_value(self): input_file.close() def test_null_value_blanks(self): - input_file = StringIO('a,b\nn/a,\\N') + input_file = io.BytesIO(b'a,b\nn/a,\\N') with stdin_as_string(input_file): self.assertLines(['-f', 'csv', '--null-value', '\\N', '--blanks'], [ @@ -153,7 +153,7 @@ def test_csv_no_headers_streaming(self): ['--no-header-row', '--no-inference', '--snifflimit', '0']) def test_csv_datetime_inference(self): - input_file = StringIO('a\n2015-01-01T00:00:00Z') + input_file = io.BytesIO(b'a\n2015-01-01T00:00:00Z') with stdin_as_string(input_file): self.assertLines(['-f', 'csv'], [ @@ -182,9 +182,9 @@ def test_xlsx_no_inference(self): ]) def test_geojson_no_inference(self): - input_file = StringIO( - '{"a": 1, "b": 2, "type": "FeatureCollection", "features": [{"geometry": {}, "properties": ' - '{"a": 1, "b": 2, "c": 3}}]}') + input_file = io.BytesIO( + b'{"a": 1, "b": 2, "type": "FeatureCollection", "features": [{"geometry": {}, "properties": ' + b'{"a": 1, "b": 2, "c": 3}}]}') with stdin_as_string(input_file): self.assertLines(['--no-inference', '-f', 'geojson'], [ @@ -195,7 +195,7 @@ def test_geojson_no_inference(self): input_file.close() def test_json_no_inference(self): - input_file = StringIO('[{"a": 1, "b": 2, "c": 3}]') + input_file = io.BytesIO(b'[{"a": 1, "b": 2, "c": 3}]') with stdin_as_string(input_file): self.assertLines(['--no-inference', '-f', 'json'], [ @@ -206,7 +206,7 @@ def test_json_no_inference(self): input_file.close() def test_ndjson_no_inference(self): - input_file = StringIO('{"a": 1, "b": 2, "c": 3}') + input_file = io.BytesIO(b'{"a": 1, "b": 2, "c": 3}') with stdin_as_string(input_file): self.assertLines(['--no-inference', '-f', 'ndjson'], [ diff --git a/tests/test_utilities/test_sql2csv.py b/tests/test_utilities/test_sql2csv.py index 5fb51d192..e5349a2bf 100644 --- a/tests/test_utilities/test_sql2csv.py +++ b/tests/test_utilities/test_sql2csv.py @@ -1,6 +1,6 @@ +import io import os import sys -from io import StringIO from unittest.mock import patch try: @@ -71,7 +71,7 @@ def test_file_with_query(self): self.assertTrue('54' in csv) def test_stdin(self): - input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer') + input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer') with stdin_as_string(input_file): csv = self.get_output([]) @@ -82,7 +82,7 @@ def test_stdin(self): input_file.close() def test_stdin_with_query(self): - input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer') + input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer') with stdin_as_string(input_file): csv = self.get_output(['--query', 'select 6*9 as question']) @@ -93,7 +93,7 @@ def test_stdin_with_query(self): input_file.close() def test_stdin_with_file(self): - input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer') + input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer') with stdin_as_string(input_file): csv = self.get_output(['examples/test.sql']) @@ -104,7 +104,7 @@ def test_stdin_with_file(self): input_file.close() def test_stdin_with_file_and_query(self): - input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer') + input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer') with stdin_as_string(input_file): csv = self.get_output(['examples/test.sql', '--query', 'select 6*9 as question']) diff --git a/tests/utils.py b/tests/utils.py index 6b92489b2..876a7e7f7 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -17,11 +17,11 @@ """ +import io import sys import unittest import warnings from contextlib import contextmanager -from io import StringIO import agate @@ -39,7 +39,7 @@ def stderr_as_stdout(): @contextmanager def stdin_as_string(content): temp = sys.stdin - sys.stdin = content + sys.stdin = io.TextIOWrapper(content) yield sys.stdin = temp @@ -48,7 +48,7 @@ class CSVKitTestCase(unittest.TestCase): warnings.filterwarnings(action='ignore', module='agate') def get_output(self, args): - output_file = StringIO() + output_file = io.StringIO() utility = self.Utility(args, output_file) utility.run() @@ -59,7 +59,7 @@ def get_output(self, args): return output def get_output_as_io(self, args): - return StringIO(self.get_output(args)) + return io.StringIO(self.get_output(args)) def get_output_as_list(self, args): return self.get_output(args).split('\n') @@ -89,7 +89,7 @@ def assertLines(self, args, rows, newline_at_eof=True): class EmptyFileTests: def test_empty(self): - with open('examples/empty.csv') as f, stdin_as_string(f): + with open('examples/empty.csv', 'rb') as f, stdin_as_string(f): utility = self.Utility(getattr(self, 'default_args', [])) utility.run() @@ -105,7 +105,7 @@ def test_names(self): def test_invalid_options(self): args = ['-n', '--no-header-row', 'examples/dummy.csv'] - output_file = StringIO() + output_file = io.StringIO() utility = self.Utility(args, output_file) with self.assertRaises(RequiredHeaderError): @@ -118,7 +118,7 @@ class ColumnsTests: def test_invalid_column(self): args = getattr(self, 'columns_args', []) + ['-c', '0', 'examples/dummy.csv'] - output_file = StringIO() + output_file = io.StringIO() utility = self.Utility(args, output_file) with self.assertRaises(ColumnIdentifierError):