diff --git a/pandas/io/common.py b/pandas/io/common.py index 4ba969f0abac4..e312181f08512 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -183,7 +183,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, Returns ------- - a filepath_ or buffer or S3File instance, the encoding, the compression + tuple of ({a filepath_ or buffer or S3File instance}, + encoding, str, + compression, str, + should_close, bool) """ filepath_or_buffer = _stringify_path(filepath_or_buffer) @@ -194,7 +197,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) - return reader, encoding, compression + req.close() + return reader, encoding, compression, True if is_s3_url(filepath_or_buffer): from pandas.io import s3 @@ -206,13 +210,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, mmap.mmap)): - return _expand_user(filepath_or_buffer), None, compression + return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = "Invalid file path or buffer object type: {_type}" raise ValueError(msg.format(_type=type(filepath_or_buffer))) - return filepath_or_buffer, None, compression + return filepath_or_buffer, None, compression, False def file_path_to_url(path): @@ -309,6 +313,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) + Returns ------- f : file-like diff --git a/pandas/io/excel.py b/pandas/io/excel.py index b03987e933bff..0d3d4286f5a3c 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -381,7 +381,7 @@ def __init__(self, io, **kwds): if _is_url(self._io): io = _urlopen(self._io) elif not isinstance(self.io, (ExcelFile, xlrd.Book)): - io, _, _ = get_filepath_or_buffer(self._io) + io, _, _, _ = get_filepath_or_buffer(self._io) if engine == 'xlrd' and isinstance(io, xlrd.Book): self.book = io diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index e3a1321336fb3..24364fe07405e 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -404,7 +404,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, """ compression = _infer_compression(path_or_buf, compression) - filepath_or_buffer, _, compression = get_filepath_or_buffer( + filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, ) @@ -419,7 +419,13 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, if chunksize: return json_reader - return json_reader.read() + result = json_reader.read() + if should_close: + try: + filepath_or_buffer.close() + except: # noqa: flake8 + pass + return result class JsonReader(BaseIterator): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 9289853a1bbfd..d3e6f0cf4a1bc 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -180,7 +180,7 @@ def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs): obj : type of object stored in file """ - path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) + path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) @@ -188,6 +188,12 @@ def read(fh): l = list(unpack(fh, encoding=encoding, **kwargs)) if len(l) == 1: return l[0] + + if should_close: + try: + path_or_buf.close() + except: # noqa: flake8 + pass return l # see if we have an actual file diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6e1b6e14861c3..1c22a305c089d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -107,7 +107,7 @@ def write(self, df, path, compression='snappy', self.validate_dataframe(df) if self._pyarrow_lt_070: self._validate_write_lt_070(df) - path, _, _ = get_filepath_or_buffer(path, mode='wb') + path, _, _, _ = get_filepath_or_buffer(path, mode='wb') if self._pyarrow_lt_060: table = self.api.Table.from_pandas(df, timestamps_to_ms=True) @@ -121,13 +121,21 @@ def write(self, df, path, compression='snappy', coerce_timestamps=coerce_timestamps, **kwargs) def read(self, path, columns=None, **kwargs): - path, _, _ = get_filepath_or_buffer(path) + path, _, _, should_close = get_filepath_or_buffer(path) if self._pyarrow_lt_070: - return self.api.parquet.read_pandas(path, columns=columns, - **kwargs).to_pandas() - kwargs['use_pandas_metadata'] = True - return self.api.parquet.read_table(path, columns=columns, - **kwargs).to_pandas() + result = self.api.parquet.read_pandas(path, columns=columns, + **kwargs).to_pandas() + else: + kwargs['use_pandas_metadata'] = True + result = self.api.parquet.read_table(path, columns=columns, + **kwargs).to_pandas() + if should_close: + try: + path.close() + except: # noqa: flake8 + pass + + return result def _validate_write_lt_070(self, df): # Compatibility shim for pyarrow < 0.7.0 @@ -199,11 +207,11 @@ def write(self, df, path, compression='snappy', **kwargs): # path is s3:// so we need to open the s3file in 'wb' mode. # TODO: Support 'ab' - path, _, _ = get_filepath_or_buffer(path, mode='wb') + path, _, _, _ = get_filepath_or_buffer(path, mode='wb') # And pass the opened s3file to the fastparquet internal impl. kwargs['open_with'] = lambda path, _: path else: - path, _, _ = get_filepath_or_buffer(path) + path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): self.api.write(path, df, @@ -214,13 +222,13 @@ def read(self, path, columns=None, **kwargs): # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. - s3, _, _ = get_filepath_or_buffer(path) + s3, _, _, should_close = get_filepath_or_buffer(path) try: parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open) finally: s3.close() else: - path, _, _ = get_filepath_or_buffer(path) + path, _, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index af1441f4a0fc9..7ea6d321e0fdd 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -413,7 +413,7 @@ def _read(filepath_or_buffer, kwds): compression = kwds.get('compression') compression = _infer_compression(filepath_or_buffer, compression) - filepath_or_buffer, _, compression = get_filepath_or_buffer( + filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( filepath_or_buffer, encoding, compression) kwds['compression'] = compression @@ -439,6 +439,13 @@ def _read(filepath_or_buffer, kwds): data = parser.read(nrows) finally: parser.close() + + if should_close: + try: + filepath_or_buffer.close() + except: # noqa: flake8 + pass + return data diff --git a/pandas/io/s3.py b/pandas/io/s3.py index e2650e29c0db3..bd2286c5c8569 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -27,7 +27,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, fs = s3fs.S3FileSystem(anon=False) try: filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) - except (OSError, NoCredentialsError): + except (compat.FileNotFoundError, NoCredentialsError): # boto3 has troubles when trying to access a public file # when credentialed... # An OSError is raised if you have credentials, but they @@ -36,4 +36,4 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # for that bucket. fs = s3fs.S3FileSystem(anon=True) filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) - return filepath_or_buffer, None, compression + return filepath_or_buffer, None, compression, True diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 26e39f0df8b29..806cbddaa2ee2 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -90,7 +90,7 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) + self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, compat.string_types): self._path_or_buf = open(self._path_or_buf, 'rb') self.handle = self._path_or_buf diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index c14524f7d7cd6..7994517b9f303 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -236,7 +236,8 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', self._chunksize = chunksize if isinstance(filepath_or_buffer, str): - filepath_or_buffer, encoding, compression = get_filepath_or_buffer( + (filepath_or_buffer, encoding, + compression, should_close) = get_filepath_or_buffer( filepath_or_buffer, encoding=encoding) if isinstance(filepath_or_buffer, (str, compat.text_type, bytes)): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ee6975ea1d938..9646831cb612c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -988,7 +988,7 @@ def __init__(self, path_or_buf, convert_dates=True, self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _ = get_filepath_or_buffer( + path_or_buf, encoding, _, should_close = get_filepath_or_buffer( path_or_buf, encoding=self._default_encoding ) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 57e72da2fd3f4..8deb51e190bab 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -2,30 +2,34 @@ import pytest from pandas.io.parsers import read_table +from pandas.util import testing as tm -HERE = os.path.dirname(__file__) +@pytest.fixture +def parser_data(request): + return os.path.join(tm.get_data_path(), '..', 'parser', 'data') -@pytest.fixture(scope='module') -def tips_file(): + +@pytest.fixture +def tips_file(parser_data): """Path to the tips dataset""" - return os.path.join(HERE, 'parser', 'data', 'tips.csv') + return os.path.join(parser_data, 'tips.csv') -@pytest.fixture(scope='module') -def jsonl_file(): +@pytest.fixture +def jsonl_file(parser_data): """Path a JSONL dataset""" - return os.path.join(HERE, 'parser', 'data', 'items.jsonl') + return os.path.join(parser_data, 'items.jsonl') -@pytest.fixture(scope='module') -def salaries_table(): +@pytest.fixture +def salaries_table(parser_data): """DataFrame with the salaries dataset""" - path = os.path.join(HERE, 'parser', 'data', 'salaries.csv') + path = os.path.join(parser_data, 'salaries.csv') return read_table(path) -@pytest.fixture(scope='module') +@pytest.fixture def s3_resource(tips_file, jsonl_file): """Fixture for mocking S3 interaction. @@ -41,8 +45,8 @@ def s3_resource(tips_file, jsonl_file): is yielded by the fixture. """ pytest.importorskip('s3fs') + boto3 = pytest.importorskip('boto3') moto = pytest.importorskip('moto') - moto.mock_s3().start() test_s3_files = [ ('tips.csv', tips_file), @@ -58,17 +62,22 @@ def add_tips_files(bucket_name): Key=s3_key, Body=f) - boto3 = pytest.importorskip('boto3') - # see gh-16135 - bucket = 'pandas-test' + try: - conn = boto3.resource("s3", region_name="us-east-1") - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) + s3 = moto.mock_s3() + s3.start() - conn.create_bucket(Bucket='cant_get_it', ACL='private') - add_tips_files('cant_get_it') + # see gh-16135 + bucket = 'pandas-test' + conn = boto3.resource("s3", region_name="us-east-1") - yield conn + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) - moto.mock_s3().stop() + conn.create_bucket(Bucket='cant_get_it', ACL='private') + add_tips_files('cant_get_it') + yield conn + except: # noqa: flake8 + pytest.skip("failure to use s3 resource") + finally: + s3.stop() diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 10139eb07a925..a72744e08fa7c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1039,7 +1039,6 @@ def test_read_inline_jsonl(self): assert_frame_equal(result, expected) def test_read_s3_jsonl(self, s3_resource): - pytest.importorskip('s3fs') # GH17200 result = read_json('s3n://pandas-test/items.jsonl', lines=True) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 10f6cef04b593..f16338fda6245 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -46,6 +46,7 @@ def check_compressed_urls(salaries_table, compression, extension, mode, class TestS3(object): + @tm.network def test_parse_public_s3_bucket(self): pytest.importorskip('s3fs') @@ -65,7 +66,8 @@ def test_parse_public_s3_bucket(self): assert not df.empty tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) - def test_parse_public_s3n_bucket(self, s3_resource): + @tm.network + def test_parse_public_s3n_bucket(self): # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) @@ -74,7 +76,8 @@ def test_parse_public_s3n_bucket(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_parse_public_s3a_bucket(self, s3_resource): + @tm.network + def test_parse_public_s3a_bucket(self): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) @@ -82,7 +85,8 @@ def test_parse_public_s3a_bucket(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_nrows(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) @@ -91,7 +95,8 @@ def test_parse_public_s3_bucket_nrows(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_chunked(self): # Read with a chunksize chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) @@ -109,7 +114,8 @@ def test_parse_public_s3_bucket_chunked(self, s3_resource): chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_chunked_python(self): # Read with a chunksize using the Python parser chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) @@ -127,7 +133,8 @@ def test_parse_public_s3_bucket_chunked_python(self, s3_resource): chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_python(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression=comp) @@ -136,7 +143,8 @@ def test_parse_public_s3_bucket_python(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df) - def test_infer_s3_compression(self, s3_resource): + @tm.network + def test_infer_s3_compression(self): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') @@ -145,7 +153,8 @@ def test_infer_s3_compression(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')), df) - def test_parse_public_s3_bucket_nrows_python(self, s3_resource): + @tm.network + def test_parse_public_s3_bucket_nrows_python(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) @@ -154,7 +163,8 @@ def test_parse_public_s3_bucket_nrows_python(self, s3_resource): tm.assert_frame_equal(read_csv( tm.get_data_path('tips.csv')).iloc[:10], df) - def test_s3_fails(self, s3_resource): + @tm.network + def test_s3_fails(self): with pytest.raises(IOError): read_csv('s3://nyqpug/asdf.csv') diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a0070dce6a7f1..a89156db38ae3 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -102,15 +102,19 @@ def test_infer_compression_from_path(self, extension, expected, path_type): def test_get_filepath_or_buffer_with_path(self): filename = '~/sometest' - filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename) + filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( + filename) assert filepath_or_buffer != filename assert isabs(filepath_or_buffer) assert os.path.expanduser(filename) == filepath_or_buffer + assert not should_close def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() - filepath_or_buffer, _, _ = common.get_filepath_or_buffer(input_buffer) + filepath_or_buffer, _, _, should_close = common.get_filepath_or_buffer( + input_buffer) assert filepath_or_buffer == input_buffer + assert not should_close def test_iterator(self): reader = read_csv(StringIO(self.data1), chunksize=1)