diff --git a/ebmdatalab/__init__.py b/ebmdatalab/__init__.py index 718c4d1..a144f77 100644 --- a/ebmdatalab/__init__.py +++ b/ebmdatalab/__init__.py @@ -1,3 +1,3 @@ """Package for ebmdatalab jupyter notebook stuff """ -__version__ = '0.0.7' +__version__ = '0.0.8' diff --git a/ebmdatalab/bq.py b/ebmdatalab/bq.py index 0eff785..3e8a9ca 100644 --- a/ebmdatalab/bq.py +++ b/ebmdatalab/bq.py @@ -1,3 +1,4 @@ +import glob import os import re from hashlib import md5 @@ -43,6 +44,11 @@ def cached_read(sql, if use_cache and already_cached: df = pd.read_csv(csv_path) else: + old_fingerprint_files = glob.glob( + os.path.join( + csv_dir, "." + csv_filename + ".*.tmp")) + for f in old_fingerprint_files: + os.remove(f) with open(fingerprint_path, "w") as f: f.write("File created by {}".format(__file__)) df = pd.read_gbq(sql, **defaults) diff --git a/ebmdatalab/test_bq.py b/ebmdatalab/test_bq.py index 23d5177..0101130 100644 --- a/ebmdatalab/test_bq.py +++ b/ebmdatalab/test_bq.py @@ -46,3 +46,28 @@ def test_cached_read(mock_read_gbq): # and now with `use_cache` param df = bq.cached_read(sql, csv_path=csv_file.name, use_cache=False) assert mock_read_gbq.call_count == 2 + + +def _check_cached_read(csv_file, mock_read, sql, expected): + mock_read.return_value = expected + df = bq.cached_read(sql, csv_path=csv_file.name) + assert str(df) == str(expected) + + +@patch('ebmdatalab.bq.pd.read_gbq') +def test_old_cache_markers_removed(mock_read_gbq): + with tempfile.NamedTemporaryFile() as csv_file: + # First, cause some sql to be cached + inputs_and_outputs = [ + ( + "select * from foobar", + DataFrame([{'a': 1}]) + ), + ( + "select * from foobar order by id", + DataFrame([{'a': 2}]) + ) + ] + _check_cached_read(csv_file, mock_read_gbq, *inputs_and_outputs[0]) + _check_cached_read(csv_file, mock_read_gbq, *inputs_and_outputs[1]) + _check_cached_read(csv_file, mock_read_gbq, *inputs_and_outputs[0])