diff --git a/.travis.yml b/.travis.yml index 17fc69b36..2886f6fc7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,10 @@ install: - pip install pytest-timeout --upgrade - pip install pytest-xdist --upgrade - pip install setuptools-git --upgrade + - pip install boto3 --upgrade + - pip install moto --upgrade + - pip install s3fs --upgrade + - pip install fastparquet --upgrade script: - pip freeze - python setup.py test --pytest-args=-v diff --git a/arctic/pluggable/__init__.py b/arctic/pluggable/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/arctic/pluggable/_kv_ndarray_store.py b/arctic/pluggable/_kv_ndarray_store.py new file mode 100644 index 000000000..ed7ef1346 --- /dev/null +++ b/arctic/pluggable/_kv_ndarray_store.py @@ -0,0 +1,331 @@ +import logging +import hashlib + +from bson.binary import Binary +import numpy as np + +from arctic.exceptions import UnhandledDtypeException, DataIntegrityException, ArcticException +from arctic.store._version_store_utils import checksum + +from arctic._compression import compress_array, decompress +from six.moves import xrange + + +logger = logging.getLogger(__name__) + +_CHUNK_SIZE = 2 * 1024 * 1024 - 2048 # ~2 MB (a bit less for usePowerOf2Sizes) +_APPEND_SIZE = 1 * 1024 * 1024 # 1MB +_APPEND_COUNT = 60 # 1 hour of 1 min data + + +def _promote_struct_dtypes(dtype1, dtype2): + if not set(dtype1.names).issuperset(set(dtype2.names)): + raise Exception("Removing columns from dtype not handled") + + def _promote(type1, type2): + if type2 is None: + return type1 + if type1.shape is not None: + if not type1.shape == type2.shape: + raise Exception("We do not handle changes to dtypes that have shape") + return np.promote_types(type1.base, type2.base), type1.shape + return np.promote_types(type1, type2) + return np.dtype([(n, _promote(dtype1.fields[n][0], dtype2.fields.get(n, (None,))[0])) for n in dtype1.names]) + + +def _resize_with_dtype(arr, dtype): + """ + This function will transform arr into an array with the same type as dtype. It will do this by + filling new columns with zeros (or NaNs, if it is a float column). Also, columns that are not + in the new dtype will be dropped. + """ + structured_arrays = dtype.names is not None and arr.dtype.names is not None + old_columns = set(arr.dtype.names or []) + new_columns = set(dtype.names or []) + + # In numpy 1.9 the ndarray.astype method used to handle changes in number of fields. The code below + # should replicate the same behaviour the old astype used to have. + # + # One may be tempted to use np.lib.recfunctions.stack_arrays to implement both this step and the + # concatenate that follows but it 2x slower and it requires providing your own default values (instead + # of np.zeros). + # + # Numpy 1.14 supports doing new_arr[old_columns] = arr[old_columns], which is faster than the code below + # (in benchmarks it seems to be even slightly faster than using the old astype). However, that is not + # supported by numpy 1.9.2. + if structured_arrays and (old_columns != new_columns): + new_arr = np.zeros(arr.shape, dtype) + for c in old_columns & new_columns: + new_arr[c] = arr[c] + + # missing float columns should default to nan rather than zero + _is_float_type = lambda _dtype: _dtype.type in (np.float32, np.float64) + _is_void_float_type = lambda _dtype: _dtype.type == np.void and _is_float_type(_dtype.subdtype[0]) + _is_float_or_void_float_type = lambda _dtype: _is_float_type(_dtype) or _is_void_float_type(_dtype) + _is_float = lambda column: _is_float_or_void_float_type(dtype.fields[column][0]) + for new_column in filter(_is_float, new_columns - old_columns): + new_arr[new_column] = np.nan + else: + new_arr = arr.astype(dtype) + + return new_arr + + +class KeyValueNdarrayStore(object): + """Chunked store for arbitrary ndarrays, supporting append. Using an arbitrary kv store backend. + + for the simple example: + dat = np.empty(10) + library.write('test', dat) #version 1 + library.append('test', dat) #version 2 + + version documents: + + [ + {u'_id': ObjectId('55fa9a7781f12654382e58b8'), + u'symbol': u'test', + u'version': 1 + u'type': u'ndarray', + u'up_to': 10, # no. of rows included in the data for this version + u'append_count': 0, + u'append_size': 0, + u'dtype': u'float64', + u'dtype_metadata': {}, + u'segment_keys': [] # sha + u'sha': Binary('.........', 0), + u'shape': [-1], + }, + + {u'_id': ObjectId('55fa9aa981f12654382e58ba'), + u'symbol': u'test', + u'version': 2 + u'type': u'ndarray', + u'up_to': 20, # no. of rows included in the data for this version + u'append_count': 1, # 1 append operation so far + u'append_size': 80, # 80 bytes appended + u'base_version_id': ObjectId('55fa9a7781f12654382e58b8'), # _id of version 1 + u'dtype': u'float64', + u'dtype_metadata': {}, + u'segment_count': 2, #2 segments included in this version + } + ] + + + segment documents: + + [ + #first chunk written: + {u'_id': ObjectId('55fa9a778b376a68efdd10e3'), + u'compressed': True, #data is lz4 compressed on write() + u'data': Binary('...........', 0), + u'parent': [ObjectId('55fa9a7781f12654382e58b8')], + u'segment': 9, #10 rows in the data up to this segment, so last row is 9 + u'sha': Binary('.............', 0), # checksum of (symbol, {'data':.., 'compressed':.., 'segment':...}) + u'symbol': u'test'}, + + #second chunk appended: + {u'_id': ObjectId('55fa9aa98b376a68efdd10e6'), + u'compressed': False, # no initial compression for append() + u'data': Binary('...........', 0), + u'parent': [ObjectId('55fa9a7781f12654382e58b8')], + u'segment': 19, #20 rows in the data up to this segment, so last row is 19 + u'sha': Binary('............', 0), # checksum of (symbol, {'data':.., 'compressed':.., 'segment':...}) + u'symbol': u'test'}, + ] + + """ + TYPE = 'ndarray' + + @classmethod + def initialize_library(cls, *args, **kwargs): + pass + + @staticmethod + def _ensure_index(collection): + pass + + def can_delete(self, version, symbol): + return self.can_read(version, symbol) + + def can_read(self, version, symbol): + return version['type'] == self.TYPE + + def can_write(self, version, symbol, data): + return isinstance(data, np.ndarray) and not data.dtype.hasobject + + def _dtype(self, string, metadata=None): + if metadata is None: + metadata = {} + if string.startswith('['): + return np.dtype(eval(string), metadata=metadata) + return np.dtype(string, metadata=metadata) + + def _index_range(self, version, symbol, from_version=None, **kwargs): + """ + Tuple describing range to read from the ndarray - closed:open + """ + from_index = None + if from_version: + from_index = from_version['up_to'] + return from_index, None + + def get_info(self, version): + ret = {} + dtype = self._dtype(version['dtype'], version.get('dtype_metadata', {})) + length = int(version['up_to']) + ret['size'] = dtype.itemsize * length + ret['segment_count'] = version['segment_count'] + ret['dtype'] = version['dtype'] + ret['type'] = version['type'] + ret['handler'] = self.__class__.__name__ + ret['rows'] = int(version['up_to']) + return ret + + def read(self, backing_store, library_name, version, symbol, **kwargs): + index_range = self._index_range(version, symbol, **kwargs) + return self._do_read(backing_store, library_name, version, symbol, index_range=index_range) + + def _do_read(self, backing_store, library_name, version, symbol, index_range=None): + ''' + index_range is a 2-tuple of integers - a [from, to) range of segments to be read. + Either from or to can be None, indicating no bound. + ''' + from_index = index_range[0] if index_range else None + to_index = version['up_to'] + if index_range and index_range[1] and index_range[1] < version['up_to']: + to_index = index_range[1] + + segment_keys = version['segment_keys'] + filtered_segment_keys = [] + for i, segment_index in enumerate(version['raw_segment_index']): + if (from_index is None or segment_index >= from_index) and \ + (to_index is None or segment_index <= to_index): + filtered_segment_keys.append(segment_keys[i]) + + data = bytearray() + for segment in backing_store.read_segments(library_name, filtered_segment_keys): + data.extend(decompress(segment)) + + dtype = self._dtype(version['dtype'], version.get('dtype_metadata', {})) + rtn = np.frombuffer(data, dtype=dtype).reshape(version.get('shape', (-1))) + return rtn + + def _promote_types(self, dtype, dtype_str): + if dtype_str == str(dtype): + return dtype + prev_dtype = self._dtype(dtype_str) + if dtype.names is None: + rtn = np.promote_types(dtype, prev_dtype) + else: + rtn = _promote_struct_dtypes(dtype, prev_dtype) + rtn = np.dtype(rtn, metadata=dict(dtype.metadata or {})) + return rtn + + def check_written(self, collection, symbol, version): + # Check all the chunks are in place + seen_chunks = collection.find({'symbol': symbol, 'parent': version['_id']}, + ).count() + + if seen_chunks != version['segment_count']: + segments = [x['segment'] for x in collection.find({'symbol': symbol, 'parent': version['_id']}, + projection={'segment': 1}, + )] + raise ArcticException("Failed to write all the Chunks. Saw %s expecting %s" + "Parent: %s \n segments: %s" % + (seen_chunks, version['segment_count'], version['_id'], segments)) + + def checksum(self, item): + sha = hashlib.sha1() + sha.update(item.tostring()) + return Binary(sha.digest()) + + def write(self, backing_store, library_name, version, symbol, item, previous_version, dtype=None): + if item.dtype.hasobject: + raise UnhandledDtypeException() + + if not dtype: + dtype = item.dtype + version['dtype'] = str(dtype) + version['shape'] = (-1,) + item.shape[1:] + version['dtype_metadata'] = dict(dtype.metadata or {}) + version['type'] = self.TYPE + version['up_to'] = len(item) + version['sha'] = self.checksum(item) + + if previous_version: + if 'sha' in previous_version \ + and previous_version['dtype'] == version['dtype'] \ + and self.checksum(item[:previous_version['up_to']]) == previous_version['sha']: + # TODO handle appends!, currently segments will be reused to but all hashes will be recomputed + pass + # The first n rows are identical to the previous version, so just append. + # Do a 'dirty' append (i.e. concat & start from a new base version) for safety + # self._do_append(backing_store, collection, version, symbol, item[previous_version['up_to']:], + # previous_version, dirty_append=True) + + version['base_sha'] = version['sha'] + self._do_write(backing_store, library_name, version, symbol, item, previous_version) + + def _do_write(self, backing_store, library_name, version, symbol, item, previous_version, segment_offset=0): + + previous_segment_keys = [] + if previous_version: + previous_segment_keys = previous_version['segment_keys'] + + if segment_offset > 0 and 'segment_index' in previous_version: + existing_index = previous_version['segment_index'] + else: + existing_index = None + + sze = int(item.dtype.itemsize * np.prod(item.shape[1:])) + length = len(item) + + # chunk and store the data by (uncompressed) size + chunk_size = int(backing_store.chunk_size / sze) + + # Compress + idxs = xrange(int(np.ceil(float(length) / chunk_size))) + chunks = [(item[i * chunk_size: (i + 1) * chunk_size]).tostring() for i in idxs] + compressed_segments = compress_array(chunks) + + segment_keys = [] + raw_segment_index = [] + for i, segment_data in zip(idxs, compressed_segments): + segment_idx = min((i + 1) * chunk_size - 1, length - 1) + segment_offset + segment_key = backing_store.write_segment(library_name, symbol, + segment_data, previous_segment_keys) + raw_segment_index.append(segment_idx) + segment_keys.append(segment_key) + + segment_index = self._segment_index(item, existing_index=existing_index, start=segment_offset, + new_segments=raw_segment_index) + if segment_index: + version['segment_index'] = segment_index + version['raw_segment_index'] = raw_segment_index + version['segment_count'] = len(segment_keys) # on appends this value is incorrect but is updated later on + version['append_size'] = 0 + version['append_count'] = 0 + version['segment_keys'] = segment_keys + + #TODO add write check + #self.check_written(collection, symbol, version) + + def _segment_index(self, new_data, existing_index, start, new_segments): + """ + Generate a segment index which can be used in subselect data in _index_range. + This function must handle both generation of the index and appending to an existing index + + Parameters: + ----------- + new_data: new data being written (or appended) + existing_index: index field from the versions document of the previous version + start: first (0-based) offset of the new data + segments: list of offsets. Each offset is the row index of the + the last row of a particular chunk relative to the start of the _original_ item. + array(new_data) - segments = array(offsets in item) + + Returns: + -------- + Library specific index metadata to be stored in the version document. + """ + pass # numpy arrays have no index diff --git a/arctic/pluggable/_pandas_ndarray_store.py b/arctic/pluggable/_pandas_ndarray_store.py new file mode 100644 index 000000000..9d949adae --- /dev/null +++ b/arctic/pluggable/_pandas_ndarray_store.py @@ -0,0 +1,230 @@ +import ast +import logging + +from bson.binary import Binary +from pandas import DataFrame, Series, Panel +import numpy as np + +from arctic.serialization.numpy_records import SeriesSerializer, DataFrameSerializer +from arctic._compression import compress, decompress +from arctic.date._util import to_pandas_closed_closed +from arctic.exceptions import ArcticException +from ._kv_ndarray_store import KeyValueNdarrayStore + + +log = logging.getLogger(__name__) + +DTN64_DTYPE = 'datetime64[ns]' + +INDEX_DTYPE = [('datetime', DTN64_DTYPE), ('index', 'i8')] + + +class PandasStore(KeyValueNdarrayStore): + + def _segment_index(self, recarr, existing_index, start, new_segments): + """ + Generate index of datetime64 -> item offset. + + Parameters: + ----------- + new_data: new data being written (or appended) + existing_index: index field from the versions document of the previous version + start: first (0-based) offset of the new data + segments: list of offsets. Each offset is the row index of the + the last row of a particular chunk relative to the start of the _original_ item. + array(new_data) - segments = array(offsets in item) + + Returns: + -------- + Binary(compress(array([(index, datetime)])) + Where index is the 0-based index of the datetime in the DataFrame + """ + # find the index of the first datetime64 column + idx_col = self._datetime64_index(recarr) + # if one exists let's create the index on it + if idx_col is not None: + new_segments = np.array(new_segments, dtype='i8') + last_rows = recarr[new_segments - start] + # create numpy index + index = np.core.records.fromarrays([last_rows[idx_col]] + + [new_segments, ], + dtype=INDEX_DTYPE) + # append to existing index if exists + if existing_index: + # existing_index_arr is read-only but it's never written to + existing_index_arr = np.frombuffer(decompress(existing_index), dtype=INDEX_DTYPE) + if start > 0: + existing_index_arr = existing_index_arr[existing_index_arr['index'] < start] + index = np.concatenate((existing_index_arr, index)) + return Binary(compress(index.tostring())) + elif existing_index: + raise ArcticException("Could not find datetime64 index in item but existing data contains one") + return None + + def _datetime64_index(self, recarr): + """ Given a np.recarray find the first datetime64 column """ + # TODO: Handle multi-indexes + names = recarr.dtype.names + for name in names: + if recarr[name].dtype == DTN64_DTYPE: + return name + return None + + def _index_range(self, version, symbol, date_range=None, **kwargs): + """ Given a version, read the segment_index and return the chunks associated + with the date_range. As the segment index is (id -> last datetime) + we need to take care in choosing the correct chunks. """ + if date_range and 'segment_index' in version: + # index is read-only but it's never written to + index = np.frombuffer(decompress(version['segment_index']), dtype=INDEX_DTYPE) + dtcol = self._datetime64_index(index) + if dtcol and len(index): + dts = index[dtcol] + start, end = _start_end(date_range, dts) + if start > dts[-1]: + return -1, -1 + idxstart = min(np.searchsorted(dts, start), len(dts) - 1) + idxend = min(np.searchsorted(dts, end, side='right'), len(dts) - 1) + return int(index['index'][idxstart]), int(index['index'][idxend] + 1) + return super(PandasStore, self)._index_range(version, symbol, **kwargs) + + def _daterange(self, recarr, date_range): + """ Given a recarr, slice out the given artic.date.DateRange if a + datetime64 index exists """ + idx = self._datetime64_index(recarr) + if idx and len(recarr): + dts = recarr[idx] + mask = Series(np.zeros(len(dts)), index=dts) + start, end = _start_end(date_range, dts) + mask[start:end] = 1.0 + return recarr[mask.values.astype(bool)] + return recarr + + def read(self, backing_store, library_name, version, symbol, date_range=None, **kwargs): + item = super(PandasStore, self).read(backing_store, library_name, version, symbol, + date_range=date_range, **kwargs) + if date_range: + item = self._daterange(item, date_range) + return item + + def get_info(self, version): + """ + parses out the relevant information in version + and returns it to the user in a dictionary + """ + ret = super(PandasStore, self).get_info(version) + ret['col_names'] = version['dtype_metadata'] + ret['handler'] = self.__class__.__name__ + ret['dtype'] = ast.literal_eval(version['dtype']) + return ret + + +def _start_end(date_range, dts): + """ + Return tuple: [start, end] of np.datetime64 dates that are inclusive of the passed + in datetimes. + """ + # FIXME: timezones + assert len(dts) + _assert_no_timezone(date_range) + date_range = to_pandas_closed_closed(date_range, add_tz=False) + start = np.datetime64(date_range.start) if date_range.start else dts[0] + end = np.datetime64(date_range.end) if date_range.end else dts[-1] + return start, end + + +def _assert_no_timezone(date_range): + for _dt in (date_range.start, date_range.end): + if _dt and _dt.tzinfo is not None: + raise ValueError("DateRange with timezone not supported") + + +class PandasSeriesStore(PandasStore): + TYPE = 'pandasseries' + SERIALIZER = SeriesSerializer() + + def can_write(self, version, symbol, data): + if isinstance(data, Series): + if data.dtype == np.object_ or data.index.dtype == np.object_: + return self.SERIALIZER.can_convert_to_records_without_objects(data, symbol) + return True + return False + + def write(self, backing_store, library_name, version, symbol, item, previous_version): + item, md = self.SERIALIZER.serialize(item) + super(PandasSeriesStore, self).write(backing_store, library_name, version, + symbol, item, previous_version, dtype=md) + + def append(self, backing_store, library_name, version, symbol, item, previous_version, **kwargs): + item, md = self.SERIALIZER.serialize(item) + super(PandasSeriesStore, self).append(backing_store, library_name, version, + symbol, item, previous_version, dtype=md, **kwargs) + + def read(self, backing_store, library_name, version, symbol, **kwargs): + item = super(PandasSeriesStore, self).read(backing_store, library_name, version, + symbol, **kwargs) + return self.SERIALIZER.deserialize(item) + + +class PandasDataFrameStore(PandasStore): + TYPE = 'pandasdf' + SERIALIZER = DataFrameSerializer() + + def can_write(self, version, symbol, data): + if isinstance(data, DataFrame): + if np.any(data.dtypes.values == 'object'): + return self.SERIALIZER.can_convert_to_records_without_objects(data, symbol) + return True + return False + + def write(self, backing_store, library_name, version, symbol, item, previous_version): + item, md = self.SERIALIZER.serialize(item) + super(PandasDataFrameStore, self).write(backing_store, library_name, version, + symbol, item, previous_version, dtype=md) + + def append(self, backing_store, library_name, version, symbol, item, previous_version, **kwargs): + item, md = self.SERIALIZER.serialize(item) + super(PandasDataFrameStore, self).append(backing_store, library_name, version, + symbol, item, previous_version, dtype=md, **kwargs) + + def read(self, backing_store, library_name, version, symbol, **kwargs): + item = super(PandasDataFrameStore, self).read(backing_store, library_name, version, + symbol, **kwargs) + return self.SERIALIZER.deserialize(item) + + +class PandasPanelStore(PandasDataFrameStore): + TYPE = 'pandaspan' + + def can_write(self, version, symbol, data): + if isinstance(data, Panel): + frame = data.to_frame(filter_observations=False) + if np.any(frame.dtypes.values == 'object'): + return self.SERIALIZER.can_convert_to_records_without_objects(frame, symbol) + return True + return False + + def write(self, backing_store, library_name, version, symbol, item, previous_version): + if np.product(item.shape) == 0: + # Currently not supporting zero size panels as they drop indices when converting to dataframes + # Plan is to find a better solution in due course. + raise ValueError('Cannot insert a zero size panel into mongo.') + if not np.all(len(i.names) == 1 for i in item.axes): + raise ValueError('Cannot insert panels with multiindexes') + item = item.to_frame(filter_observations=False) + if len(set(item.dtypes)) == 1: + # If all columns have the same dtype, we support non-string column names. + # We know from above check that columns is not a multiindex. + item = DataFrame(item.stack()) + elif item.columns.dtype != np.dtype('object'): + raise ValueError('Cannot support non-object dtypes for columns') + super(PandasPanelStore, self).write(backing_store, library_name, version, symbol, item, previous_version) + + def read(self, backing_store, library_name, version, symbol, **kwargs): + item = super(PandasPanelStore, self).read(backing_store, library_name, version, symbol, **kwargs) + if len(item.index.names) == 3: + return item.iloc[:, 0].unstack().to_panel() + return item.to_panel() + + def append(self, backing_store, library_name, version, symbol, item, previous_version, **kwargs): + raise ValueError('Appending not supported for pandas.Panel') diff --git a/arctic/pluggable/_parquet_store.py b/arctic/pluggable/_parquet_store.py new file mode 100644 index 000000000..244cbd5d8 --- /dev/null +++ b/arctic/pluggable/_parquet_store.py @@ -0,0 +1,68 @@ +import logging +import pandas as pd +import numpy as np +import io +from contextlib import contextmanager + + +logger = logging.getLogger(__name__) + + +@contextmanager +def _dummy_open(file_like, _): + yield file_like + + +class ParquetStore(object): + + TYPE = 'parquet' + + @classmethod + def initialize_library(cls, *args, **kwargs): + pass + + def can_delete(self, version, symbol): + return self.can_read(version, symbol) + + def can_read(self, version, symbol): + return version['type'] == self.TYPE + + def can_write(self, version, symbol, data): + if isinstance(data, pd.DataFrame): + if np.any(data.dtypes.values == 'object'): + # TODO to a proper check to see if we can convert to parquet + pass + return True + return False + + def get_info(self, version): + ret = {'type': self.TYPE, 'handler': self.__class__.__name__} + return ret + + def read(self, backing_store, library_name, version, symbol, **kwargs): + segment_keys = version['segment_keys'] + assert len(segment_keys) == 1, "should only be one segment for parquet" + parquet_path = backing_store._make_segment_path(library_name, symbol, version['_id']) + return pd.read_parquet(parquet_path, engine='fastparquet') + + def write(self, backing_store, library_name, version, symbol, item, previous_version): + output = io.BytesIO() + item.to_parquet(output, engine='fastparquet', open_with=_dummy_open, + compression='LZ4', file_scheme='simple') + data = [output.getvalue()] + + if previous_version: + previous_segment_keys = previous_version['segment_keys'] + else: + previous_segment_keys = set() + + segment_keys = [] + for segment_data in data: + segment_key = backing_store.write_segment(library_name, symbol, + segment_data, previous_segment_keys, + version['_id']) + segment_keys.append(segment_key) + version['segment_keys'] = segment_keys + version['type'] = self.TYPE + + #TODO Check written? diff --git a/arctic/pluggable/_pickle_store.py b/arctic/pluggable/_pickle_store.py new file mode 100644 index 000000000..f6234eb79 --- /dev/null +++ b/arctic/pluggable/_pickle_store.py @@ -0,0 +1,60 @@ +import bson +import logging +from bson.binary import Binary +from bson.errors import InvalidDocument +from six.moves import cPickle, xrange +import io +from .._compression import decompress, compress_array + +from arctic.store._version_store_utils import checksum, pickle_compat_load +from ..exceptions import UnsupportedPickleStoreVersion + + +# new versions of chunked pickled objects MUST begin with __chunked__ +_MAGIC_CHUNKED = '__chunked__' +_MAGIC_CHUNKEDV2 = '__chunked__V2' +_CHUNK_SIZE = 15 * 1024 * 1024 # 15MB +_MAX_BSON_ENCODE = 256 * 1024 # 256K - don't fill up the version document with encoded bson + +logger = logging.getLogger(__name__) + + +class PickleStore(object): + + @classmethod + def initialize_library(cls, *args, **kwargs): + pass + + def get_info(self, version): + ret = {} + ret['type'] = 'blob' + ret['handler'] = self.__class__.__name__ + return ret + + def read(self, backing_store, library_name, version, symbol, **kwargs): + segment_keys = version['segment_keys'] + data = b''.join(decompress(s) for s in backing_store.read_segments(library_name, segment_keys)) + return pickle_compat_load(io.BytesIO(data)) + + def write(self, backing_store, library_name, version, symbol, item, previous_version): + + # Try to pickle it. This is best effort + version['blob'] = _MAGIC_CHUNKEDV2 + pickled = cPickle.dumps(item, protocol=cPickle.HIGHEST_PROTOCOL) + chunk_size = backing_store.chunk_size + + data = compress_array([pickled[i * chunk_size: (i + 1) * chunk_size] for i in xrange(int(len(pickled) / chunk_size + 1))]) + + if previous_version: + previous_segment_keys = previous_version['segment_keys'] + else: + previous_segment_keys = set() + + segment_keys = [] + for segment_data in data: + segment_key = backing_store.write_segment(library_name, symbol, + segment_data, previous_segment_keys) + segment_keys.append(segment_key) + version['segment_keys'] = segment_keys + + #TODO Check written? diff --git a/arctic/pluggable/generic_version_store.py b/arctic/pluggable/generic_version_store.py new file mode 100644 index 000000000..55caf08d4 --- /dev/null +++ b/arctic/pluggable/generic_version_store.py @@ -0,0 +1,447 @@ +import logging + +import bson +import six + +from .._util import indent +from ..exceptions import NoDataFoundException, ArcticException +from arctic.pluggable._pickle_store import PickleStore +from arctic.store.versioned_item import VersionedItem + +logger = logging.getLogger(__name__) + +VERSION_STORE_TYPE = 'VersionStore' +_TYPE_HANDLERS = [] + + +def register_versioned_storage(storageClass, storage_args=tuple(), storage_kwargs=None): + add_handler(_TYPE_HANDLERS, storageClass, storage_args, storage_kwargs) + + +def add_handler(type_handlers, storageClass, storage_args=tuple(), storage_kwargs=None): + storage_kwargs = storage_kwargs or {} + existing_instances = [i for i, v in enumerate(type_handlers) if str(v.__class__) == str(storageClass)] + store = storageClass(*storage_args, **storage_kwargs) + if existing_instances: + for i in existing_instances: + type_handlers[i] = store + else: + type_handlers.insert(0, store) + return storageClass + +_default_bson_handler = PickleStore() + +class GenericVersionStore(object): + + def __init__(self, library_name, backing_store, bson_handler=_default_bson_handler, type_handlers=None): + self.library_name = library_name + self._backing_store = backing_store + self._bson_handler = bson_handler + if type_handlers: + self.type_handlers = [] + for th in type_handlers: + add_handler(self.type_handlers, th) + else: + self.type_handlers = _TYPE_HANDLERS + + + def __str__(self): + return """<%s at %s> +%s""" % (self.__class__.__name__, hex(id(self)), indent(str(self.library_name), 4)) + + def __repr__(self): + return str(self) + + def list_symbols(self, all_symbols=False, snapshot=None, regex=None, **kwargs): + """ + Return the symbols in this library. + + Parameters + ---------- + all_symbols : `bool` + If True returns all symbols under all snapshots, even if the symbol has been deleted + in the current version (i.e. it exists under a snapshot... Default: False + snapshot : `str` + Return the symbols available under the snapshot. + regex : `str` + filter symbols by the passed in regular expression + kwargs : + kwarg keys are used as fields to query for symbols with metadata matching + the kwargs query + + Returns + ------- + String list of symbols in the library + """ + return self._backing_store.list_symbols(self.library_name) + + def has_symbol(self, symbol, as_of=None): + """ + Return True if the 'symbol' exists in this library AND the symbol + isn't deleted in the specified as_of. + + It's possible for a deleted symbol to exist in older snapshots. + + Parameters + ---------- + symbol : `str` + symbol name for the item + as_of : `str` or int or `datetime.datetime` + Return the data as it was as_of the point in time. + `int` : specific version number + `str` : snapshot name which contains the version + `datetime.datetime` : the version of the data that existed as_of the requested point in time + """ + try: + version = self._backing_store.read_version(self.library_name, symbol, as_of) + except NoDataFoundException: + version = None + return version is not None + + def read_audit_log(self, symbol=None, message=None): + """ + Return the audit log associated with a given symbol + + Parameters + ---------- + symbol : `str` + symbol name for the item + """ + raise NotImplementedError() + + def list_versions(self, symbol=None, snapshot=None, latest_only=False): + """ + Return a list of versions filtered by the passed in parameters. + + Parameters + ---------- + symbol : `str` + Symbol to return versions for. If None returns versions across all + symbols in the library. + snapshot : `str` + Return the versions contained in the named snapshot + latest_only : `bool` + Only include the latest version for a specific symbol + + Returns + ------- + List of dictionaries describing the discovered versions in the library + """ + raise NotImplementedError() + + def _read_handler(self, version, symbol): + handler = None + for h in self.type_handlers: + if h.can_read(version, symbol): + handler = h + break + if handler is None: + handler = self._bson_handler + return handler + + def _write_handler(self, version, symbol, data, **kwargs): + handler = None + for h in self.type_handlers: + if h.can_write(version, symbol, data, **kwargs): + handler = h + break + if handler is None: + version['type'] = 'default' + handler = self._bson_handler + return handler + + def read(self, symbol, as_of=None, version_id=None, snapshot_id=None, date_range=None, **kwargs): + """ + Read data for the named symbol. Returns a VersionedItem object with + a data and metadata element (as passed into write). + + Parameters + ---------- + symbol : `str` + symbol name for the item + as_of : `datetime.datetime` + Return the data as it was as_of at that point in time. + version_id : `str` + Return the specific version + snapshot_id : `str` + Return the specific version contained in the referenced snapshot + date_range: `arctic.date.DateRange` + DateRange to read data for. Applies to Pandas data, with a DateTime index + returns only the part of the data that falls in the DateRange. + + Returns + ------- + VersionedItem namedtuple which contains a .data and .metadata element + """ + _version = self._backing_store.read_version(self.library_name, symbol, as_of, version_id, snapshot_id) + return self._do_read(symbol, _version, date_range=date_range, **kwargs) + + def get_info(self, symbol, as_of=None): + """ + Reads and returns information about the data stored for symbol + + Parameters + ---------- + symbol : `str` + symbol name for the item + as_of : `str` or int or `datetime.datetime` + Return the data as it was as_of the point in time. + `int` : specific version number + `str` : snapshot name which contains the version + `datetime.datetime` : the version of the data that existed as_of the requested point in time + + Returns + ------- + dictionary of the information (specific to the type of data) + """ + version = self._backing_store.read_version(self.library_name, symbol, as_of) + handler = self._read_handler(version, symbol) + if handler and hasattr(handler, 'get_info'): + return handler.get_info(version) + return {} + + def _do_read(self, symbol, version, from_version=None, **kwargs): + if version.get('deleted'): + raise NoDataFoundException("No data found for %s in library %s" % (symbol, self._arctic_lib.get_name())) + handler = self._read_handler(version, symbol) + data = handler.read(self._backing_store, self.library_name, version, symbol, from_version=from_version, **kwargs) + return VersionedItem(symbol=symbol, library=self.library_name, version=version['version'], + metadata=version.pop('metadata', None), data=data) + + def read_metadata(self, symbol, as_of=None, allow_secondary=None): + """ + Return the metadata saved for a symbol. This method is fast as it doesn't + actually load the data. + + Parameters + ---------- + symbol : `str` + symbol name for the item + as_of : `str` or int or `datetime.datetime` + Return the data as it was as_of the point in time. + `int` : specific version number + `str` : snapshot name which contains the version + `datetime.datetime` : the version of the data that existed as_of the requested point in time + allow_secondary : `bool` or `None` + Override the default behavior for allowing reads from secondary members of a cluster: + `None` : use the settings from the top-level `Arctic` object used to query this version store. + `True` : allow reads from secondary members + `False` : only allow reads from primary members + """ + _version = self._backing_store.read_version(self.library_name, symbol, as_of) + return VersionedItem(symbol=symbol, library=self.library_name, version=_version['version'], + metadata=_version.pop('metadata', None), data=None) + + def _insert_version(self, version): + try: + self._backing_store.write_version(self.library_name, version['symbol'], version) + except DuplicateKeyError as err: + logger.exception(err) + raise OperationFailure("A version with the same _id exists, force a clean retry") + + + def append(self, symbol, data, metadata=None, prune_previous_version=True, upsert=True, **kwargs): + """ + Append 'data' under the specified 'symbol' name to this library. + The exact meaning of 'append' is left up to the underlying store implementation. + + Parameters + ---------- + symbol : `str` + symbol name for the item + data : + to be persisted + metadata : `dict` + an optional dictionary of metadata to persist along with the symbol. + prune_previous_version : `bool` + Removes previous (non-snapshotted) versions from the database. + Default: True + upsert : `bool` + Write 'data' if no previous version exists. + """ + raise NotImplementedError() + + def write(self, symbol, data, metadata=None, prune_previous_version=True, **kwargs): + """ + Write 'data' under the specified 'symbol' name to this library. + + Parameters + ---------- + symbol : `str` + symbol name for the item + data : + to be persisted + metadata : `dict` + an optional dictionary of metadata to persist along with the symbol. + Default: None + prune_previous_version : `bool` + Removes previous (non-snapshotted) versions from the database. + Default: True + kwargs : + passed through to the write handler + + Returns + ------- + VersionedItem named tuple containing the metadata and version number + of the written symbol in the store. + """ + _id = bson.ObjectId() + version = {'_id': _id, 'symbol': symbol, 'metadata': metadata} + + previous_version = self._backing_store.read_version(self.library_name, symbol) + + handler = self._write_handler(version, symbol, data, **kwargs) + handler.write(self._backing_store, self.library_name, version, symbol, data, previous_version, **kwargs) + + #if prune_previous_version and previous_version: + # self._prune_previous_versions(symbol) + + # self._publish_change(symbol, version) + + # Insert the new version into the version DB + self._insert_version(version) + + logger.debug('Finished writing versions for %s', symbol) + + return VersionedItem(symbol=symbol, library=self.library_name, version=version['version'], + metadata=version.pop('metadata', None), data=None) + + def write_metadata(self, symbol, metadata, prune_previous_version=True, **kwargs): + """ + Write 'metadata' under the specified 'symbol' name to this library. + The data will remain unchanged. A new version will be created. + If the symbol is missing, it causes a write with empty data (None, pickled, can't append) + and the supplied metadata. + Returns a VersionedItem object only with a metadata element. + Fast operation: Zero data/segment read/write operations. + + Parameters + ---------- + symbol : `str` + symbol name for the item + metadata : `dict` or `None` + dictionary of metadata to persist along with the symbol + prune_previous_version : `bool` + Removes previous (non-snapshotted) versions from the database. + Default: True + kwargs : + passed through to the write handler (only used if symbol does not already exist or is deleted) + + Returns + ------- + `VersionedItem` + VersionedItem named tuple containing the metadata of the written symbol's version document in the store. + """ + raise NotImplementedError() + + def restore_version(self, symbol, as_of, prune_previous_version=True): + """ + Restore the specified 'symbol' data and metadata to the state of a given version/snapshot/date. + Returns a VersionedItem object only with a metadata element. + Fast operation: Zero data/segment read/write operations. + + Parameters + ---------- + symbol : `str` + symbol name for the item + as_of : `str` or `int` or `datetime.datetime` + Return the data as it was as_of the point in time. + `int` : specific version number + `str` : snapshot name which contains the version + `datetime.datetime` : the version of the data that existed as_of the requested point in time + prune_previous_version : `bool` + Removes previous (non-snapshotted) versions from the database. + Default: True + + Returns + ------- + `VersionedItem` + VersionedItem named tuple containing the metadata of the written symbol's version document in the store. + """ + # if version/snapshot/data supplied in "as_of" does not exist, will fail fast with NoDataFoundException + raise NotImplementedError() + + def delete(self, symbol): + """ + Delete all versions of the item from the current library which aren't + currently part of some snapshot. + + Parameters + ---------- + symbol : `str` + symbol name to delete + """ + raise self._backing_store.delete_symbol(self.library_name, symbol) + + def snapshot(self, snap_name, metadata=None, skip_symbols=None, versions=None): + """ + Snapshot versions of symbols in the library. Can be used like: + + Parameters + ---------- + snap_name : `str` + name of the snapshot + metadata : `dict` + an optional dictionary of metadata to persist along with the symbol. + skip_symbols : `collections.Iterable` + optional symbols to be excluded from the snapshot + versions: `dict` + an optional dictionary of versions of the symbols to be snapshot + """ + self._backing_store.snapshot(self.library_name, snap_name, metadata=None, skip_symbols=None, versions=None) + + + def delete_snapshot(self, snap_name): + """ + Delete a named snapshot + + Parameters + ---------- + symbol : `str` + The snapshot name to delete + """ + self._backing_store.delete_snapshot(self.library_name, snap_name) + + + def list_snapshots(self): + """ + List the snapshots in the library + + Returns + ------- + string list of snapshot names + """ + raise self._backing_store.list_snapshots(self.library_name) + + + def stats(self): + """ + Return storage statistics about the library + + Returns + ------- + dictionary of storage stats + """ + raise NotImplementedError() + + def _fsck(self, dry_run): + """ + Run a consistency check on this VersionStore library. + """ + raise NotImplementedError() + + def _cleanup_orphaned_chunks(self, dry_run): + """ + Fixes any chunks who have parent pointers to missing versions. + Removes the broken parent pointer and, if there are no other parent pointers for the chunk, + removes the chunk. + """ + raise NotImplementedError() + + def _cleanup_orphaned_versions(self, dry_run): + """ + Fixes any versions who have parent pointers to missing snapshots. + Note, doesn't delete the versions, just removes the parent pointer if it no longer + exists in snapshots. + """ + raise NotImplementedError() diff --git a/arctic/pluggable/key_value_datastore.py b/arctic/pluggable/key_value_datastore.py new file mode 100644 index 000000000..82ff01b4a --- /dev/null +++ b/arctic/pluggable/key_value_datastore.py @@ -0,0 +1,401 @@ +import hashlib +import six +from six.moves import cPickle +import io +from six.moves import xrange +import os +import glob +import errno + +import numpy as np +import boto3 +from bson import BSON, objectid +import pandas as pd + + +from arctic._compression import compress_array, decompress + + +_CHUNK_SIZE = 20 * 1024 * 1024 # 20Mb + + +def _check_bucket(client, bucket_name): + response = client.get_bucket_versioning(Bucket=bucket_name) + if 'Status' in response and response['Status'] == 'Enabled': + return + else: + raise ValueError("Bucket {} is not setup correctly." + " Does it exist and is versioning enabled?".format(bucket_name)) + + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + + +def safe_open_wb(path): + ''' Open "path" for writing, creating any parent directories as needed.''' + mkdir_p(os.path.dirname(path)) + return open(path, 'wb') + + +class FileBasedKeyValueStore(object): + """File backed store for use with GenericVersionStore. + + Uses directory structure: + + {storage_directory}/{library_name}/symbols/{symbol}/{version}.bson - version documents + {storage_directory}/{library_name}/data/{symbol}/{version}.parquet - segment documents + {storage_directory}/{library_name}/snapshots/{snapname}.bson - snapshot documents + """ + def __init__(self, storage_directory, chunk_size=_CHUNK_SIZE): + self.storage_directory = storage_directory + self.chunk_size = chunk_size + + def write_version(self, library_name, symbol, version_doc): + version_id = str(objectid.ObjectId()) + version_path = self._make_version_path(library_name, symbol, version_id) + encoded_version_doc = BSON.encode(version_doc) + with safe_open_wb(version_path) as f: + f.write(encoded_version_doc) + version_doc['version'] = version_id + + def list_versions(self, library_name, symbol): + symbol_path = self._make_symbol_path(library_name, symbol) + version_paths = glob.glob(symbol_path + '*.bson') + results = [] + # TODO lots of room to improve this... + for p in version_paths: + filename = os.path.basename(p) + version = os.path.splitext(filename)[0] + modified = os.path.getmtime(p) + results.append((version, filename, p, modified)) + columns = ['version', 'filename', 'path', 'modified'] + # TODO maybe handle case where we need the latest version seperately to avoid a large sort. + return pd.DataFrame(results, columns=columns).sort_values('modified').reset_index() + + def list_symbols(self, library_name): + base_symbols_path = self._make_base_symbols_path(library_name) + _, dirs, _ = next(os.walk(base_symbols_path), ([], [], [])) + + def mtime(d): + return os.path.getmtime(os.path.join(base_symbols_path, d)) + + return sorted(dirs, key=mtime) + + def delete_symbol(self, library_name, symbol): + """Soft deletes a symbol - no data is removed, snapshots still work. + + Other cleanup jobs needed to reclaim the storage. + """ + raise NotImplementedError() + + def delete_snapshot(self, library_name, snap_name): + """Soft deletes a snapshot. Versions in the snapshot are unaffected""" + raise NotImplementedError() + + def read_version(self, library_name, symbol, as_of=None, version_id=None, snapshot_id=None): + if isinstance(as_of, str): + # TODO remove temp bodge to handle overloading of as_of + version_id = as_of + as_of = None + version_id = self._find_version(library_name, symbol, as_of, version_id, snapshot_id) + if version_id is None: + return None + version_path = self._make_version_path(library_name, symbol, version_id) + try: + with open(version_path, 'rb') as f: + version_doc = BSON(f.read()).decode() + except FileNotFoundError: + return None + + version_doc['version'] = version_id + return version_doc + + def _find_version(self, library_name, symbol, as_of, version_id, snapshot_id): + if sum(v is not None for v in [as_of, version_id, snapshot_id]) > 1: + raise ValueError('Only one of as_of, version_id, snapshot_id should be specified') + + if version_id: + return version_id + elif as_of: + # getting all versions will get slow with many versions - look into filtering in S3 using as_of date + versions = self.list_versions(library_name, symbol) + valid_versions = versions.loc[versions['modified'] <= as_of, 'version'] + if len(valid_versions) == 0: + raise KeyError('No versions found for as_of {} for symbol: {}, library {}'.format(as_of, + symbol, + library_name)) + else: + return valid_versions.iloc[-1] + elif snapshot_id: + return self._read_snapshot(library_name, snapshot_id)['versions'][symbol] + else: + # default case return latest version + versions = self.list_versions(library_name, symbol)['version'] + if len(versions) > 0: + return versions.iat[-1] + else: + return None + + def write_segment(self, library_name, symbol, segment_data, previous_segment_keys=set(), version_id=None): + if version_id is None: + version_id = checksum(symbol, segment_data) + segment_path = self._make_segment_path(library_name, symbol, version_id) + + # optimisation so we don't rewrite identical segments + # checking if segment already exists might be expensive. + if segment_path not in previous_segment_keys: + with safe_open_wb(segment_path) as f: + f.write(segment_data) + return segment_path + + def read_segments(self, library_name, segment_keys): + for k in segment_keys: + with open(k, 'rb') as f: + yield f.read() + + def snapshot(self, library_name, snap_name, metadata=None, skip_symbols=None, versions=None): + snapshot_path = self._make_snaphot_path(library_name, snap_name) + key_version_mapping = self._list_all_versions(library_name) + if versions: + key_version_mapping.update(versions) + if skip_symbols: + for ss in skip_symbols: + if ss in key_version_mapping: + del key_version_mapping[ss] + snapshot_dict = {'versions': key_version_mapping, 'metadata': metadata or {}} + encoded_snap = BSON.encode(snapshot_dict) + with safe_open_wb(snapshot_path) as f: + f.write(encoded_snap) + + def _list_all_versions(self, library_name): + base_symbols_path = self._make_base_symbols_path(library_name) + _, symbols, _ = next(os.walk(base_symbols_path), ([], [], [])) + symbol_versions = {} + for s in symbols: + latest = self.list_versions(library_name, s)['version'].iat[-1] + symbol_versions[s] = latest + return symbol_versions + + def _read_snapshot(self, library_name, snap_name): + snapshot_path = self._make_snaphot_path(library_name, snap_name) + with open(snapshot_path, 'rb') as f: + snapshot = BSON(f.read()).decode() + return snapshot + + def list_snapshots(self, library_name): + base_snaphot_path = self._make_base_snaphot_path(library_name) + snapshot_paths = glob.glob(base_snaphot_path + '*.bson') + return snapshot_paths + + def _make_segment_path(self, library_name, symbol, version_id): + return '{}/{}/segments/{}/{}.parquet'.format(self.storage_directory, library_name, symbol, version_id) + + def _make_base_symbols_path(self, library_name): + return '{}/{}/symbols/'.format(self.storage_directory, library_name) + + def _make_version_path(self, library_name, symbol, version_id): + return '{}/{}/symbols/{}/{}.bson'.format(self.storage_directory, library_name, symbol, version_id) + + def _make_symbol_path(self, library_name, symbol): + return '{}/{}/symbols/{}/'.format(self.storage_directory, library_name, symbol) + + def _make_base_symbols_path(self, library_name): + return '{}/{}/symbols/'.format(self.storage_directory, library_name) + + def _extract_symbol_from_path(self, base_symbols_path, symbol_path): + return symbol_path.replace(base_symbols_path, '').replace('/', '').replace('.bson', '') + + def _make_snaphot_path(self, library_name, snapshot_name): + return '{}/{}/snapshots/{}.bson'.format(self.storage_directory, library_name, snapshot_name) + + def _make_base_snaphot_path(self, library_name): + return '{}/{}/snapshots/'.format(self.storage_directory, library_name) + + +class S3KeyValueStore(object): + """S3 Store for use with GenericVersionStore. + + Uses object key format: + + /{library_name}/symbols/{symbol}.bson - version documents + /{library_name}/segments/{symbol}/{segment_hash} - segment documents + /{library_name}/snapshots/{snapname}.bson - snapshot documents + + """ + # TODO should KV Stores be responsible for ID creation? + + def __init__(self, bucket, chunk_size=_CHUNK_SIZE): + self.client = boto3.client('s3') + self.bucket = bucket + _check_bucket(self.client, self.bucket) + self.chunk_size = chunk_size + + def write_version(self, library_name, symbol, version_doc): + version_path = self._make_version_path(library_name, symbol) + encoded_version_doc = BSON.encode(version_doc) + put_result = self.client.put_object(Body=encoded_version_doc, Bucket=self.bucket, Key=version_path) + version_doc['version'] = put_result['VersionId'] + + def list_versions(self, library_name, symbol): + version_path = self._make_version_path(library_name, symbol) + paginator = self.client.get_paginator("list_object_versions") + results = [] + for page in paginator.paginate(Bucket=self.bucket, Prefix=version_path): + results.append(pd.DataFrame(page['Versions'])) + # TODO decide appropriate generic response format + result_df = pd.concat(results) + return result_df.rename(columns={'VersionId': 'version', 'LastModified': 'modified'}) + + def list_symbols(self, library_name): + base_symbols_path = self._make_base_symbols_path(library_name) + # TODO handle snapshots etc. + results = [] + paginator = self.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=self.bucket, Delimiter='/', Prefix=base_symbols_path): + results.extend((self._extract_symbol_from_path(base_symbols_path, p['Key']) for p in page['Contents'])) + return results + + def delete_symbol(self, library_name, symbol): + """Soft deletes a symbol - no data is removed, snapshots still work. + + Other cleanup jobs needed to reclaim the storage. + """ + version_path = self._make_version_path(library_name, symbol) + self.client.delete_object(Bucket=self.bucket, Key=version_path) + + def delete_snapshot(self, library_name, snap_name): + """Soft deletes a snapshot. Versions in the snapshot are unaffected""" + snapshot_path = self._make_snaphot_path(library_name, snap_name) + self.client.delete_object(Bucket=self.bucket, Key=snapshot_path) + + def read_version(self, library_name, symbol, as_of=None, version_id=None, snapshot_id=None): + version_path = self._make_version_path(library_name, symbol) + get_object_args = dict(Bucket=self.bucket, Key=version_path) + if any([as_of, version_id, snapshot_id]): + get_object_args['VersionId'] = self._find_version(library_name, symbol, as_of, version_id, snapshot_id) + try: + encoded_version_doc = self.client.get_object(**get_object_args) + except self.client.exceptions.NoSuchKey: + return None + version_doc = BSON(encoded_version_doc['Body'].read()).decode() + version_doc['version'] = encoded_version_doc['VersionId'] + return version_doc + + def write_segment(self, library_name, symbol, segment_data, previous_segment_keys=set(), version_id=None): + segment_hash = checksum(symbol, segment_data) + segment_path = self._make_segment_path(library_name, symbol, segment_hash) + + # optimisation so we don't rewrite identical segments + # checking if segment already exists might be expensive. + if segment_path not in previous_segment_keys: + self.client.put_object(Body=segment_data, Bucket=self.bucket, Key=segment_path) + return segment_path + + def read_segments(self, library_name, segment_keys): + for k in segment_keys: + yield self.client.get_object(Bucket=self.bucket, Key=k)['Body'].read() + + def snapshot(self, library_name, snap_name, metadata=None, skip_symbols=None, versions=None): + snapshot_path = self._make_snaphot_path(library_name, snap_name) + symbols_path = self._make_base_symbols_path(library_name) + latest_versions_df = self._list_all_versions(library_name) + + symbols = latest_versions_df.loc[:, 'Key'].apply(lambda x: self._extract_symbol_from_path(symbols_path, x)) + latest_versions = latest_versions_df.set_index(symbols).loc[:, 'VersionId'] + if skip_symbols: + latest_versions = latest_versions.drop(labels=skip_symbols, errors='ignore') + key_version_mapping = latest_versions.to_dict() + if versions: + key_version_mapping.update(versions) + + snapshot_dict = {'versions': key_version_mapping, 'metadata': metadata or {}} + encoded_snap = BSON.encode(snapshot_dict) + self.client.put_object(Body=encoded_snap, Bucket=self.bucket, Key=snapshot_path) + + def _read_snapshot(self, library_name, snap_name): + snapshot_path = self._make_snaphot_path(library_name, snap_name) + get_object_args = dict(Bucket=self.bucket, Key=snapshot_path) + encoded_snapshot = self.client.get_object(**get_object_args) + snapshot = BSON(encoded_snapshot['Body'].read()).decode() + return snapshot + + def list_snapshots(self, library_name): + base_snaphot_path = self._make_base_snaphot_path(library_name) + results = [] + paginator = self.client.get_paginator("list_objects_v2") + for page in paginator.paginate(Bucket=self.bucket, Delimiter='/', Prefix=base_snaphot_path): + results.extend((self._extract_symbol_from_path(base_snaphot_path, p['Key']) for p in page['Contents'])) + return results + + def _list_all_versions(self, library_name): + symbols_path = self._make_base_symbols_path(library_name) + results = [] + + paginator = self.client.get_paginator("list_object_versions") + version_iterator = paginator.paginate(Bucket=self.bucket, Prefix=symbols_path) + filtered_iterator = version_iterator.search("Versions[?IsLatest][]") + results.extend(filtered_iterator) + # TODO decide appropriate generic response format + return pd.DataFrame(results) + + def _find_version(self, library_name, symbol, as_of, version_id, snapshot_id): + if sum(v is not None for v in [as_of, version_id, snapshot_id]) > 1: + raise ValueError('Only one of as_of, version_id, snapshot_id should be specified') + + if version_id: + return version_id + elif as_of: + # getting all versions will get slow with many versions - look into filtering in S3 using as_of date + versions = self.list_versions(library_name, symbol) + valid_versions = versions.loc[versions['modified'] <= as_of, 'version'] + if len(valid_versions) == 0: + raise KeyError('No versions found for as_of {} for symbol: {}, library {}'.format(as_of, + symbol, + library_name)) + else: + return valid_versions.iloc[-1] + elif snapshot_id: + return self._read_snapshot(library_name, snapshot_id)['versions'][symbol] + else: + raise ValueError('One of as_of, version_id, snapshot_id should be specified') + + def _make_base_symbols_path(self, library_name): + return '{}/symbols/'.format(library_name) + + def _make_version_path(self, library_name, symbol): + return '{}/symbols/{}.bson'.format(library_name, symbol) + + def _extract_symbol_from_path(self, base_symbols_path, symbol_path): + return symbol_path.replace(base_symbols_path, '').replace('/', '').replace('.bson', '') + + def _make_segment_path(self, library_name, symbol, segment_hash): + return '{}/segments/{}/{}'.format(library_name, symbol, segment_hash) + + def _make_snaphot_path(self, library_name, snapshot_name): + return '{}/snapshots/{}.bson'.format(library_name, snapshot_name) + + def _make_base_snaphot_path(self, library_name): + return '{}/snapshots/'.format(library_name) + + +def checksum(symbol, data): + sha = hashlib.sha1() + sha.update(symbol.encode('ascii')) + + if isinstance(data, six.binary_type): + sha.update(data) + else: + sha.update(str(data).encode('ascii')) + return sha.hexdigest() + + +def _segment_key(library_name, symbol, segment_hash): + # TODO handle slashes in symbols + return "{}/{}/{}".format(library_name, symbol, segment_hash) diff --git a/setup.py b/setup.py index a65ce32a9..8b3fb955c 100644 --- a/setup.py +++ b/setup.py @@ -90,6 +90,10 @@ def run_tests(self): "tzlocal", "lz4" ], + extras_require={ + 's3': ['boto3'], + 'parquet': ['fastparquet', 's3fs'] + }, # Note: pytest >= 4.1.0 is not compatible with pytest-cov < 2.6.1. tests_require=["mock", "mockextras", @@ -98,7 +102,7 @@ def run_tests(self): "pytest-server-fixtures", "pytest-timeout", "pytest-xdist", - "lz4" + "moto" ], entry_points={'console_scripts': [ 'arctic_init_library = arctic.scripts.arctic_init_library:main', diff --git a/tests/conftest.py b/tests/conftest.py index a7c9aacc6..d11239bcd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,15 @@ import warnings + +import warnings + +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) + # TODO Remove once issues below are fixed + # Workaround for https://gitlab.com/sashahart/cookies/issues/4 + # and https://github.com/getsentry/responses/issues/186 + from cookies import Cookies # noqa + # Turn deprecation warnings into errors warnings.simplefilter('error', DeprecationWarning) diff --git a/tests/integration/pluggable/conftest.py b/tests/integration/pluggable/conftest.py new file mode 100644 index 000000000..dbb8e54cf --- /dev/null +++ b/tests/integration/pluggable/conftest.py @@ -0,0 +1,65 @@ +import pytest +import boto3 + +from moto import mock_s3 + +from arctic.pluggable.key_value_datastore import S3KeyValueStore, FileBasedKeyValueStore +from arctic.pluggable.generic_version_store import register_versioned_storage, GenericVersionStore +from arctic.pluggable._kv_ndarray_store import KeyValueNdarrayStore +from arctic.pluggable._parquet_store import ParquetStore +from arctic.pluggable._pandas_ndarray_store import PandasDataFrameStore, PandasSeriesStore, PandasPanelStore + + +@pytest.fixture +def s3_mock(): + with mock_s3(): + yield + + +@pytest.fixture() +def s3_bucket(): + return 'arctic2' + + +@pytest.fixture() +def s3_client(s3_mock): + return boto3.client('s3') + + +@pytest.fixture() +def s3_store(s3_bucket, s3_client): + s3_client.create_bucket(Bucket=s3_bucket) + s3_client.put_bucket_versioning(Bucket=s3_bucket, + VersioningConfiguration={'MFADelete': 'Disabled', + 'Status': 'Enabled'}) + return S3KeyValueStore(bucket=s3_bucket) + + +@pytest.fixture() +def file_store(tmpdir): + return FileBasedKeyValueStore(tmpdir) + +@pytest.fixture() +def generic_version_store(library_name, s3_store): + type_handlers = [KeyValueNdarrayStore, PandasPanelStore, PandasSeriesStore, PandasDataFrameStore] + return GenericVersionStore(library_name, backing_store=s3_store, type_handlers=type_handlers) + + +@pytest.fixture() +def parquet_version_store(library_name, s3_store): + return GenericVersionStore(library_name, backing_store=s3_store, bson_handler=ParquetStore()) + + +@pytest.fixture() +def parquet_filebacked_version_store(library_name, file_store): + type_handlers = [ParquetStore] + return GenericVersionStore(library_name, backing_store=file_store, type_handlers=type_handlers) + + +@pytest.fixture(params=['parquet_filebacked_version_store', 'generic_version_store']) +def version_store(request): + return request.getfixturevalue(request.param) + +@pytest.fixture(params=['file_store', 's3_store']) +def kv_store(request): + return request.getfixturevalue(request.param) \ No newline at end of file diff --git a/tests/integration/pluggable/test_key_value_datastore.py b/tests/integration/pluggable/test_key_value_datastore.py new file mode 100644 index 000000000..04b2c203c --- /dev/null +++ b/tests/integration/pluggable/test_key_value_datastore.py @@ -0,0 +1,126 @@ +from pytest import fail + + +def test_save_read_version_doc(kv_store): + version_doc = {'symbol': 24, 'foo': 'bar'} + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_doc) + loaded_version_doc = kv_store.read_version(library_name='my_library', symbol='my_symbol') + assert version_doc == loaded_version_doc + + +def test_read_a_specific_version_doc(kv_store): + version_docs = [{'symbol': '000', 'foo': 'bar'}, {'symbol': '111', 'foo': 'bar'}] + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_docs[0]) + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_docs[1]) + versions = kv_store.list_versions(library_name='my_library', symbol='my_symbol') + + for idx, row in versions.iterrows(): + # test read by version_id + loaded_version_doc = kv_store.read_version(library_name='my_library', + symbol='my_symbol', version_id=row.version) + assert version_docs[idx] == loaded_version_doc + # test read by as_of + loaded_version_doc = kv_store.read_version(library_name='my_library', + symbol='my_symbol', as_of=row.modified) + assert version_docs[idx] == loaded_version_doc + + +def test_save_read_segments(kv_store): + segment_data = b'3424234235' + segment_key = kv_store.write_segment(library_name='my_library', symbol='symbol', + segment_data=segment_data, version_id='foo') + loaded_segment_data = list(kv_store.read_segments(library_name='my_library', segment_keys=[segment_key]))[0] + assert segment_data == loaded_segment_data + + +def test_list_symbols(kv_store): + version_doc = {'symbol': 24, 'foo': 'bar'} + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_doc) + kv_store.write_version(library_name='my_library', symbol='my_symbol2', version_doc=version_doc) + symbols = kv_store.list_symbols(library_name='my_library') + assert ['my_symbol', 'my_symbol2'] == symbols + + +def test_list_versions(kv_store): + version_doc = {'symbol': 24, 'foo': 'bar'} + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_doc) + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_doc) + versions = kv_store.list_versions(library_name='my_library', symbol='my_symbol') + assert len(versions) == 2 + + +def test_list_all_versions(kv_store): + version_doc = {'symbol': 24, 'foo': 'bar'} + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_doc) + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_doc) + kv_store.write_version(library_name='my_library', symbol='my_symbol2', version_doc=version_doc) + versions = kv_store._list_all_versions(library_name='my_library') + assert len(versions) == 2 + + +def test_create_and_read_snapshot(kv_store): + version_doc = {'symbol': 24, 'foo': 'bar'} + latest_versions = {} + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_doc) + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_doc) + latest_versions['my_symbol'] = version_doc['version'] + kv_store.write_version(library_name='my_library', symbol='my_symbol2', version_doc=version_doc) + latest_versions['my_symbol2'] = version_doc['version'] + kv_store.write_version(library_name='my_library', symbol='my_symbol3', version_doc=version_doc) + metadata = {'month': 'June'} + kv_store.snapshot(library_name='my_library', snap_name='snap1', metadata=metadata, skip_symbols=['my_symbol3']) + snap = kv_store._read_snapshot(library_name='my_library', snap_name='snap1') + assert snap['versions'] == latest_versions + assert snap['metadata'] == metadata + + +def _setup_snaps(kv_store): + my_symbol_version_doc_0 = {'foo': 24, 'data': 1} + my_symbol_version_doc_1 = {'foo': 24, 'data': 2} + my_symbol2_version_doc_0 = {'foo': 24, 'data': 999} + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=my_symbol_version_doc_0) + kv_store.snapshot(library_name='my_library', snap_name='snap0') + kv_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=my_symbol_version_doc_1) + kv_store.snapshot(library_name='my_library', snap_name='snap1') + kv_store.write_version(library_name='my_library', symbol='my_symbol2', version_doc=my_symbol2_version_doc_0) + kv_store.snapshot(library_name='my_library', snap_name='snap2') + return my_symbol2_version_doc_0, my_symbol_version_doc_0, my_symbol_version_doc_1 + + +def test_reading_from_snapshot(kv_store): + my_symbol2_version_doc_0, my_symbol_version_doc_0, my_symbol_version_doc_1 = _setup_snaps(kv_store) + + assert my_symbol_version_doc_0 == kv_store.read_version(library_name='my_library', + symbol='my_symbol', snapshot_id='snap0') + assert my_symbol_version_doc_1 == kv_store.read_version(library_name='my_library', + symbol='my_symbol', snapshot_id='snap1') + assert my_symbol2_version_doc_0 == kv_store.read_version(library_name='my_library', + symbol='my_symbol2', snapshot_id='snap2') + + try: + kv_store.read_version(library_name='my_library', symbol='my_symbol2', snapshot_id='snap0') + fail("Should not find symbol") + except KeyError: + pass + + +def test_deleting_from_snapshot(s3_store): + my_symbol2_version_doc_0, my_symbol_version_doc_0, my_symbol_version_doc_1 = _setup_snaps(s3_store) + # can delete a snapshot + assert set(s3_store.list_snapshots(library_name='my_library')) == {'snap0', 'snap1', 'snap2'} + s3_store.delete_snapshot('my_library', 'snap0') + assert set(s3_store.list_snapshots(library_name='my_library')) == {'snap1', 'snap2'} + + # should be able to read a deleted symbol from a snap after deleting the symbol + s3_store.delete_symbol(library_name='my_library', symbol='my_symbol2') + assert my_symbol2_version_doc_0 == s3_store.read_version(library_name='my_library', + symbol='my_symbol2', snapshot_id='snap2') + +def test_delete_symbol(s3_store): + version_doc = {'symbol': 24, 'foo': 'bar'} + s3_store.write_version(library_name='my_library', symbol='my_symbol', version_doc=version_doc) + s3_store.write_version(library_name='my_library', symbol='my_symbol2', version_doc=version_doc) + assert ['my_symbol', 'my_symbol2'] == s3_store.list_symbols(library_name='my_library') + s3_store.delete_symbol(library_name='my_library', symbol='my_symbol') + assert ['my_symbol2'] == s3_store.list_symbols(library_name='my_library') + diff --git a/tests/integration/pluggable/test_pandas_store.py b/tests/integration/pluggable/test_pandas_store.py new file mode 100644 index 000000000..81d6bb156 --- /dev/null +++ b/tests/integration/pluggable/test_pandas_store.py @@ -0,0 +1,1011 @@ +from six import StringIO +from datetime import datetime as dt, timedelta as dtd +import itertools +import string + +from dateutil.rrule import rrule, DAILY +from mock import Mock, patch +from pandas import DataFrame, Series, DatetimeIndex, MultiIndex, read_csv, Panel, date_range, concat +from pandas.tseries.offsets import DateOffset +from pandas.util.testing import assert_frame_equal, assert_series_equal +import pytest +import pandas as pd + +from arctic._compression import decompress +from arctic.date import DateRange, mktz +from arctic.pluggable._pandas_ndarray_store import PandasSeriesStore +import numpy as np + + +def test_write_multi_column(generic_version_store): + symbol = 'test_symbol' + df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], index=['x', 'y', 'z'], columns=[[u'a', 'w'], ['a', 'v']]) + generic_version_store.write(symbol, df) + assert np.all(generic_version_store.read(symbol).data == df) + + +def test_save_read_pandas_series(generic_version_store): + s = Series(data=[1, 2, 3], index=[4, 5, 6]) + generic_version_store.write('pandas', s) + saved = generic_version_store.read('pandas').data + assert np.all(s == saved) + assert saved.name == "values" + + +def test_save_read_pandas_series_maintains_name(generic_version_store): + s = Series(data=[1, 2, 3], index=[4, 5, 6], name="ADJ") + generic_version_store.write('pandas', s) + saved = generic_version_store.read('pandas').data + assert np.all(s == saved) + assert saved.name == "ADJ" + + +def test_save_read_pandas_series_with_multiindex(generic_version_store): + df = Series(data=['A', 'BC', 'DEF'], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)])) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_series_with_multiindex_and_name(generic_version_store): + df = Series(data=['A', 'BC', 'DEF'], + index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)]), + name='Foo') + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + assert df.name == 'Foo' + + +def test_save_read_pandas_series_with_unicode_index_name(generic_version_store): + df = Series(data=['A', 'BC', 'DEF'], + index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),), + (np.datetime64(dt(2013, 1, 2)),), + (np.datetime64(dt(2013, 1, 3)),)], names=[u'DATETIME'])) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_with_multiindex(generic_version_store): + df = DataFrame(data=['A', 'BC', 'DEF'], index=MultiIndex.from_tuples([(1, 2), (1, 3), (2, 2)])) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_with_none_values(generic_version_store): + df = DataFrame(data=[(1, None), (1, 3), (2, 2)]) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all((df.values == saved_df.values) | (np.isnan(df.values) & np.isnan(saved_df.values))) + + +def test_save_read_pandas_dataframe_with_unicode_index_name(generic_version_store): + df = DataFrame(data=['A', 'BC', 'DEF'], + index=MultiIndex.from_tuples([(np.datetime64(dt(2013, 1, 1)),), + (np.datetime64(dt(2013, 1, 2)),), + (np.datetime64(dt(2013, 1, 3)),)], names=[u'DATETIME'])) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_cant_write_pandas_series_with_tuple_values(generic_version_store): + df = Series(data=[('A', 'BC')], index=np.array([dt(2013, 1, 1), ]).astype('datetime64[ns]')) + assert PandasSeriesStore().can_write(Mock(), 'FOO', df) == False + + +def test_save_read_pandas_series_with_datetimeindex_with_timezone(generic_version_store): + df = Series(data=['A', 'BC', 'DEF'], index=DatetimeIndex(np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]'), + tz="America/Chicago")) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert df.index.tz == saved_df.index.tz + assert all(df.index == saved_df.index) + + +def test_save_read_pandas_series_with_datetimeindex(generic_version_store): + df = Series(data=['A', 'BC', 'DEF'], index=np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]')) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.index == saved_df.index) + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_with_datetimeindex_with_timezone(generic_version_store): + df = DataFrame(data=['A', 'BC', 'DEF'], index=DatetimeIndex(np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]'), + tz="America/Chicago")) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert df.index.tz == saved_df.index.tz + assert all(df.index == saved_df.index) + + +def test_save_read_pandas_empty_series_with_datetime_multiindex_with_timezone(generic_version_store): + empty_index = pd.MultiIndex(levels=(pd.DatetimeIndex([], tz="America/Chicago"), pd.Index([])), labels=([], [])) + df = Series(data=[], index=empty_index) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert empty_index.equal_levels(saved_df.index), "Index timezone information should be maintained, even when empty" + + +def test_save_read_pandas_dataframe_with_datetimeindex(generic_version_store): + df = DataFrame(data=['A', 'BC', 'DEF'], index=np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]')) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.index == saved_df.index) + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_with_strings(generic_version_store): + df = DataFrame(data=['A', 'BC', 'DEF'], index=[4, 5, 6]) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe(generic_version_store): + df = DataFrame(data=[1, 2, 3], index=[4, 5, 6]) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_empty_dataframe(generic_version_store): + df = DataFrame({'a': [], 'b': []}) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe2(generic_version_store): + df = DataFrame(data=[1, 2, 3], index=DatetimeIndex(start='1/1/2011', periods=3, freq='H')) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_strings(generic_version_store): + df = DataFrame(data=['a', 'b', 'c'], index=DatetimeIndex(start='1/1/2011', periods=3, freq='H')) + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + + +def test_save_read_pandas_dataframe_empty_multiindex(generic_version_store): + expected = read_csv(StringIO(u'''\ +STRATEGY MAC INSTRUMENT CONTRACT $Price $Delta $Gamma $Vega $Theta $Notional uDelta uGamma uVega uTheta Delta Gamma Vega Theta'''), + delimiter=' ').set_index(['STRATEGY', 'MAC', 'INSTRUMENT', 'CONTRACT']) + generic_version_store.write('pandas', expected) + saved_df = generic_version_store.read('pandas').data + assert np.all(expected.values == saved_df.values) + assert np.all(expected.index.names == saved_df.index.names) + + +def test_save_read_pandas_dataframe_empty_multiindex_and_no_columns(generic_version_store): + expected = read_csv(StringIO(u'''STRATEGY MAC INSTRUMENT CONTRACT'''), + delimiter=' ').set_index(['STRATEGY', 'MAC', 'INSTRUMENT', 'CONTRACT']) + generic_version_store.write('pandas', expected) + saved_df = generic_version_store.read('pandas').data + assert np.all(expected.values == saved_df.values) + assert np.all(expected.index.names == saved_df.index.names) + + +def test_save_read_pandas_dataframe_multiindex_and_no_columns(generic_version_store): + expected = read_csv(StringIO(u'''\ +STRATEGY MAC INSTRUMENT CONTRACT +STRAT F22 ASD 201312'''), + delimiter=' ').set_index(['STRATEGY', 'MAC', 'INSTRUMENT', 'CONTRACT']) + generic_version_store.write('pandas', expected) + saved_df = generic_version_store.read('pandas').data + assert np.all(expected.values == saved_df.values) + assert np.all(expected.index.names == saved_df.index.names) + + +def xtest_append_pandas_multi_columns_dataframe(generic_version_store): + columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]], names=["first", "second"]) + df = pd.DataFrame(np.random.randn(2, 8), index=[0, 1], columns=columns) + df2 = pd.DataFrame(np.random.randn(2, 8), index=[2, 3], columns=columns) + generic_version_store.write('test', df) + generic_version_store.append('test', df2) + + saved = generic_version_store.read('test') + + df = df.append(df2) + assert df.columns.equal_levels(saved.data.columns) + assert np.all(saved.data.columns == df.columns) + assert np.all(saved.data.columns.names == df.columns.names) + assert np.all(saved.data.index == df.index) + assert np.all(saved.data.values == df.values) + + +def xtest_append_pandas_multi_columns_dataframe_new_column(generic_version_store): + columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]], names=["first", "second"]) + df = pd.DataFrame(np.random.randn(2, 8), index=[0, 1], columns=columns) + df2 = pd.DataFrame(np.random.randn(2, 8), index=[2, 3], columns=columns) + generic_version_store.write('test', df) + df2['bar', 'three'] = np.random.randn(2, 1) + generic_version_store.append('test', df2) + + saved = generic_version_store.read('test') + + df = df.append(df2) + columns = list(itertools.product(["bar", "baz", "foo", "qux"], ["one", "two"])) + assert np.all(saved.data[columns] == df[columns]) + assert np.all(saved.data['bar', 'three'][2:] == df['bar', 'three'][2:]) + + +def test_save_read_pandas_multi_columns_empty_dataframe(generic_version_store): + columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]], names=["first", "second"]) + df = pd.DataFrame([], columns=columns) + generic_version_store.write('test', df) + + saved = generic_version_store.read('test') + + assert df.columns.equal_levels(saved.data.columns) + assert np.all(saved.data.columns == df.columns) + assert np.all(saved.data.columns.names == df.columns.names) + assert np.all(saved.data.index == df.index) + assert np.all(saved.data.values == df.values) + + +def test_save_read_pandas_multi_columns_dataframe(generic_version_store): + columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]], names=["first", "second"]) + df = pd.DataFrame(np.random.randn(2, 8), columns=columns) + generic_version_store.write('test', df) + + saved = generic_version_store.read('test') + + assert df.columns.equal_levels(saved.data.columns) + assert np.all(saved.data.columns == df.columns) + assert np.all(saved.data.columns.names == df.columns.names) + assert np.all(saved.data.index == df.index) + assert np.all(saved.data.values == df.values) + + +def test_save_read_pandas_multi_columns_no_names_dataframe(generic_version_store): + columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]]) + df = pd.DataFrame(np.random.randn(2, 8), columns=columns) + generic_version_store.write('test', df) + + saved = generic_version_store.read('test') + + assert df.columns.equal_levels(saved.data.columns) + assert np.all(saved.data.columns == df.columns) + assert list(saved.data.columns.names) == ["level_0", "level_1"] + assert np.all(saved.data.index == df.index) + assert np.all(saved.data.values == df.values) + + +def test_save_read_pandas_multi_columns_dataframe_with_int_levels(generic_version_store): + columns = pd.MultiIndex.from_product([[1, 2, 'a'], ['c', 5]]) + df = pd.DataFrame([[9, 2, 8, 1, 2, 3], [3, 4, 2, 9, 10, 11]], index=['x', 'y'], columns=columns) + generic_version_store.write('test', df) + + saved = generic_version_store.read('test') + + # Check that column names were converted to string + assert [list(sublevel) for sublevel in saved.data.columns.levels] == [list(map(str, sublevel)) for sublevel in df.columns.levels] + assert np.all(saved.data.index == df.index) + assert np.all(saved.data.values == df.values) + + +def test_save_read_multi_index_and_multi_columns_dataframe(generic_version_store): + columns = pd.MultiIndex.from_product([["bar", "baz", "foo", "qux"], ["one", "two"]]) + index = pd.MultiIndex.from_product([["x", "y", "z"], ["a", "b"]]) + df = pd.DataFrame(np.random.randn(6, 8), index=index, columns=columns) + generic_version_store.write('test', df) + + saved = generic_version_store.read('test') + + assert isinstance(saved.data.index, df.index.__class__) + assert np.all(saved.data.index == df.index) + assert np.all(saved.data.columns == df.columns) + assert np.all(saved.data == df) + + +def xtest_append_pandas_dataframe(generic_version_store): + df = DataFrame(data=[1, 2, 3], index=DatetimeIndex(start='1/1/2011', periods=3, freq='H')) + df2 = DataFrame(data=[4, 5, 6], index=DatetimeIndex(start='2/1/2011', periods=3, freq='H')) + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df2) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.append(df2).values == saved_df.values) + + +def test_empty_dataframe_multindex(generic_version_store): + df = DataFrame({'a': [], 'b': [], 'c': []}) + df = df.groupby(['a', 'b']).sum() + generic_version_store.write('pandas', df) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.values == saved_df.values) + assert np.all(df.index.names == df.index.names) + + +def xtest_dataframe_append_empty(generic_version_store): + df = DataFrame(data=[1, 2, 3], index=DatetimeIndex(start='1/1/2011', periods=3, freq='H')) + df2 = DataFrame(data=[], index=[]) + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df2) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.append(df2).values == saved_df.values) + + +def xtest_empty_dataframe_append(generic_version_store): + df = DataFrame(data=[], index=[]) + df2 = DataFrame(data=[1, 2, 3], index=DatetimeIndex(start='1/1/2011', periods=3, freq='H')) + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df2) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.append(df2).values == saved_df.values) + + +def xtest_dataframe_append_empty_multiindex(generic_version_store): + df = DataFrame({'a': [1, 1, 1], 'b': [1, 1, 2], 'c': [1, 2, 3]}).groupby(['a', 'b']).sum() + df2 = DataFrame({'a': [], 'b': [], 'c': []}).groupby(['a', 'b']).sum() + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df2) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.append(df2).values == saved_df.values) + assert np.all(df.index.names == saved_df.index.names) + + +def xtest_empty_dataframe_append_multiindex(generic_version_store): + df = DataFrame({'a': [], 'b': [], 'c': []}).groupby(['a', 'b']).sum() + df2 = DataFrame({'a': [1, 1, 1], 'b': [1, 1, 2], 'c': [1, 2, 3]}).groupby(['a', 'b']).sum() + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df2) + saved_df = generic_version_store.read('pandas').data + assert np.all(df.append(df2).values == saved_df.values) + assert np.all(df.index.names == saved_df.index.names) + + +def xtest_empty_dataframe_should_ignore_dtype(generic_version_store): + df = DataFrame({'a': [], 'b': [], 'c': []}).groupby(['a', 'b']).sum() + df2 = DataFrame({'a': [1, 1, 1], 'b': [1, 1, 2], 'c': [1, 2, 3]}).groupby(['a']).sum() + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df2) + saved_df = generic_version_store.read('pandas').data + assert np.all(df2.index.names == saved_df.index.names) + + +def xtest_empty_dataframe_should_ignore_dtype2(generic_version_store): + df = DataFrame({'a': []}) + df2 = DataFrame({'a': [1, 1, 1], 'b': [1, 1, 2], 'c': [1, 2, 3]}).groupby(['a']).sum() + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df2) + saved_df = generic_version_store.read('pandas').data + assert np.all(df2.values == saved_df.values) + assert np.all(df2.index.names == saved_df.index.names) + + +def xtest_dataframe_append_should_promote_string_column(generic_version_store): + data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) + data[:] = [(1, 2., 'Hello'), (2, 3., "World")] + df = DataFrame(data, index=DatetimeIndex(np.array([dt(2013, 1, 1), + dt(2013, 1, 2)]).astype('datetime64[ns]'), name=[u'DATETIME'])) + data2 = np.zeros((1,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a30')]) + data2[:] = [(3, 4., 'Hello World - Good Morning')] + df2 = DataFrame(data2, index=DatetimeIndex(np.array([dt(2013, 1, 3)]).astype('datetime64[ns]'), name=[u'DATETIME'])) + expected_data = np.zeros((3,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a30')]) + expected_data[:] = [(1, 2., 'Hello'), (2, 3., "World"), (3, 4., 'Hello World - Good Morning')] + expected = DataFrame(expected_data, index=DatetimeIndex(np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]'), name=[u'DATETIME'])) + + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df2) + actual = generic_version_store.read('pandas').data + + assert_frame_equal(expected, actual) + + +def xtest_dataframe_append_should_add_new_column(generic_version_store): + data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) + data[:] = [(1, 2., 'Hello'), (2, 3., "World")] + df = DataFrame(data, index=DatetimeIndex(np.array([dt(2013, 1, 1), + dt(2013, 1, 2)]).astype('datetime64[ns]'), name=[u'DATETIME'])) + data2 = np.zeros((1,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10'), ('D', 'f4')]) + data2[:] = [(4, 5., 'Hi', 6.)] + df2 = DataFrame(data2, index=DatetimeIndex(np.array([dt(2013, 1, 3)]).astype('datetime64[ns]'), name=[u'DATETIME'])) + expected_data = np.zeros((3,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10'), ('D', 'f4')]) + expected_data[:] = [(1, 2., 'Hello', np.nan), (2, 3., "World", np.nan), (4, 5., 'Hi', 6.)] + expected = DataFrame(expected_data, index=DatetimeIndex(np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]'), name=[u'DATETIME'])) + + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df2) + actual = generic_version_store.read('pandas').data + + assert_frame_equal(expected, actual) + + +def xtest_dataframe_append_should_add_new_columns_and_reorder(generic_version_store): + data = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) + data[:] = [(1, 2., 'Hello'), (2, 3., "World")] + df = DataFrame(data, index=DatetimeIndex(np.array([dt(2013, 1, 1), + dt(2013, 1, 2)]).astype('datetime64[ns]'), name=[u'DATETIME'])) + data2 = np.zeros((1,), dtype=[('C', 'a10'), ('A', 'i4'), ('E', 'a1'), ('B', 'f4'), ('D', 'f4'), ('F', 'i4')]) + data2[:] = [('Hi', 4, 'Y', 5., 6., 7)] + df2 = DataFrame(data2, index=DatetimeIndex(np.array([dt(2013, 1, 3)]).astype('datetime64[ns]'), name=[u'DATETIME'])) + expected_data = np.zeros((3,), dtype=[('C', 'a10'), ('A', 'i4'), ('E', 'a1'), + ('B', 'f4'), ('D', 'f4'), ('F', 'i4')]) + expected_data[:] = [('Hello', 1, '', 2., np.nan, 0), ("World", 2, '', 3., np.nan, 0), ('Hi', 4, 'Y', 5., 6., 7)] + expected = DataFrame(expected_data, index=DatetimeIndex(np.array([dt(2013, 1, 1), + dt(2013, 1, 2), + dt(2013, 1, 3)]).astype('datetime64[ns]'), name=[u'DATETIME'])) + + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df2) + actual = generic_version_store.read('pandas').data + + assert_frame_equal(expected, actual) + + +# -- auto generated tests --- # +def dataframe(columns, length, index): + df = DataFrame(np.ones((length, columns)), columns=list(string.ascii_lowercase[:columns])) + index = min(index, columns) + if index: + df = df.set_index(list(string.ascii_lowercase[:index])) + return df + + + +@pytest.mark.parametrize("df_size", list(itertools.combinations_with_replacement([0, 1, 2, 4], r=3))) +def test_dataframe_save_read(generic_version_store, df_size): + df = dataframe(*df_size) + generic_version_store.write('pandas', df) + result = generic_version_store.read('pandas').data + assert np.all(df.values == result.values), str(df.values) + "!=" + str(result.values) + if None not in df.index.names: # saved as 'index' or 'level' + assert np.all(df.index.names == result.index.names), str(df.index.names) + "!=" + str(result.index.names) + assert np.all(df.index.values == result.index.values), str(df.index.values) + "!=" + str(result.index.values) + assert np.all(df.columns.values == result.columns.values), str(df.columns.values) + "!=" + str(result.columns.values) + + +@pytest.mark.parametrize("df_size", list(itertools.combinations_with_replacement([0, 1, 2, 4], r=3))) +def xtest_dataframe_save_append_read(generic_version_store, df_size): + df = dataframe(*df_size) + generic_version_store.write('pandas', df) + generic_version_store.append('pandas', df) + result = generic_version_store.read('pandas').data + assert len(result) == len(df) * 2 + if None not in df.index.names: # saved as 'index' or 'level' + assert np.all(df.index.names == result.index.names), str(df.index.names) + "!=" + str(result.index.names) + assert np.all(df.columns.values == result.columns.values), str(df.columns.values) + "!=" + str(result.columns.values) + + + +def xtest_large_dataframe_append_rewrite_same_item(generic_version_store): + csv = \ +"""index, f1, f2, f3, f4, f5, f6, f7, f8, iVol, tau, uPrice, uDelta, uGamma, uVega, uTheta, Delta, Gamma, Vega, Theta, $Price, $Delta, $Gamma, $Vega, $Theta, $Time_Value, $Notional, FX, f9 +0, 201401, 2013 - 12 - 20 16:15:00, 15.0, F1, CALL, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.5768068954653813, 0.6427860135978315, 0.391592427081917, 4.915801583071703, -20.166163353481476, 9.641790203967473, 5.873886406228755, 73.73702374607555, -302.49245030222215, 11909.274289984183, 18625.940769791625, 15925.131550993763, 1014.9606370552315, -1601.4183005499872, 4786.093789984206, 2897689.1805000002, 1.37646, SYM +1, 201401, 2013 - 12 - 20 16:15:00, 15.0, F1, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, -5.358002241713311, 5.873886406228755, 73.73702374607555, -302.50057493980034, 4786.192353109285, -10350.550083271604, 15925.131550993763, 1014.9606370552315, -1601.4613130062987, 4786.192353109285, 2897689.1805000002, 1.37646, SYM +2, 201401, 2013 - 12 - 20 16:15:00, -48.0, F22, CALL, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 0.05709601681178711, 0.11956012929302556, 0.20479158314197934, 2.628816497069195, -11.027911868706408, -5.738886206065227, -9.829995990815009, -126.18319185932137, 529.3397696979075, -3772.3383984361194, -11086.338978290602, -26650.835319775462, -1736.8611626668148, 2802.3654592245452, -3772.3383984361194, -9272605.3776, 1.37646, SYM +3, 201402, 2014 - 01 - 24 16:15:00, -286.0, F22, CALL, STRAT, 141.5, 140.345, 0.045487609195962696, 0.10463818541333941, 0.3747457492377393, 0.29120692771365, 0.1660598823861943, 15.56832633851715, -3.3830120036011397, -83.2851813261039, -47.49312636245157, -4452.541332815905, 967.541433029926, -147525.24472279268, -160889.7125497546, -128762.15724702866, -61287.4504296778, 5122.238772724507, -147525.24472279268, -55249273.7082, 1.37646, SYM +4, 201402, 2014 - 01 - 24 16:15:00, -264.0, F22, CALL, STRAT, 142.0, 140.345, 0.044822991783170785, 0.10463818541333941, 0.24229877494137142, 0.21142760067302388, 0.14217904830463807, 13.134711351643713, -2.812643033008342, -55.816886577678304, -37.53526875242445, -3467.56379683394, 742.5377607142022, -88047.84694353123, -107826.65888355605, -101764.66675460352, -47729.62863790045, 3931.052023510272, -88047.84694353123, -50999329.576799996, 1.37646, SYM +5, 201401, 2013 - 12 - 20 16:15:00, -350.0, F22, CALL, STRAT, 142.0, 140.345, 0.07732984880519912, 0.008813407863715872, 0.022997617617102506, 0.053564485523868555, 0.10692101346714668, 1.4353175202195965, -6.296783951449458, -18.747569933353994, -37.422354713501335, -502.3611320768588, 2203.8743830073104, -11079.355260832921, -36216.420371031316, -101458.53708176922, -6914.80003858513, 11667.480512439395, -11079.355260832921, -67612747.545, 1.37646, SYM +6, 201402, 2014 - 01 - 24 16:15:00, -43.0, F22, CALL, STRAT, 142.5, 140.345, 0.04429193547308161, 0.10463818541333941, 0.14930517833206025, 0.14535540627931182, 0.11352765189447668, 10.36359429711007, -2.1930395074393734, -6.250282470010408, -4.881689031462497, -445.634554775733, 94.30069881989306, -8837.042047978748, -12074.25059227865, -13235.111243323556, -6133.9813926660545, 499.23515345242305, -8837.042047978748, -8306708.9841, 1.37646, SYM +7, 201401, 2013 - 12 - 20 16:15:00, -557.0, F22, CALL, STRAT, 142.5, 140.345, 0.0814452531405243, 0.008813407863715872, 0.009355428262274312, 0.02334242880598363, 0.05141464658820557, 0.7269263150873877, -3.358771076933658, -13.001732844932882, -28.637958149630503, -404.89795750367495, 1870.8354898520474, -7172.696641740786, -25116.653728342328, -77642.50435641785, -5573.258425855083, 9904.346993699035, -7172.696641740786, -107600858.2359, 1.37646, SYM +8, 201401, 2013 - 12 - 20 16:15:00, -607.0, F22, CALL, STRAT, 143.0, 140.345, 0.08598678226600448, 0.008813407863715872, 0.003929576582252237, 0.010236258301012439, 0.024009328219809185, 0.35838470316321597, -1.748258969026736, -6.21340878871455, -14.573662229424174, -217.5395148200721, 1061.1931941992289, -3283.2053243209966, -12003.018280721177, -39511.74267470002, -2994.344405692364, 5618.038400336425, -3283.2053243209966, -117259822.1709, 1.37646, SYM +9, 201401, 2013 - 12 - 20 16:15:00, -799.0, F22, CALL, STRAT, 143.5, 140.345, 0.09076344895187359, 0.008813407863715872, 0.0017194411099074047, 0.004596451952699387, 0.01121737082629775, 0.1767420247600966, -0.9100718136522263, -3.67256511020681, -8.962679290211902, -141.2168777833172, 727.1473791081288, -1891.026786204374, -7094.634789685377, -24299.388322293227, -1943.793835936248, 3849.574159412212, -1891.026786204374, -154350243.6813, 1.37646, SYM +10, 201401, 2013 - 12 - 20 16:15:00, -377.0, F22, CALL, STRAT, 144.0, 140.345, 0.09566038240450792, 0.008813407863715872, 0.0007852689424384662, 0.0021295289144923784, 0.005324993197820229, 0.08842782200919548, -0.47989481865526434, -0.8028324007636266, -2.007522435578226, -33.3372888974667, 180.92034663303465, -407.4960157678369, -1550.905840965067, -5442.743810001693, -458.87444675807006, 957.8062320250265, -407.4960157678369, -72828588.06989999, 1.37646, SYM +11, 201402, 2014 - 01 - 24 16:15:00, -43.0, F22, PUT, STRAT, 137.5, 140.345, 0.05414565513055749, 0.10463818541333941, 0.14529132959784974, -0.11936326135136956, 0.08106840033227831, 9.046889913827847, -2.3403474666535415, 5.132620238108891, -3.4859412142879673, -389.0162662945974, 100.63494106610229, -8599.471252145018, 9915.158754388978, -9450.995231657676, -5354.653299038616, 532.7691191532583, -8599.471252145018, -8306708.9841, 1.37646, SYM +12, 201402, 2014 - 01 - 24 16:15:00, -264.0, F22, PUT, STRAT, 138.0, 140.345, 0.052853182910226726, 0.10463818541333941, 0.20369081242765574, -0.16004607860136968, 0.10141337819029916, 11.047155968410756, -2.789510903380204, 42.252164750761594, -26.77313184223898, -2916.44917566044, 736.4308784923738, -74018.27549798116, 81622.4271006569, -72586.63466280713, -40143.75632329569, 3898.7217192677417, -74018.27549798116, -50999329.576799996, 1.37646, SYM +13, 201401, 2013 - 12 - 20 16:15:00, -376.0, F22, PUT, STRAT, 138.0, 140.345, 0.08897789701177691, 0.008813407863715872, 0.009425028910330369, -0.021620830082859088, 0.04411750741642045, 0.6814450839280415, -3.43983227630679, 8.129432111155017, -16.588182788574088, -256.2233515569436, 1293.376935891353, -4877.913910511415, 15704.378314735444, -44973.45961948536, -3526.811944840706, 6847.236989142353, -4877.913910511415, -72635408.7912, 1.37646, SYM +14, 201401, 2013 - 12 - 20 16:15:00, -301.0, F22, PUT, STRAT, 138.5, 140.345, 0.08383267513417192, 0.008813407863715872, 0.020991265826436845, -0.045956251827941025, 0.08727871921287762, 1.2701629715541363, -6.0408289559434, 13.832831800210249, -26.270894483076166, -382.319054437795, 1818.2895157389635, -8696.984965596635, 26722.164695430383, -71224.98149804553, -5262.468856714474, 9626.164564746361, -8696.984965596635, -58146962.8887, 1.37646, SYM +15, 201402, 2014 - 01 - 24 16:15:00, -286.0, F22, PUT, STRAT, 138.5, 140.345, 0.051599724402617266, 0.10463818541333941, 0.28321473137770425, -0.21146513081873966, 0.12351912253075312, 13.136076509490826, -3.2382097361444653, 60.479027414159546, -35.32646904379539, -3756.917881714376, 926.127984537317, -111492.45225751627, 116832.94892344868, -95776.22511698281, -51712.4718746457, 4902.992790754752, -111492.45225751627, -55249273.7082, 1.37646, SYM +16, 201401, 2013 - 12 - 20 16:15:00, -739.0, F22, PUT, STRAT, 139.0, 140.345, 0.0791166184474159, 0.008813407863715872, 0.047581495319667155, -0.0967612790459439, 0.16434769983129724, 2.257195133461944, -10.131173555213623, 71.50658521495254, -121.45295017532867, -1668.0672036283765, 7486.937257302868, -48400.084510256995, 138135.90554124617, -329280.15202122304, -22960.277831063155, 39636.42175841195, -48400.084510256995, -142759486.9593, 1.37646, SYM +17, 201401, 2013 - 12 - 20 16:15:00, -669.0, F22, PUT, STRAT, 139.5, 140.345, 0.07513349054261133, 0.008813407863715872, 0.10733307441031315, -0.1949726645282245, 0.27848967340302655, 3.6322892048663644, -15.482297001088007, 130.4367125693822, -186.30959150662477, -2430.001478055598, 10357.656693727877, -98837.84833028633, 251976.70050152476, -505117.8297913038, -33447.99834484408, 54834.231279417974, -98837.84833028633, -129236937.4503, 1.37646, SYM +18, 201401, 2013 - 12 - 20 16:15:00, -471.0, F22, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, 168.24127038979793, -184.4400331555829, -2315.3425456267723, 9498.518053109732, -150286.43988763154, 325007.2726147283, -500049.1307012041, -31869.76400353427, 50285.885228397776, -150286.43988763154, -90987440.2677, 1.37646, SYM +19, 201401, 2013 - 12 - 20 16:15:00, -364.0, F22, PUT, STRAT, 141.0, 140.345, 0.07172143045750252, 0.008813407863715872, 0.7922715181315709, -0.7543151841866509, 0.333159035321538, 4.147995696473539, -16.876460506586433, 274.5707270439409, -121.26988885703983, -1509.8704335163682, 6143.031624397461, -396952.9396004471, 530413.7500248309, -328783.8408272312, -20782.762569179402, 32521.681960454345, -68777.34640044652, -70317257.4468, 1.37646, SYM +20, 201401, 2013 - 12 - 20 16:15:00, -394.0, F22, PUT, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 1.212080035129219, -0.88042603375236, 0.20479158314197934, 2.628816497069195, -11.026098543797652, 346.8878572984298, -80.68788375793986, -1035.7536998452629, 4344.282826256274, -657341.595950662, 670115.460626992, -218758.93991649026, -14256.735376890107, 22998.967457802737, -30955.94375066146, -76112635.8078, 1.37646, SYM +21, 201402, 2014 - 01 - 24 16:15:00, -40.0, GEE1, CALL, STRAT, 141.5, 140.345, 0.045487609195962696, 0.10463818541333941, 0.3747457492377393, 0.29120692771365, 0.1660598823861943, 15.56832633851715, -3.3830120036011397, -11.648277108545999, -6.642395295447772, -622.733053540686, 135.32048014404558, -20632.901359831147, -22502.057699266377, -18008.69332126275, -8571.671388766126, 716.3970311502808, -20632.901359831147, -7727171.148, 1.37646, SYM +22, 201401, 2013 - 12 - 20 16:15:00, -12.0, GEE1, CALL, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 0.05709601681178711, 0.11956012929302556, 0.20479158314197934, 2.628816497069195, -11.027911868706408, -1.4347215515163068, -2.4574989977037522, -31.545797964830342, 132.33494242447688, -943.0845996090299, -2771.5847445726504, -6662.708829943866, -434.2152906667037, 700.5913648061363, -943.0845996090299, -2318151.3444, 1.37646, SYM +23, 201402, 2014 - 01 - 24 16:15:00, -45.0, GEE1, CALL, STRAT, 142.0, 140.345, 0.044822991783170785, 0.10463818541333941, 0.24229877494137142, 0.21142760067302388, 0.14217904830463807, 13.134711351643713, -2.812643033008342, -9.514242030286075, -6.398057173708713, -591.0620108239671, 126.56893648537539, -15008.155729011005, -18379.544127878875, -17346.250014989233, -8135.732154187577, 670.0656858256148, -15008.155729011005, -8693067.5415, 1.37646, SYM +24, 201401, 2013 - 12 - 20 16:15:00, -57.0, GEE1, CALL, STRAT, 142.0, 140.345, 0.07732984880519912, 0.008813407863715872, 0.022997617617102506, 0.053564485523868555, 0.10692101346714668, 1.4353175202195965, -6.296783951449458, -3.0531756748605074, -6.09449776762736, -81.813098652517, 358.9166852326191, -1804.3521424785042, -5898.102746139386, -16523.247467602414, -1126.1245777124357, 1900.1325405972727, -1804.3521424785042, -11011218.8859, 1.37646, SYM +25, 201401, 2013 - 12 - 20 16:15:00, -68.0, GEE1, CALL, STRAT, 142.5, 140.345, 0.0814452531405243, 0.008813407863715872, 0.009355428262274312, 0.02334242880598363, 0.05141464658820557, 0.7269263150873877, -3.358771076933658, -1.5872851588068868, -3.496195967997979, -49.430989425942364, 228.39643323148874, -875.6613494405268, -3066.306020695293, -9478.797659311334, -680.3977970523262, 1209.1482864839038, -875.6613494405268, -13136190.9516, 1.37646, SYM +26, 201402, 2014 - 01 - 24 16:15:00, -19.0, GEE1, CALL, STRAT, 142.5, 140.345, 0.04429193547308161, 0.10463818541333941, 0.14930517833206025, 0.14535540627931182, 0.11352765189447668, 10.36359429711007, -2.1930395074393734, -2.7617527193069247, -2.1570253859950568, -196.90829164509134, 41.66775064134809, -3904.7395095720058, -5335.133982634753, -5848.072409840642, -2710.3638711780245, 220.5922771068846, -3904.7395095720058, -3670406.2953000003, 1.37646, SYM +27, 201401, 2013 - 12 - 20 16:15:00, -91.0, GEE1, CALL, STRAT, 143.0, 140.345, 0.08598678226600448, 0.008813407863715872, 0.003929576582252237, 0.010236258301012439, 0.024009328219809185, 0.35838470316321597, -1.748258969026736, -0.9314995053921319, -2.1848488680026357, -32.613007987852654, 159.091566181433, -492.21035339902915, -1799.464025610588, -5923.5067271790795, -448.9050097495967, 842.2429891772894, -492.21035339902915, -17579314.3617, 1.37646, SYM +28, 201401, 2013 - 12 - 20 16:15:00, -117.0, GEE1, CALL, STRAT, 143.5, 140.345, 0.09076344895187359, 0.008813407863715872, 0.0017194411099074047, 0.004596451952699387, 0.01121737082629775, 0.1767420247600966, -0.9100718136522263, -0.5377848784658282, -1.3124323866768368, -20.678816896931302, 106.47840219731049, -276.9088034867481, -1038.8889491779587, -3558.233333802638, -284.63564305950064, 563.7048518788846, -276.9088034867481, -22601975.6079, 1.37646, SYM +29, 201401, 2013 - 12 - 20 16:15:00, -126.0, GEE1, CALL, STRAT, 144.0, 140.345, 0.09566038240450792, 0.008813407863715872, 0.0007852689424384662, 0.0021295289144923784, 0.005324993197820229, 0.08842782200919548, -0.47989481865526434, -0.26832064322603966, -0.6709491429253489, -11.141905573158631, 60.46674715056331, -136.19230235211526, -518.3398831872638, -1819.0602654117067, -153.3638734522993, 320.1156107033245, -136.19230235211526, -24340589.1162, 1.37646, SYM +30, 201402, 2014 - 01 - 24 16:15:00, -19.0, GEE1, PUT, STRAT, 137.5, 140.345, 0.05414565513055749, 0.10463818541333941, 0.14529132959784974, -0.11936326135136956, 0.08106840033227831, 9.046889913827847, -2.3403474666535415, 2.2679019656760215, -1.5402996063132879, -171.8909083627291, 44.46660186641729, -3799.766367226869, 4381.11665891606, -4176.021148871998, -2366.009597249621, 235.40961078864902, -3799.766367226869, -3670406.2953000003, 1.37646, SYM +31, 201401, 2013 - 12 - 20 16:15:00, -64.0, GEE1, PUT, STRAT, 138.0, 140.345, 0.08897789701177691, 0.008813407863715872, 0.009425028910330369, -0.021620830082859088, 0.04411750741642045, 0.6814450839280415, -3.43983227630679, 1.3837331253029816, -2.8235204746509086, -43.612485371394655, 220.14926568363455, -830.2832188104537, 2673.0856705932674, -7655.056956508147, -600.3084161430988, 1165.48714708806, -830.2832188104537, -12363473.8368, 1.37646, SYM +32, 201402, 2014 - 01 - 24 16:15:00, -45.0, GEE1, PUT, STRAT, 138.0, 140.345, 0.052853182910226726, 0.10463818541333941, 0.20369081242765574, -0.16004607860136968, 0.10141337819029916, 11.047155968410756, -2.789510903380204, 7.2020735370616356, -4.563602018563462, -497.122018578484, 125.52799065210918, -12616.751505337697, 13912.913710339246, -12372.721817523941, -6842.685736925402, 664.5548385115469, -12616.751505337697, -8693067.5415, 1.37646, SYM +33, 201401, 2013 - 12 - 20 16:15:00, -51.0, GEE1, PUT, STRAT, 138.5, 140.345, 0.08383267513417192, 0.008813407863715872, 0.020991265826436845, -0.045956251827941025, 0.08727871921287762, 1.2701629715541363, -6.0408289559434, 2.3437688432249923, -4.451214679856759, -64.77831154926095, 308.08227675311343, -1473.5755257323203, 4527.675745737374, -12068.020120931302, -891.6475471509574, 1631.011271767656, -1473.5755257323203, -9852143.2137, 1.37646, SYM +34, 201402, 2014 - 01 - 24 16:15:00, -40.0, GEE1, PUT, STRAT, 138.5, 140.345, 0.051599724402617266, 0.10463818541333941, 0.28321473137770425, -0.21146513081873966, 0.12351912253075312, 13.136076509490826, -3.2382097361444653, 8.458605232749587, -4.940764901230125, -525.443060379633, 129.5283894457786, -15593.349966086193, 16340.27257670611, -13395.276240137457, -7232.5135489014965, 685.733257448217, -15593.349966086193, -7727171.148, 1.37646, SYM +35, 201401, 2013 - 12 - 20 16:15:00, -98.0, GEE1, PUT, STRAT, 139.0, 140.345, 0.0791166184474159, 0.008813407863715872, 0.047581495319667155, -0.0967612790459439, 0.16434769983129724, 2.257195133461944, -10.131173555213623, 9.4826053465025, -16.10607458346713, -221.20512307927052, 992.855008410935, -6418.414454675487, 18318.42861034117, -43666.38010565609, -3044.8000371369267, 5256.250787989675, -6418.414454675487, -18931569.312599998, 1.37646, SYM +36, 201401, 2013 - 12 - 20 16:15:00, -111.0, GEE1, PUT, STRAT, 139.5, 140.345, 0.07513349054261133, 0.008813407863715872, 0.10733307441031315, -0.1949726645282245, 0.27848967340302655, 3.6322892048663644, -15.482297001088007, 21.64196576263292, -30.912353747735946, -403.18410174016645, 1718.5349671207687, -16399.10487991298, 41807.79335675523, -83808.78790259302, -5549.667886812695, 9098.05631093482, -16399.10487991298, -21442899.9357, 1.37646, SYM +37, 201401, 2013 - 12 - 20 16:15:00, -108.0, GEE1, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, 38.577616140335834, -42.29198212484704, -530.9065709717439, 2178.0041395665626, -34460.58494238685, 74523.96059955555, -114660.94716715509, -7307.7165867976655, 11530.52145364535, -34460.58494238685, -20863362.0996, 1.37646, SYM +38, 201401, 2013 - 12 - 20 16:15:00, -83.0, GEE1, PUT, STRAT, 141.0, 140.345, 0.07172143045750252, 0.008813407863715872, 0.7922715181315709, -0.7543151841866509, 0.333159035321538, 4.147995696473539, -16.876460506586433, 62.608160287492026, -27.652199931687655, -344.28364280730375, 1400.746222046674, -90513.99446933273, 120945.99245071695, -74969.94172708844, -4738.926629785414, 7415.658249224481, -15682.746569332587, -16033880.132100001, 1.37646, SYM +39, 201401, 2013 - 12 - 20 16:15:00, -56.0, GEE1, PUT, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 1.212080035129219, -0.88042603375236, 0.20479158314197934, 2.628816497069195, -11.026098543797652, 49.30385789013216, -11.468328655950843, -147.21372383587493, 617.4615184526685, -93429.26236862202, 95244.83704343032, -31092.641206404707, -2026.3380231112837, 3268.888775728308, -4399.8295686219335, -10818039.607199999, 1.37646, SYM""" + csv = StringIO(csv) + df = read_csv(csv).set_index(['index']) + for _ in range(10): + generic_version_store.write('pandas', df[:-2]) + result = generic_version_store.read('pandas').data + assert len(result) == len(df[:-2]) + assert np.all(df[:-2].values == result.values) + assert np.all(df[:-2].columns.values == result.columns.values) + for _ in range(10): + generic_version_store.write('pandas', df[:-1]) + result = generic_version_store.read('pandas').data + assert len(result) == len(df[:-1]) + assert np.all(df[:-1].values == result.values) + assert np.all(df[:-1].columns.values == result.columns.values) + for _ in range(10): + generic_version_store.write('pandas', df) + result = generic_version_store.read('pandas').data + assert len(result) == len(df) + assert np.all(df.values == result.values) + assert np.all(df.columns.values == result.columns.values) + + +def test_large_dataframe_rewrite_same_item(generic_version_store): + csv = \ +"""index, f1, f2, f3, f4, f5, f6, f7, f8, iVol, tau, uPrice, uDelta, uGamma, uVega, uTheta, Delta, Gamma, Vega, Theta, $Price, $Delta, $Gamma, $Vega, $Theta, $Time_Value, $Notional, FX, f9 +0, 201401, 2013 - 12 - 20 16:15:00, 15.0, F1, CALL, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.5768068954653813, 0.6427860135978315, 0.391592427081917, 4.915801583071703, -20.166163353481476, 9.641790203967473, 5.873886406228755, 73.73702374607555, -302.49245030222215, 11909.274289984183, 18625.940769791625, 15925.131550993763, 1014.9606370552315, -1601.4183005499872, 4786.093789984206, 2897689.1805000002, 1.37646, SYM +1, 201401, 2013 - 12 - 20 16:15:00, 15.0, F1, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, -5.358002241713311, 5.873886406228755, 73.73702374607555, -302.50057493980034, 4786.192353109285, -10350.550083271604, 15925.131550993763, 1014.9606370552315, -1601.4613130062987, 4786.192353109285, 2897689.1805000002, 1.37646, SYM +2, 201401, 2013 - 12 - 20 16:15:00, -48.0, F22, CALL, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 0.05709601681178711, 0.11956012929302556, 0.20479158314197934, 2.628816497069195, -11.027911868706408, -5.738886206065227, -9.829995990815009, -126.18319185932137, 529.3397696979075, -3772.3383984361194, -11086.338978290602, -26650.835319775462, -1736.8611626668148, 2802.3654592245452, -3772.3383984361194, -9272605.3776, 1.37646, SYM +3, 201402, 2014 - 01 - 24 16:15:00, -286.0, F22, CALL, STRAT, 141.5, 140.345, 0.045487609195962696, 0.10463818541333941, 0.3747457492377393, 0.29120692771365, 0.1660598823861943, 15.56832633851715, -3.3830120036011397, -83.2851813261039, -47.49312636245157, -4452.541332815905, 967.541433029926, -147525.24472279268, -160889.7125497546, -128762.15724702866, -61287.4504296778, 5122.238772724507, -147525.24472279268, -55249273.7082, 1.37646, SYM +4, 201402, 2014 - 01 - 24 16:15:00, -264.0, F22, CALL, STRAT, 142.0, 140.345, 0.044822991783170785, 0.10463818541333941, 0.24229877494137142, 0.21142760067302388, 0.14217904830463807, 13.134711351643713, -2.812643033008342, -55.816886577678304, -37.53526875242445, -3467.56379683394, 742.5377607142022, -88047.84694353123, -107826.65888355605, -101764.66675460352, -47729.62863790045, 3931.052023510272, -88047.84694353123, -50999329.576799996, 1.37646, SYM +5, 201401, 2013 - 12 - 20 16:15:00, -350.0, F22, CALL, STRAT, 142.0, 140.345, 0.07732984880519912, 0.008813407863715872, 0.022997617617102506, 0.053564485523868555, 0.10692101346714668, 1.4353175202195965, -6.296783951449458, -18.747569933353994, -37.422354713501335, -502.3611320768588, 2203.8743830073104, -11079.355260832921, -36216.420371031316, -101458.53708176922, -6914.80003858513, 11667.480512439395, -11079.355260832921, -67612747.545, 1.37646, SYM +6, 201402, 2014 - 01 - 24 16:15:00, -43.0, F22, CALL, STRAT, 142.5, 140.345, 0.04429193547308161, 0.10463818541333941, 0.14930517833206025, 0.14535540627931182, 0.11352765189447668, 10.36359429711007, -2.1930395074393734, -6.250282470010408, -4.881689031462497, -445.634554775733, 94.30069881989306, -8837.042047978748, -12074.25059227865, -13235.111243323556, -6133.9813926660545, 499.23515345242305, -8837.042047978748, -8306708.9841, 1.37646, SYM +7, 201401, 2013 - 12 - 20 16:15:00, -557.0, F22, CALL, STRAT, 142.5, 140.345, 0.0814452531405243, 0.008813407863715872, 0.009355428262274312, 0.02334242880598363, 0.05141464658820557, 0.7269263150873877, -3.358771076933658, -13.001732844932882, -28.637958149630503, -404.89795750367495, 1870.8354898520474, -7172.696641740786, -25116.653728342328, -77642.50435641785, -5573.258425855083, 9904.346993699035, -7172.696641740786, -107600858.2359, 1.37646, SYM +8, 201401, 2013 - 12 - 20 16:15:00, -607.0, F22, CALL, STRAT, 143.0, 140.345, 0.08598678226600448, 0.008813407863715872, 0.003929576582252237, 0.010236258301012439, 0.024009328219809185, 0.35838470316321597, -1.748258969026736, -6.21340878871455, -14.573662229424174, -217.5395148200721, 1061.1931941992289, -3283.2053243209966, -12003.018280721177, -39511.74267470002, -2994.344405692364, 5618.038400336425, -3283.2053243209966, -117259822.1709, 1.37646, SYM +9, 201401, 2013 - 12 - 20 16:15:00, -799.0, F22, CALL, STRAT, 143.5, 140.345, 0.09076344895187359, 0.008813407863715872, 0.0017194411099074047, 0.004596451952699387, 0.01121737082629775, 0.1767420247600966, -0.9100718136522263, -3.67256511020681, -8.962679290211902, -141.2168777833172, 727.1473791081288, -1891.026786204374, -7094.634789685377, -24299.388322293227, -1943.793835936248, 3849.574159412212, -1891.026786204374, -154350243.6813, 1.37646, SYM +10, 201401, 2013 - 12 - 20 16:15:00, -377.0, F22, CALL, STRAT, 144.0, 140.345, 0.09566038240450792, 0.008813407863715872, 0.0007852689424384662, 0.0021295289144923784, 0.005324993197820229, 0.08842782200919548, -0.47989481865526434, -0.8028324007636266, -2.007522435578226, -33.3372888974667, 180.92034663303465, -407.4960157678369, -1550.905840965067, -5442.743810001693, -458.87444675807006, 957.8062320250265, -407.4960157678369, -72828588.06989999, 1.37646, SYM +11, 201402, 2014 - 01 - 24 16:15:00, -43.0, F22, PUT, STRAT, 137.5, 140.345, 0.05414565513055749, 0.10463818541333941, 0.14529132959784974, -0.11936326135136956, 0.08106840033227831, 9.046889913827847, -2.3403474666535415, 5.132620238108891, -3.4859412142879673, -389.0162662945974, 100.63494106610229, -8599.471252145018, 9915.158754388978, -9450.995231657676, -5354.653299038616, 532.7691191532583, -8599.471252145018, -8306708.9841, 1.37646, SYM +12, 201402, 2014 - 01 - 24 16:15:00, -264.0, F22, PUT, STRAT, 138.0, 140.345, 0.052853182910226726, 0.10463818541333941, 0.20369081242765574, -0.16004607860136968, 0.10141337819029916, 11.047155968410756, -2.789510903380204, 42.252164750761594, -26.77313184223898, -2916.44917566044, 736.4308784923738, -74018.27549798116, 81622.4271006569, -72586.63466280713, -40143.75632329569, 3898.7217192677417, -74018.27549798116, -50999329.576799996, 1.37646, SYM +13, 201401, 2013 - 12 - 20 16:15:00, -376.0, F22, PUT, STRAT, 138.0, 140.345, 0.08897789701177691, 0.008813407863715872, 0.009425028910330369, -0.021620830082859088, 0.04411750741642045, 0.6814450839280415, -3.43983227630679, 8.129432111155017, -16.588182788574088, -256.2233515569436, 1293.376935891353, -4877.913910511415, 15704.378314735444, -44973.45961948536, -3526.811944840706, 6847.236989142353, -4877.913910511415, -72635408.7912, 1.37646, SYM +14, 201401, 2013 - 12 - 20 16:15:00, -301.0, F22, PUT, STRAT, 138.5, 140.345, 0.08383267513417192, 0.008813407863715872, 0.020991265826436845, -0.045956251827941025, 0.08727871921287762, 1.2701629715541363, -6.0408289559434, 13.832831800210249, -26.270894483076166, -382.319054437795, 1818.2895157389635, -8696.984965596635, 26722.164695430383, -71224.98149804553, -5262.468856714474, 9626.164564746361, -8696.984965596635, -58146962.8887, 1.37646, SYM +15, 201402, 2014 - 01 - 24 16:15:00, -286.0, F22, PUT, STRAT, 138.5, 140.345, 0.051599724402617266, 0.10463818541333941, 0.28321473137770425, -0.21146513081873966, 0.12351912253075312, 13.136076509490826, -3.2382097361444653, 60.479027414159546, -35.32646904379539, -3756.917881714376, 926.127984537317, -111492.45225751627, 116832.94892344868, -95776.22511698281, -51712.4718746457, 4902.992790754752, -111492.45225751627, -55249273.7082, 1.37646, SYM +16, 201401, 2013 - 12 - 20 16:15:00, -739.0, F22, PUT, STRAT, 139.0, 140.345, 0.0791166184474159, 0.008813407863715872, 0.047581495319667155, -0.0967612790459439, 0.16434769983129724, 2.257195133461944, -10.131173555213623, 71.50658521495254, -121.45295017532867, -1668.0672036283765, 7486.937257302868, -48400.084510256995, 138135.90554124617, -329280.15202122304, -22960.277831063155, 39636.42175841195, -48400.084510256995, -142759486.9593, 1.37646, SYM +17, 201401, 2013 - 12 - 20 16:15:00, -669.0, F22, PUT, STRAT, 139.5, 140.345, 0.07513349054261133, 0.008813407863715872, 0.10733307441031315, -0.1949726645282245, 0.27848967340302655, 3.6322892048663644, -15.482297001088007, 130.4367125693822, -186.30959150662477, -2430.001478055598, 10357.656693727877, -98837.84833028633, 251976.70050152476, -505117.8297913038, -33447.99834484408, 54834.231279417974, -98837.84833028633, -129236937.4503, 1.37646, SYM +18, 201401, 2013 - 12 - 20 16:15:00, -471.0, F22, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, 168.24127038979793, -184.4400331555829, -2315.3425456267723, 9498.518053109732, -150286.43988763154, 325007.2726147283, -500049.1307012041, -31869.76400353427, 50285.885228397776, -150286.43988763154, -90987440.2677, 1.37646, SYM +19, 201401, 2013 - 12 - 20 16:15:00, -364.0, F22, PUT, STRAT, 141.0, 140.345, 0.07172143045750252, 0.008813407863715872, 0.7922715181315709, -0.7543151841866509, 0.333159035321538, 4.147995696473539, -16.876460506586433, 274.5707270439409, -121.26988885703983, -1509.8704335163682, 6143.031624397461, -396952.9396004471, 530413.7500248309, -328783.8408272312, -20782.762569179402, 32521.681960454345, -68777.34640044652, -70317257.4468, 1.37646, SYM +20, 201401, 2013 - 12 - 20 16:15:00, -394.0, F22, PUT, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 1.212080035129219, -0.88042603375236, 0.20479158314197934, 2.628816497069195, -11.026098543797652, 346.8878572984298, -80.68788375793986, -1035.7536998452629, 4344.282826256274, -657341.595950662, 670115.460626992, -218758.93991649026, -14256.735376890107, 22998.967457802737, -30955.94375066146, -76112635.8078, 1.37646, SYM +21, 201402, 2014 - 01 - 24 16:15:00, -40.0, GEE1, CALL, STRAT, 141.5, 140.345, 0.045487609195962696, 0.10463818541333941, 0.3747457492377393, 0.29120692771365, 0.1660598823861943, 15.56832633851715, -3.3830120036011397, -11.648277108545999, -6.642395295447772, -622.733053540686, 135.32048014404558, -20632.901359831147, -22502.057699266377, -18008.69332126275, -8571.671388766126, 716.3970311502808, -20632.901359831147, -7727171.148, 1.37646, SYM +22, 201401, 2013 - 12 - 20 16:15:00, -12.0, GEE1, CALL, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 0.05709601681178711, 0.11956012929302556, 0.20479158314197934, 2.628816497069195, -11.027911868706408, -1.4347215515163068, -2.4574989977037522, -31.545797964830342, 132.33494242447688, -943.0845996090299, -2771.5847445726504, -6662.708829943866, -434.2152906667037, 700.5913648061363, -943.0845996090299, -2318151.3444, 1.37646, SYM +23, 201402, 2014 - 01 - 24 16:15:00, -45.0, GEE1, CALL, STRAT, 142.0, 140.345, 0.044822991783170785, 0.10463818541333941, 0.24229877494137142, 0.21142760067302388, 0.14217904830463807, 13.134711351643713, -2.812643033008342, -9.514242030286075, -6.398057173708713, -591.0620108239671, 126.56893648537539, -15008.155729011005, -18379.544127878875, -17346.250014989233, -8135.732154187577, 670.0656858256148, -15008.155729011005, -8693067.5415, 1.37646, SYM +24, 201401, 2013 - 12 - 20 16:15:00, -57.0, GEE1, CALL, STRAT, 142.0, 140.345, 0.07732984880519912, 0.008813407863715872, 0.022997617617102506, 0.053564485523868555, 0.10692101346714668, 1.4353175202195965, -6.296783951449458, -3.0531756748605074, -6.09449776762736, -81.813098652517, 358.9166852326191, -1804.3521424785042, -5898.102746139386, -16523.247467602414, -1126.1245777124357, 1900.1325405972727, -1804.3521424785042, -11011218.8859, 1.37646, SYM +25, 201401, 2013 - 12 - 20 16:15:00, -68.0, GEE1, CALL, STRAT, 142.5, 140.345, 0.0814452531405243, 0.008813407863715872, 0.009355428262274312, 0.02334242880598363, 0.05141464658820557, 0.7269263150873877, -3.358771076933658, -1.5872851588068868, -3.496195967997979, -49.430989425942364, 228.39643323148874, -875.6613494405268, -3066.306020695293, -9478.797659311334, -680.3977970523262, 1209.1482864839038, -875.6613494405268, -13136190.9516, 1.37646, SYM +26, 201402, 2014 - 01 - 24 16:15:00, -19.0, GEE1, CALL, STRAT, 142.5, 140.345, 0.04429193547308161, 0.10463818541333941, 0.14930517833206025, 0.14535540627931182, 0.11352765189447668, 10.36359429711007, -2.1930395074393734, -2.7617527193069247, -2.1570253859950568, -196.90829164509134, 41.66775064134809, -3904.7395095720058, -5335.133982634753, -5848.072409840642, -2710.3638711780245, 220.5922771068846, -3904.7395095720058, -3670406.2953000003, 1.37646, SYM +27, 201401, 2013 - 12 - 20 16:15:00, -91.0, GEE1, CALL, STRAT, 143.0, 140.345, 0.08598678226600448, 0.008813407863715872, 0.003929576582252237, 0.010236258301012439, 0.024009328219809185, 0.35838470316321597, -1.748258969026736, -0.9314995053921319, -2.1848488680026357, -32.613007987852654, 159.091566181433, -492.21035339902915, -1799.464025610588, -5923.5067271790795, -448.9050097495967, 842.2429891772894, -492.21035339902915, -17579314.3617, 1.37646, SYM +28, 201401, 2013 - 12 - 20 16:15:00, -117.0, GEE1, CALL, STRAT, 143.5, 140.345, 0.09076344895187359, 0.008813407863715872, 0.0017194411099074047, 0.004596451952699387, 0.01121737082629775, 0.1767420247600966, -0.9100718136522263, -0.5377848784658282, -1.3124323866768368, -20.678816896931302, 106.47840219731049, -276.9088034867481, -1038.8889491779587, -3558.233333802638, -284.63564305950064, 563.7048518788846, -276.9088034867481, -22601975.6079, 1.37646, SYM +29, 201401, 2013 - 12 - 20 16:15:00, -126.0, GEE1, CALL, STRAT, 144.0, 140.345, 0.09566038240450792, 0.008813407863715872, 0.0007852689424384662, 0.0021295289144923784, 0.005324993197820229, 0.08842782200919548, -0.47989481865526434, -0.26832064322603966, -0.6709491429253489, -11.141905573158631, 60.46674715056331, -136.19230235211526, -518.3398831872638, -1819.0602654117067, -153.3638734522993, 320.1156107033245, -136.19230235211526, -24340589.1162, 1.37646, SYM +30, 201402, 2014 - 01 - 24 16:15:00, -19.0, GEE1, PUT, STRAT, 137.5, 140.345, 0.05414565513055749, 0.10463818541333941, 0.14529132959784974, -0.11936326135136956, 0.08106840033227831, 9.046889913827847, -2.3403474666535415, 2.2679019656760215, -1.5402996063132879, -171.8909083627291, 44.46660186641729, -3799.766367226869, 4381.11665891606, -4176.021148871998, -2366.009597249621, 235.40961078864902, -3799.766367226869, -3670406.2953000003, 1.37646, SYM +31, 201401, 2013 - 12 - 20 16:15:00, -64.0, GEE1, PUT, STRAT, 138.0, 140.345, 0.08897789701177691, 0.008813407863715872, 0.009425028910330369, -0.021620830082859088, 0.04411750741642045, 0.6814450839280415, -3.43983227630679, 1.3837331253029816, -2.8235204746509086, -43.612485371394655, 220.14926568363455, -830.2832188104537, 2673.0856705932674, -7655.056956508147, -600.3084161430988, 1165.48714708806, -830.2832188104537, -12363473.8368, 1.37646, SYM +32, 201402, 2014 - 01 - 24 16:15:00, -45.0, GEE1, PUT, STRAT, 138.0, 140.345, 0.052853182910226726, 0.10463818541333941, 0.20369081242765574, -0.16004607860136968, 0.10141337819029916, 11.047155968410756, -2.789510903380204, 7.2020735370616356, -4.563602018563462, -497.122018578484, 125.52799065210918, -12616.751505337697, 13912.913710339246, -12372.721817523941, -6842.685736925402, 664.5548385115469, -12616.751505337697, -8693067.5415, 1.37646, SYM +33, 201401, 2013 - 12 - 20 16:15:00, -51.0, GEE1, PUT, STRAT, 138.5, 140.345, 0.08383267513417192, 0.008813407863715872, 0.020991265826436845, -0.045956251827941025, 0.08727871921287762, 1.2701629715541363, -6.0408289559434, 2.3437688432249923, -4.451214679856759, -64.77831154926095, 308.08227675311343, -1473.5755257323203, 4527.675745737374, -12068.020120931302, -891.6475471509574, 1631.011271767656, -1473.5755257323203, -9852143.2137, 1.37646, SYM +34, 201402, 2014 - 01 - 24 16:15:00, -40.0, GEE1, PUT, STRAT, 138.5, 140.345, 0.051599724402617266, 0.10463818541333941, 0.28321473137770425, -0.21146513081873966, 0.12351912253075312, 13.136076509490826, -3.2382097361444653, 8.458605232749587, -4.940764901230125, -525.443060379633, 129.5283894457786, -15593.349966086193, 16340.27257670611, -13395.276240137457, -7232.5135489014965, 685.733257448217, -15593.349966086193, -7727171.148, 1.37646, SYM +35, 201401, 2013 - 12 - 20 16:15:00, -98.0, GEE1, PUT, STRAT, 139.0, 140.345, 0.0791166184474159, 0.008813407863715872, 0.047581495319667155, -0.0967612790459439, 0.16434769983129724, 2.257195133461944, -10.131173555213623, 9.4826053465025, -16.10607458346713, -221.20512307927052, 992.855008410935, -6418.414454675487, 18318.42861034117, -43666.38010565609, -3044.8000371369267, 5256.250787989675, -6418.414454675487, -18931569.312599998, 1.37646, SYM +36, 201401, 2013 - 12 - 20 16:15:00, -111.0, GEE1, PUT, STRAT, 139.5, 140.345, 0.07513349054261133, 0.008813407863715872, 0.10733307441031315, -0.1949726645282245, 0.27848967340302655, 3.6322892048663644, -15.482297001088007, 21.64196576263292, -30.912353747735946, -403.18410174016645, 1718.5349671207687, -16399.10487991298, 41807.79335675523, -83808.78790259302, -5549.667886812695, 9098.05631093482, -16399.10487991298, -21442899.9357, 1.37646, SYM +37, 201401, 2013 - 12 - 20 16:15:00, -108.0, GEE1, PUT, STRAT, 140.0, 140.345, 0.07231398622706062, 0.008813407863715872, 0.2318116692147143, -0.357200149447554, 0.391592427081917, 4.915801583071703, -20.16670499598669, 38.577616140335834, -42.29198212484704, -530.9065709717439, 2178.0041395665626, -34460.58494238685, 74523.96059955555, -114660.94716715509, -7307.7165867976655, 11530.52145364535, -34460.58494238685, -20863362.0996, 1.37646, SYM +38, 201401, 2013 - 12 - 20 16:15:00, -83.0, GEE1, PUT, STRAT, 141.0, 140.345, 0.07172143045750252, 0.008813407863715872, 0.7922715181315709, -0.7543151841866509, 0.333159035321538, 4.147995696473539, -16.876460506586433, 62.608160287492026, -27.652199931687655, -344.28364280730375, 1400.746222046674, -90513.99446933273, 120945.99245071695, -74969.94172708844, -4738.926629785414, 7415.658249224481, -15682.746569332587, -16033880.132100001, 1.37646, SYM +39, 201401, 2013 - 12 - 20 16:15:00, -56.0, GEE1, PUT, STRAT, 141.5, 140.345, 0.0739452718231504, 0.008813407863715872, 1.212080035129219, -0.88042603375236, 0.20479158314197934, 2.628816497069195, -11.026098543797652, 49.30385789013216, -11.468328655950843, -147.21372383587493, 617.4615184526685, -93429.26236862202, 95244.83704343032, -31092.641206404707, -2026.3380231112837, 3268.888775728308, -4399.8295686219335, -10818039.607199999, 1.37646, SYM""" + csv = StringIO(csv) + df = read_csv(csv).set_index(['index']) + for _ in range(100): + generic_version_store.write('pandas', df) + result = generic_version_store.read('pandas').data + assert len(result) == len(df) + assert np.all(df.values == result.values) + assert np.all(df.columns.values == result.columns.values) + + +def xtest_append_after_truncate_after_append(generic_version_store): + columns = ['MAIN_UPPER', 'MAIN_LOWER', 'AUX_UPPER', 'AUX_LOWER', 'TARGET_HEDGE_POSITION'] + empty_df = DataFrame(columns=columns, dtype=np.float64) + generic_version_store.write('sym', empty_df) + full_df = DataFrame(data=[np.zeros(5)], columns=columns) + generic_version_store.write('sym', full_df) + generic_version_store.write('sym', empty_df) + full_df = DataFrame(data=[np.zeros(5)], columns=columns) + generic_version_store.write('sym', full_df) + assert len(generic_version_store.read('sym', 1).data) == 0 + assert len(generic_version_store.read('sym', 2).data) == 1 + assert len(generic_version_store.read('sym', 3).data) == 0 + assert len(generic_version_store.read('sym', 4).data) == 1 + + +def test_can_write_pandas_df_with_object_columns(generic_version_store): + expected = DataFrame(data=dict(A=['a', 'b', None, 'c'], B=[1., 2., 3., 4.]), index=range(4)) + generic_version_store.write('objects', expected) + saved_df = generic_version_store.read('objects').data + + assert_frame_equal(saved_df, expected) + + +def panel(i1, i2, i3): + return Panel(np.random.randn(i1, i2, i3), range(i1), ['A%d' % i for i in range(i2)], + list(rrule(DAILY, count=i3, dtstart=dt(1970, 1, 1), interval=1))) + + +@pytest.mark.xfail(pd.__version__ >= '0.18.0', reason="see issue #115") +@pytest.mark.parametrize("df_size", list(itertools.combinations_with_replacement([1, 2, 4], r=3))) +def test_panel_save_read(generic_version_store, df_size): + '''Note - empties are not tested here as they don't work!''' + pn = panel(*df_size) + generic_version_store.write('pandas', pn) + result = generic_version_store.read('pandas').data + assert np.all(pn.values == result.values), str(pn.values) + "!=" + str(result.values) + for i in range(3): + assert np.all(pn.axes[i] == result.axes[i]) + if None not in pn.axes[i].names: + assert np.all(pn.axes[i].names == result.axes[i].names), \ + str(pn.axes[i].names) + "!=" + str(pn.axes[i].names) + + +@pytest.mark.xfail(pd.__version__ >= '0.20.0', reason='Panel is deprecated') +def test_panel_save_read_with_nans(generic_version_store): + '''Ensure that nan rows are not dropped when calling to_frame.''' + df1 = DataFrame(data=np.arange(4).reshape((2, 2)), index=['r1', 'r2'], columns=['c1', 'c2']) + df2 = DataFrame(data=np.arange(6).reshape((3, 2)), index=['r1', 'r2', 'r3'], columns=['c1', 'c2']) + p_in = Panel(data=dict(i1=df1, i2=df2)) + + generic_version_store.write('pandas', p_in) + p_out = generic_version_store.read('pandas').data + + assert p_in.shape == p_out.shape + # check_names is False because pandas helpfully names the axes for us. + assert_frame_equal(p_in.iloc[0], p_out.iloc[0], check_names=False) + assert_frame_equal(p_in.iloc[1], p_out.iloc[1], check_names=False) + + +def test_save_read_ints(generic_version_store): + ts1 = DataFrame(index=[dt(2012, 1, 1) + dtd(hours=x) for x in range(5)], + data={'col1':np.arange(5), 'col2':np.arange(5)}) + ts1.index.name = 'index' + generic_version_store.write('TEST_1', ts1) + ts2 = generic_version_store.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +def test_save_read_datetimes(generic_version_store): + # FEF symbols have datetimes in the CLOSE_REVISION field. Handle specially. + ts1 = DataFrame(index=[dt(2012, 1, 1) + dtd(hours=x) for x in range(3)], + data={'field1': [1, 2, 3], + 'revision': [dt(2013, 1, 1), dt(2013, 1, 2), dt(2013, 1, 3)], + 'field2': [4, 5, 6]}, + ) + ts1.index.name = 'index' + generic_version_store.write('TEST_1', ts1) + ts2 = generic_version_store.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +def test_labels(generic_version_store): + ts1 = DataFrame(index=[dt(2012, 1, 1), dt(2012, 1, 2)], + data={'data': [1., 2.]}) + ts1.index.name = 'some_index' + generic_version_store.write('TEST_1', ts1) + ts2 = generic_version_store.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +def test_duplicate_labels(generic_version_store): + ts1 = DataFrame(index=[dt(2012, 1, 1) + dtd(hours=x) for x in range(5)], + data=[[np.arange(5), np.arange(5, 10)]], + columns=['a', 'a'] + ) + generic_version_store.write('TEST_1', ts1) + ts2 = generic_version_store.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +def test_no_labels(generic_version_store): + ts1 = DataFrame(index=[dt(2012, 1, 1) + dtd(hours=x) for x in range(5)], + data=[[np.arange(5), np.arange(5, 10)]]) + generic_version_store.write('TEST_1', ts1) + ts2 = generic_version_store.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +@pytest.mark.xfail(reason='needs investigating') +def test_no_index_labels(generic_version_store): + ts1 = DataFrame(index=[dt(2012, 1, 1), dt(2012, 1, 2)], + data={'data': [1., 2.]}) + generic_version_store.write('TEST_1', ts1) + ts2 = generic_version_store.read('TEST_1').data + assert_frame_equal(ts1, ts2) + + +def test_not_unique(generic_version_store): + d = dt.now() + ts = DataFrame(index=[d, d], data={'near': [1., 2.]}) + ts.index.name = 'index' + generic_version_store.write('ts', ts) + ts2 = generic_version_store.read('ts').data + assert_frame_equal(ts, ts2) + + +def test_daterange_end(generic_version_store): + df = DataFrame(index=date_range(dt(2001, 1, 1), freq='S', periods=30 * 1024), + data=np.tile(np.arange(30 * 1024), 100).reshape((-1, 100))) + df.columns = [str(c) for c in df.columns] + generic_version_store.write('MYARR', df) + mdecompressALL = Mock(side_effect=decompress) + with patch('arctic.pluggable._kv_ndarray_store.decompress', mdecompressALL): + generic_version_store.read('MYARR').data + mdecompressLR = Mock(side_effect=decompress) + with patch('arctic.pluggable._kv_ndarray_store.decompress', mdecompressLR): + result = generic_version_store.read('MYARR', date_range=DateRange(df.index[-1], df.index[-1])).data + assert len(result) == 1 + assert mdecompressLR.call_count < mdecompressALL.call_count + + +def test_daterange_start(generic_version_store): + df = DataFrame(index=date_range(dt(2001, 1, 1), freq='S', periods=30 * 1024), + data=np.tile(np.arange(30 * 1024), 100).reshape((-1, 100))) + df.columns = [str(c) for c in df.columns] + generic_version_store.write('MYARR', df) + mdecompressALL = Mock(side_effect=decompress) + with patch('arctic.pluggable._kv_ndarray_store.decompress', mdecompressALL): + generic_version_store.read('MYARR').data + mdecompressLR = Mock(side_effect=decompress) + with patch('arctic.pluggable._kv_ndarray_store.decompress', mdecompressLR): + result = generic_version_store.read('MYARR', date_range=DateRange(end=df.index[0])).data + assert len(result) == 1 + assert mdecompressLR.call_count < mdecompressALL.call_count + end = df.index[0] + dtd(milliseconds=1) + result = generic_version_store.read('MYARR', date_range=DateRange(end=end)).data + assert len(result) == 1 + + +def xtest_daterange_with_zero_index(generic_version_store): + # This test results in an index whose first element is indexed as 0 and + # the segment count is different to the number of rows that will be returned + row_count = 1 + # a signle element date range gives a first element index of 0 + df = DataFrame(index=date_range(dt(2001, 1, 1), freq='S', periods=row_count), + data=np.tile(np.arange(row_count), 100).reshape((-1, 100))) + df.columns = [str(c) for c in df.columns] + generic_version_store.write('MYARR', df) + # this append increases the segment count + generic_version_store.append('MYARR', df) + # request for a date range that won't return any values + result = generic_version_store.read('MYARR', date_range=DateRange(end=dt(2000, 1, 1))).data + assert len(result) == 0 + + +def test_daterange_large_DataFrame(generic_version_store): + df = DataFrame(index=date_range(dt(2001, 1, 1), freq='S', periods=30 * 1024), + data=np.tile(np.arange(30 * 1024), 100).reshape((-1, 100))) + df.columns = [str(c) for c in df.columns] + generic_version_store.write('MYARR', df) + # assert saved + saved_arr = generic_version_store.read('MYARR').data + assert_frame_equal(df, saved_arr, check_names=False) + # first 100 + result = generic_version_store.read('MYARR', date_range=DateRange(df.index[0], df.index[100])).data + assert_frame_equal(df[df.index[0]:df.index[100]], result, check_names=False) + # second 100 + result = generic_version_store.read('MYARR', date_range=DateRange(df.index[100], df.index[200])).data + assert_frame_equal(df[df.index[100]:df.index[200]], result, check_names=False) + # first row + result = generic_version_store.read('MYARR', date_range=DateRange(df.index[0], df.index[0])).data + assert_frame_equal(df[df.index[0]:df.index[0]], result, check_names=False) + # last 100 + result = generic_version_store.read('MYARR', date_range=DateRange(df.index[-100])).data + assert_frame_equal(df[df.index[-100]:], result, check_names=False) + # last 200-100 + result = generic_version_store.read('MYARR', date_range=DateRange(df.index[-200], df.index[-100])).data + assert_frame_equal(df[df.index[-200]:df.index[-100]], result, check_names=False) + # last row + result = generic_version_store.read('MYARR', date_range=DateRange(df.index[-1], df.index[-1])).data + assert_frame_equal(df[df.index[-1]:df.index[-1]], result, check_names=False) + # beyond last row + result = generic_version_store.read('MYARR', date_range=DateRange(df.index[-1], df.index[-1] + dtd(days=1))).data + assert_frame_equal(df[df.index[-1]:df.index[-1]], result, check_names=False) + # somewhere in time + result = generic_version_store.read('MYARR', date_range=DateRange(dt(2020, 1, 1), dt(2031, 9, 1))).data + assert_frame_equal(df[dt(2020, 1, 1):dt(2031, 9, 1)], result, check_names=False) + + +def test_daterange_large_DataFrame_middle(generic_version_store): + df = DataFrame(index=date_range(dt(2001, 1, 1), freq='S', periods=30 * 1024), + data=np.tile(np.arange(30 * 1024), 100).reshape((-1, 100))) + df.columns = [str(c) for c in df.columns] + generic_version_store.write('MYARR', df) + # middle + start = 100 + for end in np.arange(200, 30000, 1000): + result = generic_version_store.read('MYARR', date_range=DateRange(df.index[start], df.index[end])).data + assert_frame_equal(df[df.index[start]:df.index[end]], result, check_names=False) + # middle following + for start in np.arange(200, 30000, 1000): + for offset in (100, 300, 500): + end = start + offset + result = generic_version_store.read('MYARR', date_range=DateRange(df.index[start], df.index[end])).data + assert_frame_equal(df[df.index[start]:df.index[end]], result, check_names=False) + + +@pytest.mark.parametrize("df,assert_equal", [ + (DataFrame(index=date_range(dt(2001, 1, 1), freq='D', periods=30000), + data=list(range(30000)), columns=['A']), assert_frame_equal), + (Series(index=date_range(dt(2001, 1, 1), freq='D', periods=30000), + data=range(30000)), assert_series_equal), +]) +def test_daterange(generic_version_store, df, assert_equal): + df.index.name = 'idx' + df.name = 'FOO' + generic_version_store.write('MYARR', df) + # whole array + saved_arr = generic_version_store.read('MYARR').data + assert_equal(df, saved_arr) + assert_equal(df, generic_version_store.read('MYARR', date_range=DateRange(df.index[0])).data) + assert_equal(df, generic_version_store.read('MYARR', date_range=DateRange(df.index[0], df.index[-1])).data) + assert_equal(df, generic_version_store.read('MYARR', date_range=DateRange()).data) + assert_equal(df[df.index[10]:], generic_version_store.read('MYARR', date_range=DateRange(df.index[10])).data) + assert_equal(df[:df.index[10]], generic_version_store.read('MYARR', date_range=DateRange(end=df.index[10])).data) + assert_equal(df[df.index[-1]:], generic_version_store.read('MYARR', date_range=DateRange(df.index[-1])).data) + assert_equal(df[df.index[-1]:], generic_version_store.read('MYARR', date_range=DateRange(df.index[-1], df.index[-1])).data) + assert_equal(df[df.index[0]:df.index[0]], generic_version_store.read('MYARR', date_range=DateRange(df.index[0], df.index[0])).data) + assert_equal(df[:df.index[0]], generic_version_store.read('MYARR', date_range=DateRange(end=df.index[0])).data) + assert_equal(df[df.index[0] - DateOffset(days=1):], + generic_version_store.read('MYARR', date_range=DateRange(df.index[0] - DateOffset(days=1))).data) + assert_equal(df[df.index[-1] + DateOffset(days=1):], + generic_version_store.read('MYARR', date_range=DateRange(df.index[-1] + DateOffset(days=1))).data) + assert len(generic_version_store.read('MYARR', date_range=DateRange(dt(1950, 1, 1), dt(1951, 1, 1))).data) == 0 + assert len(generic_version_store.read('MYARR', date_range=DateRange(dt(2091, 1, 1), dt(2091, 1, 1))).data) == 0 + + +def xtest_daterange_append(generic_version_store): + df = DataFrame(index=date_range(dt(2001, 1, 1), freq='S', periods=30 * 1024), + data=np.tile(np.arange(30 * 1024), 100).reshape((-1, 100))) + df.columns = [str(c) for c in df.columns] + df.index.name = 'idx' + generic_version_store.write('MYARR', df) + # assert saved + saved_arr = generic_version_store.read('MYARR').data + assert_frame_equal(df, saved_arr, check_names=False) + # append two more rows + rows = df.iloc[-2:].copy() + rows.index = rows.index + dtd(days=1) + generic_version_store.append('MYARR', rows) + # assert we can rows back out + assert_frame_equal(rows, generic_version_store.read('MYARR', date_range=DateRange(rows.index[0])).data) + # assert we can read back the first array + assert_frame_equal(df, generic_version_store.read('MYARR', date_range=DateRange(df.index[0], df.index[-1])).data) + # append two more rows + rows1 = df.iloc[-2:].copy() + rows1.index = rows1.index + dtd(days=2) + generic_version_store.append('MYARR', rows1) + # assert we can read a mix of data + assert_frame_equal(rows1, generic_version_store.read('MYARR', date_range=DateRange(rows1.index[0])).data) + assert_frame_equal(concat((df, rows, rows1)), generic_version_store.read('MYARR').data) + assert_frame_equal(concat((rows, rows1)), generic_version_store.read('MYARR', date_range=DateRange(start=rows.index[0])).data) + assert_frame_equal(concat((df, rows, rows1))[df.index[50]:rows1.index[-2]], + generic_version_store.read('MYARR', date_range=DateRange(start=df.index[50], end=rows1.index[-2])).data) + + +def assert_range_slice(generic_version_store, expected, date_range, **kwargs): + assert_equals = assert_series_equal if isinstance(expected, Series) else assert_frame_equal + assert_equals(expected, generic_version_store.read('MYARR', date_range=date_range).data, **kwargs) + + +def test_daterange_single_chunk(generic_version_store): + df = read_csv(StringIO("""2015-08-10 00:00:00,200005,1.0 + 2015-08-10 00:00:00,200012,2.0 + 2015-08-10 00:00:00,200016,3.0 + 2015-08-11 00:00:00,200005,1.0 + 2015-08-11 00:00:00,200012,2,0 + 2015-08-11 00:00:00,200016,3.0"""), parse_dates=[0], + names=['date', 'security_id', 'value']).set_index(['date', 'security_id']) + generic_version_store.write('MYARR', df) + assert_range_slice(generic_version_store, df[dt(2015, 8, 11):], DateRange(dt(2015, 8, 11), dt(2015, 8, 11))) + + +def test_daterange_when_end_beyond_chunk_index(generic_version_store): + df = read_csv(StringIO("""2015-08-10 00:00:00,200005,1.0 + 2015-08-10 00:00:00,200012,2.0 + 2015-08-10 00:00:00,200016,3.0 + 2015-08-11 00:00:00,200005,1.0 + 2015-08-11 00:00:00,200012,2,0 + 2015-08-11 00:00:00,200016,3.0"""), parse_dates=[0], + names=['date', 'security_id', 'value']).set_index(['date', 'security_id']) + generic_version_store.write('MYARR', df) + assert_range_slice(generic_version_store, df[dt(2015, 8, 11):], DateRange(dt(2015, 8, 11), dt(2015, 8, 12))) + + +def test_daterange_when_end_beyond_chunk_index_no_start(generic_version_store): + df = read_csv(StringIO("""2015-08-10 00:00:00,200005,1.0 + 2015-08-10 00:00:00,200012,2.0 + 2015-08-10 00:00:00,200016,3.0 + 2015-08-11 00:00:00,200005,1.0 + 2015-08-11 00:00:00,200012,2,0 + 2015-08-11 00:00:00,200016,3.0"""), parse_dates=[0], + names=['date', 'security_id', 'value']).set_index(['date', 'security_id']) + generic_version_store.write('MYARR', df) + assert_range_slice(generic_version_store, df, DateRange(end=dt(2015, 8, 12))) + + +def test_daterange_fails_with_timezone_start(generic_version_store): + df = read_csv(StringIO("""2015-08-10 00:00:00,200005,1.0 + 2015-08-11 00:00:00,200016,3.0"""), parse_dates=[0], + names=['date', 'security_id', 'value']).set_index(['date', 'security_id']) + generic_version_store.write('MYARR', df) + with pytest.raises(ValueError): + generic_version_store.read('MYARR', date_range=DateRange(start=dt(2015, 1, 1, tzinfo=mktz()))) + + +def test_data_info_series(generic_version_store): + s = Series(data=[1, 2, 3], index=[4, 5, 6]) + generic_version_store.write('pandas', s) + md = generic_version_store.get_info('pandas') + assert md == {'dtype': [('index', ' 16 * 1024 * 1024 + generic_version_store.write('MYARR', ndarr) + saved_arr = generic_version_store.read('MYARR').data + assert np.all(ndarr == saved_arr) + + +def test_mutable_ndarray(generic_version_store): + dtype = np.dtype([('abc', 'int64')]) + ndarr = np.arange(32).view(dtype=dtype) + ndarr.setflags(write=True) + generic_version_store.write('MYARR', ndarr) + saved_arr = generic_version_store.read('MYARR').data + assert saved_arr.flags['WRITEABLE'] + + +@pytest.mark.xfail(reason="delete_version not safe with append...") +def xtest_delete_version_shouldnt_break_read(library): + data = np.arange(30) + yesterday = dt.utcnow() - dtd(days=1, seconds=1) + _id = bson.ObjectId.from_datetime(yesterday) + with patch("bson.ObjectId", return_value=_id): + library.write('symbol', data, prune_previous_version=False) + + # Re-Write the data again + library.write('symbol', data, prune_previous_version=False) + library._delete_version('symbol', 1) + assert repr(library.read('symbol').data) == repr(data)