From 0db47cfdb021b67c4ef4d63f186289dd26528fc9 Mon Sep 17 00:00:00 2001 From: Georgy Moiseev Date: Tue, 6 Sep 2022 13:36:49 +0300 Subject: [PATCH] msgpack: support datetime extended type Tarantool supports datetime type since version 2.10.0 [1]. This patch introduced the support of Tarantool datetime type in msgpack decoders and encoders. Tarantool datetime objects are decoded to `tarantool.Datetime` type. `tarantool.Datetime` may be encoded to Tarantool datetime objects. `tarantool.Datetime` stores data in a `pandas.Timestamp` object. You can create `tarantool.Datetime` objects either from msgpack data or by using the same API as in Tarantool: ``` dt1 = tarantool.Datetime(year=2022, month=8, day=31, hour=18, minute=7, sec=54, nsec=308543321) dt2 = tarantool.Datetime(timestamp=1661969274) dt3 = tarantool.Datetime(timestamp=1661969274, nsec=308543321) ``` `tarantool.Datetime` exposes `year`, `month`, `day`, `hour`, `minute`, `sec`, `nsec` and `timestamp` properties if you need to convert `tarantool.Datetime` to any other kind of datetime object: ``` pdt = pandas.Timestamp(year=dt.year, month=dt.month, day=dt.day, hour=dt.hour, minute=dt.minute, second=dt.sec, microsecond=(dt.nsec // 1000), nanosecond=(dt.nsec % 1000)) ``` `pandas.Timestamp` was chosen to store data because it could be used to store both nanoseconds and timezone information. In-build Python `datetime.datetime` supports microseconds at most, `numpy.datetime64` do not support timezones. Tarantool datetime interval type is planned to be stored in custom type `tarantool.Interval` and we'll need a way to support arithmetic between datetime and interval. This is the main reason we use custom class instead of plain `pandas.Timestamp`. It is also hard to implement Tarantool-compatible timezones with full conversion support without custom classes. This patch does not yet introduce the support of timezones in datetime. 1. https://github.com/tarantool/tarantool/issues/5941 2. https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.html Part of #204 --- CHANGELOG.md | 29 ++++ requirements.txt | 1 + tarantool/__init__.py | 6 +- tarantool/msgpack_ext/datetime.py | 9 + tarantool/msgpack_ext/packer.py | 8 +- tarantool/msgpack_ext/types/datetime.py | 192 +++++++++++++++++++++ tarantool/msgpack_ext/unpacker.py | 6 +- test/suites/__init__.py | 3 +- test/suites/lib/skip.py | 11 ++ test/suites/test_datetime.py | 218 ++++++++++++++++++++++++ 10 files changed, 477 insertions(+), 6 deletions(-) create mode 100644 tarantool/msgpack_ext/datetime.py create mode 100644 tarantool/msgpack_ext/types/datetime.py create mode 100644 test/suites/test_datetime.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 226909d8..787466e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,35 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Decimal type support (#203). - UUID type support (#202). +- Datetime type support and tarantool.Datetime type (#204). + + Tarantool datetime objects are decoded to `tarantool.Datetime` + type. `tarantool.Datetime` may be encoded to Tarantool datetime + objects. + + You can create `tarantool.Datetime` objects either from msgpack + data or by using the same API as in Tarantool: + + ```python + dt1 = tarantool.Datetime(year=2022, month=8, day=31, + hour=18, minute=7, sec=54, + nsec=308543321) + + dt2 = tarantool.Datetime(timestamp=1661969274) + + dt3 = tarantool.Datetime(timestamp=1661969274, nsec=308543321) + ``` + + `tarantool.Datetime` exposes `year`, `month`, `day`, `hour`, + `minute`, `sec`, `nsec` and `timestamp` properties if you need + to convert `tarantool.Datetime` to any other kind of datetime object: + + ```python + pdt = pandas.Timestamp(year=dt.year, month=dt.month, day=dt.day, + hour=dt.hour, minute=dt.minute, second=dt.sec, + microsecond=(dt.nsec // 1000), + nanosecond=(dt.nsec % 1000)) + ``` ### Changed - Bump msgpack requirement to 1.0.4 (PR #223). diff --git a/requirements.txt b/requirements.txt index 46dff380..cdf505c7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ msgpack>=1.0.4 +pandas diff --git a/tarantool/__init__.py b/tarantool/__init__.py index 3d4a19a8..6625b4eb 100644 --- a/tarantool/__init__.py +++ b/tarantool/__init__.py @@ -32,6 +32,10 @@ ENCODING_DEFAULT, ) +from tarantool.msgpack_ext.types.datetime import ( + Datetime, +) + __version__ = "0.9.0" @@ -91,7 +95,7 @@ def connectmesh(addrs=({'host': 'localhost', 'port': 3301},), user=None, __all__ = ['connect', 'Connection', 'connectmesh', 'MeshConnection', 'Schema', 'Error', 'DatabaseError', 'NetworkError', 'NetworkWarning', - 'SchemaError', 'dbapi'] + 'SchemaError', 'dbapi', 'Datetime'] # ConnectionPool is supported only for Python 3.7 or newer. if sys.version_info.major >= 3 and sys.version_info.minor >= 7: diff --git a/tarantool/msgpack_ext/datetime.py b/tarantool/msgpack_ext/datetime.py new file mode 100644 index 00000000..70f56dc9 --- /dev/null +++ b/tarantool/msgpack_ext/datetime.py @@ -0,0 +1,9 @@ +from tarantool.msgpack_ext.types.datetime import Datetime + +EXT_ID = 4 + +def encode(obj): + return obj.msgpack_encode() + +def decode(data): + return Datetime(data) diff --git a/tarantool/msgpack_ext/packer.py b/tarantool/msgpack_ext/packer.py index e8dd74db..bff2b821 100644 --- a/tarantool/msgpack_ext/packer.py +++ b/tarantool/msgpack_ext/packer.py @@ -2,12 +2,16 @@ from uuid import UUID from msgpack import ExtType +from tarantool.msgpack_ext.types.datetime import Datetime + import tarantool.msgpack_ext.decimal as ext_decimal import tarantool.msgpack_ext.uuid as ext_uuid +import tarantool.msgpack_ext.datetime as ext_datetime encoders = [ - {'type': Decimal, 'ext': ext_decimal}, - {'type': UUID, 'ext': ext_uuid }, + {'type': Decimal, 'ext': ext_decimal }, + {'type': UUID, 'ext': ext_uuid }, + {'type': Datetime, 'ext': ext_datetime}, ] def default(obj): diff --git a/tarantool/msgpack_ext/types/datetime.py b/tarantool/msgpack_ext/types/datetime.py new file mode 100644 index 00000000..4a4f68d7 --- /dev/null +++ b/tarantool/msgpack_ext/types/datetime.py @@ -0,0 +1,192 @@ +from copy import deepcopy + +import pandas + +# https://www.tarantool.io/en/doc/latest/dev_guide/internals/msgpack_extensions/#the-datetime-type +# +# The datetime MessagePack representation looks like this: +# +---------+----------------+==========+-----------------+ +# | MP_EXT | MP_DATETIME | seconds | nsec; tzoffset; | +# | = d7/d8 | = 4 | | tzindex; | +# +---------+----------------+==========+-----------------+ +# MessagePack data contains: +# +# * Seconds (8 bytes) as an unencoded 64-bit signed integer stored in the +# little-endian order. +# * The optional fields (8 bytes), if any of them have a non-zero value. +# The fields include nsec (4 bytes), tzoffset (2 bytes), and +# tzindex (2 bytes) packed in the little-endian order. +# +# seconds is seconds since Epoch, where the epoch is the point where the time +# starts, and is platform dependent. For Unix, the epoch is January 1, +# 1970, 00:00:00 (UTC). Tarantool uses a double type, see a structure +# definition in src/lib/core/datetime.h and reasons in +# https://github.com/tarantool/tarantool/wiki/Datetime-internals#intervals-in-c +# +# nsec is nanoseconds, fractional part of seconds. Tarantool uses int32_t, see +# a definition in src/lib/core/datetime.h. +# +# tzoffset is timezone offset in minutes from UTC. Tarantool uses a int16_t type, +# see a structure definition in src/lib/core/datetime.h. +# +# tzindex is Olson timezone id. Tarantool uses a int16_t type, see a structure +# definition in src/lib/core/datetime.h. If both tzoffset and tzindex are +# specified, tzindex has the preference and the tzoffset value is ignored. + +SECONDS_SIZE_BYTES = 8 +NSEC_SIZE_BYTES = 4 +TZOFFSET_SIZE_BYTES = 2 +TZINDEX_SIZE_BYTES = 2 + +BYTEORDER = 'little' + +NSEC_IN_SEC = 1000000000 +NSEC_IN_MKSEC = 1000 + +def get_bytes_as_int(data, cursor, size): + part = data[cursor:cursor + size] + return int.from_bytes(part, BYTEORDER, signed=True), cursor + size + +def get_int_as_bytes(data, size): + return data.to_bytes(size, byteorder=BYTEORDER, signed=True) + +def msgpack_decode(data): + cursor = 0 + seconds, cursor = get_bytes_as_int(data, cursor, SECONDS_SIZE_BYTES) + + data_len = len(data) + if data_len == (SECONDS_SIZE_BYTES + NSEC_SIZE_BYTES + \ + TZOFFSET_SIZE_BYTES + TZINDEX_SIZE_BYTES): + nsec, cursor = get_bytes_as_int(data, cursor, NSEC_SIZE_BYTES) + tzoffset, cursor = get_bytes_as_int(data, cursor, TZOFFSET_SIZE_BYTES) + tzindex, cursor = get_bytes_as_int(data, cursor, TZINDEX_SIZE_BYTES) + elif data_len == SECONDS_SIZE_BYTES: + nsec = 0 + tzoffset = 0 + tzindex = 0 + else: + raise MsgpackError(f'Unexpected datetime payload length {data_len}') + + if (tzoffset != 0) or (tzindex != 0): + raise NotImplementedError + + total_nsec = seconds * NSEC_IN_SEC + nsec + + return pandas.to_datetime(total_nsec, unit='ns') + +class Datetime(): + def __init__(self, data=None, *, timestamp=None, year=None, month=None, + day=None, hour=None, minute=None, sec=None, nsec=None): + if data is not None: + if not isinstance(data, bytes): + raise ValueError('data argument (first positional argument) ' + + 'expected to be a "bytes" instance') + + self._datetime = msgpack_decode(data) + return + + # The logic is same as in Tarantool, refer to datetime API. + # https://www.tarantool.io/en/doc/latest/reference/reference_lua/datetime/new/ + if timestamp is not None: + if ((year is not None) or (month is not None) or \ + (day is not None) or (hour is not None) or \ + (minute is not None) or (sec is not None)): + raise ValueError('Cannot provide both timestamp and year, month, ' + + 'day, hour, minute, sec') + + if nsec is not None: + if not isinstance(timestamp, int): + raise ValueError('timestamp must be int if nsec provided') + + total_nsec = timestamp * NSEC_IN_SEC + nsec + self._datetime = pandas.to_datetime(total_nsec, unit='ns') + else: + self._datetime = pandas.to_datetime(timestamp, unit='s') + else: + if nsec is not None: + microsecond = nsec // NSEC_IN_MKSEC + nanosecond = nsec % NSEC_IN_MKSEC + else: + microsecond = 0 + nanosecond = 0 + + self._datetime = pandas.Timestamp(year=year, month=month, day=day, + hour=hour, minute=minute, second=sec, + microsecond=microsecond, + nanosecond=nanosecond) + + def __eq__(self, other): + if isinstance(other, Datetime): + return self._datetime == other._datetime + elif isinstance(other, pandas.Timestamp): + return self._datetime == other + else: + return False + + def __str__(self): + return self._datetime.__str__() + + def __repr__(self): + return f'datetime: {self._datetime.__repr__()}' + + def __copy__(self): + cls = self.__class__ + result = cls.__new__(cls) + result.__dict__.update(self.__dict__) + return result + + def __deepcopy__(self, memo): + cls = self.__class__ + result = cls.__new__(cls) + memo[id(self)] = result + for k, v in self.__dict__.items(): + setattr(result, k, deepcopy(v, memo)) + return result + + @property + def year(self): + return self._datetime.year + + @property + def month(self): + return self._datetime.month + + @property + def day(self): + return self._datetime.day + + @property + def hour(self): + return self._datetime.hour + + @property + def minute(self): + return self._datetime.minute + + @property + def sec(self): + return self._datetime.second + + @property + def nsec(self): + # microseconds + nanoseconds + return self._datetime.value % NSEC_IN_SEC + + @property + def timestamp(self): + return self._datetime.timestamp() + + def msgpack_encode(self): + seconds = self._datetime.value // NSEC_IN_SEC + nsec = self.nsec + tzoffset = 0 + tzindex = 0 + + buf = get_int_as_bytes(seconds, SECONDS_SIZE_BYTES) + + if (nsec != 0) or (tzoffset != 0) or (tzindex != 0): + buf = buf + get_int_as_bytes(nsec, NSEC_SIZE_BYTES) + buf = buf + get_int_as_bytes(tzoffset, TZOFFSET_SIZE_BYTES) + buf = buf + get_int_as_bytes(tzindex, TZINDEX_SIZE_BYTES) + + return buf diff --git a/tarantool/msgpack_ext/unpacker.py b/tarantool/msgpack_ext/unpacker.py index 44bfdb63..b303e18d 100644 --- a/tarantool/msgpack_ext/unpacker.py +++ b/tarantool/msgpack_ext/unpacker.py @@ -1,9 +1,11 @@ import tarantool.msgpack_ext.decimal as ext_decimal import tarantool.msgpack_ext.uuid as ext_uuid +import tarantool.msgpack_ext.datetime as ext_datetime decoders = { - ext_decimal.EXT_ID: ext_decimal.decode, - ext_uuid.EXT_ID : ext_uuid.decode , + ext_decimal.EXT_ID : ext_decimal.decode , + ext_uuid.EXT_ID : ext_uuid.decode , + ext_datetime.EXT_ID: ext_datetime.decode, } def ext_hook(code, data): diff --git a/test/suites/__init__.py b/test/suites/__init__.py index 94357c8e..c5792bdd 100644 --- a/test/suites/__init__.py +++ b/test/suites/__init__.py @@ -17,13 +17,14 @@ from .test_ssl import TestSuite_Ssl from .test_decimal import TestSuite_Decimal from .test_uuid import TestSuite_UUID +from .test_datetime import TestSuite_Datetime test_cases = (TestSuite_Schema_UnicodeConnection, TestSuite_Schema_BinaryConnection, TestSuite_Request, TestSuite_Protocol, TestSuite_Reconnect, TestSuite_Mesh, TestSuite_Execute, TestSuite_DBAPI, TestSuite_Encoding, TestSuite_Pool, TestSuite_Ssl, - TestSuite_Decimal, TestSuite_UUID) + TestSuite_Decimal, TestSuite_UUID, TestSuite_Datetime) def load_tests(loader, tests, pattern): suite = unittest.TestSuite() diff --git a/test/suites/lib/skip.py b/test/suites/lib/skip.py index 9ce76991..71bfce13 100644 --- a/test/suites/lib/skip.py +++ b/test/suites/lib/skip.py @@ -154,3 +154,14 @@ def skip_or_run_UUID_test(func): return skip_or_run_test_tarantool(func, '2.4.1', 'does not support UUID type') + +def skip_or_run_datetime_test(func): + """Decorator to skip or run datetime-related tests depending on + the tarantool version. + + Tarantool supports datetime type only since 2.10.0 version. + See https://github.com/tarantool/tarantool/issues/5941 + """ + + return skip_or_run_test_pcall_require(func, 'datetime', + 'does not support datetime type') diff --git a/test/suites/test_datetime.py b/test/suites/test_datetime.py new file mode 100644 index 00000000..10ffdfc2 --- /dev/null +++ b/test/suites/test_datetime.py @@ -0,0 +1,218 @@ +# -*- coding: utf-8 -*- + +from __future__ import print_function + +import sys +import re +import unittest +import msgpack +import warnings +import tarantool +import pandas + +from tarantool.msgpack_ext.packer import default as packer_default +from tarantool.msgpack_ext.unpacker import ext_hook as unpacker_ext_hook + +from .lib.tarantool_server import TarantoolServer +from .lib.skip import skip_or_run_datetime_test +from tarantool.error import MsgpackError, MsgpackWarning + +class TestSuite_Datetime(unittest.TestCase): + @classmethod + def setUpClass(self): + print(' DATETIME EXT TYPE '.center(70, '='), file=sys.stderr) + print('-' * 70, file=sys.stderr) + self.srv = TarantoolServer() + self.srv.script = 'test/suites/box.lua' + self.srv.start() + + self.adm = self.srv.admin + self.adm(r""" + _, datetime = pcall(require, 'datetime') + + box.schema.space.create('test') + box.space['test']:create_index('primary', { + type = 'tree', + parts = {1, 'string'}, + unique = true}) + + box.schema.user.create('test', {password = 'test', if_not_exists = true}) + box.schema.user.grant('test', 'read,write,execute', 'universe') + """) + + self.con = tarantool.Connection(self.srv.host, self.srv.args['primary'], + user='test', password='test') + + def setUp(self): + # prevent a remote tarantool from clean our session + if self.srv.is_started(): + self.srv.touch_lock() + + self.adm("box.space['test']:truncate()") + + + def test_Datetime_class_API(self): + dt = tarantool.Datetime(year=2022, month=8, day=31, hour=18, minute=7, sec=54, + nsec=308543321) + + self.assertEqual(dt.year, 2022) + self.assertEqual(dt.month, 8) + self.assertEqual(dt.day, 31) + self.assertEqual(dt.hour, 18) + self.assertEqual(dt.minute, 7) + self.assertEqual(dt.sec, 54) + self.assertEqual(dt.nsec, 308543321) + self.assertEqual(dt.timestamp, 1661969274.308543) + # Both Tarantool and pandas prone to precision loss for timestamp() floats + + + datetime_class_invalid_init_cases = { + 'positional_year': { + 'args': [2022], + 'kwargs': {}, + 'type': ValueError, + 'msg': 'data argument (first positional argument) expected to be a "bytes" instance' + }, + 'positional_date': { + 'args': [2022, 8, 31], + 'kwargs': {}, + 'type': TypeError, + 'msg': '__init__() takes from 1 to 2 positional arguments but 4 were given' + }, + 'mixing_date_and_timestamp': { + 'args': [], + 'kwargs': {'year': 2022, 'timestamp': 1661969274}, + 'type': ValueError, + 'msg': 'Cannot provide both timestamp and year, month, day, hour, minute, sec' + }, + 'mixing_float_timestamp_and_nsec': { + 'args': [], + 'kwargs': {'timestamp': 1661969274.308543, 'nsec': 308543321}, + 'type': ValueError, + 'msg': 'timestamp must be int if nsec provided' + }, + } + + def test_Datetime_class_invalid_init(self): + for name in self.datetime_class_invalid_init_cases.keys(): + with self.subTest(msg=name): + case = self.datetime_class_invalid_init_cases[name] + self.assertRaisesRegex( + case['type'], re.escape(case['msg']), + lambda: tarantool.Datetime(*case['args'], **case['kwargs'])) + + + integration_cases = { + 'date': { + 'python': tarantool.Datetime(year=2022, month=8, day=31), + 'msgpack': (b'\x80\xa4\x0e\x63\x00\x00\x00\x00'), + 'tarantool': r"datetime.new({year=2022, month=8, day=31})", + }, + 'date_unix_start': { + 'python': tarantool.Datetime(year=1970, month=1, day=1), + 'msgpack': (b'\x00\x00\x00\x00\x00\x00\x00\x00'), + 'tarantool': r"datetime.new({year=1970, month=1, day=1})", + }, + 'date_before_1970': { + 'python': tarantool.Datetime(year=1900, month=1, day=1), + 'msgpack': (b'\x80\x81\x55\x7c\xff\xff\xff\xff'), + 'tarantool': r"datetime.new({year=1900, month=1, day=1})", + }, + 'datetime_with_minutes': { + 'python': tarantool.Datetime(year=2022, month=8, day=31, hour=18, minute=7), + 'msgpack': (b'\x44\xa3\x0f\x63\x00\x00\x00\x00'), + 'tarantool': r"datetime.new({year=2022, month=8, day=31, hour=18, min=7})", + }, + 'datetime_with_seconds': { + 'python': tarantool.Datetime(year=2022, month=8, day=31, hour=18, minute=7, sec=54), + 'msgpack': (b'\x7a\xa3\x0f\x63\x00\x00\x00\x00'), + 'tarantool': r"datetime.new({year=2022, month=8, day=31, hour=18, min=7, sec=54})", + }, + 'datetime_with_microseconds': { + 'python': tarantool.Datetime(year=2022, month=8, day=31, hour=18, minute=7, sec=54, + nsec=308543000), + 'msgpack': (b'\x7a\xa3\x0f\x63\x00\x00\x00\x00\x18\xfe\x63\x12\x00\x00\x00\x00'), + 'tarantool': r"datetime.new({year=2022, month=8, day=31, hour=18, min=7, sec=54, " + + r"nsec=308543000})", + }, + 'datetime_with_nanoseconds': { + 'python': tarantool.Datetime(year=2022, month=8, day=31, hour=18, minute=7, sec=54, + nsec=308543321), + 'msgpack': (b'\x7a\xa3\x0f\x63\x00\x00\x00\x00\x59\xff\x63\x12\x00\x00\x00\x00'), + 'tarantool': r"datetime.new({year=2022, month=8, day=31, hour=18, min=7, sec=54, " + + r"nsec=308543321})", + }, + 'date_before_1970_with_nanoseconds': { + 'python': tarantool.Datetime(year=1900, month=1, day=1, nsec=308543321), + 'msgpack': (b'\x80\x81\x55\x7c\xff\xff\xff\xff\x59\xff\x63\x12\x00\x00\x00\x00'), + 'tarantool': r"datetime.new({year=1900, month=1, day=1, nsec=308543321})", + }, + 'timestamp': { + 'python': tarantool.Datetime(timestamp=1661969274), + 'msgpack': (b'\x7a\xa3\x0f\x63\x00\x00\x00\x00'), + 'tarantool': r"datetime.new({timestamp=1661969274})", + }, + 'timestamp_with_nanoseconds': { + 'python': tarantool.Datetime(timestamp=1661969274, nsec=308543321), + 'msgpack': (b'\x7a\xa3\x0f\x63\x00\x00\x00\x00\x59\xff\x63\x12\x00\x00\x00\x00'), + 'tarantool': r"datetime.new({timestamp=1661969274, nsec=308543321})", + }, + } + + def test_msgpack_decode(self): + for name in self.integration_cases.keys(): + with self.subTest(msg=name): + case = self.integration_cases[name] + + self.assertEqual(unpacker_ext_hook(4, case['msgpack']), + case['python']) + + @skip_or_run_datetime_test + def test_tarantool_decode(self): + for name in self.integration_cases.keys(): + with self.subTest(msg=name): + case = self.integration_cases[name] + + self.adm(f"box.space['test']:replace{{'{name}', {case['tarantool']}, 'field'}}") + + self.assertSequenceEqual(self.con.select('test', name), + [[name, case['python'], 'field']]) + + def test_msgpack_encode(self): + for name in self.integration_cases.keys(): + with self.subTest(msg=name): + case = self.integration_cases[name] + + self.assertEqual(packer_default(case['python']), + msgpack.ExtType(code=4, data=case['msgpack'])) + + @skip_or_run_datetime_test + def test_tarantool_encode(self): + for name in self.integration_cases.keys(): + with self.subTest(msg=name): + case = self.integration_cases[name] + + self.con.insert('test', [name, case['python'], 'field']) + + lua_eval = f""" + local dt = {case['tarantool']} + + local tuple = box.space['test']:get('{name}') + assert(tuple ~= nil) + + if tuple[2] == dt then + return true + else + return nil, ('%s is not equal to expected %s'):format( + tostring(tuple[2]), tostring(dt)) + end + """ + + self.assertSequenceEqual(self.adm(lua_eval), [True]) + + + @classmethod + def tearDownClass(self): + self.con.close() + self.srv.stop() + self.srv.clean()