From 51c2300210533a27fbd8bb58f93c2f382bbbdc40 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 13 Sep 2023 19:36:04 -0400 Subject: [PATCH 1/4] Use pandasSQL transactions in sql test suite to avoid engine deadlocks (#55129) pandasSQL use transactions in test suite --- pandas/tests/io/test_sql.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index bbdb22955297e..1abe0ad55a864 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1141,18 +1141,21 @@ def load_types_data(self, types_data): def _read_sql_iris_parameter(self, sql_strings): query = sql_strings["read_parameters"][self.flavor] params = ("Iris-setosa", 5.1) - iris_frame = self.pandasSQL.read_query(query, params=params) + with self.pandasSQL.run_transaction(): + iris_frame = self.pandasSQL.read_query(query, params=params) check_iris_frame(iris_frame) def _read_sql_iris_named_parameter(self, sql_strings): query = sql_strings["read_named_parameters"][self.flavor] params = {"name": "Iris-setosa", "length": 5.1} - iris_frame = self.pandasSQL.read_query(query, params=params) + with self.pandasSQL.run_transaction(): + iris_frame = self.pandasSQL.read_query(query, params=params) check_iris_frame(iris_frame) def _read_sql_iris_no_parameter_with_percent(self, sql_strings): query = sql_strings["read_no_parameters_with_percent"][self.flavor] - iris_frame = self.pandasSQL.read_query(query, params=None) + with self.pandasSQL.run_transaction(): + iris_frame = self.pandasSQL.read_query(query, params=None) check_iris_frame(iris_frame) def _to_sql_empty(self, test_frame1): @@ -1182,7 +1185,8 @@ def _to_sql_with_sql_engine(self, test_frame1, engine="auto", **engine_kwargs): def _roundtrip(self, test_frame1): self.drop_table("test_frame_roundtrip", self.conn) assert self.pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4 - result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") + with self.pandasSQL.run_transaction(): + result = self.pandasSQL.read_query("SELECT * FROM test_frame_roundtrip") result.set_index("level_0", inplace=True) # result.index.astype(int) @@ -1232,13 +1236,14 @@ class DummyException(Exception): except DummyException: # ignore raised exception pass - res = self.pandasSQL.read_query("SELECT * FROM test_trans") + with self.pandasSQL.run_transaction(): + res = self.pandasSQL.read_query("SELECT * FROM test_trans") assert len(res) == 0 # Make sure when transaction is committed, rows do get inserted with self.pandasSQL.run_transaction() as trans: trans.execute(ins_sql) - res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") + res2 = self.pandasSQL.read_query("SELECT * FROM test_trans") assert len(res2) == 1 From 81fb7e76073ffe6adb875f15cdcfbac52c15b339 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 13 Sep 2023 16:37:46 -0700 Subject: [PATCH 2/4] DEPS: remove duplicated dependency in requirement-dev.txt (#55101) * Test installing dev dependencies with pip * fix typo * remove 3.12, list deps * remove pip ci test --- environment.yml | 2 +- requirements-dev.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 1eb0b4cc2c7a6..8deae839f5408 100644 --- a/environment.yml +++ b/environment.yml @@ -106,7 +106,7 @@ dependencies: - ipykernel # web - - jinja2 # in optional dependencies, but documented here as needed + # - jinja2 # already listed in optional dependencies, but documented here for reference - markdown - feedparser - pyyaml diff --git a/requirements-dev.txt b/requirements-dev.txt index ef3587b10d416..01e0701bc39a7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -77,7 +77,6 @@ ipywidgets nbformat notebook>=6.0.3 ipykernel -jinja2 markdown feedparser pyyaml From f00efd0344bd4e22cc867e76c776cb88669e6cde Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 13 Sep 2023 19:39:07 -0400 Subject: [PATCH 3/4] Assorted UBSAN cleanups (#55112) * first round of fixes * fix up includes * updates * dedup logic * move comment --- .../src/vendored/ujson/lib/ultrajsonenc.c | 8 ++- pandas/_libs/tslibs/np_datetime.pyx | 49 +++++++++++++------ 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c index e3e710ce1b876..942bd0b518144 100644 --- a/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c @@ -44,6 +44,7 @@ Numeric decoder derived from TCL library #include #include #include +#include #include #include #include @@ -763,7 +764,12 @@ void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) { void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) { char *wstr; - JSUINT64 uvalue = (value < 0) ? -value : value; + JSUINT64 uvalue; + if (value == INT64_MIN) { + uvalue = INT64_MAX + UINT64_C(1); + } else { + uvalue = (value < 0) ? -value : value; + } wstr = enc->offset; // Conversion. Number is reversed. diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 7b2ee68c73ad2..c3ee68e14a8d4 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,4 +1,3 @@ -cimport cython from cpython.datetime cimport ( PyDateTime_CheckExact, PyDateTime_DATE_GET_HOUR, @@ -18,6 +17,7 @@ from cpython.object cimport ( Py_LT, Py_NE, ) +from libc.stdint cimport INT64_MAX import_datetime() PandasDateTime_IMPORT @@ -545,7 +545,6 @@ cdef ndarray astype_round_check( return iresult -@cython.overflowcheck(True) cdef int64_t get_conversion_factor( NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit @@ -553,6 +552,7 @@ cdef int64_t get_conversion_factor( """ Find the factor by which we need to multiply to convert from from_unit to to_unit. """ + cdef int64_t value, overflow_limit, factor if ( from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC @@ -565,28 +565,44 @@ cdef int64_t get_conversion_factor( return 1 if from_unit == NPY_DATETIMEUNIT.NPY_FR_W: - return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit) + factor = 7 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D: - return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit) + factor = 24 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit) + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m: - return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit) + factor = 60 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit) + factor = 1000 elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs: - return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit) + factor = 1000 else: raise ValueError("Converting from M or Y units is not supported.") + overflow_limit = INT64_MAX // factor + if value > overflow_limit or value < -overflow_limit: + raise OverflowError("result would overflow") + + return factor * value + cdef int64_t convert_reso( int64_t value, @@ -595,7 +611,7 @@ cdef int64_t convert_reso( bint round_ok, ) except? -1: cdef: - int64_t res_value, mult, div, mod + int64_t res_value, mult, div, mod, overflow_limit if from_reso == to_reso: return value @@ -624,9 +640,12 @@ cdef int64_t convert_reso( else: # e.g. ns -> us, risk of overflow, but no risk of lossy rounding mult = get_conversion_factor(from_reso, to_reso) - with cython.overflowcheck(True): + overflow_limit = INT64_MAX // mult + if value > overflow_limit or value < -overflow_limit: # Note: caller is responsible for re-raising as OutOfBoundsTimedelta - res_value = value * mult + raise OverflowError("result would overflow") + + res_value = value * mult return res_value From 7134f2c14e614980bcf366f979a0f85aafacbde6 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 15 Sep 2023 11:10:52 +0200 Subject: [PATCH 4/4] CoW: Clear dead references every time we add a new one (#55008) --- pandas/_libs/internals.pyx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 7a9a3b84fd69f..3b1a6bc7436c3 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -897,6 +897,11 @@ cdef class BlockValuesRefs: else: self.referenced_blocks = [] + def _clear_dead_references(self) -> None: + self.referenced_blocks = [ + ref for ref in self.referenced_blocks if ref() is not None + ] + def add_reference(self, blk: Block) -> None: """Adds a new reference to our reference collection. @@ -905,6 +910,7 @@ cdef class BlockValuesRefs: blk : Block The block that the new references should point to. """ + self._clear_dead_references() self.referenced_blocks.append(weakref.ref(blk)) def add_index_reference(self, index: object) -> None: @@ -915,6 +921,7 @@ cdef class BlockValuesRefs: index : Index The index that the new reference should point to. """ + self._clear_dead_references() self.referenced_blocks.append(weakref.ref(index)) def has_reference(self) -> bool: @@ -927,8 +934,6 @@ cdef class BlockValuesRefs: ------- bool """ - self.referenced_blocks = [ - ref for ref in self.referenced_blocks if ref() is not None - ] + self._clear_dead_references() # Checking for more references than block pointing to itself return len(self.referenced_blocks) > 1