Skip to content

Commit

Permalink
Merge branch 'main' into checks_extensions.ExtensionArray
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored Aug 19, 2024
2 parents a298b1f + ca2b8c3 commit e1837a4
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 22 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,7 @@ Datetimelike
- Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`)
- Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`)
- Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`)
- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)
Expand Down
48 changes: 29 additions & 19 deletions pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,12 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
#endif // NPY_NO_DEPRECATED_API

#include <Python.h>

#include "pandas/vendored/numpy/datetime/np_datetime.h"

#define NO_IMPORT_ARRAY
#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY
#include <numpy/ndarrayobject.h>
#include <numpy/npy_common.h>
#include <stdbool.h>

#if defined(_WIN32)
#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
Expand Down Expand Up @@ -58,12 +56,15 @@ _Static_assert(0, "__has_builtin not detected; please try a newer compiler");
#endif
#endif

#define XSTR(a) STR(a)
#define STR(a) #a

#define PD_CHECK_OVERFLOW(FUNC) \
do { \
if ((FUNC) != 0) { \
PyGILState_STATE gstate = PyGILState_Ensure(); \
PyErr_SetString(PyExc_OverflowError, \
"Overflow occurred in npy_datetimestruct_to_datetime"); \
"Overflow occurred at " __FILE__ ":" XSTR(__LINE__)); \
PyGILState_Release(gstate); \
return -1; \
} \
Expand Down Expand Up @@ -139,53 +140,53 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) {
npy_int64 year, days = 0;
const int *month_lengths;

year = dts->year - 1970;
days = year * 365;
PD_CHECK_OVERFLOW(checked_int64_sub(dts->year, 1970, &year));
PD_CHECK_OVERFLOW(checked_int64_mul(year, 365, &days));

/* Adjust for leap years */
if (days >= 0) {
/*
* 1968 is the closest leap year before 1970.
* Exclude the current year, so add 1.
*/
year += 1;
PD_CHECK_OVERFLOW(checked_int64_add(year, 1, &year));
/* Add one day for each 4 years */
days += year / 4;
PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days));
/* 1900 is the closest previous year divisible by 100 */
year += 68;
PD_CHECK_OVERFLOW(checked_int64_add(year, 68, &year));
/* Subtract one day for each 100 years */
days -= year / 100;
PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days));
/* 1600 is the closest previous year divisible by 400 */
year += 300;
PD_CHECK_OVERFLOW(checked_int64_add(year, 300, &year));
/* Add one day for each 400 years */
days += year / 400;
PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days));
} else {
/*
* 1972 is the closest later year after 1970.
* Include the current year, so subtract 2.
*/
year -= 2;
PD_CHECK_OVERFLOW(checked_int64_sub(year, 2, &year));
/* Subtract one day for each 4 years */
days += year / 4;
PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days));
/* 2000 is the closest later year divisible by 100 */
year -= 28;
PD_CHECK_OVERFLOW(checked_int64_sub(year, 28, &year));
/* Add one day for each 100 years */
days -= year / 100;
PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days));
/* 2000 is also the closest later year divisible by 400 */
/* Subtract one day for each 400 years */
days += year / 400;
PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days));
}

month_lengths = days_per_month_table[is_leapyear(dts->year)];
month = dts->month - 1;

/* Add the months */
for (i = 0; i < month; ++i) {
days += month_lengths[i];
PD_CHECK_OVERFLOW(checked_int64_add(days, month_lengths[i], &days));
}

/* Add the days */
days += dts->day - 1;
PD_CHECK_OVERFLOW(checked_int64_add(days, dts->day - 1, &days));

return days;
}
Expand Down Expand Up @@ -430,6 +431,15 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
}

const int64_t days = get_datetimestruct_days(dts);
if (days == -1) {
PyGILState_STATE gstate = PyGILState_Ensure();
bool did_error = PyErr_Occurred() == NULL ? false : true;
PyGILState_Release(gstate);
if (did_error) {
return -1;
}
}

if (base == NPY_FR_D) {
return days;
}
Expand Down
19 changes: 19 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1137,6 +1137,21 @@ def test_infer_string_large_string_type(self, tmp_path, pa):
# assert result["strings"].dtype == "string"
# FIXME: don't leave commented-out

def test_non_nanosecond_timestamps(self, temp_file):
# GH#49236
pa = pytest.importorskip("pyarrow", "11.0.0")
pq = pytest.importorskip("pyarrow.parquet")

arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us"))
table = pa.table([arr], names=["timestamp"])
pq.write_table(table, temp_file)
result = read_parquet(temp_file)
expected = pd.DataFrame(
data={"timestamp": [datetime.datetime(1600, 1, 1)]},
dtype="datetime64[us]",
)
tm.assert_frame_equal(result, expected)


class TestParquetFastParquet(Base):
@pytest.mark.xfail(reason="datetime_with_nat gets incorrect values")
Expand Down Expand Up @@ -1178,6 +1193,10 @@ def test_duplicate_columns(self, fp):
msg = "Cannot create parquet dataset with duplicate column names"
self.check_error_on_write(df, fp, ValueError, msg)

@pytest.mark.xfail(
Version(np.__version__) >= Version("2.0.0"),
reason="fastparquet uses np.float_ in numpy2",
)
def test_bool_with_none(self, fp):
df = pd.DataFrame({"a": [True, None, False]})
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
Expand Down
11 changes: 9 additions & 2 deletions pandas/tests/plotting/frame/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
_check_visible,
get_y_axis,
)
from pandas.util.version import Version

from pandas.io.formats.printing import pprint_thing

Expand Down Expand Up @@ -2465,8 +2466,14 @@ def test_group_subplot_invalid_column_name(self):
d = {"a": np.arange(10), "b": np.arange(10)}
df = DataFrame(d)

with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"):
df.plot(subplots=[("a", "bad_name")])
if Version(np.__version__) < Version("2.0.0"):
with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"):
df.plot(subplots=[("a", "bad_name")])
else:
with pytest.raises(
ValueError, match=r"Column label\(s\) \[np\.str\_\('bad_name'\)\]"
):
df.plot(subplots=[("a", "bad_name")])

def test_group_subplot_duplicated_column(self):
d = {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)}
Expand Down

0 comments on commit e1837a4

Please sign in to comment.