Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into bisect
Browse files Browse the repository at this point in the history
  • Loading branch information
simonjayhawkins committed Dec 18, 2020
2 parents 5ecf905 + 5468223 commit bf40fc2
Show file tree
Hide file tree
Showing 39 changed files with 409 additions and 170 deletions.
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ The source code is currently hosted on GitHub at:
https://github.com/pandas-dev/pandas

Binary installers for the latest released version are available at the [Python
package index](https://pypi.org/project/pandas) and on conda.
Package Index (PyPI)](https://pypi.org/project/pandas) and on [Conda](https://docs.conda.io/en/latest/).

```sh
# conda
Expand All @@ -100,15 +100,15 @@ pip install pandas
```

## Dependencies
- [NumPy](https://www.numpy.org)
- [python-dateutil](https://labix.org/python-dateutil)
- [pytz](https://pythonhosted.org/pytz)
- [NumPy - Adds support for large, multi-dimensional arrays, matrices and high-level mathematical functions to operate on these arrays](https://www.numpy.org)
- [python-dateutil - Provides powerful extensions to the standard datetime module](https://labix.org/python-dateutil)
- [pytz - Brings the Olson tz database into Python which allows accurate and cross platform timezone calculations](https://pythonhosted.org/pytz)

See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for minimum supported versions of required, recommended and optional dependencies.

## Installation from sources
To install pandas from source you need Cython in addition to the normal
dependencies above. Cython can be installed from pypi:
To install pandas from source you need [Cython](https://cython.org/) in addition to the normal
dependencies above. Cython can be installed from PyPI:

```sh
pip install cython
Expand Down Expand Up @@ -145,7 +145,7 @@ See the full instructions for [installing from source](https://pandas.pydata.org
The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable

## Background
Work on ``pandas`` started at AQR (a quantitative hedge fund) in 2008 and
Work on ``pandas`` started at [AQR](https://www.aqr.com/) (a quantitative hedge fund) in 2008 and
has been under active development since then.

## Getting Help
Expand Down
5 changes: 4 additions & 1 deletion asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
lower-level methods directly on Index and subclasses, see index_object.py,
indexing_engine.py, and index_cached.py
"""
import itertools
import string
import warnings

Expand Down Expand Up @@ -256,7 +257,9 @@ def setup(self, index):
"non_monotonic": CategoricalIndex(list("abc" * N)),
}
self.data = indices[index]
self.data_unique = CategoricalIndex(list(string.printable))
self.data_unique = CategoricalIndex(
["".join(perm) for perm in itertools.permutations(string.printable, 3)]
)

self.int_scalar = 10000
self.int_list = list(range(10000))
Expand Down
95 changes: 63 additions & 32 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from io import StringIO
from io import BytesIO, StringIO
import random
import string

Expand Down Expand Up @@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
class ReadCSVSkipRows(BaseIO):

fname = "__test__.csv"
params = [None, 10000]
param_names = ["skiprows"]
params = ([None, 10000], ["c", "python"])
param_names = ["skiprows", "engine"]

def setup(self, skiprows):
def setup(self, skiprows, engine):
N = 20000
index = tm.makeStringIndex(N)
df = DataFrame(
Expand All @@ -164,8 +164,8 @@ def setup(self, skiprows):
)
df.to_csv(self.fname)

def time_skipprows(self, skiprows):
read_csv(self.fname, skiprows=skiprows)
def time_skipprows(self, skiprows, engine):
read_csv(self.fname, skiprows=skiprows, engine=engine)


class ReadUint64Integers(StringIORewind):
Expand All @@ -192,10 +192,10 @@ def time_read_uint64_na_values(self):
class ReadCSVThousands(BaseIO):

fname = "__test__.csv"
params = ([",", "|"], [None, ","])
param_names = ["sep", "thousands"]
params = ([",", "|"], [None, ","], ["c", "python"])
param_names = ["sep", "thousands", "engine"]

def setup(self, sep, thousands):
def setup(self, sep, thousands, engine):
N = 10000
K = 8
data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
Expand All @@ -206,16 +206,19 @@ def setup(self, sep, thousands):
df = df.applymap(lambda x: fmt.format(x))
df.to_csv(self.fname, sep=sep)

def time_thousands(self, sep, thousands):
read_csv(self.fname, sep=sep, thousands=thousands)
def time_thousands(self, sep, thousands, engine):
read_csv(self.fname, sep=sep, thousands=thousands, engine=engine)


class ReadCSVComment(StringIORewind):
def setup(self):
params = ["c", "python"]
param_names = ["engine"]

def setup(self, engine):
data = ["A,B,C"] + (["1,2,3 # comment"] * 100000)
self.StringIO_input = StringIO("\n".join(data))

def time_comment(self):
def time_comment(self, engine):
read_csv(
self.data(self.StringIO_input), comment="#", header=None, names=list("abc")
)
Expand Down Expand Up @@ -255,25 +258,47 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
)


class ReadCSVEngine(StringIORewind):
params = ["c", "python"]
param_names = ["engine"]

def setup(self, engine):
data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
self.StringIO_input = StringIO("\n".join(data))
# simulate reading from file
self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))

def time_read_stringcsv(self, engine):
read_csv(self.data(self.StringIO_input), engine=engine)

def time_read_bytescsv(self, engine):
read_csv(self.data(self.BytesIO_input), engine=engine)


class ReadCSVCategorical(BaseIO):

fname = "__test__.csv"
params = ["c", "python"]
param_names = ["engine"]

def setup(self):
def setup(self, engine):
N = 100000
group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"]
df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc"))
df.to_csv(self.fname, index=False)

def time_convert_post(self):
read_csv(self.fname).apply(Categorical)
def time_convert_post(self, engine):
read_csv(self.fname, engine=engine).apply(Categorical)

def time_convert_direct(self):
read_csv(self.fname, dtype="category")
def time_convert_direct(self, engine):
read_csv(self.fname, engine=engine, dtype="category")


class ReadCSVParseDates(StringIORewind):
def setup(self):
params = ["c", "python"]
param_names = ["engine"]

def setup(self, engine):
data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
{},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
{},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
Expand All @@ -284,18 +309,20 @@ def setup(self):
data = data.format(*two_cols)
self.StringIO_input = StringIO(data)

def time_multiple_date(self):
def time_multiple_date(self, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
names=list(string.digits[:9]),
parse_dates=[[1, 2], [1, 3]],
)

def time_baseline(self):
def time_baseline(self, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
parse_dates=[1],
Expand All @@ -304,17 +331,18 @@ def time_baseline(self):


class ReadCSVCachedParseDates(StringIORewind):
params = ([True, False],)
param_names = ["do_cache"]
params = ([True, False], ["c", "python"])
param_names = ["do_cache", "engine"]

def setup(self, do_cache):
def setup(self, do_cache, engine):
data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
self.StringIO_input = StringIO(data)

def time_read_csv_cached(self, do_cache):
def time_read_csv_cached(self, do_cache, engine):
try:
read_csv(
self.data(self.StringIO_input),
engine=engine,
header=None,
parse_dates=[0],
cache_dates=do_cache,
Expand All @@ -329,37 +357,40 @@ class ReadCSVMemoryGrowth(BaseIO):
chunksize = 20
num_rows = 1000
fname = "__test__.csv"
params = ["c", "python"]
param_names = ["engine"]

def setup(self):
def setup(self, engine):
with open(self.fname, "w") as f:
for i in range(self.num_rows):
f.write(f"{i}\n")

def mem_parser_chunks(self):
def mem_parser_chunks(self, engine):
# see gh-24805.
result = read_csv(self.fname, chunksize=self.chunksize)
result = read_csv(self.fname, chunksize=self.chunksize, engine=engine)

for _ in result:
pass


class ReadCSVParseSpecialDate(StringIORewind):
params = (["mY", "mdY", "hm"],)
param_names = ["value"]
params = (["mY", "mdY", "hm"], ["c", "python"])
param_names = ["value", "engine"]
objects = {
"mY": "01-2019\n10-2019\n02/2000\n",
"mdY": "12/02/2010\n",
"hm": "21:34\n",
}

def setup(self, value):
def setup(self, value, engine):
count_elem = 10000
data = self.objects[value] * count_elem
self.StringIO_input = StringIO(data)

def time_read_special_date(self, value):
def time_read_special_date(self, value, engine):
read_csv(
self.data(self.StringIO_input),
engine=engine,
sep=",",
header=None,
names=["Date"],
Expand Down
4 changes: 4 additions & 0 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
pytest -q --doctest-modules pandas/core/strings/
RET=$(($RET + $?)) ; echo $MSG "DONE"

MSG='Doctests sql.py' ; echo $MSG
pytest -q --doctest-modules pandas/io/sql.py
RET=$(($RET + $?)) ; echo $MSG "DONE"

# Directories

MSG='Doctests arrays'; echo $MSG
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-37-slow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ dependencies:
- moto>=1.3.14
- scipy
- sqlalchemy
- xlrd
- xlrd<2.0
- xlsxwriter
- xlwt
- moto
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-38-locale.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ dependencies:
- pytz
- scipy
- xarray
- xlrd
- xlrd<2.0
- xlsxwriter
- xlwt
- moto
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-macos-37.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ dependencies:
- python-dateutil==2.7.3
- pytz
- xarray
- xlrd
- xlrd<2.0
- xlsxwriter
- xlwt
- pip
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-windows-37.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ dependencies:
- s3fs>=0.4.2
- scipy
- sqlalchemy
- xlrd
- xlrd<2.0
- xlsxwriter
- xlwt
- pyreadstat
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-windows-38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ dependencies:
- pytz
- s3fs>=0.4.0
- scipy
- xlrd
- xlrd<2.0
- xlsxwriter
- xlwt
2 changes: 1 addition & 1 deletion ci/deps/travis-37-cov.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ dependencies:
- sqlalchemy
- statsmodels
- xarray
- xlrd
- xlrd<2.0
- xlsxwriter
- xlwt
- pip
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/travis-37-locale.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies:
- pytables>=3.5.1
- scipy
- xarray=0.12.3
- xlrd
- xlrd<2.0
- xlsxwriter
- xlwt
- moto
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/travis-38-slow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ dependencies:
- moto>=1.3.14
- scipy
- sqlalchemy
- xlrd
- xlrd<2.0
- xlsxwriter
- xlwt
- moto
Expand Down
5 changes: 2 additions & 3 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ Bug fixes
Categorical
^^^^^^^^^^^

-
- Bug in ``CategoricalIndex.reindex`` failed when ``Index`` passed with elements all in category (:issue:`28690`)
-

Datetimelike
Expand All @@ -195,7 +195,6 @@ Numeric
^^^^^^^
- Bug in :meth:`DataFrame.quantile`, :meth:`DataFrame.sort_values` causing incorrect subsequent indexing behavior (:issue:`38351`)
- Bug in :meth:`DataFrame.select_dtypes` with ``include=np.number`` now retains numeric ``ExtensionDtype`` columns (:issue:`35340`)
-

Conversion
^^^^^^^^^^
Expand Down Expand Up @@ -232,7 +231,7 @@ MultiIndex
^^^^^^^^^^

- Bug in :meth:`DataFrame.drop` raising ``TypeError`` when :class:`MultiIndex` is non-unique and no level is provided (:issue:`36293`)
-
- Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differntly ordered (:issue:`38439`)

I/O
^^^
Expand Down
Loading

0 comments on commit bf40fc2

Please sign in to comment.