Skip to content

Commit

Permalink
PERF: Improve performance of StataReader
Browse files Browse the repository at this point in the history
Improve performance of StataReader when converting columns
with missing values

xref pandas-dev#25772
  • Loading branch information
bashtage committed Mar 19, 2019
1 parent db6993c commit 8e27a4b
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 7 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ I/O
- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`)
- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
-
- Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)


Plotting
Expand Down
19 changes: 13 additions & 6 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
from pandas.core.dtypes.common import (
ensure_object, is_categorical_dtype, is_datetime64_dtype)

from pandas import DatetimeIndex, compat, isna, to_datetime, to_timedelta
from pandas import (DatetimeIndex, compat, isna, to_datetime, to_timedelta,
concat)
from pandas.core.arrays import Categorical
from pandas.core.base import StringMixin
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -1572,7 +1573,7 @@ def read(self, nrows=None, convert_dates=None,
data = DataFrame.from_dict(OrderedDict(data_formatted))
del data_formatted

self._do_convert_missing(data, convert_missing)
data = self._do_convert_missing(data, convert_missing)

if convert_dates:
cols = np.where(lmap(lambda x: any(x.startswith(fmt)
Expand Down Expand Up @@ -1616,7 +1617,7 @@ def read(self, nrows=None, convert_dates=None,

def _do_convert_missing(self, data, convert_missing):
# Check for missing values, and replace if found

replacements = {}
for i, colname in enumerate(data):
fmt = self.typlist[i]
if fmt not in self.VALID_RANGE:
Expand Down Expand Up @@ -1646,8 +1647,14 @@ def _do_convert_missing(self, data, convert_missing):
dtype = np.float64
replacement = Series(series, dtype=dtype)
replacement[missing] = np.nan

data[colname] = replacement
replacements[colname] = replacement
if replacements:
columns = data.columns
replacements = DataFrame(replacements)
data.drop(replacements.columns, 1, inplace=True)
data = concat([data, replacements], 1)
data = data[columns]
return data

def _insert_strls(self, data):
if not hasattr(self, 'GSO') or len(self.GSO) == 0:
Expand Down Expand Up @@ -1712,7 +1719,7 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist,
except ValueError:
vc = Series(categories).value_counts()
repeats = list(vc.index[vc > 1])
repeats = '\n' + '-' * 80 + '\n'.join(repeats)
repeats = '\n' + '-' * 80 + '\n' + '\n'.join(repeats)
raise ValueError('Value labels for column {col} are not '
'unique. The repeated labels are:\n'
'{repeats}'
Expand Down

0 comments on commit 8e27a4b

Please sign in to comment.