From 6a0452b50ae76fa2562668604c35a3a1bca39788 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 4 Nov 2011 20:31:02 -0400 Subject: [PATCH] BUG: print DataFrame columns in the right order, also convert NAs in string columns, GH #325 --- pandas/core/frame.py | 17 +++++++++-------- pandas/io/parsers.py | 2 +- pandas/io/tests/test_parsers.py | 13 +++++++++++++ pandas/src/parsing.pyx | 4 ++-- pandas/tests/test_frame.py | 23 ++++++++++++++++++++++- 5 files changed, 47 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a38ffbdcfad45..95436c6767919 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2890,7 +2890,11 @@ def __init__(self, frame, buf=None, columns=None, col_space=None, self.formatters = formatters self.na_rep = na_rep self.col_space = col_space - self.column_filter = frame.columns if columns is None else set(columns) + + if columns is not None: + self.columns = _ensure_index(columns) + else: + self.columns = frame.columns self._write_to_buffer() @@ -2909,8 +2913,7 @@ def _write_to_buffer(self): str_columns = self._get_formatted_column_labels() stringified = [str_columns[i] + format_col(c) - for i, c in enumerate(frame.columns) - if c in self.column_filter] + for i, c in enumerate(self.columns)] to_write.append(adjoin(1, str_index, *stringified)) @@ -2946,10 +2949,8 @@ def _format_col(col): def _get_formatted_column_labels(self): from pandas.core.index import _sparsify - columns = self.frame.columns - - if isinstance(columns, MultiIndex): - fmt_columns = columns.format(sparsify=False, adjoin=False) + if isinstance(self.columns, MultiIndex): + fmt_columns = self.columns.format(sparsify=False, adjoin=False) str_columns = zip(*[[' %s' % y for y in x] for x in zip(*fmt_columns)]) if self.sparsify: @@ -2957,7 +2958,7 @@ def _get_formatted_column_labels(self): str_columns = [list(x) for x in zip(*str_columns)] else: - str_columns = [[' %s' % x] for x in columns.format()] + str_columns = [[' %s' % x] for x in self.columns.format()] if self.show_index_names and self.has_index_names: for x in str_columns: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d685724398200..8ab939ede6a15 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -424,7 +424,7 @@ def _convert_types(values, na_values): try: values = lib.maybe_convert_numeric(values, na_values) except Exception: - lib.sanitize_objects(values) + lib.sanitize_objects(values, na_values) if values.dtype == np.object_: return lib.maybe_convert_bool(values) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 1ae0876512ab9..c00ba5c6a0418 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -51,6 +51,19 @@ def test_custom_na_values(self): skiprows=[1]) assert_almost_equal(df2.values, expected) + def test_detect_string_na(self): + data = """A,B +foo,bar +NA,baz +NaN,nan +""" + expected = [['foo', 'bar'], + [nan, 'baz'], + [nan, nan]] + + df = read_csv(StringIO(data)) + assert_almost_equal(df.values, expected) + def test_unnamed_columns(self): data = """A,B,C,, 1,2,3,4,5 diff --git a/pandas/src/parsing.pyx b/pandas/src/parsing.pyx index cfc81b1a30b23..f72e316464cb8 100644 --- a/pandas/src/parsing.pyx +++ b/pandas/src/parsing.pyx @@ -190,7 +190,7 @@ def try_parse_dates(ndarray[object] values, parser=None): return result -def sanitize_objects(ndarray[object] values): +def sanitize_objects(ndarray[object] values, set na_values): cdef: Py_ssize_t i, n object val, onan @@ -200,7 +200,7 @@ def sanitize_objects(ndarray[object] values): for i from 0 <= i < n: val = values[i] - if val == '': + if val == '' or val in na_values: values[i] = onan def maybe_convert_bool(ndarray[object] arr): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 02dd08f0c4b42..5a5b35c4c7aa4 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1324,6 +1324,9 @@ def test_repr_corner(self): foo = repr(df) def test_to_string(self): + from pandas import read_table + import re + # big mixed biggie = DataFrame({'A' : randn(1000), 'B' : tm.makeStringIndex(1000)}, @@ -1340,7 +1343,25 @@ def test_to_string(self): self.assert_(isinstance(s, basestring)) - biggie.to_string(columns=['B', 'A'], colSpace=17) + # print in right order + result = biggie.to_string(columns=['B', 'A'], colSpace=17, + float_format='%.6f'.__mod__) + lines = result.split('\n') + header = lines[0].strip().split() + joined = '\n'.join([re.sub('\s+', ' ', x).strip() for x in lines[1:]]) + recons = read_table(StringIO(joined), names=header, sep=' ') + assert_series_equal(recons['B'], biggie['B']) + assert_series_equal(np.round(recons['A'], 2), + np.round(biggie['A'], 2)) + + # expected = ['B', 'A'] + # self.assertEqual(header, expected) + + result = biggie.to_string(columns=['A'], colSpace=17) + header = result.split('\n')[0].strip().split() + expected = ['A'] + self.assertEqual(header, expected) + biggie.to_string(columns=['B', 'A'], formatters={'A' : lambda x: '%.1f' % x})