Micro-optimize Table.from_csv. wireservice#601 wireservice#581

lcorbasson · Apr 6, 2016 · e4527cb · e4527cb
1 parent e0a321b
commit e4527cb
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 30 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,8 @@
 1.4.0
 -----
 
+* Reduced memory usage and improved performance of :meth:`.Table.from_csv`.
+* :meth:`.Table.from_csv` no longer accepts a sequence of row ids for :code:`skip_lines`.
 * :meth:`.Number.cast` is now three times as fast.
 * :class:`.Number` now accepts :code:`group_symbol`, :code:`decimal_symbol` and :code:`currency_symbols` arguments. (#224)
 * Tutorial: clean up state data under computing columns (#570)

diff --git a/agate/table/__init__.py b/agate/table/__init__.py
@@ -33,7 +33,6 @@
 from agate.type_tester import TypeTester
 from agate import utils
 from agate.warns import warn_duplicate_column
-from agate.utils import allow_tableset_proxy
 
 
 @six.python_2_unicode_compatible

diff --git a/agate/table/from_csv.py b/agate/table/from_csv.py
@@ -4,7 +4,6 @@
 
 import six
 
-from agate import utils
 
 @classmethod
 def from_csv(cls, path, column_names=None, column_types=None, row_names=None, skip_lines=0, header=True, sniff_limit=0, encoding='utf-8', **kwargs):
@@ -17,25 +16,23 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk
     :code:`kwargs` will be passed through to the CSV reader.
 
     :param path:
-        Filepath or file-like object from which to read CSV data.
+        Filepath or file-like object from which to read CSV data. If a file-like
+        object is specified, it must be seekable.
     :param column_names:
         See :meth:`.Table.__init__`.
     :param column_types:
         See :meth:`.Table.__init__`.
     :param row_names:
         See :meth:`.Table.__init__`.
     :param skip_lines:
-        Either a single number indicating the number of lines to skip from
-        the top of the file or a sequence of line indexes to skip where the
-        first line is index 0.
+        The number of lines to skip from the top of the file.
     :param header:
-        If `True`, the first row of the CSV is assumed to contains headers
-        and will be skipped. If `header` and `column_names` are both
-        specified then a row will be skipped, but `column_names` will be
-        used.
+        If :code:`True`, the first row of the CSV is assumed to contain column
+        names. If :code:`header` and :code:`column_names` are both specified
+        then a row will be skipped, but :code:`column_names` will be used.
     :param sniff_limit:
         Limit CSV dialect sniffing to the specified number of bytes. Set to
-        None to sniff the entire file. Defaults to 0 or no sniffing.
+        None to sniff the entire file. Defaults to 0 (no sniffing).
     :param encoding:
         Character encoding of the CSV file. Note: if passing in a file
         handle it is assumed you have already opened it with the correct
@@ -44,34 +41,44 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk
     from agate import csv
     from agate.table import Table
 
+    close = False
+
     if hasattr(path, 'read'):
-        lines = path.readlines()
+        f = path
     else:
-        with io.open(path, encoding=encoding) as f:
-            lines = f.readlines()
-
-    if utils.issequence(skip_lines):
-        lines = [line for i, line in enumerate(lines) if i not in skip_lines]
-        contents = ''.join(lines)
-    elif isinstance(skip_lines, int):
-        contents = ''.join(lines[skip_lines:])
+        f = io.open(path, encoding=encoding)
+        close = True
+
+    if isinstance(skip_lines, int):
+        while skip_lines > 0:
+            f.readline()
+            skip_lines -= 1
     else:
-        raise ValueError('skip_lines argument must be an int or sequence')
+        raise ValueError('skip_lines argument must be an int')
+
+    start = f.tell()
 
     if sniff_limit is None:
-        kwargs['dialect'] = csv.Sniffer().sniff(contents)
+        kwargs['dialect'] = csv.Sniffer().sniff(f.read())
     elif sniff_limit > 0:
-        kwargs['dialect'] = csv.Sniffer().sniff(contents[:sniff_limit])
+        kwargs['dialect'] = csv.Sniffer().sniff(f.read(sniff_limit))
+
+    f.seek(start)
 
     if six.PY2:
-        contents = contents.encode('utf-8')
+        f = six.StringIO(f.read().encode('utf-8'))
 
-    rows = list(csv.reader(six.StringIO(contents), header=header, **kwargs))
+    reader = csv.reader(f, header=header, **kwargs)
 
     if header:
         if column_names is None:
-            column_names = rows.pop(0)
+            column_names = next(reader)
         else:
-            rows.pop(0)
+            next(reader)
+
+    rows = tuple(reader)
+
+    if close:
+        f.close()
 
     return Table(rows, column_names, column_types, row_names=row_names)
diff --git a/examples/test_cr.csv b/examples/test_cr.csv
@@ -0,0 +1 @@
+number,text,boolean,date,datetime,timedelta1,a,True,2015-11-04,2015-11-04T12:22:00,0:04:152,👍,False,2015-11-05,2015-11-04T12:45:00,0:06:18,b,,,,

diff --git a/examples/test_crlf.csv b/examples/test_crlf.csv
@@ -0,0 +1,4 @@
+number,text,boolean,date,datetime,timedelta
+1,a,True,2015-11-04,2015-11-04T12:22:00,0:04:15
+2,👍,False,2015-11-05,2015-11-04T12:45:00,0:06:18
+,b,,,,
diff --git a/tests/test_table/__init__.py b/tests/test_table/__init__.py
@@ -698,6 +698,24 @@ def test_from_csv(self):
 
         self.assertRows(table2, table1.rows)
 
+    def test_from_csv_crlf(self):
+        table1 = Table(self.rows, self.column_names, self.column_types)
+        table2 = Table.from_csv('examples/test_crlf.csv')
+
+        self.assertColumnNames(table2, table1.column_names)
+        self.assertColumnTypes(table2, [Number, Text, Boolean, Date, DateTime, TimeDelta])
+
+        self.assertRows(table2, table1.rows)
+
+    def test_from_csv_cr(self):
+        table1 = Table(self.rows, self.column_names, self.column_types)
+        table2 = Table.from_csv('examples/test_cr.csv')
+
+        self.assertColumnNames(table2, table1.column_names)
+        self.assertColumnTypes(table2, [Number, Text, Boolean, Date, DateTime, TimeDelta])
+
+        self.assertRows(table2, table1.rows)
+
     def test_from_csv_file_like_object(self):
         table1 = Table(self.rows, self.column_names, self.column_types)
 
@@ -755,9 +773,18 @@ def test_from_csv_skip_lines(self):
 
         self.assertRows(table2, table1.rows)
 
-    def test_from_csv_skip_lines_sequence(self):
-        table1 = Table([self.rows[1]], column_names=self.column_names, column_types=self.column_types)
-        table2 = Table.from_csv('examples/test.csv', skip_lines=(1, 3))
+    def test_from_csv_skip_lines_crlf(self):
+        table1 = Table(self.rows[1:], column_types=self.column_types)
+        table2 = Table.from_csv('examples/test_crlf.csv', header=False, skip_lines=2)
+
+        self.assertColumnNames(table2, table1.column_names)
+        self.assertColumnTypes(table2, [Number, Text, Boolean, Date, DateTime, TimeDelta])
+
+        self.assertRows(table2, table1.rows)
+
+    def test_from_csv_skip_lines_cr(self):
+        table1 = Table(self.rows[1:], column_types=self.column_types)
+        table2 = Table.from_csv('examples/test_cr.csv', header=False, skip_lines=2)
 
         self.assertColumnNames(table2, table1.column_names)
         self.assertColumnTypes(table2, [Number, Text, Boolean, Date, DateTime, TimeDelta])