Skip to content

Commit

Permalink
Micro-optimize Table.from_csv. wireservice#601 wireservice#581
Browse files Browse the repository at this point in the history
  • Loading branch information
onyxfish committed Apr 6, 2016
1 parent e0a321b commit e4527cb
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 30 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
1.4.0
-----

* Reduced memory usage and improved performance of :meth:`.Table.from_csv`.
* :meth:`.Table.from_csv` no longer accepts a sequence of row ids for :code:`skip_lines`.
* :meth:`.Number.cast` is now three times as fast.
* :class:`.Number` now accepts :code:`group_symbol`, :code:`decimal_symbol` and :code:`currency_symbols` arguments. (#224)
* Tutorial: clean up state data under computing columns (#570)
Expand Down
1 change: 0 additions & 1 deletion agate/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
from agate.type_tester import TypeTester
from agate import utils
from agate.warns import warn_duplicate_column
from agate.utils import allow_tableset_proxy


@six.python_2_unicode_compatible
Expand Down
59 changes: 33 additions & 26 deletions agate/table/from_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import six

from agate import utils

@classmethod
def from_csv(cls, path, column_names=None, column_types=None, row_names=None, skip_lines=0, header=True, sniff_limit=0, encoding='utf-8', **kwargs):
Expand All @@ -17,25 +16,23 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk
:code:`kwargs` will be passed through to the CSV reader.
:param path:
Filepath or file-like object from which to read CSV data.
Filepath or file-like object from which to read CSV data. If a file-like
object is specified, it must be seekable.
:param column_names:
See :meth:`.Table.__init__`.
:param column_types:
See :meth:`.Table.__init__`.
:param row_names:
See :meth:`.Table.__init__`.
:param skip_lines:
Either a single number indicating the number of lines to skip from
the top of the file or a sequence of line indexes to skip where the
first line is index 0.
The number of lines to skip from the top of the file.
:param header:
If `True`, the first row of the CSV is assumed to contains headers
and will be skipped. If `header` and `column_names` are both
specified then a row will be skipped, but `column_names` will be
used.
If :code:`True`, the first row of the CSV is assumed to contain column
names. If :code:`header` and :code:`column_names` are both specified
then a row will be skipped, but :code:`column_names` will be used.
:param sniff_limit:
Limit CSV dialect sniffing to the specified number of bytes. Set to
None to sniff the entire file. Defaults to 0 or no sniffing.
None to sniff the entire file. Defaults to 0 (no sniffing).
:param encoding:
Character encoding of the CSV file. Note: if passing in a file
handle it is assumed you have already opened it with the correct
Expand All @@ -44,34 +41,44 @@ def from_csv(cls, path, column_names=None, column_types=None, row_names=None, sk
from agate import csv
from agate.table import Table

close = False

if hasattr(path, 'read'):
lines = path.readlines()
f = path
else:
with io.open(path, encoding=encoding) as f:
lines = f.readlines()

if utils.issequence(skip_lines):
lines = [line for i, line in enumerate(lines) if i not in skip_lines]
contents = ''.join(lines)
elif isinstance(skip_lines, int):
contents = ''.join(lines[skip_lines:])
f = io.open(path, encoding=encoding)
close = True

if isinstance(skip_lines, int):
while skip_lines > 0:
f.readline()
skip_lines -= 1
else:
raise ValueError('skip_lines argument must be an int or sequence')
raise ValueError('skip_lines argument must be an int')

start = f.tell()

if sniff_limit is None:
kwargs['dialect'] = csv.Sniffer().sniff(contents)
kwargs['dialect'] = csv.Sniffer().sniff(f.read())
elif sniff_limit > 0:
kwargs['dialect'] = csv.Sniffer().sniff(contents[:sniff_limit])
kwargs['dialect'] = csv.Sniffer().sniff(f.read(sniff_limit))

f.seek(start)

if six.PY2:
contents = contents.encode('utf-8')
f = six.StringIO(f.read().encode('utf-8'))

rows = list(csv.reader(six.StringIO(contents), header=header, **kwargs))
reader = csv.reader(f, header=header, **kwargs)

if header:
if column_names is None:
column_names = rows.pop(0)
column_names = next(reader)
else:
rows.pop(0)
next(reader)

rows = tuple(reader)

if close:
f.close()

return Table(rows, column_names, column_types, row_names=row_names)
1 change: 1 addition & 0 deletions examples/test_cr.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
number,text,boolean,date,datetime,timedelta1,a,True,2015-11-04,2015-11-04T12:22:00,0:04:152,👍,False,2015-11-05,2015-11-04T12:45:00,0:06:18,b,,,,
Expand Down
4 changes: 4 additions & 0 deletions examples/test_crlf.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
number,text,boolean,date,datetime,timedelta
1,a,True,2015-11-04,2015-11-04T12:22:00,0:04:15
2,👍,False,2015-11-05,2015-11-04T12:45:00,0:06:18
,b,,,,
33 changes: 30 additions & 3 deletions tests/test_table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,6 +698,24 @@ def test_from_csv(self):

self.assertRows(table2, table1.rows)

def test_from_csv_crlf(self):
table1 = Table(self.rows, self.column_names, self.column_types)
table2 = Table.from_csv('examples/test_crlf.csv')

self.assertColumnNames(table2, table1.column_names)
self.assertColumnTypes(table2, [Number, Text, Boolean, Date, DateTime, TimeDelta])

self.assertRows(table2, table1.rows)

def test_from_csv_cr(self):
table1 = Table(self.rows, self.column_names, self.column_types)
table2 = Table.from_csv('examples/test_cr.csv')

self.assertColumnNames(table2, table1.column_names)
self.assertColumnTypes(table2, [Number, Text, Boolean, Date, DateTime, TimeDelta])

self.assertRows(table2, table1.rows)

def test_from_csv_file_like_object(self):
table1 = Table(self.rows, self.column_names, self.column_types)

Expand Down Expand Up @@ -755,9 +773,18 @@ def test_from_csv_skip_lines(self):

self.assertRows(table2, table1.rows)

def test_from_csv_skip_lines_sequence(self):
table1 = Table([self.rows[1]], column_names=self.column_names, column_types=self.column_types)
table2 = Table.from_csv('examples/test.csv', skip_lines=(1, 3))
def test_from_csv_skip_lines_crlf(self):
table1 = Table(self.rows[1:], column_types=self.column_types)
table2 = Table.from_csv('examples/test_crlf.csv', header=False, skip_lines=2)

self.assertColumnNames(table2, table1.column_names)
self.assertColumnTypes(table2, [Number, Text, Boolean, Date, DateTime, TimeDelta])

self.assertRows(table2, table1.rows)

def test_from_csv_skip_lines_cr(self):
table1 = Table(self.rows[1:], column_types=self.column_types)
table2 = Table.from_csv('examples/test_cr.csv', header=False, skip_lines=2)

self.assertColumnNames(table2, table1.column_names)
self.assertColumnTypes(table2, [Number, Text, Boolean, Date, DateTime, TimeDelta])
Expand Down

0 comments on commit e4527cb

Please sign in to comment.