Skip to content
This repository has been archived by the owner on Sep 9, 2020. It is now read-only.

Commit

Permalink
[PRED-2644] Fix decoding error in case of dialect detection
Browse files Browse the repository at this point in the history
  • Loading branch information
falkerson committed Jul 5, 2019
1 parent 13da21e commit fc417b9
Showing 1 changed file with 22 additions and 8 deletions.
30 changes: 22 additions & 8 deletions datarobot_batch_scoring/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
self.p.terminate()


def sniff_dialect(sample, encoding, sep, skip_dialect, ui):
def sniff_dialect(sample, sep, skip_dialect, ui):
t1 = time()
try:
if skip_dialect:
Expand All @@ -396,11 +396,11 @@ def sniff_dialect(sample, encoding, sep, skip_dialect, ui):
dialect = csv.get_dialect('dataset_dialect')
else:
sniffer = csv.Sniffer()
dialect = sniffer.sniff(sample.decode(encoding), delimiters=sep)
dialect = sniffer.sniff(sample, delimiters=sep)
ui.debug('investigate_encoding_and_dialect - seconds to detect '
'csv dialect: {}'.format(time() - t1))
except csv.Error:
decoded_one = sample.decode(encoding)
decoded_one = sample
t2 = time()
detector = Detector()
delimiter, resampled = detector.detect(decoded_one)
Expand Down Expand Up @@ -432,6 +432,19 @@ def sniff_dialect(sample, encoding, sep, skip_dialect, ui):
return dialect


def get_opener_and_mode(is_gz, text=False):
mode = 'r' if text else 'rb'
if is_gz:
return (gzip.open, mode)
elif six.PY2:
if text:
from io import open as io_open
return (io_open, 'r')
return (open, 'rU')
else:
return (open, mode)


def investigate_encoding_and_dialect(dataset, sep, ui, fast=False,
encoding=None, skip_dialect=False,
output_delimiter=None):
Expand All @@ -445,10 +458,7 @@ def investigate_encoding_and_dialect(dataset, sep, ui, fast=False,
sample_size = DETECT_SAMPLE_SIZE_SLOW

is_gz = dataset.endswith('.gz')
opener, mode = (
(gzip.open, 'rb') if is_gz
else (open, ('rU' if six.PY2 else 'rb'))
)
opener, mode = get_opener_and_mode(is_gz, text=True)
with opener(dataset, mode) as dfile:
sample = dfile.read(sample_size)

Expand All @@ -462,8 +472,12 @@ def investigate_encoding_and_dialect(dataset, sep, ui, fast=False,
encoding = encoding.lower()
sample[:1000].decode(encoding) # Fail here if the encoding is invalid

opener, mode = get_opener_and_mode(is_gz, text=True)
with opener(dataset, mode, encoding=encoding) as dfile:
sample = dfile.read(sample_size)

try:
dialect = sniff_dialect(sample, encoding, sep, skip_dialect, ui)
dialect = sniff_dialect(sample, sep, skip_dialect, ui)
except csv.Error as ex:
ui.fatal(ex)
if len(sample) < 10:
Expand Down

0 comments on commit fc417b9

Please sign in to comment.