Skip to content
This repository has been archived by the owner on Sep 9, 2020. It is now read-only.

Commit

Permalink
[PRED-2644] Fix decoding error in case of dialect detection
Browse files Browse the repository at this point in the history
We open some file in binary mode and read some N bytes to detect encoding.
Later we use this bytes to detect dialect but before it we decode() them
into string(unicode). Because we have const number of N, it's possible that
during bytes reading last character may be torn apart and then during decode()
we can't identify that character.
  • Loading branch information
falkerson committed Jul 5, 2019
1 parent 13da21e commit 4a676da
Showing 1 changed file with 22 additions and 8 deletions.
30 changes: 22 additions & 8 deletions datarobot_batch_scoring/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
self.p.terminate()


def sniff_dialect(sample, encoding, sep, skip_dialect, ui):
def sniff_dialect(sample, sep, skip_dialect, ui):
t1 = time()
try:
if skip_dialect:
Expand All @@ -396,11 +396,11 @@ def sniff_dialect(sample, encoding, sep, skip_dialect, ui):
dialect = csv.get_dialect('dataset_dialect')
else:
sniffer = csv.Sniffer()
dialect = sniffer.sniff(sample.decode(encoding), delimiters=sep)
dialect = sniffer.sniff(sample, delimiters=sep)
ui.debug('investigate_encoding_and_dialect - seconds to detect '
'csv dialect: {}'.format(time() - t1))
except csv.Error:
decoded_one = sample.decode(encoding)
decoded_one = sample
t2 = time()
detector = Detector()
delimiter, resampled = detector.detect(decoded_one)
Expand Down Expand Up @@ -432,6 +432,19 @@ def sniff_dialect(sample, encoding, sep, skip_dialect, ui):
return dialect


def get_opener_and_mode(is_gz, text=False):
mode = 'r' if text else 'rb'
if is_gz:
return (gzip.open, mode)
elif six.PY2:
if text:
from io import open as io_open
return (io_open, 'r')
return (open, 'rU')
else:
return (open, mode)


def investigate_encoding_and_dialect(dataset, sep, ui, fast=False,
encoding=None, skip_dialect=False,
output_delimiter=None):
Expand All @@ -445,10 +458,7 @@ def investigate_encoding_and_dialect(dataset, sep, ui, fast=False,
sample_size = DETECT_SAMPLE_SIZE_SLOW

is_gz = dataset.endswith('.gz')
opener, mode = (
(gzip.open, 'rb') if is_gz
else (open, ('rU' if six.PY2 else 'rb'))
)
opener, mode = get_opener_and_mode(is_gz)
with opener(dataset, mode) as dfile:
sample = dfile.read(sample_size)

Expand All @@ -462,8 +472,12 @@ def investigate_encoding_and_dialect(dataset, sep, ui, fast=False,
encoding = encoding.lower()
sample[:1000].decode(encoding) # Fail here if the encoding is invalid

opener, mode = get_opener_and_mode(is_gz, text=True)
with opener(dataset, mode, encoding=encoding) as dfile:
sample = dfile.read(sample_size)

try:
dialect = sniff_dialect(sample, encoding, sep, skip_dialect, ui)
dialect = sniff_dialect(sample, sep, skip_dialect, ui)
except csv.Error as ex:
ui.fatal(ex)
if len(sample) < 10:
Expand Down

0 comments on commit 4a676da

Please sign in to comment.