Skip to content

Commit

Permalink
(core) Switch excel import parsing from messytables+xlrd to openpyxl,…
Browse files Browse the repository at this point in the history
… and ignore empty rows

Summary:
Use openpyxl instead of messytables (which used xlrd internally) in import_xls.py.

Skip empty rows since excel files can easily contain huge numbers of them.

Drop support for xls files (which openpyxl doesn't support) in favour of the newer xlsx format.

Fix some details relating to python virtualenvs and dependencies, as Jenkins was failing to find new Python dependencies.

Test Plan: Mostly relying on existing tests. Updated various tests which referred to xls files instead of xlsx. Added a Python test for skipping empty rows.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Differential Revision: https://phab.getgrist.com/D3406
  • Loading branch information
alexmojaki committed May 12, 2022
1 parent 0a61d74 commit 6c90de4
Show file tree
Hide file tree
Showing 15 changed files with 130 additions and 138 deletions.
2 changes: 1 addition & 1 deletion app/client/lib/uploads.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export interface SelectFileOptions extends UploadOptions {
// e.g. [".jpg", ".png"]
}

export const IMPORTABLE_EXTENSIONS = [".grist", ".csv", ".tsv", ".txt", ".xls", ".xlsx", ".xlsm"];
export const IMPORTABLE_EXTENSIONS = [".grist", ".csv", ".tsv", ".txt", ".xlsx", ".xlsm"];

/**
* Shows the file-picker dialog with the given options, and uploads the selected files. If under
Expand Down
4 changes: 2 additions & 2 deletions app/server/lib/ActiveDocImport.ts
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,8 @@ export class ActiveDocImport {
/**
* Imports the data stored at tmpPath.
*
* Currently it starts a python parser (that relies on the messytables library) as a child process
* outside the sandbox, and supports xls(x), csv, txt, and perhaps some other formats. It may
* Currently it starts a python parser as a child process
* outside the sandbox, and supports xlsx, csv, and perhaps some other formats. It may
* result in the import of multiple tables, in case of e.g. Excel formats.
* @param {OptDocSession} docSession: Session instance to use for importing.
* @param {String} tmpPath: The path from of the original file.
Expand Down
1 change: 0 additions & 1 deletion app/server/lib/DocPluginManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,6 @@ export class DocPluginManager {
if (messages.length) {
const extToType: Record<string, string> = {
'.xlsx' : 'Excel',
'.xls' : 'Excel',
'.json' : 'JSON',
'.csv' : 'CSV',
};
Expand Down
10 changes: 5 additions & 5 deletions app/server/lib/NSandbox.ts
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,8 @@ function gvisor(options: ISandboxOptions): SandboxProcess {
// if checkpoints are in use.
const venv = path.join(process.cwd(),
pythonVersion === '2' ? 'venv' : 'sandbox_venv3');
if (fs.existsSync(venv) && !process.env.GRIST_CHECKPOINT) {
const useCheckpoint = process.env.GRIST_CHECKPOINT && !paths.importDir;
if (fs.existsSync(venv) && !useCheckpoint) {
wrapperArgs.addMount(venv);
wrapperArgs.push('-s', path.join(venv, 'bin', 'python'));
}
Expand All @@ -570,17 +571,16 @@ function gvisor(options: ISandboxOptions): SandboxProcess {
// between the checkpoint and how it gets used later).
// If a sandbox is being used for import, it will have a special mount we can't
// deal with easily right now. Should be possible to do in future if desired.
if (options.useGristEntrypoint && pythonVersion === '3' && !paths.importDir &&
process.env.GRIST_CHECKPOINT) {
if (options.useGristEntrypoint && pythonVersion === '3' && useCheckpoint) {
if (process.env.GRIST_CHECKPOINT_MAKE) {
const child =
spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT,
spawn(command, [...wrapperArgs.get(), '--checkpoint', process.env.GRIST_CHECKPOINT!,
`python${pythonVersion}`, '--', ...pythonArgs]);
// We don't want process control for this.
return {child, control: new NoProcessControl(child)};
}
wrapperArgs.push('--restore');
wrapperArgs.push(process.env.GRIST_CHECKPOINT);
wrapperArgs.push(process.env.GRIST_CHECKPOINT!);
}
const child = spawn(command, [...wrapperArgs.get(), `python${pythonVersion}`, '--', ...pythonArgs]);
// For gvisor under ptrace, main work is done by a traced process identifiable as
Expand Down
2 changes: 1 addition & 1 deletion app/server/lib/guessExt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ export async function guessExt(filePath: string, fileName: string, mimeType: str
return mimeExt;
}

if (origExt === ".csv" || origExt === ".xls") {
if (origExt === ".csv") {
// File type detection doesn't work for these, and mime type can't be trusted. E.g. Windows
// may report "application/vnd.ms-excel" for .csv files. See
// https://github.com/ManifoldScholar/manifold/issues/2409#issuecomment-545152220
Expand Down
4 changes: 2 additions & 2 deletions plugins/core/manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ components:
safePython: sandbox/main.py
contributions:
fileParsers:
- fileExtensions: ["csv"]
- fileExtensions: ["csv", "tsv", "txt"]
parseFile:
component: safePython
name: csv_parser
- fileExtensions: ["xls", "xlsx", "tsv", "txt", "xlsm"]
- fileExtensions: ["xlsx", "xlsm"]
parseFile:
component: safePython
name: xls_parser
Expand Down
16 changes: 16 additions & 0 deletions sandbox/grist/imports/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import warnings

import six

original_formatwarning = warnings.formatwarning


def formatwarning(*args, **kwargs):
"""
Fixes an error on Jenkins where byte strings (instead of unicode)
were being written to stderr due to a warning from an internal library.
"""
return six.ensure_text(original_formatwarning(*args, **kwargs))


warnings.formatwarning = formatwarning
Binary file not shown.
7 changes: 5 additions & 2 deletions sandbox/grist/imports/import_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,11 @@ def _parse_open_file(file_obj, parse_options=None):
headers = [''] * len(headers)

row_set.register_processor(messytables.offset_processor(data_offset))

table_data_with_types = parse_data.get_table_data(row_set, len(headers), num_rows)
rows = [
[cell.value for cell in row]
for row in row_set
]
table_data_with_types = parse_data.get_table_data(rows, len(headers), num_rows)

# Identify and remove empty columns, and populate separate metadata and data lists.
column_metadata = []
Expand Down
128 changes: 37 additions & 91 deletions sandbox/grist/imports/import_xls.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,10 @@
This module reads a file path that is passed in using ActiveDoc.importFile()
and returns a object formatted so that it can be used by grist for a bulk add records action
"""
import csv
import logging
import os

import chardet
import messytables
import messytables.excel
import openpyxl
import six
from six.moves import zip

Expand All @@ -18,76 +15,58 @@
log = logging.getLogger(__name__)


def import_file(file_source, parse_options):
def import_file(file_source):
path = import_utils.get_path(file_source["path"])
orig_name = file_source["origName"]
parse_options, tables = parse_file(path, orig_name, parse_options)
parse_options, tables = parse_file(path)
return {"parseOptions": parse_options, "tables": tables}

# messytable is painfully un-extensible, so we have to jump through dumb hoops to override any
# behavior.
orig_dialect = messytables.CSVRowSet._dialect
def override_dialect(self):
if self.delimiter == '\t':
return csv.excel_tab
return orig_dialect.fget(self)
messytables.CSVRowSet._dialect = property(override_dialect)

def parse_file(file_path, orig_name, parse_options=None, table_name_hint=None, num_rows=None):
# pylint: disable=unused-argument

def parse_file(file_path):
with open(file_path, "rb") as f:
try:
return parse_open_file(f, orig_name, table_name_hint=table_name_hint)
except Exception as e:
# Log the full error, but simplify the thrown error to omit the unhelpful extra args.
log.warn("import_xls parse_file failed: %s", e)
if six.PY2 and e.args and isinstance(e.args[0], six.string_types):
raise Exception(e.args[0])
raise


def parse_open_file(file_obj, orig_name, table_name_hint=None):
file_root, file_ext = os.path.splitext(orig_name)
table_set = messytables.any.any_tableset(file_obj, extension=file_ext, auto_detect=False)

# Messytable's encoding detection uses too small a sample, so we override it here.
if isinstance(table_set, messytables.CSVTableSet):
sample = file_obj.read(100000)
table_set.encoding = chardet.detect(sample)['encoding']
# In addition, always prefer UTF8 over ASCII.
if table_set.encoding == 'ascii':
table_set.encoding = 'utf8'
return parse_open_file(f)


def parse_open_file(file_obj):
workbook = openpyxl.load_workbook(
file_obj,
read_only=True,
keep_vba=False,
data_only=True,
keep_links=False,
)

skipped_tables = 0
export_list = []
# A table set is a collection of tables:
for row_set in table_set.tables:
table_name = row_set.name

if isinstance(row_set, messytables.CSVRowSet):
# For csv files, we can do better for table_name by using the filename.
table_name = import_utils.capitalize(table_name_hint or
os.path.basename(file_root.decode('utf8')))

# Messytables doesn't guess whether headers are present, so we need to step in.
data_offset, headers = import_utils.headers_guess(list(row_set.sample))
else:
# Let messytables guess header names and the offset of the header.
offset, headers = messytables.headers_guess(row_set.sample)
data_offset = offset + 1 # Add the header line
for sheet in workbook:
table_name = sheet.title
rows = [
row
for row in sheet.iter_rows(values_only=True)
# Exclude empty rows, i.e. rows with only empty values.
# `if not any(row)` would be slightly faster, but would count `0` as empty.
if not set(row) <= {None, ""}
]
sample = [
# Create messytables.Cells for the sake of messytables.headers_guess
[messytables.Cell(cell) for cell in row]
for row in rows[:1000]
]
offset, headers = messytables.headers_guess(sample)
data_offset = offset + 1 # Add the header line
rows = rows[data_offset:]

# Make sure all header values are strings.
for i, header in enumerate(headers):
if not isinstance(header, six.string_types):
if header is None:
headers[i] = u''
elif not isinstance(header, six.string_types):
headers[i] = six.text_type(header)

log.debug("Guessed data_offset as %s", data_offset)
log.debug("Guessed headers as: %s", headers)

row_set.register_processor(messytables.offset_processor(data_offset))


table_data_with_types = parse_data.get_table_data(row_set, len(headers))
table_data_with_types = parse_data.get_table_data(rows, len(headers))

# Identify and remove empty columns, and populate separate metadata and data lists.
column_metadata = []
Expand Down Expand Up @@ -121,36 +100,3 @@ def parse_open_file(file_obj, orig_name, table_name_hint=None):

parse_options = {}
return parse_options, export_list


# This change was initially introduced in https://phab.getgrist.com/D2145
# Monkey-patching done in https://phab.getgrist.com/D2965
# to move towards normal dependency management
@staticmethod
def from_xlrdcell(xlrd_cell, sheet, col, row):
from messytables.excel import (
XLS_TYPES, StringType, DateType, InvalidDateError, xlrd, time, datetime, XLSCell
)
value = xlrd_cell.value
cell_type = XLS_TYPES.get(xlrd_cell.ctype, StringType())
if cell_type == DateType(None):
# Try-catch added by Dmitry, to avoid failing even if we see a date we can't handle.
try:
if value == 0:
raise InvalidDateError
year, month, day, hour, minute, second = \
xlrd.xldate_as_tuple(value, sheet.book.datemode)
if (year, month, day) == (0, 0, 0):
value = time(hour, minute, second)
else:
value = datetime(year, month, day, hour, minute, second)
except Exception:
# Keep going, and we'll just interpret the date as a number.
pass
messy_cell = XLSCell(value, type=cell_type)
messy_cell.sheet = sheet
messy_cell.xlrd_cell = xlrd_cell
messy_cell.xlrd_pos = (row, col) # necessary for properties, note not (x,y)
return messy_cell

messytables.excel.XLSCell.from_xlrdcell = from_xlrdcell
3 changes: 2 additions & 1 deletion sandbox/grist/imports/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ def parse_csv(file_source, options):
sandbox.register("csv_parser.parseFile", parse_csv)

def parse_excel(file_source, parse_options):
# pylint: disable=unused-argument
from imports.import_xls import import_file
return import_file(file_source, parse_options)
return import_file(file_source)

sandbox.register("xls_parser.parseFile", parse_excel)

Expand Down
Loading

0 comments on commit 6c90de4

Please sign in to comment.