Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix skip interval for spreadsheets with empty rows before data #55

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,4 @@ rsa-key
tags
singer-check-tap-data
state.json
.venv/
15 changes: 11 additions & 4 deletions tap_spreadsheets_anywhere/excel_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,16 @@

LOGGER = logging.getLogger(__name__)

def generator_wrapper(reader):
def generator_wrapper(reader, table_spec: dict={}) -> dict:
skip_initial = table_spec.get("skip_initial", 0)
_skip_count = 0
header_row = None
for row in reader:
if _skip_count < skip_initial:
LOGGER.debug("Skipped (%d/%d) row: %r", _skip_count, skip_initial, row)
_skip_count += 1
continue

to_return = {}
if header_row is None:
header_row = row
Expand Down Expand Up @@ -58,11 +65,11 @@ def get_legacy_row_iterator(table_spec, file_handle):
except Exception as e:
LOGGER.info(e)
sheet = workbook.sheet_by_name(sheet_name_list[0])
return generator_wrapper(sheet.get_rows())
return generator_wrapper(sheet.get_rows(), table_spec)


def get_row_iterator(table_spec, file_handle):
workbook = openpyxl.load_workbook(file_handle, read_only=True)
workbook = openpyxl.load_workbook(file_handle.name, read_only=True)

if "worksheet_name" in table_spec:
try:
Expand All @@ -88,4 +95,4 @@ def get_row_iterator(table_spec, file_handle):
except Exception as e:
LOGGER.info(e)
active_sheet = worksheets[0]
return generator_wrapper(active_sheet)
return generator_wrapper(active_sheet, table_spec)
6 changes: 4 additions & 2 deletions tap_spreadsheets_anywhere/format_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,9 @@ def get_row_iterator(table_spec, uri):
except (ValueError,TypeError) as err:
raise InvalidFormatError(uri,message=err)

for _ in range(skip_initial):
next(iterator)
if format != 'excel':
# Reduce the scope of changes to fix Issue #52.
for _ in range(skip_initial):
next(iterator)

return iterator
Binary file not shown.
39 changes: 39 additions & 0 deletions tap_spreadsheets_anywhere/test/test_excel_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import logging
import pytest
from openpyxl import Workbook
from tap_spreadsheets_anywhere.excel_handler import generator_wrapper

LOGGER = logging.getLogger(__name__)


def get_worksheet():
"""Create a basic workbook that can be manipulated for tests.
See: https://openpyxl.readthedocs.io/en/stable/usage.html.
"""
wb = Workbook()
ws = wb.active
tree_data = [
["Type", "Leaf Color", "Height"],
["Maple", "Red", 549],
["Oak", "Green", 783],
["Pine", "Green", 1204]
]
exp_tree_data = [
{'type': 'Maple', 'leaf_color': 'Red', 'height': 549},
{'type': 'Oak', 'leaf_color': 'Green', 'height': 783},
{'type': 'Pine', 'leaf_color': 'Green', 'height': 1204},
]
[ws.append(row) for row in tree_data]
return ws, wb, tree_data, exp_tree_data


class TestExcelHandlerGeneratorWrapper:
"""Validate the expected state of the `excel_handler.generator_wrapper`."""
def test_parse_data(self):
worksheet, _, _, exp = get_worksheet()
_generator = generator_wrapper(worksheet)
assert next(_generator) == exp[0]
assert next(_generator) == exp[1]
assert next(_generator) == exp[2]
with pytest.raises(StopIteration):
next(_generator)
64 changes: 64 additions & 0 deletions tap_spreadsheets_anywhere/test/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,18 @@
import json
import logging
import unittest
from datetime import datetime
from pathlib import Path
from unittest.mock import patch

import dateutil
import pytest
import smart_open
from six import StringIO

from tap_spreadsheets_anywhere import configuration, file_utils, csv_handler, json_handler, generate_schema
from tap_spreadsheets_anywhere.format_handler import monkey_patch_streamreader, get_row_iterator
from tap_spreadsheets_anywhere.test.test_excel_handler import get_worksheet


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -204,3 +208,63 @@ def test_renamed_https_object(self):

row = next(iterator)
self.assertTrue(len(row)>1,"Not able to read a row.")


class TestFormatHandlerExcelXlsxSkipInitial:
"""pytests to validate Skip Initial for Excel `.xlsx` files works as expected."""
bad_file = Path(__file__).cwd() / "sample_with_bad_blank_line_above_headings.xlsx"
uri = f"file://{bad_file.absolute()}"

def test_validate_iterator(self, tmpdir):
xlsx = tmpdir / "fake_test.xlsx"
uri = f"file://{xlsx}"
_, workbook, _, exp = get_worksheet()
workbook.save(xlsx)

iterator = get_row_iterator({"format": "excel"}, uri)
assert next(iterator) == exp[0]
assert next(iterator) == exp[1]
assert next(iterator) == exp[2]
with pytest.raises(StopIteration):
next(iterator)

def test_bad_blank_line_above_headings_raises(self):
"""Test to verify a sample file that raises #52.
Iteratting through this bad sample file will currently fail
when parsing the blank line.
"""
table_spec = {"format": "excel"}
iterator = get_row_iterator(table_spec, self.uri)
with pytest.raises(IndexError):
for _ in iterator:
continue

def test_bad_blank_line_above_headings_skip_initial_over_bad_row(self):
"""Test to verify a sample file that raises #52, does not fail when
using: `skip_interval`, to avoid the bad row.
"""
# NOTE: that `get_row_iterator` will compress the header row and each
# subsequent data row together, so count one less row than in the file
# + expect a dict.
exp = {
'account': 'Sales - Commission Fees',
'account_code': 9999.0,
'account_type': 'Revenue',
'contact': 'Company A Limited',
'credit_gbp': 123.45,
'date': datetime(2023, 1, 31, 0, 0),
'debit_gbp': 0.0,
'description': 'Description for Company A',
'gross_gbp': 123.45,
'invoice_number': 'INV-1234',
'net_gbp': 1234.45,
'reference': 'REF-1234',
'revenue_type': 'Commission Fees',
'vat_gbp': 0.0,
}
table_spec = {"format": "excel", "skip_initial": 4}
# NOTE: `get_row_iterator` should no longer fail with Issue #52, now
# that: `excel_handler.generator_wrapper` is not parsing skipped rows.
iterator = get_row_iterator(table_spec, self.uri)
# Assert that the expected row, after skipping, is next.
assert next(iterator) == exp