Skip to content
This repository has been archived by the owner on Oct 17, 2024. It is now read-only.

Commit

Permalink
Merge pull request #3 from storebrand/colnamefix
Browse files Browse the repository at this point in the history
Better column name handling
  • Loading branch information
hholgersen authored Nov 7, 2023
2 parents 5a1333a + 589b26b commit d790cd3
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 27 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ The file configuration accepts an array of objects, with keys:
- `folder`: Subfolder where the files are located
- `file_type`: Type (format) of file to load, either `csv` or `excel`.
- `delimiter`: Field delimiter for CSV files. default `,`
- `clean_colnames`: Whether to convert column names to snake_case. default `false`

Example config:

Expand All @@ -53,6 +54,7 @@ Example config:
file_pattern: employees_.*\.csv
folder: hr_data/raw
file_type: csv
clean_colnames: true
...
```

Expand Down
12 changes: 3 additions & 9 deletions tap_sharepointsites/file_handlers/csv_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import logging
import re

from tap_sharepointsites.utils import snakecase

LOGGER = logging.getLogger(__name__)


Expand All @@ -14,13 +16,7 @@ def __init__(self, textcontent, delimiter=","):
"""Initialize ExcelHandler."""
self.textcontent = textcontent
self.delimiter = delimiter

@staticmethod
def format_key(key):
"""Format key."""
formatted_key = re.sub(r"[^\w\s]", "", key)
formatted_key = re.sub(r"\s+", "_", formatted_key)
return formatted_key.lower()
self.clean_colnames = clean_colnames

def get_dictreader(self):
"""Read CSV file and return csv DictReader object for the file."""
Expand All @@ -31,6 +27,4 @@ def get_dictreader(self):
delimiter=self.delimiter,
)

dr.fieldnames = [self.format_key(key) for key in dr.fieldnames.copy()]

return dr
14 changes: 5 additions & 9 deletions tap_sharepointsites/file_handlers/excel_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import openpyxl

from tap_sharepointsites.utils import snakecase

LOGGER = logging.getLogger(__name__)


Expand Down Expand Up @@ -36,8 +38,7 @@ def fieldnames(self):
"""Return fieldnames."""
return [c.value for c in self.xlsheet[1]]

@staticmethod
def generator_wrapper(reader):
def generator_wrapper(self, reader):
"""Wrap a reader in a generator."""
header_row = None
for row in reader:
Expand All @@ -50,16 +51,11 @@ def generator_wrapper(reader):
header_cell = header_row[index]

formatted_key = header_cell.value

if not formatted_key:
formatted_key = "" # default to empty string for key

# remove non-word, non-whitespace characters
formatted_key = re.sub(r"[^\w\s]", "", formatted_key)

# replace whitespace with underscores
formatted_key = re.sub(r"\s+", "_", formatted_key)

to_return[formatted_key.lower()] = (
to_return[formatted_key] = (
str(cell.value) if cell.value is not None else ""
)

Expand Down
19 changes: 10 additions & 9 deletions tap_sharepointsites/file_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from tap_sharepointsites.client import sharepointsitesStream
from tap_sharepointsites.file_handlers.csv_handler import CSVHandler
from tap_sharepointsites.file_handlers.excel_handler import ExcelHandler
from tap_sharepointsites.utils import snakecase


class FilesStream(sharepointsitesStream):
Expand Down Expand Up @@ -126,6 +127,10 @@ def parse_response(self, response: requests.Response, context) -> t.Iterable[dic
raise Exception(f"File type { filetype_name } not supported (yet)")

for i, row in enumerate(dr):

if self.file_config.get("clean_colnames", False):
row = {snakecase(k): v for k, v in row.items()}

row.update(
{
"_sdc_source_file": record["name"],
Expand Down Expand Up @@ -156,15 +161,11 @@ def schema(self):
dr = ExcelHandler(file)

properties = {}
formatted_key = [
re.sub(r"[^\w\s]", "", formatted_key)
for formatted_key in dr.fieldnames
]
formatted_key = [
re.sub(r"\s+", "_", formatted_key)
for formatted_key in formatted_key
]
fieldnames = [formatted_key.lower() for formatted_key in formatted_key]

fieldnames = [name for name in dr.fieldnames]

if self.file_config.get("clean_colnames", False):
fieldnames = [snakecase(name) for name in fieldnames]

extra_cols = [
"_sdc_source_file",
Expand Down
7 changes: 7 additions & 0 deletions tap_sharepointsites/tap.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ class Tapsharepointsites(Tap):
required=False,
description="For CSV files: the delimiter to use",
),
th.Property(
"clean_colnames",
th.BooleanType,
required=False,
default=False,
description="Replace special characters and convert to snakecase",
),
),
),
required=False,
Expand Down
15 changes: 15 additions & 0 deletions tap_sharepointsites/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import re


def snakecase(name):
# Convert camelCase to snake_case
name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name)

# Replace any non-alphanumeric characters with underscores
name = re.sub(r"[^a-zA-Z0-9_]+", "_", name)

# Replace any sequence of multiple underscores with a single underscore
name = re.sub(r"_{2,}", "_", name)

return name.lower()

0 comments on commit d790cd3

Please sign in to comment.