diff --git a/README.md b/README.md index 31574dc..e61224a 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ The file configuration accepts an array of objects, with keys: - `folder`: Subfolder where the files are located - `file_type`: Type (format) of file to load, either `csv` or `excel`. - `delimiter`: Field delimiter for CSV files. default `,` +- `clean_colnames`: Whether to convert column names to snake_case. default `false` Example config: @@ -53,6 +54,7 @@ Example config: file_pattern: employees_.*\.csv folder: hr_data/raw file_type: csv + clean_colnames: true ... ``` diff --git a/tap_sharepointsites/file_handlers/csv_handler.py b/tap_sharepointsites/file_handlers/csv_handler.py index 894c8a3..44d3ef8 100644 --- a/tap_sharepointsites/file_handlers/csv_handler.py +++ b/tap_sharepointsites/file_handlers/csv_handler.py @@ -4,6 +4,8 @@ import logging import re +from tap_sharepointsites.utils import snakecase + LOGGER = logging.getLogger(__name__) @@ -14,13 +16,7 @@ def __init__(self, textcontent, delimiter=","): """Initialize ExcelHandler.""" self.textcontent = textcontent self.delimiter = delimiter - - @staticmethod - def format_key(key): - """Format key.""" - formatted_key = re.sub(r"[^\w\s]", "", key) - formatted_key = re.sub(r"\s+", "_", formatted_key) - return formatted_key.lower() + self.clean_colnames = clean_colnames def get_dictreader(self): """Read CSV file and return csv DictReader object for the file.""" @@ -31,6 +27,4 @@ def get_dictreader(self): delimiter=self.delimiter, ) - dr.fieldnames = [self.format_key(key) for key in dr.fieldnames.copy()] - return dr diff --git a/tap_sharepointsites/file_handlers/excel_handler.py b/tap_sharepointsites/file_handlers/excel_handler.py index c3c5113..79b4e30 100644 --- a/tap_sharepointsites/file_handlers/excel_handler.py +++ b/tap_sharepointsites/file_handlers/excel_handler.py @@ -6,6 +6,8 @@ import openpyxl +from tap_sharepointsites.utils import snakecase + LOGGER = logging.getLogger(__name__) @@ -36,8 +38,7 @@ def fieldnames(self): """Return fieldnames.""" return [c.value for c in self.xlsheet[1]] - @staticmethod - def generator_wrapper(reader): + def generator_wrapper(self, reader): """Wrap a reader in a generator.""" header_row = None for row in reader: @@ -50,16 +51,11 @@ def generator_wrapper(reader): header_cell = header_row[index] formatted_key = header_cell.value + if not formatted_key: formatted_key = "" # default to empty string for key - # remove non-word, non-whitespace characters - formatted_key = re.sub(r"[^\w\s]", "", formatted_key) - - # replace whitespace with underscores - formatted_key = re.sub(r"\s+", "_", formatted_key) - - to_return[formatted_key.lower()] = ( + to_return[formatted_key] = ( str(cell.value) if cell.value is not None else "" ) diff --git a/tap_sharepointsites/file_stream.py b/tap_sharepointsites/file_stream.py index 7a834d9..e8f1e78 100644 --- a/tap_sharepointsites/file_stream.py +++ b/tap_sharepointsites/file_stream.py @@ -12,6 +12,7 @@ from tap_sharepointsites.client import sharepointsitesStream from tap_sharepointsites.file_handlers.csv_handler import CSVHandler from tap_sharepointsites.file_handlers.excel_handler import ExcelHandler +from tap_sharepointsites.utils import snakecase class FilesStream(sharepointsitesStream): @@ -126,6 +127,10 @@ def parse_response(self, response: requests.Response, context) -> t.Iterable[dic raise Exception(f"File type { filetype_name } not supported (yet)") for i, row in enumerate(dr): + + if self.file_config.get("clean_colnames", False): + row = {snakecase(k): v for k, v in row.items()} + row.update( { "_sdc_source_file": record["name"], @@ -156,15 +161,11 @@ def schema(self): dr = ExcelHandler(file) properties = {} - formatted_key = [ - re.sub(r"[^\w\s]", "", formatted_key) - for formatted_key in dr.fieldnames - ] - formatted_key = [ - re.sub(r"\s+", "_", formatted_key) - for formatted_key in formatted_key - ] - fieldnames = [formatted_key.lower() for formatted_key in formatted_key] + + fieldnames = [name for name in dr.fieldnames] + + if self.file_config.get("clean_colnames", False): + fieldnames = [snakecase(name) for name in fieldnames] extra_cols = [ "_sdc_source_file", diff --git a/tap_sharepointsites/tap.py b/tap_sharepointsites/tap.py index adedbd7..73b9f54 100644 --- a/tap_sharepointsites/tap.py +++ b/tap_sharepointsites/tap.py @@ -58,6 +58,13 @@ class Tapsharepointsites(Tap): required=False, description="For CSV files: the delimiter to use", ), + th.Property( + "clean_colnames", + th.BooleanType, + required=False, + default=False, + description="Replace special characters and convert to snakecase", + ), ), ), required=False, diff --git a/tap_sharepointsites/utils.py b/tap_sharepointsites/utils.py new file mode 100644 index 0000000..d4f412e --- /dev/null +++ b/tap_sharepointsites/utils.py @@ -0,0 +1,15 @@ +import re + + +def snakecase(name): + # Convert camelCase to snake_case + name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name) + name = re.sub("([a-z0-9])([A-Z])", r"\1_\2", name) + + # Replace any non-alphanumeric characters with underscores + name = re.sub(r"[^a-zA-Z0-9_]+", "_", name) + + # Replace any sequence of multiple underscores with a single underscore + name = re.sub(r"_{2,}", "_", name) + + return name.lower()