Merge pull request #1 from storebrand/files

Sync Sharepoint Files
storebrand · Oct 17, 2023 · c0fa9be · c0fa9be
2 parents f651a47 + 8596a6f
commit c0fa9be
Show file tree

Hide file tree

Showing 13 changed files with 1,513 additions and 886 deletions.
diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml
@@ -3,7 +3,11 @@
 
 name: Test tap-sharepointsites
 
-on: [push]
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
 
 jobs:
   linting:
@@ -39,7 +43,7 @@ jobs:
       GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.9]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/README.md b/README.md
@@ -6,6 +6,56 @@ Built with the [Meltano Tap SDK](https://sdk.meltano.com) for Singer Taps.
 
 [![Test tap-sharepointsites](https://github.com/radbrt/tap-sharepointsites/actions/workflows/ci_workflow.yml/badge.svg)](https://github.com/radbrt/tap-sharepointsites/actions/workflows/ci_workflow.yml)
 
+## Capabilities
+
+* `catalog`
+* `state`
+* `discover`
+* `about`
+* `stream-maps`
+* `schema-flattening`
+* `batch`
+
+## Settings
+
+| Setting             | Required | Default | Description |
+|:--------------------|:--------:|:-------:|:------------|
+| api_url             | True     | None    | The url for the API service |
+| lists               | True     | None    | The name of the list to sync |
+| files               | False    | None    | Files to sync |
+| client_id           | False    | None    | Managed Identity Client ID |
+| stream_maps         | False    | None    | Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html). |
+| stream_map_config   | False    | None    | User-defined config values to be used within map expressions. |
+| flattening_enabled  | False    | None    | 'True' to enable schema flattening and automatically expand nested properties. |
+| flattening_max_depth| False    | None    | The max depth to flatten schemas. |
+| batch_config        | False    | None    |             |
+
+A full list of supported settings and capabilities is available by running: `tap-sharepointsites --about`
+
+
+## File config
+
+The file configuration accepts an array of objects, with keys: 
+- `name`: Name given to the stream/table 
+- `file_pattern`: regex-like pattern for filenames to load
+- `folder`: Subfolder where the files are located
+- `file_type`: Type (format) of file to load, either `csv` or `excel`.
+- `delimiter`: Field delimiter for CSV files. default `,`
+
+Example config:
+
+```
+...
+  config:
+    ...
+    files:
+    - name: employees
+      file_pattern: employees_.*\.csv
+      folder: hr_data/raw
+      file_type: csv
+  ...
+```
+
 <!--
 
 Developer TODO: Update the below as needed to correctly describe the install procedure. For instance, if you do not have a PyPi repo, or if you want users to directly install from your git repo, you can modify this step as appropriate.

diff --git a/meltano.yml b/meltano.yml
@@ -30,6 +30,10 @@ plugins:
       label: Sharepoint Lists
       documentation: An array of lists to extract, each an object with `name` as a key.
       placeholder: '["my-list-1", "my-list-2"]'
+    - name: files
+      label: Sharepoint Lists
+      documentation: An array of files to extract.
+      placeholder: '[{"name": "my-file-1", "file_pattern": ".*\.csv", "file_type": "csv", "folder": "my-folder"}]'
     - name: client_id
       label: Managed Identity Client ID
       value: null

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,10 +10,12 @@ keywords = [
 license = "Apache 2.0"
 
 [tool.poetry.dependencies]
-python = "<3.11,>=3.7.1"
+python = "<3.14,>=3.7.1"
 requests = "^2.25.1"
-singer-sdk = "^0.11.1"
+singer-sdk = "^0.29.0"
 azure-identity = "^1.11"
+openpyxl = "^3.0.7"
+xlrd = "^2.0.1"
 
 [tool.poetry.dev-dependencies]
 pytest = "^6.2.5"

diff --git a/tap_sharepointsites/client.py b/tap_sharepointsites/client.py
@@ -29,12 +29,6 @@ class sharepointsitesStream(RESTStream):
     """sharepointsites stream class."""
 
     # OR use a dynamic url_base:
-    @property
-    def url_base(self) -> str:
-        """Return the API URL root, configurable via tap settings."""
-        return self.config["api_url"]
-
-    records_jsonpath = "$.value[*]"  # Or override `parse_response`.
 
     @property
     def authenticator(self) -> BearerTokenAuthenticator:

diff --git a/tap_sharepointsites/file_handlers/csv_handler.py b/tap_sharepointsites/file_handlers/csv_handler.py
@@ -0,0 +1,36 @@
+"""Handle CSV files."""
+
+import csv
+import logging
+import re
+
+LOGGER = logging.getLogger(__name__)
+
+
+class CSVHandler:
+    """Handle CSV files."""
+
+    def __init__(self, textcontent, delimiter=","):
+        """Initialize ExcelHandler."""
+        self.textcontent = textcontent
+        self.delimiter = delimiter
+
+    @staticmethod
+    def format_key(key):
+        """Format key."""
+        formatted_key = re.sub(r"[^\w\s]", "", key)
+        formatted_key = re.sub(r"\s+", "_", formatted_key)
+        return formatted_key.lower()
+
+    def get_dictreader(self):
+        """Read CSV file and return csv DictReader object for the file."""
+        dr = csv.DictReader(
+            self.textcontent.splitlines(),
+            fieldnames=None,
+            restkey="_sdc_extra",
+            delimiter=self.delimiter,
+        )
+
+        dr.fieldnames = [self.format_key(key) for key in dr.fieldnames.copy()]
+
+        return dr
diff --git a/tap_sharepointsites/file_handlers/excel_handler.py b/tap_sharepointsites/file_handlers/excel_handler.py
@@ -0,0 +1,66 @@
+"""Handle Excel files."""
+
+import logging
+import re
+import tempfile
+
+import openpyxl
+
+LOGGER = logging.getLogger(__name__)
+
+
+class ExcelHandler:
+    """Handle Excel files."""
+
+    def __init__(self, textcontent):
+        """Initialize ExcelHandler."""
+        self.xlsheet = self._load_workbook(textcontent)
+
+    def _load_workbook(self, textcontent):
+        """Load workbook from textcontent."""
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=".xlsx") as temp:
+            temp.write(textcontent)
+            temp.flush()
+            workbook = openpyxl.load_workbook(temp.name, read_only=True)
+            worksheets = workbook.worksheets
+            active_sheet = worksheets[0]
+            return active_sheet
+            # self.xlsheet = active_sheet
+
+    def get_row_iterator(self):
+        """Return a generator of rows."""
+        yield from self.generator_wrapper(self.xlsheet)
+
+    @property
+    def fieldnames(self):
+        """Return fieldnames."""
+        return [c.value for c in self.xlsheet[1]]
+
+    @staticmethod
+    def generator_wrapper(reader):
+        """Wrap a reader in a generator."""
+        header_row = None
+        for row in reader:
+            to_return = {}
+            if header_row is None:
+                header_row = row
+                continue
+
+            for index, cell in enumerate(row):
+                header_cell = header_row[index]
+
+                formatted_key = header_cell.value
+                if not formatted_key:
+                    formatted_key = ""  # default to empty string for key
+
+                # remove non-word, non-whitespace characters
+                formatted_key = re.sub(r"[^\w\s]", "", formatted_key)
+
+                # replace whitespace with underscores
+                formatted_key = re.sub(r"\s+", "_", formatted_key)
+
+                to_return[formatted_key.lower()] = (
+                    str(cell.value) if cell.value is not None else ""
+                )
+
+            yield to_return