Merge pull request #3 from ZeroGachis/feature/custom-schema

✨ Enable custom schema
ZeroGachis · Oct 21, 2022 · 6061170 · 6061170
2 parents a1407be + d76a2f9
commit 6061170
Show file tree

Hide file tree

Showing 14 changed files with 156 additions and 46 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,3 +17,5 @@ jobs:
         run: python3 -m black --check --diff magicparse/ tests/ setup.py
       - name: flake8
         run: python3 -m flake8
+      - name: Pytest
+        run: python3 -m pytest
diff --git a/README.md b/README.md
@@ -37,6 +37,7 @@ from uuid import UUID
 import magicparse
 
 class GuidConverter(magicparse.Converter):
+    @staticmethod
     def key() -> str:
         return "guid"
 
@@ -46,18 +47,46 @@ class GuidConverter(magicparse.Converter):
 
 magicparse.register(GuidConverter)
 
-schema = Schema.build(
-    {
-        "file_type": "csv",
-        "fields": [{"key": "shop-guid", "type": "guid", "column-number": 1}],
-    }
-)
+schema = {
+    "file_type": "csv",
+    "fields": [
+        {"key": "shop-guid", "type": "guid", "column-number": 1}
+    ],
+}
 
-rows, errors = schema.parse("13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2")
+rows, errors = magicparse.parse("13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2", schema)
 assert rows == [{"shop-guid": "13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2"}]
 assert not errors
 ```
 
+### Register a custom schema and parse content
+
+```python
+import magicparse
+
+class PipedSchema(magicparse.Schema):
+    @staticmethod
+    def key() -> str:
+        return "piped"
+
+    def get_reader(self, stream):
+        for item in stream.read().split("|"):
+            yield [item]
+
+magicparse.register(PipedSchema)
+
+schema = {
+    "file_type": "piped",
+    "fields": [
+        {"key": "name", "type": "str", "column-number": 1}
+    ]
+}
+
+rows, errors = magicparse.parse("Joe|William|Jack|Averell", schema)
+assert not errors
+assert rows == [{"name": "Joe"}, {"name": "William"}, {"name": "Jack"}, {"name": "Averell"}]
+```
+
 ## API
 
 ### File types

diff --git a/magicparse/__init__.py b/magicparse/__init__.py
@@ -1,4 +1,4 @@
-from .schema import Schema
+from .schema import Schema, builtins as builtins_schemas
 from .post_processors import PostProcessor, builtins as builtins_post_processors
 from .pre_processors import PreProcessor, builtins as builtins_pre_processors
 from .transform import Transform
@@ -22,25 +22,31 @@ def parse(data: str, schema_options: Dict[str, Any]) -> Tuple[List[dict], List[d
     return schema.parse(data)
 
 
-def register(transforms: Union[Transform, List[Transform]]) -> None:
-    if isinstance(transforms, Transform):
-        transforms = [transforms]
+Registrable = Schema | Transform
 
-    for transform in transforms:
-        if issubclass(transform, TypeConverter):
-            TypeConverter.register(transform)
-        elif issubclass(transform, PostProcessor):
-            PostProcessor.register(transform)
-        elif issubclass(transform, PreProcessor):
-            PreProcessor.register(transform)
-        elif issubclass(transform, Validator):
-            Validator.register(transform)
+
+def register(items: Union[Registrable, List[Registrable]]) -> None:
+    if not isinstance(items, list):
+        items = [items]
+
+    for item in items:
+        if issubclass(item, Schema):
+            Schema.register(item)
+        elif issubclass(item, TypeConverter):
+            TypeConverter.register(item)
+        elif issubclass(item, PostProcessor):
+            PostProcessor.register(item)
+        elif issubclass(item, PreProcessor):
+            PreProcessor.register(item)
+        elif issubclass(item, Validator):
+            Validator.register(item)
         else:
             raise ValueError(
                 "transforms must be a subclass of Transform (or a list of it)"
             )
 
 
+register(builtins_schemas)
 register(builtins_pre_processors)
 register(builtins_type_converters)
 register(builtins_validators)

diff --git a/magicparse/fields.py b/magicparse/fields.py
@@ -43,6 +43,19 @@ def read_value(self, row):
     def error(self, exception: Exception):
         pass
 
+    @classmethod
+    def build(cls, options: dict) -> "Field":
+        column_number = options.get("column-number")
+        if column_number:
+            return CsvField(options)
+
+        column_start = options.get("column-start")
+        column_length = options.get("column-length")
+        if column_start is not None and column_length is not None:
+            return ColumnarField(options)
+
+        raise ValueError("missing field position")
+
 
 class CsvField(Field):
     def __init__(self, options: dict) -> None:

diff --git a/magicparse/post_processors.py b/magicparse/post_processors.py
@@ -37,6 +37,7 @@ def __init__(self, denominator: int) -> None:
     def apply(self, value: Number) -> Number:
         return value / self.denominator
 
+    @staticmethod
     def key() -> str:
         return "divide"
 

diff --git a/magicparse/pre_processors.py b/magicparse/pre_processors.py
@@ -28,6 +28,7 @@ def __init__(self, width: int) -> None:
     def apply(self, value: str) -> str:
         return value.zfill(self.width)
 
+    @staticmethod
     def key() -> str:
         return "left-pad-zeroes"
 
@@ -45,6 +46,7 @@ def apply(self, value: str) -> str:
                 f"value '{value}' does not map to any values in [{self._keys}]"
             )
 
+    @staticmethod
     def key() -> str:
         return "map"
 
@@ -57,6 +59,7 @@ def __init__(self, pattern: str, replacement: str) -> None:
     def apply(self, value: str) -> str:
         return value.replace(self.pattern, self.replacement)
 
+    @staticmethod
     def key() -> str:
         return "replace"
 
@@ -65,6 +68,7 @@ class StripWhitespaces(PreProcessor):
     def apply(self, value: str) -> str:
         return value.strip()
 
+    @staticmethod
     def key() -> str:
         return "strip-whitespaces"
 
@@ -88,6 +92,7 @@ def apply(self, value: str) -> str:
 
         return match.group("value")
 
+    @staticmethod
     def key() -> str:
         return "regex-extract"
 

diff --git a/magicparse/schema.py b/magicparse/schema.py
@@ -1,29 +1,48 @@
-from abc import ABC, abstractmethod
+from abc import ABC, abstractmethod, abstractstaticmethod
 import csv
-from .fields import Field, CsvField, ColumnarField
+from .fields import Field
 from io import StringIO
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Tuple, Union
 
 
 class Schema(ABC):
     fields: List[Field]
-    has_header: bool
+    has_header: bool = False
+
+    def __init__(self, options: Dict[str, Any]) -> None:
+        self.fields = [Field.build(item) for item in options["fields"]]
 
     @abstractmethod
-    def get_reader(self, data: str):
+    def get_reader(self, stream: StringIO):
+        pass
+
+    @abstractstaticmethod
+    def key() -> str:
         pass
 
     @classmethod
-    def build(self, options: Dict[str, Any]) -> "Schema":
-        if options["file_type"] == "csv":
-            return CsvSchema(options)
-        elif options["file_type"] == "columnar":
-            return ColumnarSchema(options)
+    def build(cls, options: Dict[str, Any]) -> "Schema":
+        file_type = options["file_type"]
+        schema = cls.registry.get(file_type)
+        if schema:
+            return schema(options)
+
+        raise ValueError("unknown file type")
+
+    @classmethod
+    def register(cls, schema: "Schema") -> None:
+        if not hasattr(cls, "registry"):
+            cls.registry = {}
+
+        cls.registry[schema.key()] = schema
+
+    def parse(self, data: Union[str, StringIO]) -> Tuple[List[dict], List[dict]]:
+        if isinstance(data, str):
+            stream = StringIO(data)
         else:
-            raise ValueError("unknown file type")
+            stream = data
 
-    def parse(self, data: str) -> Tuple[List[dict], List[dict]]:
-        reader = self.get_reader(data)
+        reader = self.get_reader(stream)
 
         row_number = 0
         if self.has_header:
@@ -53,21 +72,26 @@ def parse(self, data: str) -> Tuple[List[dict], List[dict]]:
 
 
 class CsvSchema(Schema):
-    def __init__(self, schema: Dict[str, Any]) -> None:
-        self.has_header = schema.get("has_header", False)
-        self.delimiter = schema.get("delimiter", ",")
-        self.fields = [CsvField(field) for field in schema["fields"]]
+    def __init__(self, options: Dict[str, Any]) -> None:
+        super().__init__(options)
+        self.has_header = options.get("has_header", False)
+        self.delimiter = options.get("delimiter", ",")
+
+    def get_reader(self, stream: StringIO):
+        return csv.reader(stream, delimiter=self.delimiter, quoting=csv.QUOTE_NONE)
 
-    def get_reader(self, data: str):
-        return csv.reader(
-            StringIO(data), delimiter=self.delimiter, quoting=csv.QUOTE_NONE
-        )
+    @staticmethod
+    def key() -> str:
+        return "csv"
 
 
 class ColumnarSchema(Schema):
-    def __init__(self, schema: Dict[str, Any]) -> None:
-        self.has_header = False
-        self.fields = [ColumnarField(field) for field in schema["fields"]]
+    def get_reader(self, stream: StringIO):
+        return stream
+
+    @staticmethod
+    def key() -> str:
+        return "columnar"
+
 
-    def get_reader(self, data: str):
-        return StringIO(data)
+builtins = [ColumnarSchema, CsvSchema]
diff --git a/magicparse/type_converters.py b/magicparse/type_converters.py
@@ -20,6 +20,7 @@ class StrConverter(TypeConverter):
     def apply(self, value: str) -> str:
         return value
 
+    @staticmethod
     def key() -> str:
         return "str"
 
@@ -31,6 +32,7 @@ def apply(self, value: str) -> int:
         except:
             raise ValueError("value is not a valid integer")
 
+    @staticmethod
     def key() -> str:
         return "int"
 
@@ -42,6 +44,7 @@ def apply(self, value: str) -> Decimal:
         except:
             raise ValueError("value is not a valid decimal")
 
+    @staticmethod
     def key() -> str:
         return "decimal"
 

diff --git a/magicparse/validators.py b/magicparse/validators.py
@@ -31,6 +31,7 @@ def apply(self, value: str) -> str:
 
         raise ValueError(f"string does not match regex '{self.pattern.pattern}'")
 
+    @staticmethod
     def key() -> str:
         return "regex-matches"
 

diff --git a/tests/test_post_processors.py b/tests/test_post_processors.py
@@ -51,6 +51,7 @@ def test_divide_decimal(self):
 
 class TestRegister(TestCase):
     class NoThanksPostProcessor(PostProcessor):
+        @staticmethod
         def key() -> str:
             return "no-thanks"
 

diff --git a/tests/test_pre_processors.py b/tests/test_pre_processors.py
@@ -151,6 +151,7 @@ def test_pattern_found(self):
 
 class TestRegister(TestCase):
     class YesPreProcessor(PreProcessor):
+        @staticmethod
         def key() -> str:
             return "yes"
 

diff --git a/tests/test_schema.py b/tests/test_schema.py
@@ -213,3 +213,25 @@ def test_errors_do_not_halt_parsing(self):
                 "error": "value is not a valid integer",
             }
         ]
+
+
+class TestRegister(TestCase):
+    class PipedSchema(Schema):
+        @staticmethod
+        def key() -> str:
+            return "piped"
+
+        def get_reader(self, stream):
+            for item in stream.read().split("|"):
+                yield [item]
+
+    def test_register(self):
+        Schema.register(self.PipedSchema)
+
+        schema = Schema.build(
+            {
+                "file_type": "piped",
+                "fields": [{"key": "name", "type": "str", "column-number": 1}],
+            }
+        )
+        assert isinstance(schema, self.PipedSchema)
diff --git a/tests/test_type_converters.py b/tests/test_type_converters.py
@@ -64,6 +64,7 @@ def test_apply_failed(self):
 
 class TestRegister(TestCase):
     class GuidConverter(TypeConverter):
+        @staticmethod
         def key() -> str:
             return "guid"
 

diff --git a/tests/test_validators.py b/tests/test_validators.py
@@ -51,6 +51,7 @@ def test_does_not_match(self):
 
 class TestRegister(TestCase):
     class IsTheAnswerValidator(Validator):
+        @staticmethod
         def key() -> str:
             return "is-the-answer"