Skip to content

Commit

Permalink
Merge pull request #3 from ZeroGachis/feature/custom-schema
Browse files Browse the repository at this point in the history
✨ Enable custom schema
  • Loading branch information
ducdetronquito authored Oct 21, 2022
2 parents a1407be + d76a2f9 commit 6061170
Show file tree
Hide file tree
Showing 14 changed files with 156 additions and 46 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ jobs:
run: python3 -m black --check --diff magicparse/ tests/ setup.py
- name: flake8
run: python3 -m flake8
- name: Pytest
run: python3 -m pytest
43 changes: 36 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ from uuid import UUID
import magicparse

class GuidConverter(magicparse.Converter):
@staticmethod
def key() -> str:
return "guid"

Expand All @@ -46,18 +47,46 @@ class GuidConverter(magicparse.Converter):

magicparse.register(GuidConverter)

schema = Schema.build(
{
"file_type": "csv",
"fields": [{"key": "shop-guid", "type": "guid", "column-number": 1}],
}
)
schema = {
"file_type": "csv",
"fields": [
{"key": "shop-guid", "type": "guid", "column-number": 1}
],
}

rows, errors = schema.parse("13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2")
rows, errors = magicparse.parse("13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2", schema)
assert rows == [{"shop-guid": "13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2"}]
assert not errors
```

### Register a custom schema and parse content

```python
import magicparse

class PipedSchema(magicparse.Schema):
@staticmethod
def key() -> str:
return "piped"

def get_reader(self, stream):
for item in stream.read().split("|"):
yield [item]

magicparse.register(PipedSchema)

schema = {
"file_type": "piped",
"fields": [
{"key": "name", "type": "str", "column-number": 1}
]
}

rows, errors = magicparse.parse("Joe|William|Jack|Averell", schema)
assert not errors
assert rows == [{"name": "Joe"}, {"name": "William"}, {"name": "Jack"}, {"name": "Averell"}]
```

## API

### File types
Expand Down
32 changes: 19 additions & 13 deletions magicparse/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .schema import Schema
from .schema import Schema, builtins as builtins_schemas
from .post_processors import PostProcessor, builtins as builtins_post_processors
from .pre_processors import PreProcessor, builtins as builtins_pre_processors
from .transform import Transform
Expand All @@ -22,25 +22,31 @@ def parse(data: str, schema_options: Dict[str, Any]) -> Tuple[List[dict], List[d
return schema.parse(data)


def register(transforms: Union[Transform, List[Transform]]) -> None:
if isinstance(transforms, Transform):
transforms = [transforms]
Registrable = Schema | Transform

for transform in transforms:
if issubclass(transform, TypeConverter):
TypeConverter.register(transform)
elif issubclass(transform, PostProcessor):
PostProcessor.register(transform)
elif issubclass(transform, PreProcessor):
PreProcessor.register(transform)
elif issubclass(transform, Validator):
Validator.register(transform)

def register(items: Union[Registrable, List[Registrable]]) -> None:
if not isinstance(items, list):
items = [items]

for item in items:
if issubclass(item, Schema):
Schema.register(item)
elif issubclass(item, TypeConverter):
TypeConverter.register(item)
elif issubclass(item, PostProcessor):
PostProcessor.register(item)
elif issubclass(item, PreProcessor):
PreProcessor.register(item)
elif issubclass(item, Validator):
Validator.register(item)
else:
raise ValueError(
"transforms must be a subclass of Transform (or a list of it)"
)


register(builtins_schemas)
register(builtins_pre_processors)
register(builtins_type_converters)
register(builtins_validators)
Expand Down
13 changes: 13 additions & 0 deletions magicparse/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,19 @@ def read_value(self, row):
def error(self, exception: Exception):
pass

@classmethod
def build(cls, options: dict) -> "Field":
column_number = options.get("column-number")
if column_number:
return CsvField(options)

column_start = options.get("column-start")
column_length = options.get("column-length")
if column_start is not None and column_length is not None:
return ColumnarField(options)

raise ValueError("missing field position")


class CsvField(Field):
def __init__(self, options: dict) -> None:
Expand Down
1 change: 1 addition & 0 deletions magicparse/post_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(self, denominator: int) -> None:
def apply(self, value: Number) -> Number:
return value / self.denominator

@staticmethod
def key() -> str:
return "divide"

Expand Down
5 changes: 5 additions & 0 deletions magicparse/pre_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(self, width: int) -> None:
def apply(self, value: str) -> str:
return value.zfill(self.width)

@staticmethod
def key() -> str:
return "left-pad-zeroes"

Expand All @@ -45,6 +46,7 @@ def apply(self, value: str) -> str:
f"value '{value}' does not map to any values in [{self._keys}]"
)

@staticmethod
def key() -> str:
return "map"

Expand All @@ -57,6 +59,7 @@ def __init__(self, pattern: str, replacement: str) -> None:
def apply(self, value: str) -> str:
return value.replace(self.pattern, self.replacement)

@staticmethod
def key() -> str:
return "replace"

Expand All @@ -65,6 +68,7 @@ class StripWhitespaces(PreProcessor):
def apply(self, value: str) -> str:
return value.strip()

@staticmethod
def key() -> str:
return "strip-whitespaces"

Expand All @@ -88,6 +92,7 @@ def apply(self, value: str) -> str:

return match.group("value")

@staticmethod
def key() -> str:
return "regex-extract"

Expand Down
76 changes: 50 additions & 26 deletions magicparse/schema.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,48 @@
from abc import ABC, abstractmethod
from abc import ABC, abstractmethod, abstractstaticmethod
import csv
from .fields import Field, CsvField, ColumnarField
from .fields import Field
from io import StringIO
from typing import Any, Dict, List, Tuple
from typing import Any, Dict, List, Tuple, Union


class Schema(ABC):
fields: List[Field]
has_header: bool
has_header: bool = False

def __init__(self, options: Dict[str, Any]) -> None:
self.fields = [Field.build(item) for item in options["fields"]]

@abstractmethod
def get_reader(self, data: str):
def get_reader(self, stream: StringIO):
pass

@abstractstaticmethod
def key() -> str:
pass

@classmethod
def build(self, options: Dict[str, Any]) -> "Schema":
if options["file_type"] == "csv":
return CsvSchema(options)
elif options["file_type"] == "columnar":
return ColumnarSchema(options)
def build(cls, options: Dict[str, Any]) -> "Schema":
file_type = options["file_type"]
schema = cls.registry.get(file_type)
if schema:
return schema(options)

raise ValueError("unknown file type")

@classmethod
def register(cls, schema: "Schema") -> None:
if not hasattr(cls, "registry"):
cls.registry = {}

cls.registry[schema.key()] = schema

def parse(self, data: Union[str, StringIO]) -> Tuple[List[dict], List[dict]]:
if isinstance(data, str):
stream = StringIO(data)
else:
raise ValueError("unknown file type")
stream = data

def parse(self, data: str) -> Tuple[List[dict], List[dict]]:
reader = self.get_reader(data)
reader = self.get_reader(stream)

row_number = 0
if self.has_header:
Expand Down Expand Up @@ -53,21 +72,26 @@ def parse(self, data: str) -> Tuple[List[dict], List[dict]]:


class CsvSchema(Schema):
def __init__(self, schema: Dict[str, Any]) -> None:
self.has_header = schema.get("has_header", False)
self.delimiter = schema.get("delimiter", ",")
self.fields = [CsvField(field) for field in schema["fields"]]
def __init__(self, options: Dict[str, Any]) -> None:
super().__init__(options)
self.has_header = options.get("has_header", False)
self.delimiter = options.get("delimiter", ",")

def get_reader(self, stream: StringIO):
return csv.reader(stream, delimiter=self.delimiter, quoting=csv.QUOTE_NONE)

def get_reader(self, data: str):
return csv.reader(
StringIO(data), delimiter=self.delimiter, quoting=csv.QUOTE_NONE
)
@staticmethod
def key() -> str:
return "csv"


class ColumnarSchema(Schema):
def __init__(self, schema: Dict[str, Any]) -> None:
self.has_header = False
self.fields = [ColumnarField(field) for field in schema["fields"]]
def get_reader(self, stream: StringIO):
return stream

@staticmethod
def key() -> str:
return "columnar"


def get_reader(self, data: str):
return StringIO(data)
builtins = [ColumnarSchema, CsvSchema]
3 changes: 3 additions & 0 deletions magicparse/type_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class StrConverter(TypeConverter):
def apply(self, value: str) -> str:
return value

@staticmethod
def key() -> str:
return "str"

Expand All @@ -31,6 +32,7 @@ def apply(self, value: str) -> int:
except:
raise ValueError("value is not a valid integer")

@staticmethod
def key() -> str:
return "int"

Expand All @@ -42,6 +44,7 @@ def apply(self, value: str) -> Decimal:
except:
raise ValueError("value is not a valid decimal")

@staticmethod
def key() -> str:
return "decimal"

Expand Down
1 change: 1 addition & 0 deletions magicparse/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def apply(self, value: str) -> str:

raise ValueError(f"string does not match regex '{self.pattern.pattern}'")

@staticmethod
def key() -> str:
return "regex-matches"

Expand Down
1 change: 1 addition & 0 deletions tests/test_post_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def test_divide_decimal(self):

class TestRegister(TestCase):
class NoThanksPostProcessor(PostProcessor):
@staticmethod
def key() -> str:
return "no-thanks"

Expand Down
1 change: 1 addition & 0 deletions tests/test_pre_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def test_pattern_found(self):

class TestRegister(TestCase):
class YesPreProcessor(PreProcessor):
@staticmethod
def key() -> str:
return "yes"

Expand Down
22 changes: 22 additions & 0 deletions tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,3 +213,25 @@ def test_errors_do_not_halt_parsing(self):
"error": "value is not a valid integer",
}
]


class TestRegister(TestCase):
class PipedSchema(Schema):
@staticmethod
def key() -> str:
return "piped"

def get_reader(self, stream):
for item in stream.read().split("|"):
yield [item]

def test_register(self):
Schema.register(self.PipedSchema)

schema = Schema.build(
{
"file_type": "piped",
"fields": [{"key": "name", "type": "str", "column-number": 1}],
}
)
assert isinstance(schema, self.PipedSchema)
1 change: 1 addition & 0 deletions tests/test_type_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def test_apply_failed(self):

class TestRegister(TestCase):
class GuidConverter(TypeConverter):
@staticmethod
def key() -> str:
return "guid"

Expand Down
1 change: 1 addition & 0 deletions tests/test_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def test_does_not_match(self):

class TestRegister(TestCase):
class IsTheAnswerValidator(Validator):
@staticmethod
def key() -> str:
return "is-the-answer"

Expand Down

0 comments on commit 6061170

Please sign in to comment.