Skip to content

Commit

Permalink
Merge pull request #2 from ZeroGachis/feature/regex-extract
Browse files Browse the repository at this point in the history
✨ Add 'regex-extract' pre-processor
  • Loading branch information
ducdetronquito authored Oct 20, 2022
2 parents ff82451 + bafd07d commit 0cbb6e5
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ assert not errors

- left-pad-zeroes
- map
- regex-extract
- replace
- strip-whitespaces

Expand Down
26 changes: 25 additions & 1 deletion magicparse/pre_processors.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from .transform import Transform


Expand Down Expand Up @@ -68,4 +69,27 @@ def key() -> str:
return "strip-whitespaces"


builtins = [LeftPadZeroes, Map, Replace, StripWhitespaces]
class RegexExtract(PreProcessor):
def __init__(self, pattern: str) -> None:
pattern = re.compile(pattern)
if "value" not in pattern.groupindex:
raise ValueError(
"regex-extract's pattern must contain a group named 'value'"
)

self.pattern = pattern

def apply(self, value: str) -> str:
match = re.match(self.pattern, value)
if not match:
raise ValueError(
f"cannot extract value from pattern '{self.pattern.pattern}'"
)

return match.group("value")

def key() -> str:
return "regex-extract"


builtins = [LeftPadZeroes, Map, RegexExtract, Replace, StripWhitespaces]
48 changes: 48 additions & 0 deletions tests/test_pre_processors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import re
from magicparse.pre_processors import (
LeftPadZeroes,
Map,
PreProcessor,
RegexExtract,
Replace,
StripWhitespaces,
)
Expand Down Expand Up @@ -36,6 +38,17 @@ def test_strip_whitespaces(self):
pre_processor = PreProcessor.build({"name": "strip-whitespaces"})
assert isinstance(pre_processor, StripWhitespaces)

def test_regex_extract(self):
pre_processor = PreProcessor.build(
{
"name": "regex-extract",
"parameters": {"pattern": "^xxx(?P<value>\\d{13})xxx$"},
}
)
assert isinstance(pre_processor, RegexExtract)
assert isinstance(pre_processor.pattern, re.Pattern)
assert pre_processor.pattern.pattern == "^xxx(?P<value>\\d{13})xxx$"

def test_unknown(self):
with pytest.raises(ValueError, match="invalid pre-processor 'anything'"):
PreProcessor.build({"name": "anything"})
Expand Down Expand Up @@ -101,6 +114,41 @@ def test_success(self):
assert pre_processor.apply(" an input ") == "an input"


class TestRegexExtract(TestCase):
def test_build_without_value_group(self):
with pytest.raises(
ValueError,
match=r"regex-extract's pattern must contain a group named 'value'",
):
PreProcessor.build(
{"name": "regex-extract", "parameters": {"pattern": "xxx"}}
)

def test_pattern_not_found(self):
pre_processor = PreProcessor.build(
{
"name": "regex-extract",
"parameters": {"pattern": "^xxx(?P<value>\\d{13})xxx$"},
}
)
with pytest.raises(ValueError) as error:
pre_processor.apply("an input")

assert (
error.value.args[0]
== "cannot extract value from pattern '^xxx(?P<value>\\d{13})xxx$'"
)

def test_pattern_found(self):
pre_processor = PreProcessor.build(
{
"name": "regex-extract",
"parameters": {"pattern": "^xxx(?P<value>\\d{13})xxx$"},
}
)
pre_processor.apply("xxx9780201379624xxx") == "9780201379624"


class TestRegister(TestCase):
class YesPreProcessor(PreProcessor):
def key() -> str:
Expand Down

0 comments on commit 0cbb6e5

Please sign in to comment.