diff --git a/docker/sweepers_driver.py b/docker/sweepers_driver.py index 01c249b..b7cb602 100755 --- a/docker/sweepers_driver.py +++ b/docker/sweepers_driver.py @@ -61,7 +61,7 @@ from datetime import datetime from typing import Callable -from pds.registrysweepers import provenance, ancestry +from pds.registrysweepers import provenance, ancestry, repairkit from pds.registrysweepers.utils import configure_logging, get_human_readable_elapsed_since, parse_log_level configure_logging(filepath=None, log_level=logging.INFO) @@ -108,10 +108,12 @@ def run_factory(sweeper_f: Callable) -> Callable: run_provenance = run_factory(provenance.run) run_ancestry = run_factory(ancestry.run) +run_repairkit = run_factory(repairkit.run) log.info('Running sweepers') execution_begin = datetime.now() +run_repairkit() run_provenance() run_ancestry() diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py new file mode 100644 index 0000000..38243bf --- /dev/null +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -0,0 +1,72 @@ +"""repairkit is an executable package + +The reason repairkit is an executable package is for extension as new repairs +are needed in the future. They can be added by updating the REPAIR_TOOLS mapping +with the new field name and functional requirements. All the additions can then +be modules with this executable package. +""" +import logging +import re +from typing import Union + +from pds.registrysweepers.utils import configure_logging +from pds.registrysweepers.utils import Host +from pds.registrysweepers.utils import query_registry_db +from pds.registrysweepers.utils import write_updated_docs + +from . import allarrays + +""" +dictionary repair tools is {field_name:[funcs]} where field_name can be: + 1: re.compile().fullmatch for the equivalent of "fred" == "fred" + 2: re.compile().match for more complex matching of subparts of the string + +and funcs are: +def function_name (document:{}, fieldname:str)->{} + +and the return an empty {} if no changes and {fieldname:new_value} for repairs + +Examples + +re.compile("^ops:Info/.+").match("ops:Info/ops:filesize")->match object +re.compile("^ops:Info/.+").fullmatch("ops:Info/ops:filesize")->match object +re.compile("^ops:Info/").match("ops:Info/ops:filesize")->match object +re.compile("^ops:Info/").fullmatch("ops:Info/ops:filesize")->None + +To get str_a == str_b, re.compile(str_a).fullmatch + +""" + +REPAIR_TOOLS = { + re.compile("^ops:Data_File_Info/").match: [allarrays.repair], + re.compile("^ops:Label_File_Info/").match: [allarrays.repair], +} + +log = logging.getLogger(__name__) + + +def run( + base_url: str, + username: str, + password: str, + verify_host_certs: bool = True, + log_filepath: Union[str, None] = None, + log_level: int = logging.INFO, +): + configure_logging(filepath=log_filepath, log_level=log_level) + log.info("starting CLI processing") + host = Host(password, base_url, username, verify_host_certs) + for document in query_registry_db(host, {"match_all": {}}, {}): + id = document["_id"] + src = document["_source"] + repairs = {} + log.debug(f"working on document: {id}") + for fieldname, data in src.items(): + for regex, funcs in REPAIR_TOOLS.items(): + if regex(fieldname): + for func in funcs: + repairs.update(func(src, fieldname)) + if repairs: + log.info(f"Writing repairs to document: {id}") + write_updated_docs(host, {id: repairs}) + return diff --git a/src/pds/registrysweepers/repairkit/__main__.py b/src/pds/registrysweepers/repairkit/__main__.py new file mode 100644 index 0000000..3105049 --- /dev/null +++ b/src/pds/registrysweepers/repairkit/__main__.py @@ -0,0 +1,13 @@ +from pds.registrysweepers.repairkit import run +from pds.registrysweepers.utils import parse_args + +args = parse_args(description="sweep through the registry documents and fix common errors") + +run( + base_url=args.base_URL, + username=args.username, + password=args.password, + verify_host_certs=not args.insecure, + log_level=args.log_level, + log_filepath=args.log_file, +) diff --git a/src/pds/registrysweepers/repairkit/allarrays.py b/src/pds/registrysweepers/repairkit/allarrays.py new file mode 100644 index 0000000..f15bb92 --- /dev/null +++ b/src/pds/registrysweepers/repairkit/allarrays.py @@ -0,0 +1,13 @@ +"""change single strings to array of strings""" +import logging +from typing import Dict + +log = logging.getLogger(__name__) + + +def repair(document: Dict, fieldname: str) -> Dict: + log.debug(f"checking {fieldname}") + if isinstance(document[fieldname], str): + log.info(f"found string for {fieldname} where it should be an array") + return {fieldname: [document[fieldname]]} + return {} diff --git a/tests/pds/registrysweepers/repairkit/__init__.py b/tests/pds/registrysweepers/repairkit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pds/registrysweepers/repairkit/test_allarrays.py b/tests/pds/registrysweepers/repairkit/test_allarrays.py new file mode 100644 index 0000000..850f9be --- /dev/null +++ b/tests/pds/registrysweepers/repairkit/test_allarrays.py @@ -0,0 +1,19 @@ +import unittest + +from pds.registrysweepers.repairkit import allarrays + + +class AllArrays(unittest.TestCase): + def test_valid_field(self): + src = {"apple": ["orange"]} + repair = allarrays.repair(src, "apple") + self.assertEqual({}, repair) + + def test_invalid_field(self): + src = {"apple": "orange"} + repair = allarrays.repair(src, "apple") + self.assertEqual({"apple": ["orange"]}, repair) + + +if __name__ == "__main__": + unittest.main()