From 34caab7dd5f64de58bf06ee6b05c4eb63928b8ae Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 13:55:51 -0700 Subject: [PATCH 01/15] initial commit of repairkit --- .../registrysweepers/repairkit/__init__.py | 64 +++++++++++++++++++ .../registrysweepers/repairkit/__main__.py | 12 ++++ .../registrysweepers/repairkit/allarrays.py | 7 ++ 3 files changed, 83 insertions(+) create mode 100644 src/pds/registrysweepers/repairkit/__init__.py create mode 100644 src/pds/registrysweepers/repairkit/__main__.py create mode 100644 src/pds/registrysweepers/repairkit/allarrays.py diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py new file mode 100644 index 0000000..8a391ad --- /dev/null +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -0,0 +1,64 @@ +'''repairkit is an executable package + +The reason repairkit is an executable package is for extension as new repairs +are needed in the future. They can be added by updating the REPAIR_TOOLS mapping +with the new field name and functional requirements. All the additions can then +be modules with this executable package. +''' + +from . import allarrays + +from pds.registrysweepers.utils import Host +from pds.registrysweepers.utils import query_registry_db + +import logging +import re +from typing import Union + +''' +dictionary repair tools is {field_name:[funcs]} where field_name can be: + 1: re.compile().fullmatch for the equivalent of "fred" == "fred" + 2: re.compile().match for more complex matching of subparts of the string + +and funcs are: +def function_name (document:{}, fieldname:str)->{} + +and the return an empty {} if no changes and {fieldname:new_value} for repairs + +Examples + +re.compile("^ops:Info/.+").match("ops:Info/ops:filesize")->match object +re.compile("^ops:Info/.+").fullmatch("ops:Info/ops:filesize")->match object +re.compile("^ops:Info/").match("ops:Info/ops:filesize")->match object +re.compile("^ops:Info/").fullmatch("ops:Info/ops:filesize")->None + +To get str_a == str_b, re.compile(str_a).fullmatch + +''' + +REPAIR_TOOLS = { +re.compile('^ops:Data_File_Info/').match:[allarrays.repair], +re.compile('^ops:Label_File_Info/').match:[allarrays.repair], +} + +log = logging.getLogger(__name__) + +def run(base_url:str, + username:str, + password:str, + verify_host_certs:bool=True, + log_filepath:Union[str,None]=None, + log_level:int=logging.INFO): + configure_logging(filepath=log_filepath, log_level=log_level) + log.info("starting CLI processing") + host = Host(password, base_url, username, verify_host_certs) + for document in query_registry_db(host, query, {}): + for fieldname,data in document.items(): + repairs + for regex,funcs in REPAIR_TOOLS: + if regex(filename): + repairs.update(func(document, fieldname) for func in funcs) + if repairs: + log.info (f'Writing repairs to document: {id}') + write_update_docs (host, {id,repairs}) + return diff --git a/src/pds/registrysweepers/repairkit/__main__.py b/src/pds/registrysweepers/repairkit/__main__.py new file mode 100644 index 0000000..2b8a494 --- /dev/null +++ b/src/pds/registrysweepers/repairkit/__main__.py @@ -0,0 +1,12 @@ + +from pds.registrysweepers.repairkit import run +from pds.registrysweepers.utils import parse_args + +args = parse_args(description='sweep through the registry documents and fix common errors') + +run(base_url=args.base_URL, + username=args.username, + password=args.password, + verify_host_certs=not args.insecure, + log_level=args.log_level, + log_filepath=args.log_file) diff --git a/src/pds/registrysweepers/repairkit/allarrays.py b/src/pds/registrysweepers/repairkit/allarrays.py new file mode 100644 index 0000000..700f6f4 --- /dev/null +++ b/src/pds/registrysweepers/repairkit/allarrays.py @@ -0,0 +1,7 @@ +'''change single strings to array of strings''' + +def repair (document:{}, fieldname:str)->bool: + if isinstance (document[fieldname], str): + log.info (f'found string for {fieldname} where it should be an array') + return {fieldname:[document[fieldname]} + return {} From aecd5d768691153e6d2abc22b2349d32fafe8fa7 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 14:10:50 -0700 Subject: [PATCH 02/15] add query --- src/pds/registrysweepers/repairkit/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index 8a391ad..b345032 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -52,9 +52,9 @@ def run(base_url:str, configure_logging(filepath=log_filepath, log_level=log_level) log.info("starting CLI processing") host = Host(password, base_url, username, verify_host_certs) + query = {"match_all":{}} for document in query_registry_db(host, query, {}): for fieldname,data in document.items(): - repairs for regex,funcs in REPAIR_TOOLS: if regex(filename): repairs.update(func(document, fieldname) for func in funcs) From 083a9303053404f6b75d63733aa74ae5449182b3 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 14:11:14 -0700 Subject: [PATCH 03/15] empty repairs --- src/pds/registrysweepers/repairkit/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index b345032..fc6ec6f 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -54,6 +54,7 @@ def run(base_url:str, host = Host(password, base_url, username, verify_host_certs) query = {"match_all":{}} for document in query_registry_db(host, query, {}): + repairs = {} for fieldname,data in document.items(): for regex,funcs in REPAIR_TOOLS: if regex(filename): From 6d6652b04861b4cce4d0f004cb18fa578866993d Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 14:37:45 -0700 Subject: [PATCH 04/15] clean up typos --- src/pds/registrysweepers/repairkit/__init__.py | 16 +++++++++------- src/pds/registrysweepers/repairkit/allarrays.py | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index fc6ec6f..9ce094d 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -9,6 +9,7 @@ from . import allarrays from pds.registrysweepers.utils import Host +from pds.registrysweepers.utils import configure_logging from pds.registrysweepers.utils import query_registry_db import logging @@ -37,17 +38,18 @@ def function_name (document:{}, fieldname:str)->{} ''' REPAIR_TOOLS = { -re.compile('^ops:Data_File_Info/').match:[allarrays.repair], -re.compile('^ops:Label_File_Info/').match:[allarrays.repair], +re.compile('^ops:Data_File_Info/').match: [allarrays.repair], +re.compile('^ops:Label_File_Info/').match: [allarrays.repair], } log = logging.getLogger(__name__) + def run(base_url:str, username:str, password:str, - verify_host_certs:bool=True, - log_filepath:Union[str,None]=None, + verify_host_certs:bool = True, + log_filepath:Union[str,None] = None, log_level:int=logging.INFO): configure_logging(filepath=log_filepath, log_level=log_level) log.info("starting CLI processing") @@ -56,10 +58,10 @@ def run(base_url:str, for document in query_registry_db(host, query, {}): repairs = {} for fieldname,data in document.items(): - for regex,funcs in REPAIR_TOOLS: + for regex,funcs in REPAIR_TOOLS.items(): if regex(filename): repairs.update(func(document, fieldname) for func in funcs) if repairs: - log.info (f'Writing repairs to document: {id}') - write_update_docs (host, {id,repairs}) + log.info(f'Writing repairs to document: {id}') + write_update_docs(host, {id:repairs}) return diff --git a/src/pds/registrysweepers/repairkit/allarrays.py b/src/pds/registrysweepers/repairkit/allarrays.py index 700f6f4..cddc26a 100644 --- a/src/pds/registrysweepers/repairkit/allarrays.py +++ b/src/pds/registrysweepers/repairkit/allarrays.py @@ -3,5 +3,5 @@ def repair (document:{}, fieldname:str)->bool: if isinstance (document[fieldname], str): log.info (f'found string for {fieldname} where it should be an array') - return {fieldname:[document[fieldname]} + return {fieldname:[document[fieldname]]} return {} From 64119bd84e6306695f80bca52de8cf9b7dbe8da2 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 14:55:31 -0700 Subject: [PATCH 05/15] minor updates for running --- src/pds/registrysweepers/repairkit/__init__.py | 9 ++++++--- src/pds/registrysweepers/repairkit/allarrays.py | 6 ++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index 9ce094d..1cd49ec 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -56,11 +56,14 @@ def run(base_url:str, host = Host(password, base_url, username, verify_host_certs) query = {"match_all":{}} for document in query_registry_db(host, query, {}): + id = document['_id'] + src = document['_source'] repairs = {} - for fieldname,data in document.items(): + log.debug (f'working on document: {id}') + for fieldname,data in src.items(): for regex,funcs in REPAIR_TOOLS.items(): - if regex(filename): - repairs.update(func(document, fieldname) for func in funcs) + if regex(fieldname): + for func in funcs: repairs.update(func(src, fieldname)) if repairs: log.info(f'Writing repairs to document: {id}') write_update_docs(host, {id:repairs}) diff --git a/src/pds/registrysweepers/repairkit/allarrays.py b/src/pds/registrysweepers/repairkit/allarrays.py index cddc26a..cee304c 100644 --- a/src/pds/registrysweepers/repairkit/allarrays.py +++ b/src/pds/registrysweepers/repairkit/allarrays.py @@ -1,6 +1,12 @@ '''change single strings to array of strings''' +import logging + +log = logging.getLogger(__name__) + + def repair (document:{}, fieldname:str)->bool: + log.debug(f'checking {fieldname}') if isinstance (document[fieldname], str): log.info (f'found string for {fieldname} where it should be an array') return {fieldname:[document[fieldname]]} From 15520ed1fd752e50e8478d015790188b36f8caf1 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 15:03:28 -0700 Subject: [PATCH 06/15] more whitespace --- src/pds/registrysweepers/repairkit/__init__.py | 15 ++++++++------- src/pds/registrysweepers/repairkit/allarrays.py | 8 +++++--- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index 1cd49ec..fb27827 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -11,6 +11,7 @@ from pds.registrysweepers.utils import Host from pds.registrysweepers.utils import configure_logging from pds.registrysweepers.utils import query_registry_db +from pds.registrysweepers.utils import write_update_docs import logging import re @@ -45,16 +46,16 @@ def function_name (document:{}, fieldname:str)->{} log = logging.getLogger(__name__) -def run(base_url:str, - username:str, - password:str, - verify_host_certs:bool = True, - log_filepath:Union[str,None] = None, - log_level:int=logging.INFO): +def run(base_url: str, + username: str, + password: str, + verify_host_certs: bool = True, + log_filepath: Union[str,None] = None, + log_level: int=logging.INFO): configure_logging(filepath=log_filepath, log_level=log_level) log.info("starting CLI processing") host = Host(password, base_url, username, verify_host_certs) - query = {"match_all":{}} + query = {"match_all": {}} for document in query_registry_db(host, query, {}): id = document['_id'] src = document['_source'] diff --git a/src/pds/registrysweepers/repairkit/allarrays.py b/src/pds/registrysweepers/repairkit/allarrays.py index cee304c..a7f0c31 100644 --- a/src/pds/registrysweepers/repairkit/allarrays.py +++ b/src/pds/registrysweepers/repairkit/allarrays.py @@ -1,13 +1,15 @@ '''change single strings to array of strings''' +from typing import Dict + import logging log = logging.getLogger(__name__) -def repair (document:{}, fieldname:str)->bool: +def repair(document: Dict, fieldname: str)->Dict: log.debug(f'checking {fieldname}') - if isinstance (document[fieldname], str): - log.info (f'found string for {fieldname} where it should be an array') + if isinstance(document[fieldname], str): + log.info(f'found string for {fieldname} where it should be an array') return {fieldname:[document[fieldname]]} return {} From c5c3a2646c620030321299a0d3626c4cf68f7335 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 15:04:08 -0700 Subject: [PATCH 07/15] more whitespace --- src/pds/registrysweepers/repairkit/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index fb27827..114647c 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -39,8 +39,8 @@ def function_name (document:{}, fieldname:str)->{} ''' REPAIR_TOOLS = { -re.compile('^ops:Data_File_Info/').match: [allarrays.repair], -re.compile('^ops:Label_File_Info/').match: [allarrays.repair], + re.compile('^ops:Data_File_Info/').match: [allarrays.repair], + re.compile('^ops:Label_File_Info/').match: [allarrays.repair], } log = logging.getLogger(__name__) From eaa2f457fa9b0f5458bc938c6d14ed3cca4b06d6 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 15:12:52 -0700 Subject: [PATCH 08/15] more whitespace --- src/pds/registrysweepers/repairkit/__init__.py | 17 +++++++++-------- src/pds/registrysweepers/repairkit/allarrays.py | 4 ++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index 114647c..a4574ef 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -11,7 +11,7 @@ from pds.registrysweepers.utils import Host from pds.registrysweepers.utils import configure_logging from pds.registrysweepers.utils import query_registry_db -from pds.registrysweepers.utils import write_update_docs +from pds.registrysweepers.utils import write_updated_docs import logging import re @@ -50,8 +50,8 @@ def run(base_url: str, username: str, password: str, verify_host_certs: bool = True, - log_filepath: Union[str,None] = None, - log_level: int=logging.INFO): + log_filepath: Union[str, None] = None, + log_level: int = logging.INFO): configure_logging(filepath=log_filepath, log_level=log_level) log.info("starting CLI processing") host = Host(password, base_url, username, verify_host_certs) @@ -60,12 +60,13 @@ def run(base_url: str, id = document['_id'] src = document['_source'] repairs = {} - log.debug (f'working on document: {id}') - for fieldname,data in src.items(): - for regex,funcs in REPAIR_TOOLS.items(): + log.debug(f'working on document: {id}') + for fieldname, data in src.items(): + for regex, funcs in REPAIR_TOOLS.items(): if regex(fieldname): - for func in funcs: repairs.update(func(src, fieldname)) + for func in funcs: + repairs.update(func(src, fieldname)) if repairs: log.info(f'Writing repairs to document: {id}') - write_update_docs(host, {id:repairs}) + write_updated_docs(host, {id:repairs}) return diff --git a/src/pds/registrysweepers/repairkit/allarrays.py b/src/pds/registrysweepers/repairkit/allarrays.py index a7f0c31..2dda226 100644 --- a/src/pds/registrysweepers/repairkit/allarrays.py +++ b/src/pds/registrysweepers/repairkit/allarrays.py @@ -7,9 +7,9 @@ log = logging.getLogger(__name__) -def repair(document: Dict, fieldname: str)->Dict: +def repair(document: Dict, fieldname: str) -> Dict: log.debug(f'checking {fieldname}') if isinstance(document[fieldname], str): log.info(f'found string for {fieldname} where it should be an array') - return {fieldname:[document[fieldname]]} + return {fieldname: [document[fieldname]]} return {} From fbfc269ab14230a517dfb07818a0f5cea3120ba9 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 15:23:21 -0700 Subject: [PATCH 09/15] maybe a functional test --- .../pds/registrysweepers/repairkit/__init__.py | 0 .../repairkit/test_allarrays.py | 17 +++++++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 tests/pds/registrysweepers/repairkit/__init__.py create mode 100644 tests/pds/registrysweepers/repairkit/test_allarrays.py diff --git a/tests/pds/registrysweepers/repairkit/__init__.py b/tests/pds/registrysweepers/repairkit/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pds/registrysweepers/repairkit/test_allarrays.py b/tests/pds/registrysweepers/repairkit/test_allarrays.py new file mode 100644 index 0000000..87670f8 --- /dev/null +++ b/tests/pds/registrysweepers/repairkit/test_allarrays.py @@ -0,0 +1,17 @@ +import unittest + +from pds.registrysweepers.repairkit import allarrays + +class AllArrays(unittest.TestCase): + def test_valid_field(self): + src = {'apple': ['orange']} + repair = allarrays.repair(src, 'apple') + self.assertEqual({}, repair) + def test_invalid_field(self): + src = {'apple': 'orange'} + repair = allarrays.repair(src, 'apple') + self.assertEqual({'apple': ['orange']}, repair) + + +if __name__ == '__main__': + unittest.test() From 434d92d0d44511b02a4e9e1da74dbc18fe96a5d2 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 15:24:09 -0700 Subject: [PATCH 10/15] more whitespace --- src/pds/registrysweepers/repairkit/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index a4574ef..2009c5a 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -68,5 +68,5 @@ def run(base_url: str, repairs.update(func(src, fieldname)) if repairs: log.info(f'Writing repairs to document: {id}') - write_updated_docs(host, {id:repairs}) + write_updated_docs(host, {id: repairs}) return From 0472ab45b35f97db6e8268736a6bcae83338dec8 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 15:32:59 -0700 Subject: [PATCH 11/15] typo --- tests/pds/registrysweepers/repairkit/test_allarrays.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/pds/registrysweepers/repairkit/test_allarrays.py b/tests/pds/registrysweepers/repairkit/test_allarrays.py index 87670f8..9f158ac 100644 --- a/tests/pds/registrysweepers/repairkit/test_allarrays.py +++ b/tests/pds/registrysweepers/repairkit/test_allarrays.py @@ -2,6 +2,7 @@ from pds.registrysweepers.repairkit import allarrays + class AllArrays(unittest.TestCase): def test_valid_field(self): src = {'apple': ['orange']} @@ -14,4 +15,4 @@ def test_invalid_field(self): if __name__ == '__main__': - unittest.test() + unittest.main() From 260f952b26e8ecd954f20c76f584138a32018681 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 15:42:46 -0700 Subject: [PATCH 12/15] more whitespace --- src/pds/registrysweepers/repairkit/__init__.py | 11 +++++------ src/pds/registrysweepers/repairkit/__main__.py | 1 - src/pds/registrysweepers/repairkit/allarrays.py | 4 +--- .../pds/registrysweepers/repairkit/test_allarrays.py | 4 ++-- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index 2009c5a..5ab0ce2 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -5,17 +5,16 @@ with the new field name and functional requirements. All the additions can then be modules with this executable package. ''' +import logging +import re +from typing import Union -from . import allarrays - -from pds.registrysweepers.utils import Host from pds.registrysweepers.utils import configure_logging +from pds.registrysweepers.utils import Host from pds.registrysweepers.utils import query_registry_db from pds.registrysweepers.utils import write_updated_docs -import logging -import re -from typing import Union +from . import allarrays ''' dictionary repair tools is {field_name:[funcs]} where field_name can be: diff --git a/src/pds/registrysweepers/repairkit/__main__.py b/src/pds/registrysweepers/repairkit/__main__.py index 2b8a494..815b5c0 100644 --- a/src/pds/registrysweepers/repairkit/__main__.py +++ b/src/pds/registrysweepers/repairkit/__main__.py @@ -1,4 +1,3 @@ - from pds.registrysweepers.repairkit import run from pds.registrysweepers.utils import parse_args diff --git a/src/pds/registrysweepers/repairkit/allarrays.py b/src/pds/registrysweepers/repairkit/allarrays.py index 2dda226..4894476 100644 --- a/src/pds/registrysweepers/repairkit/allarrays.py +++ b/src/pds/registrysweepers/repairkit/allarrays.py @@ -1,8 +1,6 @@ '''change single strings to array of strings''' - -from typing import Dict - import logging +from typing import Dict log = logging.getLogger(__name__) diff --git a/tests/pds/registrysweepers/repairkit/test_allarrays.py b/tests/pds/registrysweepers/repairkit/test_allarrays.py index 9f158ac..d08fb4f 100644 --- a/tests/pds/registrysweepers/repairkit/test_allarrays.py +++ b/tests/pds/registrysweepers/repairkit/test_allarrays.py @@ -5,11 +5,11 @@ class AllArrays(unittest.TestCase): def test_valid_field(self): - src = {'apple': ['orange']} + src = {'apple': ['orange']} repair = allarrays.repair(src, 'apple') self.assertEqual({}, repair) def test_invalid_field(self): - src = {'apple': 'orange'} + src = {'apple': 'orange'} repair = allarrays.repair(src, 'apple') self.assertEqual({'apple': ['orange']}, repair) From 10cf2783e2316bb77277c188964294806d246532 Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 15:57:08 -0700 Subject: [PATCH 13/15] finally condensation --- src/pds/registrysweepers/repairkit/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index 5ab0ce2..dd62361 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -54,8 +54,7 @@ def run(base_url: str, configure_logging(filepath=log_filepath, log_level=log_level) log.info("starting CLI processing") host = Host(password, base_url, username, verify_host_certs) - query = {"match_all": {}} - for document in query_registry_db(host, query, {}): + for document in query_registry_db(host, {"match_all": {}}, {}): id = document['_id'] src = document['_source'] repairs = {} From 1a92b530c9e7a29d0b79e7afbfbc559dae4f3d0c Mon Sep 17 00:00:00 2001 From: Al Niessner Date: Tue, 1 Aug 2023 16:00:51 -0700 Subject: [PATCH 14/15] more whitespace --- .../registrysweepers/repairkit/__init__.py | 34 ++++++++++--------- .../registrysweepers/repairkit/__main__.py | 8 +++-- .../registrysweepers/repairkit/allarrays.py | 6 ++-- .../repairkit/test_allarrays.py | 13 +++---- 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/src/pds/registrysweepers/repairkit/__init__.py b/src/pds/registrysweepers/repairkit/__init__.py index dd62361..38243bf 100644 --- a/src/pds/registrysweepers/repairkit/__init__.py +++ b/src/pds/registrysweepers/repairkit/__init__.py @@ -1,10 +1,10 @@ -'''repairkit is an executable package +"""repairkit is an executable package The reason repairkit is an executable package is for extension as new repairs are needed in the future. They can be added by updating the REPAIR_TOOLS mapping with the new field name and functional requirements. All the additions can then be modules with this executable package. -''' +""" import logging import re from typing import Union @@ -16,7 +16,7 @@ from . import allarrays -''' +""" dictionary repair tools is {field_name:[funcs]} where field_name can be: 1: re.compile().fullmatch for the equivalent of "fred" == "fred" 2: re.compile().match for more complex matching of subparts of the string @@ -35,36 +35,38 @@ def function_name (document:{}, fieldname:str)->{} To get str_a == str_b, re.compile(str_a).fullmatch -''' +""" REPAIR_TOOLS = { - re.compile('^ops:Data_File_Info/').match: [allarrays.repair], - re.compile('^ops:Label_File_Info/').match: [allarrays.repair], + re.compile("^ops:Data_File_Info/").match: [allarrays.repair], + re.compile("^ops:Label_File_Info/").match: [allarrays.repair], } log = logging.getLogger(__name__) -def run(base_url: str, - username: str, - password: str, - verify_host_certs: bool = True, - log_filepath: Union[str, None] = None, - log_level: int = logging.INFO): +def run( + base_url: str, + username: str, + password: str, + verify_host_certs: bool = True, + log_filepath: Union[str, None] = None, + log_level: int = logging.INFO, +): configure_logging(filepath=log_filepath, log_level=log_level) log.info("starting CLI processing") host = Host(password, base_url, username, verify_host_certs) for document in query_registry_db(host, {"match_all": {}}, {}): - id = document['_id'] - src = document['_source'] + id = document["_id"] + src = document["_source"] repairs = {} - log.debug(f'working on document: {id}') + log.debug(f"working on document: {id}") for fieldname, data in src.items(): for regex, funcs in REPAIR_TOOLS.items(): if regex(fieldname): for func in funcs: repairs.update(func(src, fieldname)) if repairs: - log.info(f'Writing repairs to document: {id}') + log.info(f"Writing repairs to document: {id}") write_updated_docs(host, {id: repairs}) return diff --git a/src/pds/registrysweepers/repairkit/__main__.py b/src/pds/registrysweepers/repairkit/__main__.py index 815b5c0..3105049 100644 --- a/src/pds/registrysweepers/repairkit/__main__.py +++ b/src/pds/registrysweepers/repairkit/__main__.py @@ -1,11 +1,13 @@ from pds.registrysweepers.repairkit import run from pds.registrysweepers.utils import parse_args -args = parse_args(description='sweep through the registry documents and fix common errors') +args = parse_args(description="sweep through the registry documents and fix common errors") -run(base_url=args.base_URL, +run( + base_url=args.base_URL, username=args.username, password=args.password, verify_host_certs=not args.insecure, log_level=args.log_level, - log_filepath=args.log_file) + log_filepath=args.log_file, +) diff --git a/src/pds/registrysweepers/repairkit/allarrays.py b/src/pds/registrysweepers/repairkit/allarrays.py index 4894476..f15bb92 100644 --- a/src/pds/registrysweepers/repairkit/allarrays.py +++ b/src/pds/registrysweepers/repairkit/allarrays.py @@ -1,4 +1,4 @@ -'''change single strings to array of strings''' +"""change single strings to array of strings""" import logging from typing import Dict @@ -6,8 +6,8 @@ def repair(document: Dict, fieldname: str) -> Dict: - log.debug(f'checking {fieldname}') + log.debug(f"checking {fieldname}") if isinstance(document[fieldname], str): - log.info(f'found string for {fieldname} where it should be an array') + log.info(f"found string for {fieldname} where it should be an array") return {fieldname: [document[fieldname]]} return {} diff --git a/tests/pds/registrysweepers/repairkit/test_allarrays.py b/tests/pds/registrysweepers/repairkit/test_allarrays.py index d08fb4f..850f9be 100644 --- a/tests/pds/registrysweepers/repairkit/test_allarrays.py +++ b/tests/pds/registrysweepers/repairkit/test_allarrays.py @@ -5,14 +5,15 @@ class AllArrays(unittest.TestCase): def test_valid_field(self): - src = {'apple': ['orange']} - repair = allarrays.repair(src, 'apple') + src = {"apple": ["orange"]} + repair = allarrays.repair(src, "apple") self.assertEqual({}, repair) + def test_invalid_field(self): - src = {'apple': 'orange'} - repair = allarrays.repair(src, 'apple') - self.assertEqual({'apple': ['orange']}, repair) + src = {"apple": "orange"} + repair = allarrays.repair(src, "apple") + self.assertEqual({"apple": ["orange"]}, repair) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 5c19915b4320af65a324859bf2d6049bb7c41ab2 Mon Sep 17 00:00:00 2001 From: al-niessner <1130658+al-niessner@users.noreply.github.com> Date: Tue, 15 Aug 2023 08:39:12 -0700 Subject: [PATCH 15/15] Update sweepers_driver.py as requested --- docker/sweepers_driver.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/sweepers_driver.py b/docker/sweepers_driver.py index 01c249b..b7cb602 100755 --- a/docker/sweepers_driver.py +++ b/docker/sweepers_driver.py @@ -61,7 +61,7 @@ from datetime import datetime from typing import Callable -from pds.registrysweepers import provenance, ancestry +from pds.registrysweepers import provenance, ancestry, repairkit from pds.registrysweepers.utils import configure_logging, get_human_readable_elapsed_since, parse_log_level configure_logging(filepath=None, log_level=logging.INFO) @@ -108,10 +108,12 @@ def run_factory(sweeper_f: Callable) -> Callable: run_provenance = run_factory(provenance.run) run_ancestry = run_factory(ancestry.run) +run_repairkit = run_factory(repairkit.run) log.info('Running sweepers') execution_begin = datetime.now() +run_repairkit() run_provenance() run_ancestry()