Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New repairkit sweeper for updating metadata loaded with older versions of Harvest to use lists instead of single-value strings #54

Merged
merged 15 commits into from
Aug 16, 2023
65 changes: 65 additions & 0 deletions src/pds/registrysweepers/repairkit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
'''repairkit is an executable package

The reason repairkit is an executable package is for extension as new repairs
are needed in the future. They can be added by updating the REPAIR_TOOLS mapping
with the new field name and functional requirements. All the additions can then
be modules with this executable package.
'''

from . import allarrays

from pds.registrysweepers.utils import Host
from pds.registrysweepers.utils import query_registry_db

import logging
import re
from typing import Union

'''
dictionary repair tools is {field_name:[funcs]} where field_name can be:
1: re.compile().fullmatch for the equivalent of "fred" == "fred"
2: re.compile().match for more complex matching of subparts of the string

and funcs are:
def function_name (document:{}, fieldname:str)->{}

and the return an empty {} if no changes and {fieldname:new_value} for repairs

Examples

re.compile("^ops:Info/.+").match("ops:Info/ops:filesize")->match object
re.compile("^ops:Info/.+").fullmatch("ops:Info/ops:filesize")->match object
re.compile("^ops:Info/").match("ops:Info/ops:filesize")->match object
re.compile("^ops:Info/").fullmatch("ops:Info/ops:filesize")->None

To get str_a == str_b, re.compile(str_a).fullmatch

'''

REPAIR_TOOLS = {
re.compile('^ops:Data_File_Info/').match:[allarrays.repair],
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
re.compile('^ops:Label_File_Info/').match:[allarrays.repair],
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
}

log = logging.getLogger(__name__)

def run(base_url:str,
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
username:str,
password:str,
verify_host_certs:bool=True,
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
log_filepath:Union[str,None]=None,
log_level:int=logging.INFO):
configure_logging(filepath=log_filepath, log_level=log_level)
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
log.info("starting CLI processing")
host = Host(password, base_url, username, verify_host_certs)
query = {"match_all":{}}
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
for document in query_registry_db(host, query, {}):
repairs = {}
for fieldname,data in document.items():
for regex,funcs in REPAIR_TOOLS:
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
if regex(filename):
repairs.update(func(document, fieldname) for func in funcs)
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
if repairs:
log.info (f'Writing repairs to document: {id}')
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
write_update_docs (host, {id,repairs})
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
return
12 changes: 12 additions & 0 deletions src/pds/registrysweepers/repairkit/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

from pds.registrysweepers.repairkit import run
from pds.registrysweepers.utils import parse_args

args = parse_args(description='sweep through the registry documents and fix common errors')

run(base_url=args.base_URL,
username=args.username,
password=args.password,
verify_host_certs=not args.insecure,
log_level=args.log_level,
log_filepath=args.log_file)
7 changes: 7 additions & 0 deletions src/pds/registrysweepers/repairkit/allarrays.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
'''change single strings to array of strings'''

def repair (document:{}, fieldname:str)->bool:
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
if isinstance (document[fieldname], str):
log.info (f'found string for {fieldname} where it should be an array')
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
return {fieldname:[document[fieldname]}
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
al-niessner marked this conversation as resolved.
Show resolved Hide resolved
return {}
al-niessner marked this conversation as resolved.
Show resolved Hide resolved