Skip to content

Commit

Permalink
rm: nuke POST /metadata/json:validate_urls_file and helper fns
Browse files Browse the repository at this point in the history
This endpoint played a role early on in NMDC for a workflow for metadata import. That workflow is no longer used.

No use since 2023-10-05:
> db.getCollection("_runtime.analytics").countDocuments({"path": {"$regex": "^/metadata/json:validate_urls_file"}})
0
dwinston committed Dec 5, 2024
1 parent ba6d235 commit 4234e84
Showing 1 changed file with 0 additions and 90 deletions.
90 changes: 0 additions & 90 deletions nmdc_runtime/api/endpoints/metadata.py
Original file line number Diff line number Diff line change
@@ -166,96 +166,6 @@ def iter_grid_out():
)


url_pattern = re.compile(r"https?://(?P<domain>[^/]+)/(?P<path>.+)")


def url_to_name(url):
r"""
TODO: Document this function.
"""
m = url_pattern.match(url)
return f"{'.'.join(reversed(m.group('domain').split('.')))}__{m.group('path').replace('/', '.')}"


def result_for_url_to_json_file(data, url, save_dir):
r"""
TODO: Document this function.
"""
with open(os.path.join(save_dir, url_to_name(url)), "w") as f:
json.dump(data.json(), f)


def fetch_downloaded_json(url, save_dir):
r"""
TODO: Document this function.
"""
with open(os.path.join(save_dir, url_to_name(url))) as f:
return json.load(f)


@router.post("/metadata/json:validate_urls_file")
async def validate_json_urls_file(urls_file: UploadFile = File(...)):
"""
Given a text file with one URL per line, will try to validate each URL target
as a NMDC JSON Schema "nmdc:Database" object.
"""
content_type = urls_file.content_type
filename = urls_file.filename
if content_type != "text/plain":
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=(
f"file {filename} has content type '{content_type}'. "
f"Only 'text/plain' (*.txt) files are permitted."
),
)
contents: bytes = await urls_file.read()
stream = StringIO(contents.decode()) # can e.g. import csv; csv.reader(stream)

urls = [line.strip() for line in stream if line.strip()]

def load_url(url, timeout):
return requests.get(url, timeout=timeout)

with tempfile.TemporaryDirectory() as temp_dir:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {executor.submit(load_url, url, 5): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
result_for_url_to_json_file(data, url, temp_dir)
except Exception as exc:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"{url} generated an exception: {exc}",
)

validator = Draft7Validator(get_nmdc_jsonschema_dict())
validation_errors = defaultdict(list)

for url in urls:
docs = fetch_downloaded_json(url, temp_dir)
docs, validation_errors_for_activity_set = specialize_activity_set_docs(
docs
)

validation_errors["activity_set"].extend(
validation_errors_for_activity_set["activity_set"]
)

for coll_name, coll_docs in docs.items():
errors = list(validator.iter_errors({coll_name: coll_docs}))
validation_errors[coll_name].extend([e.message for e in errors])

if all(len(v) == 0 for v in validation_errors.values()):
return {"result": "All Okay!"}
else:
return {"result": "errors", "detail": validation_errors}


@router.post("/metadata/json:validate", name="Validate JSON")
async def validate_json_nmdcdb(docs: dict, mdb: MongoDatabase = Depends(get_mongo_db)):
"""

0 comments on commit 4234e84

Please sign in to comment.