generated from linkml/linkml-template
-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1246 from microbiomedata/napa_compliance
Napa compliance
- Loading branch information
Showing
4 changed files
with
903 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# Napa Compliance | ||
|
||
This code has been executed, the results were accepted, and it should not need to be run again. It is being included as | ||
documentation. The Python files have been moved from `nmdc_schema/` to `src/scripts/task_specific_code/` and they | ||
shouldn't be expected to work from that location. These scripts demonstrate many good design principles, but may not | ||
meet all of nmdc-schema's current code quality standards. | ||
|
||
* insert_many_pymongo.py | ||
* metap_records_delete.py | ||
* misc_reid_code.py | ||
|
||
One shouldn't assume that the installation notes below are intended for any other nmdc-schema development. | ||
|
||
## Installing Python packages | ||
|
||
```shell | ||
# So Python scripts can read `.env` files. | ||
pip install python-dotenv | ||
|
||
# So Python scripts can access Mongo databases. | ||
pip install pymongo | ||
|
||
# So Python code is formatted in a standard way. | ||
pip install black | ||
``` | ||
|
||
## Formatting source code | ||
|
||
You can use [`black`](https://black.readthedocs.io/en/stable/) to format the Python code you write, by running: | ||
|
||
```shell | ||
python -m black /path/to/python/code.py | ||
``` | ||
|
||
For example: | ||
|
||
```shell | ||
python -m black src/scripts/task_specific_code/metap_records_delete.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
import json | ||
import logging | ||
|
||
import click_log | ||
import click # not currently a top-level dependency of `nmdc-schema` | ||
import pymongo # not currently a top-level dependency of `nmdc-schema` | ||
import requests # not currently a top-level dependency of `nmdc-schema` | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
click_log.basic_config(logger) | ||
|
||
|
||
|
||
@click.command() | ||
@click_log.simple_verbosity_option(logger) | ||
@click.option( | ||
"--input-file", | ||
type=click.File(), | ||
required=True, | ||
help=r"Path to a JSON file containing the data you want to insert. " | ||
r"The JSON file must conform to the NMDC Schema.", | ||
prompt=r"Path to JSON file", | ||
) | ||
@click.option( | ||
"--mongo-uri", | ||
type=str, | ||
required=True, | ||
envvar="TEMP_MONGO_URI", | ||
help=r"MongoDB connection string. Note: Some connection strings include a password. " | ||
r"To avoid putting your password on the command line, you can specify the connection string " | ||
r"via an environment variable named `TEMP_MONGO_URI`.", | ||
prompt=r"MongoDB connection string", | ||
) | ||
@click.option( | ||
"--is-direct-connection", | ||
type=bool, | ||
required=False, | ||
default=True, | ||
show_default=True, | ||
help=f"Whether you want the script to set the `directConnection` flag when connecting to the MongoDB server. " | ||
f"That is required by some MongoDB servers that belong to a replica set. ", | ||
) | ||
@click.option( | ||
"--database-name", | ||
type=str, | ||
required=False, | ||
default="nmdc", | ||
show_default=True, | ||
help=f"MongoDB database name", | ||
) | ||
@click.option( | ||
"--validator-uri", | ||
type=str, | ||
required=False, | ||
default="https://api.microbiomedata.org/metadata/json:validate", | ||
show_default=True, | ||
help=f"URI of NMDC Schema-based validator", | ||
) | ||
def insert_many_pymongo( | ||
input_file, | ||
mongo_uri: str, | ||
is_direct_connection: bool, | ||
database_name: str, | ||
validator_uri: str, | ||
) -> None: | ||
r""" | ||
Reads data from an NMDC Schema-conformant JSON file and inserts that data into a MongoDB database. | ||
""" | ||
r""" | ||
References: | ||
- Topic: Specifying a file path via a CLI option (and `click` providing a file handle to the function). | ||
https://click.palletsprojects.com/en/8.1.x/api/#click.File | ||
- Topic: `click` populating function parameters from environment variables. | ||
https://click.palletsprojects.com/en/8.1.x/options/#values-from-environment-variables | ||
- Topic: `click_log`. | ||
https://click-log.readthedocs.io/en/stable/ | ||
""" | ||
|
||
# Validate the JSON data with respect to the NMDC schema. | ||
# | ||
# Note: The validation endpoint currently returns `{"result": "All Okay!"}` | ||
# when data is valid. | ||
# | ||
logger.debug(f"Validating the JSON data.") | ||
json_data = json.load(input_file) | ||
response = requests.post(validator_uri, json=json_data) | ||
assert response.status_code == 200, f"Failed to access validator at {validator_uri}" | ||
validation_result = response.json() | ||
if validation_result.get("result") == "All Okay!": | ||
logger.debug(f"The JSON data is valid.") | ||
else: | ||
logger.error(f"Validation result: {validation_result}") | ||
raise ValueError(f"The JSON data is not valid.") | ||
|
||
# Validate the MongoDB connection string and database name. | ||
mongo_client = pymongo.MongoClient(host=mongo_uri, directConnection=is_direct_connection) | ||
with pymongo.timeout(5): # stop trying after 5 seconds | ||
assert (database_name in mongo_client.list_database_names()), f'The database named "{database_name}" does not exist.' | ||
|
||
# Insert the JSON data into the MongoDB database. | ||
db = mongo_client[database_name] | ||
logger.info(f'Processing the {len(json_data.keys())} collection(s) provided.') | ||
for collection_name, documents in json_data.items(): | ||
if len(documents) > 0: | ||
logger.info(f'Inserting {len(documents)} documents into the "{collection_name}" collection.') | ||
result = db[collection_name].insert_many(documents) | ||
num_documents_inserted = len(result.inserted_ids) | ||
num_documents_provided = len(documents) | ||
logger.info(f"Inserted {num_documents_inserted} of {num_documents_provided} documents.") | ||
if num_documents_inserted < num_documents_provided: | ||
logger.warning(f"Not all of the provided documents were inserted.") | ||
else: | ||
logger.warning(f'Skipping collection "{collection_name}" because no documents were provided for it.') | ||
|
||
return None | ||
|
||
|
||
if __name__ == "__main__": | ||
insert_many_pymongo() # `click` will prompt the user for options |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
import argparse | ||
import logging | ||
import time | ||
from pathlib import Path | ||
from typing import List, Any, Tuple | ||
|
||
import pymongo | ||
|
||
|
||
logger = logging.getLogger(Path(__file__).name) | ||
logging.basicConfig( | ||
level=logging.INFO, | ||
handlers=[ | ||
logging.FileHandler(f'{Path(__file__).stem}_{time.strftime("%Y%m%d-%H%M%S")}.log'), | ||
logging.StreamHandler() | ||
] | ||
) | ||
|
||
def args() -> Tuple[str]: | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument( | ||
"--dry-run", help="Print, but not delete, all metaP proteomics records", action="store_true" | ||
) | ||
parser.add_argument("--mongo-uri", type=str, help="MongoDB URI", required=False, default="mongodb://localhost:27017",) | ||
return parser.parse_args() | ||
|
||
|
||
class NMDCAccessor: | ||
def __init__(self, db): | ||
self.db = db | ||
|
||
def get_documents_from_collection(self, collection_name: str) -> Any: | ||
collection = self.db[collection_name] | ||
documents = list(collection.find({})) | ||
|
||
logger.info(f"Found {len(documents)} documents in {collection_name}") | ||
return documents | ||
|
||
def get_matching_msgf_data_objects_records(self) -> Any: | ||
collection = self.db["data_object_set"] | ||
query = {"description": {"$regex": "MSGF"}} | ||
documents = collection.find(query) | ||
|
||
return list(documents) | ||
|
||
def get_matching_msgf_data_object_ids(self) -> List[str]: | ||
records = self.get_matching_msgf_data_objects_records() | ||
|
||
return [record["id"] for record in records] | ||
|
||
def delete_matching_records_from_ids( | ||
self, collection_name: str, ids: List[str] | ||
) -> None: | ||
collection = self.db[collection_name] | ||
filter = {"id": {"$in": ids}} | ||
|
||
result = collection.delete_many(filter) | ||
logger.info(f"Deleted {result.deleted_count} documents") | ||
|
||
def delete_matching_record_from_id(self, collection_name: str, id: str, delete=False, should_log_id=True) -> None: | ||
""" | ||
Delete a record from a collection by ID. | ||
:param collection_name: The name of the collection to delete the record from. | ||
:param id: The ID of the record to delete. | ||
:param delete: If True, delete the record. If False, just log the record that would be deleted. | ||
:param should_log_id: If True, log the ID of the record that would be deleted. If False, do not log the ID since not all records have IDs. | ||
""" | ||
collection = self.db[collection_name] | ||
filter = {"id": id} | ||
|
||
if should_log_id: | ||
self.__log_record_deletion_information(collection_name, id) | ||
|
||
if delete: | ||
result = collection.delete_one(filter) | ||
logger.info(f"Deleted {result.deleted_count} record(s) from {collection_name}") | ||
|
||
def delete_all_records_from_collection(self, collection_name: str, delete=False, should_log_id=True) -> Any: | ||
""" | ||
A terrifying function for deleting ALL records in a collection. | ||
:param collection_name: The name of the collection to delete all records from. | ||
:param delete: If True, delete the records. If False, just log the records that would be deleted. | ||
:param should_log_id: If True, log the IDs of the records that would be deleted. If False, do not log the IDs since not all records have IDs. | ||
""" | ||
logger.info(f"Deleting all records from {collection_name}") | ||
|
||
to_delete = self.get_documents_from_collection(collection_name) | ||
collection = self.db[collection_name] | ||
|
||
if should_log_id: | ||
ids = [doc["id"] for doc in to_delete] | ||
self.__log_record_deletion_information_many(collection_name, ids) | ||
|
||
if delete: | ||
result = collection.delete_many({}) | ||
logger.info(f"Deleted {result.deleted_count} record(s) from {collection_name}") | ||
|
||
def delete_all_metaproteomics_records(self, delete = False) -> None: | ||
""" | ||
Delete all metaproteomics records. | ||
:param delete: If True, delete the records. If False, just log the records that would be deleted. | ||
""" | ||
metap_collection_name = "metap_gene_function_aggregation" | ||
metaproteomics_analy_collection_name = "metaproteomics_analysis_activity_set" | ||
data_objects_set_name = "data_object_set" | ||
|
||
# Drop all from metap gene function collection. | ||
self.delete_all_records_from_collection(metap_collection_name, delete=delete, should_log_id=False) | ||
|
||
# Drop all from metaproteomics analysis activity set collection. | ||
self.delete_all_records_from_collection(metaproteomics_analy_collection_name, delete=delete, should_log_id=True) | ||
|
||
# Get all IDs associated with protemics job outputs. | ||
# This search is broader than tracing down the outputs of the metaproteomics analysis activity set records' data objects | ||
# since there appear to be dangling data objects that are not associated with any metaproteomics analysis activity records, | ||
# but "MSGF" is in their description and absolutely associated with the proteomics pipeline | ||
ids = self.get_matching_msgf_data_object_ids() | ||
logger.info(f'Found {len(ids)} matching records in {data_objects_set_name}') | ||
for id in ids: | ||
self.delete_matching_record_from_id(data_objects_set_name, id, delete=delete, should_log_id=True) | ||
|
||
def __log_record_deletion_information_many(self, collection_name: str, ids: List[str]) -> None: | ||
for id in ids: | ||
self.__log_record_deletion_information(collection_name, id) | ||
|
||
def __log_record_deletion_information(self, collection_name: str, id: str) -> None: | ||
logger.info(f"Deleting record with ID: {id} from {collection_name}") | ||
|
||
@staticmethod | ||
def get_nmdc_db(mongo_uri: str) -> "NMDCAccessor": | ||
db = "nmdc" | ||
|
||
client = pymongo.MongoClient( | ||
mongo_uri, | ||
directConnection=True, | ||
) | ||
|
||
return NMDCAccessor(client[db]) | ||
|
||
|
||
def main(): | ||
args_map = args() | ||
|
||
accessor = NMDCAccessor.get_nmdc_db(mongo_uri=args_map.mongo_uri) | ||
|
||
if args_map.dry_run: | ||
logger.info("Dry run: no records will be deleted") | ||
else: | ||
logger.info("Deleting all records") | ||
|
||
# Being very explicit about the deletion of records | ||
delete = not args_map.dry_run | ||
|
||
accessor.delete_all_metaproteomics_records(delete=delete) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.