Merge pull request #1246 from microbiomedata/napa_compliance

Napa compliance
microbiomedata · Jul 16, 2024 · 249034d · 249034d
2 parents 874c06e + c42a0d6
commit 249034d
Show file tree

Hide file tree

Showing 4 changed files with 903 additions and 0 deletions.
diff --git a/src/scripts/task_specific_code/README.md b/src/scripts/task_specific_code/README.md
@@ -0,0 +1,39 @@
+# Napa Compliance
+
+This code has been executed, the results were accepted, and it should not need to be run again. It is being included as
+documentation. The Python files have been moved from `nmdc_schema/` to `src/scripts/task_specific_code/` and they
+shouldn't be expected to work from that location. These scripts demonstrate many good design principles, but may not
+meet all of nmdc-schema's current code quality standards.
+
+* insert_many_pymongo.py
+* metap_records_delete.py
+* misc_reid_code.py
+
+One shouldn't assume that the installation notes below are intended for any other nmdc-schema development.
+
+## Installing Python packages
+
+```shell
+# So Python scripts can read `.env` files.
+pip install python-dotenv
+
+# So Python scripts can access Mongo databases.
+pip install pymongo
+
+# So Python code is formatted in a standard way.
+pip install black
+```
+
+## Formatting source code
+
+You can use [`black`](https://black.readthedocs.io/en/stable/) to format the Python code you write, by running:
+
+```shell
+python -m black /path/to/python/code.py
+```
+
+For example:
+
+```shell
+python -m black src/scripts/task_specific_code/metap_records_delete.py
+```
diff --git a/src/scripts/task_specific_code/insert_many_pymongo.py b/src/scripts/task_specific_code/insert_many_pymongo.py
@@ -0,0 +1,120 @@
+import json
+import logging
+
+import click_log
+import click  # not currently a top-level dependency of `nmdc-schema`
+import pymongo  # not currently a top-level dependency of `nmdc-schema`
+import requests  # not currently a top-level dependency of `nmdc-schema`
+
+
+logger = logging.getLogger(__name__)
+click_log.basic_config(logger)
+
+
+
+@click.command()
+@click_log.simple_verbosity_option(logger)
+@click.option(
+    "--input-file",
+    type=click.File(),
+    required=True,
+    help=r"Path to a JSON file containing the data you want to insert. "
+         r"The JSON file must conform to the NMDC Schema.",
+    prompt=r"Path to JSON file",
+)
+@click.option(
+    "--mongo-uri",
+    type=str,
+    required=True,
+    envvar="TEMP_MONGO_URI",
+    help=r"MongoDB connection string. Note: Some connection strings include a password. "
+         r"To avoid putting your password on the command line, you can specify the connection string "
+         r"via an environment variable named `TEMP_MONGO_URI`.",
+    prompt=r"MongoDB connection string",
+)
+@click.option(
+    "--is-direct-connection",
+    type=bool,
+    required=False,
+    default=True,
+    show_default=True,
+    help=f"Whether you want the script to set the `directConnection` flag when connecting to the MongoDB server. "
+         f"That is required by some MongoDB servers that belong to a replica set. ",
+)
+@click.option(
+    "--database-name",
+    type=str,
+    required=False,
+    default="nmdc",
+    show_default=True,
+    help=f"MongoDB database name",
+)
+@click.option(
+    "--validator-uri",
+    type=str,
+    required=False,
+    default="https://api.microbiomedata.org/metadata/json:validate",
+    show_default=True,
+    help=f"URI of NMDC Schema-based validator",
+)
+def insert_many_pymongo(
+    input_file,
+    mongo_uri: str,
+    is_direct_connection: bool,
+    database_name: str,
+    validator_uri: str,
+) -> None:
+    r"""
+    Reads data from an NMDC Schema-conformant JSON file and inserts that data into a MongoDB database.
+    """
+    r"""
+    References:
+    - Topic: Specifying a file path via a CLI option (and `click` providing a file handle to the function).
+      https://click.palletsprojects.com/en/8.1.x/api/#click.File
+    - Topic: `click` populating function parameters from environment variables.
+      https://click.palletsprojects.com/en/8.1.x/options/#values-from-environment-variables
+    - Topic: `click_log`.
+      https://click-log.readthedocs.io/en/stable/
+    """
+
+    # Validate the JSON data with respect to the NMDC schema.
+    #
+    # Note: The validation endpoint currently returns `{"result": "All Okay!"}`
+    #       when data is valid.
+    #
+    logger.debug(f"Validating the JSON data.")
+    json_data = json.load(input_file)
+    response = requests.post(validator_uri, json=json_data)
+    assert response.status_code == 200, f"Failed to access validator at {validator_uri}"
+    validation_result = response.json()
+    if validation_result.get("result") == "All Okay!":
+        logger.debug(f"The JSON data is valid.")
+    else:
+        logger.error(f"Validation result: {validation_result}")
+        raise ValueError(f"The JSON data is not valid.")
+
+    # Validate the MongoDB connection string and database name.
+    mongo_client = pymongo.MongoClient(host=mongo_uri, directConnection=is_direct_connection)
+    with pymongo.timeout(5):  # stop trying after 5 seconds
+        assert (database_name in mongo_client.list_database_names()), f'The database named "{database_name}" does not exist.'
+
+    # Insert the JSON data into the MongoDB database.
+    db = mongo_client[database_name]
+    logger.info(f'Processing the {len(json_data.keys())} collection(s) provided.')
+    for collection_name, documents in json_data.items():
+        if len(documents) > 0:
+            logger.info(f'Inserting {len(documents)} documents into the "{collection_name}" collection.')
+            result = db[collection_name].insert_many(documents)
+            num_documents_inserted = len(result.inserted_ids)
+            num_documents_provided = len(documents)
+            logger.info(f"Inserted {num_documents_inserted} of {num_documents_provided} documents.")
+            if num_documents_inserted < num_documents_provided:
+                logger.warning(f"Not all of the provided documents were inserted.")
+        else:
+            logger.warning(f'Skipping collection "{collection_name}" because no documents were provided for it.')
+
+    return None
+
+
+if __name__ == "__main__":
+    insert_many_pymongo()  # `click` will prompt the user for options
diff --git a/src/scripts/task_specific_code/metap_records_delete.py b/src/scripts/task_specific_code/metap_records_delete.py
@@ -0,0 +1,162 @@
+import argparse
+import logging
+import time
+from pathlib import Path
+from typing import List, Any, Tuple
+
+import pymongo
+
+
+logger = logging.getLogger(Path(__file__).name)
+logging.basicConfig(
+    level=logging.INFO,
+    handlers=[
+        logging.FileHandler(f'{Path(__file__).stem}_{time.strftime("%Y%m%d-%H%M%S")}.log'),
+        logging.StreamHandler()
+    ]
+)
+
+def args() -> Tuple[str]:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--dry-run", help="Print, but not delete, all metaP proteomics records", action="store_true"
+    )
+    parser.add_argument("--mongo-uri", type=str, help="MongoDB URI", required=False, default="mongodb://localhost:27017",)
+    return parser.parse_args()
+
+
+class NMDCAccessor:
+    def __init__(self, db):
+        self.db = db
+
+    def get_documents_from_collection(self, collection_name: str) -> Any:
+        collection = self.db[collection_name]
+        documents = list(collection.find({}))
+
+        logger.info(f"Found {len(documents)} documents in {collection_name}")
+        return documents
+
+    def get_matching_msgf_data_objects_records(self) -> Any:
+        collection = self.db["data_object_set"]
+        query = {"description": {"$regex": "MSGF"}}
+        documents = collection.find(query)
+
+        return list(documents)
+
+    def get_matching_msgf_data_object_ids(self) -> List[str]:
+        records = self.get_matching_msgf_data_objects_records()
+
+        return [record["id"] for record in records]
+
+    def delete_matching_records_from_ids(
+        self, collection_name: str, ids: List[str]
+    ) -> None:
+        collection = self.db[collection_name]
+        filter = {"id": {"$in": ids}}
+
+        result = collection.delete_many(filter)
+        logger.info(f"Deleted {result.deleted_count} documents")
+
+    def delete_matching_record_from_id(self, collection_name: str, id: str, delete=False, should_log_id=True) -> None:
+        """
+        Delete a record from a collection by ID.
+
+        :param collection_name: The name of the collection to delete the record from.
+        :param id: The ID of the record to delete.
+        :param delete: If True, delete the record. If False, just log the record that would be deleted.
+        :param should_log_id: If True, log the ID of the record that would be deleted. If False, do not log the ID since not all records have IDs.
+        """
+        collection = self.db[collection_name]
+        filter = {"id": id}
+
+        if should_log_id:
+            self.__log_record_deletion_information(collection_name, id)
+
+        if delete:
+            result = collection.delete_one(filter)
+            logger.info(f"Deleted {result.deleted_count} record(s) from {collection_name}")
+
+    def delete_all_records_from_collection(self, collection_name: str, delete=False, should_log_id=True) -> Any:
+        """
+        A terrifying function for deleting ALL records in a collection.
+
+        :param collection_name: The name of the collection to delete all records from.
+        :param delete: If True, delete the records. If False, just log the records that would be deleted.
+        :param should_log_id: If True, log the IDs of the records that would be deleted. If False, do not log the IDs since not all records have IDs.
+        """
+        logger.info(f"Deleting all records from {collection_name}")
+
+        to_delete = self.get_documents_from_collection(collection_name)
+        collection = self.db[collection_name]
+
+        if should_log_id:
+            ids = [doc["id"] for doc in to_delete]
+            self.__log_record_deletion_information_many(collection_name, ids)
+
+        if delete:
+            result = collection.delete_many({})
+            logger.info(f"Deleted {result.deleted_count} record(s) from {collection_name}")
+
+    def delete_all_metaproteomics_records(self, delete = False) -> None:
+        """
+        Delete all metaproteomics records.
+
+        :param delete: If True, delete the records. If False, just log the records that would be deleted.
+        """
+        metap_collection_name = "metap_gene_function_aggregation"
+        metaproteomics_analy_collection_name = "metaproteomics_analysis_activity_set"
+        data_objects_set_name = "data_object_set"
+
+        # Drop all from metap gene function collection.
+        self.delete_all_records_from_collection(metap_collection_name, delete=delete, should_log_id=False)
+
+        # Drop all from metaproteomics analysis activity set collection.
+        self.delete_all_records_from_collection(metaproteomics_analy_collection_name, delete=delete, should_log_id=True)
+
+        # Get all IDs associated with protemics job outputs.
+        # This search is broader than tracing down the outputs of the metaproteomics analysis activity set records' data objects
+        # since there appear to be dangling data objects that are not associated with any metaproteomics analysis activity records,
+        # but "MSGF" is in their description and absolutely associated with the proteomics pipeline
+        ids = self.get_matching_msgf_data_object_ids()
+        logger.info(f'Found {len(ids)} matching records in {data_objects_set_name}')
+        for id in ids:
+            self.delete_matching_record_from_id(data_objects_set_name, id, delete=delete, should_log_id=True)
+
+    def __log_record_deletion_information_many(self, collection_name: str, ids: List[str]) -> None:
+        for id in ids:
+            self.__log_record_deletion_information(collection_name, id)
+
+    def __log_record_deletion_information(self, collection_name: str, id: str) -> None:
+        logger.info(f"Deleting record with ID: {id} from {collection_name}")
+
+    @staticmethod
+    def get_nmdc_db(mongo_uri: str) -> "NMDCAccessor":
+        db = "nmdc"
+
+        client = pymongo.MongoClient(
+            mongo_uri,
+            directConnection=True,
+        )
+
+        return NMDCAccessor(client[db])
+
+
+def main():
+    args_map = args()
+
+    accessor = NMDCAccessor.get_nmdc_db(mongo_uri=args_map.mongo_uri)
+
+    if args_map.dry_run:
+        logger.info("Dry run: no records will be deleted")
+    else:
+        logger.info("Deleting all records")
+
+    # Being very explicit about the deletion of records
+    delete = not args_map.dry_run
+
+    accessor.delete_all_metaproteomics_records(delete=delete)
+
+
+if __name__ == "__main__":
+    main()