Skip to content

Commit

Permalink
Merge pull request #1246 from microbiomedata/napa_compliance
Browse files Browse the repository at this point in the history
Napa compliance
  • Loading branch information
aclum authored Jul 16, 2024
2 parents 874c06e + c42a0d6 commit 249034d
Show file tree
Hide file tree
Showing 4 changed files with 903 additions and 0 deletions.
39 changes: 39 additions & 0 deletions src/scripts/task_specific_code/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Napa Compliance

This code has been executed, the results were accepted, and it should not need to be run again. It is being included as
documentation. The Python files have been moved from `nmdc_schema/` to `src/scripts/task_specific_code/` and they
shouldn't be expected to work from that location. These scripts demonstrate many good design principles, but may not
meet all of nmdc-schema's current code quality standards.

* insert_many_pymongo.py
* metap_records_delete.py
* misc_reid_code.py

One shouldn't assume that the installation notes below are intended for any other nmdc-schema development.

## Installing Python packages

```shell
# So Python scripts can read `.env` files.
pip install python-dotenv

# So Python scripts can access Mongo databases.
pip install pymongo

# So Python code is formatted in a standard way.
pip install black
```

## Formatting source code

You can use [`black`](https://black.readthedocs.io/en/stable/) to format the Python code you write, by running:

```shell
python -m black /path/to/python/code.py
```

For example:

```shell
python -m black src/scripts/task_specific_code/metap_records_delete.py
```
120 changes: 120 additions & 0 deletions src/scripts/task_specific_code/insert_many_pymongo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import json
import logging

import click_log
import click # not currently a top-level dependency of `nmdc-schema`
import pymongo # not currently a top-level dependency of `nmdc-schema`
import requests # not currently a top-level dependency of `nmdc-schema`


logger = logging.getLogger(__name__)
click_log.basic_config(logger)



@click.command()
@click_log.simple_verbosity_option(logger)
@click.option(
"--input-file",
type=click.File(),
required=True,
help=r"Path to a JSON file containing the data you want to insert. "
r"The JSON file must conform to the NMDC Schema.",
prompt=r"Path to JSON file",
)
@click.option(
"--mongo-uri",
type=str,
required=True,
envvar="TEMP_MONGO_URI",
help=r"MongoDB connection string. Note: Some connection strings include a password. "
r"To avoid putting your password on the command line, you can specify the connection string "
r"via an environment variable named `TEMP_MONGO_URI`.",
prompt=r"MongoDB connection string",
)
@click.option(
"--is-direct-connection",
type=bool,
required=False,
default=True,
show_default=True,
help=f"Whether you want the script to set the `directConnection` flag when connecting to the MongoDB server. "
f"That is required by some MongoDB servers that belong to a replica set. ",
)
@click.option(
"--database-name",
type=str,
required=False,
default="nmdc",
show_default=True,
help=f"MongoDB database name",
)
@click.option(
"--validator-uri",
type=str,
required=False,
default="https://api.microbiomedata.org/metadata/json:validate",
show_default=True,
help=f"URI of NMDC Schema-based validator",
)
def insert_many_pymongo(
input_file,
mongo_uri: str,
is_direct_connection: bool,
database_name: str,
validator_uri: str,
) -> None:
r"""
Reads data from an NMDC Schema-conformant JSON file and inserts that data into a MongoDB database.
"""
r"""
References:
- Topic: Specifying a file path via a CLI option (and `click` providing a file handle to the function).
https://click.palletsprojects.com/en/8.1.x/api/#click.File
- Topic: `click` populating function parameters from environment variables.
https://click.palletsprojects.com/en/8.1.x/options/#values-from-environment-variables
- Topic: `click_log`.
https://click-log.readthedocs.io/en/stable/
"""

# Validate the JSON data with respect to the NMDC schema.
#
# Note: The validation endpoint currently returns `{"result": "All Okay!"}`
# when data is valid.
#
logger.debug(f"Validating the JSON data.")
json_data = json.load(input_file)
response = requests.post(validator_uri, json=json_data)
assert response.status_code == 200, f"Failed to access validator at {validator_uri}"
validation_result = response.json()
if validation_result.get("result") == "All Okay!":
logger.debug(f"The JSON data is valid.")
else:
logger.error(f"Validation result: {validation_result}")
raise ValueError(f"The JSON data is not valid.")

# Validate the MongoDB connection string and database name.
mongo_client = pymongo.MongoClient(host=mongo_uri, directConnection=is_direct_connection)
with pymongo.timeout(5): # stop trying after 5 seconds
assert (database_name in mongo_client.list_database_names()), f'The database named "{database_name}" does not exist.'

# Insert the JSON data into the MongoDB database.
db = mongo_client[database_name]
logger.info(f'Processing the {len(json_data.keys())} collection(s) provided.')
for collection_name, documents in json_data.items():
if len(documents) > 0:
logger.info(f'Inserting {len(documents)} documents into the "{collection_name}" collection.')
result = db[collection_name].insert_many(documents)
num_documents_inserted = len(result.inserted_ids)
num_documents_provided = len(documents)
logger.info(f"Inserted {num_documents_inserted} of {num_documents_provided} documents.")
if num_documents_inserted < num_documents_provided:
logger.warning(f"Not all of the provided documents were inserted.")
else:
logger.warning(f'Skipping collection "{collection_name}" because no documents were provided for it.')

return None


if __name__ == "__main__":
insert_many_pymongo() # `click` will prompt the user for options
162 changes: 162 additions & 0 deletions src/scripts/task_specific_code/metap_records_delete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import argparse
import logging
import time
from pathlib import Path
from typing import List, Any, Tuple

import pymongo


logger = logging.getLogger(Path(__file__).name)
logging.basicConfig(
level=logging.INFO,
handlers=[
logging.FileHandler(f'{Path(__file__).stem}_{time.strftime("%Y%m%d-%H%M%S")}.log'),
logging.StreamHandler()
]
)

def args() -> Tuple[str]:
parser = argparse.ArgumentParser()

parser.add_argument(
"--dry-run", help="Print, but not delete, all metaP proteomics records", action="store_true"
)
parser.add_argument("--mongo-uri", type=str, help="MongoDB URI", required=False, default="mongodb://localhost:27017",)
return parser.parse_args()


class NMDCAccessor:
def __init__(self, db):
self.db = db

def get_documents_from_collection(self, collection_name: str) -> Any:
collection = self.db[collection_name]
documents = list(collection.find({}))

logger.info(f"Found {len(documents)} documents in {collection_name}")
return documents

def get_matching_msgf_data_objects_records(self) -> Any:
collection = self.db["data_object_set"]
query = {"description": {"$regex": "MSGF"}}
documents = collection.find(query)

return list(documents)

def get_matching_msgf_data_object_ids(self) -> List[str]:
records = self.get_matching_msgf_data_objects_records()

return [record["id"] for record in records]

def delete_matching_records_from_ids(
self, collection_name: str, ids: List[str]
) -> None:
collection = self.db[collection_name]
filter = {"id": {"$in": ids}}

result = collection.delete_many(filter)
logger.info(f"Deleted {result.deleted_count} documents")

def delete_matching_record_from_id(self, collection_name: str, id: str, delete=False, should_log_id=True) -> None:
"""
Delete a record from a collection by ID.
:param collection_name: The name of the collection to delete the record from.
:param id: The ID of the record to delete.
:param delete: If True, delete the record. If False, just log the record that would be deleted.
:param should_log_id: If True, log the ID of the record that would be deleted. If False, do not log the ID since not all records have IDs.
"""
collection = self.db[collection_name]
filter = {"id": id}

if should_log_id:
self.__log_record_deletion_information(collection_name, id)

if delete:
result = collection.delete_one(filter)
logger.info(f"Deleted {result.deleted_count} record(s) from {collection_name}")

def delete_all_records_from_collection(self, collection_name: str, delete=False, should_log_id=True) -> Any:
"""
A terrifying function for deleting ALL records in a collection.
:param collection_name: The name of the collection to delete all records from.
:param delete: If True, delete the records. If False, just log the records that would be deleted.
:param should_log_id: If True, log the IDs of the records that would be deleted. If False, do not log the IDs since not all records have IDs.
"""
logger.info(f"Deleting all records from {collection_name}")

to_delete = self.get_documents_from_collection(collection_name)
collection = self.db[collection_name]

if should_log_id:
ids = [doc["id"] for doc in to_delete]
self.__log_record_deletion_information_many(collection_name, ids)

if delete:
result = collection.delete_many({})
logger.info(f"Deleted {result.deleted_count} record(s) from {collection_name}")

def delete_all_metaproteomics_records(self, delete = False) -> None:
"""
Delete all metaproteomics records.
:param delete: If True, delete the records. If False, just log the records that would be deleted.
"""
metap_collection_name = "metap_gene_function_aggregation"
metaproteomics_analy_collection_name = "metaproteomics_analysis_activity_set"
data_objects_set_name = "data_object_set"

# Drop all from metap gene function collection.
self.delete_all_records_from_collection(metap_collection_name, delete=delete, should_log_id=False)

# Drop all from metaproteomics analysis activity set collection.
self.delete_all_records_from_collection(metaproteomics_analy_collection_name, delete=delete, should_log_id=True)

# Get all IDs associated with protemics job outputs.
# This search is broader than tracing down the outputs of the metaproteomics analysis activity set records' data objects
# since there appear to be dangling data objects that are not associated with any metaproteomics analysis activity records,
# but "MSGF" is in their description and absolutely associated with the proteomics pipeline
ids = self.get_matching_msgf_data_object_ids()
logger.info(f'Found {len(ids)} matching records in {data_objects_set_name}')
for id in ids:
self.delete_matching_record_from_id(data_objects_set_name, id, delete=delete, should_log_id=True)

def __log_record_deletion_information_many(self, collection_name: str, ids: List[str]) -> None:
for id in ids:
self.__log_record_deletion_information(collection_name, id)

def __log_record_deletion_information(self, collection_name: str, id: str) -> None:
logger.info(f"Deleting record with ID: {id} from {collection_name}")

@staticmethod
def get_nmdc_db(mongo_uri: str) -> "NMDCAccessor":
db = "nmdc"

client = pymongo.MongoClient(
mongo_uri,
directConnection=True,
)

return NMDCAccessor(client[db])


def main():
args_map = args()

accessor = NMDCAccessor.get_nmdc_db(mongo_uri=args_map.mongo_uri)

if args_map.dry_run:
logger.info("Dry run: no records will be deleted")
else:
logger.info("Deleting all records")

# Being very explicit about the deletion of records
delete = not args_map.dry_run

accessor.delete_all_metaproteomics_records(delete=delete)


if __name__ == "__main__":
main()
Loading

0 comments on commit 249034d

Please sign in to comment.