-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #274 from microbiomedata/239-update-import-logic-t…
…o-update-omicsprocessing-record 239 update import logic to update omicsprocessing record
- Loading branch information
Showing
33 changed files
with
1,204 additions
and
790 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
""" Data classes for NMDC automation. """ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
""" Factory methods for NMDC models. """ | ||
import importlib.resources | ||
from typing import Any, Dict, Union | ||
import linkml_runtime | ||
import linkml.validator | ||
import importlib.resources | ||
from functools import lru_cache | ||
from linkml_runtime.dumpers import yaml_dumper | ||
import yaml | ||
|
||
|
||
from nmdc_schema.nmdc import DataGeneration, FileTypeEnum, MagsAnalysis, MetagenomeAnnotation, MetagenomeAssembly, \ | ||
MetatranscriptomeAnnotation, MetatranscriptomeAssembly, MetatranscriptomeExpressionAnalysis, NucleotideSequencing, \ | ||
ReadBasedTaxonomyAnalysis, ReadQcAnalysis, WorkflowExecution | ||
import nmdc_schema.nmdc as nmdc | ||
|
||
|
||
@lru_cache(maxsize=None) | ||
def get_nmdc_materialized(): | ||
with importlib.resources.open_text("nmdc_schema", "nmdc_materialized_patterns.yaml") as f: | ||
return yaml.safe_load(f) | ||
|
||
def workflow_process_factory(record: Dict[str, Any], validate: bool = False) -> Union[DataGeneration, | ||
WorkflowExecution]: | ||
""" | ||
Factory function to create a PlannedProcess subclass object from a record. | ||
Subclasses are determined by the "type" field in the record, and can be | ||
either a WorkflowExecution or DataGeneration object. | ||
""" | ||
nmdc_materialized = get_nmdc_materialized() | ||
process_types = { | ||
"nmdc:MagsAnalysis": MagsAnalysis, | ||
"nmdc:MetagenomeAnnotation": MetagenomeAnnotation, | ||
"nmdc:MetagenomeAssembly": MetagenomeAssembly, | ||
"nmdc:MetatranscriptomeAnnotation": MetatranscriptomeAnnotation, | ||
"nmdc:MetatranscriptomeAssembly": MetatranscriptomeAssembly, | ||
"nmdc:MetatranscriptomeExpressionAnalysis": MetatranscriptomeExpressionAnalysis, | ||
"nmdc:NucleotideSequencing": NucleotideSequencing, | ||
"nmdc:ReadBasedTaxonomyAnalysis": ReadBasedTaxonomyAnalysis, | ||
"nmdc:ReadQcAnalysis": ReadQcAnalysis, | ||
} | ||
record = _normalize_record(record) | ||
target_class = record["type"].split(":")[1] | ||
if validate: | ||
validation_report = linkml.validator.validate(record, nmdc_materialized, target_class) | ||
if validation_report.results: | ||
raise ValueError(f"Validation error: {validation_report.results[0].message}") | ||
|
||
|
||
|
||
|
||
try: | ||
cls = process_types[record["type"]] | ||
except KeyError: | ||
raise ValueError(f"Invalid workflow execution type: {record['type']}") | ||
wfe = cls(**record) | ||
return wfe | ||
|
||
|
||
def _normalize_record(record: Dict[str, Any]) -> Dict[str, Any]: | ||
""" Normalize the record by removing the _id field and converting the type field to a string """ | ||
record.pop("_id", None) | ||
# for backwards compatibility strip Activity from the end of the type | ||
record["type"] = record["type"].replace("Activity", "") | ||
normalized_record = _strip_empty_values(record) | ||
|
||
|
||
# type-specific normalization | ||
if normalized_record["type"] == "nmdc:MagsAnalysis": | ||
normalized_record = _normalize_mags_record(normalized_record) | ||
|
||
return normalized_record | ||
|
||
|
||
def _normalize_mags_record(record: Dict[str, Any]) -> Dict[str, Any]: | ||
""" Normalize the record for a MagsAnalysis object """ | ||
for i, mag in enumerate(record.get("mags_list", [])): | ||
if not mag.get("type"): | ||
# Update the original dictionary in the list | ||
record["mags_list"][i]["type"] = "nmdc:MagBin" | ||
# for backwards compatibility normalize num_tRNA to num_t_rna | ||
if "num_tRNA" in mag: | ||
record["mags_list"][i]["num_t_rna"] = mag.pop("num_tRNA") | ||
# add type to eukaryotic_evaluation if it exists | ||
if "eukaryotic_evaluation" in mag: | ||
record["mags_list"][i]["eukaryotic_evaluation"]["type"] = "nmdc:EukEval" | ||
# gene count should be a positive integer - remove if 'null' | ||
if "gene_count" in mag and mag["gene_count"] == "null": | ||
mag.pop("gene_count") | ||
return record | ||
|
||
|
||
def _strip_empty_values(d: Dict[str, Any]) -> Dict[str, Any]: | ||
""" Strip empty values from a record """ | ||
empty_values = [None, "", []] | ||
def clean_dict(d): | ||
if isinstance(d, dict): | ||
return {k: clean_dict(v) for k, v in d.items() if v not in empty_values} | ||
elif isinstance(d, list): | ||
return [clean_dict(v) for v in d if v not in empty_values] | ||
return d | ||
return clean_dict(d) | ||
|
||
|
||
class DataObject(nmdc.DataObject): | ||
""" | ||
Extends the NMDC DataObject dataclass with additional methods for serialization. | ||
""" | ||
def __init__(self, **record): | ||
""" Initialize the object from a dictionary """ | ||
# _id is a MongoDB field that makes the parent class fail to initialize | ||
record.pop("_id", None) | ||
if "type" not in record: | ||
record["type"] = "nmdc:DataObject" | ||
super().__init__(**record) | ||
|
||
def as_dict(self) -> Dict[str, Any]: | ||
""" Convert the object to a dictionary """ | ||
return yaml.safe_load(yaml_dumper.dumps(self)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.