-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Code for NMDC to NCBI export #518
Changes from 18 commits
4c77eea
43f6baf
2653223
5d81676
7b714a5
d992f1f
71be54f
7fb364f
ae062d1
24e8ad9
48e7be3
b7a8000
dae0d13
dfe5f41
4ceb10a
c7d3da9
41a5468
e57b41c
e9ac70d
0ab92b3
151de04
c983237
e5421a3
80c4339
adcd8e6
5a9e749
433e317
bc3fea8
e82780f
b9c6d38
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,295 @@ | ||
import datetime | ||
import xml.etree.ElementTree as ET | ||
import xml.dom.minidom | ||
|
||
from nmdc_runtime.site.export.ncbi_xml_utils import ( | ||
handle_controlled_identified_term_value, | ||
handle_controlled_term_value, | ||
handle_geolocation_value, | ||
handle_quantity_value, | ||
handle_text_value, | ||
handle_timestamp_value, | ||
handle_float_value, | ||
handle_string_value, | ||
load_mappings, | ||
validate_xml, | ||
) | ||
from nmdc_runtime.site.export.nmdc_api_client import NMDCApiClient | ||
|
||
|
||
class NCBISubmissionXML: | ||
def __init__(self, ncbi_submission_fields: dict): | ||
self.root = ET.Element("Submission") | ||
self.nmdc_study_id = ncbi_submission_fields.get("nmdc_study_id") | ||
self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_fields.get( | ||
"nmdc_ncbi_attribute_mapping_file_url" | ||
) | ||
self.ncbi_submission_metadata = ncbi_submission_fields.get( | ||
"ncbi_submission_metadata", {} | ||
) | ||
self.ncbi_bioproject_metadata = ncbi_submission_fields.get( | ||
"ncbi_bioproject_metadata", {} | ||
) | ||
self.ncbi_biosample_metadata = ncbi_submission_fields.get( | ||
"ncbi_biosample_metadata", {} | ||
) | ||
self.nmdc_api_client = NMDCApiClient() | ||
|
||
# dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute | ||
# type handlers | ||
self.type_handlers = { | ||
"QuantityValue": handle_quantity_value, | ||
"TextValue": handle_text_value, | ||
"TimestampValue": handle_timestamp_value, | ||
"ControlledTermValue": handle_controlled_term_value, | ||
"ControlledIdentifiedTermValue": handle_controlled_identified_term_value, | ||
"GeolocationValue": handle_geolocation_value, | ||
"float": handle_float_value, | ||
"string": handle_string_value, | ||
} | ||
|
||
def set_element(self, tag, text="", attrib=None, children=None): | ||
attrib = attrib or {} | ||
children = children or [] | ||
element = ET.Element(tag, attrib=attrib) | ||
element.text = text | ||
for child in children: | ||
element.append(child) | ||
return element | ||
|
||
def set_description(self, email, user, first, last, org, date=None): | ||
date = date or datetime.datetime.now().strftime("%Y-%m-%d") | ||
description = self.set_element( | ||
"Description", | ||
children=[ | ||
self.set_element( | ||
"Comment", f"NMDC Submission for {self.nmdc_study_id}" | ||
), | ||
self.set_element("Submitter", attrib={"user_name": user}), | ||
self.set_element( | ||
"Organization", | ||
attrib={"role": "owner", "type": "center"}, | ||
children=[ | ||
self.set_element("Name", org), | ||
self.set_element( | ||
"Contact", | ||
attrib={"email": email}, | ||
children=[ | ||
self.set_element( | ||
"Name", | ||
children=[ | ||
self.set_element("First", first), | ||
self.set_element("Last", last), | ||
], | ||
) | ||
], | ||
), | ||
], | ||
), | ||
self.set_element("Hold", attrib={"release_date": date}), | ||
], | ||
) | ||
self.root.append(description) | ||
|
||
def set_descriptor(self, title, description): | ||
descriptor_elements = [] | ||
descriptor_elements.append(self.set_element("Title", title)) | ||
descriptor_elements.append( | ||
self.set_element( | ||
"Description", children=[self.set_element("p", description)] | ||
) | ||
) | ||
|
||
return descriptor_elements | ||
|
||
def set_bioproject(self, title, project_id, description, data_type, org): | ||
action = self.set_element("Action") | ||
add_data = self.set_element("AddData", attrib={"target_db": "BioProject"}) | ||
|
||
data_element = self.set_element("Data", attrib={"content_type": "XML"}) | ||
xml_content = self.set_element("XmlContent") | ||
project = self.set_element("Project", attrib={"schema_version": "2.0"}) | ||
|
||
project_id_element = self.set_element("ProjectID") | ||
spuid = self.set_element("SPUID", project_id, {"spuid_namespace": org}) | ||
project_id_element.append(spuid) | ||
|
||
descriptor = self.set_descriptor(title, description) | ||
project_type = self.set_element("ProjectType") | ||
project_type_submission = self.set_element( | ||
"ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"} | ||
pkalita-lbl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
intended_data_type_set = self.set_element("IntendedDataTypeSet") | ||
data_type_element = self.set_element("DataType", data_type) | ||
|
||
intended_data_type_set.append(data_type_element) | ||
project_type_submission.append(intended_data_type_set) | ||
project_type.append(project_type_submission) | ||
|
||
project.extend([project_id_element] + descriptor + [project_type]) | ||
|
||
xml_content.append(project) | ||
data_element.append(xml_content) | ||
add_data.append(data_element) | ||
|
||
identifier = self.set_element("Identifier") | ||
spuid_identifier = self.set_element( | ||
"SPUID", project_id, {"spuid_namespace": org} | ||
) | ||
identifier.append(spuid_identifier) | ||
add_data.append(identifier) | ||
|
||
action.append(add_data) | ||
self.root.append(action) | ||
|
||
def set_biosample( | ||
self, | ||
organism_name, | ||
package, | ||
org, | ||
nmdc_biosamples, | ||
): | ||
attribute_mappings, slot_range_mappings = load_mappings( | ||
self.nmdc_ncbi_attribute_mapping_file_url | ||
) | ||
Comment on lines
+164
to
+166
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would try to not tie the process of obtaining the mapping directly into this class. Let this class be just about transformation. Let the client of the class worry about how to get the mapping and pass it (the mapping itself, not a URL) to this class. |
||
|
||
for biosample in nmdc_biosamples: | ||
attributes = {} | ||
sample_id_value = None | ||
|
||
for json_key, value in biosample.items(): | ||
if isinstance(value, list): | ||
continue # Skip processing for list values | ||
|
||
# Special handling for NMDC Biosample "id" | ||
if json_key == "id": | ||
sample_id_value = value | ||
continue | ||
|
||
if json_key not in attribute_mappings: | ||
continue | ||
|
||
xml_key = attribute_mappings[json_key] | ||
value_type = slot_range_mappings.get(json_key, "string") | ||
handler = self.type_handlers.get(value_type, handle_string_value) | ||
|
||
formatted_value = handler(value) | ||
attributes[xml_key] = formatted_value | ||
|
||
# Create the BioSample XML block with these attributes for each biosample | ||
biosample_elements = [ | ||
self.set_element( | ||
"SampleId", | ||
children=[ | ||
self.set_element( | ||
"SPUID", sample_id_value, {"spuid_namespace": org} | ||
) | ||
], | ||
), | ||
self.set_element( | ||
"Descriptor", | ||
children=[ | ||
self.set_element( | ||
"Title", | ||
f"NMDC Biosample {sample_id_value} from {organism_name} part of {self.nmdc_study_id} study", | ||
), | ||
], | ||
), | ||
self.set_element( | ||
"Organism", | ||
children=[self.set_element("OrganismName", organism_name)], | ||
), | ||
self.set_element("Package", package), | ||
self.set_element( | ||
"Attributes", | ||
children=[ | ||
self.set_element( | ||
"Attribute", attributes[key], {"attribute_name": key} | ||
) | ||
for key in sorted(attributes) | ||
], | ||
), | ||
] | ||
|
||
action = self.set_element( | ||
"Action", | ||
children=[ | ||
self.set_element( | ||
"AddData", | ||
attrib={"target_db": "BioSample"}, | ||
children=[ | ||
self.set_element( | ||
"Data", | ||
attrib={"content_type": "XML"}, | ||
children=[ | ||
self.set_element( | ||
"XmlContent", | ||
children=[ | ||
self.set_element( | ||
"BioSample", | ||
attrib={"schema_version": "2.0"}, | ||
children=biosample_elements, | ||
), | ||
], | ||
), | ||
], | ||
), | ||
self.set_element( | ||
"Identifier", | ||
children=[ | ||
self.set_element( | ||
"SPUID", | ||
sample_id_value, | ||
{"spuid_namespace": org}, | ||
), | ||
], | ||
), | ||
], | ||
), | ||
], | ||
) | ||
self.root.append(action) | ||
|
||
def get_submission_xml(self): | ||
self.set_description( | ||
email=self.ncbi_submission_metadata.get("email", ""), | ||
user=self.ncbi_submission_metadata.get("user", ""), | ||
first=self.ncbi_submission_metadata.get("first", ""), | ||
last=self.ncbi_submission_metadata.get("last", ""), | ||
org=self.ncbi_submission_metadata.get("organization", ""), | ||
) | ||
|
||
self.set_bioproject( | ||
title=self.ncbi_bioproject_metadata.get("title", ""), | ||
project_id=self.ncbi_bioproject_metadata.get("project_id", ""), | ||
description=self.ncbi_bioproject_metadata.get("description", ""), | ||
data_type=self.ncbi_bioproject_metadata.get("data_type", ""), | ||
org=self.ncbi_submission_metadata.get("organization", ""), | ||
) | ||
|
||
biosamples_list = self.nmdc_api_client.get_biosamples_part_of_study( | ||
self.nmdc_study_id | ||
) | ||
sujaypatil96 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
self.set_biosample( | ||
organism_name=self.ncbi_biosample_metadata.get("organism_name", ""), | ||
package=self.ncbi_biosample_metadata.get("package", ""), | ||
org=self.ncbi_submission_metadata.get("organization", ""), | ||
nmdc_biosamples=biosamples_list, | ||
) | ||
|
||
rough_string = ET.tostring(self.root, "unicode") | ||
reparsed = xml.dom.minidom.parseString(rough_string) | ||
submission_xml = reparsed.toprettyxml(indent=" ", newl="\n") | ||
Comment on lines
+419
to
+421
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason for this serialize-deserialize step? Is it just to get nicely formatted XML? If so, would the |
||
|
||
# ============= Uncomment the following code to validate the XML against NCBI XSDs ============ # | ||
# submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co" | ||
# submission_xsd_validation = validate_xml(submission_xml, submission_xsd_url) | ||
|
||
# bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/bioproject.xsd?view=co" | ||
pkalita-lbl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# bioproject_xsd_validation = validate_xml(submission_xml, bioproject_xsd_url) | ||
|
||
# biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/biosample.xsd?view=co" | ||
pkalita-lbl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# biosample_xsd_validation = validate_xml(submission_xml, biosample_xsd_url) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn't leave commented-out code like this in. But I would turn these into unit tests. |
||
|
||
return submission_xml |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some quibbles about naming.
NCBISubmissionXML
feels more representative of what this class produces. So what is the class itself? Something likeNCBISubmissionGenerator
orNCBISubmissionTranslator
?Also this class has a lot of methods named
set_*
. When I see a method with such a name I expect it to set some piece of internal class state and return nothing. Instead, some of these methods (set_element
,set_descriptor
) produce XML elements and return them. Other (set_description
,set_bioproject
,set_biosample
) produce XML elements and append them to the XML root. I think this code might be a bit more readable if all those methods were named something likebuild_*
orgenerate_*
and always return what they produced. Then the caller (get_submission_xml
) would be responsible for taking the return values and use them as needed (appending to the root element).