From 104763eb32382bfcc5f05b830dbc00db5f91afab Mon Sep 17 00:00:00 2001 From: Stephen Ramsey Date: Thu, 14 Nov 2024 05:46:38 +0000 Subject: [PATCH] #420 --- build/Snakefile-conversion | 12 ++++++++++++ build/Snakefile-post-etl | 1 + build/snakemake-config-var.yaml | 10 ++++++++++ kg2_util.py | 16 ++++++++++++---- maps/curies-to-urls-map.yaml | 2 ++ setup/setup-kg2-build.sh | 5 +++-- 6 files changed, 40 insertions(+), 6 deletions(-) diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion index 227d5ade..ad95fe95 100644 --- a/build/Snakefile-conversion +++ b/build/Snakefile-conversion @@ -97,6 +97,18 @@ rule ChEMBL_Conversion: shell: config['PYTHON_COMMAND'] + " {input.code} " + config['MYSQL_CONF'] + " " + config['CHEMBL_MYSQL_DBNAME'] + " {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1" +rule UNII_Conversion: + input: + code = config['UNII_CONVERSION_SCRIPT'], + real = config['UNII_TSV_FILE'], + validation = config['VALIDATION_PLACEHOLDER'] + output: + nodes = config['UNII_OUTPUT_NODES_FILE'] + log: + config['UNII_CONVERSION_LOG'] + shell: + config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} " + config['TEST_ARG'] + " > {log} 2>&1" + rule NCBIGene_Conversion: input: code = config['NCBIGENE_CONVERSION_SCRIPT'], diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl index dab60237..bcb655cb 100644 --- a/build/Snakefile-post-etl +++ b/build/Snakefile-post-etl @@ -41,6 +41,7 @@ rule Merge: disgenet_edges = config['DISGENET_OUTPUT_EDGES_FILE'], kegg_nodes = config['KEGG_OUTPUT_NODES_FILE'], kegg_edges = config['KEGG_OUTPUT_EDGES_FILE'], + unii_nodes = config['UNII_OUTPUT_NODES_FILE'], clinicaltrialskg_nodes = config['CLINICALTRIALSKG_OUTPUT_NODES_FILE'], clinicaltrialskg_edges = config['CLINICALTRIALSKG_OUTPUT_EDGES_FILE'] output: diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml index 2943089f..65ba6bcd 100644 --- a/build/snakemake-config-var.yaml +++ b/build/snakemake-config-var.yaml @@ -101,6 +101,16 @@ ncbigene_conversion_log: ${BUILD_DIR}/${ncbigene_conversion_base}${version_suffi ncbigene_output_nodes_file: ${BUILD_DIR}/${ncbigene_output_base}${nodes_suffix}${test_suffix}.jsonl ncbigene_output_edges_file: ${BUILD_DIR}/${ncbigene_output_base}${edges_suffix}${test_suffix}.jsonl +unii_extraction_base: extract-unii +unii_conversion_base: unii_tsv_to_kg_jsonl +unii_output_base: kg2-unii +unii_extraction_script: ${EXTRACT_CODE_DIR}/${unii_extraction_base}.sh +unii_extraction_log: ${BUILD_DIR}/${unii_extraction_base}${version_suffix}${test_suffix}.log +unii_tsv_file: ${BUILD_DIR}/unii/unii.tsv +unii_conversion_script: ${CONVERT_CODE_DIR}/${unii_conversion_base}.py +unii_conversion_log: ${BUILD_DIR}/${unii_conversion_base}${version_suffix}${test_suffix}.log +unii_output_nodes_file: ${BUILD_DIR}/${unii_output_base}${nodes_suffix}${test_suffix}.jsonl + dgidb_extraction_base: extract-dgidb dgidb_conversion_base: dgidb_tsv_to_kg_jsonl dgidb_output_base: kg2-dgidb diff --git a/kg2_util.py b/kg2_util.py index bfb7e2a8..7740f10f 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -125,6 +125,7 @@ CURIE_PREFIX_UMLS_STY = 'STY' CURIE_PREFIX_UMLS_SOURCE = 'umls_source' CURIE_PREFIX_UNICHEM_SOURCE = 'UNICHEM_source' +CURIE_PREFIX_UNII = 'UNII' CURIE_PREFIX_UNIPROT = 'UniProtKB' CURIE_PREFIX_VANDF = 'VANDF' @@ -178,6 +179,7 @@ BASE_URL_UMLS = BASE_BASE_URL_IDENTIFIERS_ORG + 'umls:' BASE_URL_UMLS_STY = 'http://purl.bioontology.org/ontology/STY/' BASE_URL_UNICHEM = 'https://www.ebi.ac.uk/unichem/' +BASE_URL_UNII = 'https://precision.fda.gov/uniisearch/srs/unii/' BASE_URL_UNIPROTKB = 'http://purl.uniprot.org/uniprot/' BIOLINK_CATEGORY_ANATOMICAL_ENTITY = 'anatomical entity' @@ -352,14 +354,20 @@ def close_single_jsonlines(info: tuple, output_file_name: str): temp_output_file.close() -def create_kg2_jsonlines(test_mode: bool = False): - return create_single_jsonlines(test_mode), create_single_jsonlines(test_mode) +def create_kg2_jsonlines(test_mode: bool = False, include_edges = True): + jl_nodes = create_single_jsonlines(test_mode) + if include_edges: + jl_edges = create_single_jsonlines(test_mode) + else: + jl_edges = None + return jl_nodes, jl_edges def close_kg2_jsonlines(nodes_info: tuple, edges_info: tuple, output_nodes_file_name: str, output_edges_file_name: str): close_single_jsonlines(nodes_info, output_nodes_file_name) - close_single_jsonlines(edges_info, output_edges_file_name) + if edges_info is not None: + close_single_jsonlines(edges_info, output_edges_file_name) def start_read_jsonlines(file_name: str, type=dict): @@ -732,4 +740,4 @@ def is_a_valid_http_url(id: str) -> bool: valid = id.startswith('http://') or id.startswith('https://') except validators.ValidationFailure: valid = False - return valid \ No newline at end of file + return valid diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 19faa9c9..de73a131 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -497,6 +497,8 @@ use_for_bidirectional_mapping: UMLSSC: http://purl.bioontology.org/ontology/STY/ - UNICHEM_source: "https://www.ebi.ac.uk/unichem/" + - + UNII: "https://precision.fda.gov/uniisearch/srs/unii/" - UniProtKB: "http://purl.uniprot.org/uniprot/" - diff --git a/setup/setup-kg2-build.sh b/setup/setup-kg2-build.sh index f119fc52..50a9ac00 100755 --- a/setup/setup-kg2-build.sh +++ b/setup/setup-kg2-build.sh @@ -77,7 +77,8 @@ sudo apt-get install -y \ automake \ git \ libssl-dev \ - make + make \ + unzip sudo debconf-set-selections <<< "mysql-server mysql-server/root_password password ${mysql_password}" sudo debconf-set-selections <<< "mysql-server mysql-server/root_password_again password ${mysql_password}" @@ -157,4 +158,4 @@ setup_kg2_build_part2 >> ${setup_log_file} 2>&1 if [[ "${build_flag}" != "ci" ]] then ${s3_cp_cmd} ${setup_log_file} s3://${s3_bucket_versioned}/ -fi \ No newline at end of file +fi