From 04248158db3a88be182e0a1dfd68ba0d35ee1ebc Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 22 Mar 2023 22:03:18 -0700 Subject: [PATCH 01/60] Create README.md Note: need to add Notes and Caveats as well as commands for running and testing scripts. Also need to add tests and testing files --- scripts/biomedical/ICTV_Taxonomy/README.md | 116 +++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 scripts/biomedical/ICTV_Taxonomy/README.md diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md new file mode 100644 index 0000000000..7c9acca85e --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -0,0 +1,116 @@ + +# Importing ontology dataset of molecular interaction from the European Bioinformatics Institute (EMBL-EBI) + +## Table of Contents + +1. [About the Dataset](#about-the-dataset) + 1. [Download URL](#download-urls) + 2. [Overview](#overview) + 3. [Notes and Caveats](#notes-and-caveats) + 4. [License](#license) + 5. [Dataset Documentation and Relevant Links](#dataset-documentation-and-relevant-links) +2. [About the Import](#about-the-import) + 1. [Artifacts](#artifacts) + 2. [Import Procedure](#import-procedure) + 3. [Tests](#tests) + + +## About the Dataset +“The [International Committee on Taxonomy of Viruses (ICTV)](https://ictv.global/) authorizes and organizes the taxonomic classification of and the nomenclatures for viruses. The ICTV has developed a universal taxonomic scheme for viruses, and thus has the means to appropriately describe, name, and classify every virus that affects living organisms. The members of the International Committee on Taxonomy of Viruses are considered expert virologists. The ICTV was formed from and is governed by the Virology Division of the International Union of Microbiological Societies. Detailed work, such as delimiting the boundaries of species within a family, typically is performed by study groups of experts in the families.” Description from [Wikipedia](https://en.wikipedia.org/wiki/International_Committee_on_Taxonomy_of_Viruses). + +The ICTV Master Species List is curated by virology experts, which have established over 100 international study groups, which organize discussions on emerging taxonomic issues in their field, oversee the submission of proposals for new taxonomy, and prepare or revise the relevant chapter(s) in ICTV reports. ICTV is open to submissions of proposals for taxonomic changes from an individual, however in practice proposals are usually submitted by members of the relevant study groups. + +### Download URLs + +The release history and the most recent release of the Master Species List can be found [here](https://ictv.global/msl). + +The release history and the most recent release of the Virus Metadata Resource can be found [here](https://ictv.global/vmr). + + +### Overview + +This directory stores all scripts used to import data on viurses and virus isolates from the ICTV. This includes the master species list, which includes the full viral taxonomy (realm -> species) and information on the genomic composition and taxonomic history for all species. The import also includes the Virus Metadata Resource, which includes information regarding the exemplar isolates for each species selected by the ICTV and additional virus isolates within the ICTV dataset. + + +### Notes and Caveats + + + +### License + +The data is published under the Creative Commons Attribution ShareAlike 4.0 International [(CC BY-SA 4.0)] (https://creativecommons.org/licenses/by-sa/4.0/). + +### Dataset Documentation and Relevant Links + +- Documentation can be found in one of the excel sheets in a downloaded dataset from ICTV. +- Taxonomy Browser User Interface: https://ictv.global/taxonomy + +## About the import + +### Artifacts + +#### New Classes + +Virus, VirusIsolate, VirusGenomeSegment + +#### New Properties + +- Virus: proposalForLastChange, taxonHistoryURL, versionOfLastChange, virusGenomeComposition, virusHost, virusLastTaxonomicChange, virusSource, virusRealm, virusSubrealm, virusKingdom, virusSubkingdom, virusPhylum, virusSubphylum, virusClass, virusSubclass, virusOrder, virusSuborder, virusFamily, virusSubfamily, virusGenus, virusSubgenus, virusSpecies +- VirusIsolate: genomeCoverage, isExemplarVirusIsolate, ofVirusSpecies, virusIsolateDesignation +- VirusGenomeSegment: genomeSegmentOf + +#### New Enumerations + +GenomeCoverageEnum, VirusGenomeCompositionEnum, VirusHostEnum, VirusSourceEnum + +#### Schema MCFs + +[ICTV_schema.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema.mcf) +[ICTV_schema_enum.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema_enum.mcf) + +#### tMCFs + +- [VirusMasterSpeciesList.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusMasterSpeciesList.tmcf) +- [VirusTaxonomy.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusTaxonomy.tmcf) +- [VirusGenomeSegmeng.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusGenomeSegment.tmcf) + +#### Scripts + +- [download.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/download.sh) +- [format_virus_master_species_list.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/format_virus_master_species_list.py) +- [format_virus_metadata_resource.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/format_virus_metadata_resource.py) + +### Import Procedure + +To download the most recent versions of the Master Species List and Virus Metadata Resource from ICTV run: + +```bash +download.sh +``` + +To test the script, run: + +```bash +``` + +### Tests + +#### Dataset Specific Tests + +To test the import to evaluate whether the data is formatted as expected or if changes were made in the formatting in the most recent release run the following commands to evaluate each cleaned csv individually. + +VirusSpecies: +'''bash +''' + +VirusIsolates: +'''bash +''' + +VirusGenomeSegment: +'''bash +''' + +#### Data Commons Import Tests + +Please run all cleaned CSV + tMCF pairs through our Data Commons import tool to run general Data Commons formatting tests. From c4e65ed11e4e2a9aa006df6101e3bd19f25141f4 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 22 Mar 2023 22:05:28 -0700 Subject: [PATCH 02/60] Add VirusMasterSpeciesList.tmcf --- .../tMCF/VirusMasterSpeciesList.tmcf | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 scripts/biomedical/ICTV_Taxonomy/tMCF/VirusMasterSpeciesList.tmcf diff --git a/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusMasterSpeciesList.tmcf b/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusMasterSpeciesList.tmcf new file mode 100644 index 0000000000..623dc759ed --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusMasterSpeciesList.tmcf @@ -0,0 +1,24 @@ +Node: E:VirusSpecies->E1 +typeOf: dcs:Virus +dcid: C:VirusSpecies->dcid +name: C:VirusSpecies->species +proposalForLastChange: C:VirusSpecies->proposalForLastChange +taxonHistoryURL: C:VirusSpecies->taxonHistoryURL +versionOfLastChange: C:VirusSpecies->lastChangeVersion +virusClass: C:VirusSpecies->class +virusFamily: C:VirusSpecies->family +virusGenomeComposition: C:VirusSpecies->genomeComposition +virusGenus: C:VirusSpecies->genus +virusKingdom: C:VirusSpecies->kingdom +virusLastTaxonomicChange: C:VirusSpecies->lastChange +virusOrder: C:VirusSpecies->order +virusPhylum: C:VirusSpecies->phylum +virusRealm: C:VirusSpecies->realm +virusSpecies: C:VirusSpecies->species +virusSubclass: C:VirusSpecies->subclass +virusSubfamily: C:VirusSpecies->subfamily +virusSubgenus: C:VirusSpecies->subgenus +virusSubkingdom: C:VirusSpecies->subkingdom +virusSuborder: C:VirusSpecies->suborder +virusSubphylum: C:VirusSpecies->subphylum +virusSubrealm: C:VirusSpecies->subrealm From 08d04b0965d88ddfea2bbbe5585a68b269e5722c Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 22 Mar 2023 22:06:49 -0700 Subject: [PATCH 03/60] Add tmcf files --- .../tMCF/VirusGenomeSegment.tmcf | 7 ++++ .../ICTV_Taxonomy/tMCF/VirusTaxonomy.tmcf | 35 +++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 scripts/biomedical/ICTV_Taxonomy/tMCF/VirusGenomeSegment.tmcf create mode 100644 scripts/biomedical/ICTV_Taxonomy/tMCF/VirusTaxonomy.tmcf diff --git a/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusGenomeSegment.tmcf b/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusGenomeSegment.tmcf new file mode 100644 index 0000000000..d9288e2ce7 --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusGenomeSegment.tmcf @@ -0,0 +1,7 @@ +Node: E:VirusGenomeSegments->E1 +typeOf: dcs:VirusGenomeSegment +dcid: C:VirusGenomeSegments->dcid +name: C:VirusGenomeSegments->name +genBankAccession: C:VirusGenomeSegments->genBankAccession +genomeSegmentOf: C:VirusGenomeSegments->genomeSegmentOf +refSeqAccession: C:VirusGenomeSegments->refSeqAccession diff --git a/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusTaxonomy.tmcf b/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusTaxonomy.tmcf new file mode 100644 index 0000000000..26d13211cd --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusTaxonomy.tmcf @@ -0,0 +1,35 @@ +Node: E:VirusTaxonomy->E1 +typeOf: dcs:Virus +dcid: C:VirusTaxonomy->dcid +name: C:VirusTaxonomy->species +abbreviation: C:VirusTaxonomy->abbreviation +alternateName: C:VirusTaxonomy->name +virusClass: C:VirusTaxonomy->class +virusFamily: C:VirusTaxonomy->family +virusGenomeComposition: C:VirusTaxonomy->genomeComposition +virusGenus: C:VirusTaxonomy->genus +virusHost: C:VirusTaxonomy->host +virusKingdom: C:VirusTaxonomy->kingdom +virusOrder: C:VirusTaxonomy->order +virusPhylum: C:VirusTaxonomy->phylum +virusRealm: C:VirusTaxonomy->realm +virusSource: C:VirusTaxonomy->source +virusSpecies: C:VirusTaxonomy->species +virusSubclass: C:VirusTaxonomy->subclass +virusSubfamily: C:VirusTaxonomy->subfamily +virusSubgenus: C:VirusTaxonomy->subgenus +virusSubkingdom: C:VirusTaxonomy->subkingdom +virusSuborder: C:VirusTaxonomy->suborder +virusSubphylum: C:VirusTaxonomy->subphylum +virusSubrealm: C:VirusTaxonomy->subrealm + +Node: E:ViralTaxonomy->E2 +typeOf: dcs:VirusIsolate +dcid: C:VirusTaxonomy->isolate_dcid +name: C:VirusTaxonomy->isolate_name +genBankAccession: C:VirusTaxonomy->genBankAccession +genomeCoverage: C:VirusTaxonomy->genomeCoverage +isExemplarVirusIsolate: C:VirusTaxonomy->isExemplar +ofVirusSpecies: E:VirusTaxonomy->E1 +refSeqAccession: C:VirusTaxonomy->refSeqAccession +virusIsolateDesignation: C:VirusTaxonomy->isolateDesignation From caa57590e90dac533bc98e5e676c4d506ff7cb5e Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Thu, 23 Mar 2023 15:03:57 -0700 Subject: [PATCH 04/60] Update title --- scripts/biomedical/ICTV_Taxonomy/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 7c9acca85e..597055efd1 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -1,5 +1,5 @@ -# Importing ontology dataset of molecular interaction from the European Bioinformatics Institute (EMBL-EBI) +# Importing Master Species List and Virus Metadata Resource from the International Committee on Taxonomy of Viruses (ICTV) ## Table of Contents @@ -38,7 +38,7 @@ This directory stores all scripts used to import data on viurses and virus isola ### License -The data is published under the Creative Commons Attribution ShareAlike 4.0 International [(CC BY-SA 4.0)] (https://creativecommons.org/licenses/by-sa/4.0/). +The data is published under the Creative Commons Attribution ShareAlike 4.0 International [(CC BY-SA 4.0)](https://creativecommons.org/licenses/by-sa/4.0/). ### Dataset Documentation and Relevant Links From 4ecc413505ade1003aa40da293ed8e0be03a21fc Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Thu, 23 Mar 2023 15:07:34 -0700 Subject: [PATCH 05/60] Add VMR dataset description --- scripts/biomedical/ICTV_Taxonomy/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 597055efd1..d287fcb8f0 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -15,11 +15,13 @@ 3. [Tests](#tests) -## About the Dataset +## About the Datasets “The [International Committee on Taxonomy of Viruses (ICTV)](https://ictv.global/) authorizes and organizes the taxonomic classification of and the nomenclatures for viruses. The ICTV has developed a universal taxonomic scheme for viruses, and thus has the means to appropriately describe, name, and classify every virus that affects living organisms. The members of the International Committee on Taxonomy of Viruses are considered expert virologists. The ICTV was formed from and is governed by the Virology Division of the International Union of Microbiological Societies. Detailed work, such as delimiting the boundaries of species within a family, typically is performed by study groups of experts in the families.” Description from [Wikipedia](https://en.wikipedia.org/wiki/International_Committee_on_Taxonomy_of_Viruses). The ICTV Master Species List is curated by virology experts, which have established over 100 international study groups, which organize discussions on emerging taxonomic issues in their field, oversee the submission of proposals for new taxonomy, and prepare or revise the relevant chapter(s) in ICTV reports. ICTV is open to submissions of proposals for taxonomic changes from an individual, however in practice proposals are usually submitted by members of the relevant study groups. +The ICTV chooses an exemplar virus for each species and the Virus Metadata Resource provides a list of these exemplars. An exemplar virus serves as an example of a well-characterized virus isolate of that species and includes the GenBank accession number for the genomic sequence of the isolate as well as the virus name, isolate designation, suggested abbreviation, genome composition, and host source. + ### Download URLs The release history and the most recent release of the Master Species List can be found [here](https://ictv.global/msl). From 55fbb5168f1cbd19f8281db51aa31ab6c4210c80 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 27 Mar 2023 23:39:16 -0700 Subject: [PATCH 06/60] Mention script formatting taxonomic ranking enums --- scripts/biomedical/ICTV_Taxonomy/README.md | 47 +++++++++++++++------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index d287fcb8f0..771760ae4c 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -69,28 +69,45 @@ GenomeCoverageEnum, VirusGenomeCompositionEnum, VirusHostEnum, VirusSourceEnum [ICTV_schema.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema.mcf) [ICTV_schema_enum.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema_enum.mcf) +[ICTV_schema_taxonomic_ranking_enum.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema_taxonomic_ranking_enum.mcf #### tMCFs -- [VirusMasterSpeciesList.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusMasterSpeciesList.tmcf) -- [VirusTaxonomy.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusTaxonomy.tmcf) -- [VirusGenomeSegmeng.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusGenomeSegment.tmcf) +- [VirusMasterSpeciesList.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusMasterSpeciesList.tmcf) +- [VirusTaxonomy.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusTaxonomy.tmcf) +- [VirusGenomeSegmeng.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusGenomeSegment.tmcf) #### Scripts -- [download.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/download.sh) -- [format_virus_master_species_list.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/format_virus_master_species_list.py) -- [format_virus_metadata_resource.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/format_virus_metadata_resource.py) +- [download.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh) +- [create_virus_taxonomic_ranking_enums.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py) +- [format_virus_master_species_list.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py) +- [format_virus_metadata_resource.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py) + +#### Log Files + +- [format_virus_metadata_resource.log](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/format_virus_metadata_resource.log) ### Import Procedure -To download the most recent versions of the Master Species List and Virus Metadata Resource from ICTV run: +Download the most recent versions of the Master Species List and Virus Metadata Resource from ICTV by running: ```bash download.sh ``` -To test the script, run: +Generate the enummeration schema MCF, which represents virus taxonomic ranks by running: + +```bash +python3 scripts/create_virus_taxonomic_ranking_enums.py import_files/ICTV_Master_Species_List_2021_v3.xlsx ICTV_schema_taxonomic_ranking_enum.mcf +``` + +Clean and format Master Species List as a CSV that matches the corresponding tMCF by running: + +```bash +``` + +Clean and format Virus Metadata Resource as a CSV that matches the corresponding tMCF by running: ```bash ``` @@ -102,17 +119,17 @@ To test the script, run: To test the import to evaluate whether the data is formatted as expected or if changes were made in the formatting in the most recent release run the following commands to evaluate each cleaned csv individually. VirusSpecies: -'''bash -''' +```bash +``` VirusIsolates: -'''bash -''' +```bash +``` VirusGenomeSegment: -'''bash -''' +```bash +``` #### Data Commons Import Tests -Please run all cleaned CSV + tMCF pairs through our Data Commons import tool to run general Data Commons formatting tests. +Please run all cleaned CSV + tMCF pairs through our lint test using our Data Commons import tool, which conducts general formatting tests. From 68d0bc5a5d6a9be959890e33363df24d91c006a6 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 27 Mar 2023 23:40:21 -0700 Subject: [PATCH 07/60] format schema list --- scripts/biomedical/ICTV_Taxonomy/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 771760ae4c..5cbac9d808 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -67,9 +67,9 @@ GenomeCoverageEnum, VirusGenomeCompositionEnum, VirusHostEnum, VirusSourceEnum #### Schema MCFs -[ICTV_schema.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema.mcf) -[ICTV_schema_enum.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema_enum.mcf) -[ICTV_schema_taxonomic_ranking_enum.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema_taxonomic_ranking_enum.mcf +- [ICTV_schema.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema.mcf) +- [ICTV_schema_enum.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema_enum.mcf) +- [ICTV_schema_taxonomic_ranking_enum.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema_taxonomic_ranking_enum.mcf) #### tMCFs From 6e6d5debeddcbbd06856aa9aa88897bf988949ba Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 27 Mar 2023 23:43:19 -0700 Subject: [PATCH 08/60] update new enumerations lists --- scripts/biomedical/ICTV_Taxonomy/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 5cbac9d808..9a1a7bc650 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -65,6 +65,10 @@ Virus, VirusIsolate, VirusGenomeSegment GenomeCoverageEnum, VirusGenomeCompositionEnum, VirusHostEnum, VirusSourceEnum +#### New Enumerations Generated Via Script + +VirusRealmEnum, VirusSubrealmEnum, VirusKingdomEnum, VirusSubkingdomEnum, VirusPhylumEnum, VirusSubphylumEnum, VirusClassEnum, VirusSubclassEnum, VirusOrderEnum, VirusSuborderEnum, VirusFamilyEnum, VirusSubfamilyEnum, VirusGenusEnum, VirusSubgenusEnum + #### Schema MCFs - [ICTV_schema.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema.mcf) @@ -86,7 +90,7 @@ GenomeCoverageEnum, VirusGenomeCompositionEnum, VirusHostEnum, VirusSourceEnum #### Log Files -- [format_virus_metadata_resource.log](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/format_virus_metadata_resource.log) +- [format_virus_metadata_resource.log](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log) ### Import Procedure From 0f92262a0d0e2354e62322c83430eda3e8a2653a Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 27 Mar 2023 23:46:10 -0700 Subject: [PATCH 09/60] update new schema summary formatting --- scripts/biomedical/ICTV_Taxonomy/README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 9a1a7bc650..fc0494030d 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -51,21 +51,25 @@ The data is published under the Creative Commons Attribution ShareAlike 4.0 Inte ### Artifacts -#### New Classes +#### New Schema + +Classes, properties, and enumerations that were added in this import to represent the data + +##### Classes Virus, VirusIsolate, VirusGenomeSegment -#### New Properties +##### Properties - Virus: proposalForLastChange, taxonHistoryURL, versionOfLastChange, virusGenomeComposition, virusHost, virusLastTaxonomicChange, virusSource, virusRealm, virusSubrealm, virusKingdom, virusSubkingdom, virusPhylum, virusSubphylum, virusClass, virusSubclass, virusOrder, virusSuborder, virusFamily, virusSubfamily, virusGenus, virusSubgenus, virusSpecies - VirusIsolate: genomeCoverage, isExemplarVirusIsolate, ofVirusSpecies, virusIsolateDesignation - VirusGenomeSegment: genomeSegmentOf -#### New Enumerations +##### Enumerations GenomeCoverageEnum, VirusGenomeCompositionEnum, VirusHostEnum, VirusSourceEnum -#### New Enumerations Generated Via Script +##### Enumerations Generated Via Script VirusRealmEnum, VirusSubrealmEnum, VirusKingdomEnum, VirusSubkingdomEnum, VirusPhylumEnum, VirusSubphylumEnum, VirusClassEnum, VirusSubclassEnum, VirusOrderEnum, VirusSuborderEnum, VirusFamilyEnum, VirusSubfamilyEnum, VirusGenusEnum, VirusSubgenusEnum From ddb209ae3df681a5dea991eaef0f7b346721eb91 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 27 Mar 2023 23:50:09 -0700 Subject: [PATCH 10/60] update new schema overview formatting --- scripts/biomedical/ICTV_Taxonomy/README.md | 31 +++++++++------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index fc0494030d..063b1138ee 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -53,25 +53,18 @@ The data is published under the Creative Commons Attribution ShareAlike 4.0 Inte #### New Schema -Classes, properties, and enumerations that were added in this import to represent the data - -##### Classes - -Virus, VirusIsolate, VirusGenomeSegment - -##### Properties - -- Virus: proposalForLastChange, taxonHistoryURL, versionOfLastChange, virusGenomeComposition, virusHost, virusLastTaxonomicChange, virusSource, virusRealm, virusSubrealm, virusKingdom, virusSubkingdom, virusPhylum, virusSubphylum, virusClass, virusSubclass, virusOrder, virusSuborder, virusFamily, virusSubfamily, virusGenus, virusSubgenus, virusSpecies -- VirusIsolate: genomeCoverage, isExemplarVirusIsolate, ofVirusSpecies, virusIsolateDesignation -- VirusGenomeSegment: genomeSegmentOf - -##### Enumerations - -GenomeCoverageEnum, VirusGenomeCompositionEnum, VirusHostEnum, VirusSourceEnum - -##### Enumerations Generated Via Script - -VirusRealmEnum, VirusSubrealmEnum, VirusKingdomEnum, VirusSubkingdomEnum, VirusPhylumEnum, VirusSubphylumEnum, VirusClassEnum, VirusSubclassEnum, VirusOrderEnum, VirusSuborderEnum, VirusFamilyEnum, VirusSubfamilyEnum, VirusGenusEnum, VirusSubgenusEnum +Classes, properties, and enumerations that were added in this import to represent the data. + +* Classes + * Virus, VirusIsolate, VirusGenomeSegment +* Properties + * Virus: proposalForLastChange, taxonHistoryURL, versionOfLastChange, virusGenomeComposition, virusHost, virusLastTaxonomicChange, virusSource, virusRealm, virusSubrealm, virusKingdom, virusSubkingdom, virusPhylum, virusSubphylum, virusClass, virusSubclass, virusOrder, virusSuborder, virusFamily, virusSubfamily, virusGenus, virusSubgenus, virusSpecies + * VirusIsolate: genomeCoverage, isExemplarVirusIsolate, ofVirusSpecies, virusIsolateDesignation + * VirusGenomeSegment: genomeSegmentOf +* Enumerations + * GenomeCoverageEnum, VirusGenomeCompositionEnum, VirusHostEnum, VirusSourceEnum +* Enumerations Generated Via Script + * VirusRealmEnum, VirusSubrealmEnum, VirusKingdomEnum, VirusSubkingdomEnum, VirusPhylumEnum, VirusSubphylumEnum, VirusClassEnum, VirusSubclassEnum, VirusOrderEnum, VirusSuborderEnum, VirusFamilyEnum, VirusSubfamilyEnum, VirusGenusEnum, VirusSubgenusEnum #### Schema MCFs From 20ef620c0735933a1d1589608845da596a5f11f0 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 24 Apr 2023 20:56:03 -0700 Subject: [PATCH 11/60] Add create_virus_taxonomic_ranking_enums.py --- .../create_virus_taxonomic_ranking_enums.py | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py new file mode 100644 index 0000000000..5a5523a28e --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py @@ -0,0 +1,120 @@ +# load environment +import pandas as pd +import sys + + +# declare universal variables +HEADER = [ +'Sort',\ +'Realm',\ +'Subrealm',\ +'Kingdom',\ +'Subkingdom',\ +'Phylum',\ +'Subphylum',\ +'Class',\ +'Subclass',\ +'Order',\ +'Suborder',\ +'Family',\ +'Subfamily',\ +'Genus',\ +'Subgenus',\ +'Species',\ +'GenomeComposition',\ +'LastChange',\ +'LastChangeVersion',\ +'ProposalForLastChange',\ +'TaxonHistoryURL' +] + + +LIST_DROP = [ +'Sort',\ +'Species',\ +'GenomeComposition',\ +'LastChange',\ +'LastChangeVersion',\ +'ProposalForLastChange',\ +'TaxonHistoryURL' +] + + +# declare functions +def pascalcase(s): + list_words = s.split() + converted = "".join(word[0].upper() + word[1:].lower() for word in list_words) + return converted + + +def check_for_illegal_charc(s): + list_illegal = ["'", "–", "*" ">", "<", "@", "]", "[", "|", ":", ";" " "] + if any([x in s for x in list_illegal]): + print('Error! dcid contains illegal characters!', s) + + +def initiate_enum_dict(): + d = {} + list_levels = [i for i in HEADER if i not in LIST_DROP] + for item in list_levels: + enum_name = 'Virus' + item + 'Enum' + d[enum_name] = {} + return d + + +def add_enums_to_dicts(key, value, d): + if value == value: + enum = 'Virus' + key + 'Enum' + dcid = 'Virus' + key + pascalcase(value) + check_for_illegal_charc(dcid) + d[enum][value] = dcid + return d + + +def add_item_to_enums(df): + list_levels = [i for i in HEADER if i not in LIST_DROP] + dict_of_dicts = initiate_enum_dict() + for index, row in df.iterrows(): + for item in list_levels: + dict_of_dicts = add_enums_to_dicts(item, row[item], dict_of_dicts) + return dict_of_dicts + + +def write_individual_entries_to_file(w, enum, d): + for key, value in d.items(): + w.write('Node: dcid:' + value + '\n') + w.write('name: "' + key + '"\n') + w.write('typeOf: dcs:' + enum + '\n\n') + return w + + +def write_dict_to_file(w, enum, d): + w.write('# ' + enum + '\n') + w.write('Node: dcid:' + enum + '\n') + w.write('name: "' + enum + '"\n') + w.write('typeOf: schema:Class\n') + w.write('subClassOf: schema:Enumeration\n\n') + w = write_individual_entries_to_file(w, enum, d) + w.write('\n') + return w + + +def generate_enums_mcf(f, w): + df = pd.read_excel(f, names=HEADER, header=None, sheet_name=2) + df = df.drop(LIST_DROP, axis=1).drop(0, axis=0) + dict_of_dicts = add_item_to_enums(df) + w = open(w, mode='w') + w.write('# Schema generated by create_virus_taxonomic_ranking_enums.py\n\n') + for key, value in dict_of_dicts.items(): + w = write_dict_to_file(w, key, value) + + +def main(): + file_input = sys.argv[1] + file_output = sys.argv[2] + + generate_enums_mcf(file_input, file_output) + + +if __name__ == '__main__': + main() From 7e8d9dd2c4925061e65df9bf046789c78d389193 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 24 Apr 2023 20:56:38 -0700 Subject: [PATCH 12/60] Add formatting scripts --- .../format_virus_master_species_list.py | 148 ++++++++ .../scripts/format_virus_metadata_resource.py | 341 ++++++++++++++++++ 2 files changed, 489 insertions(+) create mode 100644 scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py create mode 100644 scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py new file mode 100644 index 0000000000..7cbc694528 --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py @@ -0,0 +1,148 @@ +# load environment +import pandas as pd +import sys + + +# declare universal variables +DICT_CHANGE_ENUM = { +'abolished': 'VirusLastTaxonomicChangeAbolished',\ +'demoted' : 'VirusLastTaxonomicChangeDemoted',\ +'merged': 'VirusLastTaxonomicChangeMerged',\ +'moved': 'VirusLastTaxonomicChangeMoved',\ +'new': 'VirusLastTaxonomicChangeNew',\ +'promoted': 'VirusLastTaxonomicChangePromoted',\ +'removed as type species': 'VirusLastTaxonomicChangeRemoved',\ +'renamed': 'VirusLastTaxonomicChangeRenamed',\ +'split': 'VirusLastTaxonomicChangeSplit' +} + + +DICT_GC = { +'dsDNA': 'VirusGenomeCompositionDoubleStrandedDNA',\ +'ssDNA': 'VirusGenomeCompositionSingleStrandedDNA',\ +'ssDNA(-)': 'VirusGenomeCompositionSingleStrandedDNANegative',\ +'ssDNA(+)': 'VirusGenomeCompositionSingleStrandedDNAPositive',\ +'ssDNA(+/-)': 'VirusGenomeCompositionSingleStrandedDNA',\ +'dsDNA-RT': 'VirusGenomeCompositionDoubleStrandedDNAReverseTranscription',\ +'ssRNA-RT': 'VirusGenomeCompositionSingleStrandedDNAReverseTranscription',\ +'dsRNA': 'VirusGenomeCompositionDoubleStrandedRNA',\ +'ssRNA': 'VirusGenomeCompositionSingleStrandedRNA',\ +'ssRNA(-)': 'VirusGenomeCompositionSingleStrandedRNANegative',\ +'ssRNA(+)': 'VirusGenomeCompositionSingleStrandedRNAPositive',\ +'ssRNA(+/-)': 'VirusGenomeCompositionSingleStrandedRNA' +} + + +HEADER = [ +'sort',\ +'realm',\ +'subrealm',\ +'kingdom',\ +'subkingdom',\ +'phylum',\ +'subphylum',\ +'class',\ +'subclass',\ +'order',\ +'suborder',\ +'family',\ +'subfamily',\ +'genus',\ +'subgenus',\ +'species',\ +'genomeComposition',\ +'lastChange',\ +'lastChangeVersion',\ +'proposalForLastChange',\ +'taxonHistoryURL',\ +'dcid' +] + + +LIST_TAXONOMIC_LEVELS = [ +'realm',\ +'subrealm',\ +'kingdom',\ +'subkingdom',\ +'phylum',\ +'subphylum',\ +'class',\ +'subclass',\ +'order',\ +'suborder',\ +'family',\ +'subfamily',\ +'genus',\ +'subgenus' +] + + +# declare functions +def pascalcase(s): + list_words = s.split() + converted = "".join(word[0].upper() + word[1:].lower() for word in list_words) + return converted + + +def check_for_illegal_charc(s): + list_illegal = ["'", "–", "*" ">", "<", "@", "]", "[", "|", ":", ";" " "] + if any([x in s for x in list_illegal]): + print('Error! dcid contains illegal characters!', s) + + +def format_taxonomic_rank_properties(df, index, row): + for rank in LIST_TAXONOMIC_LEVELS: + if row[rank] == row[rank]: + enum = 'Virus' + rank.upper() + pascalcase(row[rank]) + df.loc[index, rank] = enum + return df + + +def convert_gc_to_enum(gc): + list_enum = [] + list_gc = gc.split(';') + for item in list_gc: + item = item.strip() + enum = DICT_GC[item] + list_enum.append(enum) + return (',').join(list_enum) + + +def convert_change_to_enum(change): + list_enum = [] + change = change.lower() + list_changes = change.split(',')[:-1] + for item in list_changes: + enum = DICT_CHANGE_ENUM[item] + list_enum.append(enum) + return (',').join(list_enum) + + +def clean_df(df): + for index, row in df.iterrows(): + dcid = 'bio/' + pascalcase(row['species']) + check_for_illegal_charc(dcid) + df = format_taxonomic_rank_properties(df, index, row) + df.loc[index, 'dcid'] = dcid + df.loc[index,'genomeComposition'] = convert_gc_to_enum(row['genomeComposition']) + df.loc[index, 'lastChange'] = convert_change_to_enum(row['lastChange']) + df.loc[index, 'taxonHistoryURL'] = row['taxonHistoryURL'].strip('ICTVonline=') + return df + + +def clean_file(f, w): + df = pd.read_excel(f, names=HEADER, header=None, sheet_name=2) + df = df.drop('sort', axis=1).drop(0, axis=0) + df = clean_df(df) + df.to_csv(w, index=False) + + +def main(): + file_input = sys.argv[1] + file_output = sys.argv[2] + + clean_file(file_input, file_output) + + +if __name__ == '__main__': + main() diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py new file mode 100644 index 0000000000..b1cc40cc8d --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py @@ -0,0 +1,341 @@ +# set up environment +import pandas as pd +import sys +import unidecode + + +# declare universal variables +DICT_COVERAGE = { +'complete genome': 'GenomeCoverageCompleteGenome',\ +'complete coding genome': 'GenomeCoverageCodingGenome',\ +'no entry in genbank': 'GenomeCoverageNoEntryInGenBank',\ +'partial genome': 'GenomeCoveragePartialGenome' +} + + +DICT_GC = { +'dsDNA': 'VirusGenomeCompositionDoubleStrandedDNA',\ +'ssDNA': 'VirusGenomeCompositionSingleStrandedDNA',\ +'ssDNA(-)': 'VirusGenomeCompositionSingleStrandedDNANegative',\ +'ssDNA(+)': 'VirusGenomeCompositionSingleStrandedDNAPositive',\ +'ssDNA(+/-)': 'VirusGenomeCompositionSingleStrandedDNA',\ +'dsDNA-RT': 'VirusGenomeCompositionDoubleStrandedDNAReverseTranscription',\ +'ssRNA-RT': 'VirusGenomeCompositionSingleStrandedDNAReverseTranscription',\ +'dsRNA': 'VirusGenomeCompositionDoubleStrandedRNA',\ +'ssRNA': 'VirusGenomeCompositionSingleStrandedRNA',\ +'ssRNA(-)': 'VirusGenomeCompositionSingleStrandedRNANegative',\ +'ssRNA(+)': 'VirusGenomeCompositionSingleStrandedRNAPositive',\ +'ssRNA(+/-)': 'VirusGenomeCompositionSingleStrandedRNA' +} + + +DICT_HOST = { + 'algae': 'VirusHostAlgae',\ + 'archaea': 'VirusHostArchaea',\ + 'bacteria': 'VirusHostBacteria',\ + 'fungi': 'VirusHostFungi',\ + 'invertebrates': 'VirusHostInvertebrates',\ + 'plants': 'VirusHostPlants',\ + 'protists': 'VirusHostProtists',\ + 'vertebrates': 'VirusHostVertebrates' +} + + +DICT_SOURCE = { + 'invertebrates': 'VirusSourceInvertebrates',\ + 'marine': 'VirusSourceMarine',\ + 'phytobiome': 'VirusSourcePhytobiome',\ + 'plants': 'VirusSourcePlants',\ + 'protists': 'VirusSourceProtists',\ + 'sewage': 'VirusSourceSewage',\ + 'soil': 'VirusSourceSoil' +} + + +HEADER = [ +'sort',\ +'isolateSort',\ +'realm',\ +'subrealm',\ +'kingdom',\ +'subkingdom',\ +'phylum',\ +'subphylum',\ +'class',\ +'subclass',\ +'order',\ +'suborder',\ +'family',\ +'subfamily',\ +'genus',\ +'subgenus',\ +'species',\ +'isExemplar',\ +'name',\ +'abbreviation',\ +'isolateDesignation',\ +'genBankAccession',\ +'refSeqAccession',\ +'genomeCoverage',\ +'genomeComposition',\ +'hostSource',\ +'host',\ +'source',\ +'dcid',\ +'isolate_dcid',\ +'isolate_name' +] + + +HEADER_2 = [ +'dcid',\ +'name',\ +'genBankAccession',\ +'genomeSegmentOf',\ +'refSeqAccession' +] + + +LIST_TAXONOMIC_LEVELS = [ +'realm',\ +'subrealm',\ +'kingdom',\ +'subkingdom',\ +'phylum',\ +'subphylum',\ +'class',\ +'subclass',\ +'order',\ +'suborder',\ +'family',\ +'subfamily',\ +'genus',\ +'subgenus' +] + + +# declare functions +# declare functions +def pascalcase(s): + list_words = s.split() + converted = "".join(word[0].upper() + word[1:] for word in list_words) + return converted + + +def check_for_illegal_charc(s): + list_illegal = ["'", "#", "–", "*" ">", "<", "@", "]", "[", "|", ":", ";", " "] + if any([x in s for x in list_illegal]): + print('Error! dcid contains illegal characters!', s) + + +def format_list(s): + if s != s: + return s + list_items = [] + s = str(s) + list_s = s.split(';') + for item in list_s: + list_items.append(item.strip()) + return (',').join(list_items) + + +def format_taxonomic_rank_properties(df, index, row): + for rank in LIST_TAXONOMIC_LEVELS: + if row[rank] == row[rank]: + enum = 'Virus' + rank.upper() + pascalcase(row[rank]) + df.loc[index, rank] = enum + return df + + +def convert_gc_to_enum(gc): + list_enum = [] + list_gc = gc.split(';') + for item in list_gc: + item = item.strip() + enum = DICT_GC[item] + list_enum.append(enum) + return (',').join(list_enum) + + +def convert_coverage_to_enum(cov): + return DICT_COVERAGE[cov.lower()] + + +def convert_type_to_boolean(t): + if t == 'E': + return True + if t == 'A': + return False + print('Error! Not an expected isolate type! Expected E or A, but got', t ,'.') + + +def convert_source_to_enum(source): + source = source[:-4] + return DICT_SOURCE[source] + + +def convert_host_to_enum(host): + list_enum = [] + list_host = host.split(',') + for item in list_host: + item = item.strip() + enum = DICT_HOST[item] + list_enum.append(enum) + return (',').join(list_enum) + + +def handle_genBank_missing_exception(n, virus_dcid, virus_name): + if n != n: + dcid = virus_dcid + 'Isolate' + name = virus_name + ' Isolate' + return dcid, name + n = str(n) + if ';' in n: + n = n.split(';')[0] + dcid = virus_dcid + pascalcase(n) + dcid=dcid.replace("'","").replace('–', '-') + name = virus_name + n + return dcid, name + + +def handle_genBank_components_exception(genBank, virus_dcid, virus_name): + dcid = virus_dcid + name = virus_name + list_genBank = genBank.split(';') + for item in list_genBank: + if ':' in item: + n, gb = item.split(':') + dcid = virus_dcid + '_' + gb.strip() + name = virus_name + gb + else: + dcid = virus_dcid + '_' + item.strip() + name = virus_name + item + return dcid, name + + +def format_isolate_designation_for_dcid(des): + des = str(des) + des = des.replace(':', '_') + des = des.replace(';', '_') + des = des.replace(' ', '_') + des = des.replace('[', '(') + des = des.replace(']', ')') + des = des.replace('-', '_') + des = des.replace('–', '_') + des = des.replace("'", '') + des = des.replace('#', '') + return des + + +def verify_isolate_dcid_uniqueness(dcid, list_isolate_dcids, genBank, virus_abrv): + if dcid in list_isolate_dcids: + if ';' in genBank: + dcid = dcid + '_' + virus_abrv + else: + dcid = dcid + '_' + genBank + print('Non-unique VirusIsolate dcid generated! Added additional info to differentiate:', dcid) + list_isolate_dcids.append(dcid) + return dcid, list_isolate_dcids + + +def declare_isolate_dcid(n, genBank, virus_dcid, virus_name, virus_abrv, isolate_designation, list_isolate_dcids): + if isolate_designation == isolate_designation: + des = format_isolate_designation_for_dcid(isolate_designation) + dcid = virus_dcid + '_' + pascalcase(des) + name = virus_name + ' strain ' + str(isolate_designation) + elif genBank != genBank: + dcid, name = handle_genBank_missing_exception(n, virus_dcid, virus_name) + elif ':' in genBank or ';' in genBank: + dcid, name = handle_genBank_components_exception(genBank, virus_dcid, virus_name) + else: + dcid = virus_dcid + '_' + genBank + name = virus_name + ' ' + genBank + dcid = unidecode.unidecode(dcid) + dcid, list_isolate_dcids = verify_isolate_dcid_uniqueness(dcid, list_isolate_dcids, genBank, virus_abrv) + return dcid, name, list_isolate_dcids + + +def make_refSeq_dict(refSeq): + d = {} + list_refSeq = refSeq.split(';') + for item in list_refSeq: + if ':' in item: + name, rs = item.split(':') + d[name.strip()] = rs.strip() + return d + + +def handle_genome_segments(df_segment, virus_dcid, virus_name, isolate_dcid, genBank, refSeq): + dict_refSeq = {} + list_genBank = genBank.split(';') + if refSeq == refSeq: + dict_refSeq = make_refSeq_dict(refSeq) + for item in list_genBank: + d = {'dcid': '', 'name': '', 'genBankAccession': '', 'genomeSegmentOf': '', 'refSeqAccession': ''} + if ':' not in item: + continue + name, gb = item.split(':') + name = name.strip() + gb = gb.strip() + d['dcid'] = virus_dcid + gb + d['name'] = virus_name + ' Segment ' + name + d['genBankAccession'] = gb + d['genomeSegmentOf'] = isolate_dcid + if name in dict_refSeq: + d['refSeqAccession'] = dict_refSeq[name] + df_segment = df_segment.append(d, ignore_index=True) + return df_segment + + +def clean_df(df, df_segment): + list_isolate_dcids = [] + for index, row in df.iterrows(): + dcid = 'bio/' + pascalcase(row['species']) + check_for_illegal_charc(dcid) + df.loc[index, 'dcid'] = dcid + df = format_taxonomic_rank_properties(df, index, row) + isolate_dcid, isolate_name, list_isolate_dcids = declare_isolate_dcid(row['name'], row['genBankAccession'], dcid, row['species'], row['abbreviation'], row['isolateDesignation'], list_isolate_dcids) + check_for_illegal_charc(isolate_dcid) + df.loc[index, 'isolate_dcid'] = isolate_dcid + df.loc[index, 'isolate_name'] = isolate_name + df.loc[index,'genomeComposition'] = convert_gc_to_enum(row['genomeComposition']) + df.loc[index,'genomeCoverage'] = convert_coverage_to_enum(row['genomeCoverage']) + df.loc[index, 'isExemplar'] = convert_type_to_boolean(row['isExemplar']) + df.loc[index, 'name'] = format_list(row['name']) + df.loc[index, 'abbreviation'] = format_list(row['abbreviation']) + df.loc[index, 'isolateDesignation'] = format_list(row['isolateDesignation']) + genBank = row['genBankAccession'] + if genBank == genBank and ':' in genBank: + df_segment = handle_genome_segments(df_segment, dcid, row['name'], isolate_dcid, genBank, row['refSeqAccession']) + df.loc[index, 'genBankAccession'] = '' + df.loc[index, 'refSeqAccession'] = '' + elif genBank == genBank and ';' in genBank: + df.loc[index, 'genBankAccession'] = format_list(genBank) + df.loc[index, 'refSeqAccession'] = format_list(row['refSeqAccession']) + if '(S)' in row['hostSource']: + df.loc[index, 'source'] = convert_source_to_enum(row['hostSource']) + else: + df.loc[index, 'host'] = convert_host_to_enum(row['hostSource']) + return df, df_segment + + +def clean_file(f, w, w_2): + df = pd.read_excel(f, names=HEADER, header=None, sheet_name=0) + df = df.drop(0, axis=0) + df_segment = pd.DataFrame([], columns=HEADER_2) + df, df_segment = clean_df(df, df_segment) + df = df.drop(['sort', 'isolateSort', 'hostSource'], axis=1) + df.to_csv(w, index=False) + df_segment.to_csv(w_2, index=False) + + +def main(): + file_input = sys.argv[1] + file_output_1 = sys.argv[2] + file_output_2 = sys.argv[3] + + clean_file(file_input, file_output_1, file_output_2) + + +if __name__ == '__main__': + main() From 01ea5bacea3574adc7405cd7b079886c1d699216 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 24 Apr 2023 21:36:26 -0700 Subject: [PATCH 13/60] Update format_virus_metadata_resource.py Removes error generated in two dcids by removing whitespace --- .../ICTV_Taxonomy/scripts/format_virus_metadata_resource.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py index b1cc40cc8d..10f0336c28 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py @@ -193,7 +193,8 @@ def handle_genBank_missing_exception(n, virus_dcid, virus_name): if ';' in n: n = n.split(';')[0] dcid = virus_dcid + pascalcase(n) - dcid=dcid.replace("'","").replace('–', '-') + dcid = dcid.replace("'", "") + dcid = dcid.replace('–', '-') name = virus_name + n return dcid, name @@ -217,7 +218,6 @@ def format_isolate_designation_for_dcid(des): des = str(des) des = des.replace(':', '_') des = des.replace(';', '_') - des = des.replace(' ', '_') des = des.replace('[', '(') des = des.replace(']', ')') des = des.replace('-', '_') @@ -250,6 +250,7 @@ def declare_isolate_dcid(n, genBank, virus_dcid, virus_name, virus_abrv, isolate else: dcid = virus_dcid + '_' + genBank name = virus_name + ' ' + genBank + dcid = dcid.replace(' ', '') dcid = unidecode.unidecode(dcid) dcid, list_isolate_dcids = verify_isolate_dcid_uniqueness(dcid, list_isolate_dcids, genBank, virus_abrv) return dcid, name, list_isolate_dcids @@ -278,6 +279,7 @@ def handle_genome_segments(df_segment, virus_dcid, virus_name, isolate_dcid, gen name = name.strip() gb = gb.strip() d['dcid'] = virus_dcid + gb + check_for_illegal_charc(virus_dcid + gb) d['name'] = virus_name + ' Segment ' + name d['genBankAccession'] = gb d['genomeSegmentOf'] = isolate_dcid From 915447616f521fa783ae132f2856433b1c526701 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 24 Apr 2023 21:38:04 -0700 Subject: [PATCH 14/60] Add log file --- .../ICTV_Taxonomy/logs/format_virus_metadata_resource.log | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log diff --git a/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log b/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log new file mode 100644 index 0000000000..d6ea08e466 --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log @@ -0,0 +1,4 @@ +Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/BetachrysovirusMagnaporthis_VietNam_MoCV1-B +Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/AroaVirus_BeAn4073_AF013366 +Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/UgandanCassavaBrownStreakVirus_UG_FJ185044 +Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/PotatoVirusY_N_X97895 From 348a2977b07b6fac5af79bcccce8a9116477c166 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 24 Apr 2023 21:54:26 -0700 Subject: [PATCH 15/60] Update README.md Add notes and caveats and dcid generation segments. Also add the commands to run data cleaning scripts. --- scripts/biomedical/ICTV_Taxonomy/README.md | 43 ++++++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 063b1138ee..c14a76cc92 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -7,12 +7,13 @@ 1. [Download URL](#download-urls) 2. [Overview](#overview) 3. [Notes and Caveats](#notes-and-caveats) - 4. [License](#license) - 5. [Dataset Documentation and Relevant Links](#dataset-documentation-and-relevant-links) + 4. [dcid Generation](#dcid-generation) + 5. [License](#license) + 6. [Dataset Documentation and Relevant Links](#dataset-documentation-and-relevant-links) 2. [About the Import](#about-the-import) 1. [Artifacts](#artifacts) 2. [Import Procedure](#import-procedure) - 3. [Tests](#tests) + 4. [Tests](#tests) ## About the Datasets @@ -35,7 +36,41 @@ This directory stores all scripts used to import data on viurses and virus isola ### Notes and Caveats +Viruses are not considered alive and are therefore not classified under “The Tree of Life”. They instead have their own taxonomic classification system described here. However, the viral classification system mirrors “The Tree of Life” by copying their Kingdom -> Phylum -> Class -> Order -> Family -> Genus -> Species hierarchical classes, while adding a level above called Domain and sublevels under each one. This similarity in naming can lead to confusion between the two classification systems. In particular, in datasets species of viruses may be included without distinction alongside species of bacteria, archaea, or animals. To mitigate this potential confusion Viruses have their own distinct schema, which they do not share with non-viral biological entity. +Not all levels of the viral classification are currently in use. As of release 37, Subrealm, Subkingdom, and Subclass are not in use. These classifications are defined here in the schema in case they are used in future releases. In addition, for each species there is a classification defined for each of the main classes (Domain, Kingdom, Phylum, Class, Order, Family, Genus, and Species), however there are missing classifications for some or all of the subclasses (Subkingdom, Subphylum, Subclass, Suborder, SubFamily, and Subgenus). To account for this, references will be made to the parent of the next main class in addition to the parent subclass. + +“The ICTV chooses an exemplar virus for each species and the VMR provides a list of these exemplars. An exemplar virus serves as an example of a well-characterized virus isolate of that species and includes the GenBank accession number for the genomic sequence of the isolate as well as the virus name, isolate designation, suggested abbreviation, genome composition, and host source.” Additional isolates for each species within the ICTV database are also noted. + + +### dcid Generation +A ‘bio/’ prefix was attached to all dcids in this import. Each line in each input file is considered its own unique Virus or VirusIsolate. In cases where there are multiple lines that generate the same dcid for a Virus, VirusIsolate, or VirusGenomeSegment then an error message is printed out stating the non-unique dcid generated for a given entity. + +####Virus +Dcids were generated by converting the Virus’s species name to pascal case (i.e. bio/). + +####VirusIsolate +Unique information regarding the VirusIsolate was added to the end of the Virus dcid to generate a unique VirusIsolate dcid. In the cases for which the isolate had a designation, then this was converted to pascal case and used as the dcid (i.e. bio/). In cases where there was no isolate designation indicated then the GenBank Accession Number was used to generate the dcid if there was one unique one for that isolate (i.e. bio/). In cases in which there were multiple GenBank Accession numbers associated with a virus isolate, these were daisy chained with ‘_’s to create the dcid for the VirusIsolate (i.e. bio/_). In the event both the isolate designation and the GenBank Accession for a VirusIsolate is missing then the word ‘Isolate’ was added to the pascal case name of the species to create the VirusIsolate dcid (i.e. bio/Isolate). + +Note: This resulted in collisions for four VirusIsolates. These errors were recorded in the [format_virus_metadata_resource.log](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log) file. + +####VirusGenomeSegment +The GenBank Accession number for a VirusGenomeSegment was tacked onto the corresponding VirusIsolate dcid to generate a unique VirusGenomeSegment dcid (i.e. ). + +####Illegal Characters +Only ASCII characters are allowed to be used in dcids. Additionally, the following characters are illegal to be included in the dcid: :, ;, , [, ], -, –, ‘, #. They were replaced in place with the following characters specified below: + +| Illegal Character | Replacement Character | +| ----------------- | --------------------- | +| : | _ | +| ; | _ | +| | | +| [ | ( | +| ] | ) | +| - | _ | +| – | _ | +| ‘ | _ | +| # | | ### License @@ -106,11 +141,13 @@ python3 scripts/create_virus_taxonomic_ranking_enums.py import_files/ICTV_Master Clean and format Master Species List as a CSV that matches the corresponding tMCF by running: ```bash +python3 scripts/format_virus_master_species_list.py input/ICTV_Master_Species_List.xlsx VirusSpecies.csv ``` Clean and format Virus Metadata Resource as a CSV that matches the corresponding tMCF by running: ```bash +python3 scripts/format_virus_metadata_resource.py input/ICTV_Virus_Metadata_Resource.xlsx VirusIsolates.csv VirusGenomeSegments.csv > format_virus_metadata_resource.log ``` ### Tests From 0ab5772df8dd9a1199ca296c3f33e2bad7296d62 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 24 Apr 2023 22:00:17 -0700 Subject: [PATCH 16/60] Create download.sh --- scripts/biomedical/ICTV_Taxonomy/scripts/download.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 scripts/biomedical/ICTV_Taxonomy/scripts/download.sh diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh new file mode 100644 index 0000000000..adc0f51091 --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh @@ -0,0 +1,12 @@ +''' +This file downloads the most recent version of the ICTV Master Species List and +Virus Metadata Resource and prepares it for processing +''' +#!/bin/bash + +# make input directory +mkdir -p input; cd input + +# download NCBI data +curl -o ICTV_Virus_Species_List.xlsx https://ictv.global/msl/current +curl -o ICTV_Virus_Metadata_Resource.xlsx https://ictv.global/vmr/current From 321677bcfea11435467bb97091d9518728f08795 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 24 Apr 2023 22:01:57 -0700 Subject: [PATCH 17/60] Update command to run download.sh --- scripts/biomedical/ICTV_Taxonomy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index c14a76cc92..7fb5f686e4 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -129,7 +129,7 @@ Classes, properties, and enumerations that were added in this import to represen Download the most recent versions of the Master Species List and Virus Metadata Resource from ICTV by running: ```bash -download.sh +sh download.sh ``` Generate the enummeration schema MCF, which represents virus taxonomic ranks by running: From b21c9afc60032281e6430829a23e5f8f95f503e5 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 24 Apr 2023 22:03:51 -0700 Subject: [PATCH 18/60] update illegal characters subsection --- scripts/biomedical/ICTV_Taxonomy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 7fb5f686e4..b706e22a83 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -58,7 +58,7 @@ Note: This resulted in collisions for four VirusIsolates. These errors were reco The GenBank Accession number for a VirusGenomeSegment was tacked onto the corresponding VirusIsolate dcid to generate a unique VirusGenomeSegment dcid (i.e. ). ####Illegal Characters -Only ASCII characters are allowed to be used in dcids. Additionally, the following characters are illegal to be included in the dcid: :, ;, , [, ], -, –, ‘, #. They were replaced in place with the following characters specified below: +Only ASCII characters are allowed to be used in dcids. Additionally, a number of characters that are illegal to include in the dcid were replaced in place with the following characters specified below: | Illegal Character | Replacement Character | | ----------------- | --------------------- | From 5af5a62ad1d864042281db80c22d47f7c8151393 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Tue, 25 Apr 2023 09:33:39 -0700 Subject: [PATCH 19/60] fix formatting error --- scripts/biomedical/ICTV_Taxonomy/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index b706e22a83..66436de30c 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -46,18 +46,18 @@ Not all levels of the viral classification are currently in use. As of release 3 ### dcid Generation A ‘bio/’ prefix was attached to all dcids in this import. Each line in each input file is considered its own unique Virus or VirusIsolate. In cases where there are multiple lines that generate the same dcid for a Virus, VirusIsolate, or VirusGenomeSegment then an error message is printed out stating the non-unique dcid generated for a given entity. -####Virus +#### Virus Dcids were generated by converting the Virus’s species name to pascal case (i.e. bio/). -####VirusIsolate +#### VirusIsolate Unique information regarding the VirusIsolate was added to the end of the Virus dcid to generate a unique VirusIsolate dcid. In the cases for which the isolate had a designation, then this was converted to pascal case and used as the dcid (i.e. bio/). In cases where there was no isolate designation indicated then the GenBank Accession Number was used to generate the dcid if there was one unique one for that isolate (i.e. bio/). In cases in which there were multiple GenBank Accession numbers associated with a virus isolate, these were daisy chained with ‘_’s to create the dcid for the VirusIsolate (i.e. bio/_). In the event both the isolate designation and the GenBank Accession for a VirusIsolate is missing then the word ‘Isolate’ was added to the pascal case name of the species to create the VirusIsolate dcid (i.e. bio/Isolate). Note: This resulted in collisions for four VirusIsolates. These errors were recorded in the [format_virus_metadata_resource.log](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log) file. -####VirusGenomeSegment +#### VirusGenomeSegment The GenBank Accession number for a VirusGenomeSegment was tacked onto the corresponding VirusIsolate dcid to generate a unique VirusGenomeSegment dcid (i.e. ). -####Illegal Characters +#### Illegal Characters Only ASCII characters are allowed to be used in dcids. Additionally, a number of characters that are illegal to include in the dcid were replaced in place with the following characters specified below: | Illegal Character | Replacement Character | From 4bdec7542c4623c98bc10f64e5d61ea5b11785e7 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 7 Jun 2023 15:12:18 -0700 Subject: [PATCH 20/60] Add header --- .../create_virus_taxonomic_ranking_enums.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py index 5a5523a28e..37f30233c4 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py @@ -1,3 +1,29 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Samantha Piekos +Date: 03/22/2023 +Name: create_virus_taxonomic_ranking_enums +Description: Takes in the ICTV Master Species List and uses it to represent +the viral taxonomy as class enummerations. The output is in mcf format. + +@file_input input Master Species List .csv from ICTV +@file_output formatted mcf file of the schema reperenting the virus + taxonomy as enummerations +""" + + # load environment import pandas as pd import sys From 8061182dd1ad275a45dc45beb7d19abc2759a520 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 7 Jun 2023 15:13:15 -0700 Subject: [PATCH 21/60] Add header --- .../ICTV_Taxonomy/scripts/download.sh | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh index adc0f51091..1d1927ced3 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh @@ -1,7 +1,23 @@ -''' -This file downloads the most recent version of the ICTV Master Species List and -Virus Metadata Resource and prepares it for processing -''' +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Samantha Piekos +Date: 03/22/2023 +Name: create_virus_taxonomic_ranking_enums +Description: This file downloads the most recent version of the ICTV Master +Species List and Virus Metadata Resource and prepares it for processing +""" #!/bin/bash # make input directory From 011b3d147afd6520409f1772ccf8dc0fc6a65b7e Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 7 Jun 2023 15:15:54 -0700 Subject: [PATCH 22/60] add header --- .../format_virus_master_species_list.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py index 7cbc694528..ce0f4bda1c 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py @@ -1,3 +1,27 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Samantha Piekos +Date: 03/22/2023 +Name: create_virus_taxonomic_ranking_enums +Description: This cleans and formats the ICTV Master Species List for +import into the knowledge graph as a csv+tmcf pair. + +@file_input Master Species List csv file +@file_output cleaned and formatted csv file +""" + # load environment import pandas as pd import sys From 042812c1ae99f9d24170b80074ac05aa09304eb7 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 7 Jun 2023 15:17:31 -0700 Subject: [PATCH 23/60] Add header --- .../scripts/format_virus_metadata_resource.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py index 10f0336c28..7faa9eaab8 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py @@ -1,3 +1,27 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Samantha Piekos +Date: 03/22/2023 +Name: format_virus_metadata_resource +Description: This cleans and formats the ICTV Virus Metadata Resource for +import into the knowledge graph as a csv+tmcf pair. + +@file_input Virus Metadata Resource excel file +@file_output cleaned and formatted csv file +""" + # set up environment import pandas as pd import sys From c9e50222e6de6f312ddbd869b02340ebe6c79fe1 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 7 Jun 2023 15:18:04 -0700 Subject: [PATCH 24/60] update header --- .../ICTV_Taxonomy/scripts/format_virus_master_species_list.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py index ce0f4bda1c..8e37b87052 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py @@ -14,11 +14,11 @@ """ Author: Samantha Piekos Date: 03/22/2023 -Name: create_virus_taxonomic_ranking_enums +Name: format_virus_master_species_list Description: This cleans and formats the ICTV Master Species List for import into the knowledge graph as a csv+tmcf pair. -@file_input Master Species List csv file +@file_input Master Species List excel file @file_output cleaned and formatted csv file """ From 667ecc2cb9769611292b374e8bb974650f00a68d Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 7 Jun 2023 15:18:23 -0700 Subject: [PATCH 25/60] Update header --- scripts/biomedical/ICTV_Taxonomy/scripts/download.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh index 1d1927ced3..e49993cd8f 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh @@ -14,7 +14,7 @@ """ Author: Samantha Piekos Date: 03/22/2023 -Name: create_virus_taxonomic_ranking_enums +Name: download Description: This file downloads the most recent version of the ICTV Master Species List and Virus Metadata Resource and prepares it for processing """ From d2368b5f71e7f0d45b7030d4aea9651d6bcd45cb Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 31 Jul 2023 15:41:13 -0700 Subject: [PATCH 26/60] Update scripts --- .../create_virus_taxonomic_ranking_enums.py | 140 +++++++++--------- .../format_virus_metadata_resource.log | 4 + .../scripts/format_virus_metadata_resource.py | 88 ++++------- 3 files changed, 109 insertions(+), 123 deletions(-) create mode 100644 scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.log diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py index 37f30233c4..589637b1bc 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py @@ -1,29 +1,3 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Author: Samantha Piekos -Date: 03/22/2023 -Name: create_virus_taxonomic_ranking_enums -Description: Takes in the ICTV Master Species List and uses it to represent -the viral taxonomy as class enummerations. The output is in mcf format. - -@file_input input Master Species List .csv from ICTV -@file_output formatted mcf file of the schema reperenting the virus - taxonomy as enummerations -""" - - # load environment import pandas as pd import sys @@ -31,38 +5,57 @@ # declare universal variables HEADER = [ -'Sort',\ -'Realm',\ -'Subrealm',\ -'Kingdom',\ -'Subkingdom',\ -'Phylum',\ -'Subphylum',\ -'Class',\ -'Subclass',\ -'Order',\ -'Suborder',\ -'Family',\ -'Subfamily',\ -'Genus',\ -'Subgenus',\ -'Species',\ -'GenomeComposition',\ -'LastChange',\ -'LastChangeVersion',\ -'ProposalForLastChange',\ -'TaxonHistoryURL' +'sort',\ +'isolateSort',\ +'realm',\ +'subrealm',\ +'kingdom',\ +'subkingdom',\ +'phylum',\ +'subphylum',\ +'class',\ +'subclass',\ +'order',\ +'suborder',\ +'family',\ +'subfamily',\ +'genus',\ +'subgenus',\ +'species',\ +'isExemplar',\ +'name',\ +'abbreviation',\ +'isolateDesignation',\ +'genBankAccession',\ +'refSeqAccession',\ +'genomeCoverage',\ +'genomeComposition',\ +'hostSource',\ +'host',\ +'source',\ +'dcid',\ +'isolate_dcid',\ +'isolate_name' ] - LIST_DROP = [ -'Sort',\ -'Species',\ -'GenomeComposition',\ -'LastChange',\ -'LastChangeVersion',\ -'ProposalForLastChange',\ -'TaxonHistoryURL' +'sort',\ +'isolateSort',\ +'species',\ +'isExemplar',\ +'name',\ +'abbreviation',\ +'isolateDesignation',\ +'genBankAccession',\ +'refSeqAccession',\ +'genomeCoverage',\ +'genomeComposition',\ +'hostSource',\ +'host',\ +'source',\ +'dcid',\ +'isolate_dcid',\ +'isolate_name' ] @@ -83,7 +76,7 @@ def initiate_enum_dict(): d = {} list_levels = [i for i in HEADER if i not in LIST_DROP] for item in list_levels: - enum_name = 'Virus' + item + 'Enum' + enum_name = 'Virus' + item.capitalize() + 'Enum' d[enum_name] = {} return d @@ -100,39 +93,52 @@ def add_enums_to_dicts(key, value, d): def add_item_to_enums(df): list_levels = [i for i in HEADER if i not in LIST_DROP] dict_of_dicts = initiate_enum_dict() + dict_specialization = {} # keep track of previous top level for index, row in df.iterrows(): + last_level_dcid = False # initiate empty value for tracking specialization for item in list_levels: - dict_of_dicts = add_enums_to_dicts(item, row[item], dict_of_dicts) - return dict_of_dicts - - -def write_individual_entries_to_file(w, enum, d): + level = item.capitalize() + if row[item] != row[item]: + continue + dict_of_dicts = add_enums_to_dicts(level, row[item], dict_of_dicts) + if last_level_dcid: # track specialization if relevant + dcid = 'Virus' + level + pascalcase(row[item]) + dict_specialization[dcid] = last_level_dcid + last_level_dcid = 'Virus' + level + pascalcase(row[item]) # update top level + return dict_of_dicts, dict_specialization + + +def write_individual_entries_to_file(w, enum, d, dict_specialization): for key, value in d.items(): w.write('Node: dcid:' + value + '\n') w.write('name: "' + key + '"\n') - w.write('typeOf: dcs:' + enum + '\n\n') + w.write('typeOf: dcs:' + enum + '\n') + if value in dict_specialization: + w.write('specializationOf: dcs:' + dict_specialization[value] + '\n\n') + else: + w.write('\n') return w -def write_dict_to_file(w, enum, d): +def write_dict_to_file(w, enum, d, dict_specialization): w.write('# ' + enum + '\n') w.write('Node: dcid:' + enum + '\n') w.write('name: "' + enum + '"\n') w.write('typeOf: schema:Class\n') w.write('subClassOf: schema:Enumeration\n\n') - w = write_individual_entries_to_file(w, enum, d) + w = write_individual_entries_to_file(w, enum, d, dict_specialization) w.write('\n') return w def generate_enums_mcf(f, w): - df = pd.read_excel(f, names=HEADER, header=None, sheet_name=2) + df = pd.read_excel(f, names=HEADER, header=None, sheet_name=0) df = df.drop(LIST_DROP, axis=1).drop(0, axis=0) - dict_of_dicts = add_item_to_enums(df) + dict_of_dicts, dict_specialization = add_item_to_enums(df) w = open(w, mode='w') w.write('# Schema generated by create_virus_taxonomic_ranking_enums.py\n\n') for key, value in dict_of_dicts.items(): - w = write_dict_to_file(w, key, value) + w = write_dict_to_file(w, key, value, dict_specialization) def main(): diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.log b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.log new file mode 100644 index 0000000000..d6ea08e466 --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.log @@ -0,0 +1,4 @@ +Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/BetachrysovirusMagnaporthis_VietNam_MoCV1-B +Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/AroaVirus_BeAn4073_AF013366 +Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/UgandanCassavaBrownStreakVirus_UG_FJ185044 +Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/PotatoVirusY_N_X97895 diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py index 7faa9eaab8..ee45f69c60 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py @@ -1,27 +1,3 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Author: Samantha Piekos -Date: 03/22/2023 -Name: format_virus_metadata_resource -Description: This cleans and formats the ICTV Virus Metadata Resource for -import into the knowledge graph as a csv+tmcf pair. - -@file_input Virus Metadata Resource excel file -@file_output cleaned and formatted csv file -""" - # set up environment import pandas as pd import sys @@ -30,49 +6,49 @@ # declare universal variables DICT_COVERAGE = { -'complete genome': 'GenomeCoverageCompleteGenome',\ -'complete coding genome': 'GenomeCoverageCodingGenome',\ -'no entry in genbank': 'GenomeCoverageNoEntryInGenBank',\ -'partial genome': 'GenomeCoveragePartialGenome' +'complete genome': 'dcs:GenomeCoverageCompleteGenome',\ +'complete coding genome': 'dcs:GenomeCoverageCompleteCodingGenome',\ +'no entry in genbank': 'dcs:GenomeCoverageNoEntryInGenBank',\ +'partial genome': 'dcs:GenomeCoveragePartialGenome' } DICT_GC = { -'dsDNA': 'VirusGenomeCompositionDoubleStrandedDNA',\ -'ssDNA': 'VirusGenomeCompositionSingleStrandedDNA',\ -'ssDNA(-)': 'VirusGenomeCompositionSingleStrandedDNANegative',\ -'ssDNA(+)': 'VirusGenomeCompositionSingleStrandedDNAPositive',\ -'ssDNA(+/-)': 'VirusGenomeCompositionSingleStrandedDNA',\ -'dsDNA-RT': 'VirusGenomeCompositionDoubleStrandedDNAReverseTranscription',\ -'ssRNA-RT': 'VirusGenomeCompositionSingleStrandedDNAReverseTranscription',\ -'dsRNA': 'VirusGenomeCompositionDoubleStrandedRNA',\ -'ssRNA': 'VirusGenomeCompositionSingleStrandedRNA',\ -'ssRNA(-)': 'VirusGenomeCompositionSingleStrandedRNANegative',\ -'ssRNA(+)': 'VirusGenomeCompositionSingleStrandedRNAPositive',\ -'ssRNA(+/-)': 'VirusGenomeCompositionSingleStrandedRNA' +'dsDNA': 'dcs:VirusGenomeCompositionDoubleStrandedDNA',\ +'ssDNA': 'dcs:VirusGenomeCompositionSingleStrandedDNA',\ +'ssDNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedDNANegative',\ +'ssDNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedDNAPositive',\ +'ssDNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedDNA',\ +'dsDNA-RT': 'dcs:VirusGenomeCompositionDoubleStrandedDNAReverseTranscription',\ +'ssRNA-RT': 'dcs:VirusGenomeCompositionSingleStrandedRNAReverseTranscription',\ +'dsRNA': 'dcs:VirusGenomeCompositionDoubleStrandedRNA',\ +'ssRNA': 'dcs:VirusGenomeCompositionSingleStrandedRNA',\ +'ssRNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedRNANegative',\ +'ssRNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedRNAPositive',\ +'ssRNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedRNA' } DICT_HOST = { - 'algae': 'VirusHostAlgae',\ - 'archaea': 'VirusHostArchaea',\ - 'bacteria': 'VirusHostBacteria',\ - 'fungi': 'VirusHostFungi',\ - 'invertebrates': 'VirusHostInvertebrates',\ - 'plants': 'VirusHostPlants',\ - 'protists': 'VirusHostProtists',\ - 'vertebrates': 'VirusHostVertebrates' + 'algae': 'dcs:VirusHostAlgae',\ + 'archaea': 'dcs:VirusHostArchaea',\ + 'bacteria': 'dcs:VirusHostBacteria',\ + 'fungi': 'dcs:VirusHostFungi',\ + 'invertebrates': 'dcs:VirusHostInvertebrates',\ + 'plants': 'dcs:VirusHostPlants',\ + 'protists': 'dcs:VirusHostProtists',\ + 'vertebrates': 'dcs:VirusHostVertebrates' } DICT_SOURCE = { - 'invertebrates': 'VirusSourceInvertebrates',\ - 'marine': 'VirusSourceMarine',\ - 'phytobiome': 'VirusSourcePhytobiome',\ - 'plants': 'VirusSourcePlants',\ - 'protists': 'VirusSourceProtists',\ - 'sewage': 'VirusSourceSewage',\ - 'soil': 'VirusSourceSoil' + 'invertebrates': 'dcs:VirusSourceInvertebrates',\ + 'marine': 'dcs:VirusSourceMarine',\ + 'phytobiome': 'dcs:VirusSourcePhytobiome',\ + 'plants': 'dcs:VirusSourcePlants',\ + 'protists': 'dcs:VirusSourceProtists',\ + 'sewage': 'dcs:VirusSourceSewage',\ + 'soil': 'dcs:VirusSourceSoil' } @@ -166,7 +142,7 @@ def format_list(s): def format_taxonomic_rank_properties(df, index, row): for rank in LIST_TAXONOMIC_LEVELS: if row[rank] == row[rank]: - enum = 'Virus' + rank.upper() + pascalcase(row[rank]) + enum = 'dcs:Virus' + rank.upper()[0] + rank.lower()[1:] + pascalcase(row[rank]) df.loc[index, rank] = enum return df From 5948dd449dd229eb3142f46db9f6f2d920c6e54f Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 31 Jul 2023 15:42:58 -0700 Subject: [PATCH 27/60] Delete log file --- .../ICTV_Taxonomy/scripts/format_virus_metadata_resource.log | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.log diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.log b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.log deleted file mode 100644 index d6ea08e466..0000000000 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.log +++ /dev/null @@ -1,4 +0,0 @@ -Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/BetachrysovirusMagnaporthis_VietNam_MoCV1-B -Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/AroaVirus_BeAn4073_AF013366 -Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/UgandanCassavaBrownStreakVirus_UG_FJ185044 -Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/PotatoVirusY_N_X97895 From 6404ff6567dcd1df94f6d1a6d31cf18c1ca65acb Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 31 Jul 2023 15:43:18 -0700 Subject: [PATCH 28/60] Update script --- .../format_virus_master_species_list.py | 68 ++++++------------- 1 file changed, 22 insertions(+), 46 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py index 8e37b87052..a05975d31f 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py @@ -1,27 +1,3 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Author: Samantha Piekos -Date: 03/22/2023 -Name: format_virus_master_species_list -Description: This cleans and formats the ICTV Master Species List for -import into the knowledge graph as a csv+tmcf pair. - -@file_input Master Species List excel file -@file_output cleaned and formatted csv file -""" - # load environment import pandas as pd import sys @@ -29,31 +5,31 @@ # declare universal variables DICT_CHANGE_ENUM = { -'abolished': 'VirusLastTaxonomicChangeAbolished',\ -'demoted' : 'VirusLastTaxonomicChangeDemoted',\ -'merged': 'VirusLastTaxonomicChangeMerged',\ -'moved': 'VirusLastTaxonomicChangeMoved',\ -'new': 'VirusLastTaxonomicChangeNew',\ -'promoted': 'VirusLastTaxonomicChangePromoted',\ -'removed as type species': 'VirusLastTaxonomicChangeRemoved',\ -'renamed': 'VirusLastTaxonomicChangeRenamed',\ -'split': 'VirusLastTaxonomicChangeSplit' +'abolished': 'dcs:VirusLastTaxonomicChangeAbolished',\ +'demoted' : 'dcs:VirusLastTaxonomicChangeDemoted',\ +'merged': 'dcs:VirusLastTaxonomicChangeMerged',\ +'moved': 'dcs:VirusLastTaxonomicChangeMoved',\ +'new': 'dcs:VirusLastTaxonomicChangeNew',\ +'promoted': 'dcs:VirusLastTaxonomicChangePromoted',\ +'removed as type species': 'dcs:VirusLastTaxonomicChangeRemoved',\ +'renamed': 'dcs:VirusLastTaxonomicChangeRenamed',\ +'split': 'dcs:VirusLastTaxonomicChangeSplit' } DICT_GC = { -'dsDNA': 'VirusGenomeCompositionDoubleStrandedDNA',\ -'ssDNA': 'VirusGenomeCompositionSingleStrandedDNA',\ -'ssDNA(-)': 'VirusGenomeCompositionSingleStrandedDNANegative',\ -'ssDNA(+)': 'VirusGenomeCompositionSingleStrandedDNAPositive',\ -'ssDNA(+/-)': 'VirusGenomeCompositionSingleStrandedDNA',\ -'dsDNA-RT': 'VirusGenomeCompositionDoubleStrandedDNAReverseTranscription',\ -'ssRNA-RT': 'VirusGenomeCompositionSingleStrandedDNAReverseTranscription',\ -'dsRNA': 'VirusGenomeCompositionDoubleStrandedRNA',\ -'ssRNA': 'VirusGenomeCompositionSingleStrandedRNA',\ -'ssRNA(-)': 'VirusGenomeCompositionSingleStrandedRNANegative',\ -'ssRNA(+)': 'VirusGenomeCompositionSingleStrandedRNAPositive',\ -'ssRNA(+/-)': 'VirusGenomeCompositionSingleStrandedRNA' +'dsDNA': 'dcs:VirusGenomeCompositionDoubleStrandedDNA',\ +'ssDNA': 'dcs:VirusGenomeCompositionSingleStrandedDNA',\ +'ssDNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedDNANegative',\ +'ssDNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedDNAPositive',\ +'ssDNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedDNA',\ +'dsDNA-RT': 'dcs:VirusGenomeCompositionDoubleStrandedDNAReverseTranscription',\ +'ssRNA-RT': 'dcs:VirusGenomeCompositionSingleStrandedRNAReverseTranscription',\ +'dsRNA': 'dcs:VirusGenomeCompositionDoubleStrandedRNA',\ +'ssRNA': 'dcs:VirusGenomeCompositionSingleStrandedRNA',\ +'ssRNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedRNANegative',\ +'ssRNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedRNAPositive',\ +'ssRNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedRNA' } @@ -117,7 +93,7 @@ def check_for_illegal_charc(s): def format_taxonomic_rank_properties(df, index, row): for rank in LIST_TAXONOMIC_LEVELS: if row[rank] == row[rank]: - enum = 'Virus' + rank.upper() + pascalcase(row[rank]) + enum = 'dcs:Virus' + rank.upper()[0] + rank.lower()[1:] + pascalcase(row[rank]) df.loc[index, rank] = enum return df From 3b84cb890d8670095ee523506ad116f1644e9ebf Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 14:57:52 -0800 Subject: [PATCH 29/60] Update create_virus_taxonomic_ranking_enums.py update script to handle new v38 master species list input file --- .../create_virus_taxonomic_ranking_enums.py | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py index 589637b1bc..cbd70fab68 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py @@ -1,3 +1,26 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Samantha Piekos +Date: 02/21/2024 +Name: create_virus_taxonomic_ranking_enums.py +Description: Creates hierarchical viral taxonomy enum schema from the +ICTV Master Species List. +@file_input: ICTV Master Speices List .xslx file +@file_output: formatted .mcf files for viral taxonomy enum schema +""" + # load environment import pandas as pd import sys @@ -132,7 +155,7 @@ def write_dict_to_file(w, enum, d, dict_specialization): def generate_enums_mcf(f, w): - df = pd.read_excel(f, names=HEADER, header=None, sheet_name=0) + df = pd.read_excel(f, names=HEADER, header=None, sheet_name=1) df = df.drop(LIST_DROP, axis=1).drop(0, axis=0) dict_of_dicts, dict_specialization = add_item_to_enums(df) w = open(w, mode='w') From a474fc230fd551dbb7ee88478bd59d08d9351f2d Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 15:03:55 -0800 Subject: [PATCH 30/60] Update format_virus_master_species_list.py update script to accommodate new v38 release --- .../format_virus_master_species_list.py | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py index a05975d31f..be730670b5 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py @@ -1,3 +1,30 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Samantha Piekos +Date: 02/21/2024 +Name: format_virus_master_species_list.py +Description: Formats ICTV Master Species List into a csv format for import +into Data Commons. This includes converting genome composition and last +change made to corresponding enums. Dcids were formatted by converting the +viral species name to pascalcase and adding the prefix 'bio/'. The viral +taxonomy is encoded in enum format. +@file_input: ICTV Master Speices List .xslx file +@file_output: formatted .mcf files for viral taxonomy enum schema +""" + + # load environment import pandas as pd import sys @@ -131,7 +158,7 @@ def clean_df(df): def clean_file(f, w): - df = pd.read_excel(f, names=HEADER, header=None, sheet_name=2) + df = pd.read_excel(f, names=HEADER, header=None, sheet_name=1) df = df.drop('sort', axis=1).drop(0, axis=0) df = clean_df(df) df.to_csv(w, index=False) From c298ab49c0f41e4fc2eaa9236633521f11369c3f Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 15:05:48 -0800 Subject: [PATCH 31/60] Update format_virus_master_species_list.py correct file_output description in the header --- .../ICTV_Taxonomy/scripts/format_virus_master_species_list.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py index be730670b5..5bb41595b0 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py @@ -21,7 +21,7 @@ viral species name to pascalcase and adding the prefix 'bio/'. The viral taxonomy is encoded in enum format. @file_input: ICTV Master Speices List .xslx file -@file_output: formatted .mcf files for viral taxonomy enum schema +@file_output: formatted csv format of Virus nodes """ From 394cda850c0eef15ca195eb7e2ebeb95d20a3efa Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 15:16:34 -0800 Subject: [PATCH 32/60] Update format_virus_metadata_resource.py update script to accommodate v38 --- .../scripts/format_virus_metadata_resource.py | 51 ++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py index ee45f69c60..a575c85677 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py @@ -1,3 +1,35 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Samantha Piekos +Date: 02/21/2024 +Name: format_virus_master_species_list.py +Description: Formats ICTV Virus Metadata Resource into two csv files - +one specific to VirusIsolates and the other VirusGenomeSegment for import +into Data Commons. This includes converting genome composition, genome +coverage, viral host, and viral source to corresponding enums. Virus, +VirusIsolate and VirusGenomeSegment dcids were formatted by converting +the names into pascal case and adding the prefix 'bio/'. The viral taxonomy +is encoded in enum format and found within Virus nodes. Whether an isolate +is an exemplar isolate or not was encoded into a boolean as a value for the +property 'isExemplar'. +@file_input: ICTV Virus Metadata Resource .xslx file +@file_output: formatted csv format of VirusIsolate and VirusGenomeSegment + nodes +""" + + # set up environment import pandas as pd import sys @@ -6,6 +38,7 @@ # declare universal variables DICT_COVERAGE = { +'coding-complete genome': 'dcs:GenomeCoverageCompleteGenome',\ 'complete genome': 'dcs:GenomeCoverageCompleteGenome',\ 'complete coding genome': 'dcs:GenomeCoverageCompleteCodingGenome',\ 'no entry in genbank': 'dcs:GenomeCoverageNoEntryInGenBank',\ @@ -43,6 +76,7 @@ DICT_SOURCE = { 'invertebrates': 'dcs:VirusSourceInvertebrates',\ + 'freshwater': 'dcs:VirusSourceWater',\ 'marine': 'dcs:VirusSourceMarine',\ 'phytobiome': 'dcs:VirusSourcePhytobiome',\ 'plants': 'dcs:VirusSourcePlants',\ @@ -272,20 +306,23 @@ def handle_genome_segments(df_segment, virus_dcid, virus_name, isolate_dcid, gen if refSeq == refSeq: dict_refSeq = make_refSeq_dict(refSeq) for item in list_genBank: - d = {'dcid': '', 'name': '', 'genBankAccession': '', 'genomeSegmentOf': '', 'refSeqAccession': ''} + d = {'dcid': [], 'name': [], 'genBankAccession': [], 'genomeSegmentOf': [], 'refSeqAccession': []} if ':' not in item: continue name, gb = item.split(':') name = name.strip() gb = gb.strip() - d['dcid'] = virus_dcid + gb + d['dcid'].append(virus_dcid + gb) check_for_illegal_charc(virus_dcid + gb) - d['name'] = virus_name + ' Segment ' + name - d['genBankAccession'] = gb - d['genomeSegmentOf'] = isolate_dcid + d['name'].append(virus_name + ' Segment ' + name) + d['genBankAccession'].append(gb) + d['genomeSegmentOf'].append('dcid:' + isolate_dcid) if name in dict_refSeq: - d['refSeqAccession'] = dict_refSeq[name] - df_segment = df_segment.append(d, ignore_index=True) + d['refSeqAccession'].append(dict_refSeq[name]) + else: + d['refSeqAccession'].append('') + df_new_row = pd.DataFrame.from_dict(d, orient='columns') + df_segment = pd.concat([df_segment, df_new_row], ignore_index=True) return df_segment From 1c99e86b39f15cd33facc0d6b5accc899e3123a5 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 15:19:30 -0800 Subject: [PATCH 33/60] Add run.sh --- .../biomedical/ICTV_Taxonomy/scripts/run.sh | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 scripts/biomedical/ICTV_Taxonomy/scripts/run.sh diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh new file mode 100644 index 0000000000..0aacc2d463 --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh @@ -0,0 +1,32 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Samantha Piekos +Date: 02/21/2024 +Name: download +Description: This file runs the python scripts to generate the viral taxonomy +enum mcf file and the csv files for Viruses, Virus Isolates, and Virus Genome +Segments from the ICTV Master Species List and the Virus Metadata Files. +""" + +# !/bin/bash + + +# Command to Generate Taxonomic Rank Enum Schema +python3 scripts/create_virus_taxonomic_ranking_enums.py input/ICTV_Virus_Species_List.xlsx ICTV_schema_taxonomic_ranking_enum.mcf + +# Commands to Run Scripts to Generate Cleaned CSV Files +python3 scripts/format_virus_master_species_list.py input/ICTV_Virus_Species_List.xlsx VirusSpecies.csv + +python3 scripts/format_virus_metadata_resource.py input/ICTV_Virus_Metadata_Resource.xlsx VirusIsolates.csv VirusGenomeSegments.csv > format_virus_metadata_resource.log \ No newline at end of file From e94562ec3a2ebbbeb87c73a4a1b4e41d5c1fb7ac Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 15:23:08 -0800 Subject: [PATCH 34/60] Update download.sh --- scripts/biomedical/ICTV_Taxonomy/scripts/download.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh index e49993cd8f..ce695fe4d6 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,15 @@ # limitations under the License. """ Author: Samantha Piekos -Date: 03/22/2023 +Date: 02/21/2024 Name: download Description: This file downloads the most recent version of the ICTV Master -Species List and Virus Metadata Resource and prepares it for processing +Species List and Virus Metadata Resource and prepares it for processing. """ + #!/bin/bash + # make input directory mkdir -p input; cd input From 4dc2aaa22156b388feff5420ebfcb14bedc3b1a7 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 15:23:56 -0800 Subject: [PATCH 35/60] Update format_virus_metadata_resource.log update log from running script for v38 --- .../ICTV_Taxonomy/logs/format_virus_metadata_resource.log | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log b/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log index d6ea08e466..80d063f948 100644 --- a/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log +++ b/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log @@ -1,4 +1,5 @@ Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/BetachrysovirusMagnaporthis_VietNam_MoCV1-B -Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/AroaVirus_BeAn4073_AF013366 +Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/OrthoflavivirusAroaense_BeAn4073_AF013366 +Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/OrthobornavirusCaenophidiae_CHC_224_BK014571 Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/UgandanCassavaBrownStreakVirus_UG_FJ185044 Non-unique VirusIsolate dcid generated! Added additional info to differentiate: bio/PotatoVirusY_N_X97895 From 31a429f40cda23ab23552a1a928da6678fce81df Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 15:26:26 -0800 Subject: [PATCH 36/60] Update README.md --- scripts/biomedical/ICTV_Taxonomy/README.md | 33 ++-------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 66436de30c..69ae08c1b8 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -116,6 +116,7 @@ Classes, properties, and enumerations that were added in this import to represen #### Scripts - [download.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh) +- [run.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/runsh) - [create_virus_taxonomic_ranking_enums.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py) - [format_virus_master_species_list.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py) - [format_virus_metadata_resource.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py) @@ -135,39 +136,9 @@ sh download.sh Generate the enummeration schema MCF, which represents virus taxonomic ranks by running: ```bash -python3 scripts/create_virus_taxonomic_ranking_enums.py import_files/ICTV_Master_Species_List_2021_v3.xlsx ICTV_schema_taxonomic_ranking_enum.mcf -``` - -Clean and format Master Species List as a CSV that matches the corresponding tMCF by running: - -```bash -python3 scripts/format_virus_master_species_list.py input/ICTV_Master_Species_List.xlsx VirusSpecies.csv -``` - -Clean and format Virus Metadata Resource as a CSV that matches the corresponding tMCF by running: - -```bash -python3 scripts/format_virus_metadata_resource.py input/ICTV_Virus_Metadata_Resource.xlsx VirusIsolates.csv VirusGenomeSegments.csv > format_virus_metadata_resource.log +sh run.sh ``` ### Tests -#### Dataset Specific Tests - -To test the import to evaluate whether the data is formatted as expected or if changes were made in the formatting in the most recent release run the following commands to evaluate each cleaned csv individually. - -VirusSpecies: -```bash -``` - -VirusIsolates: -```bash -``` - -VirusGenomeSegment: -```bash -``` - -#### Data Commons Import Tests - Please run all cleaned CSV + tMCF pairs through our lint test using our Data Commons import tool, which conducts general formatting tests. From efd8befe5f0c3ef7bdb6abbd08bfebc48c2c81f6 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 15:36:25 -0800 Subject: [PATCH 37/60] Update run.sh change taxonomic rank enum schema to be generated from the virus metadata file --- scripts/biomedical/ICTV_Taxonomy/scripts/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh index 0aacc2d463..ac8c5d5758 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh @@ -24,9 +24,9 @@ Segments from the ICTV Master Species List and the Virus Metadata Files. # Command to Generate Taxonomic Rank Enum Schema -python3 scripts/create_virus_taxonomic_ranking_enums.py input/ICTV_Virus_Species_List.xlsx ICTV_schema_taxonomic_ranking_enum.mcf +python3 scripts/create_virus_taxonomic_ranking_enums.py input/ICTV_Virus_Metadata_Resource.xlsx ICTV_schema_taxonomic_ranking_enum.mcf # Commands to Run Scripts to Generate Cleaned CSV Files python3 scripts/format_virus_master_species_list.py input/ICTV_Virus_Species_List.xlsx VirusSpecies.csv -python3 scripts/format_virus_metadata_resource.py input/ICTV_Virus_Metadata_Resource.xlsx VirusIsolates.csv VirusGenomeSegments.csv > format_virus_metadata_resource.log \ No newline at end of file +python3 scripts/format_virus_metadata_resource.py input/ICTV_Virus_Metadata_Resource.xlsx VirusIsolates.csv VirusGenomeSegments.csv > format_virus_metadata_resource.log From c1c30deebc0674d53e3f7ef7559dec2de685cf5f Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 15:37:46 -0800 Subject: [PATCH 38/60] Update create_virus_taxonomic_ranking_enums.py update so that the virus taxonomic schema is generated from the virus metadata resource file --- .../scripts/create_virus_taxonomic_ranking_enums.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py index cbd70fab68..c5515d4929 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py @@ -16,8 +16,8 @@ Date: 02/21/2024 Name: create_virus_taxonomic_ranking_enums.py Description: Creates hierarchical viral taxonomy enum schema from the -ICTV Master Species List. -@file_input: ICTV Master Speices List .xslx file +ICTV Virus Metadata Resource. +@file_input: ICTV Virus Metadata Resource .xslx file @file_output: formatted .mcf files for viral taxonomy enum schema """ @@ -155,7 +155,7 @@ def write_dict_to_file(w, enum, d, dict_specialization): def generate_enums_mcf(f, w): - df = pd.read_excel(f, names=HEADER, header=None, sheet_name=1) + df = pd.read_excel(f, names=HEADER, header=None, sheet_name=0) df = df.drop(LIST_DROP, axis=1).drop(0, axis=0) dict_of_dicts, dict_specialization = add_item_to_enums(df) w = open(w, mode='w') From fb396e9d07e7617539e279a376f33d2f922cc160 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Wed, 21 Feb 2024 15:47:46 -0800 Subject: [PATCH 39/60] Update format_virus_metadata_resource.py fix new enum added in v38 --- .../ICTV_Taxonomy/scripts/format_virus_metadata_resource.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py index a575c85677..d117eb87c3 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py @@ -75,8 +75,8 @@ DICT_SOURCE = { + 'freshwater': 'dcs:VirusSourceFreshwater',\ 'invertebrates': 'dcs:VirusSourceInvertebrates',\ - 'freshwater': 'dcs:VirusSourceWater',\ 'marine': 'dcs:VirusSourceMarine',\ 'phytobiome': 'dcs:VirusSourcePhytobiome',\ 'plants': 'dcs:VirusSourcePlants',\ From eee41abc03aded5939f0475be72b3da2e1bfdc3b Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 26 Feb 2024 14:50:57 -0800 Subject: [PATCH 40/60] Update execution bash files update download.sh, run.sh, and tests.sh scripts that download, format+clean, and test the import files --- .../ICTV_Taxonomy/scripts/download.sh | 2 +- .../biomedical/ICTV_Taxonomy/scripts/run.sh | 6 ++-- .../biomedical/ICTV_Taxonomy/scripts/tests.sh | 31 +++++++++++++++++++ 3 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh index ce695fe4d6..672164fcd4 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh @@ -13,7 +13,7 @@ # limitations under the License. """ Author: Samantha Piekos -Date: 02/21/2024 +Date: 02/26/2024 Name: download Description: This file downloads the most recent version of the ICTV Master Species List and Virus Metadata Resource and prepares it for processing. diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh index ac8c5d5758..336a5d6923 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh @@ -13,7 +13,7 @@ # limitations under the License. """ Author: Samantha Piekos -Date: 02/21/2024 +Date: 02/26/2024 Name: download Description: This file runs the python scripts to generate the viral taxonomy enum mcf file and the csv files for Viruses, Virus Isolates, and Virus Genome @@ -27,6 +27,6 @@ Segments from the ICTV Master Species List and the Virus Metadata Files. python3 scripts/create_virus_taxonomic_ranking_enums.py input/ICTV_Virus_Metadata_Resource.xlsx ICTV_schema_taxonomic_ranking_enum.mcf # Commands to Run Scripts to Generate Cleaned CSV Files -python3 scripts/format_virus_master_species_list.py input/ICTV_Virus_Species_List.xlsx VirusSpecies.csv +python3 scripts/format_virus_master_species_list.py input/ICTV_Virus_Species_List.xlsx CSVs/VirusSpecies.csv -python3 scripts/format_virus_metadata_resource.py input/ICTV_Virus_Metadata_Resource.xlsx VirusIsolates.csv VirusGenomeSegments.csv > format_virus_metadata_resource.log +python3 scripts/format_virus_metadata_resource.py input/ICTV_Virus_Metadata_Resource.xlsx CSVs/VirusIsolates.csv CSVs/VirusGenomeSegments.csv > format_virus_metadata_resource.log diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh new file mode 100644 index 0000000000..75fdea6d81 --- /dev/null +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh @@ -0,0 +1,31 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Samantha Piekos +Date: 02/26/2024 +Name: tests +Description: This file runs the Data Commons Java tool to run standard +tests on tmcf + CSV pairs for the ICTV data import. +""" + +#!/bin/bash + +java -jar /Applications/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar lint tMCFs/virusMasterSpeciesList.tmcf CSVs/VirusSpecies.csv ICTV*.mcf +mv dc_generated species + +java -jar /Applications/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar lint tMCFs/virusTaxonomy.tmcf CSVs/VirusIsolates.csv ICTV*.mcf +mv dc_generated virus_isolates + +java -jar /Applications/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar lint tMCFs/virusGenomeSegment.tmcf CSVs/VirusGenomeSegments.csv ICTV*.mcf +mv dc_generated genome_segments From 8f8a9dd0f5db53462e292b176ff2c80c044cf9fb Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 26 Feb 2024 15:06:55 -0800 Subject: [PATCH 41/60] Update README.md --- scripts/biomedical/ICTV_Taxonomy/README.md | 40 +++++++++++++--------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 69ae08c1b8..6be4e29c39 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -101,29 +101,28 @@ Classes, properties, and enumerations that were added in this import to represen * Enumerations Generated Via Script * VirusRealmEnum, VirusSubrealmEnum, VirusKingdomEnum, VirusSubkingdomEnum, VirusPhylumEnum, VirusSubphylumEnum, VirusClassEnum, VirusSubclassEnum, VirusOrderEnum, VirusSuborderEnum, VirusFamilyEnum, VirusSubfamilyEnum, VirusGenusEnum, VirusSubgenusEnum -#### Schema MCFs - -- [ICTV_schema.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema.mcf) -- [ICTV_schema_enum.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema_enum.mcf) -- [ICTV_schema_taxonomic_ranking_enum.mcf](https://github.com/datacommonsorg/schema/blob/main/biomedical_schema/ICTV_schema_taxonomic_ranking_enum.mcf) - #### tMCFs -- [VirusMasterSpeciesList.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusMasterSpeciesList.tmcf) -- [VirusTaxonomy.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusTaxonomy.tmcf) -- [VirusGenomeSegmeng.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusGenomeSegment.tmcf) +- [VirusMasterSpeciesList.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusMasterSpeciesList.tmcf) contains the tmcf mapping to the csv of viruses. +- [VirusTaxonomy.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusTaxonomy.tmcf) contains the tmcf mapping to the csv of virus isolates. +- [VirusGenomeSegmeng.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusGenomeSegment.tmcf) contains the tmcf mapping to the csv of viral genome segments. #### Scripts -- [download.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh) -- [run.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/runsh) -- [create_virus_taxonomic_ranking_enums.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py) -- [format_virus_master_species_list.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py) -- [format_virus_metadata_resource.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py) +##### Bash Scripts + +- [download.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh) downloads the most recent release of the ICTV Master Species List and Virus Metadata Resource. +- [run.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh) creates new viral taxonomy enum and converts data into formatted CSV for import of data on viruses, virus isolates, and viral genome fragments into the knowledge graph. +- [tests.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh) runs standard tests on CSV + tMCF pairs to check for proper formatting. + +##### Python Scripts +- [create_virus_taxonomic_ranking_enums.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py) creates the viral taxonomy enum mcf file from the Virus Metadata Resource file. +- [format_virus_master_species_list.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py) parses the raw Master Species List xslx file into virus csv file. +- [format_virus_metadata_resource.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py) parses the raw Virus Metadata Resource file into virus isolates and viral genome segements csv files. #### Log Files -- [format_virus_metadata_resource.log](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log) +- [format_virus_metadata_resource.log](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log) log file from script converting the Virus Metadata Resource into formatted CSV file. ### Import Procedure @@ -141,4 +140,13 @@ sh run.sh ### Tests -Please run all cleaned CSV + tMCF pairs through our lint test using our Data Commons import tool, which conducts general formatting tests. +Run Data Commons's java -jar import tool to ensure that all schema used in the import is present in the graph, all referenced nodes are present in the graph, along with other warnings. Please note that empty tokens for some columns are expected as this reflects the original data. The imports create the Virus nodes that are then refrenced within this import. This resolves any concern about missing reference warnings concerning these node types by the test. + +To run tests: + +```bash +sh tests.sh +``` + +This will generate an output file for the results of the tests on each csv + tmcf pair + From 4142015b55c55c45afd6500d45fc7b949aa14e25 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 4 Mar 2024 14:35:27 -0800 Subject: [PATCH 42/60] Rename VirusMasterSpeciesList.tmcf to VirusSpecies.tmcf --- .../tMCF/{VirusMasterSpeciesList.tmcf => VirusSpecies.tmcf} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/biomedical/ICTV_Taxonomy/tMCF/{VirusMasterSpeciesList.tmcf => VirusSpecies.tmcf} (100%) diff --git a/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusMasterSpeciesList.tmcf b/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusSpecies.tmcf similarity index 100% rename from scripts/biomedical/ICTV_Taxonomy/tMCF/VirusMasterSpeciesList.tmcf rename to scripts/biomedical/ICTV_Taxonomy/tMCF/VirusSpecies.tmcf From 0530ee62d7e888574099f0f3487c9f89b326a775 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 4 Mar 2024 14:35:43 -0800 Subject: [PATCH 43/60] Rename VirusGenomeSegment.tmcf to VirusGenomeSegments.tmcf --- .../tMCF/{VirusGenomeSegment.tmcf => VirusGenomeSegments.tmcf} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/biomedical/ICTV_Taxonomy/tMCF/{VirusGenomeSegment.tmcf => VirusGenomeSegments.tmcf} (100%) diff --git a/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusGenomeSegment.tmcf b/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusGenomeSegments.tmcf similarity index 100% rename from scripts/biomedical/ICTV_Taxonomy/tMCF/VirusGenomeSegment.tmcf rename to scripts/biomedical/ICTV_Taxonomy/tMCF/VirusGenomeSegments.tmcf From b88449fbd2a63151adf5358aee7319fb22fe06dc Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 4 Mar 2024 14:36:01 -0800 Subject: [PATCH 44/60] Rename VirusTaxonomy.tmcf to VirusIsolates.tmcf --- .../ICTV_Taxonomy/tMCF/{VirusTaxonomy.tmcf => VirusIsolates.tmcf} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/biomedical/ICTV_Taxonomy/tMCF/{VirusTaxonomy.tmcf => VirusIsolates.tmcf} (100%) diff --git a/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusTaxonomy.tmcf b/scripts/biomedical/ICTV_Taxonomy/tMCF/VirusIsolates.tmcf similarity index 100% rename from scripts/biomedical/ICTV_Taxonomy/tMCF/VirusTaxonomy.tmcf rename to scripts/biomedical/ICTV_Taxonomy/tMCF/VirusIsolates.tmcf From 413656d88eb95c718b247b0305702c4030f905d2 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 4 Mar 2024 14:40:36 -0800 Subject: [PATCH 45/60] Update tmcf links in README.md --- scripts/biomedical/ICTV_Taxonomy/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 6be4e29c39..1d4fb1ea34 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -103,9 +103,9 @@ Classes, properties, and enumerations that were added in this import to represen #### tMCFs -- [VirusMasterSpeciesList.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusMasterSpeciesList.tmcf) contains the tmcf mapping to the csv of viruses. -- [VirusTaxonomy.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusTaxonomy.tmcf) contains the tmcf mapping to the csv of virus isolates. -- [VirusGenomeSegmeng.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusGenomeSegment.tmcf) contains the tmcf mapping to the csv of viral genome segments. +- [VirusSpecies.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusSpecies.tmcf) contains the tmcf mapping to the csv of viruses. +- [VirusIsolates.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusIsolates.tmcf) contains the tmcf mapping to the csv of virus isolates. +- [VirusGenomeSegments.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusGenomeSegments.tmcf) contains the tmcf mapping to the csv of viral genome segments. #### Scripts From 90fdf4bbeee6b4b780318db672caafc153cda3df Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 4 Mar 2024 17:27:29 -0800 Subject: [PATCH 46/60] Update bash scripts filepaths in README.md --- scripts/biomedical/ICTV_Taxonomy/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 1d4fb1ea34..33e8ca84ce 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -111,9 +111,9 @@ Classes, properties, and enumerations that were added in this import to represen ##### Bash Scripts -- [download.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/download.sh) downloads the most recent release of the ICTV Master Species List and Virus Metadata Resource. -- [run.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh) creates new viral taxonomy enum and converts data into formatted CSV for import of data on viruses, virus isolates, and viral genome fragments into the knowledge graph. -- [tests.sh](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh) runs standard tests on CSV + tMCF pairs to check for proper formatting. +- [download.sh](scripts/download.sh) downloads the most recent release of the ICTV Master Species List and Virus Metadata Resource. +- [run.sh](scripts/run.sh) creates new viral taxonomy enum and converts data into formatted CSV for import of data on viruses, virus isolates, and viral genome fragments into the knowledge graph. +- [tests.sh](scripts/tests.sh) runs standard tests on CSV + tMCF pairs to check for proper formatting. ##### Python Scripts - [create_virus_taxonomic_ranking_enums.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py) creates the viral taxonomy enum mcf file from the Virus Metadata Resource file. From 4131f21f6f73b35c37d4b0a53beeb64b1334b657 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 4 Mar 2024 17:28:37 -0800 Subject: [PATCH 47/60] Update filepaths in README.md --- scripts/biomedical/ICTV_Taxonomy/README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 33e8ca84ce..e2a9757785 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -103,9 +103,9 @@ Classes, properties, and enumerations that were added in this import to represen #### tMCFs -- [VirusSpecies.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusSpecies.tmcf) contains the tmcf mapping to the csv of viruses. -- [VirusIsolates.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusIsolates.tmcf) contains the tmcf mapping to the csv of virus isolates. -- [VirusGenomeSegments.tmcf](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/tMCFs/VirusGenomeSegments.tmcf) contains the tmcf mapping to the csv of viral genome segments. +- [VirusSpecies.tmcf](tMCFs/VirusSpecies.tmcf) contains the tmcf mapping to the csv of viruses. +- [VirusIsolates.tmcf](tMCFs/VirusIsolates.tmcf) contains the tmcf mapping to the csv of virus isolates. +- [VirusGenomeSegments.tmcf](tMCFs/VirusGenomeSegments.tmcf) contains the tmcf mapping to the csv of viral genome segments. #### Scripts @@ -116,13 +116,13 @@ Classes, properties, and enumerations that were added in this import to represen - [tests.sh](scripts/tests.sh) runs standard tests on CSV + tMCF pairs to check for proper formatting. ##### Python Scripts -- [create_virus_taxonomic_ranking_enums.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py) creates the viral taxonomy enum mcf file from the Virus Metadata Resource file. -- [format_virus_master_species_list.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py) parses the raw Master Species List xslx file into virus csv file. -- [format_virus_metadata_resource.py](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py) parses the raw Virus Metadata Resource file into virus isolates and viral genome segements csv files. +- [create_virus_taxonomic_ranking_enums.py](scripts/create_virus_taxonomic_ranking_enums.py) creates the viral taxonomy enum mcf file from the Virus Metadata Resource file. +- [format_virus_master_species_list.py](scripts/format_virus_master_species_list.py) parses the raw Master Species List xslx file into virus csv file. +- [format_virus_metadata_resource.py](scripts/format_virus_metadata_resource.py) parses the raw Virus Metadata Resource file into virus isolates and viral genome segements csv files. #### Log Files -- [format_virus_metadata_resource.log](https://github.com/datacommonsorg/data/new/master/scripts/biomedical/ICTV_Taxonomy/logs/format_virus_metadata_resource.log) log file from script converting the Virus Metadata Resource into formatted CSV file. +- [format_virus_metadata_resource.log](logs/format_virus_metadata_resource.log) log file from script converting the Virus Metadata Resource into formatted CSV file. ### Import Procedure From 4fcca307a073d5b3d8c2bd3de2a5b187590bc48c Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 4 Mar 2024 21:58:12 -0800 Subject: [PATCH 48/60] Update README.md table of contents --- scripts/biomedical/ICTV_Taxonomy/README.md | 41 +++++++++++++--------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index e2a9757785..62d94834a7 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -4,16 +4,24 @@ ## Table of Contents 1. [About the Dataset](#about-the-dataset) - 1. [Download URL](#download-urls) - 2. [Overview](#overview) - 3. [Notes and Caveats](#notes-and-caveats) - 4. [dcid Generation](#dcid-generation) - 5. [License](#license) - 6. [Dataset Documentation and Relevant Links](#dataset-documentation-and-relevant-links) -2. [About the Import](#about-the-import) - 1. [Artifacts](#artifacts) - 2. [Import Procedure](#import-procedure) - 4. [Tests](#tests) + A. [Download URL](#download-urls) + B. [Overview](#overview) + C. [Notes and Caveats](#notes-and-caveats) + D. [dcid Generation](#dcid-generation) + i. [Virus](#virus) + ii. [VirusIsolate](#virusisolate) + iii. [VirusGenomeSegment](#virusgenomesegment) + iv. [Illegal Characters](#illegal-characters) + E. [License](#license) + F. [Dataset Documentation and Relevant Links](#dataset-documentation-and-relevant-links) +3. [About the Import](#about-the-import) + A. [Artifacts](#artifacts) + i. [New Schema](#new-schema) + ii. [Scripts](#scripts) + iii.[tMCFs)(#tmcfs) + iv. [Log Files](#log-files) + B. [Import Procedure](#import-procedure) + C. [Tests](#tests) ## About the Datasets @@ -101,12 +109,6 @@ Classes, properties, and enumerations that were added in this import to represen * Enumerations Generated Via Script * VirusRealmEnum, VirusSubrealmEnum, VirusKingdomEnum, VirusSubkingdomEnum, VirusPhylumEnum, VirusSubphylumEnum, VirusClassEnum, VirusSubclassEnum, VirusOrderEnum, VirusSuborderEnum, VirusFamilyEnum, VirusSubfamilyEnum, VirusGenusEnum, VirusSubgenusEnum -#### tMCFs - -- [VirusSpecies.tmcf](tMCFs/VirusSpecies.tmcf) contains the tmcf mapping to the csv of viruses. -- [VirusIsolates.tmcf](tMCFs/VirusIsolates.tmcf) contains the tmcf mapping to the csv of virus isolates. -- [VirusGenomeSegments.tmcf](tMCFs/VirusGenomeSegments.tmcf) contains the tmcf mapping to the csv of viral genome segments. - #### Scripts ##### Bash Scripts @@ -116,10 +118,17 @@ Classes, properties, and enumerations that were added in this import to represen - [tests.sh](scripts/tests.sh) runs standard tests on CSV + tMCF pairs to check for proper formatting. ##### Python Scripts + - [create_virus_taxonomic_ranking_enums.py](scripts/create_virus_taxonomic_ranking_enums.py) creates the viral taxonomy enum mcf file from the Virus Metadata Resource file. - [format_virus_master_species_list.py](scripts/format_virus_master_species_list.py) parses the raw Master Species List xslx file into virus csv file. - [format_virus_metadata_resource.py](scripts/format_virus_metadata_resource.py) parses the raw Virus Metadata Resource file into virus isolates and viral genome segements csv files. +#### tMCFs + +- [VirusSpecies.tmcf](tMCFs/VirusSpecies.tmcf) contains the tmcf mapping to the csv of viruses. +- [VirusIsolates.tmcf](tMCFs/VirusIsolates.tmcf) contains the tmcf mapping to the csv of virus isolates. +- [VirusGenomeSegments.tmcf](tMCFs/VirusGenomeSegments.tmcf) contains the tmcf mapping to the csv of viral genome segments. + #### Log Files - [format_virus_metadata_resource.log](logs/format_virus_metadata_resource.log) log file from script converting the Virus Metadata Resource into formatted CSV file. From 0267a424c04ff7d1b4164a4af20698591bc1dc5e Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 4 Mar 2024 22:01:09 -0800 Subject: [PATCH 49/60] Update README.md table of contents --- scripts/biomedical/ICTV_Taxonomy/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 62d94834a7..d0304ebee4 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -14,11 +14,11 @@ iv. [Illegal Characters](#illegal-characters) E. [License](#license) F. [Dataset Documentation and Relevant Links](#dataset-documentation-and-relevant-links) -3. [About the Import](#about-the-import) +2. [About the Import](#about-the-import) A. [Artifacts](#artifacts) - i. [New Schema](#new-schema) + i. [New Schema](#new-schema) ii. [Scripts](#scripts) - iii.[tMCFs)(#tmcfs) + iii.[tMCFs](#tmcfs) iv. [Log Files](#log-files) B. [Import Procedure](#import-procedure) C. [Tests](#tests) From 157f1e5ca8c3dedd2def38cadcc3da7554788646 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 4 Mar 2024 22:02:54 -0800 Subject: [PATCH 50/60] Update README.md Table of Contents --- scripts/biomedical/ICTV_Taxonomy/README.md | 34 +++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index d0304ebee4..b1a37c2b92 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -4,24 +4,24 @@ ## Table of Contents 1. [About the Dataset](#about-the-dataset) - A. [Download URL](#download-urls) - B. [Overview](#overview) - C. [Notes and Caveats](#notes-and-caveats) - D. [dcid Generation](#dcid-generation) - i. [Virus](#virus) - ii. [VirusIsolate](#virusisolate) - iii. [VirusGenomeSegment](#virusgenomesegment) - iv. [Illegal Characters](#illegal-characters) - E. [License](#license) - F. [Dataset Documentation and Relevant Links](#dataset-documentation-and-relevant-links) + 1. [Download URL](#download-urls) + 2. [Overview](#overview) + 3. [Notes and Caveats](#notes-and-caveats) + 4. [dcid Generation](#dcid-generation) + 1. [Virus](#virus) + 2. [VirusIsolate](#virusisolate) + 3. [VirusGenomeSegment](#virusgenomesegment) + 4. [Illegal Characters](#illegal-characters) + 5. [License](#license) + 6. [Dataset Documentation and Relevant Links](#dataset-documentation-and-relevant-links) 2. [About the Import](#about-the-import) - A. [Artifacts](#artifacts) - i. [New Schema](#new-schema) - ii. [Scripts](#scripts) - iii.[tMCFs](#tmcfs) - iv. [Log Files](#log-files) - B. [Import Procedure](#import-procedure) - C. [Tests](#tests) + 1. [Artifacts](#artifacts) + 1. [New Schema](#new-schema) + 2. [Scripts](#scripts) + 3.[tMCFs](#tmcfs) + 4. [Log Files](#log-files) + 2. [Import Procedure](#import-procedure) + 3. [Tests](#tests) ## About the Datasets From b7332803894ee4e637004b43bf579819ad2b0afa Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 4 Mar 2024 22:03:30 -0800 Subject: [PATCH 51/60] Update README.md --- scripts/biomedical/ICTV_Taxonomy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index b1a37c2b92..fd4102810a 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -18,7 +18,7 @@ 1. [Artifacts](#artifacts) 1. [New Schema](#new-schema) 2. [Scripts](#scripts) - 3.[tMCFs](#tmcfs) + 3. [tMCFs](#tmcfs) 4. [Log Files](#log-files) 2. [Import Procedure](#import-procedure) 3. [Tests](#tests) From 799cc24ac3c45f6531242dd8c62704fcb7f47a0c Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Tue, 5 Mar 2024 13:48:14 -0800 Subject: [PATCH 52/60] Add line creating CSVs directory --- scripts/biomedical/ICTV_Taxonomy/scripts/run.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh index 336a5d6923..796a9cb975 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/run.sh @@ -22,6 +22,8 @@ Segments from the ICTV Master Species List and the Virus Metadata Files. # !/bin/bash +# make CSV directory to which to output cleaned csv +mkdir -p CSVs # Command to Generate Taxonomic Rank Enum Schema python3 scripts/create_virus_taxonomic_ranking_enums.py input/ICTV_Virus_Metadata_Resource.xlsx ICTV_schema_taxonomic_ranking_enum.mcf From 25d27408dd81175f93639e401fa50b15e94534ac Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Tue, 5 Mar 2024 13:51:08 -0800 Subject: [PATCH 53/60] Update create_virus_taxonomic_ranking_enums.py remove trailing \ in comma separated lists --- .../create_virus_taxonomic_ranking_enums.py | 92 +++++++++---------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py index c5515d4929..0b80fcb243 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py @@ -28,56 +28,56 @@ # declare universal variables HEADER = [ -'sort',\ -'isolateSort',\ -'realm',\ -'subrealm',\ -'kingdom',\ -'subkingdom',\ -'phylum',\ -'subphylum',\ -'class',\ -'subclass',\ -'order',\ -'suborder',\ -'family',\ -'subfamily',\ -'genus',\ -'subgenus',\ -'species',\ -'isExemplar',\ -'name',\ -'abbreviation',\ -'isolateDesignation',\ -'genBankAccession',\ -'refSeqAccession',\ -'genomeCoverage',\ -'genomeComposition',\ -'hostSource',\ -'host',\ -'source',\ -'dcid',\ -'isolate_dcid',\ +'sort', +'isolateSort', +'realm', +'subrealm', +'kingdom', +'subkingdom', +'phylum', +'subphylum', +'class', +'subclass', +'order', +'suborder', +'family', +'subfamily', +'genus', +'subgenus', +'species', +'isExemplar', +'name', +'abbreviation', +'isolateDesignation', +'genBankAccession', +'refSeqAccession', +'genomeCoverage', +'genomeComposition', +'hostSource', +'host', +'source', +'dcid', +'isolate_dcid', 'isolate_name' ] LIST_DROP = [ -'sort',\ -'isolateSort',\ -'species',\ -'isExemplar',\ -'name',\ -'abbreviation',\ -'isolateDesignation',\ -'genBankAccession',\ -'refSeqAccession',\ -'genomeCoverage',\ -'genomeComposition',\ -'hostSource',\ -'host',\ -'source',\ -'dcid',\ -'isolate_dcid',\ +'sort', +'isolateSort', +'species', +'isExemplar', +'name', +'abbreviation', +'isolateDesignation', +'genBankAccession', +'refSeqAccession', +'genomeCoverage', +'genomeComposition', +'hostSource', +'host', +'source', +'dcid', +'isolate_dcid', 'isolate_name' ] From e3ffedc104a324be8bf60523c4244ba4bc833bf0 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Tue, 5 Mar 2024 13:51:56 -0800 Subject: [PATCH 54/60] Update format_virus_master_species_list.py remove trailing '\' from comma separated lists and directories --- .../format_virus_master_species_list.py | 106 +++++++++--------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py index 5bb41595b0..7fb97312a0 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py @@ -32,74 +32,74 @@ # declare universal variables DICT_CHANGE_ENUM = { -'abolished': 'dcs:VirusLastTaxonomicChangeAbolished',\ -'demoted' : 'dcs:VirusLastTaxonomicChangeDemoted',\ -'merged': 'dcs:VirusLastTaxonomicChangeMerged',\ -'moved': 'dcs:VirusLastTaxonomicChangeMoved',\ -'new': 'dcs:VirusLastTaxonomicChangeNew',\ -'promoted': 'dcs:VirusLastTaxonomicChangePromoted',\ -'removed as type species': 'dcs:VirusLastTaxonomicChangeRemoved',\ -'renamed': 'dcs:VirusLastTaxonomicChangeRenamed',\ +'abolished': 'dcs:VirusLastTaxonomicChangeAbolished', +'demoted' : 'dcs:VirusLastTaxonomicChangeDemoted', +'merged': 'dcs:VirusLastTaxonomicChangeMerged', +'moved': 'dcs:VirusLastTaxonomicChangeMoved', +'new': 'dcs:VirusLastTaxonomicChangeNew', +'promoted': 'dcs:VirusLastTaxonomicChangePromoted', +'removed as type species': 'dcs:VirusLastTaxonomicChangeRemoved', +'renamed': 'dcs:VirusLastTaxonomicChangeRenamed', 'split': 'dcs:VirusLastTaxonomicChangeSplit' } DICT_GC = { -'dsDNA': 'dcs:VirusGenomeCompositionDoubleStrandedDNA',\ -'ssDNA': 'dcs:VirusGenomeCompositionSingleStrandedDNA',\ -'ssDNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedDNANegative',\ -'ssDNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedDNAPositive',\ -'ssDNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedDNA',\ -'dsDNA-RT': 'dcs:VirusGenomeCompositionDoubleStrandedDNAReverseTranscription',\ -'ssRNA-RT': 'dcs:VirusGenomeCompositionSingleStrandedRNAReverseTranscription',\ -'dsRNA': 'dcs:VirusGenomeCompositionDoubleStrandedRNA',\ -'ssRNA': 'dcs:VirusGenomeCompositionSingleStrandedRNA',\ -'ssRNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedRNANegative',\ -'ssRNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedRNAPositive',\ +'dsDNA': 'dcs:VirusGenomeCompositionDoubleStrandedDNA', +'ssDNA': 'dcs:VirusGenomeCompositionSingleStrandedDNA', +'ssDNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedDNANegative', +'ssDNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedDNAPositive', +'ssDNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedDNA', +'dsDNA-RT': 'dcs:VirusGenomeCompositionDoubleStrandedDNAReverseTranscription', +'ssRNA-RT': 'dcs:VirusGenomeCompositionSingleStrandedRNAReverseTranscription', +'dsRNA': 'dcs:VirusGenomeCompositionDoubleStrandedRNA', +'ssRNA': 'dcs:VirusGenomeCompositionSingleStrandedRNA', +'ssRNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedRNANegative', +'ssRNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedRNAPositive', 'ssRNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedRNA' } HEADER = [ -'sort',\ -'realm',\ -'subrealm',\ -'kingdom',\ -'subkingdom',\ -'phylum',\ -'subphylum',\ -'class',\ -'subclass',\ -'order',\ -'suborder',\ -'family',\ -'subfamily',\ -'genus',\ -'subgenus',\ -'species',\ -'genomeComposition',\ -'lastChange',\ -'lastChangeVersion',\ -'proposalForLastChange',\ -'taxonHistoryURL',\ +'sort', +'realm', +'subrealm', +'kingdom', +'subkingdom', +'phylum', +'subphylum', +'class', +'subclass', +'order', +'suborder', +'family', +'subfamily', +'genus', +'subgenus', +'species', +'genomeComposition', +'lastChange', +'lastChangeVersion', +'proposalForLastChange', +'taxonHistoryURL', 'dcid' ] LIST_TAXONOMIC_LEVELS = [ -'realm',\ -'subrealm',\ -'kingdom',\ -'subkingdom',\ -'phylum',\ -'subphylum',\ -'class',\ -'subclass',\ -'order',\ -'suborder',\ -'family',\ -'subfamily',\ -'genus',\ +'realm', +'subrealm', +'kingdom', +'subkingdom', +'phylum', +'subphylum', +'class', +'subclass', +'order', +'suborder', +'family', +'subfamily', +'genus', 'subgenus' ] From 67909985f72a330593c56a5c3a737f4ce76b8c62 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Tue, 5 Mar 2024 13:52:33 -0800 Subject: [PATCH 55/60] Update format_virus_metadata_resource.py remove trailing '\' from comma separated lists and directories --- .../scripts/format_virus_metadata_resource.py | 152 +++++++++--------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py index d117eb87c3..753769d3c1 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py @@ -38,112 +38,112 @@ # declare universal variables DICT_COVERAGE = { -'coding-complete genome': 'dcs:GenomeCoverageCompleteGenome',\ -'complete genome': 'dcs:GenomeCoverageCompleteGenome',\ -'complete coding genome': 'dcs:GenomeCoverageCompleteCodingGenome',\ -'no entry in genbank': 'dcs:GenomeCoverageNoEntryInGenBank',\ +'coding-complete genome': 'dcs:GenomeCoverageCompleteGenome', +'complete genome': 'dcs:GenomeCoverageCompleteGenome', +'complete coding genome': 'dcs:GenomeCoverageCompleteCodingGenome', +'no entry in genbank': 'dcs:GenomeCoverageNoEntryInGenBank', 'partial genome': 'dcs:GenomeCoveragePartialGenome' } DICT_GC = { -'dsDNA': 'dcs:VirusGenomeCompositionDoubleStrandedDNA',\ -'ssDNA': 'dcs:VirusGenomeCompositionSingleStrandedDNA',\ -'ssDNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedDNANegative',\ -'ssDNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedDNAPositive',\ -'ssDNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedDNA',\ -'dsDNA-RT': 'dcs:VirusGenomeCompositionDoubleStrandedDNAReverseTranscription',\ -'ssRNA-RT': 'dcs:VirusGenomeCompositionSingleStrandedRNAReverseTranscription',\ -'dsRNA': 'dcs:VirusGenomeCompositionDoubleStrandedRNA',\ -'ssRNA': 'dcs:VirusGenomeCompositionSingleStrandedRNA',\ -'ssRNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedRNANegative',\ -'ssRNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedRNAPositive',\ +'dsDNA': 'dcs:VirusGenomeCompositionDoubleStrandedDNA', +'ssDNA': 'dcs:VirusGenomeCompositionSingleStrandedDNA', +'ssDNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedDNANegative', +'ssDNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedDNAPositive', +'ssDNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedDNA', +'dsDNA-RT': 'dcs:VirusGenomeCompositionDoubleStrandedDNAReverseTranscription', +'ssRNA-RT': 'dcs:VirusGenomeCompositionSingleStrandedRNAReverseTranscription', +'dsRNA': 'dcs:VirusGenomeCompositionDoubleStrandedRNA', +'ssRNA': 'dcs:VirusGenomeCompositionSingleStrandedRNA', +'ssRNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedRNANegative', +'ssRNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedRNAPositive', 'ssRNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedRNA' } DICT_HOST = { - 'algae': 'dcs:VirusHostAlgae',\ - 'archaea': 'dcs:VirusHostArchaea',\ - 'bacteria': 'dcs:VirusHostBacteria',\ - 'fungi': 'dcs:VirusHostFungi',\ - 'invertebrates': 'dcs:VirusHostInvertebrates',\ - 'plants': 'dcs:VirusHostPlants',\ - 'protists': 'dcs:VirusHostProtists',\ + 'algae': 'dcs:VirusHostAlgae', + 'archaea': 'dcs:VirusHostArchaea', + 'bacteria': 'dcs:VirusHostBacteria', + 'fungi': 'dcs:VirusHostFungi', + 'invertebrates': 'dcs:VirusHostInvertebrates', + 'plants': 'dcs:VirusHostPlants', + 'protists': 'dcs:VirusHostProtists', 'vertebrates': 'dcs:VirusHostVertebrates' } DICT_SOURCE = { - 'freshwater': 'dcs:VirusSourceFreshwater',\ - 'invertebrates': 'dcs:VirusSourceInvertebrates',\ - 'marine': 'dcs:VirusSourceMarine',\ - 'phytobiome': 'dcs:VirusSourcePhytobiome',\ - 'plants': 'dcs:VirusSourcePlants',\ - 'protists': 'dcs:VirusSourceProtists',\ - 'sewage': 'dcs:VirusSourceSewage',\ + 'freshwater': 'dcs:VirusSourceFreshwater', + 'invertebrates': 'dcs:VirusSourceInvertebrates', + 'marine': 'dcs:VirusSourceMarine', + 'phytobiome': 'dcs:VirusSourcePhytobiome', + 'plants': 'dcs:VirusSourcePlants', + 'protists': 'dcs:VirusSourceProtists', + 'sewage': 'dcs:VirusSourceSewage', 'soil': 'dcs:VirusSourceSoil' } HEADER = [ -'sort',\ -'isolateSort',\ -'realm',\ -'subrealm',\ -'kingdom',\ -'subkingdom',\ -'phylum',\ -'subphylum',\ -'class',\ -'subclass',\ -'order',\ -'suborder',\ -'family',\ -'subfamily',\ -'genus',\ -'subgenus',\ -'species',\ -'isExemplar',\ -'name',\ -'abbreviation',\ -'isolateDesignation',\ -'genBankAccession',\ -'refSeqAccession',\ -'genomeCoverage',\ -'genomeComposition',\ -'hostSource',\ -'host',\ -'source',\ -'dcid',\ -'isolate_dcid',\ +'sort', +'isolateSort', +'realm', +'subrealm', +'kingdom', +'subkingdom', +'phylum', +'subphylum', +'class', +'subclass', +'order', +'suborder', +'family', +'subfamily', +'genus', +'subgenus', +'species', +'isExemplar', +'name', +'abbreviation', +'isolateDesignation', +'genBankAccession', +'refSeqAccession', +'genomeCoverage', +'genomeComposition', +'hostSource', +'host', +'source', +'dcid', +'isolate_dcid', 'isolate_name' ] HEADER_2 = [ -'dcid',\ -'name',\ -'genBankAccession',\ -'genomeSegmentOf',\ +'dcid', +'name', +'genBankAccession', +'genomeSegmentOf', 'refSeqAccession' ] LIST_TAXONOMIC_LEVELS = [ -'realm',\ -'subrealm',\ -'kingdom',\ -'subkingdom',\ -'phylum',\ -'subphylum',\ -'class',\ -'subclass',\ -'order',\ -'suborder',\ -'family',\ -'subfamily',\ -'genus',\ +'realm', +'subrealm', +'kingdom', +'subkingdom', +'phylum', +'subphylum', +'class', +'subclass', +'order', +'suborder', +'family', +'subfamily', +'genus', 'subgenus' ] From b02e7c0db7b27be746ad98814fba647badd4fad3 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Tue, 5 Mar 2024 14:49:49 -0800 Subject: [PATCH 56/60] Update tests.sh add extra step to download the data commons java test tool --- scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh b/scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh index 75fdea6d81..5117c9eb81 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/tests.sh @@ -13,7 +13,7 @@ # limitations under the License. """ Author: Samantha Piekos -Date: 02/26/2024 +Date: 03/05/2024 Name: tests Description: This file runs the Data Commons Java tool to run standard tests on tmcf + CSV pairs for the ICTV data import. @@ -21,11 +21,17 @@ tests on tmcf + CSV pairs for the ICTV data import. #!/bin/bash -java -jar /Applications/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar lint tMCFs/virusMasterSpeciesList.tmcf CSVs/VirusSpecies.csv ICTV*.mcf +# download data commons java test tool version 0.1-alpha.1k +mkdir -p tmp; cd tmp +wget https://github.com/datacommonsorg/import/releases/download/0.1-alpha.1k/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar +cd .. + +# run tests +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar lint tMCFs/virusMasterSpeciesList.tmcf CSVs/VirusSpecies.csv ICTV*.mcf mv dc_generated species -java -jar /Applications/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar lint tMCFs/virusTaxonomy.tmcf CSVs/VirusIsolates.csv ICTV*.mcf +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar lint tMCFs/virusTaxonomy.tmcf CSVs/VirusIsolates.csv ICTV*.mcf mv dc_generated virus_isolates -java -jar /Applications/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar lint tMCFs/virusGenomeSegment.tmcf CSVs/VirusGenomeSegments.csv ICTV*.mcf +java -jar tmp/datacommons-import-tool-0.1-alpha.1-jar-with-dependencies.jar lint tMCFs/virusGenomeSegment.tmcf CSVs/VirusGenomeSegments.csv ICTV*.mcf mv dc_generated genome_segments From 05ca8d975b6a2bbc57e19c75075be9f7f0508f51 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Tue, 5 Mar 2024 15:38:24 -0800 Subject: [PATCH 57/60] Update README.md update tests subsection description --- scripts/biomedical/ICTV_Taxonomy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index fd4102810a..44c4b1b078 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -149,7 +149,7 @@ sh run.sh ### Tests -Run Data Commons's java -jar import tool to ensure that all schema used in the import is present in the graph, all referenced nodes are present in the graph, along with other warnings. Please note that empty tokens for some columns are expected as this reflects the original data. The imports create the Virus nodes that are then refrenced within this import. This resolves any concern about missing reference warnings concerning these node types by the test. +Downloads Data Commons's java -jar import tool, storing it in a `tmp` directory. This tool is described in Data Commons documentation of the [import pipeline](https://github.com/datacommonsorg/import/). The relases of the tool can be viewed [here](https://github.com/datacommonsorg/import/releases/). Here we download version `0.1-alpha.1k` and apply it to check our csv + tmcf import. It evaluates if all schema used in the import is present in the graph, all referenced nodes are present in the graph, along with other checks that issue fatal errors, errors, or warnings upon failing checks. Please note that empty tokens for some columns are expected as this reflects the original data. The imports create the Virus nodes that are then refrenced within this import. This resolves any concern about missing reference warnings concerning these node types by the test. To run tests: From 17a0ba9e84412a6a572cc4d25f53344489c9d7af Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Tue, 5 Mar 2024 15:42:20 -0800 Subject: [PATCH 58/60] Update README.md update tests subsection description to add assumptions --- scripts/biomedical/ICTV_Taxonomy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index 44c4b1b078..a381729581 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -149,7 +149,7 @@ sh run.sh ### Tests -Downloads Data Commons's java -jar import tool, storing it in a `tmp` directory. This tool is described in Data Commons documentation of the [import pipeline](https://github.com/datacommonsorg/import/). The relases of the tool can be viewed [here](https://github.com/datacommonsorg/import/releases/). Here we download version `0.1-alpha.1k` and apply it to check our csv + tmcf import. It evaluates if all schema used in the import is present in the graph, all referenced nodes are present in the graph, along with other checks that issue fatal errors, errors, or warnings upon failing checks. Please note that empty tokens for some columns are expected as this reflects the original data. The imports create the Virus nodes that are then refrenced within this import. This resolves any concern about missing reference warnings concerning these node types by the test. +The first step of `tests.sh` is to downloads Data Commons's java -jar import tool, storing it in a `tmp` directory. This assumes that the user has ava Runtime Environment (JRE) installed. This tool is described in Data Commons documentation of the [import pipeline](https://github.com/datacommonsorg/import/). The relases of the tool can be viewed [here](https://github.com/datacommonsorg/import/releases/). Here we download version `0.1-alpha.1k` and apply it to check our csv + tmcf import. It evaluates if all schema used in the import is present in the graph, all referenced nodes are present in the graph, along with other checks that issue fatal errors, errors, or warnings upon failing checks. Please note that empty tokens for some columns are expected as this reflects the original data. The imports create the Virus nodes that are then refrenced within this import. This resolves any concern about missing reference warnings concerning these node types by the test. To run tests: From 247948500b28e8f2e44e5dbd803f1e6bcfaf91c7 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Tue, 5 Mar 2024 15:43:37 -0800 Subject: [PATCH 59/60] Update README.md fix typo --- scripts/biomedical/ICTV_Taxonomy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/README.md b/scripts/biomedical/ICTV_Taxonomy/README.md index a381729581..14374ca4d9 100644 --- a/scripts/biomedical/ICTV_Taxonomy/README.md +++ b/scripts/biomedical/ICTV_Taxonomy/README.md @@ -149,7 +149,7 @@ sh run.sh ### Tests -The first step of `tests.sh` is to downloads Data Commons's java -jar import tool, storing it in a `tmp` directory. This assumes that the user has ava Runtime Environment (JRE) installed. This tool is described in Data Commons documentation of the [import pipeline](https://github.com/datacommonsorg/import/). The relases of the tool can be viewed [here](https://github.com/datacommonsorg/import/releases/). Here we download version `0.1-alpha.1k` and apply it to check our csv + tmcf import. It evaluates if all schema used in the import is present in the graph, all referenced nodes are present in the graph, along with other checks that issue fatal errors, errors, or warnings upon failing checks. Please note that empty tokens for some columns are expected as this reflects the original data. The imports create the Virus nodes that are then refrenced within this import. This resolves any concern about missing reference warnings concerning these node types by the test. +The first step of `tests.sh` is to downloads Data Commons's java -jar import tool, storing it in a `tmp` directory. This assumes that the user has Java Runtime Environment (JRE) installed. This tool is described in Data Commons documentation of the [import pipeline](https://github.com/datacommonsorg/import/). The relases of the tool can be viewed [here](https://github.com/datacommonsorg/import/releases/). Here we download version `0.1-alpha.1k` and apply it to check our csv + tmcf import. It evaluates if all schema used in the import is present in the graph, all referenced nodes are present in the graph, along with other checks that issue fatal errors, errors, or warnings upon failing checks. Please note that empty tokens for some columns are expected as this reflects the original data. The imports create the Virus nodes that are then refrenced within this import. This resolves any concern about missing reference warnings concerning these node types by the test. To run tests: From b282d449cafe0ea7c1328a85512c03091ff03bdb Mon Sep 17 00:00:00 2001 From: Prashanth R Date: Thu, 21 Mar 2024 16:05:06 -0700 Subject: [PATCH 60/60] Fix lint (#1005) --- .../create_virus_taxonomic_ranking_enums.py | 184 +++--- .../format_virus_master_species_list.py | 190 +++---- .../scripts/format_virus_metadata_resource.py | 526 +++++++++--------- 3 files changed, 420 insertions(+), 480 deletions(-) diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py index 0b80fcb243..1924621cf6 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/create_virus_taxonomic_ranking_enums.py @@ -25,150 +25,116 @@ import pandas as pd import sys - # declare universal variables HEADER = [ -'sort', -'isolateSort', -'realm', -'subrealm', -'kingdom', -'subkingdom', -'phylum', -'subphylum', -'class', -'subclass', -'order', -'suborder', -'family', -'subfamily', -'genus', -'subgenus', -'species', -'isExemplar', -'name', -'abbreviation', -'isolateDesignation', -'genBankAccession', -'refSeqAccession', -'genomeCoverage', -'genomeComposition', -'hostSource', -'host', -'source', -'dcid', -'isolate_dcid', -'isolate_name' + 'sort', 'isolateSort', 'realm', 'subrealm', 'kingdom', 'subkingdom', + 'phylum', 'subphylum', 'class', 'subclass', 'order', 'suborder', 'family', + 'subfamily', 'genus', 'subgenus', 'species', 'isExemplar', 'name', + 'abbreviation', 'isolateDesignation', 'genBankAccession', 'refSeqAccession', + 'genomeCoverage', 'genomeComposition', 'hostSource', 'host', 'source', + 'dcid', 'isolate_dcid', 'isolate_name' ] LIST_DROP = [ -'sort', -'isolateSort', -'species', -'isExemplar', -'name', -'abbreviation', -'isolateDesignation', -'genBankAccession', -'refSeqAccession', -'genomeCoverage', -'genomeComposition', -'hostSource', -'host', -'source', -'dcid', -'isolate_dcid', -'isolate_name' + 'sort', 'isolateSort', 'species', 'isExemplar', 'name', 'abbreviation', + 'isolateDesignation', 'genBankAccession', 'refSeqAccession', + 'genomeCoverage', 'genomeComposition', 'hostSource', 'host', 'source', + 'dcid', 'isolate_dcid', 'isolate_name' ] # declare functions def pascalcase(s): - list_words = s.split() - converted = "".join(word[0].upper() + word[1:].lower() for word in list_words) - return converted + list_words = s.split() + converted = "".join( + word[0].upper() + word[1:].lower() for word in list_words) + return converted def check_for_illegal_charc(s): - list_illegal = ["'", "–", "*" ">", "<", "@", "]", "[", "|", ":", ";" " "] - if any([x in s for x in list_illegal]): - print('Error! dcid contains illegal characters!', s) + list_illegal = ["'", "–", "*" + ">", "<", "@", "]", "[", "|", ":", ";" + " "] + if any([x in s for x in list_illegal]): + print('Error! dcid contains illegal characters!', s) def initiate_enum_dict(): - d = {} - list_levels = [i for i in HEADER if i not in LIST_DROP] - for item in list_levels: - enum_name = 'Virus' + item.capitalize() + 'Enum' - d[enum_name] = {} - return d + d = {} + list_levels = [i for i in HEADER if i not in LIST_DROP] + for item in list_levels: + enum_name = 'Virus' + item.capitalize() + 'Enum' + d[enum_name] = {} + return d def add_enums_to_dicts(key, value, d): - if value == value: - enum = 'Virus' + key + 'Enum' - dcid = 'Virus' + key + pascalcase(value) - check_for_illegal_charc(dcid) - d[enum][value] = dcid - return d + if value == value: + enum = 'Virus' + key + 'Enum' + dcid = 'Virus' + key + pascalcase(value) + check_for_illegal_charc(dcid) + d[enum][value] = dcid + return d def add_item_to_enums(df): - list_levels = [i for i in HEADER if i not in LIST_DROP] - dict_of_dicts = initiate_enum_dict() - dict_specialization = {} # keep track of previous top level - for index, row in df.iterrows(): - last_level_dcid = False # initiate empty value for tracking specialization - for item in list_levels: - level = item.capitalize() - if row[item] != row[item]: - continue - dict_of_dicts = add_enums_to_dicts(level, row[item], dict_of_dicts) - if last_level_dcid: # track specialization if relevant - dcid = 'Virus' + level + pascalcase(row[item]) - dict_specialization[dcid] = last_level_dcid - last_level_dcid = 'Virus' + level + pascalcase(row[item]) # update top level - return dict_of_dicts, dict_specialization + list_levels = [i for i in HEADER if i not in LIST_DROP] + dict_of_dicts = initiate_enum_dict() + dict_specialization = {} # keep track of previous top level + for index, row in df.iterrows(): + last_level_dcid = False # initiate empty value for tracking specialization + for item in list_levels: + level = item.capitalize() + if row[item] != row[item]: + continue + dict_of_dicts = add_enums_to_dicts(level, row[item], dict_of_dicts) + if last_level_dcid: # track specialization if relevant + dcid = 'Virus' + level + pascalcase(row[item]) + dict_specialization[dcid] = last_level_dcid + last_level_dcid = 'Virus' + level + pascalcase( + row[item]) # update top level + return dict_of_dicts, dict_specialization def write_individual_entries_to_file(w, enum, d, dict_specialization): - for key, value in d.items(): - w.write('Node: dcid:' + value + '\n') - w.write('name: "' + key + '"\n') - w.write('typeOf: dcs:' + enum + '\n') - if value in dict_specialization: - w.write('specializationOf: dcs:' + dict_specialization[value] + '\n\n') - else: - w.write('\n') - return w + for key, value in d.items(): + w.write('Node: dcid:' + value + '\n') + w.write('name: "' + key + '"\n') + w.write('typeOf: dcs:' + enum + '\n') + if value in dict_specialization: + w.write('specializationOf: dcs:' + dict_specialization[value] + + '\n\n') + else: + w.write('\n') + return w def write_dict_to_file(w, enum, d, dict_specialization): - w.write('# ' + enum + '\n') - w.write('Node: dcid:' + enum + '\n') - w.write('name: "' + enum + '"\n') - w.write('typeOf: schema:Class\n') - w.write('subClassOf: schema:Enumeration\n\n') - w = write_individual_entries_to_file(w, enum, d, dict_specialization) - w.write('\n') - return w + w.write('# ' + enum + '\n') + w.write('Node: dcid:' + enum + '\n') + w.write('name: "' + enum + '"\n') + w.write('typeOf: schema:Class\n') + w.write('subClassOf: schema:Enumeration\n\n') + w = write_individual_entries_to_file(w, enum, d, dict_specialization) + w.write('\n') + return w def generate_enums_mcf(f, w): - df = pd.read_excel(f, names=HEADER, header=None, sheet_name=0) - df = df.drop(LIST_DROP, axis=1).drop(0, axis=0) - dict_of_dicts, dict_specialization = add_item_to_enums(df) - w = open(w, mode='w') - w.write('# Schema generated by create_virus_taxonomic_ranking_enums.py\n\n') - for key, value in dict_of_dicts.items(): - w = write_dict_to_file(w, key, value, dict_specialization) + df = pd.read_excel(f, names=HEADER, header=None, sheet_name=0) + df = df.drop(LIST_DROP, axis=1).drop(0, axis=0) + dict_of_dicts, dict_specialization = add_item_to_enums(df) + w = open(w, mode='w') + w.write('# Schema generated by create_virus_taxonomic_ranking_enums.py\n\n') + for key, value in dict_of_dicts.items(): + w = write_dict_to_file(w, key, value, dict_specialization) def main(): - file_input = sys.argv[1] - file_output = sys.argv[2] + file_input = sys.argv[1] + file_output = sys.argv[2] - generate_enums_mcf(file_input, file_output) + generate_enums_mcf(file_input, file_output) if __name__ == '__main__': diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py index 7fb97312a0..0445d557ec 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_master_species_list.py @@ -24,151 +24,135 @@ @file_output: formatted csv format of Virus nodes """ - # load environment import pandas as pd import sys - # declare universal variables DICT_CHANGE_ENUM = { -'abolished': 'dcs:VirusLastTaxonomicChangeAbolished', -'demoted' : 'dcs:VirusLastTaxonomicChangeDemoted', -'merged': 'dcs:VirusLastTaxonomicChangeMerged', -'moved': 'dcs:VirusLastTaxonomicChangeMoved', -'new': 'dcs:VirusLastTaxonomicChangeNew', -'promoted': 'dcs:VirusLastTaxonomicChangePromoted', -'removed as type species': 'dcs:VirusLastTaxonomicChangeRemoved', -'renamed': 'dcs:VirusLastTaxonomicChangeRenamed', -'split': 'dcs:VirusLastTaxonomicChangeSplit' + 'abolished': 'dcs:VirusLastTaxonomicChangeAbolished', + 'demoted': 'dcs:VirusLastTaxonomicChangeDemoted', + 'merged': 'dcs:VirusLastTaxonomicChangeMerged', + 'moved': 'dcs:VirusLastTaxonomicChangeMoved', + 'new': 'dcs:VirusLastTaxonomicChangeNew', + 'promoted': 'dcs:VirusLastTaxonomicChangePromoted', + 'removed as type species': 'dcs:VirusLastTaxonomicChangeRemoved', + 'renamed': 'dcs:VirusLastTaxonomicChangeRenamed', + 'split': 'dcs:VirusLastTaxonomicChangeSplit' } - DICT_GC = { -'dsDNA': 'dcs:VirusGenomeCompositionDoubleStrandedDNA', -'ssDNA': 'dcs:VirusGenomeCompositionSingleStrandedDNA', -'ssDNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedDNANegative', -'ssDNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedDNAPositive', -'ssDNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedDNA', -'dsDNA-RT': 'dcs:VirusGenomeCompositionDoubleStrandedDNAReverseTranscription', -'ssRNA-RT': 'dcs:VirusGenomeCompositionSingleStrandedRNAReverseTranscription', -'dsRNA': 'dcs:VirusGenomeCompositionDoubleStrandedRNA', -'ssRNA': 'dcs:VirusGenomeCompositionSingleStrandedRNA', -'ssRNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedRNANegative', -'ssRNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedRNAPositive', -'ssRNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedRNA' + 'dsDNA': + 'dcs:VirusGenomeCompositionDoubleStrandedDNA', + 'ssDNA': + 'dcs:VirusGenomeCompositionSingleStrandedDNA', + 'ssDNA(-)': + 'dcs:VirusGenomeCompositionSingleStrandedDNANegative', + 'ssDNA(+)': + 'dcs:VirusGenomeCompositionSingleStrandedDNAPositive', + 'ssDNA(+/-)': + 'dcs:VirusGenomeCompositionSingleStrandedDNA', + 'dsDNA-RT': + 'dcs:VirusGenomeCompositionDoubleStrandedDNAReverseTranscription', + 'ssRNA-RT': + 'dcs:VirusGenomeCompositionSingleStrandedRNAReverseTranscription', + 'dsRNA': + 'dcs:VirusGenomeCompositionDoubleStrandedRNA', + 'ssRNA': + 'dcs:VirusGenomeCompositionSingleStrandedRNA', + 'ssRNA(-)': + 'dcs:VirusGenomeCompositionSingleStrandedRNANegative', + 'ssRNA(+)': + 'dcs:VirusGenomeCompositionSingleStrandedRNAPositive', + 'ssRNA(+/-)': + 'dcs:VirusGenomeCompositionSingleStrandedRNA' } - HEADER = [ -'sort', -'realm', -'subrealm', -'kingdom', -'subkingdom', -'phylum', -'subphylum', -'class', -'subclass', -'order', -'suborder', -'family', -'subfamily', -'genus', -'subgenus', -'species', -'genomeComposition', -'lastChange', -'lastChangeVersion', -'proposalForLastChange', -'taxonHistoryURL', -'dcid' + 'sort', 'realm', 'subrealm', 'kingdom', 'subkingdom', 'phylum', 'subphylum', + 'class', 'subclass', 'order', 'suborder', 'family', 'subfamily', 'genus', + 'subgenus', 'species', 'genomeComposition', 'lastChange', + 'lastChangeVersion', 'proposalForLastChange', 'taxonHistoryURL', 'dcid' ] - LIST_TAXONOMIC_LEVELS = [ -'realm', -'subrealm', -'kingdom', -'subkingdom', -'phylum', -'subphylum', -'class', -'subclass', -'order', -'suborder', -'family', -'subfamily', -'genus', -'subgenus' + 'realm', 'subrealm', 'kingdom', 'subkingdom', 'phylum', 'subphylum', + 'class', 'subclass', 'order', 'suborder', 'family', 'subfamily', 'genus', + 'subgenus' ] # declare functions def pascalcase(s): - list_words = s.split() - converted = "".join(word[0].upper() + word[1:].lower() for word in list_words) - return converted + list_words = s.split() + converted = "".join( + word[0].upper() + word[1:].lower() for word in list_words) + return converted def check_for_illegal_charc(s): - list_illegal = ["'", "–", "*" ">", "<", "@", "]", "[", "|", ":", ";" " "] - if any([x in s for x in list_illegal]): - print('Error! dcid contains illegal characters!', s) + list_illegal = ["'", "–", "*" + ">", "<", "@", "]", "[", "|", ":", ";" + " "] + if any([x in s for x in list_illegal]): + print('Error! dcid contains illegal characters!', s) def format_taxonomic_rank_properties(df, index, row): - for rank in LIST_TAXONOMIC_LEVELS: - if row[rank] == row[rank]: - enum = 'dcs:Virus' + rank.upper()[0] + rank.lower()[1:] + pascalcase(row[rank]) - df.loc[index, rank] = enum - return df + for rank in LIST_TAXONOMIC_LEVELS: + if row[rank] == row[rank]: + enum = 'dcs:Virus' + rank.upper()[0] + rank.lower( + )[1:] + pascalcase(row[rank]) + df.loc[index, rank] = enum + return df def convert_gc_to_enum(gc): - list_enum = [] - list_gc = gc.split(';') - for item in list_gc: - item = item.strip() - enum = DICT_GC[item] - list_enum.append(enum) - return (',').join(list_enum) + list_enum = [] + list_gc = gc.split(';') + for item in list_gc: + item = item.strip() + enum = DICT_GC[item] + list_enum.append(enum) + return (',').join(list_enum) def convert_change_to_enum(change): - list_enum = [] - change = change.lower() - list_changes = change.split(',')[:-1] - for item in list_changes: - enum = DICT_CHANGE_ENUM[item] - list_enum.append(enum) - return (',').join(list_enum) + list_enum = [] + change = change.lower() + list_changes = change.split(',')[:-1] + for item in list_changes: + enum = DICT_CHANGE_ENUM[item] + list_enum.append(enum) + return (',').join(list_enum) def clean_df(df): - for index, row in df.iterrows(): - dcid = 'bio/' + pascalcase(row['species']) - check_for_illegal_charc(dcid) - df = format_taxonomic_rank_properties(df, index, row) - df.loc[index, 'dcid'] = dcid - df.loc[index,'genomeComposition'] = convert_gc_to_enum(row['genomeComposition']) - df.loc[index, 'lastChange'] = convert_change_to_enum(row['lastChange']) - df.loc[index, 'taxonHistoryURL'] = row['taxonHistoryURL'].strip('ICTVonline=') - return df + for index, row in df.iterrows(): + dcid = 'bio/' + pascalcase(row['species']) + check_for_illegal_charc(dcid) + df = format_taxonomic_rank_properties(df, index, row) + df.loc[index, 'dcid'] = dcid + df.loc[index, 'genomeComposition'] = convert_gc_to_enum( + row['genomeComposition']) + df.loc[index, 'lastChange'] = convert_change_to_enum(row['lastChange']) + df.loc[index, + 'taxonHistoryURL'] = row['taxonHistoryURL'].strip('ICTVonline=') + return df def clean_file(f, w): - df = pd.read_excel(f, names=HEADER, header=None, sheet_name=1) - df = df.drop('sort', axis=1).drop(0, axis=0) - df = clean_df(df) - df.to_csv(w, index=False) + df = pd.read_excel(f, names=HEADER, header=None, sheet_name=1) + df = df.drop('sort', axis=1).drop(0, axis=0) + df = clean_df(df) + df.to_csv(w, index=False) def main(): - file_input = sys.argv[1] - file_output = sys.argv[2] + file_input = sys.argv[1] + file_output = sys.argv[2] - clean_file(file_input, file_output) + clean_file(file_input, file_output) if __name__ == '__main__': diff --git a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py index 753769d3c1..b0294423bc 100644 --- a/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py +++ b/scripts/biomedical/ICTV_Taxonomy/scripts/format_virus_metadata_resource.py @@ -29,351 +29,341 @@ nodes """ - # set up environment import pandas as pd import sys import unidecode - # declare universal variables DICT_COVERAGE = { -'coding-complete genome': 'dcs:GenomeCoverageCompleteGenome', -'complete genome': 'dcs:GenomeCoverageCompleteGenome', -'complete coding genome': 'dcs:GenomeCoverageCompleteCodingGenome', -'no entry in genbank': 'dcs:GenomeCoverageNoEntryInGenBank', -'partial genome': 'dcs:GenomeCoveragePartialGenome' + 'coding-complete genome': 'dcs:GenomeCoverageCompleteGenome', + 'complete genome': 'dcs:GenomeCoverageCompleteGenome', + 'complete coding genome': 'dcs:GenomeCoverageCompleteCodingGenome', + 'no entry in genbank': 'dcs:GenomeCoverageNoEntryInGenBank', + 'partial genome': 'dcs:GenomeCoveragePartialGenome' } - DICT_GC = { -'dsDNA': 'dcs:VirusGenomeCompositionDoubleStrandedDNA', -'ssDNA': 'dcs:VirusGenomeCompositionSingleStrandedDNA', -'ssDNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedDNANegative', -'ssDNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedDNAPositive', -'ssDNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedDNA', -'dsDNA-RT': 'dcs:VirusGenomeCompositionDoubleStrandedDNAReverseTranscription', -'ssRNA-RT': 'dcs:VirusGenomeCompositionSingleStrandedRNAReverseTranscription', -'dsRNA': 'dcs:VirusGenomeCompositionDoubleStrandedRNA', -'ssRNA': 'dcs:VirusGenomeCompositionSingleStrandedRNA', -'ssRNA(-)': 'dcs:VirusGenomeCompositionSingleStrandedRNANegative', -'ssRNA(+)': 'dcs:VirusGenomeCompositionSingleStrandedRNAPositive', -'ssRNA(+/-)': 'dcs:VirusGenomeCompositionSingleStrandedRNA' + 'dsDNA': + 'dcs:VirusGenomeCompositionDoubleStrandedDNA', + 'ssDNA': + 'dcs:VirusGenomeCompositionSingleStrandedDNA', + 'ssDNA(-)': + 'dcs:VirusGenomeCompositionSingleStrandedDNANegative', + 'ssDNA(+)': + 'dcs:VirusGenomeCompositionSingleStrandedDNAPositive', + 'ssDNA(+/-)': + 'dcs:VirusGenomeCompositionSingleStrandedDNA', + 'dsDNA-RT': + 'dcs:VirusGenomeCompositionDoubleStrandedDNAReverseTranscription', + 'ssRNA-RT': + 'dcs:VirusGenomeCompositionSingleStrandedRNAReverseTranscription', + 'dsRNA': + 'dcs:VirusGenomeCompositionDoubleStrandedRNA', + 'ssRNA': + 'dcs:VirusGenomeCompositionSingleStrandedRNA', + 'ssRNA(-)': + 'dcs:VirusGenomeCompositionSingleStrandedRNANegative', + 'ssRNA(+)': + 'dcs:VirusGenomeCompositionSingleStrandedRNAPositive', + 'ssRNA(+/-)': + 'dcs:VirusGenomeCompositionSingleStrandedRNA' } - DICT_HOST = { - 'algae': 'dcs:VirusHostAlgae', - 'archaea': 'dcs:VirusHostArchaea', - 'bacteria': 'dcs:VirusHostBacteria', - 'fungi': 'dcs:VirusHostFungi', - 'invertebrates': 'dcs:VirusHostInvertebrates', - 'plants': 'dcs:VirusHostPlants', - 'protists': 'dcs:VirusHostProtists', - 'vertebrates': 'dcs:VirusHostVertebrates' + 'algae': 'dcs:VirusHostAlgae', + 'archaea': 'dcs:VirusHostArchaea', + 'bacteria': 'dcs:VirusHostBacteria', + 'fungi': 'dcs:VirusHostFungi', + 'invertebrates': 'dcs:VirusHostInvertebrates', + 'plants': 'dcs:VirusHostPlants', + 'protists': 'dcs:VirusHostProtists', + 'vertebrates': 'dcs:VirusHostVertebrates' } - DICT_SOURCE = { - 'freshwater': 'dcs:VirusSourceFreshwater', - 'invertebrates': 'dcs:VirusSourceInvertebrates', - 'marine': 'dcs:VirusSourceMarine', - 'phytobiome': 'dcs:VirusSourcePhytobiome', - 'plants': 'dcs:VirusSourcePlants', - 'protists': 'dcs:VirusSourceProtists', - 'sewage': 'dcs:VirusSourceSewage', - 'soil': 'dcs:VirusSourceSoil' + 'freshwater': 'dcs:VirusSourceFreshwater', + 'invertebrates': 'dcs:VirusSourceInvertebrates', + 'marine': 'dcs:VirusSourceMarine', + 'phytobiome': 'dcs:VirusSourcePhytobiome', + 'plants': 'dcs:VirusSourcePlants', + 'protists': 'dcs:VirusSourceProtists', + 'sewage': 'dcs:VirusSourceSewage', + 'soil': 'dcs:VirusSourceSoil' } - HEADER = [ -'sort', -'isolateSort', -'realm', -'subrealm', -'kingdom', -'subkingdom', -'phylum', -'subphylum', -'class', -'subclass', -'order', -'suborder', -'family', -'subfamily', -'genus', -'subgenus', -'species', -'isExemplar', -'name', -'abbreviation', -'isolateDesignation', -'genBankAccession', -'refSeqAccession', -'genomeCoverage', -'genomeComposition', -'hostSource', -'host', -'source', -'dcid', -'isolate_dcid', -'isolate_name' + 'sort', 'isolateSort', 'realm', 'subrealm', 'kingdom', 'subkingdom', + 'phylum', 'subphylum', 'class', 'subclass', 'order', 'suborder', 'family', + 'subfamily', 'genus', 'subgenus', 'species', 'isExemplar', 'name', + 'abbreviation', 'isolateDesignation', 'genBankAccession', 'refSeqAccession', + 'genomeCoverage', 'genomeComposition', 'hostSource', 'host', 'source', + 'dcid', 'isolate_dcid', 'isolate_name' ] - HEADER_2 = [ -'dcid', -'name', -'genBankAccession', -'genomeSegmentOf', -'refSeqAccession' + 'dcid', 'name', 'genBankAccession', 'genomeSegmentOf', 'refSeqAccession' ] - LIST_TAXONOMIC_LEVELS = [ -'realm', -'subrealm', -'kingdom', -'subkingdom', -'phylum', -'subphylum', -'class', -'subclass', -'order', -'suborder', -'family', -'subfamily', -'genus', -'subgenus' + 'realm', 'subrealm', 'kingdom', 'subkingdom', 'phylum', 'subphylum', + 'class', 'subclass', 'order', 'suborder', 'family', 'subfamily', 'genus', + 'subgenus' ] # declare functions # declare functions def pascalcase(s): - list_words = s.split() - converted = "".join(word[0].upper() + word[1:] for word in list_words) - return converted + list_words = s.split() + converted = "".join(word[0].upper() + word[1:] for word in list_words) + return converted def check_for_illegal_charc(s): - list_illegal = ["'", "#", "–", "*" ">", "<", "@", "]", "[", "|", ":", ";", " "] - if any([x in s for x in list_illegal]): - print('Error! dcid contains illegal characters!', s) + list_illegal = [ + "'", "#", "–", "*" + ">", "<", "@", "]", "[", "|", ":", ";", " " + ] + if any([x in s for x in list_illegal]): + print('Error! dcid contains illegal characters!', s) def format_list(s): - if s != s: - return s - list_items = [] - s = str(s) - list_s = s.split(';') - for item in list_s: - list_items.append(item.strip()) - return (',').join(list_items) + if s != s: + return s + list_items = [] + s = str(s) + list_s = s.split(';') + for item in list_s: + list_items.append(item.strip()) + return (',').join(list_items) def format_taxonomic_rank_properties(df, index, row): - for rank in LIST_TAXONOMIC_LEVELS: - if row[rank] == row[rank]: - enum = 'dcs:Virus' + rank.upper()[0] + rank.lower()[1:] + pascalcase(row[rank]) - df.loc[index, rank] = enum - return df + for rank in LIST_TAXONOMIC_LEVELS: + if row[rank] == row[rank]: + enum = 'dcs:Virus' + rank.upper()[0] + rank.lower( + )[1:] + pascalcase(row[rank]) + df.loc[index, rank] = enum + return df def convert_gc_to_enum(gc): - list_enum = [] - list_gc = gc.split(';') - for item in list_gc: - item = item.strip() - enum = DICT_GC[item] - list_enum.append(enum) - return (',').join(list_enum) + list_enum = [] + list_gc = gc.split(';') + for item in list_gc: + item = item.strip() + enum = DICT_GC[item] + list_enum.append(enum) + return (',').join(list_enum) def convert_coverage_to_enum(cov): - return DICT_COVERAGE[cov.lower()] + return DICT_COVERAGE[cov.lower()] def convert_type_to_boolean(t): - if t == 'E': - return True - if t == 'A': - return False - print('Error! Not an expected isolate type! Expected E or A, but got', t ,'.') + if t == 'E': + return True + if t == 'A': + return False + print('Error! Not an expected isolate type! Expected E or A, but got', t, + '.') def convert_source_to_enum(source): - source = source[:-4] - return DICT_SOURCE[source] + source = source[:-4] + return DICT_SOURCE[source] def convert_host_to_enum(host): - list_enum = [] - list_host = host.split(',') - for item in list_host: - item = item.strip() - enum = DICT_HOST[item] - list_enum.append(enum) - return (',').join(list_enum) + list_enum = [] + list_host = host.split(',') + for item in list_host: + item = item.strip() + enum = DICT_HOST[item] + list_enum.append(enum) + return (',').join(list_enum) def handle_genBank_missing_exception(n, virus_dcid, virus_name): - if n != n: - dcid = virus_dcid + 'Isolate' - name = virus_name + ' Isolate' - return dcid, name - n = str(n) - if ';' in n: - n = n.split(';')[0] - dcid = virus_dcid + pascalcase(n) - dcid = dcid.replace("'", "") - dcid = dcid.replace('–', '-') - name = virus_name + n - return dcid, name + if n != n: + dcid = virus_dcid + 'Isolate' + name = virus_name + ' Isolate' + return dcid, name + n = str(n) + if ';' in n: + n = n.split(';')[0] + dcid = virus_dcid + pascalcase(n) + dcid = dcid.replace("'", "") + dcid = dcid.replace('–', '-') + name = virus_name + n + return dcid, name def handle_genBank_components_exception(genBank, virus_dcid, virus_name): - dcid = virus_dcid - name = virus_name - list_genBank = genBank.split(';') - for item in list_genBank: - if ':' in item: - n, gb = item.split(':') - dcid = virus_dcid + '_' + gb.strip() - name = virus_name + gb - else: - dcid = virus_dcid + '_' + item.strip() - name = virus_name + item - return dcid, name + dcid = virus_dcid + name = virus_name + list_genBank = genBank.split(';') + for item in list_genBank: + if ':' in item: + n, gb = item.split(':') + dcid = virus_dcid + '_' + gb.strip() + name = virus_name + gb + else: + dcid = virus_dcid + '_' + item.strip() + name = virus_name + item + return dcid, name def format_isolate_designation_for_dcid(des): - des = str(des) - des = des.replace(':', '_') - des = des.replace(';', '_') - des = des.replace('[', '(') - des = des.replace(']', ')') - des = des.replace('-', '_') - des = des.replace('–', '_') - des = des.replace("'", '') - des = des.replace('#', '') - return des - - -def verify_isolate_dcid_uniqueness(dcid, list_isolate_dcids, genBank, virus_abrv): - if dcid in list_isolate_dcids: - if ';' in genBank: - dcid = dcid + '_' + virus_abrv - else: - dcid = dcid + '_' + genBank - print('Non-unique VirusIsolate dcid generated! Added additional info to differentiate:', dcid) - list_isolate_dcids.append(dcid) - return dcid, list_isolate_dcids - - -def declare_isolate_dcid(n, genBank, virus_dcid, virus_name, virus_abrv, isolate_designation, list_isolate_dcids): - if isolate_designation == isolate_designation: - des = format_isolate_designation_for_dcid(isolate_designation) - dcid = virus_dcid + '_' + pascalcase(des) - name = virus_name + ' strain ' + str(isolate_designation) - elif genBank != genBank: - dcid, name = handle_genBank_missing_exception(n, virus_dcid, virus_name) - elif ':' in genBank or ';' in genBank: - dcid, name = handle_genBank_components_exception(genBank, virus_dcid, virus_name) - else: - dcid = virus_dcid + '_' + genBank - name = virus_name + ' ' + genBank - dcid = dcid.replace(' ', '') - dcid = unidecode.unidecode(dcid) - dcid, list_isolate_dcids = verify_isolate_dcid_uniqueness(dcid, list_isolate_dcids, genBank, virus_abrv) - return dcid, name, list_isolate_dcids + des = str(des) + des = des.replace(':', '_') + des = des.replace(';', '_') + des = des.replace('[', '(') + des = des.replace(']', ')') + des = des.replace('-', '_') + des = des.replace('–', '_') + des = des.replace("'", '') + des = des.replace('#', '') + return des + + +def verify_isolate_dcid_uniqueness(dcid, list_isolate_dcids, genBank, + virus_abrv): + if dcid in list_isolate_dcids: + if ';' in genBank: + dcid = dcid + '_' + virus_abrv + else: + dcid = dcid + '_' + genBank + print( + 'Non-unique VirusIsolate dcid generated! Added additional info to differentiate:', + dcid) + list_isolate_dcids.append(dcid) + return dcid, list_isolate_dcids + + +def declare_isolate_dcid(n, genBank, virus_dcid, virus_name, virus_abrv, + isolate_designation, list_isolate_dcids): + if isolate_designation == isolate_designation: + des = format_isolate_designation_for_dcid(isolate_designation) + dcid = virus_dcid + '_' + pascalcase(des) + name = virus_name + ' strain ' + str(isolate_designation) + elif genBank != genBank: + dcid, name = handle_genBank_missing_exception(n, virus_dcid, virus_name) + elif ':' in genBank or ';' in genBank: + dcid, name = handle_genBank_components_exception( + genBank, virus_dcid, virus_name) + else: + dcid = virus_dcid + '_' + genBank + name = virus_name + ' ' + genBank + dcid = dcid.replace(' ', '') + dcid = unidecode.unidecode(dcid) + dcid, list_isolate_dcids = verify_isolate_dcid_uniqueness( + dcid, list_isolate_dcids, genBank, virus_abrv) + return dcid, name, list_isolate_dcids def make_refSeq_dict(refSeq): - d = {} - list_refSeq = refSeq.split(';') - for item in list_refSeq: - if ':' in item: - name, rs = item.split(':') - d[name.strip()] = rs.strip() - return d - - -def handle_genome_segments(df_segment, virus_dcid, virus_name, isolate_dcid, genBank, refSeq): - dict_refSeq = {} - list_genBank = genBank.split(';') - if refSeq == refSeq: - dict_refSeq = make_refSeq_dict(refSeq) - for item in list_genBank: - d = {'dcid': [], 'name': [], 'genBankAccession': [], 'genomeSegmentOf': [], 'refSeqAccession': []} - if ':' not in item: - continue - name, gb = item.split(':') - name = name.strip() - gb = gb.strip() - d['dcid'].append(virus_dcid + gb) - check_for_illegal_charc(virus_dcid + gb) - d['name'].append(virus_name + ' Segment ' + name) - d['genBankAccession'].append(gb) - d['genomeSegmentOf'].append('dcid:' + isolate_dcid) - if name in dict_refSeq: - d['refSeqAccession'].append(dict_refSeq[name]) - else: - d['refSeqAccession'].append('') - df_new_row = pd.DataFrame.from_dict(d, orient='columns') - df_segment = pd.concat([df_segment, df_new_row], ignore_index=True) - return df_segment + d = {} + list_refSeq = refSeq.split(';') + for item in list_refSeq: + if ':' in item: + name, rs = item.split(':') + d[name.strip()] = rs.strip() + return d + + +def handle_genome_segments(df_segment, virus_dcid, virus_name, isolate_dcid, + genBank, refSeq): + dict_refSeq = {} + list_genBank = genBank.split(';') + if refSeq == refSeq: + dict_refSeq = make_refSeq_dict(refSeq) + for item in list_genBank: + d = { + 'dcid': [], + 'name': [], + 'genBankAccession': [], + 'genomeSegmentOf': [], + 'refSeqAccession': [] + } + if ':' not in item: + continue + name, gb = item.split(':') + name = name.strip() + gb = gb.strip() + d['dcid'].append(virus_dcid + gb) + check_for_illegal_charc(virus_dcid + gb) + d['name'].append(virus_name + ' Segment ' + name) + d['genBankAccession'].append(gb) + d['genomeSegmentOf'].append('dcid:' + isolate_dcid) + if name in dict_refSeq: + d['refSeqAccession'].append(dict_refSeq[name]) + else: + d['refSeqAccession'].append('') + df_new_row = pd.DataFrame.from_dict(d, orient='columns') + df_segment = pd.concat([df_segment, df_new_row], ignore_index=True) + return df_segment def clean_df(df, df_segment): - list_isolate_dcids = [] - for index, row in df.iterrows(): - dcid = 'bio/' + pascalcase(row['species']) - check_for_illegal_charc(dcid) - df.loc[index, 'dcid'] = dcid - df = format_taxonomic_rank_properties(df, index, row) - isolate_dcid, isolate_name, list_isolate_dcids = declare_isolate_dcid(row['name'], row['genBankAccession'], dcid, row['species'], row['abbreviation'], row['isolateDesignation'], list_isolate_dcids) - check_for_illegal_charc(isolate_dcid) - df.loc[index, 'isolate_dcid'] = isolate_dcid - df.loc[index, 'isolate_name'] = isolate_name - df.loc[index,'genomeComposition'] = convert_gc_to_enum(row['genomeComposition']) - df.loc[index,'genomeCoverage'] = convert_coverage_to_enum(row['genomeCoverage']) - df.loc[index, 'isExemplar'] = convert_type_to_boolean(row['isExemplar']) - df.loc[index, 'name'] = format_list(row['name']) - df.loc[index, 'abbreviation'] = format_list(row['abbreviation']) - df.loc[index, 'isolateDesignation'] = format_list(row['isolateDesignation']) - genBank = row['genBankAccession'] - if genBank == genBank and ':' in genBank: - df_segment = handle_genome_segments(df_segment, dcid, row['name'], isolate_dcid, genBank, row['refSeqAccession']) - df.loc[index, 'genBankAccession'] = '' - df.loc[index, 'refSeqAccession'] = '' - elif genBank == genBank and ';' in genBank: - df.loc[index, 'genBankAccession'] = format_list(genBank) - df.loc[index, 'refSeqAccession'] = format_list(row['refSeqAccession']) - if '(S)' in row['hostSource']: - df.loc[index, 'source'] = convert_source_to_enum(row['hostSource']) - else: - df.loc[index, 'host'] = convert_host_to_enum(row['hostSource']) - return df, df_segment + list_isolate_dcids = [] + for index, row in df.iterrows(): + dcid = 'bio/' + pascalcase(row['species']) + check_for_illegal_charc(dcid) + df.loc[index, 'dcid'] = dcid + df = format_taxonomic_rank_properties(df, index, row) + isolate_dcid, isolate_name, list_isolate_dcids = declare_isolate_dcid( + row['name'], row['genBankAccession'], dcid, row['species'], + row['abbreviation'], row['isolateDesignation'], list_isolate_dcids) + check_for_illegal_charc(isolate_dcid) + df.loc[index, 'isolate_dcid'] = isolate_dcid + df.loc[index, 'isolate_name'] = isolate_name + df.loc[index, 'genomeComposition'] = convert_gc_to_enum( + row['genomeComposition']) + df.loc[index, 'genomeCoverage'] = convert_coverage_to_enum( + row['genomeCoverage']) + df.loc[index, 'isExemplar'] = convert_type_to_boolean(row['isExemplar']) + df.loc[index, 'name'] = format_list(row['name']) + df.loc[index, 'abbreviation'] = format_list(row['abbreviation']) + df.loc[index, + 'isolateDesignation'] = format_list(row['isolateDesignation']) + genBank = row['genBankAccession'] + if genBank == genBank and ':' in genBank: + df_segment = handle_genome_segments(df_segment, dcid, row['name'], + isolate_dcid, genBank, + row['refSeqAccession']) + df.loc[index, 'genBankAccession'] = '' + df.loc[index, 'refSeqAccession'] = '' + elif genBank == genBank and ';' in genBank: + df.loc[index, 'genBankAccession'] = format_list(genBank) + df.loc[index, + 'refSeqAccession'] = format_list(row['refSeqAccession']) + if '(S)' in row['hostSource']: + df.loc[index, 'source'] = convert_source_to_enum(row['hostSource']) + else: + df.loc[index, 'host'] = convert_host_to_enum(row['hostSource']) + return df, df_segment def clean_file(f, w, w_2): - df = pd.read_excel(f, names=HEADER, header=None, sheet_name=0) - df = df.drop(0, axis=0) - df_segment = pd.DataFrame([], columns=HEADER_2) - df, df_segment = clean_df(df, df_segment) - df = df.drop(['sort', 'isolateSort', 'hostSource'], axis=1) - df.to_csv(w, index=False) - df_segment.to_csv(w_2, index=False) + df = pd.read_excel(f, names=HEADER, header=None, sheet_name=0) + df = df.drop(0, axis=0) + df_segment = pd.DataFrame([], columns=HEADER_2) + df, df_segment = clean_df(df, df_segment) + df = df.drop(['sort', 'isolateSort', 'hostSource'], axis=1) + df.to_csv(w, index=False) + df_segment.to_csv(w_2, index=False) def main(): - file_input = sys.argv[1] - file_output_1 = sys.argv[2] - file_output_2 = sys.argv[3] + file_input = sys.argv[1] + file_output_1 = sys.argv[2] + file_output_2 = sys.argv[3] - clean_file(file_input, file_output_1, file_output_2) + clean_file(file_input, file_output_1, file_output_2) if __name__ == '__main__':