From 3b0f5ab42b9a956fac4e9252896654a07090eccb Mon Sep 17 00:00:00 2001 From: Christie Ellks Date: Mon, 11 Nov 2024 09:45:45 -0800 Subject: [PATCH] [bio] Update biomed property embeddings. (#4721) Adds: - mechanismOfAction - hgncID - inChIKey Updates: - unifiedMedicalLanguageSystemConceptUniqueIdentifier -> umlsConceptUniqueID - ncbiTaxonID->ncbiTaxId - referenceAlleleNCBI -> referenceAllele - genomicCoordinates -> hasGenomicCoordinates Deletes: - diseaseName - observedAllele - hg19GenomicPosition - hg19GenomicLocation - hg38GenomicPosition - hg38GenomicLocation - hasRNATranscript - ncbiDNASequenceName - imageUrl - availableStrength Example Screenshots ![image](https://github.com/user-attachments/assets/6783a776-48e0-4ceb-aa57-b0276a67d2db) ![image](https://github.com/user-attachments/assets/568b48e8-c6b6-4dce-9e99-af9bf7f0f912) ![image](https://github.com/user-attachments/assets/8f31b18b-3169-4ef0-a67d-a0b6cea3bf71) --- deploy/nl/catalog.yaml | 2 +- server/integration_tests/explore_test.py | 7 +- .../whatisthephylumofvolvox/debug_info.json | 40 +- .../debug_info.json | 42 +- .../howaboutforp53/chart_config.json | 708 ------------------ .../howaboutforpqlc3/chart_config.json | 68 ++ .../chart_config.json | 156 +--- tools/nl/embeddings/input/bio/_preindex.csv | 49 +- tools/nl/embeddings/input/bio/sheets_svs.csv | 21 +- 9 files changed, 136 insertions(+), 957 deletions(-) delete mode 100644 server/integration_tests/test_data/e2e_triple/howaboutforp53/chart_config.json create mode 100644 server/integration_tests/test_data/e2e_triple/howaboutforpqlc3/chart_config.json rename server/integration_tests/test_data/e2e_triple/{whattranscriptsdoesithave => whatgenomiccoordinatesdoesithave}/chart_config.json (89%) diff --git a/deploy/nl/catalog.yaml b/deploy/nl/catalog.yaml index 9cb901f293..f2839a69b7 100644 --- a/deploy/nl/catalog.yaml +++ b/deploy/nl/catalog.yaml @@ -78,7 +78,7 @@ indexes: bio_ft: store_type: MEMORY source_path: ../../tools/nl/embeddings/input/bio - embeddings_path: gs://datcom-nl-models/bio_ft_2024_11_05_09_59_39/embeddings.csv + embeddings_path: gs://datcom-nl-models/bio_ft_2024_11_08_19_00_38/embeddings.csv model: ft-final-v20230717230459-all-MiniLM-L6-v2 healthcheck_query: "Gene" base_uae_lance: diff --git a/server/integration_tests/explore_test.py b/server/integration_tests/explore_test.py index 12749f1b71..a4820e16ee 100644 --- a/server/integration_tests/explore_test.py +++ b/server/integration_tests/explore_test.py @@ -761,14 +761,17 @@ def test_e2e_triple(self): self.run_detect_and_fulfill( 'e2e_triple', [ + # ----- Context Based Queries ----- # Should have 'out' properties as answer 'What strand orientation does FGFR1 have?', # Should use context for the entity - 'what transcripts does it have', + 'what genomic coordinates does it have', # Should use context for the property - 'how about for P53', + 'how about for PQLC3', # Should not use context because no entity or property found 'what animal is that found in', + + # ----- Singleton Queries ----- # Should have 'in' properties as answer 'What is Betacoronavirus 1 the species of', # Should have a chained property in the answer diff --git a/server/integration_tests/test_data/detection_api_bio/whatisthephylumofvolvox/debug_info.json b/server/integration_tests/test_data/detection_api_bio/whatisthephylumofvolvox/debug_info.json index 3b57174311..0e4bb79560 100644 --- a/server/integration_tests/test_data/detection_api_bio/whatisthephylumofvolvox/debug_info.json +++ b/server/integration_tests/test_data/detection_api_bio/whatisthephylumofvolvox/debug_info.json @@ -24,7 +24,7 @@ 0.314266, 0.312765, 0.312629, - 0.312504, + 0.312505, 0.311774, 0.310339, 0.310075, @@ -75,24 +75,23 @@ 0.31467, 0.308395, 0.307941, - 0.30463, - 0.282709, 0.281875, - 0.280828, - 0.280453, - 0.274298, 0.271505, 0.270718, - 0.269547, 0.259901, 0.259901, 0.248279, + 0.247354, 0.24662, - 0.246321, - 0.246164, - 0.242123, + 0.246163, + 0.238985, 0.231355, - 0.229624 + 0.230288, + 0.229624, + 0.227736, + 0.221732, + 0.21856, + 0.216932 ], "PROP": [ "phylum", @@ -106,24 +105,23 @@ "ensemblID", "simplifiedMolecularInputLineEntrySystem", "geneticVariantFunctionalCategory", - "imageUrl", - "genomicCoordinates", "chromosomeSize", - "observedAllele", - "hg38GenomicPosition", - "hg19GenomicPosition", "virusHost", "strandOrientation", - "ncbiDNASequenceName", "<-geneID{typeOf:GeneGeneticVariantAssociation}->variantID", "<-variantID{typeOf:GeneGeneticVariantAssociation}->geneID", "ncbiProteinAccessionNumber", + "hgncID", "alleleType", - "hg38GenomicLocation", "ofVirusSpecies", - "hg19GenomicLocation", - "ncbiTaxonID", - "antigenType" + "hasGenomicCoordinates", + "ncbiTaxId", + "umlsConceptUniqueID", + "antigenType", + "alleleOrigin", + "<-diseaseID{typeOf:DiseaseGeneAssociation}->geneID", + "antibodyType", + "inChIKey" ] }, "query_detection_debug_logs": { diff --git a/server/integration_tests/test_data/detection_api_bio/whattypesofgenesarefgfr1,apoe,andache/debug_info.json b/server/integration_tests/test_data/detection_api_bio/whattypesofgenesarefgfr1,apoe,andache/debug_info.json index 524cf98484..0d540cc7d3 100644 --- a/server/integration_tests/test_data/detection_api_bio/whattypesofgenesarefgfr1,apoe,andache/debug_info.json +++ b/server/integration_tests/test_data/detection_api_bio/whattypesofgenesarefgfr1,apoe,andache/debug_info.json @@ -25,7 +25,7 @@ 0.400638, 0.398746, 0.398361, - 0.390523, + 0.390524, 0.389033, 0.388461, 0.388382, @@ -35,7 +35,7 @@ 0.385679, 0.385434, 0.385339, - 0.381842, + 0.381843, 0.379705 ], "MultiSV": {}, @@ -77,24 +77,23 @@ 0.65902, 0.63579, 0.63404, - 0.617156, + 0.617157, 0.598812, - 0.595879, 0.585015, 0.583271, 0.581079, 0.576904, - 0.543242, - 0.539973, - 0.532047, - 0.530959, - 0.51427, + 0.540855, + 0.538974, 0.512528, 0.46106, 0.444108, - 0.398229, - 0.394443, - 0.39229 + 0.39229, + 0.384802, + 0.341795, + 0.324578, + 0.32201, + 0.320837 ], "PROP": [ "typeOfGene", @@ -107,22 +106,21 @@ "<-diseaseID{typeOf:DiseaseGeneAssociation}->geneID", "<-compoundID{typeOf:ChemicalCompoundGeneticVariantAssociation}->variantID", "<-geneID{typeOf:DiseaseGeneAssociation}->diseaseID", - "genomicCoordinates", "antigenType", "<-variantID{typeOf:ChemicalCompoundGeneticVariantAssociation}->compoundID", "alleleType", "<-geneticVariantID{typeOf:DiseaseGeneticVariantAssociation}->diseaseID", - "hg19GenomicPosition", - "hasRNATranscript", - "hg19GenomicLocation", - "hg38GenomicPosition", - "hg38GenomicLocation", + "hgncID", + "hasGenomicCoordinates", "antibodyType", "chromosomeSize", - "referenceAlleleNCBI", - "observedAllele", - "ncbiDNASequenceName", - "alleleOrigin" + "referenceAllele", + "alleleOrigin", + "specializationOf", + "subClassificationOf", + "virusGenus", + "virusHost", + "ncbiProteinAccessionNumber" ] }, "query_detection_debug_logs": { diff --git a/server/integration_tests/test_data/e2e_triple/howaboutforp53/chart_config.json b/server/integration_tests/test_data/e2e_triple/howaboutforp53/chart_config.json deleted file mode 100644 index ba4687bb47..0000000000 --- a/server/integration_tests/test_data/e2e_triple/howaboutforp53/chart_config.json +++ /dev/null @@ -1,708 +0,0 @@ -{ - "client": "test_detect-and-fulfill", - "config": { - "categories": [ - { - "blocks": [ - { - "columns": [ - { - "tiles": [ - { - "answerTableTileSpec": { - "columns": [ - { - "header": "Alliance Of Genome Resources Id", - "propertyExpr": "->allianceOfGenomeResourcesId" - }, - { - "header": "Chrom", - "propertyExpr": "->chrom" - }, - { - "header": "Cytogenetic Map Location", - "propertyExpr": "->cytogeneticMapLocation" - }, - { - "header": "Date Modified", - "propertyExpr": "->dateModified" - }, - { - "header": "Description", - "propertyExpr": "->description" - }, - { - "header": "Ensembl Gene Id", - "propertyExpr": "->ensemblGeneId" - }, - { - "header": "Ensembl Rapid Release Id", - "propertyExpr": "->ensemblRapidReleaseId" - }, - { - "header": "Fly Base Id", - "propertyExpr": "->flyBaseId" - }, - { - "header": "Full Name", - "propertyExpr": "->fullName" - }, - { - "header": "Gene Ortholog", - "propertyExpr": "->geneOrtholog" - }, - { - "header": "Gene Symbol", - "propertyExpr": "->geneSymbol" - }, - { - "header": "Has Genomic Coordinates", - "propertyExpr": "->hasGenomicCoordinates" - }, - { - "header": "Name", - "propertyExpr": "->name" - }, - { - "header": "Nasonia Base Id", - "propertyExpr": "->nasoniaBaseId" - }, - { - "header": "Ncbi Gene ID", - "propertyExpr": "->ncbiGeneID" - }, - { - "header": "Ncbi Sequence Gi Number", - "propertyExpr": "->ncbiSequenceGiNumber" - }, - { - "header": "Of Species", - "propertyExpr": "->ofSpecies" - }, - { - "header": "Provenance", - "propertyExpr": "->provenance" - }, - { - "header": "Pub Med ID", - "propertyExpr": "->pubMedID" - }, - { - "header": "Ref Seq Genomic Accession", - "propertyExpr": "->refSeqGenomicAccession" - }, - { - "header": "Synonym", - "propertyExpr": "->synonym" - }, - { - "header": "Type Of", - "propertyExpr": "->typeOf" - }, - { - "header": "Type Of Gene", - "propertyExpr": "->typeOfGene" - } - ] - }, - "entities": [ - "bio/ncbi_100116126", - "bio/ncbi_100384887", - "bio/ncbi_100648488", - "bio/ncbi_100748472", - "bio/ncbi_100862994", - "bio/ncbi_100880552", - "bio/ncbi_102673235", - "bio/ncbi_103571032", - "bio/ncbi_105153546", - "bio/ncbi_105191985", - "bio/ncbi_105208183", - "bio/ncbi_105254636", - "bio/ncbi_105262739", - "bio/ncbi_105279045", - "bio/ncbi_105362659", - "bio/ncbi_105432841", - "bio/ncbi_105459353", - "bio/ncbi_105569139", - "bio/ncbi_105618583", - "bio/ncbi_105675615", - "bio/ncbi_105689091", - "bio/ncbi_105698580", - "bio/ncbi_105838456", - "bio/ncbi_106639380", - "bio/ncbi_106658358", - "bio/ncbi_106745060", - "bio/ncbi_106788176", - "bio/ncbi_107042628", - "bio/ncbi_107066338", - "bio/ncbi_107189253", - "bio/ncbi_107225289", - "bio/ncbi_107265501", - "bio/ncbi_107995588", - "bio/ncbi_108545268", - "bio/ncbi_108570248", - "bio/ncbi_108631837", - "bio/ncbi_108691270", - "bio/ncbi_108727983", - "bio/ncbi_108748092", - "bio/ncbi_108767092", - "bio/ncbi_108781309", - "bio/ncbi_109856991", - "bio/ncbi_112460253", - "bio/ncbi_114879319", - "bio/ncbi_114932480", - "bio/ncbi_115239461", - "bio/ncbi_116433506", - "bio/ncbi_116840453", - "bio/ncbi_117157365", - "bio/ncbi_117168497", - "bio/ncbi_117205777", - "bio/ncbi_117229339", - "bio/ncbi_117233976", - "bio/ncbi_117603631", - "bio/ncbi_118072630", - "bio/ncbi_118446045", - "bio/ncbi_122405354", - "bio/ncbi_122407929", - "bio/ncbi_122510677", - "bio/ncbi_122526586", - "bio/ncbi_122529794", - "bio/ncbi_122575581", - "bio/ncbi_122627339", - "bio/ncbi_122713845", - "bio/ncbi_122854147", - "bio/ncbi_123263553", - "bio/ncbi_124184389", - "bio/ncbi_124220898", - "bio/ncbi_124306655", - "bio/ncbi_124413059", - "bio/ncbi_124421799", - "bio/ncbi_124954635", - "bio/ncbi_126856388", - "bio/ncbi_126870169", - "bio/ncbi_126924324", - "bio/ncbi_127069822", - "bio/ncbi_127282513", - "bio/ncbi_128874625", - "bio/ncbi_128889118", - "bio/ncbi_130669610", - "bio/ncbi_131673312", - "bio/ncbi_132911847", - "bio/ncbi_13405080", - "bio/ncbi_135164074", - "bio/ncbi_136772661", - "bio/ncbi_137238056", - "bio/ncbi_137584463", - "bio/ncbi_137923949", - "bio/ncbi_138184182", - "bio/ncbi_24590706", - "bio/ncbi_27205858", - "bio/ncbi_2768677", - "bio/ncbi_55632361", - "bio/ncbi_7020953" - ], - "title": "Here are all the properties for p53", - "type": "ANSWER_TABLE" - } - ] - } - ] - } - ] - } - ] - }, - "context": {}, - "debug": {}, - "entities": [ - { - "dcid": "bio/ncbi_100116126", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_100384887", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_100648488", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_100748472", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_100862994", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_100880552", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_102673235", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_103571032", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105153546", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105191985", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105208183", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105254636", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105262739", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105279045", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105362659", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105432841", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105459353", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105569139", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105618583", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105675615", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105689091", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105698580", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_105838456", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_106639380", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_106658358", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_106745060", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_106788176", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_107042628", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_107066338", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_107189253", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_107225289", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_107265501", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_107995588", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_108545268", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_108570248", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_108631837", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_108691270", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_108727983", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_108748092", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_108767092", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_108781309", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_109856991", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_112460253", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_114879319", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_114932480", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_115239461", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_116433506", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_116840453", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_117157365", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_117168497", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_117205777", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_117229339", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_117233976", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_117603631", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_118072630", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_118446045", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_122405354", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_122407929", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_122510677", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_122526586", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_122529794", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_122575581", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_122627339", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_122713845", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_122854147", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_123263553", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_124184389", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_124220898", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_124306655", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_124413059", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_124421799", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_124954635", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_126856388", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_126870169", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_126924324", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_127069822", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_127282513", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_128874625", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_128889118", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_130669610", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_131673312", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_132911847", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_13405080", - "name": "P53", - "type": "" - }, - { - "dcid": "bio/ncbi_135164074", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_136772661", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_137238056", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_137584463", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_137923949", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_138184182", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_24590706", - "name": "P53", - "type": "" - }, - { - "dcid": "bio/ncbi_27205858", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_2768677", - "name": "p53", - "type": "" - }, - { - "dcid": "bio/ncbi_55632361", - "name": "P53", - "type": "" - }, - { - "dcid": "bio/ncbi_7020953", - "name": "P53", - "type": "" - } - ], - "pastSourceContext": "", - "place": {}, - "placeFallback": {}, - "placeSource": "UNKNOWN", - "places": [], - "relatedThings": { - "childPlaces": {}, - "childTopics": [], - "exploreMore": {}, - "mainTopics": [], - "parentPlaces": [], - "parentTopics": [], - "peerPlaces": [], - "peerTopics": [] - }, - "svSource": "UNKNOWN", - "userMessages": [ - "See relevant information based on the previous query." - ] -} \ No newline at end of file diff --git a/server/integration_tests/test_data/e2e_triple/howaboutforpqlc3/chart_config.json b/server/integration_tests/test_data/e2e_triple/howaboutforpqlc3/chart_config.json new file mode 100644 index 0000000000..d390e4f8f9 --- /dev/null +++ b/server/integration_tests/test_data/e2e_triple/howaboutforpqlc3/chart_config.json @@ -0,0 +1,68 @@ +{ + "client": "test_detect-and-fulfill", + "config": { + "categories": [ + { + "blocks": [ + { + "columns": [ + { + "tiles": [ + { + "answerTableTileSpec": { + "columns": [ + { + "header": "Has Genomic Coordinates", + "propertyExpr": "->hasGenomicCoordinates" + } + ] + }, + "entities": [ + "bio/ncbi_109457387", + "bio/ncbi_421941" + ], + "title": "The Has Genomic Coordinates for PQLC3 are as follows:", + "type": "ANSWER_TABLE" + } + ] + } + ] + } + ] + } + ] + }, + "context": {}, + "debug": {}, + "entities": [ + { + "dcid": "bio/ncbi_109457387", + "name": "PQLC3", + "type": "" + }, + { + "dcid": "bio/ncbi_421941", + "name": "PQLC3", + "type": "" + } + ], + "pastSourceContext": "", + "place": {}, + "placeFallback": {}, + "placeSource": "UNKNOWN", + "places": [], + "relatedThings": { + "childPlaces": {}, + "childTopics": [], + "exploreMore": {}, + "mainTopics": [], + "parentPlaces": [], + "parentTopics": [], + "peerPlaces": [], + "peerTopics": [] + }, + "svSource": "UNKNOWN", + "userMessages": [ + "See relevant information based on the previous query." + ] +} \ No newline at end of file diff --git a/server/integration_tests/test_data/e2e_triple/whattranscriptsdoesithave/chart_config.json b/server/integration_tests/test_data/e2e_triple/whatgenomiccoordinatesdoesithave/chart_config.json similarity index 89% rename from server/integration_tests/test_data/e2e_triple/whattranscriptsdoesithave/chart_config.json rename to server/integration_tests/test_data/e2e_triple/whatgenomiccoordinatesdoesithave/chart_config.json index 71027cf577..b7b47c48da 100644 --- a/server/integration_tests/test_data/e2e_triple/whattranscriptsdoesithave/chart_config.json +++ b/server/integration_tests/test_data/e2e_triple/whatgenomiccoordinatesdoesithave/chart_config.json @@ -11,161 +11,9 @@ { "answerTableTileSpec": { "columns": [ - { - "header": "Alliance Of Genome Resources Id", - "propertyExpr": "->allianceOfGenomeResourcesId" - }, - { - "header": "Alternate Gene Symbol", - "propertyExpr": "->alternateGeneSymbol" - }, - { - "header": "Bovine Genome Database Id", - "propertyExpr": "->bovineGenomeDatabaseId" - }, - { - "header": "Chicken Gene Nomenclature Consortium Id", - "propertyExpr": "->chickenGeneNomenclatureConsortiumId" - }, - { - "header": "Chrom", - "propertyExpr": "->chrom" - }, - { - "header": "Comparative Toxicogenomics DBID", - "propertyExpr": "->comparativeToxicogenomicsDBID" - }, - { - "header": "Cytogenetic Map Location", - "propertyExpr": "->cytogeneticMapLocation" - }, - { - "header": "Date Modified", - "propertyExpr": "->dateModified" - }, - { - "header": "Description", - "propertyExpr": "->description" - }, - { - "header": "Ensembl Gene Id", - "propertyExpr": "->ensemblGeneId" - }, - { - "header": "Ensembl ID", - "propertyExpr": "->ensemblID" - }, - { - "header": "Ensembl Rapid Release Id", - "propertyExpr": "->ensemblRapidReleaseId" - }, - { - "header": "Full Name", - "propertyExpr": "->fullName" - }, - { - "header": "Gen Atlas ID", - "propertyExpr": "->genAtlasID" - }, - { - "header": "Gene Card ID", - "propertyExpr": "->geneCardID" - }, - { - "header": "Gene Ortholog", - "propertyExpr": "->geneOrtholog" - }, - { - "header": "Gene Symbol", - "propertyExpr": "->geneSymbol" - }, - { - "header": "Has Cpic Dosing Guideline", - "propertyExpr": "->hasCpicDosingGuideline" - }, - { - "header": "Has Genetic Variant Annotation", - "propertyExpr": "->hasGeneticVariantAnnotation" - }, { "header": "Has Genomic Coordinates", "propertyExpr": "->hasGenomicCoordinates" - }, - { - "header": "Hgnc ID", - "propertyExpr": "->hgncID" - }, - { - "header": "Is Very Important Pharmacogene", - "propertyExpr": "->isVeryImportantPharmacogene" - }, - { - "header": "Mouse Genome Informatics Id", - "propertyExpr": "->mouseGenomeInformaticsId" - }, - { - "header": "Name", - "propertyExpr": "->name" - }, - { - "header": "Ncbi Gene ID", - "propertyExpr": "->ncbiGeneID" - }, - { - "header": "Ncbi Sequence Gi Number", - "propertyExpr": "->ncbiSequenceGiNumber" - }, - { - "header": "Of Species", - "propertyExpr": "->ofSpecies" - }, - { - "header": "Online Mendelian Inheritance In Man ID", - "propertyExpr": "->onlineMendelianInheritanceInManID" - }, - { - "header": "Pharm GKBID", - "propertyExpr": "->pharmGKBID" - }, - { - "header": "Provenance", - "propertyExpr": "->provenance" - }, - { - "header": "Pub Med ID", - "propertyExpr": "->pubMedID" - }, - { - "header": "Rat Genome Database Id", - "propertyExpr": "->ratGenomeDatabaseId" - }, - { - "header": "Ref Seq Genomic Accession", - "propertyExpr": "->refSeqGenomicAccession" - }, - { - "header": "Related Pseudogene", - "propertyExpr": "->relatedPseudogene" - }, - { - "header": "Synonym", - "propertyExpr": "->synonym" - }, - { - "header": "Type Of", - "propertyExpr": "->typeOf" - }, - { - "header": "Type Of Gene", - "propertyExpr": "->typeOfGene" - }, - { - "header": "Vertebrate Gene Nomenclature Committee Id", - "propertyExpr": "->vertebrateGeneNomenclatureCommitteeId" - }, - { - "header": "Xenbase Id", - "propertyExpr": "->xenbaseId" } ] }, @@ -199,7 +47,6 @@ "bio/ncbi_101342270", "bio/ncbi_101364849", "bio/ncbi_101427669", - "bio/ncbi_101534918", "bio/ncbi_101544250", "bio/ncbi_101588120", "bio/ncbi_101617102", @@ -421,7 +268,6 @@ "bio/ncbi_121663069", "bio/ncbi_121935105", "bio/ncbi_122107616", - "bio/ncbi_122158487", "bio/ncbi_122188167", "bio/ncbi_122217613", "bio/ncbi_122432228", @@ -579,7 +425,7 @@ "bio/ncbi_780507", "bio/ncbi_79114" ], - "title": "Here are all the properties for FGFR1", + "title": "The Has Genomic Coordinates for FGFR1 are as follows:", "type": "ANSWER_TABLE" } ] diff --git a/tools/nl/embeddings/input/bio/_preindex.csv b/tools/nl/embeddings/input/bio/_preindex.csv index 0dde4b2383..d70ab1273a 100644 --- a/tools/nl/embeddings/input/bio/_preindex.csv +++ b/tools/nl/embeddings/input/bio/_preindex.csv @@ -1,5 +1,4 @@ sentence,dcid -"""UMLS CUI""",unifiedMedicalLanguageSystemConceptUniqueIdentifier A specific organism or taxonomic group of organisms that are susceptible to be infected by a virus,virusHost A unique ID for a Medical Subject Heading Descriptor record,medicalSubjectHeadingDescriptorID A unique ID for a Medical Subject Heading supplementary record,medicalSubjectHeadingSupplementaryRecordID @@ -17,35 +16,30 @@ Gene associated with a disease,<-diseaseID{typeOf:DiseaseGeneAssociation}->geneI Gene associated with a genetic variant,<-variantID{typeOf:GeneGeneticVariantAssociation}->geneID Genetic variant associated with a disease,<-diseaseID{typeOf:DiseaseGeneticVariantAssociation}->geneticVariantID GeneticVariantGeneAssociation,<-geneID{typeOf:GeneGeneticVariantAssociation}->variantID;<-variantID{typeOf:GeneGeneticVariantAssociation}->geneID +HUGO Gene Nomenclature Committee identifier,hgncID +InChIKey,inChIKey +International Chemical Identifier (InChI) Key,inChIKey +MOA,mechanismOfAction MeSH descriptor record ID,medicalSubjectHeadingDescriptorID MeSH supplementary record ID,medicalSubjectHeadingSupplementaryRecordID -NCBI Taxonomy database identifier,ncbiTaxonID -NCBI defined segment of DNA sequence name,ncbiDNASequenceName +NCBI Taxonomy database identifier,ncbiTaxId NCBI protein accession number,ncbiProteinAccessionNumber -Name used by NIH NCBI to refer to a segment of DNA sequence,ncbiDNASequenceName OMIM database identifier,omimID Origin of variant allele,alleleOrigin -RNA transcript that a gene has,hasRNATranscript -Recorded transcript,hasRNATranscript -Reference genomic sequence from dbSNP,referenceAlleleNCBI +Reference genomic sequence from dbSNP,referenceAllele Simplified Molecular Input Line Entry System (SMILE),simplifiedMolecularInputLineEntrySystem Size of chromosome,chromosomeSize Systematiized Nomenclature of Medicine (SNOMED) clinical terms (CT) code,snomedCT The allele of a genetic variant observed within a population,alleleType "The disease diagnosis code for version 10 of the International Classification of Diseases (ICD), Clinical Modification",icd10CMCode -The genomic location of a genetic variant using the hg19 assembly,hg19GenomicLocation -The genomic location of a genetic variant using the hg38 assembly,hg38GenomicLocation -The genomic position of a genetic variant using the hg19 assembly,hg19GenomicPosition -The genomic position of a genetic variant using the hg38 assembly,hg38GenomicPosition The method by which a drug is administered,administrationRoute -The name of the disease,diseaseName The orientation of the strand on which an annotation is located,strandOrientation -The sequences of the observed alleles from rs-fasta files.,observedAllele The species of a virus isolate,ofVirusSpecies The strand on which a given annotation is located,strandOrientation The type of gene,typeOfGene Type of allele,alleleType -Unified Medical Language System (UMLS) Concept Unique Identifier (CUI),unifiedMedicalLanguageSystemConceptUniqueIdentifier +UMLS CUI,umlsConceptUniqueID +Unified Medical Language System (UMLS) Concept Unique Identifier (CUI),umlsConceptUniqueID Variant allele origin,alleleOrigin activeIngredient,activeIngredient administrationRoute,administrationRoute @@ -53,15 +47,12 @@ alleleOrigin,alleleOrigin alleleType,alleleType antibodyType,antibodyType antigenType,antigenType -availableStrength,availableStrength chemblID,chemblID chemical compound associated with a genetic variant,<-variantID{typeOf:ChemicalCompoundGeneticVariantAssociation}->compoundID chromosomeSize,chromosomeSize class,class "component that provides pharmacological activity or other direct effect in the diagnosis, cure, mitigation, treatment, or prevention of disease, or to affect the structure or any function of the body of man or animals",activeIngredient -diseaseName,diseaseName dosageForm,dosageForm -dose approved for a drug,availableStrength ensemblID,ensemblID full name of the gene,fullName fullName,fullName @@ -70,31 +61,22 @@ geneID,geneID genetic variant associated with a chemical compound,<-compoundID{typeOf:ChemicalCompoundGeneticVariantAssociation}->variantID genetic variant associated with a gene,<-geneID{typeOf:GeneGeneticVariantAssociation}->variantID geneticVariantFunctionalCategory,geneticVariantFunctionalCategory -genomic coordinates,genomicCoordinates -genomicCoordinates,genomicCoordinates +genomic coordinates,hasGenomicCoordinates genus of a virus species,virusGenus -hasRNATranscript,hasRNATranscript -hg19GenomicLocation,hg19GenomicLocation -hg19GenomicPosition,hg19GenomicPosition -hg38GenomicLocation,hg38GenomicLocation -hg38GenomicPosition,hg38GenomicPosition +hgncID,hgncID host of a virus,virusHost icd10CMCode,icd10CMCode -imageUrl,imageUrl +mechanismOfAction,mechanismOfAction medicalSubjectHeadingDescriptorID,medicalSubjectHeadingDescriptorID medicalSubjectHeadingSupplementaryRecordID,medicalSubjectHeadingSupplementaryRecordID -ncbiDNASequenceName,ncbiDNASequenceName ncbiProteinAccessionNumber,ncbiProteinAccessionNumber -ncbiTaxonID,ncbiTaxonID +ncbiTaxID,ncbiTaxId number of nucleotides in a chromosome,chromosomeSize -observedAllele,observedAllele ofVirusSpecies,ofVirusSpecies omimID,omimID phylum,phylum physical form in which a drug is produced and dispensed,dosageForm -preferred disease name for the concept specified by disease identifiers,diseaseName -reference allele,referenceAlleleNCBI -referenceAlleleNCBI,referenceAlleleNCBI +reference allele,referenceAllele simplifiedMolecularInputLineEntrySystem,simplifiedMolecularInputLineEntrySystem snomedCT,snomedCT specialization of,specializationOf @@ -102,11 +84,10 @@ specializationOf,specializationOf strandOrientation,strandOrientation subClassificationOf,subClassificationOf subclassification of,subClassificationOf +the biochemical interaction through which a drug produces a pharmacological effect,mechanismOfAction type of antibody,antibodyType type of antigen,antigenType typeOfGene,typeOfGene -unifiedMedicalLanguageSystemConceptUniqueIdentifier,unifiedMedicalLanguageSystemConceptUniqueIdentifier -url to an image of what the biological specimen looks like,imageUrl +umlsConceptUniqueID,umlsConceptUniqueID virusGenus,virusGenus virusHost,virusHost -what the entity looks like,imageUrl diff --git a/tools/nl/embeddings/input/bio/sheets_svs.csv b/tools/nl/embeddings/input/bio/sheets_svs.csv index 7042de0db2..8ef47be98b 100644 --- a/tools/nl/embeddings/input/bio/sheets_svs.csv +++ b/tools/nl/embeddings/input/bio/sheets_svs.csv @@ -1,30 +1,26 @@ dcid,sentence ofVirusSpecies,ofVirusSpecies;The species of a virus isolate virusHost,virusHost;A specific organism or taxonomic group of organisms that are susceptible to be infected by a virus;host of a virus -ncbiTaxonID,ncbiTaxonID;NCBI Taxonomy database identifier -diseaseName,diseaseName;preferred disease name for the concept specified by disease identifiers;The name of the disease -observedAllele,observedAllele;The sequences of the observed alleles from rs-fasta files. -referenceAlleleNCBI,referenceAlleleNCBI;Reference genomic sequence from dbSNP;reference allele +ncbiTaxId,ncbiTaxID;NCBI Taxonomy database identifier +referenceAllele,Reference genomic sequence from dbSNP;reference allele class,class phylum,phylum geneticVariantFunctionalCategory,geneticVariantFunctionalCategory;Functional category of the genetic variant -hg19GenomicPosition,hg19GenomicPosition;The genomic position of a genetic variant using the hg19 assembly -hg19GenomicLocation,hg19GenomicLocation;The genomic location of a genetic variant using the hg19 assembly -hg38GenomicPosition,hg38GenomicPosition;The genomic position of a genetic variant using the hg38 assembly -hg38GenomicLocation,hg38GenomicLocation;The genomic location of a genetic variant using the hg38 assembly -hasRNATranscript,hasRNATranscript;Recorded transcript;RNA transcript that a gene has +hgncID,hgncID;HUGO Gene Nomenclature Committee identifier +inChIKey,InChIKey;International Chemical Identifier (InChI) Key strandOrientation,strandOrientation;The strand on which a given annotation is located;The orientation of the strand on which an annotation is located typeOfGene,typeOfGene;The type of gene omimID,omimID;OMIM database identifier icd10CMCode,"icd10CMCode;The disease diagnosis code for version 10 of the International Classification of Diseases (ICD), Clinical Modification" subClassificationOf,subClassificationOf;subclassification of snomedCT,snomedCT;Systematiized Nomenclature of Medicine (SNOMED) clinical terms (CT) code -unifiedMedicalLanguageSystemConceptUniqueIdentifier,"unifiedMedicalLanguageSystemConceptUniqueIdentifier;Unified Medical Language System (UMLS) Concept Unique Identifier (CUI);""UMLS CUI""" +umlsConceptUniqueID,umlsConceptUniqueID;Unified Medical Language System (UMLS) Concept Unique Identifier (CUI);UMLS CUI specializationOf,specializationOf;specialization of chemblID,chemblID;ChEMBL identifier simplifiedMolecularInputLineEntrySystem,simplifiedMolecularInputLineEntrySystem;Simplified Molecular Input Line Entry System (SMILE) medicalSubjectHeadingSupplementaryRecordID,medicalSubjectHeadingSupplementaryRecordID;A unique ID for a Medical Subject Heading supplementary record;An ID for a Medical Subject Heading supplementary record;MeSH supplementary record ID medicalSubjectHeadingDescriptorID,medicalSubjectHeadingDescriptorID;A unique ID for a Medical Subject Heading Descriptor record;An ID for a Medical Subject Heading descriptor record;MeSH descriptor record ID +mechanismOfAction,mechanismOfAction;MOA;the biochemical interaction through which a drug produces a pharmacological effect activeIngredient,"activeIngredient;component that provides pharmacological activity or other direct effect in the diagnosis, cure, mitigation, treatment, or prevention of disease, or to affect the structure or any function of the body of man or animals" administrationRoute,administrationRoute;The method by which a drug is administered dosageForm,dosageForm;physical form in which a drug is produced and dispensed @@ -37,10 +33,7 @@ geneID,geneID;gene id ncbiProteinAccessionNumber,ncbiProteinAccessionNumber;NCBI protein accession number alleleOrigin,alleleOrigin;Variant allele origin;Origin of variant allele alleleType,alleleType;The allele of a genetic variant observed within a population;Type of allele -ncbiDNASequenceName,ncbiDNASequenceName;NCBI defined segment of DNA sequence name;Name used by NIH NCBI to refer to a segment of DNA sequence -imageUrl,imageUrl;url to an image of what the biological specimen looks like;what the entity looks like -genomicCoordinates,genomicCoordinates;genomic coordinates -availableStrength,availableStrength;dose approved for a drug +hasGenomicCoordinates,genomic coordinates <-variantID{typeOf:GeneGeneticVariantAssociation}->geneID,GeneticVariantGeneAssociation;Gene associated with a genetic variant <-geneID{typeOf:GeneGeneticVariantAssociation}->variantID,GeneticVariantGeneAssociation;genetic variant associated with a gene <-diseaseID{typeOf:DiseaseGeneAssociation}->geneID,DiseaseGeneAssociation;Gene associated with a disease