Skip to content

Commit

Permalink
[bio] Update biomed property embeddings. (#4721)
Browse files Browse the repository at this point in the history
Adds:
- mechanismOfAction
- hgncID
- inChIKey

Updates:
- unifiedMedicalLanguageSystemConceptUniqueIdentifier ->
umlsConceptUniqueID
- ncbiTaxonID->ncbiTaxId
- referenceAlleleNCBI -> referenceAllele
- genomicCoordinates -> hasGenomicCoordinates

Deletes:
- diseaseName
- observedAllele
- hg19GenomicPosition
- hg19GenomicLocation
- hg38GenomicPosition
- hg38GenomicLocation
- hasRNATranscript
- ncbiDNASequenceName
- imageUrl
- availableStrength

Example Screenshots

![image](https://github.com/user-attachments/assets/6783a776-48e0-4ceb-aa57-b0276a67d2db)

![image](https://github.com/user-attachments/assets/568b48e8-c6b6-4dce-9e99-af9bf7f0f912)

![image](https://github.com/user-attachments/assets/8f31b18b-3169-4ef0-a67d-a0b6cea3bf71)
  • Loading branch information
clincoln8 authored Nov 11, 2024
1 parent c2c1d9d commit 3b0f5ab
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 957 deletions.
2 changes: 1 addition & 1 deletion deploy/nl/catalog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ indexes:
bio_ft:
store_type: MEMORY
source_path: ../../tools/nl/embeddings/input/bio
embeddings_path: gs://datcom-nl-models/bio_ft_2024_11_05_09_59_39/embeddings.csv
embeddings_path: gs://datcom-nl-models/bio_ft_2024_11_08_19_00_38/embeddings.csv
model: ft-final-v20230717230459-all-MiniLM-L6-v2
healthcheck_query: "Gene"
base_uae_lance:
Expand Down
7 changes: 5 additions & 2 deletions server/integration_tests/explore_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -761,14 +761,17 @@ def test_e2e_triple(self):
self.run_detect_and_fulfill(
'e2e_triple',
[
# ----- Context Based Queries -----
# Should have 'out' properties as answer
'What strand orientation does FGFR1 have?',
# Should use context for the entity
'what transcripts does it have',
'what genomic coordinates does it have',
# Should use context for the property
'how about for P53',
'how about for PQLC3',
# Should not use context because no entity or property found
'what animal is that found in',

# ----- Singleton Queries -----
# Should have 'in' properties as answer
'What is Betacoronavirus 1 the species of',
# Should have a chained property in the answer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
0.314266,
0.312765,
0.312629,
0.312504,
0.312505,
0.311774,
0.310339,
0.310075,
Expand Down Expand Up @@ -75,24 +75,23 @@
0.31467,
0.308395,
0.307941,
0.30463,
0.282709,
0.281875,
0.280828,
0.280453,
0.274298,
0.271505,
0.270718,
0.269547,
0.259901,
0.259901,
0.248279,
0.247354,
0.24662,
0.246321,
0.246164,
0.242123,
0.246163,
0.238985,
0.231355,
0.229624
0.230288,
0.229624,
0.227736,
0.221732,
0.21856,
0.216932
],
"PROP": [
"phylum",
Expand All @@ -106,24 +105,23 @@
"ensemblID",
"simplifiedMolecularInputLineEntrySystem",
"geneticVariantFunctionalCategory",
"imageUrl",
"genomicCoordinates",
"chromosomeSize",
"observedAllele",
"hg38GenomicPosition",
"hg19GenomicPosition",
"virusHost",
"strandOrientation",
"ncbiDNASequenceName",
"<-geneID{typeOf:GeneGeneticVariantAssociation}->variantID",
"<-variantID{typeOf:GeneGeneticVariantAssociation}->geneID",
"ncbiProteinAccessionNumber",
"hgncID",
"alleleType",
"hg38GenomicLocation",
"ofVirusSpecies",
"hg19GenomicLocation",
"ncbiTaxonID",
"antigenType"
"hasGenomicCoordinates",
"ncbiTaxId",
"umlsConceptUniqueID",
"antigenType",
"alleleOrigin",
"<-diseaseID{typeOf:DiseaseGeneAssociation}->geneID",
"antibodyType",
"inChIKey"
]
},
"query_detection_debug_logs": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
0.400638,
0.398746,
0.398361,
0.390523,
0.390524,
0.389033,
0.388461,
0.388382,
Expand All @@ -35,7 +35,7 @@
0.385679,
0.385434,
0.385339,
0.381842,
0.381843,
0.379705
],
"MultiSV": {},
Expand Down Expand Up @@ -77,24 +77,23 @@
0.65902,
0.63579,
0.63404,
0.617156,
0.617157,
0.598812,
0.595879,
0.585015,
0.583271,
0.581079,
0.576904,
0.543242,
0.539973,
0.532047,
0.530959,
0.51427,
0.540855,
0.538974,
0.512528,
0.46106,
0.444108,
0.398229,
0.394443,
0.39229
0.39229,
0.384802,
0.341795,
0.324578,
0.32201,
0.320837
],
"PROP": [
"typeOfGene",
Expand All @@ -107,22 +106,21 @@
"<-diseaseID{typeOf:DiseaseGeneAssociation}->geneID",
"<-compoundID{typeOf:ChemicalCompoundGeneticVariantAssociation}->variantID",
"<-geneID{typeOf:DiseaseGeneAssociation}->diseaseID",
"genomicCoordinates",
"antigenType",
"<-variantID{typeOf:ChemicalCompoundGeneticVariantAssociation}->compoundID",
"alleleType",
"<-geneticVariantID{typeOf:DiseaseGeneticVariantAssociation}->diseaseID",
"hg19GenomicPosition",
"hasRNATranscript",
"hg19GenomicLocation",
"hg38GenomicPosition",
"hg38GenomicLocation",
"hgncID",
"hasGenomicCoordinates",
"antibodyType",
"chromosomeSize",
"referenceAlleleNCBI",
"observedAllele",
"ncbiDNASequenceName",
"alleleOrigin"
"referenceAllele",
"alleleOrigin",
"specializationOf",
"subClassificationOf",
"virusGenus",
"virusHost",
"ncbiProteinAccessionNumber"
]
},
"query_detection_debug_logs": {
Expand Down
Loading

0 comments on commit 3b0f5ab

Please sign in to comment.