Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add upper- and lowercase prefix synonyms #969

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
4 changes: 3 additions & 1 deletion src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -36468,7 +36468,6 @@
"name": "Drosophila Phenotype Ontology",
"prefix": "FBcv"
},
"part_of": "flybase",
"pattern": "^\\d{7}$",
"prefixcommons": {
"bioportal": "1017",
Expand Down Expand Up @@ -41924,6 +41923,9 @@
"Phenomics",
"Comparative Genomics",
"Omics"
],
"synonyms": [
"GEO"
]
},
"go": {
Expand Down
7 changes: 6 additions & 1 deletion src/bioregistry/data/contexts.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
{
"obo": {
"blacklist": [
"icd9"
"icd9",
"orphanet",
"gro"
],
"custom_prefix_map": {
"PMID": "https://www.ncbi.nlm.nih.gov/pubmed/"
Expand Down Expand Up @@ -29,7 +31,10 @@
"default"
],
"prefix_remapping": {
"GEO": "ncbi.geo",
"cpga": "GRO",
"ensembl": "ENSEMBL",
"geogeo": "GEO",
"icd10": "ICD10WHO",
"orphanet.ordo": "Orphanet",
"pubmed": "PMID",
Expand Down
22 changes: 22 additions & 0 deletions src/bioregistry/record_accumulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def get_converter(
converter = curies.remap_curie_prefixes(converter, remapping)
if rewiring:
converter = curies.rewire(converter, rewiring)
converter = _enrich_converter_synonyms(converter)
return converter


Expand Down Expand Up @@ -340,3 +341,24 @@ def _add_prefix_prefixes(
)

return [record for _, record in sorted(records.items())]


def _enrich_converter_synonyms(converter: Converter) -> Converter:
return Converter([_enrich_record_synonyms(r) for r in converter.records])


def _enrich_record_synonyms(record: curies.Record) -> curies.Record:
sss = set()
for s in [record.prefix, *record.prefix_synonyms]:
sss.update(_generate_variants(s))
record.prefix_synonyms = sorted(sss - {record.prefix})
return record


def _generate_variants(s: str):
yield s
yield s.lower()
yield s.upper()
yield s.replace("_", "")
yield s.replace("_", "").upper()
yield s.replace("_", "").lower()
44 changes: 14 additions & 30 deletions tests/test_contexts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import json
import unittest

import pytest

import bioregistry
from bioregistry import Resource, manager
from bioregistry.constants import CONTEXTS_PATH
Expand Down Expand Up @@ -44,6 +46,8 @@ def test_obo_context(self):
self.assertEqual(f"{p}/KISAO_", prefix_map["KISAO"])
self.assertIn("FBcv", prefix_map)
self.assertEqual(f"{p}/FBcv_", prefix_map["FBcv"])
self.assertNotIn("geo", prefix_map)
self.assertIn("ncbi.geo", prefix_map)
self.assertIn("GEO", prefix_map)
self.assertEqual(f"{p}/GEO_", prefix_map["GEO"])
self.assertEqual("https://www.ncbi.nlm.nih.gov/pubmed/", prefix_map["PMID"])
Expand All @@ -58,15 +62,22 @@ def test_obo_context(self):
msg="When overriding, this means that bioregistry prefix isn't properly added to the synonyms list",
)

@pytest.mark.slow
def test_obo_converter(self):
"""Test getting a converter from a context."""
converter = manager.get_converter_from_context("obo")
self.assertEqual("ICD10WHO", converter.standardize_prefix("icd10"))
self.assertEqual("Orphanet", converter.standardize_prefix("ordo"))
self.assertEqual("GO", converter.standardize_prefix("GO"))
self.assertEqual("GO", converter.standardize_prefix("gomf"))
self.assertEqual("GO", converter.standardize_prefix("GO", strict=True))
self.assertEqual("GO", converter.standardize_prefix("gomf", strict=True))
self.assertEqual("https://www.ncbi.nlm.nih.gov/pubmed/", converter.bimap["PMID"])
self.assertEqual("GO", converter.standardize_prefix("go", strict=True))
self.assertEqual("GO", converter.standardize_prefix("go"))
self.assertEqual("PMID", converter.standardize_prefix("pmid", strict=True))
self.assertEqual("PMID", converter.standardize_prefix("pubmed", strict=True))
self.assertEqual("PMID", converter.standardize_prefix("PubMed", strict=True))
self.assertEqual("PMID", converter.standardize_prefix("PUBMED"))
self.assertEqual("PMID", converter.standardize_prefix("PMID"))
self.assertEqual("oboInOwl", converter.standardize_prefix("oboinowl"))

def test_data(self):
Expand Down Expand Up @@ -95,36 +106,9 @@ def test_data(self):
self.valid_metaprefixes.union({"obofoundry.preferred", "preferred", "default"}),
)
remapping = context.prefix_remapping or {}
_valid_remapping_prefixes = set(
bioregistry.get_prefix_map(
uri_prefix_priority=context.uri_prefix_priority,
)
)
_valid_remapping_prefixes = set(manager.converter.prefix_map)
for prefix in remapping:
# Currently this allows overwriting of existing prefixes
self.assertIn(prefix, _valid_remapping_prefixes)

_valid_custom_prefixes = set(
bioregistry.get_prefix_map(
remapping=remapping,
uri_prefix_priority=context.uri_prefix_priority,
)
)
invalid_custom_prefixes = {
prefix
for prefix in (context.custom_prefix_map or {})
if prefix not in _valid_custom_prefixes
}
self.assertEqual(
0,
len(invalid_custom_prefixes),
msg=f"""

All prefixes in the custom prefix mapping should either be canonical prefixes or generated by the prefix remapping

Invalid prefixes: {", ".join(sorted(invalid_custom_prefixes))}
""",
)

for blacklist_prefix in context.blacklist or []:
self.assertIn(blacklist_prefix, self.valid_prefixes)
15 changes: 15 additions & 0 deletions tests/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import unittest

import pytest

import bioregistry
from bioregistry import Manager, parse_curie
from bioregistry.export.rdf_export import get_full_rdf
Expand Down Expand Up @@ -261,3 +263,16 @@ def test_external_registry_mappings(self):
self.assertIn("loggerhead", res.source_only)
# This is a non-ontology so it won't get in OBO Foundry
self.assertIn("DCTERMS", res.target_only)

@pytest.mark.slow
def test_converter(self):
"""Test standardizing prefixes with the converter."""
converter = self.manager.get_converter()
self.assertEqual("meddra", converter.standardize_prefix("MEDDRA"))
self.assertEqual("meddra", converter.standardize_prefix("MedDRA"))

self.assertEqual("pubmed", converter.standardize_prefix("PMID"))
self.assertEqual("pubmed", converter.standardize_prefix("PUBMED"))
self.assertEqual("pubmed", converter.standardize_prefix("pmid"))
self.assertEqual("pubmed", converter.standardize_prefix("pubmed"))
self.assertEqual("pubmed", converter.standardize_prefix("PubMed"))
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ envlist =

[testenv]
commands =
coverage run -p -m pytest --durations=20 {posargs:tests}
coverage run -p -m pytest --durations=20 --disable-warnings {posargs:tests}
coverage combine
coverage xml
deps =
Expand Down
Loading