Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

get a report of envo terms with ids in the range of nmdc or nmdc contributors #26

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
src/ontology/nmdco-redundant.ttl
local/
**/__pycache__/

.DS_Store
.idea/
Expand Down
27 changes: 27 additions & 0 deletions envo_id_ranges_report.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
range owner min max
1 Norman Morrison 1 999999
10 ENVO-P, E-NumberOf-E 9400000 9499999
11 Kai Blumberg pattern generated terms 3100001 3200000
13 chemical_concentration.yaml yaml design pattern 3200001 3300000
14 ebi_biomes.csv robot design pattern 3300001 3400000
15 Ruth Duerr 3400001 3500000
16 Anne Thessen 3500001 3501000
17 Anton Van de Putte 3501001 3501100
2 Pier Luigi Buttigieg 1000000 1999999
20 Raissa Meyer 3520000 3529999
21 Brandon Whitehead 6000000 6100000
22 Jimena Linares 6100001 6105000
23 Mei-Chen Liu 6105001 6105100
24 Isabella Maria Wilkie 6105101 6105200
25 National Microbiome Data Collaborative 3600000 3600099
26 Agronomy Ontology 6105201 6105499
27 ANZ Soil and Land survey 5000000 5100000
28 Alicia Clum 3600100 3604999
29 Mark Andrew Miller 3605000 3609999
3 Chris Mungall 2000000 2999999
4 TermGenie 8000000 8999999
5 ENVO-P, E-A-L 9000000 9099999
6 ENVO-P, E-Q-L 9100000 9199999
7 ENVO-P, E-A 9200000 9299999
8 Kai Blumberg 3000000 3100000
9 ENVO-P, E-Content-E 9300000 9399999
Empty file added local/.gitkeep
Empty file.
21 changes: 21 additions & 0 deletions maintenance.Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
RUN=poetry run

local/envo-idranges.owl.omn:
@echo "Downloading..."
ifeq ($(shell command -v wget 2> /dev/null),)
@echo "wget is not installed, trying with curl..."
@curl -o $@ https://raw.githubusercontent.com/EnvironmentOntology/envo/master/src/envo/envo-idranges.owl
else
@echo "Downloading with wget..."
@wget -O $@ https://raw.githubusercontent.com/EnvironmentOntology/envo/master/src/envo/envo-idranges.owl
endif

local/envo-idranges.owl.ttl: local/envo-idranges.owl.omn
@echo "Converting..."
@robot convert --input $< --output $@

envo_id_ranges_report.tsv: local/envo-idranges.owl.ttl
@echo "Generating report..."
$(RUN) report-id-ranges \
--id-ranges-ttl $< \
--output $@
59 changes: 59 additions & 0 deletions nmdc_ontology/report_id_ranges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import click
from rdflib import Graph, Namespace
import pandas as pd


@click.command()
@click.option("--id-ranges-ttl", "-i", default="local/envo-idranges.owl.ttl", help="Input file path")
@click.option("--output", "-o", default="envo_id_ranges_report.tsv", help="Output file path")
def generate_id_ranges(id_ranges_ttl, output):
# Load the Turtle content into an RDF graph
g = Graph()
g.parse(id_ranges_ttl, format="turtle")

# Define namespaces
obo = Namespace("http://purl.obolibrary.org/obo/")
ro = Namespace("http://purl.obolibrary.org/obo/ro/")
iao = Namespace("http://purl.obolibrary.org/obo/IAO_")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")

# Construct SPARQL query string
query = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX obo: <http://purl.obolibrary.org/obo/>
SELECT ?range ?owner ?min ?max
WHERE {
?range a rdfs:Datatype ;
owl:equivalentClass ?ec ;
obo:IAO_0000597 ?owner .
?ec owl:withRestrictions ?restrictions .
?restrictions rdf:first ?first ;
rdf:rest ?rest .
?first xsd:minInclusive ?min .
?rest rdf:first ?rest_first .
?rest_first xsd:maxInclusive ?max .
}
"""

# Execute the SPARQL query and convert results to DataFrame
results = g.query(query)
data = [(str(row.range), str(row.owner), str(row.min), str(row.max)) for row in results]
data = [(str(row.range).replace("http://purl.obolibrary.org/obo/ro/idrange/", ""), str(row.owner), str(row.min),
str(row.max)) for row in results]

df = pd.DataFrame(data, columns=['range', 'owner', 'min', 'max'])

# Save DataFrame as TSV
df.to_csv(output, sep="\t", index=False)

click.echo(f"ID ranges saved to {output}")


if __name__ == "__main__":
generate_id_ranges()
51 changes: 50 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ pandas = "^2.2.0"
requests = "^2.31.0"
pyarrow = "^15.0.0"
click = "^8.1.7"
rdflib = "^7.0.0"


[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.scripts]
report-instantiated-traids = "nmdc_ontology.report_instantiated_traids:main"
report-instantiated-traids = "nmdc_ontology.report_instantiated_traids:main"
report-id-ranges = "nmdc_ontology.report_id_ranges:generate_id_ranges"
Loading