-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqc.Makefile
240 lines (202 loc) · 8.91 KB
/
qc.Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
RUN=poetry run
#ENVO_SEMSQL=downlaods/envo.db
SPARQL_ENDPOINT := http://3.236.215.220/repositories/nmdc-knowledgegraph
.PHONY: qc-all qc-clean
mirror/purl.obolibrary.org/obo/nmdco/imports/envo_import.owl: # todo slow
robot mirror \
-vvv \
--input src/ontology/nmdco-edit.owl \
--directory mirror
qc-all: qc-clean \
mirror/purl.obolibrary.org/obo/nmdco/imports/envo_import.owl \
qc-reports/report-asserted-equivalencies.tsv \
qc-reports/report-cycles.tsv \
downloads/envo-idranges.owl \
downloads/envo-idranges.owl.ttl \
downloads/envo.db \
downloads/envo.owl \
assets/extension_report.yaml \
assets/report_envo_biome_annotations.yaml \
assets/report_envo_environmental_material_annotations.yaml \
assets/robot_diff.txt \
qc-reports/biosample-triad-counts.tsv \
qc-reports/envo-all-classes.txt \
qc-reports/envo-biomes.txt \
qc-reports/envo-environmental-materials.txt \
qc-reports/envo-id-ranges-report.tsv \
qc-reports/nmdco-envo-classes-with-id-owner.tsv \
qc-reports/report-class-ids.tsv \
qc-reports/report-id-ranges.tsv \
qc-reports/report-unlabelled-classes.tsv \
src/ontology/imports/report-unlabelled-classes.txt \
qc-reports/problematic_triads.tsv
# todo doesn't include
# assets/environments_with_no_biome_mappings.raw.txt:
# assets/environments_with_no_biome_mappings.raw.yaml.txt:
# assets/material_environment_built_environment_mappings_raw.yaml.txt:
qc-clean:
rm -rf qc-reports/*
mkdir -p qc-reports
touch qc-reports/.gitkeep
# rm -rf downloads/*owl* downloads/*.db
# mkdir -p downloads
# touch downloads/.gitkeep
rm -rf \
assets/extension_report.yaml \
assets/parse_robot_diff.tsv \
assets/report_envo_biome_annotations.yaml \
assets/report_envo_environmental_material_annotations.yaml \
assets/robot_diff.txt \
src/ontology/imports/report-unlabelled-classes.txt
downloads/envo.owl:
@echo "Downloading..."
ifeq ($(shell command -v wget 2> /dev/null),)
@echo "wget is not installed, trying with curl..."
@curl -o $@ https://raw.githubusercontent.com/EnvironmentOntology/envo/master/envo.owl
else
@echo "Downloading with wget..."
@wget -O $@ https://raw.githubusercontent.com/EnvironmentOntology/envo/master/envo.owl
endif
qc-reports/report-class-ids.tsv: downloads/envo.owl
robot query \
--input $< \
--query $(subst .tsv,.rq,$(subst qc-reports,qc-sparql,$@)) $@
downloads/envo.db:
poetry run semsql download envo -o $@
# omn manchester syntax
downloads/envo-idranges.owl:
@echo "Downloading..."
ifeq ($(shell command -v wget 2> /dev/null),)
@echo "wget is not installed, trying with curl..."
@curl -o $@ https://raw.githubusercontent.com/EnvironmentOntology/envo/master/src/envo/envo-idranges.owl
else
@echo "Downloading with wget..."
@wget -O $@ https://raw.githubusercontent.com/EnvironmentOntology/envo/master/src/envo/envo-idranges.owl
endif
qc-reports/report-id-ranges.tsv: downloads/envo-idranges.owl
@echo "Querying..."
robot query \
--input $< \
--query $(subst .tsv,.rq,$(subst qc-reports,qc-sparql,$@)) $@
downloads/envo-idranges.owl.ttl: downloads/envo-idranges.owl
@echo "Converting..."
@robot convert --input $< --output $@
qc-reports/envo-id-ranges-report.tsv: downloads/envo-idranges.owl.ttl
@echo "Generating report..."
$(RUN) report-id-ranges \
--id-ranges-ttl $< \
--output $@
qc-reports/biosample-triad-counts.tsv:
@echo "Generating report..."
$(RUN) report-instantiated-traids \
--output $(subst counts.tsv,report.tsv,$@) \
--counts-output $@
qc-reports/nmdco-envo-classes-with-id-owner.tsv: qc-reports/envo-id-ranges-report.tsv nmdco-classes.json
$(RUN) report-nmdco-envo-classes-by-id-owners \
--id-range-tsv-input $(word 1,$^) \
--nmdco-classes-json-input $(word 2,$^) \
--output $@
qc-reports/envo-biomes.txt: downloads/envo.db
$(RUN) runoak \
--input $< descendants -p i biome | sort -t '!' -k2,2 > $@
qc-reports/envo-environmental-materials.txt: downloads/envo.db
$(RUN) runoak \
--input $< descendants -p i 'environmental material' | sort -t '!' -k2,2 > $@
qc-reports/envo-all-classes.txt: downloads/envo.db
$(RUN) runoak \
--input $< descendants -p i entity | sort -t '!' -k2,2 > $@
qc-reports/problematic_triads.tsv: qc-reports/envo-all-classes.txt \
qc-reports/envo-biomes.txt \
qc-reports/biosample-triad-report.tsv \
qc-reports/envo-environmental-materials.txt
$(RUN) find-biosamples-with-problematic-triads \
--all-envo-classes-file $(word 1, $^) \
--biomes-file $(word 2, $^) \
--biosamples-file $(word 3, $^) \
--materials-file $(word 4, $^) \
--output $@ \
--output-summary qc-reports/problematic_triad_summary.yaml
assets/robot_diff.txt: nmdco.owl src/ontology/nmdco-classes.owl
robot diff \
--left $< \
--right $(word 2,$^) \
--output $@
## this was a short term hack
#assets/parse_robot_diff.tsv: assets/robot_diff.txt
# $(RUN) python nmdc_ontology/parse_robot_diff.py \
# --input-file $< \
# --output-file $@
qc-reports/report-cycles.tsv: mirror/purl.obolibrary.org/obo/nmdco/imports/envo_import.owl
robot query \
--input $< \
--query $(subst .tsv,.rq,$(subst qc-reports,qc-sparql,$@)) $@
qc-reports/report-asserted-equivalencies.tsv: mirror/purl.obolibrary.org/obo/nmdco/imports/envo_import.owl
robot query \
--input $< \
--query $(subst .tsv,.rq,$(subst qc-reports,qc-sparql,$@)) $@
qc-reports/report-unlabelled-classes.tsv: nmdco.owl
robot query \
--input $< \
--query $(subst .tsv,.rq,$(subst qc-reports,qc-sparql,$@)) $@
src/ontology/imports/report-unlabelled-classes.txt: qc-reports/report-unlabelled-classes.tsv
awk 'NR > 1 ' $< | tr -d '<>' > [email protected]
cat assets/additional-extracts.txt [email protected] | sort -u > $@
###
#.PHONY: envo_mixs_all
#envo_mixs_all: envo_mixs_clean assets/mixs_environments_env_materials_subsets.yaml.txt
#
#.PHONY: envo_mixs_clean
#envo_mixs_clean: # todo update
# rm -rf assets/extension_report.yaml \
# assets/report_envo_environmental_material_annotations.tsv \
# assets/mixs_environments_env_materials_subsets.yaml.txt
assets/extension_report.yaml: # todo fix these non-ascii characters upstream in MIxS
$(RUN) report-mixs-extensions \
--url https://raw.githubusercontent.com/GenomicsStandardsConsortium/mixs/main/assets/class_summary_results.tsv \
--output-file $@
assets/report_envo_biome_annotations.yaml:
$(RUN) report-envo-biome-annotations \
--endpoint $(SPARQL_ENDPOINT) \
--tsv-output $@ \
--yaml-output assets/report_envo_biome_annotations.yaml
assets/report_envo_environmental_material_annotations.yaml:
$(RUN) report-envo-environmental-material-annotations \
--endpoint $(SPARQL_ENDPOINT) \
--tsv-output assets/report_envo_environmental_material_annotations.tsv \
--yaml-output $@
assets/environments_with_no_biome_mappings.raw.yaml.txt: assets/extension_report.yaml assets/report_envo_biome_annotations.yaml assets/biome_subsets_accepted.yaml
date && time $(RUN) mixs-environments-to-envo-classes-by-claude \
--mixs-file $(word 1,$^) \
--envo-file $(word 2,$^) \
--mappings-file $(word 3,$^) \
--envo-description biomes \
--temperature 0.1 \
--max-tokens 4096 \
--model claude-3-opus-20240229 \
--suffix "Please list any EnvO biomes that have not been mapped to any MIxS environment. Provide both the id and the label. Do not provide any introduction, commentary, summary or anything like that." > $@
assets/environments_with_no_biome_mappings.raw.txt: assets/extension_report.yaml assets/report_envo_biome_annotations.yaml assets/biome_subsets_accepted.yaml
date && time $(RUN) mixs-environments-to-envo-classes-by-claude \
--mixs-file $(word 1,$^) \
--envo-file $(word 2,$^) \
--mappings-file $(word 3,$^) \
--envo-description biomes \
--temperature 0.1 \
--max-tokens 4096 \
--model claude-3-opus-20240229 \
--suffix "Please list any MIxS environments to which no EnvO biomes have been mapped. Do not provide any introduction, commentary, summary or anything like that." > $@
assets/material_environment_built_environment_mappings_raw.yaml.txt: assets/extension_report.yaml \
assets/report_envo_biome_annotations.yaml \
assets/materials_subsets_accepted.yaml
date && time $(RUN) mixs-environments-to-envo-classes-by-claude \
--mixs-file $(word 1,$^) \
--envo-file $(word 2,$^) \
--mappings-file $(word 3,$^) \
--envo-description "environmental materials" \
--temperature 0.01 \
--max-tokens 4096 \
--model claude-3-opus-20240229 \
--suffix "Please generate an exhaustive mapping of environmental materials to the BuiltEnvironment MIxS environment, using the same YAML format. Do not provide any introduction, commentary, summary or anything like that." > $@
# --suffix "Please generate an exhaustive mapping of environmental materials to the MIxS environments that they could reasonably be found in, using the same YAML format. Do not provide any introduction, commentary, summary or anything like that. Do not perform mappings for any environmental materials or environments that are related to food, humans, hosts, or health." > $@
# Please do not repeat any of completed mappings.
# Please do not map any {envo_classes_description} to ENVO_00000428 'biome'.