-
Notifications
You must be signed in to change notification settings - Fork 4
/
dvc.yaml
135 lines (124 loc) · 5.6 KB
/
dvc.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
stages:
create_inclusion_list:
cmd: python grants_tagger/create_inclusion_list.py data/raw/desc2021.xml data/processed/descriptors_not_to_use_manual.csv data/processed/descriptors_to_use.csv
deps:
- data/raw/desc2021.xml
outs:
- data/processed/descriptors_to_use.csv
preprocess_bioasq_mesh:
cmd: grants_tagger preprocess bioasq-mesh data/raw/allMeSH_2021.json data/processed/train_mesh2021.jsonl
models/xlinear/label_binarizer.pkl --test-split 0.01 --test-output-path data/processed/test_mesh2021.jsonl
--mesh-tags-path data/processed/descriptors_to_use.csv
deps:
- data/raw/allMeSH_2021.json
- grants_tagger/preprocess_mesh.py
outs:
- data/processed/train_mesh2021.jsonl
- data/processed/test_mesh2021.jsonl
- models/xlinear/label_binarizer.pkl
preprocess_bioasq_mesh_toy: # Creates toy data to help iterating/testing code
cmd: grants_tagger preprocess bioasq-mesh data/raw/allMeSH_2021.json data/processed/train_mesh2021_toy.jsonl
models/xlinear-toy/label_binarizer_toy.pkl --test-split 0.01 --test-output-path data/processed/test_mesh2021_toy.jsonl
--n-max 1000
deps:
- data/raw/allMeSH_2021.json
- grants_tagger/preprocess_mesh.py
- data/processed/wt_tags_used.csv
outs:
- data/processed/train_mesh2021_toy.jsonl
- data/processed/test_mesh2021_toy.jsonl
- models/xlinear-toy/label_binarizer_toy.pkl
train_mesh_xlinear:
cmd: grants_tagger train data/processed/train_mesh2021.jsonl models/xlinear/label_binarizer.pkl models/xlinear/model
--approach mesh-xlinear --sparse-labels --train-info results/mesh_xlinear_train_info.json --slim
deps:
- data/processed/train_mesh2021.jsonl
- grants_tagger/train.py
- grants_tagger/slim/mesh_xlinear.py
params:
- train.mesh-xlinear.config
outs:
- models/xlinear/model
train_mesh_xlinear_toy:
cmd: grants_tagger train data/processed/train_mesh2021_toy.jsonl models/xlinear-toy/label_binarizer_toy.pkl models/xlinear-toy/model
--approach mesh-xlinear --sparse-labels --train-info results/mesh_xlinear_train_info_toy.json --slim
deps:
- data/processed/train_mesh2021_toy.jsonl
- grants_tagger/train.py
- grants_tagger/slim/mesh_xlinear.py
params:
- train.mesh-xlinear.config
outs:
- models/xlinear-toy/model
evaluate_mesh_xlinear:
cmd: grants_tagger evaluate model mesh-xlinear models/xlinear/model
data/processed/test_mesh2021.jsonl models/xlinear/label_binarizer.pkl
--results-path results/mesh_xlinear.json
--full-report-path results/mesh_xlinear_full_report.json --no-split-data
deps:
- grants_tagger/evaluate_model.py
- models/xlinear/model
- models/xlinear/label_binarizer.pkl
metrics:
- results/mesh_xlinear.json:
cache: false
filter_mesh_tags:
cmd: python grants_tagger/filter_mesh_tags.py data/raw/desc2021.xml data/processed/mesh_disease_tags.csv
deps:
- data/raw/desc2021.xml
outs:
- data/processed/mesh_disease_tags.csv
evaluate_mesh_xlinear_on_grants:
cmd: grants_tagger evaluate grants mesh-xlinear models/xlinear/model
data/raw/disease_tags_validation_grants.xlsx models/xlinear/label_binarizer.pkl
--results-path results/mesh_xlinear_on_grants.json
--mesh-tags-path data/processed/mesh_disease_tags.csv
deps:
- data/raw/disease_tags_validation_grants.xlsx
- data/processed/mesh_disease_tags.csv
- grants_tagger/evaluate_mesh_on_grants.py
- models/xlinear/model
- models/xlinear/label_binarizer.pkl
metrics:
- results/mesh_xlinear_on_grants.json:
cache: false
#####
# uncomment the section below if you would like to create a model that only predicts MeSH tags
# used by Wellcome (Wellcome uses c. 85% of all MeSH tags). This improves accuracy but won't recognise
# potential MeSH tags Wellcome hasn't seen before in its portfolio
#####
# preprocess_bioasq_mesh_wt_only: # only train on tags that are used in Wellcome's active portfolio
# cmd: grants_tagger preprocess bioasq-mesh data/raw/allMeSH_2021.json data/processed/train_mesh2021_wt.jsonl
# models/xlinear-wt/label_binarizer_wt.pkl --test-split 0.01 --test-output-path data/processed/test_mesh2021_wt.jsonl
# --mesh-tags-path data/processed/wt_tags_used.csv
# deps:
# - data/raw/allMeSH_2021.json
# - grants_tagger/preprocess_mesh.py
# - data/processed/wt_tags_used.csv
# outs:
# - data/processed/train_mesh2021_wt.jsonl
# - data/processed/test_mesh2021_wt.jsonl
# - models/xlinear-wt/label_binarizer_wt.pkl
# train_mesh_xlinear_wt_only:
# cmd: grants_tagger train data/processed/train_mesh2021_wt.jsonl models/xlinear-wt/label_binarizer_wt.pkl models/xlinear-wt/model
# --approach mesh-xlinear --sparse-labels --train-info results/mesh_xlinear_train_info_wt.json --slim
# deps:
# - data/processed/train_mesh2021_wt.jsonl
# - grants_tagger/train.py
# - grants_tagger/slim/mesh_xlinear.py
# params:
# - train.mesh-xlinear.config
# outs:
# - models/xlinear-wt/model
# evaluate_mesh_xlinear_wt_only:
# cmd: grants_tagger evaluate model mesh-xlinear models/xlinear-wt/model
# data/processed/test_mesh2021_wt.jsonl models/xlinear-wt/label_binarizer_wt.pkl
# --results-path results/mesh_xlinear_wt_only.json
# --full-report-path results/mesh_xlinear_full_report_wt_only.json --no-split-data
# deps:
# - grants_tagger/evaluate_model.py
# - models/xlinear-wt/model
# - models/xlinear-wt/label_binarizer_wt.pkl
# metrics:
# - results/mesh_xlinear_wt_only.json:
# cache: false