-
Notifications
You must be signed in to change notification settings - Fork 1
/
dvc.lock
338 lines (338 loc) · 10.4 KB
/
dvc.lock
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
schema: '2.0'
stages:
scrap-oecd:
cmd: python scripts/scrap_oecd_ai_database.py
deps:
- path: scripts/scrap_oecd_ai_database.py
md5: bbde9a137353aab423e627b54f192476
size: 1878
outs:
- path: data/oecd_docs/meta.csv
md5: 6b29959be888533946a5a9836caa9c05
size: 182747
- path: data/oecd_docs/raw/
md5: e95798bc3b2ee45a42e325309b826bd5.dir
size: 203091915
nfiles: 84
scrap-nesta:
cmd: python scripts/scrap_nesta_ai_database.py
deps:
- path: scripts/scrap_nesta_ai_database.py
md5: 3c7cf1ff0a7a4d0dea0fe5fa3d92767c
size: 2260
outs:
- path: data/nesta_ai_governance_docs/meta.csv
md5: 3dc67b24eab575a380194ea8ffb2e80f
size: 65235
- path: data/nesta_ai_governance_docs/raw/
md5: 881135d944d83c90d70503e80854c7a8.dir
size: 229589096
nfiles: 102
parse-text-oecd:
cmd: python scripts/parse_docs_to_text.py oecd
deps:
- path: data/oecd_docs/raw/
md5: e95798bc3b2ee45a42e325309b826bd5.dir
size: 203091915
nfiles: 84
- path: scripts/parse_docs_to_text.py
md5: fdf241e4640e53c8d36081a7964eda80
size: 1046
outs:
- path: data/oecd_docs/texts/
md5: c5431d9156654c02670da8abbbe8db84.dir
size: 13457505
nfiles: 84
parse-text-nesta:
cmd: python scripts/parse_docs_to_text.py nesta
deps:
- path: data/nesta_ai_governance_docs/raw/
md5: 881135d944d83c90d70503e80854c7a8.dir
size: 229589096
nfiles: 102
- path: scripts/parse_docs_to_text.py
md5: fdf241e4640e53c8d36081a7964eda80
size: 1046
outs:
- path: data/nesta_ai_governance_docs/texts/
md5: bdac8556ea86146fc38c20a41bc35935.dir
size: 9142758
nfiles: 102
embedd-legal-docs:
cmd: python scripts/calculate_docs_embeddings.py
deps:
- path: data/nesta_ai_governance_docs/texts/
md5: bdac8556ea86146fc38c20a41bc35935.dir
size: 9142758
nfiles: 102
- path: data/oecd_docs/texts/
md5: c5431d9156654c02670da8abbbe8db84.dir
size: 13457505
nfiles: 84
- path: scripts/calculate_docs_embeddings.py
md5: f70fd78bed435affdbeddb04198e5b73
size: 682
outs:
- path: data/processed/docs-bert-embeddings.joblib
md5: f76a3278b1fb75110bb8079daeb741f8
size: 637243
download-arxiv-dump:
cmd: python scripts/download_arxiv_dump.py --download-sources --download-pdfs
deps:
- path: data/arxiv_dump/arxiv_categories.txt
md5: 6636f37f222ea6f6547b0a7200b487ec
size: 1057
- path: mair/arxiv_dump/keywords.py
md5: 25e2f9c96594facaef4c3af0d402c3da
size: 1385
- path: scripts/download_arxiv_dump.py
md5: 8b985181a44373ea77ea20c01dd01abd
size: 6478
outs:
- path: data/arxiv_dump/papers
md5: d19dfe4c05101c1931cfcbff9110364a.dir
size: 1119198961
nfiles: 742
- path: data/arxiv_dump/search_results.json
md5: 998d3a8a9340cc3885883b2a028ce049
size: 1676155
- path: data/arxiv_dump/sources
md5: f15be23a5b55f4a749a562fae8aa4042.dir
size: 1959808118
nfiles: 742
download-s2orc-meta:
cmd: python scripts/download_s2orc_meta.py
deps:
- path: mair/s2orc_links.py
md5: 4a957ba3960cf3fd5c24e1bd17d8c330
size: 42832
- path: scripts/download_s2orc_meta.py
md5: 765a7c1b765b3daa7d3423c6c3766f51
size: 2371
outs:
- path: data/s2orc/metadata/comp_sci/
md5: aebab75f488d845e9d610fabf91d2f17.dir
size: 19637376594
nfiles: 100
filter-ai-s2orc:
cmd: python scripts/filter_ai_s2orc.py
deps:
- path: data/s2orc/metadata/comp_sci/
md5: aebab75f488d845e9d610fabf91d2f17.dir
size: 19637376594
nfiles: 100
- path: scripts/filter_ai_s2orc.py
md5: ea05646cd5cfe3327b0bc3470a372ed2
size: 1958
outs:
- path: data/s2orc/ai_papers_meta.csv
md5: 5db86603f07f695aaec261639a1b57ef
size: 978702792
download-full-ai-texts-s2orc:
cmd: python scripts/download_full_text_s2orc.py
deps:
- path: data/s2orc/ai_papers_meta.csv
md5: 5db86603f07f695aaec261639a1b57ef
size: 978702792
- path: scripts/download_full_text_s2orc.py
md5: a539413dd1f10c820e2ef84faa9d174c
size: 1959
outs:
- path: data/s2orc/pdf_parses/ai/
md5: 05598a81abbe366313bbcb3308a71fb8.dir
size: 0
nfiles: 100
extract-text-nesta:
cmd: python scripts/parse_docs_to_text.py nesta
deps:
- path: data/nesta_ai_governance_docs/raw/
md5: 881135d944d83c90d70503e80854c7a8.dir
size: 229589096
nfiles: 102
- path: scripts/parse_docs_to_text.py
md5: fdf241e4640e53c8d36081a7964eda80
size: 1046
outs:
- path: data/nesta_ai_governance_docs/texts/
md5: 2916be574866433731ad51f66b39b47b.dir
size: 8821922
nfiles: 102
extract-text-oecd:
cmd: python scripts/parse_docs_to_text.py oecd
deps:
- path: data/oecd_docs/raw/
md5: e95798bc3b2ee45a42e325309b826bd5.dir
size: 203091915
nfiles: 84
- path: scripts/parse_docs_to_text.py
md5: fdf241e4640e53c8d36081a7964eda80
size: 1046
outs:
- path: data/oecd_docs/texts/
md5: 5a5f576150194af1d1e5230ff632843e.dir
size: 12574153
nfiles: 84
parse-trees-legal-docs:
cmd: python scripts/process_legal_docs_spacy.py
deps:
- path: data/nesta_ai_governance_docs/texts/
md5: 2916be574866433731ad51f66b39b47b.dir
size: 8821922
nfiles: 102
- path: data/oecd_docs/texts/
md5: 5a5f576150194af1d1e5230ff632843e.dir
size: 12574153
nfiles: 84
- path: scripts/process_legal_docs_spacy.py
md5: 6ddececb4507f12208229c9306ef4373
size: 534
outs:
- path: data/processed/intermediate/parsed_legal_texts.joblib
md5: 5d82d6956618a73c986af74f7d06b1f9
size: 1895689799
parse-legal-docs:
cmd: python scripts/process_legal_docs_spacy.py
deps:
- path: data/nesta_ai_governance_docs/texts/
md5: 2916be574866433731ad51f66b39b47b.dir
size: 8821922
nfiles: 102
- path: data/oecd_docs/texts/
md5: 5a5f576150194af1d1e5230ff632843e.dir
size: 12574153
nfiles: 84
- path: scripts/process_legal_docs_spacy.py
md5: 953b3d25a47cf70424a2bfc898ebd5df
size: 547
outs:
- path: data/processed/intermediate/parsed_legal_texts.joblib
md5: b38a6be2cd7a7c7e3a5d2040e1bb2ef5
size: 2204971189
extract-texts-arxiv:
cmd: python scripts/parse_arxiv_pdfs.py
deps:
- path: data/arxiv_dump/papers
md5: d19dfe4c05101c1931cfcbff9110364a.dir
size: 1119198961
nfiles: 742
- path: scripts/parse_arxiv_pdfs.py
md5: 8f6ba6b860b83a70968fb1f18d999e0d
size: 757
outs:
- path: data/arxiv_dump/raw_extracted_texts
md5: cfcbbb7916819c31c09ad98ce2325cc0.dir
size: 32439833
nfiles: 523
analyse-deontic-sentences:
cmd: python scripts/analyse_deontics_sentences.py
deps:
- path: data/policy_docs_all/metadata.csv
md5: 2ac7c7c472376ace6c463a29c7d009de
size: 290579
- path: data/processed/intermediate/parsed_legal_texts.joblib
md5: b38a6be2cd7a7c7e3a5d2040e1bb2ef5
size: 2204971189
- path: scripts/analyse_deontics_sentences.py
md5: 697464aa5a732fd4034eae90d4a79ef1
size: 1261
outs:
- path: data/processed/deontics.csv
md5: 138839c0659f7d917db9d18a916883e3
size: 15591758
clean-arxiv-texts:
cmd: python scripts/clean_raw_papers_text.py
deps:
- path: data/arxiv_dump/raw_extracted_texts
md5: cfcbbb7916819c31c09ad98ce2325cc0.dir
size: 32439833
nfiles: 523
- path: scripts/clean_raw_papers_text.py
md5: 9befa87c7d64aca8a431d077fb0eb635
size: 1368
outs:
- path: data/arxiv_dump/cleaned_texts
md5: d0f56f3df0999c638f1da03b39fdc1d8.dir
size: 25132499
nfiles: 517
parse-arxiv-papers:
cmd: python scripts/process_arxiv_spacy.py
deps:
- path: data/arxiv_dump/cleaned_texts
md5: d0f56f3df0999c638f1da03b39fdc1d8.dir
size: 25132499
nfiles: 517
- path: scripts/process_arxiv_spacy.py
md5: 57f373f3132fb95c36614e6dc9fb7b01
size: 563
outs:
- path: data/processed/intermediate/parsed_arxiv_papers.joblib
md5: 6a56b89bd8d29219932d4a3dff215944
size: 2733274491
unpack-sources-arxiv:
cmd: scripts/unpack_sources.sh
deps:
- path: data/arxiv_dump/sources
md5: f15be23a5b55f4a749a562fae8aa4042.dir
size: 1959808118
nfiles: 742
- path: scripts/unpack_sources.sh
md5: cc3af9d45182eececbc437b0cf713e9d
size: 249
outs:
- path: data/arxiv_dump/unpacked_sources
md5: fd5ec3c0c29649231aa369a83168db10.dir
size: 2439008640
nfiles: 18263
extract-affiliations:
cmd: python scripts/extract_affiliations.py
deps:
- path: data/arxiv_dump/unpacked_sources
md5: fd5ec3c0c29649231aa369a83168db10.dir
size: 2439008640
nfiles: 18263
- path: mair/affiliations_extraction.py
md5: eadd2c21f0be6fe8b5f3f3117068e018
size: 702
- path: scripts/extract_affiliations.py
md5: a59d954cf943e7a0e97cebb40766f5c4
size: 5380
outs:
- path: data/arxiv_dump/affiliations.json
md5: 7550466eac264af911bc70e75729b552
size: 83442
merge_metadata:
cmd: python scripts/merge_metadata.py
deps:
- path: data/golden_standard/policy_docs_categories.csv
md5: 79976e11144a64c686f0ae1b545cb4c4
size: 19984
- path: data/nesta_ai_governance_docs/meta.csv
md5: 3dc67b24eab575a380194ea8ffb2e80f
size: 65235
- path: data/oecd_docs/meta.csv
md5: 6b29959be888533946a5a9836caa9c05
size: 182747
- path: scripts/merge_metadata.py
md5: def6540ed8998509321ff8799891fc8d
size: 1341
outs:
- path: data/policy_docs_all/metadata.csv
md5: 2ac7c7c472376ace6c463a29c7d009de
size: 290579
build-citation-graph:
cmd: python scripts/build_citations_graph.py
deps:
- path: data/arxiv_dump/unpacked_sources
md5: fd5ec3c0c29649231aa369a83168db10.dir
size: 2439008640
nfiles: 18263
- path: scripts/build_citations_graph.py
md5: 94b4304d319515025e2824d985fed2a0
size: 2263
outs:
- path: data/arxiv_dump/citations_graph.json
md5: e77340eacb21996e7a10bc30d71369bc
size: 239592
- path: data/arxiv_dump/semantic_scholar.json
md5: 79504bfa04bfc1d6770f9aa3dbb2a0f9
size: 53709992