-
Notifications
You must be signed in to change notification settings - Fork 270
/
generic-spark.extraction.minidump.properties
128 lines (74 loc) · 4.3 KB
/
generic-spark.extraction.minidump.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# download and extraction target dir
#base-dir= moved to $extraction-framework/core/src/main/resources/universal.properties
base-dir=./target/minidumptest/base
log-dir=./target/minidumptest/log
spark-local-dir=./target/minidumptest/spark-local
spark-master=local[*]
# Source file. If source file name ends with .gz or .bz2, it is unzipped on the fly.
# Must exist in the directory xxwiki/yyyymmdd and have the prefix xxwiki-yyyymmdd-
# where xx is the wiki code and yyyymmdd is the dump date.
# default:
#source=# moved to $extraction-framework/core/src/main/resources/universal.properties
# use only directories that contain a 'download-complete' file? Default is false.
require-download-complete=false
# List of languages or article count ranges, e.g. 'en,de,fr' or '10000-20000' or '10000-', or '@mappings'
# excluded en, seemed too big for local[32]
# TODO currently working:
languages=en,fr,nl,ro,it,de,cs
# TODO include bpy, cdo, eml, mhr, xmf, wa, wuu in wikidump, it is in marvin-config
#languages=en,af,als,am,an,ar,arz,ast,azb,az,ba,bar,bat-smg,be,bg,bn,br,bs,bug,ca,ceb,ce,ckb,cs,cv,cy,da,de,el,eo,es,et,eu,fa,fi,fo,fr,fy,ga,gd,gl,gu,he,hi,hr,hsb,ht,hu,hy,ia,id,ilo,io,is,it,ja,jv,ka,kk,kn,ko,ku,ky,la,lb,li,lmo,lt,lv,mai,mg,min,mk,ml,mn,mrj,mr,ms,my,mzn,nap,nds,ne,new,nl,nn,no,oc,or,os,pa,pl,pms,pnb,pt,qu,ro,ru,sah,sa,scn,sco,sd,sh,si,simple,sk,sl,sq,sr,su,sv,sw,ta,te,tg,th,tl,tr,tt,uk,ur,uz,vec,vi,vo,war,yi,yo,zh,zh-min-nan,zh-yue
# extractor class names starting with "." are prefixed by "org.dbpedia.extraction.mappings"
#extractors=.ProvenanceExtractor
extractors=.ArticleCategoriesExtractor,.ArticlePageExtractor,.ArticleTemplatesExtractor,.CategoryLabelExtractor,\
.ExternalLinksExtractor,.GeoExtractor,.InfoboxExtractor,.InterLanguageLinksExtractor,.LabelExtractor,.PageIdExtractor,\
.PageLinksExtractor,.RedirectExtractor,.RevisionIdExtractor,.ProvenanceExtractor,.SkosCategoriesExtractor,\
.WikiPageLengthExtractor,.WikiPageOutDegreeExtractor
extractors.ar=.TopicalConceptsExtractor
extractors.be=
extractors.bg=
extractors.bn=
extractors.ca=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor
extractors.ced=
extractors.commons=.ContributorExtractor,.TemplateParameterExtractor,.FileTypeExtractor,.GalleryExtractor,.ImageAnnotationExtractor,.CommonsKMLExtractor,.DBpediaResourceExtractor
extractors.cs=.DisambiguationExtractor
extractors.cy=
extractors.da=
extractors.de=.DisambiguationExtractor,.HomepageExtractor,.PersondataExtractor,.PndExtractor,.CommonsResourceExtractor
extractors.el=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor
extractors.en=.CitationExtractor,.DisambiguationExtractor,.HomepageExtractor,.PersondataExtractor,.PndExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor
extractors.eo=
extractors.es=,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.CommonsResourceExtractor
extractors.et=
extractors.eu=,.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor
extractors.fa=
extractors.fi=
extractors.fr=.DisambiguationExtractor,.HomepageExtractor,.PndExtractor,.TopicalConceptsExtractor,.fr.PopulationExtractor,.CommonsResourceExtractor
extractors.ga=.HomepageExtractor
extractors.gl=
extractors.hi=
extractors.hr=
extractors.hu=
extractors.id=
extractors.it=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor
extractors.ja=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.CommonsResourceExtractor
extractors.ko=.DisambiguationExtractor
extractors.lt=
extractors.lv=
extractors.nl=.DisambiguationExtractor,.CommonsResourceExtractor
extractors.mk=
extractors.mt=
extractors.pl=.DisambiguationExtractor,.HomepageExtractor
extractors.pt=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor,.CommonsResourceExtractor
extractors.ru=.DisambiguationExtractor,.HomepageExtractor,.TopicalConceptsExtractor
extractors.sk=
extractors.sl=
extractors.sr=
extractors.tr=
extractors.ur=
extractors.vi=
extractors.war=
#only the raw extractor here: all other wikidata extractors are executed in an separate extraction for wikidata (see: extraction.wikidata.properties)
#extractors.wikidata=.WikidataSameAsExtractor,.WikidataRawExtractor
extractors.zh=
# If we need to Exclude Non-Free Images in this Extraction, set this to true
copyrightCheck=false