-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfig_win.yaml
105 lines (105 loc) · 3.3 KB
/
config_win.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
---
# application main params
general:
# where all path and following file are located
basedirectory: '.'
locale: ''
# following directories or files a under "basedirectory"
index: 'index'
dbenv: 'db'
# log options, file is "pytextminer-log.txt"
logsize: 1024000
loglevel: 'debug'
user: 'sessions'
shared: 'shared'
whitelist_directory: 'whitelists'
source_file_directory: 'source_files'
userstopwords: 'user_stopwords.csv'
# stopwords are under share
stopwords: 'stopwords\en.txt'
# extraction settings
datasets:
doc_extraction:
# the following values can be document object fields:
# - defined by one of the field's value
# - constants required fields : 'content'/'label'/'id'
- 'title'
- 'content'
#- 'keywords'
# tina csv columns declaration
# will ignore undeclared fields
# and warn not found optional fields
tinacsv:
# doc_label represents one of the field's key
# if not found in the file, will use the field specified by "label"
doc_label: 'title'
fields:
# required fields
label: 'doc_id'
content: 'abstract'
corpus_id: 'corp_id'
id: 'doc_id'
# optionnal fields
title: 'title'
acronym: 'acronym'
keywords: 'keywords'
# csv reader params
encoding: 'utf_8'
dialect: 'excel'
# pubmed.gov "medline" file export
medline:
# doc_label represents one of the field's key
# if not found in the file, will use the field specified by "label"
doc_label: 'title'
fields:
label: 'PMID'
content: 'AB'
corpus_id: 'DP'
id: 'PMID'
# optional fields
title: 'TI'
# period_size = number first characters of pub date field "corpusField"
period_size: 4
encoding: 'ascii'
# archive of pubmed.gov "medline" files, organized like this
# Medline/
# period1/period1.txt
# period2/period2.txt
#medlinearchive:
# TODO
# extraction size
ngramMin: 1
ngramMax: 4
# tagger learning on 2 x training_tagger_size sentences of tagged nltk corpus
training_tagger_size: ~
# under "basedirectory" directory, delete this file to regenerate a new one on next starting
tagger: 'shared\tagger.pickle'
# change this Reg Expression to change NGram extraction filtering
postag_valid: '^((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?(CD.,)??)+?((PREP.?|DET.?,|IN.?,|CC.?,|\?,)((VBD,|VBG,|VBN,|CD.?,|JJ.?,|\?,){0,2}?(N.?.?,|\?,)+?)+?)*?$'
datamining:
template: 'shared/gexf/gexf.default.template'
# default values if no params are passed to gexf exporter
DocumentGraph:
edgethreshold:
- 0.0001
- 'inf'
nodethreshold:
- 1.00
- 'inf'
#proximity: 'sharedNGrams'
proximity: 'logJaccard'
maxdegree: 100
NGramGraph:
edgethreshold:
- 0.0001
- 'inf'
nodethreshold:
- 1.00
- 'inf'
alpha: 0.10
#hapax: 1
proximity: 'Cooccurrences'
#proximity: 'EquivalenceIndex'
#proximity: 'PseudoInclusion'
indexer:
minCooc: 10