-
Notifications
You must be signed in to change notification settings - Fork 1
/
config.py
85 lines (62 loc) · 3.15 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# folder config. Please take care that each path string ends with a /
folder_dblp_xml = './data/'
folder_content_xml = './data/content_xml/'
folder_pdf = './data/pdf/'
folder_log = './data/logs/'
folder_datasets = './data/datasets/'
folder_classifiers = './data/classifiers/'
folder_pickle = './data/pickle/'
folder_clusters = './data/clusters/'
# mongoDB
mongoDB_IP = '127.0.0.1'
mongoDB_Port = 27017 # default local port. change this if you use SSH tunneling on your machine (likely 4321 or 27017).
# mongoDB_db = 'pub'
mongoDB_db = 'TU_Delft_Library'
# pdf extraction
grobid_url = 'http://127.0.0.1:8080'
# conferences we like
# book_titles = ['JCDL','SIGIR','ECDL','TPDL','TREC', 'ICWSM', 'ESWC', 'ICSR','WWW', 'ICSE', 'HRI', 'VLDB', 'ICRA', 'ICARCV']
evaluation_conferences = ['JCDL', 'TPDL', 'TREC', 'ECDL', 'ESWC', 'ICWSM', 'VLDB', 'ACL', 'WWW', 'ICSE', 'SIGIR']
# Data Coner feedback data exported from Firebase
data_date = '2018_05_28'
booktitles = ['test_no_conf']
# root to the project
ROOTPATH = '/data2/SmartPub-TSENER'
# ROOTPATH = 'C:/Users/mvall/PycharmProjects/SmartPub-TSENER'
# ROOTPATH = '/Users/daniel/Documents/TUDelftMasterThesis/coner_v2/SmartPub-TSENER'
STANFORD_NER_PATH = '/data2/SmartPub-TSENER/stanford_files/stanford-ner.jar'
# STANFORD_NER_PATH = 'C:/Users/mvall/PycharmProjects/SmartPub-TSENER/stanford_files/stanford-ner.jar'
# STANFORD_NER_PATH = '/Users/daniel/Documents/TUDelftMasterThesis/coner_v2/SmartPub-TSENER/stanford_files/stanford-ner.jar'
# journals we like
# journals = ['IEEE Trans. Robotics' , 'IEEE Trans. Robotics and Automation', 'IEEE J. Robotics and Automation']
journals = ['I. J. Robotics and Automation', 'IEEE J. Biomedical and Health Informatics',
'Journal of Intelligent and Robotic Systems'] # ieee and Springer
source = 'data/pdf/'
source_xml = 'data/xml/'
# Update process
overwriteDBLP_XML = False
updateNow = True
checkDaily = False
checkWeekly = False
# Only pdf download
only_pdf_download = False
# Only text extraction
only_text_extraction = False
# Only classify and name entity extraction
only_classify_nee = False
####################### XML processing configurations #######################
# set to true if you want to persist to a local mongo DB (default connection)
storeToMongo = True
# set to true if you want to skip downloading EE entries (pdf URLs) which have been accessed before (either
# successfully or unsuccessfully) this only works if storeToMongo is set to True because the MongoDB must be accessed
# for that. (if you set storeToMongo to false, I will just assume that MongoDB is simply not active / there
skipPreviouslyAccessedURLs = True
# the categories you are interested in
CATEGORIES = {'article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www'}
# the categories you are NOT interested in
SKIP_CATEGORIES = {'phdthesis', 'mastersthesis', 'www', 'proceedings'}
# the fields which should be in your each data item / mongo entry
DATA_ITEMS = ["title", "booktitle", "year", "journal", "crossref", "ee", "license"]
statusEveryXdownloads = 100
statusEveryXxmlLoops = 1000
###############################################################################