-
Notifications
You must be signed in to change notification settings - Fork 33
/
readCatHier.py
84 lines (62 loc) · 1.7 KB
/
readCatHier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/python
'''
Copyright (C) 2010 Cagatay Calli <[email protected]>
Reads cat_hier file and produces extended stop category list.
Input format: <cat id> <list of immediate descendants cat ids>
USAGE: scanCatHier.py <cat_hier output file path>
IMPORTANT: If you use XML output from a recent version of Wikiprep
(e.g. Zemanta fork), then set FORMAT to 'Zemanta-legacy' or 'Zemanta-modern'.
'''
import sys
import re
FORMAT = 'Gabrilovich'
# read list of stop categories from 'wiki_stop_categories.txt'
STOP_CATS = []
try:
f = open('wiki_stop_categories.txt','r')
for line in f.readlines():
[strId,strCat] = line.split('\t')
STOP_CATS.append(int(strId))
f.close()
except:
print 'Stop categories cannot be read! Please put "wiki_stop_categories.txt" file containing stop categories in this folder.'
sys.exit(1)
catDict = {}
args = sys.argv[1:]
if not args:
sys.exit(1)
f = open(args[0],'r') # cat_hier file
for i in range(3):
f.readline()
for line in f.readlines():
parts = line.split('\t',1)
if len(parts) == 2:
parent = int(parts[0])
cs = parts[1].split()
childs = []
for c in cs:
if c:
c = int(c)
childs.append(c)
catDict[parent] = childs
f.close()
print 'cat_hier output complete'
print 'traversing category tree..'
cats = set(STOP_CATS)
outcats = set(STOP_CATS)
#allCatSet = frozenset(catList)
while cats:
parent = cats.pop()
childs = []
if catDict.has_key(parent):
childs = catDict[parent]
# avoid cycles/repeats
for c in childs:
if not c in outcats:
cats.add(c)
outcats.add(c)
# write extended stop category list
f = open('ecat.txt','w')
for c in outcats:
f.write(str(c) + '\n')
f.close()