forked from komoot/photon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_xml.py
74 lines (51 loc) · 1.79 KB
/
convert_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# script to convert solr xml to check out other approaches
import json
import xml.etree.cElementTree as et
#input_xml = '/home/photon/data/solr-131012.xml'
input_xml = 'sample_data/iceland.solr.xml'
I18N_FIELDS = "name", "city", "country", "places"
int_fields = []
int_fields += I18N_FIELDS
LANGUAGES = ["de", "en", "fr", "it", 'es']
for field in I18N_FIELDS:
int_fields += [field + "_" + lang for lang in LANGUAGES]
def nested_keys(field_name):
keys = field_name.split("_")
if len(keys) < 2:
keys.append('default')
if keys[0] == 'places':
keys[0] = 'context'
return keys
ONLY_GERMANY = True
INSERT_LINE = """{ "index" : { "_index" : "photon", "_type" : "place"} }\n"""
xml_tree = et.iterparse(input_xml)
output_xml = input_xml.replace('.xml', '.json')
count = 0
with open(output_xml, 'w') as f:
doc = {}
for event, elem in xml_tree:
if elem.tag == 'field':
attr = elem.attrib['name']
if attr in int_fields:
key, sub_key = nested_keys(attr)
if key not in doc:
doc[key] = {}
if sub_key == "es":
continue
doc[key][sub_key] = elem.text
elif attr == "ranking":
doc[attr] = int(elem.text)
else:
doc[attr] = elem.text
elif elem.tag == 'doc':
# document is finished, dump it to json
if not ONLY_GERMANY or doc.get('country', {}).get('default') == "Deutschland":
count += 1
if not count % 1000:
print("progress: {:,}".format(count))
f.write(INSERT_LINE)
json.dump(doc, f)
f.write("\n")
doc = {}
elem.clear()
f.write("\n")