forked from mediawiki-utilities/python-mwxml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_xml.py
154 lines (113 loc) · 4.41 KB
/
parse_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import sys
from xml.etree import ElementTree
from more_itertools import peekable
def strip_tag(tag):
return tag.split("}")[1]
class Dump:
__slots__ = ('siteinfo', 'pages')
def __init__(self, siteinfo=None, pages=None):
self.siteinfo = siteinfo
if pages is None:
self.pages = range(0)
else:
self.pages = pages
def iter(self):
return self.pages
@classmethod
def from_events(cls, events):
event, elem = next(events)
assert event == "start" and strip_tag(elem.tag) == "mediawiki"
kwargs = {}
while events:
event, elem = events.peek()
# Inner loop stuff
if event == "start" and strip_tag(elem.tag) == "siteinfo":
kwargs['siteinfo'] = SiteInfo.from_events(events)
elif event == "end" and strip_tag(elem.tag) == "siteinfo":
#kwargs['pages'] = Page.read_from(events)
next(event)
break
else:
next(event)
return cls(**kwargs)
@classmethod
def from_file(cls, f):
events = ElementTree.iterparse(f, events=('start', 'end'));
return cls.from_events(peekable(events))
class SiteInfo:
__slots__ = ('sitename', 'dbname', 'base', 'generator', 'case',
'namespaces')
def __init__(self, sitename=None, dbname=None, base=None, generator=None,
case=None, namespaces=None):
self.sitename = str(sitename) if sitename is not None else None
self.base = str(base) if base is not None else None
self.dbname = str(dbname) if dbname is not None else None
self.generator = str(generator) if generator is not None else None
self.case = str(case) if case is not None else None
self.namespaces = namespaces
@classmethod
def from_events(cls, events):
event, elem = next(events)
assert event == "start" and strip_tag(elem.tag) == "siteinfo"
kwargs = {}
while events:
event, elem = events.peek()
if event == "end":
if strip_tag(elem.tag) == "sitename":
kwargs['sitename'] = elem.text
elif strip_tag(elem.tag) == "dbname":
kwargs['dbname'] = elem.text
elif strip_tag(elem.tag) == "base":
kwargs['base'] = elem.text
elif strip_tag(elem.tag) == "generator":
kwargs['generator'] = elem.text
elif strip_tag(elem.tag) == "case":
kwargs['case'] = elem.text
next(events)
elif event == "start" and strip_tag(elem.tag) == "namespaces":
kwargs['namespaces'] = Namespaces.from_events(events)
elif event == "end" and strip_tag(elem.tag) == "siteinfo":
next(events)
break
else:
next(events)
return cls(**kwargs)
class Namespaces(list):
def init(self, namespaces):
super().__init__(namespace)
@classmethod
def from_events(cls, events):
event, elem = next(events)
assert event == "start" and strip_tag(elem.tag) == "namespaces"
namespaces = cls()
while events:
event, elem = events.peek()
# Inner tag stuff
if event == "start" and strip_tag(elem.tag) == "namespace":
namespaces.append(Namespace.from_events(events))
elif event == "end" and strip_tag(elem.tag) == "namespaces":
next(events)
break
else:
next(events)
return namespaces
class Namespace:
__slots__ = ('key', 'case', 'text')
def __init__(self, key=None, case=None, text=None):
self.key = int(key) if key is not None else None
self.case = str(case) if case is not None else None
self.text = str(text) if text is not None else None
@classmethod
def from_events(cls, events):
event, elem = next(events)
assert event == "start" and strip_tag(elem.tag) == "namespace"
event, elem = next(events)
assert event == "end" and strip_tag(elem.tag) == "namespace"
return cls(
elem.attrib.get('key'),
elem.attrib.get('case'),
elem.text
)
dump = Dump.from_file(sys.stdin)
for ns in dump.siteinfo.namespaces:
print(ns.key)