-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathmediawiki.py
161 lines (146 loc) · 7.78 KB
/
mediawiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Methods for importing mediawiki pages, images via the simplemediawki
wrapper to the MediaWiki API.
Copyright (C) 2014 Angus Gratton
Licensed under New BSD License as described in the file LICENSE.
"""
from __future__ import print_function, unicode_literals, absolute_import, division
import simplemediawiki, simplejson
import re
from pprint import pprint
class Importer(object):
def __init__(self, api_url, http_user=None, http_pass="", wiki_user=None, wiki_pass="", wiki_domain=None, verbose=False):
self.verbose = verbose
if wiki_domain:
self.mw = simplemediawiki.MediaWiki(api_url, http_user=http_user, http_password=http_pass, domain=wiki_domain)
else:
self.mw = simplemediawiki.MediaWiki(api_url, http_user=http_user, http_password=http_pass)
# login if necessary
if wiki_user is not None:
print("Logging in as %s..." % wiki_user)
if not self.mw.login(wiki_user, wiki_pass):
raise RuntimeError("Mediawiki login failed. Wrong credentials?")
# version check
try:
self.need_rawcontinue = False
generator = "".join(self._query({'meta' : 'siteinfo'}, ['general', 'generator']))
version = [ int(x) for x in re.search(r'[0-9.]+', generator).group(0).split(".") ] # list of [ 1, 19, 1 ] or similar
if version[0] == 1 and version[1] < 13:
raise RuntimeError("Mediawiki version is too old. Yamdwe requires 1.13 or newer. This install is %s" % generator)
# check if the version is too old for the 'rawcontinue' parameter
# see https://www.mediawiki.org/wiki/API:Query#Backwards_compatibility_of_continue
self.need_rawcontinue = version[0] > 1 or (version[0] == 1 and version[1] >= 24)
print("%s meets version requirements." % generator)
except IndexError:
raise RuntimeError("Failed to read Mediawiki siteinfo/generator. Is version older than 1.8? Yamdwe requires 1.13 or greater.")
def verbose_print(self, msg):
if self.verbose:
print(msg)
def get_all_pages(self):
"""
Slurp all pages down from the mediawiki instance, together with all revisions including content.
WARNING: Hits API hard, don't do this without knowledge/permission of wiki operator!!
"""
query = {'list' : 'allpages'}
print("Getting list of pages...")
pages = self._query(query, [ 'allpages' ])
self.verbose_print("Got %d pages." % len(pages))
print("Query page revisions (this may take a while)...")
for page in pages:
self.verbose_print("Querying revisions for pageid %s (%s)..." % (page['pageid'], page['title']))
page["revisions"] = self._get_revisions(page)
self.verbose_print("Got %d revisions." % len(page["revisions"]))
return pages
def _get_revisions(self, page):
pageid = page['pageid']
query = { 'prop' : 'revisions',
'pageids' : pageid,
'rvprop' : 'timestamp|user|comment|content',
'rvlimit' : '5',
}
revisions = self._query(query, [ 'pages', str(pageid), 'revisions' ])
return revisions
def get_all_images(self):
"""
Slurp all images down from the mediawiki instance, latest revision of each image, only.
WARNING: Hits API hard, don't do this without knowledge/permission of wiki operator!!
"""
query = {'list' : 'allimages'}
return self._query(query, [ 'allimages' ])
def get_all_users(self):
"""
Slurp down all usernames from the mediawiki instance.
"""
query = {'list' : 'allusers'}
return self._query(query, [ 'allusers' ])
def _query(self, args, path_to_result):
"""
Make a Mediawiki API query that results a list of results,
handle the possibility of making a paginated query using query-continue
"""
query = { 'action' : 'query' }
if self.need_rawcontinue:
query["rawcontinue"] = ""
query.update(args)
result = []
continuations = 0
while True:
try:
response = self.mw.call(query)
except simplejson.scanner.JSONDecodeError as e:
if e.pos == 0:
if not self.verbose:
raise RuntimeError("Mediawiki gave us back a non-JSON response. You may need to double-check the Mediawiki API URL you are providing (it usually ends in api.php), and also your Mediawiki permissions. To see the response content, pass the --verbose flag to yamdwe.")
else:
raise RuntimeError("Mediawiki gave us back a non-JSON response:\n\n\nInvalid response follows (%d bytes):\n%s\n\n(End of content)\nFailed to parse. You may need to double-check the Mediawiki API URL you are providing (it usually ends in api.php), and also your Mediawiki permissions." % (len(e.doc), e.doc.decode("utf-8")))
raise
# fish around in the response for our actual data (location depends on query)
try:
inner = response['query']
for key in path_to_result:
inner = inner[key]
except KeyError:
raise RuntimeError("Mediawiki query '%s' returned unexpected response '%s' after %d continuations" % (args, response, continuations))
result += inner
# if there's a warning print it out (shouldn't need a debug flag since this is of interest to any user)
if 'warnings' in response:
for warnkey in response['warnings']:
print("WARNING: %s function throws the warning %s" % (warnkey, response['warnings'][warnkey]['*']))
# if there's a continuation, find the new arguments and follow them
try:
query.update(response['query-continue'][path_to_result[-1]])
continuations += 1
except KeyError:
return result
def get_file_namespaces(self):
"""
Return a tuple. First entry is the name used by default for the file namespace (which dokuwiki will also use.)
Second entry is a list of all aliases used for that namespace, and aliases used for the 'media' namespace.
"""
query = { 'action' : 'query', 'meta' : 'siteinfo', 'siprop' : 'namespaces|namespacealiases' }
result = self.mw.call(query)['query']
namespaces = result['namespaces'].values()
aliases = result.get('namespacealiases', {})
file_namespace = {'*' : 'Files', 'canonical' : 'File'}
media_namespace = {'*' : 'Media', 'canonical' : 'Media'}
# search for the File namespace
for namespace in namespaces:
if namespace.get('canonical', None) == 'File':
file_namespace = namespace
elif namespace.get('canonical', None) == 'Media':
media_namespace = namespace
# alias list starts with the file & media namespace canonical values, and the media "real" value
aliases_result = [ file_namespace['canonical'], media_namespace['canonical'], media_namespace['*'] ]
# look for any aliases by searching the file namespace id, add to the list
ids = [ file_namespace.get('id', None), media_namespace.get('id', None) ]
for alias in aliases:
if alias['id'] in ids:
aliases_result.append(alias['*'])
return file_namespace['*'], aliases_result
def get_main_pagetitle(self):
"""
Return the title of the main Mediawiki page
"""
query = { 'action' : 'query', 'meta' : 'siteinfo', 'siprop' : 'general' }
result = self.mw.call(query)['query']
return result['general'].get("mainpage", "Main")