-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparse_xml.py
88 lines (75 loc) · 3.36 KB
/
parse_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from xml.etree import ElementTree as ET
import re
import os
from redis import Redis
r = Redis()
YEAR_P = re.compile(r"(1[0-9]{3})")
METADATA_ROOT_DIRECTORY = "/mnt/Downloads/md"
MODSXMLNS = "{http://www.loc.gov/mods/v3}"
nasmount = "/mnt/nas"
def get_filepath(identifier):
return os.path.join(METADATA_ROOT_DIRECTORY, identifier[:4], identifier, "{0}.xml".format(identifier))
def divination_for_year(*items):
for item in items:
d = YEAR_P.search(item)
if d != None:
return d.groups()[0]
return ""
def parse_xml(identifier):
fp = get_filepath(identifier)
title, author, pubplace, publisher, pubdate = "","","","",""
xmlfile = open(fp, "r")
doc = ET.fromstring(xmlfile.read())
xmlfile.close()
mods = doc[0][0][0][0] # Hacky in a sense, but effective
titlen = mods.find("{0}titleInfo/{0}title".format(MODSXMLNS))
nonsort = mods.find("{0}titleInfo/{0}nonSort".format(MODSXMLNS))
if nonsort != None and nonsort.text != None:
title += nonsort.text.strip() + " "
if titlen != None and titlen.text != None:
title += titlen.text.strip()
firstauthorn = mods.find('{0}name[@type="personal"]'.format(MODSXMLNS))
if firstauthorn != None and firstauthorn.text != None:
authorname = firstauthorn.find('{0}namePart'.format(MODSXMLNS))
if authorname != None and authorname.text != None:
author = authorname.text.strip()
authorterms = firstauthorn.find('{0}namePart[@type="termsOfAddress"]'.format(MODSXMLNS))
if authorterms != None and authorterms.text != None:
author += u" {0}".format(authorterms.text.strip())
oI = mods.find('{0}originInfo'.format(MODSXMLNS))
pubplacen = oI.find('{0}place/{0}placeTerm'.format(MODSXMLNS))
if pubplacen != None and pubplacen.text != None:
pubplace = pubplacen.text.strip()
publishern = oI.find('{0}publisher'.format(MODSXMLNS))
if publishern != None and publishern.text != None:
publisher = publishern.text.strip()
pubdaten = oI.find('{0}dateIssued'.format(MODSXMLNS))
if pubdaten != None and pubdaten.text != None:
pubdate = pubdaten.text.strip()
guesseddate = divination_for_year(pubdate, pubplace, publisher)
return identifier, title, author, pubplace, publisher, guesseddate
def generate_metadata(filename):
id, vol, page, imgno, _ = filename.split("_",4)
identifier, title, author, pubplace, publisher, guesseddate = parse_xml(id)
decoded = map(lambda x: x.encode("utf-8"), [title, author, guesseddate, pubplace, publisher, vol, str(int(page)), identifier])
# lookup direct links, if any
# eg to create http://itemviewer.bl.uk/?itemid=lsidyv3bd2589c#ark:/81055/vdc_000000054755.0x000020
rosetta = r.get("s:"+id)
if rosetta:
d, a, u = rosetta.strip().split("\t")
adjusted_ark_number = "{0:012x}".format(int(a.split("_")[1], 16) - 1)
adj_ark = "{0}_{1}".format(a.split("_")[0], adjusted_ark_number.upper())
hexpage = "0x000001"
try:
hexpage = "{0:06x}".format(int(page)).upper()
print hexpage
except:
pass
additional = """
<ul><li>Open the page in the <a href="http://itemviewer.bl.uk/?itemid={2}#{1}.0x{0}">British Library's itemViewer (page: {3})</a></li>
<li><a href="http://access.dl.bl.uk/{2}">Download the PDF for this book</a>
""".format(hexpage, adj_ark, u, page)
decoded += [additional]
else:
decoded += [u""]
return id, vol, page, imgno, identifier, title, author, pubplace, publisher, guesseddate, decoded