diff --git a/src/main/java/io/bdrc/xmltoldmigration/CommonMigration.java b/src/main/java/io/bdrc/xmltoldmigration/CommonMigration.java index eed04d18..2865c688 100644 --- a/src/main/java/io/bdrc/xmltoldmigration/CommonMigration.java +++ b/src/main/java/io/bdrc/xmltoldmigration/CommonMigration.java @@ -5,6 +5,8 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.regex.Pattern; + import javax.xml.transform.Source; import javax.xml.transform.dom.DOMSource; import javax.xml.validation.Validator; @@ -221,6 +223,18 @@ public static String normalizePropName(String toNormalize, String targetType) { res = normalizeDescription(res); } return res; + } + + public static Pattern whiteSpacePattern = Pattern.compile("[\\s\\p{Cntrl}]+", Pattern.UNICODE_CHARACTER_CLASS); + + public static String normalizeString(String toNormalize, boolean keepSpaces) { + if (keepSpaces) + return toNormalize.trim(); + return whiteSpacePattern.matcher(toNormalize).replaceAll(" ").trim(); + } + + public static String normalizeString(String toNormalize) { + return normalizeString(toNormalize, false); } public static void addNote(Model m, Element e, Resource r, int i, Property p, Literal l) { @@ -253,7 +267,7 @@ public static void addNote(Model m, Element e, Resource r, int i, Property p, Li lit = m.createLiteral(value); m.add(note, prop, lit); } - value = e.getTextContent().trim(); + value = normalizeString(e.getTextContent(), true); if (!value.isEmpty()) { prop = m.createProperty(ROOT_PREFIX+"note_content"); lit = m.createLiteral(value, "en"); @@ -332,12 +346,12 @@ public static void addLogEntry(Model m, Element e, Resource r) { addException(m, logEntry, "cannot convert log date properly, original date: '"+value+"'"); } } - value = e.getAttribute("who").trim(); + value = normalizeString(e.getAttribute("who")); if (!value.isEmpty()) { prop = m.createProperty(ROOT_PREFIX+"log_who"); m.add(logEntry, prop, m.createLiteral(value, "en")); } - value = e.getTextContent().trim(); + value = normalizeString(e.getTextContent(), true); if (!value.isEmpty()) { prop = m.createProperty(ROOT_PREFIX+"log_content"); m.add(logEntry, prop, m.createLiteral(value, "en")); @@ -382,7 +396,7 @@ public static void addDescriptions(Model m, Element e, Resource r, String XsdPre boolean labelGuessed = !guessLabel; for (int i = 0; i < nodeList.size(); i++) { Element current = (Element) nodeList.get(i); - String descriptionValue = current.getTextContent().trim(); + String descriptionValue = normalizeString(current.getTextContent()); if (descriptionValue.isEmpty()) continue; String type = current.getAttribute("type"); if (type.isEmpty()) type = "noType"; @@ -401,7 +415,7 @@ public static void addDescriptions(Model m, Element e, Resource r, String XsdPre } if (type.equals("nameLex")) { String placeId = r.getLocalName(); - descriptionValue = descriptionValue.replace(placeId, "").trim(); + descriptionValue = normalizeString(descriptionValue.replace(placeId, "")); current.setTextContent(descriptionValue); } Property prop = m.getProperty(addPrefixToDescription(type)); @@ -472,7 +486,7 @@ private static int addLocationIntOrString(Model m, Resource main, Resource loc, return res; } - public static void addLocations(Model m, Resource main, Element root, String XsdPrefix, String propname, String propname1, String propname2) { + public static void addLocations(Model m, Resource main, Element root, String XsdPrefix, String propname, String propname1, String propname2, String workId) { List nodeList = CommonMigration.getChildrenByTagName(root, XsdPrefix, "location"); int i; int volume1 = -1; @@ -487,7 +501,7 @@ public static void addLocations(Model m, Resource main, Element root, String Xsd if (value.isEmpty()) value = "page"; value = value.equals("page") ? "LocationByPage" : "LocationByFolio"; m.add(loc, RDF.type, m.createResource(WORK_PREFIX+value)); - + String localName = main.getLocalName(); // convention: if propname2 is not null, then we're in the case where the first property // is beginsAt and the second is endsAt, we handle it accordingly if (propname1 != null && nodeList.size() > 1) { @@ -501,6 +515,7 @@ public static void addLocations(Model m, Resource main, Element root, String Xsd case 2: addException(m, main, "too many locations, it should only have 2"); //System.err.println(main.getLocalName()+" has 3 or more locations"); + //System.err.println("- [ ] ["+localName+"](https://www.tbrc.org/#library_work_ViewByOutline-"+localName+"|"+workId+") has invalid location"); default: m.add(main, m.getProperty(propname), loc); } @@ -517,13 +532,14 @@ public static void addLocations(Model m, Resource main, Element root, String Xsd if (i == 0) volume1 = volume; if (i == 1 && propname1 != null && volume != -1 && volume1 != -1 && volume < volume1) { addException(m, main, "end location volume is before beginning location volume"); - // System.err.println(main.getLocalName()+" begins at volume "+volume1+", end at volume "+volume); + //System.err.println("- [ ] ["+localName+"](https://www.tbrc.org/#library_work_ViewByOutline-"+localName+"|"+workId+") begins at volume "+volume1+", end at volume "+volume); } int page = addLocationIntOrString(m, main, loc, current, "page", "page"); if (i == 0) page1 = page; if (i == 1 && propname1 != null && page != -1 && page1 != -1 && page < page1 && volume == volume1) { addException(m, main, "end location page is before beginning location"); - // System.err.println(main.getLocalName()+" begins at page "+page1+", end at page "+page); + // System.err.println(main.getLocalName()+" begins at page "+page1+", end at page "+page);d + //System.err.println("- [ ] ["+localName+"](https://www.tbrc.org/#library_work_ViewByOutline-"+localName+"|"+workId+") begins at page "+page1+", end at page "+page+" of volume "+volume1); } addLocationIntOrString(m, main, loc, current, "phrase", "phrase"); addLocationIntOrString(m, main, loc, current, "line", "line"); @@ -735,7 +751,7 @@ public static String normalizeTibetan(String s) { } public static void addCurrentString(Element e, String dflt, Model m, Resource r, Property p, boolean addLabel) { - String value = e.getTextContent().trim(); + String value = normalizeString(e.getTextContent()); if (value.isEmpty()) return; String tag = getBCP47(e, dflt, m, r); if (tag.equals("bo") && !value.isEmpty()) { diff --git a/src/main/java/io/bdrc/xmltoldmigration/LineageMigration.java b/src/main/java/io/bdrc/xmltoldmigration/LineageMigration.java index 53107851..85b7c70d 100644 --- a/src/main/java/io/bdrc/xmltoldmigration/LineageMigration.java +++ b/src/main/java/io/bdrc/xmltoldmigration/LineageMigration.java @@ -42,7 +42,7 @@ public static Model MigrateLineage(Document xmlDocument) { CommonMigration.addExternals(m, root, main, LXSDNS); CommonMigration.addDescriptions(m, root, main, LXSDNS); CommonMigration.addLog(m, root, main, LXSDNS); - CommonMigration.addLocations(m, main, root, LXSDNS, LP+"location", null, null); + CommonMigration.addLocations(m, main, root, LXSDNS, LP+"location", null, null, ""); NodeList nodeList = root.getElementsByTagNameNS(LXSDNS, "object"); for (int i = 0; i < nodeList.getLength(); i++) { diff --git a/src/main/java/io/bdrc/xmltoldmigration/OutlineMigration.java b/src/main/java/io/bdrc/xmltoldmigration/OutlineMigration.java index cc1793b1..ee6c0e7c 100644 --- a/src/main/java/io/bdrc/xmltoldmigration/OutlineMigration.java +++ b/src/main/java/io/bdrc/xmltoldmigration/OutlineMigration.java @@ -31,6 +31,7 @@ public static Model MigrateOutline(Document xmlDocument) { // fetch type in isOutlineOf NodeList nodeList = root.getElementsByTagNameNS(OXSDNS, "isOutlineOf"); String value = null; + String workId = ""; for (int i = 0; i < nodeList.getLength(); i++) { Element current = (Element) nodeList.item(i); @@ -44,8 +45,12 @@ public static Model MigrateOutline(Document xmlDocument) { m.add(main, RDF.type, m.createResource(OP + "Outline")); value = current.getAttribute("work").trim(); - if (!value.isEmpty()) + if (!value.isEmpty()) { m.add(main, m.getProperty(OP+"isOutlineOf"), m.createProperty(WP+value)); + } else { + CommonMigration.addException(m, main, "outline does not reference the corresponding work"); + } + workId = value; } value = root.getAttribute("webAccess").trim(); @@ -61,7 +66,7 @@ public static Model MigrateOutline(Document xmlDocument) { CommonMigration.addExternals(m, root, main, OXSDNS); CommonMigration.addLog(m, root, main, OXSDNS); CommonMigration.addDescriptions(m, root, main, OXSDNS); - CommonMigration.addLocations(m, main, root, OXSDNS, OP+"location", null, null); + CommonMigration.addLocations(m, main, root, OXSDNS, OP+"location", null, null, workId); addCreators(m, main, root); @@ -69,7 +74,7 @@ public static Model MigrateOutline(Document xmlDocument) { addViewIn(m, main, root); - addNodes(m, main, root); + addNodes(m, main, root, workId); return m; } @@ -87,7 +92,7 @@ public static void addCreators(Model m, Resource r, Element e) { } } - public static void addNode(Model m, Resource r, Element e, int i) { + public static void addNode(Model m, Resource r, Element e, int i, String workId) { String value = e.getAttribute("RID"); if (value.isEmpty()) value = CommonMigration.getSubResourceName(r, OP, "Node", i+1); @@ -118,7 +123,7 @@ public static void addNode(Model m, Resource r, Element e, int i) { CommonMigration.addDescriptions(m, e, node, OXSDNS); CommonMigration.addTitles(m, node, e, OXSDNS, false); - CommonMigration.addLocations(m, node, e, OXSDNS, OP+"location", OP+"beginsAt", OP+"endsAt"); + CommonMigration.addLocations(m, node, e, OXSDNS, OP+"location", OP+"beginsAt", OP+"endsAt", workId); CommonMigration.addSubjects(m, node, e, OXSDNS); addSimpleAttr(e.getAttribute("value"), "node_value", m, node); @@ -164,16 +169,16 @@ public static void addNode(Model m, Resource r, Element e, int i) { addCreators(m, node, e); // sub nodes - addNodes(m, node, e); + addNodes(m, node, e, workId); } - public static void addNodes(Model m, Resource r, Element e) { + public static void addNodes(Model m, Resource r, Element e, String workId) { List nodeList = CommonMigration.getChildrenByTagName(e, OXSDNS, "node"); for (int i = 0; i < nodeList.size(); i++) { Element current = (Element) nodeList.get(i); - addNode(m, r, current, i); + addNode(m, r, current, i, workId); } } diff --git a/src/main/resources/owl-file b/src/main/resources/owl-file index 63e1a847..0d71c37b 160000 --- a/src/main/resources/owl-file +++ b/src/main/resources/owl-file @@ -1 +1 @@ -Subproject commit 63e1a847d88376733f84be97263889886f268714 +Subproject commit 0d71c37b8fa9bf5e8be557cf9bc2a0cbb6f20404 diff --git a/src/test/java/io/bdrc/xmltoldmigration/MigrationTest.java b/src/test/java/io/bdrc/xmltoldmigration/MigrationTest.java index 387acbf3..9884de39 100644 --- a/src/test/java/io/bdrc/xmltoldmigration/MigrationTest.java +++ b/src/test/java/io/bdrc/xmltoldmigration/MigrationTest.java @@ -13,6 +13,7 @@ import static org.junit.Assert.assertTrue; import java.io.IOException; +import java.math.BigInteger; import java.util.ArrayList; import java.util.List; @@ -86,6 +87,13 @@ public void textEwts() { assertTrue(CommonMigration.normalizeTibetan("\u0F76").equals("\u0FB2\u0F80")); } + @Test + public void testNormalize() { + assertTrue(CommonMigration.normalizeString("").equals("")); + String allWhiteSpaces = " 1 \u0009 2 \n 3 \u000C 4 \r 5 \u0020 6 \u0085 7 \u00A0 8 \u1680 9 \u180E 10 \u2000 11 \u2001 12 \u2002 13 \u2003 14 \u2004 15 \u2005 16 \u2006 17 \u2007 18 \u2008 19 \u2009 20 \u200A 21 \u2028 22 \u2029 23 \u202F 24 \u205F 25 \u3000 26 \0 27 "; + assertTrue(CommonMigration.normalizeString(allWhiteSpaces).equals("1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27")); + } + @Test public void testP1583() { diff --git a/src/test/jsonld/OutlineTest.jsonld b/src/test/jsonld/OutlineTest.jsonld index 3b442db9..bb951b2a 100644 --- a/src/test/jsonld/OutlineTest.jsonld +++ b/src/test/jsonld/OutlineTest.jsonld @@ -17,10 +17,10 @@ "@value" : "མངོན་པ" }, { "@language" : "bo-x-ewts", - "@value" : "bka' brgyud pa'i gsung rab dpe tshogs las mngon pa" + "@value" : "mngon pa" }, { "@language" : "bo-x-ewts", - "@value" : "mngon pa" + "@value" : "bka' brgyud pa'i gsung rab dpe tshogs las mngon pa" } ], "out:authorship" : { "@language" : "en", @@ -32,12 +32,12 @@ "out:hasNode" : { "@id" : "out:O2DB875722DB875732DB87599", "@type" : "out:Class", - "out:location" : [ { - "@id" : "wor:O2DB875722DB875732DB87599_Location2", + "out:beginsAt" : { + "@id" : "wor:O2DB875722DB875732DB87599_Location1", "@type" : "wor:LocationByPage", "wor:page" : { "@type" : "xsd:positiveInteger", - "@value" : "18" + "@value" : "11" }, "wor:volume" : { "@type" : "xsd:positiveInteger", @@ -46,12 +46,13 @@ "wor:work" : { "@id" : "wor:W30020" } - }, { - "@id" : "wor:O2DB875722DB875732DB87599_Location1", + }, + "out:endsAt" : { + "@id" : "wor:O2DB875722DB875732DB87599_Location2", "@type" : "wor:LocationByPage", "wor:page" : { "@type" : "xsd:positiveInteger", - "@value" : "11" + "@value" : "18" }, "wor:volume" : { "@type" : "xsd:positiveInteger", @@ -60,9 +61,9 @@ "wor:work" : { "@id" : "wor:W30020" } - } ], + }, "note" : { - "@id" : "http://purl.bdrc.io/ontology/root#O2DB875722DB875732DB87599_Note1", + "@id" : ":O2DB875722DB875732DB87599_Note1", "@type" : "Note", "note_content" : { "@language" : "en", @@ -119,16 +120,12 @@ "@language" : "en", "@value" : "Another scribe" }, - "out:gser_bris_number" : { - "@language" : "en", - "@value" : "test broken property type" - }, - "out:location" : [ { - "@id" : "wor:O2DB875722DB87573_Location2", + "out:beginsAt" : { + "@id" : "wor:O2DB875722DB87573_Location1", "@type" : "wor:LocationByPage", "wor:page" : { "@type" : "xsd:positiveInteger", - "@value" : "18" + "@value" : "11" }, "wor:volume" : { "@type" : "xsd:positiveInteger", @@ -137,12 +134,13 @@ "wor:work" : { "@id" : "wor:W30020" } - }, { - "@id" : "wor:O2DB875722DB87573_Location1", + }, + "out:endsAt" : { + "@id" : "wor:O2DB875722DB87573_Location2", "@type" : "wor:LocationByPage", "wor:page" : { "@type" : "xsd:positiveInteger", - "@value" : "11" + "@value" : "18" }, "wor:volume" : { "@type" : "xsd:positiveInteger", @@ -151,7 +149,11 @@ "wor:work" : { "@id" : "wor:W30020" } - } ], + }, + "out:gser_bris_number" : { + "@language" : "en", + "@value" : "test broken property type" + }, "wor:bibliographicalTitle" : [ { "@language" : "bo", "@value" : "གླེང་གཞི།" @@ -218,46 +220,47 @@ "@type" : "LogEntry", "log_content" : { "@language" : "en", - "@value" : "vol. 1 outline images fixed" + "@value" : "zhu dag byas" }, "log_when" : { "@type" : "xsd:dateTime", - "@value" : "2014-10-03T14:40:05.896Z" + "@value" : "2009-12-28T10:00:20.456Z" }, "log_who" : { "@language" : "en", - "@value" : "Lobsang Shastri" + "@value" : "paldor" } }, { "@id" : "_:b2", "@type" : "LogEntry", + "log_content" : { + "@language" : "en", + "@value" : "vol. 1 outline images fixed" + }, "log_when" : { "@type" : "xsd:dateTime", - "@value" : "2009-09-15T14:51:25.708Z" + "@value" : "2014-10-03T14:40:05.896Z" }, "log_who" : { "@language" : "en", - "@value" : "Bumu Dega" + "@value" : "Lobsang Shastri" } }, { "@id" : "_:b0", "@type" : "LogEntry", - "log_content" : { - "@language" : "en", - "@value" : "zhu dag byas" - }, "log_when" : { "@type" : "xsd:dateTime", - "@value" : "2009-12-28T10:00:20.456Z" + "@value" : "2009-09-15T14:51:25.708Z" }, "log_who" : { "@language" : "en", - "@value" : "paldor" + "@value" : "Bumu Dega" } } ] } ], "@context" : { "@vocab" : "http://purl.bdrc.io/ontology/root#", + "" : "http://purl.bdrc.io/ontology/root#", "crp" : "http://purl.bdrc.io/ontology/corporation#", "prd" : "http://purl.bdrc.io/ontology/product#", "owl" : "http://www.w3.org/2002/07/owl#", diff --git a/src/test/xml/OutlineTest.xml b/src/test/xml/OutlineTest.xml index 09d26556..265a0c63 100644 --- a/src/test/xml/OutlineTest.xml +++ b/src/test/xml/OutlineTest.xml @@ -19,7 +19,7 @@ - 2 (smad cha) + 2 (smad
 cha) gleng gzhi/