Skip to content

Commit

Permalink
normalize strings when relevant
Browse files Browse the repository at this point in the history
removes adjacent spaces, control characters, new lines, and unifies spacing
characters. Should fix buda-base/BDRC-Lib-App#41
  • Loading branch information
eroux committed Jul 10, 2017
1 parent 9f17b2b commit 3dc7528
Show file tree
Hide file tree
Showing 7 changed files with 85 additions and 53 deletions.
36 changes: 26 additions & 10 deletions src/main/java/io/bdrc/xmltoldmigration/CommonMigration.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import javax.xml.transform.Source;
import javax.xml.transform.dom.DOMSource;
import javax.xml.validation.Validator;
Expand Down Expand Up @@ -221,6 +223,18 @@ public static String normalizePropName(String toNormalize, String targetType) {
res = normalizeDescription(res);
}
return res;
}

public static Pattern whiteSpacePattern = Pattern.compile("[\\s\\p{Cntrl}]+", Pattern.UNICODE_CHARACTER_CLASS);

public static String normalizeString(String toNormalize, boolean keepSpaces) {
if (keepSpaces)
return toNormalize.trim();
return whiteSpacePattern.matcher(toNormalize).replaceAll(" ").trim();
}

public static String normalizeString(String toNormalize) {
return normalizeString(toNormalize, false);
}

public static void addNote(Model m, Element e, Resource r, int i, Property p, Literal l) {
Expand Down Expand Up @@ -253,7 +267,7 @@ public static void addNote(Model m, Element e, Resource r, int i, Property p, Li
lit = m.createLiteral(value);
m.add(note, prop, lit);
}
value = e.getTextContent().trim();
value = normalizeString(e.getTextContent(), true);
if (!value.isEmpty()) {
prop = m.createProperty(ROOT_PREFIX+"note_content");
lit = m.createLiteral(value, "en");
Expand Down Expand Up @@ -332,12 +346,12 @@ public static void addLogEntry(Model m, Element e, Resource r) {
addException(m, logEntry, "cannot convert log date properly, original date: '"+value+"'");
}
}
value = e.getAttribute("who").trim();
value = normalizeString(e.getAttribute("who"));
if (!value.isEmpty()) {
prop = m.createProperty(ROOT_PREFIX+"log_who");
m.add(logEntry, prop, m.createLiteral(value, "en"));
}
value = e.getTextContent().trim();
value = normalizeString(e.getTextContent(), true);
if (!value.isEmpty()) {
prop = m.createProperty(ROOT_PREFIX+"log_content");
m.add(logEntry, prop, m.createLiteral(value, "en"));
Expand Down Expand Up @@ -382,7 +396,7 @@ public static void addDescriptions(Model m, Element e, Resource r, String XsdPre
boolean labelGuessed = !guessLabel;
for (int i = 0; i < nodeList.size(); i++) {
Element current = (Element) nodeList.get(i);
String descriptionValue = current.getTextContent().trim();
String descriptionValue = normalizeString(current.getTextContent());
if (descriptionValue.isEmpty()) continue;
String type = current.getAttribute("type");
if (type.isEmpty()) type = "noType";
Expand All @@ -401,7 +415,7 @@ public static void addDescriptions(Model m, Element e, Resource r, String XsdPre
}
if (type.equals("nameLex")) {
String placeId = r.getLocalName();
descriptionValue = descriptionValue.replace(placeId, "").trim();
descriptionValue = normalizeString(descriptionValue.replace(placeId, ""));
current.setTextContent(descriptionValue);
}
Property prop = m.getProperty(addPrefixToDescription(type));
Expand Down Expand Up @@ -472,7 +486,7 @@ private static int addLocationIntOrString(Model m, Resource main, Resource loc,
return res;
}

public static void addLocations(Model m, Resource main, Element root, String XsdPrefix, String propname, String propname1, String propname2) {
public static void addLocations(Model m, Resource main, Element root, String XsdPrefix, String propname, String propname1, String propname2, String workId) {
List<Element> nodeList = CommonMigration.getChildrenByTagName(root, XsdPrefix, "location");
int i;
int volume1 = -1;
Expand All @@ -487,7 +501,7 @@ public static void addLocations(Model m, Resource main, Element root, String Xsd
if (value.isEmpty()) value = "page";
value = value.equals("page") ? "LocationByPage" : "LocationByFolio";
m.add(loc, RDF.type, m.createResource(WORK_PREFIX+value));

String localName = main.getLocalName();
// convention: if propname2 is not null, then we're in the case where the first property
// is beginsAt and the second is endsAt, we handle it accordingly
if (propname1 != null && nodeList.size() > 1) {
Expand All @@ -501,6 +515,7 @@ public static void addLocations(Model m, Resource main, Element root, String Xsd
case 2:
addException(m, main, "too many locations, it should only have 2");
//System.err.println(main.getLocalName()+" has 3 or more locations");
//System.err.println("- [ ] ["+localName+"](https://www.tbrc.org/#library_work_ViewByOutline-"+localName+"|"+workId+") has invalid location");
default:
m.add(main, m.getProperty(propname), loc);
}
Expand All @@ -517,13 +532,14 @@ public static void addLocations(Model m, Resource main, Element root, String Xsd
if (i == 0) volume1 = volume;
if (i == 1 && propname1 != null && volume != -1 && volume1 != -1 && volume < volume1) {
addException(m, main, "end location volume is before beginning location volume");
// System.err.println(main.getLocalName()+" begins at volume "+volume1+", end at volume "+volume);
//System.err.println("- [ ] ["+localName+"](https://www.tbrc.org/#library_work_ViewByOutline-"+localName+"|"+workId+") begins at volume "+volume1+", end at volume "+volume);
}
int page = addLocationIntOrString(m, main, loc, current, "page", "page");
if (i == 0) page1 = page;
if (i == 1 && propname1 != null && page != -1 && page1 != -1 && page < page1 && volume == volume1) {
addException(m, main, "end location page is before beginning location");
// System.err.println(main.getLocalName()+" begins at page "+page1+", end at page "+page);
// System.err.println(main.getLocalName()+" begins at page "+page1+", end at page "+page);d
//System.err.println("- [ ] ["+localName+"](https://www.tbrc.org/#library_work_ViewByOutline-"+localName+"|"+workId+") begins at page "+page1+", end at page "+page+" of volume "+volume1);
}
addLocationIntOrString(m, main, loc, current, "phrase", "phrase");
addLocationIntOrString(m, main, loc, current, "line", "line");
Expand Down Expand Up @@ -735,7 +751,7 @@ public static String normalizeTibetan(String s) {
}

public static void addCurrentString(Element e, String dflt, Model m, Resource r, Property p, boolean addLabel) {
String value = e.getTextContent().trim();
String value = normalizeString(e.getTextContent());
if (value.isEmpty()) return;
String tag = getBCP47(e, dflt, m, r);
if (tag.equals("bo") && !value.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public static Model MigrateLineage(Document xmlDocument) {
CommonMigration.addExternals(m, root, main, LXSDNS);
CommonMigration.addDescriptions(m, root, main, LXSDNS);
CommonMigration.addLog(m, root, main, LXSDNS);
CommonMigration.addLocations(m, main, root, LXSDNS, LP+"location", null, null);
CommonMigration.addLocations(m, main, root, LXSDNS, LP+"location", null, null, "");

NodeList nodeList = root.getElementsByTagNameNS(LXSDNS, "object");
for (int i = 0; i < nodeList.getLength(); i++) {
Expand Down
21 changes: 13 additions & 8 deletions src/main/java/io/bdrc/xmltoldmigration/OutlineMigration.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ public static Model MigrateOutline(Document xmlDocument) {
// fetch type in isOutlineOf
NodeList nodeList = root.getElementsByTagNameNS(OXSDNS, "isOutlineOf");
String value = null;
String workId = "";
for (int i = 0; i < nodeList.getLength(); i++) {
Element current = (Element) nodeList.item(i);

Expand All @@ -44,8 +45,12 @@ public static Model MigrateOutline(Document xmlDocument) {
m.add(main, RDF.type, m.createResource(OP + "Outline"));

value = current.getAttribute("work").trim();
if (!value.isEmpty())
if (!value.isEmpty()) {
m.add(main, m.getProperty(OP+"isOutlineOf"), m.createProperty(WP+value));
} else {
CommonMigration.addException(m, main, "outline does not reference the corresponding work");
}
workId = value;
}

value = root.getAttribute("webAccess").trim();
Expand All @@ -61,15 +66,15 @@ public static Model MigrateOutline(Document xmlDocument) {
CommonMigration.addExternals(m, root, main, OXSDNS);
CommonMigration.addLog(m, root, main, OXSDNS);
CommonMigration.addDescriptions(m, root, main, OXSDNS);
CommonMigration.addLocations(m, main, root, OXSDNS, OP+"location", null, null);
CommonMigration.addLocations(m, main, root, OXSDNS, OP+"location", null, null, workId);

addCreators(m, main, root);

// TODO: parent (unused?)

addViewIn(m, main, root);

addNodes(m, main, root);
addNodes(m, main, root, workId);

return m;
}
Expand All @@ -87,7 +92,7 @@ public static void addCreators(Model m, Resource r, Element e) {
}
}

public static void addNode(Model m, Resource r, Element e, int i) {
public static void addNode(Model m, Resource r, Element e, int i, String workId) {
String value = e.getAttribute("RID");
if (value.isEmpty())
value = CommonMigration.getSubResourceName(r, OP, "Node", i+1);
Expand Down Expand Up @@ -118,7 +123,7 @@ public static void addNode(Model m, Resource r, Element e, int i) {
CommonMigration.addDescriptions(m, e, node, OXSDNS);
CommonMigration.addTitles(m, node, e, OXSDNS, false);

CommonMigration.addLocations(m, node, e, OXSDNS, OP+"location", OP+"beginsAt", OP+"endsAt");
CommonMigration.addLocations(m, node, e, OXSDNS, OP+"location", OP+"beginsAt", OP+"endsAt", workId);
CommonMigration.addSubjects(m, node, e, OXSDNS);

addSimpleAttr(e.getAttribute("value"), "node_value", m, node);
Expand Down Expand Up @@ -164,16 +169,16 @@ public static void addNode(Model m, Resource r, Element e, int i) {
addCreators(m, node, e);

// sub nodes
addNodes(m, node, e);
addNodes(m, node, e, workId);
}



public static void addNodes(Model m, Resource r, Element e) {
public static void addNodes(Model m, Resource r, Element e, String workId) {
List<Element> nodeList = CommonMigration.getChildrenByTagName(e, OXSDNS, "node");
for (int i = 0; i < nodeList.size(); i++) {
Element current = (Element) nodeList.get(i);
addNode(m, r, current, i);
addNode(m, r, current, i, workId);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/owl-file
8 changes: 8 additions & 0 deletions src/test/java/io/bdrc/xmltoldmigration/MigrationTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;

Expand Down Expand Up @@ -86,6 +87,13 @@ public void textEwts() {
assertTrue(CommonMigration.normalizeTibetan("\u0F76").equals("\u0FB2\u0F80"));
}

@Test
public void testNormalize() {
assertTrue(CommonMigration.normalizeString("").equals(""));
String allWhiteSpaces = " 1 \u0009 2 \n 3 \u000C 4 \r 5 \u0020 6 \u0085 7 \u00A0 8 \u1680 9 \u180E 10 \u2000 11 \u2001 12 \u2002 13 \u2003 14 \u2004 15 \u2005 16 \u2006 17 \u2007 18 \u2008 19 \u2009 20 \u200A 21 \u2028 22 \u2029 23 \u202F 24 \u205F 25 \u3000 26 \0 27 ";
assertTrue(CommonMigration.normalizeString(allWhiteSpaces).equals("1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27"));
}

@Test
public void testP1583()
{
Expand Down
Loading

0 comments on commit 3dc7528

Please sign in to comment.