1 && isAdjective.apply(words.get(prepPos-1))) {
+ adjective=words.get(prepPos-1);
+ words=words.subList(0, prepPos-1);
+ } else {
+ words=words.subList(0, prepPos);
+ }
+ }
+
+ if(words.size()==0) return;
+
+ head=words.get(words.size()-1);
+ if(words.size()>1) {
+ preModifier=words.subList(0, words.size()-1).toString().replace(", ", "_");
+ preModifier=preModifier.substring(1, preModifier.length()-1);
+ }
+ }
+
+
+ /** Checks if the originals match */
+ public boolean equals(Object o) {
+ return(o instanceof NounGroup && ((NounGroup)o).original.equals(original));
+ }
+
+ /** Returns the original */
+ public String toString() {
+ return(original);
+ }
+
+ /** Returns all fields in a String */
+ public String description() {
+ return("NounGroup:\n"+
+ " Original: "+original+"\n"+
+ " Stemmed: "+stemmed()+"\n"+
+ " Determiner: "+determiner+"\n"+
+ " preModifiers: "+preModifier+"\n"+
+ " Head: "+head+"\n"+
+ " Adjective: "+adjective+"\n"+
+ " Preposition: "+preposition+"\n"+
+ " postModifier: \n"+(postModifier==null?"":postModifier.description()));
+ }
+
+ /** Test method */
+ public static void main(String[] args) throws Exception {
+ D.p("Enter a noun group and press ENTER. Press CTRL+C to abort");
+// while(true) {
+// D.p(new NounGroup(D.r()).description());
+// }
+
+ D.p(new NounGroup("Star_Trek_characters").description());
+ }
+
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PlingStemmer.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PlingStemmer.java
new file mode 100644
index 0000000..277efc0
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PlingStemmer.java
@@ -0,0 +1,923 @@
+package org.yago.javatools.parsers;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.util.Map;
+import java.util.Set;
+
+import org.yago.javatools.datatypes.FinalMap;
+import org.yago.javatools.datatypes.FinalSet;
+
+/**
+This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools).
+It is licensed under the Creative Commons Attribution License
+(see http://creativecommons.org/licenses/by/3.0) by
+the YAGO-NAGA team (see http://mpii.de/yago-naga).
+
+
+
+
+
+ The PlingStemmer stems an English noun (plural or singular) to its singular
+ form. It deals with "firemen"->"fireman", it knows Greek stuff like
+ "appendices"->"appendix" and yes, it was a lot of work to compile these exceptions.
+ Examples:
+
+ System.out.println(PlingStemmer.stem("boy"));
+ ----> boy
+ System.out.println(PlingStemmer.stem("boys"));
+ ----> boy
+ System.out.println(PlingStemmer.stem("biophysics"));
+ ----> biophysics
+ System.out.println(PlingStemmer.stem("automata"));
+ ----> automaton
+ System.out.println(PlingStemmer.stem("genus"));
+ ----> genus
+ System.out.println(PlingStemmer.stem("emus"));
+ ----> emu
+
+
+ There are a number of word forms that can either be plural or singular.
+ Examples include "physics" (the science or the plural of "physic" (the
+ medicine)), "quarters" (the housing or the plural of "quarter" (1/4))
+ or "people" (the singular of "peoples" or the plural of "person"). In
+ these cases, the stemmer assumes the word is a plural form and returns
+ the singular form. The methods isPlural, isSingular and isPluralAndSingular
+ can be used to differentiate the cases.
+
+ It cannot be guaranteed that the stemmer correctly stems a plural word
+ or correctly ignores a singular word -- let alone that it treats an
+ ambiguous word form in the way expected by the user.
+
+ The PlingStemmer uses material from WordNet.
+ It requires the class FinalSet from the
+ Java Tools.
+*/
+public class PlingStemmer {
+
+ /** Tells whether a word form is plural. This method just checks whether the
+ * stem method alters the word */
+ public static boolean isPlural(String s) {
+ return(!s.equals(stem(s)));
+ }
+
+ /** Tells whether a word form is singular. Note that a word can be both plural and singular */
+ public static boolean isSingular(String s) {
+ return(singAndPlur.contains(s.toLowerCase()) || !isPlural(s));
+ }
+
+ /** Tells whether a word form is the singular form of one word and at
+ * the same time the plural form of another.*/
+ public static boolean isSingularAndPlural(String s) {
+ return(singAndPlur.contains(s.toLowerCase()));
+ }
+
+ /** Cuts a suffix from a string (that is the number of chars given by the suffix) */
+ public static String cut(String s, String suffix) {
+ return(s.substring(0,s.length()-suffix.length()));
+ }
+
+ /** Returns true if a word is probably not Latin */
+ public static boolean noLatin(String s) {
+ return(s.indexOf('h')>0 || s.indexOf('j')>0 || s.indexOf('k')>0 ||
+ s.indexOf('w')>0 || s.indexOf('y')>0 || s.indexOf('z')>0 ||
+ s.indexOf("ou")>0 || s.indexOf("sh")>0 || s.indexOf("ch")>0 ||
+ s.endsWith("aus"));
+ }
+
+ /** Returns true if a word is probably Greek */
+ private static boolean greek(String s) {
+ return(s.indexOf("ph")>0 || s.indexOf('y')>0 && s.endsWith("nges"));
+ }
+
+ /** Stems an English noun */
+ public static String stem(String s) {
+ String stem = s;
+
+ // Handle irregular ones
+ String irreg=irregular.get(s);
+ if(irreg!=null) return(stem=irreg);
+
+ // -on to -a
+ if(categoryON_A.contains(s)) return(stem=cut(s,"a")+"on");
+
+ // -um to -a
+ if(categoryUM_A.contains(s)) return(stem=cut(s,"a")+"um");
+
+ // -x to -ices
+ if(categoryIX_ICES.contains(s)) return(stem=cut(s,"ices")+"ix");
+
+ // -o to -i
+ if(categoryO_I.contains(s)) return(stem=cut(s,"i")+"o");
+
+ // -se to ses
+ if(categorySE_SES.contains(s)) return(stem=cut(s,"s"));
+
+ // -is to -es
+ if(categoryIS_ES.contains(s) || s.endsWith("theses")) return(stem=cut(s,"es")+"is");
+
+ // -us to -i
+ if(categoryUS_I.contains(s)) return(stem=cut(s,"i")+"us");
+ //Wrong plural
+ if(s.endsWith("uses") && (categoryUS_I.contains(cut(s,"uses")+"i") ||
+ s.equals("genuses") || s.equals("corpuses"))) return(stem=cut(s,"es"));
+
+ // -ex to -ices
+ if(categoryEX_ICES.contains(s)) return(stem=cut(s,"ices")+"ex");
+
+ // Words that do not inflect in the plural
+ if(s.endsWith("ois") || s.endsWith("itis") || category00.contains(s) || categoryICS.contains(s)) return(stem=s);
+
+ // -en to -ina
+ // No other common words end in -ina
+ if(s.endsWith("ina")) return(stem=cut(s,"en"));
+
+ // -a to -ae
+ // No other common words end in -ae
+ if(s.endsWith("ae")) return(stem=cut(s,"e"));
+
+ // -a to -ata
+ // No other common words end in -ata
+ if(s.endsWith("ata")) return(stem=cut(s,"ta"));
+
+ // trix to -trices
+ // No common word ends with -trice(s)
+ if(s.endsWith("trices")) return(stem=cut(s,"trices")+"trix");
+
+ // -us to -us
+ //No other common word ends in -us, except for false plurals of French words
+ //Catch words that are not latin or known to end in -u
+ if(s.endsWith("us") && !s.endsWith("eaus") && !s.endsWith("ieus") && !noLatin(s)
+ && !categoryU_US.contains(s)) return(stem=s);
+
+ // -tooth to -teeth
+ // -goose to -geese
+ // -foot to -feet
+ // -zoon to -zoa
+ //No other common words end with the indicated suffixes
+ if(s.endsWith("teeth")) return(stem=cut(s,"teeth")+"tooth");
+ if(s.endsWith("geese")) return(stem=cut(s,"geese")+"goose");
+ if(s.endsWith("feet")) return(stem=cut(s,"feet")+"foot");
+ if(s.endsWith("zoa")) return(stem=cut(s,"zoa")+"zoon");
+
+ // -eau to -eaux
+ //No other common words end in eaux
+ if(s.endsWith("eaux")) return(stem=cut(s,"x"));
+
+ // -ieu to -ieux
+ //No other common words end in ieux
+ if(s.endsWith("ieux")) return(stem=cut(s,"x"));
+
+ // -nx to -nges
+ // Pay attention not to kill words ending in -nge with plural -nges
+ // Take only Greek words (works fine, only a handfull of exceptions)
+ if(s.endsWith("nges") && greek(s)) return(stem=cut(s,"nges")+"nx");
+
+ // -[sc]h to -[sc]hes
+ //No other common word ends with "shes", "ches" or "she(s)"
+ //Quite a lot end with "che(s)", filter them out
+ if(s.endsWith("shes") || s.endsWith("ches") && !categoryCHE_CHES.contains(s)) return(stem=cut(s,"es"));
+
+ // -ss to -sses
+ // No other common singular word ends with "sses"
+ // Filter out those ending in "sse(s)"
+ if(s.endsWith("sses") && !categorySSE_SSES.contains(s) && !s.endsWith("mousses")) return(stem=cut(s,"es"));
+
+ // -x to -xes
+ // No other common word ends with "xe(s)" except for "axe"
+ if(s.endsWith("xes") && !s.equals("axes")) return(stem=cut(s,"es"));
+
+ // -[nlw]ife to -[nlw]ives
+ //No other common word ends with "[nlw]ive(s)" except for olive
+ if(s.endsWith("nives") || s.endsWith("lives") && !s.endsWith("olives") ||
+ s.endsWith("wives")) return(stem=cut(s,"ves")+"fe");
+
+ // -[aeo]lf to -ves exceptions: valve, solve
+ // -[^d]eaf to -ves exceptions: heave, weave
+ // -arf to -ves no exception
+ if(s.endsWith("alves") && !s.endsWith("valves") ||
+ s.endsWith("olves") && !s.endsWith("solves") ||
+ s.endsWith("eaves") && !s.endsWith("heaves") && !s.endsWith("weaves") ||
+ s.endsWith("arves") ) return(stem=cut(s,"ves")+"f");
+
+ // -y to -ies
+ // -ies is very uncommon as a singular suffix
+ // but -ie is quite common, filter them out
+ if(s.endsWith("ies") && !categoryIE_IES.contains(s)) return(stem=cut(s,"ies")+"y");
+
+ // -o to -oes
+ // Some words end with -oe, so don't kill the "e"
+ if(s.endsWith("oes") && !categoryOE_OES.contains(s)) return(stem=cut(s,"es"));
+
+ // -s to -ses
+ // -z to -zes
+ // no words end with "-ses" or "-zes" in singular
+ if(s.endsWith("ses") || s.endsWith("zes") ) return(stem=cut(s,"es"));
+
+ // - to -s
+ if(s.endsWith("s") && !s.endsWith("ss") && !s.endsWith("is")) return(stem=cut(s,"s"));
+
+ return stem;
+ }
+
+ /** Words that end in "-se" in their plural forms (like "nurse" etc.)*/
+ public static Set categorySE_SES=new FinalSet(
+ "nurses",
+ "cruises",
+ "premises",
+ "houses"
+ );
+
+ /** Words that do not have a distinct plural form (like "atlas" etc.)*/
+ public static Set category00=new FinalSet(
+ "alias",
+ "asbestos",
+ "atlas",
+ "barracks",
+ "bathos",
+ "bias",
+ "breeches",
+ "britches",
+ "canvas",
+ "chaos",
+ "clippers",
+ "contretemps",
+ "corps",
+ "cosmos",
+ "crossroads",
+ "diabetes",
+ "ethos",
+ "gallows",
+ "gas",
+ "graffiti",
+ "headquarters",
+ "herpes",
+ "high-jinks",
+ "innings",
+ "jackanapes",
+ "lens",
+ "means",
+ "measles",
+ "mews",
+ "mumps",
+ "news",
+ "pathos",
+ "pincers",
+ "pliers",
+ "proceedings",
+ "rabies",
+ "rhinoceros",
+ "sassafras",
+ "scissors",
+ "series",
+ "shears",
+ "species",
+ "tuna"
+ );
+
+ /** Words that change from "-um" to "-a" (like "curriculum" etc.), listed in their plural forms*/
+ public static Set categoryUM_A=new FinalSet(
+ "addenda",
+ "agenda",
+ "aquaria",
+ "bacteria",
+ "candelabra",
+ "compendia",
+ "consortia",
+ "crania",
+ "curricula",
+ "data",
+ "desiderata",
+ "dicta",
+ "emporia",
+ "enconia",
+ "errata",
+ "extrema",
+ "gymnasia",
+ "honoraria",
+ "interregna",
+ "lustra",
+ "maxima",
+ "media",
+ "memoranda",
+ "millenia",
+ "minima",
+ "momenta",
+ "optima",
+ "ova",
+ "phyla",
+ "quanta",
+ "rostra",
+ "spectra",
+ "specula",
+ "stadia",
+ "strata",
+ "symposia",
+ "trapezia",
+ "ultimata",
+ "vacua",
+ "vela"
+ );
+
+ /** Words that change from "-on" to "-a" (like "phenomenon" etc.), listed in their plural forms*/
+ public static Set categoryON_A=new FinalSet(
+ "aphelia",
+ "asyndeta",
+ "automata",
+ "criteria",
+ "hyperbata",
+ "noumena",
+ "organa",
+ "perihelia",
+ "phenomena",
+ "prolegomena"
+ );
+
+ /** Words that change from "-o" to "-i" (like "libretto" etc.), listed in their plural forms*/
+ public static Set categoryO_I=new FinalSet(
+ "alti",
+ "bassi",
+ "canti",
+ "contralti",
+ "crescendi",
+ "libretti",
+ "soli",
+ "soprani",
+ "tempi",
+ "virtuosi"
+ );
+
+ /** Words that change from "-us" to "-i" (like "fungus" etc.), listed in their plural forms*/
+ public static Set categoryUS_I=new FinalSet(
+ "alumni",
+ "bacilli",
+ "cacti",
+ "foci",
+ "fungi",
+ "genii",
+ "hippopotami",
+ "incubi",
+ "nimbi",
+ "nuclei",
+ "nucleoli",
+ "octopi",
+ "radii",
+ "stimuli",
+ "styli",
+ "succubi",
+ "syllabi",
+ "termini",
+ "tori",
+ "umbilici",
+ "uteri"
+ );
+
+ /** Words that change from "-ix" to "-ices" (like "appendix" etc.), listed in their plural forms*/
+ public static Set categoryIX_ICES=new FinalSet(
+ "appendices",
+ "cervices"
+ );
+
+ /** Words that change from "-is" to "-es" (like "axis" etc.), listed in their plural forms*/
+ public static Set categoryIS_ES=new FinalSet(
+ // plus everybody ending in theses
+ "analyses",
+ "axes",
+ "bases",
+ "crises",
+ "diagnoses",
+ "ellipses",
+ "emphases",
+ "neuroses",
+ "oases",
+ "paralyses",
+ "synopses"
+ );
+
+ /** Words that change from "-oe" to "-oes" (like "toe" etc.), listed in their plural forms*/
+ public static Set categoryOE_OES=new FinalSet(
+ "aloes",
+ "backhoes",
+ "beroes",
+ "canoes",
+ "chigoes",
+ "cohoes",
+ "does",
+ "felloes",
+ "floes",
+ "foes",
+ "gumshoes",
+ "hammertoes",
+ "hoes",
+ "hoopoes",
+ "horseshoes",
+ "leucothoes",
+ "mahoes",
+ "mistletoes",
+ "oboes",
+ "overshoes",
+ "pahoehoes",
+ "pekoes",
+ "roes",
+ "shoes",
+ "sloes",
+ "snowshoes",
+ "throes",
+ "tic-tac-toes",
+ "tick-tack-toes",
+ "ticktacktoes",
+ "tiptoes",
+ "tit-tat-toes",
+ "toes",
+ "toetoes",
+ "tuckahoes",
+ "woes"
+ );
+
+ /** Words that change from "-ex" to "-ices" (like "index" etc.), listed in their plural forms*/
+ public static Set categoryEX_ICES=new FinalSet(
+ "apices",
+ "codices",
+ "cortices",
+ "indices",
+ "latices",
+ "murices",
+ "pontifices",
+ "silices",
+ "simplices",
+ "vertices",
+ "vortices"
+ );
+
+ /** Words that change from "-u" to "-us" (like "emu" etc.), listed in their plural forms*/
+ public static Set categoryU_US=new FinalSet(
+ "apercus",
+ "barbus",
+ "cornus",
+ "ecrus",
+ "emus",
+ "fondus",
+ "gnus",
+ "iglus",
+ "mus",
+ "nandus",
+ "napus",
+ "poilus",
+ "quipus",
+ "snafus",
+ "tabus",
+ "tamandus",
+ "tatus",
+ "timucus",
+ "tiramisus",
+ "tofus",
+ "tutus"
+ );
+
+ /** Words that change from "-sse" to "-sses" (like "finesse" etc.), listed in their plural forms*/
+ public static Set categorySSE_SSES=new FinalSet(
+ //plus those ending in mousse
+ "bouillabaisses",
+ "coulisses",
+ "crevasses",
+ "crosses",
+ "cuisses",
+ "demitasses",
+ "ecrevisses",
+ "fesses",
+ "finesses",
+ "fosses",
+ "impasses",
+ "lacrosses",
+ "largesses",
+ "masses",
+ "noblesses",
+ "palliasses",
+ "pelisses",
+ "politesses",
+ "posses",
+ "tasses",
+ "wrasses"
+ );
+
+ /** Words that change from "-che" to "-ches" (like "brioche" etc.), listed in their plural forms*/
+ public static Set categoryCHE_CHES=new FinalSet(
+ "adrenarches",
+ "attaches",
+ "avalanches",
+ "barouches",
+ "brioches",
+ "caches",
+ "caleches",
+ "caroches",
+ "cartouches",
+ "cliches",
+ "cloches",
+ "creches",
+ "demarches",
+ "douches",
+ "gouaches",
+ "guilloches",
+ "headaches",
+ "heartaches",
+ "huaraches",
+ "menarches",
+ "microfiches",
+ "moustaches",
+ "mustaches",
+ "niches",
+ "panaches",
+ "panoches",
+ "pastiches",
+ "penuches",
+ "pinches",
+ "postiches",
+ "psyches",
+ "quiches",
+ "schottisches",
+ "seiches",
+ "soutaches",
+ "synecdoches",
+ "thelarches",
+ "troches"
+ );
+
+ /** Words that end with "-ics" and do not exist as nouns without the 's' (like "aerobics" etc.)*/
+ public static Set categoryICS=new FinalSet(
+ "aerobatics",
+ "aerobics",
+ "aerodynamics",
+ "aeromechanics",
+ "aeronautics",
+ "alphanumerics",
+ "animatronics",
+ "apologetics",
+ "architectonics",
+ "astrodynamics",
+ "astronautics",
+ "astrophysics",
+ "athletics",
+ "atmospherics",
+ "autogenics",
+ "avionics",
+ "ballistics",
+ "bibliotics",
+ "bioethics",
+ "biometrics",
+ "bionics",
+ "bionomics",
+ "biophysics",
+ "biosystematics",
+ "cacogenics",
+ "calisthenics",
+ "callisthenics",
+ "catoptrics",
+ "civics",
+ "cladistics",
+ "cryogenics",
+ "cryonics",
+ "cryptanalytics",
+ "cybernetics",
+ "cytoarchitectonics",
+ "cytogenetics",
+ "diagnostics",
+ "dietetics",
+ "dramatics",
+ "dysgenics",
+ "econometrics",
+ "economics",
+ "electromagnetics",
+ "electronics",
+ "electrostatics",
+ "endodontics",
+ "enterics",
+ "ergonomics",
+ "eugenics",
+ "eurhythmics",
+ "eurythmics",
+ "exodontics",
+ "fibreoptics",
+ "futuristics",
+ "genetics",
+ "genomics",
+ "geographics",
+ "geophysics",
+ "geopolitics",
+ "geriatrics",
+ "glyptics",
+ "graphics",
+ "gymnastics",
+ "hermeneutics",
+ "histrionics",
+ "homiletics",
+ "hydraulics",
+ "hydrodynamics",
+ "hydrokinetics",
+ "hydroponics",
+ "hydrostatics",
+ "hygienics",
+ "informatics",
+ "kinematics",
+ "kinesthetics",
+ "kinetics",
+ "lexicostatistics",
+ "linguistics",
+ "lithoglyptics",
+ "liturgics",
+ "logistics",
+ "macrobiotics",
+ "macroeconomics",
+ "magnetics",
+ "magnetohydrodynamics",
+ "mathematics",
+ "metamathematics",
+ "metaphysics",
+ "microeconomics",
+ "microelectronics",
+ "mnemonics",
+ "morphophonemics",
+ "neuroethics",
+ "neurolinguistics",
+ "nucleonics",
+ "numismatics",
+ "obstetrics",
+ "onomastics",
+ "orthodontics",
+ "orthopaedics",
+ "orthopedics",
+ "orthoptics",
+ "paediatrics",
+ "patristics",
+ "patristics",
+ "pedagogics",
+ "pediatrics",
+ "periodontics",
+ "pharmaceutics",
+ "pharmacogenetics",
+ "pharmacokinetics",
+ "phonemics",
+ "phonetics",
+ "phonics",
+ "photomechanics",
+ "physiatrics",
+ "pneumatics",
+ "poetics",
+ "politics",
+ "pragmatics",
+ "prosthetics",
+ "prosthodontics",
+ "proteomics",
+ "proxemics",
+ "psycholinguistics",
+ "psychometrics",
+ "psychonomics",
+ "psychophysics",
+ "psychotherapeutics",
+ "robotics",
+ "semantics",
+ "semiotics",
+ "semitropics",
+ "sociolinguistics",
+ "stemmatics",
+ "strategics",
+ "subtropics",
+ "systematics",
+ "tectonics",
+ "telerobotics",
+ "therapeutics",
+ "thermionics",
+ "thermodynamics",
+ "thermostatics"
+ );
+
+ /** Words that change from "-ie" to "-ies" (like "auntie" etc.), listed in their plural forms*/
+ public static Set categoryIE_IES=new FinalSet(
+ "aeries",
+ "anomies",
+ "aunties",
+ "baddies",
+ "beanies",
+ "birdies",
+ "boccies",
+ "bogies",
+ "bolshies",
+ "bombies",
+ "bonhomies",
+ "bonxies",
+ "booboisies",
+ "boogies",
+ "boogie-woogies",
+ "bookies",
+ "booties",
+ "bosies",
+ "bourgeoisies",
+ "brasseries",
+ "brassies",
+ "brownies",
+ "budgies",
+ "byrnies",
+ "caddies",
+ "calories",
+ "camaraderies",
+ "capercaillies",
+ "capercailzies",
+ "cassies",
+ "catties",
+ "causeries",
+ "charcuteries",
+ "chinoiseries",
+ "collies",
+ "commies",
+ "cookies",
+ "coolies",
+ "coonties",
+ "cooties",
+ "corries",
+ "coteries",
+ "cowpies",
+ "cowries",
+ "cozies",
+ "crappies",
+ "crossties",
+ "curies",
+ "dachsies",
+ "darkies",
+ "dassies",
+ "dearies",
+ "dickies",
+ "dies",
+ "dixies",
+ "doggies",
+ "dogies",
+ "dominies",
+ "dovekies",
+ "eyries",
+ "faeries",
+ "falsies",
+ "floozies",
+ "folies",
+ "foodies",
+ "freebies",
+ "gaucheries",
+ "gendarmeries",
+ "genies",
+ "ghillies",
+ "gillies",
+ "goalies",
+ "goonies",
+ "grannies",
+ "grotesqueries",
+ "groupies",
+ "hankies",
+ "hippies",
+ "hoagies",
+ "honkies",
+ "hymies",
+ "indies",
+ "junkies",
+ "kelpies",
+ "kilocalories",
+ "knobkerries",
+ "koppies",
+ "kylies",
+ "laddies",
+ "lassies",
+ "lies",
+ "lingeries",
+ "magpies",
+ "magpies",
+ "marqueteries",
+ "mashies",
+ "mealies",
+ "meanies",
+ "menageries",
+ "millicuries",
+ "mollies",
+ "facts1",
+ "moxies",
+ "neckties",
+ "newbies",
+ "nighties",
+ "nookies",
+ "oldies",
+ "organdies",
+ "panties",
+ "parqueteries",
+ "passementeries",
+ "patisseries",
+ "pies",
+ "pinkies",
+ "pixies",
+ "porkpies",
+ "potpies",
+ "prairies",
+ "preemies",
+ "premies",
+ "punkies",
+ "pyxies",
+ "quickies",
+ "ramies",
+ "reveries",
+ "rookies",
+ "rotisseries",
+ "scrapies",
+ "sharpies",
+ "smoothies",
+ "softies",
+ "stoolies",
+ "stymies",
+ "swaggies",
+ "sweeties",
+ "talkies",
+ "techies",
+ "ties",
+ "tooshies",
+ "toughies",
+ "townies",
+ "veggies",
+ "walkie-talkies",
+ "wedgies",
+ "weenies",
+ "weirdies",
+ "yardies",
+ "yuppies",
+ "zombies"
+ );
+
+ /** Maps irregular Germanic English plural nouns to their singular form */
+ public static Map irregular=new FinalMap(
+ "beefs","beef",
+ "beeves","beef",
+ "brethren","brother",
+ "busses","bus",
+ "cattle","cattlebeast",
+ "children","child",
+ "corpora","corpus",
+ "ephemerides","ephemeris",
+ "firemen","fireman",
+ "genera","genus",
+ "genies","genie",
+ "genii","genie",
+ "kine","cow",
+ "lice","louse",
+ "men","man",
+ "mice","mouse",
+ "mongooses","mongoose",
+ "monies","money",
+ "mythoi","mythos",
+ "octopodes","octopus",
+ "octopuses","octopus",
+ "oxen","ox",
+ "people","person",
+ "soliloquies","soliloquy",
+ "throes","throes",
+ "trilbys","trilby",
+ "women","woman"
+ );
+
+ /** Contains word forms that can either be plural or singular */
+ public static Set singAndPlur=new FinalSet(
+ "acoustics",
+ "aestetics",
+ "aquatics",
+ "basics",
+ "ceramics",
+ "classics",
+ "cosmetics",
+ "dermatoglyphics",
+ "dialectics",
+ "dynamics",
+ "esthetics",
+ "ethics",
+ "harmonics",
+ "heroics",
+ "isometrics",
+ "mechanics",
+ "metrics",
+ "statistics",
+ "optic",
+ "people",
+ "physics",
+ "polemics",
+ "premises",
+ "propaedeutics",
+ "pyrotechnics",
+ "quadratics",
+ "quarters",
+ "statistics",
+ "tactics",
+ "tropics"
+ );
+
+ /** Test routine */
+ public static void main(String[] argv) throws Exception {
+ System.out.println("Enter an English word in plural form and press ENTER");
+ BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
+ while(true) {
+ String w=in.readLine();
+ if(w.length()==0) break;
+ if(isPlural(w)) System.out.println("This word is plural");
+ if(isSingular(w)) System.out.println("This word is singular");
+ System.out.println("Stemmed to singular: "+stem(w));
+ }
+ }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PositionTracker.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PositionTracker.java
new file mode 100644
index 0000000..4c07240
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PositionTracker.java
@@ -0,0 +1,288 @@
+package org.yago.javatools.parsers;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+/**
+ * This class is part of the Java Tools (see
+ * http://mpii.de/yago-naga/javatools). It is licensed under the Creative
+ * Commons Attribution License (see http://creativecommons.org/licenses/by/3.0)
+ * by the YAGO-NAGA team (see http://mpii.de/yago-naga).
+ *
+ * This class implements position change trackers that keep track of position
+ * changes within a String, e.g. caused through normalization etc.
+ * This allows for instance, given a position int the normalized string
+ * to get the corresponding position in the original non-normalized string
+ *
+ *
+ *
+ * backward position tracker -
+ * tracking several replacement/text changes allowing to trace a position in the modified
+ * text back to the corresp. position in the original text
+ * for the other direction see ForwardPositionTracker
+ *
+ * @author smetzger */
+public class PositionTracker {
+
+
+ private SortedMappositionMap;
+ private SortedMappositionChanges;
+ private SortedMapold2NewMap;
+ private int accumulatedModifier=0;
+
+ public PositionTracker(){
+ positionMap=new TreeMap();
+ positionChanges=new TreeMap();
+ old2NewMap=new TreeMap();
+ }
+
+
+ public void addPositionChange(int pos, int modifier){
+ if(modifier!=0){
+ int oldModifier=0;
+ old2NewMap.put(pos, modifier);
+ accumulatedModifier+=modifier;
+ if(positionChanges.containsKey(pos+accumulatedModifier))
+ oldModifier=positionChanges.get(pos+accumulatedModifier);
+ positionChanges.put(pos+accumulatedModifier,modifier*-1+oldModifier);
+ }
+ }
+
+
+
+ /** Closes the current changing run by Merging new position changes into the existing position change map
+ * after each round (one round=consecutive changes along the text) you need to call closeRun() before submitting more position changes from a new round,
+ * i.e. whenever you passed the string to be modified once call closeRun() before starting to run over the string again with more replacements
+ * Do this every time you ran once over the text making changes to be tracked*/
+ public void closeRun() {
+ if(positionChanges.isEmpty())
+ return;
+
+
+ SortedMap temp=positionChanges;
+
+ //adapt old positions to new mapping
+ while(!positionMap.isEmpty()){
+ Integer key=positionMap.firstKey();
+ Collection modifiers=old2NewMap.headMap(key+1).values();
+ Integer newposition=key;
+ for(Iterator it=modifiers.iterator(); it.hasNext(); newposition+=it.next()){}
+ Integer value=positionMap.get(key);
+ if(positionChanges.containsKey(newposition))
+ value+=positionChanges.get(newposition);
+ positionChanges.put(newposition, value);
+ positionMap.remove(key);
+ }
+
+ positionChanges=positionMap;
+ positionMap=temp;
+ old2NewMap.clear();
+ accumulatedModifier=0;
+ return;
+ }
+
+
+
+
+ /** Merges new position changes (given with the inversed old2new mapping) into the existing position change map*/
+/* private void addPositionMappings(SortedMap newPosChanges,
+ SortedMap old2NewMap) {
+
+
+ TreeMap newMap=new TreeMap();
+
+ //adapt old positions to new mapping
+ while(!positionMap.isEmpty()){
+ Integer key=positionMap.firstKey();
+ Collection modifiers=old2NewMap.headMap(key+1).values();
+ Integer newposition=key;
+ for(Iterator it=modifiers.iterator(); it.hasNext(); newposition+=it.next()){}
+ Integer value=positionMap.get(key);
+ if(newMap.containsKey(newposition))
+ value+=newMap.get(newposition);
+ newMap.put(newposition, value);
+ positionMap.remove(key);
+ }
+ while(!newPosChanges.isEmpty()){
+ Integer key=newPosChanges.firstKey();
+ Integer value=newPosChanges.get(key);
+ if(newMap.containsKey(key))
+ value+=newMap.get(key);
+ newMap.put(key, value);
+ newPosChanges.remove(key);
+ }
+ positionMap=newMap;
+ old2NewMap.clear();
+ return;
+ }
+ */
+
+ public Integer translatePosition(Integer pos) {
+ SortedMap headMap=positionMap.headMap(pos+1);
+ Integer modifier=0;
+ for(Iterator it=headMap.values().iterator(); it.hasNext(); modifier+=it.next()){}
+/* if(headMap.size()>1){ TODO: Possible Optimization if we assume positions are asked in ascending order
+ headMap.clear();
+ posMap.put(pos, modifier);
+ }*/
+ return pos+modifier;
+ }
+
+
+
+
+
+
+
+
+
+
+
+ /** forward position change tracking - keeping track of several rounds of text modifications allowing to trace a position in the original
+ * text along the modifications to the corresp. position in the modified text
+ * after each round (one round=consecutive changes along the text) you need to call closeRun() before submitting more position changes from a new round,
+ * i.e. whenever you passed the string to be modified once call closeRun() before starting to run over the string again with more replacements
+ * REMARK: NOT TESTED WITH MORE THAN ONE ROUND! may be ERRORNOUS with multiple rounds -> use with care (works with a single round though)
+ * @author smetzger
+ *
+ */
+ public static class ForwardPositionTracker {
+
+
+ private SortedMappositionMap;
+ private SortedMappositionChanges;
+ //private SortedMapnew2OldMap;
+ private PositionTracker new2OldTracker=null;
+ private int accumulatedModifier=0;
+
+ public ForwardPositionTracker(){
+ positionMap=new TreeMap();
+ positionChanges=new TreeMap();
+ // new2OldMap=new TreeMap();
+ new2OldTracker=new PositionTracker();
+
+ }
+
+
+ public void addPositionChange(int pos, int modifier){
+ if(modifier!=0){
+ positionChanges.put(pos,modifier);
+ accumulatedModifier+=modifier;
+ /*if(new2OldMap.containsKey(pos+accumulatedModifier))
+ oldModifier=new2OldMap.get(pos+accumulatedModifier);
+ new2OldMap.put(pos+accumulatedModifier, -1*modifier+oldModifier);
+ } */
+ new2OldTracker.addPositionChange(pos, modifier);
+ }
+ }
+
+
+
+
+
+
+
+ /** Closes the current changing run by Merging new position changes into the existing position change map
+ * Do this every time you ran once over the text making changes to be tracked*/
+ public void closeRun() {
+ if(positionChanges.isEmpty())
+ return;
+
+
+ for(Map.Entry change:positionChanges.entrySet()){
+ Integer positionInOrigStream=new2OldTracker.translatePosition(change.getKey());
+ if(positionMap.containsKey(positionInOrigStream))
+ positionMap.put(positionInOrigStream, change.getValue()+positionMap.get(positionInOrigStream));
+ else
+ positionMap.put(positionInOrigStream, change.getValue());
+ }
+
+ positionChanges.clear();
+ accumulatedModifier=0;
+ new2OldTracker.closeRun();
+
+ return;
+ }
+
+
+
+ /** tells whether a position in the original stream has been cut away by some change operation,
+ * such that translating it usually would make not to much sense
+ * @return true, iff the given position has been cut away, false otherwise (i.e. false if it should be mappable)
+ * TODO: current version ONLY WORKS SECURELY WHEN THERE IS ONLY ONE POSITION CHANGE RUN WITHOUT OVERLAPPING CHANGES!
+ * as soon as there are more than one change runs, or changes that overlap, we would need to check all following changes instead of only the next one */
+ public boolean hasBeenCutAway(Integer pos){
+ SortedMap tailMap=positionMap.tailMap(pos+1);
+ if(tailMap.isEmpty())
+ return false;
+ Integer key=tailMap.firstKey();
+ Integer modifier=tailMap.get(key);
+ if(modifier<0 && key+modifier<=pos )
+ return true;
+ else
+ return false;
+ /* this does not work for the general case (had it the wrong way aroung), but can be used to implement it
+ Integer key=null;
+ Iterator it=tailMap.keySet().iterator();
+ while(it.hasNext()){
+ key=it.next();
+ Integer mod=tailMap.get(key);
+ if(mod<0 && key-mod>=pos)
+ return true;
+ }
+ return false;*/
+ }
+
+ public Integer translatePosition(Integer pos) {
+ SortedMap headMap=positionMap.headMap(pos+1);
+ Integer modifier=0;
+ for(Iterator it=headMap.values().iterator(); it.hasNext(); modifier+=it.next()){}
+ /* if(headMap.size()>1){ Optimization if we assume positions are asked in ascending order
+ headMap.clear();
+ posMap.put(pos, modifier);
+ }*/
+ return pos+modifier;
+ }
+
+ /** also handles positions inside text parts that have been cut out properly
+ *
+ * TODO: current version ONLY WORKS SECURELY WHEN THERE IS ONLY ONE POSITION CHANGE RUN WITHOUT OVERLAPPING CHANGES!
+ * as soon as there are more than one change runs, or changes that overlap, we would need to check all following changes instead of only the next one */
+ public Integer translatePositionExactly(Integer pos) {
+
+ SortedMap tailMap=positionMap.tailMap(pos+1);
+ if(tailMap.isEmpty())
+ return translatePosition(pos);
+ else{
+ Integer key=tailMap.firstKey();
+ Integer modifier=tailMap.get(key);
+ return translatePosition(Math.min(pos,key+modifier));
+ }
+
+/*
+ * That version does it the wrong way around
+ * SortedMap headMap=positionMap.headMap(pos+1);
+ Integer modifier=0;
+ Integer key=null, value=null;
+ Iterator it=headMap.keySet().iterator();
+ while(it.hasNext()){
+ key=it.next();
+ value=headMap.get(key);
+ if(value<0)
+ modifier+=Math.max(key-pos, value);
+ }*/
+ /* if(headMap.size()>1){ Optimization if we assume positions are asked in ascending order
+ headMap.clear();
+ posMap.put(pos, modifier);
+ }
+ return pos+modifier; */
+
+ }
+
+ }
+
+
+}
diff --git a/WikipediaCategoryProcessor/src/test/java/org/karsha/wikipediacategoryprocessor/AppTest.java b/WikipediaCategoryProcessor/src/test/java/org/karsha/wikipediacategoryprocessor/AppTest.java
new file mode 100644
index 0000000..02a30d8
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/test/java/org/karsha/wikipediacategoryprocessor/AppTest.java
@@ -0,0 +1,38 @@
+package org.karsha.wikipediacategoryprocessor;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+/**
+ * Unit test for simple App.
+ */
+public class AppTest
+ extends TestCase
+{
+ /**
+ * Create the test case
+ *
+ * @param testName name of the test case
+ */
+ public AppTest( String testName )
+ {
+ super( testName );
+ }
+
+ /**
+ * @return the suite of tests being tested
+ */
+ public static Test suite()
+ {
+ return new TestSuite( AppTest.class );
+ }
+
+ /**
+ * Rigourous Test :-)
+ */
+ public void testApp()
+ {
+ assertTrue( true );
+ }
+}