From 232bb04d9287ebddce460fac3eba13b42e1a3bfa Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 22 Mar 2020 17:01:12 +0100 Subject: [PATCH 01/10] Remove oboslete comment and add full link to issue --- src/main/java/org/jabref/model/cleanup/CleanupJob.java | 5 ----- .../logic/layout/format/LatexToUnicodeFormatterTest.java | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/main/java/org/jabref/model/cleanup/CleanupJob.java b/src/main/java/org/jabref/model/cleanup/CleanupJob.java index 7c88cf6566c..b538f78b968 100644 --- a/src/main/java/org/jabref/model/cleanup/CleanupJob.java +++ b/src/main/java/org/jabref/model/cleanup/CleanupJob.java @@ -7,10 +7,5 @@ @FunctionalInterface public interface CleanupJob { - - /** - * Cleanup the entry. - */ List cleanup(BibEntry entry); - } diff --git a/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java b/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java index f48963ff923..ca03f872ecd 100644 --- a/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java +++ b/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java @@ -27,7 +27,7 @@ void preserveUnknownCommand() { @Test void testFormatTextit() { - // See #1464 + // See https://github.com/JabRef/jabref/pull/1464 assertEquals("\uD835\uDC61\uD835\uDC52\uD835\uDC65\uD835\uDC61", formatter.format("\\textit{text}")); } From 5be8b513e394cb1e77504182546522baa2415072 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 22 Mar 2020 17:17:51 +0100 Subject: [PATCH 02/10] WIP --- build.gradle | 12 --------- .../format/LatexToUnicodeFormatter.java | 21 ++++++++++++--- .../model/strings/LatexToUnicodeAdapter.java | 27 ------------------- 3 files changed, 18 insertions(+), 42 deletions(-) delete mode 100644 src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java diff --git a/build.gradle b/build.gradle index b3f1871f7ec..51f4b40bf1b 100644 --- a/build.gradle +++ b/build.gradle @@ -54,14 +54,6 @@ application { mainClassName = "$moduleName/org.jabref.JabRefLauncher" } -// TODO: Ugly workaround to temporarily ignore build errors to dependencies of latex2unicode -// These should be removed, as well as the files in the lib folder, as soon as they have valid module names -patchModules.config = [ - "test=fastparse_2.12-1.0.0.jar", - "test2=fastparse-utils_2.12-1.0.0.jar", - "test3=sourcecode_2.12-0.1.4.jar" -] - // These are the Java version requirements we will check on each start of JabRef ext.minRequiredJavaVersion = "1.8.0_171" ext.allowJava9 = true @@ -186,10 +178,6 @@ dependencies { implementation group: 'jakarta.xml.bind', name: 'jakarta.xml.bind-api', version: '2.3.2' implementation group: 'org.glassfish.jaxb', name: 'jaxb-runtime', version: '2.3.2' - implementation ('com.github.tomtung:latex2unicode_2.12:0.2.6') { - exclude module: 'fastparse_2.12' - } - implementation group: 'com.microsoft.azure', name: 'applicationinsights-core', version: '2.4.1' implementation (group: 'com.microsoft.azure', name: 'applicationinsights-logging-log4j2', version: '2.4.1') { exclude module: "log4j-core" diff --git a/src/main/java/org/jabref/logic/layout/format/LatexToUnicodeFormatter.java b/src/main/java/org/jabref/logic/layout/format/LatexToUnicodeFormatter.java index f16a5dd4c54..32bc2b26a52 100644 --- a/src/main/java/org/jabref/logic/layout/format/LatexToUnicodeFormatter.java +++ b/src/main/java/org/jabref/logic/layout/format/LatexToUnicodeFormatter.java @@ -1,9 +1,12 @@ package org.jabref.logic.layout.format; +import java.util.Map; +import java.util.Objects; + import org.jabref.logic.l10n.Localization; import org.jabref.logic.layout.LayoutFormatter; +import org.jabref.logic.util.strings.HTMLUnicodeConversionMaps; import org.jabref.model.cleanup.Formatter; -import org.jabref.model.strings.LatexToUnicodeAdapter; /** * This formatter converts LaTeX character sequences their equivalent unicode characters, @@ -22,8 +25,20 @@ public String getKey() { } @Override - public String format(String inField) { - return LatexToUnicodeAdapter.format(inField); + public String format(String text) { + String result = Objects.requireNonNull(text); + + if (result.isEmpty()) { + return result; + } + + // Standard symbols + for (Map.Entry unicodeLatexPair : HTMLUnicodeConversionMaps.UNICODE_LATEX_CONVERSION_MAP + .entrySet()) { + result = result.replace(unicodeLatexPair.getValue(), unicodeLatexPair.getKey()); + } + + return result; } @Override diff --git a/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java b/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java deleted file mode 100644 index c1eb8dc4248..00000000000 --- a/src/main/java/org/jabref/model/strings/LatexToUnicodeAdapter.java +++ /dev/null @@ -1,27 +0,0 @@ -package org.jabref.model.strings; - -import java.text.Normalizer; -import java.util.Objects; -import java.util.regex.Pattern; - -import com.github.tomtung.latex2unicode.LaTeX2Unicode; - -/** - * Adapter class for the latex2unicode lib. This is an alternative to our LatexToUnicode class - */ -public class LatexToUnicodeAdapter { - - private static Pattern underscoreMatcher = Pattern.compile("_(?!\\{)"); - - private static String replacementChar = "\uFFFD"; - - private static Pattern underscorePlaceholderMatcher = Pattern.compile(replacementChar); - - public static String format(String inField) { - Objects.requireNonNull(inField); - - String toFormat = underscoreMatcher.matcher(inField).replaceAll(replacementChar); - toFormat = Normalizer.normalize(LaTeX2Unicode.convert(toFormat), Normalizer.Form.NFC); - return underscorePlaceholderMatcher.matcher(toFormat).replaceAll("_"); - } -} From ee5722c03907d9978ef3543886b3f705e64b01fd Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 22 Mar 2020 18:58:00 +0100 Subject: [PATCH 03/10] WIP: Fix UnicodeToLaTeX to handle "dot below" --- .../bibtexfields/UnicodeToLatexFormatter.java | 1 - .../strings/HTMLUnicodeConversionMaps.java | 104 ++++++++++++++++-- .../UnicodeToLatexFormatterTest.java | 11 ++ 3 files changed, 108 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java index 41147892971..7d147b974f8 100644 --- a/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java @@ -18,7 +18,6 @@ public class UnicodeToLatexFormatter extends Formatter implements LayoutFormatte @Override public String format(String text) { String result = Objects.requireNonNull(text); - if (result.isEmpty()) { return result; } diff --git a/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java b/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java index ac4b78c0857..e13ecc9ae15 100644 --- a/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java +++ b/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java @@ -22,12 +22,33 @@ public class HTMLUnicodeConversionMaps { conforming SGML systems and applications as defined in ISO 8879, provided this notice is included in all copies. */ - // as well as http://www.w3.org/Math/characters/unicode.xml - // An array of arrays of strings in the format: - // {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"} - // Leaving a field empty is OK as it then will not be included - private static final String[][] CONVERSION_LIST = new String[][] {{"160", "nbsp", "{~}"}, // no-break space = non-breaking space, + + /** + * We need to have lookup table, because the unicode table does not follow an easy scheme. + * For instance, there is no a with a lower dot, but a b. + * See https://www.utf8-chartable.de/unicode-utf8-table.pl + * + * An array of arrays of strings in the format: + * {"decimal number of HTML entity", "text HTML entity", "corresponding LaTeX command"} + * Leaving a field empty is OK as it then will not be included. + * + * Aim for this format is easy addition of data by a developer. + * It is not possible to create a multi-dimensional array of different content types. + * When creating an enum (e.g., Inverted_Exclamation_Mark(161, "iexcl", "{\\textexclamdown}");, then one needs to assign a name to each entry. This is unnecessary overhead. + * + * We need to keep the triple together as HTML encoding closely relates to LaTeX encoding - and we want to support LaTeX to HTML as well as LaTeX to Unicode + * + * Mappings from unicode to latex, unicode to HTML, HTML to unicode, ... are generated based on these entries. + * + * Helper scripts to gernerate entries: + * + * - copy table from https://www.utf8-chartable.de/unicode-utf8-table.pl to input.txt + * - grep "DOT BELOW" input.txt > input-dot-below.txt + * - sed "s#..\(....\)..............\(LATIN SMALL LETTER \(.\).*\)#{\"0x\1\", \"\", \"\\\\\\\\d{\L\3}\"}, // \2#" input-dot-below.txt | grep { + */ + private static final String[][] CONVERSION_LIST = new String[][] { + {"160", "nbsp", "{~}"}, // no-break space = non-breaking space, // U+00A0 ISOnum {"161", "iexcl", "{\\textexclamdown}"}, // inverted exclamation mark, U+00A1 ISOnum {"162", "cent", "{\\textcent}"}, // cent sign, U+00A2 ISOnum @@ -293,6 +314,70 @@ public class HTMLUnicodeConversionMaps { // U+03D2 NEW {"982", "piv", "$\\varphi$"}, // greek pi symbol, U+03D6 ISOgrk3 + // Dot Below + {"7717", "", "\\d{h}"}, // ḥ, https://unicode-table.com/de/1E25/ + {"7751", "", "\\d{n}"}, // ṇ, https://unicode-table.com/de/1E47/ + + {"0x1E05", "", "\\d{b}"}, // latin small letter b with dot below + {"0x1E0D", "", "\\d{d}"}, // latin small letter d with dot below + {"0x1E25", "", "\\d{h}"}, // latin small letter h with dot below + {"0x1E33", "", "\\d{k}"}, // latin small letter k with dot below + {"0x1E37", "", "\\d{l}"}, // latin small letter l with dot below + {"0x1E39", "", "\\d{l}"}, // latin small letter l with dot below and macron + {"0x1E43", "", "\\d{m}"}, // latin small letter m with dot below + {"0x1E47", "", "\\d{n}"}, // latin small letter n with dot below + {"0x1E5B", "", "\\d{r}"}, // latin small letter r with dot below + {"0x1E5D", "", "\\d{r}"}, // latin small letter r with dot below and macron + {"0x1E63", "", "\\d{s}"}, // latin small letter s with dot below + {"0x1E69", "", "\\d{s}"}, // latin small letter s with dot below and dot above + {"0x1E6D", "", "\\d{t}"}, // latin small letter t with dot below + {"0x1E7F", "", "\\d{v}"}, // latin small letter v with dot below + {"0x1E89", "", "\\d{w}"}, // latin small letter w with dot below + {"0x1E93", "", "\\d{z}"}, // latin small letter z with dot below + {"0x1EA1", "", "\\d{a}"}, // latin small letter a with dot below + {"0x1EAD", "", "\\d{a}"}, // latin small letter a with circumflex and dot below + {"0x1EB7", "", "\\d{a}"}, // latin small letter a with breve and dot below + {"0x1EB9", "", "\\d{e}"}, // latin small letter e with dot below + {"0x1EC7", "", "\\d{e}"}, // latin small letter e with circumflex and dot below + {"0x1ECB", "", "\\d{i}"}, // latin small letter i with dot below + {"0x1ECD", "", "\\d{o}"}, // latin small letter o with dot below + {"0x1ED9", "", "\\d{o}"}, // latin small letter o with circumflex and dot below + {"0x1EE3", "", "\\d{o}"}, // latin small letter o with horn and dot below + {"0x1EE5", "", "\\d{u}"}, // latin small letter u with dot below + {"0x1EF1", "", "\\d{u}"}, // latin small letter u with horn and dot below + {"0x1EF5", "", "\\d{y}"}, // latin small letter y with dot below + + // TODO macrons and dots above --> special cases + + {"0x1E04", "", "\\d{B}"}, // LATIN CAPITAL LETTER B WITH DOT BELOW + {"0x1E0C", "", "\\d{D}"}, // LATIN CAPITAL LETTER D WITH DOT BELOW + {"0x1E24", "", "\\d{H}"}, // LATIN CAPITAL LETTER H WITH DOT BELOW + {"0x1E32", "", "\\d{K}"}, // LATIN CAPITAL LETTER K WITH DOT BELOW + {"0x1E36", "", "\\d{L}"}, // LATIN CAPITAL LETTER L WITH DOT BELOW + {"0x1E38", "", "\\d{L}"}, // LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON + {"0x1E42", "", "\\d{M}"}, // LATIN CAPITAL LETTER M WITH DOT BELOW + {"0x1E46", "", "\\d{N}"}, // LATIN CAPITAL LETTER N WITH DOT BELOW + {"0x1E5A", "", "\\d{R}"}, // LATIN CAPITAL LETTER R WITH DOT BELOW + {"0x1E5C", "", "\\d{R}"}, // LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON + {"0x1E62", "", "\\d{S}"}, // LATIN CAPITAL LETTER S WITH DOT BELOW + {"0x1E68", "", "\\d{S}"}, // LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE + {"0x1E6C", "", "\\d{T}"}, // LATIN CAPITAL LETTER T WITH DOT BELOW + {"0x1E7E", "", "\\d{V}"}, // LATIN CAPITAL LETTER V WITH DOT BELOW + {"0x1E88", "", "\\d{W}"}, // LATIN CAPITAL LETTER W WITH DOT BELOW + {"0x1E92", "", "\\d{Z}"}, // LATIN CAPITAL LETTER Z WITH DOT BELOW + {"0x1EA0", "", "\\d{A}"}, // LATIN CAPITAL LETTER A WITH DOT BELOW + {"0x1EAC", "", "\\d{A}"}, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW + {"0x1EB6", "", "\\d{A}"}, // LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW + {"0x1EB8", "", "\\d{E}"}, // LATIN CAPITAL LETTER E WITH DOT BELOW + {"0x1EC6", "", "\\d{E}"}, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW + {"0x1ECA", "", "\\d{I}"}, // LATIN CAPITAL LETTER I WITH DOT BELOW + {"0x1ECC", "", "\\d{O}"}, // LATIN CAPITAL LETTER O WITH DOT BELOW + {"0x1ED8", "", "\\d{O}"}, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW + {"0x1EE2", "", "\\d{O}"}, // LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW + {"0x1EE4", "", "\\d{U}"}, // LATIN CAPITAL LETTER U WITH DOT BELOW + {"0x1EF0", "", "\\d{U}"}, // LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW + {"0x1EF4", "", "\\d{Y}"}, // LATIN CAPITAL LETTER Y WITH DOT BELOW + /* General Punctuation */ {"8211", "ndash", "$\\textendash$"}, {"8212", "mdash", "$\\textemdash$"}, @@ -765,8 +850,13 @@ public class HTMLUnicodeConversionMaps { }; - // List of combining accents - private static final String[][] ACCENT_LIST = new String[][] {{"768", "`"}, // Grave + /** + * List of combining accents + * + * See https://de.wikibooks.org/wiki/LaTeX/_Akzente_und_Sonderzeichen for the LaTeX commands + */ + private static final String[][] ACCENT_LIST = new String[][] { + {"768", "`"}, // Grave {"769", "'"}, // Acute {"770", "^"}, // Circumflex {"771", "~"}, // Tilde diff --git a/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java index 5b87c87661d..cb1850222ea 100644 --- a/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java +++ b/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java @@ -19,13 +19,24 @@ void formatWithoutUnicodeCharactersReturnsSameString() { assertEquals("abc", formatter.format("abc")); } + @Test + void formatOfMacronAIsCorrect() { + assertEquals("{\\={a}}", formatter.format("ā")); + } + @Test void formatMultipleUnicodeCharacters() { assertEquals("{{\\aa}}{\\\"{a}}{\\\"{o}}", formatter.format("\u00E5\u00E4\u00F6")); } + @Test + void testSanskrit() { + assertEquals("Pu\\d{n}ya-pattana-vidy{\\={a}}-p{\\i{\\={}}}ṭh{\\={a}}dhi-kṛtaiḥ pr{\\={a}}-ka{{\\'{s}}}yaṃ n{\\i{\\={}}}taḥ", formatter.format("Pu\\d{n}ya-pattana-vidyā-pı̄ṭhādhi-kṛtaiḥ prā-kaśyaṃ nı̄taḥ")); + } + @Test void formatExample() { assertEquals("M{\\\"{o}}nch", formatter.format(formatter.getExampleInput())); } + } From 417f05e12ebd6f80953374f6660f15672aa1c45f Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sun, 22 Mar 2020 18:58:57 +0100 Subject: [PATCH 04/10] WIP: replace latex2unicode lib --- src/main/java/org/jabref/gui/texparser/CitationsDisplay.java | 5 +++-- src/main/java/org/jabref/logic/citationstyle/CSLAdapter.java | 4 ++-- src/main/java/org/jabref/model/entry/BibEntry.java | 5 +++-- .../logic/layout/format/LatexToUnicodeFormatterTest.java | 5 +++++ 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/jabref/gui/texparser/CitationsDisplay.java b/src/main/java/org/jabref/gui/texparser/CitationsDisplay.java index 8e03598787f..e591a34afea 100644 --- a/src/main/java/org/jabref/gui/texparser/CitationsDisplay.java +++ b/src/main/java/org/jabref/gui/texparser/CitationsDisplay.java @@ -18,7 +18,6 @@ import org.jabref.gui.icon.IconTheme; import org.jabref.gui.util.ViewModelListCellFactory; -import org.jabref.model.strings.LatexToUnicodeAdapter; import org.jabref.model.texparser.Citation; public class CitationsDisplay extends ListView { @@ -44,7 +43,9 @@ private Node getDisplayGraphic(Citation item) { } Node citationIcon = IconTheme.JabRefIcons.LATEX_COMMENT.getGraphicNode(); - Text contextText = new Text(LatexToUnicodeAdapter.format(item.getContext())); + // FIXME + Text contextText = null; + // Text contextText = new Text(LatexToUnicodeAdapter.format(item.getContext())); contextText.wrappingWidthProperty().bind(this.widthProperty().subtract(85)); HBox contextBox = new HBox(8, citationIcon, contextText); contextBox.getStyleClass().add("contextBox"); diff --git a/src/main/java/org/jabref/logic/citationstyle/CSLAdapter.java b/src/main/java/org/jabref/logic/citationstyle/CSLAdapter.java index 69b5d1395b4..123c295b862 100644 --- a/src/main/java/org/jabref/logic/citationstyle/CSLAdapter.java +++ b/src/main/java/org/jabref/logic/citationstyle/CSLAdapter.java @@ -11,7 +11,6 @@ import org.jabref.model.entry.Month; import org.jabref.model.entry.field.Field; import org.jabref.model.entry.field.StandardField; -import org.jabref.model.strings.LatexToUnicodeAdapter; import de.undercouch.citeproc.CSL; import de.undercouch.citeproc.DefaultAbbreviationProvider; @@ -98,7 +97,8 @@ private static CSLItemData bibEntryToCSLItemData(BibEntry bibEntry) { for (Field key : bibEntry.getFieldMap().keySet()) { bibEntry.getField(key) .map(removeNewlinesFormatter::format) - .map(LatexToUnicodeAdapter::format) + // FIXME + // .map(LatexToUnicodeAdapter::format) .ifPresent(value -> { if (StandardField.MONTH.equals(key)) { // Change month from #mon# to mon because CSL does not support the former format diff --git a/src/main/java/org/jabref/model/entry/BibEntry.java b/src/main/java/org/jabref/model/entry/BibEntry.java index d4c69cdfcdd..6d81c9cdd33 100644 --- a/src/main/java/org/jabref/model/entry/BibEntry.java +++ b/src/main/java/org/jabref/model/entry/BibEntry.java @@ -34,7 +34,6 @@ import org.jabref.model.entry.types.EntryType; import org.jabref.model.entry.types.IEEETranEntryType; import org.jabref.model.entry.types.StandardEntryType; -import org.jabref.model.strings.LatexToUnicodeAdapter; import org.jabref.model.strings.StringUtil; import org.jabref.model.util.MultiKeyMap; @@ -870,7 +869,9 @@ public Optional getLatexFreeField(Field field) { } else { Optional fieldValue = getField(field); if (fieldValue.isPresent()) { - String latexFreeField = LatexToUnicodeAdapter.format(fieldValue.get()).intern(); + // FIXME + // String latexFreeField = LatexToUnicodeAdapter.format(fieldValue.get()).intern(); + String latexFreeField = fieldValue.get(); latexFreeFields.put(field, latexFreeField); return Optional.of(latexFreeField); } else { diff --git a/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java b/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java index ca03f872ecd..3b4a0d84082 100644 --- a/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java +++ b/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java @@ -187,4 +187,9 @@ void testConversionOfOrdinal4th() { void testConversionOfOrdinal9th() { assertEquals("9ᵗʰ", formatter.format("9\\textsuperscript{th}")); } + + @Test + void testSanskrit() { + assertEquals("Puṇya-pattana-vidyā-pı̄ṭhādhi-kṛtaiḥ prā-kaśyaṃ nı̄taḥ", formatter.format("Pu\\d{n}ya-pattana-vidy{\\={a}}-p{\\i{\\={}}}\\d{t}h{\\={a}}dhi-k\\d{r}tai\\d{h} pr{\\={a}}-ka{{\\'{s}}}ya\\d{m} n{\\i{\\={}}}ta\\d{h}")); + } } From 77d7d2d78af17e63d0bfec16fc5f6c2189d4f076 Mon Sep 17 00:00:00 2001 From: Carl Christian Snethlage Date: Tue, 19 May 2020 15:09:39 +0200 Subject: [PATCH 05/10] Fixed merge error --- src/main/java/module-info.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java index acb5dd6e5ef..af5e04da951 100644 --- a/src/main/java/module-info.java +++ b/src/main/java/module-info.java @@ -56,7 +56,6 @@ requires org.apache.pdfbox; requires reactfx; requires commons.cli; - requires com.github.tomtung.latex2unicode; requires jbibtex; requires citeproc.java; requires antlr.runtime; From 406e03bad5bdb271a03e5a9936b257a9e4a68c2c Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 13 May 2023 00:26:44 +0200 Subject: [PATCH 06/10] Use withField --- .../logic/cleanup/CleanupWorkerTest.java | 90 ++++++++----------- 1 file changed, 35 insertions(+), 55 deletions(-) diff --git a/src/test/java/org/jabref/logic/cleanup/CleanupWorkerTest.java b/src/test/java/org/jabref/logic/cleanup/CleanupWorkerTest.java index 9f75580446b..4659008095f 100644 --- a/src/test/java/org/jabref/logic/cleanup/CleanupWorkerTest.java +++ b/src/test/java/org/jabref/logic/cleanup/CleanupWorkerTest.java @@ -85,20 +85,20 @@ void cleanupNullEntryThrowsException() { @Test void cleanupDoesNothingByDefault(@TempDir Path bibFolder) throws IOException { - BibEntry entry = new BibEntry(); - entry.setCitationKey("Toot"); - entry.setField(StandardField.PDF, "aPdfFile"); - entry.setField(new UnknownField("some"), "1st"); - entry.setField(StandardField.DOI, "http://dx.doi.org/10.1016/0001-8708(80)90035-3"); - entry.setField(StandardField.MONTH, "01"); - entry.setField(StandardField.PAGES, "1-2"); - entry.setField(StandardField.DATE, "01/1999"); - entry.setField(StandardField.PDF, "aPdfFile"); - entry.setField(StandardField.ISSN, "aPsFile"); - entry.setField(StandardField.FILE, "link::"); - entry.setField(StandardField.JOURNAL, "test"); - entry.setField(StandardField.TITLE, "hallo units 1 A case AlGaAs and latex $\\alpha$$\\beta$"); - entry.setField(StandardField.ABSTRACT, "Réflexions"); + BibEntry entry = new BibEntry() + .withCitationKey("Toot") + .withField(StandardField.PDF, "aPdfFile") + .withField(new UnknownField("some"), "1st") + .withField(StandardField.DOI, "http://dx.doi.org/10.1016/0001-8708(80)90035-3") + .withField(StandardField.MONTH, "01") + .withField(StandardField.PAGES, "1-2") + .withField(StandardField.DATE, "01/1999") + .withField(StandardField.PDF, "aPdfFile") + .withField(StandardField.ISSN, "aPsFile") + .withField(StandardField.FILE, "link::") + .withField(StandardField.JOURNAL, "test") + .withField(StandardField.TITLE, "hallo units 1 A case AlGaAs and latex $\\alpha$$\\beta$") + .withField(StandardField.ABSTRACT, "Réflexions"); Path path = bibFolder.resolve("ARandomlyNamedFile"); Files.createFile(path); LinkedFile fileField = new LinkedFile("", path.toAbsolutePath(), ""); @@ -111,8 +111,7 @@ void cleanupDoesNothingByDefault(@TempDir Path bibFolder) throws IOException { @Test void upgradeExternalLinksMoveFromPdfToFile() { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.CLEAN_UP_UPGRADE_EXTERNAL_LINKS); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.PDF, "aPdfFile"); + BibEntry entry = new BibEntry().withField(StandardField.PDF, "aPdfFile"); worker.cleanup(preset, entry); assertEquals(Optional.empty(), entry.getField(StandardField.PDF)); @@ -122,8 +121,7 @@ void upgradeExternalLinksMoveFromPdfToFile() { @Test void upgradeExternalLinksMoveFromPsToFile() { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.CLEAN_UP_UPGRADE_EXTERNAL_LINKS); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.PS, "aPsFile"); + BibEntry entry = new BibEntry().withField(StandardField.PS, "aPsFile"); worker.cleanup(preset, entry); assertEquals(Optional.empty(), entry.getField(StandardField.PDF)); @@ -133,8 +131,7 @@ void upgradeExternalLinksMoveFromPsToFile() { @Test void cleanupDoiRemovesLeadingHttp() { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.CLEAN_UP_DOI); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.DOI, "http://dx.doi.org/10.1016/0001-8708(80)90035-3"); + BibEntry entry = new BibEntry().withField(StandardField.DOI, "http://dx.doi.org/10.1016/0001-8708(80)90035-3"); worker.cleanup(preset, entry); assertEquals(Optional.of("10.1016/0001-8708(80)90035-3"), entry.getField(StandardField.DOI)); @@ -143,8 +140,7 @@ void cleanupDoiRemovesLeadingHttp() { @Test void cleanupDoiReturnsChanges() { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.CLEAN_UP_DOI); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.DOI, "http://dx.doi.org/10.1016/0001-8708(80)90035-3"); + BibEntry entry = new BibEntry().withField(StandardField.DOI, "http://dx.doi.org/10.1016/0001-8708(80)90035-3"); List changes = worker.cleanup(preset, entry); @@ -155,8 +151,7 @@ void cleanupDoiReturnsChanges() { @Test void cleanupDoiFindsDoiInURLFieldAndMoveItToDOIField() { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.CLEAN_UP_DOI); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.URL, "http://dx.doi.org/10.1016/0001-8708(80)90035-3"); + BibEntry entry = new BibEntry().withField(StandardField.URL, "http://dx.doi.org/10.1016/0001-8708(80)90035-3"); worker.cleanup(preset, entry); assertEquals(Optional.of("10.1016/0001-8708(80)90035-3"), entry.getField(StandardField.DOI)); @@ -166,8 +161,7 @@ void cleanupDoiFindsDoiInURLFieldAndMoveItToDOIField() { @Test void cleanupDoiReturnsChangeWhenDoiInURLField() { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.CLEAN_UP_DOI); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.URL, "http://dx.doi.org/10.1016/0001-8708(80)90035-3"); + BibEntry entry = new BibEntry().withField(StandardField.URL, "http://dx.doi.org/10.1016/0001-8708(80)90035-3"); List changes = worker.cleanup(preset, entry); List changeList = new ArrayList<>(); @@ -180,8 +174,7 @@ void cleanupDoiReturnsChangeWhenDoiInURLField() { void cleanupMonthChangesNumberToBibtex() { CleanupPreferences preset = new CleanupPreferences(new FieldFormatterCleanups(true, Collections.singletonList(new FieldFormatterCleanup(StandardField.MONTH, new NormalizeMonthFormatter())))); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.MONTH, "01"); + BibEntry entry = new BibEntry().withField(StandardField.MONTH, "01"); worker.cleanup(preset, entry); assertEquals(Optional.of("#jan#"), entry.getField(StandardField.MONTH)); @@ -191,8 +184,7 @@ void cleanupMonthChangesNumberToBibtex() { void cleanupPageNumbersConvertsSingleDashToDouble() { CleanupPreferences preset = new CleanupPreferences(new FieldFormatterCleanups(true, Collections.singletonList(new FieldFormatterCleanup(StandardField.PAGES, new NormalizePagesFormatter())))); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.PAGES, "1-2"); + BibEntry entry = new BibEntry().withField(StandardField.PAGES, "1-2"); worker.cleanup(preset, entry); assertEquals(Optional.of("1--2"), entry.getField(StandardField.PAGES)); @@ -202,8 +194,7 @@ void cleanupPageNumbersConvertsSingleDashToDouble() { void cleanupDatesConvertsToCorrectFormat() { CleanupPreferences preset = new CleanupPreferences(new FieldFormatterCleanups(true, Collections.singletonList(new FieldFormatterCleanup(StandardField.DATE, new NormalizeDateFormatter())))); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.DATE, "01/1999"); + BibEntry entry = new BibEntry().withField(StandardField.DATE, "01/1999"); worker.cleanup(preset, entry); assertEquals(Optional.of("1999-01"), entry.getField(StandardField.DATE)); @@ -212,8 +203,7 @@ void cleanupDatesConvertsToCorrectFormat() { @Test void cleanupFixFileLinksMovesSingleDescriptionToLink() { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.FIX_FILE_LINKS); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.FILE, "link::"); + BibEntry entry = new BibEntry().withField(StandardField.FILE, "link::"); worker.cleanup(preset, entry); assertEquals(Optional.of(":link:"), entry.getField(StandardField.FILE)); @@ -222,13 +212,11 @@ void cleanupFixFileLinksMovesSingleDescriptionToLink() { @Test void cleanupMoveFilesMovesFileFromSubfolder(@TempDir Path bibFolder) throws IOException { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.MOVE_PDF); - Path path = bibFolder.resolve("AnotherRandomlyNamedFolder"); Files.createDirectory(path); Path tempFile = Files.createFile(path.resolve("test.pdf")); - BibEntry entry = new BibEntry(); LinkedFile fileField = new LinkedFile("", tempFile.toAbsolutePath(), ""); - entry.setField(StandardField.FILE, FileFieldWriter.getStringRepresentation(fileField)); + BibEntry entry = new BibEntry().withField(StandardField.FILE, FileFieldWriter.getStringRepresentation(fileField)); worker.cleanup(preset, entry); LinkedFile newFileField = new LinkedFile("", tempFile.getFileName(), ""); @@ -253,13 +241,12 @@ void cleanupRelativePathsConvertAbsoluteToRelativePath() throws IOException { @Test void cleanupRenamePdfRenamesRelativeFile() throws IOException { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.RENAME_PDF); - Path path = pdfPath.resolve("AnotherRandomlyNamedFile.tmp"); Files.createFile(path); - BibEntry entry = new BibEntry() - .withCitationKey("Toot"); LinkedFile fileField = new LinkedFile("", path.toAbsolutePath(), ""); - entry.setField(StandardField.FILE, FileFieldWriter.getStringRepresentation(fileField)); + BibEntry entry = new BibEntry() + .withCitationKey("Toot") + .withField(StandardField.FILE, FileFieldWriter.getStringRepresentation(fileField)); worker.cleanup(preset, entry); LinkedFile newFileField = new LinkedFile("", Path.of("Toot.tmp"), ""); @@ -270,8 +257,7 @@ void cleanupRenamePdfRenamesRelativeFile() throws IOException { void cleanupHtmlToLatexConvertsEpsilonToLatex() { CleanupPreferences preset = new CleanupPreferences(new FieldFormatterCleanups(true, Collections.singletonList(new FieldFormatterCleanup(StandardField.TITLE, new HtmlToLatexFormatter())))); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.TITLE, "Ε"); + BibEntry entry = new BibEntry().withField(StandardField.TITLE, "Ε"); worker.cleanup(preset, entry); assertEquals(Optional.of("{{$\\Epsilon$}}"), entry.getField(StandardField.TITLE)); @@ -281,8 +267,7 @@ void cleanupHtmlToLatexConvertsEpsilonToLatex() { void cleanupUnitsConvertsOneAmpereToLatex() { CleanupPreferences preset = new CleanupPreferences(new FieldFormatterCleanups(true, Collections.singletonList(new FieldFormatterCleanup(StandardField.TITLE, new UnitsToLatexFormatter())))); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.TITLE, "1 A"); + BibEntry entry = new BibEntry().withField(StandardField.TITLE, "1 A"); worker.cleanup(preset, entry); assertEquals(Optional.of("1~{A}"), entry.getField(StandardField.TITLE)); @@ -296,8 +281,7 @@ void cleanupCasesAddsBracketAroundAluminiumGalliumArsenid() { assertNotEquals(Collections.emptyList(), protectedTermsLoader.getProtectedTerms()); CleanupPreferences preset = new CleanupPreferences(new FieldFormatterCleanups(true, Collections .singletonList(new FieldFormatterCleanup(StandardField.TITLE, new ProtectTermsFormatter(protectedTermsLoader))))); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.TITLE, "AlGaAs"); + BibEntry entry = new BibEntry().withField(StandardField.TITLE, "AlGaAs"); worker.cleanup(preset, entry); assertEquals(Optional.of("{AlGaAs}"), entry.getField(StandardField.TITLE)); @@ -307,8 +291,7 @@ void cleanupCasesAddsBracketAroundAluminiumGalliumArsenid() { void cleanupLatexMergesTwoLatexMathEnvironments() { CleanupPreferences preset = new CleanupPreferences(new FieldFormatterCleanups(true, Collections.singletonList(new FieldFormatterCleanup(StandardField.TITLE, new LatexCleanupFormatter())))); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.TITLE, "$\\alpha$$\\beta$"); + BibEntry entry = new BibEntry().withField(StandardField.TITLE, "$\\alpha$$\\beta$"); worker.cleanup(preset, entry); assertEquals(Optional.of("$\\alpha\\beta$"), entry.getField(StandardField.TITLE)); @@ -317,8 +300,7 @@ void cleanupLatexMergesTwoLatexMathEnvironments() { @Test void convertToBiblatexMovesAddressToLocation() { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.CONVERT_TO_BIBLATEX); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.ADDRESS, "test"); + BibEntry entry = new BibEntry().withField(StandardField.ADDRESS, "test"); worker.cleanup(preset, entry); assertEquals(Optional.empty(), entry.getField(StandardField.ADDRESS)); @@ -328,8 +310,7 @@ void convertToBiblatexMovesAddressToLocation() { @Test void convertToBiblatexMovesJournalToJournalTitle() { CleanupPreferences preset = new CleanupPreferences(CleanupPreferences.CleanupStep.CONVERT_TO_BIBLATEX); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.JOURNAL, "test"); + BibEntry entry = new BibEntry().withField(StandardField.JOURNAL, "test"); worker.cleanup(preset, entry); assertEquals(Optional.empty(), entry.getField(StandardField.JOURNAL)); @@ -340,8 +321,7 @@ void convertToBiblatexMovesJournalToJournalTitle() { void cleanupWithDisabledFieldFormatterChangesNothing() { CleanupPreferences preset = new CleanupPreferences(new FieldFormatterCleanups(false, Collections.singletonList(new FieldFormatterCleanup(StandardField.MONTH, new NormalizeMonthFormatter())))); - BibEntry entry = new BibEntry(); - entry.setField(StandardField.MONTH, "01"); + BibEntry entry = new BibEntry().withField(StandardField.MONTH, "01"); worker.cleanup(preset, entry); assertEquals(Optional.of("01"), entry.getField(StandardField.MONTH)); From eaf317cbfd2562e07ddc76a0f4a11bd281f749d5 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 13 May 2023 00:34:19 +0200 Subject: [PATCH 07/10] Fix package of LatexToUnicodeFormatter --- src/jmh/java/org/jabref/benchmarks/Benchmarks.java | 2 +- src/main/java/org/jabref/gui/groups/GroupNodeViewModel.java | 2 +- src/main/java/org/jabref/logic/bst/BstPreviewLayout.java | 2 +- .../org/jabref/logic/cleanup/FieldFormatterCleanups.java | 2 +- src/main/java/org/jabref/logic/formatter/Formatters.java | 2 +- .../bibtexfields}/LatexToUnicodeFormatter.java | 6 ++++-- .../formatter/bibtexfields/UnicodeToLatexFormatter.java | 3 +++ .../org/jabref/logic/importer/fetcher/INSPIREFetcher.java | 2 +- src/main/java/org/jabref/logic/layout/LayoutEntry.java | 2 +- src/main/java/org/jabref/logic/layout/format/XMLChars.java | 1 + src/main/java/org/jabref/model/entry/identifier/DOI.java | 2 +- .../bibtexfields}/LatexToUnicodeFormatterTest.java | 2 +- 12 files changed, 17 insertions(+), 11 deletions(-) rename src/main/java/org/jabref/logic/{layout/format => formatter/bibtexfields}/LatexToUnicodeFormatter.java (89%) rename src/test/java/org/jabref/logic/{layout/format => formatter/bibtexfields}/LatexToUnicodeFormatterTest.java (99%) diff --git a/src/jmh/java/org/jabref/benchmarks/Benchmarks.java b/src/jmh/java/org/jabref/benchmarks/Benchmarks.java index 37a3079492a..e80bdd6697b 100644 --- a/src/jmh/java/org/jabref/benchmarks/Benchmarks.java +++ b/src/jmh/java/org/jabref/benchmarks/Benchmarks.java @@ -15,10 +15,10 @@ import org.jabref.logic.exporter.BibtexDatabaseWriter; import org.jabref.logic.exporter.SaveConfiguration; import org.jabref.logic.formatter.bibtexfields.HtmlToLatexFormatter; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.logic.importer.ParserResult; import org.jabref.logic.importer.fileformat.BibtexParser; import org.jabref.logic.layout.format.HTMLChars; -import org.jabref.logic.layout.format.LatexToUnicodeFormatter; import org.jabref.logic.search.SearchQuery; import org.jabref.logic.util.OS; import org.jabref.model.database.BibDatabase; diff --git a/src/main/java/org/jabref/gui/groups/GroupNodeViewModel.java b/src/main/java/org/jabref/gui/groups/GroupNodeViewModel.java index ff9cf822a82..db6814fab8b 100644 --- a/src/main/java/org/jabref/gui/groups/GroupNodeViewModel.java +++ b/src/main/java/org/jabref/gui/groups/GroupNodeViewModel.java @@ -26,8 +26,8 @@ import org.jabref.gui.util.DefaultTaskExecutor; import org.jabref.gui.util.DroppingMouseLocation; import org.jabref.gui.util.TaskExecutor; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.logic.groups.DefaultGroupsFactory; -import org.jabref.logic.layout.format.LatexToUnicodeFormatter; import org.jabref.model.FieldChange; import org.jabref.model.database.BibDatabaseContext; import org.jabref.model.entry.BibEntry; diff --git a/src/main/java/org/jabref/logic/bst/BstPreviewLayout.java b/src/main/java/org/jabref/logic/bst/BstPreviewLayout.java index a002d388e60..d6ec3cc9bcc 100644 --- a/src/main/java/org/jabref/logic/bst/BstPreviewLayout.java +++ b/src/main/java/org/jabref/logic/bst/BstPreviewLayout.java @@ -5,9 +5,9 @@ import java.util.List; import org.jabref.logic.cleanup.ConvertToBibtexCleanup; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.logic.formatter.bibtexfields.RemoveNewlinesFormatter; import org.jabref.logic.l10n.Localization; -import org.jabref.logic.layout.format.LatexToUnicodeFormatter; import org.jabref.logic.layout.format.RemoveLatexCommandsFormatter; import org.jabref.logic.layout.format.RemoveTilde; import org.jabref.logic.preview.PreviewLayout; diff --git a/src/main/java/org/jabref/logic/cleanup/FieldFormatterCleanups.java b/src/main/java/org/jabref/logic/cleanup/FieldFormatterCleanups.java index 17e88ab6955..3dfda03c5d1 100644 --- a/src/main/java/org/jabref/logic/cleanup/FieldFormatterCleanups.java +++ b/src/main/java/org/jabref/logic/cleanup/FieldFormatterCleanups.java @@ -16,12 +16,12 @@ import org.jabref.logic.formatter.IdentityFormatter; import org.jabref.logic.formatter.bibtexfields.HtmlToLatexFormatter; import org.jabref.logic.formatter.bibtexfields.HtmlToUnicodeFormatter; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.logic.formatter.bibtexfields.NormalizeDateFormatter; import org.jabref.logic.formatter.bibtexfields.NormalizeMonthFormatter; import org.jabref.logic.formatter.bibtexfields.NormalizePagesFormatter; import org.jabref.logic.formatter.bibtexfields.OrdinalsToSuperscriptFormatter; import org.jabref.logic.formatter.bibtexfields.UnicodeToLatexFormatter; -import org.jabref.logic.layout.format.LatexToUnicodeFormatter; import org.jabref.logic.layout.format.ReplaceUnicodeLigaturesFormatter; import org.jabref.model.FieldChange; import org.jabref.model.entry.BibEntry; diff --git a/src/main/java/org/jabref/logic/formatter/Formatters.java b/src/main/java/org/jabref/logic/formatter/Formatters.java index cc89cfcfb03..f5e2594c01f 100644 --- a/src/main/java/org/jabref/logic/formatter/Formatters.java +++ b/src/main/java/org/jabref/logic/formatter/Formatters.java @@ -16,6 +16,7 @@ import org.jabref.logic.formatter.bibtexfields.HtmlToLatexFormatter; import org.jabref.logic.formatter.bibtexfields.HtmlToUnicodeFormatter; import org.jabref.logic.formatter.bibtexfields.LatexCleanupFormatter; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.logic.formatter.bibtexfields.NormalizeDateFormatter; import org.jabref.logic.formatter.bibtexfields.NormalizeMonthFormatter; import org.jabref.logic.formatter.bibtexfields.NormalizeNamesFormatter; @@ -34,7 +35,6 @@ import org.jabref.logic.formatter.casechanger.UpperCaseFormatter; import org.jabref.logic.formatter.minifier.MinifyNameListFormatter; import org.jabref.logic.formatter.minifier.TruncateFormatter; -import org.jabref.logic.layout.format.LatexToUnicodeFormatter; public class Formatters { private static final Pattern TRUNCATE_PATTERN = Pattern.compile("\\Atruncate\\d+\\z"); diff --git a/src/main/java/org/jabref/logic/layout/format/LatexToUnicodeFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatter.java similarity index 89% rename from src/main/java/org/jabref/logic/layout/format/LatexToUnicodeFormatter.java rename to src/main/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatter.java index 0222dc66f6b..22a551f5ac8 100644 --- a/src/main/java/org/jabref/logic/layout/format/LatexToUnicodeFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatter.java @@ -1,16 +1,18 @@ -package org.jabref.logic.layout.format; +package org.jabref.logic.formatter.bibtexfields; import java.util.Map; import java.util.Objects; +import org.jabref.logic.cleanup.Formatter; import org.jabref.logic.l10n.Localization; import org.jabref.logic.layout.LayoutFormatter; import org.jabref.logic.util.strings.HTMLUnicodeConversionMaps; -import org.jabref.model.cleanup.Formatter; /** * This formatter converts LaTeX character sequences their equivalent unicode characters, * and removes other LaTeX commands without handling them. + * + * The inverse operation is {@link UnicodeToLatexFormatter}. */ public class LatexToUnicodeFormatter extends Formatter implements LayoutFormatter { diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java index 6101e3c8807..f0cf0de7f1c 100644 --- a/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java @@ -11,6 +11,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +/** + * The inverse operation is {@link LatexToUnicodeFormatter}. + */ public class UnicodeToLatexFormatter extends Formatter implements LayoutFormatter { private static final Logger LOGGER = LoggerFactory.getLogger(UnicodeToLatexFormatter.class); diff --git a/src/main/java/org/jabref/logic/importer/fetcher/INSPIREFetcher.java b/src/main/java/org/jabref/logic/importer/fetcher/INSPIREFetcher.java index d15644471ec..d18249596ee 100644 --- a/src/main/java/org/jabref/logic/importer/fetcher/INSPIREFetcher.java +++ b/src/main/java/org/jabref/logic/importer/fetcher/INSPIREFetcher.java @@ -10,6 +10,7 @@ import org.jabref.logic.cleanup.FieldFormatterCleanup; import org.jabref.logic.formatter.bibtexfields.ClearFormatter; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.logic.formatter.bibtexfields.RemoveBracesFormatter; import org.jabref.logic.help.HelpFile; import org.jabref.logic.importer.EntryBasedFetcher; @@ -21,7 +22,6 @@ import org.jabref.logic.importer.fetcher.transformers.DefaultLuceneQueryTransformer; import org.jabref.logic.importer.fileformat.BibtexParser; import org.jabref.logic.importer.util.MediaTypes; -import org.jabref.logic.layout.format.LatexToUnicodeFormatter; import org.jabref.logic.net.URLDownload; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.field.StandardField; diff --git a/src/main/java/org/jabref/logic/layout/LayoutEntry.java b/src/main/java/org/jabref/logic/layout/LayoutEntry.java index 495795417a5..c942d669690 100644 --- a/src/main/java/org/jabref/logic/layout/LayoutEntry.java +++ b/src/main/java/org/jabref/logic/layout/LayoutEntry.java @@ -11,6 +11,7 @@ import java.util.Optional; import org.jabref.logic.formatter.bibtexfields.HtmlToLatexFormatter; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.logic.formatter.bibtexfields.UnicodeToLatexFormatter; import org.jabref.logic.layout.format.AuthorAbbreviator; import org.jabref.logic.layout.format.AuthorAndToSemicolonReplacer; @@ -58,7 +59,6 @@ import org.jabref.logic.layout.format.Iso690NamesAuthors; import org.jabref.logic.layout.format.JournalAbbreviator; import org.jabref.logic.layout.format.LastPage; -import org.jabref.logic.layout.format.LatexToUnicodeFormatter; import org.jabref.logic.layout.format.MarkdownFormatter; import org.jabref.logic.layout.format.NameFormatter; import org.jabref.logic.layout.format.NoSpaceBetweenAbbreviations; diff --git a/src/main/java/org/jabref/logic/layout/format/XMLChars.java b/src/main/java/org/jabref/logic/layout/format/XMLChars.java index b7c9f9136e0..677ad30d81e 100644 --- a/src/main/java/org/jabref/logic/layout/format/XMLChars.java +++ b/src/main/java/org/jabref/logic/layout/format/XMLChars.java @@ -3,6 +3,7 @@ import java.util.HashMap; import java.util.Map; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.logic.layout.LayoutFormatter; import org.jabref.logic.util.strings.XmlCharsMap; diff --git a/src/main/java/org/jabref/model/entry/identifier/DOI.java b/src/main/java/org/jabref/model/entry/identifier/DOI.java index da3a80217dd..9e471b6ab3e 100644 --- a/src/main/java/org/jabref/model/entry/identifier/DOI.java +++ b/src/main/java/org/jabref/model/entry/identifier/DOI.java @@ -11,7 +11,7 @@ import java.util.regex.Pattern; import org.jabref.architecture.AllowedToUseLogic; -import org.jabref.logic.layout.format.LatexToUnicodeFormatter; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.model.entry.field.Field; import org.jabref.model.entry.field.StandardField; diff --git a/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java similarity index 99% rename from src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java rename to src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java index 3b4a0d84082..dae3ead9dda 100644 --- a/src/test/java/org/jabref/logic/layout/format/LatexToUnicodeFormatterTest.java +++ b/src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java @@ -1,4 +1,4 @@ -package org.jabref.logic.layout.format; +package org.jabref.logic.formatter.bibtexfields; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; From a2a242044118b7a80dc67795270b270bc4ed55c2 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 13 May 2023 00:43:18 +0200 Subject: [PATCH 08/10] Convert to parameterized tests --- .../LatexToUnicodeFormatterTest.java | 233 +++++------------- 1 file changed, 55 insertions(+), 178 deletions(-) diff --git a/src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java index dae3ead9dda..468ebe76fa2 100644 --- a/src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java +++ b/src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java @@ -1,7 +1,8 @@ package org.jabref.logic.formatter.bibtexfields; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -9,187 +10,63 @@ class LatexToUnicodeFormatterTest { final LatexToUnicodeFormatter formatter = new LatexToUnicodeFormatter(); - @Test - void testPlainFormat() { - assertEquals("aaa", formatter.format("aaa")); - } - - @Test - void testFormatUmlaut() { - assertEquals("ä", formatter.format("{\\\"{a}}")); - assertEquals("Ä", formatter.format("{\\\"{A}}")); - } - - @Test - void preserveUnknownCommand() { - assertEquals("\\mbox{-}", formatter.format("\\mbox{-}")); - } - - @Test - void testFormatTextit() { - // See https://github.com/JabRef/jabref/pull/1464 - assertEquals("\uD835\uDC61\uD835\uDC52\uD835\uDC65\uD835\uDC61", formatter.format("\\textit{text}")); - } - - @Test - void testEscapedDollarSign() { - assertEquals("$", formatter.format("\\$")); - } - - @Test - void testEquationsSingleSymbol() { - assertEquals("σ", formatter.format("$\\sigma$")); - } - - @Test - void testEquationsMoreComplicatedFormatting() { - assertEquals("A 32 mA ΣΔ-modulator", formatter.format("A 32~{mA} {$\\Sigma\\Delta$}-modulator")); - } - @Test void formatExample() { assertEquals("Mönch", formatter.format(formatter.getExampleInput())); } - @Test - void testChi() { - // See #1464 - assertEquals("χ", formatter.format("$\\chi$")); - } - - @Test - void testSWithCaron() { - // Bug #1264 - assertEquals("Š", formatter.format("{\\v{S}}")); - } - - @Test - void testIWithDiaresis() { - assertEquals("ï", formatter.format("\\\"{i}")); - } - - @Test - void testIWithDiaresisAndEscapedI() { - // this might look strange in the test, but is actually a correct translation and renders identically to the above example in the UI - assertEquals("ı̈", formatter.format("\\\"{\\i}")); - } - - @Test - void testIWithDiaresisAndUnnecessaryBraces() { - assertEquals("ï", formatter.format("{\\\"{i}}")); - } - - @Test - void testUpperCaseIWithDiaresis() { - assertEquals("Ï", formatter.format("\\\"{I}")); - } - - @Test - void testPolishName() { - assertEquals("Łęski", formatter.format("\\L\\k{e}ski")); - } - - @Test - void testDoubleCombiningAccents() { - assertEquals("ώ", formatter.format("$\\acute{\\omega}$")); - } - - @Test - void testCombiningAccentsCase1() { - assertEquals("ḩ", formatter.format("{\\c{h}}")); - } - - @Disabled("This is not a standard LaTeX command. It is debatable why we should convert this.") - @Test - void testCombiningAccentsCase2() { - assertEquals("a͍", formatter.format("\\spreadlips{a}")); - } - - @Test - void keepUnknownCommandWithoutArgument() { - assertEquals("\\aaaa", formatter.format("\\aaaa")); - } - - @Test - void keepUnknownCommandWithArgument() { - assertEquals("\\aaaa{bbbb}", formatter.format("\\aaaa{bbbb}")); - } - - @Test - void keepUnknownCommandWithEmptyArgument() { - assertEquals("\\aaaa{}", formatter.format("\\aaaa{}")); - } - - @Test - void testTildeN() { - assertEquals("Montaña", formatter.format("Monta\\~{n}a")); - } - - @Test - void testAcuteNLongVersion() { - assertEquals("Maliński", formatter.format("Mali\\'{n}ski")); - assertEquals("MaliŃski", formatter.format("Mali\\'{N}ski")); - } - - @Test - void testAcuteNShortVersion() { - assertEquals("Maliński", formatter.format("Mali\\'nski")); - assertEquals("MaliŃski", formatter.format("Mali\\'Nski")); - } - - @Test - void testApostrophN() { - assertEquals("Mali'nski", formatter.format("Mali'nski")); - assertEquals("Mali'Nski", formatter.format("Mali'Nski")); - } - - @Test - void testApostrophO() { - assertEquals("L'oscillation", formatter.format("L'oscillation")); - } - - @Test - void testApostrophC() { - assertEquals("O'Connor", formatter.format("O'Connor")); - } - - @Test - void testPreservationOfSingleUnderscore() { - assertEquals("Lorem ipsum_lorem ipsum", formatter.format("Lorem ipsum_lorem ipsum")); - } - - @Test - void testConversionOfUnderscoreWithBraces() { - assertEquals("Lorem ipsum_(lorem ipsum)", formatter.format("Lorem ipsum_{lorem ipsum}")); - } - - @Test - void testConversionOfOrdinal1st() { - assertEquals("1ˢᵗ", formatter.format("1\\textsuperscript{st}")); - } - - @Test - void testConversionOfOrdinal2nd() { - assertEquals("2ⁿᵈ", formatter.format("2\\textsuperscript{nd}")); - } - - @Test - void testConversionOfOrdinal3rd() { - assertEquals("3ʳᵈ", formatter.format("3\\textsuperscript{rd}")); - } - - @Test - void testConversionOfOrdinal4th() { - assertEquals("4ᵗʰ", formatter.format("4\\textsuperscript{th}")); - } - - @Test - void testConversionOfOrdinal9th() { - assertEquals("9ᵗʰ", formatter.format("9\\textsuperscript{th}")); - } - - @Test - void testSanskrit() { - assertEquals("Puṇya-pattana-vidyā-pı̄ṭhādhi-kṛtaiḥ prā-kaśyaṃ nı̄taḥ", formatter.format("Pu\\d{n}ya-pattana-vidy{\\={a}}-p{\\i{\\={}}}\\d{t}h{\\={a}}dhi-k\\d{r}tai\\d{h} pr{\\={a}}-ka{{\\'{s}}}ya\\d{m} n{\\i{\\={}}}ta\\d{h}")); + @ParameterizedTest + @CsvSource({ + "aaa, aaa", + "ä, {\\\"{a}}", + "Ä, {\\\"{A}}", + "\\mbox{-}, \\mbox{-}", + // See https://github.com/JabRef/jabref/pull/1464 + "\uD835\uDC61\uD835\uDC52\uD835\uDC65\uD835\uDC61, \\textit{text}", + "$, \\$", + "σ, $\\sigma$", + "A 32 mA ΣΔ-modulator, A 32~{mA} {$\\Sigma\\Delta$}-modulator", + // See #1464 + "χ, $\\chi$", + // Bug #1264 + "Š, {\\v{S}}", + "ï, \\\"{i}", + // this might look strange in the test, but is actually a correct translation and renders identically to the above example in the UI + "ı̈, \\\"{\\i}", + "ï, {\\\"{i}}", + "Ï, \\\"{I}", + "Łęski, \\L\\k{e}ski", + // doubleCombiningAccents + "ώ, $\\acute{\\omega}$", + "ḩ, {\\c{h}}", + // This is not a standard LaTeX command. It is debatable why we should convert this. + // "a͍, \\spreadlips{a}", + // unknown command + "\\aaaa, \\aaaa", + // unknown command + "\\aaaa{bbbb}, \\aaaa{bbbb}", + // unknown command + "\\aaaa{}, \\aaaa{}", + "Montaña, Monta\\~{n}a", + "Maliński, Mali\\'{n}ski", + "MaliŃski, Mali\\'{N}ski", + "Maliński, Mali\\'nski", + "MaliŃski, Mali\\'Nski", + "Mali'nski, Mali'nski", + "Mali'Nski, Mali'Nski", + "L'oscillation, L'oscillation", + "O'Connor, O'Connor", + "Lorem ipsum_lorem ipsum, Lorem ipsum_lorem ipsum", + "Lorem ipsum_(lorem ipsum), Lorem ipsum_{lorem ipsum}", + "1ˢᵗ, 1\\textsuperscript{st}", + "2ⁿᵈ, 2\\textsuperscript{nd}", + "3ʳᵈ, 3\\textsuperscript{rd}", + "4ᵗʰ, 4\\textsuperscript{th}", + "9ᵗʰ, 9\\textsuperscript{th}", + // Sanskrit + "Puṇya-pattana-vidyā-pı̄ṭhādhi-kṛtaiḥ prā-kaśyaṃ nı̄taḥ, Pu\\d{n}ya-pattana-vidy{\\={a}}-p{\\i{\\={}}}\\d{t}h{\\={a}}dhi-k\\d{r}tai\\d{h} pr{\\={a}}-ka{{\\'{s}}}ya\\d{m} n{\\i{\\={}}}ta\\d{h}" + }) + void test(String expected, String input) { + assertEquals(expected, formatter.format(input)); } } From 97dfa3cf05cd213e59d51300de453c6818029d70 Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 13 May 2023 01:11:55 +0200 Subject: [PATCH 09/10] Compilefix (LatexToUnicodeAdapter -> LatexToUnicodeFormatter) --- .../citationkeypattern/BracketedPattern.java | 25 ++++++++----------- .../citationstyle/JabRefItemDataProvider.java | 6 +++-- .../bibtexfields/UnicodeToLatexFormatter.java | 9 ++++--- .../java/org/jabref/model/entry/Author.java | 14 ++++++----- .../jabref/model/groups/LastNameGroup.java | 5 ++-- 5 files changed, 32 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java b/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java index 13fda45cedf..779f2587b3c 100644 --- a/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java +++ b/src/main/java/org/jabref/logic/citationkeypattern/BracketedPattern.java @@ -1,7 +1,6 @@ package org.jabref.logic.citationkeypattern; import java.math.BigInteger; -import java.text.Normalizer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -21,6 +20,7 @@ import org.jabref.logic.cleanup.Formatter; import org.jabref.logic.formatter.Formatters; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.logic.formatter.casechanger.Word; import org.jabref.logic.layout.format.RemoveLatexCommandsFormatter; import org.jabref.model.database.BibDatabase; @@ -32,7 +32,6 @@ import org.jabref.model.entry.field.FieldFactory; import org.jabref.model.entry.field.InternalField; import org.jabref.model.entry.field.StandardField; -import org.jabref.model.strings.LatexToUnicodeAdapter; import org.jabref.model.strings.StringUtil; import org.slf4j.Logger; @@ -84,6 +83,8 @@ public class BracketedPattern { private static final Pattern WHITESPACE = Pattern.compile("\\p{javaWhitespace}"); + private static final LatexToUnicodeFormatter LATEX_TO_UNICODE_FORMATTER = new LatexToUnicodeFormatter(); + private enum Institution { SCHOOL, DEPARTMENT, @@ -513,7 +514,7 @@ public static String getFieldValue(BibEntry entry, String pattern, Character key } /** - * Parses the provided string to an {@link AuthorList}, which are then formatted by {@link LatexToUnicodeAdapter}. + * Parses the provided string to an {@link AuthorList}, which are then formatted by {@link LatexToUnicodeFormatter}. * Afterward, any institutions are formatted into an institution key. * * @param unparsedAuthors a string representation of authors or editors @@ -526,14 +527,14 @@ private static AuthorList createAuthorList(String unparsedAuthors) { String lastName = author.getLast() .map(lastPart -> isInstitution(author) ? generateInstitutionKey(lastPart) : - LatexToUnicodeAdapter.format(lastPart)) + LATEX_TO_UNICODE_FORMATTER.format(lastPart)) .orElse(null); return new Author( - author.getFirst().map(LatexToUnicodeAdapter::format).orElse(null), - author.getFirstAbbr().map(LatexToUnicodeAdapter::format).orElse(null), - author.getVon().map(LatexToUnicodeAdapter::format).orElse(null), + author.getFirst().map(LATEX_TO_UNICODE_FORMATTER::format).orElse(null), + author.getFirstAbbr().map(LATEX_TO_UNICODE_FORMATTER::format).orElse(null), + author.getVon().map(LATEX_TO_UNICODE_FORMATTER::format).orElse(null), lastName, - author.getJr().map(LatexToUnicodeAdapter::format).orElse(null)); + author.getJr().map(LATEX_TO_UNICODE_FORMATTER::format).orElse(null)); }) .collect(AuthorList.collect()); } @@ -1195,14 +1196,10 @@ private static String generateInstitutionKey(String content) { Matcher matcher = INLINE_ABBREVIATION.matcher(content); if (matcher.find()) { - return LatexToUnicodeAdapter.format(matcher.group()); + return LATEX_TO_UNICODE_FORMATTER.format(matcher.group()); } - Optional unicodeFormattedName = LatexToUnicodeAdapter.parse(content); - if (unicodeFormattedName.isEmpty()) { - LOGGER.warn("{} could not be converted to unicode. This can result in an incorrect or missing institute citation key", content); - } - String result = unicodeFormattedName.orElse(Normalizer.normalize(content, Normalizer.Form.NFC)); + String result = LATEX_TO_UNICODE_FORMATTER.format(content); // Special characters can't be allowed past this point because the citation key generator might replace them with multiple mixed-case characters result = StringUtil.replaceSpecialCharacters(result); diff --git a/src/main/java/org/jabref/logic/citationstyle/JabRefItemDataProvider.java b/src/main/java/org/jabref/logic/citationstyle/JabRefItemDataProvider.java index b15c0e4534a..d760be3bec7 100644 --- a/src/main/java/org/jabref/logic/citationstyle/JabRefItemDataProvider.java +++ b/src/main/java/org/jabref/logic/citationstyle/JabRefItemDataProvider.java @@ -7,6 +7,7 @@ import java.util.Optional; import java.util.Set; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.logic.formatter.bibtexfields.RemoveNewlinesFormatter; import org.jabref.logic.integrity.PagesChecker; import org.jabref.model.database.BibDatabaseContext; @@ -18,7 +19,6 @@ import org.jabref.model.entry.field.Field; import org.jabref.model.entry.field.StandardField; import org.jabref.model.entry.types.StandardEntryType; -import org.jabref.model.strings.LatexToUnicodeAdapter; import de.undercouch.citeproc.ItemDataProvider; import de.undercouch.citeproc.bibtex.BibTeXConverter; @@ -44,6 +44,8 @@ public class JabRefItemDataProvider implements ItemDataProvider { private BibEntryTypesManager entryTypesManager; private PagesChecker pagesChecker; + private LatexToUnicodeFormatter latexToUnicodeFormatter = new LatexToUnicodeFormatter(); + public JabRefItemDataProvider() { stringJsonBuilderFactory = new StringJsonBuilderFactory(); } @@ -152,7 +154,7 @@ private CSLItemData bibEntryToCSLItemData(BibEntry originalBibEntry, BibDatabase for (Field key : fields) { bibEntry.getResolvedFieldOrAlias(key, bibDatabaseContext.getDatabase()) .map(removeNewlinesFormatter::format) - .map(LatexToUnicodeAdapter::format) + .map(latexToUnicodeFormatter::format) .ifPresent(value -> { if (StandardField.MONTH == key) { // Change month from #mon# to mon because CSL does not support the former format diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java index f0cf0de7f1c..bab53cc841c 100644 --- a/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java @@ -1,5 +1,6 @@ package org.jabref.logic.formatter.bibtexfields; +import java.text.Normalizer; import java.util.Map; import java.util.Objects; @@ -20,11 +21,13 @@ public class UnicodeToLatexFormatter extends Formatter implements LayoutFormatte @Override public String format(String text) { - String result = Objects.requireNonNull(text); - if (result.isEmpty()) { - return result; + if (Objects.requireNonNull(text).isEmpty()) { + return text; } + // normalize the unicode characters to cover more cases + String result = Normalizer.normalize(text, Normalizer.Form.NFC); + // Standard symbols for (Map.Entry unicodeLatexPair : HTMLUnicodeConversionMaps.UNICODE_LATEX_CONVERSION_MAP .entrySet()) { diff --git a/src/main/java/org/jabref/model/entry/Author.java b/src/main/java/org/jabref/model/entry/Author.java index 2fa63335acf..21251f25907 100644 --- a/src/main/java/org/jabref/model/entry/Author.java +++ b/src/main/java/org/jabref/model/entry/Author.java @@ -3,7 +3,7 @@ import java.util.Objects; import java.util.Optional; -import org.jabref.model.strings.LatexToUnicodeAdapter; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.model.strings.StringUtil; /** @@ -27,6 +27,8 @@ public class Author { private final String jrPart; private Author latexFreeAuthor; + private LatexToUnicodeFormatter latexToUnicodeFormatter = new LatexToUnicodeFormatter(); + /** * Creates the Author object. If any part of the name is absent, null must be passed; otherwise other methods may return erroneous results. * @@ -370,11 +372,11 @@ public String getNameForAlphabetization() { */ public Author latexFree() { if (latexFreeAuthor == null) { - String first = getFirst().map(LatexToUnicodeAdapter::format).orElse(null); - String firstabbr = getFirstAbbr().map(LatexToUnicodeAdapter::format).orElse(null); - String von = getVon().map(LatexToUnicodeAdapter::format).orElse(null); - String last = getLast().map(LatexToUnicodeAdapter::format).orElse(null); - String jr = getJr().map(LatexToUnicodeAdapter::format).orElse(null); + String first = getFirst().map(latexToUnicodeFormatter::format).orElse(null); + String firstabbr = getFirstAbbr().map(latexToUnicodeFormatter::format).orElse(null); + String von = getVon().map(latexToUnicodeFormatter::format).orElse(null); + String last = getLast().map(latexToUnicodeFormatter::format).orElse(null); + String jr = getJr().map(latexToUnicodeFormatter::format).orElse(null); latexFreeAuthor = new Author(first, firstabbr, von, last, jr); latexFreeAuthor.latexFreeAuthor = latexFreeAuthor; } diff --git a/src/main/java/org/jabref/model/groups/LastNameGroup.java b/src/main/java/org/jabref/model/groups/LastNameGroup.java index 8fa6fff938c..a12a84dbb92 100644 --- a/src/main/java/org/jabref/model/groups/LastNameGroup.java +++ b/src/main/java/org/jabref/model/groups/LastNameGroup.java @@ -5,18 +5,19 @@ import java.util.Optional; import java.util.stream.Collectors; +import org.jabref.logic.formatter.bibtexfields.LatexToUnicodeFormatter; import org.jabref.model.entry.Author; import org.jabref.model.entry.AuthorList; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.field.Field; -import org.jabref.model.strings.LatexToUnicodeAdapter; /** * Matches based on a latex free last name in a specified field. The field is parsed as an author list and the last names are resolved of latex. */ public class LastNameGroup extends KeywordGroup { + public LastNameGroup(String groupName, GroupHierarchyType context, Field searchField, String lastName) { - super(groupName, context, searchField, LatexToUnicodeAdapter.format(lastName), true); + super(groupName, context, searchField, new LatexToUnicodeFormatter().format(lastName), true); } static List getAsLastNamesLatexFree(Field field, BibEntry bibEntry) { From aa192215b94026003e2d5e311477df549364331b Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 13 May 2023 02:15:05 +0200 Subject: [PATCH 10/10] Restrict to "Basic Multilingual Plane" (BMP) only (and simplify some LaTeX commands) --- .../bibtexfields/LatexToUnicodeFormatter.java | 5 +- .../bibtexfields/UnicodeToLatexFormatter.java | 21 +++- .../openoffice/style/OOPreFormatter.java | 15 ++- .../strings/HTMLUnicodeConversionMaps.java | 106 ++++++++++-------- .../LatexToUnicodeFormatterTest.java | 3 + .../UnicodeToLatexFormatterTest.java | 74 ++++++++++-- 6 files changed, 151 insertions(+), 73 deletions(-) diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatter.java index 22a551f5ac8..ccd95b4e97a 100644 --- a/src/main/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatter.java @@ -29,15 +29,14 @@ public String getKey() { @Override public String format(String text) { String result = Objects.requireNonNull(text); - if (result.isEmpty()) { return result; } // Standard symbols - for (Map.Entry unicodeLatexPair : HTMLUnicodeConversionMaps.UNICODE_LATEX_CONVERSION_MAP + for (Map.Entry unicodeLatexPair : HTMLUnicodeConversionMaps.UNICODE_LATEX_CONVERSION_MAP .entrySet()) { - result = result.replace(unicodeLatexPair.getValue(), unicodeLatexPair.getKey()); + result = result.replace(unicodeLatexPair.getValue(), unicodeLatexPair.getKey().toString()); } return result; diff --git a/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java b/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java index bab53cc841c..a29bff00968 100644 --- a/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java +++ b/src/main/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatter.java @@ -1,7 +1,6 @@ package org.jabref.logic.formatter.bibtexfields; import java.text.Normalizer; -import java.util.Map; import java.util.Objects; import org.jabref.logic.cleanup.Formatter; @@ -28,10 +27,20 @@ public String format(String text) { // normalize the unicode characters to cover more cases String result = Normalizer.normalize(text, Normalizer.Form.NFC); - // Standard symbols - for (Map.Entry unicodeLatexPair : HTMLUnicodeConversionMaps.UNICODE_LATEX_CONVERSION_MAP - .entrySet()) { - result = result.replace(unicodeLatexPair.getKey(), unicodeLatexPair.getValue()); + // Convert single Unicode characters to LaTeX commands + boolean changed = false; + StringBuilder stringBuilder = new StringBuilder(); + for (char c : text.toCharArray()) { + String lookup = HTMLUnicodeConversionMaps.UNICODE_LATEX_CONVERSION_MAP.get(c); + if (lookup == null) { + stringBuilder.append(c); + } else { + stringBuilder.append(lookup); + changed = true; + } + } + if (changed) { + result = stringBuilder.toString(); } // Combining accents @@ -65,7 +74,7 @@ public String format(String text) { for (int i = 0; i <= (result.length() - 1); i++) { int cp = result.codePointAt(i); if (cp >= 129) { - LOGGER.warn("Unicode character not converted: " + cp); + LOGGER.warn("Unicode character not converted: {}", cp); } } return result; diff --git a/src/main/java/org/jabref/logic/openoffice/style/OOPreFormatter.java b/src/main/java/org/jabref/logic/openoffice/style/OOPreFormatter.java index caeaf8cce2a..733cab3e8c0 100644 --- a/src/main/java/org/jabref/logic/openoffice/style/OOPreFormatter.java +++ b/src/main/java/org/jabref/logic/openoffice/style/OOPreFormatter.java @@ -12,7 +12,7 @@ */ public class OOPreFormatter implements LayoutFormatter { - private static final Map CHARS = HTMLUnicodeConversionMaps.LATEX_UNICODE_CONVERSION_MAP; + private static final Map CHARS = HTMLUnicodeConversionMaps.LATEX_UNICODE_CONVERSION_MAP; @Override public String format(String field) { @@ -37,7 +37,7 @@ public String format(String field) { if (incommand) { /* Close Command */ String command = currentCommand.toString(); - String result = OOPreFormatter.CHARS.get(command); + Character result = OOPreFormatter.CHARS.get(command); sb.append(Objects.requireNonNullElse(result, command)); } escaped = true; @@ -73,8 +73,7 @@ public String format(String field) { } else { combody = finalResult.substring(i, i + 1); } - String result = OOPreFormatter.CHARS.get(command + combody); - + Character result = OOPreFormatter.CHARS.get(command + combody); if (result != null) { sb.append(result); } @@ -85,7 +84,7 @@ public String format(String field) { // Are we already at the end of the string? if ((i + 1) == finalResult.length()) { String command = currentCommand.toString(); - String result = OOPreFormatter.CHARS.get(command); + Character result = OOPreFormatter.CHARS.get(command); // If found, then use translated version. If not, then keep the text of the parameter intact. sb.append(Objects.requireNonNullElse(result, command)); } @@ -110,17 +109,17 @@ public String format(String field) { i += part.length(); argument = part; // handle common case of general latex command - String result = OOPreFormatter.CHARS.get(command + argument); + Character result = OOPreFormatter.CHARS.get(command + argument); // If found, then use translated version. If not, then keep the text of the parameter intact. sb.append(Objects.requireNonNullElse(result, argument)); } else if (c == '}') { // This end brace terminates a command. This can be the case in constructs like {\aa}. The // correct behaviour should be to substitute the evaluated command and swallow the brace: - String result = OOPreFormatter.CHARS.get(command); + Character result = OOPreFormatter.CHARS.get(command); // If the command is unknown, just print it: sb.append(Objects.requireNonNullElse(result, command)); } else { - String result = OOPreFormatter.CHARS.get(command); + Character result = OOPreFormatter.CHARS.get(command); sb.append(Objects.requireNonNullElse(result, command)); sb.append(' '); } diff --git a/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java b/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java index de477470a02..8f93b4834fc 100644 --- a/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java +++ b/src/main/java/org/jabref/logic/util/strings/HTMLUnicodeConversionMaps.java @@ -3,8 +3,15 @@ import java.util.HashMap; import java.util.Map; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + public class HTMLUnicodeConversionMaps { + private static final Logger LOGGER = LoggerFactory.getLogger(HTMLUnicodeConversionMaps.class); + + // We support "Basic Multilingual Plane" (BMP) only - due to speed reasons + // most of the LaTeX commands can be read at http://en.wikibooks.org/wiki/LaTeX/Accents // The symbols can be seen at http://www.fileformat.info/info/unicode/char/a4/index.htm. Replace "a4" with the U+ number // http://detexify.kirelabs.org/classify.html and http://www.ctan.org/tex-archive/info/symbols/comprehensive/ might help to find the right LaTeX command @@ -13,9 +20,9 @@ public class HTMLUnicodeConversionMaps { public static final Map ESCAPED_ACCENTS = new HashMap<>(); public static final Map UNICODE_ESCAPED_ACCENTS = new HashMap<>(); public static final Map NUMERICAL_LATEX_CONVERSION_MAP = new HashMap<>(); - public static final Map UNICODE_LATEX_CONVERSION_MAP = new HashMap<>(); + public static final Map UNICODE_LATEX_CONVERSION_MAP = new HashMap(); public static final Map LATEX_HTML_CONVERSION_MAP = new HashMap<>(); - public static final Map LATEX_UNICODE_CONVERSION_MAP = new HashMap<>(); + public static final Map LATEX_UNICODE_CONVERSION_MAP = new HashMap(); /* Portions © International Organization for Standardization 1986: Permission to copy in any form is granted for use with @@ -41,7 +48,7 @@ public class HTMLUnicodeConversionMaps { * * Mappings from unicode to latex, unicode to HTML, HTML to unicode, ... are generated based on these entries. * - * Helper scripts to gernerate entries: + * Helper scripts to generate entries: * * - copy table from https://www.utf8-chartable.de/unicode-utf8-table.pl to input.txt * - grep "DOT BELOW" input.txt > input-dot-below.txt @@ -110,31 +117,31 @@ public class HTMLUnicodeConversionMaps { // U+00C2 ISOlat1 {"195", "Atilde", "{{\\~{A}}}"}, // latin capital letter A with tilde, // U+00C3 ISOlat1 - {"196", "Auml", "{{\\\"{A}}}"}, // latin capital letter A with diaeresis, + {"196", "Auml", "\\\"{A}"}, // latin capital letter A with diaeresis, "Ä" // U+00C4 ISOlat1 - {"197", "Aring", "{{\\AA}}"}, // latin capital letter A with ring above + {"197", "Aring", "{\\AA}"}, // latin capital letter A with ring above // = latin capital letter A ring, // U+00C5 ISOlat1 - {"198", "AElig", "{{\\AE}}"}, // latin capital letter AE + {"198", "AElig", "{\\AE}"}, // latin capital letter AE // = latin capital ligature AE, // U+00C6 ISOlat1 - {"199", "Ccedil", "{{\\c{C}}}"}, // latin capital letter C with cedilla, + {"199", "Ccedil", "\\c{C}"}, // latin capital letter C with cedilla, // U+00C7 ISOlat1 - {"200", "Egrave", "{{\\`{E}}}"}, // latin capital letter E with grave, + {"200", "Egrave", "\\`{E}"}, // latin capital letter E with grave, // U+00C8 ISOlat1 - {"201", "Eacute", "{{\\'{E}}}"}, // latin capital letter E with acute, + {"201", "Eacute", "\\'{E}"}, // latin capital letter E with acute, // U+00C9 ISOlat1 - {"202", "Ecirc", "{{\\^{E}}}"}, // latin capital letter E with circumflex, + {"202", "Ecirc", "\\^{E}"}, // latin capital letter E with circumflex, // U+00CA ISOlat1 - {"203", "Euml", "{{\\\"{E}}}"}, // latin capital letter E with diaeresis, + {"203", "Euml", "\\\"{E}"}, // latin capital letter E with diaeresis, // U+00CB ISOlat1 - {"204", "Igrave", "{{\\`{I}}}"}, // latin capital letter I with grave, + {"204", "Igrave", "\\`{I}"}, // latin capital letter I with grave, // U+00CC ISOlat1 - {"205", "Iacute", "{{\\'{I}}}"}, // latin capital letter I with acute, + {"205", "Iacute", "\\'{I}"}, // latin capital letter I with acute, // U+00CD ISOlat1 - {"206", "Icirc", "{{\\^{I}}}"}, // latin capital letter I with circumflex, + {"206", "Icirc", "\\^{I}"}, // latin capital letter I with circumflex, // U+00CE ISOlat1 - {"207", "Iuml", "{{\\\"{I}}}"}, // latin capital letter I with diaeresis, + {"207", "Iuml", "\\\"{I}"}, // latin capital letter I with diaeresis, // U+00CF ISOlat1 {"208", "ETH", "{{\\DH}}"}, // latin capital letter ETH, U+00D0 ISOlat1 {"209", "Ntilde", "{{\\~{N}}}"}, // latin capital letter N with tilde, @@ -153,53 +160,53 @@ public class HTMLUnicodeConversionMaps { {"216", "Oslash", "{{\\O}}"}, // latin capital letter O with stroke // = latin capital letter O slash, // U+00D8 ISOlat1 - {"217", "Ugrave", "{{\\`{U}}}"}, // latin capital letter U with grave, + {"217", "Ugrave", "\\`{U}"}, // latin capital letter U with grave, // U+00D9 ISOlat1 - {"218", "Uacute", "{{\\'{U}}}"}, // latin capital letter U with acute, + {"218", "Uacute", "\\'{U}"}, // latin capital letter U with acute, // U+00DA ISOlat1 - {"219", "Ucirc", "{{\\^{U}}}"}, // latin capital letter U with circumflex, + {"219", "Ucirc", "\\^{U}}"}, // latin capital letter U with circumflex, // U+00DB ISOlat1 - {"220", "Uuml", "{{\\\"{U}}}"}, // latin capital letter U with diaeresis, + {"220", "Uuml", "\\\"{U}"}, // latin capital letter U with diaeresis, // U+00DC ISOlat1 - {"221", "Yacute", "{{\\'{Y}}}"}, // latin capital letter Y with acute, + {"221", "Yacute", "\\'{Y}"}, // latin capital letter Y with acute, // U+00DD ISOlat1 - {"222", "THORN", "{{\\TH}}"}, // latin capital letter THORN, + {"222", "THORN", "{\\TH}"}, // latin capital letter THORN, // U+00DE ISOlat1 {"223", "szlig", "{\\ss}"}, // latin small letter sharp s = ess-zed, // U+00DF ISOlat1 - {"224", "agrave", "{\\`{a}}"}, // latin small letter a with grave + {"224", "agrave", "\\`{a}"}, // latin small letter a with grave // = latin small letter a grave, // U+00E0 ISOlat1 - {"225", "aacute", "{\\'{a}}"}, // latin small letter a with acute, + {"225", "aacute", "\\'{a}"}, // latin small letter a with acute, // U+00E1 ISOlat1 - {"226", "acirc", "{\\^{a}}"}, // latin small letter a with circumflex, + {"226", "acirc", "\\^{a}"}, // latin small letter a with circumflex, // U+00E2 ISOlat1 - {"227", "atilde", "{\\~{a}}"}, // latin small letter a with tilde, + {"227", "atilde", "\\~{a}"}, // latin small letter a with tilde, // U+00E3 ISOlat1 - {"228", "auml", "{\\\"{a}}"}, // latin small letter a with diaeresis, + {"228", "auml", "\\\"{a}"}, // latin small letter a with diaeresis, "ä" // U+00E4 ISOlat1 - {"229", "aring", "{{\\aa}}"}, // latin small letter a with ring above + {"229", "aring", "{\\aa}"}, // latin small letter a with ring above // = latin small letter a ring, // U+00E5 ISOlat1 {"230", "aelig", "{\\ae}"}, // latin small letter ae // = latin small ligature ae, U+00E6 ISOlat1 - {"231", "ccedil", "{\\c{c}}"}, // latin small letter c with cedilla, + {"231", "ccedil", "\\c{c}"}, // latin small letter c with cedilla, // U+00E7 ISOlat1 - {"232", "egrave", "{\\`{e}}"}, // latin small letter e with grave, + {"232", "egrave", "\\`{e}"}, // latin small letter e with grave, // U+00E8 ISOlat1 - {"233", "eacute", "{\\'{e}}"}, // latin small letter e with acute, + {"233", "eacute", "\\'{e}"}, // latin small letter e with acute, // U+00E9 ISOlat1 - {"234", "ecirc", "{\\^{e}}"}, // latin small letter e with circumflex, + {"234", "ecirc", "\\^{e}"}, // latin small letter e with circumflex, // U+00EA ISOlat1 - {"235", "euml", "{\\\"{e}}"}, // latin small letter e with diaeresis, + {"235", "euml", "\\\"{e}"}, // latin small letter e with diaeresis, // U+00EB ISOlat1 - {"236", "igrave", "{\\`{i}}"}, // latin small letter i with grave, + {"236", "igrave", "\\`{i}"}, // latin small letter i with grave, // U+00EC ISOlat1 - {"237", "iacute", "{\\'{i}}"}, // latin small letter i with acute, + {"237", "iacute", "\\'{i}"}, // latin small letter i with acute, // U+00ED ISOlat1 - {"238", "icirc", "{\\^{i}}"}, // latin small letter i with circumflex, + {"238", "icirc", "\\^{i}"}, // latin small letter i with circumflex, // U+00EE ISOlat1 - {"239", "iuml", "{\\\"{i}}"}, // latin small letter i with diaeresis, + {"239", "iuml", "\\\"{i}"}, // latin small letter i with diaeresis, // U+00EF ISOlat1 {"240", "eth", "{\\dh}"}, // latin small letter eth, U+00F0 ISOlat1 {"241", "ntilde", "{\\~{n}}"}, // latin small letter n with tilde, @@ -657,10 +664,10 @@ public class HTMLUnicodeConversionMaps { {"318", "", "{\\v{l}}"}, // small l with caron // {"319", "Lmidot", "{\\Lmidot}"}, // upper case L with mid dot // {"320", "lmidot", "{\\lmidot}"}, // lower case l with mid dot - {"321", "Lstrok", "{{\\L}}"}, // upper case L with stroke - {"322", "lstrok", "{{\\l}}"}, // lower case l with stroke - {"323", "Nacute", "{{\\'{N}}}"}, // upper case N with acute - {"324", "nacute", "{{\\'{n}}}"}, // lower case n with acute + {"321", "Lstrok", "{\\L}"}, // upper case L with stroke + {"322", "lstrok", "{\\l}"}, // lower case l with stroke + {"323", "Nacute", "\\'{N}"}, // upper case N with acute + {"324", "nacute", "\\'{n}"}, // lower case n with acute {"325", "", "{{\\c{N}}}"}, // capital N with cedilla {"326", "", "{\\c{n}}"}, // small n with cedilla {"327", "", "{{\\v{N}}}"}, // capital N with caron @@ -688,8 +695,8 @@ public class HTMLUnicodeConversionMaps { {"349", "scirc", "{\\^{s}}"}, // lower case s with circumflex {"350", "Scedil", "{{\\c{S}}}"}, // upper case S with cedilla {"351", "scedil", "{\\c{s}}"}, // lower case s with cedilla - {"352", "Scaron", "{{\\v{S}}}"}, // latin capital letter S with caron, - {"353", "scaron", "{\\v{s}}"}, // latin small letter s with caron, + {"352", "Scaron", "\\v{S}"}, // latin capital letter S with caron, + {"353", "scaron", "\\v{s}"}, // latin small letter s with caron, {"354", "", "{{\\c{T}}}"}, // upper case T with cedilla {"355", "", "{{\\c{T}}}"}, // lower case t with cedilla {"356", "", "{{\\v{T}}}"}, // latin capital letter T with caron, @@ -958,7 +965,12 @@ public class HTMLUnicodeConversionMaps { if (!(aConversionList[0].isEmpty())) { NUMERICAL_LATEX_CONVERSION_MAP.put(Integer.decode(aConversionList[0]), aConversionList[2]); if (Integer.decode(aConversionList[0]) > 128) { - String unicodeSymbol = String.valueOf(Character.toChars(Integer.decode(aConversionList[0]))); + // with [0] we assume BMP + char[] chars = Character.toChars(Integer.decode(aConversionList[0])); + if (chars.length > 1) { + LOGGER.error("Non-BMP unicode range"); + } + Character unicodeSymbol = chars[0]; UNICODE_LATEX_CONVERSION_MAP.put(unicodeSymbol, aConversionList[2]); if (!strippedLaTeX.isEmpty()) { LATEX_UNICODE_CONVERSION_MAP.put(strippedLaTeX, unicodeSymbol); @@ -974,19 +986,19 @@ public class HTMLUnicodeConversionMaps { } // Manually added values which are killed by cleanLaTeX LATEX_HTML_CONVERSION_MAP.put("$", "$"); - LATEX_UNICODE_CONVERSION_MAP.put("$", "$"); + LATEX_UNICODE_CONVERSION_MAP.put("$", '$'); // Manual corrections LATEX_HTML_CONVERSION_MAP.put("AA", "Å"); // Overwritten by Å which is less supported - LATEX_UNICODE_CONVERSION_MAP.put("AA", "Å"); // Overwritten by Ångstrom symbol + LATEX_UNICODE_CONVERSION_MAP.put("AA", 'Å'); // Overwritten by Ångstrom symbol // Manual additions // Support relax to the extent that it is simply removed LATEX_HTML_CONVERSION_MAP.put("relax", ""); - LATEX_UNICODE_CONVERSION_MAP.put("relax", ""); + // LATEX_UNICODE_CONVERSION_MAP.put("relax", ""); // Support a special version of apostrophe LATEX_HTML_CONVERSION_MAP.put("textquotesingle", "'"); - LATEX_UNICODE_CONVERSION_MAP.put("textquotesingle", "'"); // apostrophe, U+00027 + LATEX_UNICODE_CONVERSION_MAP.put("textquotesingle", '\''); // apostrophe, U+00027 } private HTMLUnicodeConversionMaps() { diff --git a/src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java index 468ebe76fa2..d8b5ec41c6d 100644 --- a/src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java +++ b/src/test/java/org/jabref/logic/formatter/bibtexfields/LatexToUnicodeFormatterTest.java @@ -15,6 +15,9 @@ void formatExample() { assertEquals("Mönch", formatter.format(formatter.getExampleInput())); } + /** + * In case of an update of the test data, also update {@link UnicodeToLatexFormatterTest#test(String, String)} + */ @ParameterizedTest @CsvSource({ "aaa, aaa", diff --git a/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java b/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java index ffee237eced..17dc04b2e03 100644 --- a/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java +++ b/src/test/java/org/jabref/logic/formatter/bibtexfields/UnicodeToLatexFormatterTest.java @@ -2,23 +2,18 @@ import java.util.stream.Stream; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.CsvSource; import org.junit.jupiter.params.provider.MethodSource; import static org.junit.jupiter.api.Assertions.assertEquals; class UnicodeToLatexFormatterTest { - private UnicodeToLatexFormatter formatter; + private UnicodeToLatexFormatter formatter = new UnicodeToLatexFormatter(); - @BeforeEach - void setUp() { - formatter = new UnicodeToLatexFormatter(); - } - - private static Stream testCases() { + private static Stream testFormat() { return Stream.of( Arguments.of("", ""), // empty string input Arguments.of("abc", "abc"), // non unicode input @@ -31,8 +26,69 @@ private static Stream testCases() { } @ParameterizedTest() - @MethodSource("testCases") + @MethodSource void testFormat(String expectedResult, String input) { assertEquals(expectedResult, formatter.format(input)); } + + /** + * Similar data as in {@link LatexToUnicodeFormatterTest#test(String, String)}. + * "Duplicate" entries are removed; there is one LaTeX presentation, not multiple for a given Unicode text. + */ + @ParameterizedTest + @CsvSource({ + "aaa, aaa", + "ä, \\\"{a}", + "Ä, \\\"{A}", + "\\mbox{-}, \\mbox{-}", + // See https://github.com/JabRef/jabref/pull/1464 + "\uD835\uDC61\uD835\uDC52\uD835\uDC65\uD835\uDC61, \\textit{text}", + // "$, \\$", + "σ, $\\sigma$", + "A 32 mA ΣΔ-modulator, A 32~{mA} {$\\Sigma\\Delta$}-modulator", + // See #1464 + "χ, $\\chi$", + // Bug #1264 + "Š, \\v{S}", + "ï, \\\"{i}", + // this might look strange in the test, but is actually a correct translation and renders identically to the above example in the UI + // this is with diatrics + "ı̈, \\\"{\\i}", + // this is the letter as is + "ï, \\\"{i}", + "Ï, \\\"{I}", + "Łęski, \\L\\k{e}ski", + // doubleCombiningAccents + "ώ, $\\acute{\\omega}$", + "ḩ, \\c{h}", + // This is not a standard LaTeX command. It is debatable why we should convert this. + // "a͍, \\spreadlips{a}", + // unknown command + "\\aaaa, \\aaaa", + // unknown command + "\\aaaa{bbbb}, \\aaaa{bbbb}", + // unknown command + "\\aaaa{}, \\aaaa{}", + "Montaña, Monta\\~{n}a", + "Maliński, Mali\\'{n}ski", + "MaliŃski, Mali\\'{N}ski", + "Maliński, Mali\\'nski", + "MaliŃski, Mali\\'Nski", + "Mali'nski, Mali'nski", + "Mali'Nski, Mali'Nski", + "L'oscillation, L'oscillation", + "O'Connor, O'Connor", + "Lorem ipsum_lorem ipsum, Lorem ipsum_lorem ipsum", + "Lorem ipsum_{lorem ipsum}, Lorem ipsum_{lorem ipsum}", + "1ˢᵗ, 1\\textsuperscript{st}", + "2ⁿᵈ, 2\\textsuperscript{nd}", + "3ʳᵈ, 3\\textsuperscript{rd}", + "4ᵗʰ, 4\\textsuperscript{th}", + "9ᵗʰ, 9\\textsuperscript{th}", + // Sanskrit + "Puṇya-pattana-vidyā-pı̄ṭhādhi-kṛtaiḥ prā-kaśyaṃ nı̄taḥ, Pu\\d{n}ya-pattana-vidy{\\={a}}-p{\\i{\\={}}}\\d{t}h{\\={a}}dhi-k\\d{r}tai\\d{h} pr{\\={a}}-ka{{\\'{s}}}ya\\d{m} n{\\i{\\={}}}ta\\d{h}" + }) + void test(String input, String expected) { + assertEquals(expected, formatter.format(input)); + } }