google · tushuhei · Mar 6, 2023 · Mar 1, 2023 · Mar 3, 2023 · Mar 3, 2023
diff --git a/.github/workflows/java-unittest.yml b/.github/workflows/java-unittest.yml
@@ -0,0 +1,24 @@
+name: Unittest for Java
+on:
+  push:
+    paths:
+      - 'java/**'
+  pull_request:
+    paths:
+      - 'java/**'
+jobs:
+  java-unittest:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up JDK 17
+        uses: actions/setup-java@v3
+        with:
+          java-version: '17'
+          distribution: 'temurin'
+      - name: Build with Maven
+        run: mvn --batch-mode --update-snapshots -f ./java/pom.xml package
diff --git a/.github/workflows/py-unittest.yml b/.github/workflows/py-unittest.yml
@@ -3,9 +3,11 @@ on:
   push:
     paths-ignore:
       - 'javascript/**'
+      - 'java/**'
   pull_request:
     paths-ignore:
       - 'javascript/**'
+      - 'java/**'
 jobs:
   python-unittest:
     runs-on: ${{ matrix.os }}

diff --git a/.github/workflows/style-check.yml b/.github/workflows/style-check.yml
@@ -40,6 +40,18 @@ jobs:
         working-directory: ./javascript
       - run: npm run lint
         working-directory: ./javascript
+  java-style-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-java@v3
+        with:
+          java-version: '17'
+          distribution: 'temurin'
+      - name: Google Java Format
+        uses: axel-op/googlejavaformat-action@fe78db8a90171b6a836449f8d0e982d5d71e5c5a
+        with:
+          args: "--dry-run --set-exit-if-changed"
   markdown-style-check:
     runs-on: ubuntu-latest
     steps:

diff --git a/README.md b/README.md
@@ -31,8 +31,7 @@ Last but not least, BudouX supports HTML inputs.
 
 - Python
 - [JavaScript](https://github.com/google/budoux/tree/main/javascript/)
-
-For details about the JavaScript module, please visit [JavaScript README](https://github.com/google/budoux/tree/main/javascript/README.md).
+- [Java](https://github.com/google/budoux/tree/main/java/)
 
 ## Python module
 

diff --git a/java/.gitignore b/java/.gitignore
@@ -0,0 +1,2 @@
+target
+src/main/resources
diff --git a/java/README.md b/java/README.md
@@ -0,0 +1,63 @@
+# BudouX Java Module
+
+BudouX is a standalone, small, and language-neutral phrase segmenter tool that
+provides beautiful and legible line breaks.
+
+For more details about the project, please refer to the [project README](https://github.com/google/budoux/).
+
+## Demo
+
+<https://google.github.io/budoux>
+
+## Usage
+
+### Simple usage
+
+You can get a list of phrases by feeding a sentence to the parser.
+The easiest way is to get a parser is loading the default parser for each language.
+
+```java
+import com.google.budoux.Parser;
+
+public class App
+{
+    public static void main( String[] args )
+    {
+        Parser parser = Parser.loadDefaultJapaneseParser();
+        System.out.println(parser.parse("今日は良い天気ですね。"));
+        // [今日は, 良い, 天気ですね。]
+    }
+}
+```
+
+#### Supported languages and their default parsers
+
+- Japanese: `Parser.loadDefaultJapaneseParser()`
+- Simplified Chinese: `Parser.loadDefaultSimplifiedChineseParser()`
+- Traditional Chinese: `Parser.loadDefaultTraditionalChineseParser()`
+
+### Working with HTML
+
+If you want to use the result in a website, you can use the `translateHTMLString`
+method to get an HTML string with non-breaking markup to wrap phrases.
+
+```java
+System.out.println(parser.translateHTMLString("今日は<strong>良い天気</strong>ですね。"));
+//<span style="word-break: keep-all; overflow-wrap: break-word;">今日は<strong><wbr>良い<wbr>天気</strong>ですね。</span>
+```
+
+## Caveat
+
+BudouX supports HTML inputs and outputs HTML strings with markup applied to wrap
+phrases, but it's not meant to be used as an HTML sanitizer.
+**BudouX doesn't sanitize any inputs.**
+Malicious HTML inputs yield malicious HTML outputs.
+Please use it with an appropriate sanitizer library if you don't trust the input.
+
+## Author
+
+[Shuhei Iitsuka](https://tushuhei.com)
+
+## Disclaimer
+
+This is not an officially supported Google product.
diff --git a/java/pom.xml b/java/pom.xml
@@ -0,0 +1,118 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Copyright 2023 Google LLC
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>com.google.budoux</groupId>
+  <artifactId>budoux</artifactId>
+  <version>0.5.1</version>
+
+  <name>BudouX</name>
+  <url>https://google.github.io/budoux/</url>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <maven.compiler.source>1.8</maven.compiler.source>
+    <maven.compiler.target>1.8</maven.compiler.target>
+  </properties>
+  <dependencies>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.13.2</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.google.code.gson</groupId>
+      <artifactId>gson</artifactId>
+      <version>2.10.1</version>
+    </dependency>
+    <dependency>
+      <groupId>org.jsoup</groupId>
+      <artifactId>jsoup</artifactId>
+      <version>1.15.3</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
+      <plugins>
+        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
+        <plugin>
+          <artifactId>maven-clean-plugin</artifactId>
+          <version>3.1.0</version>
+        </plugin>
+        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
+        <plugin>
+          <artifactId>maven-resources-plugin</artifactId>
+          <version>3.0.2</version>
+          <executions>
+            <execution>
+              <id>copy-data</id>
+              <phase>generate-resources</phase>
+              <goals>
+                <goal>copy-resources</goal>
+              </goals>
+              <configuration>
+                <outputDirectory>${basedir}/src/main/resources</outputDirectory>
+                <resources>
+                  <resource>
+                    <directory>../budoux</directory>
+                    <includes>
+                      <include>models/*.json</include>
+                      <include>skip_nodes.json</include>
+                    </includes>
+                  </resource>
+                </resources>
+              </configuration>
+            </execution>
+          </executions>
+        </plugin>
+        <plugin>
+          <artifactId>maven-compiler-plugin</artifactId>
+          <version>3.8.0</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-surefire-plugin</artifactId>
+          <version>2.22.1</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-jar-plugin</artifactId>
+          <version>3.0.2</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-install-plugin</artifactId>
+          <version>2.5.2</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-deploy-plugin</artifactId>
+          <version>2.8.2</version>
+        </plugin>
+        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
+        <plugin>
+          <artifactId>maven-site-plugin</artifactId>
+          <version>3.7.1</version>
+        </plugin>
+        <plugin>
+          <artifactId>maven-project-info-reports-plugin</artifactId>
+          <version>3.0.0</version>
+        </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
+</project>
diff --git a/java/src/main/java/com/google/budoux/HTMLProcessor.java b/java/src/main/java/com/google/budoux/HTMLProcessor.java
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.budoux;
+
+import com.google.gson.Gson;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.select.NodeVisitor;
+
+/** Processes phrases into an HTML string wrapping them in no-breaking markup. */
+final class HTMLProcessor {
+  private static final Set<String> skipNodes;
+  private static final String STYLE = "word-break: keep-all; overflow-wrap: break-word;";
+
+  static {
+    Gson gson = new Gson();
+    InputStream inputStream = HTMLProcessor.class.getResourceAsStream("/skip_nodes.json");
+    Reader reader = new InputStreamReader(inputStream);
+    String[] skipNodesStrings = gson.fromJson(reader, String[].class);
+    skipNodes = new HashSet<>(Arrays.asList(skipNodesStrings));
+  }
+
+  private static class PhraseResolvingNodeVisitor implements NodeVisitor {
+    private static final char SEP = '\uFFFF';
+    private final String phrasesJoined;
+    private Integer scanIndex = 0;
+    private StringBuffer output = new StringBuffer();
+    private boolean toSkip = false;
+
+    PhraseResolvingNodeVisitor(List<String> phrases) {
+      this.phrasesJoined = String.join(Character.toString(SEP), phrases);
+    }
+
+    public StringBuffer getOutput() {
+      return output;
+    }
+
+    @Override
+    public void head(Node node, int depth) {
+      if (node.nodeName() == "body") return;
+      if (node instanceof Element) {
+        String attributesEncoded =
+            node.attributes().asList().stream()
+                .map(attribute -> " " + attribute.toString())
+                .collect(Collectors.joining(""));
+        output.append(String.format("<%s%s>", node.nodeName(), attributesEncoded));
+        if (skipNodes.contains(node.nodeName().toUpperCase())) toSkip = true;
+      } else if (node instanceof TextNode) {
+        String data = ((TextNode) node).getWholeText();
+        for (int i = 0; i < data.length(); i++) {
+          char c = data.charAt(i);
+          if (c != phrasesJoined.charAt(scanIndex)) {
+            if (!toSkip) output.append("<wbr>");
+            scanIndex++;
+          }
+          scanIndex++;
+          output.append(c);
+        }
+      }
+    }
+
+    @Override
+    public void tail(Node node, int depth) {
+      if (node.nodeName() == "body") return;
+      if (node instanceof TextNode) return;
+      output.append(String.format("</%s>", node.nodeName()));
+    }
+  }
+
+  /**
+   * Wraps phrases in the HTML string with non-breaking markup.
+   *
+   * @param phrases the phrases included in the HTML string.
+   * @param html the HTML string to resolve.
+   * @return the HTML string of phrases wrapped in non-breaking markup.
+   */
+  public static String resolve(List<String> phrases, String html) {
+    Document doc = Jsoup.parseBodyFragment(html);
+    PhraseResolvingNodeVisitor nodeVisitor = new PhraseResolvingNodeVisitor(phrases);
+    doc.body().traverse(nodeVisitor);
+    String result = String.format("<span style=\"%s\">%s</span>", STYLE, nodeVisitor.getOutput());
+    return result;
+  }
+
+  /**
+   * Gets the text content from the input HTML string.
+   *
+   * @param html an HTML string.
+   * @return the text content.
+   */
+  public static String getText(String html) {
+    return Jsoup.parseBodyFragment(html).text();
+  }
+}