From 352f42cd2c459d7bc0a98b0a6ec7e76377cce911 Mon Sep 17 00:00:00 2001 From: Michael Griffiths Date: Wed, 31 Dec 2014 13:01:59 +0000 Subject: [PATCH] fuzzy completion wip --- project.clj | 2 + src/{ => clj}/cljs_tooling/complete.clj | 50 +++++- src/{ => clj}/cljs_tooling/info.clj | 0 src/{ => clj}/cljs_tooling/util/analysis.clj | 0 src/{ => clj}/cljs_tooling/util/misc.clj | 0 src/java/cljs_tooling/DamerauLevenshtein.java | 160 ++++++++++++++++++ src/java/cljs_tooling/LiquidMetal.java | 88 ++++++++++ test/cljs_tooling/test_complete.clj | 5 +- 8 files changed, 295 insertions(+), 10 deletions(-) rename src/{ => clj}/cljs_tooling/complete.clj (60%) rename src/{ => clj}/cljs_tooling/info.clj (100%) rename src/{ => clj}/cljs_tooling/util/analysis.clj (100%) rename src/{ => clj}/cljs_tooling/util/misc.clj (100%) create mode 100644 src/java/cljs_tooling/DamerauLevenshtein.java create mode 100644 src/java/cljs_tooling/LiquidMetal.java diff --git a/project.clj b/project.clj index 28a5213..ac9d826 100644 --- a/project.clj +++ b/project.clj @@ -4,6 +4,8 @@ :license {:name "Eclipse Public License" :url "http://www.eclipse.org/legal/epl-v10.html"} :dependencies [] + :source-paths ["src/clj"] + :java-source-paths ["src/java"] :profiles {:dev {:dependencies [[org.clojure/clojure "1.5.1"] [org.clojure/clojurescript "0.0-2202"] [org.clojure/core.async "0.1.303.0-886421-alpha"] diff --git a/src/cljs_tooling/complete.clj b/src/clj/cljs_tooling/complete.clj similarity index 60% rename from src/cljs_tooling/complete.clj rename to src/clj/cljs_tooling/complete.clj index 6ed0913..6741769 100644 --- a/src/cljs_tooling/complete.clj +++ b/src/clj/cljs_tooling/complete.clj @@ -1,7 +1,8 @@ (ns cljs-tooling.complete "Standalone auto-complete library based on cljs analyzer state" (:require [cljs-tooling.util.analysis :as a] - [cljs-tooling.util.misc :as u])) + [cljs-tooling.util.misc :as u]) + (:import [cljs_tooling DamerauLevenshtein LiquidMetal])) ;;; TODO (defn ns-classes @@ -61,14 +62,47 @@ (scoped-completions env sym context-ns) (unscoped-completions env context-ns))) +(def ^:private damerau-levenshtein-scorer (cljs_tooling.DamerauLevenshtein. 1 1 1 1)) + +(defn- damerau-levenshtein-score + [source target] + (.execute damerau-levenshtein-scorer source target)) + +(defn damerau-levenshtein-match? + [name prefix] + (when (<= (.length prefix) (.length name)) + (let [name (subs name 0 (.length prefix)) + threshold (java.lang.Math/ceil (/ (.length prefix) 4))] + (>= threshold (damerau-levenshtein-score name prefix))))) + +(defn liquidmetal-match? + [name prefix] + (<= 0.5 (cljs_tooling.LiquidMetal/score name prefix))) + +(defn prefix-match? + [name prefix] + (.startsWith name prefix)) + +(defn match? + [name prefix] + (or (prefix-match? name prefix) + (damerau-levenshtein-match? name prefix) + (liquidmetal-match? name prefix))) + (defn completions "Return a sequence of matching completions given current namespace and a prefix string" ([env prefix] (completions env prefix nil)) - ([env prefix context-ns] - (->> (potential-completions env (u/as-sym prefix) (u/as-sym context-ns)) - distinct - (map str) - (filter #(.startsWith % prefix)) - sort))) - + ([env prefix context-ns] (completions env prefix context-ns match?)) + ([env prefix context-ns match-fn] + (->> (potential-completions env (u/as-sym prefix) (u/as-sym context-ns)) + distinct + (map str) + (filter #(match-fn % prefix)) + sort + doall))) +(defn completions-test + [env prefix context-ns] + (doseq [candidate (potential-completions env (u/as-sym prefix) (u/as-sym context-ns))] + (when (damerau-levenshtein-match? (str candidate) prefix) + (println candidate (damerau-levenshtein-score (str candidate) prefix))))) diff --git a/src/cljs_tooling/info.clj b/src/clj/cljs_tooling/info.clj similarity index 100% rename from src/cljs_tooling/info.clj rename to src/clj/cljs_tooling/info.clj diff --git a/src/cljs_tooling/util/analysis.clj b/src/clj/cljs_tooling/util/analysis.clj similarity index 100% rename from src/cljs_tooling/util/analysis.clj rename to src/clj/cljs_tooling/util/analysis.clj diff --git a/src/cljs_tooling/util/misc.clj b/src/clj/cljs_tooling/util/misc.clj similarity index 100% rename from src/cljs_tooling/util/misc.clj rename to src/clj/cljs_tooling/util/misc.clj diff --git a/src/java/cljs_tooling/DamerauLevenshtein.java b/src/java/cljs_tooling/DamerauLevenshtein.java new file mode 100644 index 0000000..98e9fbf --- /dev/null +++ b/src/java/cljs_tooling/DamerauLevenshtein.java @@ -0,0 +1,160 @@ +// Adapted from https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java + +package cljs_tooling; + +import java.util.HashMap; +import java.util.Map; + +/* Copyright (c) 2012 Kevin L. Stern + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * The Damerau-Levenshtein Algorithm is an extension to the Levenshtein + * Algorithm which solves the edit distance problem between a source string and + * a target string with the following operations: + * + * + * + * Note that the adjacent character swap operation is an edit that may be + * applied when two adjacent characters in the source string match two adjacent + * characters in the target string, but in reverse order, rather than a general + * allowance for adjacent character swaps. + *

+ * + * This implementation allows the client to specify the costs of the various + * edit operations with the restriction that the cost of two swap operations + * must not be less than the cost of a delete operation followed by an insert + * operation. This restriction is required to preclude two swaps involving the + * same character being required for optimality which, in turn, enables a fast + * dynamic programming solution. + *

+ * + * The running time of the Damerau-Levenshtein algorithm is O(n*m) where n is + * the length of the source string and m is the length of the target string. + * This implementation consumes O(n*m) space. + * + * @author Kevin L. Stern + */ +public class DamerauLevenshtein { + private final int deleteCost, insertCost, replaceCost, swapCost; + + /** + * Constructor. + * + * @param deleteCost + * the cost of deleting a character. + * @param insertCost + * the cost of inserting a character. + * @param replaceCost + * the cost of replacing a character. + * @param swapCost + * the cost of swapping two adjacent characters. + */ + public DamerauLevenshtein(int deleteCost, int insertCost, + int replaceCost, int swapCost) { + /* + * Required to facilitate the premise to the algorithm that two swaps of the + * same character are never required for optimality. + */ + if (2 * swapCost < insertCost + deleteCost) { + throw new IllegalArgumentException("Unsupported cost assignment"); + } + this.deleteCost = deleteCost; + this.insertCost = insertCost; + this.replaceCost = replaceCost; + this.swapCost = swapCost; + } + + /** + * Compute the Damerau-Levenshtein distance between the specified source + * string and the specified target string. + */ + public int execute(String source, String target) { + if (source.length() == 0) { + return target.length() * insertCost; + } + if (target.length() == 0) { + return source.length() * deleteCost; + } + int[][] table = new int[source.length()][target.length()]; + Map sourceIndexByCharacter = new HashMap(); + if (source.charAt(0) != target.charAt(0)) { + table[0][0] = Math.min(replaceCost, deleteCost + insertCost); + } + sourceIndexByCharacter.put(source.charAt(0), 0); + for (int i = 1; i < source.length(); i++) { + int deleteDistance = table[i - 1][0] + deleteCost; + int insertDistance = (i + 1) * deleteCost + insertCost; + int matchDistance = i * deleteCost + + (source.charAt(i) == target.charAt(0) ? 0 : replaceCost); + table[i][0] = Math.min(Math.min(deleteDistance, insertDistance), + matchDistance); + } + for (int j = 1; j < target.length(); j++) { + int deleteDistance = (j + 1) * insertCost + deleteCost; + int insertDistance = table[0][j - 1] + insertCost; + int matchDistance = j * insertCost + + (source.charAt(0) == target.charAt(j) ? 0 : replaceCost); + table[0][j] = Math.min(Math.min(deleteDistance, insertDistance), + matchDistance); + } + for (int i = 1; i < source.length(); i++) { + int maxSourceLetterMatchIndex = source.charAt(i) == target.charAt(0) ? 0 + : -1; + for (int j = 1; j < target.length(); j++) { + Integer candidateSwapIndex = sourceIndexByCharacter.get(target + .charAt(j)); + int jSwap = maxSourceLetterMatchIndex; + int deleteDistance = table[i - 1][j] + deleteCost; + int insertDistance = table[i][j - 1] + insertCost; + int matchDistance = table[i - 1][j - 1]; + if (source.charAt(i) != target.charAt(j)) { + matchDistance += replaceCost; + } else { + maxSourceLetterMatchIndex = j; + } + int swapDistance; + if (candidateSwapIndex != null && jSwap != -1) { + int iSwap = candidateSwapIndex; + int preSwapCost; + if (iSwap == 0 && jSwap == 0) { + preSwapCost = 0; + } else { + preSwapCost = table[Math.max(0, iSwap - 1)][Math.max(0, jSwap - 1)]; + } + swapDistance = preSwapCost + (i - iSwap - 1) * deleteCost + + (j - jSwap - 1) * insertCost + swapCost; + } else { + swapDistance = Integer.MAX_VALUE; + } + table[i][j] = Math.min(Math.min(Math + .min(deleteDistance, insertDistance), matchDistance), swapDistance); + } + sourceIndexByCharacter.put(source.charAt(i), i); + } + return table[source.length() - 1][target.length() - 1]; + } +} diff --git a/src/java/cljs_tooling/LiquidMetal.java b/src/java/cljs_tooling/LiquidMetal.java new file mode 100644 index 0000000..299fedf --- /dev/null +++ b/src/java/cljs_tooling/LiquidMetal.java @@ -0,0 +1,88 @@ +// Adapted from https://gist.github.com/rmsy/5137611 + +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package cljs_tooling; + +import java.util.Collection; + +public class LiquidMetal { + public static final double SCORE_NO_MATCH = 0.0; + public static final double SCORE_MATCH = 1.0; + public static final double SCORE_TRAILING = 0.8; + public static final double SCORE_TRAILING_BUT_STARTED = 0.9; + public static final double SCORE_BUFFER = 0.85; + public static final String WORD_SEPARATORS = " \t_-./"; + + public static final double score(String string, String abbreviation) { + if (abbreviation.length() == 0) return SCORE_TRAILING; + if (abbreviation.length() > string.length()) return SCORE_NO_MATCH; + + double[] scores = buildScoreArray(string, abbreviation); + + // complete miss: + if (scores == null) return SCORE_NO_MATCH; + + double sum = 0.0; + for (double score : scores) { + sum += score; + } + + return (sum / scores.length); + } + + private static final double[] buildScoreArray(String string, String abbreviation) { + double[] scores = new double[string.length()]; + String lower = string.toLowerCase(); + String chars = abbreviation.toLowerCase(); + + int lastIndex = -1; + boolean started = false; + for (int i = 0; i < chars.length(); i++) { + char c = chars.charAt(i); + int index = lower.indexOf(c, lastIndex + 1); + + if (index == -1) return null; // signal no match + if (index == 0) started = true; + + if (isNewWord(string, index)) { + scores[index - 1] = 1.0; + fillArray(scores, SCORE_BUFFER, lastIndex + 1, index - 1); + } else if (isUpperCase(string, index)) { + fillArray(scores, SCORE_BUFFER, lastIndex + 1, index); + } else { + fillArray(scores, SCORE_NO_MATCH, lastIndex + 1, index); + } + + scores[index] = SCORE_MATCH; + lastIndex = index; + } + + double trailingScore = started ? SCORE_TRAILING_BUT_STARTED : SCORE_TRAILING; + fillArray(scores, trailingScore, lastIndex + 1, scores.length); + return scores; + } + + private static final boolean isNewWord(String string, int index) { + if (index == 0) return false; + char c = string.charAt(index); + return WORD_SEPARATORS.indexOf(c) != -1; + } + + private static final void fillArray(double[] array, double value, int from, int to) { + for (int i = from; i < to; i++) { + array[i] = value; + } + } + + private static final boolean isUpperCase(String string, int index) { + char c = string.charAt(index); + return ('A' <= c && c <= 'Z'); + } +} diff --git a/test/cljs_tooling/test_complete.clj b/test/cljs_tooling/test_complete.clj index c84e90f..111ef90 100644 --- a/test/cljs_tooling/test_complete.clj +++ b/test/cljs_tooling/test_complete.clj @@ -14,8 +14,9 @@ (use-fixtures :once test-env/wrap-test-env) (defn completions - [& args] - (apply cc/completions test-env/*env* args)) + ([prefix] (cc/completions test-env/*env* prefix nil cc/prefix-match?)) + ([prefix context-ns] (cc/completions test-env/*env* prefix context-ns cc/prefix-match?)) + ([prefix context-ns match-fn] (cc/completions test-env/*env* prefix context-ns match-fn))) (deftest namespace-completions (testing "Namespace"