-
Notifications
You must be signed in to change notification settings - Fork 0
/
Helper.java
132 lines (114 loc) · 3.8 KB
/
Helper.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import java.util.ArrayList;
import java.util.Iterator;
import java.util.HashMap;
import java.util.AbstractMap;
import java.util.Collection;
import java.util.HashSet;
import java.util.Comparator;
import java.util.Set;
import java.util.PriorityQueue;
import java.util.Map;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
public class Helper {
static final float WEIGHT_DIACRITIC = 0.1f;
static final float WEIGHT_CASE = 0.5f;
static final float WEIGHT_DIACRITIC_MIX = 100f;
static final char EMPTY_CHAR = '\u00b7';
static boolean isDiacritic(Character c) {
return UCharacter.hasBinaryProperty(c, UProperty.DIACRITIC);
}
static float weight(Character c) {
if (c == EMPTY_CHAR)
return 0;
if (isDiacritic(c))
return WEIGHT_DIACRITIC;
return 1;
}
public static float distance(char c1, char c2) {
if (c1 == c2)
return 0;
if (UCharacter.toLowerCase(c1) == UCharacter.toLowerCase(c2))
return WEIGHT_CASE;
boolean isDia1 = isDiacritic(c1);
boolean isDia2 = isDiacritic(c2);
if (!isDia1 && !isDia2) {
return 1;
}
if (isDia1 && isDia2)
return WEIGHT_DIACRITIC;
return WEIGHT_DIACRITIC_MIX; // don't replace diacritic with letter, that's an error. force delete and insert instead.
}
public static String pChar(char c) {
return (isDiacritic(c) ? "\u25cc" + c : "") + c;
}
public static float distance(String s, String t) {
int m = s.length();
int n = t.length();
// for all i and j, d[i,j] will hold the Levenshtein distance between
// the first i characters of s and the first j characters of t
// note that d has (m+1)*(n+1) values
float[][] d = new float[m + 1][n + 1];
d[0][0] = 0;
// source prefixes can be transformed into empty string by dropping all characters
for (int i = 1; i <= m; i++)
d[i][0] = d[i - 1][0] + Helper.weight(s.charAt(i - 1)); // cost of deletion
// target prefixes can be reached from empty source prefix by inserting every character
for (int j = 1; j <= n; j++)
d[0][j] = d[0][j - 1] + Helper.weight(t.charAt(j - 1)); // cost of insertion
for (int j = 1; j <= n; j++) {
char tj = t.charAt(j - 1);
for (int i = 1; i <= m; i++) {
char si = s.charAt(i - 1);
float costDel = d[i - 1][j ] + Helper.weight(si);
float costIns = d[i ][j - 1] + Helper.weight(tj);
float costSbs = d[i - 1][j - 1] + Helper.distance(tj, si); // cost of substitution
d[i][j] = Float.min(costSbs, Float.min(costDel, costIns));
}
}
// dump the matrix
if (false) {
System.out.format("%12s", "");
for (int i = 1; i <= m; i++)
System.out.format("%6s", Helper.pChar(s.charAt(i - 1)));
System.out.println();
for (int j = 0; j <= n; j++) {
if (j == 0)
System.out.format("%6s", "");
else
System.out.format("%6s", Helper.pChar(t.charAt(j - 1)));
for (int i = 0; i <= m; i++)
System.out.format("%6.1f", d[i][j]);
System.out.println();
}
}
// dump the process
if (false) {
int i = m, j = n;
while ((i > 0) || (j > 0)) {
float costDel = (i > 0) ? d[i - 1][j ] : Float.POSITIVE_INFINITY;
float costIns = (j > 0) ? d[i ][j - 1] : Float.POSITIVE_INFINITY;
float costSbs = ((i > 0) && (j > 0)) ? d[i - 1][j - 1] : Float.POSITIVE_INFINITY;
if ((costSbs <= costDel) && (costSbs <= costIns)) {
i--;
j--;
char si = s.charAt(i);
char tj = t.charAt(j);
if (si != tj)
System.out.println("Substitute from '" + pChar(si) + "' to '" + pChar(tj) + "' at pos " + i + ", " + j);
}
else if (costDel <= costIns) {
i--;
char si = s.charAt(i);
System.out.println("Delete '" + pChar(si) + "' from pos " + i + ", " + j);
}
else {
j--;
char tj = t.charAt(j);
System.out.println("Insert '" + pChar(tj) + "' to pos " + i + ", " + j);
}
}
}
return d[m][n];
}
}