Skip to content

Commit

Permalink
Add Text folding to UnorderedMultiValueKey
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd committed Sep 3, 2022
1 parent 7680c81 commit b940cd9
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 22 deletions.
9 changes: 7 additions & 2 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Data/Table.enso
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ polyglot java import org.enso.table.data.table.Table as Java_Table
polyglot java import org.enso.table.data.table.Column as Java_Column
polyglot java import org.enso.table.operations.OrderBuilder
polyglot java import org.enso.table.data.mask.OrderMask
polyglot java import org.enso.table.text.UnicodeNormalizedFold
polyglot java import org.enso.table.text.CaseInsensitiveFold

## Creates a new table from a vector of `[name, items]` pairs.

Expand Down Expand Up @@ -682,14 +684,17 @@ type Table
`Floating_Point_Grouping` warning.
distinct : Column_Selector -> (True|Case_Insensitive) -> Problem_Behavior -> Table
distinct self (columns = By_Name (self.columns.map .name)) case_sensitive=True on_problems=Report_Warning =
# TODO case sensitive!
warning_mapper error = case error of
No_Output_Columns -> Maybe.Some No_Input_Columns_Selected
_ -> Nothing
key_columns = Warning.map_warnings_and_errors warning_mapper <|
Table_Helpers.select_columns internal_columns=self.columns selector=columns reorder=True on_problems=on_problems
java_columns = key_columns.map .java_column
java_table = self.java_table.distinct java_columns.to_array Comparator.new
text_folding_strategy = case case_sensitive of
True -> UnicodeNormalizedFold.INSTANCE
Case_Insensitive_Data locale ->
CaseInsensitiveFold.new locale.java_locale
java_table = self.java_table.distinct java_columns.to_array text_folding_strategy
on_problems.attach_problems_after (Table_Data java_table) <|
problems = java_table.getProblems
Aggregate_Column_Helper.parse_aggregated_problems problems
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.enso.table.aggregations;

import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.index.MultiValueKeyBase;
import org.enso.table.data.index.UnorderedMultiValueKey;
import org.enso.table.data.table.Column;
import org.enso.table.data.table.problems.FloatingPointGrouping;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,21 @@ private boolean findFloats() {
return false;
}

protected static Object foldObject(Object value) {
protected Object foldObject(Object value) {
if (value instanceof Long) {
return value;
} else if (value instanceof Integer) {
return ((Integer) value).longValue();
} else if (value instanceof Byte) {
return ((Byte) value).longValue();
} else if (value instanceof Float && ((Float) value) % 1 == 0) {
return ((Float) value).longValue();
} else if (value instanceof Double && ((Double) value) % 1 == 0) {
return ((Double) value).longValue();
} else if (value instanceof Float) {
return ((Float) value).doubleValue();
} else if (value instanceof Double) {
return value;
} else if (value instanceof Integer i) {
return i.longValue();
} else if (value instanceof Byte b) {
return b.longValue();
} else if (value instanceof Float f && f % 1 == 0) {
return f.longValue();
} else if (value instanceof Double d && d % 1 == 0) {
return d.longValue();
} else if (value instanceof Float f) {
return f.doubleValue();
} else if (value instanceof Double d) {
return d;
}

return value;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
package org.enso.table.data.index;

import org.enso.table.data.column.storage.Storage;
import org.enso.table.text.TextFoldingStrategy;
import org.enso.table.text.UnicodeNormalizedFold;

import java.util.Objects;

public class UnorderedMultiValueKey extends MultiValueKeyBase {
private final int hashCodeValue;
private final TextFoldingStrategy textFoldingStrategy;

public UnorderedMultiValueKey(Storage[] storages, int rowIndex) {
this(storages, rowIndex, UnicodeNormalizedFold.INSTANCE);
}

public UnorderedMultiValueKey(Storage[] storages, int rowIndex, TextFoldingStrategy textFoldingStrategy) {
super(storages, rowIndex);
this.textFoldingStrategy = textFoldingStrategy;

// Precompute HashCode - using Apache.Commons.Collections.Map.MultiKeyMap.hash algorithm
int h = 1;
Expand All @@ -27,13 +35,24 @@ public UnorderedMultiValueKey(Storage[] storages, int rowIndex) {
floatsComputed = true;
}

@Override
protected Object foldObject(Object value) {
if (value instanceof String s) {
return textFoldingStrategy.fold(s);
} else {
return super.foldObject(value);
}
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof MultiValueKeyBase that)) return false;
if (storages.length != that.storages.length) return false;
for (int i = 0; i < storages.length; i++) {
if (!Objects.equals(get(i), that.get(i))) {
Object thisFolded = foldObject(this.get(i));
Object thatFolded = foldObject(that.get(i));
if (!Objects.equals(thisFolded, thatFolded)) {
return false;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.enso.table.error.NoSuchColumnException;
import org.enso.table.error.UnexpectedColumnTypeException;
import org.enso.table.operations.Distinct;
import org.enso.table.text.TextFoldingStrategy;

import java.util.*;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -236,12 +237,12 @@ public Table orderBy(Column[] columns, Long[] directions, Comparator<Object> obj
* Creates a new table keeping only rows with distinct key columns.
*
* @param keyColumns set of columns to use as an Index
* @param objectComparator Object comparator allowing calling back to `compare_to` when needed.
* @param textFoldingStrategy a strategy for folding text columns
* @return a table where duplicate rows with the same key are removed
*/
public Table distinct(Column[] keyColumns, Comparator<Object> objectComparator) {
public Table distinct(Column[] keyColumns, TextFoldingStrategy textFoldingStrategy) {
var problems = new AggregatedProblems();
var rowsToKeep = Distinct.buildDistinctRowsMask(rowCount(), keyColumns, objectComparator, problems);
var rowsToKeep = Distinct.buildDistinctRowsMask(rowCount(), keyColumns, textFoldingStrategy, problems);
int cardinality = rowsToKeep.cardinality();
Column[] newColumns = new Column[this.columns.length];
Index newIx = index.mask(rowsToKeep, cardinality);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,19 @@
import org.enso.table.data.table.Column;
import org.enso.table.data.table.problems.AggregatedProblems;
import org.enso.table.data.table.problems.FloatingPointGrouping;
import org.enso.table.text.TextFoldingStrategy;

import java.util.*;

public class Distinct {
public static BitSet buildDistinctRowsMask(int tableSize, Column[] keyColumns, Comparator<Object> objectComparator, AggregatedProblems problems) {
public static BitSet buildDistinctRowsMask(int tableSize, Column[] keyColumns, TextFoldingStrategy textFoldingStrategy, AggregatedProblems problems) {
var mask = new BitSet();
if (keyColumns.length != 0) {
HashSet<MultiValueKeyBase> visitedRows = new HashSet<>();
int size = keyColumns[0].getSize();
Storage[] storage = Arrays.stream(keyColumns).map(Column::getStorage).toArray(Storage[]::new);
for (int i = 0; i < size; i++) {
UnorderedMultiValueKey key = new UnorderedMultiValueKey(storage, i);
UnorderedMultiValueKey key = new UnorderedMultiValueKey(storage, i, textFoldingStrategy);

if (key.hasFloatValues()) {
problems.add(new FloatingPointGrouping("Distinct", i));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package org.enso.table.text;

import org.enso.base.Text_Utils;

import java.util.Locale;

public class CaseInsensitiveFold implements TextFoldingStrategy {

private final Locale locale;

public CaseInsensitiveFold(Locale locale) {
this.locale = locale;
}

@Override
public String fold(String value) {
return Text_Utils.case_insensitive_key(value, locale);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package org.enso.table.text;

public interface TextFoldingStrategy {
String fold(String value);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
package org.enso.table.text;

import org.enso.base.Text_Utils;

public class UnicodeNormalizedFold implements TextFoldingStrategy {
@Override
public String fold(String value) {
return Text_Utils.normalize(value);
}

public static final UnicodeNormalizedFold INSTANCE = new UnicodeNormalizedFold();
}

0 comments on commit b940cd9

Please sign in to comment.