Skip to content

Commit

Permalink
Handle mixed Date and Date Time column within Excel (#11349)
Browse files Browse the repository at this point in the history
- If a column contains both Date and DateTime in Excel, we create a DateTime column.
- If the column contains numbers or text as well then we end up with a mixed column.
![image](https://github.com/user-attachments/assets/b0b98d1c-c5c5-41db-8af5-0c946d8a5b92)
  • Loading branch information
jdunkerley authored Oct 17, 2024
1 parent fb8c492 commit 5f44c51
Show file tree
Hide file tree
Showing 10 changed files with 125 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import project.Data.Time.Time_Period.Time_Period
import project.Data.Time.Time_Zone.Time_Zone
import project.Data.Vector.Vector
import project.Error.Error
import project.Errors.Common.Missing_Argument
import project.Errors.Common.Type_Error
import project.Errors.Illegal_Argument.Illegal_Argument
import project.Errors.Time_Error.Time_Error
Expand Down Expand Up @@ -161,7 +162,7 @@ type Date_Time
@nanosecond (Widget.Numeric_Input minimum=0 maximum=999 display=Display.When_Modified)
@zone Time_Zone.default_widget
new : Integer -> Integer -> Integer -> Integer -> Integer -> Integer -> Integer -> Integer -> Integer -> Time_Zone -> Date_Time ! Time_Error
new year (month = 1) (day = 1) (hour = 0) (minute = 0) (second = 0) (millisecond = 0) (microsecond = 0) (nanosecond = 0) (zone = Time_Zone.system) =
new (year:Integer=(Missing_Argument.throw "year")) (month:Integer=1) (day:Integer=1) (hour:Integer=0) (minute:Integer=0) (second:Integer=0) (millisecond:Integer=0) (microsecond:Integer=0) (nanosecond = 0) (zone:Time_Zone=Time_Zone.system) =
total_nanoseconds = nanosecond + microsecond * 1000 + millisecond * 1000000
Panic.catch JException (new_builtin year month day hour minute second total_nanoseconds zone) cause->
Error.throw (Time_Error.Error cause.payload.getMessage)
Expand Down Expand Up @@ -834,6 +835,20 @@ type Date_Time
zone_format = if self.zone == Time_Zone.system then "" else "'['TT']'"
self.format "yyyy-MM-dd "+time_format+zone_format

## PRIVATE
Convert to a Enso code representation of this Time_Of_Day.
pretty : Text
pretty self = "(Date_Time.new " + self.year.to_text + " " + self.month.to_text + " " + self.day.to_text
+ (if self.hour == 0 then "" else " hour="+self.hour.to_text)
+ (if self.minute == 0 then "" else " minute="+self.minute.to_text)
+ (if self.second == 0 then "" else " second="+self.second.to_text)
+ (if self.millisecond == 0 then "" else " millisecond="+self.millisecond.to_text)
+ (if self.microsecond == 0 then "" else " microsecond="+self.microsecond.to_text)
+ (if self.nanosecond == 0 then "" else " nanosecond="+self.nanosecond.to_text)
+ (if self.zone == Time_Zone.system then "" else " zone="+self.zone.pretty)
+ ")"


## PRIVATE
Convert to a JavaScript Object representing a Date_Time.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ type Time_Of_Day
## PRIVATE
Convert to a Enso code representation of this Time_Of_Day.
pretty : Text
pretty self = "(Time_Of_Day.new "
pretty self = "(Time_Of_Day.new"
+ (if self.hour == 0 then "" else " hour="+self.hour.to_text)
+ (if self.minute == 0 then "" else " minute="+self.minute.to_text)
+ (if self.second == 0 then "" else " second="+self.second.to_text)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,12 @@ type Time_Zone
zone_names : Vector Text
zone_names = Time_Utils.getZoneNames

## PRIVATE
Convert to a Enso code representation of this Time_Of_Day.
pretty : Text
pretty self = "(Time_Zone.parse '" + self.zone_id + "')"


## PRIVATE
Time_Zone.from (that:JS_Object) =
if that.get "type" == "Time_Zone" && ["id"].all that.contains_key then Time_Zone.parse (that.get "id") else
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package org.enso.table.data.column.builder;

import java.time.LocalDate;
import java.util.Objects;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.datetime.DateStorage;
import org.enso.table.data.column.storage.type.DateTimeType;
import org.enso.table.data.column.storage.type.DateType;
import org.enso.table.data.column.storage.type.StorageType;
import org.enso.table.error.ValueTypeMismatchException;
Expand All @@ -14,8 +16,15 @@ protected LocalDate[] newArray(int size) {
return new LocalDate[size];
}

private final boolean allowDateToDateTimeConversion;

public DateBuilder(int size) {
this(size, false);
}

public DateBuilder(int size, boolean allowDateToDateTimeConversion) {
super(size);
this.allowDateToDateTimeConversion = allowDateToDateTimeConversion;
}

@Override
Expand Down Expand Up @@ -45,4 +54,24 @@ public boolean accepts(Object o) {
protected Storage<LocalDate> doSeal() {
return new DateStorage(data, currentSize);
}

@Override
public boolean canRetypeTo(StorageType type) {
if (allowDateToDateTimeConversion && Objects.equals(type, DateTimeType.INSTANCE)) {
return true;
}
return super.canRetypeTo(type);
}

@Override
public TypedBuilder retypeTo(StorageType type) {
if (allowDateToDateTimeConversion && Objects.equals(type, DateTimeType.INSTANCE)) {
DateTimeBuilder res = new DateTimeBuilder(data.length, true);
for (int i = 0; i < currentSize; i++) {
res.appendNoGrow(data[i]);
}
return res;
}
return super.retypeTo(type);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.util.BitSet;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.datetime.DateStorage;
import org.enso.table.data.column.storage.datetime.DateTimeStorage;
Expand All @@ -19,8 +20,17 @@ protected ZonedDateTime[] newArray(int size) {
return new ZonedDateTime[size];
}

private final boolean allowDateToDateTimeConversion;
private final BitSet wasLocalDate;

public DateTimeBuilder(int size) {
this(size, false);
}

public DateTimeBuilder(int size, boolean allowDateToDateTimeConversion) {
super(size);
this.allowDateToDateTimeConversion = allowDateToDateTimeConversion;
this.wasLocalDate = allowDateToDateTimeConversion ? new BitSet(size) : null;
}

@Override
Expand All @@ -39,7 +49,12 @@ private ZonedDateTime convertDate(LocalDate date) {
@Override
public void appendNoGrow(Object o) {
try {
data[currentSize++] = (ZonedDateTime) o;
if (allowDateToDateTimeConversion && o instanceof LocalDate localDate) {
data[currentSize++] = convertDate(localDate);
wasLocalDate.set(currentSize - 1);
} else {
data[currentSize++] = (ZonedDateTime) o;
}
} catch (ClassCastException e) {
throw new ValueTypeMismatchException(getType(), o);
}
Expand Down Expand Up @@ -75,11 +90,29 @@ public void appendBulkStorage(Storage<?> storage) {

@Override
public boolean accepts(Object o) {
return o instanceof ZonedDateTime;
return o instanceof ZonedDateTime || (allowDateToDateTimeConversion && o instanceof LocalDate);
}

@Override
protected Storage<ZonedDateTime> doSeal() {
return new DateTimeStorage(data, currentSize);
}

@Override
public void retypeToMixed(Object[] items) {
if (allowDateToDateTimeConversion) {
if (currentSize >= 0) {
System.arraycopy(data, 0, items, 0, currentSize);

// Replace ZonedDateTime with LocalDate where necessary.
int next = this.wasLocalDate.nextSetBit(0);
while (next != -1) {
items[next] = data[next].toLocalDate();
next = this.wasLocalDate.nextSetBit(next + 1);
}
}
} else {
super.retypeToMixed(items);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,7 @@
import org.enso.base.polyglot.NumericConverter;
import org.enso.base.polyglot.Polyglot_Utils;
import org.enso.table.data.column.storage.Storage;
import org.enso.table.data.column.storage.type.BigDecimalType;
import org.enso.table.data.column.storage.type.BigIntegerType;
import org.enso.table.data.column.storage.type.BooleanType;
import org.enso.table.data.column.storage.type.DateTimeType;
import org.enso.table.data.column.storage.type.DateType;
import org.enso.table.data.column.storage.type.FloatType;
import org.enso.table.data.column.storage.type.IntegerType;
import org.enso.table.data.column.storage.type.StorageType;
import org.enso.table.data.column.storage.type.TextType;
import org.enso.table.data.column.storage.type.TimeOfDayType;
import org.enso.table.data.column.storage.type.*;
import org.enso.table.problems.ProblemAggregator;

/**
Expand All @@ -29,15 +20,31 @@ public class InferredBuilder extends Builder {
private int currentSize = 0;
private final int initialSize;
private final ProblemAggregator problemAggregator;
private final boolean allowDateToDateTimeConversion;

/**
* Creates a new instance of this builder, with the given known result length.
*
* @param initialSize the result length
* @param problemAggregator the problem aggregator to use
*/
public InferredBuilder(int initialSize, ProblemAggregator problemAggregator) {
this(initialSize, problemAggregator, false);
}

/**
* Creates a new instance of this builder, with the given known result length. This is a special
* constructor that allows for date to date-time conversion (for Excel).
*
* @param initialSize the result length
* @param problemAggregator the problem aggregator to use
* @param allowDateToDateTimeConversion whether to allow date to date-time conversion
*/
public InferredBuilder(
int initialSize, ProblemAggregator problemAggregator, boolean allowDateToDateTimeConversion) {
this.initialSize = initialSize;
this.problemAggregator = problemAggregator;
this.allowDateToDateTimeConversion = allowDateToDateTimeConversion;
}

@Override
Expand Down Expand Up @@ -120,11 +127,11 @@ private void initBuilderFor(Object o) {
} else if (o instanceof BigDecimal) {
currentBuilder = new BigDecimalBuilder(initialCapacity);
} else if (o instanceof LocalDate) {
currentBuilder = new DateBuilder(initialCapacity);
currentBuilder = new DateBuilder(initialCapacity, allowDateToDateTimeConversion);
} else if (o instanceof LocalTime) {
currentBuilder = new TimeOfDayBuilder(initialCapacity);
} else if (o instanceof ZonedDateTime) {
currentBuilder = new DateTimeBuilder(initialCapacity);
currentBuilder = new DateTimeBuilder(initialCapacity, allowDateToDateTimeConversion);
} else {
currentBuilder = new MixedBuilder(initialCapacity);
}
Expand All @@ -149,7 +156,9 @@ private record RetypeInfo(Class<?> clazz, StorageType type) {}
new RetypeInfo(Integer.class, IntegerType.INT_64),
new RetypeInfo(Short.class, IntegerType.INT_64),
new RetypeInfo(Byte.class, IntegerType.INT_64),
new RetypeInfo(BigInteger.class, BigIntegerType.INSTANCE));
new RetypeInfo(BigInteger.class, BigIntegerType.INSTANCE),
// Will only return true if the date to date-time conversion is allowed.
new RetypeInfo(LocalDate.class, DateTimeType.INSTANCE));

private void retypeAndAppend(Object o) {
for (RetypeInfo info : retypePairs) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ private static void expandBuilders(
int rows,
ProblemAggregator problemAggregator) {
for (int i = builders.size(); i <= columnCount; i++) {
Builder builder = new InferredBuilder(size, problemAggregator);
Builder builder = new InferredBuilder(size, problemAggregator, true);
builder.appendNulls(rows);
builders.add(builder);
}
Expand Down
Binary file added test/Table_Tests/data/MixedExcel.xlsx
Binary file not shown.
15 changes: 14 additions & 1 deletion test/Table_Tests/src/IO/Excel_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import Standard.Base.Runtime.Context
import Standard.Base.Runtime.Managed_Resource.Managed_Resource
import Standard.Base.Runtime.Ref.Ref

from Standard.Table import Table, Match_Columns, Excel_Format, Excel_Range, Data_Formatter, Delimited_Format, Excel_Workbook
from Standard.Table import Table, Match_Columns, Excel_Format, Excel_Range, Data_Formatter, Delimited_Format, Excel_Workbook, Value_Type

from Standard.Table.Errors import Invalid_Column_Names, Duplicate_Output_Column_Names, Invalid_Location, Range_Exceeded, Existing_Data, Column_Count_Mismatch, Column_Name_Mismatch, Empty_Sheet

Expand Down Expand Up @@ -1012,6 +1012,19 @@ add_specs suite_builder =
workbook.close . should_equal Nothing
workbook.read "Sheet1" . should_fail_with Illegal_State

group_builder.specify "should be able to read a mixed Date and DateTime column" <|
table = (enso_project.data / "MixedExcel.xlsx") . read ..Sheet
table.row_count . should_equal 5
table.column_names . should_equal ["Number","Date","DateTimes","Time","MixDates","InvMixDates","Mixed"]
table.columns.map .value_type . should_equal [Value_Type.Integer, Value_Type.Date, Value_Type.Date_Time, Value_Type.Time, Value_Type.Date_Time, Value_Type.Date_Time, Value_Type.Mixed]
table.at "Number" . to_vector . should_equal [15,24,43,1,85]
table.at "Date" . to_vector . should_equal [(Date.new 1997 7 25), (Date.new 1993 5 3), (Date.new 2010 8 1), (Date.new 1988 7 12), (Date.new 2009 10 22)]
table.at "DateTimes" . to_vector . should_equal [(Date_Time.new 1997 7 25 22 23 28 millisecond=34), (Date_Time.new 1993 5 3 10 58 45 millisecond=980), (Date_Time.new 2010 8 1 17 9 29 millisecond=923), (Date_Time.new 1988 7 12 12 39 20 millisecond=185), (Date_Time.new 2009 10 22 12 33 7 millisecond=157)]
table.at "Time" . to_vector . should_equal [(Time_Of_Day.new 16 43 42 millisecond=486), (Time_Of_Day.new 1 39 30 millisecond=506), (Time_Of_Day.new 7 44 4 millisecond=567), (Time_Of_Day.new 18 39 24 millisecond=572), (Time_Of_Day.new 10 6 32 millisecond=917)]
table.at "MixDates" . to_vector . should_equal [(Date_Time.new 1997 7 25 22 23 28 millisecond=34), (Date_Time.new 1993 5 3), (Date_Time.new 2010 8 1), (Date_Time.new 1988 7 12), (Date_Time.new 2009 10 22 12 33 7 millisecond=157)]
table.at "InvMixDates" . to_vector . should_equal [(Date_Time.new 1997 7 25), (Date_Time.new 1993 5 3 10 58 45 millisecond=980), (Date_Time.new 2010 8 1 17 9 29 millisecond=923), (Date_Time.new 1988 7 12 12 39 20 millisecond=185), (Date_Time.new 2009 10 22)]
table.at "Mixed" . to_vector . should_equal [(Date.new 1997 7 25), (Date_Time.new 1993 5 3 10 58 45 millisecond=980), (Date_Time.new 2010 8 1 17 9 29 millisecond=923), (Time_Of_Day.new 18 39 24 millisecond=572), 85]

ci_pending = if Environment.get "CI" != Nothing then "This test takes a lot of time so it is disabled on CI."
group_builder.specify "should be able to write and read a big XLSX file (>110MB)" pending=ci_pending <|
n = 10^6
Expand Down
2 changes: 1 addition & 1 deletion test/Table_Tests/src/In_Memory/Table_Spec.enso
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ add_specs suite_builder =

group_builder.specify "should correctly handle storage of results" <|
c_int = Column.from_vector 'year' [2022, 2000, 1999]
r = c_int . map Date_Time.new
r = c_int . map (i-> Date_Time.new i)
r.to_vector . should_equal [Date_Time.new 2022, Date_Time.new 2000, Date_Time.new 1999]
r.value_type . should_equal Value_Type.Date_Time

Expand Down

0 comments on commit 5f44c51

Please sign in to comment.