Skip to content

Commit

Permalink
Basic changes to File_Format (#3516)
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd authored Jun 8, 2022
1 parent b1db359 commit 2af970f
Show file tree
Hide file tree
Showing 13 changed files with 169 additions and 81 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@
- [Added rank data, correlation and covariance statistics for `Vector`][3484]
- [Implemented `Table.order_by` for the SQLite backend.][3502]
- [Implemented `Table.order_by` for the PostgreSQL backend.][3514]
- [Renamed `File_Format.Text` to `Plain_Text`, updated `File_Format.Delimited`
API and added builders for customizing less common settings.][3516]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -210,6 +212,7 @@
[3484]: https://github.com/enso-org/enso/pull/3484
[3502]: https://github.com/enso-org/enso/pull/3502
[3514]: https://github.com/enso-org/enso/pull/3514
[3516]: https://github.com/enso-org/enso/pull/3516

#### Enso Compiler

Expand Down
6 changes: 6 additions & 0 deletions distribution/lib/Standard/Base/0.0.0-dev/src/Data/Locale.enso
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,12 @@ type Locale
if this.variant.is_nothing.not then b.append ["variant", this.variant]
Json.from_pairs b.to_vector

## Compares two locales for equality.
== : Any -> Boolean
== other = case other of
Locale other_java_locale -> this.java_locale.equals other_java_locale
_ -> False

## PRIVATE

Convert a java locale to an Enso locale.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ make_order_descriptor internal_column sort_direction text_ordering =
IR.Order_Descriptor internal_column.expression sort_direction nulls_order=nulls collation=Nothing
True ->
IR.Order_Descriptor internal_column.expression sort_direction nulls_order=nulls collation="ucs_basic"
Case_Insensitive locale -> case Locale.default.java_locale.equals locale.java_locale of
Case_Insensitive locale -> case locale == Locale.default of
False ->
Error.throw (Unsupported_Database_Operation_Error "Case insensitive ordering with custom locale is currently not supported. You may need to materialize the Table to perform this operation.")
True ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ type Sqlite_Dialect
IR.Order_Descriptor internal_column.expression sort_direction collation=Nothing
True ->
IR.Order_Descriptor internal_column.expression sort_direction collation="BINARY"
Case_Insensitive locale -> case Locale.default.java_locale.equals locale.java_locale of
Case_Insensitive locale -> case locale == Locale.default of
False ->
Error.throw (Unsupported_Database_Operation_Error "Case insensitive ordering with custom locale is not supported by the SQLite backend. You may need to materialize the Table to perform this operation.")
True ->
Expand Down
7 changes: 7 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,10 @@ type Leading_Zeros column:Text (datatype:(Integer|Number|Date|Time|Time_Of_Day|B
a parse is attempted anyway. If mixed types are requested, the column is not
parsed due to ambiguity.
type Duplicate_Type_Selector column:Text ambiguous:Boolean

## Indicates that the given file type is not supported by the `Auto` format.
type Unsupported_File_Type filename

Unsupported_File_Type.to_display_text : Text
Unsupported_File_Type.to_display_text =
"The "+this.filename+" has a type that is not supported by the Auto format."
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,6 @@ read_from_reader format java_reader on_problems max_columns=4096 =
True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS
Infer -> DelimitedReader.HeaderBehavior.INFER
False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS
skip_rows = case format.skip_rows of
Nothing -> 0
Integer -> format.skip_rows
_ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.")
row_limit = case format.row_limit of
Nothing -> -1
Integer -> format.row_limit
Expand Down Expand Up @@ -127,7 +123,7 @@ read_from_reader format java_reader on_problems max_columns=4096 =
cell_type_guesser = if format.headers != Infer then Nothing else
formatter = format.value_formatter.if_nothing Data_Formatter
TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new
reader = DelimitedReader.new java_reader format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
reader = DelimitedReader.new java_reader format.delimiter format.quote format.quote_escape java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
result_with_problems = reader.read
parsing_problems = Vector.Vector (result_with_problems.problems) . map here.translate_reader_problem
on_problems.attach_problems_after (Table.Table result_with_problems.value) parsing_problems
Expand Down
4 changes: 2 additions & 2 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Io/Excel.enso
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ type Excel_Section

## Gets the data from a specific sheet. Column names are the Excel column
names.
type Sheet (sheet:(Integer|Text)) (skip_rows:(Integer|Nothing)=Nothing) (row_limit:(Integer|Nothing)=Nothing)
type Sheet (sheet:(Integer|Text)) (skip_rows:Integer=0) (row_limit:(Integer|Nothing)=Nothing)

## Gets a specific range (taking either a defined name or external style
address) from the workbook.
type Range (address:(Text|Excel_Range)) (skip_rows:(Integer|Nothing)=Nothing) (row_limit:(Integer|Nothing)=Nothing)
type Range (address:(Text|Excel_Range)) (skip_rows:Integer=0) (row_limit:(Integer|Nothing)=Nothing)

type Excel_Range
## Specifies a range within an Excel Workbook.
Expand Down
57 changes: 47 additions & 10 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Prob
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
import Standard.Base.Runtime.Ref
import Standard.Table.Internal.Delimited_Reader
from Standard.Table.Error as Table_Errors import Unsupported_File_Type

from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
import Standard.Table.Io.Excel as Excel_Module
import Standard.Table.Io.Quote_Style

## This type needs to be here to allow for the usage of Standard.Table
functions. Ideally, it would be an interface within Standard.Base and
Expand All @@ -24,17 +26,18 @@ type Auto
materialise file =
extension = file.extension

output = Ref.new File_Format.Bytes
if ".txt".equals_ignore_case extension then output.put File_Format.Text
if ".log".equals_ignore_case extension then output.put File_Format.Text
output = Ref.new Nothing
if ".txt".equals_ignore_case extension then output.put File_Format.Plain_Text
if ".log".equals_ignore_case extension then output.put File_Format.Plain_Text
if ".csv".equals_ignore_case extension then output.put (File_Format.Delimited ',')
if ".tsv".equals_ignore_case extension then output.put (File_Format.Delimited '\t')
if ".xlsx".equals_ignore_case extension then output.put File_Format.Excel
if ".xlsm".equals_ignore_case extension then output.put File_Format.Excel
if ".xls".equals_ignore_case extension then output.put File_Format.Excel
if ".xlt".equals_ignore_case extension then output.put File_Format.Excel

output.get
output.get.if_nothing <|
Error.throw (Unsupported_File_Type file.name)

## Implements the `File.read` for this `File_Format`
read : File -> Problem_Behavior -> Any
Expand All @@ -52,8 +55,8 @@ type Bytes
file.read_bytes

## Reads the file to a `Text` with specified encoding.
type Text
type Text (encoding:Encoding=Encoding.utf_8)
type Plain_Text
type Plain_Text (encoding:Encoding=Encoding.utf_8)

## Implements the `File.read` for this `File_Format`
read : File -> Problem_Behavior -> Any
Expand All @@ -72,6 +75,9 @@ type Delimited
- delimiter: The delimiter character to split the file into columns. An
`Illegal_Argument_Error` error is returned if this is an empty string.
- encoding: The encoding to use when reading the file.
- skip_rows: The number of rows to skip from the top of the file.
- row_limit: The maximum number of rows to read from the file. This count
does not include the header row (if applicable).
- quote: The quote character denotes the start and end of a quoted value.
No quote character is used if set to `Nothing`. Quoted items are not
split on the delimiter and can also contain newlines. Within a quoted
Expand All @@ -83,27 +89,58 @@ type Delimited
then escaping quotes is done by double quotes: `"ab""cd"` will yield
the text `ab"cd"`. Another popular choice for `quote_escape` is the `\`
character. Then `"ab\"cd"` will yield the same text.
- quote_style: The style of quoting to use when writing the file.
- headers: If set to `True`, the first row is used as column names. If
set to `False`, the column names are generated by adding increasing
numeric suffixes to the base name `Column` (i.e. `Column_1`,
`Column_2` etc.). If set to `Infer`, the process tries to infer if
headers are present on the first row. If the column names are not
unique, numeric suffixes will be appended to disambiguate them.
- skip_rows: The number of rows to skip from the top of the file.
- row_limit: The maximum number of rows to read from the file. This count
does not include the header row (if applicable).
- value_formatter: Formatter to parse text values into numbers, dates,
times, etc. If `Nothing` values are left as Text.
- keep_invalid_rows: Specifies whether rows that contain less or more
columns than expected should be kept (setting the missing columns to
`Nothing` or dropping the excess columns) or dropped.
type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=Infer) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True)
type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (quote_style:Quote_Style=Quote_Style.Necessary) (headers:True|False|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True)

## Implements the `File.read` for this `File_Format`
read : File -> Problem_Behavior -> Any
read file on_problems =
Delimited_Reader.read_file this file on_problems

## PRIVATE
Clone the instance with some properties overridden.
Note: This function is internal until such time as Atom cloning with modification is built into Enso.
clone : Text->Text->(Boolean|Infer)->Data_Formatter->Boolean->Delimited
clone (quote=this.quote) (quote_escape=this.quote_escape) (quote_style=this.quote_style) (headers=this.headers) (value_formatter=this.value_formatter) (keep_invalid_rows=this.keep_invalid_rows) =
Delimited this.delimiter this.encoding this.skip_rows this.row_limit quote quote_escape quote_style headers value_formatter keep_invalid_rows

## Create a clone of this with specified `quote` and `quote_escape`.
with_quotes : Text->Text->Quote_Style->Delimited
with_quotes quote quote_escape=quote quote_style=this.quote_style =
this.clone quote=quote quote_escape=quote_escape quote_style=quote_style

## Create a clone of this with first row treated as header.
with_headers : Delimited
with_headers = this.clone headers=True

## Create a clone of this where the first row is treated as data, not a
header.
without_headers : Delimited
without_headers = this.clone headers=False

## Create a clone of this with value parsing.

A custom `Data_Formatter` can be provided to customize parser options.
with_parsing : Data_Formatter -> Delimited
with_parsing (value_formatter=Data_Formatter) =
this.clone value_formatter=value_formatter

## Create a clone of this without value parsing.
without_parsing : Delimited
without_parsing =
this.clone value_formatter=Nothing

## A setting to infer the default behaviour of some option.
type Infer

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
type Quote_Style
## Do not quote any values even if this will result in an invalid file.
type Never

## Quote text values which are empty or contain the delimiter or new lines.
type Necessary

## Quote all text values.
type Always
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ public static String[] readRangeNames(InputStream stream, boolean xls_format) th
public static Table readSheetByName(
InputStream stream,
String sheetName,
Integer skip_rows,
int skip_rows,
Integer row_limit,
boolean xls_format)
throws IOException, IllegalArgumentException {
Expand All @@ -367,7 +367,7 @@ public static Table readSheetByName(
workbook,
sheetIndex,
null,
skip_rows == null ? 0 : skip_rows,
skip_rows,
row_limit == null ? Integer.MAX_VALUE : row_limit);
}

Expand All @@ -383,7 +383,7 @@ public static Table readSheetByName(
* @throws IOException when the input stream cannot be read.
*/
public static Table readSheetByIndex(
InputStream stream, int index, Integer skip_rows, Integer row_limit, boolean xls_format)
InputStream stream, int index, int skip_rows, Integer row_limit, boolean xls_format)
throws IOException, IllegalArgumentException {
Workbook workbook = getWorkbook(stream, xls_format);

Expand All @@ -397,7 +397,7 @@ public static Table readSheetByIndex(
workbook,
index - 1,
null,
skip_rows == null ? 0 : skip_rows,
skip_rows,
row_limit == null ? Integer.MAX_VALUE : row_limit);
}

Expand All @@ -415,7 +415,7 @@ public static Table readSheetByIndex(
public static Table readRangeByName(
InputStream stream,
String rangeNameOrAddress,
Integer skip_rows,
int skip_rows,
Integer row_limit,
boolean xls_format)
throws IOException {
Expand All @@ -438,7 +438,7 @@ public static Table readRangeByName(
* @throws IOException when the input stream cannot be read.
*/
public static Table readRange(
InputStream stream, Range range, Integer skip_rows, Integer row_limit, boolean xls_format)
InputStream stream, Range range, int skip_rows, Integer row_limit, boolean xls_format)
throws IOException {
return readRange(getWorkbook(stream, xls_format), range, skip_rows, row_limit);
}
Expand All @@ -448,7 +448,7 @@ private static Workbook getWorkbook(InputStream stream, boolean xls_format) thro
}

private static Table readRange(
Workbook workbook, Range range, Integer skip_rows, Integer row_limit) {
Workbook workbook, Range range, int skip_rows, Integer row_limit) {
int sheetIndex = getSheetIndex(workbook, range.getSheetName());
if (sheetIndex == -1) {
throw new IllegalArgumentException("Unknown sheet '" + range.getSheetName() + "'.");
Expand All @@ -458,7 +458,7 @@ private static Table readRange(
workbook,
sheetIndex,
range,
skip_rows == null ? 0 : skip_rows,
skip_rows,
row_limit == null ? Integer.MAX_VALUE : row_limit);
}
}
Loading

0 comments on commit 2af970f

Please sign in to comment.