Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic changes to File_Format #3516

Merged
merged 8 commits into from
Jun 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@
- [Added rank data, correlation and covariance statistics for `Vector`][3484]
- [Implemented `Table.order_by` for the SQLite backend.][3502]
- [Implemented `Table.order_by` for the PostgreSQL backend.][3514]
- [Renamed `File_Format.Text` to `Plain_Text`, updated `File_Format.Delimited`
API and added builders for customizing less common settings.][3516]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -210,6 +212,7 @@
[3484]: https://github.com/enso-org/enso/pull/3484
[3502]: https://github.com/enso-org/enso/pull/3502
[3514]: https://github.com/enso-org/enso/pull/3514
[3516]: https://github.com/enso-org/enso/pull/3516

#### Enso Compiler

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,12 @@ type Locale
if this.variant.is_nothing.not then b.append ["variant", this.variant]
Json.from_pairs b.to_vector

## Compares two locales for equality.
== : Any -> Boolean
== other = case other of
Locale other_java_locale -> this.java_locale.equals other_java_locale
_ -> False

## PRIVATE

Convert a java locale to an Enso locale.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ make_order_descriptor internal_column sort_direction text_ordering =
IR.Order_Descriptor internal_column.expression sort_direction nulls_order=nulls collation=Nothing
True ->
IR.Order_Descriptor internal_column.expression sort_direction nulls_order=nulls collation="ucs_basic"
Case_Insensitive locale -> case Locale.default.java_locale.equals locale.java_locale of
Case_Insensitive locale -> case locale == Locale.default of
False ->
Error.throw (Unsupported_Database_Operation_Error "Case insensitive ordering with custom locale is currently not supported. You may need to materialize the Table to perform this operation.")
True ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ type Sqlite_Dialect
IR.Order_Descriptor internal_column.expression sort_direction collation=Nothing
True ->
IR.Order_Descriptor internal_column.expression sort_direction collation="BINARY"
Case_Insensitive locale -> case Locale.default.java_locale.equals locale.java_locale of
Case_Insensitive locale -> case locale == Locale.default of
False ->
Error.throw (Unsupported_Database_Operation_Error "Case insensitive ordering with custom locale is not supported by the SQLite backend. You may need to materialize the Table to perform this operation.")
True ->
Expand Down
7 changes: 7 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso
Original file line number Diff line number Diff line change
Expand Up @@ -154,3 +154,10 @@ type Leading_Zeros column:Text (datatype:(Integer|Number|Date|Time|Time_Of_Day|B
a parse is attempted anyway. If mixed types are requested, the column is not
parsed due to ambiguity.
type Duplicate_Type_Selector column:Text ambiguous:Boolean

## Indicates that the given file type is not supported by the `Auto` format.
type Unsupported_File_Type filename

Unsupported_File_Type.to_display_text : Text
Unsupported_File_Type.to_display_text =
"The "+this.filename+" has a type that is not supported by the Auto format."
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,6 @@ read_from_reader format java_reader on_problems max_columns=4096 =
True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS
Infer -> DelimitedReader.HeaderBehavior.INFER
False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS
skip_rows = case format.skip_rows of
Nothing -> 0
Integer -> format.skip_rows
_ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.")
row_limit = case format.row_limit of
Nothing -> -1
Integer -> format.row_limit
Expand Down Expand Up @@ -127,7 +123,7 @@ read_from_reader format java_reader on_problems max_columns=4096 =
cell_type_guesser = if format.headers != Infer then Nothing else
formatter = format.value_formatter.if_nothing Data_Formatter
TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new
reader = DelimitedReader.new java_reader format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
reader = DelimitedReader.new java_reader format.delimiter format.quote format.quote_escape java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors
result_with_problems = reader.read
parsing_problems = Vector.Vector (result_with_problems.problems) . map here.translate_reader_problem
on_problems.attach_problems_after (Table.Table result_with_problems.value) parsing_problems
Expand Down
4 changes: 2 additions & 2 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Io/Excel.enso
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ type Excel_Section

## Gets the data from a specific sheet. Column names are the Excel column
names.
type Sheet (sheet:(Integer|Text)) (skip_rows:(Integer|Nothing)=Nothing) (row_limit:(Integer|Nothing)=Nothing)
type Sheet (sheet:(Integer|Text)) (skip_rows:Integer=0) (row_limit:(Integer|Nothing)=Nothing)

## Gets a specific range (taking either a defined name or external style
address) from the workbook.
type Range (address:(Text|Excel_Range)) (skip_rows:(Integer|Nothing)=Nothing) (row_limit:(Integer|Nothing)=Nothing)
type Range (address:(Text|Excel_Range)) (skip_rows:Integer=0) (row_limit:(Integer|Nothing)=Nothing)

type Excel_Range
## Specifies a range within an Excel Workbook.
Expand Down
57 changes: 47 additions & 10 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Prob
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
import Standard.Base.Runtime.Ref
import Standard.Table.Internal.Delimited_Reader
from Standard.Table.Error as Table_Errors import Unsupported_File_Type

from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter
import Standard.Table.Io.Excel as Excel_Module
import Standard.Table.Io.Quote_Style

## This type needs to be here to allow for the usage of Standard.Table
functions. Ideally, it would be an interface within Standard.Base and
Expand All @@ -24,17 +26,18 @@ type Auto
materialise file =
extension = file.extension

output = Ref.new File_Format.Bytes
if ".txt".equals_ignore_case extension then output.put File_Format.Text
if ".log".equals_ignore_case extension then output.put File_Format.Text
output = Ref.new Nothing
if ".txt".equals_ignore_case extension then output.put File_Format.Plain_Text
if ".log".equals_ignore_case extension then output.put File_Format.Plain_Text
if ".csv".equals_ignore_case extension then output.put (File_Format.Delimited ',')
if ".tsv".equals_ignore_case extension then output.put (File_Format.Delimited '\t')
if ".xlsx".equals_ignore_case extension then output.put File_Format.Excel
if ".xlsm".equals_ignore_case extension then output.put File_Format.Excel
if ".xls".equals_ignore_case extension then output.put File_Format.Excel
if ".xlt".equals_ignore_case extension then output.put File_Format.Excel

output.get
output.get.if_nothing <|
Error.throw (Unsupported_File_Type file.name)

## Implements the `File.read` for this `File_Format`
read : File -> Problem_Behavior -> Any
Expand All @@ -52,8 +55,8 @@ type Bytes
file.read_bytes

## Reads the file to a `Text` with specified encoding.
type Text
type Text (encoding:Encoding=Encoding.utf_8)
type Plain_Text
type Plain_Text (encoding:Encoding=Encoding.utf_8)

## Implements the `File.read` for this `File_Format`
read : File -> Problem_Behavior -> Any
Expand All @@ -72,6 +75,9 @@ type Delimited
- delimiter: The delimiter character to split the file into columns. An
`Illegal_Argument_Error` error is returned if this is an empty string.
- encoding: The encoding to use when reading the file.
- skip_rows: The number of rows to skip from the top of the file.
- row_limit: The maximum number of rows to read from the file. This count
does not include the header row (if applicable).
- quote: The quote character denotes the start and end of a quoted value.
No quote character is used if set to `Nothing`. Quoted items are not
split on the delimiter and can also contain newlines. Within a quoted
Expand All @@ -83,27 +89,58 @@ type Delimited
then escaping quotes is done by double quotes: `"ab""cd"` will yield
the text `ab"cd"`. Another popular choice for `quote_escape` is the `\`
character. Then `"ab\"cd"` will yield the same text.
- quote_style: The style of quoting to use when writing the file.
- headers: If set to `True`, the first row is used as column names. If
set to `False`, the column names are generated by adding increasing
numeric suffixes to the base name `Column` (i.e. `Column_1`,
`Column_2` etc.). If set to `Infer`, the process tries to infer if
headers are present on the first row. If the column names are not
unique, numeric suffixes will be appended to disambiguate them.
- skip_rows: The number of rows to skip from the top of the file.
- row_limit: The maximum number of rows to read from the file. This count
does not include the header row (if applicable).
- value_formatter: Formatter to parse text values into numbers, dates,
times, etc. If `Nothing` values are left as Text.
- keep_invalid_rows: Specifies whether rows that contain less or more
columns than expected should be kept (setting the missing columns to
`Nothing` or dropping the excess columns) or dropped.
type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=Infer) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True)
type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (quote_style:Quote_Style=Quote_Style.Necessary) (headers:True|False|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True)

## Implements the `File.read` for this `File_Format`
read : File -> Problem_Behavior -> Any
read file on_problems =
Delimited_Reader.read_file this file on_problems

## PRIVATE
Clone the instance with some properties overridden.
Note: This function is internal until such time as Atom cloning with modification is built into Enso.
clone : Text->Text->(Boolean|Infer)->Data_Formatter->Boolean->Delimited
clone (quote=this.quote) (quote_escape=this.quote_escape) (quote_style=this.quote_style) (headers=this.headers) (value_formatter=this.value_formatter) (keep_invalid_rows=this.keep_invalid_rows) =
Delimited this.delimiter this.encoding this.skip_rows this.row_limit quote quote_escape quote_style headers value_formatter keep_invalid_rows

## Create a clone of this with specified `quote` and `quote_escape`.
with_quotes : Text->Text->Quote_Style->Delimited
with_quotes quote quote_escape=quote quote_style=this.quote_style =
this.clone quote=quote quote_escape=quote_escape quote_style=quote_style

## Create a clone of this with first row treated as header.
with_headers : Delimited
with_headers = this.clone headers=True

## Create a clone of this where the first row is treated as data, not a
header.
without_headers : Delimited
without_headers = this.clone headers=False

## Create a clone of this with value parsing.

A custom `Data_Formatter` can be provided to customize parser options.
with_parsing : Data_Formatter -> Delimited
with_parsing (value_formatter=Data_Formatter) =
this.clone value_formatter=value_formatter

## Create a clone of this without value parsing.
without_parsing : Delimited
without_parsing =
this.clone value_formatter=Nothing

## A setting to infer the default behaviour of some option.
type Infer

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
type Quote_Style
## Do not quote any values even if this will result in an invalid file.
type Never

## Quote text values which are empty or contain the delimiter or new lines.
type Necessary

## Quote all text values.
type Always
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ public static String[] readRangeNames(InputStream stream, boolean xls_format) th
public static Table readSheetByName(
InputStream stream,
String sheetName,
Integer skip_rows,
int skip_rows,
Integer row_limit,
boolean xls_format)
throws IOException, IllegalArgumentException {
Expand All @@ -367,7 +367,7 @@ public static Table readSheetByName(
workbook,
sheetIndex,
null,
skip_rows == null ? 0 : skip_rows,
skip_rows,
row_limit == null ? Integer.MAX_VALUE : row_limit);
}

Expand All @@ -383,7 +383,7 @@ public static Table readSheetByName(
* @throws IOException when the input stream cannot be read.
*/
public static Table readSheetByIndex(
InputStream stream, int index, Integer skip_rows, Integer row_limit, boolean xls_format)
InputStream stream, int index, int skip_rows, Integer row_limit, boolean xls_format)
throws IOException, IllegalArgumentException {
Workbook workbook = getWorkbook(stream, xls_format);

Expand All @@ -397,7 +397,7 @@ public static Table readSheetByIndex(
workbook,
index - 1,
null,
skip_rows == null ? 0 : skip_rows,
skip_rows,
row_limit == null ? Integer.MAX_VALUE : row_limit);
}

Expand All @@ -415,7 +415,7 @@ public static Table readSheetByIndex(
public static Table readRangeByName(
InputStream stream,
String rangeNameOrAddress,
Integer skip_rows,
int skip_rows,
Integer row_limit,
boolean xls_format)
throws IOException {
Expand All @@ -438,7 +438,7 @@ public static Table readRangeByName(
* @throws IOException when the input stream cannot be read.
*/
public static Table readRange(
InputStream stream, Range range, Integer skip_rows, Integer row_limit, boolean xls_format)
InputStream stream, Range range, int skip_rows, Integer row_limit, boolean xls_format)
throws IOException {
return readRange(getWorkbook(stream, xls_format), range, skip_rows, row_limit);
}
Expand All @@ -448,7 +448,7 @@ private static Workbook getWorkbook(InputStream stream, boolean xls_format) thro
}

private static Table readRange(
Workbook workbook, Range range, Integer skip_rows, Integer row_limit) {
Workbook workbook, Range range, int skip_rows, Integer row_limit) {
int sheetIndex = getSheetIndex(workbook, range.getSheetName());
if (sheetIndex == -1) {
throw new IllegalArgumentException("Unknown sheet '" + range.getSheetName() + "'.");
Expand All @@ -458,7 +458,7 @@ private static Table readRange(
workbook,
sheetIndex,
range,
skip_rows == null ? 0 : skip_rows,
skip_rows,
row_limit == null ? Integer.MAX_VALUE : row_limit);
}
}
Loading