Skip to content

Commit

Permalink
File_Format.Excel work (#3425)
Browse files Browse the repository at this point in the history
- Read in Excel files following the specification.
- Support for XLSX and XLS formats.
- Ability to select ranges and sheets.
- Skip Rows and Row Limits.

# Important Notes
- Minor fix to DelimitedReader for Windows
  • Loading branch information
jdunkerley authored May 6, 2022
1 parent 4bbabc0 commit 078c665
Show file tree
Hide file tree
Showing 13 changed files with 986 additions and 36 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@
`with_step` allowing to change the range step.][3408]
- [Aligned `Text.split` API with other methods and added `Text.lines`.][3415]
- [Implemented a basic reader for the `Delimited` file format.][3424]
- [Implemented a reader for the `Excel` file format.][3425]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -177,6 +178,7 @@
[3408]: https://github.com/enso-org/enso/pull/3408
[3415]: https://github.com/enso-org/enso/pull/3415
[3424]: https://github.com/enso-org/enso/pull/3424
[3425]: https://github.com/enso-org/enso/pull/3425

#### Enso Compiler

Expand Down
7 changes: 7 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,10 @@ type Mismatched_Quote

## Indicates an unexpected parser error.
type Parser_Error cause

## Indicates that a specified location was not valid.
type Invalid_Location (location:Text)

Invalid_Location.to_display_text : Text
Invalid_Location.to_display_text =
"The location '"+this.location+"' is not valid."
186 changes: 186 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Io/Excel.enso
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
from Standard.Base import Integer, Text, Nothing, Boolean, Illegal_Argument_Error, Any, Error, Panic, File, Vector, False, IO
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior

import Standard.Table.Data.Table
from Standard.Table.Error as Error_Module import Invalid_Location

polyglot java import org.enso.table.format.xlsx.Range as Java_Range
polyglot java import org.enso.table.format.xlsx.Reader

polyglot java import java.lang.IllegalArgumentException
polyglot java import java.io.IOException
polyglot java import org.apache.poi.UnsupportedFileFormatException

type Excel_Section
## Gets a list of sheets within a workbook.
type Sheet_Names

## Gets a list of named ranges within a workbook.
type Range_Names

## Gets the data from a specific sheet. Column names are the Excel column
names.
type Sheet (sheet:(Integer|Text)) (skip_rows:(Integer|Nothing)=Nothing) (row_limit:(Integer|Nothing)=Nothing)

## Gets a specific range (taking either a defined name or external style
address) from the workbook.
type Range (address:(Text|Excel_Range)) (skip_rows:(Integer|Nothing)=Nothing) (row_limit:(Integer|Nothing)=Nothing)

type Excel_Range
## Specifies a range within an Excel Workbook.
type Excel_Range java_range:Java_Range

## Gets the name of the sheet.
sheet_name : Text
sheet_name = this.java_range.getSheetName

## Gets the index (1-based) of the top row of the range.
Returns `Nothing` if referring to a complete column.
top_row : Integer | Nothing
top_row = if this.java_range.isWholeColumn then Nothing else
this.java_range.getTopRow

## Gets the index (1-based) of the bottom row of the range.
Returns `Nothing` if referring to a complete column.
bottom_row : Integer | Nothing
bottom_row = if this.java_range.isWholeColumn then Nothing else
this.java_range.getBottomRow

## Gets the index (1-based) of the left column of the range.
Returns `Nothing` if referring to a complete row.
left_column : Integer | Nothing
left_column = if this.java_range.isWholeRow then Nothing else
this.java_range.getLeftColumn

## Gets the index (1-based) of the right column of the range.
Returns `Nothing` if referring to a complete row.
right_column : Integer | Nothing
right_column = if this.java_range.isWholeRow then Nothing else
this.java_range.getRightColumn

## Gets the address to this in A1 format.
address : Text
address = this.java_range.getAddress

## Displays the Excel_Range.
to_text : Text
to_text = "Excel_Range " + this.address

## Validates if a column index (1-based) is within the valid range for
Excel.

Arguments:
- column: 1-based index to check.
is_valid_column : Integer -> Boolean
is_valid_column column =
excel_2007_column_limit = 16384
(column > 0) && (column <= excel_2007_column_limit)

## Validates if a row index (1-based) is within the valid range for Excel.

Arguments:
- row: 1-based index to check.
is_valid_row : Integer -> Boolean
is_valid_row row =
excel_2007_row_limit = 1048576
(row > 0) && (row <= excel_2007_row_limit)

## Given a column name, parses to the index (1-based) or return index
unchanged.
column_index : (Text|Integer) -> Integer
column_index column =
if column.is_an Integer then column else Java_Range.parseA1Column column

## Creates a Range from an address.
from_address : Text -> Excel_Range
from_address address =
Panic.catch IllegalArgumentException (Excel_Range (Java_Range.new address)) caught_panic->
Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage caught_panic.payload.cause)

## Create a Range for a single cell.
for_cell : Text -> (Text|Integer) -> Integer -> Excel_Range
for_cell sheet column row =
col_index = Excel_Range.column_index column

col_valid = here.validate (Excel_Range.is_valid_column col_index) ("Invalid column for Excel: " + column.to_text + ".")
row_valid = here.validate (Excel_Range.is_valid_row row) ("Invalid row for Excel: " + row.to_text + ".")

col_valid <| row_valid <|
Excel_Range (Java_Range.new sheet col_index row col_index row)

## Create a Range for a range of cells.
for_range : Text -> (Text|Integer) -> Integer -> (Text|Integer) -> Integer -> Excel_Range
for_range sheet left top right bottom =
left_index = Excel_Range.column_index left
right_index = Excel_Range.column_index right

left_valid = here.validate (Excel_Range.is_valid_column left_index) ("Invalid left column for Excel: " + left.to_text + ".")
right_valid = here.validate (Excel_Range.is_valid_column right_index) ("Invalid right column for Excel: " + right.to_text + ".")
top_valid = here.validate (Excel_Range.is_valid_row top) ("Invalid top row for Excel: " + top.to_text + ".")
bottom_valid = here.validate (Excel_Range.is_valid_row bottom) ("Invalid bottom row for Excel: " + bottom.to_text + ".")

left_valid <| right_valid <| top_valid <| bottom_valid <|
Excel_Range (Java_Range.new sheet left_index top right_index bottom)

## Create an Excel_Range for a set of columns.
for_columns : Text -> (Text|Integer) -> (Text|Integer) -> Excel_Range
for_columns sheet left (right=left) =
left_index = Excel_Range.column_index left
right_index = Excel_Range.column_index right

left_valid = here.validate (Excel_Range.is_valid_column left_index) ("Invalid left column for Excel: " + left.to_text + ".")
right_valid = here.validate (Excel_Range.is_valid_column right_index) ("Invalid right column for Excel: " + right.to_text + ".")

left_valid <| right_valid <|
Excel_Range (Java_Range.new sheet left_index 0 right_index 0)

## Create an Excel_Range for a set of rows.
for_rows : Text -> Integer -> Integer -> Excel_Range
for_rows sheet top (bottom=top) =
top_valid = here.validate (Excel_Range.is_valid_row top) ("Invalid top row for Excel: " + top.to_text + ".")
bottom_valid = here.validate (Excel_Range.is_valid_row bottom) ("Invalid bottom row for Excel: " + bottom.to_text + ".")

top_valid <| bottom_valid <|
Excel_Range (Java_Range.new sheet 0 top 0 bottom)


## PRIVATE
Wrapper for validation of a value prior to execution.
validate : Boolean -> Text -> Any
validate validation ~error_message ~wrapped =
if validation then wrapped else Error.throw (Illegal_Argument_Error error_message)

## PRIVATE
Reads an input Excel file according to the provided section.

Arguments:
- file: The File object to read.
- section: The part of the Excel document to read.
- on_problems: Specifies the behavior when a problem occurs during the
operation. By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
If set to `Ignore`, the operation proceeds without errors or warnings.
- xls_format: If `True` then the file is read in using Excel 95-2003 format
otherwise reads in Excel 2007+ format.
read_excel : File -> Excel_Section -> Problem_Behavior -> Boolean -> (Table | Vector)
read_excel file section _ xls_format=False =
reader stream = case section of
Sheet_Names -> Vector.Vector (Reader.readSheetNames stream xls_format)
Range_Names -> Vector.Vector (Reader.readRangeNames stream xls_format)
Sheet sheet skip_rows row_limit ->
Table.Table <| case sheet of
Integer -> Reader.readSheetByIndex stream sheet skip_rows row_limit xls_format
Text -> Reader.readSheetByName stream sheet skip_rows row_limit xls_format
Range address skip_rows row_limit ->
Table.Table <| case address of
Excel_Range _ -> Reader.readRange stream address.java_range skip_rows row_limit xls_format
Text -> Reader.readRangeByName stream address skip_rows row_limit xls_format

bad_argument caught_panic = Error.throw (Invalid_Location caught_panic.payload.cause.getCause)
handle_bad_argument = Panic.catch IllegalArgumentException handler=bad_argument

bad_format caught_panic = Error.throw (File.Io_Error file caught_panic.payload.cause.getMessage)
handle_bad_format = Panic.catch UnsupportedFileFormatException handler=bad_format

File.handle_java_exceptions file <| handle_bad_argument <| handle_bad_format <|
file.with_input_stream [File.Option.Read] stream->(stream.with_java_stream reader)
31 changes: 31 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Prob
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
import Standard.Table.Internal.Delimited_Reader

import Standard.Table.Io.Excel as Excel_Module

## This type needs to be here to allow for the usage of Standard.Table
functions. Ideally, it would be an interface within Standard.Base and
expanded by additional implementations in Standard.Table.
Expand All @@ -25,6 +27,10 @@ type Auto
if ".log".equals_ignore_case extension then Ref.put output File_Format.Text
if ".csv".equals_ignore_case extension then Ref.put output (File_Format.Delimited ',')
if ".tsv".equals_ignore_case extension then Ref.put output (File_Format.Delimited '\t')
if ".xlsx".equals_ignore_case extension then Ref.put output File_Format.Excel
if ".xlsm".equals_ignore_case extension then Ref.put output File_Format.Excel
if ".xls".equals_ignore_case extension then Ref.put output File_Format.Excel
if ".xlt".equals_ignore_case extension then Ref.put output File_Format.Excel

Ref.get output

Expand Down Expand Up @@ -112,3 +118,28 @@ type Delimited
## A setting to infer the default behaviour of some option.
type Infer

## Read the file to a `Table` from an Excel file
type Excel
## Read Excels files into a Table or Vector.

Arguments:
- section: The `Excel_Section` to read from the workbook.
This can be one of:
- `Sheet_Names` - outputs a `Vector` of sheet names.
- `Range_Names` - outputs a `Vector` of range names.
- `Sheet` - outputs a `Table` containing the specified sheet.
- `Range` - outputs a `Table` containing the specified range.
- `xls_format`:
If set to `True`, the file is read as an Excel 95-2003 format.
If set to `False`, the file is read as an Excel 2007+ format.
`Infer` will attempt to deduce this from the extension of the filename.
type Excel (section:Excel_Section=Excel_Module.Sheet_Names) (xls_format:(True|False|Infer)=Infer)

## Implements the `File.read` for this `File_Format`
read : File -> Problem_Behavior -> Any
read file on_problems =
format = if this.xls_format != Infer then this.xls_format else
extension = file.extension
(extension.equals_ignore_case ".xls") || (extension.equals_ignore_case ".xlt")

Excel_Module.read_excel file this.section on_problems format
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from Standard.Base import all
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning

import Standard.Table.Io.File_Format

## ALIAS Read Text File, Read File
Expand Down
4 changes: 4 additions & 0 deletions distribution/lib/Standard/Table/0.0.0-dev/src/Main.enso
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ from Standard.Base import all
import Standard.Geo.Geo_Json
import Standard.Table.Io.Csv
import Standard.Table.Io.Format
import Standard.Table.Io.File_Read
import Standard.Table.Io.Excel
import Standard.Table.Io.Spreadsheet
import Standard.Table.Io.Spreadsheet_Write_Mode
import Standard.Table.Data.Table
Expand All @@ -11,12 +13,14 @@ import Standard.Table.Data.Order_Rule
import Standard.Table.Model

from Standard.Table.Io.Csv export all hiding Parser
from Standard.Table.Io.Excel export Excel_Section, Excel_Range, read_excel
from Standard.Table.Io.Spreadsheet export all hiding Reader

export Standard.Table.Io.Format
export Standard.Table.Io.Spreadsheet_Write_Mode
export Standard.Table.Data.Column
export Standard.Table.Model
export Standard.Table.Io.File_Read

from Standard.Table.Data.Table export new, from_rows, join, concat, No_Such_Column_Error, Table
from Standard.Table.Data.Order_Rule export Order_Rule
Expand Down
Loading

0 comments on commit 078c665

Please sign in to comment.