From 5aa65fd4c5c43a3eda9a87d2d9e9ad57f9e32eda Mon Sep 17 00:00:00 2001 From: Marcin Kostrzewa Date: Mon, 16 Aug 2021 17:01:33 +0200 Subject: [PATCH] Allow specifying a cell range when reading spreadsheets (#1954) --- RELEASES.md | 2 + .../0.2.25-SNAPSHOT/src/Io/Spreadsheet.enso | 40 ++++++++++++------ .../org/enso/table/format/xlsx/Reader.java | 41 +++++++++++++++++-- test/Table_Tests/src/Spreadsheet_Spec.enso | 17 ++++++++ 4 files changed, 84 insertions(+), 16 deletions(-) diff --git a/RELEASES.md b/RELEASES.md index 794f52f833e8..5e92eac548fb 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -4,6 +4,8 @@ - Fixed a bug where reading binary and text files would be 100 times slower than expected ([#1949](https://github.com/enso-org/enso/pull/1949)). +- Added the ability to specify cell ranges for reading XLS and XSLX spreadsheets + ([#1954](https://github.com/enso-org/enso/pull/1954)). # Enso 0.2.24 (2021-08-13) diff --git a/distribution/lib/Standard/Table/0.2.25-SNAPSHOT/src/Io/Spreadsheet.enso b/distribution/lib/Standard/Table/0.2.25-SNAPSHOT/src/Io/Spreadsheet.enso index 63deae3e96e4..494c5565f128 100644 --- a/distribution/lib/Standard/Table/0.2.25-SNAPSHOT/src/Io/Spreadsheet.enso +++ b/distribution/lib/Standard/Table/0.2.25-SNAPSHOT/src/Io/Spreadsheet.enso @@ -14,6 +14,10 @@ polyglot java import org.enso.table.format.xlsx.Reader the active sheet (i.e. the one that would be displayed when the file is opened in Excel). Can be a 0-based index to specify the number of the sheet to read, or a `Text` corresponding to the sheet name. + - cell_range: specifies a cell range to read from the sheet. If not provided + (default), a range containing all non-empty cells will be selected. If + provided, this must be a valid Excel range address, e.g. `'A1:B5'`, + `'A:WX'`, or `'5:64'`. - has_header: Specifies whether the first row of the sheet should be interpreted as a header, containing storage names. If set to `False`, storage names will be automatically generated. @@ -37,9 +41,9 @@ polyglot java import org.enso.table.format.xlsx.Reader import Standard.Examples example_xlsx_to_table = Examples.xlsx.read_xlsx sheet='Dates' -File.File.read_xlsx : Integer | Text | Nothing -> Boolean -> Text -> Table -File.File.read_xlsx sheet=Nothing has_header=True prefix='C' = - here.from_xlsx this sheet has_header prefix +File.File.read_xlsx : Integer | Text | Nothing -> Text | Nothing -> Boolean -> Text -> Table +File.File.read_xlsx sheet=Nothing cell_range=Nothing has_header=True prefix='C' = + here.from_xlsx this sheet cell_range has_header prefix ## Reads the contents of `this` and parses them as an XLSX dataframe. @@ -49,6 +53,10 @@ File.File.read_xlsx sheet=Nothing has_header=True prefix='C' = the active sheet (i.e. the one that would be displayed when the file is opened in Excel). Can be a 0-based index to specify the number of the sheet to read, or a `Text` corresponding to the sheet name. + - cell_range: specifies a cell range to read from the sheet. If not provided + (default), a range containing all non-empty cells will be selected. If + provided, this must be a valid Excel range address, e.g. `'A1:B5'`, + `'A:WX'`, or `'5:64'`. - has_header: Specifies whether the first row of the sheet should be interpreted as a header, containing storage names. If set to `False`, storage names will be automatically generated. @@ -72,11 +80,11 @@ File.File.read_xlsx sheet=Nothing has_header=True prefix='C' = import Standard.Examples example_xlsx_to_table = Table.from_xlsx Examples.xlsx sheet='Dates' -from_xlsx : File.File -> Integer | Text | Nothing -> Boolean -> Text -> Table -from_xlsx file sheet=Nothing has_header=True prefix='C' = +from_xlsx : File.File -> Integer | Text | Nothing -> Text | Nothing -> Boolean -> Text -> Table +from_xlsx file sheet=Nothing cell_range=Nothing has_header=True prefix='C' = file.with_input_stream [File.Option.Read] stream-> stream.with_java_stream js-> - Table.Table (Reader.read_xlsx js sheet has_header prefix Date.Date) + Table.Table (Reader.read_xlsx js sheet cell_range has_header prefix Date.Date) ## Reads the contents of `this` and parses them as an XLS dataframe. @@ -85,6 +93,10 @@ from_xlsx file sheet=Nothing has_header=True prefix='C' = the active sheet (i.e. the one that would be displayed when the file is opened in Excel). Can be a 0-based index to specify the number of the sheet to read, or a `Text` corresponding to the sheet name. + - cell_range: specifies a cell range to read from the sheet. If not provided + (default), a range containing all non-empty cells will be selected. If + provided, this must be a valid Excel range address, e.g. `'A1:B5'`, + `'A:WX'`, or `'5:64'`. - has_header: Specifies whether the first row of the sheet should be interpreted as a header, containing storage names. If set to `False`, storage names will be automatically generated. @@ -108,9 +120,9 @@ from_xlsx file sheet=Nothing has_header=True prefix='C' = import Standard.Examples example_xls_to_table = Examples.xls.read_xls sheet='Dates' -File.File.read_xls : Integer | Text | Nothing -> Boolean -> Text -> Table -File.File.read_xls sheet=Nothing has_header=True prefix='C' = - here.from_xls this sheet has_header prefix +File.File.read_xls : Integer | Text | Nothing -> Text | Nothing -> Boolean -> Text -> Table +File.File.read_xls sheet=Nothing cell_range=Nothing has_header=True prefix='C' = + here.from_xls this sheet cell_range has_header prefix ## Reads the contents of `this` and parses them as an XLS dataframe. @@ -120,6 +132,10 @@ File.File.read_xls sheet=Nothing has_header=True prefix='C' = the active sheet (i.e. the one that would be displayed when the file is opened in Excel). Can be a 0-based index to specify the number of the sheet to read, or a `Text` corresponding to the sheet name. + - cell_range: specifies a cell range to read from the sheet. If not provided + (default), a range containing all non-empty cells will be selected. If + provided, this must be a valid Excel range address, e.g. `'A1:B5'`, + `'A:WX'`, or `'5:64'`. - has_header: Specifies whether the first row of the sheet should be interpreted as a header, containing storage names. If set to `False`, storage names will be automatically generated. @@ -143,9 +159,9 @@ File.File.read_xls sheet=Nothing has_header=True prefix='C' = import Standard.Examples example_xls_to_table = Table.from_xls Examples.xls sheet='Dates' -from_xls : File.File -> Integer | Text | Nothing -> Boolean -> Text -> Table -from_xls file sheet=Nothing has_header=True prefix='C' = +from_xls : File.File -> Integer | Text | Nothing -> Text | Nothing -> Boolean -> Text -> Table +from_xls file sheet=Nothing cell_range=Nothing has_header=True prefix='C' = file.with_input_stream [File.Option.Read] stream-> stream.with_java_stream js-> - Table.Table (Reader.read_xls js sheet has_header prefix Date.Date) + Table.Table (Reader.read_xls js sheet cell_range has_header prefix Date.Date) diff --git a/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java b/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java index 8c29adb09600..eb95721c54c5 100644 --- a/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java +++ b/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java @@ -2,6 +2,7 @@ import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.*; +import org.apache.poi.ss.util.CellRangeAddress; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.enso.table.data.column.builder.object.Builder; import org.enso.table.data.column.builder.object.InferredBuilder; @@ -27,6 +28,9 @@ public class Reader { * @param sheetIdx specifies which sheet should be read. If the value is a {@link Long}, it is * used as a 0-based index of the sheet. If it is a {@link String}, it is used as a sheet * name. Otherwise, the active sheet is read. + * @param cellRange specifies a cell range to read from the sheet. If not provided (default), a + * range containing all non-empty cells will be selected. If provided, this must be a valid + * Excel range address. * @param hasHeaders specifies whether the first non-empty row of the sheet should be used for * column names. * @param unnamedColumnPrefix specifies the prefix to use for missing columns. @@ -38,12 +42,18 @@ public class Reader { public static Table read_xlsx( InputStream inputStream, Object sheetIdx, + String cellRange, boolean hasHeaders, String unnamedColumnPrefix, Function mkDate) throws IOException { return read_table( - new XSSFWorkbook(inputStream), sheetIdx, hasHeaders, unnamedColumnPrefix, mkDate); + new XSSFWorkbook(inputStream), + sheetIdx, + cellRange, + hasHeaders, + unnamedColumnPrefix, + mkDate); } /** @@ -53,6 +63,9 @@ public static Table read_xlsx( * @param sheetIdx specifies which sheet should be read. If the value is a {@link Long}, it is * used as a 0-based index of the sheet. If it is a {@link String}, it is used as a sheet * name. Otherwise, the active sheet is read. + * @param cellRange specifies a cell range to read from the sheet. If not provided (default), a + * range containing all non-empty cells will be selected. If provided, this must be a valid + * Excel range address. * @param hasHeaders specifies whether the first non-empty row of the sheet should be used for * column names. * @param unnamedColumnPrefix specifies the prefix to use for missing columns. @@ -64,16 +77,19 @@ public static Table read_xlsx( public static Table read_xls( InputStream is, Object sheetIdx, + String cellRange, boolean hasHeaders, String unnamedColumnPrefix, Function mkDate) throws IOException { - return read_table(new HSSFWorkbook(is), sheetIdx, hasHeaders, unnamedColumnPrefix, mkDate); + return read_table( + new HSSFWorkbook(is), sheetIdx, cellRange, hasHeaders, unnamedColumnPrefix, mkDate); } private static Table read_table( Workbook workbook, Object sheetIdx, + String cellRange, boolean hasHeaders, String unnamedColumnPrefix, Function mkDate) @@ -87,8 +103,23 @@ private static Table read_table( if (sheet == null) { sheet = workbook.getSheetAt(workbook.getActiveSheetIndex()); } - int minRow = sheet.getFirstRowNum(); - int maxRow = sheet.getLastRowNum(); + + int minRowSpecified, maxRowSpecified, minColSpecified, maxColSpecified; + if (cellRange != null) { + var range = CellRangeAddress.valueOf(cellRange); + minRowSpecified = range.getFirstRow(); + maxRowSpecified = range.getLastRow() == -1 ? Integer.MAX_VALUE : range.getLastRow(); + minColSpecified = range.getFirstColumn(); + maxColSpecified = range.getLastColumn() == -1 ? Integer.MAX_VALUE - 1 : range.getLastColumn(); + } else { + minRowSpecified = -1; + maxRowSpecified = Integer.MAX_VALUE; + minColSpecified = 0; + maxColSpecified = Integer.MAX_VALUE - 1; + } + + int minRow = Math.max(sheet.getFirstRowNum(), minRowSpecified); + int maxRow = Math.min(sheet.getLastRowNum(), maxRowSpecified); if (minRow == -1) { return new Table(new Column[0]); } @@ -107,6 +138,8 @@ private static Table read_table( if (minCol >= maxCol) { return new Table(new Column[0]); } + minCol = Math.max(minCol, minColSpecified); + maxCol = Math.min(maxCol, maxColSpecified + 1); List colNames = new ArrayList<>(maxCol - minCol); if (hasHeaders) { diff --git a/test/Table_Tests/src/Spreadsheet_Spec.enso b/test/Table_Tests/src/Spreadsheet_Spec.enso index dda92e279288..47bdbed44765 100644 --- a/test/Table_Tests/src/Spreadsheet_Spec.enso +++ b/test/Table_Tests/src/Spreadsheet_Spec.enso @@ -35,6 +35,23 @@ spec_fmt header file read_method = t.columns.map .name . should_equal ['Item', 'Price', 'Quantity', 'Price 1'] t.at 'Price 1' . to_vector . should_equal [20, 40, 0, 60, 0, 10] + Test.specify "should allow reading with cell range specified" <| + t_1 = read_method file sheet="Simple" cell_range="B:C" + t_1.columns.map .name . should_equal ['Quantity', 'Price'] + t_1.at 'Quantity' . to_vector . should_equal [10, 20, Nothing, 30, Nothing, 5] + t_1.at 'Price' . to_vector . should_equal [22.3, 32, 43.2, 54, 31, Nothing] + + t_2 = read_method file sheet="Simple" cell_range="3:5" has_header=False + t_2.columns.length.should_equal 3 + t_2.at 'C0' . to_vector . should_equal ['t-shirt', 'trousers', 'shoes'] + t_2.at 'C1' . to_vector . should_equal [20, Nothing, 30] + t_2.at 'C2' . to_vector . should_equal [32, 43.2, 54] + + t_3 = read_method file sheet="Simple" cell_range="B4:C5" has_header=False + t_3.columns.length.should_equal 2 + t_3.at 'C0' . to_vector . should_equal [Nothing, 30] + t_3.at 'C1' . to_vector . should_equal [43.2, 54] + spec = here.spec_fmt 'XLSX reading' Examples.xlsx .read_xlsx