Allow specifying a cell range when reading spreadsheets (#1954)

enso-org · Aug 18, 2021 · 5aa65fd · 5aa65fd
1 parent ebdcca8
commit 5aa65fd
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 16 deletions.
diff --git a/RELEASES.md b/RELEASES.md
@@ -4,6 +4,8 @@
 
 - Fixed a bug where reading binary and text files would be 100 times slower than
   expected ([#1949](https://github.com/enso-org/enso/pull/1949)).
+- Added the ability to specify cell ranges for reading XLS and XSLX spreadsheets
+  ([#1954](https://github.com/enso-org/enso/pull/1954)).
 
 # Enso 0.2.24 (2021-08-13)
 

diff --git a/distribution/lib/Standard/Table/0.2.25-SNAPSHOT/src/Io/Spreadsheet.enso b/distribution/lib/Standard/Table/0.2.25-SNAPSHOT/src/Io/Spreadsheet.enso
@@ -14,6 +14,10 @@ polyglot java import org.enso.table.format.xlsx.Reader
      the active sheet (i.e. the one that would be displayed when the file is
      opened in Excel). Can be a 0-based index to specify the number of the sheet
      to read, or a `Text` corresponding to the sheet name.
+   - cell_range: specifies a cell range to read from the sheet. If not provided
+     (default), a range containing all non-empty cells will be selected. If
+     provided, this must be a valid Excel range address, e.g. `'A1:B5'`,
+     `'A:WX'`, or `'5:64'`.
    - has_header: Specifies whether the first row of the sheet should be
      interpreted as a header, containing storage names. If set to `False`,
      storage names will be automatically generated.
@@ -37,9 +41,9 @@ polyglot java import org.enso.table.format.xlsx.Reader
          import Standard.Examples
 
          example_xlsx_to_table = Examples.xlsx.read_xlsx sheet='Dates'
-File.File.read_xlsx : Integer | Text | Nothing -> Boolean -> Text -> Table
-File.File.read_xlsx sheet=Nothing has_header=True prefix='C' =
-    here.from_xlsx this sheet has_header prefix
+File.File.read_xlsx : Integer | Text | Nothing -> Text | Nothing -> Boolean -> Text -> Table
+File.File.read_xlsx sheet=Nothing cell_range=Nothing has_header=True prefix='C' =
+    here.from_xlsx this sheet cell_range has_header prefix
 
 ## Reads the contents of `this` and parses them as an XLSX dataframe.
 
@@ -49,6 +53,10 @@ File.File.read_xlsx sheet=Nothing has_header=True prefix='C' =
      the active sheet (i.e. the one that would be displayed when the file is
      opened in Excel). Can be a 0-based index to specify the number of the sheet
      to read, or a `Text` corresponding to the sheet name.
+   - cell_range: specifies a cell range to read from the sheet. If not provided
+     (default), a range containing all non-empty cells will be selected. If
+     provided, this must be a valid Excel range address, e.g. `'A1:B5'`,
+     `'A:WX'`, or `'5:64'`.
    - has_header: Specifies whether the first row of the sheet should be
      interpreted as a header, containing storage names. If set to `False`,
      storage names will be automatically generated.
@@ -72,11 +80,11 @@ File.File.read_xlsx sheet=Nothing has_header=True prefix='C' =
          import Standard.Examples
 
          example_xlsx_to_table = Table.from_xlsx Examples.xlsx sheet='Dates'
-from_xlsx : File.File -> Integer | Text | Nothing -> Boolean -> Text -> Table
-from_xlsx file sheet=Nothing has_header=True prefix='C' =
+from_xlsx : File.File -> Integer | Text | Nothing -> Text | Nothing -> Boolean -> Text -> Table
+from_xlsx file sheet=Nothing cell_range=Nothing has_header=True prefix='C' =
     file.with_input_stream [File.Option.Read] stream->
         stream.with_java_stream js->
-            Table.Table (Reader.read_xlsx js sheet has_header prefix Date.Date)
+            Table.Table (Reader.read_xlsx js sheet cell_range has_header prefix Date.Date)
 
 ## Reads the contents of `this` and parses them as an XLS dataframe.
 
@@ -85,6 +93,10 @@ from_xlsx file sheet=Nothing has_header=True prefix='C' =
      the active sheet (i.e. the one that would be displayed when the file is
      opened in Excel). Can be a 0-based index to specify the number of the sheet
      to read, or a `Text` corresponding to the sheet name.
+   - cell_range: specifies a cell range to read from the sheet. If not provided
+     (default), a range containing all non-empty cells will be selected. If
+     provided, this must be a valid Excel range address, e.g. `'A1:B5'`,
+     `'A:WX'`, or `'5:64'`.
    - has_header: Specifies whether the first row of the sheet should be
      interpreted as a header, containing storage names. If set to `False`,
      storage names will be automatically generated.
@@ -108,9 +120,9 @@ from_xlsx file sheet=Nothing has_header=True prefix='C' =
          import Standard.Examples
 
          example_xls_to_table = Examples.xls.read_xls sheet='Dates'
-File.File.read_xls : Integer | Text | Nothing -> Boolean -> Text -> Table
-File.File.read_xls sheet=Nothing has_header=True prefix='C' =
-    here.from_xls this sheet has_header prefix
+File.File.read_xls : Integer | Text | Nothing -> Text | Nothing -> Boolean -> Text -> Table
+File.File.read_xls sheet=Nothing cell_range=Nothing has_header=True prefix='C' =
+    here.from_xls this sheet cell_range has_header prefix
 
 ## Reads the contents of `this` and parses them as an XLS dataframe.
 
@@ -120,6 +132,10 @@ File.File.read_xls sheet=Nothing has_header=True prefix='C' =
      the active sheet (i.e. the one that would be displayed when the file is
      opened in Excel). Can be a 0-based index to specify the number of the sheet
      to read, or a `Text` corresponding to the sheet name.
+   - cell_range: specifies a cell range to read from the sheet. If not provided
+     (default), a range containing all non-empty cells will be selected. If
+     provided, this must be a valid Excel range address, e.g. `'A1:B5'`,
+     `'A:WX'`, or `'5:64'`.
    - has_header: Specifies whether the first row of the sheet should be
      interpreted as a header, containing storage names. If set to `False`,
      storage names will be automatically generated.
@@ -143,9 +159,9 @@ File.File.read_xls sheet=Nothing has_header=True prefix='C' =
          import Standard.Examples
 
          example_xls_to_table = Table.from_xls Examples.xls sheet='Dates'
-from_xls : File.File -> Integer | Text | Nothing -> Boolean -> Text -> Table
-from_xls file sheet=Nothing has_header=True prefix='C' =
+from_xls : File.File -> Integer | Text | Nothing -> Text | Nothing -> Boolean -> Text -> Table
+from_xls file sheet=Nothing cell_range=Nothing has_header=True prefix='C' =
     file.with_input_stream [File.Option.Read] stream->
         stream.with_java_stream js->
-            Table.Table (Reader.read_xls js sheet has_header prefix Date.Date)
+            Table.Table (Reader.read_xls js sheet cell_range has_header prefix Date.Date)
 
diff --git a/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java b/std-bits/table/src/main/java/org/enso/table/format/xlsx/Reader.java
@@ -2,6 +2,7 @@
 
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.ss.usermodel.*;
+import org.apache.poi.ss.util.CellRangeAddress;
 import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.enso.table.data.column.builder.object.Builder;
 import org.enso.table.data.column.builder.object.InferredBuilder;
@@ -27,6 +28,9 @@ public class Reader {
    * @param sheetIdx specifies which sheet should be read. If the value is a {@link Long}, it is
    *     used as a 0-based index of the sheet. If it is a {@link String}, it is used as a sheet
    *     name. Otherwise, the active sheet is read.
+   * @param cellRange specifies a cell range to read from the sheet. If not provided (default), a
+   *     range containing all non-empty cells will be selected. If provided, this must be a valid
+   *     Excel range address.
    * @param hasHeaders specifies whether the first non-empty row of the sheet should be used for
    *     column names.
    * @param unnamedColumnPrefix specifies the prefix to use for missing columns.
@@ -38,12 +42,18 @@ public class Reader {
   public static Table read_xlsx(
       InputStream inputStream,
       Object sheetIdx,
+      String cellRange,
       boolean hasHeaders,
       String unnamedColumnPrefix,
       Function<LocalDate, Value> mkDate)
       throws IOException {
     return read_table(
-        new XSSFWorkbook(inputStream), sheetIdx, hasHeaders, unnamedColumnPrefix, mkDate);
+        new XSSFWorkbook(inputStream),
+        sheetIdx,
+        cellRange,
+        hasHeaders,
+        unnamedColumnPrefix,
+        mkDate);
   }
 
   /**
@@ -53,6 +63,9 @@ public static Table read_xlsx(
    * @param sheetIdx specifies which sheet should be read. If the value is a {@link Long}, it is
    *     used as a 0-based index of the sheet. If it is a {@link String}, it is used as a sheet
    *     name. Otherwise, the active sheet is read.
+   * @param cellRange specifies a cell range to read from the sheet. If not provided (default), a
+   *     range containing all non-empty cells will be selected. If provided, this must be a valid
+   *     Excel range address.
    * @param hasHeaders specifies whether the first non-empty row of the sheet should be used for
    *     column names.
    * @param unnamedColumnPrefix specifies the prefix to use for missing columns.
@@ -64,16 +77,19 @@ public static Table read_xlsx(
   public static Table read_xls(
       InputStream is,
       Object sheetIdx,
+      String cellRange,
       boolean hasHeaders,
       String unnamedColumnPrefix,
       Function<LocalDate, Value> mkDate)
       throws IOException {
-    return read_table(new HSSFWorkbook(is), sheetIdx, hasHeaders, unnamedColumnPrefix, mkDate);
+    return read_table(
+        new HSSFWorkbook(is), sheetIdx, cellRange, hasHeaders, unnamedColumnPrefix, mkDate);
   }
 
   private static Table read_table(
       Workbook workbook,
       Object sheetIdx,
+      String cellRange,
       boolean hasHeaders,
       String unnamedColumnPrefix,
       Function<LocalDate, Value> mkDate)
@@ -87,8 +103,23 @@ private static Table read_table(
     if (sheet == null) {
       sheet = workbook.getSheetAt(workbook.getActiveSheetIndex());
     }
-    int minRow = sheet.getFirstRowNum();
-    int maxRow = sheet.getLastRowNum();
+
+    int minRowSpecified, maxRowSpecified, minColSpecified, maxColSpecified;
+    if (cellRange != null) {
+      var range = CellRangeAddress.valueOf(cellRange);
+      minRowSpecified = range.getFirstRow();
+      maxRowSpecified = range.getLastRow() == -1 ? Integer.MAX_VALUE : range.getLastRow();
+      minColSpecified = range.getFirstColumn();
+      maxColSpecified = range.getLastColumn() == -1 ? Integer.MAX_VALUE - 1 : range.getLastColumn();
+    } else {
+      minRowSpecified = -1;
+      maxRowSpecified = Integer.MAX_VALUE;
+      minColSpecified = 0;
+      maxColSpecified = Integer.MAX_VALUE - 1;
+    }
+
+    int minRow = Math.max(sheet.getFirstRowNum(), minRowSpecified);
+    int maxRow = Math.min(sheet.getLastRowNum(), maxRowSpecified);
     if (minRow == -1) {
       return new Table(new Column[0]);
     }
@@ -107,6 +138,8 @@ private static Table read_table(
     if (minCol >= maxCol) {
       return new Table(new Column[0]);
     }
+    minCol = Math.max(minCol, minColSpecified);
+    maxCol = Math.min(maxCol, maxColSpecified + 1);
 
     List<String> colNames = new ArrayList<>(maxCol - minCol);
     if (hasHeaders) {

diff --git a/test/Table_Tests/src/Spreadsheet_Spec.enso b/test/Table_Tests/src/Spreadsheet_Spec.enso
@@ -35,6 +35,23 @@ spec_fmt header file read_method =
             t.columns.map .name . should_equal ['Item', 'Price', 'Quantity', 'Price 1']
             t.at 'Price 1' . to_vector . should_equal [20, 40, 0, 60, 0, 10]
 
+        Test.specify "should allow reading with cell range specified" <|
+            t_1 = read_method file sheet="Simple" cell_range="B:C"
+            t_1.columns.map .name . should_equal ['Quantity', 'Price']
+            t_1.at 'Quantity' . to_vector . should_equal [10, 20, Nothing, 30, Nothing, 5]
+            t_1.at 'Price' . to_vector . should_equal [22.3, 32, 43.2, 54, 31, Nothing]
+
+            t_2 = read_method file sheet="Simple" cell_range="3:5" has_header=False
+            t_2.columns.length.should_equal 3
+            t_2.at 'C0' . to_vector . should_equal ['t-shirt', 'trousers', 'shoes']
+            t_2.at 'C1' . to_vector . should_equal [20, Nothing, 30]
+            t_2.at 'C2' . to_vector . should_equal [32, 43.2, 54]
+
+            t_3 = read_method file sheet="Simple" cell_range="B4:C5" has_header=False
+            t_3.columns.length.should_equal 2
+            t_3.at 'C0' . to_vector . should_equal [Nothing, 30]
+            t_3.at 'C1' . to_vector . should_equal [43.2, 54]
+
 spec =
     here.spec_fmt 'XLSX reading' Examples.xlsx .read_xlsx