feat(python): Change default engine for read_excel to "calamine"

pola-rs · Jun 29, 2024 · 4f824e9 · 4f824e9
1 parent 6a55411
commit 4f824e9
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 61 deletions.
diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
@@ -45,7 +45,7 @@ def read_excel(
     *,
     sheet_id: None = ...,
     sheet_name: str,
-    engine: ExcelSpreadsheetEngine | None = ...,
+    engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
@@ -60,7 +60,7 @@ def read_excel(
     *,
     sheet_id: None = ...,
     sheet_name: None = ...,
-    engine: ExcelSpreadsheetEngine | None = ...,
+    engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
@@ -75,7 +75,7 @@ def read_excel(
     *,
     sheet_id: int,
     sheet_name: str,
-    engine: ExcelSpreadsheetEngine | None = ...,
+    engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
@@ -92,7 +92,7 @@ def read_excel(
     *,
     sheet_id: Literal[0] | Sequence[int],
     sheet_name: None = ...,
-    engine: ExcelSpreadsheetEngine | None = ...,
+    engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
@@ -107,7 +107,7 @@ def read_excel(
     *,
     sheet_id: int,
     sheet_name: None = ...,
-    engine: ExcelSpreadsheetEngine | None = ...,
+    engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
@@ -122,7 +122,7 @@ def read_excel(
     *,
     sheet_id: None,
     sheet_name: list[str] | tuple[str],
-    engine: ExcelSpreadsheetEngine | None = ...,
+    engine: ExcelSpreadsheetEngine = ...,
     engine_options: dict[str, Any] | None = ...,
     read_options: dict[str, Any] | None = ...,
     schema_overrides: SchemaDict | None = ...,
@@ -138,7 +138,7 @@ def read_excel(
     *,
     sheet_id: int | Sequence[int] | None = None,
     sheet_name: str | list[str] | tuple[str] | None = None,
-    engine: ExcelSpreadsheetEngine | None = None,
+    engine: ExcelSpreadsheetEngine = "calamine",
     engine_options: dict[str, Any] | None = None,
     read_options: dict[str, Any] | None = None,
     schema_overrides: SchemaDict | None = None,
@@ -166,34 +166,34 @@ def read_excel(
     sheet_name
         Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If more
         than one is given then a `{sheetname:frame,}` dict is returned.
-    engine
-        Library used to parse the spreadsheet file; currently defaults to "xlsx2csv"
+    engine : {'calamine', 'xlsx2csv', 'openpyxl'}
+        Library used to parse the spreadsheet file; defaults to "xlsx2csv"
         if not explicitly set.
 
-        * "xlsx2csv": converts the data to an in-memory CSV before using the native
-          polars `read_csv` method to parse the result. You can pass `engine_options`
-          and `read_options` to refine the conversion.
         * "calamine": this engine can be used for reading all major types of Excel
           Workbook (`.xlsx`, `.xlsb`, `.xls`) and is *dramatically* faster than the
           other options, using the `fastexcel` module to bind the calamine reader.
+        * "xlsx2csv": converts the data to an in-memory CSV before using the native
+          polars `read_csv` method to parse the result. You can pass `engine_options`
+          and `read_options` to refine the conversion.
         * "openpyxl": this engine is significantly slower than `xlsx2csv` but supports
           additional automatic type inference; potentially useful if you are otherwise
-          unable to parse your sheet with the (default) `xlsx2csv` engine in
-          conjunction with the `schema_overrides` parameter.
+          unable to parse your sheet with the `xlsx2csv` engine in conjunction with the
+          `schema_overrides` parameter.
     engine_options
         Additional options passed to the underlying engine's primary parsing
         constructor (given below), if supported:
 
-        * "xlsx2csv": `Xlsx2csv`
         * "calamine": n/a (can only provide `read_options`)
+        * "xlsx2csv": `Xlsx2csv`
         * "openpyxl": `load_workbook`
     read_options
         Options passed to the underlying engine method that reads the sheet data.
         Where supported, this allows for additional control over parsing. The
         specific read methods associated with each engine are:
 
-        * "xlsx2csv": `pl.read_csv`
         * "calamine": `ExcelReader.load_sheet_by_name`
+        * "xlsx2csv": `pl.read_csv`
         * "openpyxl": n/a (can only provide `engine_options`)
     schema_overrides
         Support type specification or override of one or more columns.
@@ -206,26 +206,29 @@ def read_excel(
         When there is no data in the sheet,`NoDataError` is raised. If this parameter
         is set to False, an empty DataFrame (with no columns) is returned instead.
 
+    Returns
+    -------
+    DataFrame
+        If reading a single sheet.
+    dict
+        If reading multiple sheets, a "{sheetname: DataFrame, ...}" dict is returned.
+
+    See Also
+    --------
+    read_ods
+
     Notes
     -----
-    * When using the default `xlsx2csv` engine the target Excel sheet is first converted
+    * Where possible, prefer the default "calamine" engine for reading Excel Workbooks,
+      as it is significantly faster than the other options.
+    * When using the `xlsx2csv` engine the target Excel sheet is first converted
       to CSV using `xlsx2csv.Xlsx2csv(source).convert()` and then parsed with Polars'
       :func:`read_csv` function. You can pass additional options to `read_options`
       to influence this part of the parsing pipeline.
-    * Where possible, prefer the "calamine" engine for reading Excel Workbooks, as it is
-      significantly faster than the other options, and is intended to become the default
-      engine for all Excel file types in a future release.
     * If you want to read multiple sheets and set *different* options (`read_options`,
       `schema_overrides`, etc), you should make separate calls as the options are set
       globally, not on a per-sheet basis.
 
-    Returns
-    -------
-    DataFrame
-        If reading a single sheet.
-    dict
-        If reading multiple sheets, a "{sheetname: DataFrame, ...}" dict is returned.
-
     Examples
     --------
     Read the "data" worksheet from an Excel file into a DataFrame.
@@ -235,30 +238,26 @@ def read_excel(
     ...     sheet_name="data",
     ... )  # doctest: +SKIP
 
-    Read table data from sheet 3 in an Excel workbook as a DataFrame while skipping
-    empty lines in the sheet. As sheet 3 does not have a header row and the default
-    engine is `xlsx2csv` you can pass the necessary additional settings for this
-    to the "read_options" parameter; these will be passed to :func:`read_csv`.
+    If the correct dtypes can't be determined, use the `schema_overrides` parameter
+    to specify them, or increase the inference length with `infer_schema_length`.
 
     >>> pl.read_excel(
     ...     source="test.xlsx",
-    ...     sheet_id=3,
-    ...     engine_options={"skip_empty_lines": True},
-    ...     read_options={"has_header": False, "new_columns": ["a", "b", "c"]},
+    ...     schema_overrides={"dt": pl.Date},
+    ...     infer_schema_length=None,
     ... )  # doctest: +SKIP
 
-    If the correct datatypes can't be determined you can use `schema_overrides` and/or
-    some of the :func:`read_csv` documentation to see which options you can pass to fix
-    this issue. For example, if using `xlsx2csv` or `calamine` the "infer_schema_length"
-    parameter can be set to `None` to force reading the entire dataset to infer the
-    best dtypes. If column types are known in advance, and there is no ambiguity in the
-    parsing, `schema_overrides` is typically the more efficient option.
+    Using the `xlsx2csv` engine, read table data from sheet 3 in an Excel workbook as a
+    DataFrame while skipping empty lines in the sheet. As sheet 3 does not have a header
+    row, you can pass the necessary additional settings for this to the `read_options`
+    parameter; these will be passed to :func:`read_csv`.
 
     >>> pl.read_excel(
     ...     source="test.xlsx",
-    ...     schema_overrides={"dt": pl.Date},
-    ...     infer_schema_length=None,
-    ...     engine="calamine",
+    ...     sheet_id=3,
+    ...     engine="xlsx2csv",
+    ...     engine_options={"skip_empty_lines": True},
+    ...     read_options={"has_header": False, "new_columns": ["a", "b", "c"]},
     ... )  # doctest: +SKIP
     """
     return _read_spreadsheet(
@@ -386,6 +385,10 @@ def read_ods(
     -------
     DataFrame, or a `{sheetname: DataFrame, ...}` dict if reading multiple sheets.
 
+    See Also
+    --------
+    read_excel
+
     Examples
     --------
     Read the "data" worksheet from an OpenOffice spreadsheet file into a DataFrame.
@@ -464,30 +467,19 @@ def _read_spreadsheet(
     sheet_id: int | Sequence[int] | None,
     sheet_name: str | list[str] | tuple[str] | None,
     source: str | Path | IO[bytes] | bytes,
-    engine: ExcelSpreadsheetEngine | Literal["ods"] | None,
+    engine: ExcelSpreadsheetEngine,
     engine_options: dict[str, Any] | None = None,
     read_options: dict[str, Any] | None = None,
     schema_overrides: SchemaDict | None = None,
     infer_schema_length: int | None = N_INFER_DEFAULT,
     *,
     raise_if_empty: bool = True,
 ) -> pl.DataFrame | dict[str, pl.DataFrame]:
-    if is_file := isinstance(source, (str, Path)):
+    if isinstance(source, (str, Path)):
         source = normalize_filepath(source)
         if looks_like_url(source):
             source = process_file_url(source)
 
-    if engine is None:
-        if is_file and str(source).lower().endswith(".ods"):
-            # note: if called from "read_ods" the engine cannot be 'None', hence
-            # this check is only triggered when called from "read_excel"
-            msg = "OpenDocumentSpreadsheet files require use of `read_ods`, not `read_excel`"
-            raise ValueError(msg)
-
-        # note: eventually want 'calamine' to be the default for all extensions
-        file_type = _identify_workbook(source)
-        engine = "calamine" if file_type in ("xlsb", "xls") else "xlsx2csv"
-
     read_options = (read_options or {}).copy()
     engine_options = (engine_options or {}).copy()
 

diff --git a/py-polars/tests/unit/io/test_spreadsheet.py b/py-polars/tests/unit/io/test_spreadsheet.py
@@ -86,10 +86,8 @@ def path_ods_mixed(io_files_path: Path) -> Path:
     [
         # xls file
         (pl.read_excel, "path_xls", {"engine": "calamine"}),
-        (pl.read_excel, "path_xls", {"engine": None}),  # << autodetect
         # xlsx file
         (pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
-        (pl.read_excel, "path_xlsx", {"engine": None}),  # << autodetect
         (pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
         (pl.read_excel, "path_xlsx", {"engine": "calamine"}),
         # xlsb file (binary)
@@ -392,6 +390,7 @@ def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> N
     df2 = pl.read_excel(
         path_xlsx,
         sheet_name="test4",
+        engine="xlsx2csv",
         read_options={"schema_overrides": {"cardinality": pl.UInt16}},
     ).drop_nulls()
 
@@ -402,6 +401,7 @@ def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> N
     df3 = pl.read_excel(
         path_xlsx,
         sheet_name="test4",
+        engine="xlsx2csv",
         schema_overrides={"cardinality": pl.UInt16},
         read_options={
             "schema_overrides": {
@@ -444,6 +444,7 @@ def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> N
         pl.read_excel(
             path_xlsx,
             sheet_name="test4",
+            engine="xlsx2csv",
             schema_overrides={"cardinality": pl.UInt16},
             read_options={"schema_overrides": {"cardinality": pl.Int32}},
         )
@@ -834,9 +835,9 @@ def test_excel_empty_sheet(
     with pytest.raises(NoDataError, match="empty Excel sheet"):
         read_spreadsheet(empty_spreadsheet_path)
 
-    engine_params = [{}] if ods else [{"engine": None}, {"engine": "calamine"}]
+    engine_params = [{}] if ods else [{"engine": "calamine"}]
     for params in engine_params:
-        df = read_spreadsheet(  # type: ignore[arg-type]
+        df = read_spreadsheet(
             empty_spreadsheet_path,
             sheet_name="no_data",
             raise_if_empty=False,
@@ -845,7 +846,7 @@ def test_excel_empty_sheet(
         expected = pl.DataFrame()
         assert_frame_equal(df, expected)
 
-        df = read_spreadsheet(  # type: ignore[arg-type]
+        df = read_spreadsheet(
             empty_spreadsheet_path,
             sheet_name="no_rows",
             raise_if_empty=False,