Skip to content

Commit

Permalink
feat(python): Change default engine for read_excel to "calamine"
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored and alexander-beedie committed Jun 29, 2024
1 parent 6a55411 commit 4f824e9
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 61 deletions.
104 changes: 48 additions & 56 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def read_excel(
*,
sheet_id: None = ...,
sheet_name: str,
engine: ExcelSpreadsheetEngine | None = ...,
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
schema_overrides: SchemaDict | None = ...,
Expand All @@ -60,7 +60,7 @@ def read_excel(
*,
sheet_id: None = ...,
sheet_name: None = ...,
engine: ExcelSpreadsheetEngine | None = ...,
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
schema_overrides: SchemaDict | None = ...,
Expand All @@ -75,7 +75,7 @@ def read_excel(
*,
sheet_id: int,
sheet_name: str,
engine: ExcelSpreadsheetEngine | None = ...,
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
schema_overrides: SchemaDict | None = ...,
Expand All @@ -92,7 +92,7 @@ def read_excel(
*,
sheet_id: Literal[0] | Sequence[int],
sheet_name: None = ...,
engine: ExcelSpreadsheetEngine | None = ...,
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
schema_overrides: SchemaDict | None = ...,
Expand All @@ -107,7 +107,7 @@ def read_excel(
*,
sheet_id: int,
sheet_name: None = ...,
engine: ExcelSpreadsheetEngine | None = ...,
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
schema_overrides: SchemaDict | None = ...,
Expand All @@ -122,7 +122,7 @@ def read_excel(
*,
sheet_id: None,
sheet_name: list[str] | tuple[str],
engine: ExcelSpreadsheetEngine | None = ...,
engine: ExcelSpreadsheetEngine = ...,
engine_options: dict[str, Any] | None = ...,
read_options: dict[str, Any] | None = ...,
schema_overrides: SchemaDict | None = ...,
Expand All @@ -138,7 +138,7 @@ def read_excel(
*,
sheet_id: int | Sequence[int] | None = None,
sheet_name: str | list[str] | tuple[str] | None = None,
engine: ExcelSpreadsheetEngine | None = None,
engine: ExcelSpreadsheetEngine = "calamine",
engine_options: dict[str, Any] | None = None,
read_options: dict[str, Any] | None = None,
schema_overrides: SchemaDict | None = None,
Expand Down Expand Up @@ -166,34 +166,34 @@ def read_excel(
sheet_name
Sheet name(s) to convert; cannot be used in conjunction with `sheet_id`. If more
than one is given then a `{sheetname:frame,}` dict is returned.
engine
Library used to parse the spreadsheet file; currently defaults to "xlsx2csv"
engine : {'calamine', 'xlsx2csv', 'openpyxl'}
Library used to parse the spreadsheet file; defaults to "xlsx2csv"
if not explicitly set.
* "xlsx2csv": converts the data to an in-memory CSV before using the native
polars `read_csv` method to parse the result. You can pass `engine_options`
and `read_options` to refine the conversion.
* "calamine": this engine can be used for reading all major types of Excel
Workbook (`.xlsx`, `.xlsb`, `.xls`) and is *dramatically* faster than the
other options, using the `fastexcel` module to bind the calamine reader.
* "xlsx2csv": converts the data to an in-memory CSV before using the native
polars `read_csv` method to parse the result. You can pass `engine_options`
and `read_options` to refine the conversion.
* "openpyxl": this engine is significantly slower than `xlsx2csv` but supports
additional automatic type inference; potentially useful if you are otherwise
unable to parse your sheet with the (default) `xlsx2csv` engine in
conjunction with the `schema_overrides` parameter.
unable to parse your sheet with the `xlsx2csv` engine in conjunction with the
`schema_overrides` parameter.
engine_options
Additional options passed to the underlying engine's primary parsing
constructor (given below), if supported:
* "xlsx2csv": `Xlsx2csv`
* "calamine": n/a (can only provide `read_options`)
* "xlsx2csv": `Xlsx2csv`
* "openpyxl": `load_workbook`
read_options
Options passed to the underlying engine method that reads the sheet data.
Where supported, this allows for additional control over parsing. The
specific read methods associated with each engine are:
* "xlsx2csv": `pl.read_csv`
* "calamine": `ExcelReader.load_sheet_by_name`
* "xlsx2csv": `pl.read_csv`
* "openpyxl": n/a (can only provide `engine_options`)
schema_overrides
Support type specification or override of one or more columns.
Expand All @@ -206,26 +206,29 @@ def read_excel(
When there is no data in the sheet,`NoDataError` is raised. If this parameter
is set to False, an empty DataFrame (with no columns) is returned instead.
Returns
-------
DataFrame
If reading a single sheet.
dict
If reading multiple sheets, a "{sheetname: DataFrame, ...}" dict is returned.
See Also
--------
read_ods
Notes
-----
* When using the default `xlsx2csv` engine the target Excel sheet is first converted
* Where possible, prefer the default "calamine" engine for reading Excel Workbooks,
as it is significantly faster than the other options.
* When using the `xlsx2csv` engine the target Excel sheet is first converted
to CSV using `xlsx2csv.Xlsx2csv(source).convert()` and then parsed with Polars'
:func:`read_csv` function. You can pass additional options to `read_options`
to influence this part of the parsing pipeline.
* Where possible, prefer the "calamine" engine for reading Excel Workbooks, as it is
significantly faster than the other options, and is intended to become the default
engine for all Excel file types in a future release.
* If you want to read multiple sheets and set *different* options (`read_options`,
`schema_overrides`, etc), you should make separate calls as the options are set
globally, not on a per-sheet basis.
Returns
-------
DataFrame
If reading a single sheet.
dict
If reading multiple sheets, a "{sheetname: DataFrame, ...}" dict is returned.
Examples
--------
Read the "data" worksheet from an Excel file into a DataFrame.
Expand All @@ -235,30 +238,26 @@ def read_excel(
... sheet_name="data",
... ) # doctest: +SKIP
Read table data from sheet 3 in an Excel workbook as a DataFrame while skipping
empty lines in the sheet. As sheet 3 does not have a header row and the default
engine is `xlsx2csv` you can pass the necessary additional settings for this
to the "read_options" parameter; these will be passed to :func:`read_csv`.
If the correct dtypes can't be determined, use the `schema_overrides` parameter
to specify them, or increase the inference length with `infer_schema_length`.
>>> pl.read_excel(
... source="test.xlsx",
... sheet_id=3,
... engine_options={"skip_empty_lines": True},
... read_options={"has_header": False, "new_columns": ["a", "b", "c"]},
... schema_overrides={"dt": pl.Date},
... infer_schema_length=None,
... ) # doctest: +SKIP
If the correct datatypes can't be determined you can use `schema_overrides` and/or
some of the :func:`read_csv` documentation to see which options you can pass to fix
this issue. For example, if using `xlsx2csv` or `calamine` the "infer_schema_length"
parameter can be set to `None` to force reading the entire dataset to infer the
best dtypes. If column types are known in advance, and there is no ambiguity in the
parsing, `schema_overrides` is typically the more efficient option.
Using the `xlsx2csv` engine, read table data from sheet 3 in an Excel workbook as a
DataFrame while skipping empty lines in the sheet. As sheet 3 does not have a header
row, you can pass the necessary additional settings for this to the `read_options`
parameter; these will be passed to :func:`read_csv`.
>>> pl.read_excel(
... source="test.xlsx",
... schema_overrides={"dt": pl.Date},
... infer_schema_length=None,
... engine="calamine",
... sheet_id=3,
... engine="xlsx2csv",
... engine_options={"skip_empty_lines": True},
... read_options={"has_header": False, "new_columns": ["a", "b", "c"]},
... ) # doctest: +SKIP
"""
return _read_spreadsheet(
Expand Down Expand Up @@ -386,6 +385,10 @@ def read_ods(
-------
DataFrame, or a `{sheetname: DataFrame, ...}` dict if reading multiple sheets.
See Also
--------
read_excel
Examples
--------
Read the "data" worksheet from an OpenOffice spreadsheet file into a DataFrame.
Expand Down Expand Up @@ -464,30 +467,19 @@ def _read_spreadsheet(
sheet_id: int | Sequence[int] | None,
sheet_name: str | list[str] | tuple[str] | None,
source: str | Path | IO[bytes] | bytes,
engine: ExcelSpreadsheetEngine | Literal["ods"] | None,
engine: ExcelSpreadsheetEngine,
engine_options: dict[str, Any] | None = None,
read_options: dict[str, Any] | None = None,
schema_overrides: SchemaDict | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
*,
raise_if_empty: bool = True,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
if is_file := isinstance(source, (str, Path)):
if isinstance(source, (str, Path)):
source = normalize_filepath(source)
if looks_like_url(source):
source = process_file_url(source)

if engine is None:
if is_file and str(source).lower().endswith(".ods"):
# note: if called from "read_ods" the engine cannot be 'None', hence
# this check is only triggered when called from "read_excel"
msg = "OpenDocumentSpreadsheet files require use of `read_ods`, not `read_excel`"
raise ValueError(msg)

# note: eventually want 'calamine' to be the default for all extensions
file_type = _identify_workbook(source)
engine = "calamine" if file_type in ("xlsb", "xls") else "xlsx2csv"

read_options = (read_options or {}).copy()
engine_options = (engine_options or {}).copy()

Expand Down
11 changes: 6 additions & 5 deletions py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,8 @@ def path_ods_mixed(io_files_path: Path) -> Path:
[
# xls file
(pl.read_excel, "path_xls", {"engine": "calamine"}),
(pl.read_excel, "path_xls", {"engine": None}), # << autodetect
# xlsx file
(pl.read_excel, "path_xlsx", {"engine": "xlsx2csv"}),
(pl.read_excel, "path_xlsx", {"engine": None}), # << autodetect
(pl.read_excel, "path_xlsx", {"engine": "openpyxl"}),
(pl.read_excel, "path_xlsx", {"engine": "calamine"}),
# xlsb file (binary)
Expand Down Expand Up @@ -392,6 +390,7 @@ def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> N
df2 = pl.read_excel(
path_xlsx,
sheet_name="test4",
engine="xlsx2csv",
read_options={"schema_overrides": {"cardinality": pl.UInt16}},
).drop_nulls()

Expand All @@ -402,6 +401,7 @@ def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> N
df3 = pl.read_excel(
path_xlsx,
sheet_name="test4",
engine="xlsx2csv",
schema_overrides={"cardinality": pl.UInt16},
read_options={
"schema_overrides": {
Expand Down Expand Up @@ -444,6 +444,7 @@ def test_schema_overrides(path_xlsx: Path, path_xlsb: Path, path_ods: Path) -> N
pl.read_excel(
path_xlsx,
sheet_name="test4",
engine="xlsx2csv",
schema_overrides={"cardinality": pl.UInt16},
read_options={"schema_overrides": {"cardinality": pl.Int32}},
)
Expand Down Expand Up @@ -834,9 +835,9 @@ def test_excel_empty_sheet(
with pytest.raises(NoDataError, match="empty Excel sheet"):
read_spreadsheet(empty_spreadsheet_path)

engine_params = [{}] if ods else [{"engine": None}, {"engine": "calamine"}]
engine_params = [{}] if ods else [{"engine": "calamine"}]
for params in engine_params:
df = read_spreadsheet( # type: ignore[arg-type]
df = read_spreadsheet(
empty_spreadsheet_path,
sheet_name="no_data",
raise_if_empty=False,
Expand All @@ -845,7 +846,7 @@ def test_excel_empty_sheet(
expected = pl.DataFrame()
assert_frame_equal(df, expected)

df = read_spreadsheet( # type: ignore[arg-type]
df = read_spreadsheet(
empty_spreadsheet_path,
sheet_name="no_rows",
raise_if_empty=False,
Expand Down

0 comments on commit 4f824e9

Please sign in to comment.