Skip to content

Commit

Permalink
DataFrame.to_excel (databricks#288)
Browse files Browse the repository at this point in the history
* DataFrame.to_excel

* Lint fix

* Lint fix

* Lint fix

* Added install of openpxyl as excel engine

* Comparing the values of generated excel files

* Added xlrd as dependecncy

* Added xlrd as dependecncy

* Refactored the code

* Moving the libraries to test suite

* Used TestUtils functions for creating and removing files and folder

* Lint fix

* Lint fix

* Added precautionary note in inline doc of DataFrame.to_excel

* Using temp_dir function of TestUtils for creating temporary folder

* Removing redundant library

* Removing redundant library
  • Loading branch information
shril authored and athena15 committed May 13, 2019
1 parent 968d26d commit df2106b
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 3 deletions.
105 changes: 105 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,111 @@ def assign(self, **kwargs):
[name for name, _ in pairs if name not in self._metadata.column_fields]))
return DataFrame(sdf, metadata)

def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", float_format=None,
columns=None, header=True, index=True, index_label=None, startrow=0,
startcol=0, engine=None, merge_cells=True, encoding=None, inf_rep="inf",
verbose=True, freeze_panes=None):
"""
Write object to an Excel sheet.
.. note:: This method should only be used if the resulting DataFrame is expected
to be small, as all the data is loaded into the driver's memory.
To write a single object to an Excel .xlsx file it is only necessary to
specify a target file name. To write to multiple sheets it is necessary to
create an `ExcelWriter` object with a target file name, and specify a sheet
in the file to write to.
Multiple sheets may be written to by specifying unique `sheet_name`.
With all data written to the file it is necessary to save the changes.
Note that creating an `ExcelWriter` object with a file name that already
exists will result in the contents of the existing file being erased.
Parameters
----------
excel_writer : str or ExcelWriter object
File path or existing ExcelWriter.
sheet_name : str, default 'Sheet1'
Name of sheet which will contain DataFrame.
na_rep : str, default ''
Missing data representation.
float_format : str, optional
Format string for floating point numbers. For example
``float_format="%%.2f"`` will format 0.1234 to 0.12.
columns : sequence or list of str, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of string is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, optional
Column label for index column(s) if desired. If not specified, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the DataFrame uses MultiIndex.
startrow : int, default 0
Upper left cell row to dump data frame.
startcol : int, default 0
Upper left cell column to dump data frame.
engine : str, optional
Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
``io.excel.xlsm.writer``.
merge_cells : bool, default True
Write MultiIndex and Hierarchical Rows as merged cells.
encoding : str, optional
Encoding of the resulting excel file. Only necessary for xlwt,
other writers support unicode natively.
inf_rep : str, default 'inf'
Representation for infinity (there is no native representation for
infinity in Excel).
verbose : bool, default True
Display more information in the error logs.
freeze_panes : tuple of int (length 2), optional
Specifies the one-based bottommost row and rightmost column that
is to be frozen.
Notes
-----
Once a workbook has been saved it is not possible write further data
without rewriting the whole workbook.
Examples
--------
Create, write to and save a workbook:
>>> df1 = ks.DataFrame([['a', 'b'], ['c', 'd']],
... index=['row 1', 'row 2'],
... columns=['col 1', 'col 2'])
>>> df1.to_excel("output.xlsx") # doctest: +SKIP
To specify the sheet name:
>>> df1.to_excel("output.xlsx") # doctest: +SKIP
>>> df1.to_excel("output.xlsx",
... sheet_name='Sheet_name_1') # doctest: +SKIP
If you wish to write to more than one sheet in the workbook, it is
necessary to specify an ExcelWriter object:
>>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
... df1.to_excel(writer, sheet_name='Sheet_name_1')
... df2.to_excel(writer, sheet_name='Sheet_name_2')
To set the library that is used to write the Excel file,
you can pass the `engine` keyword (the default engine is
automatically chosen depending on the file extension):
>>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
"""

# Make sure locals() call is at the top of the function so we don't capture local variables.
args = locals()
kdf = self

return validate_arguments_and_invoke_function(
kdf.to_pandas(), self.to_excel, pd.DataFrame.to_excel, args)

@property
def loc(self):
return SparkDataFrameLocator(self)
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ class _MissingPandasLikeDataFrame(object):
to_clipboard = unsupported_function('to_clipboard')
to_csv = unsupported_function('to_csv')
to_dense = unsupported_function('to_dense')
to_excel = unsupported_function('to_excel')
to_feather = unsupported_function('to_feather')
to_gbq = unsupported_function('to_gbq')
to_hdf = unsupported_function('to_hdf')
Expand Down
57 changes: 55 additions & 2 deletions databricks/koalas/tests/test_dataframe_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
import pandas as pd

from databricks import koalas
from databricks.koalas.testing.utils import ReusedSQLTestCase, SQLTestUtils
from databricks.koalas.testing.utils import ReusedSQLTestCase, SQLTestUtils, TestUtils


class DataFrameConversionTest(ReusedSQLTestCase, SQLTestUtils):
class DataFrameConversionTest(ReusedSQLTestCase, SQLTestUtils, TestUtils):

@property
def pdf(self):
Expand Down Expand Up @@ -71,6 +71,59 @@ def test_to_html(self):
got = self.strip_all_whitespace(self.kdf.to_html(max_rows=2))
self.assert_eq(got, expected)

@staticmethod
def get_excel_dfs(koalas_location, pandas_location):
return {
'got': pd.read_excel(koalas_location, index_col=0),
'expected': pd.read_excel(pandas_location, index_col=0)
}

def test_to_excel(self):
with self.temp_dir() as dirpath:
pandas_location = dirpath + "/" + "output1.xlsx"
koalas_location = dirpath + "/" + "output2.xlsx"

pdf = self.pdf
kdf = self.kdf
kdf.to_excel(koalas_location)
pdf.to_excel(pandas_location)
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
self.assert_eq(dataframes['got'], dataframes['expected'])

pdf = pd.DataFrame({
'a': [1, None, 3],
'b': ["one", "two", None],
}, index=[0, 1, 3])

kdf = koalas.from_pandas(pdf)

kdf.to_excel(koalas_location, na_rep='null')
pdf.to_excel(pandas_location, na_rep='null')
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
self.assert_eq(dataframes['got'], dataframes['expected'])

pdf = pd.DataFrame({
'a': [1.0, 2.0, 3.0],
'b': [4.0, 5.0, 6.0],
}, index=[0, 1, 3])

kdf = koalas.from_pandas(pdf)

kdf.to_excel(koalas_location, float_format='%.1f')
pdf.to_excel(pandas_location, float_format='%.1f')
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
self.assert_eq(dataframes['got'], dataframes['expected'])

kdf.to_excel(koalas_location, header=False)
pdf.to_excel(pandas_location, header=False)
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
self.assert_eq(dataframes['got'], dataframes['expected'])

kdf.to_excel(koalas_location, index=False)
pdf.to_excel(pandas_location, index=False)
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
self.assert_eq(dataframes['got'], dataframes['expected'])

def test_to_latex(self):
expected = self.strip_all_whitespace(r"""
\begin{tabular}{lrr}
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,4 @@ Serialization / IO / Conversion
DataFrame.to_spark
DataFrame.to_string
DataFrame.to_dict
DataFrame.to_excel
2 changes: 2 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ flake8
# Test
pytest
pytest-cov
openpyxl
xlrd

0 comments on commit df2106b

Please sign in to comment.