DataFrame.to_excel (databricks#288)

* DataFrame.to_excel * Lint fix * Lint fix * Lint fix * Added install of openpxyl as excel engine * Comparing the values of generated excel files * Added xlrd as dependecncy * Added xlrd as dependecncy * Refactored the code * Moving the libraries to test suite * Used TestUtils functions for creating and removing files and folder * Lint fix * Lint fix * Added precautionary note in inline doc of DataFrame.to_excel * Using temp_dir function of TestUtils for creating temporary folder * Removing redundant library * Removing redundant library
athena15 · May 13, 2019 · df2106b · df2106b
1 parent 968d26d
commit df2106b
Show file tree

Hide file tree

Showing 5 changed files with 163 additions and 3 deletions.
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -1040,6 +1040,111 @@ def assign(self, **kwargs):
                            [name for name, _ in pairs if name not in self._metadata.column_fields]))
         return DataFrame(sdf, metadata)
 
+    def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", float_format=None,
+                 columns=None, header=True, index=True, index_label=None, startrow=0,
+                 startcol=0, engine=None, merge_cells=True, encoding=None, inf_rep="inf",
+                 verbose=True, freeze_panes=None):
+        """
+        Write object to an Excel sheet.
+
+        .. note:: This method should only be used if the resulting DataFrame is expected
+                  to be small, as all the data is loaded into the driver's memory.
+
+        To write a single object to an Excel .xlsx file it is only necessary to
+        specify a target file name. To write to multiple sheets it is necessary to
+        create an `ExcelWriter` object with a target file name, and specify a sheet
+        in the file to write to.
+
+        Multiple sheets may be written to by specifying unique `sheet_name`.
+        With all data written to the file it is necessary to save the changes.
+        Note that creating an `ExcelWriter` object with a file name that already
+        exists will result in the contents of the existing file being erased.
+
+        Parameters
+        ----------
+        excel_writer : str or ExcelWriter object
+            File path or existing ExcelWriter.
+        sheet_name : str, default 'Sheet1'
+            Name of sheet which will contain DataFrame.
+        na_rep : str, default ''
+            Missing data representation.
+        float_format : str, optional
+            Format string for floating point numbers. For example
+            ``float_format="%%.2f"`` will format 0.1234 to 0.12.
+        columns : sequence or list of str, optional
+            Columns to write.
+        header : bool or list of str, default True
+            Write out the column names. If a list of string is given it is
+            assumed to be aliases for the column names.
+        index : bool, default True
+            Write row names (index).
+        index_label : str or sequence, optional
+            Column label for index column(s) if desired. If not specified, and
+            `header` and `index` are True, then the index names are used. A
+            sequence should be given if the DataFrame uses MultiIndex.
+        startrow : int, default 0
+            Upper left cell row to dump data frame.
+        startcol : int, default 0
+            Upper left cell column to dump data frame.
+        engine : str, optional
+            Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
+            via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
+            ``io.excel.xlsm.writer``.
+        merge_cells : bool, default True
+            Write MultiIndex and Hierarchical Rows as merged cells.
+        encoding : str, optional
+            Encoding of the resulting excel file. Only necessary for xlwt,
+            other writers support unicode natively.
+        inf_rep : str, default 'inf'
+            Representation for infinity (there is no native representation for
+            infinity in Excel).
+        verbose : bool, default True
+            Display more information in the error logs.
+        freeze_panes : tuple of int (length 2), optional
+            Specifies the one-based bottommost row and rightmost column that
+            is to be frozen.
+
+        Notes
+        -----
+        Once a workbook has been saved it is not possible write further data
+        without rewriting the whole workbook.
+
+        Examples
+        --------
+        Create, write to and save a workbook:
+
+        >>> df1 = ks.DataFrame([['a', 'b'], ['c', 'd']],
+        ...                    index=['row 1', 'row 2'],
+        ...                    columns=['col 1', 'col 2'])
+        >>> df1.to_excel("output.xlsx")  # doctest: +SKIP
+
+        To specify the sheet name:
+
+        >>> df1.to_excel("output.xlsx")  # doctest: +SKIP
+        >>> df1.to_excel("output.xlsx",
+        ...              sheet_name='Sheet_name_1')  # doctest: +SKIP
+
+        If you wish to write to more than one sheet in the workbook, it is
+        necessary to specify an ExcelWriter object:
+
+        >>> with pd.ExcelWriter('output.xlsx') as writer:  # doctest: +SKIP
+        ...      df1.to_excel(writer, sheet_name='Sheet_name_1')
+        ...      df2.to_excel(writer, sheet_name='Sheet_name_2')
+
+        To set the library that is used to write the Excel file,
+        you can pass the `engine` keyword (the default engine is
+        automatically chosen depending on the file extension):
+
+        >>> df1.to_excel('output1.xlsx', engine='xlsxwriter')  # doctest: +SKIP
+        """
+
+        # Make sure locals() call is at the top of the function so we don't capture local variables.
+        args = locals()
+        kdf = self
+
+        return validate_arguments_and_invoke_function(
+            kdf.to_pandas(), self.to_excel, pd.DataFrame.to_excel, args)
+
     @property
     def loc(self):
         return SparkDataFrameLocator(self)

diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -178,7 +178,6 @@ class _MissingPandasLikeDataFrame(object):
     to_clipboard = unsupported_function('to_clipboard')
     to_csv = unsupported_function('to_csv')
     to_dense = unsupported_function('to_dense')
-    to_excel = unsupported_function('to_excel')
     to_feather = unsupported_function('to_feather')
     to_gbq = unsupported_function('to_gbq')
     to_hdf = unsupported_function('to_hdf')

diff --git a/databricks/koalas/tests/test_dataframe_conversion.py b/databricks/koalas/tests/test_dataframe_conversion.py
@@ -19,10 +19,10 @@
 import pandas as pd
 
 from databricks import koalas
-from databricks.koalas.testing.utils import ReusedSQLTestCase, SQLTestUtils
+from databricks.koalas.testing.utils import ReusedSQLTestCase, SQLTestUtils, TestUtils
 
 
-class DataFrameConversionTest(ReusedSQLTestCase, SQLTestUtils):
+class DataFrameConversionTest(ReusedSQLTestCase, SQLTestUtils, TestUtils):
 
     @property
     def pdf(self):
@@ -71,6 +71,59 @@ def test_to_html(self):
         got = self.strip_all_whitespace(self.kdf.to_html(max_rows=2))
         self.assert_eq(got, expected)
 
+    @staticmethod
+    def get_excel_dfs(koalas_location, pandas_location):
+        return {
+            'got': pd.read_excel(koalas_location, index_col=0),
+            'expected': pd.read_excel(pandas_location, index_col=0)
+        }
+
+    def test_to_excel(self):
+        with self.temp_dir() as dirpath:
+            pandas_location = dirpath + "/" + "output1.xlsx"
+            koalas_location = dirpath + "/" + "output2.xlsx"
+
+            pdf = self.pdf
+            kdf = self.kdf
+            kdf.to_excel(koalas_location)
+            pdf.to_excel(pandas_location)
+            dataframes = self.get_excel_dfs(koalas_location, pandas_location)
+            self.assert_eq(dataframes['got'], dataframes['expected'])
+
+            pdf = pd.DataFrame({
+                'a': [1, None, 3],
+                'b': ["one", "two", None],
+            }, index=[0, 1, 3])
+
+            kdf = koalas.from_pandas(pdf)
+
+            kdf.to_excel(koalas_location, na_rep='null')
+            pdf.to_excel(pandas_location, na_rep='null')
+            dataframes = self.get_excel_dfs(koalas_location, pandas_location)
+            self.assert_eq(dataframes['got'], dataframes['expected'])
+
+            pdf = pd.DataFrame({
+                'a': [1.0, 2.0, 3.0],
+                'b': [4.0, 5.0, 6.0],
+            }, index=[0, 1, 3])
+
+            kdf = koalas.from_pandas(pdf)
+
+            kdf.to_excel(koalas_location, float_format='%.1f')
+            pdf.to_excel(pandas_location, float_format='%.1f')
+            dataframes = self.get_excel_dfs(koalas_location, pandas_location)
+            self.assert_eq(dataframes['got'], dataframes['expected'])
+
+            kdf.to_excel(koalas_location, header=False)
+            pdf.to_excel(pandas_location, header=False)
+            dataframes = self.get_excel_dfs(koalas_location, pandas_location)
+            self.assert_eq(dataframes['got'], dataframes['expected'])
+
+            kdf.to_excel(koalas_location, index=False)
+            pdf.to_excel(pandas_location, index=False)
+            dataframes = self.get_excel_dfs(koalas_location, pandas_location)
+            self.assert_eq(dataframes['got'], dataframes['expected'])
+
     def test_to_latex(self):
         expected = self.strip_all_whitespace(r"""
             \begin{tabular}{lrr}

diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -124,3 +124,4 @@ Serialization / IO / Conversion
    DataFrame.to_spark
    DataFrame.to_string
    DataFrame.to_dict
+   DataFrame.to_excel
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -15,3 +15,5 @@ flake8
 # Test
 pytest
 pytest-cov
+openpyxl
+xlrd