DataFrame.to_csv (databricks#239)

Implementation of to_csv(). Part of databricks#169
athena15 · May 13, 2019 · 4845250 · 4845250
1 parent b749824
commit 4845250
Show file tree

Hide file tree

Showing 4 changed files with 127 additions and 1 deletion.
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -571,6 +571,105 @@ def to_dict(self, orient='dict', into=dict):
         return validate_arguments_and_invoke_function(
             kdf.to_pandas(), self.to_dict, pd.DataFrame.to_dict, args)
 
+    def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
+               columns=None, header=True, index=True, index_label=None,
+               mode='w', encoding=None, compression='infer', quoting=None,
+               quotechar='"', line_terminator="\n", chunksize=None,
+               tupleize_cols=None, date_format=None, doublequote=True,
+               escapechar=None, decimal='.'):
+        """
+        Write object to a comma-separated values (csv) file.
+
+        .. note:: This method should only be used if the resulting CSV is expected
+            to be small, as all the data is loaded into the driver's memory.
+
+        Parameters
+        ----------
+        path_or_buf : str or file handle, default None
+            File path or object, if None is provided the result is returned as
+            a string.  If a file object is passed it should be opened with
+            `newline=''`, disabling universal newlines.
+
+        sep : str, default ','
+            String of length 1. Field delimiter for the output file.
+        na_rep : str, default ''
+            Missing data representation.
+        float_format : str, default None
+            Format string for floating point numbers.
+        columns : sequence, optional
+            Columns to write.
+        header : bool or list of str, default True
+            Write out the column names. If a list of strings is given it is
+            assumed to be aliases for the column names.
+        index : bool, default True
+            Write row names (index).
+        index_label : str or sequence, or False, default None
+            Column label for index column(s) if desired. If None is given, and
+            `header` and `index` are True, then the index names are used. A
+            sequence should be given if the object uses MultiIndex. If
+            False do not print fields for index names. Use index_label=False
+            for easier importing in R.
+        mode : str
+            Python write mode, default 'w'.
+        encoding : str, optional
+            A string representing the encoding to use in the output file,
+            defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
+        compression : str, default 'infer'
+            Compression mode among the following possible values: {'infer',
+            'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf`
+            is path-like, then detect compression from the following
+            extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no
+            compression).
+        quoting : optional constant from csv module
+            Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
+            then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
+            will treat them as non-numeric.
+        quotechar : str, default '\"'
+            String of length 1. Character used to quote fields.
+        line_terminator : string, default '\\n'
+            The newline character or character sequence to use in the output
+            file. Defaults to `os.linesep`, which depends on the OS in which
+            this method is called ('\n' for linux, '\r\n' for Windows, i.e.).
+        chunksize : int or None
+            Rows to write at a time.
+        tupleize_cols : bool, default False
+            Write MultiIndex columns as a list of tuples (if True) or in
+            the new, expanded format, where each MultiIndex column is a row
+            in the CSV (if False).
+        date_format : str, default None
+            Format string for datetime objects.
+        doublequote : bool, default True
+            Control quoting of `quotechar` inside a field.
+        escapechar : str, default None
+            String of length 1. Character used to escape `sep` and `quotechar`
+            when appropriate.
+        decimal : str, default '.'
+            Character recognized as decimal separator. E.g. use ',' for
+            European data.
+
+        Returns
+        -------
+        None or str
+            If path_or_buf is None, returns the resulting csv format as a
+            string. Otherwise returns None.
+
+        Examples
+        --------
+        >>> df = ks.DataFrame({'name': ['Raphael', 'Donatello'],
+        ...                    'mask': ['red', 'purple'],
+        ...                    'weapon': ['sai', 'bo staff']},
+        ...                     columns=['name', 'mask', 'weapon'])
+        >>> df.to_csv(index=False)
+        'name,mask,weapon\\nRaphael,red,sai\\nDonatello,purple,bo staff\\n'
+        """
+
+        # Make sure locals() call is at the top of the function so we don't capture local variables.
+        args = locals()
+        kdf = self
+
+        return validate_arguments_and_invoke_function(
+            kdf.to_pandas(), self.to_csv, pd.DataFrame.to_csv, args)
+
     def to_latex(self, buf=None, columns=None, col_space=None, header=True, index=True,
                  na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True,
                  bold_rows=False, column_format=None, longtable=None, escape=None, encoding=None,

diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -176,7 +176,6 @@ class _MissingPandasLikeDataFrame(object):
     tail = unsupported_function('tail')
     take = unsupported_function('take')
     to_clipboard = unsupported_function('to_clipboard')
-    to_csv = unsupported_function('to_csv')
     to_dense = unsupported_function('to_dense')
     to_feather = unsupported_function('to_feather')
     to_gbq = unsupported_function('to_gbq')

diff --git a/databricks/koalas/tests/test_dataframe_conversion.py b/databricks/koalas/tests/test_dataframe_conversion.py
@@ -16,6 +16,7 @@
 
 import string
 
+import numpy as np
 import pandas as pd
 
 from databricks import koalas
@@ -40,6 +41,32 @@ def strip_all_whitespace(str):
         """A helper function to remove all whitespace from a string."""
         return str.translate({ord(c): None for c in string.whitespace})
 
+    def test_csv(self):
+        pdf = self.pdf
+        kdf = self.kdf
+
+        self.assert_eq(kdf.to_csv(), pdf.to_csv())
+
+        pdf = pd.DataFrame({
+            'a': [1, np.nan, 3],
+            'b': ["one", "two", None],
+        }, index=[0, 1, 3])
+
+        kdf = koalas.from_pandas(pdf)
+
+        self.assert_eq(kdf.to_csv(na_rep='null'), pdf.to_csv(na_rep='null'))
+
+        pdf = pd.DataFrame({
+            'a': [1.0, 2.0, 3.0],
+            'b': [4.0, 5.0, 6.0],
+        }, index=[0, 1, 3])
+
+        kdf = koalas.from_pandas(pdf)
+
+        self.assert_eq(kdf.to_csv(float_format='%.1f'), pdf.to_csv(float_format='%.1f'))
+        self.assert_eq(kdf.to_csv(header=False), pdf.to_csv(header=False))
+        self.assert_eq(kdf.to_csv(index=False), pdf.to_csv(index=False))
+
     def test_to_html(self):
         expected = self.strip_all_whitespace("""
             <table border="1" class="dataframe">

diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -117,6 +117,7 @@ Serialization / IO / Conversion
 .. autosummary::
    :toctree: api/
 
+   DataFrame.to_csv
    DataFrame.to_pandas
    DataFrame.to_html
    DataFrame.to_numpy