Skip to content

Commit

Permalink
DataFrame.to_csv (databricks#239)
Browse files Browse the repository at this point in the history
Implementation of to_csv(). Part of databricks#169
  • Loading branch information
garawalid authored and athena15 committed May 13, 2019
1 parent b749824 commit 4845250
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 1 deletion.
99 changes: 99 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,105 @@ def to_dict(self, orient='dict', into=dict):
return validate_arguments_and_invoke_function(
kdf.to_pandas(), self.to_dict, pd.DataFrame.to_dict, args)

def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
columns=None, header=True, index=True, index_label=None,
mode='w', encoding=None, compression='infer', quoting=None,
quotechar='"', line_terminator="\n", chunksize=None,
tupleize_cols=None, date_format=None, doublequote=True,
escapechar=None, decimal='.'):
"""
Write object to a comma-separated values (csv) file.
.. note:: This method should only be used if the resulting CSV is expected
to be small, as all the data is loaded into the driver's memory.
Parameters
----------
path_or_buf : str or file handle, default None
File path or object, if None is provided the result is returned as
a string. If a file object is passed it should be opened with
`newline=''`, disabling universal newlines.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Missing data representation.
float_format : str, default None
Format string for floating point numbers.
columns : sequence, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : str
Python write mode, default 'w'.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
compression : str, default 'infer'
Compression mode among the following possible values: {'infer',
'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf`
is path-like, then detect compression from the following
extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no
compression).
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.
quotechar : str, default '\"'
String of length 1. Character used to quote fields.
line_terminator : string, default '\\n'
The newline character or character sequence to use in the output
file. Defaults to `os.linesep`, which depends on the OS in which
this method is called ('\n' for linux, '\r\n' for Windows, i.e.).
chunksize : int or None
Rows to write at a time.
tupleize_cols : bool, default False
Write MultiIndex columns as a list of tuples (if True) or in
the new, expanded format, where each MultiIndex column is a row
in the CSV (if False).
date_format : str, default None
Format string for datetime objects.
doublequote : bool, default True
Control quoting of `quotechar` inside a field.
escapechar : str, default None
String of length 1. Character used to escape `sep` and `quotechar`
when appropriate.
decimal : str, default '.'
Character recognized as decimal separator. E.g. use ',' for
European data.
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
Examples
--------
>>> df = ks.DataFrame({'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']},
... columns=['name', 'mask', 'weapon'])
>>> df.to_csv(index=False)
'name,mask,weapon\\nRaphael,red,sai\\nDonatello,purple,bo staff\\n'
"""

# Make sure locals() call is at the top of the function so we don't capture local variables.
args = locals()
kdf = self

return validate_arguments_and_invoke_function(
kdf.to_pandas(), self.to_csv, pd.DataFrame.to_csv, args)

def to_latex(self, buf=None, columns=None, col_space=None, header=True, index=True,
na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True,
bold_rows=False, column_format=None, longtable=None, escape=None, encoding=None,
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ class _MissingPandasLikeDataFrame(object):
tail = unsupported_function('tail')
take = unsupported_function('take')
to_clipboard = unsupported_function('to_clipboard')
to_csv = unsupported_function('to_csv')
to_dense = unsupported_function('to_dense')
to_feather = unsupported_function('to_feather')
to_gbq = unsupported_function('to_gbq')
Expand Down
27 changes: 27 additions & 0 deletions databricks/koalas/tests/test_dataframe_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import string

import numpy as np
import pandas as pd

from databricks import koalas
Expand All @@ -40,6 +41,32 @@ def strip_all_whitespace(str):
"""A helper function to remove all whitespace from a string."""
return str.translate({ord(c): None for c in string.whitespace})

def test_csv(self):
pdf = self.pdf
kdf = self.kdf

self.assert_eq(kdf.to_csv(), pdf.to_csv())

pdf = pd.DataFrame({
'a': [1, np.nan, 3],
'b': ["one", "two", None],
}, index=[0, 1, 3])

kdf = koalas.from_pandas(pdf)

self.assert_eq(kdf.to_csv(na_rep='null'), pdf.to_csv(na_rep='null'))

pdf = pd.DataFrame({
'a': [1.0, 2.0, 3.0],
'b': [4.0, 5.0, 6.0],
}, index=[0, 1, 3])

kdf = koalas.from_pandas(pdf)

self.assert_eq(kdf.to_csv(float_format='%.1f'), pdf.to_csv(float_format='%.1f'))
self.assert_eq(kdf.to_csv(header=False), pdf.to_csv(header=False))
self.assert_eq(kdf.to_csv(index=False), pdf.to_csv(index=False))

def test_to_html(self):
expected = self.strip_all_whitespace("""
<table border="1" class="dataframe">
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ Serialization / IO / Conversion
.. autosummary::
:toctree: api/

DataFrame.to_csv
DataFrame.to_pandas
DataFrame.to_html
DataFrame.to_numpy
Expand Down

0 comments on commit 4845250

Please sign in to comment.