Skip to content

Commit

Permalink
DataFrame.to_json (databricks#238)
Browse files Browse the repository at this point in the history
add to_json(). Part of databricks#169
  • Loading branch information
garawalid authored and athena15 committed May 13, 2019
1 parent 4845250 commit 1c5d312
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 1 deletion.
124 changes: 124 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,130 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True,
return validate_arguments_and_invoke_function(
kdf.to_pandas(), self.to_string, pd.DataFrame.to_string, args)

def to_json(self, path_or_buf=None, orient=None, date_format=None,
double_precision=10, force_ascii=True, date_unit='ms',
default_handler=None, lines=False, compression='infer',
index=True):
"""
Convert the object to a JSON string.
Note NaN's and None will be converted to null and datetime objects
will be converted to UNIX timestamps.
.. note:: This method should only be used if the resulting JSON is expected
to be small, as all the data is loaded into the driver's memory.
Parameters
----------
path_or_buf : string or file handle, optional
File path or object. If not specified, the result is returned as
a string.
orient : string
Indication of expected JSON string format.
* Series
- default is 'index'
- allowed values are: {'split','records','index','table'}
* DataFrame
- default is 'columns'
- allowed values are:
{'split','records','index','columns','values','table'}
* The format of the JSON string
- 'split' : dict like {'index' -> [index],
'columns' -> [columns], 'data' -> [values]}
- 'records' : list like
[{column -> value}, ... , {column -> value}]
- 'index' : dict like {index -> {column -> value}}
- 'columns' : dict like {column -> {index -> value}}
- 'values' : just the values array
- 'table' : dict like {'schema': {schema}, 'data': {data}}
describing the data, and the data component is
like ``orient='records'``.
date_format : {None, 'epoch', 'iso'}
Type of date conversion. 'epoch' = epoch milliseconds,
'iso' = ISO8601. The default depends on the `orient`. For
``orient='table'``, the default is 'iso'. For all other orients,
the default is 'epoch'.
double_precision : int, default 10
The number of decimal places to use when encoding
floating point values.
force_ascii : bool, default True
Force encoded string to be ASCII.
date_unit : string, default 'ms' (milliseconds)
The time unit to encode to, governs timestamp and ISO8601
precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
microsecond, and nanosecond respectively.
default_handler : callable, default None
Handler to call if object cannot otherwise be converted to a
suitable format for JSON. Should receive a single argument which is
the object to convert and return a serialisable object.
lines : bool, default False
If 'orient' is 'records' write out line delimited json format. Will
throw ValueError if incorrect 'orient' since others are not list
like.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
A string representing the compression to use in the output file,
only used when the first argument is a filename. By default, the
compression is inferred from the filename.
index : bool, default True
Whether to include the index values in the JSON string. Not
including the index (``index=False``) is only supported when
orient is 'split' or 'table'.
Examples
--------
>>> df = ks.DataFrame([['a', 'b'], ['c', 'd']],
... index=['row 1', 'row 2'],
... columns=['col 1', 'col 2'])
>>> df.to_json(orient='split')
'{"columns":["col 1","col 2"],\
"index":["row 1","row 2"],\
"data":[["a","b"],["c","d"]]}'
Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
Note that index labels are not preserved with this encoding.
>>> df.to_json(orient='records')
'[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
>>> df.to_json(orient='index')
'{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
>>> df.to_json(orient='columns')
'{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}'
Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
>>> df.to_json(orient='values')
'[["a","b"],["c","d"]]'
Encoding with Table Schema
>>> df.to_json(orient='table') # doctest: +SKIP
'{"schema": {"fields":[{"name":"index","type":"string"},\
{"name":"col 1","type":"string"},\
{"name":"col 2","type":"string"}],\
"primaryKey":["index"],\
"pandas_version":"0.20.0"}, \
"data": [{"index":"row 1","col 1":"a","col 2":"b"},\
{"index":"row 2","col 1":"c","col 2":"d"}]}'
"""
# Make sure locals() call is at the top of the function so we don't capture local variables.
args = locals()
kdf = self
return validate_arguments_and_invoke_function(
kdf.to_pandas(), self.to_json, pd.DataFrame.to_json, args)

def to_dict(self, orient='dict', into=dict):
"""
Convert the DataFrame to a dictionary.
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ class _MissingPandasLikeDataFrame(object):
to_feather = unsupported_function('to_feather')
to_gbq = unsupported_function('to_gbq')
to_hdf = unsupported_function('to_hdf')
to_json = unsupported_function('to_json')
to_msgpack = unsupported_function('to_msgpack')
to_panel = unsupported_function('to_panel')
to_parquet = unsupported_function('to_parquet')
Expand Down
15 changes: 15 additions & 0 deletions databricks/koalas/tests/test_dataframe_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,21 @@ def test_to_excel(self):
dataframes = self.get_excel_dfs(koalas_location, pandas_location)
self.assert_eq(dataframes['got'], dataframes['expected'])

def test_to_json(self):
pdf = self.pdf
kdf = koalas.from_pandas(pdf)

self.assert_eq(kdf.to_json(), pdf.to_json())
self.assert_eq(kdf.to_json(orient='split'), pdf.to_json(orient='split'))
self.assert_eq(kdf.to_json(orient='records'), pdf.to_json(orient='records'))
self.assert_eq(kdf.to_json(orient='index'), pdf.to_json(orient='index'))
self.assert_eq(kdf.to_json(orient='values'), pdf.to_json(orient='values'))
self.assert_eq(kdf.to_json(orient='table'), pdf.to_json(orient='table'))
self.assert_eq(kdf.to_json(orient='records', lines=True),
pdf.to_json(orient='records', lines=True))
self.assert_eq(kdf.to_json(orient='split', index=False),
pdf.to_json(orient='split', index=False))

def test_to_latex(self):
expected = self.strip_all_whitespace(r"""
\begin{tabular}{lrr}
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -124,5 +124,6 @@ Serialization / IO / Conversion
DataFrame.to_koalas
DataFrame.to_spark
DataFrame.to_string
DataFrame.to_json
DataFrame.to_dict
DataFrame.to_excel

0 comments on commit 1c5d312

Please sign in to comment.