From 1c5d312f9f8171fba3ad59ae7c19fc266405afe0 Mon Sep 17 00:00:00 2001 From: Walid Gara <20029252+garawalid@users.noreply.github.com> Date: Sat, 11 May 2019 15:08:19 +0200 Subject: [PATCH] DataFrame.to_json (#238) add to_json(). Part of #169 --- databricks/koalas/frame.py | 124 ++++++++++++++++++ databricks/koalas/missing/frame.py | 1 - .../koalas/tests/test_dataframe_conversion.py | 15 +++ docs/source/reference/frame.rst | 1 + 4 files changed, 140 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 1a94c2dc65..a5fa5f0b28 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -477,6 +477,130 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True, return validate_arguments_and_invoke_function( kdf.to_pandas(), self.to_string, pd.DataFrame.to_string, args) + def to_json(self, path_or_buf=None, orient=None, date_format=None, + double_precision=10, force_ascii=True, date_unit='ms', + default_handler=None, lines=False, compression='infer', + index=True): + """ + Convert the object to a JSON string. + + Note NaN's and None will be converted to null and datetime objects + will be converted to UNIX timestamps. + + .. note:: This method should only be used if the resulting JSON is expected + to be small, as all the data is loaded into the driver's memory. + + Parameters + ---------- + path_or_buf : string or file handle, optional + File path or object. If not specified, the result is returned as + a string. + orient : string + Indication of expected JSON string format. + + * Series + + - default is 'index' + - allowed values are: {'split','records','index','table'} + + * DataFrame + + - default is 'columns' + - allowed values are: + {'split','records','index','columns','values','table'} + + * The format of the JSON string + + - 'split' : dict like {'index' -> [index], + 'columns' -> [columns], 'data' -> [values]} + - 'records' : list like + [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + - 'columns' : dict like {column -> {index -> value}} + - 'values' : just the values array + - 'table' : dict like {'schema': {schema}, 'data': {data}} + describing the data, and the data component is + like ``orient='records'``. + date_format : {None, 'epoch', 'iso'} + Type of date conversion. 'epoch' = epoch milliseconds, + 'iso' = ISO8601. The default depends on the `orient`. For + ``orient='table'``, the default is 'iso'. For all other orients, + the default is 'epoch'. + double_precision : int, default 10 + The number of decimal places to use when encoding + floating point values. + force_ascii : bool, default True + Force encoded string to be ASCII. + date_unit : string, default 'ms' (milliseconds) + The time unit to encode to, governs timestamp and ISO8601 + precision. One of 's', 'ms', 'us', 'ns' for second, millisecond, + microsecond, and nanosecond respectively. + default_handler : callable, default None + Handler to call if object cannot otherwise be converted to a + suitable format for JSON. Should receive a single argument which is + the object to convert and return a serialisable object. + lines : bool, default False + If 'orient' is 'records' write out line delimited json format. Will + throw ValueError if incorrect 'orient' since others are not list + like. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} + A string representing the compression to use in the output file, + only used when the first argument is a filename. By default, the + compression is inferred from the filename. + index : bool, default True + Whether to include the index values in the JSON string. Not + including the index (``index=False``) is only supported when + orient is 'split' or 'table'. + + Examples + -------- + + >>> df = ks.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + >>> df.to_json(orient='split') + '{"columns":["col 1","col 2"],\ +"index":["row 1","row 2"],\ +"data":[["a","b"],["c","d"]]}' + + Encoding/decoding a Dataframe using ``'records'`` formatted JSON. + Note that index labels are not preserved with this encoding. + + >>> df.to_json(orient='records') + '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' + + Encoding/decoding a Dataframe using ``'index'`` formatted JSON: + + >>> df.to_json(orient='index') + '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' + + Encoding/decoding a Dataframe using ``'columns'`` formatted JSON: + + >>> df.to_json(orient='columns') + '{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}' + + Encoding/decoding a Dataframe using ``'values'`` formatted JSON: + + >>> df.to_json(orient='values') + '[["a","b"],["c","d"]]' + + Encoding with Table Schema + + >>> df.to_json(orient='table') # doctest: +SKIP + '{"schema": {"fields":[{"name":"index","type":"string"},\ +{"name":"col 1","type":"string"},\ +{"name":"col 2","type":"string"}],\ +"primaryKey":["index"],\ +"pandas_version":"0.20.0"}, \ +"data": [{"index":"row 1","col 1":"a","col 2":"b"},\ +{"index":"row 2","col 1":"c","col 2":"d"}]}' + """ + # Make sure locals() call is at the top of the function so we don't capture local variables. + args = locals() + kdf = self + return validate_arguments_and_invoke_function( + kdf.to_pandas(), self.to_json, pd.DataFrame.to_json, args) + def to_dict(self, orient='dict', into=dict): """ Convert the DataFrame to a dictionary. diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py index 66abff1285..c9699f9628 100644 --- a/databricks/koalas/missing/frame.py +++ b/databricks/koalas/missing/frame.py @@ -180,7 +180,6 @@ class _MissingPandasLikeDataFrame(object): to_feather = unsupported_function('to_feather') to_gbq = unsupported_function('to_gbq') to_hdf = unsupported_function('to_hdf') - to_json = unsupported_function('to_json') to_msgpack = unsupported_function('to_msgpack') to_panel = unsupported_function('to_panel') to_parquet = unsupported_function('to_parquet') diff --git a/databricks/koalas/tests/test_dataframe_conversion.py b/databricks/koalas/tests/test_dataframe_conversion.py index 3706e26e9c..8fb3c3879c 100644 --- a/databricks/koalas/tests/test_dataframe_conversion.py +++ b/databricks/koalas/tests/test_dataframe_conversion.py @@ -151,6 +151,21 @@ def test_to_excel(self): dataframes = self.get_excel_dfs(koalas_location, pandas_location) self.assert_eq(dataframes['got'], dataframes['expected']) + def test_to_json(self): + pdf = self.pdf + kdf = koalas.from_pandas(pdf) + + self.assert_eq(kdf.to_json(), pdf.to_json()) + self.assert_eq(kdf.to_json(orient='split'), pdf.to_json(orient='split')) + self.assert_eq(kdf.to_json(orient='records'), pdf.to_json(orient='records')) + self.assert_eq(kdf.to_json(orient='index'), pdf.to_json(orient='index')) + self.assert_eq(kdf.to_json(orient='values'), pdf.to_json(orient='values')) + self.assert_eq(kdf.to_json(orient='table'), pdf.to_json(orient='table')) + self.assert_eq(kdf.to_json(orient='records', lines=True), + pdf.to_json(orient='records', lines=True)) + self.assert_eq(kdf.to_json(orient='split', index=False), + pdf.to_json(orient='split', index=False)) + def test_to_latex(self): expected = self.strip_all_whitespace(r""" \begin{tabular}{lrr} diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst index cd737bbb7f..c6d5ec0f03 100644 --- a/docs/source/reference/frame.rst +++ b/docs/source/reference/frame.rst @@ -124,5 +124,6 @@ Serialization / IO / Conversion DataFrame.to_koalas DataFrame.to_spark DataFrame.to_string + DataFrame.to_json DataFrame.to_dict DataFrame.to_excel