DataFrame.to_json (databricks#238)

add to_json(). Part of databricks#169
athena15 · May 13, 2019 · 1c5d312 · 1c5d312
1 parent 4845250
commit 1c5d312
Show file tree

Hide file tree

Showing 4 changed files with 140 additions and 1 deletion.
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -477,6 +477,130 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True,
         return validate_arguments_and_invoke_function(
             kdf.to_pandas(), self.to_string, pd.DataFrame.to_string, args)
 
+    def to_json(self, path_or_buf=None, orient=None, date_format=None,
+                double_precision=10, force_ascii=True, date_unit='ms',
+                default_handler=None, lines=False, compression='infer',
+                index=True):
+        """
+        Convert the object to a JSON string.
+
+        Note NaN's and None will be converted to null and datetime objects
+        will be converted to UNIX timestamps.
+
+        .. note:: This method should only be used if the resulting JSON is expected
+            to be small, as all the data is loaded into the driver's memory.
+
+        Parameters
+        ----------
+        path_or_buf : string or file handle, optional
+            File path or object. If not specified, the result is returned as
+            a string.
+        orient : string
+            Indication of expected JSON string format.
+
+            * Series
+
+              - default is 'index'
+              - allowed values are: {'split','records','index','table'}
+
+            * DataFrame
+
+              - default is 'columns'
+              - allowed values are:
+                {'split','records','index','columns','values','table'}
+
+            * The format of the JSON string
+
+              - 'split' : dict like {'index' -> [index],
+                'columns' -> [columns], 'data' -> [values]}
+              - 'records' : list like
+                [{column -> value}, ... , {column -> value}]
+              - 'index' : dict like {index -> {column -> value}}
+              - 'columns' : dict like {column -> {index -> value}}
+              - 'values' : just the values array
+              - 'table' : dict like {'schema': {schema}, 'data': {data}}
+                describing the data, and the data component is
+                like ``orient='records'``.
+        date_format : {None, 'epoch', 'iso'}
+            Type of date conversion. 'epoch' = epoch milliseconds,
+            'iso' = ISO8601. The default depends on the `orient`. For
+            ``orient='table'``, the default is 'iso'. For all other orients,
+            the default is 'epoch'.
+        double_precision : int, default 10
+            The number of decimal places to use when encoding
+            floating point values.
+        force_ascii : bool, default True
+            Force encoded string to be ASCII.
+        date_unit : string, default 'ms' (milliseconds)
+            The time unit to encode to, governs timestamp and ISO8601
+            precision.  One of 's', 'ms', 'us', 'ns' for second, millisecond,
+            microsecond, and nanosecond respectively.
+        default_handler : callable, default None
+            Handler to call if object cannot otherwise be converted to a
+            suitable format for JSON. Should receive a single argument which is
+            the object to convert and return a serialisable object.
+        lines : bool, default False
+            If 'orient' is 'records' write out line delimited json format. Will
+            throw ValueError if incorrect 'orient' since others are not list
+            like.
+        compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
+            A string representing the compression to use in the output file,
+            only used when the first argument is a filename. By default, the
+            compression is inferred from the filename.
+        index : bool, default True
+            Whether to include the index values in the JSON string. Not
+            including the index (``index=False``) is only supported when
+            orient is 'split' or 'table'.
+
+        Examples
+        --------
+
+        >>> df = ks.DataFrame([['a', 'b'], ['c', 'd']],
+        ...                   index=['row 1', 'row 2'],
+        ...                   columns=['col 1', 'col 2'])
+        >>> df.to_json(orient='split')
+        '{"columns":["col 1","col 2"],\
+"index":["row 1","row 2"],\
+"data":[["a","b"],["c","d"]]}'
+
+        Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
+        Note that index labels are not preserved with this encoding.
+
+        >>> df.to_json(orient='records')
+        '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
+
+        Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
+
+        >>> df.to_json(orient='index')
+        '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
+
+        Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
+
+        >>> df.to_json(orient='columns')
+        '{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}'
+
+        Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
+
+        >>> df.to_json(orient='values')
+        '[["a","b"],["c","d"]]'
+
+        Encoding with Table Schema
+
+        >>> df.to_json(orient='table') # doctest: +SKIP
+        '{"schema": {"fields":[{"name":"index","type":"string"},\
+{"name":"col 1","type":"string"},\
+{"name":"col 2","type":"string"}],\
+"primaryKey":["index"],\
+"pandas_version":"0.20.0"}, \
+"data": [{"index":"row 1","col 1":"a","col 2":"b"},\
+{"index":"row 2","col 1":"c","col 2":"d"}]}'
+        """
+        # Make sure locals() call is at the top of the function so we don't capture local variables.
+        args = locals()
+        kdf = self
+        return validate_arguments_and_invoke_function(
+            kdf.to_pandas(), self.to_json, pd.DataFrame.to_json, args)
+
     def to_dict(self, orient='dict', into=dict):
         """
         Convert the DataFrame to a dictionary.

diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -180,7 +180,6 @@ class _MissingPandasLikeDataFrame(object):
     to_feather = unsupported_function('to_feather')
     to_gbq = unsupported_function('to_gbq')
     to_hdf = unsupported_function('to_hdf')
-    to_json = unsupported_function('to_json')
     to_msgpack = unsupported_function('to_msgpack')
     to_panel = unsupported_function('to_panel')
     to_parquet = unsupported_function('to_parquet')

diff --git a/databricks/koalas/tests/test_dataframe_conversion.py b/databricks/koalas/tests/test_dataframe_conversion.py
@@ -151,6 +151,21 @@ def test_to_excel(self):
             dataframes = self.get_excel_dfs(koalas_location, pandas_location)
             self.assert_eq(dataframes['got'], dataframes['expected'])
 
+    def test_to_json(self):
+        pdf = self.pdf
+        kdf = koalas.from_pandas(pdf)
+
+        self.assert_eq(kdf.to_json(), pdf.to_json())
+        self.assert_eq(kdf.to_json(orient='split'), pdf.to_json(orient='split'))
+        self.assert_eq(kdf.to_json(orient='records'), pdf.to_json(orient='records'))
+        self.assert_eq(kdf.to_json(orient='index'), pdf.to_json(orient='index'))
+        self.assert_eq(kdf.to_json(orient='values'), pdf.to_json(orient='values'))
+        self.assert_eq(kdf.to_json(orient='table'), pdf.to_json(orient='table'))
+        self.assert_eq(kdf.to_json(orient='records', lines=True),
+                       pdf.to_json(orient='records', lines=True))
+        self.assert_eq(kdf.to_json(orient='split', index=False),
+                       pdf.to_json(orient='split', index=False))
+
     def test_to_latex(self):
         expected = self.strip_all_whitespace(r"""
             \begin{tabular}{lrr}

diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -124,5 +124,6 @@ Serialization / IO / Conversion
    DataFrame.to_koalas
    DataFrame.to_spark
    DataFrame.to_string
+   DataFrame.to_json
    DataFrame.to_dict
    DataFrame.to_excel