diff --git a/lib/kino/explorer.ex b/lib/kino/explorer.ex index 5d82396..68496cf 100644 --- a/lib/kino/explorer.ex +++ b/lib/kino/explorer.ex @@ -17,40 +17,31 @@ defmodule Kino.Explorer do @type t :: Kino.JS.Live.t() + @date_types [ + :date, + {:datetime, :nanosecond}, + {:datetime, :microsecond}, + {:datetime, :millisecond} + ] + + @legacy_numeric_types [:float, :integer] + @doc """ Creates a new kino displaying a given data frame or series. """ @spec new(DataFrame.t() | Series.t(), keyword()) :: t() def new(data, opts \\ []) - # TODO: remove the fallback once we require Kino v0.11.0 - if Code.ensure_loaded?(Kino.Table) and function_exported?(Kino.Table, :new, 3) do - def new(%DataFrame{} = df, opts) do - name = Keyword.get(opts, :name, "DataFrame") - Kino.Table.new(__MODULE__, {df, name}, export: fn state -> {"text", inspect(state.df)} end) - end - - def new(%Series{} = s, opts) do - name = Keyword.get(opts, :name, "Series") - column_name = name |> String.replace(" ", "_") |> String.downcase() |> String.to_atom() - df = DataFrame.new([{column_name, s}]) - - Kino.Table.new(__MODULE__, {df, name}, - export: fn state -> {"text", inspect(state.df[0])} end - ) - end - else - def new(%DataFrame{} = df, opts) do - name = Keyword.get(opts, :name, "DataFrame") - Kino.Table.new(__MODULE__, {df, name}) - end + def new(%DataFrame{} = df, opts) do + name = Keyword.get(opts, :name, "DataFrame") + Kino.Table.new(__MODULE__, {df, name}, export: fn state -> {"text", inspect(state.df)} end) + end - def new(%Series{} = s, opts) do - name = Keyword.get(opts, :name, "Series") - column_name = name |> String.replace(" ", "_") |> String.downcase() |> String.to_atom() - df = DataFrame.new([{column_name, s}]) - Kino.Table.new(__MODULE__, {df, name}) - end + def new(%Series{} = s, opts) do + name = Keyword.get(opts, :name, "Series") + column_name = name |> String.replace(" ", "_") |> String.downcase() |> String.to_atom() + df = DataFrame.new([{column_name, s}]) + Kino.Table.new(__MODULE__, {df, name}, export: fn state -> {"text", inspect(state.df[0])} end) end @impl true @@ -59,35 +50,8 @@ defmodule Kino.Explorer do groups = df.groups df = DataFrame.ungroup(df) total_rows = if !lazy, do: DataFrame.n_rows(df) - dtypes = DataFrame.dtypes(df) - sample_data = df |> DataFrame.head(1) |> DataFrame.collect() |> DataFrame.to_columns() - summaries = if !lazy, do: summaries(df, groups) - name = if lazy, do: "Lazy - #{name}", else: name - - columns = - for name <- df.names, dtype = Map.fetch!(dtypes, name) do - %{ - key: name, - label: to_string(name), - type: type_of(dtype, sample_data[name]), - summary: summaries[name] - } - end - - has_list_column? = Enum.any?(columns, fn x -> x.type == "list" end) - - export = - if has_list_column? do - %{formats: ["NDJSON", "Parquet"]} - else - %{formats: ["CSV", "NDJSON", "Parquet"]} - end - - info = %{ - name: name, - features: [:export, :pagination, :sorting], - export: export - } + columns = columns(df, lazy, groups) + info = info(columns, lazy, name) {:ok, info, %{df: df, total_rows: total_rows, columns: columns, groups: groups}} end @@ -116,6 +80,29 @@ defmodule Kino.Explorer do %{data: data, extension: ".parquet", type: "application/x-parquet"} end + defp columns(df, lazy, groups) do + dtypes = DataFrame.dtypes(df) + sample_data = df |> DataFrame.head(1) |> DataFrame.collect() |> DataFrame.to_columns() + summaries = if !lazy, do: summaries(df, groups) + + for name <- df.names, dtype = Map.fetch!(dtypes, name) do + %{ + key: name, + label: to_string(name), + type: type_of(dtype, sample_data[name]), + summary: summaries[name] + } + end + end + + defp info(columns, lazy, name) do + name = if lazy, do: "Lazy - #{name}", else: name + has_list_column? = Enum.any?(columns, fn x -> x.type == "list" end) + formats = if has_list_column?, do: ["NDJSON", "Parquet"], else: ["CSV", "NDJSON", "Parquet"] + + %{name: name, features: [:export, :pagination, :sorting], export: %{formats: formats}} + end + defp get_records(%{df: df, groups: groups}, rows_spec) do lazy = lazy?(df) df = order_by(df, rows_spec[:order]) @@ -143,56 +130,51 @@ defmodule Kino.Explorer do if String.printable?(value, inspect_opts.limit), do: value, else: inspect(value) end - defp value_to_string("list", value) do - inspect(value) - end - - defp value_to_string(_type, value) do - to_string(value) - end + defp value_to_string("list", value), do: inspect(value) + defp value_to_string(_type, value), do: to_string(value) defp summaries(df, groups) do df_series = DataFrame.to_series(df) has_groups = length(groups) > 0 - # hacky way to provide backward compatibility for {:list, numeric} error - # https://github.com/elixir-explorer/explorer/issues/787 - exp_ver_0_7_2_gte? = Explorer.Shared.dtypes() |> Enum.member?({:s, 8}) for {column, series} <- df_series, - summary_type = summary_type(series), + type = if(numeric_type?(Series.dtype(series)), do: :numeric, else: :categorical), grouped = (column in groups) |> to_string(), nulls = Series.nil_count(series) |> to_string(), into: %{} do - cond do - summary_type == :numeric -> - mean = Series.mean(series) - mean = if is_float(mean), do: Float.round(mean, 2) |> to_string(), else: to_string(mean) - min = Series.min(series) |> to_string() - max = Series.max(series) |> to_string() - keys = ["min", "max", "mean", "nulls"] - values = [min, max, mean, nulls] - - keys = if has_groups, do: keys ++ ["grouped"], else: keys - values = if has_groups, do: values ++ [grouped], else: values - - {column, %{keys: keys, values: values}} - - summary_type == :categorical and compute_summaries?(series, exp_ver_0_7_2_gte?) -> - %{"counts" => top_freq, "values" => top} = most_frequent(series) - top_freq = top_freq |> List.first() |> to_string() - top = List.first(top) |> to_string() - unique = count_unique(series) - keys = ["unique", "top", "top freq", "nulls"] - values = [unique, top, top_freq, nulls] - - keys = if has_groups, do: keys ++ ["grouped"], else: keys - values = if has_groups, do: values ++ [grouped], else: values - - {column, %{keys: keys, values: values}} - - true -> - {column, %{keys: [], values: []}} - end + build_summary(type, column, series, has_groups, grouped, nulls) + end + end + + defp build_summary(:numeric, column, series, has_groups, grouped, nulls) do + mean = Series.mean(series) + mean = if is_float(mean), do: Float.round(mean, 2) |> to_string(), else: to_string(mean) + min = Series.min(series) |> to_string() + max = Series.max(series) |> to_string() + keys = ["min", "max", "mean", "nulls"] + values = [min, max, mean, nulls] + + keys = if has_groups, do: keys ++ ["grouped"], else: keys + values = if has_groups, do: values ++ [grouped], else: values + + {column, %{keys: keys, values: values}} + end + + defp build_summary(:categorical, column, series, has_groups, grouped, nulls) do + if compute_summaries?(series) do + %{"counts" => top_freq, "values" => top} = most_frequent(series) + top_freq = top_freq |> List.first() |> to_string() + top = List.first(top) |> to_string() + unique = series |> Series.distinct() |> Series.count() |> to_string() + keys = ["unique", "top", "top freq", "nulls"] + values = [unique, top, top_freq, nulls] + + keys = if has_groups, do: keys ++ ["grouped"], else: keys + values = if has_groups, do: values ++ [grouped], else: values + + {column, %{keys: keys, values: values}} + else + {column, %{keys: [], values: []}} end end @@ -205,33 +187,22 @@ defmodule Kino.Explorer do |> DataFrame.to_columns() end - defp compute_summaries?(series, exp_ver_0_7_2_gte?) do + defp compute_summaries?(series) do + # hacky way to provide backward compatibility for {:list, numeric} error + # https://github.com/elixir-explorer/explorer/issues/787 + # TODO: remove the check once we require Explorer v0.8 + exp_ver_0_7_2_gt? = Explorer.Shared.dtypes() |> Enum.member?({:s, 8}) + case Series.dtype(series) do {:list, dtype} -> - exp_ver_0_7_2_gte? && numeric_type?(dtype) + exp_ver_0_7_2_gt? && numeric_type?(dtype) _ -> true end end - defp summary_type(data) do - if numeric_type?(Series.dtype(data)), do: :numeric, else: :categorical - end - - defp count_unique(data) do - data |> Series.distinct() |> Series.count() |> to_string() - end - - defp type_of(dtype, _) - when dtype in [ - :date, - {:datetime, :nanosecond}, - {:datetime, :microsecond}, - {:datetime, :millisecond} - ], - do: "date" - + defp type_of(dtype, _) when dtype in @date_types, do: "date" defp type_of(:boolean, _), do: "boolean" defp type_of(:string, [data]), do: type_of_sample(data) defp type_of(:binary, _), do: "binary" @@ -245,7 +216,7 @@ defmodule Kino.Explorer do defp numeric_type?({:u, _}), do: true defp numeric_type?({:f, _}), do: true # For backwards compatibility - defp numeric_type?(other), do: other in [:float, :integer] + defp numeric_type?(other), do: other in @legacy_numeric_types defp lazy?(%DataFrame{data: %struct{}}), do: struct.lazy() == struct end diff --git a/lib/kino_explorer/data_transform_cell.ex b/lib/kino_explorer/data_transform_cell.ex index ad02868..c40cc49 100644 --- a/lib/kino_explorer/data_transform_cell.ex +++ b/lib/kino_explorer/data_transform_cell.ex @@ -855,7 +855,7 @@ defmodule KinoExplorer.DataTransformCell do data_options = case df do nil -> nil - %DataFrame{} -> DataFrame.dtypes(df) |> normalize_dtypes() + %DataFrame{} -> build_data_options(df) _ -> maybe_data_options(df) end @@ -888,10 +888,7 @@ defmodule KinoExplorer.DataTransformCell do |> Code.eval_string(binding) |> elem(0) - data_options = - DataFrame.dtypes(df) - |> normalize_dtypes() - |> Map.reject(fn {_k, v} -> v == "list" end) + data_options = build_data_options(df) Map.put(operation, "data_options", data_options) |> maybe_update_datalist(df) @@ -975,10 +972,14 @@ defmodule KinoExplorer.DataTransformCell do defp maybe_data_options(df) do try do - df |> DataFrame.new() |> DataFrame.dtypes() |> normalize_dtypes() + df |> DataFrame.new() |> build_data_options() rescue _ -> nil end end + + defp build_data_options(df) do + df |> DataFrame.dtypes() |> normalize_dtypes() |> Map.reject(fn {_k, v} -> v == "list" end) + end end diff --git a/test/kino/explorer_test.exs b/test/kino/explorer_test.exs index cc3c3c1..e65c693 100644 --- a/test/kino/explorer_test.exs +++ b/test/kino/explorer_test.exs @@ -168,10 +168,7 @@ defmodule Kino.ExplorerTest do end test "support data summary for all nils" do - df = - Explorer.DataFrame.new(%{ - id: [nil, nil, nil, nil] - }) + df = Explorer.DataFrame.new(%{id: [nil, nil, nil, nil]}) widget = Kino.Explorer.new(df) data = connect(widget) @@ -193,6 +190,21 @@ defmodule Kino.ExplorerTest do } = data end + test "does not compute summary for unsupported lists" do + df = Explorer.DataFrame.new(%{list: Explorer.Series.from_list([[1, 2], [1]])}) + + widget = Kino.Explorer.new(df) + data = connect(widget) + + assert %{ + content: %{ + columns: [ + %{key: "0", label: "list", summary: %{keys: [], values: []}, type: "list"} + ] + } + } = data + end + test "shows if a column is in a group when there are groups" do df = Explorer.DataFrame.new(%{ @@ -255,15 +267,21 @@ defmodule Kino.ExplorerTest do test "supports types" do df = Explorer.DataFrame.new( - a: ["a", "b"], - b: [1, 2], - c: ["https://elixir-lang.org", "https://www.erlang.org"] + [ + a: ["a", "b"], + b: [1, 2], + c: ["https://elixir-lang.org", "https://www.erlang.org"], + d: [<<110, 120>>, <<200, 210>>], + e: [[1, 2], [3, 4]] + ], + dtypes: [d: :binary] ) widget = Kino.Explorer.new(df) data = connect(widget) + types = ["text", "number", "uri", "binary", "list"] - assert get_in(data.content.columns, [Access.all(), :type]) == ["text", "number", "uri"] + assert get_in(data.content.columns, [Access.all(), :type]) == types end test "correctly handles empty data frames with string columns" do @@ -289,7 +307,7 @@ defmodule Kino.ExplorerTest do } = data end - test "correctly data frames with binary non-utf8 column values" do + test "correctly handles data frames with binary non-utf8 column values" do df = Explorer.DataFrame.new([x: [1, 2], y: [<<110, 120>>, <<200, 210>>]], dtypes: [y: :binary]) @@ -357,6 +375,7 @@ defmodule Kino.ExplorerTest do data = connect(widget) assert %{ + export: %{formats: ["CSV", "NDJSON", "Parquet"]}, features: [:export, :pagination, :sorting], content: %{ page: 1, @@ -381,6 +400,7 @@ defmodule Kino.ExplorerTest do data = connect(widget) assert %{ + export: %{formats: ["CSV", "NDJSON", "Parquet"]}, features: [:export, :pagination, :sorting], content: %{ page: 1, @@ -417,4 +437,19 @@ defmodule Kino.ExplorerTest do assert %{extension: ^extension} = exported end end + + test "export to for data frames with list-type columns" do + df = Explorer.DataFrame.new(%{list: Explorer.Series.from_list([[1, 2], [1]])}) + + widget = Kino.Explorer.new(df) + data = connect(widget) + + assert %{export: %{formats: ["NDJSON", "Parquet"]}} = data + + for format <- ["NDJSON", "Parquet"] do + exported = Kino.Explorer.export_data(%{df: df}, format) + extension = ".#{String.downcase(format)}" + assert %{extension: ^extension} = exported + end + end end diff --git a/test/kino_explorer/data_transform_cell_test.exs b/test/kino_explorer/data_transform_cell_test.exs index 4564178..698a5a0 100644 --- a/test/kino_explorer/data_transform_cell_test.exs +++ b/test/kino_explorer/data_transform_cell_test.exs @@ -150,6 +150,51 @@ defmodule KinoExplorer.DataTransformCellTest do }) end + test "removes list-type columns from data options" do + {kino, _source} = start_smart_cell!(DataTransformCell, %{}) + + df = + Explorer.DataFrame.new( + [ + a: ["a", "b"], + b: [1, 2], + c: ["https://elixir-lang.org", "https://www.erlang.org"], + d: [<<110, 120>>, <<200, 210>>], + e: [[1, 2], [3, 4]] + ], + dtypes: [d: :binary] + ) + + env = Code.env_for_eval([]) + DataTransformCell.scan_binding(kino.pid, binding(), env) + + data_frame_variables = %{"df" => true} + + assert_broadcast_event(kino, "set_available_data", %{ + "data_frame_variables" => ^data_frame_variables, + "fields" => %{ + operations: [ + %{ + "active" => true, + "column" => nil, + "data_options" => %{ + "a" => "string", + "b" => "integer", + "c" => "string", + "d" => "binary" + }, + "datalist" => [], + "filter" => nil, + "operation_type" => "filters", + "type" => "string", + "value" => nil + } + ], + root_fields: %{"assign_to" => nil, "data_frame" => "df"} + } + }) + end + describe "code generation" do test "source for a data frame without operations" do attrs = build_attrs(%{})