Skip to content

Commit

Permalink
Feat/infinte row (#328)
Browse files Browse the repository at this point in the history
1.  Adds DFViewerInfinite and BuckarooWidgetInfinite.  These lazily load data, 50 rows at a time.  This makes the table much more responsive and uses less browser memory.  Encoding JSON is very slow in python.
2. Removes display sampling since data is loaded lazily.  also removes the "displayed" status bar item, since all rows are available by default
3. Sorting works across all rows, not just the sampled displayed set.
4. Upgrades AG-Grid from 31.0.3 to  32.3.2
5. Histograms no longer animated, they were re-animating on each new lazy data load
6. Convenience method for creating DOMWidget models and views
7. Fixes #255 Display "No Rows to show" appears as a cell in summary stats
  • Loading branch information
paddymul authored Nov 13, 2024
1 parent 346c564 commit c49ac56
Show file tree
Hide file tree
Showing 30 changed files with 11,693 additions and 276 deletions.
160 changes: 155 additions & 5 deletions buckaroo/buckaroo_widget.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
TODO: Add module docstring
"""

import traceback
from ipywidgets import DOMWidget
import json
import pandas as pd
Expand All @@ -23,9 +24,9 @@
from .pluggable_analysis_framework.analysis_management import DfStats
from .pluggable_analysis_framework.pluggable_analysis_framework import ColAnalysis

from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df
from .dataflow.dataflow import CustomizableDataflow, StylingAnalysis, exception_protect
from .dataflow.dataflow_extras import (Sampling)
from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj
from .dataflow.dataflow import CustomizableDataflow, StylingAnalysis
from .dataflow.dataflow_extras import (Sampling, exception_protect, merge_column_config)
from .dataflow.autocleaning import PandasAutocleaning


Expand Down Expand Up @@ -242,12 +243,13 @@ class RawDFViewerWidget(BuckarooProjectWidget):
"""

#### DOMWidget Boilerplate
# _model_name = Unicode('InfiniteViewerModel').tag(sync=True)
# _view_name = Unicode('InfiniteViewerView').tag(sync=True)
_model_name = Unicode('DFViewerModel').tag(sync=True)
_view_name = Unicode('DFViewerView').tag(sync=True)
#_model_id = Unicode('paddy').tag(sync=True)
#END DOMWidget Boilerplate



df_data = List([
{'a': 5 , 'b':20, 'c': 'Paddy'},
{'a': 58.2, 'b': 9, 'c': 'Margaret'}]).tag(sync=True)
Expand All @@ -269,4 +271,152 @@ class RawDFViewerWidget(BuckarooProjectWidget):
{ 'index': 'mean', 'a': 28, 'b': 14, 'c': 'Padarget' },
{ 'index': 'dtype', 'a': 'float64', 'b': 'int64', 'c': 'object' }]).tag(sync=True)

"""
interface PayloadArgs {
sourceName: string;
start: number;
end: number
}
interface PayloadResponse {
key: PayloadArgs;
data: DFData;
}
"""

class InfiniteViewerWidget(BuckarooProjectWidget):
"""
A very raw way of instaniating just the DFViewer, not meant for use by enduers
instead use DFViewer, or PolarsDFViewer which have better convience methods
"""

#### DOMWidget Boilerplate
# _model_name = Unicode('InfiniteViewerModel').tag(sync=True)
# _view_name = Unicode('InfiniteViewerView').tag(sync=True)
_model_name = Unicode('InfiniteViewerModel').tag(sync=True)
_view_name = Unicode('InfiniteViewerView').tag(sync=True)
#END DOMWidget Boilerplate


def __init__(self, df):
super().__init__()
print("InfiniteViewerWidget 231")
self.df = df

payloadArgs = Dict({'sourceName':'[]', 'start':0, 'end':50}).tag(sync=True)
payloadResponse = Dict({'key': {'sourceName':'[]', 'start':0, 'end':49},
'data': []}
).tag(sync=True)

# @exception_protect('payloadArgsHandler')
@observe('payloadArgs')
def _payloadArgsHandler(self, change):
start, end = self.payloadArgs['start'], self.payloadArgs['end']
print(self.payloadArgs)
if self.payloadArgs.get('sort'):
sort_dir = self.payloadArgs.get('sort_direction')
ascending = sort_dir == 'asc'
slice_df = pd_to_obj(self.df.sort_values(by=[self.payloadArgs.get('sort')], ascending=ascending)[start:end])
else:
slice_df = pd_to_obj(self.df[start:end])
self.payloadResponse = {'key':self.payloadArgs, 'data':slice_df}



class InfinitePdSampling(PdSampling):
serialize_limit = -1 #this turns off rows shown in the UI

class BuckarooInfiniteWidget(BuckarooWidget):
"""Extends CustomizableDataFlow and DOMWIdget
Replaces generic options in CustomizableDataFlow with Pandas implementations
Also adds buckaroo_state object and communication to simpler CustomizableDataFlow implementations
"""

#### DOMWidget Boilerplate
_model_name = Unicode('BuckarooInfiniteWidgetModel').tag(sync=True)
_view_name = Unicode('BuckarooInfiniteWidgetView').tag(sync=True)
#END DOMWidget Boilerplate

sampling_klass = InfinitePdSampling
#final processing block
@observe('widget_args_tuple')
def _handle_widget_change(self, change):
"""
put together df_dict for consumption by the frontend
"""
_unused, processed_df, merged_sd = self.widget_args_tuple
if processed_df is None:
return

# df_data_dict is still hardcoded for now
# eventually processed_df will be able to add or alter values of df_data_dict
# correlation would be added, filtered would probably be altered

# to expedite processing maybe future provided dfs from
# postprcoessing could default to empty until that is
# selected, optionally

#note this needs to be empty so that we can do the infinite stuff
self.df_data_dict = {'main': [],
'all_stats': self._sd_to_jsondf(merged_sd),
'empty': []}

temp_display_args = {}
for display_name, A_Klass in self.df_display_klasses.items():
df_viewer_config = A_Klass.style_columns(merged_sd)
base_column_config = df_viewer_config['column_config']
df_viewer_config['column_config'] = merge_column_config(
base_column_config, self.column_config_overrides)
disp_arg = {'data_key': A_Klass.data_key,
#'df_viewer_config': json.loads(json.dumps(df_viewer_config)),
'df_viewer_config': df_viewer_config,
'summary_stats_key': A_Klass.summary_stats_key}
temp_display_args[display_name] = disp_arg

if self.pinned_rows is not None:
temp_display_args['main']['df_viewer_config']['pinned_rows'] = self.pinned_rows
if self.extra_grid_config:
temp_display_args['main']['df_viewer_config']['extra_grid_config'] = self.extra_grid_config
if self.component_config:
temp_display_args['main']['df_viewer_config']['component_config'] = self.component_config

self.df_display_args = temp_display_args

payload_args = Dict({'sourceName':'unused', 'start':0, 'end':50}).tag(sync=True)
payload_response = Dict({'key': {'sourceName':'unused', 'start':0, 'end':49},
'data': []}
).tag(sync=True)

# @exception_protect('payloadArgsHandler')
@observe('payload_args')
def _payload_args_handler(self, change):

start, end = self.payload_args['start'], self.payload_args['end']
print("payload_args changed", start, end)
_unused, processed_df, merged_sd = self.widget_args_tuple
if processed_df is None:
return

print(self.payload_args)
try:
if self.payload_args.get('sort'):
sort_dir = self.payload_args.get('sort_direction')
ascending = sort_dir == 'asc'
sorted_df = processed_df.sort_values(by=[self.payload_args.get('sort')], ascending=ascending)
slice_df = pd_to_obj(sorted_df[start:end])
self.payload_response = {'key':self.payload_args, 'data':slice_df, 'length':len(sorted_df)}
else:
slice_df = pd_to_obj(processed_df[start:end])
self.payload_response = {'key':self.payload_args, 'data':slice_df, 'length':len(processed_df)}
except Exception as e:
print(e)
stack_trace = traceback.format_exc()
self.payload_response = {'key':self.payload_args, 'data':[], 'error_info':stack_trace, 'length':0}
raise

def _df_to_obj(self, df:pd.DataFrame):
return pd_to_obj(df)

13 changes: 4 additions & 9 deletions buckaroo/widget_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import traceback
from .buckaroo_widget import BuckarooWidget
from .buckaroo_widget import BuckarooInfiniteWidget
import pandas as pd
from datetime import datetime as dtdt
import os
Expand All @@ -20,13 +20,8 @@ def is_in_ipython():
return ip


def enable(sampled=True,
summaryStats=False,
reorderdColumns=False,
showCommands=False,
auto_clean=False,
postProcessingF=None,
debug=False
def enable(buckaroo_kls=BuckarooInfiniteWidget,
debug=False,
):
"""
Automatically use buckaroo to display all DataFrames
Expand Down Expand Up @@ -63,7 +58,7 @@ def enable(sampled=True,
def _display_as_buckaroo(df):
from IPython.display import display
try:
bw = BuckarooWidget(df, debug=debug)
bw = buckaroo_kls(df, debug=debug)
return display(bw)
except:
if debug:
Expand Down
38 changes: 24 additions & 14 deletions docs/example-notebooks/Extending-pandas.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,19 @@
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": null,
"id": "4",
"metadata": {},
"outputs": [],
"source": [
"bw.df_display_args"
]
},
{
"cell_type": "markdown",
"id": "5",
"metadata": {},
"source": [
"# Using the Pluggable Analysis Framework\n",
"\n",
Expand All @@ -87,7 +97,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "5",
"id": "6",
"metadata": {
"tags": []
},
Expand All @@ -114,7 +124,7 @@
},
{
"cell_type": "markdown",
"id": "6",
"id": "7",
"metadata": {},
"source": [
"# Adding a styling analysis\n",
Expand All @@ -133,7 +143,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "7",
"id": "8",
"metadata": {
"tags": []
},
Expand Down Expand Up @@ -194,15 +204,15 @@
},
{
"cell_type": "markdown",
"id": "8",
"id": "9",
"metadata": {},
"source": [
"Let's look at pinned_rows, they can be modified by setting `pinned_rows` on Buckaroo Instaniation"
]
},
{
"cell_type": "markdown",
"id": "9",
"id": "10",
"metadata": {},
"source": [
"# lets add a post processing method"
Expand All @@ -211,7 +221,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "10",
"id": "11",
"metadata": {
"tags": []
},
Expand All @@ -227,7 +237,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "11",
"id": "12",
"metadata": {
"tags": []
},
Expand Down Expand Up @@ -276,7 +286,7 @@
},
{
"cell_type": "markdown",
"id": "12",
"id": "13",
"metadata": {},
"source": [
"## Where to use PostProcessing\n",
Expand Down Expand Up @@ -315,7 +325,7 @@
},
{
"cell_type": "markdown",
"id": "13",
"id": "14",
"metadata": {},
"source": [
"# Putting it all together\n",
Expand All @@ -326,7 +336,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "14",
"id": "15",
"metadata": {
"tags": []
},
Expand Down Expand Up @@ -355,7 +365,7 @@
},
{
"cell_type": "markdown",
"id": "15",
"id": "16",
"metadata": {},
"source": [
"# Why aren't there click handlers?\n",
Expand All @@ -367,7 +377,7 @@
},
{
"cell_type": "markdown",
"id": "16",
"id": "17",
"metadata": {},
"source": [
"# What about autocleaning and the low code UI\n",
Expand All @@ -385,7 +395,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "17",
"id": "18",
"metadata": {},
"outputs": [],
"source": []
Expand Down
Binary file modified docs/example-notebooks/citibike-trips-2016-04.parq
Binary file not shown.
11 changes: 10 additions & 1 deletion docs/examples/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import { Button } from 'react-bootstrap';

import './app.css';


const examples = {
WidgetDCFCellEx: {
title: 'WigetDCFCell',
Expand All @@ -31,8 +32,14 @@ const examples = {
DFViewerEx_large: {
title: 'DFViewer large',
file: 'DFViewerEx_large',
layout: 'HBox',
layout: 'VBox',
},
DFViewerInfiniteEx_large: {
title: 'DFViewerInfinite large',
file: 'DFViewerInfiniteEx_large',
layout: 'VBox',
},

DFViewerEx_real_summary: {
title: 'DFViewer summary',
file: 'DFViewerEx_real_summary',
Expand All @@ -45,6 +52,8 @@ const examples = {
},

StatusBarEx: { title: 'StatusBar', file: 'StatusBarEx', layout: 'VBox' },
InfiniteEx: { title: 'Infinite Example', file: 'InfiniteEx', layout: 'VBox' },

HistogramEx: { title: 'Histogram', file: 'HistogramEx', layout: 'HBox' },
};

Expand Down
Loading

0 comments on commit c49ac56

Please sign in to comment.