Feat/infinte row (#328)

1. Adds DFViewerInfinite and BuckarooWidgetInfinite. These lazily load data, 50 rows at a time. This makes the table much more responsive and uses less browser memory. Encoding JSON is very slow in python. 2. Removes display sampling since data is loaded lazily. also removes the "displayed" status bar item, since all rows are available by default 3. Sorting works across all rows, not just the sampled displayed set. 4. Upgrades AG-Grid from 31.0.3 to 32.3.2 5. Histograms no longer animated, they were re-animating on each new lazy data load 6. Convenience method for creating DOMWidget models and views 7. Fixes #255 Display "No Rows to show" appears as a cell in summary stats
paddymul · Nov 13, 2024 · c49ac56 · c49ac56
1 parent 346c564
commit c49ac56
Show file tree

Hide file tree

Showing 30 changed files with 11,693 additions and 276 deletions.
diff --git a/buckaroo/buckaroo_widget.py b/buckaroo/buckaroo_widget.py
@@ -8,6 +8,7 @@
 TODO: Add module docstring
 """
 
+import traceback
 from ipywidgets import DOMWidget
 import json
 import pandas as pd
@@ -23,9 +24,9 @@
 from .pluggable_analysis_framework.analysis_management import DfStats
 from .pluggable_analysis_framework.pluggable_analysis_framework import ColAnalysis
 
-from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df
-from .dataflow.dataflow import CustomizableDataflow, StylingAnalysis, exception_protect
-from .dataflow.dataflow_extras import (Sampling)
+from .serialization_utils import EMPTY_DF_WHOLE, check_and_fix_df, pd_to_obj
+from .dataflow.dataflow import CustomizableDataflow, StylingAnalysis
+from .dataflow.dataflow_extras import (Sampling, exception_protect, merge_column_config)
 from .dataflow.autocleaning import PandasAutocleaning
 
 
@@ -242,12 +243,13 @@ class RawDFViewerWidget(BuckarooProjectWidget):
     """
 
     #### DOMWidget Boilerplate
+    # _model_name = Unicode('InfiniteViewerModel').tag(sync=True)
+    # _view_name = Unicode('InfiniteViewerView').tag(sync=True)
     _model_name = Unicode('DFViewerModel').tag(sync=True)
     _view_name = Unicode('DFViewerView').tag(sync=True)
+    #_model_id =  Unicode('paddy').tag(sync=True)
     #END DOMWidget Boilerplate
 
-
-
     df_data = List([
         {'a':  5  , 'b':20, 'c': 'Paddy'},
         {'a': 58.2, 'b': 9, 'c': 'Margaret'}]).tag(sync=True)
@@ -269,4 +271,152 @@ class RawDFViewerWidget(BuckarooProjectWidget):
         { 'index': 'mean',  'a':      28,   'b':      14, 'c': 'Padarget' },
         { 'index': 'dtype', 'a': 'float64', 'b': 'int64', 'c': 'object' }]).tag(sync=True)
 
+"""
+interface PayloadArgs {
+    sourceName: string;
+    start: number;
+    end: number
+}
+interface PayloadResponse {
+    key: PayloadArgs;
+    data: DFData;
+}
+"""
+
+class InfiniteViewerWidget(BuckarooProjectWidget):
+    """
+
+    A very raw way of instaniating just the DFViewer, not meant for use by enduers
+
+    instead use DFViewer, or PolarsDFViewer which have better convience methods
+    """
+
+    #### DOMWidget Boilerplate
+    # _model_name = Unicode('InfiniteViewerModel').tag(sync=True)
+    # _view_name = Unicode('InfiniteViewerView').tag(sync=True)
+    _model_name = Unicode('InfiniteViewerModel').tag(sync=True)
+    _view_name = Unicode('InfiniteViewerView').tag(sync=True)
+    #END DOMWidget Boilerplate
+
+
+    def __init__(self, df):
+        super().__init__()
+        print("InfiniteViewerWidget 231")
+        self.df = df
+
+    payloadArgs = Dict({'sourceName':'[]', 'start':0, 'end':50}).tag(sync=True)
+    payloadResponse = Dict({'key': {'sourceName':'[]', 'start':0, 'end':49},
+                            'data': []}
+                            ).tag(sync=True)
+
+    #    @exception_protect('payloadArgsHandler')    
+    @observe('payloadArgs')
+    def _payloadArgsHandler(self, change):
+        start, end = self.payloadArgs['start'], self.payloadArgs['end']
+        print(self.payloadArgs)
+        if self.payloadArgs.get('sort'):
+            sort_dir = self.payloadArgs.get('sort_direction')
+            ascending = sort_dir == 'asc'
+            slice_df = pd_to_obj(self.df.sort_values(by=[self.payloadArgs.get('sort')], ascending=ascending)[start:end])
+        else:
+            slice_df = pd_to_obj(self.df[start:end])
+        self.payloadResponse = {'key':self.payloadArgs, 'data':slice_df}
+
+
+
+class InfinitePdSampling(PdSampling):
+    serialize_limit = -1 #this turns off rows shown in the UI
+
+class BuckarooInfiniteWidget(BuckarooWidget):
+    """Extends CustomizableDataFlow and DOMWIdget
+
+    Replaces generic options in CustomizableDataFlow with Pandas implementations
+    Also adds buckaroo_state object and communication to simpler CustomizableDataFlow implementations
+    
+    """
+
+    #### DOMWidget Boilerplate
+    _model_name = Unicode('BuckarooInfiniteWidgetModel').tag(sync=True)
+    _view_name =  Unicode('BuckarooInfiniteWidgetView').tag(sync=True)
+    #END DOMWidget Boilerplate
+
+    sampling_klass = InfinitePdSampling
+    #final processing block
+    @observe('widget_args_tuple')
+    def _handle_widget_change(self, change):
+        """
+       put together df_dict for consumption by the frontend
+        """
+        _unused, processed_df, merged_sd = self.widget_args_tuple
+        if processed_df is None:
+            return
+
+        # df_data_dict is still hardcoded for now
+        # eventually processed_df will be able to add or alter values of df_data_dict
+        # correlation would be added, filtered would probably be altered
+
+        # to expedite processing maybe future provided dfs from
+        # postprcoessing could default to empty until that is
+        # selected, optionally
+
+        #note this needs to be empty so that we can do the infinite stuff
+        self.df_data_dict = {'main': [],
+                             'all_stats': self._sd_to_jsondf(merged_sd),
+                             'empty': []}
+
+        temp_display_args = {}
+        for display_name, A_Klass in self.df_display_klasses.items():
+            df_viewer_config = A_Klass.style_columns(merged_sd)
+            base_column_config = df_viewer_config['column_config']
+            df_viewer_config['column_config'] =  merge_column_config(
+                base_column_config, self.column_config_overrides)
+            disp_arg = {'data_key': A_Klass.data_key,
+                        #'df_viewer_config': json.loads(json.dumps(df_viewer_config)),
+                        'df_viewer_config': df_viewer_config,
+                        'summary_stats_key': A_Klass.summary_stats_key}
+            temp_display_args[display_name] = disp_arg
+
+        if self.pinned_rows is not None:
+            temp_display_args['main']['df_viewer_config']['pinned_rows'] = self.pinned_rows
+        if self.extra_grid_config:
+            temp_display_args['main']['df_viewer_config']['extra_grid_config'] = self.extra_grid_config
+        if self.component_config:
+            temp_display_args['main']['df_viewer_config']['component_config'] = self.component_config
+
+        self.df_display_args = temp_display_args
+
+    payload_args = Dict({'sourceName':'unused', 'start':0, 'end':50}).tag(sync=True)
+    payload_response = Dict({'key': {'sourceName':'unused', 'start':0, 'end':49},
+                            'data': []}
+                            ).tag(sync=True)
+
+    #    @exception_protect('payloadArgsHandler')    
+    @observe('payload_args')
+    def _payload_args_handler(self, change):
+
+        start, end = self.payload_args['start'], self.payload_args['end']
+        print("payload_args changed", start, end)
+        _unused, processed_df, merged_sd = self.widget_args_tuple
+        if processed_df is None:
+            return
+
+        print(self.payload_args)
+        try:
+            if self.payload_args.get('sort'):
+                sort_dir = self.payload_args.get('sort_direction')
+                ascending = sort_dir == 'asc'
+                sorted_df = processed_df.sort_values(by=[self.payload_args.get('sort')], ascending=ascending)
+                slice_df = pd_to_obj(sorted_df[start:end])
+                self.payload_response = {'key':self.payload_args, 'data':slice_df, 'length':len(sorted_df)}
+            else:
+                slice_df = pd_to_obj(processed_df[start:end])
+                self.payload_response = {'key':self.payload_args, 'data':slice_df, 'length':len(processed_df)}
+        except Exception as e:
+            print(e)
+            stack_trace = traceback.format_exc()
+            self.payload_response = {'key':self.payload_args, 'data':[], 'error_info':stack_trace, 'length':0}
+            raise
+
+    def _df_to_obj(self, df:pd.DataFrame):
+        return pd_to_obj(df)
 
diff --git a/buckaroo/widget_utils.py b/buckaroo/widget_utils.py
@@ -1,5 +1,5 @@
 import traceback
-from .buckaroo_widget import BuckarooWidget
+from .buckaroo_widget import BuckarooInfiniteWidget
 import pandas as pd
 from datetime import datetime as dtdt
 import os
@@ -20,13 +20,8 @@ def is_in_ipython():
     return ip
 
 
-def enable(sampled=True,
-           summaryStats=False,
-           reorderdColumns=False,
-           showCommands=False,
-           auto_clean=False,
-           postProcessingF=None,
-           debug=False
+def enable(buckaroo_kls=BuckarooInfiniteWidget,
+           debug=False,
            ):
     """
     Automatically use buckaroo to display all DataFrames
@@ -63,7 +58,7 @@ def enable(sampled=True,
     def _display_as_buckaroo(df):
         from IPython.display import display
         try:
-            bw = BuckarooWidget(df, debug=debug)
+            bw = buckaroo_kls(df, debug=debug)
             return display(bw)
         except:
             if debug:

diff --git a/docs/example-notebooks/Extending-pandas.ipynb b/docs/example-notebooks/Extending-pandas.ipynb
@@ -65,9 +65,19 @@
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "id": "4",
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "bw.df_display_args"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5",
+   "metadata": {},
    "source": [
     "# Using the Pluggable Analysis Framework\n",
     "\n",
@@ -87,7 +97,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5",
+   "id": "6",
    "metadata": {
     "tags": []
    },
@@ -114,7 +124,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6",
+   "id": "7",
    "metadata": {},
    "source": [
     "# Adding a styling analysis\n",
@@ -133,7 +143,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "7",
+   "id": "8",
    "metadata": {
     "tags": []
    },
@@ -194,15 +204,15 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8",
+   "id": "9",
    "metadata": {},
    "source": [
     "Let's look at pinned_rows, they can be modified by setting `pinned_rows` on Buckaroo Instaniation"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "9",
+   "id": "10",
    "metadata": {},
    "source": [
     "# lets add a post processing method"
@@ -211,7 +221,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "10",
+   "id": "11",
    "metadata": {
     "tags": []
    },
@@ -227,7 +237,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "11",
+   "id": "12",
    "metadata": {
     "tags": []
    },
@@ -276,7 +286,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "12",
+   "id": "13",
    "metadata": {},
    "source": [
     "## Where to use PostProcessing\n",
@@ -315,7 +325,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "13",
+   "id": "14",
    "metadata": {},
    "source": [
     "# Putting it all together\n",
@@ -326,7 +336,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "14",
+   "id": "15",
    "metadata": {
     "tags": []
    },
@@ -355,7 +365,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "15",
+   "id": "16",
    "metadata": {},
    "source": [
     "# Why aren't there click handlers?\n",
@@ -367,7 +377,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "16",
+   "id": "17",
    "metadata": {},
    "source": [
     "# What about autocleaning and the low code UI\n",
@@ -385,7 +395,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "17",
+   "id": "18",
    "metadata": {},
    "outputs": [],
    "source": []

diff --git a/docs/example-notebooks/citibike-trips-2016-04.parq b/docs/example-notebooks/citibike-trips-2016-04.parq
diff --git a/docs/examples/App.tsx b/docs/examples/App.tsx
@@ -6,6 +6,7 @@ import { Button } from 'react-bootstrap';
 
 import './app.css';
 
+
 const examples = {
   WidgetDCFCellEx: {
     title: 'WigetDCFCell',
@@ -31,8 +32,14 @@ const examples = {
   DFViewerEx_large: {
     title: 'DFViewer large',
     file: 'DFViewerEx_large',
-    layout: 'HBox',
+    layout: 'VBox',
+  },
+  DFViewerInfiniteEx_large: {
+    title: 'DFViewerInfinite large',
+    file: 'DFViewerInfiniteEx_large',
+    layout: 'VBox',
   },
+
   DFViewerEx_real_summary: {
     title: 'DFViewer summary',
     file: 'DFViewerEx_real_summary',
@@ -45,6 +52,8 @@ const examples = {
   },
 
   StatusBarEx: { title: 'StatusBar', file: 'StatusBarEx', layout: 'VBox' },
+  InfiniteEx: { title: 'Infinite Example', file: 'InfiniteEx', layout: 'VBox' },
+
   HistogramEx: { title: 'Histogram', file: 'HistogramEx', layout: 'HBox' },
 };