Add document viewer to dataset explorer (#666)

PR that adds document viewer to data explorer. The viewer attempts to automatically detect the string type and render it to its most appropriate format. Note that the normal string value does not get rendered to a pdf value but rather a normal text/markdown. I don't think it makes a lot of sense to render normal unformated text to pdf. We can still detect whether a strings is a base64 pdf encoded string and render that to it's original pdf format. ![Screenshot from 2023-11-22 17-32-32](https://github.com/ml6team/fondant/assets/47530815/2aa3a163-71dc-4dd2-b1d7-06f5d2d0450c)
ml6team · Nov 23, 2023 · f9b4b81 · f9b4b81
1 parent fe7e036
commit f9b4b81
Show file tree

Hide file tree

Showing 3 changed files with 128 additions and 35 deletions.
diff --git a/data_explorer/app/df_helpers/fields.py b/data_explorer/app/df_helpers/fields.py
@@ -3,35 +3,24 @@
 import typing as t
 
 
-def get_image_fields(fields: t.Dict[str, str]) -> t.List[str]:
-    """Get the image fields of the dataframe.
+def get_fields_by_types(
+    fields: t.Dict[str, str],
+    field_types: t.List[str],
+) -> t.List[str]:
+    return [
+        field
+        for field, f_type in fields.items()
+        if any(ftype in f_type for ftype in field_types)
+    ]
+
 
-    Args:
-        fields: dictionary with fields and field types
+def get_string_fields(fields: t.Dict[str, str]) -> t.List[str]:
+    return get_fields_by_types(fields, ["string", "utf8"])
 
-    Returns:
-        List of image fields
-    """
-    # check which of the columns contain byte data
-    image_fields = []
-    for k, v in fields.items():
-        if v == "binary":
-            image_fields.append(k)
-    return image_fields
+
+def get_image_fields(fields: t.Dict[str, str]) -> t.List[str]:
+    return get_fields_by_types(fields, ["binary"])
 
 
 def get_numeric_fields(fields: t.Dict[str, str]) -> t.List[str]:
-    """Get the numeric fields of the dataframe.
-
-    Args:
-        fields: dictionary with fields and field types
-
-    Returns:
-         List of numeric fields
-    """
-    # check which of the columns contain byte data
-    numeric_fields = []
-    for k, v in fields.items():
-        if "int" in v or "float" in v:
-            numeric_fields.append(k)
-    return numeric_fields
+    return get_fields_by_types(fields, ["int", "float"])
diff --git a/data_explorer/app/pages/dataset.py b/data_explorer/app/pages/dataset.py
@@ -1,15 +1,49 @@
 """Data exploration page of the app."""
+import base64
+import json
+import typing as t
 
 import streamlit as st
-from df_helpers.fields import get_image_fields
+import streamlit.components.v1 as components
+from bs4 import BeautifulSoup
+from df_helpers.fields import get_image_fields, get_string_fields
 from df_helpers.image_render import configure_image_builder, convert_image_column
+from fpdf import FPDF
 from interfaces.dataset_interface import DatasetLoaderApp
-from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder
+from st_aggrid import AgGrid, AgGridReturn, ColumnsAutoSizeMode, GridOptionsBuilder
+
+
+def is_html(text: str):
+    return bool(BeautifulSoup(text, "html.parser").find())
+
+
+def is_json(text: str):
+    try:
+        json_object = json.loads(text)
+        return bool(isinstance(json_object, (dict, list)))
+    except ValueError:
+        return False
+
+
+def is_pdf_base64(text: str):
+    try:
+        _bytes = base64.b64decode(text)
+        return _bytes[0:4] == b"%PDF"
+    except ValueError:
+        return False
+
+
+def create_pdf_from_text(raw_text):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.set_font("Arial", size=12)
+    pdf.multi_cell(200, 10, txt=raw_text)
+    return pdf.output(dest="S").encode("latin-1")
 
 
 class DatasetExplorerApp(DatasetLoaderApp):
     @staticmethod
-    def setup_app_page(dataframe, fields):
+    def setup_app_page(dataframe, fields) -> AgGridReturn:
         """Build the dataframe explorer table."""
         image_fields = get_image_fields(fields)
 
@@ -34,32 +68,100 @@ def setup_app_page(dataframe, fields):
 
         # configure builder
         options_builder = GridOptionsBuilder.from_dataframe(dataframe_explorer)
+
+        # Add tooltip hover for all fields
+        for field in fields:
+            if field not in image_fields:
+                options_builder.configure_column(
+                    field=field,
+                    tooltipField=field,
+                    max_width=400,
+                )
+
+        grid_options: t.Dict[str, t.Any] = {"rowWidth": "auto", "tooltipShowDelay": 500}
+
         if len(image_fields) > 0:
-            options_builder.configure_grid_options(rowHeight=100, rowWidth="auto")
+            grid_options["rowHeight"] = 100
+            options_builder.configure_grid_options(**grid_options)
         else:
-            options_builder.configure_grid_options(rowHeight="auto", rowWidth="auto")
+            grid_options["rowHeight"] = "auto"
+            options_builder.configure_grid_options(**grid_options)
 
         # format the image columns
         for field in image_fields:
             configure_image_builder(options_builder, field)
 
         # configure pagination and sidebar
         options_builder.configure_pagination(
+            enabled=True,
             paginationPageSize=rows_per_page,
             paginationAutoPageSize=False,
         )
-        options_builder.configure_side_bar()
+        options_builder.configure_default_column(
+            editable=False,
+            groupable=True,
+            wrapText=True,
+            resizable=True,
+            filterable=True,
+            sortable=True,
+        )
+        options_builder.configure_selection(
+            selection_mode="single",
+            use_checkbox=False,
+            pre_selected_rows=[0],
+        )
 
         # display the Ag Grid table
-        AgGrid(
+        return AgGrid(
             dataframe_explorer,
             gridOptions=options_builder.build(),
             allow_unsafe_jscode=True,
             columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW,
         )
 
+    @staticmethod
+    def render_text(text: str):
+        if is_html(text):
+            st.text("HTML detected")
+            render = st.checkbox("Render HTML")
+            if render:
+                components.html(text, height=600)
+            else:
+                st.code(text, language="html")
+
+        elif is_json(text):
+            st.text("JSON detected")
+            st.json(json.loads(text))
+
+        elif is_pdf_base64(text):
+            st.text("PDF detected")
+            pdf_data = create_pdf_from_text(text)
+            encoded_pdf = base64.b64encode(pdf_data).decode("utf-8")
+            data = (
+                f'<embed src="data:application/pdf;base64,{encoded_pdf}" width="100%" '
+                f'height="1000" type="application/pdf">'
+            )
+            st.markdown(data, unsafe_allow_html=True)
+        else:
+            st.markdown(text)
+
+    def setup_viewer_widget(self, grid_dict: AgGridReturn, fields: t.Dict[str, t.Any]):
+        """Setup the viewer widget. This widget allows the user to view the selected row in the
+        dataframe.
+        """
+        text_fields = get_string_fields(fields)
+        with st.expander("Document Viewer"):
+            if text_fields:
+                selected_column = st.selectbox("View column", text_fields)
+                if grid_dict["selected_rows"]:
+                    data = str(grid_dict["selected_rows"][0][selected_column])
+                    self.render_text(data)
+            else:
+                st.info("No text fields found in dataframe")
+
 
 app = DatasetExplorerApp()
 app.create_common_interface()
 df, df_fields = app.create_loader_widget()
-app.setup_app_page(df, df_fields)
+grid_data_dict = app.setup_app_page(df, df_fields)
+app.setup_viewer_widget(grid_data_dict, df_fields)
diff --git a/data_explorer/requirements.txt b/data_explorer/requirements.txt
@@ -5,3 +5,5 @@ streamlit-extras==0.3.5
 st-pages==0.4.5
 matplotlib==3.7.1
 plotly==5.15.0
+beautifulsoup4==4.12.2
+fpdf==1.7.2