Skip to content

Commit

Permalink
Add document viewer to dataset explorer (#666)
Browse files Browse the repository at this point in the history
PR that adds document viewer to data explorer. The viewer attempts to
automatically detect the string type and render it to its most
appropriate format. Note that the normal string value does not get
rendered to a pdf value but rather a normal text/markdown.

I don't think it makes a lot of sense to render normal unformated text
to pdf. We can still detect whether a strings is a base64 pdf encoded
string and render that to it's original pdf format.

![Screenshot from 2023-11-22
17-32-32](https://github.com/ml6team/fondant/assets/47530815/2aa3a163-71dc-4dd2-b1d7-06f5d2d0450c)
  • Loading branch information
PhilippeMoussalli authored Nov 23, 2023
1 parent fe7e036 commit f9b4b81
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 35 deletions.
43 changes: 16 additions & 27 deletions data_explorer/app/df_helpers/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,24 @@
import typing as t


def get_image_fields(fields: t.Dict[str, str]) -> t.List[str]:
"""Get the image fields of the dataframe.
def get_fields_by_types(
fields: t.Dict[str, str],
field_types: t.List[str],
) -> t.List[str]:
return [
field
for field, f_type in fields.items()
if any(ftype in f_type for ftype in field_types)
]


Args:
fields: dictionary with fields and field types
def get_string_fields(fields: t.Dict[str, str]) -> t.List[str]:
return get_fields_by_types(fields, ["string", "utf8"])

Returns:
List of image fields
"""
# check which of the columns contain byte data
image_fields = []
for k, v in fields.items():
if v == "binary":
image_fields.append(k)
return image_fields

def get_image_fields(fields: t.Dict[str, str]) -> t.List[str]:
return get_fields_by_types(fields, ["binary"])


def get_numeric_fields(fields: t.Dict[str, str]) -> t.List[str]:
"""Get the numeric fields of the dataframe.
Args:
fields: dictionary with fields and field types
Returns:
List of numeric fields
"""
# check which of the columns contain byte data
numeric_fields = []
for k, v in fields.items():
if "int" in v or "float" in v:
numeric_fields.append(k)
return numeric_fields
return get_fields_by_types(fields, ["int", "float"])
118 changes: 110 additions & 8 deletions data_explorer/app/pages/dataset.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,49 @@
"""Data exploration page of the app."""
import base64
import json
import typing as t

import streamlit as st
from df_helpers.fields import get_image_fields
import streamlit.components.v1 as components
from bs4 import BeautifulSoup
from df_helpers.fields import get_image_fields, get_string_fields
from df_helpers.image_render import configure_image_builder, convert_image_column
from fpdf import FPDF
from interfaces.dataset_interface import DatasetLoaderApp
from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder
from st_aggrid import AgGrid, AgGridReturn, ColumnsAutoSizeMode, GridOptionsBuilder


def is_html(text: str):
return bool(BeautifulSoup(text, "html.parser").find())


def is_json(text: str):
try:
json_object = json.loads(text)
return bool(isinstance(json_object, (dict, list)))
except ValueError:
return False


def is_pdf_base64(text: str):
try:
_bytes = base64.b64decode(text)
return _bytes[0:4] == b"%PDF"
except ValueError:
return False


def create_pdf_from_text(raw_text):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.multi_cell(200, 10, txt=raw_text)
return pdf.output(dest="S").encode("latin-1")


class DatasetExplorerApp(DatasetLoaderApp):
@staticmethod
def setup_app_page(dataframe, fields):
def setup_app_page(dataframe, fields) -> AgGridReturn:
"""Build the dataframe explorer table."""
image_fields = get_image_fields(fields)

Expand All @@ -34,32 +68,100 @@ def setup_app_page(dataframe, fields):

# configure builder
options_builder = GridOptionsBuilder.from_dataframe(dataframe_explorer)

# Add tooltip hover for all fields
for field in fields:
if field not in image_fields:
options_builder.configure_column(
field=field,
tooltipField=field,
max_width=400,
)

grid_options: t.Dict[str, t.Any] = {"rowWidth": "auto", "tooltipShowDelay": 500}

if len(image_fields) > 0:
options_builder.configure_grid_options(rowHeight=100, rowWidth="auto")
grid_options["rowHeight"] = 100
options_builder.configure_grid_options(**grid_options)
else:
options_builder.configure_grid_options(rowHeight="auto", rowWidth="auto")
grid_options["rowHeight"] = "auto"
options_builder.configure_grid_options(**grid_options)

# format the image columns
for field in image_fields:
configure_image_builder(options_builder, field)

# configure pagination and sidebar
options_builder.configure_pagination(
enabled=True,
paginationPageSize=rows_per_page,
paginationAutoPageSize=False,
)
options_builder.configure_side_bar()
options_builder.configure_default_column(
editable=False,
groupable=True,
wrapText=True,
resizable=True,
filterable=True,
sortable=True,
)
options_builder.configure_selection(
selection_mode="single",
use_checkbox=False,
pre_selected_rows=[0],
)

# display the Ag Grid table
AgGrid(
return AgGrid(
dataframe_explorer,
gridOptions=options_builder.build(),
allow_unsafe_jscode=True,
columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW,
)

@staticmethod
def render_text(text: str):
if is_html(text):
st.text("HTML detected")
render = st.checkbox("Render HTML")
if render:
components.html(text, height=600)
else:
st.code(text, language="html")

elif is_json(text):
st.text("JSON detected")
st.json(json.loads(text))

elif is_pdf_base64(text):
st.text("PDF detected")
pdf_data = create_pdf_from_text(text)
encoded_pdf = base64.b64encode(pdf_data).decode("utf-8")
data = (
f'<embed src="data:application/pdf;base64,{encoded_pdf}" width="100%" '
f'height="1000" type="application/pdf">'
)
st.markdown(data, unsafe_allow_html=True)
else:
st.markdown(text)

def setup_viewer_widget(self, grid_dict: AgGridReturn, fields: t.Dict[str, t.Any]):
"""Setup the viewer widget. This widget allows the user to view the selected row in the
dataframe.
"""
text_fields = get_string_fields(fields)
with st.expander("Document Viewer"):
if text_fields:
selected_column = st.selectbox("View column", text_fields)
if grid_dict["selected_rows"]:
data = str(grid_dict["selected_rows"][0][selected_column])
self.render_text(data)
else:
st.info("No text fields found in dataframe")


app = DatasetExplorerApp()
app.create_common_interface()
df, df_fields = app.create_loader_widget()
app.setup_app_page(df, df_fields)
grid_data_dict = app.setup_app_page(df, df_fields)
app.setup_viewer_widget(grid_data_dict, df_fields)
2 changes: 2 additions & 0 deletions data_explorer/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ streamlit-extras==0.3.5
st-pages==0.4.5
matplotlib==3.7.1
plotly==5.15.0
beautifulsoup4==4.12.2
fpdf==1.7.2

0 comments on commit f9b4b81

Please sign in to comment.