Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add document viewer to dataset explorer #666

Merged
merged 14 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 16 additions & 27 deletions data_explorer/app/df_helpers/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,24 @@
import typing as t


def get_image_fields(fields: t.Dict[str, str]) -> t.List[str]:
"""Get the image fields of the dataframe.
def get_fields_by_types(
fields: t.Dict[str, str],
field_types: t.List[str],
) -> t.List[str]:
return [
field
for field, f_type in fields.items()
if any(ftype in f_type for ftype in field_types)
]


Args:
fields: dictionary with fields and field types
def get_string_fields(fields: t.Dict[str, str]) -> t.List[str]:
return get_fields_by_types(fields, ["string", "utf8"])

Returns:
List of image fields
"""
# check which of the columns contain byte data
image_fields = []
for k, v in fields.items():
if v == "binary":
image_fields.append(k)
return image_fields

def get_image_fields(fields: t.Dict[str, str]) -> t.List[str]:
return get_fields_by_types(fields, ["binary"])


def get_numeric_fields(fields: t.Dict[str, str]) -> t.List[str]:
"""Get the numeric fields of the dataframe.

Args:
fields: dictionary with fields and field types

Returns:
List of numeric fields
"""
# check which of the columns contain byte data
numeric_fields = []
for k, v in fields.items():
if "int" in v or "float" in v:
numeric_fields.append(k)
return numeric_fields
return get_fields_by_types(fields, ["int", "float"])
118 changes: 110 additions & 8 deletions data_explorer/app/pages/dataset.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,49 @@
"""Data exploration page of the app."""
import base64
import json
import typing as t

import streamlit as st
from df_helpers.fields import get_image_fields
import streamlit.components.v1 as components
from bs4 import BeautifulSoup
from df_helpers.fields import get_image_fields, get_string_fields
from df_helpers.image_render import configure_image_builder, convert_image_column
from fpdf import FPDF
from interfaces.dataset_interface import DatasetLoaderApp
from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder
from st_aggrid import AgGrid, AgGridReturn, ColumnsAutoSizeMode, GridOptionsBuilder


def is_html(text: str):
return bool(BeautifulSoup(text, "html.parser").find())


def is_json(text: str):
try:
json_object = json.loads(text)
return bool(isinstance(json_object, (dict, list)))
except ValueError:
return False


def is_pdf_base64(text: str):
try:
_bytes = base64.b64decode(text)
return _bytes[0:4] == b"%PDF"
except ValueError:
return False


def create_pdf_from_text(raw_text):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.multi_cell(200, 10, txt=raw_text)
return pdf.output(dest="S").encode("latin-1")


class DatasetExplorerApp(DatasetLoaderApp):
@staticmethod
def setup_app_page(dataframe, fields):
def setup_app_page(dataframe, fields) -> AgGridReturn:
"""Build the dataframe explorer table."""
image_fields = get_image_fields(fields)

Expand All @@ -34,32 +68,100 @@ def setup_app_page(dataframe, fields):

# configure builder
options_builder = GridOptionsBuilder.from_dataframe(dataframe_explorer)

# Add tooltip hover for all fields
for field in fields:
if field not in image_fields:
options_builder.configure_column(
field=field,
tooltipField=field,
max_width=400,
)

grid_options: t.Dict[str, t.Any] = {"rowWidth": "auto", "tooltipShowDelay": 500}

if len(image_fields) > 0:
options_builder.configure_grid_options(rowHeight=100, rowWidth="auto")
grid_options["rowHeight"] = 100
options_builder.configure_grid_options(**grid_options)
else:
options_builder.configure_grid_options(rowHeight="auto", rowWidth="auto")
grid_options["rowHeight"] = "auto"
options_builder.configure_grid_options(**grid_options)

# format the image columns
for field in image_fields:
configure_image_builder(options_builder, field)

# configure pagination and sidebar
options_builder.configure_pagination(
enabled=True,
paginationPageSize=rows_per_page,
paginationAutoPageSize=False,
)
options_builder.configure_side_bar()
options_builder.configure_default_column(
editable=False,
groupable=True,
wrapText=True,
resizable=True,
filterable=True,
sortable=True,
)
options_builder.configure_selection(
selection_mode="single",
use_checkbox=False,
pre_selected_rows=[0],
)

# display the Ag Grid table
AgGrid(
return AgGrid(
dataframe_explorer,
gridOptions=options_builder.build(),
allow_unsafe_jscode=True,
columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW,
)

@staticmethod
def render_text(text: str):
if is_html(text):
st.text("HTML detected")
render = st.checkbox("Render HTML")
if render:
components.html(text, height=600)
else:
st.code(text, language="html")

elif is_json(text):
st.text("JSON detected")
st.json(json.loads(text))

elif is_pdf_base64(text):
st.text("PDF detected")
pdf_data = create_pdf_from_text(text)
encoded_pdf = base64.b64encode(pdf_data).decode("utf-8")
data = (
f'<embed src="data:application/pdf;base64,{encoded_pdf}" width="100%" '
f'height="1000" type="application/pdf">'
)
st.markdown(data, unsafe_allow_html=True)
else:
st.markdown(text)

def setup_viewer_widget(self, grid_dict: AgGridReturn, fields: t.Dict[str, t.Any]):
"""Setup the viewer widget. This widget allows the user to view the selected row in the
dataframe.
"""
text_fields = get_string_fields(fields)
with st.expander("Document Viewer"):
if text_fields:
selected_column = st.selectbox("View column", text_fields)
if grid_dict["selected_rows"]:
data = str(grid_dict["selected_rows"][0][selected_column])
self.render_text(data)
else:
st.info("No text fields found in dataframe")


app = DatasetExplorerApp()
app.create_common_interface()
df, df_fields = app.create_loader_widget()
app.setup_app_page(df, df_fields)
grid_data_dict = app.setup_app_page(df, df_fields)
app.setup_viewer_widget(grid_data_dict, df_fields)
2 changes: 2 additions & 0 deletions data_explorer/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ streamlit-extras==0.3.5
st-pages==0.4.5
matplotlib==3.7.1
plotly==5.15.0
beautifulsoup4==4.12.2
fpdf==1.7.2
Loading