diff --git a/data_explorer/app/df_helpers/fields.py b/data_explorer/app/df_helpers/fields.py index 4b8fd4e36..e13eab17f 100644 --- a/data_explorer/app/df_helpers/fields.py +++ b/data_explorer/app/df_helpers/fields.py @@ -3,35 +3,24 @@ import typing as t -def get_image_fields(fields: t.Dict[str, str]) -> t.List[str]: - """Get the image fields of the dataframe. +def get_fields_by_types( + fields: t.Dict[str, str], + field_types: t.List[str], +) -> t.List[str]: + return [ + field + for field, f_type in fields.items() + if any(ftype in f_type for ftype in field_types) + ] + - Args: - fields: dictionary with fields and field types +def get_string_fields(fields: t.Dict[str, str]) -> t.List[str]: + return get_fields_by_types(fields, ["string", "utf8"]) - Returns: - List of image fields - """ - # check which of the columns contain byte data - image_fields = [] - for k, v in fields.items(): - if v == "binary": - image_fields.append(k) - return image_fields + +def get_image_fields(fields: t.Dict[str, str]) -> t.List[str]: + return get_fields_by_types(fields, ["binary"]) def get_numeric_fields(fields: t.Dict[str, str]) -> t.List[str]: - """Get the numeric fields of the dataframe. - - Args: - fields: dictionary with fields and field types - - Returns: - List of numeric fields - """ - # check which of the columns contain byte data - numeric_fields = [] - for k, v in fields.items(): - if "int" in v or "float" in v: - numeric_fields.append(k) - return numeric_fields + return get_fields_by_types(fields, ["int", "float"]) diff --git a/data_explorer/app/pages/dataset.py b/data_explorer/app/pages/dataset.py index b033adcf3..8022dd588 100644 --- a/data_explorer/app/pages/dataset.py +++ b/data_explorer/app/pages/dataset.py @@ -1,15 +1,49 @@ """Data exploration page of the app.""" +import base64 +import json +import typing as t import streamlit as st -from df_helpers.fields import get_image_fields +import streamlit.components.v1 as components +from bs4 import BeautifulSoup +from df_helpers.fields import get_image_fields, get_string_fields from df_helpers.image_render import configure_image_builder, convert_image_column +from fpdf import FPDF from interfaces.dataset_interface import DatasetLoaderApp -from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder +from st_aggrid import AgGrid, AgGridReturn, ColumnsAutoSizeMode, GridOptionsBuilder + + +def is_html(text: str): + return bool(BeautifulSoup(text, "html.parser").find()) + + +def is_json(text: str): + try: + json_object = json.loads(text) + return bool(isinstance(json_object, (dict, list))) + except ValueError: + return False + + +def is_pdf_base64(text: str): + try: + _bytes = base64.b64decode(text) + return _bytes[0:4] == b"%PDF" + except ValueError: + return False + + +def create_pdf_from_text(raw_text): + pdf = FPDF() + pdf.add_page() + pdf.set_font("Arial", size=12) + pdf.multi_cell(200, 10, txt=raw_text) + return pdf.output(dest="S").encode("latin-1") class DatasetExplorerApp(DatasetLoaderApp): @staticmethod - def setup_app_page(dataframe, fields): + def setup_app_page(dataframe, fields) -> AgGridReturn: """Build the dataframe explorer table.""" image_fields = get_image_fields(fields) @@ -34,10 +68,24 @@ def setup_app_page(dataframe, fields): # configure builder options_builder = GridOptionsBuilder.from_dataframe(dataframe_explorer) + + # Add tooltip hover for all fields + for field in fields: + if field not in image_fields: + options_builder.configure_column( + field=field, + tooltipField=field, + max_width=400, + ) + + grid_options: t.Dict[str, t.Any] = {"rowWidth": "auto", "tooltipShowDelay": 500} + if len(image_fields) > 0: - options_builder.configure_grid_options(rowHeight=100, rowWidth="auto") + grid_options["rowHeight"] = 100 + options_builder.configure_grid_options(**grid_options) else: - options_builder.configure_grid_options(rowHeight="auto", rowWidth="auto") + grid_options["rowHeight"] = "auto" + options_builder.configure_grid_options(**grid_options) # format the image columns for field in image_fields: @@ -45,21 +93,75 @@ def setup_app_page(dataframe, fields): # configure pagination and sidebar options_builder.configure_pagination( + enabled=True, paginationPageSize=rows_per_page, paginationAutoPageSize=False, ) - options_builder.configure_side_bar() + options_builder.configure_default_column( + editable=False, + groupable=True, + wrapText=True, + resizable=True, + filterable=True, + sortable=True, + ) + options_builder.configure_selection( + selection_mode="single", + use_checkbox=False, + pre_selected_rows=[0], + ) # display the Ag Grid table - AgGrid( + return AgGrid( dataframe_explorer, gridOptions=options_builder.build(), allow_unsafe_jscode=True, columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW, ) + @staticmethod + def render_text(text: str): + if is_html(text): + st.text("HTML detected") + render = st.checkbox("Render HTML") + if render: + components.html(text, height=600) + else: + st.code(text, language="html") + + elif is_json(text): + st.text("JSON detected") + st.json(json.loads(text)) + + elif is_pdf_base64(text): + st.text("PDF detected") + pdf_data = create_pdf_from_text(text) + encoded_pdf = base64.b64encode(pdf_data).decode("utf-8") + data = ( + f'' + ) + st.markdown(data, unsafe_allow_html=True) + else: + st.markdown(text) + + def setup_viewer_widget(self, grid_dict: AgGridReturn, fields: t.Dict[str, t.Any]): + """Setup the viewer widget. This widget allows the user to view the selected row in the + dataframe. + """ + text_fields = get_string_fields(fields) + with st.expander("Document Viewer"): + if text_fields: + selected_column = st.selectbox("View column", text_fields) + if grid_dict["selected_rows"]: + data = str(grid_dict["selected_rows"][0][selected_column]) + self.render_text(data) + else: + st.info("No text fields found in dataframe") + app = DatasetExplorerApp() app.create_common_interface() df, df_fields = app.create_loader_widget() -app.setup_app_page(df, df_fields) +grid_data_dict = app.setup_app_page(df, df_fields) +app.setup_viewer_widget(grid_data_dict, df_fields) diff --git a/data_explorer/requirements.txt b/data_explorer/requirements.txt index be27a0a28..ad4941574 100644 --- a/data_explorer/requirements.txt +++ b/data_explorer/requirements.txt @@ -5,3 +5,5 @@ streamlit-extras==0.3.5 st-pages==0.4.5 matplotlib==3.7.1 plotly==5.15.0 +beautifulsoup4==4.12.2 +fpdf==1.7.2