From 71214bb9de30cd4302d09af97acf117d4b0e9231 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 23 Aug 2023 13:15:33 +0200 Subject: [PATCH 1/7] disalbe dask convert string --- data_explorer/app/data.py | 3 ++- data_explorer/app/main.py | 3 +++ data_explorer/app/numeric_analysis.py | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/data_explorer/app/data.py b/data_explorer/app/data.py index 8e5bb13ac..1ee14b5cd 100644 --- a/data_explorer/app/data.py +++ b/data_explorer/app/data.py @@ -4,6 +4,7 @@ from typing import List, Tuple from urllib.parse import urlparse +import dask import dask.dataframe as dd import streamlit as st from exceptions import RemoteFileNotFoundException @@ -12,7 +13,7 @@ from fondant.manifest import Manifest LOGGER = logging.getLogger(__name__) - +dask.config.set({"dataframe.convert-string": False}) def is_remote(path: str) -> bool: """Check if path is remote diff --git a/data_explorer/app/main.py b/data_explorer/app/main.py index 4c7b88e07..0d326647d 100644 --- a/data_explorer/app/main.py +++ b/data_explorer/app/main.py @@ -1,6 +1,7 @@ """Main file of the data explorer interface""" import logging +import dask import streamlit as st from data import load_dataframe from table import get_image_fields, get_numeric_fields @@ -8,6 +9,8 @@ build_numeric_analysis_plots, build_numeric_analysis_table, build_sidebar) +dask.config.set({"dataframe.convert-string": False}) + LOGGER = logging.getLogger(__name__) # streamlit wide st.set_page_config(layout="wide") diff --git a/data_explorer/app/numeric_analysis.py b/data_explorer/app/numeric_analysis.py index 6d7288dbb..ec94b3852 100644 --- a/data_explorer/app/numeric_analysis.py +++ b/data_explorer/app/numeric_analysis.py @@ -2,12 +2,14 @@ import logging from typing import List +import dask import dask.dataframe as dd import pandas as pd import streamlit as st LOGGER = logging.getLogger(__name__) +dask.config.set({"dataframe.convert-string": False}) pd.options.plotting.backend = "plotly" From d5a15fe04afb9a78dc04b84324ca9056e1b6ea44 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 23 Aug 2023 14:03:31 +0200 Subject: [PATCH 2/7] debug --- data_explorer/app/table.py | 8 +++++++- data_explorer/requirements.txt | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/data_explorer/app/table.py b/data_explorer/app/table.py index 936f2905e..f37102d4f 100644 --- a/data_explorer/app/table.py +++ b/data_explorer/app/table.py @@ -45,8 +45,14 @@ def get_image_fields(fields: Dict[str, str]) -> List[str]: # check which of the columns contain byte data image_fields = [] for k, v in fields.items(): - if v == "object": + print("key") + print(k) + print("value") + print(v) + if v in ["object", "binary"]: image_fields.append(k) + print("image fields") + print(image_fields) return image_fields diff --git a/data_explorer/requirements.txt b/data_explorer/requirements.txt index f688d5e9f..11ffd89de 100644 --- a/data_explorer/requirements.txt +++ b/data_explorer/requirements.txt @@ -1,4 +1,4 @@ -git+https://github.com/ml6team/fondant@main +git+https://github.com/ml6team/fondant@71214bb9de30cd4302d09af97acf117d4b0e9231 streamlit==1.23.1 streamlit-aggrid==0.3.4 matplotlib==3.7.1 From 741b5366543a1cd2097180591150bd4c5a7dcf6b Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 23 Aug 2023 14:09:10 +0200 Subject: [PATCH 3/7] debug4 --- data_explorer/app/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data_explorer/app/main.py b/data_explorer/app/main.py index 0d326647d..bbb01effb 100644 --- a/data_explorer/app/main.py +++ b/data_explorer/app/main.py @@ -34,7 +34,8 @@ # extract image and numeric columns image_fields = get_image_fields(fields) numeric_fields = get_numeric_fields(fields) - + print("image fields") + print("numeric fields") # build tabs tab_explorer, tab_numeric, tab_images = st.tabs( ["Data explorer", "Numerical analysis", "Image explorer"] From 8051d09fc64de8b8dbf669f40398aa792fa391f1 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 23 Aug 2023 14:10:24 +0200 Subject: [PATCH 4/7] debug5 --- data_explorer/app/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data_explorer/app/main.py b/data_explorer/app/main.py index bbb01effb..8948583a1 100644 --- a/data_explorer/app/main.py +++ b/data_explorer/app/main.py @@ -35,7 +35,9 @@ image_fields = get_image_fields(fields) numeric_fields = get_numeric_fields(fields) print("image fields") + print(image_fields) print("numeric fields") + print(numeric_fields) # build tabs tab_explorer, tab_numeric, tab_images = st.tabs( ["Data explorer", "Numerical analysis", "Image explorer"] From d893dfdc987c55e1f6a77e523fef49d129f019f2 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 23 Aug 2023 14:27:18 +0200 Subject: [PATCH 5/7] polishing --- data_explorer/app/main.py | 4 ---- data_explorer/app/table.py | 8 +------- data_explorer/app/widgets.py | 36 ++++++++++++++++++++---------------- 3 files changed, 21 insertions(+), 27 deletions(-) diff --git a/data_explorer/app/main.py b/data_explorer/app/main.py index 8948583a1..85dbed909 100644 --- a/data_explorer/app/main.py +++ b/data_explorer/app/main.py @@ -34,10 +34,6 @@ # extract image and numeric columns image_fields = get_image_fields(fields) numeric_fields = get_numeric_fields(fields) - print("image fields") - print(image_fields) - print("numeric fields") - print(numeric_fields) # build tabs tab_explorer, tab_numeric, tab_images = st.tabs( ["Data explorer", "Numerical analysis", "Image explorer"] diff --git a/data_explorer/app/table.py b/data_explorer/app/table.py index f37102d4f..de2f7bc21 100644 --- a/data_explorer/app/table.py +++ b/data_explorer/app/table.py @@ -45,14 +45,8 @@ def get_image_fields(fields: Dict[str, str]) -> List[str]: # check which of the columns contain byte data image_fields = [] for k, v in fields.items(): - print("key") - print(k) - print("value") - print(v) - if v in ["object", "binary"]: + if v == "binary": image_fields.append(k) - print("image fields") - print(image_fields) return image_fields diff --git a/data_explorer/app/widgets.py b/data_explorer/app/widgets.py index c12b0675f..f9c127b62 100644 --- a/data_explorer/app/widgets.py +++ b/data_explorer/app/widgets.py @@ -78,7 +78,7 @@ def build_sidebar() -> Tuple[Optional[str], Optional[str], Optional[Dict]]: def build_explorer_table( - dataframe: Union[dd.DataFrame, pd.DataFrame], image_fields: List[str] + dataframe: Union[dd.DataFrame, pd.DataFrame], image_fields: List[str] ) -> None: """Build the dataframe explorer table. @@ -129,7 +129,7 @@ def build_explorer_table( def build_numeric_analysis_table( - dataframe: Union[dd.DataFrame, pd.DataFrame], numeric_fields: List[str] + dataframe: Union[dd.DataFrame, pd.DataFrame], numeric_fields: Union[List[str], None] ) -> None: """Build the numeric analysis table. @@ -138,7 +138,9 @@ def build_numeric_analysis_table( numeric_fields (List[str]): list of numeric fields """ # check if there are numeric fields - if len(numeric_fields) > 0: + if len(numeric_fields) == 0: + st.warning("There are no numeric fields in this subset") + else: st.write("## Numerical statistics") # make numeric statistics table @@ -159,7 +161,7 @@ def build_numeric_analysis_table( def build_numeric_analysis_plots( - dataframe: Union[dd.DataFrame, pd.DataFrame], numeric_fields: List[str] + dataframe: Union[dd.DataFrame, pd.DataFrame], numeric_fields: Union[List[str], None] ) -> None: """Build the numeric analysis plots. @@ -167,20 +169,24 @@ def build_numeric_analysis_plots( dataframe (Union[dd.DataFrame, pd.DataFrame]): dataframe to explore numeric_fields (List[str]): list of numeric fields """ - st.write("## Show numeric distributions") + # check if there are numeric fields + if len(numeric_fields) == 0: + st.warning("There are no numeric fields in this subset") + else: + st.write("## Show numeric distributions") - # choose a numeric field in dropdown - cols = st.columns(2) - with cols[0]: - numeric_field = st.selectbox("Field", numeric_fields) - with cols[1]: - plot_type = st.selectbox("Plot type", - ["histogram", "violin", "density", "categorical"]) + # choose a numeric field in dropdown + cols = st.columns(2) + with cols[0]: + numeric_field = st.selectbox("Field", numeric_fields) + with cols[1]: + plot_type = st.selectbox("Plot type", + ["histogram", "violin", "density", "categorical"]) - make_numeric_plot(dataframe, numeric_field, plot_type) + make_numeric_plot(dataframe, numeric_field, plot_type) -def build_image_explorer(dataframe: dd.DataFrame, image_fields: List[str]): +def build_image_explorer(dataframe: dd.DataFrame, image_fields: Union[List[str], None]): """Build the image explorer This explorer shows a gallery of the images in a certain column. @@ -188,8 +194,6 @@ def build_image_explorer(dataframe: dd.DataFrame, image_fields: List[str]): dataframe (dd.DataFrame): dataframe to explore image_fields (List[str]): list of image fields """ - st.write("## Image explorer") - st.write("In this table, you can explore the images") if len(image_fields) == 0: st.warning("There are no image fields in this subset") From c439e99e05cae6408d3c22bd576ccc9cd104e6a1 Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 23 Aug 2023 14:35:58 +0200 Subject: [PATCH 6/7] remove duplicate configs --- data_explorer/app/data.py | 1 - data_explorer/app/numeric_analysis.py | 1 - 2 files changed, 2 deletions(-) diff --git a/data_explorer/app/data.py b/data_explorer/app/data.py index 1ee14b5cd..d3e88c49b 100644 --- a/data_explorer/app/data.py +++ b/data_explorer/app/data.py @@ -13,7 +13,6 @@ from fondant.manifest import Manifest LOGGER = logging.getLogger(__name__) -dask.config.set({"dataframe.convert-string": False}) def is_remote(path: str) -> bool: """Check if path is remote diff --git a/data_explorer/app/numeric_analysis.py b/data_explorer/app/numeric_analysis.py index ec94b3852..d7bd3bcaa 100644 --- a/data_explorer/app/numeric_analysis.py +++ b/data_explorer/app/numeric_analysis.py @@ -9,7 +9,6 @@ LOGGER = logging.getLogger(__name__) -dask.config.set({"dataframe.convert-string": False}) pd.options.plotting.backend = "plotly" From cef20eb0e19040645905392e32de1f520d267b6d Mon Sep 17 00:00:00 2001 From: Philippe Moussalli Date: Wed, 23 Aug 2023 14:41:49 +0200 Subject: [PATCH 7/7] more tweaks --- data_explorer/app/data.py | 4 ++-- data_explorer/app/widgets.py | 9 ++++++--- data_explorer/requirements.txt | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/data_explorer/app/data.py b/data_explorer/app/data.py index d3e88c49b..847b87bd4 100644 --- a/data_explorer/app/data.py +++ b/data_explorer/app/data.py @@ -1,10 +1,9 @@ """This file contains data loading logic""" import json import logging -from typing import List, Tuple +from typing import List from urllib.parse import urlparse -import dask import dask.dataframe as dd import streamlit as st from exceptions import RemoteFileNotFoundException @@ -14,6 +13,7 @@ LOGGER = logging.getLogger(__name__) + def is_remote(path: str) -> bool: """Check if path is remote diff --git a/data_explorer/app/widgets.py b/data_explorer/app/widgets.py index f9c127b62..8bcadc2c9 100644 --- a/data_explorer/app/widgets.py +++ b/data_explorer/app/widgets.py @@ -129,7 +129,7 @@ def build_explorer_table( def build_numeric_analysis_table( - dataframe: Union[dd.DataFrame, pd.DataFrame], numeric_fields: Union[List[str], None] + dataframe: Union[dd.DataFrame, pd.DataFrame], numeric_fields: List[str] ) -> None: """Build the numeric analysis table. @@ -161,7 +161,7 @@ def build_numeric_analysis_table( def build_numeric_analysis_plots( - dataframe: Union[dd.DataFrame, pd.DataFrame], numeric_fields: Union[List[str], None] + dataframe: Union[dd.DataFrame, pd.DataFrame], numeric_fields: List[str] ) -> None: """Build the numeric analysis plots. @@ -186,7 +186,7 @@ def build_numeric_analysis_plots( make_numeric_plot(dataframe, numeric_field, plot_type) -def build_image_explorer(dataframe: dd.DataFrame, image_fields: Union[List[str], None]): +def build_image_explorer(dataframe: dd.DataFrame, image_fields: List[str]): """Build the image explorer This explorer shows a gallery of the images in a certain column. @@ -198,6 +198,9 @@ def build_image_explorer(dataframe: dd.DataFrame, image_fields: Union[List[str], if len(image_fields) == 0: st.warning("There are no image fields in this subset") else: + st.write("## Image explorer") + st.write("In this table, you can explore the images") + image_field = st.selectbox("Image field", image_fields) images = dataframe[image_field].compute() diff --git a/data_explorer/requirements.txt b/data_explorer/requirements.txt index 11ffd89de..f688d5e9f 100644 --- a/data_explorer/requirements.txt +++ b/data_explorer/requirements.txt @@ -1,4 +1,4 @@ -git+https://github.com/ml6team/fondant@71214bb9de30cd4302d09af97acf117d4b0e9231 +git+https://github.com/ml6team/fondant@main streamlit==1.23.1 streamlit-aggrid==0.3.4 matplotlib==3.7.1