Skip to content

Commit

Permalink
feat: adjust inferred field type
Browse files Browse the repository at this point in the history
  • Loading branch information
longxiaofei committed Sep 6, 2023
1 parent f584aad commit ba197ec
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 33 deletions.
24 changes: 22 additions & 2 deletions pygwalker/data_parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import abc
import io

import arrow

from pygwalker._typing import DataFrame


Expand Down Expand Up @@ -75,12 +77,30 @@ def _infer_prop(
s = self.example_df[col]
orig_fname = self._decode_fname(s)
field_spec = field_specs.get(orig_fname, default_field_spec)
semantic_type = self._infer_semantic(s) if field_spec.semanticType == '?' else field_spec.semanticType
analytic_type = self._infer_analytic(s) if field_spec.analyticType == '?' else field_spec.analyticType
semantic_type = self._infer_semantic(s, orig_fname) if field_spec.semanticType == '?' else field_spec.semanticType
analytic_type = self._infer_analytic(s, orig_fname) if field_spec.analyticType == '?' else field_spec.analyticType
fname = orig_fname if field_spec.display_as is None else field_spec.display_as
return {
'fid': col,
'name': fname,
'semanticType': semantic_type,
'analyticType': analytic_type,
}


def is_temporal_field(value: str) -> bool:
"""check if field is temporal"""
try:
arrow.get(value)
except arrow.parser.ParserError:
return False
return True


def is_geo_field(field_name: str) -> bool:
"""check if filed is """
field_name = field_name.lower().strip(" .")
return field_name in {
"latitude", "longitude",
"lat", "long",
}
29 changes: 19 additions & 10 deletions pygwalker/data_parsers/modin_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from modin import pandas as mpd
import duckdb

from .base import BaseDataFrameDataParser
from .base import BaseDataFrameDataParser, is_temporal_field, is_geo_field
from pygwalker.services.fname_encodings import fname_decode, fname_encode, rename_columns


Expand Down Expand Up @@ -40,19 +40,28 @@ def _init_dataframe(self, df: mpd.DataFrame) -> mpd.DataFrame:
df.columns = [fname_encode(col) for col in rename_columns(list(df.columns))]
return df

def _infer_semantic(self, s: mpd.Series):
def _infer_semantic(self, s: mpd.Series, field_name: str):
v_cnt = len(s.value_counts())
example_value = s[0]
kind = s.dtype.kind
return 'quantitative' if (kind in 'fcmiu' and v_cnt > 16) else \
'temporal' if kind in 'M' else \
'nominal' if kind in 'bOSUV' or v_cnt <= 2 else \
'ordinal'

def _infer_analytic(self, s: mpd.Series):
if (kind in "fcmiu" and v_cnt > 16) or is_geo_field(field_name):
return "quantitative"
if kind in "M" or (kind in "bOSUV" and is_temporal_field(str(example_value))):
return "temporal"
if kind in "bOSUV" or v_cnt <= 2:
return "nominal"
return "ordinal"

def _infer_analytic(self, s: mpd.Series, field_name: str):
kind = s.dtype.kind
return 'measure' if \
kind in 'fcm' or (kind in 'iu' and len(s.value_counts()) > 16) \
else 'dimension'

if is_geo_field(field_name):
return "dimension"
if kind in "fcm" or (kind in "iu" and len(s.value_counts()) > 16):
return "measure"

return "dimension"

def _decode_fname(self, s: mpd.Series):
fname = fname_decode(s.name).rsplit('_', 1)[0]
Expand Down
29 changes: 19 additions & 10 deletions pygwalker/data_parsers/pandas_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
import duckdb

from .base import BaseDataFrameDataParser
from .base import BaseDataFrameDataParser, is_temporal_field, is_geo_field
from pygwalker.services.fname_encodings import fname_decode, fname_encode, rename_columns


Expand Down Expand Up @@ -35,19 +35,28 @@ def _init_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
df.columns = [fname_encode(col) for col in rename_columns(list(df.columns))]
return df

def _infer_semantic(self, s: pd.Series):
def _infer_semantic(self, s: pd.Series, field_name: str):
v_cnt = len(s.value_counts())
example_value = s[0]
kind = s.dtype.kind
return 'quantitative' if (kind in 'fcmiu' and v_cnt > 16) else \
'temporal' if kind in 'M' else \
'nominal' if kind in 'bOSUV' or v_cnt <= 2 else \
'ordinal'

def _infer_analytic(self, s: pd.Series):
if (kind in "fcmiu" and v_cnt > 16) or is_geo_field(field_name):
return "quantitative"
if kind in "M" or (kind in "bOSUV" and is_temporal_field(str(example_value))):
return 'temporal'
if kind in "bOSUV" or v_cnt <= 2:
return "nominal"
return "ordinal"

def _infer_analytic(self, s: pd.Series, field_name: str):
kind = s.dtype.kind
return 'measure' if \
kind in 'fcm' or (kind in 'iu' and len(s.value_counts()) > 16) \
else 'dimension'

if is_geo_field(field_name):
return "dimension"
if kind in "fcm" or (kind in "iu" and len(s.value_counts()) > 16):
return "measure"

return "dimension"

def _decode_fname(self, s: pd.Series):
fname = fname_decode(s.name).rsplit('_', 1)[0]
Expand Down
32 changes: 22 additions & 10 deletions pygwalker/data_parsers/polars_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import polars as pl
import duckdb

from .base import BaseDataFrameDataParser
from .base import BaseDataFrameDataParser, is_temporal_field, is_geo_field
from pygwalker.services.fname_encodings import fname_decode, fname_encode, rename_columns


Expand Down Expand Up @@ -37,19 +37,31 @@ def _init_dataframe(self, df: pl.DataFrame) -> pl.DataFrame:
})
return df

def _infer_semantic(self, s: pl.Series):
def _infer_semantic(self, s: pl.Series, field_name: str):
v_cnt = len(s.value_counts())
example_value = s[0]
kind = s.dtype
return 'quantitative' if kind in pl.NUMERIC_DTYPES and v_cnt > 16 else \
'temporal' if kind in pl.TEMPORAL_DTYPES else \
'nominal' if kind in [pl.Boolean, pl.Object, pl.Utf8, pl.Categorical, pl.Struct, pl.List] or v_cnt <= 2 else \
'ordinal'

def _infer_analytic(self, s: pl.Series):
if (kind in pl.NUMERIC_DTYPES and v_cnt > 16) or is_geo_field(field_name):
return "quantitative"
if kind in pl.TEMPORAL_DTYPES or is_temporal_field(str(example_value)):
return "temporal"
if kind in [pl.Boolean, pl.Object, pl.Utf8, pl.Categorical, pl.Struct, pl.List] or v_cnt <= 2:
return "nominal"
return "ordinal"

def _infer_analytic(self, s: pl.Series, field_name: str):
kind = s.dtype
return 'measure' if kind in pl.FLOAT_DTYPES | pl.DURATION_DTYPES or \
(kind in pl.INTEGER_DTYPES and len(s.value_counts()) > 16) else \
'dimension'

if is_geo_field(field_name):
return "dimension"
if (
kind in (pl.FLOAT_DTYPES | pl.DURATION_DTYPES)
or (kind in pl.INTEGER_DTYPES and len(s.value_counts()) > 16)
):
return "measure"

return "dimension"

def _decode_fname(self, s: pl.Series):
fname = fname_decode(s.name).rsplit('_', 1)[0]
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ dependencies = [
"duckdb",
"pyarrow",
"sqlglot",
"requests"
"requests",
"arrow"
]
[project.urls]
homepage = "https://github.com/Kanaries/pygwalker"
Expand Down

0 comments on commit ba197ec

Please sign in to comment.