From 20bf9a39be7aea47614020cd58c532a01a628e25 Mon Sep 17 00:00:00 2001 From: dding3 Date: Mon, 29 Aug 2022 14:53:06 -0700 Subject: [PATCH 1/3] merge get schema and get class name in shards into 1 rdd operation --- python/orca/src/bigdl/orca/data/shard.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/python/orca/src/bigdl/orca/data/shard.py b/python/orca/src/bigdl/orca/data/shard.py index 044fa8a9c35..d6862a23ce5 100644 --- a/python/orca/src/bigdl/orca/data/shard.py +++ b/python/orca/src/bigdl/orca/data/shard.py @@ -454,20 +454,29 @@ def _get_schema(self): if 'schema' in self.type: return self.type['schema'] else: - if self._get_class_name() == 'pandas.core.frame.DataFrame': - import pandas as pd - columns, dtypes = self.rdd.map(lambda x: (x.columns, x.dtypes)).first() - self.type['schema'] = {'columns': columns, 'dtypes': dtypes} - return self.type['schema'] - return None + class_name, schema = self._get_schema_class_name() + self.type['class_name'] = class_name + self.type['schema'] = schema + return self.type['schema'] def _get_class_name(self): if 'class_name' in self.type: return self.type['class_name'] else: - self.type['class_name'] = self._for_each(get_class_name).first() + class_name, schema = self._get_schema_class_name() + self.type['class_name'] = class_name + self.type['schema'] = schema return self.type['class_name'] + def _get_schema_class_name(self): + def func(x): + class_name = get_class_name(x) + schema = None + if class_name == 'pandas.core.frame.DataFrame': + schema = {'columns': x.columns, 'dtypes': x.dtypes} + return (class_name, schema) + return self.rdd.map(lambda x: func(x)).first() + class SharedValue(object): def __init__(self, data): From 15057367339be497f3839a67e76ae6c71e4d3e87 Mon Sep 17 00:00:00 2001 From: dding3 Date: Wed, 31 Aug 2022 09:06:00 -0700 Subject: [PATCH 2/3] fix comments --- python/orca/src/bigdl/orca/data/shard.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/orca/src/bigdl/orca/data/shard.py b/python/orca/src/bigdl/orca/data/shard.py index 067c9efd5dd..da54ae3052c 100644 --- a/python/orca/src/bigdl/orca/data/shard.py +++ b/python/orca/src/bigdl/orca/data/shard.py @@ -598,7 +598,8 @@ def utility_func(x, func, *args, **kwargs): def get_schema(self): if 'schema' in self.type: return self.type['schema'] - else: + + if 'class_name' not in self.type or self.type['class_name'] == 'pandas.core.frame.DataFrame': class_name, schema = self._get_schema_class_name() self.type['class_name'] = class_name self.type['schema'] = schema From c01192944fe71d971d5e31fbb626b60b5fdabed9 Mon Sep 17 00:00:00 2001 From: dding3 Date: Wed, 31 Aug 2022 11:37:20 -0700 Subject: [PATCH 3/3] fix style check failure --- python/orca/src/bigdl/orca/data/shard.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/orca/src/bigdl/orca/data/shard.py b/python/orca/src/bigdl/orca/data/shard.py index da54ae3052c..587aebeb04a 100644 --- a/python/orca/src/bigdl/orca/data/shard.py +++ b/python/orca/src/bigdl/orca/data/shard.py @@ -599,7 +599,8 @@ def get_schema(self): if 'schema' in self.type: return self.type['schema'] - if 'class_name' not in self.type or self.type['class_name'] == 'pandas.core.frame.DataFrame': + if 'class_name' not in self.type\ + or self.type['class_name'] == 'pandas.core.frame.DataFrame': class_name, schema = self._get_schema_class_name() self.type['class_name'] = class_name self.type['schema'] = schema