[ORCA] Add xshards example (intel-analytics#5542)

* update sh * add data example 2 * add data example 2 * fix style * fix time * fix time * fix time Co-authored-by: yansu <[email protected]>
ForJadeForest · Sep 20, 2022 · 2c2cdd9 · 2c2cdd9
1 parent 404a452
commit 2c2cdd9
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 2 deletions.
diff --git a/python/orca/dev/test/run-tutorial-xshards.sh b/python/orca/dev/test/run-tutorial-xshards.sh
@@ -26,7 +26,7 @@ ray stop -f
 cd "`dirname $0`"
 cd ../../tutorial/xshards
 
-echo "Running Xshards tests"
+echo "#1 Running run-tabular_playground_series"
 
 #timer
 start=$(date "+%s")
@@ -43,4 +43,24 @@ python tabular_playground_series.py
 now=$(date "+%s")
 time1=$((now - start))
 
-echo "Running Xshards tests time used: $time1 seconds"
+echo "#2 Running titanic"
+
+#timer
+start=$(date "+%s")
+
+if [ -f ${BIGDL_ROOT}/python/orca/tutorial/xshards/titanic.csv ]
+then
+    echo "titanic.csv already exists"
+else
+    wget -nv $FTP_URI/analytics-zoo-data/xshards/titanic.csv -P ${BIGDL_ROOT}/python/orca/tutorial/xshards/
+fi
+
+python titanic.py
+
+rm -rf result
+
+now=$(date "+%s")
+time2=$((now - start))
+
+echo "#1 Running run-tabular_playground_series time used: $time1 seconds"
+echo "#2 Running titanic time used: $time2 seconds"
diff --git a/python/orca/tutorial/xshards/titanic.py b/python/orca/tutorial/xshards/titanic.py
@@ -0,0 +1,59 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This example is adapted from
+# https://www.kaggle.com/code/chuanguy/titanic-data-processing-with-python-0-813/notebook
+
+from bigdl.orca import init_orca_context, stop_orca_context
+import bigdl.orca.data.pandas
+
+init_orca_context(cluster_mode="local", cores=4, memory="3g")
+
+file_path = "titanic.csv"
+data_shard = bigdl.orca.data.pandas.read_csv(file_path)
+
+# drop
+def drop_passenger(df):
+    df = df.drop(['PassengerId'], axis=1)
+    return df
+data_shard = data_shard.transform_shard(drop_passenger)
+
+
+# fillna, apply, replace, map
+def process_cabin(df):
+    df['Cabin'] = df['Cabin'].fillna('X')
+    df['Cabin'] = df['Cabin'].apply(lambda x: str(x)[0])
+    df['Cabin'] = df['Cabin'].replace(['A', 'D', 'E', 'T'], 'M')
+    df['Cabin'] = df['Cabin'].replace(['B', 'C'], 'H')
+    df['Cabin'] = df['Cabin'].replace(['F', 'G'], 'L')
+    df['Cabin'] = df['Cabin'].map({'X': 0, 'L': 1, 'M': 2, 'H': 3})
+    df['Cabin'] = df['Cabin'].astype(int)
+    return df
+data_shard = data_shard.transform_shard(process_cabin)
+
+
+# astype, loc
+def encode(data):
+    data['Sex'] = data['Sex'].map({'female': 1, 'male': 0})
+    data['Pclass'] = data['Pclass'].map({1: 3, 2: 2, 3: 1}).astype(int)
+    data.loc[data['Sex'] == 0, 'SexByPclass'] = data.loc[data['Sex'] == 0, 'Pclass']
+    data.loc[data['Sex'] == 1, 'SexByPclass'] = data.loc[data['Sex'] == 1, 'Pclass'] + 3
+    return data
+data_shard = data_shard.transform_shard(encode)
+
+# save
+data_shard.save_pickle('./result')
+
+stop_orca_context()