From 3e023a5391bc1a5239dec0739d7a8c4d1b6f62d6 Mon Sep 17 00:00:00 2001 From: yansu Date: Tue, 23 Aug 2022 15:02:05 +0800 Subject: [PATCH 01/14] Add Xshards test to github actions --- .../workflows/nb-orca-tutorial-xshards.yml | 92 +++++++++++++++++++ python/orca/dev/test/run-tutorial-xshards.sh | 28 ++++++ .../xshards/tabular_playground_series.py | 2 +- 3 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/nb-orca-tutorial-xshards.yml create mode 100755 python/orca/dev/test/run-tutorial-xshards.sh diff --git a/.github/workflows/nb-orca-tutorial-xshards.yml b/.github/workflows/nb-orca-tutorial-xshards.yml new file mode 100644 index 00000000000..0072652105d --- /dev/null +++ b/.github/workflows/nb-orca-tutorial-xshards.yml @@ -0,0 +1,92 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Nightly Build Orca Tutorial Xshards + +on: + #release: + # types: [published] + pull_request: + branches: [ "main" ] + paths: + - 'python/orca/src/bigdl/orca/data/**' + - 'python/orca/src/bigdl/orca/learn/metrics.py' + - 'python/orca/src/bigdl/orca/learn/util.py' + - 'python/orca/src/bigdl/orca/learn/ray_estimator.py' + - 'python/orca/src/bigdl/orca/learn/pytorch/**' + schedule: + - cron: '0 15 * * *' + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + build: + + runs-on: [ubuntu-20.04-lts] + permissions: + contents: read + packages: write + + steps: + - uses: actions/checkout@v3 + - name: Set up JDK 8 + uses: actions/setup-java@v3 + with: + java-version: '8' + distribution: 'temurin' + #server-id: github # Value of the distributionManagement/repository/id field of the pom.xml + settings-path: ${{ github.workspace }} # location for the settings.xml file + + - name: Set up Maven + uses: stCarolas/setup-maven@v4.4 + with: + maven-version: 3.8.2 + + - name: Set up Maven Settings + uses: s4u/maven-settings-action@v2.6.0 + with: + sonatypeSnapshots: true + apacheSnapshots: true + servers: | + [{ + "id": "central", + "configuration": { + "httpConfiguration": { + "all": { + "connectionTimeout": "3600000", + "readTimeout": "3600000" + } + } + } + }] + mirrors: '[{"id": "ardaNexus", "name": "ardaNexus", "mirrorOf": "*", "url": "${NEXUS_URL}" }]' + + - name: Setup Env + run: | + apt-get update + apt-get install wget + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.7' + - name: Install dependencies + shell: bash + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade setuptools==58.0.4 + pip uninstall -y bigdl-friesian bigdl-friesian-spark3 bigdl-dllib bigdl-dllib-spark3 bigdl-orca pyspark bigdl-orca-spark3 bigdl-chronos bigdl-chronos-spark3 bigdl-friesian bigdl-friesian-spark3 + pip install -i https://pypi.org/simple --pre --upgrade bigdl-orca-spark3 + pip install numpy==1.18.5 + - name: Run Test + #run: python -m build + run: | + export SPARK_LOCAL_HOSTNAME=localhost + chmod a+x python/orca/dev/test/run-tutorial-xshards.sh + python/orca/dev/test/run-tutorial-xshards.sh + env: + BIGDL_ROOT: ${{ github.workspace }} \ No newline at end of file diff --git a/python/orca/dev/test/run-tutorial-xshards.sh b/python/orca/dev/test/run-tutorial-xshards.sh new file mode 100755 index 00000000000..88a7df4b29d --- /dev/null +++ b/python/orca/dev/test/run-tutorial-xshards.sh @@ -0,0 +1,28 @@ +# +# Copyright 2016 The BigDL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -ex + +cd "`dirname $0`" + +export PYSPARK_PYTHON=python +export PYSPARK_DRIVER_PYTHON=python + +ray stop -f + +cd ../../ +echo "Running RayOnSpark tests" +python tutorial/xshards/tabular_playground_series.py diff --git a/python/orca/tutorial/xshards/tabular_playground_series.py b/python/orca/tutorial/xshards/tabular_playground_series.py index 01ad8e797c7..7c4bad2e776 100644 --- a/python/orca/tutorial/xshards/tabular_playground_series.py +++ b/python/orca/tutorial/xshards/tabular_playground_series.py @@ -30,7 +30,7 @@ init_orca_context(cluster_mode="local", cores=4, memory="3g") # Load data -file_path = 'train.csv' +file_path = '/home/yansu/Desktop/yxy/data_example/train.csv' data_shard = bigdl.orca.data.pandas.read_csv(file_path) # Drop duplicate columns From 1920cd1a2d3a656b9bf58740c090d8c7c2e1822b Mon Sep 17 00:00:00 2001 From: yansu Date: Wed, 24 Aug 2022 12:29:44 +0800 Subject: [PATCH 02/14] add file path --- .../workflows/nb-orca-tutorial-xshards.yml | 1 + python/orca/dev/test/run-tutorial-xshards.sh | 3 +- .../xshards/tabular_playground_series.py | 183 +++++++++--------- 3 files changed, 99 insertions(+), 88 deletions(-) diff --git a/.github/workflows/nb-orca-tutorial-xshards.yml b/.github/workflows/nb-orca-tutorial-xshards.yml index 0072652105d..253c51cba38 100644 --- a/.github/workflows/nb-orca-tutorial-xshards.yml +++ b/.github/workflows/nb-orca-tutorial-xshards.yml @@ -86,6 +86,7 @@ jobs: #run: python -m build run: | export SPARK_LOCAL_HOSTNAME=localhost + export FTP_URI=ftp://zoo:1234qwer@10.112.231.51 chmod a+x python/orca/dev/test/run-tutorial-xshards.sh python/orca/dev/test/run-tutorial-xshards.sh env: diff --git a/python/orca/dev/test/run-tutorial-xshards.sh b/python/orca/dev/test/run-tutorial-xshards.sh index 88a7df4b29d..be195f00fc5 100755 --- a/python/orca/dev/test/run-tutorial-xshards.sh +++ b/python/orca/dev/test/run-tutorial-xshards.sh @@ -14,6 +14,7 @@ # limitations under the License. # +export FTP_URI=$FTP_URI set -ex cd "`dirname $0`" @@ -25,4 +26,4 @@ ray stop -f cd ../../ echo "Running RayOnSpark tests" -python tutorial/xshards/tabular_playground_series.py +python tutorial/xshards/tabular_playground_series.py --path './xshards/train.csv' diff --git a/python/orca/tutorial/xshards/tabular_playground_series.py b/python/orca/tutorial/xshards/tabular_playground_series.py index 7c4bad2e776..5e5374bad60 100644 --- a/python/orca/tutorial/xshards/tabular_playground_series.py +++ b/python/orca/tutorial/xshards/tabular_playground_series.py @@ -16,6 +16,8 @@ # This example is adapted from # https://www.kaggle.com/code/remekkinas/tps-5-pytorch-nn-for-tabular-step-by-step/notebook +import argparse + from sklearn.model_selection import train_test_split import torch import torch.nn as nn @@ -27,101 +29,108 @@ from bigdl.orca.learn.pytorch import Estimator from bigdl.orca.learn.metrics import Accuracy -init_orca_context(cluster_mode="local", cores=4, memory="3g") - -# Load data -file_path = '/home/yansu/Desktop/yxy/data_example/train.csv' -data_shard = bigdl.orca.data.pandas.read_csv(file_path) - -# Drop duplicate columns -data_shard = data_shard.deduplicates() - -# Labelencode y -def change_col_name(df): - df = df.rename(columns={'id': 'id0'}) - return df -data_shard = data_shard.transform_shard(change_col_name) -encode = StringIndexer(inputCol='target') -data_shard = encode.fit_transform(data_shard) -def change_val(df): - df['target'] = df['target']-1 - return df -data_shard = data_shard.transform_shard(change_val) - -# Split train and test set -def split_train_test(data): - RANDOM_STATE = 2021 - train, test = train_test_split(data, test_size=0.2, random_state=RANDOM_STATE) - return train, test -train_shard, val_shard = data_shard.transform_shard(split_train_test).split() - -# Transform the feature columns -feature_list = [] -for i in range(50): - feature_list.append('feature_' + str(i)) -scale = MinMaxScaler(inputCol=feature_list, outputCol="x_scaled") -train_shard = scale.fit_transform(train_shard) -val_shard = scale.transform(val_shard) - -# Change data types -def change_data_type(df): - df['x_scaled'] = df['x_scaled'].apply(lambda x: np.array(x, dtype=np.float32)) - df['target'] = df['target'].apply(lambda x: np.long(x)) - return df -train_shard = train_shard.transform_shard(change_data_type) -val_shard = val_shard.transform_shard(change_data_type) - -# Model -torch.manual_seed(0) -BATCH_SIZE = 64 -NUM_CLASSES = 4 -NUM_EPOCHS = 1 -NUM_FEATURES = 50 - -def linear_block(in_features, out_features, p_drop, *args, **kwargs): - return nn.Sequential( - nn.Linear(in_features, out_features), - nn.ReLU(), - nn.Dropout(p=p_drop) - ) - -class TPS05ClassificationSeq(nn.Module): - def __init__(self): - super(TPS05ClassificationSeq, self).__init__() - num_feature = NUM_FEATURES - num_class = NUM_CLASSES - self.linear = nn.Sequential( - linear_block(num_feature, 100, 0.3), - linear_block(100, 250, 0.3), - linear_block(250, 128, 0.3), +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--path', type=str, + default="./xshards/train.csv", + help='Training data path') + opt = parser.parse_args() + + init_orca_context(cluster_mode="local", cores=4, memory="3g") + + # Load data + # file_path = 'train.csv' + data_shard = bigdl.orca.data.pandas.read_csv(opt.path) + + # Drop duplicate columns + data_shard = data_shard.deduplicates() + + # Labelencode y + def change_col_name(df): + df = df.rename(columns={'id': 'id0'}) + return df + data_shard = data_shard.transform_shard(change_col_name) + encode = StringIndexer(inputCol='target') + data_shard = encode.fit_transform(data_shard) + def change_val(df): + df['target'] = df['target']-1 + return df + data_shard = data_shard.transform_shard(change_val) + + # Split train and test set + def split_train_test(data): + RANDOM_STATE = 2021 + train, test = train_test_split(data, test_size=0.2, random_state=RANDOM_STATE) + return train, test + train_shard, val_shard = data_shard.transform_shard(split_train_test).split() + + # Transform the feature columns + feature_list = [] + for i in range(50): + feature_list.append('feature_' + str(i)) + scale = MinMaxScaler(inputCol=feature_list, outputCol="x_scaled") + train_shard = scale.fit_transform(train_shard) + val_shard = scale.transform(val_shard) + + # Change data types + def change_data_type(df): + df['x_scaled'] = df['x_scaled'].apply(lambda x: np.array(x, dtype=np.float32)) + df['target'] = df['target'].apply(lambda x: np.long(x)) + return df + train_shard = train_shard.transform_shard(change_data_type) + val_shard = val_shard.transform_shard(change_data_type) + + # Model + torch.manual_seed(0) + BATCH_SIZE = 64 + NUM_CLASSES = 4 + NUM_EPOCHS = 1 + NUM_FEATURES = 50 + + def linear_block(in_features, out_features, p_drop, *args, **kwargs): + return nn.Sequential( + nn.Linear(in_features, out_features), + nn.ReLU(), + nn.Dropout(p=p_drop) ) - self.out = nn.Sequential( - nn.Linear(128, num_class) - ) + class TPS05ClassificationSeq(nn.Module): + def __init__(self): + super(TPS05ClassificationSeq, self).__init__() + num_feature = NUM_FEATURES + num_class = NUM_CLASSES + self.linear = nn.Sequential( + linear_block(num_feature, 100, 0.3), + linear_block(100, 250, 0.3), + linear_block(250, 128, 0.3), + ) + + self.out = nn.Sequential( + nn.Linear(128, num_class) + ) - def forward(self, x): - x = self.linear(x) - return self.out(x) + def forward(self, x): + x = self.linear(x) + return self.out(x) -def model_creator(config): - model = TPS05ClassificationSeq() - return model + def model_creator(config): + model = TPS05ClassificationSeq() + return model -def optim_creator(model, config): - return optim.Adam(model.parameters(), lr=0.001) + def optim_creator(model, config): + return optim.Adam(model.parameters(), lr=0.001) -criterion = nn.CrossEntropyLoss() + criterion = nn.CrossEntropyLoss() -est = Estimator.from_torch(model=model_creator, optimizer=optim_creator, - loss=criterion, metrics=[Accuracy()], backend="ray") + est = Estimator.from_torch(model=model_creator, optimizer=optim_creator, + loss=criterion, metrics=[Accuracy()], backend="ray") -est.fit(data=train_shard, feature_cols=['x_scaled'], label_cols=['target'], - validation_data=val_shard, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE) + est.fit(data=train_shard, feature_cols=['x_scaled'], label_cols=['target'], + validation_data=val_shard, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE) -result = est.evaluate(data=val_shard, feature_cols=['x_scaled'], label_cols=['target'], batch_size=BATCH_SIZE) + result = est.evaluate(data=val_shard, feature_cols=['x_scaled'], label_cols=['target'], batch_size=BATCH_SIZE) -for r in result: - print(r, ":", result[r]) + for r in result: + print(r, ":", result[r]) -stop_orca_context() + stop_orca_context() From a97e614f87d70f457f116eb33555b1bacbd71a85 Mon Sep 17 00:00:00 2001 From: yansu Date: Wed, 24 Aug 2022 13:59:19 +0800 Subject: [PATCH 03/14] fix yml --- .github/workflows/nb-orca-tutorial-xshards.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nb-orca-tutorial-xshards.yml b/.github/workflows/nb-orca-tutorial-xshards.yml index 253c51cba38..9a392b136e4 100644 --- a/.github/workflows/nb-orca-tutorial-xshards.yml +++ b/.github/workflows/nb-orca-tutorial-xshards.yml @@ -14,11 +14,9 @@ on: pull_request: branches: [ "main" ] paths: - - 'python/orca/src/bigdl/orca/data/**' - - 'python/orca/src/bigdl/orca/learn/metrics.py' - - 'python/orca/src/bigdl/orca/learn/util.py' - - 'python/orca/src/bigdl/orca/learn/ray_estimator.py' - - 'python/orca/src/bigdl/orca/learn/pytorch/**' + - 'python/orca/src/bigdl/**' + - 'python/orca/tutorial/**' + - '.github/workflows/nb-orca-tutorial-xshards.yml' schedule: - cron: '0 15 * * *' # Allows you to run this workflow manually from the Actions tab From 468ee3e101a088d9128c2c46de0097324660e9e6 Mon Sep 17 00:00:00 2001 From: yansu Date: Wed, 24 Aug 2022 14:08:14 +0800 Subject: [PATCH 04/14] fix csv path --- python/orca/dev/test/run-tutorial-xshards.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/orca/dev/test/run-tutorial-xshards.sh b/python/orca/dev/test/run-tutorial-xshards.sh index be195f00fc5..312e71847eb 100755 --- a/python/orca/dev/test/run-tutorial-xshards.sh +++ b/python/orca/dev/test/run-tutorial-xshards.sh @@ -26,4 +26,4 @@ ray stop -f cd ../../ echo "Running RayOnSpark tests" -python tutorial/xshards/tabular_playground_series.py --path './xshards/train.csv' +python tutorial/xshards/tabular_playground_series.py --path 'tutorial/xshards/train.csv' From d218498f79710744e7da908e14eac72b62bb2722 Mon Sep 17 00:00:00 2001 From: yansu Date: Wed, 24 Aug 2022 14:13:00 +0800 Subject: [PATCH 05/14] fix csv path --- python/orca/dev/test/run-tutorial-xshards.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/orca/dev/test/run-tutorial-xshards.sh b/python/orca/dev/test/run-tutorial-xshards.sh index 312e71847eb..adf52770958 100755 --- a/python/orca/dev/test/run-tutorial-xshards.sh +++ b/python/orca/dev/test/run-tutorial-xshards.sh @@ -26,4 +26,4 @@ ray stop -f cd ../../ echo "Running RayOnSpark tests" -python tutorial/xshards/tabular_playground_series.py --path 'tutorial/xshards/train.csv' +python tutorial/xshards/tabular_playground_series.py --path './tutorial/xshards/train.csv' From a9473875c82944ba873c263f3e1bdf8f8ae19089 Mon Sep 17 00:00:00 2001 From: yansu Date: Wed, 24 Aug 2022 14:14:36 +0800 Subject: [PATCH 06/14] fix yml --- .github/workflows/nb-orca-tutorial-xshards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nb-orca-tutorial-xshards.yml b/.github/workflows/nb-orca-tutorial-xshards.yml index 9a392b136e4..14ab18efce3 100644 --- a/.github/workflows/nb-orca-tutorial-xshards.yml +++ b/.github/workflows/nb-orca-tutorial-xshards.yml @@ -25,7 +25,7 @@ on: jobs: build: - runs-on: [ubuntu-20.04-lts] + runs-on: [self-hosted, Gondolin, ubuntu-20.04-lts] permissions: contents: read packages: write From ed669197b58a8284462e9dba372f5f066bcfbd4f Mon Sep 17 00:00:00 2001 From: yansu Date: Wed, 24 Aug 2022 15:00:55 +0800 Subject: [PATCH 07/14] add wget --- ...-xshards.yml => orca-tutorial-xshards.yml} | 5 +- python/orca/dev/test/run-tutorial-xshards.sh | 12 +- .../xshards/tabular_playground_series.py | 177 +++++++++--------- 3 files changed, 97 insertions(+), 97 deletions(-) rename .github/workflows/{nb-orca-tutorial-xshards.yml => orca-tutorial-xshards.yml} (95%) diff --git a/.github/workflows/nb-orca-tutorial-xshards.yml b/.github/workflows/orca-tutorial-xshards.yml similarity index 95% rename from .github/workflows/nb-orca-tutorial-xshards.yml rename to .github/workflows/orca-tutorial-xshards.yml index 14ab18efce3..935f5c3ee00 100644 --- a/.github/workflows/nb-orca-tutorial-xshards.yml +++ b/.github/workflows/orca-tutorial-xshards.yml @@ -6,7 +6,7 @@ # separate terms of service, privacy policy, and support # documentation. -name: Nightly Build Orca Tutorial Xshards +name: Orca Tutorial Xshards on: #release: @@ -16,7 +16,7 @@ on: paths: - 'python/orca/src/bigdl/**' - 'python/orca/tutorial/**' - - '.github/workflows/nb-orca-tutorial-xshards.yml' + - '.github/workflows/orca-tutorial-xshards.yml' schedule: - cron: '0 15 * * *' # Allows you to run this workflow manually from the Actions tab @@ -79,7 +79,6 @@ jobs: python -m pip install --upgrade setuptools==58.0.4 pip uninstall -y bigdl-friesian bigdl-friesian-spark3 bigdl-dllib bigdl-dllib-spark3 bigdl-orca pyspark bigdl-orca-spark3 bigdl-chronos bigdl-chronos-spark3 bigdl-friesian bigdl-friesian-spark3 pip install -i https://pypi.org/simple --pre --upgrade bigdl-orca-spark3 - pip install numpy==1.18.5 - name: Run Test #run: python -m build run: | diff --git a/python/orca/dev/test/run-tutorial-xshards.sh b/python/orca/dev/test/run-tutorial-xshards.sh index adf52770958..3f56de32f4e 100755 --- a/python/orca/dev/test/run-tutorial-xshards.sh +++ b/python/orca/dev/test/run-tutorial-xshards.sh @@ -26,4 +26,14 @@ ray stop -f cd ../../ echo "Running RayOnSpark tests" -python tutorial/xshards/tabular_playground_series.py --path './tutorial/xshards/train.csv' + +if [ -f ${BIGDL_ROOT}/python/orca/tutorial/xshards/train.csv ] +then + echo "train.csv already exists" +else + wget -nv $FTP_URI/xshards/train.csv -P ${BIGDL_ROOT}/python/orca/tutorial/xshards/ +fi + +#sed -i "s/train.csv/train.csv/g" ${BIGDL_ROOT}/python/orca/tutorial/xshards/tabular_playground_series.py + +python tutorial/xshards/tabular_playground_series.py diff --git a/python/orca/tutorial/xshards/tabular_playground_series.py b/python/orca/tutorial/xshards/tabular_playground_series.py index 5e5374bad60..f845d178a43 100644 --- a/python/orca/tutorial/xshards/tabular_playground_series.py +++ b/python/orca/tutorial/xshards/tabular_playground_series.py @@ -29,108 +29,99 @@ from bigdl.orca.learn.pytorch import Estimator from bigdl.orca.learn.metrics import Accuracy -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-p', '--path', type=str, - default="./xshards/train.csv", - help='Training data path') - opt = parser.parse_args() - - init_orca_context(cluster_mode="local", cores=4, memory="3g") - - # Load data - # file_path = 'train.csv' - data_shard = bigdl.orca.data.pandas.read_csv(opt.path) - - # Drop duplicate columns - data_shard = data_shard.deduplicates() - - # Labelencode y - def change_col_name(df): - df = df.rename(columns={'id': 'id0'}) - return df - data_shard = data_shard.transform_shard(change_col_name) - encode = StringIndexer(inputCol='target') - data_shard = encode.fit_transform(data_shard) - def change_val(df): - df['target'] = df['target']-1 - return df - data_shard = data_shard.transform_shard(change_val) - - # Split train and test set - def split_train_test(data): - RANDOM_STATE = 2021 - train, test = train_test_split(data, test_size=0.2, random_state=RANDOM_STATE) - return train, test - train_shard, val_shard = data_shard.transform_shard(split_train_test).split() - - # Transform the feature columns - feature_list = [] - for i in range(50): - feature_list.append('feature_' + str(i)) - scale = MinMaxScaler(inputCol=feature_list, outputCol="x_scaled") - train_shard = scale.fit_transform(train_shard) - val_shard = scale.transform(val_shard) - - # Change data types - def change_data_type(df): - df['x_scaled'] = df['x_scaled'].apply(lambda x: np.array(x, dtype=np.float32)) - df['target'] = df['target'].apply(lambda x: np.long(x)) - return df - train_shard = train_shard.transform_shard(change_data_type) - val_shard = val_shard.transform_shard(change_data_type) - - # Model - torch.manual_seed(0) - BATCH_SIZE = 64 - NUM_CLASSES = 4 - NUM_EPOCHS = 1 - NUM_FEATURES = 50 - - def linear_block(in_features, out_features, p_drop, *args, **kwargs): - return nn.Sequential( - nn.Linear(in_features, out_features), - nn.ReLU(), - nn.Dropout(p=p_drop) +# Load data +file_path = 'train.csv' +data_shard = bigdl.orca.data.pandas.read_csv(opt.path) + +# Drop duplicate columns +data_shard = data_shard.deduplicates() + +# Labelencode y +def change_col_name(df): + df = df.rename(columns={'id': 'id0'}) + return df +data_shard = data_shard.transform_shard(change_col_name) +encode = StringIndexer(inputCol='target') +data_shard = encode.fit_transform(data_shard) +def change_val(df): + df['target'] = df['target']-1 + return df +data_shard = data_shard.transform_shard(change_val) + +# Split train and test set +def split_train_test(data): + RANDOM_STATE = 2021 + train, test = train_test_split(data, test_size=0.2, random_state=RANDOM_STATE) + return train, test +train_shard, val_shard = data_shard.transform_shard(split_train_test).split() + +# Transform the feature columns +feature_list = [] +for i in range(50): + feature_list.append('feature_' + str(i)) +scale = MinMaxScaler(inputCol=feature_list, outputCol="x_scaled") +train_shard = scale.fit_transform(train_shard) +val_shard = scale.transform(val_shard) + +# Change data types +def change_data_type(df): + df['x_scaled'] = df['x_scaled'].apply(lambda x: np.array(x, dtype=np.float32)) + df['target'] = df['target'].apply(lambda x: np.long(x)) + return df +train_shard = train_shard.transform_shard(change_data_type) +val_shard = val_shard.transform_shard(change_data_type) + +# Model +torch.manual_seed(0) +BATCH_SIZE = 64 +NUM_CLASSES = 4 +NUM_EPOCHS = 1 +NUM_FEATURES = 50 + +def linear_block(in_features, out_features, p_drop, *args, **kwargs): + return nn.Sequential( + nn.Linear(in_features, out_features), + nn.ReLU(), + nn.Dropout(p=p_drop) + ) + +class TPS05ClassificationSeq(nn.Module): + def __init__(self): + super(TPS05ClassificationSeq, self).__init__() + num_feature = NUM_FEATURES + num_class = NUM_CLASSES + self.linear = nn.Sequential( + linear_block(num_feature, 100, 0.3), + linear_block(100, 250, 0.3), + linear_block(250, 128, 0.3), ) - class TPS05ClassificationSeq(nn.Module): - def __init__(self): - super(TPS05ClassificationSeq, self).__init__() - num_feature = NUM_FEATURES - num_class = NUM_CLASSES - self.linear = nn.Sequential( - linear_block(num_feature, 100, 0.3), - linear_block(100, 250, 0.3), - linear_block(250, 128, 0.3), - ) - - self.out = nn.Sequential( - nn.Linear(128, num_class) - ) + self.out = nn.Sequential( + nn.Linear(128, num_class) + ) - def forward(self, x): - x = self.linear(x) - return self.out(x) + def forward(self, x): + x = self.linear(x) + return self.out(x) - def model_creator(config): - model = TPS05ClassificationSeq() - return model +def model_creator(config): + model = TPS05ClassificationSeq() + return model - def optim_creator(model, config): - return optim.Adam(model.parameters(), lr=0.001) +def optim_creator(model, config): + return optim.Adam(model.parameters(), lr=0.001) - criterion = nn.CrossEntropyLoss() +criterion = nn.CrossEntropyLoss() - est = Estimator.from_torch(model=model_creator, optimizer=optim_creator, +est = Estimator.from_torch(model=model_creator, optimizer=optim_creator, loss=criterion, metrics=[Accuracy()], backend="ray") - est.fit(data=train_shard, feature_cols=['x_scaled'], label_cols=['target'], - validation_data=val_shard, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE) +est.fit(data=train_shard, feature_cols=['x_scaled'], label_cols=['target'], + validation_data=val_shard, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE) - result = est.evaluate(data=val_shard, feature_cols=['x_scaled'], label_cols=['target'], batch_size=BATCH_SIZE) +result = est.evaluate(data=val_shard, feature_cols=['x_scaled'], label_cols=['target'], batch_size=BATCH_SIZE) - for r in result: - print(r, ":", result[r]) +for r in result: + print(r, ":", result[r]) - stop_orca_context() +stop_orca_context() From fdf12152beb733d0041dde2d1b8c70440d06b343 Mon Sep 17 00:00:00 2001 From: yansu Date: Wed, 24 Aug 2022 15:04:03 +0800 Subject: [PATCH 08/14] fic typo --- python/orca/tutorial/xshards/tabular_playground_series.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/orca/tutorial/xshards/tabular_playground_series.py b/python/orca/tutorial/xshards/tabular_playground_series.py index f845d178a43..45c2e9aa34c 100644 --- a/python/orca/tutorial/xshards/tabular_playground_series.py +++ b/python/orca/tutorial/xshards/tabular_playground_series.py @@ -29,9 +29,11 @@ from bigdl.orca.learn.pytorch import Estimator from bigdl.orca.learn.metrics import Accuracy +init_orca_context(cluster_mode="local", cores=4, memory="3g") + # Load data file_path = 'train.csv' -data_shard = bigdl.orca.data.pandas.read_csv(opt.path) +data_shard = bigdl.orca.data.pandas.read_csv(file_path) # Drop duplicate columns data_shard = data_shard.deduplicates() From 2a6c76c3ff39fe43c0f97980401c847c905bf671 Mon Sep 17 00:00:00 2001 From: yansu Date: Wed, 24 Aug 2022 15:05:06 +0800 Subject: [PATCH 09/14] fix typo --- python/orca/tutorial/xshards/tabular_playground_series.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/orca/tutorial/xshards/tabular_playground_series.py b/python/orca/tutorial/xshards/tabular_playground_series.py index 45c2e9aa34c..01ad8e797c7 100644 --- a/python/orca/tutorial/xshards/tabular_playground_series.py +++ b/python/orca/tutorial/xshards/tabular_playground_series.py @@ -16,8 +16,6 @@ # This example is adapted from # https://www.kaggle.com/code/remekkinas/tps-5-pytorch-nn-for-tabular-step-by-step/notebook -import argparse - from sklearn.model_selection import train_test_split import torch import torch.nn as nn @@ -116,7 +114,7 @@ def optim_creator(model, config): criterion = nn.CrossEntropyLoss() est = Estimator.from_torch(model=model_creator, optimizer=optim_creator, - loss=criterion, metrics=[Accuracy()], backend="ray") + loss=criterion, metrics=[Accuracy()], backend="ray") est.fit(data=train_shard, feature_cols=['x_scaled'], label_cols=['target'], validation_data=val_shard, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE) From 1366e6823bdd60c3cf00bafa02c5e47e8a0b3306 Mon Sep 17 00:00:00 2001 From: Nancy <51090416+yexinyinancy@users.noreply.github.com> Date: Wed, 24 Aug 2022 16:56:32 +0800 Subject: [PATCH 10/14] change csv file path --- python/orca/dev/test/run-tutorial-xshards.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/orca/dev/test/run-tutorial-xshards.sh b/python/orca/dev/test/run-tutorial-xshards.sh index 3f56de32f4e..362c8fb35f2 100755 --- a/python/orca/dev/test/run-tutorial-xshards.sh +++ b/python/orca/dev/test/run-tutorial-xshards.sh @@ -31,7 +31,7 @@ if [ -f ${BIGDL_ROOT}/python/orca/tutorial/xshards/train.csv ] then echo "train.csv already exists" else - wget -nv $FTP_URI/xshards/train.csv -P ${BIGDL_ROOT}/python/orca/tutorial/xshards/ + wget -nv $FTP_URI/analytics-zoo-data/xshards/train.csv -P ${BIGDL_ROOT}/python/orca/tutorial/xshards/ fi #sed -i "s/train.csv/train.csv/g" ${BIGDL_ROOT}/python/orca/tutorial/xshards/tabular_playground_series.py From f749dfd9a5904141e9c4bce8f8f8443d0d0518f6 Mon Sep 17 00:00:00 2001 From: Nancy <51090416+yexinyinancy@users.noreply.github.com> Date: Wed, 24 Aug 2022 17:05:29 +0800 Subject: [PATCH 11/14] update pwd --- python/orca/dev/test/run-tutorial-xshards.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/orca/dev/test/run-tutorial-xshards.sh b/python/orca/dev/test/run-tutorial-xshards.sh index 362c8fb35f2..bb1d27e9c80 100755 --- a/python/orca/dev/test/run-tutorial-xshards.sh +++ b/python/orca/dev/test/run-tutorial-xshards.sh @@ -24,7 +24,7 @@ export PYSPARK_DRIVER_PYTHON=python ray stop -f -cd ../../ +cd ../../tutorial/xshards echo "Running RayOnSpark tests" if [ -f ${BIGDL_ROOT}/python/orca/tutorial/xshards/train.csv ] @@ -36,4 +36,4 @@ fi #sed -i "s/train.csv/train.csv/g" ${BIGDL_ROOT}/python/orca/tutorial/xshards/tabular_playground_series.py -python tutorial/xshards/tabular_playground_series.py +python tabular_playground_series.py From 5966e9fb66aa3cab7da2d8c1b38b4d07eb75e0b5 Mon Sep 17 00:00:00 2001 From: Nancy <51090416+yexinyinancy@users.noreply.github.com> Date: Wed, 24 Aug 2022 17:36:08 +0800 Subject: [PATCH 12/14] add new line --- .github/workflows/orca-tutorial-xshards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/orca-tutorial-xshards.yml b/.github/workflows/orca-tutorial-xshards.yml index 935f5c3ee00..ca3849932c2 100644 --- a/.github/workflows/orca-tutorial-xshards.yml +++ b/.github/workflows/orca-tutorial-xshards.yml @@ -87,4 +87,4 @@ jobs: chmod a+x python/orca/dev/test/run-tutorial-xshards.sh python/orca/dev/test/run-tutorial-xshards.sh env: - BIGDL_ROOT: ${{ github.workspace }} \ No newline at end of file + BIGDL_ROOT: ${{ github.workspace }} From 0e9dc41ac3690e518653df4489d10d8026c6f927 Mon Sep 17 00:00:00 2001 From: Nancy <51090416+yexinyinancy@users.noreply.github.com> Date: Wed, 24 Aug 2022 17:40:28 +0800 Subject: [PATCH 13/14] delete a line --- python/orca/dev/test/run-tutorial-xshards.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/orca/dev/test/run-tutorial-xshards.sh b/python/orca/dev/test/run-tutorial-xshards.sh index bb1d27e9c80..f6c884d29a5 100755 --- a/python/orca/dev/test/run-tutorial-xshards.sh +++ b/python/orca/dev/test/run-tutorial-xshards.sh @@ -14,17 +14,18 @@ # limitations under the License. # -export FTP_URI=$FTP_URI -set -ex -cd "`dirname $0`" +set -ex +export FTP_URI=$FTP_URI export PYSPARK_PYTHON=python export PYSPARK_DRIVER_PYTHON=python ray stop -f +cd "`dirname $0`" cd ../../tutorial/xshards + echo "Running RayOnSpark tests" if [ -f ${BIGDL_ROOT}/python/orca/tutorial/xshards/train.csv ] @@ -34,6 +35,4 @@ else wget -nv $FTP_URI/analytics-zoo-data/xshards/train.csv -P ${BIGDL_ROOT}/python/orca/tutorial/xshards/ fi -#sed -i "s/train.csv/train.csv/g" ${BIGDL_ROOT}/python/orca/tutorial/xshards/tabular_playground_series.py - python tabular_playground_series.py From d6997ab67767b0b4c07f08eff51045f736833086 Mon Sep 17 00:00:00 2001 From: Nancy <51090416+yexinyinancy@users.noreply.github.com> Date: Wed, 24 Aug 2022 17:42:55 +0800 Subject: [PATCH 14/14] add time --- python/orca/dev/test/run-tutorial-xshards.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/orca/dev/test/run-tutorial-xshards.sh b/python/orca/dev/test/run-tutorial-xshards.sh index f6c884d29a5..9d37acb3a58 100755 --- a/python/orca/dev/test/run-tutorial-xshards.sh +++ b/python/orca/dev/test/run-tutorial-xshards.sh @@ -26,7 +26,10 @@ ray stop -f cd "`dirname $0`" cd ../../tutorial/xshards -echo "Running RayOnSpark tests" +echo "Running Xshards tests" + +#timer +start=$(date "+%s") if [ -f ${BIGDL_ROOT}/python/orca/tutorial/xshards/train.csv ] then @@ -36,3 +39,8 @@ else fi python tabular_playground_series.py + +now=$(date "+%s") +time1=$((now - start)) + +echo "Running Xshards tests time used: $time1 seconds"