diff --git a/python/orca/test/bigdl/orca/ray/__init__.py b/python/orca/test/bigdl/orca/ray/__init__.py new file mode 100644 index 00000000000..5976dc4df02 --- /dev/null +++ b/python/orca/test/bigdl/orca/ray/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/orca/test/bigdl/orca/ray/integration/__init__.py b/python/orca/test/bigdl/orca/ray/integration/__init__.py new file mode 100644 index 00000000000..5976dc4df02 --- /dev/null +++ b/python/orca/test/bigdl/orca/ray/integration/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/orca/test/bigdl/orca/ray/integration/ray_on_yarn.py b/python/orca/test/bigdl/orca/ray/integration/ray_on_yarn.py new file mode 100644 index 00000000000..68838987a35 --- /dev/null +++ b/python/orca/test/bigdl/orca/ray/integration/ray_on_yarn.py @@ -0,0 +1,72 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import ray + +from bigdl.dllib.utils.nncontext import init_spark_on_yarn +from bigdl.orca.ray import RayContext + +slave_num = 2 + +sc = init_spark_on_yarn( + hadoop_conf="/opt/work/almaren-yarn-config/", + conda_name="ray_train", + num_executors=slave_num, + executor_cores=28, + executor_memory="10g", + driver_memory="2g", + driver_cores=4, + extra_executor_memory_for_ray="30g", + conf={"hello": "world"}) + +ray_ctx = RayContext(sc=sc, + object_store_memory="25g", + extra_params={"temp-dir": "/tmp/hello/"}, + env={"http_proxy": "http://child-prc.intel.com:913", + "http_proxys": "http://child-prc.intel.com:913"}) +ray_ctx.init() + + +@ray.remote +class TestRay(): + def hostname(self): + import socket + return socket.gethostname() + + def check_cv2(self): + # conda install -c conda-forge opencv==3.4.2 + import cv2 + return cv2.__version__ + + def ip(self): + return ray._private.services.get_node_ip_address() + + def network(self): + from urllib.request import urlopen + try: + urlopen('http://www.baidu.com', timeout=3) + return True + except Exception as err: + return False + + +actors = [TestRay.remote() for i in range(0, slave_num)] +print(ray.get([actor.hostname.remote() for actor in actors])) +print(ray.get([actor.ip.remote() for actor in actors])) +# print(ray.get([actor.network.remote() for actor in actors])) + +ray_ctx.stop() diff --git a/python/orca/test/bigdl/orca/ray/integration/test_yarn_reinit_raycontext.py b/python/orca/test/bigdl/orca/ray/integration/test_yarn_reinit_raycontext.py new file mode 100644 index 00000000000..255cc510494 --- /dev/null +++ b/python/orca/test/bigdl/orca/ray/integration/test_yarn_reinit_raycontext.py @@ -0,0 +1,58 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import time + +import numpy as np +import ray + +from bigdl.dllib.utils.nncontext import init_spark_on_yarn +from bigdl.orca.ray import RayContext + +np.random.seed(1337) # for reproducibility + + +@ray.remote +class TestRay: + def hostname(self): + import socket + return socket.gethostname() + + +node_num = 4 +sc = init_spark_on_yarn( + hadoop_conf="/opt/work/hadoop-2.7.2/etc/hadoop/", + conda_name="rayexample", + num_executors=node_num, + executor_cores=28, + executor_memory="10g", + driver_memory="2g", + driver_cores=4, + extra_executor_memory_for_ray="30g") +ray_ctx = RayContext(sc=sc, object_store_memory="2g") +ray_ctx.init() +actors = [TestRay.remote() for i in range(0, node_num)] +print(ray.get([actor.hostname.remote() for actor in actors])) +ray_ctx.stop() +# repeat +ray_ctx = RayContext(sc=sc, object_store_memory="1g") +ray_ctx.init() +actors = [TestRay.remote() for i in range(0, node_num)] +print(ray.get([actor.hostname.remote() for actor in actors])) +ray_ctx.stop() + +sc.stop() +time.sleep(3) diff --git a/python/orca/test/bigdl/orca/ray/test_ray_on_local.py b/python/orca/test/bigdl/orca/ray/test_ray_on_local.py new file mode 100644 index 00000000000..4b920af86a2 --- /dev/null +++ b/python/orca/test/bigdl/orca/ray/test_ray_on_local.py @@ -0,0 +1,45 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from unittest import TestCase + +import pytest +import ray + +from bigdl.dllib.utils.nncontext import init_spark_on_local +from bigdl.orca.ray import RayContext + + +class TestRayLocal(TestCase): + + def test_local(self): + @ray.remote + class TestRay: + def hostname(self): + import socket + return socket.gethostname() + + sc = init_spark_on_local(cores=8) + ray_ctx = RayContext(sc=sc, object_store_memory="1g", ray_node_cpu_cores=4) + address_info = ray_ctx.init() + assert "object_store_address" in address_info + actors = [TestRay.remote() for i in range(0, 4)] + print(ray.get([actor.hostname.remote() for actor in actors])) + ray_ctx.stop() + sc.stop() + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/python/orca/test/bigdl/orca/ray/test_reinit_raycontext.py b/python/orca/test/bigdl/orca/ray/test_reinit_raycontext.py new file mode 100644 index 00000000000..2abb22efa55 --- /dev/null +++ b/python/orca/test/bigdl/orca/ray/test_reinit_raycontext.py @@ -0,0 +1,62 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import time +from unittest import TestCase + +import numpy as np +import psutil +import pytest +import ray + +from bigdl.dllib.utils.nncontext import init_spark_on_local +from bigdl.orca.ray import RayContext + +np.random.seed(1337) # for reproducibility + + +@ray.remote +class TestRay(): + def hostname(self): + import socket + return socket.gethostname() + + +class TestUtil(TestCase): + + def test_local(self): + node_num = 4 + sc = init_spark_on_local(cores=node_num) + ray_ctx = RayContext(sc=sc, object_store_memory="1g") + ray_ctx.init() + actors = [TestRay.remote() for i in range(0, node_num)] + print(ray.get([actor.hostname.remote() for actor in actors])) + ray_ctx.stop() + time.sleep(3) + # repeat + print("-------------------first repeat begin!------------------") + ray_ctx = RayContext(sc=sc, object_store_memory="1g") + ray_ctx.init() + actors = [TestRay.remote() for i in range(0, node_num)] + print(ray.get([actor.hostname.remote() for actor in actors])) + ray_ctx.stop() + sc.stop() + time.sleep(3) + for process_info in ray_ctx.ray_processesMonitor.process_infos: + for pid in process_info.pids: + assert not psutil.pid_exists(pid) + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/python/orca/test/bigdl/orca/ray/test_util.py b/python/orca/test/bigdl/orca/ray/test_util.py new file mode 100644 index 00000000000..ec7ba1ee227 --- /dev/null +++ b/python/orca/test/bigdl/orca/ray/test_util.py @@ -0,0 +1,33 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from unittest import TestCase + +import pytest + +import bigdl.orca.ray.utils as rutils + + +class TestUtil(TestCase): + + def test_resource_to_bytes(self): + assert 10 == rutils.resource_to_bytes("10b") + assert 10000 == rutils.resource_to_bytes("10k") + assert 10000000 == rutils.resource_to_bytes("10m") + assert 10000000000 == rutils.resource_to_bytes("10g") + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/python/orca/test/bigdl/orca/resources/bert/bert_config.json b/python/orca/test/bigdl/orca/resources/bert/bert_config.json new file mode 100644 index 00000000000..fca794a5f07 --- /dev/null +++ b/python/orca/test/bigdl/orca/resources/bert/bert_config.json @@ -0,0 +1,13 @@ +{ + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "type_vocab_size": 2, + "vocab_size": 30522 +} diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7000.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7000.jpg new file mode 100644 index 00000000000..ecea901b801 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7000.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7001.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7001.jpg new file mode 100644 index 00000000000..95b3e29367c Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7001.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7002.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7002.jpg new file mode 100644 index 00000000000..7a357a72442 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7002.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7003.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7003.jpg new file mode 100644 index 00000000000..811bb8a01dd Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7003.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7004.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7004.jpg new file mode 100644 index 00000000000..7bcdf603653 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7004.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7005.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7005.jpg new file mode 100644 index 00000000000..8dac9c4492a Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/cats/cat.7005.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7011.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7011.jpg new file mode 100644 index 00000000000..6152fa1e12f Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7011.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7012.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7012.jpg new file mode 100644 index 00000000000..a17e8e49179 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7012.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7013.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7013.jpg new file mode 100644 index 00000000000..09928b2b7c0 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7013.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7014.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7014.jpg new file mode 100644 index 00000000000..18cfff3bd83 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7014.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7015.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7015.jpg new file mode 100644 index 00000000000..f15b98e428b Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7015.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7016.jpg b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7016.jpg new file mode 100644 index 00000000000..8d0f5250ad0 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cat_dog/dogs/dog.7016.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cats/cat.7000.jpg b/python/orca/test/bigdl/orca/resources/cats/cat.7000.jpg new file mode 100644 index 00000000000..ecea901b801 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cats/cat.7000.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cats/cat.7001.jpg b/python/orca/test/bigdl/orca/resources/cats/cat.7001.jpg new file mode 100644 index 00000000000..95b3e29367c Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cats/cat.7001.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cats/cat.7002.jpg b/python/orca/test/bigdl/orca/resources/cats/cat.7002.jpg new file mode 100644 index 00000000000..7a357a72442 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cats/cat.7002.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cats/cat.7003.jpg b/python/orca/test/bigdl/orca/resources/cats/cat.7003.jpg new file mode 100644 index 00000000000..811bb8a01dd Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cats/cat.7003.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cats/cat.7004.jpg b/python/orca/test/bigdl/orca/resources/cats/cat.7004.jpg new file mode 100644 index 00000000000..7bcdf603653 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cats/cat.7004.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/cats/cat.7005.jpg b/python/orca/test/bigdl/orca/resources/cats/cat.7005.jpg new file mode 100644 index 00000000000..8dac9c4492a Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/cats/cat.7005.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/dogs/dog.7011.jpg b/python/orca/test/bigdl/orca/resources/dogs/dog.7011.jpg new file mode 100644 index 00000000000..6152fa1e12f Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/dogs/dog.7011.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/dogs/dog.7012.jpg b/python/orca/test/bigdl/orca/resources/dogs/dog.7012.jpg new file mode 100644 index 00000000000..a17e8e49179 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/dogs/dog.7012.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/dogs/dog.7013.jpg b/python/orca/test/bigdl/orca/resources/dogs/dog.7013.jpg new file mode 100644 index 00000000000..09928b2b7c0 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/dogs/dog.7013.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/dogs/dog.7014.jpg b/python/orca/test/bigdl/orca/resources/dogs/dog.7014.jpg new file mode 100644 index 00000000000..18cfff3bd83 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/dogs/dog.7014.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/dogs/dog.7015.jpg b/python/orca/test/bigdl/orca/resources/dogs/dog.7015.jpg new file mode 100644 index 00000000000..f15b98e428b Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/dogs/dog.7015.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/dogs/dog.7016.jpg b/python/orca/test/bigdl/orca/resources/dogs/dog.7016.jpg new file mode 100644 index 00000000000..8d0f5250ad0 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/dogs/dog.7016.jpg differ diff --git a/python/orca/test/bigdl/orca/resources/saved-model-resource/saved_model.pb b/python/orca/test/bigdl/orca/resources/saved-model-resource/saved_model.pb new file mode 100644 index 00000000000..44ecae37bec Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/saved-model-resource/saved_model.pb differ diff --git a/python/orca/test/bigdl/orca/resources/saved-model-resource/variables/variables.data-00000-of-00001 b/python/orca/test/bigdl/orca/resources/saved-model-resource/variables/variables.data-00000-of-00001 new file mode 100644 index 00000000000..4fc17e35786 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/saved-model-resource/variables/variables.data-00000-of-00001 differ diff --git a/python/orca/test/bigdl/orca/resources/saved-model-resource/variables/variables.index b/python/orca/test/bigdl/orca/resources/saved-model-resource/variables/variables.index new file mode 100644 index 00000000000..d96f3373062 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/saved-model-resource/variables/variables.index differ diff --git a/python/orca/test/bigdl/orca/resources/tfnet/frozen_inference_graph.pb b/python/orca/test/bigdl/orca/resources/tfnet/frozen_inference_graph.pb new file mode 100644 index 00000000000..c7f418a3b8a Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/tfnet/frozen_inference_graph.pb differ diff --git a/python/orca/test/bigdl/orca/resources/tfnet/graph_meta.json b/python/orca/test/bigdl/orca/resources/tfnet/graph_meta.json new file mode 100644 index 00000000000..42fcef2d362 --- /dev/null +++ b/python/orca/test/bigdl/orca/resources/tfnet/graph_meta.json @@ -0,0 +1 @@ +{"input_names": ["Placeholder:0"], "output_names": ["dense_1/Sigmoid:0"]} \ No newline at end of file diff --git a/python/orca/test/bigdl/orca/resources/tfrecord/mnist_test.tfrecord b/python/orca/test/bigdl/orca/resources/tfrecord/mnist_test.tfrecord new file mode 100644 index 00000000000..9aedd1bcac7 Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/tfrecord/mnist_test.tfrecord differ diff --git a/python/orca/test/bigdl/orca/resources/tfrecord/mnist_train.tfrecord b/python/orca/test/bigdl/orca/resources/tfrecord/mnist_train.tfrecord new file mode 100644 index 00000000000..a8291516a5d Binary files /dev/null and b/python/orca/test/bigdl/orca/resources/tfrecord/mnist_train.tfrecord differ diff --git a/python/orca/test/bigdl/orca/tfpark/__init__.py b/python/orca/test/bigdl/orca/tfpark/__init__.py new file mode 100644 index 00000000000..5976dc4df02 --- /dev/null +++ b/python/orca/test/bigdl/orca/tfpark/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/orca/test/bigdl/orca/tfpark/test_keras_model.py b/python/orca/test/bigdl/orca/tfpark/test_keras_model.py new file mode 100644 index 00000000000..d13004fc887 --- /dev/null +++ b/python/orca/test/bigdl/orca/tfpark/test_keras_model.py @@ -0,0 +1,88 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import pytest + +from bigdl.dllib.feature.image import * +from bigdl.orca.test_zoo_utils import ZooTestCase +import tensorflow as tf +import numpy as np +import os + +from bigdl.orca.tfpark import KerasModel + +resource_path = os.path.join(os.path.split(__file__)[0], "../resources") + + +class TestTFParkModel(ZooTestCase): + + def setup_method(self, method): + tf.keras.backend.clear_session() + super(TestTFParkModel, self).setup_method(method) + + def create_multi_input_output_model(self): + data1 = tf.keras.layers.Input(shape=[10]) + data2 = tf.keras.layers.Input(shape=[10]) + + x1 = tf.keras.layers.Flatten()(data1) + x1 = tf.keras.layers.Dense(10, activation='relu')(x1) + pred1 = tf.keras.layers.Dense(2, activation='softmax')(x1) + + x2 = tf.keras.layers.Flatten()(data2) + x2 = tf.keras.layers.Dense(10, activation='relu')(x2) + pred2 = tf.keras.layers.Dense(2)(x2) + + model = tf.keras.models.Model(inputs=[data1, data2], outputs=[pred1, pred2]) + model.compile(optimizer='rmsprop', + loss=['sparse_categorical_crossentropy', 'mse']) + return model + + def create_training_data(self): + np.random.seed(20) + x = np.random.rand(20, 10) + y = np.random.randint(0, 2, (20)) + return x, y + + def test_training_with_validation_data_distributed_multi_heads(self): + + keras_model = self.create_multi_input_output_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + val_x, val_y = self.create_training_data() + model.fit([x, x], [y, y], validation_data=([val_x, val_x], [val_y, val_y]), + batch_size=4, distributed=True) + + def test_invalid_data_handling(self): + keras_model = self.create_multi_input_output_model() + model = KerasModel(keras_model) + x, y = self.create_training_data() + val_x, val_y = self.create_training_data() + + # Number doesn't match + with pytest.raises(AssertionError) as excinfo: + model.fit([x, x], [y, y, y], batch_size=4, distributed=True) + + assert "model_target number does not match data number" in str(excinfo.value) + + # Dict as input + with pytest.raises(AssertionError) as excinfo: + model.fit({"input_1": x}, [y, y], batch_size=4, distributed=True) + + assert "all model_input names should exist in data" in str(excinfo.value) + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/python/orca/test/bigdl/orca/tfpark/test_text_estimators.py b/python/orca/test/bigdl/orca/tfpark/test_text_estimators.py new file mode 100644 index 00000000000..f7a66830ee8 --- /dev/null +++ b/python/orca/test/bigdl/orca/tfpark/test_text_estimators.py @@ -0,0 +1,110 @@ +# # +# # Copyright 2018 Analytics Zoo Authors. +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # http://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. +# # +# +# import pytest +# +# import os +# import random +# import numpy as np +# from bigdl.orca.test_zoo_utils import ZooTestCase +# from bigdl.orca.tfpark.text.estimator import * +# +# resource_path = os.path.join(os.path.split(__file__)[0], "../resources") +# bert_config_path = os.path.join(resource_path, "bert/bert_config.json") +# +# +# class TestTextEstimators(ZooTestCase): +# +# def setup_method(self, method): +# import tensorflow as tf +# tf.keras.backend.clear_session() +# super(TestTextEstimators, self).setup_method(method) +# +# def test_bert_classifier(self): +# def gen_record(has_label=True): +# res = dict() +# res["input_ids"] = np.random.randint(10000, size=2) +# res["input_mask"] = np.array([1] * 2) +# res["token_type_ids"] = np.array([0] * 1 + [1] * 1) +# if has_label: +# return res, np.array(random.choice([0, 1])) +# else: +# return res +# import tensorflow as tf +# estimator = BERTClassifier(2, bert_config_path, optimizer=tf.train.AdamOptimizer()) +# rdd = self.sc.parallelize([gen_record() for i in range(8)]) +# # Training is too slow and memory consuming for a unit test. Skip here. Tested manually. +# # train_input_fn = bert_input_fn(rdd, 2, 4) +# # estimator.train(train_input_fn, 2) +# eval_input_fn = bert_input_fn(rdd, 2, 4) +# print(estimator.evaluate(eval_input_fn, eval_methods=["acc"])) +# test_rdd = self.sc.parallelize([gen_record(has_label=False) for i in range(4)]) +# test_input_fn = bert_input_fn(test_rdd, 2, 4) +# predictions = estimator.predict(test_input_fn) +# assert predictions.count() == 4 +# assert len(predictions.first()) == 2 +# +# def test_bert_squad(self): +# def gen_record(has_label=True): +# res = dict() +# res["input_ids"] = np.random.randint(10000, size=2) +# res["input_mask"] = np.array([1] * 2) +# res["token_type_ids"] = np.array([0] * 1 + [1] * 1) +# if has_label: +# label = dict() +# label["start_position"] = np.array(0) +# label["end_position"] = np.array(0) +# return res, label +# else: +# res["unique_ids"] = np.array(np.random.randint(100)) +# return res +# import tensorflow as tf +# estimator = BERTSQuAD(bert_config_path, optimizer=tf.train.AdamOptimizer()) +# # Training is too slow and memory consuming for a unit test. Skip here. Tested manually. +# # rdd = self.sc.parallelize([gen_record() for i in range(8)]) +# # train_input_fn = bert_input_fn(rdd, 2, 4, labels={"start_positions", "end_positions"}) +# # estimator.train(train_input_fn, 2) +# test_rdd = self.sc.parallelize([gen_record(has_label=False) for i in range(4)]) +# test_input_fn = bert_input_fn(test_rdd, 2, 4, extra_features={"unique_ids": (tf.int32, [])}) +# predictions = estimator.predict(test_input_fn) +# assert predictions.count() == 4 +# assert isinstance(predictions.first(), dict) +# +# def test_bert_ner(self): +# def gen_record(has_label=True): +# res = dict() +# res["input_ids"] = np.random.randint(10000, size=2) +# res["input_mask"] = np.array([1] * 2) +# res["token_type_ids"] = np.array([0] * 1 + [1] * 1) +# if has_label: +# return res, np.array(np.random.randint(10, size=2)) +# else: +# return res +# import tensorflow as tf +# estimator = BERTNER(10, bert_config_path, optimizer=tf.train.AdamOptimizer()) +# # Training is too slow and memory consuming for a unit test. Skip here. Tested manually. +# # rdd = self.sc.parallelize([gen_record() for i in range(8)]) +# # train_input_fn = bert_input_fn(rdd, 2, 4, label_size=2) +# # estimator.train(train_input_fn, 2) +# test_rdd = self.sc.parallelize([gen_record(has_label=False) for i in range(4)]) +# test_input_fn = bert_input_fn(test_rdd, 2, 4) +# predictions = estimator.predict(test_input_fn) +# assert predictions.count() == 4 +# assert len(predictions.first()) == 2 +# +# +# if __name__ == "__main__": +# pytest.main([__file__]) diff --git a/python/orca/test/bigdl/orca/tfpark/test_text_models.py b/python/orca/test/bigdl/orca/tfpark/test_text_models.py new file mode 100644 index 00000000000..a62b950cf06 --- /dev/null +++ b/python/orca/test/bigdl/orca/tfpark/test_text_models.py @@ -0,0 +1,79 @@ +# # +# # Copyright 2018 Analytics Zoo Authors. +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # http://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. +# # +# +# import pytest +# +# import numpy as np +# from bigdl.orca.test_zoo_utils import ZooTestCase +# from bigdl.orca.tfpark.text.keras import * +# +# +# class TestTextModels(ZooTestCase): +# +# def setup_method(self, method): +# import tensorflow as tf +# tf.keras.backend.clear_session() +# super(TestTextModels, self).setup_method(method) +# +# def test_intent_entity(self): +# model = IntentEntity(num_intents=8, num_entities=5, word_length=10, +# word_vocab_size=200, char_vocab_size=50) +# input_data = [np.random.randint(200, size=(8, 30)), np.random.randint(50, size=(8, 30, 10))] +# output = model.predict(input_data, distributed=True) +# assert isinstance(output, list) and len(output) == 2 +# assert output[0].shape == (8, 8) +# assert output[1].shape == (8, 30, 5) +# self.assert_tfpark_model_save_load(model, input_data) +# +# def test_ner_crf_reg_mode(self): +# model = NER(num_entities=10, word_length=5, word_vocab_size=20, char_vocab_size=10) +# input_data = [np.random.randint(20, size=(15, 12)), np.random.randint(10, size=(15, 12, 5))] +# output = model.predict(input_data, distributed=True) +# assert output.shape == (15, 12, 10) +# self.assert_tfpark_model_save_load(model, input_data) +# +# def test_ner_crf_pad_mode(self): +# model = NER(num_entities=15, word_length=8, word_vocab_size=20, +# char_vocab_size=10, crf_mode="pad") +# input_data = [np.random.randint(20, size=(4, 12)), np.random.randint(10, size=(4, 12, 8)), +# np.random.randint(12, size=(4, 1))] +# output = model.predict(input_data, distributed=True) +# assert output.shape == (4, 12, 15) +# self.assert_tfpark_model_save_load(model, input_data) +# +# def test_sequence_tagger_softmax(self): +# model = SequenceTagger(num_pos_labels=5, num_chunk_labels=10, word_vocab_size=150) +# input_data = np.random.randint(150, size=(10, 50)) +# output = model.predict(input_data, distributed=True) +# assert isinstance(output, list) and len(output) == 2 +# assert output[0].shape == (10, 50, 5) +# assert output[1].shape == (10, 50, 10) +# self.assert_tfpark_model_save_load(model, input_data) +# +# def test_sequence_tagger_crf(self): +# model = SequenceTagger(num_pos_labels=8, num_chunk_labels=8, word_vocab_size=50, +# char_vocab_size=20, classifier="crf") +# input_data = [np.random.randint(50, size=(10, 15)), +# np.random.randint(20, size=(10, 15, 12))] +# output = model.predict(input_data, distributed=True) +# assert isinstance(output, list) and len(output) == 2 +# assert output[0].shape == (10, 15, 8) +# assert output[1].shape == (10, 15, 8) +# self.assert_tfpark_model_save_load(model, input_data) +# +# +# if __name__ == "__main__": +# pytest.main([__file__]) diff --git a/python/orca/test/bigdl/orca/tfpark/test_tf_dataset.py b/python/orca/test/bigdl/orca/tfpark/test_tf_dataset.py new file mode 100644 index 00000000000..c4715dbc1cd --- /dev/null +++ b/python/orca/test/bigdl/orca/tfpark/test_tf_dataset.py @@ -0,0 +1,421 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import pytest +from pyspark.ml.linalg import DenseVector +from bigdl.dllib.feature.common import ChainedPreprocessing, FeatureSet +from bigdl.dllib.feature.image import * +from bigdl.orca.test_zoo_utils import ZooTestCase +from bigdl.dllib.keras.optimizers import Adam +from bigdl.orca.tfpark import TFNet, TFOptimizer +import tensorflow as tf +import numpy as np +import os + +from bigdl.orca.tfpark import KerasModel, TFDataset + +resource_path = os.path.join(os.path.split(__file__)[0], "../resources") + + +def single_parse_fn(e): + keys_to_features = { + 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), + 'image/format': tf.FixedLenFeature((), tf.string, default_value='raw'), + 'image/class/label': tf.FixedLenFeature( + [1], tf.int64, default_value=tf.zeros([1], dtype=tf.int64)), + } + items_to_handlers = { + 'image': tf.contrib.slim.tfexample_decoder.Image(shape=[28, 28, 1], channels=1), + 'label': tf.contrib.slim.tfexample_decoder.Tensor('image/class/label', shape=[]), + } + decoder = tf.contrib.slim.tfexample_decoder.TFExampleDecoder( + keys_to_features, items_to_handlers) + results = decoder.decode(e) + if len(results[0].shape) > 0: + feature = results[0] + label = results[1] + else: + feature = results[1] + label = results[0] + return feature, label + + +def parse_fn(example): + results = tf.map_fn(single_parse_fn, example, dtype=(tf.uint8, tf.int64)) + return tf.to_float(results[0]), results[1] + + +class TestTFDataset(ZooTestCase): + + def get_raw_image_set(self, with_label): + resource_path = os.path.join(os.path.split(__file__)[0], "../resources") + if with_label: + image_folder = os.path.join(resource_path, "cat_dog") + else: + image_folder = os.path.join(resource_path, "cat_dog/*") + from bigdl.dllib.feature.image import ImageSet + image_set = ImageSet.read(image_folder, with_label=with_label, sc=get_spark_context(), + one_based_label=False) + return image_set + + def setup_method(self, method): + tf.keras.backend.clear_session() + super(TestTFDataset, self).setup_method(method) + + def create_model(self): + data = tf.keras.layers.Input(shape=[10]) + + x = tf.keras.layers.Flatten()(data) + x = tf.keras.layers.Dense(10, activation='relu')(x) + predictions = tf.keras.layers.Dense(2, activation='softmax')(x) + + model = tf.keras.models.Model(inputs=data, outputs=predictions) + model.compile(optimizer='rmsprop', + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + return model + + def create_training_dataset(self): + np.random.seed(20) + x = np.random.rand(20, 10) + y = np.random.randint(0, 2, (20)) + + rdd_x = self.sc.parallelize(x) + rdd_y = self.sc.parallelize(y) + + rdd = rdd_x.zip(rdd_y) + + dataset = TFDataset.from_rdd(rdd, + features=(tf.float32, [10]), + labels=(tf.int32, []), + batch_size=4, + val_rdd=rdd + ) + return dataset + + def test_dataset_without_batch(self): + x = np.random.rand(20, 10) + y = np.random.randint(0, 2, (20)) + + rdd_x = self.sc.parallelize(x) + rdd_y = self.sc.parallelize(y) + + rdd = rdd_x.zip(rdd_y) + + dataset = TFDataset.from_rdd(rdd, + features=(tf.float32, [10]), + labels=(tf.int32, []), + names=["features", "labels"], + val_rdd=rdd + ) + + keras_model = self.create_model() + model = KerasModel(keras_model) + self.intercept(lambda: model.fit(dataset), + "The batch_size of TFDataset must be" + + " specified when used in KerasModel fit.") + + dataset = TFDataset.from_rdd(rdd, + features=(tf.float32, [10]), + labels=(tf.int32, []), + names=["features", "labels"], + ) + self.intercept(lambda: model.evaluate(dataset), + "The batch_per_thread of TFDataset must be " + + "specified when used in KerasModel evaluate.") + + dataset = TFDataset.from_rdd(rdd_x, + features=(tf.float32, [10]), + names=["features", "labels"], + ) + self.intercept(lambda: model.predict(dataset), + "The batch_per_thread of TFDataset must be" + + " specified when used in KerasModel predict.") + + def create_image_model(self): + + data = tf.keras.layers.Input(shape=[224, 224, 3]) + x = tf.keras.layers.Flatten()(data) + predictions = tf.keras.layers.Dense(10, activation='softmax')(x) + + model = tf.keras.models.Model(inputs=data, outputs=predictions) + model.compile(optimizer='rmsprop', + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + + return KerasModel(model) + + def create_image_set(self, with_label): + image_set = self.get_raw_image_set(with_label) + transformer = ChainedPreprocessing([ImageResize(256, 256), + ImageRandomCrop(224, 224, True), + ImageMatToTensor(format="NHWC"), + ImageSetToSample(input_keys=["imageTensor"], + target_keys=["label"] + if with_label else None)]) + image_set = image_set.transform(transformer) + return image_set + + def create_train_features_Set(self): + image_set = self.get_raw_image_set(with_label=True) + feature_set = FeatureSet.image_frame(image_set.to_image_frame()) + train_transformer = ChainedPreprocessing([ImageBytesToMat(), + ImageResize(256, 256), + ImageRandomCrop(224, 224), + ImageRandomPreprocessing(ImageHFlip(), 0.5), + ImageChannelNormalize( + 0.485, 0.456, 0.406, + 0.229, 0.224, 0.225), + ImageMatToTensor(to_RGB=True, format="NHWC"), + ImageSetToSample(input_keys=["imageTensor"], + target_keys=["label"]) + ]) + feature_set = feature_set.transform(train_transformer) + feature_set = feature_set.transform(ImageFeatureToSample()) + return feature_set + + def test_training_for_imageset(self): + + model = self.create_image_model() + image_set = self.create_image_set(with_label=True) + training_dataset = TFDataset.from_image_set(image_set, + image=(tf.float32, [224, 224, 3]), + label=(tf.int32, [1]), + batch_size=4) + model.fit(training_dataset) + + def test_training_for_feature_set(self): + model = self.create_image_model() + feature_set = self.create_train_features_Set() + training_dataset = TFDataset.from_feature_set(feature_set, + features=(tf.float32, [224, 224, 3]), + labels=(tf.int32, [1]), + batch_size=8) + model.fit(training_dataset) + + def test_evaluation_for_imageset(self): + + model = self.create_image_model() + image_set = self.create_image_set(with_label=True) + eval_dataset = TFDataset.from_image_set(image_set, + image=(tf.float32, [224, 224, 3]), + label=(tf.int32, [1]), + batch_per_thread=1) + + model.evaluate(eval_dataset) + + def test_predict_for_imageset(self): + model = self.create_image_model() + image_set = self.create_image_set(with_label=False) + + predict_dataset = TFDataset.from_image_set(image_set, + image=(tf.float32, [224, 224, 3]), + batch_per_thread=1) + results = model.predict(predict_dataset).get_predict().collect() + assert all(r[1] is not None for r in results) + + def test_gradient_clipping(self): + + data = tf.keras.layers.Input(shape=[10]) + + x = tf.keras.layers.Flatten()(data) + x = tf.keras.layers.Dense(10, activation='relu')(x) + predictions = tf.keras.layers.Dense(2, activation='softmax')(x) + + model = tf.keras.models.Model(inputs=data, outputs=predictions) + model.compile(optimizer=tf.keras.optimizers.SGD(lr=1, clipvalue=1e-8), + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + model = KerasModel(model) + + pre_weights = model.get_weights() + + dataset = self.create_training_dataset() + + # 5 iterations + model.fit(dataset) + + current_weight = model.get_weights() + + np.all(np.abs((current_weight[0] - pre_weights[0])) < 1e-7) + + def test_tf_dataset_with_list_feature(self): + np.random.seed(20) + x = np.random.rand(20, 10) + y = np.random.randint(0, 2, (20)) + + rdd_x = self.sc.parallelize(x) + rdd_y = self.sc.parallelize(y) + + rdd = rdd_x.zip(rdd_y) + + dataset = TFDataset.from_rdd(rdd, + features=[(tf.float32, [10]), (tf.float32, [10])], + labels=(tf.int32, []), + batch_size=4, + val_rdd=rdd + ) + + for idx, tensor in enumerate(dataset.feature_tensors): + assert tensor.name == "list_input_" + str(idx) + ":0" + + def test_tfdataset_with_string_rdd(self): + string_rdd = self.sc.parallelize(["123", "456"], 1) + ds = TFDataset.from_string_rdd(string_rdd, batch_per_thread=1) + input_tensor = tf.placeholder(dtype=tf.string, shape=(None,)) + output_tensor = tf.string_to_number(input_tensor) + with tf.Session() as sess: + tfnet = TFNet.from_session(sess, inputs=[input_tensor], outputs=[output_tensor]) + result = tfnet.predict(ds).collect() + assert result[0] == 123 + assert result[1] == 456 + + def test_tfdataset_with_tfrecord(self): + train_path = os.path.join(resource_path, "tfrecord/mnist_train.tfrecord") + test_path = os.path.join(resource_path, "tfrecord/mnist_test.tfrecord") + dataset = TFDataset.from_tfrecord_file(self.sc, train_path, + batch_size=16, + validation_file_path=test_path) + raw_bytes = dataset.tensors[0] + images, labels = parse_fn(raw_bytes) + flat = tf.layers.flatten(images) + logits = tf.layers.dense(flat, 10) + loss = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(logits=logits, + labels=labels)) + opt = TFOptimizer.from_loss(loss, Adam()) + opt.optimize() + + def test_tfdataset_with_tf_data_dataset_which_requires_table(self): + + keys = [1, 0, -1] + dataset = tf.data.Dataset.from_tensor_slices([1, 2, -1, 5] * 40) + table = tf.contrib.lookup.HashTable( + initializer=tf.contrib.lookup.KeyValueTensorInitializer( + keys=keys, values=list(reversed(keys))), + default_value=100) + dataset = dataset.map(table.lookup) + + def transform(x): + float_x = tf.to_float(x) + return float_x, 1 + dataset = dataset.map(transform) + dataset = TFDataset.from_tf_data_dataset(dataset, batch_size=16) + seq = tf.keras.Sequential( + [tf.keras.layers.Flatten(input_shape=()), + tf.keras.layers.Dense(10, activation="softmax")]) + seq.compile(optimizer=tf.keras.optimizers.RMSprop(), + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + model = KerasModel(seq) + model.fit(dataset) + + def test_tfdataset_with_tf_data_dataset_which_contains_bool(self): + dataset = tf.data.Dataset.from_tensor_slices((np.random.randn(102, 28, 28, 1), + np.random.randint(0, 10, size=(102,)), + np.ones(shape=(102, 28, 28, 1), + dtype=np.bool))) + dataset = TFDataset.from_tf_data_dataset(dataset, batch_size=16) + + feature, labels, mask = dataset.tensors + + float_mask = tf.to_float(mask) + masked_feature = tf.to_float(feature) * float_mask + flatten = tf.layers.flatten(masked_feature) + logits = tf.layers.dense(flatten, 10) + loss = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(logits=logits, + labels=labels)) + opt = TFOptimizer.from_loss(loss, Adam()) + opt.optimize() + + def test_tfdataset_with_tf_data_dataset(self): + dataset = tf.data.Dataset.from_tensor_slices((np.random.randn(102, 28, 28, 1), + np.random.randint(0, 10, size=(102,)))) + dataset = dataset.map(lambda feature, label: (tf.to_float(feature), label)) + dataset = TFDataset.from_tf_data_dataset(dataset, batch_size=16) + seq = tf.keras.Sequential( + [tf.keras.layers.Flatten(input_shape=(28, 28, 1)), + tf.keras.layers.Dense(10, activation="softmax")]) + + seq.compile(optimizer=tf.keras.optimizers.RMSprop(), + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + model = KerasModel(seq) + model.fit(dataset) + dataset = tf.data.Dataset.from_tensor_slices((np.random.randn(102, 28, 28, 1), + np.random.randint(0, 10, size=(102,)))) + dataset = dataset.map(lambda feature, label: (tf.to_float(feature), + label)) + dataset = TFDataset.from_tf_data_dataset(dataset, batch_per_thread=16) + model.evaluate(dataset) + + def check_dataset(self, create_ds): + + seq = tf.keras.Sequential( + [tf.keras.layers.Flatten(input_shape=(20,)), + tf.keras.layers.Dense(10, activation="softmax")]) + + seq.compile(optimizer=tf.keras.optimizers.RMSprop(), + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + model = KerasModel(seq) + + model.fit(create_ds("train")) + model.predict(create_ds("predict")).collect() + model.evaluate(create_ds("evaluate")) + + def make_create_ds_fn(self, train_df, val_df): + def create_ds(mode): + if mode == "train": + dataset = TFDataset.from_dataframe(train_df, + feature_cols=["feature"], + labels_cols=["label"], + batch_size=32, + validation_df=val_df) + elif mode == "predict": + dataset = TFDataset.from_dataframe(val_df, + feature_cols=["feature"], + batch_per_thread=32) + elif mode == "evaluate": + dataset = TFDataset.from_dataframe(val_df, + feature_cols=["feature"], + labels_cols=["label"], + batch_per_thread=32) + else: + raise ValueError("unrecognized mode: {}".format(mode)) + + return dataset + return create_ds + + def test_tfdataset_with_dataframe(self): + + rdd = self.sc.range(0, 1000) + df = rdd.map(lambda x: (DenseVector(np.random.rand(20).astype(np.float)), + x % 10)).toDF(["feature", "label"]) + train_df, val_df = df.randomSplit([0.7, 0.3]) + + create_ds = self.make_create_ds_fn(train_df, val_df) + + self.check_dataset(create_ds) + + def test_tfdataset_with_dataframe_arraytype(self): + rdd = self.sc.range(0, 1000) + df = rdd.map(lambda x: ([0.0] * 20, x % 10)).toDF(["feature", "label"]) + train_df, val_df = df.randomSplit([0.7, 0.3]) + create_ds = self.make_create_ds_fn(train_df, val_df) + self.check_dataset(create_ds) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/python/orca/test/bigdl/orca/tfpark/test_tfnet.py b/python/orca/test/bigdl/orca/tfpark/test_tfnet.py new file mode 100644 index 00000000000..5594e7f36e9 --- /dev/null +++ b/python/orca/test/bigdl/orca/tfpark/test_tfnet.py @@ -0,0 +1,106 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest + + +from bigdl.orca.test_zoo_utils import ZooTestCase +from bigdl.orca.tfpark import TFNet, TFDataset +from bigdl.dllib.utils.common import * + +np.random.seed(1337) # for reproducibility + + +class TestTF(ZooTestCase): + + resource_path = os.path.join(os.path.split(__file__)[0], "../resources") + + def test_init_tf_net(self): + tfnet_path = os.path.join(TestTF.resource_path, "tfnet") + net = TFNet.from_export_folder(tfnet_path) + output = net.forward(np.random.rand(2, 4)) + assert output.shape == (2, 2) + + def test_for_scalar(self): + import tensorflow as tf + with tf.Graph().as_default(): + input1 = tf.placeholder(dtype=tf.float32, shape=()) + output = input1 + 1 + sess = tf.Session() + net = TFNet.from_session(sess, [input1], [output]) + sess.close() + out_value = net.forward(np.array(1.0)) + assert len(out_value.shape) == 0 + + # the following test would fail on bigdl 0.6.0 due to a bug in bigdl, + # comment it out for now + + # out_value = net.predict(np.array([1.0])).first() + # assert len(out_value.shape) == 0 + + def test_init_tfnet_from_session(self): + import tensorflow as tf + with tf.Graph().as_default(): + input1 = tf.placeholder(dtype=tf.float32, shape=(None, 2)) + label1 = tf.placeholder(dtype=tf.float32, shape=(None, 1)) + hidden = tf.layers.dense(input1, 4) + output = tf.layers.dense(hidden, 1) + loss = tf.reduce_mean(tf.square(output - label1)) + grad_inputs = tf.gradients(loss, input1) + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + data = np.random.rand(2, 2) + output_value_ref = sess.run(output, feed_dict={input1: data}) + label_value = output_value_ref - 1.0 + grad_input_value_ref = sess.run(grad_inputs[0], + feed_dict={input1: data, + label1: label_value}) + net = TFNet.from_session(sess, [input1], [output], generate_backward=True) + + output_value = net.forward(data) + + grad_input_value = net.backward(data, np.ones(shape=(2, 1))) + + self.assert_allclose(output_value, output_value_ref) + self.assert_allclose(grad_input_value, grad_input_value_ref) + + def test_init_tfnet_from_saved_model(self): + model_path = os.path.join(TestTF.resource_path, "saved-model-resource") + tfnet = TFNet.from_saved_model(model_path, inputs=["flatten_input:0"], + outputs=["dense_2/Softmax:0"]) + result = tfnet.predict(np.ones(dtype=np.float32, shape=(20, 28, 28, 1))) + result.collect() + + def test_tf_net_predict(self): + tfnet_path = os.path.join(TestTF.resource_path, "tfnet") + import tensorflow as tf + tf_session_config = tf.ConfigProto(inter_op_parallelism_threads=1, + intra_op_parallelism_threads=1) + net = TFNet.from_export_folder(tfnet_path, tf_session_config=tf_session_config) + output = net.predict(np.random.rand(16, 4), batch_per_thread=5, distributed=False) + assert output.shape == (16, 2) + + def test_tf_net_predict_dataset(self): + tfnet_path = os.path.join(TestTF.resource_path, "tfnet") + net = TFNet.from_export_folder(tfnet_path) + dataset = TFDataset.from_ndarrays((np.random.rand(16, 4),)) + output = net.predict(dataset) + output = np.stack(output.collect()) + assert output.shape == (16, 2) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/python/orca/test/bigdl/orca/tfpark/test_tfpark_estimator.py b/python/orca/test/bigdl/orca/tfpark/test_tfpark_estimator.py new file mode 100644 index 00000000000..9303e69f358 --- /dev/null +++ b/python/orca/test/bigdl/orca/tfpark/test_tfpark_estimator.py @@ -0,0 +1,275 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import pytest + +from bigdl.orca.test_zoo_utils import ZooTestCase +import tensorflow as tf + +from bigdl.dllib.feature.common import ChainedPreprocessing, FeatureSet +from bigdl.dllib.feature.image import * +from bigdl.orca.tfpark import TFDataset, TFEstimator +from bigdl.orca.tfpark import ZooOptimizer + + +class TestTFParkEstimator(ZooTestCase): + def get_raw_image_set(self, with_label): + resource_path = os.path.join(os.path.split(__file__)[0], "../resources") + if with_label: + image_folder = os.path.join(resource_path, "cat_dog") + else: + image_folder = os.path.join(resource_path, "cat_dog/*") + from bigdl.dllib.feature.image import ImageSet + image_set = ImageSet.read(image_folder, with_label=with_label, sc=get_spark_context(), + one_based_label=False) + return image_set + + def setup_method(self, method): + tf.keras.backend.clear_session() + super(TestTFParkEstimator, self).setup_method(method) + + def create_model_fn(self): + def model_fn(features, labels, mode): + features = tf.layers.flatten(features) + h1 = tf.layers.dense(features, 64, activation=tf.nn.relu) + h2 = tf.layers.dense(h1, 64, activation=tf.nn.relu) + logits = tf.layers.dense(h2, 10) + + if mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.TRAIN: + loss = tf.reduce_mean( + tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) + train_op = ZooOptimizer(tf.train.AdamOptimizer()).minimize(loss) + return tf.estimator.EstimatorSpec(mode, train_op=train_op, + predictions=logits, loss=loss) + else: + return tf.estimator.EstimatorSpec(mode, predictions=logits) + return model_fn + + def create_input_fn(self): + + def input_fn(mode): + np.random.seed(20) + x = np.random.rand(20, 10) + y = np.random.randint(0, 10, (20)) + + rdd_x = self.sc.parallelize(x) + rdd_y = self.sc.parallelize(y) + + rdd = rdd_x.zip(rdd_y) + if mode == tf.estimator.ModeKeys.TRAIN: + dataset = TFDataset.from_rdd(rdd, + features=(tf.float32, [10]), + labels=(tf.int32, []), + batch_size=4) + elif mode == tf.estimator.ModeKeys.EVAL: + dataset = TFDataset.from_rdd(rdd, + features=(tf.float32, [10]), + labels=(tf.int32, []), + batch_per_thread=4) + else: + dataset = TFDataset.from_rdd(rdd_x, + features=(tf.float32, [10]), + batch_per_thread=4) + return dataset + + return input_fn + + def test_init_TFDataset_from_ndarrays(self): + + model_fn = self.create_model_fn() + + def input_fn(mode): + x = np.random.rand(20, 10) + y = np.random.randint(0, 10, (20,)) + if mode == tf.estimator.ModeKeys.TRAIN: + return TFDataset.from_ndarrays((x, y), batch_size=8) + elif mode == tf.estimator.ModeKeys.EVAL: + return TFDataset.from_ndarrays((x, y), batch_per_thread=1) + else: + return TFDataset.from_ndarrays(x, batch_per_thread=1) + + estimator = TFEstimator.from_model_fn(model_fn) + estimator.train(input_fn, 10) + estimator.evaluate(input_fn, ["acc"]) + estimator.predict(input_fn) + + def test_training(self): + model_fn = self.create_model_fn() + input_fn = self.create_input_fn() + estimator = TFEstimator.from_model_fn(model_fn) + estimator.train(input_fn, steps=60000 // 320) + + def test_evaluating(self): + model_fn = self.create_model_fn() + input_fn = self.create_input_fn() + estimator = TFEstimator.from_model_fn(model_fn) + eval_results = estimator.evaluate(input_fn, ["acc"]) + assert len(eval_results) > 0 + + def test_predict(self): + model_fn = self.create_model_fn() + input_fn = self.create_input_fn() + estimator = TFEstimator.from_model_fn(model_fn) + results = estimator.predict(input_fn).collect() + + def test_estimator_without_batch(self): + def model_fn(features, labels, mode): + + assert features.shape.ndims == 1 + if labels is not None: + assert labels.shape.ndims == 0 + + features = tf.expand_dims(features, axis=0) + + h1 = tf.layers.dense(features, 64, activation=tf.nn.relu) + h2 = tf.layers.dense(h1, 64, activation=tf.nn.relu) + logits = tf.layers.dense(h2, 10) + + if mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.TRAIN: + labels = tf.expand_dims(labels, axis=0) + loss = tf.reduce_mean( + tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)) + train_op = ZooOptimizer(tf.train.AdamOptimizer()).minimize(loss) + return tf.estimator.EstimatorSpec(mode, train_op=train_op, + predictions=logits, loss=loss) + else: + return tf.estimator.EstimatorSpec(mode, predictions=logits) + + def input_fn(mode): + np.random.seed(20) + x = np.random.rand(20, 10) + y = np.random.randint(0, 10, (20)) + + rdd_x = self.sc.parallelize(x) + rdd_y = self.sc.parallelize(y) + + rdd = rdd_x.zip(rdd_y) + if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL: + dataset = TFDataset.from_rdd(rdd, + features=(tf.float32, [10]), + labels=(tf.int32, [])) + else: + dataset = TFDataset.from_rdd(rdd_x, + features=(tf.float32, [10])) + return dataset + + estimator = TFEstimator.from_model_fn(model_fn) + + self.intercept(lambda: estimator.train(input_fn, steps=1), + "The batch_size of TFDataset must be specified when used for training.") + + estimator.evaluate(input_fn, ["acc"]) + estimator.predict(input_fn).collect() + + def create_imageset_input_fn(self): + def input_fn(mode): + import os + resource_path = os.path.join(os.path.split(__file__)[0], "../resources") + if mode == tf.estimator.ModeKeys.TRAIN: + image_folder = os.path.join(resource_path, "cat_dog") + image_set = ImageSet.read(image_folder, with_label=True, sc=self.sc, + one_based_label=False) + transformer = ChainedPreprocessing([ImageResize(256, 256), + ImageRandomCrop(224, 224, True), + ImageMatToTensor(format="NHWC"), + ImageSetToSample(input_keys=["imageTensor"], + target_keys=["label"])]) + image_set = image_set.transform(transformer) + dataset = TFDataset.from_image_set(image_set, + image=(tf.float32, [224, 224, 3]), + label=(tf.int32, [1]), + batch_size=8) + elif mode == tf.estimator.ModeKeys.EVAL: + image_folder = os.path.join(resource_path, "cat_dog") + image_set = ImageSet.read(image_folder, with_label=True, sc=self.sc, + one_based_label=False) + transformer = ChainedPreprocessing([ImageResize(256, 256), + ImageRandomCrop(224, 224, True), + ImageMatToTensor(format="NHWC"), + ImageSetToSample(input_keys=["imageTensor"], + target_keys=["label"])]) + image_set = image_set.transform(transformer) + dataset = TFDataset.from_image_set(image_set, + image=(tf.float32, [224, 224, 3]), + label=(tf.int32, [1]), + batch_per_thread=8) + else: + image_folder = os.path.join(resource_path, "cat_dog/*/*") + image_set = ImageSet.read(image_folder, with_label=False, sc=self.sc, + one_based_label=False) + transformer = ChainedPreprocessing([ImageResize(256, 256), + ImageRandomCrop(224, 224, True), + ImageMatToTensor(format="NHWC"), + ImageSetToSample( + input_keys=["imageTensor"])]) + image_set = image_set.transform(transformer) + dataset = TFDataset.from_image_set(image_set, + image=(tf.float32, [224, 224, 3]), + batch_per_thread=8) + + return dataset + return input_fn + + def test_estimator_for_imageset(self): + + model_fn = self.create_model_fn() + input_fn = self.create_imageset_input_fn() + + estimator = TFEstimator.from_model_fn(model_fn) + estimator.train(input_fn, steps=1) + estimator.evaluate(input_fn, ["acc"]) + results = estimator.predict(input_fn).get_predict().collect() + assert all(r[1] is not None for r in results) + + def create_train_feature_set_input_fn(self): + def input_fn(mode): + if mode == tf.estimator.ModeKeys.TRAIN: + image_set = self.get_raw_image_set(with_label=True) + feature_set = FeatureSet.image_frame(image_set.to_image_frame()) + train_transformer = ChainedPreprocessing([ImageBytesToMat(), + ImageResize(256, 256), + ImageRandomCrop(224, 224), + ImageRandomPreprocessing( + ImageHFlip(), 0.5), + ImageChannelNormalize( + 0.485, 0.456, 0.406, + 0.229, 0.224, 0.225), + ImageMatToTensor( + to_RGB=True, format="NHWC"), + ImageSetToSample( + input_keys=["imageTensor"], + target_keys=["label"]) + ]) + feature_set = feature_set.transform(train_transformer) + feature_set = feature_set.transform(ImageFeatureToSample()) + training_dataset = TFDataset.from_feature_set(feature_set, + features=(tf.float32, [224, 224, 3]), + labels=(tf.int32, [1]), + batch_size=8) + return training_dataset + else: + raise NotImplementedError + return input_fn + + def test_estimator_for_feature_set(self): + model_fn = self.create_model_fn() + input_fn = self.create_train_feature_set_input_fn() + + estimator = TFEstimator.from_model_fn(model_fn) + estimator.train(input_fn, steps=1) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/python/orca/test/bigdl/orca/tfpark/test_tfpark_model.py b/python/orca/test/bigdl/orca/tfpark/test_tfpark_model.py new file mode 100644 index 00000000000..33aed0f2526 --- /dev/null +++ b/python/orca/test/bigdl/orca/tfpark/test_tfpark_model.py @@ -0,0 +1,325 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import pytest + +from bigdl.orca.test_zoo_utils import ZooTestCase +import tensorflow as tf +import numpy as np +import os + +from bigdl.orca.tfpark import KerasModel, TFDataset, TFOptimizer + +resource_path = os.path.join(os.path.split(__file__)[0], "../resources") + + +class TestTFParkModel(ZooTestCase): + + def setup_method(self, method): + tf.keras.backend.clear_session() + super(TestTFParkModel, self).setup_method(method) + + def create_model(self): + tf.set_random_seed(1) + data = tf.keras.layers.Input(shape=[10]) + + x = tf.keras.layers.Flatten()(data) + x = tf.keras.layers.Dense(10, activation='relu')(x) + predictions = tf.keras.layers.Dense(2, activation='softmax')(x) + + model = tf.keras.models.Model(inputs=data, outputs=predictions) + model.compile(optimizer='rmsprop', + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + return model + + def create_multi_input_output_model(self): + data1 = tf.keras.layers.Input(shape=[10]) + data2 = tf.keras.layers.Input(shape=[10]) + + x1 = tf.keras.layers.Flatten()(data1) + x1 = tf.keras.layers.Dense(10, activation='relu')(x1) + pred1 = tf.keras.layers.Dense(2, activation='softmax')(x1) + + x2 = tf.keras.layers.Flatten()(data2) + x2 = tf.keras.layers.Dense(10, activation='relu')(x2) + pred2 = tf.keras.layers.Dense(2)(x2) + + model = tf.keras.models.Model(inputs=[data1, data2], outputs=[pred1, pred2]) + model.compile(optimizer='rmsprop', + loss=['sparse_categorical_crossentropy', 'mse']) + return model + + def create_training_data(self): + np.random.seed(20) + x = np.random.rand(20, 10) + y = np.random.randint(0, 2, (20)) + return x, y + + def create_training_dataset(self): + np.random.seed(20) + x = np.random.rand(20, 10) + y = np.random.randint(0, 2, (20)) + + rdd_x = self.sc.parallelize(x) + rdd_y = self.sc.parallelize(y) + + rdd = rdd_x.zip(rdd_y) + + dataset = TFDataset.from_rdd(rdd, + features=(tf.float32, [10]), + labels=(tf.int32, []), + batch_size=4, + val_rdd=rdd + ) + return dataset + + def create_evaluation_dataset(self): + np.random.seed(20) + x = np.random.rand(20, 10) + y = np.random.randint(0, 2, (20)) + + rdd_x = self.sc.parallelize(x) + rdd_y = self.sc.parallelize(y) + + rdd = rdd_x.zip(rdd_y) + + dataset = TFDataset.from_rdd(rdd, + features=(tf.float32, [10]), + labels=(tf.int32, []), + batch_per_thread=1 + ) + return dataset + + def create_predict_dataset(self): + np.random.seed(20) + x = np.random.rand(20, 10) + + rdd = self.sc.parallelize(x) + + rdd = rdd.map(lambda x: [x]) + + dataset = TFDataset.from_rdd(rdd, + features=(tf.float32, [10]), + batch_per_thread=1 + ) + return dataset + + def test_training_with_ndarray(self): + + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + model.fit(x, y, batch_size=2) + + def test_training_with_ndarry_distributed(self): + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + model.fit(x, y, batch_size=4, distributed=True) + + def test_training_with_ndarry_distributed_twice(self): + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + model.fit(x, y, batch_size=4, distributed=True) + model.fit(x, y, batch_size=4, distributed=True) + + def test_training_with_validation_data(self): + + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + val_x, val_y = self.create_training_data() + + model.fit(x, y, validation_data=(val_x, val_y), batch_size=4) + + def test_training_with_validation_data_distributed(self): + + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + val_x, val_y = self.create_training_data() + + model.fit(x, y, validation_data=(val_x, val_y), batch_size=4, distributed=True) + + def test_training_and_validation_with_dataset(self): + keras_model = self.create_model() + model = KerasModel(keras_model) + + dataset = self.create_training_dataset() + + model.fit(dataset) + + def test_evaluate_with_ndarray(self): + + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + results_pre = model.evaluate(x, y) + + model.fit(x, y, batch_size=4, epochs=10) + + results_after = model.evaluate(x, y) + + assert results_pre["loss"] > results_after["loss"] + + def test_evaluate_with_ndarray_distributed(self): + + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + results_pre = model.evaluate(x, y, batch_per_thread=1) + + model.fit(x, y, batch_size=4, epochs=10) + + results_after = model.evaluate(x, y, distributed=True, batch_per_thread=1) + + assert results_pre["loss"] > results_after["loss"] + + def test_evaluate_and_distributed_evaluate(self): + + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + results_pre = model.evaluate(x, y) + + results_after = model.evaluate(x, y, distributed=True) + + assert np.square(results_pre["acc"] - results_after["acc Top1Accuracy"]) < 0.000001 + assert np.square(results_pre["loss"] - results_after["loss"]) < 0.000001 + + def test_evaluate_with_dataset(self): + + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + results_pre = model.evaluate(x, y) + + dataset = self.create_evaluation_dataset() + + results_after = model.evaluate(dataset) + + assert np.square(results_pre["acc"] - results_after["acc Top1Accuracy"]) < 0.000001 + assert np.square(results_pre["loss"] - results_after["loss"]) < 0.000001 + + def test_predict_with_ndarray(self): + + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + results_pre = model.evaluate(x, y) + + pred_y = np.argmax(model.predict(x), axis=1) + + acc = np.average((pred_y == y)) + + assert np.square(acc - results_pre["acc"]) < 0.000001 + + def test_predict_with_ndarray_distributed(self): + + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + + results_pre = model.evaluate(x, y) + + pred_y = np.argmax(model.predict(x, distributed=True), axis=1) + acc = np.average((pred_y == y)) + print(results_pre) + assert np.square(acc - results_pre["acc"]) < 0.000001 + + def test_predict_with_dataset(self): + + keras_model = self.create_model() + model = KerasModel(keras_model) + + x, y = self.create_training_data() + results_pre = model.evaluate(x, y) + + pred_y = np.argmax(np.array(model.predict( + self.create_predict_dataset()).collect()), axis=1) + + acc = np.average((pred_y == y)) + + assert np.square(acc - results_pre["acc"]) < 0.000001 + + # move the test here to avoid keras session to be closed (not sure about why) + def test_tf_optimizer_with_sparse_gradient_using_keras(self): + import tensorflow as tf + + ids = np.random.randint(0, 10, size=[40]) + labels = np.random.randint(0, 5, size=[40]) + id_rdd = self.sc.parallelize(ids) + label_rdd = self.sc.parallelize(labels) + training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]]) + + dataset = TFDataset.from_rdd(training_rdd, + features=(tf.int32, []), + labels=(tf.int32, []), + batch_size=8) + words_input = tf.keras.layers.Input(shape=(), name='words_input') + embedding_layer = tf.keras.layers.Embedding(input_dim=10, + output_dim=5, name='word_embedding') + word_embeddings = embedding_layer(words_input) + embedding = tf.keras.layers.Flatten()(word_embeddings) + output = tf.keras.layers.Dense(5, activation="softmax")(embedding) + model = tf.keras.models.Model(inputs=[words_input], outputs=[output]) + model.compile(optimizer="sgd", loss="sparse_categorical_crossentropy") + + optimizer = TFOptimizer.from_keras(model, dataset) + optimizer.optimize() + + def test_tensorflow_optimizer(self): + data = tf.keras.layers.Input(shape=[10]) + + x = tf.keras.layers.Flatten()(data) + x = tf.keras.layers.Dense(10, activation='relu')(x) + predictions = tf.keras.layers.Dense(2, activation='softmax')(x) + + model = tf.keras.models.Model(inputs=data, outputs=predictions) + model.compile(optimizer=tf.train.AdamOptimizer(), + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + + keras_model = KerasModel(model) + + x, y = self.create_training_data() + + keras_model.fit(x, y, batch_size=4, distributed=True) + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/python/orca/test/bigdl/orca/tfpark/test_tfpark_tfoptimizer.py b/python/orca/test/bigdl/orca/tfpark/test_tfpark_tfoptimizer.py new file mode 100644 index 00000000000..dec7671a7cd --- /dev/null +++ b/python/orca/test/bigdl/orca/tfpark/test_tfpark_tfoptimizer.py @@ -0,0 +1,176 @@ +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import pytest + +from bigdl.dllib.optim.optimizer import Adam, SGD, MaxEpoch +from bigdl.dllib.keras.metrics import Accuracy +from bigdl.orca.test_zoo_utils import ZooTestCase +import tensorflow as tf +import numpy as np +import tempfile +import os + +from bigdl.orca.tfpark import TFDataset, TFOptimizer + + +class TestTFParkTFOptimizer(ZooTestCase): + def setup_method(self, method): + tf.keras.backend.clear_session() + super(TestTFParkTFOptimizer, self).setup_method(method) + + def test_tf_optimizer_with_sparse_gradient(self): + ids = np.random.randint(0, 10, size=[40]) + labels = np.random.randint(0, 5, size=[40]) + id_rdd = self.sc.parallelize(ids) + label_rdd = self.sc.parallelize(labels) + training_rdd = id_rdd.zip(label_rdd).map(lambda x: [x[0], x[1]]) + with tf.Graph().as_default(): + dataset = TFDataset.from_rdd(training_rdd, + names=["ids", "labels"], + shapes=[[], []], + types=[tf.int32, tf.int32], + batch_size=8) + id_tensor, label_tensor = dataset.tensors + embedding_table = tf.get_variable( + name="word_embedding", + shape=[10, 5]) + + embedding = tf.nn.embedding_lookup(embedding_table, id_tensor) + loss = tf.reduce_mean(tf.losses. + sparse_softmax_cross_entropy(logits=embedding, + labels=label_tensor)) + optimizer = TFOptimizer.from_loss(loss, Adam(1e-3)) + optimizer.optimize(end_trigger=MaxEpoch(1)) + optimizer.sess.close() + + def test_tf_optimizer_metrics(self): + + features = np.random.randn(20, 10) + labels = np.random.randint(0, 10, size=[20]) + with tf.Graph().as_default(): + dataset = TFDataset.from_ndarrays((features, labels), + batch_size=4, + val_tensors=(features, labels)) + feature_tensor, label_tensor = dataset.tensors + features = tf.layers.dense(feature_tensor, 8) + output = tf.layers.dense(features, 10) + loss = tf.reduce_mean(tf.losses. + sparse_softmax_cross_entropy(logits=output, + labels=label_tensor)) + optimizer = TFOptimizer.from_loss(loss, {"dense/": Adam(1e-3), "dense_1/": SGD(0.0)}, + val_outputs=[output], + val_labels=[label_tensor], + val_method=Accuracy(), metrics={"loss": loss}) + initial_weights = optimizer.tf_model.training_helper_layer.get_weights() + optimizer.optimize(end_trigger=MaxEpoch(1)) + updated_weights = optimizer.tf_model.training_helper_layer.get_weights() + for i in [0, 1]: # weights and bias combined with "dense/" should be updated + assert not np.allclose(initial_weights[i], updated_weights[i]) + for i in [2, 3]: # weights and bias combined with "dense_1" should be unchanged + assert np.allclose(initial_weights[i], updated_weights[i]) + optimizer.sess.close() + + def test_control_inputs(self): + + features = np.random.randn(20, 10) + labels = np.random.randint(0, 10, size=[20]) + with tf.Graph().as_default(): + dataset = TFDataset.from_ndarrays((features, labels), + batch_size=4, + val_tensors=(features, labels)) + is_training = tf.placeholder(dtype=tf.bool, shape=()) + feature_tensor, label_tensor = dataset.tensors + features = tf.layers.dense(feature_tensor, 8) + features = tf.layers.dropout(features, training=is_training) + output = tf.layers.dense(features, 10) + loss = tf.reduce_mean(tf.losses. + sparse_softmax_cross_entropy(logits=output, + labels=label_tensor)) + optimizer = TFOptimizer.from_loss(loss, Adam(), + val_outputs=[output], + val_labels=[label_tensor], + val_method=Accuracy(), + tensor_with_value={is_training: (True, False)}, + metrics={"loss": loss}) + optimizer.optimize(end_trigger=MaxEpoch(1)) + optimizer.sess.close() + + def test_checkpoint(self): + + features = np.random.randn(20, 10) + labels = np.random.randint(0, 10, size=[20]) + with tf.Graph().as_default(): + dataset = TFDataset.from_ndarrays((features, labels), + batch_size=4, + val_tensors=(features, labels)) + feature_tensor, label_tensor = dataset.tensors + features = tf.layers.dense(feature_tensor, 8) + output = tf.layers.dense(features, 10) + loss = tf.reduce_mean(tf.losses. + sparse_softmax_cross_entropy(logits=output, + labels=label_tensor)) + model_dir = tempfile.mkdtemp() + try: + optimizer = TFOptimizer.from_loss(loss, Adam(), + val_outputs=[output], + val_labels=[label_tensor], + val_method=Accuracy(), + metrics={"loss": loss}, model_dir=model_dir) + optimizer.optimize(end_trigger=MaxEpoch(1)) + + first_weights = optimizer.sess.run(tf.trainable_variables()[0]) + import re + ckpt_path = None + versions = [] + for (root, dirs, files) in os.walk(model_dir, topdown=True): + temp_versions = [] + for file_name in files: + if re.match("^optimMethod-TFParkTraining\.[0-9]+$", file_name) is not None: + version = int(file_name.split(".")[1]) + temp_versions.append(version) + if temp_versions: + ckpt_path = root + versions = temp_versions + break + + assert ckpt_path is not None, "Cannot fine checkpoint file" + optimizer.sess.run(tf.global_variables_initializer()) # reset variable + optimizer_load = TFOptimizer.from_loss(loss, Adam(), + session=optimizer.sess, + val_outputs=[output], + val_labels=[label_tensor], + val_method=Accuracy(), + metrics={"loss": loss}, model_dir=model_dir) + optimizer_load.load_checkpoint(ckpt_path, max(versions)) + loaded_first_weights_before_train = optimizer.sess.run(tf.trainable_variables()[0]) + assert np.allclose(first_weights, loaded_first_weights_before_train) + # max epoch still 1, should not train + optimizer_load.optimize(end_trigger=MaxEpoch(1)) + loaded_first_weights = optimizer.sess.run(tf.trainable_variables()[0]) + assert np.allclose(first_weights, loaded_first_weights) + + # max epoch increase 1, should train 1 epoch + optimizer_load.optimize(end_trigger=MaxEpoch(2)) + loaded_first_weights_2 = optimizer.sess.run(tf.trainable_variables()[0]) + assert not np.allclose(first_weights, loaded_first_weights_2) + optimizer_load.sess.close() + finally: + import shutil + shutil.rmtree(model_dir) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/python/orca/test/dev/prepare_env.sh b/python/orca/test/dev/prepare_env.sh new file mode 100755 index 00000000000..aa5fe421de9 --- /dev/null +++ b/python/orca/test/dev/prepare_env.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +SCRIPT_DIR=$(dirname ${BASH_SOURCE[0]}) +echo "SCRIPT_DIR": $SCRIPT_DIR +export DL_PYTHON_HOME="$(cd ${SCRIPT_DIR}/../../../src; pwd)" +export DL_PYTHON_DLLIB_HOME="$(cd ${SCRIPT_DIR}/../../../src/../../dllib/src; pwd)" +export BIGDL_HOME="$(cd ${SCRIPT_DIR}/../../../../..; pwd)" + +echo "BIGDL_HOME: $BIGDL_HOME" +echo "SPARK_HOME": $SPARK_HOME +echo "DL_PYTHON_HOME": $DL_PYTHON_HOME + +if [ -z ${SPARK_HOME+x} ]; then echo "SPARK_HOME is unset"; exit 1; else echo "SPARK_HOME is set to '$SPARK_HOME'"; fi + +export PYSPARK_ZIP=`find $SPARK_HOME/python/lib -type f -iname '*.zip' | tr "\n" ":"` + +#export PYTHONPATH=$PYTHONPATH:$PYSPARK_ZIP:$DL_PYTHON_DLLIB_HOME:$DL_PYTHON_HOME:$DL_PYTHON_HOME/:$DL_PYTHON_HOME/test/dev:$BIGDL_HOME/scala/dllib/src/main/resources/spark-bigdl.conf +export PYTHONPATH=$PYSPARK_ZIP:/home/ding/proj/clone-ding-zoo/analytics-zoo/dist/lib/bigdl-orca-0.14.0-SNAPSHOT-python-api.zip:$DL_PYTHON_HOME/test:$DL_PYTHON_HOME/test/:$DL_PYTHON_HOME/test/dev:$BIGDL_HOME/scala/dllib/src/main/resources/spark-bigdl.conf +echo "PYTHONPATH": $PYTHONPATH + +export BIGDL_CLASSPATH=$(find $BIGDL_HOME/dist/lib/ -name "*with-dependencies.jar" | head -n 1) +echo "BIGDL_CLASSPATH": $BIGDL_CLASSPATH + +if [[ ($SPARK_HOME == *"2.2.0"*) || ($SPARK_HOME == *"2.1.1"*) || ($SPARK_HOME == *"1.6.4"*) ]]; then + export PYTHON_EXECUTABLES=("python2.7" "python3.5" "python3.6") +else + export PYTHON_EXECUTABLES=("python2.7" "python3.5") +fi + +function run_notebook() { + notebook_path=$1 + target_notebook_path=${DL_PYTHON_HOME}/tmp_${PYTHON_EXECUTABLE}.ipynb + echo "Change kernel to $PYTHON_EXECUTABLE" + sed "s/\"python.\"/\"$PYTHON_EXECUTABLE\"/g" $notebook_path > ${target_notebook_path} + jupyter nbconvert --to notebook --execute \ + --ExecutePreprocessor.timeout=360 --output ${DL_PYTHON_HOME}/tmp_out.ipynb \ + $target_notebook_path +} + +export -f run_notebook diff --git a/python/orca/test/dev/run-pytests-ray b/python/orca/test/dev/run-pytests-ray new file mode 100755 index 00000000000..fcb00983463 --- /dev/null +++ b/python/orca/test/dev/run-pytests-ray @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +. `dirname $0`/prepare_env.sh + +cd "`dirname $0`" + +export PYSPARK_PYTHON=python +export PYSPARK_DRIVER_PYTHON=python + +ray stop -f + +#echo "Running orca learn ray tests" +#python -m pytest -v ../test/zoo/orca/learn/ray \ +# --ignore=../test/zoo/orca/learn/ray/pytorch/test_estimator_horovod_backend.py \ +# --ignore=../test/zoo/orca/learn/ray/tf/ +#exit_status_2=$? +#if [ $exit_status_2 -ne 0 ]; +#then +# exit $exit_status_2 +#fi +# +#ray stop -f +# +#echo "Running orca auto estimator tests" +#python -m pytest -v ../test/zoo/orca/automl +#exit_status_6=$? +#if [ $exit_status_6 -ne 0 ]; +#then +# exit $exit_status_6 +#fi +# +#ray stop -f +# +#echo "Running chronos tests" +#python -m pytest -v ../test/zoo/chronos \ +# -k "not test_forecast_tcmf_distributed" +#exit_status_4=$? +#if [ $exit_status_4 -ne 0 ]; +#then +# exit $exit_status_4 +#fi +# +#ray stop -f +# +#echo "Running orca data tests" +#python -m pytest -v ../test/zoo/orca/data \ +# --ignore=../test/zoo/orca/data/test_read_parquet_images.py +#exit_status_5=$? +#if [ $exit_status_5 -ne 0 ]; +#then +# exit $exit_status_5 +#fi +# +#ray stop -f + +echo "Running RayOnSpark tests" +python -m pytest -v ../bigdl/orca/ray/ \ + --ignore=../bigdl/orca/ray/integration/ \ + --ignore=../bigdl/orca/ray/test_reinit_raycontext.py +exit_status_1=$? +if [ $exit_status_1 -ne 0 ]; +then + exit $exit_status_1 +fi + +ray stop -f diff --git a/python/orca/test/dev/run-tfpark.sh b/python/orca/test/dev/run-tfpark.sh new file mode 100755 index 00000000000..93c231d4b25 --- /dev/null +++ b/python/orca/test/dev/run-tfpark.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# +# Copyright 2018 Analytics Zoo Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +. `dirname $0`/prepare_env.sh + +cd "`dirname $0`" + +export PYSPARK_PYTHON=python +export PYSPARK_DRIVER_PYTHON=python + +#python -m pytest -v --doctest-modules ../../../../orca/src/bigdl/orca/tfpark + +exit_status_1=$? +if [ $exit_status_1 -ne 0 ]; +then + exit $exit_status_1 +fi + +python -m pytest -v ../bigdl/orca/tfpark +exit_status_2=$? +if [ $exit_status_2 -ne 0 ]; +then + exit $exit_status_2 +fi