From 9707e2049c26f3189a6b83ef5484ea0b61833d09 Mon Sep 17 00:00:00 2001 From: Vignesh Kothapalli Date: Tue, 5 Jan 2021 21:56:56 +0530 Subject: [PATCH] [docs] Restructure README.md content (#1257) * Refactor README.md content * bump to run ci jobs --- .github/workflows/build.yml | 6 +- CONTRIBUTING.md | 6 +- README.md | 346 +++--------------------------------- docs/development.md | 339 +++++++++++++++++++++++++++++++++++ 4 files changed, 364 insertions(+), 333 deletions(-) create mode 100644 docs/development.md diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dc3700ab3..458f704ee 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -55,7 +55,7 @@ jobs: echo $PATH python3 --version python3 -c 'import site; print(site.getsitepackages())' - python3 .github/workflows/build.instruction.py --sudo=true README.md "#### macOS" > source.sh + python3 .github/workflows/build.instruction.py --sudo=true docs/development.md "#### macOS" > source.sh bash -x -e source.sh python3 -c 'import tensorflow as tf; print(tf.version.VERSION)' @@ -67,7 +67,7 @@ jobs: - run: | set -x -e bash -x -e .github/workflows/build.space.sh - python3 .github/workflows/build.instruction.py README.md "##### Ubuntu 20.04" > source.sh + python3 .github/workflows/build.instruction.py docs/development.md "##### Ubuntu 20.04" > source.sh cat source.sh docker run -i --rm -v $PWD:/v -w /v --net=host ubuntu:20.04 \ bash -x -e source.sh @@ -80,7 +80,7 @@ jobs: - run: | set -x -e bash -x -e .github/workflows/build.space.sh - python3 .github/workflows/build.instruction.py README.md "##### CentOS 7" > source.sh + python3 .github/workflows/build.instruction.py docs/development.md "##### CentOS 7" > source.sh cat source.sh docker run -i --rm -v $PWD:/v -w /v --net=host centos:7 \ bash -x -e source.sh diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e8e02ea5f..2c4cc2378 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,7 +1,6 @@ # Contributing -Tensorflow I/O project welcomes all kinds of contributions, be it code changes, bug-fixes or documentation changes. -This guide should help you in taking care of some basic setups & code conventions. +Tensorflow I/O project welcomes all kinds of contributions, be it code changes, bug-fixes or documentation changes. This guide should help you in taking care of some basic setups & code conventions. ## Contributor License Agreement @@ -17,8 +16,7 @@ again. ## Coding Style -Tensorflow project wide code style guidelines can be followed at [TensorFlow Style Guide - Conventions](https://www.tensorflow.org/community/contribute/code_style) and Tensorflow I/O project specific -code style guidelines can be followed at [Style Guide](STYLE_GUIDE.md). +Tensorflow project wide code style guidelines can be followed at [TensorFlow Style Guide - Conventions](https://www.tensorflow.org/community/contribute/code_style) and Tensorflow I/O project specific code style guidelines can be followed at [Style Guide](STYLE_GUIDE.md). ## Code Reviews diff --git a/README.md b/README.md index 94abcdea0..a24ef1b4e 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,8 @@ People who are a little more adventurous can also try our nightly binaries: $ pip install tensorflow-io-nightly ``` +### Docker Images + In addition to the pip packages, the docker images can be used to quickly get started. For stable builds: @@ -132,324 +134,14 @@ of releases [here](https://github.com/tensorflow/io/releases). | 0.2.0 | 1.12.0 | Jan 29, 2019 | | 0.1.0 | 1.12.0 | Dec 16, 2018 | -## Development - -### IDE Setup - -For instructions on how to configure Visual Studio Code for developing TensorFlow I/O, please refer to -https://github.com/tensorflow/io/blob/master/docs/vscode.md - -### Lint - -TensorFlow I/O's code conforms to Bazel Buildifier, Clang Format, Black, and Pyupgrade. -Please use the following command to check the source code and identify lint issues: -``` -$ bazel run //tools/lint:check -``` - -For Bazel Buildifier and Clang Format, the following command will automatically identify -and fix any lint errors: -``` -$ bazel run //tools/lint:lint -``` - -Alternatively, if you only want to perform lint check using individual linters, -then you can selectively pass `black`, `pyupgrade`, `bazel`, or `clang` to the above commands. - -For example, a `black` specific lint check can be done using: -``` -$ bazel run //tools/lint:check -- black -``` - -Lint fix using Bazel Buildifier and Clang Format can be done using: -``` -$ bazel run //tools/lint:lint -- bazel clang -``` - -Lint check using `black` and `pyupgrade` for an individual python file can be done using: -``` -$ bazel run //tools/lint:check -- black pyupgrade -- tensorflow_io/core/python/ops/version_ops.py -``` - -Lint fix an individual python file with black and pyupgrade using: -``` -$ bazel run //tools/lint:lint -- black pyupgrade -- tensorflow_io/core/python/ops/version_ops.py -``` - -### Notebooks/Tutorials -If you are updating or creating a notebook, please refer to the tutorials and instructions mentioned [here](https://github.com/tensorflow/io/tree/master/docs/tutorials). - -### Python - -#### macOS - -On macOS Catalina 10.15.7, it is possible to build tensorflow-io with -system provided python 3.8.2. Both `tensorflow` and `bazel` are needed. - -NOTE: The system default python 3.8.2 on macOS 10.15.7 will cause `regex` installation -error caused by compiler option of `-arch arm64 -arch x86_64` (similar to the issue -mentioned in https://github.com/giampaolo/psutil/issues/1832). To overcome this issue -`export ARCHFLAGS="-arch x86_64"` will be needed to remove arm64 build option. - -```sh -#!/usr/bin/env bash - -# Disable arm64 build by specifying only x86_64 arch. -# Only needed for macOS's system default python 3.8.2 on macOS 10.15.7 -export ARCHFLAGS="-arch x86_64" - -# Use following command to check if Xcode is correctly installed: -xcodebuild -version - -# Show macOS's default python3 -python3 --version - -# Install Bazel version specified in .bazelversion -curl -OL https://github.com/bazelbuild/bazel/releases/download/$(cat .bazelversion)/bazel-$(cat .bazelversion)-installer-darwin-x86_64.sh -sudo bash -x -e bazel-$(cat .bazelversion)-installer-darwin-x86_64.sh - -# Install tensorflow and configure bazel -sudo ./configure.sh - -# Build shared libraries -bazel build -s --verbose_failures //tensorflow_io/... - -# Once build is complete, shared libraries will be available in -# `bazel-bin/tensorflow_io/core/python/ops/` and it is possible -# to run tests with `pytest`, e.g.: -sudo python3 -m pip install pytest -TFIO_DATAPATH=bazel-bin python3 -m pytest -s -v tests/test_serialization_eager.py -``` - -NOTE: When running pytest, `TFIO_DATAPATH=bazel-bin` has to be passed so that python can utilize the generated shared libraries after the build process. - -##### Troubleshoot - -If Xcode is installed, but `$ xcodebuild -version` is not displaying the expected output, you might need to enable Xcode command line with the command: - -`$ xcode-select -s /Applications/Xcode.app/Contents/Developer`. - -A terminal restart might be required for the changes to take effect. - -Sample output: - -``` -$ xcodebuild -version -Xcode 11.6 -Build version 11E708 -``` - - -#### Linux - -Development of tensorflow-io on Linux is similar to macOS. The required packages -are gcc, g++, git, bazel, and python 3. Newer versions of gcc or python, other than the default system installed -versions might be required though. - -##### Ubuntu 20.04 - -Ubuntu 20.04 requires gcc/g++, git, and python 3. The following will install dependencies and build -the shared libraries on Ubuntu 20.04: -```sh -#!/usr/bin/env bash - -# Install gcc/g++, git, unzip/curl (for bazel), and python3 -sudo apt-get -y -qq update -sudo apt-get -y -qq install gcc g++ git unzip curl python3-pip - -# Install Bazel version specified in .bazelversion -curl -sSOL https://github.com/bazelbuild/bazel/releases/download/$(cat .bazelversion)/bazel-$(cat .bazelversion)-installer-linux-x86_64.sh -sudo bash -x -e bazel-$(cat .bazelversion)-installer-linux-x86_64.sh - -# Upgrade pip -sudo python3 -m pip install -U pip - -# Install tensorflow and configure bazel -sudo ./configure.sh - -# Build shared libraries -bazel build -s --verbose_failures //tensorflow_io/... - -# Once build is complete, shared libraries will be available in -# `bazel-bin/tensorflow_io/core/python/ops/` and it is possible -# to run tests with `pytest`, e.g.: -sudo python3 -m pip install pytest -TFIO_DATAPATH=bazel-bin python3 -m pytest -s -v tests/test_serialization_eager.py -``` - -##### CentOS 8 - -The steps to build shared libraries for CentOS 8 is similiar to Ubuntu 20.04 above -excpet that -``` -sudo yum install -y python3 python3-devel gcc gcc-c++ git unzip which make -``` -should be used instead to install gcc/g++, git, unzip/which (for bazel), and python3. - -##### CentOS 7 - -On CentOS 7, the default python and gcc version are too old to build tensorflow-io's shared -libraries (.so). The gcc provided by Developer Toolset and rh-python36 should be used instead. -Also, the libstdc++ has to be linked statically to avoid discrepancy of libstdc++ installed on -CentOS vs. newer gcc version by devtoolset. - -Furthermore, a special flag `--//tensorflow_io/core:static_build` has to be passed to Bazel -in order to avoid duplication of symbols in statically linked libraries for file system -plugins. - -The following will install bazel, devtoolset-9, rh-python36, and build the shared libraries: -```sh -#!/usr/bin/env bash - -# Install centos-release-scl, then install gcc/g++ (devtoolset), git, and python 3 -sudo yum install -y centos-release-scl -sudo yum install -y devtoolset-9 git rh-python36 make - -# Install Bazel version specified in .bazelversion -curl -sSOL https://github.com/bazelbuild/bazel/releases/download/$(cat .bazelversion)/bazel-$(cat .bazelversion)-installer-linux-x86_64.sh -sudo bash -x -e bazel-$(cat .bazelversion)-installer-linux-x86_64.sh - -# Upgrade pip -scl enable rh-python36 devtoolset-9 \ - 'python3 -m pip install -U pip' - -# Install tensorflow and configure bazel with rh-python36 -scl enable rh-python36 devtoolset-9 \ - './configure.sh' - -# Build shared libraries, notice the passing of --//tensorflow_io/core:static_build -BAZEL_LINKOPTS="-static-libstdc++ -static-libgcc" BAZEL_LINKLIBS="-lm -l%:libstdc++.a" \ - scl enable rh-python36 devtoolset-9 \ - 'bazel build -s --verbose_failures --//tensorflow_io/core:static_build //tensorflow_io/...' - -# Once build is complete, shared libraries will be available in -# `bazel-bin/tensorflow_io/core/python/ops/` and it is possible -# to run tests with `pytest`, e.g.: -scl enable rh-python36 devtoolset-9 \ - 'python3 -m pip install pytest' - -TFIO_DATAPATH=bazel-bin \ - scl enable rh-python36 devtoolset-9 \ - 'python3 -m pytest -s -v tests/test_serialization_eager.py' -``` - -#### Python Wheels - -It is possible to build python wheels after bazel build is complete with the following command: -``` -$ python3 setup.py bdist_wheel --data bazel-bin -``` -The .whl file will be available in dist directory. Note the bazel binary directory `bazel-bin` -has to be passed with `--data` args in order for setup.py to locate the necessary share objects, -as `bazel-bin` is outside of the `tensorflow_io` package directory. - -Alternatively, source install could be done with: -``` -$ TFIO_DATAPATH=bazel-bin python3 -m pip install . -``` -with `TFIO_DATAPATH=bazel-bin` passed for the same reason. - -Note installing with `-e` is different from the above. The -``` -$ TFIO_DATAPATH=bazel-bin python3 -m pip install -e . -``` -will not install shared object automatically even with `TFIO_DATAPATH=bazel-bin`. Instead, -`TFIO_DATAPATH=bazel-bin` has to be passed everytime the program is run after the install: -``` -$ TFIO_DATAPATH=bazel-bin python3 - ->>> import tensorflow_io as tfio ->>> ... -``` - -#### Docker - -For Python development, a reference Dockerfile [here](tools/docker/devel.Dockerfile) can be -used to build the TensorFlow I/O package (`tensorflow-io`) from source. Additionally, the -pre-built devel images can be used as well: -```sh -# Pull (if necessary) and start the devel container -$ docker run -it --rm --name tfio-dev --net=host -v ${PWD}:/v -w /v tfsigio/tfio:latest-devel bash - -# Inside the docker container, ./configure.sh will install TensorFlow or use existing install -(tfio-dev) root@docker-desktop:/v$ ./configure.sh - -# Clean up exisiting bazel build's (if any) -(tfio-dev) root@docker-desktop:/v$ rm -rf bazel-* - -# Build TensorFlow I/O C++. For compilation optimization flags, the default (-march=native) -# optimizes the generated code for your machine's CPU type. -# Reference: https://www.tensorflow.orginstall/source#configuration_options). - -# NOTE: Based on the available resources, please change the number of job workers to: -# -j 4/8/16 to prevent bazel server terminations and resource oriented build errors. - -(tfio-dev) root@docker-desktop:/v$ bazel build -j 8 --copt=-msse4.2 --copt=-mavx --compilation_mode=opt --verbose_failures --test_output=errors --crosstool_top=//third_party/toolchains/gcc7_manylinux2010:toolchain //tensorflow_io/... - - -# Run tests with PyTest, note: some tests require launching additional containers to run (see below) -(tfio-dev) root@docker-desktop:/v$ pytest -s -v tests/ -# Build the TensorFlow I/O package -(tfio-dev) root@docker-desktop:/v$ python setup.py bdist_wheel -``` - -A package file `dist/tensorflow_io-*.whl` will be generated after a build is successful. - -NOTE: When working in the Python development container, an environment variable -`TFIO_DATAPATH` is automatically set to point tensorflow-io to the shared C++ -libraries built by Bazel to run `pytest` and build the `bdist_wheel`. Python -`setup.py` can also accept `--data [path]` as an argument, for example -`python setup.py --data bazel-bin bdist_wheel`. - -NOTE: While the tfio-dev container gives developers an easy to work with -environment, the released whl packages are built differently due to manylinux2010 -requirements. Please check [Build Status and CI] section for more details -on how the released whl packages are generated. - -#### Starting Test Containers - -Some tests require launching a test container before running. In order -to run all tests, execute the following commands: - -```sh -$ bash -x -e tests/test_ignite/start_ignite.sh -$ bash -x -e tests/test_kafka/kafka_test.sh -$ bash -x -e tests/test_kinesis/kinesis_test.sh -``` - -### R - -We provide a reference Dockerfile [here](R-package/scripts/Dockerfile) for you -so that you can use the R package directly for testing. You can build it via: -```sh -$ docker build -t tfio-r-dev -f R-package/scripts/Dockerfile . -``` - -Inside the container, you can start your R session, instantiate a `SequenceFileDataset` -from an example [Hadoop SequenceFile](https://wiki.apache.org/hadoop/SequenceFile) -[string.seq](R-package/tests/testthat/testdata/string.seq), and then use any [transformation functions](https://tensorflow.rstudio.com/tools/tfdatasets/articles/introduction.html#transformations) provided by [tfdatasets package](https://tensorflow.rstudio.com/tools/tfdatasets/) on the dataset like the following: - -```r -library(tfio) -dataset <- sequence_file_dataset("R-package/tests/testthat/testdata/string.seq") %>% - dataset_repeat(2) - -sess <- tf$Session() -iterator <- make_iterator_one_shot(dataset) -next_batch <- iterator_get_next(iterator) - -until_out_of_range({ - batch <- sess$run(next_batch) - print(batch) -}) -``` - ## Contributing Tensorflow I/O is a community led open source project. As such, the project -depends on public contributions, bug-fixes, and documentation. Please -see [contribution guidelines](CONTRIBUTING.md) for a guide on how to -contribute. +depends on public contributions, bug-fixes, and documentation. Please see: + +- [contribution guidelines](CONTRIBUTING.md) for a guide on how to contribute. +- [development doc](docs/development.md) for instructions on the development environment setup. +- [tutorials](docs/tutorials) for a list of tutorial notebooks and instructions on how to write one. ### Build Status and CI @@ -481,7 +173,7 @@ It takes some time to build, but once complete, there will be python `3.5`, `3.6`, `3.7` compatible whl packages available in `wheelhouse` directory. -On macOS, the same command could be used though the script expect `python` in shell +On macOS, the same command could be used. However, the script expects `python` in shell and will only generate a whl package that matches the version of `python` in shell. If you want to build a whl package for a specific python then you have to alias this version of python to `python` in shell. See [.github/workflows/build.yml](.github/workflows/build.yml) @@ -493,17 +185,16 @@ TensorFlow I/O uses both GitHub Workflows and Google CI (Kokoro) for continuous GitHub Workflows is used for macOS build and test. Kokoro is used for Linux build and test. Again, because of the manylinux2010 requirement, on Linux whl packages are always built with Ubuntu 16.04 + Developer Toolset 7. Tests are done on a variatiy of systems -with different python version to ensure a good coverage: +with different python3 versions to ensure a good coverage: -| Python | Ubuntu 16.04| Ubuntu 18.04 | macOS + osx9 | -| ------- | ----- | ------- | ------- | -| 2.7 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -| 3.5 | :heavy_check_mark: | N/A | :heavy_check_mark: | -| 3.6 | N/A | :heavy_check_mark: | :heavy_check_mark: | -| 3.7 | N/A | :heavy_check_mark: | N/A | +| Python | Ubuntu 18.04| Ubuntu 20.04 | macOS + osx9 | Windows-2019 | +| ------- | ----- | ------- | ------- | --------- | +| 2.7 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | N/A | +| 3.7 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| 3.8 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -TensorFlow I/O has integrations with may systems and cloud vendors such as +TensorFlow I/O has integrations with many systems and cloud vendors such as Prometheus, Apache Kafka, Apache Ignite, Google Cloud PubSub, AWS Kinesis, Microsoft Azure Storage, Alibaba Cloud OSS etc. @@ -526,8 +217,11 @@ level of coverage as live systems or emulators. | AWS Kinesis | | :heavy_check_mark: |:heavy_check_mark:| | | Alibaba Cloud OSS | | | | :heavy_check_mark: | | Google BigTable/BigQuery | | to be added | | | +| Elasticsearch (experimental) | :heavy_check_mark: | |:heavy_check_mark:| | +| MongoDB (experimental) | :heavy_check_mark: | |:heavy_check_mark:| | + -Note: +References for emulators: - Official [PubSub Emulator](https://cloud.google.com/sdk/gcloud/reference/beta/emulators/pubsub/) by Google Cloud for Cloud PubSub. - Official [Azurite Emulator](https://github.com/Azure/Azurite) by Azure for Azure Storage. - None-official [LocalStack emulator](https://github.com/localstack/localstack) by LocalStack for AWS Kinesis. @@ -539,7 +233,7 @@ Note: * SIG IO [Monthly Meeting Notes](https://docs.google.com/document/d/1CB51yJxns5WA4Ylv89D-a5qReiGTC0GYum6DU-9nKGo/edit) * Gitter room: [tensorflow/sig-io](https://gitter.im/tensorflow/sig-io) -## More Information +## Additional Information * [Streaming Machine Learning with Tiered Storage and Without a Data Lake](https://www.confluent.io/blog/streaming-machine-learning-with-tiered-storage/) - [Kai Waehner](https://github.com/kaiwaehner) * [TensorFlow with Apache Arrow Datasets](https://medium.com/tensorflow/tensorflow-with-apache-arrow-datasets-cdbcfe80a59f) - [Bryan Cutler](https://github.com/BryanCutler) diff --git a/docs/development.md b/docs/development.md new file mode 100644 index 000000000..2e690f46a --- /dev/null +++ b/docs/development.md @@ -0,0 +1,339 @@ + +## Development + +The document contains the necessary information for setting up the development environement +and building the `tensorflow-io` package from source on various platforms. + +### IDE Setup + +For instructions on how to configure Visual Studio Code for developing TensorFlow I/O, please refer to this [doc](https://github.com/tensorflow/io/blob/master/docs/vscode.md). + +### Lint + +TensorFlow I/O's code conforms to Bazel Buildifier, Clang Format, Black, and Pyupgrade. +Please use the following command to check the source code and identify lint issues: +``` +$ bazel run //tools/lint:check +``` + +For Bazel Buildifier and Clang Format, the following command will automatically identify +and fix any lint errors: +``` +$ bazel run //tools/lint:lint +``` + +Alternatively, if you only want to perform lint check using individual linters, +then you can selectively pass `black`, `pyupgrade`, `bazel`, or `clang` to the above commands. + +For example, a `black` specific lint check can be done using: +``` +$ bazel run //tools/lint:check -- black +``` + +Lint fix using Bazel Buildifier and Clang Format can be done using: +``` +$ bazel run //tools/lint:lint -- bazel clang +``` + +Lint check using `black` and `pyupgrade` for an individual python file can be done using: +``` +$ bazel run //tools/lint:check -- black pyupgrade -- tensorflow_io/core/python/ops/version_ops.py +``` + +Lint fix an individual python file with black and pyupgrade using: +``` +$ bazel run //tools/lint:lint -- black pyupgrade -- tensorflow_io/core/python/ops/version_ops.py +``` + +### Python + +#### macOS + +On macOS Catalina 10.15.7, it is possible to build tensorflow-io with +system provided python 3.8.2. Both `tensorflow` and `bazel` are needed to do so. + +NOTE: The system default python 3.8.2 on macOS 10.15.7 will cause `regex` installation +error caused by compiler option of `-arch arm64 -arch x86_64` (similar to the issue +mentioned in https://github.com/giampaolo/psutil/issues/1832). To overcome this issue +`export ARCHFLAGS="-arch x86_64"` will be needed to remove arm64 build option. + +```sh +#!/usr/bin/env bash + +# Disable arm64 build by specifying only x86_64 arch. +# Only needed for macOS's system default python 3.8.2 on macOS 10.15.7 +export ARCHFLAGS="-arch x86_64" + +# Use following command to check if Xcode is correctly installed: +xcodebuild -version + +# Show macOS's default python3 +python3 --version + +# Install Bazel version specified in .bazelversion +curl -OL https://github.com/bazelbuild/bazel/releases/download/$(cat .bazelversion)/bazel-$(cat .bazelversion)-installer-darwin-x86_64.sh +sudo bash -x -e bazel-$(cat .bazelversion)-installer-darwin-x86_64.sh + +# Install tensorflow and configure bazel +sudo ./configure.sh + +# Build shared libraries +bazel build -s --verbose_failures //tensorflow_io/... + +# Once build is complete, shared libraries will be available in +# `bazel-bin/tensorflow_io/core/python/ops/` and it is possible +# to run tests with `pytest`, e.g.: +sudo python3 -m pip install pytest +TFIO_DATAPATH=bazel-bin python3 -m pytest -s -v tests/test_serialization_eager.py +``` + +NOTE: When running pytest, `TFIO_DATAPATH=bazel-bin` has to be passed so that python can utilize the generated shared libraries after the build process. + +##### Troubleshoot + +If Xcode is installed, but `$ xcodebuild -version` is not displaying the expected output, you might need to enable Xcode command line with the command: + +`$ xcode-select -s /Applications/Xcode.app/Contents/Developer`. + +A terminal restart might be required for the changes to take effect. + +Sample output: + +``` +$ xcodebuild -version +Xcode 12.2 +Build version 12B45b +``` + +#### Linux + +Development of tensorflow-io on Linux is similar to macOS. The required packages +are gcc, g++, git, bazel, and python 3. Newer versions of gcc or python, other than the default system installed +versions might be required though. + +##### Ubuntu 20.04 + +Ubuntu 20.04 requires gcc/g++, git, and python 3. The following will install dependencies and build +the shared libraries on Ubuntu 20.04: +```sh +#!/usr/bin/env bash + +# Install gcc/g++, git, unzip/curl (for bazel), and python3 +sudo apt-get -y -qq update +sudo apt-get -y -qq install gcc g++ git unzip curl python3-pip + +# Install Bazel version specified in .bazelversion +curl -sSOL https://github.com/bazelbuild/bazel/releases/download/$(cat .bazelversion)/bazel-$(cat .bazelversion)-installer-linux-x86_64.sh +sudo bash -x -e bazel-$(cat .bazelversion)-installer-linux-x86_64.sh + +# Upgrade pip +sudo python3 -m pip install -U pip + +# Install tensorflow and configure bazel +sudo ./configure.sh + +# Build shared libraries +bazel build -s --verbose_failures //tensorflow_io/... + +# Once build is complete, shared libraries will be available in +# `bazel-bin/tensorflow_io/core/python/ops/` and it is possible +# to run tests with `pytest`, e.g.: +sudo python3 -m pip install pytest +TFIO_DATAPATH=bazel-bin python3 -m pytest -s -v tests/test_serialization_eager.py +``` + +##### CentOS 8 + +The steps to build shared libraries for CentOS 8 is similiar to Ubuntu 20.04 above +excpet that +``` +sudo yum install -y python3 python3-devel gcc gcc-c++ git unzip which make +``` +should be used instead to install gcc/g++, git, unzip/which (for bazel), and python3. + +##### CentOS 7 + +On CentOS 7, the default python and gcc version are too old to build tensorflow-io's shared +libraries (.so). The gcc provided by Developer Toolset and rh-python36 should be used instead. +Also, the libstdc++ has to be linked statically to avoid discrepancy of libstdc++ installed on +CentOS vs. newer gcc version by devtoolset. + +Furthermore, a special flag `--//tensorflow_io/core:static_build` has to be passed to Bazel +in order to avoid duplication of symbols in statically linked libraries for file system +plugins. + +The following will install bazel, devtoolset-9, rh-python36, and build the shared libraries: +```sh +#!/usr/bin/env bash + +# Install centos-release-scl, then install gcc/g++ (devtoolset), git, and python 3 +sudo yum install -y centos-release-scl +sudo yum install -y devtoolset-9 git rh-python36 make + +# Install Bazel version specified in .bazelversion +curl -sSOL https://github.com/bazelbuild/bazel/releases/download/$(cat .bazelversion)/bazel-$(cat .bazelversion)-installer-linux-x86_64.sh +sudo bash -x -e bazel-$(cat .bazelversion)-installer-linux-x86_64.sh + +# Upgrade pip +scl enable rh-python36 devtoolset-9 \ + 'python3 -m pip install -U pip' + +# Install tensorflow and configure bazel with rh-python36 +scl enable rh-python36 devtoolset-9 \ + './configure.sh' + +# Build shared libraries, notice the passing of --//tensorflow_io/core:static_build +BAZEL_LINKOPTS="-static-libstdc++ -static-libgcc" BAZEL_LINKLIBS="-lm -l%:libstdc++.a" \ + scl enable rh-python36 devtoolset-9 \ + 'bazel build -s --verbose_failures --//tensorflow_io/core:static_build //tensorflow_io/...' + +# Once build is complete, shared libraries will be available in +# `bazel-bin/tensorflow_io/core/python/ops/` and it is possible +# to run tests with `pytest`, e.g.: +scl enable rh-python36 devtoolset-9 \ + 'python3 -m pip install pytest' + +TFIO_DATAPATH=bazel-bin \ + scl enable rh-python36 devtoolset-9 \ + 'python3 -m pytest -s -v tests/test_serialization_eager.py' +``` + +#### Python Wheels + +It is possible to build python wheels after bazel build is complete with the following command: +``` +$ python3 setup.py bdist_wheel --data bazel-bin +``` +The .whl file will be available in dist directory. Note the bazel binary directory `bazel-bin` +has to be passed with `--data` args in order for setup.py to locate the necessary share objects, +as `bazel-bin` is outside of the `tensorflow_io` package directory. + +Alternatively, source install could be done with: +``` +$ TFIO_DATAPATH=bazel-bin python3 -m pip install . +``` +with `TFIO_DATAPATH=bazel-bin` passed for the same reason. + +Note installing with `-e` is different from the above. The +``` +$ TFIO_DATAPATH=bazel-bin python3 -m pip install -e . +``` +will not install shared object automatically even with `TFIO_DATAPATH=bazel-bin`. Instead, +`TFIO_DATAPATH=bazel-bin` has to be passed everytime the program is run after the install: +``` +$ TFIO_DATAPATH=bazel-bin python3 + +>>> import tensorflow_io as tfio +>>> ... +``` + +#### Docker + +For Python development, a reference Dockerfile [here](tools/docker/devel.Dockerfile) can be +used to build the TensorFlow I/O package (`tensorflow-io`) from source. Additionally, the +pre-built devel images can be used as well: +```sh +# Pull (if necessary) and start the devel container +$ docker run -it --rm --name tfio-dev --net=host -v ${PWD}:/v -w /v tfsigio/tfio:latest-devel bash + +# Inside the docker container, ./configure.sh will install TensorFlow or use existing install +(tfio-dev) root@docker-desktop:/v$ ./configure.sh + +# Clean up exisiting bazel build's (if any) +(tfio-dev) root@docker-desktop:/v$ rm -rf bazel-* + +# Build TensorFlow I/O C++. For compilation optimization flags, the default (-march=native) +# optimizes the generated code for your machine's CPU type. +# Reference: https://www.tensorflow.orginstall/source#configuration_options). + +# NOTE: Based on the available resources, please change the number of job workers to: +# -j 4/8/16 to prevent bazel server terminations and resource oriented build errors. + +(tfio-dev) root@docker-desktop:/v$ bazel build -j 8 --copt=-msse4.2 --copt=-mavx --compilation_mode=opt --verbose_failures --test_output=errors --crosstool_top=//third_party/toolchains/gcc7_manylinux2010:toolchain //tensorflow_io/... + + +# Run tests with PyTest, note: some tests require launching additional containers to run (see below) +(tfio-dev) root@docker-desktop:/v$ pytest -s -v tests/ +# Build the TensorFlow I/O package +(tfio-dev) root@docker-desktop:/v$ python setup.py bdist_wheel +``` + +A package file `dist/tensorflow_io-*.whl` will be generated after a build is successful. + +NOTE: When working in the Python development container, an environment variable +`TFIO_DATAPATH` is automatically set to point tensorflow-io to the shared C++ +libraries built by Bazel to run `pytest` and build the `bdist_wheel`. Python +`setup.py` can also accept `--data [path]` as an argument, for example +`python setup.py --data bazel-bin bdist_wheel`. + +NOTE: While the tfio-dev container gives developers an easy to work with +environment, the released whl packages are built differently due to manylinux2010 +requirements. Please check [Build Status and CI] section for more details +on how the released whl packages are generated. + +#### Testing + +Some tests require launching a test container or start a local instance +of the associated tool before running. For example, to run kafka +related tests which will start a local instance of kafka, zookeeper and schema-registry, +use: + +```sh +# Start the local instances of kafka, zookeeper and schema-registry +$ bash -x -e tests/test_kafka/kafka_test.sh + +# Run the tests +$ TFIO_DATAPATH=bazel-bin pytest -s -vv tests/test_kafka_eager.py +``` + +Testing `Datasets` associated with tools such as `Elasticsearch` or `MongoDB` +require docker to be available on the system. In such scenarios, use: + + +```sh +# Start elasticsearch within docker container +$ bash tests/test_elasticsearch/elasticsearch_test.sh start + +# Run the tests +$ TFIO_DATAPATH=bazel-bin pytest -s -vv tests/test_elasticsearch_eager.py + +# Stop and remove the container +$ bash tests/test_elasticsearch/elasticsearch_test.sh stop +``` + +Additionally, testing some features of `tensorflow-io` doesn't require you to spin up +any additional tools as the data has been provided in the `tests` directory itself. +For example, to run tests related to `parquet` dataset's, use: + +```sh +# Just run the test +$ TFIO_DATAPATH=bazel-bin pytest -s -vv tests/test_parquet_eager.py +``` + + +### R + +We provide a reference Dockerfile [here](R-package/scripts/Dockerfile) for you +so that you can use the R package directly for testing. You can build it via: +```sh +$ docker build -t tfio-r-dev -f R-package/scripts/Dockerfile . +``` + +Inside the container, you can start your R session, instantiate a `SequenceFileDataset` +from an example [Hadoop SequenceFile](https://wiki.apache.org/hadoop/SequenceFile) +[string.seq](R-package/tests/testthat/testdata/string.seq), and then use any [transformation functions](https://tensorflow.rstudio.com/tools/tfdatasets/articles/introduction.html#transformations) provided by [tfdatasets package](https://tensorflow.rstudio.com/tools/tfdatasets/) on the dataset like the following: + +```r +library(tfio) +dataset <- sequence_file_dataset("R-package/tests/testthat/testdata/string.seq") %>% + dataset_repeat(2) + +sess <- tf$Session() +iterator <- make_iterator_one_shot(dataset) +next_batch <- iterator_get_next(iterator) + +until_out_of_range({ + batch <- sess$run(next_batch) + print(batch) +}) +``` \ No newline at end of file