diff --git a/.gitignore b/.gitignore index 736424f85..3903d2d61 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,9 @@ build build-* # pymarian wheels dist/ +tmp +tmp-* +tmp.* # Examples examples/*/*.gz diff --git a/CHANGELOG.md b/CHANGELOG.md index afa4465ce..887f76bb2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed compilation with clang 16.0.6 - Added Threads::Threads to `EXT_LIBS` - Updates to pymarian: building for multiple python versions; disabling tcmalloc; hosting gated COMETs on HuggingFace +- Scripts for building _manylinux_ compatible wheel files (a requirement for publishing wheels on PyPI) ### Added - Added `--normalize-gradient-by-ratio` to mildly adapt gradient magnitude if effective batch size diverges from running average effective batch size. diff --git a/cmake/PythonModules.cmake b/cmake/PythonModules.cmake index 062155647..0cd168911 100644 --- a/cmake/PythonModules.cmake +++ b/cmake/PythonModules.cmake @@ -34,10 +34,8 @@ macro(py_exec) endif() endmacro() -set(PYBIND11_NOPYTHON On) -# this wont work if pybind11 is git submodule -#find_package(pybind11 REQUIRED) - +# NOTE: this property must be set before including pybind11 +# set(PYBIND11_NOPYTHON On) ## ===================== set(PYTHON_SEARCH_VERSIONS 3.7 3.8 3.9 3.10 3.11 3.12 3.13) set(PYTHON_DISABLE_VERSIONS "" CACHE STRING "") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fb5bdca98..f94643500 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -292,7 +292,9 @@ endif(GENERATE_MARIAN_INSTALL_TARGETS) if(PYMARIAN) - # python libs which use different version of tcmalloc (e.g. pandas) can cause segfaults, so we disable it + # this property must be set **before** including pybind11 + # otherwise pybind will intervene with our own python version detection + set(PYBIND11_NOPYTHON On) include_directories(3rd_party/pybind11/include) add_subdirectory(3rd_party/pybind11) include(PythonModules) diff --git a/src/python/build-manylinux.sh b/src/python/build-manylinux.sh new file mode 100755 index 000000000..bfddd7876 --- /dev/null +++ b/src/python/build-manylinux.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash + +# DO NOT call this script directly (unless you know what you are doing). +# Use the build.sh script instead. +# this script builds pymarian wheels for multiple python versions +# it uses mamba to create python environments and builds the wheels +# it also creates manylinux wheels using auditwheel + +set -eu +MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +MARIAN_ROOT="$( cd "$MYDIR/../.." && pwd )" +# assume this directory is mounted in the docker container +cd $MARIAN_ROOT + +#MKL is not in docker image +# yum-config-manager --add-repo https://yum.repos.intel.com/setup/intelproducts.repo +yum-config-manager --add-repo https://yum.repos.intel.com/mkl/setup/intel-mkl.repo +rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB +yum install -y intel-mkl-64bit-2020.4-912 + +# TODO: build a docker image with MKL and mamba installed + +COMPILE_CUDA=1 +PY_VERSIONS="$(echo 3.{12,11,10,9,8})" + +# quick testing: compile for only one version and for CPU only +#COMPILE_CUDA=0 +#PY_VERSIONS="3.10" + +# GLIBC we use for compiling marian should be compatible for newer platforms +# So we use an old GLIBC that works (e.g. 2.17), thus ensuring maximum compatibility +PY_PLATFORM="manylinux_2_17_x86_64" # GLIBC must be 2.17 (or older) for this platform +echo "$(ldd --version | head -1); platform=$PY_PLATFORM" +which mamba >& /dev/null || { + name=Miniforge3-$(uname)-$(uname -m).sh + mambadir=tmp/mamba-$(uname)-$(uname -m) + mkdir -p tmp/ + [[ -s $mambadir/bin/activate ]] || { + [[ -s $name ]] || { + rm -f $name.tmp + wget -q "https://github.com/conda-forge/miniforge/releases/latest/download/$name" -O tmp/$name.tmp \ + && mv tmp/$name{.tmp,} + } + bash tmp/$name -b -u -p $mambadir/ + $mambadir/bin/mamba init bash + } + source $mambadir/etc/profile.d/mamba.sh + source $mambadir/bin/activate +} + +# check if mamba is available +which mamba || { + echo "mamba not found. Exiting." + exit 1 +} + +# create environment for each version + +for v in $PY_VERSIONS; do + mamba env list | grep -q "^py${v}" || { + echo "Creating python $v environment" + mamba create -q -y -n py${v} python=${v} + } +done + +# stack all environments +for v in $PY_VERSIONS; do mamba activate py${v} --stack; done +# check if all python versions are available +for v in $PY_VERSIONS; do which python$v; done + + +# Build as usual +build_dir=$MARIAN_ROOT/build-pymarian +fresh_build=1 +if [[ $fresh_build -eq 1 && -d $build_dir ]]; then + backup_dir=$build_dir.$(date +%y%m%d%H%M%S) + echo "Moving existing build directory to $backup_dir" + mv $build_dir $backup_dir +fi + +mkdir -p $build_dir +cd $build_dir + +#CMAKE_FLAGS="-DPYMARIAN=on -DCMAKE_BUILD_TYPE=Release -DUSE_STATIC_LIBS=on -DUSE_FBGEMM=on" +CMAKE_FLAGS="-DPYMARIAN=on -DCMAKE_BUILD_TYPE=Slim -DUSE_STATIC_LIBS=on -DUSE_FBGEMM=on" +# for cuda support +if [[ $COMPILE_CUDA -eq 1 ]]; then + CMAKE_FLAGS+=" -DCOMPILE_CUDA=on -DCOMPILE_PASCAL=ON -DCOMPILE_VOLTA=ON -DCOMPILE_TURING=ON -DCOMPILE_AMPERE=ON -DCOMPILE_AMPERE_RTX=ON" +else + CMAKE_FLAGS+=" -DCOMPILE_CUDA=off -DCOMPILE_CPU=on" +fi + +cmake .. $CMAKE_FLAGS +make -j +ls -lh pymarian*.whl + +echo "=== Generating manylinux wheels ===" +# make the wheels manylinux compatible +auditwheel repair --plat $PY_PLATFORM *.whl -w manylinux/ +ls -lh manylinux/ + +echo "=== Done ===" diff --git a/src/python/build.sh b/src/python/build.sh new file mode 100755 index 000000000..22e90c311 --- /dev/null +++ b/src/python/build.sh @@ -0,0 +1,20 @@ + +#!/usr/bin/env bash + +# This script is used to build the Python wheels. +# A requirement is that we have to use older GLIBC versions to ensure maximum compatibility. +# Python folks call it "manylinux" wheels and recommed using docker images to build them. +# official manylinux docs: https://github.com/pypa/manylinux +# But the official manylinux images doesnt have CUDA support. +# So we use the "pytorch/manylinux-builder" image which has CUDA support. +# Available tags: https://hub.docker.com/r/pytorch/manylinux-builder/tags + + +LINUX_IMAGE="pytorch/manylinux-builder:cuda12.1" +MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +MARIAN_ROOT="$( cd "$MYDIR/../.." && pwd )" + +set -x +LINUX_BUILDER="src/python/build-manylinux.sh" +MOUNT="/work" +docker run --rm -it -v $MARIAN_ROOT:$MOUNT $LINUX_IMAGE $MOUNT/$LINUX_BUILDER diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml index 30eb16f36..cb39417b6 100644 --- a/src/python/pyproject.toml +++ b/src/python/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "pyyaml", "tqdm", "requests", - "huggingface-hub==0.23.1", + "huggingface-hub", ] [project.scripts] @@ -47,7 +47,7 @@ demos = [ "flask", "sacremoses", "pyqt5", - "sentence-splitter@git+https://github.com/mediacloud/sentence-splitter", +# "sentence-splitter@git+https://github.com/mediacloud/sentence-splitter", ]