Skip to content

Commit

Permalink
Rework RedisAI build to allow for GCC-14 testing (#518)
Browse files Browse the repository at this point in the history
The RedisAI build needed to be tweaked slightly to allow compilation
with GCC-14. Additionally, bumping the testing container to Ubuntu
24.04 (needed to add gcc-14) necessitated the need to switch to
OpenMPI because the default MPICH packages on that platform
do not work properly.

Lastly, some small tweaks to the database launch script were
added to help with debugging output and versions of some
actions were bumped to avoid warnings.

[ committed by @ashao ]
[ reviewed by @MattToast ]
  • Loading branch information
ashao authored Oct 21, 2024
1 parent 734fb52 commit bebcd9c
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04, macos-12]
os: [ubuntu-24.04, macos-12]
gcc_v: [11] # Version of GFortran we want to use.
env:
FC: gfortran-${{ matrix.gcc_v }}
Expand Down
14 changes: 7 additions & 7 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,15 @@ env:
jobs:

run_tests:
name: Run smartredis tests using ${{ matrix.os }}, Python ${{ matrix.py_v }}, and ${{ matrix.compiler }}
name: Python ${{ matrix.py_v }}, ${{ matrix.compiler }}, ${{ matrix.link_type }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-22.04] # cannot test on macOS as docker isn't supported on Mac
os: [ubuntu-24.04] # cannot test on macOS as docker isn't supported on Mac
rai_v: [1.2.7] # versions of RedisAI
py_v: ['3.9.x', '3.10.x', '3.11.x'] # versions of Python
compiler: [nvhpc-23-11, intel-2024.0, gcc-11, gcc-12] # intel compiler, and versions of GNU compiler
compiler: [nvhpc-24-5, intel-2024.0, gcc-11, gcc-12, gcc-13, gcc-14] # intel compiler, and versions of GNU compiler
link_type: [shared, static]
env:
COMPILER: ${{ matrix.compiler }} # used when the compiler is gcc/gfortran
Expand All @@ -76,18 +76,18 @@ jobs:
- name: Maximize build space
uses: easimon/maximize-build-space@master
with:
root-reserve-mb: 30720
root-reserve-mb: 40960
remove-dotnet: true
remove-android: true
remove-haskell: true
remove-codeql: true
remove-docker-images: true

# download a copy of SmartRedis before running CI tests
- uses: actions/checkout@v3
- uses: actions/checkout@v4

# Setup python within the container
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.py_v }}

Expand All @@ -99,7 +99,7 @@ jobs:
sudo apt-get -y update &&
sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test &&
sudo apt-get -y update &&
sudo apt-get -y install -y gcc-${GCC_V} gfortran-${GCC_V} g++-${GCC_V} mpich &&
sudo apt-get -y install -y gcc-${GCC_V} gfortran-${GCC_V} g++-${GCC_V} openmpi-bin libopenmpi-dev &&
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${GCC_V} 100 \
--slave /usr/bin/gfortran gfortran /usr/bin/gfortran-${GCC_V} \
--slave /usr/bin/g++ g++ /usr/bin/g++-${GCC_V} \
Expand Down
59 changes: 39 additions & 20 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,20 @@ SR_TEST_PORT := 6379
SR_TEST_NODES := 3
SR_TEST_REDISAI_VER := v1.2.7
SR_TEST_DEVICE := cpu
SR_TEST_DEVICE_UPPER := $(shell echo $(SR_TEST_DEVICE) | tr '[:lower:]' '[:upper:]')
SR_TEST_PYTEST_FLAGS := -vv -s
ifeq ($(LINK_TYPE), shared)
BUILD_SHARED_LIBS=on
else
BUILD_SHARED_LIBS=off
endif
SR_TEST_INSTALL_PREFIX = $(CWD)/install/$(BUILD_TYPE)/$(LINK_TYPE)

LIBTORCH_CPU_URL = https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.3.1%2Bcpu.zip
LIBTORCH_CUDA11_URL = https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.3.1%2Bcu118.zip

TORCH_CPU_PIP = pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cpu
TORCH_CUDA11_PIP = pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118
# Do not remove this block. It is used by the 'help' rule when
# constructing the help output.
# help:
Expand Down Expand Up @@ -349,7 +356,7 @@ define run_smartredis_tests_with_standalone_server
(testresult=$$?; \
echo "Shutting down standalone Redis server" && \
python utils/launch_redis.py --port $(SR_TEST_PORT) --nodes 1 --stop && \
test $$testresult -eq 0 || echo "Standalone tests failed"; exit $$testresult) && \
test $$testresult -eq 0 || cat *.log; exit $$testresult) && \
echo "Standalone tests complete"
endef

Expand All @@ -370,7 +377,7 @@ define run_smartredis_tests_with_clustered_server
echo "Shutting down clustered Redis server" && \
python utils/launch_redis.py --port $(SR_TEST_PORT) \
--nodes $(SR_TEST_NODES) --stop; \
test $$testresult -eq 0 || echo "Clustered tests failed"; exit $$testresult) && \
test $$testresult -eq 0 || cat *.log; exit $$testresult) && \
echo "Clustered tests complete"
endef

Expand All @@ -393,7 +400,7 @@ define run_smartredis_tests_with_uds_server
echo "Shutting down standalone Redis server with Unix Domain Socket support" && \
python utils/launch_redis.py --port $(SR_TEST_PORT) --nodes 1 \
--udsport $(SR_TEST_UDS_FILE) --stop; \
test $$testresult -eq 0 || echo "UDS tests failed"; exit $$testresult) && \
test $$testresult -eq 0 || cat *.log; exit $$testresult) && \
echo "UDS tests complete"
endef

Expand Down Expand Up @@ -503,10 +510,10 @@ test-examples:
############################################################################
# hidden build targets for third-party software

# cudann-check (hidden test target)
# cudnn-check (hidden test target)
# checks cuda dependencies for GPU build
.PHONY: cudann-check
cudann-check:
.PHONY: cudnn-check
cudnn-check:
ifeq ($(SR_TEST_DEVICE),gpu)
ifndef CUDA_HOME
$(error ERROR: CUDA_HOME is not set)
Expand Down Expand Up @@ -543,25 +550,26 @@ third-party/RedisAI:
@mkdir -p third-party
@cd third-party && \
rm -rf RedisAI/$(SR_TEST_REDISAI_VER) && \
GIT_LFS_SKIP_SMUDGE=1 git clone --recursive $(REDISAI_URL) RedisAI/$(SR_TEST_REDISAI_VER) \
GIT_LFS_SKIP_SMUDGE=1 git clone $(REDISAI_URL) RedisAI/$(SR_TEST_REDISAI_VER) \
--branch $(SR_TEST_REDISAI_VER) --depth=1

.PHONY: redisAI
redisAI: cudann-check
redisAI: cudnn-check
redisAI: pytorch
redisAI: dlpack
redisAI: third-party/RedisAI/$(SR_TEST_REDISAI_VER)/install-$(SR_TEST_DEVICE)/redisai.so
third-party/RedisAI/$(SR_TEST_REDISAI_VER)/install-$(SR_TEST_DEVICE)/redisai.so: third-party/RedisAI
@echo in third-party/RedisAI/$(SR_TEST_REDISAI_VER)/install-$(SR_TEST_DEVICE)/redisai.so:
$(eval DEVICE_IS_GPU := $(shell test $(SR_TEST_DEVICE) == "cpu"; echo $$?))
@cd third-party/RedisAI/$(SR_TEST_REDISAI_VER) && \
WITH_PT=1 WITH_TF=1 WITH_TFLITE=0 WITH_ORT=0 bash get_deps.sh \
$(SR_TEST_DEVICE)
@cd third-party/RedisAI/$(SR_TEST_REDISAI_VER) && \
GPU=$(DEVICE_IS_GPU) WITH_PT=1 WITH_TF=1 WITH_TFLITE=0 WITH_ORT=0 \
WITH_PT=1 WITH_TF=1 WITH_TFLITE=0 WITH_ORT=0 bash get_deps.sh \
WITH_UNIT_TESTS=0 make CC=$(DEP_CC) CXX=$(DEP_CXX) -j $(NPROC) -C opt clean
@cd third-party/RedisAI/$(SR_TEST_REDISAI_VER) && \
GPU=$(DEVICE_IS_GPU) WITH_PT=1 WITH_TF=1 WITH_TFLITE=0 WITH_ORT=0 \
WITH_UNIT_TESTS=0 make CC=$(DEP_CC) CXX=$(DEP_CXX) -C opt && \
mkdir -p build-$(SR_TEST_DEVICE) && cd build-$(SR_TEST_DEVICE) && \
sed -E -i "s/CXX_STANDARD (11|14)/CXX_STANDARD 17/g" ../src/backends/libtorch_c/CMakeLists.txt && \
cmake -DBUILD_TF=0 -DBUILD_ORT=0 -DBUILD_TORCH=1 -DBUILD_TFLITE=0 \
-DDEPS_PATH=$(CWD)/third-party/backends \
-DDEVICE=$(SR_TEST_DEVICE) \
-DINSTALL_PATH=../install-$(SR_TEST_DEVICE) \
-DCMAKE_C_COMPILER=$(DEP_CC) \
-DCMAKE_CXX_COMPILER=$(DEP_CXX) ../ && \
make -j install && \
chmod +x ../install-$(SR_TEST_DEVICE)/redisai.so
echo "Finished installing RedisAI"

# Catch2 (hidden test target)
Expand All @@ -586,4 +594,15 @@ third-party/lcov/install/bin/lcov:
make CC=$(DEP_CC) CXX=$(DEP_CXX) PREFIX=$(CWD)/third-party/lcov/install/ install && \
echo "Finished installing LCOV"


.PHONY: dlpack
dlpack: third-party/backends/dlpack
third-party/backends/dlpack:
git clone --branch v0.5_RAI https://github.com/RedisAI/dlpack.git $@

.PHONY: pytorch
pytorch: third-party/backends/libtorch
third-party/backends/libtorch:
@mkdir -p third-party/backends
wget -O third-party/backends/libtorch.zip $(LIBTORCH_$(SR_TEST_DEVICE_UPPER)_URL)
cd third-party/backends && unzip libtorch.zip && rm libtorch.zip
$(TORCH_$(SR_TEST_DEVICE_UPPER)_PIP)
5 changes: 5 additions & 0 deletions doc/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,15 @@ Released on 27 September, 2024

Description

- Fix RedisAI build to allow for compilation with GCC-14
- Fix a memory leak in the Fortran Dataset implementation

Detailed Notes

- Fix RedisAI build to allow for compilation with GCC-14. Also,
we only use the Torch backend and change the compilation of
RedisAI to use CMake (like SmartSim)
([PR518](https://github.com/CrayLabs/SmartRedis/pull/518))
- The dataset object, if used in a loop, would leave memory dangling.
To alleviate this, a final procedure has been implemented. Fortran
compilers, however, are notoriously bad at detecting when an object
Expand Down
15 changes: 9 additions & 6 deletions examples/parallel/fortran/smartredis_put_get_3D.F90
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ program main

#include "enum_fortran.inc"

integer, parameter :: dim1 = 10
integer, parameter :: dim2 = 20
integer, parameter :: dim3 = 30
integer, parameter :: dim1 = 2
integer, parameter :: dim2 = 3
integer, parameter :: dim3 = 4

real(kind=8), dimension(dim1, dim2, dim3) :: recv_array_real_64
real(kind=c_double), dimension(dim1, dim2, dim3) :: recv_array_real_64

real(kind=c_double), dimension(dim1, dim2, dim3) :: true_array_real_64

Expand All @@ -49,9 +49,10 @@ program main
character(len=9) :: key_prefix

! Initialize MPI
call MPI_init( err_code )
call MPI_comm_rank( MPI_COMM_WORLD, pe_id, err_code)
call MPI_init(err_code)
call MPI_comm_rank(MPI_COMM_WORLD, pe_id, err_code)
write(key_prefix, "(A,I6.6)") "pe_",pe_id
print *, "Key Prefix: ", key_prefix

call random_number(true_array_real_64)
call random_number(recv_array_real_64)
Expand All @@ -65,6 +66,8 @@ program main
if (result .ne. SRNoError) error stop 'client%put_tensor failed'
result = client%unpack_tensor(key_prefix//"true_array_real_64", recv_array_real_64, shape(recv_array_real_64))
if (result .ne. SRNoError) error stop 'client%unpack_tensor failed'
print *, "Sent: ", true_array_real_64
print *, "Received: ", recv_array_real_64
if (.not. all(true_array_real_64 == recv_array_real_64)) error stop 'true_array_real_64: FAILED'

! Shut down MPI
Expand Down
1 change: 0 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ dev =
black==23.3.0
isort==5.6.4
pylint>=2.10.0,<3.2.0
torch<=2.0.1
mypy>=1.4.0
typing_extensions
jinja2==3.0.3
Expand Down
18 changes: 13 additions & 5 deletions utils/launch_redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,8 @@ def create_db(n_nodes, port, device, rai_ver, udsport):
/ f"third-party/RedisAI/{rai_ver}/install-{test_device}"
).resolve()
redisai = redisai_dir / "redisai.so"
tf_loc = redisai_dir / "backends/redisai_tensorflow/redisai_tensorflow.so"
torch_loc = redisai_dir / "backends/redisai_torch/redisai_torch.so"
rai_clause = f"--loadmodule {redisai} TF {tf_loc} TORCH {torch_loc}"
rai_clause = f"--loadmodule {redisai} TORCH {torch_loc}"
uds_clause = ""
if is_uds:
prepare_uds_socket(udsport)
Expand Down Expand Up @@ -198,20 +197,29 @@ def create_db(n_nodes, port, device, rai_ver, udsport):

# Make sure that all servers are up
# Let exceptions propagate to the caller
check_availability(n_nodes, port, udsport)
for proc in procs:
_ = proc.communicate(timeout=15)
out, err = proc.communicate(timeout=15)
if proc.returncode != 0:
print("STDERR:")
print(err)
print("STDOUT:")
print(out)
raise RuntimeError("Failed to launch Redis server!")
check_availability(n_nodes, port, udsport)

# Create cluster for clustered Redis request
if n_nodes > 1:
sleep(5)
cluster_str = " ".join(f"127.0.0.1:{port + i}" for i in range(n_nodes))
cmd = f"{rediscli} --cluster create {cluster_str} --cluster-replicas 0 --cluster-yes"
print(cmd)
proc = run(cmd.split(), encoding="utf-8", shell=False)
proc = run(cmd.split(), encoding="utf-8", shell=False, capture_output=True)
if proc.returncode != 0:
print(f'{rediscli} returncode: {proc.returncode}')
print("STDOUT:")
print(proc.stdout)
print("STDERR:")
print(proc.stderr)
raise SubprocessError("Cluster could not be created!")
sleep(2)
print("Cluster has been setup!")
Expand Down

0 comments on commit bebcd9c

Please sign in to comment.