diff --git a/cmake/modules/GpuCppLibrary.cmake b/cmake/modules/GpuCppLibrary.cmake index 6afcfb9fc..f89b60aaf 100644 --- a/cmake/modules/GpuCppLibrary.cmake +++ b/cmake/modules/GpuCppLibrary.cmake @@ -8,6 +8,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake) function(prepare_target_sources) # This function does the following: + # # 1. Take all the specified project sources for a target # 1. Filter files out based on CPU-only, CUDA, and HIP build modes # 1. Bucketize them into sets of CXX, CU, and HIP files @@ -134,14 +135,20 @@ endfunction() function(gpu_cpp_library) # This function does the following: + # # 1. Take all the target sources and select relevant sources based on build type (CPU-only, CUDA, HIP) # 1. Apply source file properties as needed - # 1. HIPify files as needed - # 1. Build the .SO file + # 1. Fetch the HIPified versions of the files as needed (presumes that `hipify()` has already been run) + # 1. Build the .SO file, either as STATIC or MODULE + # + # Building as STATIC allows the target to be linked to other library targets: + # https://www.reddit.com/r/cpp_questions/comments/120p0ey/how_to_create_a_composite_shared_library_out_of + # https://github.com/ROCm/hipDNN/blob/master/Examples/hipdnn-training/cmake/FindHIP.cmake set(flags) set(singleValueArgs - PREFIX # Desired name prefix for the library target + PREFIX # Desired name for the library target (and by extension, the prefix for naming intermediate targets) + TYPE # Target type, e.g., MODULE, OBJECT. See https://cmake.org/cmake/help/latest/command/add_library.html ) set(multiValueArgs CPU_SRCS # Sources for CPU-only build @@ -151,6 +158,7 @@ function(gpu_cpp_library) OTHER_SRCS # Sources from third-party libraries GPU_FLAGS # Compile flags for GPU builds INCLUDE_DIRS # Include directories for compilation + DEPS # Target dependencies, i.e. built STATIC targets ) cmake_parse_arguments( @@ -162,6 +170,8 @@ function(gpu_cpp_library) # Prepare CXX and CU sources ############################################################################ + # Take all the sources, and filter them into CPU and GPU buckets depending + # on the source type and build mode prepare_target_sources( PREFIX ${args_PREFIX} CPU_SRCS ${args_CPU_SRCS} @@ -172,15 +182,25 @@ function(gpu_cpp_library) INCLUDE_DIRS ${args_INCLUDE_DIRS}) set(lib_sources ${${args_PREFIX}_sources}) + ############################################################################ + # Prepare Target Deps + ############################################################################ + + # Convert target dependency references into CMake target-dependent expressions + # See https://cmake.org/cmake/help/latest/manual/cmake-generator-expressions.7.html#id34 + set(target_deps) + foreach(dep ${args_DEPS}) + list(APPEND target_deps "$") + endforeach() ############################################################################ # Build the Library ############################################################################ - set(lib_name ${args_PREFIX}_py) + set(lib_name ${args_PREFIX}) if(USE_ROCM) # Fetch the equivalent HIPified sources if available. - # This presumes that hipify() has already been run. + # This presumes that `hipify()` has already been run. get_hipified_list("${lib_sources}" lib_sources_hipified) # Set properties for the HIPified sources @@ -191,9 +211,10 @@ function(gpu_cpp_library) hip_include_directories("${args_INCLUDE_DIRS}") # Create the HIP library - hip_add_library(${lib_name} SHARED + hip_add_library(${lib_name} ${args_TYPE} ${lib_sources_hipified} ${args_OTHER_SRCS} + ${target_deps} ${FBGEMM_HIP_HCC_LIBRARIES} HIPCC_OPTIONS ${HIP_HCC_FLAGS}) @@ -206,10 +227,11 @@ function(gpu_cpp_library) ${args_INCLUDE_DIRS}) else() - # Create the C++/CUDA library - add_library(${lib_name} MODULE + # Create the CPU-only / CUDA library + add_library(${lib_name} ${args_TYPE} ${lib_sources} - ${args_OTHER_SRCS}) + ${args_OTHER_SRCS} + ${target_deps}) endif() ############################################################################ @@ -221,9 +243,14 @@ function(gpu_cpp_library) ${TORCH_INCLUDE_DIRS} ${NCCL_INCLUDE_DIRS}) - # Remove `lib` from the output artifact name, i.e. `libfoo.so` -> `foo.so` - set_target_properties(${lib_name} - PROPERTIES PREFIX "") + # Set additional target properties + set_target_properties(${lib_name} PROPERTIES + # Remove `lib` prefix from the output artifact name, e.g. `libfoo.so` -> `foo.so` + PREFIX "" + # Enforce -fPIC for STATIC library option, since they are to be + # integrated into other libraries down the line + # https://stackoverflow.com/questions/3961446/why-does-gcc-not-implicitly-supply-the-fpic-flag-when-compiling-static-librarie + POSITION_INDEPENDENT_CODE ON) # Link to PyTorch target_link_libraries(${lib_name} @@ -236,7 +263,7 @@ function(gpu_cpp_library) target_link_libraries(${lib_name} ${NVML_LIB_PATH}) endif() - # Silence warnings (in asmjit) + # Silence compiler warnings (in asmjit) target_compile_options(${lib_name} PRIVATE -Wno-deprecated-anon-enum-enum-conversion -Wno-deprecated-declarations) @@ -251,18 +278,17 @@ function(gpu_cpp_library) WORKING_DIRECTORY ${OUTPUT_DIR} COMMAND bash ${FBGEMM}/.github/scripts/fbgemm_gpu_postbuild.bash) - # Run the post-build steps AFTER the build itself + # Set the post-build steps to run AFTER the build completes add_dependencies(${lib_name}_postbuild ${lib_name}) ############################################################################ # Set the Output Variable(s) ############################################################################ - # PREFIX = `foo` --> Target Library = `foo_py` - set(${args_PREFIX}_py ${lib_name} PARENT_SCOPE) + set(${args_PREFIX} ${lib_name} PARENT_SCOPE) BLOCK_PRINT( - "GPU CPP Library Target: ${args_PREFIX}" + "GPU CPP Library Target: ${args_PREFIX} (${args_TYPE})" " " "CPU_SRCS:" "${args_CPU_SRCS}" @@ -291,6 +317,9 @@ function(gpu_cpp_library) "HIPified Source Files:" "${lib_sources_hipified}" " " + "Target Dependencies:" + "${target_deps}" + " " "Output Library:" "${lib_name}" ) diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 77a57a964..09d57d672 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -151,7 +151,10 @@ if(USE_ROCM) ${CMAKE_CURRENT_SOURCE_DIR}/experimental/gen_ai) # HIPify all .CU and .CUH sources under the current directory (`/fbgemm_gpu`) - # .H sources are not automatically HIPified, so they need #ifdef USE_ROCM guards + # + # Note that .H sources are not automatically HIPified, so if they reference + # CUDA-specific code, e.g. `#include `, they will need + # to be updated with `#ifdef USE_ROCM` guards. hipify( CUDA_SOURCE_DIR ${PROJECT_SOURCE_DIR} diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake index 74172ae3d..8c4e751b5 100644 --- a/fbgemm_gpu/FbgemmGpu.cmake +++ b/fbgemm_gpu/FbgemmGpu.cmake @@ -308,53 +308,6 @@ list(APPEND gen_defused_optim_py_files ${CMAKE_BINARY_DIR}/optimizer_args.py) -################################################################################ -# FBGEMM_GPU Generated Sources -################################################################################ - -if(CXX_AVX2_FOUND) - set_source_files_properties(${gen_cpu_source_files} - PROPERTIES COMPILE_OPTIONS "${AVX2_FLAGS}") -else() - set_source_files_properties(${gen_cpu_source_files} - PROPERTIES COMPILE_OPTIONS "-fopenmp") -endif() - -set_source_files_properties(${gen_cpu_source_files} - PROPERTIES INCLUDE_DIRECTORIES - "${fbgemm_sources_include_directories}") - -set_source_files_properties(${gen_gpu_host_source_files} - PROPERTIES INCLUDE_DIRECTORIES - "${fbgemm_sources_include_directories}") - -set_source_files_properties(${gen_gpu_kernel_source_files} - PROPERTIES INCLUDE_DIRECTORIES - "${fbgemm_sources_include_directories}") - -set_source_files_properties(${gen_gpu_kernel_source_files} - PROPERTIES COMPILE_OPTIONS - "${TORCH_CUDA_OPTIONS}") - -set_source_files_properties(${gen_defused_optim_source_files} - PROPERTIES INCLUDE_DIRECTORIES - "${fbgemm_sources_include_directories}") - -if(NOT FBGEMM_CPU_ONLY) - set(fbgemm_gpu_sources_gen - ${gen_gpu_kernel_source_files} - ${gen_gpu_host_source_files} - ${gen_cpu_source_files} - ${gen_defused_optim_source_files}) -else() - set(fbgemm_gpu_sources_gen - ${gen_cpu_source_files} - # To force generate_embedding_optimizer to generate Python files - ${gen_defused_optim_py_files} - ) -endif() - - ################################################################################ # FBGEMM (not FBGEMM_GPU) Sources ################################################################################ @@ -437,7 +390,7 @@ set(fbgemm_gpu_sources_cpu_static src/sparse_ops/sparse_async_cumsum.cpp src/sparse_ops/sparse_ops_cpu.cpp src/sparse_ops/sparse_ops_meta.cpp - src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp + # src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp src/split_embeddings_cache/linearize_cache_indices.cpp src/split_embeddings_cache/lfu_cache_populate_byte.cpp src/split_embeddings_cache/lru_cache_populate_byte.cpp @@ -459,7 +412,7 @@ if(NOT FBGEMM_CPU_ONLY) src/sparse_ops/sparse_ops_gpu.cpp src/split_embeddings_utils/split_embeddings_utils.cpp src/metric_ops/metric_ops_host.cpp - src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp + # src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp src/input_combine_ops/input_combine_gpu.cpp codegen/training/index_select/batch_index_select_dim0_host.cpp) @@ -478,7 +431,7 @@ if(NOT FBGEMM_CPU_ONLY) codegen/utils/embedding_bounds_check_v1.cu codegen/utils/embedding_bounds_check_v2.cu codegen/inference/embedding_forward_quantized_split_lookup.cu - src/embedding_inplace_ops/embedding_inplace_update.cu + # src/embedding_inplace_ops/embedding_inplace_update.cu src/histogram_binning_calibration_ops.cu src/input_combine_ops/input_combine.cu src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu @@ -552,7 +505,7 @@ endif() ################################################################################ -# FBGEMM_GPU HIP Code Generation +# FBGEMM_GPU Generated Sources Organized ################################################################################ set(fbgemm_gpu_sources_cpu_gen @@ -580,36 +533,42 @@ endif() # FBGEMM_GPU C++ Modules ################################################################################ +# Test target to demonstrate that target deps works as intended gpu_cpp_library( PREFIX - fbgemm_gpu + embedding_inplace_ops + TYPE + STATIC INCLUDE_DIRS ${fbgemm_sources_include_directories} CPU_SRCS - ${fbgemm_gpu_sources_cpu_static} - ${fbgemm_gpu_sources_cpu_gen} + src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp GPU_SRCS - ${fbgemm_gpu_sources_gpu_static} - ${fbgemm_gpu_sources_gpu_gen} - OTHER_SRCS - ${asmjit_sources} - ${fbgemm_sources} + src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp + src/embedding_inplace_ops/embedding_inplace_update.cu GPU_FLAGS ${TORCH_CUDA_OPTIONS}) -# TODO: Test target, need to properly integrate into FBGEMM_GPU main build gpu_cpp_library( PREFIX - embedding_inplace_ops + fbgemm_gpu_py + TYPE + MODULE INCLUDE_DIRS ${fbgemm_sources_include_directories} CPU_SRCS - src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp + ${fbgemm_gpu_sources_cpu_static} + ${fbgemm_gpu_sources_cpu_gen} GPU_SRCS - src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp - src/embedding_inplace_ops/embedding_inplace_update.cu + ${fbgemm_gpu_sources_gpu_static} + ${fbgemm_gpu_sources_gpu_gen} + OTHER_SRCS + ${asmjit_sources} + ${fbgemm_sources} GPU_FLAGS - ${TORCH_CUDA_OPTIONS}) + ${TORCH_CUDA_OPTIONS} + DEPS + embedding_inplace_ops) ################################################################################ diff --git a/fbgemm_gpu/experimental/example/CMakeLists.txt b/fbgemm_gpu/experimental/example/CMakeLists.txt index 1b831b8c1..592845078 100644 --- a/fbgemm_gpu/experimental/example/CMakeLists.txt +++ b/fbgemm_gpu/experimental/example/CMakeLists.txt @@ -24,7 +24,9 @@ set(experimental_example_python_source_files gpu_cpp_library( PREFIX - fbgemm_gpu_experimental_example + fbgemm_gpu_experimental_example_py + TYPE + MODULE INCLUDE_DIRS ${fbgemm_sources_include_directories} GPU_SRCS diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt index 3e402ae2e..42e806518 100644 --- a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt +++ b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt @@ -52,7 +52,9 @@ file(GLOB_RECURSE experimental_gen_ai_python_source_files gpu_cpp_library( PREFIX - fbgemm_gpu_experimental_gen_ai + fbgemm_gpu_experimental_gen_ai_py + TYPE + MODULE INCLUDE_DIRS ${fbgemm_sources_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/src/quantize