diff --git a/Makefile b/Makefile index 3fb08a1b8..a24d29658 100644 --- a/Makefile +++ b/Makefile @@ -59,7 +59,7 @@ endif # AMD flags ROCM_PATH ?= /opt/rocm -AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-offload-arch -a) +AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) HIPCC := $(shell which hipcc 2>/dev/null) HIPIFY := $(shell which hipify-perl 2>/dev/null) HIPCC_FLAGS = -O3 -march=native -I$(BUILD_DIR)/hip -fno-strict-aliasing @@ -69,6 +69,10 @@ ifneq ($(filter gfx1100,$(AMDGPU_TARGETS)),) USE_HIPBLAS ?= 1 USE_CK ?= 1 AMDGPU_TARGETS := gfx1100 +else ifneq ($(filter gfx906,$(AMDGPU_TARGETS)),) + WAVEFRONTSIZE64 ?= 1 + USE_HIPBLAS ?= 1 + AMDGPU_TARGETS := gfx906 else ifneq ($(filter gfx90a,$(AMDGPU_TARGETS)),) WAVEFRONTSIZE64 ?= 1 BUILD_XDL ?= 1 diff --git a/README.md b/README.md index d61b7604a..db1090fab 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # llm.c for AMD devices -This is a fork of [Andrej Karpathy's llm.c](https://github.com/karpathy/llm.c) with support for AMD's RDNA and CDNA devices. +This is a fork of [Andrej Karpathy's llm.c](https://github.com/karpathy/llm.c) with support for AMD devices. + +It has been tested on Radeon VII (aka gfx906), MI250X (aka gfx90a), and 7900 XTX (aka gfx1100). ## Performance diff --git a/llmc/mfu.h b/llmc/mfu.h index 21f878af8..6043d27cb 100644 --- a/llmc/mfu.h +++ b/llmc/mfu.h @@ -29,6 +29,7 @@ static const PerfData HOPPER = {378.f, 756.f, 756.f, 756.f, 1513.f, 1513.f, 1620 static const PerfData ADA = {82.6f, 165.2f, 165.2f, 330.3f, 330.3f, 660.6f, 2520.f, 512.f}; static const PerfData RDNA3 = {61.42f, 122.8f, 122.8f, -1.f, -1.f, -1.f, 2500.f, 384.f}; static const PerfData CDNA2 = {95.7f, 383.0f, 383.0f, -1.f, -1.f, -1.f, 1690.f, 208.f}; +static const PerfData GCN5 = {10.75f, 10.75f, 21.5f, -1.f, -1.f, -1.f, 1750.f, 60.f}; typedef struct { const char* name; @@ -79,6 +80,7 @@ static GPUEntry gpu_db[] = { {"NVIDIA H100 80GB HBM3", &HOPPER, 528, 1830}, // HBM3 = SXM5 {"Radeon RX 7900 XTX", &RDNA3, 384, 2500}, {"AMD Instinct MI250X/MI250", &CDNA2, 208, 1690}, + {"AMD Radeon VII", &GCN5, 60, 1750}, }; float get_flops_promised(const char* device, int precision_mode) {