Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HIP][Perf] Simple data conversion kernel is significantly slower on AMD #7195

Closed
densamoilov opened this issue Oct 27, 2022 · 4 comments
Closed
Labels
bug Something isn't working hip Issues related to execution on HIP backend.

Comments

@densamoilov
Copy link

densamoilov commented Oct 27, 2022

I implemented a simple kernel that converts data between different data types. This kernel demonstrates reasonably good bandwidth utilization on RTX TITAN NVIDIA GPU (250-550 GB/S depending on data types), which is similar to what clpeak shows.
However, when I run it on MI200 bandwidth utilization is very poor (~45GB/S), though clpeak shows ~1100 GB/S.

I used the latest commit: 599b1b9. OS Linux.

Kernel(clickable)
#include <chrono>
#include <cstdint>
#include <iostream>
#include <CL/sycl.hpp>
#include <sycl/ext/oneapi/experimental/bfloat16.hpp>

using sycl::ext::oneapi::experimental::bfloat16;

using namespace std::chrono;

using namespace std;
using namespace sycl;

enum class dt_t : int { bf16, f16, f32, s32, s8, u8 };

template <dt_t dt>
struct dt_traits;

#define DECLARE_TRAIT(dt, t, n) \
    template <> \
    struct dt_traits<dt_t::dt> { \
        using type = t; \
        static constexpr size_t size = sizeof(t); \
        static constexpr char *name = (decltype(name))n; \
    };

DECLARE_TRAIT(bf16, bfloat16, "bf16")
DECLARE_TRAIT(f16, half, "f16")
DECLARE_TRAIT(f32, float, "f32")
DECLARE_TRAIT(s32, int32_t, "s32")
DECLARE_TRAIT(s8, int8_t, "s8")
DECLARE_TRAIT(u8, uint8_t, "u8")

#undef DECLARE_TRAIT

template <dt_t sdt, dt_t ddt>
struct transform_kernel_t {
    using src_type = typename dt_traits<sdt>::type;
    using dst_type = typename dt_traits<ddt>::type;

    transform_kernel_t(const void *src, void *dst, int nelems)
        : src(src), dst(dst), nelems(nelems) {}

    void operator()(nd_item<1> id) const {
        const auto local_id = id.get_global_id(0);
        if (local_id < nelems) {
            const auto *src_typed = reinterpret_cast<const src_type *>(src);
            auto *dst_typed = reinterpret_cast<dst_type *>(dst);
            dst_typed[local_id] = static_cast<dst_type>(src_typed[local_id]);
        }
    }

    const void *src;
    void *dst;
    int nelems;
};

void *allocate_buffer(dt_t dt, int nelems, queue q) {
#define CASE(dt) \
    case dt_t::dt: \
        return malloc_shared(nelems * sizeof(dt_traits<dt_t::dt>::type), q);

    switch (dt) {
        CASE(f32)
        CASE(f16)
        CASE(bf16)
        CASE(s32)
        CASE(s8)
        CASE(u8)
        default: throw std::runtime_error("unexpected dt");
    }
#undef CASE
}

void init_buffer(void *ptr, dt_t dt, int nelems) {
#define CASE(dt) \
    case dt_t::dt: \
        reinterpret_cast<dt_traits<dt_t::dt>::type *>(ptr)[i] \
                = (i % 2 == 0) ? 4 : 9; \
        break;

    for (int i = 0; i < nelems; i++) {
        switch (dt) {
            CASE(f32)
            CASE(f16)
            CASE(bf16)
            CASE(s32)
            CASE(s8)
            CASE(u8)
            default: throw std::runtime_error("unexpected dt");
        }
    }
#undef CASE
}

void check_buffer(void *ptr, dt_t dt, int nelems) {
#define CASE(dt) \
    case dt_t::dt: { \
        auto *p = reinterpret_cast<dt_traits<dt_t::dt>::type *>(ptr); \
        const auto exp = (i % 2 == 0) ? 4 : 9; \
        if (p[i] != exp) throw std::runtime_error("result mismatch"); \
        break; \
    }

    for (int i = 0; i < nelems; i++) {
        switch (dt) {
            CASE(f32)
            CASE(f16)
            CASE(bf16)
            CASE(s32)
            CASE(s8)
            CASE(u8)
            default: throw std::runtime_error("unexpected dt");
        }
    }
    printf("Test passed\n");
#undef CASE
}

sycl::nd_range<1> get_nd_range(const device &dev, int nelems) {
    const size_t max_wg_size
            = dev.get_info<sycl::info::device::max_work_group_size>();
    const size_t max_work_item
            = dev.get_info<sycl::info::device::max_work_item_sizes>()[0];
    const size_t optimal_ls = std::min(max_wg_size, max_work_item);

    const size_t ls = std::min((size_t)nelems, optimal_ls);
    const size_t gs = nelems % ls ? (nelems / ls + 1) * ls : nelems;
printf("ls:%lu, gs:%lu\n", ls, gs);
    return {{gs}, {ls}};
}

int main() {
    constexpr int64_t nelems = 1024 * 1024 * 256;
    constexpr int64_t niters = 10;

    auto dev = device(gpu_selector_v);
    auto q = queue(dev);


#define SUBMIT_CASE(sdt, ddt) \
    { \
        printf("%s -> %s\n", dt_traits<dt_t::sdt>::name, \
                dt_traits<dt_t::ddt>::name); \
        void *src = allocate_buffer(dt_t::sdt, nelems, q); \
        void *dst = allocate_buffer(dt_t::ddt, nelems, q); \
        init_buffer(src, dt_t::sdt, nelems); \
        const auto nd_range = get_nd_range(dev, nelems); \
        /* Warm-up run */ \
        auto e = q.submit([&](handler &cgh) { \
            transform_kernel_t<dt_t::sdt, dt_t::ddt> tk(src, dst, nelems); \
            cgh.parallel_for(nd_range, tk); \
        }); \
        q.wait_and_throw(); \
\
        auto start = high_resolution_clock::now(); \
        for (int i = 0; i < niters; i++) { \
            e = q.submit([&](handler &cgh) { \
                cgh.depends_on({e}); \
                transform_kernel_t<dt_t::sdt, dt_t::ddt> tk(src, dst, nelems); \
                cgh.parallel_for(nd_range, tk); \
            }); \
        } \
        q.wait_and_throw(); \
        auto end = high_resolution_clock::now(); \
\
        try { \
            check_buffer(dst, dt_t::ddt, nelems); \
        } catch (std::exception & e) { \
            std::cout << e.what() << std::endl; \
            return 1; \
        } \
        /* Time in seconds */ \
        double time = (duration_cast<microseconds>(end - start)).count() \
                / niters / 1e6f; \
        /* Size in GB */ \
        double size = ((dt_traits<dt_t::sdt>::size * nelems) \
                             + (dt_traits<dt_t::ddt>::size * nelems)) \
                / 1e9; \
        printf("size(gb):%.2f, time(sec):%f, BW:%f\n", size, time, \
                size / time); \
        free(src, q); \
        free(dst, q); \
    }

    //-----------------------
    SUBMIT_CASE(f16, f16)
    SUBMIT_CASE(f16, f32)
    SUBMIT_CASE(f16, s32)
    SUBMIT_CASE(f16, s8)
    SUBMIT_CASE(f16, u8)
    //-----------------------
    SUBMIT_CASE(f32, f32)
    SUBMIT_CASE(f32, f16)
    SUBMIT_CASE(f32, s32)
    SUBMIT_CASE(f32, s8)
    SUBMIT_CASE(f32, u8)
    //-----------------------
    SUBMIT_CASE(s32, s32)
    SUBMIT_CASE(s32, f32)
    SUBMIT_CASE(s32, f16)
    SUBMIT_CASE(s32, s8)
    SUBMIT_CASE(s32, u8)
    //-----------------------
    SUBMIT_CASE(s8, s8)
    SUBMIT_CASE(s8, f32)
    SUBMIT_CASE(s8, f16)
    SUBMIT_CASE(s8, s32)
    SUBMIT_CASE(s8, u8)
    //-----------------------
    SUBMIT_CASE(u8, u8)
    SUBMIT_CASE(u8, f32)
    SUBMIT_CASE(u8, f16)
    SUBMIT_CASE(u8, s32)
    SUBMIT_CASE(u8, s8)
    //-----------------------

#undef SUBMIT_CASE
    return 0;
}
@densamoilov densamoilov added the bug Something isn't working label Oct 27, 2022
@densamoilov densamoilov changed the title [HIP] Simple data conversion kernel is significantly slower on AMD [HIP][Perf] Simple data conversion kernel is significantly slower on AMD Oct 27, 2022
@abagusetty
Copy link
Contributor

I too can confirm this is the case for MI250 as well (~40 GB/s)

@bader bader added the hip Issues related to execution on HIP backend. label Oct 27, 2022
@zjin-lcf
Copy link
Contributor

I'd like to add the following sub-issue
lld: error: undefined hidden symbol: __spirv_ConvertBF16ToFINTEL(unsigned short)

@zjin-lcf
Copy link
Contributor

Plain hip and sycl programs suggest that the issue may be directed to AMD ROCm.

https://github.com/zjin-lcf/HeCBench/tree/master/conversion-hip
https://github.com/zjin-lcf/HeCBench/tree/master/conversion-sycl

@JackAKirk
Copy link
Contributor

Please see ROCm/HIP#3043 for solution.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working hip Issues related to execution on HIP backend.
Projects
None yet
Development

No branches or pull requests

5 participants