diff --git a/scripts/amd/gemm/bench_gemm.py b/scripts/amd/gemm/bench_gemm.py index 39252680436d..c3b16cc6e2e9 100644 --- a/scripts/amd/gemm/bench_gemm.py +++ b/scripts/amd/gemm/bench_gemm.py @@ -62,7 +62,7 @@ def run_hipblaslt_bench(hipblaslt_bench, M, N, K, transA, transB, dtype): hipBLASLt_bench_args += f" -i {ITER} -j {WARMUP} --print_kernel_info" SED_WINNER = "sed -n '/Winner:/, $p'" - print(f"Tuning hipblaslt with {hipBLASLt_bench_args}") + print(f"Tuning hipblaslt with {hipBLASLt_bench_args}", flush=True) winner = run_bash_command( f"HIP_FORCE_DEV_KERNARG=1 {hipblaslt_bench} {hipBLASLt_bench_args} | {SED_WINNER}" @@ -83,7 +83,7 @@ def run_hipblaslt_bench(hipblaslt_bench, M, N, K, transA, transB, dtype): def run_triton_tuning(input, output, dtype_a): - print(f"Tuning gemm sizes from {input} with Triton") + print(f"Tuning gemm sizes from {input} with Triton", flush=True) run_bash_command( f"./tune_gemm.py --gemm_size_file {input} -dtype_a {dtype_a} -dtype_b {dtype_a} --ngpus 8 --jobs 32 --o {output}", False) @@ -93,7 +93,7 @@ def run_triton_bench(input, dtype_a): if not os.path.exists(input): print(f"{input} does not exist, please run tuning first") sys.exit(1) - print(f"Benchmarking gemms from {input} with Triton") + print(f"Benchmarking gemms from {input} with Triton", flush=True) triton_output = run_bash_command( f"./tune_gemm.py --gemm_size_file {input} -dtype_a {dtype_a} -dtype_b {dtype_a} --benchmark" ) diff --git a/scripts/amd/gemm/configs/beautiful.yaml b/scripts/amd/gemm/configs/beautiful.yaml index 3c7a5cfac9ce..5d085e028018 100644 --- a/scripts/amd/gemm/configs/beautiful.yaml +++ b/scripts/amd/gemm/configs/beautiful.yaml @@ -1,9 +1,17 @@ ## TN -# The best gemm size that provides the best perf number -- {'M': 4864, 'N': 4096, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N'} -# K % 256 == 0, it has some cache conflict issue -- {'M': 4864, 'N': 4096, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N'} +#- {'M': 4864, 'N': 4096, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N'} +#- {'M': 4864, 'N': 4096, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N'} +#- {'M': 4864, 'N': 4096, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N'} +#- {'M': 4864, 'N': 4096, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N'} + +- {'M': 4864, 'N': 4096, 'K': 4097, 'rowMajorA': 'T', 'rowMajorB': 'N'} +- {'M': 4864, 'N': 4096, 'K': 4098, 'rowMajorA': 'T', 'rowMajorB': 'N'} +- {'M': 4864, 'N': 4096, 'K': 4100, 'rowMajorA': 'T', 'rowMajorB': 'N'} +- {'M': 4864, 'N': 4096, 'K': 4104, 'rowMajorA': 'T', 'rowMajorB': 'N'} +- {'M': 4864, 'N': 4096, 'K': 4112, 'rowMajorA': 'T', 'rowMajorB': 'N'} +- {'M': 4864, 'N': 4096, 'K': 4128, 'rowMajorA': 'T', 'rowMajorB': 'N'} + ## TT replace the same config except TT -- {'M': 4864, 'N': 4096, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'T'} -- {'M': 4864, 'N': 4096, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'T'} +#- {'M': 4864, 'N': 4096, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'T'} +#- {'M': 4864, 'N': 4096, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'T'}