From 44e7122c15c155de0873f7a6b3bd6f2e133c211a Mon Sep 17 00:00:00 2001 From: William Moses Date: Mon, 7 Feb 2022 11:33:32 -0500 Subject: [PATCH] Correct and simplify sdot/ddot (#498) --- enzyme/Enzyme/AdjointGenerator.h | 45 ++-- enzyme/Enzyme/EnzymeLogic.cpp | 11 +- .../Enzyme/ReverseMode/blas/cblas_ddot.ll | 198 ++++++++++++++++++ .../blas/cblas_ddot_inactive_first.ll | 85 -------- .../blas/cblas_ddot_inactive_mod1.ll | 91 -------- .../blas/cblas_ddot_inactive_second.ll | 85 -------- .../blas/cblas_ddot_inactive_stride.ll | 104 --------- .../ReverseMode/blas/cblas_ddot_mod1.ll | 117 ----------- .../blas/cblas_ddot_mod1_stride.ll | 127 ----------- .../blas/cblas_ddot_mod1_stride_split.ll | 182 ---------------- .../ReverseMode/blas/cblas_ddot_mod2_split.ll | 179 ---------------- .../ReverseMode/blas/cblas_ddot_nomod.ll | 90 -------- .../Enzyme/ReverseMode/blas/cblas_sdot.ll | 198 ++++++++++++++++++ .../blas/cblas_sdot_inactive_first.ll | 85 -------- .../blas/cblas_sdot_inactive_mod1.ll | 91 -------- .../blas/cblas_sdot_inactive_second.ll | 85 -------- .../blas/cblas_sdot_inactive_stride.ll | 92 -------- .../ReverseMode/blas/cblas_sdot_mod1.ll | 117 ----------- .../blas/cblas_sdot_mod1_stride.ll | 115 ---------- .../blas/cblas_sdot_mod1_stride_split.ll | 170 --------------- .../ReverseMode/blas/cblas_sdot_mod2_split.ll | 139 ------------ .../ReverseMode/blas/cblas_sdot_nomod.ll | 90 -------- 22 files changed, 435 insertions(+), 2061 deletions(-) create mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_first.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_mod1.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_second.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_stride.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1_stride.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1_stride_split.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod2_split.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_nomod.ll create mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_first.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_mod1.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_second.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_stride.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1_stride.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1_stride_split.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod2_split.ll delete mode 100644 enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_nomod.ll diff --git a/enzyme/Enzyme/AdjointGenerator.h b/enzyme/Enzyme/AdjointGenerator.h index 2d61ebf5ad996..91b1228abf2ad 100644 --- a/enzyme/Enzyme/AdjointGenerator.h +++ b/enzyme/Enzyme/AdjointGenerator.h @@ -4652,6 +4652,11 @@ class AdjointGenerator bool handleBLAS(llvm::CallInst &call, Function *called, StringRef funcName, const std::map &uncacheable_args) { + // Forward Mode not handled yet + assert(Mode != DerivativeMode::ForwardMode && + Mode != DerivativeMode::ForwardModeSplit); + // Vector Mode not handled yet + assert(gutils->getWidth() == 1); CallInst *const newCall = cast(gutils->getNewFromOriginal(&call)); IRBuilder<> BuilderZ(newCall); BuilderZ.setFastMathFlags(getFast()); @@ -4671,9 +4676,6 @@ class AdjointGenerator } Type *castvals[2] = {call.getArgOperand(1)->getType(), call.getArgOperand(3)->getType()}; - auto *cachetype = - StructType::get(call.getContext(), ArrayRef(castvals)); - Value *undefinit = UndefValue::get(cachetype); Value *cacheval; auto in_arg = call.getCalledFunction()->arg_begin(); in_arg++; @@ -4694,15 +4696,16 @@ class AdjointGenerator if (xcache) { auto dmemcpy = getOrInsertMemcpyStrided(*gutils->oldFunc->getParent(), - PointerType::getUnqual(innerType), 0, 0); + cast(castvals[0]), 0, 0); auto malins = CallInst::CreateMalloc( gutils->getNewFromOriginal(&call), size->getType(), innerType, - size, call.getArgOperand(0), nullptr, ""); - arg1 = - BuilderZ.CreateBitCast(malins, call.getArgOperand(1)->getType()); + size, gutils->getNewFromOriginal(call.getArgOperand(0)), nullptr, + ""); + arg1 = BuilderZ.CreateBitCast(malins, castvals[0]); Value *args[4] = {arg1, gutils->getNewFromOriginal(call.getArgOperand(1)), - call.getArgOperand(0), call.getArgOperand(2)}; + gutils->getNewFromOriginal(call.getArgOperand(0)), + gutils->getNewFromOriginal(call.getArgOperand(2))}; BuilderZ.CreateCall( dmemcpy, args, @@ -4715,15 +4718,16 @@ class AdjointGenerator if (ycache) { auto dmemcpy = getOrInsertMemcpyStrided(*gutils->oldFunc->getParent(), - PointerType::getUnqual(innerType), 0, 0); + cast(castvals[1]), 0, 0); auto malins = CallInst::CreateMalloc( gutils->getNewFromOriginal(&call), size->getType(), innerType, - size, call.getArgOperand(0), nullptr, ""); - arg2 = - BuilderZ.CreateBitCast(malins, call.getArgOperand(3)->getType()); + size, gutils->getNewFromOriginal(call.getArgOperand(0)), nullptr, + ""); + arg2 = BuilderZ.CreateBitCast(malins, castvals[1]); Value *args[4] = {arg2, gutils->getNewFromOriginal(call.getArgOperand(3)), - call.getArgOperand(0), call.getArgOperand(4)}; + gutils->getNewFromOriginal(call.getArgOperand(0)), + gutils->getNewFromOriginal(call.getArgOperand(4))}; BuilderZ.CreateCall( dmemcpy, args, gutils->getInvertedBundles(&call, @@ -4733,7 +4737,10 @@ class AdjointGenerator BuilderZ, /*lookup*/ false)); } if (xcache && ycache) { - auto valins1 = BuilderZ.CreateInsertValue(undefinit, arg1, 0); + Type *cachetype = + StructType::get(call.getContext(), ArrayRef(castvals)); + auto valins1 = + BuilderZ.CreateInsertValue(UndefValue::get(cachetype), arg1, 0); cacheval = BuilderZ.CreateInsertValue(valins1, arg2, 1); } else if (xcache) cacheval = arg1; @@ -4758,6 +4765,16 @@ class AdjointGenerator if (Mode == DerivativeMode::ReverseModeGradient && (!gutils->isConstantValue(call.getArgOperand(1)) || !gutils->isConstantValue(call.getArgOperand(3)))) { + Type *cachetype = nullptr; + if (xcache && ycache) + cachetype = StructType::get(call.getContext(), + ArrayRef(castvals)); + else if (xcache) + cachetype = castvals[0]; + else { + assert(ycache); + cachetype = castvals[1]; + } cacheval = BuilderZ.CreatePHI(cachetype, 0); } cacheval = diff --git a/enzyme/Enzyme/EnzymeLogic.cpp b/enzyme/Enzyme/EnzymeLogic.cpp index 87ea0ccf9b0d8..cf3261d874a18 100644 --- a/enzyme/Enzyme/EnzymeLogic.cpp +++ b/enzyme/Enzyme/EnzymeLogic.cpp @@ -3266,9 +3266,14 @@ Function *EnzymeLogic::CreatePrimalAndGradient( "return or non-constant"); } - if (key.todiff->empty() && CustomErrorHandler) { - std::string s = ("No derivative found for " + key.todiff->getName()).str(); - CustomErrorHandler(s.c_str()); + if (key.todiff->empty()) { + std::string str = + ("No derivative found for " + key.todiff->getName()).str(); + if (CustomErrorHandler) { + CustomErrorHandler(str.c_str()); + } else { + llvm_unreachable(str.c_str()); + } } assert(!key.todiff->empty()); diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot.ll new file mode 100644 index 0000000000000..d1af40180ca44 --- /dev/null +++ b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot.ll @@ -0,0 +1,198 @@ +;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare dso_local void @__enzyme_autodiff(...) + +declare double @cblas_ddot(i32, double*, i32, double*, i32) + +define void @active(i32 %len, double* noalias %m, double* %dm, i32 %incm, double* noalias %n, double* %dn, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(double (i32, double*, i32, double*, i32)* @f, i32 %len, double* noalias %m, double* %dm, i32 %incm, double* noalias %n, double* %dn, i32 %incn) + ret void +} + +define void @inactiveFirst(i32 %len, double* noalias %m, i32 %incm, double* noalias %n, double* %dn, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(double (i32, double*, i32, double*, i32)* @f, i32 %len, metadata !"enzyme_const", double* noalias %m, i32 %incm, double* noalias %n, double* %dn, i32 %incn) + ret void +} + +define void @inactiveSecond(i32 %len, double* noalias %m, double* noalias %dm, i32 %incm, double* noalias %n, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(double (i32, double*, i32, double*, i32)* @f, i32 %len, double* noalias %m, double* noalias %dm, i32 %incm, metadata !"enzyme_const", double* noalias %n, i32 %incn) + ret void +} + +define void @activeMod(i32 %len, double* noalias %m, double* %dm, i32 %incm, double* noalias %n, double* %dn, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(double (i32, double*, i32, double*, i32)* @modf, i32 %len, double* noalias %m, double* %dm, i32 %incm, double* noalias %n, double* %dn, i32 %incn) + ret void +} + +define void @inactiveModFirst(i32 %len, double* noalias %m, i32 %incm, double* noalias %n, double* %dn, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(double (i32, double*, i32, double*, i32)* @modf, i32 %len, metadata !"enzyme_const", double* noalias %m, i32 %incm, double* noalias %n, double* %dn, i32 %incn) + ret void +} + +define void @inactiveModSecond(i32 %len, double* noalias %m, double* noalias %dm, i32 %incm, double* noalias %n, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(double (i32, double*, i32, double*, i32)* @modf, i32 %len, double* noalias %m, double* noalias %dm, i32 %incm, metadata !"enzyme_const", double* noalias %n, i32 %incn) + ret void +} + +define double @f(i32 %len, double* noalias %m, i32 %incm, double* noalias %n, i32 %incn) { +entry: + %call = call double @cblas_ddot(i32 %len, double* %m, i32 %incm, double* %n, i32 %incn) + ret double %call +} + +define double @modf(i32 %len, double* noalias %m, i32 %incm, double* noalias %n, i32 %incn) { +entry: + %call = call double @f(i32 %len, double* %m, i32 %incm, double* %n, i32 %incn) + store double 0.000000e+00, double* %m + store double 0.000000e+00, double* %n + ret double %call +} + + +; CHECK: define void @active +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[active:.+]]( + +; CHECK: define void @inactiveFirst +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[inactiveFirst:.+]]( + +; CHECK: define void @inactiveSecond +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[inactiveSecond:.+]]( + + +; CHECK: define void @activeMod +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[activeMod:.+]]( + +; CHECK: define void @inactiveModFirst +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[inactiveModFirst:.+]]( + +; CHECK: define void @inactiveModSecond +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[inactiveModSecond:.+]]( + + +; CHECK: define internal void @[[active]](i32 %len, double* noalias %m, double* %"m'", i32 %incm, double* noalias %n, double* %"n'", i32 %incn, double %differeturn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %call = call double @cblas_ddot(i32 %len, double* nocapture readonly %m, i32 %incm, double* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: call void @cblas_daxpy(i32 %len, double %differeturn, double* %m, i32 %incm, double* %"n'", i32 %incn) +; CHECK-NEXT: call void @cblas_daxpy(i32 %len, double %differeturn, double* %n, i32 %incn, double* %"m'", i32 %incm) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal void @[[inactiveFirst]](i32 %len, double* noalias %m, i32 %incm, double* noalias %n, double* %"n'", i32 %incn, double %differeturn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %call = call double @cblas_ddot(i32 %len, double* nocapture readonly %m, i32 %incm, double* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: call void @cblas_daxpy(i32 %len, double %differeturn, double* %m, i32 %incm, double* %"n'", i32 %incn) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal void @[[inactiveSecond]](i32 %len, double* noalias %m, double* %"m'", i32 %incm, double* noalias %n, i32 %incn, double %differeturn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %call = call double @cblas_ddot(i32 %len, double* nocapture readonly %m, i32 %incm, double* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: call void @cblas_daxpy(i32 %len, double %differeturn, double* %n, i32 %incn, double* %"m'", i32 %incm) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal void @[[activeMod]](i32 %len, double* noalias %m, double* %"m'", i32 %incm, double* noalias %n, double* %"n'", i32 %incn, double %differeturn) +; CHECK-NEXT: entry: +; CHECK: %call_augmented = call { double*, double* } @[[augMod:.+]](i32 %len, double* %m, double* %"m'", i32 %incm, double* %n, double* %"n'", i32 %incn) +; CHECK: call void @[[revMod:.+]](i32 %len, double* %m, double* %"m'", i32 %incm, double* %n, double* %"n'", i32 %incn, double %differeturn, { double*, double* } %call_augmented) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal { double*, double* } @[[augMod]](i32 %len, double* noalias %m, double* %"m'", i32 %incm, double* noalias %n, double* %"n'", i32 %incn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = zext i32 %len to i64 +; CHECK-NEXT: %mallocsize = mul i64 %0, ptrtoint (double* getelementptr (double, double* null, i32 1) to i64) +; CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 %mallocsize) +; CHECK-NEXT: %1 = bitcast i8* %malloccall to double* +; CHECK-NEXT: call void @__enzyme_memcpy_doubleda0sa0stride(double* %1, double* %m, i32 %len, i32 %incm) +; CHECK-NEXT: %2 = zext i32 %len to i64 +; CHECK-NEXT: %mallocsize1 = mul i64 %2, ptrtoint (double* getelementptr (double, double* null, i32 1) to i64) +; CHECK-NEXT: %malloccall2 = tail call i8* @malloc(i64 %mallocsize1) +; CHECK-NEXT: %3 = bitcast i8* %malloccall2 to double* +; CHECK-NEXT: call void @__enzyme_memcpy_doubleda0sa0stride(double* %3, double* %n, i32 %len, i32 %incn) +; CHECK-NEXT: %4 = insertvalue { double*, double* } undef, double* %1, 0 +; CHECK-NEXT: %5 = insertvalue { double*, double* } %4, double* %3, 1 +; CHECK-NEXT: %call = call double @cblas_ddot(i32 %len, double* nocapture readonly %m, i32 %incm, double* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: ret { double*, double* } %5 +; CHECK-NEXT: } + +; CHECK: define internal void @[[revMod]](i32 %len, double* noalias %m, double* %"m'", i32 %incm, double* noalias %n, double* %"n'", i32 %incn, double %differeturn, { double*, double* } +; CHECK-NEXT: entry: +; CHECK-NEXT: %1 = extractvalue { double*, double* } %0, 0 +; CHECK-NEXT: %2 = extractvalue { double*, double* } %0, 1 +; CHECK-NEXT: call void @cblas_daxpy(i32 %len, double %differeturn, double* %1, i32 1, double* %"n'", i32 %incn) +; CHECK-NEXT: %3 = bitcast double* %1 to i8* +; CHECK-NEXT: tail call void @free(i8* %3) +; CHECK-NEXT: call void @cblas_daxpy(i32 %len, double %differeturn, double* %2, i32 1, double* %"m'", i32 %incm) +; CHECK-NEXT: %4 = bitcast double* %2 to i8* +; CHECK-NEXT: tail call void @free(i8* %4) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal void @[[inactiveModFirst]](i32 %len, double* noalias %m, i32 %incm, double* noalias %n, double* %"n'", i32 %incn, double %differeturn) +; CHECK-NEXT: entry: +; CHECK: %call_augmented = call double* @[[augModFirst:.+]](i32 %len, double* %m, i32 %incm, double* %n, double* %"n'", i32 %incn) +; CHECK: call void @[[revModFirst:.+]](i32 %len, double* %m, i32 %incm, double* %n, double* %"n'", i32 %incn, double %differeturn, double* %call_augmented) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal double* @[[augModFirst]](i32 %len, double* noalias %m, i32 %incm, double* noalias %n, double* %"n'", i32 %incn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = zext i32 %len to i64 +; CHECK-NEXT: %mallocsize = mul i64 %0, ptrtoint (double* getelementptr (double, double* null, i32 1) to i64) +; CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 %mallocsize) +; CHECK-NEXT: %1 = bitcast i8* %malloccall to double* +; CHECK-NEXT: call void @__enzyme_memcpy_doubleda0sa0stride(double* %1, double* %m, i32 %len, i32 %incm) +; CHECK-NEXT: %call = call double @cblas_ddot(i32 %len, double* nocapture readonly %m, i32 %incm, double* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: ret double* %1 +; CHECK-NEXT: } + +; CHECK: define internal void @[[revModFirst]](i32 %len, double* noalias %m, i32 %incm, double* noalias %n, double* %"n'", i32 %incn, double %differeturn, double* +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @cblas_daxpy(i32 %len, double %differeturn, double* %0, i32 1, double* %"n'", i32 %incn) +; CHECK-NEXT: %1 = bitcast double* %0 to i8* +; CHECK-NEXT: tail call void @free(i8* %1) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal void @[[inactiveModSecond]](i32 %len, double* noalias %m, double* %"m'", i32 %incm, double* noalias %n, i32 %incn, double %differeturn) +; CHECK-NEXT: entry: +; CHECK: %call_augmented = call double* @[[augModSecond:.+]](i32 %len, double* %m, double* %"m'", i32 %incm, double* %n, i32 %incn) +; CHECK: call void @[[revModSecond:.+]](i32 %len, double* %m, double* %"m'", i32 %incm, double* %n, i32 %incn, double %differeturn, double* %call_augmented) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal double* @[[augModSecond]](i32 %len, double* noalias %m, double* %"m'", i32 %incm, double* noalias %n, i32 %incn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = zext i32 %len to i64 +; CHECK-NEXT: %mallocsize = mul i64 %0, ptrtoint (double* getelementptr (double, double* null, i32 1) to i64) +; CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 %mallocsize) +; CHECK-NEXT: %1 = bitcast i8* %malloccall to double* +; CHECK-NEXT: call void @__enzyme_memcpy_doubleda0sa0stride(double* %1, double* %n, i32 %len, i32 %incn) +; CHECK-NEXT: %call = call double @cblas_ddot(i32 %len, double* nocapture readonly %m, i32 %incm, double* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: ret double* %1 +; CHECK-NEXT: } + +; CHECK: define internal void @[[revModSecond]](i32 %len, double* noalias %m, double* %"m'", i32 %incm, double* noalias %n, i32 %incn, double %differeturn, double* +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @cblas_daxpy(i32 %len, double %differeturn, double* %0, i32 1, double* %"m'", i32 %incm) +; CHECK-NEXT: %1 = bitcast double* %0 to i8* +; CHECK-NEXT: tail call void @free(i8* %1) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_first.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_first.ll deleted file mode 100644 index 28930a219f373..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_first.ll +++ /dev/null @@ -1,85 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern double __enzyme_autodiff(double*, double*, double*); -; -;double g(double *restrict m) { -; double n[3] = {4, 5, 6}; -; double x = cblas_ddot(3, m, 1, n, 1); -; double y = x*x; -; return y; -;} -; -;int main() { -; double m[3] = {1, 2, 3}; -; double m1[3] = {0.}; -; double z = __enzyme_autodiff((double*)g, m, m1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.g.n = private unnamed_addr constant [3 x double] [double 4.000000e+00, double 5.000000e+00, double 6.000000e+00], align 16 -@__const.main.m = private unnamed_addr constant [3 x double] [double 1.000000e+00, double 2.000000e+00, double 3.000000e+00], align 16 - -define dso_local double @g(double* noalias %m) { -entry: - %m.addr = alloca double*, align 8 - %n = alloca [3 x double], align 16 - %x = alloca double, align 8 - %y = alloca double, align 8 - store double* %m, double** %m.addr, align 8 - %0 = bitcast [3 x double]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.g.n to i8*), i64 24, i1 false) - %1 = load double*, double** %m.addr, align 8 - %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %n, i32 0, i32 0 - %call = call double @cblas_ddot(i32 3, double* %1, i32 1, double* %arraydecay, i32 1) - store double %call, double* %x, align 8 - %2 = load double, double* %x, align 8 - %3 = load double, double* %x, align 8 - %mul = fmul double %2, %3 - store double %mul, double* %y, align 8 - %4 = load double, double* %y, align 8 - ret double %4 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare dso_local double @cblas_ddot(i32, double*, i32, double*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [3 x double], align 16 - %m1 = alloca [3 x double], align 16 - %z = alloca double, align 8 - %0 = bitcast [3 x double]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.main.m to i8*), i64 24, i1 false) - %1 = bitcast [3 x double]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %1, i8 0, i64 24, i1 false) - %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x double], [3 x double]* %m1, i32 0, i32 0 - %call = call double @__enzyme_autodiff(double* bitcast (double (double*)* @g to double*), double* %arraydecay, double* %arraydecay1) - store double %call, double* %z, align 8 - ret i32 0 -} - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local double @__enzyme_autodiff(double*, double*, double*) - -;CHECK:define internal void @diffeg(double* noalias %m, double* %"m'", double %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %n = alloca [3 x double], align 16 -;CHECK-NEXT: %0 = bitcast [3 x double]* %n to i8* -;CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.g.n to i8*), i64 24, i1 false) -;CHECK-NEXT: %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %n, i32 0, i32 0 -;CHECK-NEXT: %call = call double @cblas_ddot(i32 3, double* nocapture readonly %m, i32 1, double* nocapture readonly %arraydecay, i32 1) -;CHECK-NEXT: %m0diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %1 = fadd fast double %m0diffecall, %m1diffecall -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %1, double* %arraydecay, i32 1, double* %"m'", i32 1) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK: declare void @cblas_daxpy(i32, double, double*, i32, double*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_mod1.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_mod1.ll deleted file mode 100644 index f2de7ab4707a8..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_mod1.ll +++ /dev/null @@ -1,91 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern double __enzyme_autodiff(double*, double*, double*); -; -;double g(double *restrict m) { -; double n[3] = {4, 5, 6}; -; double x = cblas_ddot(3, m, 1, n, 1); -; m[0] = 10; -; double y = x*x; -; return y; -;} -; -;int main() { -; double m[3] = {1, 2, 3}; -; double m1[3] = {0.}; -; double z = __enzyme_autodiff((double*)g, m, m1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.g.n = private unnamed_addr constant [3 x double] [double 4.000000e+00, double 5.000000e+00, double 6.000000e+00], align 16 -@__const.main.m = private unnamed_addr constant [3 x double] [double 1.000000e+00, double 2.000000e+00, double 3.000000e+00], align 16 - -define dso_local double @g(double* noalias %m) { -entry: - %m.addr = alloca double*, align 8 - %n = alloca [3 x double], align 16 - %x = alloca double, align 8 - %y = alloca double, align 8 - store double* %m, double** %m.addr, align 8 - %0 = bitcast [3 x double]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.g.n to i8*), i64 24, i1 false) - %1 = load double*, double** %m.addr, align 8 - %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %n, i32 0, i32 0 - %call = call double @cblas_ddot(i32 3, double* %1, i32 1, double* %arraydecay, i32 1) - store double %call, double* %x, align 8 - %2 = load double*, double** %m.addr, align 8 - %arrayidx = getelementptr inbounds double, double* %2, i64 0 - store double 1.000000e+01, double* %arrayidx, align 8 - %3 = load double, double* %x, align 8 - %4 = load double, double* %x, align 8 - %mul = fmul double %3, %4 - store double %mul, double* %y, align 8 - %5 = load double, double* %y, align 8 - ret double %5 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare dso_local double @cblas_ddot(i32, double*, i32, double*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [3 x double], align 16 - %m1 = alloca [3 x double], align 16 - %z = alloca double, align 8 - %0 = bitcast [3 x double]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.main.m to i8*), i64 24, i1 false) - %1 = bitcast [3 x double]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %1, i8 0, i64 24, i1 false) - %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x double], [3 x double]* %m1, i32 0, i32 0 - %call = call double @__enzyme_autodiff(double* bitcast (double (double*)* @g to double*), double* %arraydecay, double* %arraydecay1) - store double %call, double* %z, align 8 - ret i32 0 -} - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local double @__enzyme_autodiff(double*, double*, double*) - -;CHECK:define internal void @diffeg(double* noalias %m, double* %"m'", double %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %n = alloca [3 x double], align 16 -;CHECK-NEXT: %0 = bitcast [3 x double]* %n to i8* -;CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.g.n to i8*), i64 24, i1 false) -;CHECK-NEXT: %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %n, i32 0, i32 0 -;CHECK-NEXT: %call = call double @cblas_ddot(i32 3, double* nocapture readonly %m, i32 1, double* nocapture readonly %arraydecay, i32 1) -;CHECK-NEXT: store double 1.000000e+01, double* %m, align 8 -;CHECK-NEXT: %m0diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %1 = fadd fast double %m0diffecall, %m1diffecall -;CHECK-NEXT: store double 0.000000e+00, double* %"m'", align 8 -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %1, double* %arraydecay, i32 1, double* %"m'", i32 1) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK: declare void @cblas_daxpy(i32, double, double*, i32, double*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_second.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_second.ll deleted file mode 100644 index 759dda82c3b8d..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_second.ll +++ /dev/null @@ -1,85 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern double __enzyme_autodiff(double*, double*, double*); -; -;double g(double *restrict n) { -; double m[3] = {1, 2, 3}; -; double x = cblas_ddot(3, m, 1, n, 1); -; double y = x*x; -; return y; -;} -; -;int main() { -; double n[3] = {4, 5, 6}; -; double n1[3] = {0.}; -; double z = __enzyme_autodiff((double*)g, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.g.m = private unnamed_addr constant [3 x double] [double 1.000000e+00, double 2.000000e+00, double 3.000000e+00], align 16 -@__const.main.n = private unnamed_addr constant [3 x double] [double 4.000000e+00, double 5.000000e+00, double 6.000000e+00], align 16 - -define dso_local double @g(double* noalias %n) { -entry: - %n.addr = alloca double*, align 8 - %m = alloca [3 x double], align 16 - %x = alloca double, align 8 - %y = alloca double, align 8 - store double* %n, double** %n.addr, align 8 - %0 = bitcast [3 x double]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.g.m to i8*), i64 24, i1 false) - %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %m, i32 0, i32 0 - %1 = load double*, double** %n.addr, align 8 - %call = call double @cblas_ddot(i32 3, double* %arraydecay, i32 1, double* %1, i32 1) - store double %call, double* %x, align 8 - %2 = load double, double* %x, align 8 - %3 = load double, double* %x, align 8 - %mul = fmul double %2, %3 - store double %mul, double* %y, align 8 - %4 = load double, double* %y, align 8 - ret double %4 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare dso_local double @cblas_ddot(i32, double*, i32, double*, i32) - -define dso_local i32 @main() { -entry: - %n = alloca [3 x double], align 16 - %n1 = alloca [3 x double], align 16 - %z = alloca double, align 8 - %0 = bitcast [3 x double]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.main.n to i8*), i64 24, i1 false) - %1 = bitcast [3 x double]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %1, i8 0, i64 24, i1 false) - %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %n, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x double], [3 x double]* %n1, i32 0, i32 0 - %call = call double @__enzyme_autodiff(double* bitcast (double (double*)* @g to double*), double* %arraydecay, double* %arraydecay1) - store double %call, double* %z, align 8 - ret i32 0 -} - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local double @__enzyme_autodiff(double*, double*, double*) - -;CHECK:define internal void @diffeg(double* noalias %n, double* %"n'", double %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %m = alloca [3 x double], align 16 -;CHECK-NEXT: %0 = bitcast [3 x double]* %m to i8* -;CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.g.m to i8*), i64 24, i1 false) -;CHECK-NEXT: %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %m, i32 0, i32 0 -;CHECK-NEXT: %call = call double @cblas_ddot(i32 3, double* nocapture readonly %arraydecay, i32 1, double* nocapture readonly %n, i32 1) -;CHECK-NEXT: %m0diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %1 = fadd fast double %m0diffecall, %m1diffecall -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %1, double* %arraydecay, i32 1, double* %"n'", i32 1) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK: declare void @cblas_daxpy(i32, double, double*, i32, double*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_stride.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_stride.ll deleted file mode 100644 index 0ba43f57491ad..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_inactive_stride.ll +++ /dev/null @@ -1,104 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern double __enzyme_autodiff(void *, double *, double *, double *, -; double *, int); -; -;double g(double *restrict m, double *restrict n, int stride) { -; double x = cblas_ddot(3, m, 2, n, stride); -; double y = x * x; -; return y; -;} -; -;int main() { -; double m[6] = {1, 2, 3, 101, 102, 103}; -; double m1[6] = {0, 0, 0, 0, 0, 0}; -; double n[9] = {4, 5, 6, 104, 105, 106, 7, 8, 9}; -; double n1[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; -; __enzyme_autodiff((void*)g, m, m1, n, n1, 3); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.n = private unnamed_addr constant [9 x double] [double 4.000000e+00, double 5.000000e+00, double 6.000000e+00, double 1.040000e+02, double 1.050000e+02, double 1.060000e+02, double 7.000000e+00, double 8.000000e+00, double 9.000000e+00], align 16 - -define dso_local double @g(double* noalias %m, double* noalias %n, i32 %stride) { -entry: - %m.addr = alloca double*, align 8 - %n.addr = alloca double*, align 8 - %stride.addr = alloca i32, align 4 - %x = alloca double, align 8 - %y = alloca double, align 8 - store double* %m, double** %m.addr, align 8 - store double* %n, double** %n.addr, align 8 - store i32 %stride, i32* %stride.addr, align 4 - %0 = load double*, double** %m.addr, align 8 - %1 = load double*, double** %n.addr, align 8 - %2 = load i32, i32* %stride.addr, align 4 - %call = call double @cblas_ddot(i32 3, double* %0, i32 2, double* %1, i32 %2) - store double %call, double* %x, align 8 - %3 = load double, double* %x, align 8 - %4 = load double, double* %x, align 8 - %mul = fmul double %3, %4 - store double %mul, double* %y, align 8 - %5 = load double, double* %y, align 8 - ret double %5 -} - -declare dso_local double @cblas_ddot(i32, double*, i32, double*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [6 x double], align 16 - %m1 = alloca [6 x double], align 16 - %n = alloca [9 x double], align 16 - %n1 = alloca [9 x double], align 16 - %0 = bitcast [6 x double]* %m to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %0, i8 0, i64 48, i1 false) - %1 = bitcast i8* %0 to [6 x double]* - %2 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 0 - store double 1.000000e+00, double* %2, align 16 - %3 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 1 - store double 2.000000e+00, double* %3, align 8 - %4 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 2 - store double 3.000000e+00, double* %4, align 16 - %5 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 3 - store double 1.010000e+02, double* %5, align 8 - %6 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 4 - store double 1.020000e+02, double* %6, align 16 - %7 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 5 - store double 1.030000e+02, double* %7, align 8 - %8 = bitcast [6 x double]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %8, i8 0, i64 48, i1 false) - %9 = bitcast [9 x double]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %9, i8* align 16 bitcast ([9 x double]* @__const.main.n to i8*), i64 72, i1 false) - %10 = bitcast [9 x double]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %10, i8 0, i64 72, i1 false) - %arraydecay = getelementptr inbounds [6 x double], [6 x double]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [6 x double], [6 x double]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [9 x double], [9 x double]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [9 x double], [9 x double]* %n1, i32 0, i32 0 - %call = call double @__enzyme_autodiff(i8* bitcast (double (double*, double*, i32)* @g to i8*), double* %arraydecay, double* %arraydecay1, double* %arraydecay2, double* %arraydecay3, i32 3) - ret i32 0 -} - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare dso_local double @__enzyme_autodiff(i8*, double*, double*, double*, double*, i32) - -;CHECK:define internal void @diffeg(double* noalias %m, double* %"m'", double* noalias %n, double* %"n'", i32 %stride, double %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %call = call double @cblas_ddot(i32 3, double* nocapture readonly %m, i32 2, double* nocapture readonly %n, i32 %stride) -;CHECK-NEXT: %m0diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %0 = fadd fast double %m0diffecall, %m1diffecall -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %0, double* %m, i32 2, double* %"n'", i32 %stride) -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %0, double* %n, i32 %stride, double* %"m'", i32 2) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_daxpy(i32, double, double*, i32, double*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1.ll deleted file mode 100644 index af8a8681d602d..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1.ll +++ /dev/null @@ -1,117 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern double __enzyme_autodiff(void *, double *, double *, double *, -; double *); -; -;double g(double *restrict m, double *restrict n) { -; double x = cblas_ddot(3, m, 1, n, 1); -; m[0] = 11.0; -; m[1] = 12.0; -; m[2] = 13.0; -; double y = x * x; -; return y; -;} -; -;int main() { -; double m[3] = {1, 2, 3}; -; double m1[3] = {0, 0, 0}; -; double n[3] = {4, 5, 6}; -; double n1[3] = {0, 0, 0}; -; double val = __enzyme_autodiff((void*)g, m, m1, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.m = private unnamed_addr constant [3 x double] [double 1.000000e+00, double 2.000000e+00, double 3.000000e+00], align 16 -@__const.main.n = private unnamed_addr constant [3 x double] [double 4.000000e+00, double 5.000000e+00, double 6.000000e+00], align 16 - -define dso_local double @g(double* noalias %m, double* noalias %n) { -entry: - %m.addr = alloca double*, align 8 - %n.addr = alloca double*, align 8 - %x = alloca double, align 8 - %y = alloca double, align 8 - store double* %m, double** %m.addr, align 8 - store double* %n, double** %n.addr, align 8 - %0 = load double*, double** %m.addr, align 8 - %1 = load double*, double** %n.addr, align 8 - %call = call double @cblas_ddot(i32 3, double* %0, i32 1, double* %1, i32 1) - store double %call, double* %x, align 8 - %2 = load double*, double** %m.addr, align 8 - %arrayidx = getelementptr inbounds double, double* %2, i64 0 - store double 1.100000e+01, double* %arrayidx, align 8 - %3 = load double*, double** %m.addr, align 8 - %arrayidx1 = getelementptr inbounds double, double* %3, i64 1 - store double 1.200000e+01, double* %arrayidx1, align 8 - %4 = load double*, double** %m.addr, align 8 - %arrayidx2 = getelementptr inbounds double, double* %4, i64 2 - store double 1.300000e+01, double* %arrayidx2, align 8 - %5 = load double, double* %x, align 8 - %6 = load double, double* %x, align 8 - %mul = fmul double %5, %6 - store double %mul, double* %y, align 8 - %7 = load double, double* %y, align 8 - ret double %7 -} - -declare dso_local double @cblas_ddot(i32, double*, i32, double*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [3 x double], align 16 - %m1 = alloca [3 x double], align 16 - %n = alloca [3 x double], align 16 - %n1 = alloca [3 x double], align 16 - %val = alloca double, align 8 - %0 = bitcast [3 x double]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.main.m to i8*), i64 24, i1 false) - %1 = bitcast [3 x double]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %1, i8 0, i64 24, i1 false) - %2 = bitcast [3 x double]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %2, i8* align 16 bitcast ([3 x double]* @__const.main.n to i8*), i64 24, i1 false) - %3 = bitcast [3 x double]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %3, i8 0, i64 24, i1 false) - %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x double], [3 x double]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [3 x double], [3 x double]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [3 x double], [3 x double]* %n1, i32 0, i32 0 - %call = call double @__enzyme_autodiff(i8* bitcast (double (double*, double*)* @g to i8*), double* %arraydecay, double* %arraydecay1, double* %arraydecay2, double* %arraydecay3) - store double %call, double* %val, align 8 - ret i32 0 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local double @__enzyme_autodiff(i8*, double*, double*, double*, double*) - -;CHECK:define internal void @diffeg(double* noalias %m, double* %"m'", double* noalias %n, double* %"n'", double %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 mul (i64 ptrtoint (double* getelementptr (double, double* null, i32 1) to i64), i64 3)) -;CHECK-NEXT: %0 = bitcast i8* %malloccall to double* -;CHECK-NEXT: call void @__enzyme_memcpy_doubleda0sa0stride(double* %0, double* %m, i32 3, i32 1) -;CHECK-NEXT: %call = call double @cblas_ddot(i32 3, double* nocapture readonly %m, i32 1, double* nocapture readonly %n, i32 1) -;CHECK-NEXT: store double 1.100000e+01, double* %m, align 8 -;CHECK-NEXT: %"arrayidx1'ipg" = getelementptr inbounds double, double* %"m'", i64 1 -;CHECK-NEXT: %arrayidx1 = getelementptr inbounds double, double* %m, i64 1 -;CHECK-NEXT: store double 1.200000e+01, double* %arrayidx1, align 8 -;CHECK-NEXT: %"arrayidx2'ipg" = getelementptr inbounds double, double* %"m'", i64 2 -;CHECK-NEXT: %arrayidx2 = getelementptr inbounds double, double* %m, i64 2 -;CHECK-NEXT: store double 1.300000e+01, double* %arrayidx2, align 8 -;CHECK-NEXT: %m0diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %1 = fadd fast double %m0diffecall, %m1diffecall -;CHECK-NEXT: store double 0.000000e+00, double* %"arrayidx2'ipg", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"arrayidx1'ipg", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"m'", align 8 -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %1, double* %0, i32 1, double* %"n'", i32 1) -;CHECK-NEXT: tail call void @free(i8* %malloccall) -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %1, double* %n, i32 1, double* %"m'", i32 1) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_daxpy(i32, double, double*, i32, double*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1_stride.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1_stride.ll deleted file mode 100644 index d4d2c513c9f61..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1_stride.ll +++ /dev/null @@ -1,127 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern double __enzyme_autodiff(void *, double *, double *, double *, -; double *); -; -;double g(double *restrict m, double *restrict n) { -; double x = cblas_ddot(3, m, 2, n, 3); -; m[0] = 11.0; -; m[1] = 12.0; -; m[2] = 13.0; -; double y = x * x; -; return y; -;} -; -;int main() { -; double m[6] = {1, 2, 3, 101, 102, 103}; -; double m1[6] = {0, 0, 0, 0, 0, 0}; -; double n[9] = {4, 5, 6, 104, 105, 106, 7, 8, 9}; -; double n1[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; -; __enzyme_autodiff((void*)g, m, m1, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.n = private unnamed_addr constant [9 x double] [double 4.000000e+00, double 5.000000e+00, double 6.000000e+00, double 1.040000e+02, double 1.050000e+02, double 1.060000e+02, double 7.000000e+00, double 8.000000e+00, double 9.000000e+00], align 16 - -define dso_local double @g(double* noalias %m, double* noalias %n) { -entry: - %m.addr = alloca double*, align 8 - %n.addr = alloca double*, align 8 - %x = alloca double, align 8 - %y = alloca double, align 8 - store double* %m, double** %m.addr, align 8 - store double* %n, double** %n.addr, align 8 - %0 = load double*, double** %m.addr, align 8 - %1 = load double*, double** %n.addr, align 8 - %call = call double @cblas_ddot(i32 3, double* %0, i32 2, double* %1, i32 3) - store double %call, double* %x, align 8 - %2 = load double*, double** %m.addr, align 8 - %arrayidx = getelementptr inbounds double, double* %2, i64 0 - store double 1.100000e+01, double* %arrayidx, align 8 - %3 = load double*, double** %m.addr, align 8 - %arrayidx1 = getelementptr inbounds double, double* %3, i64 1 - store double 1.200000e+01, double* %arrayidx1, align 8 - %4 = load double*, double** %m.addr, align 8 - %arrayidx2 = getelementptr inbounds double, double* %4, i64 2 - store double 1.300000e+01, double* %arrayidx2, align 8 - %5 = load double, double* %x, align 8 - %6 = load double, double* %x, align 8 - %mul = fmul double %5, %6 - store double %mul, double* %y, align 8 - %7 = load double, double* %y, align 8 - ret double %7 -} - -declare dso_local double @cblas_ddot(i32, double*, i32, double*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [6 x double], align 16 - %m1 = alloca [6 x double], align 16 - %n = alloca [9 x double], align 16 - %n1 = alloca [9 x double], align 16 - %0 = bitcast [6 x double]* %m to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %0, i8 0, i64 48, i1 false) - %1 = bitcast i8* %0 to [6 x double]* - %2 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 0 - store double 1.000000e+00, double* %2, align 16 - %3 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 1 - store double 2.000000e+00, double* %3, align 8 - %4 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 2 - store double 3.000000e+00, double* %4, align 16 - %5 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 3 - store double 1.010000e+02, double* %5, align 8 - %6 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 4 - store double 1.020000e+02, double* %6, align 16 - %7 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 5 - store double 1.030000e+02, double* %7, align 8 - %8 = bitcast [6 x double]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %8, i8 0, i64 48, i1 false) - %9 = bitcast [9 x double]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %9, i8* align 16 bitcast ([9 x double]* @__const.main.n to i8*), i64 72, i1 false) - %10 = bitcast [9 x double]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %10, i8 0, i64 72, i1 false) - %arraydecay = getelementptr inbounds [6 x double], [6 x double]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [6 x double], [6 x double]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [9 x double], [9 x double]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [9 x double], [9 x double]* %n1, i32 0, i32 0 - %call = call double @__enzyme_autodiff(i8* bitcast (double (double*, double*)* @g to i8*), double* %arraydecay, double* %arraydecay1, double* %arraydecay2, double* %arraydecay3) - ret i32 0 -} - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare dso_local double @__enzyme_autodiff(i8*, double*, double*, double*, double*) - -;CHECK:define internal void @diffeg(double* noalias %m, double* %"m'", double* noalias %n, double* %"n'", double %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 mul (i64 ptrtoint (double* getelementptr (double, double* null, i32 1) to i64), i64 3)) -;CHECK-NEXT: %0 = bitcast i8* %malloccall to double* -;CHECK-NEXT: call void @__enzyme_memcpy_doubleda0sa0stride(double* %0, double* %m, i32 3, i32 2) -;CHECK-NEXT: %call = call double @cblas_ddot(i32 3, double* nocapture readonly %m, i32 2, double* nocapture readonly %n, i32 3) -;CHECK-NEXT: store double 1.100000e+01, double* %m, align 8 -;CHECK-NEXT: %"arrayidx1'ipg" = getelementptr inbounds double, double* %"m'", i64 1 -;CHECK-NEXT: %arrayidx1 = getelementptr inbounds double, double* %m, i64 1 -;CHECK-NEXT: store double 1.200000e+01, double* %arrayidx1, align 8 -;CHECK-NEXT: %"arrayidx2'ipg" = getelementptr inbounds double, double* %"m'", i64 2 -;CHECK-NEXT: %arrayidx2 = getelementptr inbounds double, double* %m, i64 2 -;CHECK-NEXT: store double 1.300000e+01, double* %arrayidx2, align 8 -;CHECK-NEXT: %m0diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %1 = fadd fast double %m0diffecall, %m1diffecall -;CHECK-NEXT: store double 0.000000e+00, double* %"arrayidx2'ipg", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"arrayidx1'ipg", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"m'", align 8 -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %1, double* %0, i32 1, double* %"n'", i32 3) -;CHECK-NEXT: tail call void @free(i8* %malloccall) -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %1, double* %n, i32 3, double* %"m'", i32 2) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_daxpy(i32, double, double*, i32, double*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1_stride_split.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1_stride_split.ll deleted file mode 100644 index 4a1c324b6dc6c..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod1_stride_split.ll +++ /dev/null @@ -1,182 +0,0 @@ -;RUN: if [ %llvmver -ge 8 ]; then %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s; fi - -;#include -; -;extern double __enzyme_autodiff(void *, double *, double *, double *, -; double *); -; -;void outer(double* out, double *a, double *b) { -; *out = cblas_ddot(3, a, 2, b, 3); -;} -; -;double g(double *restrict m, double *restrict n) { -; double x; -; outer(&x, m, n); -; m[0] = 11.0; -; m[1] = 12.0; -; m[2] = 13.0; -; double y = x * x; -; return y; -;} -; -;int main() { -; double m[6] = {1, 2, 3, 101, 102, 103}; -; double m1[6] = {0, 0, 0, 0, 0, 0}; -; double n[9] = {4, 5, 6, 104, 105, 106, 7, 8, 9}; -; double n1[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; -; __enzyme_autodiff((void*)g, m, m1, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.n = private unnamed_addr constant [9 x double] [double 4.000000e+00, double 5.000000e+00, double 6.000000e+00, double 1.040000e+02, double 1.050000e+02, double 1.060000e+02, double 7.000000e+00, double 8.000000e+00, double 9.000000e+00], align 16 - -define dso_local void @outer(double* %out, double* %a, double* %b) { -entry: - %out.addr = alloca double*, align 8 - %a.addr = alloca double*, align 8 - %b.addr = alloca double*, align 8 - store double* %out, double** %out.addr, align 8 - store double* %a, double** %a.addr, align 8 - store double* %b, double** %b.addr, align 8 - %0 = load double*, double** %a.addr, align 8 - %1 = load double*, double** %b.addr, align 8 - %call = call double @cblas_ddot(i32 3, double* %0, i32 2, double* %1, i32 3) - %2 = load double*, double** %out.addr, align 8 - store double %call, double* %2, align 8 - ret void -} - -declare dso_local double @cblas_ddot(i32, double*, i32, double*, i32) - -define dso_local double @g(double* noalias %m, double* noalias %n) { -entry: - %m.addr = alloca double*, align 8 - %n.addr = alloca double*, align 8 - %x = alloca double, align 8 - %y = alloca double, align 8 - store double* %m, double** %m.addr, align 8 - store double* %n, double** %n.addr, align 8 - %0 = load double*, double** %m.addr, align 8 - %1 = load double*, double** %n.addr, align 8 - call void @outer(double* %x, double* %0, double* %1) - %2 = load double*, double** %m.addr, align 8 - %arrayidx = getelementptr inbounds double, double* %2, i64 0 - store double 1.100000e+01, double* %arrayidx, align 8 - %3 = load double*, double** %m.addr, align 8 - %arrayidx1 = getelementptr inbounds double, double* %3, i64 1 - store double 1.200000e+01, double* %arrayidx1, align 8 - %4 = load double*, double** %m.addr, align 8 - %arrayidx2 = getelementptr inbounds double, double* %4, i64 2 - store double 1.300000e+01, double* %arrayidx2, align 8 - %5 = load double, double* %x, align 8 - %6 = load double, double* %x, align 8 - %mul = fmul double %5, %6 - store double %mul, double* %y, align 8 - %7 = load double, double* %y, align 8 - ret double %7 -} - -define dso_local i32 @main() { -entry: - %m = alloca [6 x double], align 16 - %m1 = alloca [6 x double], align 16 - %n = alloca [9 x double], align 16 - %n1 = alloca [9 x double], align 16 - %0 = bitcast [6 x double]* %m to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %0, i8 0, i64 48, i1 false) - %1 = bitcast i8* %0 to [6 x double]* - %2 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 0 - store double 1.000000e+00, double* %2, align 16 - %3 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 1 - store double 2.000000e+00, double* %3, align 8 - %4 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 2 - store double 3.000000e+00, double* %4, align 16 - %5 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 3 - store double 1.010000e+02, double* %5, align 8 - %6 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 4 - store double 1.020000e+02, double* %6, align 16 - %7 = getelementptr inbounds [6 x double], [6 x double]* %1, i32 0, i32 5 - store double 1.030000e+02, double* %7, align 8 - %8 = bitcast [6 x double]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %8, i8 0, i64 48, i1 false) - %9 = bitcast [9 x double]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %9, i8* align 16 bitcast ([9 x double]* @__const.main.n to i8*), i64 72, i1 false) - %10 = bitcast [9 x double]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %10, i8 0, i64 72, i1 false) - %arraydecay = getelementptr inbounds [6 x double], [6 x double]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [6 x double], [6 x double]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [9 x double], [9 x double]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [9 x double], [9 x double]* %n1, i32 0, i32 0 - %call = call double @__enzyme_autodiff(i8* bitcast (double (double*, double*)* @g to i8*), double* %arraydecay, double* %arraydecay1, double* %arraydecay2, double* %arraydecay3) - ret i32 0 -} - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare dso_local double @__enzyme_autodiff(i8*, double*, double*, double*, double*) - -;CHECK:define internal void @diffeg(double* noalias %m, double* %"m'", double* noalias %n, double* %"n'", double %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %"x'ipa" = alloca double, align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"x'ipa", align 8 -;CHECK-NEXT: %x = alloca double, align 8 -;CHECK-NEXT: %_augmented = call { double*, double* } @augmented_outer(double* %x, double* %"x'ipa", double* %m, double* %"m'", double* %n, double* %"n'") -;CHECK-NEXT: store double 1.100000e+01, double* %m, align 8 -;CHECK-NEXT: %"arrayidx1'ipg" = getelementptr inbounds double, double* %"m'", i64 1 -;CHECK-NEXT: %arrayidx1 = getelementptr inbounds double, double* %m, i64 1 -;CHECK-NEXT: store double 1.200000e+01, double* %arrayidx1, align 8 -;CHECK-NEXT: %"arrayidx2'ipg" = getelementptr inbounds double, double* %"m'", i64 2 -;CHECK-NEXT: %arrayidx2 = getelementptr inbounds double, double* %m, i64 2 -;CHECK-NEXT: store double 1.300000e+01, double* %arrayidx2, align 8 -;CHECK-NEXT: %0 = load double, double* %x, align 8 -;CHECK-NEXT: %1 = load double, double* %x, align 8 -;CHECK-NEXT: %m0diffe = fmul fast double %differeturn, %1 -;CHECK-NEXT: %m1diffe = fmul fast double %differeturn, %0 -;CHECK-NEXT: %2 = load double, double* %"x'ipa", align 8 -;CHECK-NEXT: %3 = fadd fast double %2, %m1diffe -;CHECK-NEXT: store double %3, double* %"x'ipa", align 8 -;CHECK-NEXT: %4 = load double, double* %"x'ipa", align 8 -;CHECK-NEXT: %5 = fadd fast double %4, %m0diffe -;CHECK-NEXT: store double %5, double* %"x'ipa", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"arrayidx2'ipg", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"arrayidx1'ipg", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"m'", align 8 -;CHECK-NEXT: call void @diffeouter(double* %x, double* %"x'ipa", double* %m, double* %"m'", double* %n, double* %"n'", { double*, double* } %_augmented) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:define internal { double*, double* } @augmented_outer(double* %out, double* %"out'", double* %a, double* %"a'", double* %b, double* %"b'") -;CHECK-NEXT:entry: -;CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 mul (i64 ptrtoint (double* getelementptr (double, double* null, i32 1) to i64), i64 3)) -;CHECK-NEXT: %0 = bitcast i8* %malloccall to double* -;CHECK-NEXT: call void @__enzyme_memcpy_doubleda0sa0stride(double* %0, double* %a, i32 3, i32 2) -;CHECK-NEXT: %malloccall2 = tail call i8* @malloc(i64 mul (i64 ptrtoint (double* getelementptr (double, double* null, i32 1) to i64), i64 3)) -;CHECK-NEXT: %1 = bitcast i8* %malloccall2 to double* -;CHECK-NEXT: call void @__enzyme_memcpy_doubleda0sa0stride(double* %1, double* %b, i32 3, i32 3) -;CHECK-NEXT: %2 = insertvalue { double*, double* } undef, double* %0, 0 -;CHECK-NEXT: %3 = insertvalue { double*, double* } %2, double* %1, 1 -;CHECK-NEXT: %call = call double @cblas_ddot(i32 3, double* nocapture readonly %a, i32 2, double* nocapture readonly %b, i32 3) -;CHECK-NEXT: store double %call, double* %out, align 8 -;CHECK-NEXT: ret { double*, double* } %3 -;CHECK-NEXT:} - -;CHECK:define internal void @diffeouter(double* %out, double* %"out'", double* %a, double* %"a'", double* %b, double* %"b'", { double*, double* } -;CHECK-NEXT:entry: -;CHECK-NEXT: %1 = extractvalue { double*, double* } %0, 0 -;CHECK-NEXT: %2 = extractvalue { double*, double* } %0, 1 -;CHECK-NEXT: %3 = load double, double* %"out'", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"out'", align 8 -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %3, double* %1, i32 1, double* %"b'", i32 3) -;CHECK-NEXT: %4 = bitcast double* %1 to i8* -;CHECK-NEXT: tail call void @free(i8* %4) -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %3, double* %2, i32 1, double* %"a'", i32 2) -;CHECK-NEXT: %5 = bitcast double* %2 to i8* -;CHECK-NEXT: tail call void @free(i8* %5) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_daxpy(i32, double, double*, i32, double*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod2_split.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod2_split.ll deleted file mode 100644 index 32bf798f53a2b..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_mod2_split.ll +++ /dev/null @@ -1,179 +0,0 @@ -;RUN: if [ %llvmver -ge 8 ]; then %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s; fi - -;#include -; -;extern double __enzyme_autodiff(void *, double *, double *, double *, -; double *); -; -;void outer(double* out, double *a, double *b) { -; *out = cblas_ddot(3, a, 1, b, 1); -;} -; -;double g(double *m, double *n) { -; double x; -; outer(&x, m, n); -; m[0] = 11.0; -; m[1] = 12.0; -; m[2] = 13.0; -; n[0] = 21.0; -; n[1] = 22.0; -; n[2] = 23.0; -; double y = x * x; -; return y; -;} -; -;int main() { -; double m[3] = {1, 2, 3}; -; double m1[3] = {0, 0, 0}; -; double n[3] = {4, 5, 6}; -; double n1[3] = {0, 0, 0}; -; double val = __enzyme_autodiff((void*)g, m, m1, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.m = private unnamed_addr constant [3 x double] [double 1.000000e+00, double 2.000000e+00, double 3.000000e+00], align 16 -@__const.main.n = private unnamed_addr constant [3 x double] [double 4.000000e+00, double 5.000000e+00, double 6.000000e+00], align 16 - -define dso_local void @outer(double* %out, double* %a, double* %b) { -entry: - %out.addr = alloca double*, align 8 - %a.addr = alloca double*, align 8 - %b.addr = alloca double*, align 8 - store double* %out, double** %out.addr, align 8 - store double* %a, double** %a.addr, align 8 - store double* %b, double** %b.addr, align 8 - %0 = load double*, double** %a.addr, align 8 - %1 = load double*, double** %b.addr, align 8 - %call = call double @cblas_ddot(i32 3, double* %0, i32 1, double* %1, i32 1) - %2 = load double*, double** %out.addr, align 8 - store double %call, double* %2, align 8 - ret void -} - -declare dso_local double @cblas_ddot(i32, double*, i32, double*, i32) - -define dso_local double @g(double* %m, double* %n) { -entry: - %m.addr = alloca double*, align 8 - %n.addr = alloca double*, align 8 - %x = alloca double, align 8 - %y = alloca double, align 8 - store double* %m, double** %m.addr, align 8 - store double* %n, double** %n.addr, align 8 - %0 = load double*, double** %m.addr, align 8 - %1 = load double*, double** %n.addr, align 8 - call void @outer(double* %x, double* %0, double* %1) - %2 = load double*, double** %m.addr, align 8 - %arrayidx = getelementptr inbounds double, double* %2, i64 0 - store double 1.100000e+01, double* %arrayidx, align 8 - %3 = load double*, double** %m.addr, align 8 - %arrayidx1 = getelementptr inbounds double, double* %3, i64 1 - store double 1.200000e+01, double* %arrayidx1, align 8 - %4 = load double*, double** %m.addr, align 8 - %arrayidx2 = getelementptr inbounds double, double* %4, i64 2 - store double 1.300000e+01, double* %arrayidx2, align 8 - %5 = load double*, double** %n.addr, align 8 - %arrayidx3 = getelementptr inbounds double, double* %5, i64 0 - store double 2.100000e+01, double* %arrayidx3, align 8 - %6 = load double*, double** %n.addr, align 8 - %arrayidx4 = getelementptr inbounds double, double* %6, i64 1 - store double 2.200000e+01, double* %arrayidx4, align 8 - %7 = load double*, double** %n.addr, align 8 - %arrayidx5 = getelementptr inbounds double, double* %7, i64 2 - store double 2.300000e+01, double* %arrayidx5, align 8 - %8 = load double, double* %x, align 8 - %9 = load double, double* %x, align 8 - %mul = fmul double %8, %9 - store double %mul, double* %y, align 8 - %10 = load double, double* %y, align 8 - ret double %10 -} - -define dso_local i32 @main() { -entry: - %m = alloca [3 x double], align 16 - %m1 = alloca [3 x double], align 16 - %n = alloca [3 x double], align 16 - %n1 = alloca [3 x double], align 16 - %val = alloca double, align 8 - %0 = bitcast [3 x double]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.main.m to i8*), i64 24, i1 false) - %1 = bitcast [3 x double]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %1, i8 0, i64 24, i1 false) - %2 = bitcast [3 x double]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %2, i8* align 16 bitcast ([3 x double]* @__const.main.n to i8*), i64 24, i1 false) - %3 = bitcast [3 x double]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %3, i8 0, i64 24, i1 false) - %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x double], [3 x double]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [3 x double], [3 x double]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [3 x double], [3 x double]* %n1, i32 0, i32 0 - %call = call double @__enzyme_autodiff(i8* bitcast (double (double*, double*)* @g to i8*), double* %arraydecay, double* %arraydecay1, double* %arraydecay2, double* %arraydecay3) - store double %call, double* %val, align 8 - ret i32 0 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local double @__enzyme_autodiff(i8*, double*, double*, double*, double*) - -;CHECK:define internal void @diffeg(double* %m, double* %"m'", double* %n, double* %"n'", double %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %"x'ipa" = alloca double, align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"x'ipa", align 8 -;CHECK-NEXT: %x = alloca double, align 8 -;CHECK-NEXT: %_augmented = call { double*, double* } @augmented_outer(double* %x, double* %"x'ipa", double* %m, double* %"m'", double* %n, double* %"n'") -;CHECK-NEXT: store double 1.100000e+01, double* %m, align 8 -;CHECK-NEXT: %"arrayidx1'ipg" = getelementptr inbounds double, double* %"m'", i64 1 -;CHECK-NEXT: %arrayidx1 = getelementptr inbounds double, double* %m, i64 1 -;CHECK-NEXT: store double 1.200000e+01, double* %arrayidx1, align 8 -;CHECK-NEXT: %"arrayidx2'ipg" = getelementptr inbounds double, double* %"m'", i64 2 -;CHECK-NEXT: %arrayidx2 = getelementptr inbounds double, double* %m, i64 2 -;CHECK-NEXT: store double 1.300000e+01, double* %arrayidx2, align 8 -;CHECK-NEXT: store double 2.100000e+01, double* %n, align 8 -;CHECK-NEXT: %"arrayidx4'ipg" = getelementptr inbounds double, double* %"n'", i64 1 -;CHECK-NEXT: %arrayidx4 = getelementptr inbounds double, double* %n, i64 1 -;CHECK-NEXT: store double 2.200000e+01, double* %arrayidx4, align 8 -;CHECK-NEXT: %"arrayidx5'ipg" = getelementptr inbounds double, double* %"n'", i64 2 -;CHECK-NEXT: %arrayidx5 = getelementptr inbounds double, double* %n, i64 2 -;CHECK-NEXT: store double 2.300000e+01, double* %arrayidx5, align 8 -;CHECK-NEXT: %0 = load double, double* %x, align 8 -;CHECK-NEXT: %1 = load double, double* %x, align 8 -;CHECK-NEXT: %m0diffe = fmul fast double %differeturn, %1 -;CHECK-NEXT: %m1diffe = fmul fast double %differeturn, %0 -;CHECK-NEXT: %2 = load double, double* %"x'ipa", align 8 -;CHECK-NEXT: %3 = fadd fast double %2, %m1diffe -;CHECK-NEXT: store double %3, double* %"x'ipa", align 8 -;CHECK-NEXT: %4 = load double, double* %"x'ipa", align 8 -;CHECK-NEXT: %5 = fadd fast double %4, %m0diffe -;CHECK-NEXT: store double %5, double* %"x'ipa", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"arrayidx5'ipg", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"arrayidx4'ipg", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"n'", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"arrayidx2'ipg", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"arrayidx1'ipg", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"m'", align 8 -;CHECK-NEXT: call void @diffeouter(double* %x, double* %"x'ipa", double* %m, double* %"m'", double* %n, double* %"n'", { double*, double* } %_augmented) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:define internal void @diffeouter(double* %out, double* %"out'", double* %a, double* %"a'", double* %b, double* %"b'", { double*, double* } -;CHECK-NEXT:entry: -;CHECK-NEXT: %1 = extractvalue { double*, double* } %0, 0 -;CHECK-NEXT: %2 = extractvalue { double*, double* } %0, 1 -;CHECK-NEXT: %3 = load double, double* %"out'", align 8 -;CHECK-NEXT: store double 0.000000e+00, double* %"out'", align 8 -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %3, double* %1, i32 1, double* %"b'", i32 1) -;CHECK-NEXT: %4 = bitcast double* %1 to i8* -;CHECK-NEXT: tail call void @free(i8* %4) -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %3, double* %2, i32 1, double* %"a'", i32 1) -;CHECK-NEXT: %5 = bitcast double* %2 to i8* -;CHECK-NEXT: tail call void @free(i8* %5) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_daxpy(i32, double, double*, i32, double*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_nomod.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_nomod.ll deleted file mode 100644 index 8cab8cfd0f9b5..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_ddot_nomod.ll +++ /dev/null @@ -1,90 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern double __enzyme_autodiff(double*, double*, double*, double*, double*); -; -;double g(double *restrict m, double *restrict n) { -; double x = cblas_ddot(3, m, 1, n, 1); -; double y = x*x; -; return y; -;} -; -;int main() { -; double m[3] = {1, 2, 3}; -; double n[3] = {4, 5, 6}; -; double m1[3] = {0.}; -; double n1[3] = {0.}; -; double z = __enzyme_autodiff((double*)g, m, m1, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.m = private unnamed_addr constant [3 x double] [double 1.000000e+00, double 2.000000e+00, double 3.000000e+00], align 16 -@__const.main.n = private unnamed_addr constant [3 x double] [double 4.000000e+00, double 5.000000e+00, double 6.000000e+00], align 16 - -define dso_local double @g(double* noalias %m, double* noalias %n) { -entry: - %m.addr = alloca double*, align 8 - %n.addr = alloca double*, align 8 - %x = alloca double, align 8 - %y = alloca double, align 8 - store double* %m, double** %m.addr, align 8 - store double* %n, double** %n.addr, align 8 - %0 = load double*, double** %m.addr, align 8 - %1 = load double*, double** %n.addr, align 8 - %call = call double @cblas_ddot(i32 3, double* %0, i32 1, double* %1, i32 1) - store double %call, double* %x, align 8 - %2 = load double, double* %x, align 8 - %3 = load double, double* %x, align 8 - %mul = fmul double %2, %3 - store double %mul, double* %y, align 8 - %4 = load double, double* %y, align 8 - ret double %4 -} - -declare dso_local double @cblas_ddot(i32, double*, i32, double*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [3 x double], align 16 - %n = alloca [3 x double], align 16 - %m1 = alloca [3 x double], align 16 - %n1 = alloca [3 x double], align 16 - %z = alloca double, align 8 - %0 = bitcast [3 x double]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([3 x double]* @__const.main.m to i8*), i64 24, i1 false) - %1 = bitcast [3 x double]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %1, i8* align 16 bitcast ([3 x double]* @__const.main.n to i8*), i64 24, i1 false) - %2 = bitcast [3 x double]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %2, i8 0, i64 24, i1 false) - %3 = bitcast [3 x double]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %3, i8 0, i64 24, i1 false) - %arraydecay = getelementptr inbounds [3 x double], [3 x double]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x double], [3 x double]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [3 x double], [3 x double]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [3 x double], [3 x double]* %n1, i32 0, i32 0 - %call = call double @__enzyme_autodiff(double* bitcast (double (double*, double*)* @g to double*), double* %arraydecay, double* %arraydecay1, double* %arraydecay2, double* %arraydecay3) - store double %call, double* %z, align 8 - ret i32 0 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local double @__enzyme_autodiff(double*, double*, double*, double*, double*) - -;CHECK:define internal void @diffeg(double* noalias %m, double* %"m'", double* noalias %n, double* %"n'", double %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %call = call double @cblas_ddot(i32 3, double* nocapture readonly %m, i32 1, double* nocapture readonly %n, i32 1) -;CHECK-NEXT: %m0diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast double %differeturn, %call -;CHECK-NEXT: %0 = fadd fast double %m0diffecall, %m1diffecall -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %0, double* %m, i32 1, double* %"n'", i32 1) -;CHECK-NEXT: call void @cblas_daxpy(i32 3, double %0, double* %n, i32 1, double* %"m'", i32 1) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_daxpy(i32, double, double*, i32, double*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot.ll new file mode 100644 index 0000000000000..3de2e52d87850 --- /dev/null +++ b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot.ll @@ -0,0 +1,198 @@ +;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare dso_local void @__enzyme_autodiff(...) + +declare float @cblas_sdot(i32, float*, i32, float*, i32) + +define void @active(i32 %len, float* noalias %m, float* %dm, i32 %incm, float* noalias %n, float* %dn, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(float (i32, float*, i32, float*, i32)* @f, i32 %len, float* noalias %m, float* %dm, i32 %incm, float* noalias %n, float* %dn, i32 %incn) + ret void +} + +define void @inactiveFirst(i32 %len, float* noalias %m, i32 %incm, float* noalias %n, float* %dn, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(float (i32, float*, i32, float*, i32)* @f, i32 %len, metadata !"enzyme_const", float* noalias %m, i32 %incm, float* noalias %n, float* %dn, i32 %incn) + ret void +} + +define void @inactiveSecond(i32 %len, float* noalias %m, float* noalias %dm, i32 %incm, float* noalias %n, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(float (i32, float*, i32, float*, i32)* @f, i32 %len, float* noalias %m, float* noalias %dm, i32 %incm, metadata !"enzyme_const", float* noalias %n, i32 %incn) + ret void +} + +define void @activeMod(i32 %len, float* noalias %m, float* %dm, i32 %incm, float* noalias %n, float* %dn, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(float (i32, float*, i32, float*, i32)* @modf, i32 %len, float* noalias %m, float* %dm, i32 %incm, float* noalias %n, float* %dn, i32 %incn) + ret void +} + +define void @inactiveModFirst(i32 %len, float* noalias %m, i32 %incm, float* noalias %n, float* %dn, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(float (i32, float*, i32, float*, i32)* @modf, i32 %len, metadata !"enzyme_const", float* noalias %m, i32 %incm, float* noalias %n, float* %dn, i32 %incn) + ret void +} + +define void @inactiveModSecond(i32 %len, float* noalias %m, float* noalias %dm, i32 %incm, float* noalias %n, i32 %incn) { +entry: + call void (...) @__enzyme_autodiff(float (i32, float*, i32, float*, i32)* @modf, i32 %len, float* noalias %m, float* noalias %dm, i32 %incm, metadata !"enzyme_const", float* noalias %n, i32 %incn) + ret void +} + +define float @f(i32 %len, float* noalias %m, i32 %incm, float* noalias %n, i32 %incn) { +entry: + %call = call float @cblas_sdot(i32 %len, float* %m, i32 %incm, float* %n, i32 %incn) + ret float %call +} + +define float @modf(i32 %len, float* noalias %m, i32 %incm, float* noalias %n, i32 %incn) { +entry: + %call = call float @f(i32 %len, float* %m, i32 %incm, float* %n, i32 %incn) + store float 0.000000e+00, float* %m + store float 0.000000e+00, float* %n + ret float %call +} + + +; CHECK: define void @active +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[active:.+]]( + +; CHECK: define void @inactiveFirst +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[inactiveFirst:.+]]( + +; CHECK: define void @inactiveSecond +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[inactiveSecond:.+]]( + + +; CHECK: define void @activeMod +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[activeMod:.+]]( + +; CHECK: define void @inactiveModFirst +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[inactiveModFirst:.+]]( + +; CHECK: define void @inactiveModSecond +; CHECK-NEXT: entry +; CHECK-NEXT: call void @[[inactiveModSecond:.+]]( + + +; CHECK: define internal void @[[active]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, float* %"n'", i32 %incn, float %differeturn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %call = call float @cblas_sdot(i32 %len, float* nocapture readonly %m, i32 %incm, float* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %m, i32 %incm, float* %"n'", i32 %incn) +; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %n, i32 %incn, float* %"m'", i32 %incm) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal void @[[inactiveFirst]](i32 %len, float* noalias %m, i32 %incm, float* noalias %n, float* %"n'", i32 %incn, float %differeturn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %call = call float @cblas_sdot(i32 %len, float* nocapture readonly %m, i32 %incm, float* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %m, i32 %incm, float* %"n'", i32 %incn) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal void @[[inactiveSecond]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, i32 %incn, float %differeturn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %call = call float @cblas_sdot(i32 %len, float* nocapture readonly %m, i32 %incm, float* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %n, i32 %incn, float* %"m'", i32 %incm) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal void @[[activeMod]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, float* %"n'", i32 %incn, float %differeturn) +; CHECK-NEXT: entry: +; CHECK: %call_augmented = call { float*, float* } @[[augMod:.+]](i32 %len, float* %m, float* %"m'", i32 %incm, float* %n, float* %"n'", i32 %incn) +; CHECK: call void @[[revMod:.+]](i32 %len, float* %m, float* %"m'", i32 %incm, float* %n, float* %"n'", i32 %incn, float %differeturn, { float*, float* } %call_augmented) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal { float*, float* } @[[augMod]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, float* %"n'", i32 %incn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = zext i32 %len to i64 +; CHECK-NEXT: %mallocsize = mul i64 %0, ptrtoint (float* getelementptr (float, float* null, i32 1) to i64) +; CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 %mallocsize) +; CHECK-NEXT: %1 = bitcast i8* %malloccall to float* +; CHECK-NEXT: call void @__enzyme_memcpy_floatda0sa0stride(float* %1, float* %m, i32 %len, i32 %incm) +; CHECK-NEXT: %2 = zext i32 %len to i64 +; CHECK-NEXT: %mallocsize1 = mul i64 %2, ptrtoint (float* getelementptr (float, float* null, i32 1) to i64) +; CHECK-NEXT: %malloccall2 = tail call i8* @malloc(i64 %mallocsize1) +; CHECK-NEXT: %3 = bitcast i8* %malloccall2 to float* +; CHECK-NEXT: call void @__enzyme_memcpy_floatda0sa0stride(float* %3, float* %n, i32 %len, i32 %incn) +; CHECK-NEXT: %4 = insertvalue { float*, float* } undef, float* %1, 0 +; CHECK-NEXT: %5 = insertvalue { float*, float* } %4, float* %3, 1 +; CHECK-NEXT: %call = call float @cblas_sdot(i32 %len, float* nocapture readonly %m, i32 %incm, float* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: ret { float*, float* } %5 +; CHECK-NEXT: } + +; CHECK: define internal void @[[revMod]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, float* %"n'", i32 %incn, float %differeturn, { float*, float* } +; CHECK-NEXT: entry: +; CHECK-NEXT: %1 = extractvalue { float*, float* } %0, 0 +; CHECK-NEXT: %2 = extractvalue { float*, float* } %0, 1 +; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %1, i32 1, float* %"n'", i32 %incn) +; CHECK-NEXT: %3 = bitcast float* %1 to i8* +; CHECK-NEXT: tail call void @free(i8* %3) +; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %2, i32 1, float* %"m'", i32 %incm) +; CHECK-NEXT: %4 = bitcast float* %2 to i8* +; CHECK-NEXT: tail call void @free(i8* %4) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal void @[[inactiveModFirst]](i32 %len, float* noalias %m, i32 %incm, float* noalias %n, float* %"n'", i32 %incn, float %differeturn) +; CHECK-NEXT: entry: +; CHECK: %call_augmented = call float* @[[augModFirst:.+]](i32 %len, float* %m, i32 %incm, float* %n, float* %"n'", i32 %incn) +; CHECK: call void @[[revModFirst:.+]](i32 %len, float* %m, i32 %incm, float* %n, float* %"n'", i32 %incn, float %differeturn, float* %call_augmented) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal float* @[[augModFirst]](i32 %len, float* noalias %m, i32 %incm, float* noalias %n, float* %"n'", i32 %incn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = zext i32 %len to i64 +; CHECK-NEXT: %mallocsize = mul i64 %0, ptrtoint (float* getelementptr (float, float* null, i32 1) to i64) +; CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 %mallocsize) +; CHECK-NEXT: %1 = bitcast i8* %malloccall to float* +; CHECK-NEXT: call void @__enzyme_memcpy_floatda0sa0stride(float* %1, float* %m, i32 %len, i32 %incm) +; CHECK-NEXT: %call = call float @cblas_sdot(i32 %len, float* nocapture readonly %m, i32 %incm, float* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: ret float* %1 +; CHECK-NEXT: } + +; CHECK: define internal void @[[revModFirst]](i32 %len, float* noalias %m, i32 %incm, float* noalias %n, float* %"n'", i32 %incn, float %differeturn, float* +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %0, i32 1, float* %"n'", i32 %incn) +; CHECK-NEXT: %1 = bitcast float* %0 to i8* +; CHECK-NEXT: tail call void @free(i8* %1) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal void @[[inactiveModSecond]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, i32 %incn, float %differeturn) +; CHECK-NEXT: entry: +; CHECK: %call_augmented = call float* @[[augModSecond:.+]](i32 %len, float* %m, float* %"m'", i32 %incm, float* %n, i32 %incn) +; CHECK: call void @[[revModSecond:.+]](i32 %len, float* %m, float* %"m'", i32 %incm, float* %n, i32 %incn, float %differeturn, float* %call_augmented) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define internal float* @[[augModSecond]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, i32 %incn) +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = zext i32 %len to i64 +; CHECK-NEXT: %mallocsize = mul i64 %0, ptrtoint (float* getelementptr (float, float* null, i32 1) to i64) +; CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 %mallocsize) +; CHECK-NEXT: %1 = bitcast i8* %malloccall to float* +; CHECK-NEXT: call void @__enzyme_memcpy_floatda0sa0stride(float* %1, float* %n, i32 %len, i32 %incn) +; CHECK-NEXT: %call = call float @cblas_sdot(i32 %len, float* nocapture readonly %m, i32 %incm, float* nocapture readonly %n, i32 %incn) +; CHECK-NEXT: ret float* %1 +; CHECK-NEXT: } + +; CHECK: define internal void @[[revModSecond]](i32 %len, float* noalias %m, float* %"m'", i32 %incm, float* noalias %n, i32 %incn, float %differeturn, float* +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @cblas_saxpy(i32 %len, float %differeturn, float* %0, i32 1, float* %"m'", i32 %incm) +; CHECK-NEXT: %1 = bitcast float* %0 to i8* +; CHECK-NEXT: tail call void @free(i8* %1) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_first.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_first.ll deleted file mode 100644 index 7c145cd9618d0..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_first.ll +++ /dev/null @@ -1,85 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern float __enzyme_autodiff(float*, float*, float*); -; -;float g(float *restrict m) { -; float n[3] = {4, 5, 6}; -; float x = cblas_sdot(3, m, 1, n, 1); -; float y = x*x; -; return y; -;} -; -;int main() { -; float m[3] = {1, 2, 3}; -; float m1[3] = {0.}; -; float z = __enzyme_autodiff((float*)g, m, m1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.g.n = private unnamed_addr constant [3 x float] [float 4.000000e+00, float 5.000000e+00, float 6.000000e+00], align 4 -@__const.main.m = private unnamed_addr constant [3 x float] [float 1.000000e+00, float 2.000000e+00, float 3.000000e+00], align 4 - -define dso_local float @g(float* noalias %m) { -entry: - %m.addr = alloca float*, align 8 - %n = alloca [3 x float], align 4 - %x = alloca float, align 4 - %y = alloca float, align 4 - store float* %m, float** %m.addr, align 8 - %0 = bitcast [3 x float]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.g.n to i8*), i64 12, i1 false) - %1 = load float*, float** %m.addr, align 8 - %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %n, i32 0, i32 0 - %call = call float @cblas_sdot(i32 3, float* %1, i32 1, float* %arraydecay, i32 1) - store float %call, float* %x, align 4 - %2 = load float, float* %x, align 4 - %3 = load float, float* %x, align 4 - %mul = fmul float %2, %3 - store float %mul, float* %y, align 4 - %4 = load float, float* %y, align 4 - ret float %4 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare dso_local float @cblas_sdot(i32, float*, i32, float*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [3 x float], align 4 - %m1 = alloca [3 x float], align 4 - %z = alloca float, align 4 - %0 = bitcast [3 x float]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.main.m to i8*), i64 12, i1 false) - %1 = bitcast [3 x float]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 12, i1 false) - %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x float], [3 x float]* %m1, i32 0, i32 0 - %call = call float @__enzyme_autodiff(float* bitcast (float (float*)* @g to float*), float* %arraydecay, float* %arraydecay1) - store float %call, float* %z, align 4 - ret i32 0 -} - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local float @__enzyme_autodiff(float*, float*, float*) - -;CHECK:define internal void @diffeg(float* noalias %m, float* %"m'", float %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %n = alloca [3 x float], align 4 -;CHECK-NEXT: %0 = bitcast [3 x float]* %n to i8* -;CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.g.n to i8*), i64 12, i1 false) -;CHECK-NEXT: %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %n, i32 0, i32 0 -;CHECK-NEXT: %call = call float @cblas_sdot(i32 3, float* nocapture readonly %m, i32 1, float* nocapture readonly %arraydecay, i32 1) -;CHECK-NEXT: %m0diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %1 = fadd fast float %m0diffecall, %m1diffecall -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %1, float* %arraydecay, i32 1, float* %"m'", i32 1) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_saxpy(i32, float, float*, i32, float*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_mod1.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_mod1.ll deleted file mode 100644 index c23efebbc4900..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_mod1.ll +++ /dev/null @@ -1,91 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern float __enzyme_autodiff(float*, float*, float*); -; -;float g(float *restrict m) { -; float n[3] = {4, 5, 6}; -; float x = cblas_sdot(3, m, 1, n, 1); -; m[0] = 10; -; float y = x*x; -; return y; -;} -; -;int main() { -; float m[3] = {1, 2, 3}; -; float m1[3] = {0.}; -; float z = __enzyme_autodiff((float*)g, m, m1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.g.n = private unnamed_addr constant [3 x float] [float 4.000000e+00, float 5.000000e+00, float 6.000000e+00], align 4 -@__const.main.m = private unnamed_addr constant [3 x float] [float 1.000000e+00, float 2.000000e+00, float 3.000000e+00], align 4 - -define dso_local float @g(float* noalias %m) { -entry: - %m.addr = alloca float*, align 8 - %n = alloca [3 x float], align 4 - %x = alloca float, align 4 - %y = alloca float, align 4 - store float* %m, float** %m.addr, align 8 - %0 = bitcast [3 x float]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.g.n to i8*), i64 12, i1 false) - %1 = load float*, float** %m.addr, align 8 - %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %n, i32 0, i32 0 - %call = call float @cblas_sdot(i32 3, float* %1, i32 1, float* %arraydecay, i32 1) - store float %call, float* %x, align 4 - %2 = load float*, float** %m.addr, align 8 - %arrayidx = getelementptr inbounds float, float* %2, i64 0 - store float 1.000000e+01, float* %arrayidx, align 4 - %3 = load float, float* %x, align 4 - %4 = load float, float* %x, align 4 - %mul = fmul float %3, %4 - store float %mul, float* %y, align 4 - %5 = load float, float* %y, align 4 - ret float %5 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare dso_local float @cblas_sdot(i32, float*, i32, float*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [3 x float], align 4 - %m1 = alloca [3 x float], align 4 - %z = alloca float, align 4 - %0 = bitcast [3 x float]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.main.m to i8*), i64 12, i1 false) - %1 = bitcast [3 x float]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 12, i1 false) - %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x float], [3 x float]* %m1, i32 0, i32 0 - %call = call float @__enzyme_autodiff(float* bitcast (float (float*)* @g to float*), float* %arraydecay, float* %arraydecay1) - store float %call, float* %z, align 4 - ret i32 0 -} - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local float @__enzyme_autodiff(float*, float*, float*) - -;CHECK:define internal void @diffeg(float* noalias %m, float* %"m'", float %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %n = alloca [3 x float], align 4 -;CHECK-NEXT: %0 = bitcast [3 x float]* %n to i8* -;CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.g.n to i8*), i64 12, i1 false) -;CHECK-NEXT: %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %n, i32 0, i32 0 -;CHECK-NEXT: %call = call float @cblas_sdot(i32 3, float* nocapture readonly %m, i32 1, float* nocapture readonly %arraydecay, i32 1) -;CHECK-NEXT: store float 1.000000e+01, float* %m, align 4 -;CHECK-NEXT: %m0diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %1 = fadd fast float %m0diffecall, %m1diffecall -;CHECK-NEXT: store float 0.000000e+00, float* %"m'", align 4 -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %1, float* %arraydecay, i32 1, float* %"m'", i32 1) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_saxpy(i32, float, float*, i32, float*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_second.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_second.ll deleted file mode 100644 index d90cf07bd2db3..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_second.ll +++ /dev/null @@ -1,85 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern float __enzyme_autodiff(float*, float*, float*); -; -;float g(float *restrict n) { -; float m[3] = {1, 2, 3}; -; float x = cblas_sdot(3, m, 1, n, 1); -; float y = x*x; -; return y; -;} -; -;int main() { -; float n[3] = {4, 5, 6}; -; float n1[3] = {0.}; -; float z = __enzyme_autodiff((float*)g, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.g.m = private unnamed_addr constant [3 x float] [float 1.000000e+00, float 2.000000e+00, float 3.000000e+00], align 4 -@__const.main.n = private unnamed_addr constant [3 x float] [float 4.000000e+00, float 5.000000e+00, float 6.000000e+00], align 4 - -define dso_local float @g(float* noalias %n) { -entry: - %n.addr = alloca float*, align 8 - %m = alloca [3 x float], align 4 - %x = alloca float, align 4 - %y = alloca float, align 4 - store float* %n, float** %n.addr, align 8 - %0 = bitcast [3 x float]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.g.m to i8*), i64 12, i1 false) - %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %m, i32 0, i32 0 - %1 = load float*, float** %n.addr, align 8 - %call = call float @cblas_sdot(i32 3, float* %arraydecay, i32 1, float* %1, i32 1) - store float %call, float* %x, align 4 - %2 = load float, float* %x, align 4 - %3 = load float, float* %x, align 4 - %mul = fmul float %2, %3 - store float %mul, float* %y, align 4 - %4 = load float, float* %y, align 4 - ret float %4 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare dso_local float @cblas_sdot(i32, float*, i32, float*, i32) - -define dso_local i32 @main() { -entry: - %n = alloca [3 x float], align 4 - %n1 = alloca [3 x float], align 4 - %z = alloca float, align 4 - %0 = bitcast [3 x float]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.main.n to i8*), i64 12, i1 false) - %1 = bitcast [3 x float]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 12, i1 false) - %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %n, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x float], [3 x float]* %n1, i32 0, i32 0 - %call = call float @__enzyme_autodiff(float* bitcast (float (float*)* @g to float*), float* %arraydecay, float* %arraydecay1) - store float %call, float* %z, align 4 - ret i32 0 -} - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local float @__enzyme_autodiff(float*, float*, float*) - -;CHECK:define internal void @diffeg(float* noalias %n, float* %"n'", float %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %m = alloca [3 x float], align 4 -;CHECK-NEXT: %0 = bitcast [3 x float]* %m to i8* -;CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.g.m to i8*), i64 12, i1 false) -;CHECK-NEXT: %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %m, i32 0, i32 0 -;CHECK-NEXT: %call = call float @cblas_sdot(i32 3, float* nocapture readonly %arraydecay, i32 1, float* nocapture readonly %n, i32 1) -;CHECK-NEXT: %m0diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %1 = fadd fast float %m0diffecall, %m1diffecall -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %1, float* %arraydecay, i32 1, float* %"n'", i32 1) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_saxpy(i32, float, float*, i32, float*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_stride.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_stride.ll deleted file mode 100644 index ea16c53a7dc89..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_inactive_stride.ll +++ /dev/null @@ -1,92 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern float __enzyme_autodiff(void *, float *, float *, float *, -; float *, int); -; -;float g(float *restrict m, float *restrict n, int stride) { -; float x = cblas_sdot(3, m, 2, n, stride); -; float y = x * x; -; return y; -;} -; -;int main() { -; float m[6] = {1, 2, 3, 101, 102, 103}; -; float m1[6] = {0, 0, 0, 0, 0, 0}; -; float n[9] = {4, 5, 6, 104, 105, 106, 7, 8, 9}; -; float n1[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; -; __enzyme_autodiff((void*)g, m, m1, n, n1, 3); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.m = private unnamed_addr constant [6 x float] [float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 1.010000e+02, float 1.020000e+02, float 1.030000e+02], align 16 -@__const.main.n = private unnamed_addr constant [9 x float] [float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 1.040000e+02, float 1.050000e+02, float 1.060000e+02, float 7.000000e+00, float 8.000000e+00, float 9.000000e+00], align 16 - -define dso_local float @g(float* noalias %m, float* noalias %n, i32 %stride) { -entry: - %m.addr = alloca float*, align 8 - %n.addr = alloca float*, align 8 - %stride.addr = alloca i32, align 4 - %x = alloca float, align 4 - %y = alloca float, align 4 - store float* %m, float** %m.addr, align 8 - store float* %n, float** %n.addr, align 8 - store i32 %stride, i32* %stride.addr, align 4 - %0 = load float*, float** %m.addr, align 8 - %1 = load float*, float** %n.addr, align 8 - %2 = load i32, i32* %stride.addr, align 4 - %call = call float @cblas_sdot(i32 3, float* %0, i32 2, float* %1, i32 %2) - store float %call, float* %x, align 4 - %3 = load float, float* %x, align 4 - %4 = load float, float* %x, align 4 - %mul = fmul float %3, %4 - store float %mul, float* %y, align 4 - %5 = load float, float* %y, align 4 - ret float %5 -} - -declare dso_local float @cblas_sdot(i32, float*, i32, float*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [6 x float], align 16 - %m1 = alloca [6 x float], align 16 - %n = alloca [9 x float], align 16 - %n1 = alloca [9 x float], align 16 - %0 = bitcast [6 x float]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([6 x float]* @__const.main.m to i8*), i64 24, i1 false) - %1 = bitcast [6 x float]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %1, i8 0, i64 24, i1 false) - %2 = bitcast [9 x float]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %2, i8* align 16 bitcast ([9 x float]* @__const.main.n to i8*), i64 36, i1 false) - %3 = bitcast [9 x float]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %3, i8 0, i64 36, i1 false) - %arraydecay = getelementptr inbounds [6 x float], [6 x float]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [6 x float], [6 x float]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [9 x float], [9 x float]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [9 x float], [9 x float]* %n1, i32 0, i32 0 - %call = call float @__enzyme_autodiff(i8* bitcast (float (float*, float*, i32)* @g to i8*), float* %arraydecay, float* %arraydecay1, float* %arraydecay2, float* %arraydecay3, i32 3) - ret i32 0 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local float @__enzyme_autodiff(i8*, float*, float*, float*, float*, i32) - -;CHECK:define internal void @diffeg(float* noalias %m, float* %"m'", float* noalias %n, float* %"n'", i32 %stride, float %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %call = call float @cblas_sdot(i32 3, float* nocapture readonly %m, i32 2, float* nocapture readonly %n, i32 %stride) -;CHECK-NEXT: %m0diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %0 = fadd fast float %m0diffecall, %m1diffecall -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %0, float* %m, i32 2, float* %"n'", i32 %stride) -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %0, float* %n, i32 %stride, float* %"m'", i32 2) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_saxpy(i32, float, float*, i32, float*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1.ll deleted file mode 100644 index e9c0a4cfa5aab..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1.ll +++ /dev/null @@ -1,117 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern float __enzyme_autodiff(void *, float *, float *, float *, -; float *); -; -;float g(float *restrict m, float *restrict n) { -; float x = cblas_sdot(3, m, 1, n, 1); -; m[0] = 11.0; -; m[1] = 12.0; -; m[2] = 13.0; -; float y = x * x; -; return y; -;} -; -;int main() { -; float m[3] = {1, 2, 3}; -; float m1[3] = {0, 0, 0}; -; float n[3] = {4, 5, 6}; -; float n1[3] = {0, 0, 0}; -; float val = __enzyme_autodiff((void*)g, m, m1, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.m = private unnamed_addr constant [3 x float] [float 1.000000e+00, float 2.000000e+00, float 3.000000e+00], align 4 -@__const.main.n = private unnamed_addr constant [3 x float] [float 4.000000e+00, float 5.000000e+00, float 6.000000e+00], align 4 - -define dso_local float @g(float* noalias %m, float* noalias %n) { -entry: - %m.addr = alloca float*, align 8 - %n.addr = alloca float*, align 8 - %x = alloca float, align 4 - %y = alloca float, align 4 - store float* %m, float** %m.addr, align 8 - store float* %n, float** %n.addr, align 8 - %0 = load float*, float** %m.addr, align 8 - %1 = load float*, float** %n.addr, align 8 - %call = call float @cblas_sdot(i32 3, float* %0, i32 1, float* %1, i32 1) - store float %call, float* %x, align 4 - %2 = load float*, float** %m.addr, align 8 - %arrayidx = getelementptr inbounds float, float* %2, i64 0 - store float 1.100000e+01, float* %arrayidx, align 4 - %3 = load float*, float** %m.addr, align 8 - %arrayidx1 = getelementptr inbounds float, float* %3, i64 1 - store float 1.200000e+01, float* %arrayidx1, align 4 - %4 = load float*, float** %m.addr, align 8 - %arrayidx2 = getelementptr inbounds float, float* %4, i64 2 - store float 1.300000e+01, float* %arrayidx2, align 4 - %5 = load float, float* %x, align 4 - %6 = load float, float* %x, align 4 - %mul = fmul float %5, %6 - store float %mul, float* %y, align 4 - %7 = load float, float* %y, align 4 - ret float %7 -} - -declare dso_local float @cblas_sdot(i32, float*, i32, float*, i32) #1 - -define dso_local i32 @main() { -entry: - %m = alloca [3 x float], align 4 - %m1 = alloca [3 x float], align 4 - %n = alloca [3 x float], align 4 - %n1 = alloca [3 x float], align 4 - %val = alloca float, align 4 - %0 = bitcast [3 x float]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.main.m to i8*), i64 12, i1 false) - %1 = bitcast [3 x float]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 12, i1 false) - %2 = bitcast [3 x float]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %2, i8* align 4 bitcast ([3 x float]* @__const.main.n to i8*), i64 12, i1 false) - %3 = bitcast [3 x float]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 4 %3, i8 0, i64 12, i1 false) - %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x float], [3 x float]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [3 x float], [3 x float]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [3 x float], [3 x float]* %n1, i32 0, i32 0 - %call = call float @__enzyme_autodiff(i8* bitcast (float (float*, float*)* @g to i8*), float* %arraydecay, float* %arraydecay1, float* %arraydecay2, float* %arraydecay3) - store float %call, float* %val, align 4 - ret i32 0 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local float @__enzyme_autodiff(i8*, float*, float*, float*, float*) - -;CHECK:define internal void @diffeg(float* noalias %m, float* %"m'", float* noalias %n, float* %"n'", float %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 mul (i64 ptrtoint (float* getelementptr (float, float* null, i32 1) to i64), i64 3)) -;CHECK-NEXT: %0 = bitcast i8* %malloccall to float* -;CHECK-NEXT: call void @__enzyme_memcpy_floatda0sa0stride(float* %0, float* %m, i32 3, i32 1) -;CHECK-NEXT: %call = call float @cblas_sdot(i32 3, float* nocapture readonly %m, i32 1, float* nocapture readonly %n, i32 1) -;CHECK-NEXT: store float 1.100000e+01, float* %m, align 4 -;CHECK-NEXT: %"arrayidx1'ipg" = getelementptr inbounds float, float* %"m'", i64 1 -;CHECK-NEXT: %arrayidx1 = getelementptr inbounds float, float* %m, i64 1 -;CHECK-NEXT: store float 1.200000e+01, float* %arrayidx1, align 4 -;CHECK-NEXT: %"arrayidx2'ipg" = getelementptr inbounds float, float* %"m'", i64 2 -;CHECK-NEXT: %arrayidx2 = getelementptr inbounds float, float* %m, i64 2 -;CHECK-NEXT: store float 1.300000e+01, float* %arrayidx2, align 4 -;CHECK-NEXT: %m0diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %1 = fadd fast float %m0diffecall, %m1diffecall -;CHECK-NEXT: store float 0.000000e+00, float* %"arrayidx2'ipg", align 4 -;CHECK-NEXT: store float 0.000000e+00, float* %"arrayidx1'ipg", align 4 -;CHECK-NEXT: store float 0.000000e+00, float* %"m'", align 4 -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %1, float* %0, i32 1, float* %"n'", i32 1) -;CHECK-NEXT: tail call void @free(i8* %malloccall) -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %1, float* %n, i32 1, float* %"m'", i32 1) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_saxpy(i32, float, float*, i32, float*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1_stride.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1_stride.ll deleted file mode 100644 index 12d923b2f9a75..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1_stride.ll +++ /dev/null @@ -1,115 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern float __enzyme_autodiff(void *, float *, float *, float *, -; float *); -; -;float g(float *restrict m, float *restrict n) { -; float x = cblas_sdot(3, m, 2, n, 3); -; m[0] = 11.0; -; m[1] = 12.0; -; m[2] = 13.0; -; float y = x * x; -; return y; -;} -; -;int main() { -; float m[6] = {1, 2, 3, 101, 102, 103}; -; float m1[6] = {0, 0, 0, 0, 0, 0}; -; float n[9] = {4, 5, 6, 104, 105, 106, 7, 8, 9}; -; float n1[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; -; __enzyme_autodiff((void*)g, m, m1, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.m = private unnamed_addr constant [6 x float] [float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 1.010000e+02, float 1.020000e+02, float 1.030000e+02], align 16 -@__const.main.n = private unnamed_addr constant [9 x float] [float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 1.040000e+02, float 1.050000e+02, float 1.060000e+02, float 7.000000e+00, float 8.000000e+00, float 9.000000e+00], align 16 - -define dso_local float @g(float* noalias %m, float* noalias %n) { -entry: - %m.addr = alloca float*, align 8 - %n.addr = alloca float*, align 8 - %x = alloca float, align 4 - %y = alloca float, align 4 - store float* %m, float** %m.addr, align 8 - store float* %n, float** %n.addr, align 8 - %0 = load float*, float** %m.addr, align 8 - %1 = load float*, float** %n.addr, align 8 - %call = call float @cblas_sdot(i32 3, float* %0, i32 2, float* %1, i32 3) - store float %call, float* %x, align 4 - %2 = load float*, float** %m.addr, align 8 - %arrayidx = getelementptr inbounds float, float* %2, i64 0 - store float 1.100000e+01, float* %arrayidx, align 4 - %3 = load float*, float** %m.addr, align 8 - %arrayidx1 = getelementptr inbounds float, float* %3, i64 1 - store float 1.200000e+01, float* %arrayidx1, align 4 - %4 = load float*, float** %m.addr, align 8 - %arrayidx2 = getelementptr inbounds float, float* %4, i64 2 - store float 1.300000e+01, float* %arrayidx2, align 4 - %5 = load float, float* %x, align 4 - %6 = load float, float* %x, align 4 - %mul = fmul float %5, %6 - store float %mul, float* %y, align 4 - %7 = load float, float* %y, align 4 - ret float %7 -} - -declare dso_local float @cblas_sdot(i32, float*, i32, float*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [6 x float], align 16 - %m1 = alloca [6 x float], align 16 - %n = alloca [9 x float], align 16 - %n1 = alloca [9 x float], align 16 - %0 = bitcast [6 x float]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([6 x float]* @__const.main.m to i8*), i64 24, i1 false) - %1 = bitcast [6 x float]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %1, i8 0, i64 24, i1 false) - %2 = bitcast [9 x float]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %2, i8* align 16 bitcast ([9 x float]* @__const.main.n to i8*), i64 36, i1 false) - %3 = bitcast [9 x float]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %3, i8 0, i64 36, i1 false) - %arraydecay = getelementptr inbounds [6 x float], [6 x float]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [6 x float], [6 x float]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [9 x float], [9 x float]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [9 x float], [9 x float]* %n1, i32 0, i32 0 - %call = call float @__enzyme_autodiff(i8* bitcast (float (float*, float*)* @g to i8*), float* %arraydecay, float* %arraydecay1, float* %arraydecay2, float* %arraydecay3) - ret i32 0 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local float @__enzyme_autodiff(i8*, float*, float*, float*, float*) - -;CHECK:define internal void @diffeg(float* noalias %m, float* %"m'", float* noalias %n, float* %"n'", float %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 mul (i64 ptrtoint (float* getelementptr (float, float* null, i32 1) to i64), i64 3)) -;CHECK-NEXT: %0 = bitcast i8* %malloccall to float* -;CHECK-NEXT: call void @__enzyme_memcpy_floatda0sa0stride(float* %0, float* %m, i32 3, i32 2) -;CHECK-NEXT: %call = call float @cblas_sdot(i32 3, float* nocapture readonly %m, i32 2, float* nocapture readonly %n, i32 3) -;CHECK-NEXT: store float 1.100000e+01, float* %m, align 4 -;CHECK-NEXT: %"arrayidx1'ipg" = getelementptr inbounds float, float* %"m'", i64 1 -;CHECK-NEXT: %arrayidx1 = getelementptr inbounds float, float* %m, i64 1 -;CHECK-NEXT: store float 1.200000e+01, float* %arrayidx1, align 4 -;CHECK-NEXT: %"arrayidx2'ipg" = getelementptr inbounds float, float* %"m'", i64 2 -;CHECK-NEXT: %arrayidx2 = getelementptr inbounds float, float* %m, i64 2 -;CHECK-NEXT: store float 1.300000e+01, float* %arrayidx2, align 4 -;CHECK-NEXT: %m0diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %1 = fadd fast float %m0diffecall, %m1diffecall -;CHECK-NEXT: store float 0.000000e+00, float* %"arrayidx2'ipg", align 4 -;CHECK-NEXT: store float 0.000000e+00, float* %"arrayidx1'ipg", align 4 -;CHECK-NEXT: store float 0.000000e+00, float* %"m'", align 4 -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %1, float* %0, i32 1, float* %"n'", i32 3) -;CHECK-NEXT: tail call void @free(i8* %malloccall) -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %1, float* %n, i32 3, float* %"m'", i32 2) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_saxpy(i32, float, float*, i32, float*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1_stride_split.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1_stride_split.ll deleted file mode 100644 index 5bd74bfb68046..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod1_stride_split.ll +++ /dev/null @@ -1,170 +0,0 @@ -;RUN: if [ %llvmver -ge 8 ]; then %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s; fi - -;#include -; -;extern float __enzyme_autodiff(void *, float *, float *, float *, -; float *); -; -;void outer(float *out, float *a, float *b) { -; *out = cblas_sdot(3, a, 2, b, 3); -;} -; -;float g(float *m, float *n) { -; float x; -; outer(&x, m, n); -; m[0] = 11.0; -; m[1] = 12.0; -; m[2] = 13.0; -; float y = x * x; -; return y; -;} -; -;int main() { -; float m[6] = {1, 2, 3, 101, 102, 103}; -; float m1[6] = {0, 0, 0, 0, 0, 0}; -; float n[9] = {4, 5, 6, 104, 105, 106, 7, 8, 9}; -; float n1[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; -; __enzyme_autodiff((void*)g, m, m1, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.m = private unnamed_addr constant [6 x float] [float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 1.010000e+02, float 1.020000e+02, float 1.030000e+02], align 16 -@__const.main.n = private unnamed_addr constant [9 x float] [float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 1.040000e+02, float 1.050000e+02, float 1.060000e+02, float 7.000000e+00, float 8.000000e+00, float 9.000000e+00], align 16 - -define dso_local void @outer(float* %out, float* %a, float* %b) { -entry: - %out.addr = alloca float*, align 8 - %a.addr = alloca float*, align 8 - %b.addr = alloca float*, align 8 - store float* %out, float** %out.addr, align 8 - store float* %a, float** %a.addr, align 8 - store float* %b, float** %b.addr, align 8 - %0 = load float*, float** %a.addr, align 8 - %1 = load float*, float** %b.addr, align 8 - %call = call float @cblas_sdot(i32 3, float* %0, i32 2, float* %1, i32 3) - %2 = load float*, float** %out.addr, align 8 - store float %call, float* %2, align 4 - ret void -} - -declare dso_local float @cblas_sdot(i32, float*, i32, float*, i32) - -define dso_local float @g(float* %m, float* %n) { -entry: - %m.addr = alloca float*, align 8 - %n.addr = alloca float*, align 8 - %x = alloca float, align 4 - %y = alloca float, align 4 - store float* %m, float** %m.addr, align 8 - store float* %n, float** %n.addr, align 8 - %0 = load float*, float** %m.addr, align 8 - %1 = load float*, float** %n.addr, align 8 - call void @outer(float* %x, float* %0, float* %1) - %2 = load float*, float** %m.addr, align 8 - %arrayidx = getelementptr inbounds float, float* %2, i64 0 - store float 1.100000e+01, float* %arrayidx, align 4 - %3 = load float*, float** %m.addr, align 8 - %arrayidx1 = getelementptr inbounds float, float* %3, i64 1 - store float 1.200000e+01, float* %arrayidx1, align 4 - %4 = load float*, float** %m.addr, align 8 - %arrayidx2 = getelementptr inbounds float, float* %4, i64 2 - store float 1.300000e+01, float* %arrayidx2, align 4 - %5 = load float, float* %x, align 4 - %6 = load float, float* %x, align 4 - %mul = fmul float %5, %6 - store float %mul, float* %y, align 4 - %7 = load float, float* %y, align 4 - ret float %7 -} - -define dso_local i32 @main() { -entry: - %m = alloca [6 x float], align 16 - %m1 = alloca [6 x float], align 16 - %n = alloca [9 x float], align 16 - %n1 = alloca [9 x float], align 16 - %0 = bitcast [6 x float]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %0, i8* align 16 bitcast ([6 x float]* @__const.main.m to i8*), i64 24, i1 false) - %1 = bitcast [6 x float]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %1, i8 0, i64 24, i1 false) - %2 = bitcast [9 x float]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %2, i8* align 16 bitcast ([9 x float]* @__const.main.n to i8*), i64 36, i1 false) - %3 = bitcast [9 x float]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 16 %3, i8 0, i64 36, i1 false) - %arraydecay = getelementptr inbounds [6 x float], [6 x float]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [6 x float], [6 x float]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [9 x float], [9 x float]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [9 x float], [9 x float]* %n1, i32 0, i32 0 - %call = call float @__enzyme_autodiff(i8* bitcast (float (float*, float*)* @g to i8*), float* %arraydecay, float* %arraydecay1, float* %arraydecay2, float* %arraydecay3) - ret i32 0 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local float @__enzyme_autodiff(i8*, float*, float*, float*, float*) - -;CHECK:define internal void @diffeg(float* %m, float* %"m'", float* %n, float* %"n'", float %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %"x'ipa" = alloca float, align 4 -;CHECK-NEXT: store float 0.000000e+00, float* %"x'ipa", align 4 -;CHECK-NEXT: %x = alloca float, align 4 -;CHECK-NEXT: %_augmented = call { float*, float* } @augmented_outer(float* %x, float* %"x'ipa", float* %m, float* %"m'", float* %n, float* %"n'") -;CHECK-NEXT: store float 1.100000e+01, float* %m, align 4 -;CHECK-NEXT: %"arrayidx1'ipg" = getelementptr inbounds float, float* %"m'", i64 1 -;CHECK-NEXT: %arrayidx1 = getelementptr inbounds float, float* %m, i64 1 -;CHECK-NEXT: store float 1.200000e+01, float* %arrayidx1, align 4 -;CHECK-NEXT: %"arrayidx2'ipg" = getelementptr inbounds float, float* %"m'", i64 2 -;CHECK-NEXT: %arrayidx2 = getelementptr inbounds float, float* %m, i64 2 -;CHECK-NEXT: store float 1.300000e+01, float* %arrayidx2, align 4 -;CHECK-NEXT: %0 = load float, float* %x, align 4 -;CHECK-NEXT: %1 = load float, float* %x, align 4 -;CHECK-NEXT: %m0diffe = fmul fast float %differeturn, %1 -;CHECK-NEXT: %m1diffe = fmul fast float %differeturn, %0 -;CHECK-NEXT: %2 = load float, float* %"x'ipa", align 4 -;CHECK-NEXT: %3 = fadd fast float %2, %m1diffe -;CHECK-NEXT: store float %3, float* %"x'ipa", align 4 -;CHECK-NEXT: %4 = load float, float* %"x'ipa", align 4 -;CHECK-NEXT: %5 = fadd fast float %4, %m0diffe -;CHECK-NEXT: store float %5, float* %"x'ipa", align 4 -;CHECK-NEXT: store float 0.000000e+00, float* %"arrayidx2'ipg", align 4 -;CHECK-NEXT: store float 0.000000e+00, float* %"arrayidx1'ipg", align 4 -;CHECK-NEXT: store float 0.000000e+00, float* %"m'", align 4 -;CHECK-NEXT: call void @diffeouter(float* %x, float* %"x'ipa", float* %m, float* %"m'", float* %n, float* %"n'", { float*, float* } %_augmented) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:define internal { float*, float* } @augmented_outer(float* %out, float* %"out'", float* %a, float* %"a'", float* %b, float* %"b'") -;CHECK-NEXT:entry: -;CHECK-NEXT: %malloccall = tail call i8* @malloc(i64 mul (i64 ptrtoint (float* getelementptr (float, float* null, i32 1) to i64), i64 3)) -;CHECK-NEXT: %0 = bitcast i8* %malloccall to float* -;CHECK-NEXT: call void @__enzyme_memcpy_floatda0sa0stride(float* %0, float* %a, i32 3, i32 2) -;CHECK-NEXT: %malloccall2 = tail call i8* @malloc(i64 mul (i64 ptrtoint (float* getelementptr (float, float* null, i32 1) to i64), i64 3)) -;CHECK-NEXT: %1 = bitcast i8* %malloccall2 to float* -;CHECK-NEXT: call void @__enzyme_memcpy_floatda0sa0stride(float* %1, float* %b, i32 3, i32 3) -;CHECK-NEXT: %2 = insertvalue { float*, float* } undef, float* %0, 0 -;CHECK-NEXT: %3 = insertvalue { float*, float* } %2, float* %1, 1 -;CHECK-NEXT: %call = call float @cblas_sdot(i32 3, float* nocapture readonly %a, i32 2, float* nocapture readonly %b, i32 3) -;CHECK-NEXT: store float %call, float* %out, align 4 -;CHECK-NEXT: ret { float*, float* } %3 -;CHECK-NEXT:} - -;CHECK:define internal void @diffeouter(float* %out, float* %"out'", float* %a, float* %"a'", float* %b, float* %"b'", { float*, float* } -;CHECK-NEXT:entry: -;CHECK-NEXT: %1 = extractvalue { float*, float* } %0, 0 -;CHECK-NEXT: %2 = extractvalue { float*, float* } %0, 1 -;CHECK-NEXT: %3 = load float, float* %"out'", align 4 -;CHECK-NEXT: store float 0.000000e+00, float* %"out'", align 4 -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %3, float* %1, i32 1, float* %"b'", i32 3) -;CHECK-NEXT: %4 = bitcast float* %1 to i8* -;CHECK-NEXT: tail call void @free(i8* %4) -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %3, float* %2, i32 1, float* %"a'", i32 2) -;CHECK-NEXT: %5 = bitcast float* %2 to i8* -;CHECK-NEXT: tail call void @free(i8* %5) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_saxpy(i32, float, float*, i32, float*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod2_split.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod2_split.ll deleted file mode 100644 index 0b531b9c8c710..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_mod2_split.ll +++ /dev/null @@ -1,139 +0,0 @@ -;RUN: if [ %llvmver -ge 8 ]; then %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s; fi - -;#include -; -;extern float __enzyme_autodiff(void *, float *, float *, float *, -; float *); -; -;void outer(float* out, float *a, float *b) { -; *out = cblas_sdot(3, a, 1, b, 1); -;} -; -;float g(float *m, float *n) { -; float x; -; outer(&x, m, n); -; m[0] = 11.0; -; m[1] = 12.0; -; m[2] = 13.0; -; n[0] = 21.0; -; n[1] = 22.0; -; n[2] = 23.0; -; float y = x * x; -; return y; -;} -; -;int main() { -; float m[3] = {1, 2, 3}; -; float m1[3] = {0, 0, 0}; -; float n[3] = {4, 5, 6}; -; float n1[3] = {0, 0, 0}; -; float val = __enzyme_autodiff((void*)g, m, m1, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.m = private unnamed_addr constant [3 x float] [float 1.000000e+00, float 2.000000e+00, float 3.000000e+00], align 4 -@__const.main.n = private unnamed_addr constant [3 x float] [float 4.000000e+00, float 5.000000e+00, float 6.000000e+00], align 4 - -define dso_local void @outer(float* %out, float* %a, float* %b) { -entry: - %out.addr = alloca float*, align 8 - %a.addr = alloca float*, align 8 - %b.addr = alloca float*, align 8 - store float* %out, float** %out.addr, align 8 - store float* %a, float** %a.addr, align 8 - store float* %b, float** %b.addr, align 8 - %0 = load float*, float** %a.addr, align 8 - %1 = load float*, float** %b.addr, align 8 - %call = call float @cblas_sdot(i32 3, float* %0, i32 1, float* %1, i32 1) - %2 = load float*, float** %out.addr, align 8 - store float %call, float* %2, align 4 - ret void -} - -declare dso_local float @cblas_sdot(i32, float*, i32, float*, i32) - -define dso_local float @g(float* %m, float* %n) { -entry: - %m.addr = alloca float*, align 8 - %n.addr = alloca float*, align 8 - %x = alloca float, align 4 - %y = alloca float, align 4 - store float* %m, float** %m.addr, align 8 - store float* %n, float** %n.addr, align 8 - %0 = load float*, float** %m.addr, align 8 - %1 = load float*, float** %n.addr, align 8 - call void @outer(float* %x, float* %0, float* %1) - %2 = load float*, float** %m.addr, align 8 - %arrayidx = getelementptr inbounds float, float* %2, i64 0 - store float 1.100000e+01, float* %arrayidx, align 4 - %3 = load float*, float** %m.addr, align 8 - %arrayidx1 = getelementptr inbounds float, float* %3, i64 1 - store float 1.200000e+01, float* %arrayidx1, align 4 - %4 = load float*, float** %m.addr, align 8 - %arrayidx2 = getelementptr inbounds float, float* %4, i64 2 - store float 1.300000e+01, float* %arrayidx2, align 4 - %5 = load float*, float** %n.addr, align 8 - %arrayidx3 = getelementptr inbounds float, float* %5, i64 0 - store float 2.100000e+01, float* %arrayidx3, align 4 - %6 = load float*, float** %n.addr, align 8 - %arrayidx4 = getelementptr inbounds float, float* %6, i64 1 - store float 2.200000e+01, float* %arrayidx4, align 4 - %7 = load float*, float** %n.addr, align 8 - %arrayidx5 = getelementptr inbounds float, float* %7, i64 2 - store float 2.300000e+01, float* %arrayidx5, align 4 - %8 = load float, float* %x, align 4 - %9 = load float, float* %x, align 4 - %mul = fmul float %8, %9 - store float %mul, float* %y, align 4 - %10 = load float, float* %y, align 4 - ret float %10 -} - -define dso_local i32 @main() { -entry: - %m = alloca [3 x float], align 4 - %m1 = alloca [3 x float], align 4 - %n = alloca [3 x float], align 4 - %n1 = alloca [3 x float], align 4 - %val = alloca float, align 4 - %0 = bitcast [3 x float]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.main.m to i8*), i64 12, i1 false) - %1 = bitcast [3 x float]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 12, i1 false) - %2 = bitcast [3 x float]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %2, i8* align 4 bitcast ([3 x float]* @__const.main.n to i8*), i64 12, i1 false) - %3 = bitcast [3 x float]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 4 %3, i8 0, i64 12, i1 false) - %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x float], [3 x float]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [3 x float], [3 x float]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [3 x float], [3 x float]* %n1, i32 0, i32 0 - %call = call float @__enzyme_autodiff(i8* bitcast (float (float*, float*)* @g to i8*), float* %arraydecay, float* %arraydecay1, float* %arraydecay2, float* %arraydecay3) - store float %call, float* %val, align 4 - ret i32 0 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local float @__enzyme_autodiff(i8*, float*, float*, float*, float*) - -;CHECK:define internal void @diffeouter(float* %out, float* %"out'", float* %a, float* %"a'", float* %b, float* %"b'", { float*, float* } -;CHECK-NEXT:entry: -;CHECK-NEXT: %1 = extractvalue { float*, float* } %0, 0 -;CHECK-NEXT: %2 = extractvalue { float*, float* } %0, 1 -;CHECK-NEXT: %3 = load float, float* %"out'", align 4 -;CHECK-NEXT: store float 0.000000e+00, float* %"out'", align 4 -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %3, float* %1, i32 1, float* %"b'", i32 1) -;CHECK-NEXT: %4 = bitcast float* %1 to i8* -;CHECK-NEXT: tail call void @free(i8* %4) -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %3, float* %2, i32 1, float* %"a'", i32 1) -;CHECK-NEXT: %5 = bitcast float* %2 to i8* -;CHECK-NEXT: tail call void @free(i8* %5) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_saxpy(i32, float, float*, i32, float*, i32) diff --git a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_nomod.ll b/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_nomod.ll deleted file mode 100644 index 296e56ec8409f..0000000000000 --- a/enzyme/test/Enzyme/ReverseMode/blas/cblas_sdot_nomod.ll +++ /dev/null @@ -1,90 +0,0 @@ -;RUN: %opt < %s %loadEnzyme -enzyme -mem2reg -instsimplify -simplifycfg -S | FileCheck %s - -;#include -; -;extern float __enzyme_autodiff(float*, float*, float*, float*, float*); -; -;float g(float *restrict m, float *restrict n) { -; float x = cblas_sdot(3, m, 1, n, 1); -; float y = x*x; -; return y; -;} -; -;int main() { -; float m[3] = {1, 2, 3}; -; float n[3] = {4, 5, 6}; -; float m1[3] = {0.}; -; float n1[3] = {0.}; -; float z = __enzyme_autodiff((float*)g, m, m1, n, n1); -;} - -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -@__const.main.m = private unnamed_addr constant [3 x float] [float 1.000000e+00, float 2.000000e+00, float 3.000000e+00], align 4 -@__const.main.n = private unnamed_addr constant [3 x float] [float 4.000000e+00, float 5.000000e+00, float 6.000000e+00], align 4 - -define dso_local float @g(float* noalias %m, float* noalias %n) { -entry: - %m.addr = alloca float*, align 8 - %n.addr = alloca float*, align 8 - %x = alloca float, align 4 - %y = alloca float, align 4 - store float* %m, float** %m.addr, align 8 - store float* %n, float** %n.addr, align 8 - %0 = load float*, float** %m.addr, align 8 - %1 = load float*, float** %n.addr, align 8 - %call = call float @cblas_sdot(i32 3, float* %0, i32 1, float* %1, i32 1) - store float %call, float* %x, align 4 - %2 = load float, float* %x, align 4 - %3 = load float, float* %x, align 4 - %mul = fmul float %2, %3 - store float %mul, float* %y, align 4 - %4 = load float, float* %y, align 4 - ret float %4 -} - -declare dso_local float @cblas_sdot(i32, float*, i32, float*, i32) - -define dso_local i32 @main() { -entry: - %m = alloca [3 x float], align 4 - %n = alloca [3 x float], align 4 - %m1 = alloca [3 x float], align 4 - %n1 = alloca [3 x float], align 4 - %z = alloca float, align 4 - %0 = bitcast [3 x float]* %m to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast ([3 x float]* @__const.main.m to i8*), i64 12, i1 false) - %1 = bitcast [3 x float]* %n to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %1, i8* align 4 bitcast ([3 x float]* @__const.main.n to i8*), i64 12, i1 false) - %2 = bitcast [3 x float]* %m1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 4 %2, i8 0, i64 12, i1 false) - %3 = bitcast [3 x float]* %n1 to i8* - call void @llvm.memset.p0i8.i64(i8* align 4 %3, i8 0, i64 12, i1 false) - %arraydecay = getelementptr inbounds [3 x float], [3 x float]* %m, i32 0, i32 0 - %arraydecay1 = getelementptr inbounds [3 x float], [3 x float]* %m1, i32 0, i32 0 - %arraydecay2 = getelementptr inbounds [3 x float], [3 x float]* %n, i32 0, i32 0 - %arraydecay3 = getelementptr inbounds [3 x float], [3 x float]* %n1, i32 0, i32 0 - %call = call float @__enzyme_autodiff(float* bitcast (float (float*, float*)* @g to float*), float* %arraydecay, float* %arraydecay1, float* %arraydecay2, float* %arraydecay3) - store float %call, float* %z, align 4 - ret i32 0 -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) - -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) - -declare dso_local float @__enzyme_autodiff(float*, float*, float*, float*, float*) - -;CHECK:define internal void @diffeg(float* noalias %m, float* %"m'", float* noalias %n, float* %"n'", float %differeturn) -;CHECK-NEXT:entry: -;CHECK-NEXT: %call = call float @cblas_sdot(i32 3, float* nocapture readonly %m, i32 1, float* nocapture readonly %n, i32 1) -;CHECK-NEXT: %m0diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %m1diffecall = fmul fast float %differeturn, %call -;CHECK-NEXT: %0 = fadd fast float %m0diffecall, %m1diffecall -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %0, float* %m, i32 1, float* %"n'", i32 1) -;CHECK-NEXT: call void @cblas_saxpy(i32 3, float %0, float* %n, i32 1, float* %"m'", i32 1) -;CHECK-NEXT: ret void -;CHECK-NEXT:} - -;CHECK:declare void @cblas_saxpy(i32, float, float*, i32, float*, i32)