From f46f3da0dbf8a027e1100edbab3b4542985d60a0 Mon Sep 17 00:00:00 2001
From: gbaraldi <baraldigabriel@gmail.com>
Date: Wed, 18 Sep 2024 12:27:30 -0300
Subject: [PATCH 1/2] Remove llvm-muladd pass and move it's functionality to to
 llvm-simdloop

---
 doc/src/devdocs/llvm-passes.md |  12 ----
 doc/src/devdocs/llvm.md        |   1 -
 src/jl_exported_funcs.inc      |   1 -
 src/llvm-julia-passes.inc      |   1 -
 src/llvm-muladd.cpp            | 117 ---------------------------------
 src/llvm-simdloop.cpp          |  60 +++++++++++++++++
 src/passes.h                   |   4 --
 src/pipeline.cpp               |   1 -
 test/llvmpasses/parsing.ll     |   2 +-
 9 files changed, 61 insertions(+), 138 deletions(-)
 delete mode 100644 src/llvm-muladd.cpp

diff --git a/doc/src/devdocs/llvm-passes.md b/doc/src/devdocs/llvm-passes.md
index 36383acaef512..736faf54c219b 100644
--- a/doc/src/devdocs/llvm-passes.md
+++ b/doc/src/devdocs/llvm-passes.md
@@ -114,18 +114,6 @@ This pass is used to verify Julia's invariants about LLVM IR. This includes thin
 
 These passes are used to perform transformations on LLVM IR that LLVM will not perform itself, e.g. fast math flag propagation, escape analysis, and optimizations on Julia-specific internal functions. They use knowledge about Julia's semantics to perform these optimizations.
 
-### CombineMulAdd
-
-* Filename: `llvm-muladd.cpp`
-* Class Name: `CombineMulAddPass`
-* Opt Name: `function(CombineMulAdd)`
-
-This pass serves to optimize the particular combination of a regular `fmul` with a fast `fadd` into a contract `fmul` with a fast `fadd`. This is later optimized by the backend to a [fused multiply-add](https://en.wikipedia.org/wiki/Multiply%E2%80%93accumulate_operation#Fused_multiply%E2%80%93add) instruction, which can provide significantly faster operations at the cost of more [unpredictable semantics](https://simonbyrne.github.io/notes/fastmath/).
-
-!!! note
-
-    This optimization only occurs when the `fmul` has a single use, which is the fast `fadd`.
-
 ### AllocOpt
 
 * Filename: `llvm-alloc-opt.cpp`
diff --git a/doc/src/devdocs/llvm.md b/doc/src/devdocs/llvm.md
index ab8f7dde50022..f69a2287d0e10 100644
--- a/doc/src/devdocs/llvm.md
+++ b/doc/src/devdocs/llvm.md
@@ -30,7 +30,6 @@ The code for lowering Julia AST to LLVM IR or interpreting it directly is in dir
 | `llvm-julia-licm.cpp`            | Custom LLVM pass to hoist/sink Julia-specific intrinsics           |
 | `llvm-late-gc-lowering.cpp`      | Custom LLVM pass to root GC-tracked values                         |
 | `llvm-lower-handlers.cpp`        | Custom LLVM pass to lower try-catch blocks                         |
-| `llvm-muladd.cpp`                | Custom LLVM pass for fast-match FMA                                |
 | `llvm-multiversioning.cpp`       | Custom LLVM pass to generate sysimg code on multiple architectures |
 | `llvm-propagate-addrspaces.cpp`  | Custom LLVM pass to canonicalize addrspaces                        |
 | `llvm-ptls.cpp`                  | Custom LLVM pass to lower TLS operations                           |
diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index 7abf2b055bb8c..d4446b58a51c2 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -550,7 +550,6 @@
     YY(LLVMExtraMPMAddRemoveAddrspacesPass) \
     YY(LLVMExtraMPMAddLowerPTLSPass) \
     YY(LLVMExtraFPMAddDemoteFloat16Pass) \
-    YY(LLVMExtraFPMAddCombineMulAddPass) \
     YY(LLVMExtraFPMAddLateLowerGCPass) \
     YY(LLVMExtraFPMAddAllocOptPass) \
     YY(LLVMExtraFPMAddPropagateJuliaAddrspacesPass) \
diff --git a/src/llvm-julia-passes.inc b/src/llvm-julia-passes.inc
index bd89c01c6fdfe..c41ecbba87b6a 100644
--- a/src/llvm-julia-passes.inc
+++ b/src/llvm-julia-passes.inc
@@ -11,7 +11,6 @@ MODULE_PASS("LowerPTLSPass", LowerPTLSPass, LowerPTLSPass())
 //Function passes
 #ifdef FUNCTION_PASS
 FUNCTION_PASS("DemoteFloat16", DemoteFloat16Pass, DemoteFloat16Pass())
-FUNCTION_PASS("CombineMulAdd", CombineMulAddPass, CombineMulAddPass())
 FUNCTION_PASS("LateLowerGCFrame", LateLowerGCPass, LateLowerGCPass())
 FUNCTION_PASS("AllocOpt", AllocOptPass, AllocOptPass())
 FUNCTION_PASS("PropagateJuliaAddrspaces", PropagateJuliaAddrspacesPass, PropagateJuliaAddrspacesPass())
diff --git a/src/llvm-muladd.cpp b/src/llvm-muladd.cpp
deleted file mode 100644
index 12f1c8ad765d9..0000000000000
--- a/src/llvm-muladd.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-// This file is a part of Julia. License is MIT: https://julialang.org/license
-
-#include "llvm-version.h"
-#include "passes.h"
-
-#include <llvm-c/Core.h>
-#include <llvm-c/Types.h>
-
-#include <llvm/ADT/Statistic.h>
-#include <llvm/Analysis/OptimizationRemarkEmitter.h>
-#include <llvm/IR/Value.h>
-#include <llvm/IR/PassManager.h>
-#include <llvm/IR/Function.h>
-#include <llvm/IR/Instructions.h>
-#include <llvm/IR/IntrinsicInst.h>
-#include <llvm/IR/Module.h>
-#include <llvm/IR/Operator.h>
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/Verifier.h>
-#include <llvm/Pass.h>
-#include <llvm/Support/Debug.h>
-
-#include "julia.h"
-#include "julia_assert.h"
-
-#define DEBUG_TYPE "combine-muladd"
-#undef DEBUG
-
-using namespace llvm;
-STATISTIC(TotalContracted, "Total number of multiplies marked for FMA");
-
-#ifndef __clang_gcanalyzer__
-#define REMARK(remark) ORE.emit(remark)
-#else
-#define REMARK(remark) (void) 0;
-#endif
-
-/**
- * Combine
- * ```
- * %v0 = fmul ... %a, %b
- * %v = fadd contract ... %v0, %c
- * ```
- * to
- * `%v = call contract @llvm.fmuladd.<...>(... %a, ... %b, ... %c)`
- * when `%v0` has no other use
- */
-
-// Return true if we changed the mulOp
-static bool checkCombine(Value *maybeMul, OptimizationRemarkEmitter &ORE) JL_NOTSAFEPOINT
-{
-    auto mulOp = dyn_cast<Instruction>(maybeMul);
-    if (!mulOp || mulOp->getOpcode() != Instruction::FMul)
-        return false;
-    if (!mulOp->hasOneUse()) {
-        LLVM_DEBUG(dbgs() << "mulOp has multiple uses: " << *maybeMul << "\n");
-        REMARK([&](){
-            return OptimizationRemarkMissed(DEBUG_TYPE, "Multiuse FMul", mulOp)
-                << "fmul had multiple uses " << ore::NV("fmul", mulOp);
-        });
-        return false;
-    }
-    // On 5.0+ we only need to mark the mulOp as contract and the backend will do the work for us.
-    auto fmf = mulOp->getFastMathFlags();
-    if (!fmf.allowContract()) {
-        LLVM_DEBUG(dbgs() << "Marking mulOp for FMA: " << *maybeMul << "\n");
-        REMARK([&](){
-            return OptimizationRemark(DEBUG_TYPE, "Marked for FMA", mulOp)
-                << "marked for fma " << ore::NV("fmul", mulOp);
-        });
-        ++TotalContracted;
-        fmf.setAllowContract(true);
-        mulOp->copyFastMathFlags(fmf);
-        return true;
-    }
-    return false;
-}
-
-static bool combineMulAdd(Function &F) JL_NOTSAFEPOINT
-{
-    OptimizationRemarkEmitter ORE(&F);
-    bool modified = false;
-    for (auto &BB: F) {
-        for (auto it = BB.begin(); it != BB.end();) {
-            auto &I = *it;
-            it++;
-            switch (I.getOpcode()) {
-            case Instruction::FAdd: {
-                if (!I.hasAllowContract())
-                    continue;
-                modified |= checkCombine(I.getOperand(0), ORE) || checkCombine(I.getOperand(1), ORE);
-                break;
-            }
-            case Instruction::FSub: {
-                if (!I.hasAllowContract())
-                    continue;
-                modified |= checkCombine(I.getOperand(0), ORE) || checkCombine(I.getOperand(1), ORE);
-                break;
-            }
-            default:
-                break;
-            }
-        }
-    }
-#ifdef JL_VERIFY_PASSES
-    assert(!verifyLLVMIR(F));
-#endif
-    return modified;
-}
-
-PreservedAnalyses CombineMulAddPass::run(Function &F, FunctionAnalysisManager &AM) JL_NOTSAFEPOINT
-{
-    if (combineMulAdd(F)) {
-        return PreservedAnalyses::allInSet<CFGAnalyses>();
-    }
-    return PreservedAnalyses::all();
-}
diff --git a/src/llvm-simdloop.cpp b/src/llvm-simdloop.cpp
index 07afa8c930deb..830868c2b24ce 100644
--- a/src/llvm-simdloop.cpp
+++ b/src/llvm-simdloop.cpp
@@ -41,6 +41,7 @@ STATISTIC(ReductionChainLength, "Total sum of instructions folded from reduction
 STATISTIC(MaxChainLength, "Max length of reduction chain");
 STATISTIC(AddChains, "Addition reduction chains");
 STATISTIC(MulChains, "Multiply reduction chains");
+STATISTIC(TotalContracted, "Total number of multiplies marked for FMA");
 
 #ifndef __clang_gcanalyzer__
 #define REMARK(remark) ORE.emit(remark)
@@ -49,6 +50,49 @@ STATISTIC(MulChains, "Multiply reduction chains");
 #endif
 namespace {
 
+/**
+ * Combine
+ * ```
+ * %v0 = fmul ... %a, %b
+ * %v = fadd contract ... %v0, %c
+ * ```
+ * to
+ * %v0 = fmul contract ... %a, %b
+ * %v = fadd contract ... %v0, %c
+ * when `%v0` has no other use
+ */
+
+static bool checkCombine(Value *maybeMul, Loop &L, OptimizationRemarkEmitter &ORE) JL_NOTSAFEPOINT
+{
+    auto mulOp = dyn_cast<Instruction>(maybeMul);
+    if (!mulOp || mulOp->getOpcode() != Instruction::FMul)
+        return false;
+    if (L.contains(mulOp))
+        return false;
+    if (!mulOp->hasOneUse()) {
+        LLVM_DEBUG(dbgs() << "mulOp has multiple uses: " << *maybeMul << "\n");
+        REMARK([&](){
+            return OptimizationRemarkMissed(DEBUG_TYPE, "Multiuse FMul", mulOp)
+                << "fmul had multiple uses " << ore::NV("fmul", mulOp);
+        });
+        return false;
+    }
+    // On 5.0+ we only need to mark the mulOp as contract and the backend will do the work for us.
+    auto fmf = mulOp->getFastMathFlags();
+    if (!fmf.allowContract()) {
+        LLVM_DEBUG(dbgs() << "Marking mulOp for FMA: " << *maybeMul << "\n");
+        REMARK([&](){
+            return OptimizationRemark(DEBUG_TYPE, "Marked for FMA", mulOp)
+                << "marked for fma " << ore::NV("fmul", mulOp);
+        });
+        ++TotalContracted;
+        fmf.setAllowContract(true);
+        mulOp->copyFastMathFlags(fmf);
+        return true;
+    }
+    return false;
+}
+
 static unsigned getReduceOpcode(Instruction *J, Instruction *operand) JL_NOTSAFEPOINT
 {
     switch (J->getOpcode()) {
@@ -150,6 +194,22 @@ static void enableUnsafeAlgebraIfReduction(PHINode *Phi, Loop &L, OptimizationRe
         });
         (*K)->setHasAllowReassoc(true);
         (*K)->setHasAllowContract(true);
+        switch ((*K)->getOpcode()) {
+            case Instruction::FAdd: {
+                if (!(*K)->hasAllowContract())
+                    continue;
+                checkCombine((*K)->getOperand(0), L, ORE) || checkCombine((*K)->getOperand(1), L, ORE);
+                break;
+            }
+            case Instruction::FSub: {
+                if (!(*K)->hasAllowContract())
+                    continue;
+                checkCombine((*K)->getOperand(0), L, ORE) || checkCombine((*K)->getOperand(1), L, ORE);
+                break;
+            }
+            default:
+                break;
+            }
         if (SE)
             SE->forgetValue(*K);
         ++length;
diff --git a/src/passes.h b/src/passes.h
index 6557a5813063d..4c9cba164d049 100644
--- a/src/passes.h
+++ b/src/passes.h
@@ -15,10 +15,6 @@ struct DemoteFloat16Pass : PassInfoMixin<DemoteFloat16Pass> {
     static bool isRequired() { return true; }
 };
 
-struct CombineMulAddPass : PassInfoMixin<CombineMulAddPass> {
-    PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) JL_NOTSAFEPOINT;
-};
-
 struct LateLowerGCPass : PassInfoMixin<LateLowerGCPass> {
     PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) JL_NOTSAFEPOINT;
     static bool isRequired() { return true; }
diff --git a/src/pipeline.cpp b/src/pipeline.cpp
index 236be179e12c9..f300e4d7757b2 100644
--- a/src/pipeline.cpp
+++ b/src/pipeline.cpp
@@ -568,7 +568,6 @@ static void buildCleanupPipeline(ModulePassManager &MPM, PassBuilder *PB, Optimi
     if (options.cleanup) {
         if (O.getSpeedupLevel() >= 2) {
             FunctionPassManager FPM;
-            JULIA_PASS(FPM.addPass(CombineMulAddPass()));
             FPM.addPass(DivRemPairsPass());
             MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         }
diff --git a/test/llvmpasses/parsing.ll b/test/llvmpasses/parsing.ll
index e0a726176b225..b8aec5ee2fa71 100644
--- a/test/llvmpasses/parsing.ll
+++ b/test/llvmpasses/parsing.ll
@@ -1,6 +1,6 @@
 ; COM: NewPM-only test, tests for ability to parse Julia passes
 
-; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='module(CPUFeatures,RemoveNI,JuliaMultiVersioning,RemoveJuliaAddrspaces,LowerPTLSPass,function(DemoteFloat16,CombineMulAdd,LateLowerGCFrame,FinalLowerGC,AllocOpt,PropagateJuliaAddrspaces,LowerExcHandlers,GCInvariantVerifier,loop(LowerSIMDLoop,JuliaLICM),GCInvariantVerifier<strong>,GCInvariantVerifier<no-strong>),LowerPTLSPass<imaging>,LowerPTLSPass<no-imaging>,JuliaMultiVersioning<external>,JuliaMultiVersioning<no-external>)' -S %s -o /dev/null
+; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='module(CPUFeatures,RemoveNI,JuliaMultiVersioning,RemoveJuliaAddrspaces,LowerPTLSPass,function(DemoteFloat16,LateLowerGCFrame,FinalLowerGC,AllocOpt,PropagateJuliaAddrspaces,LowerExcHandlers,GCInvariantVerifier,loop(LowerSIMDLoop,JuliaLICM),GCInvariantVerifier<strong>,GCInvariantVerifier<no-strong>),LowerPTLSPass<imaging>,LowerPTLSPass<no-imaging>,JuliaMultiVersioning<external>,JuliaMultiVersioning<no-external>)' -S %s -o /dev/null
 ; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes="julia<level=3;llvm_only>" -S %s -o /dev/null
 ; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes="julia<level=3;no_llvm_only>" -S %s -o /dev/null
 ; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes="julia<level=3;no_enable_vector_pipeline>" -S %s -o /dev/null

From 2547b1d2fd488d9f689e512e170d2090349bb447 Mon Sep 17 00:00:00 2001
From: gbaraldi <baraldigabriel@gmail.com>
Date: Wed, 18 Sep 2024 15:28:51 -0300
Subject: [PATCH 2/2] Add tests and fix behaviour slightly

---
 src/Makefile                      |  2 +-
 src/llvm-simdloop.cpp             | 12 ++++--
 test/llvmpasses/julia-simdloop.ll | 69 +++++++++++++++++++++++++------
 test/llvmpasses/muladd.ll         | 62 ---------------------------
 4 files changed, 67 insertions(+), 78 deletions(-)
 delete mode 100644 test/llvmpasses/muladd.ll

diff --git a/src/Makefile b/src/Makefile
index 52e673aa6cc1a..9dd7060795d6c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -52,7 +52,7 @@ RT_LLVMLINK :=
 CG_LLVMLINK :=
 
 ifeq ($(JULIACODEGEN),LLVM)
-CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop llvm-muladd \
+CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \
 	llvm-final-gc-lowering llvm-pass-helpers llvm-late-gc-lowering llvm-ptls \
 	llvm-lower-handlers llvm-gc-invariant-verifier llvm-propagate-addrspaces \
 	llvm-multiversioning llvm-alloc-opt llvm-alloc-helpers cgmemmgr llvm-remove-addrspaces \
diff --git a/src/llvm-simdloop.cpp b/src/llvm-simdloop.cpp
index 830868c2b24ce..353799dad2beb 100644
--- a/src/llvm-simdloop.cpp
+++ b/src/llvm-simdloop.cpp
@@ -67,7 +67,7 @@ static bool checkCombine(Value *maybeMul, Loop &L, OptimizationRemarkEmitter &OR
     auto mulOp = dyn_cast<Instruction>(maybeMul);
     if (!mulOp || mulOp->getOpcode() != Instruction::FMul)
         return false;
-    if (L.contains(mulOp))
+    if (!L.contains(mulOp))
         return false;
     if (!mulOp->hasOneUse()) {
         LLVM_DEBUG(dbgs() << "mulOp has multiple uses: " << *maybeMul << "\n");
@@ -198,13 +198,19 @@ static void enableUnsafeAlgebraIfReduction(PHINode *Phi, Loop &L, OptimizationRe
             case Instruction::FAdd: {
                 if (!(*K)->hasAllowContract())
                     continue;
-                checkCombine((*K)->getOperand(0), L, ORE) || checkCombine((*K)->getOperand(1), L, ORE);
+                // (*K)->getOperand(0)->print(dbgs());
+                // (*K)->getOperand(1)->print(dbgs());
+                checkCombine((*K)->getOperand(0), L, ORE);
+                checkCombine((*K)->getOperand(1), L, ORE);
                 break;
             }
             case Instruction::FSub: {
                 if (!(*K)->hasAllowContract())
                     continue;
-                checkCombine((*K)->getOperand(0), L, ORE) || checkCombine((*K)->getOperand(1), L, ORE);
+                // (*K)->getOperand(0)->print(dbgs());
+                // (*K)->getOperand(1)->print(dbgs());
+                checkCombine((*K)->getOperand(0), L, ORE);
+                checkCombine((*K)->getOperand(1), L, ORE);
                 break;
             }
             default:
diff --git a/test/llvmpasses/julia-simdloop.ll b/test/llvmpasses/julia-simdloop.ll
index a8d5ea3342b20..9a23a2826da70 100644
--- a/test/llvmpasses/julia-simdloop.ll
+++ b/test/llvmpasses/julia-simdloop.ll
@@ -3,18 +3,18 @@
 ; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='loop(LowerSIMDLoop)' -S %s | FileCheck %s
 
 ; CHECK-LABEL: @simd_test(
-define void @simd_test(double *%a, double *%b) {
+define void @simd_test(ptr %a, ptr %b) {
 top:
   br label %loop
 loop:
   %i = phi i64 [0, %top], [%nexti, %loop]
-  %aptr = getelementptr double, double *%a, i64 %i
-  %bptr = getelementptr double, double *%b, i64 %i
+  %aptr = getelementptr double, ptr %a, i64 %i
+  %bptr = getelementptr double, ptr %b, i64 %i
 ; CHECK: llvm.mem.parallel_loop_access
-  %aval = load double, double *%aptr
-  %bval = load double, double *%aptr
+  %aval = load double, ptr %aptr
+  %bval = load double, ptr %aptr
   %cval = fadd double %aval, %bval
-  store double %cval, double *%bptr
+  store double %cval, ptr %bptr
   %nexti = add i64 %i, 1
   %done = icmp sgt i64 %nexti, 500
   br i1 %done, label %loopdone, label %loop, !llvm.loop !1
@@ -23,15 +23,15 @@ loopdone:
 }
 
 ; CHECK-LABEL: @simd_test_sub(
-define double @simd_test_sub(double *%a) {
+define double @simd_test_sub(ptr %a) {
 top:
   br label %loop
 loop:
   %i = phi i64 [0, %top], [%nexti, %loop]
   %v = phi double [0.000000e+00, %top], [%nextv, %loop]
-  %aptr = getelementptr double, double *%a, i64 %i
+  %aptr = getelementptr double, ptr %a, i64 %i
 ; CHECK: llvm.mem.parallel_loop_access
-  %aval = load double, double *%aptr
+  %aval = load double, ptr %aptr
   %nextv = fsub double %v, %aval
 ; CHECK: fsub reassoc contract double %v, %aval
   %nexti = add i64 %i, 1
@@ -42,14 +42,14 @@ loopdone:
 }
 
 ; CHECK-LABEL: @simd_test_sub2(
-define double @simd_test_sub2(double *%a) {
+define double @simd_test_sub2(ptr %a) {
 top:
   br label %loop
 loop:
   %i = phi i64 [0, %top], [%nexti, %loop]
   %v = phi double [0.000000e+00, %top], [%nextv, %loop]
-  %aptr = getelementptr double, double *%a, i64 %i
-  %aval = load double, double *%aptr
+  %aptr = getelementptr double, ptr %a, i64 %i
+  %aval = load double, ptr %aptr
   %nextv = fsub double %v, %aval
 ; CHECK: fsub reassoc contract double %v, %aval
   %nexti = add i64 %i, 1
@@ -59,6 +59,26 @@ loopdone:
   ret double %nextv
 }
 
+; CHECK-LABEL: @simd_test_sub4(
+define double @simd_test_sub4(ptr %a) {
+top:
+  br label %loop
+loop:
+  %i = phi i64 [0, %top], [%nexti, %loop]
+  %v = phi double [0.000000e+00, %top], [%nextv, %loop]
+  %aptr = getelementptr double, double *%a, i64 %i
+  %aval = load double, double *%aptr
+  %nextv2 = fmul double %aval, %aval
+  ; CHECK: fmul contract double %aval, %aval
+  %nextv = fsub double %v, %nextv2
+; CHECK: fsub reassoc contract double %v, %nextv2
+  %nexti = add i64 %i, 1
+  %done = icmp sgt i64 %nexti, 500
+  br i1 %done, label %loopdone, label %loop, !llvm.loop !0
+loopdone:
+  ret double %nextv
+}
+
 ; Tests if we correctly pass through other metadata
 ; CHECK-LABEL: @disabled(
 define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
@@ -82,6 +102,31 @@ for.end:                                          ; preds = %for.body
   ret i32 %1
 }
 
+; Check that we don't add contract to non loop things
+; CHECK-LABEL: @dont_add_no_loop(
+define double @dont_add_no_loop(ptr nocapture noundef nonnull readonly align 8 dereferenceable(72) %"a::Tuple", ptr nocapture noundef nonnull readonly align 8 dereferenceable(24) %"b::Tuple") #0 {
+top:
+   %"a::Tuple[9]_ptr" = getelementptr inbounds i8, ptr %"a::Tuple", i64 64
+   %"b::Tuple[3]_ptr" = getelementptr inbounds i8, ptr %"b::Tuple", i64 16
+   %"a::Tuple[6]_ptr" = getelementptr inbounds i8, ptr %"a::Tuple", i64 40
+   %"b::Tuple[2]_ptr" = getelementptr inbounds i8, ptr %"b::Tuple", i64 8
+   %"a::Tuple[3]_ptr" = getelementptr inbounds i8, ptr %"a::Tuple", i64 16
+   %"a::Tuple[3]_ptr.unbox" = load double, ptr %"a::Tuple[3]_ptr", align 8
+   %"b::Tuple.unbox" = load double, ptr %"b::Tuple", align 8
+   %0 = fmul double %"a::Tuple[3]_ptr.unbox", %"b::Tuple.unbox"
+; CHECK: fmul double %
+   %"a::Tuple[6]_ptr.unbox" = load double, ptr %"a::Tuple[6]_ptr", align 8
+   %"b::Tuple[2]_ptr.unbox" = load double, ptr %"b::Tuple[2]_ptr", align 8
+   %1 = fmul contract double %"a::Tuple[6]_ptr.unbox", %"b::Tuple[2]_ptr.unbox"
+   %2 = fadd contract double %0, %1
+   %"a::Tuple[9]_ptr.unbox" = load double, ptr %"a::Tuple[9]_ptr", align 8
+   %"b::Tuple[3]_ptr.unbox" = load double, ptr %"b::Tuple[3]_ptr", align 8
+   %3 = fmul contract double %"a::Tuple[9]_ptr.unbox", %"b::Tuple[3]_ptr.unbox"
+   %4 = fadd contract double %2, %3
+   ret double %4
+}
+
+
 !0 = distinct !{!0, !"julia.simdloop"}
 !1 = distinct !{!1, !"julia.simdloop", !"julia.ivdep"}
 !2 = distinct !{!2, !"julia.simdloop", !"julia.ivdep", !3}
diff --git a/test/llvmpasses/muladd.ll b/test/llvmpasses/muladd.ll
deleted file mode 100644
index 079582305ee72..0000000000000
--- a/test/llvmpasses/muladd.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; This file is a part of Julia. License is MIT: https://julialang.org/license
-
-; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='CombineMulAdd' -S %s | FileCheck %s
-
-
-; CHECK-LABEL: @fast_muladd1
-define double @fast_muladd1(double %a, double %b, double %c) {
-top:
-; CHECK: {{contract|fmuladd}}
-  %v1 = fmul double %a, %b
-  %v2 = fadd fast double %v1, %c
-; CHECK: ret double
-  ret double %v2
-}
-
-; CHECK-LABEL: @fast_mulsub1
-define double @fast_mulsub1(double %a, double %b, double %c) {
-top:
-; CHECK: {{contract|fmuladd}}
-  %v1 = fmul double %a, %b
-  %v2 = fsub fast double %v1, %c
-; CHECK: ret double
-  ret double %v2
-}
-
-; CHECK-LABEL: @fast_mulsub_vec1
-define <2 x double> @fast_mulsub_vec1(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
-top:
-; CHECK: {{contract|fmuladd}}
-  %v1 = fmul <2 x double> %a, %b
-  %v2 = fsub fast <2 x double> %c, %v1
-; CHECK: ret <2 x double>
-  ret <2 x double> %v2
-}
-
-; COM: Should not mark fmul as contract when multiple uses of fmul exist
-; CHECK-LABEL: @slow_muladd1
-define double @slow_muladd1(double %a, double %b, double %c) {
-top:
-; CHECK: %v1 = fmul double %a, %b
-  %v1 = fmul double %a, %b
-; CHECK: %v2 = fadd fast double %v1, %c
-  %v2 = fadd fast double %v1, %c
-; CHECK: %v3 = fadd fast double %v1, %b
-  %v3 = fadd fast double %v1, %b
-; CHECK: %v4 = fadd fast double %v3, %v2
-  %v4 = fadd fast double %v3, %v2
-; CHECK: ret double %v4
-  ret double %v4
-}
-
-; COM: Should not mark fadd->fadd fast as contract
-; CHECK-LABEL: @slow_addadd1
-define double @slow_addadd1(double %a, double %b, double %c) {
-top:
-; CHECK: %v1 = fadd double %a, %b
-  %v1 = fadd double %a, %b
-; CHECK: %v2 = fadd fast double %v1, %c
-  %v2 = fadd fast double %v1, %c
-; CHECK: ret double %v2
-  ret double %v2
-}