JuliaLang · jakebolewski · Jan 13, 2015 · Jan 6, 2015 · Jan 6, 2015 · Jan 6, 2015
diff --git a/base/base.jl b/base/base.jl
@@ -242,6 +242,24 @@ macro inbounds(blk)
     :(@boundscheck false $(esc(blk)))
 end
 
+function make_fastmath(expr)
+    if isa(expr, Expr)
+        Expr(make_fastmath(expr.head), map(make_fastmath, expr.args)...)
+    elseif expr==:+; :add_fast
+    elseif expr==:-; :sub_fast
+    elseif expr==:*; :mul_fast
+    elseif expr==:/; :div_fast
+    elseif expr==:rem; :rem_fast
+    elseif expr==:mod; :mod_fast
+    elseif expr==:cmp; :cmp_fast
+    else expr
+    end
+end
+
+macro fastmath(expr)
+    make_fastmath(esc(expr))
+end
+
 macro label(name::Symbol)
     Expr(:symboliclabel, name)
 end

diff --git a/base/exports.jl b/base/exports.jl
@@ -464,6 +464,15 @@ export
     √,
     ∛,
 
+# fast math
+    add_fast,
+    sub_fast,
+    mul_fast,
+    div_fast,
+    rem_fast,
+    mod_fast,
+    cmp_fast,
+
 # specfun
     airy,
     airyai,
@@ -1417,6 +1426,7 @@ export
     @deprecate,
     @boundscheck,
     @inbounds,
+    @fastmath,
     @simd,
     @label,
     @goto,

diff --git a/base/float.jl b/base/float.jl
@@ -204,6 +204,35 @@ widen(::Type{Float32}) = Float64
 rem(x::Float32, y::Float32) = box(Float32,rem_float(unbox(Float32,x),unbox(Float32,y)))
 rem(x::Float64, y::Float64) = box(Float64,rem_float(unbox(Float64,x),unbox(Float64,y)))
 
+# fast versions that may violate strict IEEE semantics
+# TODO: provide isnan_fast and friends
+for (op_fast, op) in ((:add_fast, :+), (:sub_fast, :-),
+                      (:mul_fast, :*), (:div_fast, :/),
+                      (:rem_fast, :rem), (:mod_fast, :mod),
+                      (:cmp_fast, :cmp))
+    @eval begin
+        # fall-back implementation for non-numeric types
+        ($op_fast)(xs...) = ($op)(xs...)
+        # type promotion
+        ($op_fast)(x::Number, y::Number, zs::Number...) =
+            ($op_fast)(promote(x,y,zs...)...)
+        # fall-back implementation that applies after promotion
+        ($op_fast){T<:Number}(x::T,ys::T...) = ($op)(x,ys...)
+    end
+end
+for T in (Float32, Float64)
+    @eval begin
+        sub_fast(x::$T) = box($T,neg_float_fast(unbox($T,x)))
+        add_fast(x::$T, y::$T) = box($T,add_float_fast(unbox($T,x),unbox($T,y)))
+        sub_fast(x::$T, y::$T) = box($T,sub_float_fast(unbox($T,x),unbox($T,y)))
+        mul_fast(x::$T, y::$T) = box($T,mul_float_fast(unbox($T,x),unbox($T,y)))
+        div_fast(x::$T, y::$T) = box($T,div_float_fast(unbox($T,x),unbox($T,y)))
+        rem_fast(x::$T, y::$T) = box($T,rem_float_fast(unbox($T,x),unbox($T,y)))
+        add_fast(x::$T, y::$T, zs::$T...) = add_fast(add_fast(x, y), zs...)
+        mul_fast(x::$T, y::$T, zs::$T...) = mul_fast(mul_fast(x, y), zs...)
+    end
+end
+
 cld{T<:FloatingPoint}(x::T, y::T) = -fld(-x,y)
 
 function mod{T<:FloatingPoint}(x::T, y::T)
@@ -217,6 +246,16 @@ function mod{T<:FloatingPoint}(x::T, y::T)
     end
 end
 
+function mod_fast{T<:FloatingPoint}(x::T, y::T)
+    r = rem_fast(x,y)
+    if r == 0
+        copysign(r,y)
+    elseif (r > 0) $ (y > 0)
+        r+y
+    else
+        r
+    end
+end
 
 ## floating point comparisons ##
 ==(x::Float32, y::Float32) = eq_float(unbox(Float32,x),unbox(Float32,y))
@@ -248,6 +287,9 @@ function cmp(x::FloatingPoint, y::Real)
     ifelse(x<y, -1, ifelse(x>y, 1, 0))
 end
 
+cmp_fast(x::Float32, y::Float32) = ifelse(x<y, -1, ifelse(x>y, 1, 0))
+cmp_fast(x::Float64, y::Float64) = ifelse(x<y, -1, ifelse(x>y, 1, 0))
+
 for Ti in (Int64,UInt64,Int128,UInt128)
     for Tf in (Float32,Float64)
         @eval begin

diff --git a/base/inference.jl b/base/inference.jl
@@ -50,6 +50,7 @@ immutable JLCompilerOpts
     opt_level::Int8
     depwarn::Int8
     can_inline::Int8
+    fast_math::Int8
 end
 
 compileropts() = unsafe_load(cglobal(:jl_compileropts, JLCompilerOpts))

diff --git a/doc/man/julia.1 b/doc/man/julia.1
@@ -120,6 +120,11 @@ Enable or disable color text
 --check-bounds={yes|no}
 Emit bounds checks always or never (ignoring declarations)
 
+.TP
+--math-mode={ieee|user}
+Always use IEEE semantics for math (ignoring declarations),
+or adhere to declarations in source code
+
 .TP
 --int-literals={32|64}
 Select integer literal size independent of platform

diff --git a/doc/manual/getting-started.rst b/doc/manual/getting-started.rst
@@ -130,6 +130,8 @@ those available for the ``perl`` and ``ruby`` programs::
      --track-allocation={none|user|all}
                               Count bytes allocated by each source line
      --check-bounds={yes|no}  Emit bounds checks always or never (ignoring declarations)
+     --math-mode={ieee|user}  Always use IEEE semantics for math (ignoring declarations),
+                              or adhere to declarations in source code
      -O, --optimize           Run time-intensive code optimizations
      --int-literals={32|64}   Select integer literal size independent of platform
      --dump-bitcode={yes|no}  Dump bitcode for the system image (used with --build)

diff --git a/doc/manual/performance-tips.rst b/doc/manual/performance-tips.rst
@@ -546,11 +546,15 @@ properties.
 -  Use :obj:`@inbounds` to eliminate array bounds checking within expressions.
    Be certain before doing this. If the subscripts are ever out of bounds,
    you may suffer crashes or silent corruption.
+-  Use :obj:`@fastmath` to allow floating point optimizations that are
+   correct for real numbers, but lead to differences for IEEE numbers.
+   Be careful when doing this, as this may change numerical results.
+   This corresponds to the ``-ffast-math`` option of clang.
 -  Write :obj:`@simd` in front of ``for`` loops that are amenable to vectorization.
    **This feature is experimental** and could change or disappear in future
    versions of Julia.
 
-Here is an example with both forms of markup::
+Here is an example with both :obj:`@inbounds` and :obj:`@simd` markup::
 
     function inner( x, y )
         s = zero(eltype(x))
@@ -621,6 +625,95 @@ properties:
    LLVM auto-vectorization may kick in automatically, leading to no further
    speedup with :obj:`@simd`.
 
+Here is an example with all three kinds of markup. This program first
+calculates the finite difference of a one-dimensional array, and then
+evaluates the L2-norm of the result::
+
+    function init!(u)
+        n = length(u)
+        dx = 1.0 / (n-1)
+        @fastmath @inbounds @simd for i in 1:n
+            u[i] = sin(2pi*dx*i)
+        end
+    end
+
+    function deriv!(u, du)
+        n = length(u)
+        dx = 1.0 / (n-1)
+        @fastmath @inbounds du[1] = (u[2] - u[1]) / dx
+        @fastmath @inbounds @simd for i in 2:n-1
+            du[i] = (u[i+1] - u[i-1]) / (2*dx)
+        end
+        @fastmath @inbounds du[n] = (u[n] - u[n-1]) / dx
+    end
+
+    function norm(u)
+        n = length(u)
+        T = eltype(u)
+        s = zero(T)
+        @fastmath @inbounds @simd for i in 1:n
+            s += u[i]^2
+        end
+        @fastmath @inbounds return sqrt(s/n)
+    end
+
+    function main()
+        n = 2000
+        u = Array(Float64, n)
+        init!(u)
+        du = similar(u)
+
+        deriv!(u, du)
+        nu = norm(du)
+
+        @time for i in 1:10^6
+            deriv!(u, du)
+            nu = norm(du)
+        end
+
+        println(nu)
+    end
+
+    main()
+
+On a computer with a 2.7 GHz Intel Core i7 processor, this produces::
+
+    $ julia wave.jl
+    elapsed time: 1.207814709 seconds (0 bytes allocated)
+    4.443986180758243
+
+    $ julia --math-mode=ieee wave.jl
+    elapsed time: 4.487083643 seconds (0 bytes allocated)
+    4.443986180758243
+
+Here, the option ``--math-mode=ieee`` disables the :opt:`@fastmath`
+macro, so that we can compare results.
+
+In this case, the speedup due to :opt:`@fastmath` is a factor of about
+3.7. This is unusually large -- in general, the speedup will be
+smaller. (In this particular example, the working set of the benchmark
+is small enough to fit into the L1 cache of the processor, so that
+memory access latency does not play a role, and computing time is
+dominated by CPU usage. In many real world programs this is not the
+case.) Also, in this case this optimization does not change the result
+-- in general, the result will be slightly different. In some cases,
+especially for numerically unstable algorithms, the result can be very
+different.
+
+The annotation :opt:`@fastmath` re-arranges floating point
+expressions, e.g. changing the order of evaluation, or assuming that
+certain special cases (inf, nan) cannot occur. In this case (and on
+this particular computer), the main difference is that the expression
+``1 / (2*dx)`` in the function ``deriv`` is hoisted out of the loop
+(i.e. calculated outside the loop), as if one had written ``idx = 1 /
+(2*dx)``. In the loop, the expression ``... / (2*dx)`` then becomes
+``... * idx``, which is much faster to evaluate. Of course, both the
+actual optimization that is applied by the compiler as well as the
+resulting speedup depend very much on the hardware. You can examine
+the change in generated code by using Julia's :obj:`code_native`
+function.
+
+
 .. _man-code-warntype:
 
 :obj:`@code_warntype`

diff --git a/src/alloc.c b/src/alloc.c
@@ -93,6 +93,7 @@ jl_sym_t *compositetype_sym; jl_sym_t *type_goto_sym;
 jl_sym_t *global_sym; jl_sym_t *tuple_sym;
 jl_sym_t *dot_sym;    jl_sym_t *newvar_sym;
 jl_sym_t *boundscheck_sym; jl_sym_t *copyast_sym;
+jl_sym_t *fastmath_sym;
 jl_sym_t *simdloop_sym; jl_sym_t *meta_sym;
 jl_sym_t *arrow_sym; jl_sym_t *ldots_sym;
 

diff --git a/src/init.c b/src/init.c
@@ -97,7 +97,8 @@ jl_compileropts_t jl_compileropts = { NULL, // julia_home
                                       JL_COMPILEROPT_COMPILE_DEFAULT,
                                       0,    // opt_level
                                       1,    // depwarn
-                                      1     // can_inline
+                                      1,    // can_inline
+                                      JL_COMPILEROPT_FAST_MATH_DEFAULT
 };
 
 int jl_boot_file_loaded = 0;

diff --git a/src/interpreter.c b/src/interpreter.c
@@ -449,6 +449,9 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl)
     else if (ex->head == boundscheck_sym) {
         return (jl_value_t*)jl_nothing;
     }
+    else if (ex->head == fastmath_sym) {
+        return (jl_value_t*)jl_nothing;
+    }
     else if (ex->head == simdloop_sym) {
         return (jl_value_t*)jl_nothing;
     }

diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
@@ -6,6 +6,9 @@ namespace JL_I {
         neg_int, add_int, sub_int, mul_int,
         sdiv_int, udiv_int, srem_int, urem_int, smod_int,
         neg_float, add_float, sub_float, mul_float, div_float, rem_float,
+        // fast arithmetic
+        neg_float_fast, add_float_fast, sub_float_fast,
+        mul_float_fast, div_float_fast, rem_float_fast,
         // same-type comparisons
         eq_int,  ne_int,
         slt_int, ult_int,
@@ -718,6 +721,26 @@ static Value *emit_srem(Value *x, Value *den, jl_codectx_t *ctx)
     return ret;
 }
 
+// Temporarily switch the builder to fast-math mode if requested
+struct math_builder {
+    FastMathFlags old_fmf;
+    math_builder(jl_codectx_t *ctx, bool always_fast = false):
+        old_fmf(builder.getFastMathFlags())
+    {
+        if (jl_compileropts.fast_math != JL_COMPILEROPT_FAST_MATH_OFF &&
+            (always_fast ||
+             jl_compileropts.fast_math == JL_COMPILEROPT_FAST_MATH_ON)) {
+            FastMathFlags fmf;
+            fmf.setUnsafeAlgebra();
+            builder.SetFastMathFlags(fmf);
+        }
+    }
+    IRBuilder<>& operator()() const { return builder; }
+    ~math_builder() {
+        builder.SetFastMathFlags(old_fmf);
+    }
+};
+
 static Value *emit_smod(Value *x, Value *den, jl_codectx_t *ctx)
 {
     Type *t = den->getType();
@@ -926,15 +949,24 @@ static Value *emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
 // that do the correct thing on LLVM <= 3.3 and >= 3.5 respectively.
 // See issue #7868
 #ifdef LLVM35
-    HANDLE(neg_float,1) return builder.CreateFSub(ConstantFP::get(FT(t), -0.0), FP(x));
+    HANDLE(neg_float,1) return math_builder(ctx)().CreateFSub(ConstantFP::get(FT(t), -0.0), FP(x));
+    HANDLE(neg_float_fast,1) return math_builder(ctx, true)().CreateFNeg(FP(x));
 #else
-    HANDLE(neg_float,1) return builder.CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
+    HANDLE(neg_float,1)
+        return math_builder(ctx)().CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
+    HANDLE(neg_float_fast,1)
+        return math_builder(ctx, true)().CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
 #endif
-    HANDLE(add_float,2) return builder.CreateFAdd(FP(x), FP(y));
-    HANDLE(sub_float,2) return builder.CreateFSub(FP(x), FP(y));
-    HANDLE(mul_float,2) return builder.CreateFMul(FP(x), FP(y));
-    HANDLE(div_float,2) return builder.CreateFDiv(FP(x), FP(y));
-    HANDLE(rem_float,2) return builder.CreateFRem(FP(x), FP(y));
+    HANDLE(add_float,2) return math_builder(ctx)().CreateFAdd(FP(x), FP(y));
+    HANDLE(sub_float,2) return math_builder(ctx)().CreateFSub(FP(x), FP(y));
+    HANDLE(mul_float,2) return math_builder(ctx)().CreateFMul(FP(x), FP(y));
+    HANDLE(div_float,2) return math_builder(ctx)().CreateFDiv(FP(x), FP(y));
+    HANDLE(rem_float,2) return math_builder(ctx)().CreateFRem(FP(x), FP(y));
+    HANDLE(add_float_fast,2) return math_builder(ctx, true)().CreateFAdd(FP(x), FP(y));
+    HANDLE(sub_float_fast,2) return math_builder(ctx, true)().CreateFSub(FP(x), FP(y));
+    HANDLE(mul_float_fast,2) return math_builder(ctx, true)().CreateFMul(FP(x), FP(y));
+    HANDLE(div_float_fast,2) return math_builder(ctx, true)().CreateFDiv(FP(x), FP(y));
+    HANDLE(rem_float_fast,2) return math_builder(ctx, true)().CreateFRem(FP(x), FP(y));
 
     HANDLE(checked_sadd,2)
     HANDLE(checked_uadd,2)
@@ -1262,6 +1294,8 @@ extern "C" void jl_init_intrinsic_functions(void)
     ADD_I(smod_int);
     ADD_I(neg_float); ADD_I(add_float); ADD_I(sub_float); ADD_I(mul_float);
     ADD_I(div_float); ADD_I(rem_float);
+    ADD_I(neg_float_fast); ADD_I(add_float_fast); ADD_I(sub_float_fast);
+    ADD_I(mul_float_fast); ADD_I(div_float_fast); ADD_I(rem_float_fast);
     ADD_I(eq_int); ADD_I(ne_int);
     ADD_I(slt_int); ADD_I(ult_int);
     ADD_I(sle_int); ADD_I(ule_int);

diff --git a/src/jltypes.c b/src/jltypes.c
@@ -3279,6 +3279,7 @@ void jl_init_types(void)
     kw_sym = jl_symbol("kw");
     dot_sym = jl_symbol(".");
     boundscheck_sym = jl_symbol("boundscheck");
+    fastmath_sym = jl_symbol("fastmath");
     newvar_sym = jl_symbol("newvar");
     copyast_sym = jl_symbol("copyast");
     simdloop_sym = jl_symbol("simdloop");