Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement @fastmath #9406

Merged
merged 7 commits into from
Jan 13, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions base/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,24 @@ macro inbounds(blk)
:(@boundscheck false $(esc(blk)))
end

function make_fastmath(expr)
if isa(expr, Expr)
Expr(make_fastmath(expr.head), map(make_fastmath, expr.args)...)
elseif expr==:+; :add_fast
elseif expr==:-; :sub_fast
elseif expr==:*; :mul_fast
elseif expr==:/; :div_fast
elseif expr==:rem; :rem_fast
elseif expr==:mod; :mod_fast
elseif expr==:cmp; :cmp_fast
else expr
end
end

macro fastmath(expr)
make_fastmath(esc(expr))
end

macro label(name::Symbol)
Expr(:symboliclabel, name)
end
Expand Down
10 changes: 10 additions & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,15 @@ export
√,
∛,

# fast math
add_fast,
sub_fast,
mul_fast,
div_fast,
rem_fast,
mod_fast,
cmp_fast,

# specfun
airy,
airyai,
Expand Down Expand Up @@ -1417,6 +1426,7 @@ export
@deprecate,
@boundscheck,
@inbounds,
@fastmath,
@simd,
@label,
@goto,
Expand Down
42 changes: 42 additions & 0 deletions base/float.jl
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,35 @@ widen(::Type{Float32}) = Float64
rem(x::Float32, y::Float32) = box(Float32,rem_float(unbox(Float32,x),unbox(Float32,y)))
rem(x::Float64, y::Float64) = box(Float64,rem_float(unbox(Float64,x),unbox(Float64,y)))

# fast versions that may violate strict IEEE semantics
# TODO: provide isnan_fast and friends
for (op_fast, op) in ((:add_fast, :+), (:sub_fast, :-),
(:mul_fast, :*), (:div_fast, :/),
(:rem_fast, :rem), (:mod_fast, :mod),
(:cmp_fast, :cmp))
@eval begin
# fall-back implementation for non-numeric types
($op_fast)(xs...) = ($op)(xs...)
# type promotion
($op_fast)(x::Number, y::Number, zs::Number...) =
($op_fast)(promote(x,y,zs...)...)
# fall-back implementation that applies after promotion
($op_fast){T<:Number}(x::T,ys::T...) = ($op)(x,ys...)
end
end
for T in (Float32, Float64)
@eval begin
sub_fast(x::$T) = box($T,neg_float_fast(unbox($T,x)))
add_fast(x::$T, y::$T) = box($T,add_float_fast(unbox($T,x),unbox($T,y)))
sub_fast(x::$T, y::$T) = box($T,sub_float_fast(unbox($T,x),unbox($T,y)))
mul_fast(x::$T, y::$T) = box($T,mul_float_fast(unbox($T,x),unbox($T,y)))
div_fast(x::$T, y::$T) = box($T,div_float_fast(unbox($T,x),unbox($T,y)))
rem_fast(x::$T, y::$T) = box($T,rem_float_fast(unbox($T,x),unbox($T,y)))
add_fast(x::$T, y::$T, zs::$T...) = add_fast(add_fast(x, y), zs...)
mul_fast(x::$T, y::$T, zs::$T...) = mul_fast(mul_fast(x, y), zs...)
end
end

cld{T<:FloatingPoint}(x::T, y::T) = -fld(-x,y)

function mod{T<:FloatingPoint}(x::T, y::T)
Expand All @@ -217,6 +246,16 @@ function mod{T<:FloatingPoint}(x::T, y::T)
end
end

function mod_fast{T<:FloatingPoint}(x::T, y::T)
r = rem_fast(x,y)
if r == 0
copysign(r,y)
elseif (r > 0) $ (y > 0)
r+y
else
r
end
end

## floating point comparisons ##
==(x::Float32, y::Float32) = eq_float(unbox(Float32,x),unbox(Float32,y))
Expand Down Expand Up @@ -248,6 +287,9 @@ function cmp(x::FloatingPoint, y::Real)
ifelse(x<y, -1, ifelse(x>y, 1, 0))
end

cmp_fast(x::Float32, y::Float32) = ifelse(x<y, -1, ifelse(x>y, 1, 0))
cmp_fast(x::Float64, y::Float64) = ifelse(x<y, -1, ifelse(x>y, 1, 0))

for Ti in (Int64,UInt64,Int128,UInt128)
for Tf in (Float32,Float64)
@eval begin
Expand Down
1 change: 1 addition & 0 deletions base/inference.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ immutable JLCompilerOpts
opt_level::Int8
depwarn::Int8
can_inline::Int8
fast_math::Int8
end

compileropts() = unsafe_load(cglobal(:jl_compileropts, JLCompilerOpts))
Expand Down
5 changes: 5 additions & 0 deletions doc/man/julia.1
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ Enable or disable color text
--check-bounds={yes|no}
Emit bounds checks always or never (ignoring declarations)

.TP
--math-mode={ieee|user}
Always use IEEE semantics for math (ignoring declarations),
or adhere to declarations in source code

.TP
--int-literals={32|64}
Select integer literal size independent of platform
Expand Down
2 changes: 2 additions & 0 deletions doc/manual/getting-started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ those available for the ``perl`` and ``ruby`` programs::
--track-allocation={none|user|all}
Count bytes allocated by each source line
--check-bounds={yes|no} Emit bounds checks always or never (ignoring declarations)
--math-mode={ieee|user} Always use IEEE semantics for math (ignoring declarations),
or adhere to declarations in source code
-O, --optimize Run time-intensive code optimizations
--int-literals={32|64} Select integer literal size independent of platform
--dump-bitcode={yes|no} Dump bitcode for the system image (used with --build)
Expand Down
95 changes: 94 additions & 1 deletion doc/manual/performance-tips.rst
Original file line number Diff line number Diff line change
Expand Up @@ -546,11 +546,15 @@ properties.
- Use :obj:`@inbounds` to eliminate array bounds checking within expressions.
Be certain before doing this. If the subscripts are ever out of bounds,
you may suffer crashes or silent corruption.
- Use :obj:`@fastmath` to allow floating point optimizations that are
correct for real numbers, but lead to differences for IEEE numbers.
Be careful when doing this, as this may change numerical results.
This corresponds to the ``-ffast-math`` option of clang.
- Write :obj:`@simd` in front of ``for`` loops that are amenable to vectorization.
**This feature is experimental** and could change or disappear in future
versions of Julia.

Here is an example with both forms of markup::
Here is an example with both :obj:`@inbounds` and :obj:`@simd` markup::

function inner( x, y )
s = zero(eltype(x))
Expand Down Expand Up @@ -621,6 +625,95 @@ properties:
LLVM auto-vectorization may kick in automatically, leading to no further
speedup with :obj:`@simd`.

Here is an example with all three kinds of markup. This program first
calculates the finite difference of a one-dimensional array, and then
evaluates the L2-norm of the result::

function init!(u)
n = length(u)
dx = 1.0 / (n-1)
@fastmath @inbounds @simd for i in 1:n
u[i] = sin(2pi*dx*i)
end
end

function deriv!(u, du)
n = length(u)
dx = 1.0 / (n-1)
@fastmath @inbounds du[1] = (u[2] - u[1]) / dx
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering if it's worth showing the trick of wrapping several statements in a begin...end pair so that the @fastmath @inbounds only has to be written once.

@fastmath @inbounds @simd for i in 2:n-1
du[i] = (u[i+1] - u[i-1]) / (2*dx)
end
@fastmath @inbounds du[n] = (u[n] - u[n-1]) / dx
end

function norm(u)
n = length(u)
T = eltype(u)
s = zero(T)
@fastmath @inbounds @simd for i in 1:n
s += u[i]^2
end
@fastmath @inbounds return sqrt(s/n)
end

function main()
n = 2000
u = Array(Float64, n)
init!(u)
du = similar(u)

deriv!(u, du)
nu = norm(du)

@time for i in 1:10^6
deriv!(u, du)
nu = norm(du)
end

println(nu)
end

main()

On a computer with a 2.7 GHz Intel Core i7 processor, this produces::

$ julia wave.jl
elapsed time: 1.207814709 seconds (0 bytes allocated)
4.443986180758243

$ julia --math-mode=ieee wave.jl
elapsed time: 4.487083643 seconds (0 bytes allocated)
4.443986180758243

Here, the option ``--math-mode=ieee`` disables the :opt:`@fastmath`
macro, so that we can compare results.

In this case, the speedup due to :opt:`@fastmath` is a factor of about
3.7. This is unusually large -- in general, the speedup will be
smaller. (In this particular example, the working set of the benchmark
is small enough to fit into the L1 cache of the processor, so that
memory access latency does not play a role, and computing time is
dominated by CPU usage. In many real world programs this is not the
case.) Also, in this case this optimization does not change the result
-- in general, the result will be slightly different. In some cases,
especially for numerically unstable algorithms, the result can be very
different.

The annotation :opt:`@fastmath` re-arranges floating point
expressions, e.g. changing the order of evaluation, or assuming that
certain special cases (inf, nan) cannot occur. In this case (and on
this particular computer), the main difference is that the expression
``1 / (2*dx)`` in the function ``deriv`` is hoisted out of the loop
(i.e. calculated outside the loop), as if one had written ``idx = 1 /
(2*dx)``. In the loop, the expression ``... / (2*dx)`` then becomes
``... * idx``, which is much faster to evaluate. Of course, both the
actual optimization that is applied by the compiler as well as the
resulting speedup depend very much on the hardware. You can examine
the change in generated code by using Julia's :obj:`code_native`
function.


.. _man-code-warntype:

:obj:`@code_warntype`
Expand Down
1 change: 1 addition & 0 deletions src/alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ jl_sym_t *compositetype_sym; jl_sym_t *type_goto_sym;
jl_sym_t *global_sym; jl_sym_t *tuple_sym;
jl_sym_t *dot_sym; jl_sym_t *newvar_sym;
jl_sym_t *boundscheck_sym; jl_sym_t *copyast_sym;
jl_sym_t *fastmath_sym;
jl_sym_t *simdloop_sym; jl_sym_t *meta_sym;
jl_sym_t *arrow_sym; jl_sym_t *ldots_sym;

Expand Down
3 changes: 2 additions & 1 deletion src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ jl_compileropts_t jl_compileropts = { NULL, // julia_home
JL_COMPILEROPT_COMPILE_DEFAULT,
0, // opt_level
1, // depwarn
1 // can_inline
1, // can_inline
JL_COMPILEROPT_FAST_MATH_DEFAULT
};

int jl_boot_file_loaded = 0;
Expand Down
3 changes: 3 additions & 0 deletions src/interpreter.c
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,9 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl)
else if (ex->head == boundscheck_sym) {
return (jl_value_t*)jl_nothing;
}
else if (ex->head == fastmath_sym) {
return (jl_value_t*)jl_nothing;
}
else if (ex->head == simdloop_sym) {
return (jl_value_t*)jl_nothing;
}
Expand Down
48 changes: 41 additions & 7 deletions src/intrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ namespace JL_I {
neg_int, add_int, sub_int, mul_int,
sdiv_int, udiv_int, srem_int, urem_int, smod_int,
neg_float, add_float, sub_float, mul_float, div_float, rem_float,
// fast arithmetic
neg_float_fast, add_float_fast, sub_float_fast,
mul_float_fast, div_float_fast, rem_float_fast,
// same-type comparisons
eq_int, ne_int,
slt_int, ult_int,
Expand Down Expand Up @@ -718,6 +721,26 @@ static Value *emit_srem(Value *x, Value *den, jl_codectx_t *ctx)
return ret;
}

// Temporarily switch the builder to fast-math mode if requested
struct math_builder {
FastMathFlags old_fmf;
math_builder(jl_codectx_t *ctx, bool always_fast = false):
old_fmf(builder.getFastMathFlags())
{
if (jl_compileropts.fast_math != JL_COMPILEROPT_FAST_MATH_OFF &&
(always_fast ||
jl_compileropts.fast_math == JL_COMPILEROPT_FAST_MATH_ON)) {
FastMathFlags fmf;
fmf.setUnsafeAlgebra();
builder.SetFastMathFlags(fmf);
}
}
IRBuilder<>& operator()() const { return builder; }
~math_builder() {
builder.SetFastMathFlags(old_fmf);
}
};

static Value *emit_smod(Value *x, Value *den, jl_codectx_t *ctx)
{
Type *t = den->getType();
Expand Down Expand Up @@ -926,15 +949,24 @@ static Value *emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
// that do the correct thing on LLVM <= 3.3 and >= 3.5 respectively.
// See issue #7868
#ifdef LLVM35
HANDLE(neg_float,1) return builder.CreateFSub(ConstantFP::get(FT(t), -0.0), FP(x));
HANDLE(neg_float,1) return math_builder(ctx)().CreateFSub(ConstantFP::get(FT(t), -0.0), FP(x));
HANDLE(neg_float_fast,1) return math_builder(ctx, true)().CreateFNeg(FP(x));
#else
HANDLE(neg_float,1) return builder.CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
HANDLE(neg_float,1)
return math_builder(ctx)().CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
HANDLE(neg_float_fast,1)
return math_builder(ctx, true)().CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
#endif
HANDLE(add_float,2) return builder.CreateFAdd(FP(x), FP(y));
HANDLE(sub_float,2) return builder.CreateFSub(FP(x), FP(y));
HANDLE(mul_float,2) return builder.CreateFMul(FP(x), FP(y));
HANDLE(div_float,2) return builder.CreateFDiv(FP(x), FP(y));
HANDLE(rem_float,2) return builder.CreateFRem(FP(x), FP(y));
HANDLE(add_float,2) return math_builder(ctx)().CreateFAdd(FP(x), FP(y));
HANDLE(sub_float,2) return math_builder(ctx)().CreateFSub(FP(x), FP(y));
HANDLE(mul_float,2) return math_builder(ctx)().CreateFMul(FP(x), FP(y));
HANDLE(div_float,2) return math_builder(ctx)().CreateFDiv(FP(x), FP(y));
HANDLE(rem_float,2) return math_builder(ctx)().CreateFRem(FP(x), FP(y));
HANDLE(add_float_fast,2) return math_builder(ctx, true)().CreateFAdd(FP(x), FP(y));
HANDLE(sub_float_fast,2) return math_builder(ctx, true)().CreateFSub(FP(x), FP(y));
HANDLE(mul_float_fast,2) return math_builder(ctx, true)().CreateFMul(FP(x), FP(y));
HANDLE(div_float_fast,2) return math_builder(ctx, true)().CreateFDiv(FP(x), FP(y));
HANDLE(rem_float_fast,2) return math_builder(ctx, true)().CreateFRem(FP(x), FP(y));

HANDLE(checked_sadd,2)
HANDLE(checked_uadd,2)
Expand Down Expand Up @@ -1262,6 +1294,8 @@ extern "C" void jl_init_intrinsic_functions(void)
ADD_I(smod_int);
ADD_I(neg_float); ADD_I(add_float); ADD_I(sub_float); ADD_I(mul_float);
ADD_I(div_float); ADD_I(rem_float);
ADD_I(neg_float_fast); ADD_I(add_float_fast); ADD_I(sub_float_fast);
ADD_I(mul_float_fast); ADD_I(div_float_fast); ADD_I(rem_float_fast);
ADD_I(eq_int); ADD_I(ne_int);
ADD_I(slt_int); ADD_I(ult_int);
ADD_I(sle_int); ADD_I(ule_int);
Expand Down
1 change: 1 addition & 0 deletions src/jltypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -3279,6 +3279,7 @@ void jl_init_types(void)
kw_sym = jl_symbol("kw");
dot_sym = jl_symbol(".");
boundscheck_sym = jl_symbol("boundscheck");
fastmath_sym = jl_symbol("fastmath");
newvar_sym = jl_symbol("newvar");
copyast_sym = jl_symbol("copyast");
simdloop_sym = jl_symbol("simdloop");
Expand Down
Loading