From 5a1f97198c30914249fcbcf984dd92440b267066 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 22 Feb 2017 15:00:06 -0500
Subject: [PATCH 1/4] update DomainError showerror handling to ignoring
 inlining

---
 base/replutil.jl | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/base/replutil.jl b/base/replutil.jl
index 4f60d3140e15e..dd771314b02e1 100644
--- a/base/replutil.jl
+++ b/base/replutil.jl
@@ -219,28 +219,29 @@ showerror(io::IO, ex::InitError) = showerror(io, ex, [])
 function showerror(io::IO, ex::DomainError, bt; backtrace=true)
     print(io, "DomainError:")
     for b in bt
-        code = StackTraces.lookup(b)[1]
-        if !code.from_c
-            if code.func == :nan_dom_err
+        for code in StackTraces.lookup(b)
+            if code.from_c
+                continue
+            elseif code.func === :nan_dom_err
                 continue
             elseif code.func in (:log, :log2, :log10, :sqrt)
                 print(io, "\n$(code.func) will only return a complex result if called ",
                     "with a complex argument. Try $(string(code.func))(complex(x)).")
-            elseif (code.func == :^ && code.file == Symbol("intfuncs.jl")) ||
-                    code.func == :power_by_squaring #3024
+            elseif (code.func === :^ &&
+                    (code.file === Symbol("intfuncs.jl") || code.file === Symbol(joinpath(".", "intfuncs.jl")))) ||
+                   code.func === :power_by_squaring #3024
                 print(io, "\nCannot raise an integer x to a negative power -n. ",
                     "\nMake x a float by adding a zero decimal (e.g. 2.0^-n instead ",
                     "of 2^-n), or write 1/x^n, float(x)^-n, or (x//1)^-n.")
-            elseif code.func == :^ &&
-                    (code.file == Symbol("promotion.jl") || code.file == Symbol("math.jl") ||
-                    code.file == Symbol(joinpath(".","promotion.jl")) ||
-                    code.file == Symbol(joinpath(".","math.jl")))
+            elseif code.func === :^ &&
+                    (code.file === Symbol("math.jl") || code.file === Symbol(joinpath(".", "math.jl")))
                 print(io, "\nExponentiation yielding a complex result requires a complex ",
                     "argument.\nReplace x^y with (x+0im)^y, Complex(x)^y, or similar.")
             end
-            break
+            @goto showbacktrace
         end
     end
+    @label showbacktrace
     backtrace && show_backtrace(io, bt)
     nothing
 end

From 1c5d7339e72b64ee9289c3645eb651cc611c18b1 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 22 Feb 2017 14:58:46 -0500
Subject: [PATCH 2/4] only use accurate powf function

The powi intrinsic optimization over calling powf is that it is inaccurate.
We don't need that.

When it is equally accurate (e.g. tiny constant powers),
LLVM will already recognize and optimize any call to a function named `powf`,
and produce the same speedup.

fix #19872
---
 base/fastmath.jl         |  6 +++---
 base/inference.jl        |  1 -
 base/math.jl             | 23 +++++++++++------------
 src/codegen.cpp          | 23 -----------------------
 src/intrinsics.cpp       | 28 ----------------------------
 src/intrinsics.h         |  1 -
 src/julia_internal.h     |  1 -
 src/runtime_intrinsics.c | 25 -------------------------
 test/math.jl             | 12 ++++++++++++
 9 files changed, 26 insertions(+), 94 deletions(-)

diff --git a/base/fastmath.jl b/base/fastmath.jl
index f99e44b0f50e1..33c6b9884a7ac 100644
--- a/base/fastmath.jl
+++ b/base/fastmath.jl
@@ -23,7 +23,7 @@ module FastMath
 
 export @fastmath
 
-import Core.Intrinsics: powi_llvm, sqrt_llvm_fast, neg_float_fast,
+import Core.Intrinsics: sqrt_llvm_fast, neg_float_fast,
     add_float_fast, sub_float_fast, mul_float_fast, div_float_fast, rem_float_fast,
     eq_float_fast, ne_float_fast, lt_float_fast, le_float_fast
 
@@ -243,8 +243,8 @@ end
 
 # builtins
 
-pow_fast(x::FloatTypes, y::Integer) = pow_fast(x, Int32(y))
-pow_fast(x::FloatTypes, y::Int32) = Base.powi_llvm(x, y)
+pow_fast(x::Float32, y::Integer) = ccall("llvm.powi.f32", llvmcall, Float32, (Float32, Int32), x, y)
+pow_fast(x::Float64, y::Integer) = ccall("llvm.powi.f64", llvmcall, Float64, (Float64, Int32), x, y)
 
 # TODO: Change sqrt_llvm intrinsic to avoid nan checking; add nan
 # checking to sqrt in math.jl; remove sqrt_llvm_fast intrinsic
diff --git a/base/inference.jl b/base/inference.jl
index eb52cbde5f204..83a4a5d7f2998 100644
--- a/base/inference.jl
+++ b/base/inference.jl
@@ -468,7 +468,6 @@ add_tfunc(floor_llvm, 1, 1, math_tfunc)
 add_tfunc(trunc_llvm, 1, 1, math_tfunc)
 add_tfunc(rint_llvm, 1, 1, math_tfunc)
 add_tfunc(sqrt_llvm, 1, 1, math_tfunc)
-add_tfunc(powi_llvm, 2, 2, math_tfunc)
 add_tfunc(sqrt_llvm_fast, 1, 1, math_tfunc)
     ## same-type comparisons ##
 cmp_tfunc(x::ANY, y::ANY) = Bool
diff --git a/base/math.jl b/base/math.jl
index 9fae027458b00..c75740809c400 100644
--- a/base/math.jl
+++ b/base/math.jl
@@ -24,7 +24,7 @@ using Base: sign_mask, exponent_mask, exponent_one, exponent_bias,
             exponent_half, exponent_max, exponent_raw_max, fpinttype,
             significand_mask, significand_bits, exponent_bits
 
-using Core.Intrinsics: sqrt_llvm, powi_llvm
+using Core.Intrinsics: sqrt_llvm
 
 const IEEEFloat = Union{Float16,Float32,Float64}
 # non-type specific math functions
@@ -286,6 +286,8 @@ exp10(x::Float32) = 10.0f0^x
 exp10(x::Integer) = exp10(float(x))
 
 # utility for converting NaN return to DomainError
+# the branch in nan_dom_err prevents its callers from inlining, so be sure to force it
+# until the heuristics can be improved
 @inline nan_dom_err(f, x) = isnan(f) & !isnan(x) ? throw(DomainError()) : f
 
 # functions that return NaN on non-NaN argument for domain error
@@ -403,9 +405,9 @@ log1p(x)
 for f in (:sin, :cos, :tan, :asin, :acos, :acosh, :atanh, :log, :log2, :log10,
           :lgamma, :log1p)
     @eval begin
-        ($f)(x::Float64) = nan_dom_err(ccall(($(string(f)),libm), Float64, (Float64,), x), x)
-        ($f)(x::Float32) = nan_dom_err(ccall(($(string(f,"f")),libm), Float32, (Float32,), x), x)
-        ($f)(x::Real) = ($f)(float(x))
+        @inline ($f)(x::Float64) = nan_dom_err(ccall(($(string(f)), libm), Float64, (Float64,), x), x)
+        @inline ($f)(x::Float32) = nan_dom_err(ccall(($(string(f, "f")), libm), Float32, (Float32,), x), x)
+        @inline ($f)(x::Real) = ($f)(float(x))
     end
 end
 
@@ -683,14 +685,11 @@ function modf(x::Float64)
     f, _modf_temp[]
 end
 
-^(x::Float64, y::Float64) = nan_dom_err(ccall((:pow,libm),  Float64, (Float64,Float64), x, y), x+y)
-^(x::Float32, y::Float32) = nan_dom_err(ccall((:powf,libm), Float32, (Float32,Float32), x, y), x+y)
-
-^(x::Float64, y::Integer) = x^Int32(y)
-^(x::Float64, y::Int32) = powi_llvm(x, y)
-^(x::Float32, y::Integer) = x^Int32(y)
-^(x::Float32, y::Int32) = powi_llvm(x, y)
-^(x::Float16, y::Integer) = Float16(Float32(x)^y)
+@inline ^(x::Float64, y::Float64) = nan_dom_err(ccall("llvm.pow.f64", llvmcall, Float64, (Float64, Float64), x, y), x + y)
+@inline ^(x::Float32, y::Float32) = nan_dom_err(ccall("llvm.pow.f32", llvmcall, Float32, (Float32, Float32), x, y), x + y)
+@inline ^(x::Float64, y::Integer) = x ^ Float64(y)
+@inline ^(x::Float32, y::Integer) = x ^ Float32(y)
+@inline ^(x::Float16, y::Integer) = Float16(Float32(x) ^ Float32(y))
 ^{p}(x::Float16, ::Type{Val{p}}) = Float16(Float32(x)^Val{p})
 
 function angle_restrict_symm(theta)
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 26c67da603b17..4d4ec294e5f0e 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -397,10 +397,6 @@ static Function *jldlsym_func;
 static Function *jlnewbits_func;
 static Function *jltypeassert_func;
 static Function *jldepwarnpi_func;
-#if JL_LLVM_VERSION < 30600
-static Function *jlpow_func;
-static Function *jlpowf_func;
-#endif
 //static Function *jlgetnthfield_func;
 static Function *jlgetnthfieldchecked_func;
 //static Function *jlsetnthfield_func;
@@ -6857,25 +6853,6 @@ static void init_julia_llvm_env(Module *m)
                          "jl_gc_diff_total_bytes", m);
     add_named_global(diff_gc_total_bytes_func, *jl_gc_diff_total_bytes);
 
-#if JL_LLVM_VERSION < 30600
-    Type *powf_type[2] = { T_float32, T_float32 };
-    jlpowf_func = Function::Create(FunctionType::get(T_float32, powf_type, false),
-                                   Function::ExternalLinkage,
-                                   "powf", m);
-    add_named_global(jlpowf_func, &powf, false);
-
-    Type *pow_type[2] = { T_float64, T_float64 };
-    jlpow_func = Function::Create(FunctionType::get(T_float64, pow_type, false),
-                                  Function::ExternalLinkage,
-                                  "pow", m);
-    add_named_global(jlpow_func,
-#ifdef _COMPILER_MICROSOFT_
-        static_cast<double (*)(double, double)>(&pow),
-#else
-        &pow,
-#endif
-        false);
-#endif
     std::vector<Type*> array_owner_args(0);
     array_owner_args.push_back(T_pjlvalue);
     jlarray_data_owner_func =
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 78e4eca282bcb..060422a10a222 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -71,7 +71,6 @@ static void jl_init_intrinsic_functions_codegen(Module *m)
     float_func[rint_llvm] = true;
     float_func[sqrt_llvm] = true;
     float_func[sqrt_llvm_fast] = true;
-    float_func[powi_llvm] = true;
 }
 
 extern "C"
@@ -851,33 +850,6 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
         return mark_julia_type(ans, false, x.typ, ctx);
     }
 
-    case powi_llvm: {
-        const jl_cgval_t &x = argv[0];
-        const jl_cgval_t &y = argv[1];
-        if (!jl_is_primitivetype(x.typ) || !jl_is_primitivetype(y.typ) || jl_datatype_size(y.typ) != 4)
-            return emit_runtime_call(f, argv, nargs, ctx);
-        Type *xt = FLOATT(bitstype_to_llvm(x.typ));
-        Type *yt = T_int32;
-        if (!xt)
-            return emit_runtime_call(f, argv, nargs, ctx);
-
-        Value *xv = emit_unbox(xt, x, x.typ);
-        Value *yv = emit_unbox(yt, y, y.typ);
-#if JL_LLVM_VERSION >= 30600
-        Value *powi = Intrinsic::getDeclaration(jl_Module, Intrinsic::powi, makeArrayRef(xt));
-#if JL_LLVM_VERSION >= 30700
-        Value *ans = builder.CreateCall(powi, {xv, yv});
-#else
-        Value *ans = builder.CreateCall2(powi, xv, yv);
-#endif
-#else
-        // issue #6506
-        Value *ans = builder.CreateCall2(prepare_call(xt == T_float64 ? jlpow_func : jlpowf_func),
-                xv, builder.CreateSIToFP(yv, xt));
-#endif
-        return mark_julia_type(ans, false, x.typ, ctx);
-    }
-
     default: {
         assert(nargs >= 1 && "invalid nargs for intrinsic call");
         const jl_cgval_t &xinfo = argv[0];
diff --git a/src/intrinsics.h b/src/intrinsics.h
index 058ccc68faaa8..15ea45dc2d73d 100644
--- a/src/intrinsics.h
+++ b/src/intrinsics.h
@@ -91,7 +91,6 @@
     ADD_I(trunc_llvm, 1) \
     ADD_I(rint_llvm, 1) \
     ADD_I(sqrt_llvm, 1) \
-    ADD_I(powi_llvm, 2) \
     ALIAS(sqrt_llvm_fast, sqrt_llvm) \
     /*  pointer access */ \
     ADD_I(pointerref, 3) \
diff --git a/src/julia_internal.h b/src/julia_internal.h
index fc0ede8097841..135d6c5655021 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -680,7 +680,6 @@ JL_DLLEXPORT jl_value_t *jl_floor_llvm(jl_value_t *a);
 JL_DLLEXPORT jl_value_t *jl_trunc_llvm(jl_value_t *a);
 JL_DLLEXPORT jl_value_t *jl_rint_llvm(jl_value_t *a);
 JL_DLLEXPORT jl_value_t *jl_sqrt_llvm(jl_value_t *a);
-JL_DLLEXPORT jl_value_t *jl_powi_llvm(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_abs_float(jl_value_t *a);
 JL_DLLEXPORT jl_value_t *jl_copysign_float(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_flipsign_int(jl_value_t *a, jl_value_t *b);
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index d107a3a898294..f391588c12d6b 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -925,31 +925,6 @@ un_fintrinsic(trunc_float,trunc_llvm)
 un_fintrinsic(rint_float,rint_llvm)
 un_fintrinsic(sqrt_float,sqrt_llvm)
 
-JL_DLLEXPORT jl_value_t *jl_powi_llvm(jl_value_t *a, jl_value_t *b)
-{
-    jl_ptls_t ptls = jl_get_ptls_states();
-    jl_value_t *ty = jl_typeof(a);
-    if (!jl_is_primitivetype(ty))
-        jl_error("powi_llvm: a is not a primitive type");
-    if (!jl_is_primitivetype(jl_typeof(b)) || jl_datatype_size(jl_typeof(b)) != 4)
-        jl_error("powi_llvm: b is not a 32-bit primitive type");
-    int sz = jl_datatype_size(ty);
-    jl_value_t *newv = jl_gc_alloc(ptls, sz, ty);
-    void *pa = jl_data_ptr(a), *pr = jl_data_ptr(newv);
-    switch (sz) {
-    /* choose the right size c-type operation */
-    case 4:
-        *(float*)pr = powf(*(float*)pa, (float)jl_unbox_int32(b));
-        break;
-    case 8:
-        *(double*)pr = pow(*(double*)pa, (double)jl_unbox_int32(b));
-        break;
-    default:
-        jl_error("powi_llvm: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
-    }
-    return newv;
-}
-
 JL_DLLEXPORT jl_value_t *jl_select_value(jl_value_t *isfalse, jl_value_t *a, jl_value_t *b)
 {
     JL_TYPECHK(isfalse, bool, isfalse);
diff --git a/test/math.jl b/test/math.jl
index 9c43e610a9bc9..ca5b2912cfc94 100644
--- a/test/math.jl
+++ b/test/math.jl
@@ -590,6 +590,18 @@ end
     end
 end
 
+@testset "issue #19872" begin
+    f19872a(x) = x ^ 5
+    f19872b(x) = x ^ (-1024)
+    @test 0 < f19872b(2.0) < 1e-300
+    @test issubnormal(2.0 ^ (-1024))
+    @test issubnormal(f19872b(2.0))
+    @test !issubnormal(f19872b(0.0))
+    @test f19872a(2.0) === 32.0
+    @test !issubnormal(f19872a(2.0))
+    @test !issubnormal(0.0)
+end
+
 # no domain error is thrown for negative values
 @test invoke(cbrt, Tuple{AbstractFloat}, -1.0) == -1.0
 

From cc7990b1936440e1b4bbe33895aa4dcdd6e2a754 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Thu, 23 Feb 2017 13:10:52 -0500
Subject: [PATCH 3/4] make several pure-marked math functions actually pure

---
 base/float.jl       |  8 --------
 base/math.jl        | 21 +++++++++++++++------
 base/special/exp.jl |  8 ++++----
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/base/float.jl b/base/float.jl
index f3b7abdfa5c4c..612e9360c08a7 100644
--- a/base/float.jl
+++ b/base/float.jl
@@ -812,14 +812,6 @@ fpinttype(::Type{Float64}) = UInt64
 fpinttype(::Type{Float32}) = UInt32
 fpinttype(::Type{Float16}) = UInt16
 
-@pure significand_bits{T<:AbstractFloat}(::Type{T}) = trailing_ones(significand_mask(T))
-@pure exponent_bits{T<:AbstractFloat}(::Type{T}) = sizeof(T)*8 - significand_bits(T) - 1
-@pure exponent_bias{T<:AbstractFloat}(::Type{T}) = Int(exponent_one(T) >> significand_bits(T))
-# maximum float exponent
-@pure exponent_max{T<:AbstractFloat}(::Type{T}) = Int(exponent_mask(T) >> significand_bits(T)) - exponent_bias(T)
-# maximum float exponent without bias
-@pure exponent_raw_max{T<:AbstractFloat}(::Type{T}) = Int(exponent_mask(T) >> significand_bits(T))
-
 ## TwicePrecision utilities
 # The numeric constants are half the number of bits in the mantissa
 for (F, T, n) in ((Float16, UInt16, 5), (Float32, UInt32, 12), (Float64, UInt64, 26))
diff --git a/base/math.jl b/base/math.jl
index c75740809c400..d4db49a661030 100644
--- a/base/math.jl
+++ b/base/math.jl
@@ -20,13 +20,23 @@ import Base: log, exp, sin, cos, tan, sinh, cosh, tanh, asin,
              max, min, minmax, ^, exp2, muladd, rem,
              exp10, expm1, log1p
 
-using Base: sign_mask, exponent_mask, exponent_one, exponent_bias,
-            exponent_half, exponent_max, exponent_raw_max, fpinttype,
-            significand_mask, significand_bits, exponent_bits
+using Base: sign_mask, exponent_mask, exponent_one,
+            exponent_half, fpinttype, significand_mask
 
 using Core.Intrinsics: sqrt_llvm
 
-const IEEEFloat = Union{Float16,Float32,Float64}
+const IEEEFloat = Union{Float16, Float32, Float64}
+
+for T in (Float16, Float32, Float64)
+    @eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
+    @eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
+    @eval exponent_bias(::Type{$T}) = $(Int(exponent_one(T) >> significand_bits(T)))
+    # maximum float exponent
+    @eval exponent_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)) - exponent_bias(T))
+    # maximum float exponent without bias
+    @eval exponent_raw_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)))
+end
+
 # non-type specific math functions
 
 """
@@ -229,8 +239,6 @@ for f in (:cbrt, :sinh, :cosh, :tanh, :atan, :asinh, :exp2, :expm1)
         ($f)(x::Real) = ($f)(float(x))
     end
 end
-# pure julia exp function
-include("special/exp.jl")
 exp(x::Real) = exp(float(x))
 
 # fallback definitions to prevent infinite loop from $f(x::Real) def above
@@ -950,6 +958,7 @@ end
 cbrt(a::Float16) = Float16(cbrt(Float32(a)))
 
 # More special functions
+include("special/exp.jl")
 include("special/trig.jl")
 include("special/gamma.jl")
 
diff --git a/base/special/exp.jl b/base/special/exp.jl
index 9d7012a98034f..7f21519c79619 100644
--- a/base/special/exp.jl
+++ b/base/special/exp.jl
@@ -56,8 +56,8 @@ MINEXP(::Type{Float32}) = -103.97207708f0            # log 2^-150
 @inline exp_kernel(x::Float32) = @horner(x, 1.6666625440f-1, -2.7667332906f-3)
 
 # for values smaller than this threshold just use a Taylor expansion
-exp_small_thres(::Type{Float64}) = 2.0^-28
-exp_small_thres(::Type{Float32}) = 2.0f0^-13
+@eval exp_small_thres(::Type{Float64}) = $(2.0^-28)
+@eval exp_small_thres(::Type{Float32}) = $(2.0f0^-13)
 
 """
     exp(x)
@@ -114,13 +114,13 @@ function exp{T<:Union{Float32,Float64}}(x::T)
         # scale back
         if k > -significand_bits(T)
             # multiply by 2.0 first to prevent overflow, which helps extends the range
-            k == exponent_max(T) && return y*T(2.0)*T(2.0)^(exponent_max(T) - 1)
+            k == exponent_max(T) && return y * T(2.0) * T(2.0)^(exponent_max(T) - 1)
             twopk = reinterpret(T, rem(exponent_bias(T) + k, fpinttype(T)) << significand_bits(T))
             return y*twopk
         else
             # add significand_bits(T) + 1 to lift the range outside the subnormals
             twopk = reinterpret(T, rem(exponent_bias(T) + significand_bits(T) + 1 + k, fpinttype(T)) << significand_bits(T))
-            return y*twopk*T(2.0)^(-significand_bits(T) - 1)
+            return y * twopk * T(2.0)^(-significand_bits(T) - 1)
         end
     elseif xa < reinterpret(Unsigned, exp_small_thres(T)) # |x| < exp_small_thres
         # Taylor approximation for small x

From 0c5eac2bc5603b82c9ac114bdc8999831cba2f6f Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Thu, 23 Feb 2017 14:10:23 -0500
Subject: [PATCH 4/4] make sure that the indirection through the `Val{p}` type
 doesn't stop inlining of `^`

---
 base/intfuncs.jl | 17 ++++++++++-------
 base/math.jl     |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/base/intfuncs.jl b/base/intfuncs.jl
index d299a8da27944..69c0df7d11c17 100644
--- a/base/intfuncs.jl
+++ b/base/intfuncs.jl
@@ -199,16 +199,19 @@ end
 # to enable compile-time optimizations specialized to p.
 # However, we still need a fallback that calls the general ^.
 # To avoid ambiguities for methods that dispatch on the
-# first argument, we dispatch the fallback via internal_pow:
-^(x, p) = internal_pow(x, p)
-internal_pow{p}(x, ::Type{Val{p}}) = x^p
+# first argument, we dispatch the fallback via internal_pow.
+# We mark these @inline since if the target is marked @inline,
+# we want to make sure that gets propagated,
+# even if it is over the inlining threshold.
+@inline ^(x, p) = internal_pow(x, p)
+@inline internal_pow{p}(x, ::Type{Val{p}}) = x^p
 
 # inference.jl has complicated logic to inline x^2 and x^3 for
 # numeric types.  In terms of Val we can do it much more simply:
-internal_pow(x::Number, ::Type{Val{0}}) = one(x)
-internal_pow(x::Number, ::Type{Val{1}}) = x
-internal_pow(x::Number, ::Type{Val{2}}) = x*x
-internal_pow(x::Number, ::Type{Val{3}}) = x*x*x
+@inline internal_pow(x::Number, ::Type{Val{0}}) = one(x)
+@inline internal_pow(x::Number, ::Type{Val{1}}) = x
+@inline internal_pow(x::Number, ::Type{Val{2}}) = x*x
+@inline internal_pow(x::Number, ::Type{Val{3}}) = x*x*x
 
 # b^p mod m
 
diff --git a/base/math.jl b/base/math.jl
index d4db49a661030..944514da5e736 100644
--- a/base/math.jl
+++ b/base/math.jl
@@ -698,7 +698,7 @@ end
 @inline ^(x::Float64, y::Integer) = x ^ Float64(y)
 @inline ^(x::Float32, y::Integer) = x ^ Float32(y)
 @inline ^(x::Float16, y::Integer) = Float16(Float32(x) ^ Float32(y))
-^{p}(x::Float16, ::Type{Val{p}}) = Float16(Float32(x)^Val{p})
+@inline ^{p}(x::Float16, ::Type{Val{p}}) = Float16(Float32(x) ^ Val{p})
 
 function angle_restrict_symm(theta)
     const P1 = 4 * 7.8539812564849853515625e-01