Skip to content

Commit

Permalink
cmd/compile: add intrinsic HasCPUFeature for checking cpu features
Browse files Browse the repository at this point in the history
Before using some CPU instructions, we must check for their presence.
We use global variables in the runtime package to record features.

Prior to this CL, we issued a regular memory load for these features.
The downside to this is that, because it is a regular memory load,
it cannot be hoisted out of loops or otherwise reordered with other loads.

This CL introduces a new intrinsic just for checking cpu features.
It still ends up resulting in a memory load, but that memory load can
now be floated to the entry block and rematerialized as needed.

One downside is that the regular load could be combined with the comparison
into a CMPBconstload+NE. This new intrinsic cannot; it generates MOVB+TESTB+NE.
(It is possible that MOVBQZX+TESTQ+NE would be better.)

This CL does only amd64. It is easy to extend to other architectures.

For the benchmark in #36196, on my machine, this offers a mild speedup.

name      old time/op  new time/op  delta
FMA-8     1.39ns ± 6%  1.29ns ± 9%  -7.19%  (p=0.000 n=97+96)
NonFMA-8  2.03ns ±11%  2.04ns ±12%    ~     (p=0.618 n=99+98)

Updates #15808
Updates #36196

Change-Id: I75e2fcfcf5a6df1bdb80657a7143bed69fca6deb
Reviewed-on: https://go-review.googlesource.com/c/go/+/212360
Run-TryBot: Josh Bleecher Snyder <[email protected]>
TryBot-Result: Gobot Gobot <[email protected]>
Reviewed-by: Keith Randall <[email protected]>
Reviewed-by: Giovanni Bajo <[email protected]>
  • Loading branch information
josharian committed Apr 4, 2020
1 parent ed7a833 commit fff7509
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 10 deletions.
6 changes: 6 additions & 0 deletions src/cmd/compile/internal/amd64/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,12 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
gc.AddrAuto(&p.To, v)
case ssa.OpAMD64LoweredHasCPUFeature:
p := s.Prog(x86.AMOVB)
p.From.Type = obj.TYPE_MEM
gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64LoweredGetClosurePtr:
// Closure pointer is DX.
gc.CheckLoweredGetClosurePtr(v)
Expand Down
9 changes: 3 additions & 6 deletions src/cmd/compile/internal/gc/ssa.go
Original file line number Diff line number Diff line change
Expand Up @@ -3595,8 +3595,7 @@ func init() {
s.vars[n] = s.load(types.Types[TFLOAT64], a)
return s.variable(n, types.Types[TFLOAT64])
}
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasFMA, s.sb)
v := s.load(types.Types[TBOOL], addr)
v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[TBOOL], x86HasFMA)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
Expand Down Expand Up @@ -3661,8 +3660,7 @@ func init() {

makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasSSE41, s.sb)
v := s.load(types.Types[TBOOL], addr)
v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[TBOOL], x86HasSSE41)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
Expand Down Expand Up @@ -3869,8 +3867,7 @@ func init() {

makeOnesCountAMD64 := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasPOPCNT, s.sb)
v := s.load(types.Types[TBOOL], addr)
v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[TBOOL], x86HasPOPCNT)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
Expand Down
1 change: 1 addition & 0 deletions src/cmd/compile/internal/ssa/gen/AMD64.rules
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,7 @@
(GetClosurePtr ...) -> (LoweredGetClosurePtr ...)
(GetCallerPC ...) -> (LoweredGetCallerPC ...)
(GetCallerSP ...) -> (LoweredGetCallerSP ...)
(HasCPUFeature ...) -> (LoweredHasCPUFeature ...)
(Addr ...) -> (LEAQ ...)
(LocalAddr {sym} base _) -> (LEAQ {sym} base)

Expand Down
2 changes: 2 additions & 0 deletions src/cmd/compile/internal/ssa/gen/AMD64Ops.go
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,8 @@ func init() {
// It saves all GP registers if necessary, but may clobber others.
{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), buildReg("AX CX DX BX BP SI R8 R9")}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"},

{name: "LoweredHasCPUFeature", argLength: 0, reg: gp01, rematerializeable: true, typ: "bool", aux: "Sym", symEffect: "None"},

// There are three of these functions so that they can have three different register inputs.
// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
// default registers to match so we don't need to copy registers around unnecessarily.
Expand Down
2 changes: 2 additions & 0 deletions src/cmd/compile/internal/ssa/gen/genericOps.go
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,8 @@ var genericOps = []opData{
// arch-dependent), and is not a safe-point.
{name: "WB", argLength: 3, typ: "Mem", aux: "Sym", symEffect: "None"}, // arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier

{name: "HasCPUFeature", argLength: 0, typ: "bool", aux: "Sym", symEffect: "None"}, // aux=place that this feature flag can be loaded from

// PanicBounds and PanicExtend generate a runtime panic.
// Their arguments provide index values to use in panic messages.
// Both PanicBounds and PanicExtend have an AuxInt value from the BoundsKind type (in ../op.go).
Expand Down
21 changes: 21 additions & 0 deletions src/cmd/compile/internal/ssa/opGen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions src/cmd/compile/internal/ssa/rewriteAMD64.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 8 additions & 4 deletions test/codegen/mathbits.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,9 @@ func Len8(n uint8) int {
// bits.OnesCount //
// -------------------- //

// amd64:".*x86HasPOPCNT"
func OnesCount(n uint) int {
// amd64:"POPCNTQ",".*x86HasPOPCNT"
// amd64:"POPCNTQ"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
// ppc64:"POPCNTD"
Expand All @@ -120,8 +121,9 @@ func OnesCount(n uint) int {
return bits.OnesCount(n)
}

// amd64:".*x86HasPOPCNT"
func OnesCount64(n uint64) int {
// amd64:"POPCNTQ",".*x86HasPOPCNT"
// amd64:"POPCNTQ"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
// ppc64:"POPCNTD"
Expand All @@ -130,8 +132,9 @@ func OnesCount64(n uint64) int {
return bits.OnesCount64(n)
}

// amd64:".*x86HasPOPCNT"
func OnesCount32(n uint32) int {
// amd64:"POPCNTL",".*x86HasPOPCNT"
// amd64:"POPCNTL"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
// ppc64:"POPCNTW"
Expand All @@ -140,8 +143,9 @@ func OnesCount32(n uint32) int {
return bits.OnesCount32(n)
}

// amd64:".*x86HasPOPCNT"
func OnesCount16(n uint16) int {
// amd64:"POPCNTL",".*x86HasPOPCNT"
// amd64:"POPCNTL"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
// ppc64:"POPCNTW"
Expand Down

0 comments on commit fff7509

Please sign in to comment.