Skip to content

Commit

Permalink
cmd/compile: enable carry chain scheduling for arm64
Browse files Browse the repository at this point in the history
This is a follow up of CL 393656 on arm64.

This CL puts ScoreCarryChainTail before ScoreMemory and after
ScoreReadFlags, so that the scheduling of the carry chain will not
break the scheduling of ScoreVarDef.

Benchmarks:
name                                  old time/op    new time/op    delta
ScalarMult/P256-8                       42.0µs ± 0%    42.0µs ± 0%   -0.13%  (p=0.032 n=5+5)
ScalarMult/P224-8                        135µs ± 0%      96µs ± 0%  -29.04%  (p=0.008 n=5+5)
ScalarMult/P384-8                        573µs ± 1%     355µs ± 0%  -38.05%  (p=0.008 n=5+5)
ScalarMult/P521-8                       1.50ms ± 4%    0.77ms ± 0%  -48.78%  (p=0.008 n=5+5)
MarshalUnmarshal/P256/Uncompressed-8     505ns ± 1%     506ns ± 0%     ~     (p=0.460 n=5+5)
MarshalUnmarshal/P256/Compressed-8      6.75µs ± 0%    6.73µs ± 0%   -0.27%  (p=0.016 n=5+5)
MarshalUnmarshal/P224/Uncompressed-8     927ns ± 0%     818ns ± 0%  -11.76%  (p=0.008 n=5+5)
MarshalUnmarshal/P224/Compressed-8       136µs ± 0%      96µs ± 0%  -29.58%  (p=0.008 n=5+5)
MarshalUnmarshal/P384/Uncompressed-8    1.77µs ± 0%    1.36µs ± 1%  -23.14%  (p=0.008 n=5+5)
MarshalUnmarshal/P384/Compressed-8      56.5µs ± 0%    31.9µs ± 0%  -43.59%  (p=0.016 n=5+4)
MarshalUnmarshal/P521/Uncompressed-8    2.91µs ± 0%    2.03µs ± 1%  -30.32%  (p=0.008 n=5+5)
MarshalUnmarshal/P521/Compressed-8       148µs ± 0%      68µs ± 1%  -54.28%  (p=0.008 n=5+5)

Change-Id: I4bf4e3265d7e1ee85765ff2bf006ca5a794d4979
Reviewed-on: https://go-review.googlesource.com/c/go/+/432275
Reviewed-by: Carlos Amedee <[email protected]>
Reviewed-by: Keith Randall <[email protected]>
TryBot-Result: Gopher Robot <[email protected]>
Reviewed-by: Keith Randall <[email protected]>
Run-TryBot: Eric Fang <[email protected]>
  • Loading branch information
eric fang committed Oct 8, 2022
1 parent 40c7e94 commit 669ec54
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 29 deletions.
89 changes: 60 additions & 29 deletions src/cmd/compile/internal/ssa/schedule.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ const (
ScorePhi = iota // towards top of block
ScoreArg
ScoreNilCheck
ScoreCarryChainTail
ScoreReadTuple
ScoreVarDef
ScoreMemory
ScoreCarryChainTail
ScoreReadFlags
ScoreDefault
ScoreFlags
Expand Down Expand Up @@ -155,16 +155,16 @@ func schedule(f *Func) {
// VARDEF ops are scheduled before the corresponding LEA.
score[v.ID] = ScoreMemory
case v.Op == OpSelect0 || v.Op == OpSelect1 || v.Op == OpSelectN:
if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].Op.isCarry() || v.Type.IsFlags()) {
if (v.Op == OpSelect1 || v.Op == OpSelect0) && (v.Args[0].isCarry() || v.Type.IsFlags()) {
// When the Select pseudo op is being used for a carry or flag from
// a tuple then score it as ScoreFlags so it happens later. This
// prevents the bit from being clobbered before it is used.
score[v.ID] = ScoreFlags
} else {
score[v.ID] = ScoreReadTuple
}
case v.Op.isCarry():
if w := v.getCarryProducer(); w != nil {
case v.isCarry():
if w := v.getCarryInput(); w != nil && w.Block == b {
// The producing op is not the final user of the carry bit. Its
// current score is one of unscored, Flags, or CarryChainTail.
// These occur if the producer has not been scored, another user
Expand All @@ -183,7 +183,7 @@ func schedule(f *Func) {
// one chain to be scheduled, if possible.
score[v.ID] = ScoreCarryChainTail
}
case v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags():
case v.isFlagOp():
// Schedule flag register generation as late as possible.
// This makes sure that we only have one live flags
// value at a time.
Expand All @@ -192,7 +192,7 @@ func schedule(f *Func) {
score[v.ID] = ScoreDefault
// If we're reading flags, schedule earlier to keep flag lifetime short.
for _, a := range v.Args {
if a.Type.IsFlags() {
if a.isFlagOp() {
score[v.ID] = ScoreReadFlags
}
}
Expand Down Expand Up @@ -263,7 +263,6 @@ func schedule(f *Func) {
}
}
}

}

// To put things into a priority queue
Expand All @@ -287,7 +286,7 @@ func schedule(f *Func) {

v := heap.Pop(priq).(*Value)

if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.Op.isCarry() {
if f.pass.debug > 1 && score[v.ID] == ScoreCarryChainTail && v.isCarry() {
// Add some debugging noise if the chain of carrying ops will not
// likely be scheduled without potential carry flag clobbers.
if !isCarryChainReady(v, uses) {
Expand Down Expand Up @@ -551,42 +550,74 @@ func storeOrder(values []*Value, sset *sparseSet, storeNumber []int32) []*Value
return order
}

// Return whether all dependent carry ops can be scheduled after this.
// isFlagOp reports if v is an OP with the flag type.
func (v *Value) isFlagOp() bool {
return v.Type.IsFlags() || v.Type.IsTuple() && v.Type.FieldType(1).IsFlags()
}

// isCarryChainReady reports whether all dependent carry ops can be scheduled after this.
func isCarryChainReady(v *Value, uses []int32) bool {
// A chain can be scheduled in it's entirety if
// the use count of each dependent op is 1. If none,
// schedule the first.
j := 1 // The first op uses[k.ID] == 0. Dependent ops are always >= 1.
for k := v; k != nil; k = k.getCarryProducer() {
for k := v; k != nil; k = k.getCarryInput() {
j += int(uses[k.ID]) - 1
}
return j == 0
}

// Return whether op is an operation which produces a carry bit value, but does not consume it.
func (op Op) isCarryCreator() bool {
switch op {
case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst:
return true
}
return false
// isCarryInput reports whether v accepts a carry value as input.
func (v *Value) isCarryInput() bool {
return v.getCarryInput() != nil
}

// Return whether op consumes or creates a carry a bit value.
func (op Op) isCarry() bool {
switch op {
case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
return true
// isCarryOutput reports whether v generates a carry as output.
func (v *Value) isCarryOutput() bool {
// special cases for PPC64 which put their carry values in XER instead of flags
switch v.Block.Func.Config.arch {
case "ppc64", "ppc64le":
switch v.Op {
case OpPPC64SUBC, OpPPC64ADDC, OpPPC64SUBCconst, OpPPC64ADDCconst:
return true
}
return false
}
return op.isCarryCreator()
return v.isFlagOp() && v.Op != OpSelect1
}

// isCarryCreator reports whether op is an operation which produces a carry bit value,
// but does not consume it.
func (v *Value) isCarryCreator() bool {
return v.isCarryOutput() && !v.isCarryInput()
}

// isCarry reports whether op consumes or creates a carry a bit value.
func (v *Value) isCarry() bool {
return v.isCarryOutput() || v.isCarryInput()
}

// Return the producing *Value of the carry bit of this op, or nil if none.
func (v *Value) getCarryProducer() *Value {
if v.Op.isCarry() && !v.Op.isCarryCreator() {
// PPC64 carry dependencies are conveyed through their final argument.
// Likewise, there is always an OpSelect1 between them.
return v.Args[len(v.Args)-1].Args[0]
// getCarryInput returns the producing *Value of the carry bit of this op, or nil if none.
func (v *Value) getCarryInput() *Value {
// special cases for PPC64 which put their carry values in XER instead of flags
switch v.Block.Func.Config.arch {
case "ppc64", "ppc64le":
switch v.Op {
case OpPPC64SUBE, OpPPC64ADDE, OpPPC64SUBZEzero, OpPPC64ADDZEzero:
// PPC64 carry dependencies are conveyed through their final argument.
// Likewise, there is always an OpSelect1 between them.
return v.Args[len(v.Args)-1].Args[0]
}
return nil
}
for _, a := range v.Args {
if !a.isFlagOp() {
continue
}
if a.Op == OpSelect1 {
a = a.Args[0]
}
return a
}
return nil
}
Expand Down
59 changes: 59 additions & 0 deletions src/cmd/compile/internal/ssa/schedule_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,62 @@ func TestStoreOrder(t *testing.T) {
t.Errorf("store order is wrong: got %v, want v2 v3 v4 after v5", order)
}
}

func TestCarryChainOrder(t *testing.T) {
// In the function below, there are two carry chains that have no dependencies on each other,
// one is A1 -> A1carry -> A1Carryvalue, the other is A2 -> A2carry -> A2Carryvalue. If they
// are not scheduled properly, the carry will be clobbered, causing the carry to be regenerated.
c := testConfigARM64(t)
fun := c.Fun("entry",
Bloc("entry",
Valu("mem0", OpInitMem, types.TypeMem, 0, nil),
Valu("x", OpARM64MOVDconst, c.config.Types.UInt64, 5, nil),
Valu("y", OpARM64MOVDconst, c.config.Types.UInt64, 6, nil),
Valu("z", OpARM64MOVDconst, c.config.Types.UInt64, 7, nil),
Valu("A1", OpARM64ADDSflags, types.NewTuple(c.config.Types.UInt64, types.TypeFlags), 0, nil, "x", "z"), // x+z, set flags
Valu("A1carry", OpSelect1, types.TypeFlags, 0, nil, "A1"),
Valu("A2", OpARM64ADDSflags, types.NewTuple(c.config.Types.UInt64, types.TypeFlags), 0, nil, "y", "z"), // y+z, set flags
Valu("A2carry", OpSelect1, types.TypeFlags, 0, nil, "A2"),
Valu("A1value", OpSelect0, c.config.Types.UInt64, 0, nil, "A1"),
Valu("A1Carryvalue", OpARM64ADCzerocarry, c.config.Types.UInt64, 0, nil, "A1carry"), // 0+0+A1carry
Valu("A2value", OpSelect0, c.config.Types.UInt64, 0, nil, "A2"),
Valu("A2Carryvalue", OpARM64ADCzerocarry, c.config.Types.UInt64, 0, nil, "A2carry"), // 0+0+A2carry
Valu("ValueSum", OpARM64ADD, c.config.Types.UInt64, 0, nil, "A1value", "A2value"),
Valu("CarrySum", OpARM64ADD, c.config.Types.UInt64, 0, nil, "A1Carryvalue", "A2Carryvalue"),
Valu("Sum", OpARM64AND, c.config.Types.UInt64, 0, nil, "ValueSum", "CarrySum"),
Goto("exit")),
Bloc("exit",
Exit("mem0")),
)

CheckFunc(fun.f)
schedule(fun.f)

// The expected order is A1 < A1carry < A1Carryvalue < A2 < A2carry < A2Carryvalue.
// There is no dependency between the two carry chains, so it doesn't matter which
// comes first and which comes after, but the unsorted position of A1 is before A2,
// so A1Carryvalue < A2.
var ai, bi, ci, di, ei, fi int
for i, v := range fun.f.Blocks[0].Values {
switch {
case fun.values["A1"] == v:
ai = i
case fun.values["A1carry"] == v:
bi = i
case fun.values["A1Carryvalue"] == v:
ci = i
case fun.values["A2"] == v:
di = i
case fun.values["A2carry"] == v:
ei = i
case fun.values["A2Carryvalue"] == v:
fi = i
}
}
if !(ai < bi && bi < ci && ci < di && di < ei && ei < fi) {
t.Logf("Func: %s", fun.f)
t.Errorf("carry chain order is wrong: got %v, want V%d after V%d after V%d after V%d after V%d after V%d,",
fun.f.Blocks[0], fun.values["A1"].ID, fun.values["A1carry"].ID, fun.values["A1Carryvalue"].ID,
fun.values["A2"].ID, fun.values["A2carry"].ID, fun.values["A2Carryvalue"].ID)
}
}

0 comments on commit 669ec54

Please sign in to comment.