Skip to content

Commit

Permalink
runtime, cmd/internal/obj/arm: improve arm function prologue
Browse files Browse the repository at this point in the history
When stack growth is not needed, as it usually is not,
execute only a single conditional branch
rather than three conditional instructions.
This adds 4 bytes to every function,
but might speed up execution in the common case.

Sample disassembly for

func f() {
	_ = [128]byte{}
}

Before:

TEXT main.f(SB) x.go
	x.go:3	0x2000	e59a1008	MOVW 0x8(R10), R1
	x.go:3	0x2004	e59fb028	MOVW 0x28(R15), R11
	x.go:3	0x2008	e08d200b	ADD R11, R13, R2
	x.go:3	0x200c	e1520001	CMP R1, R2
	x.go:3	0x2010	91a0300e	MOVW.LS R14, R3
	x.go:3	0x2014	9b0118a9	BL.LS runtime.morestack_noctxt(SB)
	x.go:3	0x2018	9afffff8	B.LS main.f(SB)
	x.go:3	0x201c	e52de084	MOVW.W R14, -0x84(R13)
	x.go:4	0x2020	e28d1004	ADD $4, R13, R1
	x.go:4	0x2024	e3a00000	MOVW $0, R0
	x.go:4	0x2028	eb012255	BL 0x4a984
	x.go:5	0x202c	e49df084	RET #132
	x.go:5	0x2030	eafffffe	B 0x2030
	x.go:5	0x2034	ffffff7c	?

After:

TEXT main.f(SB) x.go
	x.go:3	0x2000	e59a1008	MOVW 0x8(R10), R1
	x.go:3	0x2004	e59fb02c	MOVW 0x2c(R15), R11
	x.go:3	0x2008	e08d200b	ADD R11, R13, R2
	x.go:3	0x200c	e1520001	CMP R1, R2
	x.go:3	0x2010	9a000004	B.LS 0x2028
	x.go:3	0x2014	e52de084	MOVW.W R14, -0x84(R13)
	x.go:4	0x2018	e28d1004	ADD $4, R13, R1
	x.go:4	0x201c	e3a00000	MOVW $0, R0
	x.go:4	0x2020	eb0124dc	BL 0x4b398
	x.go:5	0x2024	e49df084	RET #132
	x.go:5	0x2028	e1a0300e	MOVW R14, R3
	x.go:5	0x202c	eb011b0d	BL runtime.morestack_noctxt(SB)
	x.go:5	0x2030	eafffff2	B main.f(SB)
	x.go:5	0x2034	eafffffe	B 0x2034
	x.go:5	0x2038	ffffff7c	?

Updates #10587.

package sort benchmarks on an iPhone 6:

name            old time/op  new time/op  delta
SortString1K     569µs ± 0%   565µs ± 1%  -0.75%  (p=0.000 n=23+24)
StableString1K   872µs ± 1%   870µs ± 1%  -0.16%  (p=0.009 n=23+24)
SortInt1K        317µs ± 2%   316µs ± 2%    ~     (p=0.410 n=26+26)
StableInt1K      343µs ± 1%   339µs ± 1%  -1.07%  (p=0.000 n=22+23)
SortInt64K      30.0ms ± 1%  30.0ms ± 1%    ~     (p=0.091 n=25+24)
StableInt64K    30.2ms ± 0%  30.0ms ± 0%  -0.69%  (p=0.000 n=22+22)
Sort1e2          147µs ± 1%   146µs ± 0%  -0.48%  (p=0.000 n=25+24)
Stable1e2        290µs ± 1%   286µs ± 1%  -1.30%  (p=0.000 n=23+24)
Sort1e4         29.5ms ± 2%  29.7ms ± 1%  +0.71%  (p=0.000 n=23+23)
Stable1e4       88.7ms ± 4%  88.6ms ± 8%  -0.07%  (p=0.022 n=26+26)
Sort1e6          4.81s ± 7%   4.83s ± 7%    ~     (p=0.192 n=26+26)
Stable1e6        18.3s ± 1%   18.1s ± 1%  -0.76%  (p=0.000 n=25+23)
SearchWrappers   318ns ± 1%   344ns ± 1%  +8.14%  (p=0.000 n=23+26)

package sort benchmarks on a first generation rpi:

name            old time/op  new time/op  delta
SearchWrappers  4.13µs ± 0%  3.95µs ± 0%   -4.42%  (p=0.000 n=15+13)
SortString1K    5.81ms ± 1%  5.82ms ± 2%     ~     (p=0.400 n=14+15)
StableString1K  9.69ms ± 1%  9.73ms ± 0%     ~     (p=0.121 n=15+11)
SortInt1K       3.30ms ± 2%  3.66ms ±19%  +10.82%  (p=0.000 n=15+14)
StableInt1K     5.97ms ±15%  4.17ms ± 8%  -30.05%  (p=0.000 n=15+15)
SortInt64K       319ms ± 1%   295ms ± 1%   -7.65%  (p=0.000 n=15+15)
StableInt64K     343ms ± 0%   332ms ± 0%   -3.26%  (p=0.000 n=12+13)
Sort1e2         3.36ms ± 2%  3.22ms ± 4%   -4.10%  (p=0.000 n=15+15)
Stable1e2       6.74ms ± 1%  6.43ms ± 2%   -4.67%  (p=0.000 n=15+15)
Sort1e4          247ms ± 1%   247ms ± 1%     ~     (p=0.331 n=15+14)
Stable1e4        864ms ± 0%   820ms ± 0%   -5.15%  (p=0.000 n=14+15)
Sort1e6          41.2s ± 0%   41.2s ± 0%   +0.15%  (p=0.000 n=13+14)
Stable1e6         192s ± 0%    182s ± 0%   -5.07%  (p=0.000 n=14+14)

Change-Id: I8a9db77e1d4ea1956575895893bc9d04bd81204b
Reviewed-on: https://go-review.googlesource.com/10497
Reviewed-by: Russ Cox <[email protected]>
  • Loading branch information
josharian committed Jun 4, 2015
1 parent f4b48de commit 5353cde
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 29 deletions.
61 changes: 34 additions & 27 deletions src/cmd/internal/obj/arm/obj5.go
Original file line number Diff line number Diff line change
Expand Up @@ -795,38 +795,45 @@ func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog {
p.Scond = C_SCOND_NE
}

// MOVW.LS R14, R3
p = obj.Appendp(ctxt, p)

p.As = AMOVW
p.Scond = C_SCOND_LS
p.From.Type = obj.TYPE_REG
p.From.Reg = REGLINK
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R3

// BL.LS runtime.morestack(SB) // modifies LR, returns with LO still asserted
p = obj.Appendp(ctxt, p)
// BLS call-to-morestack
bls := obj.Appendp(ctxt, p)
bls.As = ABLS
bls.To.Type = obj.TYPE_BRANCH

p.As = ABL
p.Scond = C_SCOND_LS
p.To.Type = obj.TYPE_BRANCH
if ctxt.Cursym.Cfunc != 0 {
p.To.Sym = obj.Linklookup(ctxt, "runtime.morestackc", 0)
} else if ctxt.Cursym.Text.From3.Offset&obj.NEEDCTXT == 0 {
p.To.Sym = obj.Linklookup(ctxt, "runtime.morestack_noctxt", 0)
} else {
p.To.Sym = obj.Linklookup(ctxt, "runtime.morestack", 0)
var last *obj.Prog
for last = ctxt.Cursym.Text; last.Link != nil; last = last.Link {
}

// BLS start
p = obj.Appendp(ctxt, p)
// MOVW LR, R3
movw := obj.Appendp(ctxt, last)
movw.As = AMOVW
movw.From.Type = obj.TYPE_REG
movw.From.Reg = REGLINK
movw.To.Type = obj.TYPE_REG
movw.To.Reg = REG_R3

bls.Pcond = movw

// BL runtime.morestack
call := obj.Appendp(ctxt, movw)
call.As = obj.ACALL
call.To.Type = obj.TYPE_BRANCH
morestack := "runtime.morestack"
switch {
case ctxt.Cursym.Cfunc != 0:
morestack = "runtime.morestackc"
case ctxt.Cursym.Text.From3.Offset&obj.NEEDCTXT == 0:
morestack = "runtime.morestack_noctxt"
}
call.To.Sym = obj.Linklookup(ctxt, morestack, 0)

p.As = ABLS
p.To.Type = obj.TYPE_BRANCH
p.Pcond = ctxt.Cursym.Text.Link
// B start
b := obj.Appendp(ctxt, call)
b.As = obj.AJMP
b.To.Type = obj.TYPE_BRANCH
b.Pcond = ctxt.Cursym.Text.Link

return p
return bls
}

func initdiv(ctxt *obj.Link) {
Expand Down
1 change: 0 additions & 1 deletion src/runtime/asm_arm.s
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,6 @@ noswitch:

// Called during function prolog when more stack is needed.
// R1 frame size
// R2 arg size
// R3 prolog's LR
// NB. we do not save R0 because we've forced 5c to pass all arguments
// on the stack.
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/sys_arm.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func rewindmorestack(buf *gobuf) {
var inst uint32
if buf.pc&3 == 0 && buf.pc != 0 {
inst = *(*uint32)(unsafe.Pointer(buf.pc))
if inst>>24 == 0x9a {
if inst>>24 == 0x9a || inst>>24 == 0xea {
buf.pc += uintptr(int32(inst<<8)>>6) + 8
return
}
Expand Down

0 comments on commit 5353cde

Please sign in to comment.