From f38e968abafde345fa470cb14d55b6f092af569f Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Mon, 8 Dec 2025 12:14:24 -0500 Subject: [PATCH] [dev.simd] cmd/compile: zero only low 128-bit of X15 Zeroing the upper part of X15 may make the CPU think it is "dirty" and slow down SSE operations. For now, just not zeroing the upper part, and construct a zero value on the fly if we need a 256- or 512-bit zero value. Maybe VZEROUPPER works better than explicitly zeroing X15, but we need to evaluate. Long term, we probably want to move more things from SSE to AVX. This essentially undoes CL 698237 and CL 698238, except keeping using X15 for 128-bit zeroing for SIMD. Change-Id: I1564e6332c4c57f9721397c92c7c734c5497534c Reviewed-on: https://go-review.googlesource.com/c/go/+/728240 LUCI-TryBot-Result: Go LUCI Reviewed-by: David Chase --- src/cmd/compile/internal/amd64/ssa.go | 40 +++++-------------- src/cmd/compile/internal/ssa/_gen/AMD64Ops.go | 9 ++++- src/cmd/compile/internal/ssa/opGen.go | 18 ++++----- src/runtime/asm_amd64.s | 10 ----- src/runtime/race_amd64.s | 5 --- src/runtime/sys_darwin_amd64.s | 5 --- src/runtime/sys_dragonfly_amd64.s | 5 --- src/runtime/sys_freebsd_amd64.s | 10 ----- src/runtime/sys_linux_amd64.s | 10 ----- src/runtime/sys_netbsd_amd64.s | 5 --- src/runtime/sys_openbsd_amd64.s | 5 --- src/runtime/sys_windows_amd64.s | 5 --- 12 files changed, 25 insertions(+), 102 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 9a0fa27470..5ddcb84c59 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -18,7 +18,6 @@ import ( "cmd/internal/obj" "cmd/internal/obj/x86" "internal/abi" - "internal/buildcfg" ) // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags. @@ -1718,7 +1717,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL: s.Prog(v.Op.Asm()) - case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: // no code emitted + case ssa.OpAMD64Zero128: // no code emitted + + case ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = simdReg(v) + p.AddRestSourceReg(simdReg(v)) + p.To.Type = obj.TYPE_REG + p.To.Reg = simdReg(v) case ssa.OpAMD64VMOVSSf2v, ssa.OpAMD64VMOVSDf2v: // These are for initializing the least 32/64 bits of a SIMD register from a "float". @@ -1871,34 +1878,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { // zeroX15 zeroes the X15 register. func zeroX15(s *ssagen.State) { - if !buildcfg.Experiment.SIMD { - opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) - return - } - vxorps := func(s *ssagen.State) { - p := s.Prog(x86.AVXORPS) - p.From.Type = obj.TYPE_REG - p.From.Reg = x86.REG_X15 - p.AddRestSourceReg(x86.REG_X15) - p.To.Type = obj.TYPE_REG - p.To.Reg = x86.REG_X15 - } - if buildcfg.GOAMD64 >= 3 { - vxorps(s) - return - } - // AVX may not be available, check before zeroing the high bits. - p := s.Prog(x86.ACMPB) - p.From.Type = obj.TYPE_MEM - p.From.Name = obj.NAME_EXTERN - p.From.Sym = ir.Syms.X86HasAVX - p.To.Type = obj.TYPE_CONST - p.To.Offset = 1 - jmp := s.Prog(x86.AJNE) - jmp.To.Type = obj.TYPE_BRANCH - vxorps(s) - sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) - jmp.To.SetTarget(sse) + opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) } // Example instruction: VRSQRTPS X1, X1 diff --git a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go index e77f55ab5e..2fb4fdfc96 100644 --- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go @@ -214,6 +214,7 @@ func init() { vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly} vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}} + v01 = regInfo{inputs: nil, outputs: vonly} v11 = regInfo{inputs: vonly, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15 v21 = regInfo{inputs: []regMask{v, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15 vk = regInfo{inputs: vzonly, outputs: maskonly} @@ -232,6 +233,7 @@ func init() { gpv = regInfo{inputs: []regMask{gp}, outputs: vonly} v2flags = regInfo{inputs: []regMask{vz, vz}} + w01 = regInfo{inputs: nil, outputs: wonly} w11 = regInfo{inputs: wonly, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15 w21 = regInfo{inputs: []regMask{wz, wz}, outputs: wonly} wk = regInfo{inputs: wzonly, outputs: maskonly} @@ -1398,12 +1400,15 @@ func init() { {name: "VPMOVVec64x4ToM", argLength: 1, reg: vk, asm: "VPMOVQ2M"}, {name: "VPMOVVec64x8ToM", argLength: 1, reg: wk, asm: "VPMOVQ2M"}, + // X15 is the zero register up to 128-bit. For larger values, we zero it on the fly. {name: "Zero128", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true}, - {name: "Zero256", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true}, - {name: "Zero512", argLength: 0, reg: x15only, zeroWidth: true, fixedReg: true}, + {name: "Zero256", argLength: 0, reg: v01, asm: "VPXOR"}, + {name: "Zero512", argLength: 0, reg: w01, asm: "VPXORQ"}, + // Move a 32/64 bit float to a 128-bit SIMD register. {name: "VMOVSDf2v", argLength: 1, reg: fpv, asm: "VMOVSD"}, {name: "VMOVSSf2v", argLength: 1, reg: fpv, asm: "VMOVSS"}, + {name: "VMOVQ", argLength: 1, reg: gpv, asm: "VMOVQ"}, {name: "VMOVD", argLength: 1, reg: gpv, asm: "VMOVD"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 83e7959218..00d581ec9a 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -20365,24 +20365,22 @@ var opcodeTable = [...]opInfo{ }, }, { - name: "Zero256", - argLen: 0, - zeroWidth: true, - fixedReg: true, + name: "Zero256", + argLen: 0, + asm: x86.AVPXOR, reg: regInfo{ outputs: []outputInfo{ - {0, 2147483648}, // X15 + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 }, }, }, { - name: "Zero512", - argLen: 0, - zeroWidth: true, - fixedReg: true, + name: "Zero512", + argLen: 0, + asm: x86.AVPXORQ, reg: regInfo{ outputs: []outputInfo{ - {0, 2147483648}, // X15 + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 }, }, }, diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index bf208a4d29..391d9bcd22 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1093,11 +1093,6 @@ needm: // there's no need to handle that. Clear R14 so that there's // a bad value in there, in case needm tries to use it. XORPS X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif XORQ R14, R14 MOVQ $runtime·needAndBindM(SB), AX CALL AX @@ -1795,11 +1790,6 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0 get_tls(R14) MOVQ g(R14), R14 XORPS X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif JMP ·sigpanic(SB) // gcWriteBarrier informs the GC about heap pointer writes. diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s index ade29bc5f1..e19118bd54 100644 --- a/src/runtime/race_amd64.s +++ b/src/runtime/race_amd64.s @@ -456,11 +456,6 @@ call: // Back to Go world, set special registers. // The g register (R14) is preserved in C. XORPS X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif RET // C->Go callback thunk that allows to call runtime·racesymbolize from C code. diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s index e033e8b702..99d67a9cfd 100644 --- a/src/runtime/sys_darwin_amd64.s +++ b/src/runtime/sys_darwin_amd64.s @@ -177,11 +177,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s index e417d4b8a8..a223c2cf76 100644 --- a/src/runtime/sys_dragonfly_amd64.s +++ b/src/runtime/sys_dragonfly_amd64.s @@ -228,11 +228,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s index bab275cc72..977ea093d2 100644 --- a/src/runtime/sys_freebsd_amd64.s +++ b/src/runtime/sys_freebsd_amd64.s @@ -265,11 +265,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif // Reserve space for spill slots. NOP SP // disable vet stack checking @@ -295,11 +290,6 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index 618553b196..878f834748 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -352,11 +352,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif // Reserve space for spill slots. NOP SP // disable vet stack checking @@ -382,11 +377,6 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s index 946b1fbe22..2f1ddcdc89 100644 --- a/src/runtime/sys_netbsd_amd64.s +++ b/src/runtime/sys_netbsd_amd64.s @@ -310,11 +310,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s index 7766fa5194..ff0bc2416a 100644 --- a/src/runtime/sys_openbsd_amd64.s +++ b/src/runtime/sys_openbsd_amd64.s @@ -64,11 +64,6 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s index 52a21ba89b..e438599910 100644 --- a/src/runtime/sys_windows_amd64.s +++ b/src/runtime/sys_windows_amd64.s @@ -32,11 +32,6 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0 // R14 is cleared in case there's a non-zero value in there // if called from a non-go thread. XORPS X15, X15 -#ifdef GOEXPERIMENT_simd - CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 - JNE 2(PC) - VXORPS X15, X15, X15 -#endif XORQ R14, R14 get_tls(AX) -- 2.43.0