func isDebugFn(fn *ir.Func) bool {
// if n := fn.Nname; n != nil {
- // if n.Sym().Name == "Int32x8.Transpose8" && n.Sym().Pkg.Path == "simd" {
+ // if n.Sym().Name == "Int32x8.Transpose8" && n.Sym().Pkg.Path == "simd/archsimd" {
// fmt.Printf("isDebugFn '%s' DOT '%s'\n", n.Sym().Pkg.Path, n.Sym().Name)
// return true
// }
// Only enable intrinsics, if SIMD experiment.
simdIntrinsics(addF)
- addF("simd", "ClearAVXUpperBits",
+ addF(simdPackage, "ClearAVXUpperBits",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue1(ssa.OpAMD64VZEROUPPER, types.TypeMem, s.mem())
return nil
addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
+ // sfp4 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go.
sfp4 := func(method string, hwop ssa.Op, vectype *types.Type) {
- addF("simd", method,
+ addF(simdPackage, method,
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x, a, b, c, d, y := args[0], args[1], args[2], args[3], args[4], args[5]
if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 && c.Op == ssa.OpConst8 && d.Op == ssa.OpConst8 {
- return select4FromPair(x, a, b, c, d, y, s, hwop, vectype)
- } else {
- return s.callResult(n, callNormal)
+ z := select4FromPair(x, a, b, c, d, y, s, hwop, vectype)
+ if z != nil {
+ return z
+ }
}
+ return s.callResult(n, callNormal)
},
sys.AMD64)
}
sfp4("Uint32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512)
sfp4("Float32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x16, types.TypeVec512)
+ // sfp2 is intrinsic-if-constant, but otherwise it's complicated enough to just implement in Go.
sfp2 := func(method string, hwop ssa.Op, vectype *types.Type, cscimm func(i, j uint8) int64) {
- addF("simd", method,
+ addF(simdPackage, method,
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x, a, b, y := args[0], args[1], args[2], args[3]
if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 {
- return select2FromPair(x, a, b, y, s, hwop, vectype, cscimm)
- } else {
- return s.callResult(n, callNormal)
+ z := select2FromPair(x, a, b, y, s, hwop, vectype, cscimm)
+ if z != nil {
+ return z
+ }
}
+ return s.callResult(n, callNormal)
},
sys.AMD64)
}
func select2FromPair(x, _a, _b, y *ssa.Value, s *state, op ssa.Op, t *types.Type, csc func(a, b uint8) int64) *ssa.Value {
a, b := uint8(_a.AuxInt8()), uint8(_b.AuxInt8())
+ if a > 3 || b > 3 {
+ return nil
+ }
pattern := (a&2)>>1 + (b & 2)
a, b = a&1, b&1
func select4FromPair(x, _a, _b, _c, _d, y *ssa.Value, s *state, op ssa.Op, t *types.Type) *ssa.Value {
a, b, c, d := uint8(_a.AuxInt8()), uint8(_b.AuxInt8()), uint8(_c.AuxInt8()), uint8(_d.AuxInt8())
+ if a > 7 || b > 7 || c > 7 || d > 7 {
+ return nil
+ }
pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
a, b, c, d = a&3, b&3, c&3, d&3
fn := sym.Name
if ssa.IntrinsicsDisable {
if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") ||
- pkg == "internal/simd" || pkg == "simd" { // TODO after simd has been moved to package simd, remove internal/simd
+ pkg == simdPackage {
// These runtime functions don't have definitions, must be intrinsics.
} else {
return nil
gotIntrinsics[testIntrinsicKey{ik.arch.Name, ik.pkg, ik.fn}] = struct{}{}
}
for ik, _ := range gotIntrinsics {
- if _, found := wantIntrinsics[ik]; !found && (ik.pkg != "simd" || *simd) {
+ if _, found := wantIntrinsics[ik]; !found && (ik.pkg != "simd/archsimd" || *simd) {
t.Errorf("Got unwanted intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn)
}
}
for ik, _ := range wantIntrinsics {
- if _, found := gotIntrinsics[ik]; !found && (ik.pkg != "simd" || *simd) {
+ if _, found := gotIntrinsics[ik]; !found && (ik.pkg != "simd/archsimd" || *simd) {
t.Errorf("Want missing intrinsic %v %v.%v", ik.archName, ik.pkg, ik.fn)
}
}
"cmd/internal/sys"
)
-const simdPackage = "simd"
+const simdPackage = "simd/archsimd"
func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily)) {
addF(simdPackage, "Uint8x16.AESDecryptLastRound", opLen2(ssa.OpAESDecryptLastRoundUint8x16, types.TypeVec128), sys.AMD64)
} else {
st.floatRegs = 1
}
- // if st.Sym() != nil {
- // base.Warn("Simdify %s, %v, %d", st.Sym().Name, isTag, st.width)
- // } else {
- // base.Warn("Simdify %v, %v, %d", st, isTag, st.width)
- // }
}
// CalcStructSize calculates the size of t,
case sym.Name == "align64" && isAtomicStdPkg(sym.Pkg):
maxAlign = 8
- case buildcfg.Experiment.SIMD && (sym.Pkg.Path == "internal/simd" || sym.Pkg.Path == "simd") && len(t.Fields()) >= 1:
+ case buildcfg.Experiment.SIMD && (sym.Pkg.Path == "simd/archsimd") && len(t.Fields()) >= 1:
// This gates the experiment -- without it, no user-visible types can be "simd".
// The SSA-visible SIMD types remain.
- // TODO after simd has been moved to package simd, remove internal/simd.
switch sym.Name {
case "v128":
simdify(t, true)
"builtin": true,
"cmd/compile/internal/ssa/_gen": true,
"runtime/_mkmalloc": true,
- "simd/_gen/simdgen": true,
- "simd/_gen/unify": true,
+ "simd/archsimd/_gen/simdgen": true,
+ "simd/archsimd/_gen/unify": true,
}
// printPackageMu synchronizes the printing of type-checked package files in
internal/byteorder, internal/cpu, internal/goarch < internal/chacha8rand;
internal/goarch, math/bits < internal/strconv;
- internal/cpu, internal/strconv < simd;
+ internal/cpu, internal/strconv < simd/archsimd;
# RUNTIME is the core runtime group of packages, all of them very light-weight.
internal/abi,
< testing;
testing, math
- < simd/internal/test_helpers;
+ < simd/archsimd/internal/test_helpers;
log/slog, testing
< testing/slogtest;
"builtin": true,
"cmd/compile/internal/ssa/_gen": true,
"runtime/_mkmalloc": true,
- "simd/_gen/simdgen": true,
- "simd/_gen/unify": true,
+ "simd/archsimd/_gen/simdgen": true,
+ "simd/archsimd/_gen/unify": true,
}
// printPackageMu synchronizes the printing of type-checked package files in
+++ /dev/null
-module simd/_gen
-
-go 1.24
-
-require (
- golang.org/x/arch v0.20.0
- gopkg.in/yaml.v3 v3.0.1
-)
+++ /dev/null
-golang.org/x/arch v0.20.0 h1:dx1zTU0MAE98U+TQ8BLl7XsJbgze2WnNKF/8tGp/Q6c=
-golang.org/x/arch v0.20.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
-gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Run all SIMD-related code generators.
-package main
-
-import (
- "flag"
- "fmt"
- "os"
- "os/exec"
- "path/filepath"
- "strings"
-)
-
-const defaultXedPath = "$XEDPATH" + string(filepath.ListSeparator) + "./simdgen/xeddata" + string(filepath.ListSeparator) + "$HOME/xed/obj/dgen"
-
-var (
- flagTmplgen = flag.Bool("tmplgen", true, "run tmplgen generator")
- flagSimdgen = flag.Bool("simdgen", true, "run simdgen generator")
-
- flagN = flag.Bool("n", false, "dry run")
- flagXedPath = flag.String("xedPath", defaultXedPath, "load XED datafile from `path`, which must be the XED obj/dgen directory")
-)
-
-var goRoot string
-
-func main() {
- flag.Parse()
- if flag.NArg() > 0 {
- flag.Usage()
- os.Exit(1)
- }
-
- if *flagXedPath == defaultXedPath {
- // In general we want the shell to do variable expansion, but for the
- // default value we don't get that, so do it ourselves.
- *flagXedPath = os.ExpandEnv(defaultXedPath)
- }
-
- var err error
- goRoot, err = resolveGOROOT()
- if err != nil {
- fmt.Fprintln(os.Stderr, err)
- os.Exit(1)
- }
-
- if *flagTmplgen {
- doTmplgen()
- }
- if *flagSimdgen {
- doSimdgen()
- }
-}
-
-func doTmplgen() {
- goRun("-C", "tmplgen", ".")
-}
-
-func doSimdgen() {
- xedPath, err := resolveXEDPath(*flagXedPath)
- if err != nil {
- fmt.Fprintln(os.Stderr, err)
- os.Exit(1)
- }
-
- // Regenerate the XED-derived SIMD files
- goRun("-C", "simdgen", ".", "-o", "godefs", "-goroot", goRoot, "-xedPath", prettyPath("./simdgen", xedPath), "go.yaml", "types.yaml", "categories.yaml")
-
- // simdgen produces SSA rule files, so update the SSA files
- goRun("-C", prettyPath(".", filepath.Join(goRoot, "src", "cmd", "compile", "internal", "ssa", "_gen")), ".")
-}
-
-func resolveXEDPath(pathList string) (xedPath string, err error) {
- for _, path := range filepath.SplitList(pathList) {
- if path == "" {
- // Probably an unknown shell variable. Ignore.
- continue
- }
- if _, err := os.Stat(filepath.Join(path, "all-dec-instructions.txt")); err == nil {
- return filepath.Abs(path)
- }
- }
- return "", fmt.Errorf("set $XEDPATH or -xedPath to the XED obj/dgen directory")
-}
-
-func resolveGOROOT() (goRoot string, err error) {
- cmd := exec.Command("go", "env", "GOROOT")
- cmd.Stderr = os.Stderr
- out, err := cmd.Output()
- if err != nil {
- return "", fmt.Errorf("%s: %s", cmd, err)
- }
- goRoot = strings.TrimSuffix(string(out), "\n")
- return goRoot, nil
-}
-
-func goRun(args ...string) {
- exe := filepath.Join(goRoot, "bin", "go")
- cmd := exec.Command(exe, append([]string{"run"}, args...)...)
- run(cmd)
-}
-
-func run(cmd *exec.Cmd) {
- cmd.Stdout = os.Stdout
- cmd.Stderr = os.Stderr
- fmt.Fprintf(os.Stderr, "%s\n", cmdString(cmd))
- if *flagN {
- return
- }
- if err := cmd.Run(); err != nil {
- fmt.Fprintf(os.Stderr, "%s failed: %s\n", cmd, err)
- }
-}
-
-func prettyPath(base, path string) string {
- base, err := filepath.Abs(base)
- if err != nil {
- return path
- }
- p, err := filepath.Rel(base, path)
- if err != nil {
- return path
- }
- return p
-}
-
-func cmdString(cmd *exec.Cmd) string {
- // TODO: Shell quoting?
- // TODO: Environment.
-
- var buf strings.Builder
-
- cmdPath, err := exec.LookPath(filepath.Base(cmd.Path))
- if err == nil && cmdPath == cmd.Path {
- cmdPath = filepath.Base(cmdPath)
- } else {
- cmdPath = prettyPath(".", cmd.Path)
- }
- buf.WriteString(cmdPath)
-
- for _, arg := range cmd.Args[1:] {
- buf.WriteByte(' ')
- buf.WriteString(arg)
- }
-
- return buf.String()
-}
+++ /dev/null
-testdata/*
-.gemini/*
-.gemini*
+++ /dev/null
-!import ops/*/categories.yaml
+++ /dev/null
-#!/bin/bash
-
-# This is an end-to-end test of Go SIMD. It updates all generated
-# files in this repo and then runs several tests.
-
-XEDDATA="${XEDDATA:-xeddata}"
-if [[ ! -d "$XEDDATA" ]]; then
- echo >&2 "Must either set \$XEDDATA or symlink xeddata/ to the XED obj/dgen directory."
- exit 1
-fi
-
-which go >/dev/null || exit 1
-goroot="$(go env GOROOT)"
-if [[ ! ../../../.. -ef "$goroot" ]]; then
- # We might be able to make this work but it's SO CONFUSING.
- echo >&2 "go command in path has GOROOT $goroot"
- exit 1
-fi
-
-if [[ $(go env GOEXPERIMENT) != simd ]]; then
- echo >&2 "GOEXPERIMENT=$(go env GOEXPERIMENT), expected simd"
- exit 1
-fi
-
-set -ex
-
-# Regenerate SIMD files
-go run . -o godefs -goroot "$goroot" -xedPath "$XEDDATA" go.yaml types.yaml categories.yaml
-# Regenerate SSA files from SIMD rules
-go run -C "$goroot"/src/cmd/compile/internal/ssa/_gen .
-
-# Rebuild compiler
-cd "$goroot"/src
-go install cmd/compile
-
-# Tests
-GOARCH=amd64 go run -C simd/testdata .
-GOARCH=amd64 go test -v simd
-go test go/doc go/build
-go test cmd/api -v -check -run ^TestCheck$
-go test cmd/compile/internal/ssagen -simd=0
-
-# Check tests without the GOEXPERIMENT
-GOEXPERIMENT= go test go/doc go/build
-GOEXPERIMENT= go test cmd/api -v -check -run ^TestCheck$
-GOEXPERIMENT= go test cmd/compile/internal/ssagen -simd=0
-
-# TODO: Add some tests of SIMD itself
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "bytes"
- "fmt"
- "sort"
-)
-
-const simdGenericOpsTmpl = `
-package main
-
-func simdGenericOps() []opData {
- return []opData{
-{{- range .Ops }}
- {name: "{{.OpName}}", argLength: {{.OpInLen}}, commutative: {{.Comm}}},
-{{- end }}
-{{- range .OpsImm }}
- {name: "{{.OpName}}", argLength: {{.OpInLen}}, commutative: {{.Comm}}, aux: "UInt8"},
-{{- end }}
- }
-}
-`
-
-// writeSIMDGenericOps generates the generic ops and writes it to simdAMD64ops.go
-// within the specified directory.
-func writeSIMDGenericOps(ops []Operation) *bytes.Buffer {
- t := templateOf(simdGenericOpsTmpl, "simdgenericOps")
- buffer := new(bytes.Buffer)
- buffer.WriteString(generatedHeader)
-
- type genericOpsData struct {
- OpName string
- OpInLen int
- Comm bool
- }
- type opData struct {
- Ops []genericOpsData
- OpsImm []genericOpsData
- }
- var opsData opData
- for _, op := range ops {
- if op.NoGenericOps != nil && *op.NoGenericOps == "true" {
- continue
- }
- if op.SkipMaskedMethod() {
- continue
- }
- _, _, _, immType, gOp := op.shape()
- gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative}
- if immType == VarImm || immType == ConstVarImm {
- opsData.OpsImm = append(opsData.OpsImm, gOpData)
- } else {
- opsData.Ops = append(opsData.Ops, gOpData)
- }
- }
- sort.Slice(opsData.Ops, func(i, j int) bool {
- return compareNatural(opsData.Ops[i].OpName, opsData.Ops[j].OpName) < 0
- })
- sort.Slice(opsData.OpsImm, func(i, j int) bool {
- return compareNatural(opsData.OpsImm[i].OpName, opsData.OpsImm[j].OpName) < 0
- })
-
- err := t.Execute(buffer, opsData)
- if err != nil {
- panic(fmt.Errorf("failed to execute template: %w", err))
- }
-
- return buffer
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "bytes"
- "fmt"
- "slices"
-)
-
-const simdIntrinsicsTmpl = `
-{{define "header"}}
-package ssagen
-
-import (
- "cmd/compile/internal/ir"
- "cmd/compile/internal/ssa"
- "cmd/compile/internal/types"
- "cmd/internal/sys"
-)
-
-const simdPackage = "` + simdPackage + `"
-
-func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily)) {
-{{end}}
-
-{{define "op1"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen1(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op2"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen2(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op2_21"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen2_21(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op2_21Type1"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2_21(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op3"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen3(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op3_21"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen3_21(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op3_21Type1"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3_21(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op3_231Type1"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3_231(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op3_31Zero3"}} addF(simdPackage, "{{(index .In 2).Go}}.{{.Go}}", opLen3_31Zero3(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op4"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen4(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op4_231Type1"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen4_231(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op4_31"}} addF(simdPackage, "{{(index .In 2).Go}}.{{.Go}}", opLen4_31(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
-{{end}}
-{{define "op1Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen1Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
-{{end}}
-{{define "op2Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
-{{end}}
-{{define "op2Imm8_2I"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
-{{end}}
-{{define "op2Imm8_II"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_II(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
-{{end}}
-{{define "op2Imm8_SHA1RNDS4"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_SHA1RNDS4(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
-{{end}}
-{{define "op3Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
-{{end}}
-{{define "op3Imm8_2I"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
-{{end}}
-{{define "op4Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen4Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
-{{end}}
-
-{{define "vectorConversion"}} addF(simdPackage, "{{.Tsrc.Name}}.As{{.Tdst.Name}}", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
-{{end}}
-
-{{define "loadStore"}} addF(simdPackage, "Load{{.Name}}", simdLoad(), sys.AMD64)
- addF(simdPackage, "{{.Name}}.Store", simdStore(), sys.AMD64)
-{{end}}
-
-{{define "maskedLoadStore"}} addF(simdPackage, "LoadMasked{{.Name}}", simdMaskedLoad(ssa.OpLoadMasked{{.ElemBits}}), sys.AMD64)
- addF(simdPackage, "{{.Name}}.StoreMasked", simdMaskedStore(ssa.OpStoreMasked{{.ElemBits}}), sys.AMD64)
-{{end}}
-
-{{define "mask"}} addF(simdPackage, "{{.Name}}.As{{.VectorCounterpart}}", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
- addF(simdPackage, "{{.VectorCounterpart}}.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
- addF(simdPackage, "{{.Name}}.And", opLen2(ssa.OpAnd{{.ReshapedVectorWithAndOr}}, types.TypeVec{{.Size}}), sys.AMD64)
- addF(simdPackage, "{{.Name}}.Or", opLen2(ssa.OpOr{{.ReshapedVectorWithAndOr}}, types.TypeVec{{.Size}}), sys.AMD64)
- addF(simdPackage, "{{.Name}}FromBits", simdCvtVToMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
- addF(simdPackage, "{{.Name}}.ToBits", simdCvtMaskToV({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
-{{end}}
-
-{{define "footer"}}}
-{{end}}
-`
-
-// writeSIMDIntrinsics generates the intrinsic mappings and writes it to simdintrinsics.go
-// within the specified directory.
-func writeSIMDIntrinsics(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
- t := templateOf(simdIntrinsicsTmpl, "simdintrinsics")
- buffer := new(bytes.Buffer)
- buffer.WriteString(generatedHeader)
-
- if err := t.ExecuteTemplate(buffer, "header", nil); err != nil {
- panic(fmt.Errorf("failed to execute header template: %w", err))
- }
-
- slices.SortFunc(ops, compareOperations)
-
- for _, op := range ops {
- if op.NoTypes != nil && *op.NoTypes == "true" {
- continue
- }
- if op.SkipMaskedMethod() {
- continue
- }
- if s, op, err := classifyOp(op); err == nil {
- if err := t.ExecuteTemplate(buffer, s, op); err != nil {
- panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err))
- }
-
- } else {
- panic(fmt.Errorf("failed to classify op %v: %w", op.Go, err))
- }
- }
-
- for _, conv := range vConvertFromTypeMap(typeMap) {
- if err := t.ExecuteTemplate(buffer, "vectorConversion", conv); err != nil {
- panic(fmt.Errorf("failed to execute vectorConversion template: %w", err))
- }
- }
-
- for _, typ := range typesFromTypeMap(typeMap) {
- if typ.Type != "mask" {
- if err := t.ExecuteTemplate(buffer, "loadStore", typ); err != nil {
- panic(fmt.Errorf("failed to execute loadStore template: %w", err))
- }
- }
- }
-
- for _, typ := range typesFromTypeMap(typeMap) {
- if typ.MaskedLoadStoreFilter() {
- if err := t.ExecuteTemplate(buffer, "maskedLoadStore", typ); err != nil {
- panic(fmt.Errorf("failed to execute maskedLoadStore template: %w", err))
- }
- }
- }
-
- for _, mask := range masksFromTypeMap(typeMap) {
- if err := t.ExecuteTemplate(buffer, "mask", mask); err != nil {
- panic(fmt.Errorf("failed to execute mask template: %w", err))
- }
- }
-
- if err := t.ExecuteTemplate(buffer, "footer", nil); err != nil {
- panic(fmt.Errorf("failed to execute footer template: %w", err))
- }
-
- return buffer
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "bytes"
- "fmt"
- "log"
- "sort"
- "strings"
-)
-
-const simdMachineOpsTmpl = `
-package main
-
-func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv, w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw,
- wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload, v31x0AtIn2 regInfo) []opData {
- return []opData{
-{{- range .OpsData }}
- {name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", resultInArg0: {{.ResultInArg0}}},
-{{- end }}
-{{- range .OpsDataImm }}
- {name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", aux: "UInt8", commutative: {{.Comm}}, typ: "{{.Type}}", resultInArg0: {{.ResultInArg0}}},
-{{- end }}
-{{- range .OpsDataLoad}}
- {name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", aux: "SymOff", symEffect: "Read", resultInArg0: {{.ResultInArg0}}},
-{{- end}}
-{{- range .OpsDataImmLoad}}
- {name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", aux: "SymValAndOff", symEffect: "Read", resultInArg0: {{.ResultInArg0}}},
-{{- end}}
-{{- range .OpsDataMerging }}
- {name: "{{.OpName}}Merging", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: false, typ: "{{.Type}}", resultInArg0: true},
-{{- end }}
-{{- range .OpsDataImmMerging }}
- {name: "{{.OpName}}Merging", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", aux: "UInt8", commutative: false, typ: "{{.Type}}", resultInArg0: true},
-{{- end }}
- }
-}
-`
-
-// writeSIMDMachineOps generates the machine ops and writes it to simdAMD64ops.go
-// within the specified directory.
-func writeSIMDMachineOps(ops []Operation) *bytes.Buffer {
- t := templateOf(simdMachineOpsTmpl, "simdAMD64Ops")
- buffer := new(bytes.Buffer)
- buffer.WriteString(generatedHeader)
-
- type opData struct {
- OpName string
- Asm string
- OpInLen int
- RegInfo string
- Comm bool
- Type string
- ResultInArg0 bool
- }
- type machineOpsData struct {
- OpsData []opData
- OpsDataImm []opData
- OpsDataLoad []opData
- OpsDataImmLoad []opData
- OpsDataMerging []opData
- OpsDataImmMerging []opData
- }
-
- regInfoSet := map[string]bool{
- "v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true,
- "w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true,
- "wkwload": true, "v21load": true, "v31load": true, "v11load": true, "w21load": true, "w31load": true, "w2kload": true, "w2kwload": true, "w11load": true,
- "w3kwload": true, "w2kkload": true, "v31x0AtIn2": true}
- opsData := make([]opData, 0)
- opsDataImm := make([]opData, 0)
- opsDataLoad := make([]opData, 0)
- opsDataImmLoad := make([]opData, 0)
- opsDataMerging := make([]opData, 0)
- opsDataImmMerging := make([]opData, 0)
-
- // Determine the "best" version of an instruction to use
- best := make(map[string]Operation)
- var mOpOrder []string
- countOverrides := func(s []Operand) int {
- a := 0
- for _, o := range s {
- if o.OverwriteBase != nil {
- a++
- }
- }
- return a
- }
- for _, op := range ops {
- _, _, maskType, _, gOp := op.shape()
- asm := machineOpName(maskType, gOp)
- other, ok := best[asm]
- if !ok {
- best[asm] = op
- mOpOrder = append(mOpOrder, asm)
- continue
- }
- if !op.Commutative && other.Commutative { // if there's a non-commutative version of the op, it wins.
- best[asm] = op
- continue
- }
- // see if "op" is better than "other"
- if countOverrides(op.In)+countOverrides(op.Out) < countOverrides(other.In)+countOverrides(other.Out) {
- best[asm] = op
- }
- }
-
- regInfoErrs := make([]error, 0)
- regInfoMissing := make(map[string]bool, 0)
- for _, asm := range mOpOrder {
- op := best[asm]
- shapeIn, shapeOut, maskType, _, gOp := op.shape()
-
- // TODO: all our masked operations are now zeroing, we need to generate machine ops with merging masks, maybe copy
- // one here with a name suffix "Merging". The rewrite rules will need them.
- makeRegInfo := func(op Operation, mem memShape) (string, error) {
- regInfo, err := op.regShape(mem)
- if err != nil {
- panic(err)
- }
- regInfo, err = rewriteVecAsScalarRegInfo(op, regInfo)
- if err != nil {
- if mem == NoMem || mem == InvalidMem {
- panic(err)
- }
- return "", err
- }
- if regInfo == "v01load" {
- regInfo = "vload"
- }
- // Makes AVX512 operations use upper registers
- if strings.Contains(op.CPUFeature, "AVX512") {
- regInfo = strings.ReplaceAll(regInfo, "v", "w")
- }
- if _, ok := regInfoSet[regInfo]; !ok {
- regInfoErrs = append(regInfoErrs, fmt.Errorf("unsupported register constraint, please update the template and AMD64Ops.go: %s. Op is %s", regInfo, op))
- regInfoMissing[regInfo] = true
- }
- return regInfo, nil
- }
- regInfo, err := makeRegInfo(op, NoMem)
- if err != nil {
- panic(err)
- }
- var outType string
- if shapeOut == OneVregOut || shapeOut == OneVregOutAtIn || gOp.Out[0].OverwriteClass != nil {
- // If class overwrite is happening, that's not really a mask but a vreg.
- outType = fmt.Sprintf("Vec%d", *gOp.Out[0].Bits)
- } else if shapeOut == OneGregOut {
- outType = gOp.GoType() // this is a straight Go type, not a VecNNN type
- } else if shapeOut == OneKmaskOut {
- outType = "Mask"
- } else {
- panic(fmt.Errorf("simdgen does not recognize this output shape: %d", shapeOut))
- }
- resultInArg0 := false
- if shapeOut == OneVregOutAtIn {
- resultInArg0 = true
- }
- var memOpData *opData
- regInfoMerging := regInfo
- hasMerging := false
- if op.MemFeatures != nil && *op.MemFeatures == "vbcst" {
- // Right now we only have vbcst case
- // Make a full vec memory variant.
- opMem := rewriteLastVregToMem(op)
- regInfo, err := makeRegInfo(opMem, VregMemIn)
- if err != nil {
- // Just skip it if it's non nill.
- // an error could be triggered by [checkVecAsScalar].
- // TODO: make [checkVecAsScalar] aware of mem ops.
- if *Verbose {
- log.Printf("Seen error: %e", err)
- }
- } else {
- memOpData = &opData{asm + "load", gOp.Asm, len(gOp.In) + 1, regInfo, false, outType, resultInArg0}
- }
- }
- hasMerging = gOp.hasMaskedMerging(maskType, shapeOut)
- if hasMerging && !resultInArg0 {
- // We have to copy the slice here becasue the sort will be visible from other
- // aliases when no reslicing is happening.
- newIn := make([]Operand, len(op.In), len(op.In)+1)
- copy(newIn, op.In)
- op.In = newIn
- op.In = append(op.In, op.Out[0])
- op.sortOperand()
- regInfoMerging, err = makeRegInfo(op, NoMem)
- if err != nil {
- panic(err)
- }
- }
-
- if shapeIn == OneImmIn || shapeIn == OneKmaskImmIn {
- opsDataImm = append(opsDataImm, opData{asm, gOp.Asm, len(gOp.In), regInfo, gOp.Commutative, outType, resultInArg0})
- if memOpData != nil {
- if *op.MemFeatures != "vbcst" {
- panic("simdgen only knows vbcst for mem ops for now")
- }
- opsDataImmLoad = append(opsDataImmLoad, *memOpData)
- }
- if hasMerging {
- mergingLen := len(gOp.In)
- if !resultInArg0 {
- mergingLen++
- }
- opsDataImmMerging = append(opsDataImmMerging, opData{asm, gOp.Asm, mergingLen, regInfoMerging, gOp.Commutative, outType, resultInArg0})
- }
- } else {
- opsData = append(opsData, opData{asm, gOp.Asm, len(gOp.In), regInfo, gOp.Commutative, outType, resultInArg0})
- if memOpData != nil {
- if *op.MemFeatures != "vbcst" {
- panic("simdgen only knows vbcst for mem ops for now")
- }
- opsDataLoad = append(opsDataLoad, *memOpData)
- }
- if hasMerging {
- mergingLen := len(gOp.In)
- if !resultInArg0 {
- mergingLen++
- }
- opsDataMerging = append(opsDataMerging, opData{asm, gOp.Asm, mergingLen, regInfoMerging, gOp.Commutative, outType, resultInArg0})
- }
- }
- }
- if len(regInfoErrs) != 0 {
- for _, e := range regInfoErrs {
- log.Printf("Errors: %e\n", e)
- }
- panic(fmt.Errorf("these regInfo unseen: %v", regInfoMissing))
- }
- sort.Slice(opsData, func(i, j int) bool {
- return compareNatural(opsData[i].OpName, opsData[j].OpName) < 0
- })
- sort.Slice(opsDataImm, func(i, j int) bool {
- return compareNatural(opsDataImm[i].OpName, opsDataImm[j].OpName) < 0
- })
- sort.Slice(opsDataLoad, func(i, j int) bool {
- return compareNatural(opsDataLoad[i].OpName, opsDataLoad[j].OpName) < 0
- })
- sort.Slice(opsDataImmLoad, func(i, j int) bool {
- return compareNatural(opsDataImmLoad[i].OpName, opsDataImmLoad[j].OpName) < 0
- })
- sort.Slice(opsDataMerging, func(i, j int) bool {
- return compareNatural(opsDataMerging[i].OpName, opsDataMerging[j].OpName) < 0
- })
- sort.Slice(opsDataImmMerging, func(i, j int) bool {
- return compareNatural(opsDataImmMerging[i].OpName, opsDataImmMerging[j].OpName) < 0
- })
- err := t.Execute(buffer, machineOpsData{opsData, opsDataImm, opsDataLoad, opsDataImmLoad,
- opsDataMerging, opsDataImmMerging})
- if err != nil {
- panic(fmt.Errorf("failed to execute template: %w", err))
- }
-
- return buffer
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "bytes"
- "cmp"
- "fmt"
- "maps"
- "slices"
- "sort"
- "strings"
- "unicode"
-)
-
-type simdType struct {
- Name string // The go type name of this simd type, for example Int32x4.
- Lanes int // The number of elements in this vector/mask.
- Base string // The element's type, like for Int32x4 it will be int32.
- Fields string // The struct fields, it should be right formatted.
- Type string // Either "mask" or "vreg"
- VectorCounterpart string // For mask use only: just replacing the "Mask" in [simdType.Name] with "Int"
- ReshapedVectorWithAndOr string // For mask use only: vector AND and OR are only available in some shape with element width 32.
- Size int // The size of the vector type
-}
-
-func (x simdType) ElemBits() int {
- return x.Size / x.Lanes
-}
-
-// LanesContainer returns the smallest int/uint bit size that is
-// large enough to hold one bit for each lane. E.g., Mask32x4
-// is 4 lanes, and a uint8 is the smallest uint that has 4 bits.
-func (x simdType) LanesContainer() int {
- if x.Lanes > 64 {
- panic("too many lanes")
- }
- if x.Lanes > 32 {
- return 64
- }
- if x.Lanes > 16 {
- return 32
- }
- if x.Lanes > 8 {
- return 16
- }
- return 8
-}
-
-// MaskedLoadStoreFilter encodes which simd type type currently
-// get masked loads/stores generated, it is used in two places,
-// this forces coordination.
-func (x simdType) MaskedLoadStoreFilter() bool {
- return x.Size == 512 || x.ElemBits() >= 32 && x.Type != "mask"
-}
-
-func (x simdType) IntelSizeSuffix() string {
- switch x.ElemBits() {
- case 8:
- return "B"
- case 16:
- return "W"
- case 32:
- return "D"
- case 64:
- return "Q"
- }
- panic("oops")
-}
-
-func (x simdType) MaskedLoadDoc() string {
- if x.Size == 512 || x.ElemBits() < 32 {
- return fmt.Sprintf("// Asm: VMOVDQU%d.Z, CPU Feature: AVX512", x.ElemBits())
- } else {
- return fmt.Sprintf("// Asm: VMASKMOV%s, CPU Feature: AVX2", x.IntelSizeSuffix())
- }
-}
-
-func (x simdType) MaskedStoreDoc() string {
- if x.Size == 512 || x.ElemBits() < 32 {
- return fmt.Sprintf("// Asm: VMOVDQU%d, CPU Feature: AVX512", x.ElemBits())
- } else {
- return fmt.Sprintf("// Asm: VMASKMOV%s, CPU Feature: AVX2", x.IntelSizeSuffix())
- }
-}
-
-func compareSimdTypes(x, y simdType) int {
- // "vreg" then "mask"
- if c := -compareNatural(x.Type, y.Type); c != 0 {
- return c
- }
- // want "flo" < "int" < "uin" (and then 8 < 16 < 32 < 64),
- // not "int16" < "int32" < "int64" < "int8")
- // so limit comparison to first 3 bytes in string.
- if c := compareNatural(x.Base[:3], y.Base[:3]); c != 0 {
- return c
- }
- // base type size, 8 < 16 < 32 < 64
- if c := x.ElemBits() - y.ElemBits(); c != 0 {
- return c
- }
- // vector size last
- return x.Size - y.Size
-}
-
-type simdTypeMap map[int][]simdType
-
-type simdTypePair struct {
- Tsrc simdType
- Tdst simdType
-}
-
-func compareSimdTypePairs(x, y simdTypePair) int {
- c := compareSimdTypes(x.Tsrc, y.Tsrc)
- if c != 0 {
- return c
- }
- return compareSimdTypes(x.Tdst, y.Tdst)
-}
-
-const simdPackageHeader = generatedHeader + `
-//go:build goexperiment.simd
-
-package simd
-`
-
-const simdTypesTemplates = `
-{{define "sizeTmpl"}}
-// v{{.}} is a tag type that tells the compiler that this is really {{.}}-bit SIMD
-type v{{.}} struct {
- _{{.}} [0]func() // uncomparable
-}
-{{end}}
-
-{{define "typeTmpl"}}
-// {{.Name}} is a {{.Size}}-bit SIMD vector of {{.Lanes}} {{.Base}}
-type {{.Name}} struct {
-{{.Fields}}
-}
-
-{{end}}
-`
-
-const simdFeaturesTemplate = `
-import "internal/cpu"
-
-type X86Features struct {}
-
-var X86 X86Features
-
-{{range .}}
-{{- if eq .Feature "AVX512"}}
-// {{.Feature}} returns whether the CPU supports the AVX512F+CD+BW+DQ+VL features.
-//
-// These five CPU features are bundled together, and no use of AVX-512
-// is allowed unless all of these features are supported together.
-// Nearly every CPU that has shipped with any support for AVX-512 has
-// supported all five of these features.
-{{- else -}}
-// {{.Feature}} returns whether the CPU supports the {{.Feature}} feature.
-{{- end}}
-//
-// {{.Feature}} is defined on all GOARCHes, but will only return true on
-// GOARCH {{.GoArch}}.
-func (X86Features) {{.Feature}}() bool {
- return cpu.X86.Has{{.Feature}}
-}
-{{end}}
-`
-
-const simdLoadStoreTemplate = `
-// Len returns the number of elements in a {{.Name}}
-func (x {{.Name}}) Len() int { return {{.Lanes}} }
-
-// Load{{.Name}} loads a {{.Name}} from an array
-//
-//go:noescape
-func Load{{.Name}}(y *[{{.Lanes}}]{{.Base}}) {{.Name}}
-
-// Store stores a {{.Name}} to an array
-//
-//go:noescape
-func (x {{.Name}}) Store(y *[{{.Lanes}}]{{.Base}})
-`
-
-const simdMaskFromValTemplate = `
-// {{.Name}}FromBits constructs a {{.Name}} from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-{{- if ne .Lanes .LanesContainer}}
-// Only the lower {{.Lanes}} bits of y are used.
-{{- end}}
-//
-// Asm: KMOV{{.IntelSizeSuffix}}, CPU Feature: AVX512
-func {{.Name}}FromBits(y uint{{.LanesContainer}}) {{.Name}}
-
-// ToBits constructs a bitmap from a {{.Name}}, where 1 means set for the indexed element, 0 means unset.
-{{- if ne .Lanes .LanesContainer}}
-// Only the lower {{.Lanes}} bits of y are used.
-{{- end}}
-//
-// Asm: KMOV{{.IntelSizeSuffix}}, CPU Features: AVX512
-func (x {{.Name}}) ToBits() uint{{.LanesContainer}}
-`
-
-const simdMaskedLoadStoreTemplate = `
-// LoadMasked{{.Name}} loads a {{.Name}} from an array,
-// at those elements enabled by mask
-//
-{{.MaskedLoadDoc}}
-//
-//go:noescape
-func LoadMasked{{.Name}}(y *[{{.Lanes}}]{{.Base}}, mask Mask{{.ElemBits}}x{{.Lanes}}) {{.Name}}
-
-// StoreMasked stores a {{.Name}} to an array,
-// at those elements enabled by mask
-//
-{{.MaskedStoreDoc}}
-//
-//go:noescape
-func (x {{.Name}}) StoreMasked(y *[{{.Lanes}}]{{.Base}}, mask Mask{{.ElemBits}}x{{.Lanes}})
-`
-
-const simdStubsTmpl = `
-{{define "op1"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op0NameAndType "x"}}) {{.Go}}() {{.GoType}}
-{{end}}
-
-{{define "op2"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op0NameAndType "x"}}) {{.Go}}({{.Op1NameAndType "y"}}) {{.GoType}}
-{{end}}
-
-{{define "op2_21"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op0NameAndType "y"}}) {{.GoType}}
-{{end}}
-
-{{define "op2_21Type1"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op0NameAndType "y"}}) {{.GoType}}
-{{end}}
-
-{{define "op3"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op0NameAndType "x"}}) {{.Go}}({{.Op1NameAndType "y"}}, {{.Op2NameAndType "z"}}) {{.GoType}}
-{{end}}
-
-{{define "op3_31Zero3"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op2NameAndType "x"}}) {{.Go}}({{.Op1NameAndType "y"}}) {{.GoType}}
-{{end}}
-
-{{define "op3_21"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op0NameAndType "y"}}, {{.Op2NameAndType "z"}}) {{.GoType}}
-{{end}}
-
-{{define "op3_21Type1"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op0NameAndType "y"}}, {{.Op2NameAndType "z"}}) {{.GoType}}
-{{end}}
-
-{{define "op3_231Type1"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.Op0NameAndType "z"}}) {{.GoType}}
-{{end}}
-
-{{define "op2VecAsScalar"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op0NameAndType "x"}}) {{.Go}}(y uint{{(index .In 1).TreatLikeAScalarOfSize}}) {{(index .Out 0).Go}}
-{{end}}
-
-{{define "op3VecAsScalar"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op0NameAndType "x"}}) {{.Go}}(y uint{{(index .In 1).TreatLikeAScalarOfSize}}, {{.Op2NameAndType "z"}}) {{(index .Out 0).Go}}
-{{end}}
-
-{{define "op4"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op0NameAndType "x"}}) {{.Go}}({{.Op1NameAndType "y"}}, {{.Op2NameAndType "z"}}, {{.Op3NameAndType "u"}}) {{.GoType}}
-{{end}}
-
-{{define "op4_231Type1"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.Op0NameAndType "z"}}, {{.Op3NameAndType "u"}}) {{.GoType}}
-{{end}}
-
-{{define "op4_31"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op2NameAndType "x"}}) {{.Go}}({{.Op1NameAndType "y"}}, {{.Op0NameAndType "z"}}, {{.Op3NameAndType "u"}}) {{.GoType}}
-{{end}}
-
-{{define "op1Imm8"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8) {{.GoType}}
-{{end}}
-
-{{define "op2Imm8"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
-{{end}}
-
-{{define "op2Imm8_2I"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uint8) {{.GoType}}
-{{end}}
-
-{{define "op2Imm8_II"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// {{.ImmName}} result in better performance when they are constants, non-constant values will be translated into a jump table.
-// {{.ImmName}} should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
-{{end}}
-
-{{define "op2Imm8_SHA1RNDS4"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
-{{end}}
-
-{{define "op3Imm8"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}, {{.Op3NameAndType "z"}}) {{.GoType}}
-{{end}}
-
-{{define "op3Imm8_2I"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uint8, {{.Op3NameAndType "z"}}) {{.GoType}}
-{{end}}
-
-
-{{define "op4Imm8"}}
-{{if .Documentation}}{{.Documentation}}
-//{{end}}
-// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
-func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}, {{.Op3NameAndType "z"}}, {{.Op4NameAndType "u"}}) {{.GoType}}
-{{end}}
-
-{{define "vectorConversion"}}
-// {{.Tdst.Name}} converts from {{.Tsrc.Name}} to {{.Tdst.Name}}
-func (from {{.Tsrc.Name}}) As{{.Tdst.Name}}() (to {{.Tdst.Name}})
-{{end}}
-
-{{define "mask"}}
-// As{{.VectorCounterpart}} converts from {{.Name}} to {{.VectorCounterpart}}
-func (from {{.Name}}) As{{.VectorCounterpart}}() (to {{.VectorCounterpart}})
-
-// asMask converts from {{.VectorCounterpart}} to {{.Name}}
-func (from {{.VectorCounterpart}}) asMask() (to {{.Name}})
-
-func (x {{.Name}}) And(y {{.Name}}) {{.Name}}
-
-func (x {{.Name}}) Or(y {{.Name}}) {{.Name}}
-{{end}}
-`
-
-// parseSIMDTypes groups go simd types by their vector sizes, and
-// returns a map whose key is the vector size, value is the simd type.
-func parseSIMDTypes(ops []Operation) simdTypeMap {
- // TODO: maybe instead of going over ops, let's try go over types.yaml.
- ret := map[int][]simdType{}
- seen := map[string]struct{}{}
- processArg := func(arg Operand) {
- if arg.Class == "immediate" || arg.Class == "greg" {
- // Immediates are not encoded as vector types.
- return
- }
- if _, ok := seen[*arg.Go]; ok {
- return
- }
- seen[*arg.Go] = struct{}{}
-
- lanes := *arg.Lanes
- base := fmt.Sprintf("%s%d", *arg.Base, *arg.ElemBits)
- tagFieldNameS := fmt.Sprintf("%sx%d", base, lanes)
- tagFieldS := fmt.Sprintf("%s v%d", tagFieldNameS, *arg.Bits)
- valFieldS := fmt.Sprintf("vals%s[%d]%s", strings.Repeat(" ", len(tagFieldNameS)-3), lanes, base)
- fields := fmt.Sprintf("\t%s\n\t%s", tagFieldS, valFieldS)
- if arg.Class == "mask" {
- vectorCounterpart := strings.ReplaceAll(*arg.Go, "Mask", "Int")
- reshapedVectorWithAndOr := fmt.Sprintf("Int32x%d", *arg.Bits/32)
- ret[*arg.Bits] = append(ret[*arg.Bits], simdType{*arg.Go, lanes, base, fields, arg.Class, vectorCounterpart, reshapedVectorWithAndOr, *arg.Bits})
- // In case the vector counterpart of a mask is not present, put its vector counterpart typedef into the map as well.
- if _, ok := seen[vectorCounterpart]; !ok {
- seen[vectorCounterpart] = struct{}{}
- ret[*arg.Bits] = append(ret[*arg.Bits], simdType{vectorCounterpart, lanes, base, fields, "vreg", "", "", *arg.Bits})
- }
- } else {
- ret[*arg.Bits] = append(ret[*arg.Bits], simdType{*arg.Go, lanes, base, fields, arg.Class, "", "", *arg.Bits})
- }
- }
- for _, op := range ops {
- for _, arg := range op.In {
- processArg(arg)
- }
- for _, arg := range op.Out {
- processArg(arg)
- }
- }
- return ret
-}
-
-func vConvertFromTypeMap(typeMap simdTypeMap) []simdTypePair {
- v := []simdTypePair{}
- for _, ts := range typeMap {
- for i, tsrc := range ts {
- for j, tdst := range ts {
- if i != j && tsrc.Type == tdst.Type && tsrc.Type == "vreg" &&
- tsrc.Lanes > 1 && tdst.Lanes > 1 {
- v = append(v, simdTypePair{tsrc, tdst})
- }
- }
- }
- }
- slices.SortFunc(v, compareSimdTypePairs)
- return v
-}
-
-func masksFromTypeMap(typeMap simdTypeMap) []simdType {
- m := []simdType{}
- for _, ts := range typeMap {
- for _, tsrc := range ts {
- if tsrc.Type == "mask" {
- m = append(m, tsrc)
- }
- }
- }
- slices.SortFunc(m, compareSimdTypes)
- return m
-}
-
-func typesFromTypeMap(typeMap simdTypeMap) []simdType {
- m := []simdType{}
- for _, ts := range typeMap {
- for _, tsrc := range ts {
- if tsrc.Lanes > 1 {
- m = append(m, tsrc)
- }
- }
- }
- slices.SortFunc(m, compareSimdTypes)
- return m
-}
-
-// writeSIMDTypes generates the simd vector types into a bytes.Buffer
-func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
- t := templateOf(simdTypesTemplates, "types_amd64")
- loadStore := templateOf(simdLoadStoreTemplate, "loadstore_amd64")
- maskedLoadStore := templateOf(simdMaskedLoadStoreTemplate, "maskedloadstore_amd64")
- maskFromVal := templateOf(simdMaskFromValTemplate, "maskFromVal_amd64")
-
- buffer := new(bytes.Buffer)
- buffer.WriteString(simdPackageHeader)
-
- sizes := make([]int, 0, len(typeMap))
- for size, types := range typeMap {
- slices.SortFunc(types, compareSimdTypes)
- sizes = append(sizes, size)
- }
- sort.Ints(sizes)
-
- for _, size := range sizes {
- if size <= 64 {
- // these are scalar
- continue
- }
- if err := t.ExecuteTemplate(buffer, "sizeTmpl", size); err != nil {
- panic(fmt.Errorf("failed to execute size template for size %d: %w", size, err))
- }
- for _, typeDef := range typeMap[size] {
- if typeDef.Lanes == 1 {
- continue
- }
- if err := t.ExecuteTemplate(buffer, "typeTmpl", typeDef); err != nil {
- panic(fmt.Errorf("failed to execute type template for type %s: %w", typeDef.Name, err))
- }
- if typeDef.Type != "mask" {
- if err := loadStore.ExecuteTemplate(buffer, "loadstore_amd64", typeDef); err != nil {
- panic(fmt.Errorf("failed to execute loadstore template for type %s: %w", typeDef.Name, err))
- }
- // restrict to AVX2 masked loads/stores first.
- if typeDef.MaskedLoadStoreFilter() {
- if err := maskedLoadStore.ExecuteTemplate(buffer, "maskedloadstore_amd64", typeDef); err != nil {
- panic(fmt.Errorf("failed to execute maskedloadstore template for type %s: %w", typeDef.Name, err))
- }
- }
- } else {
- if err := maskFromVal.ExecuteTemplate(buffer, "maskFromVal_amd64", typeDef); err != nil {
- panic(fmt.Errorf("failed to execute maskFromVal template for type %s: %w", typeDef.Name, err))
- }
- }
- }
- }
-
- return buffer
-}
-
-func writeSIMDFeatures(ops []Operation) *bytes.Buffer {
- // Gather all features
- type featureKey struct {
- GoArch string
- Feature string
- }
- featureSet := make(map[featureKey]struct{})
- for _, op := range ops {
- // Generate a feature check for each independant feature in a
- // composite feature.
- for feature := range strings.SplitSeq(op.CPUFeature, ",") {
- feature = strings.TrimSpace(feature)
- featureSet[featureKey{op.GoArch, feature}] = struct{}{}
- }
- }
- features := slices.SortedFunc(maps.Keys(featureSet), func(a, b featureKey) int {
- if c := cmp.Compare(a.GoArch, b.GoArch); c != 0 {
- return c
- }
- return compareNatural(a.Feature, b.Feature)
- })
-
- // If we ever have the same feature name on more than one GOARCH, we'll have
- // to be more careful about this.
- t := templateOf(simdFeaturesTemplate, "features")
-
- buffer := new(bytes.Buffer)
- buffer.WriteString(simdPackageHeader)
-
- if err := t.Execute(buffer, features); err != nil {
- panic(fmt.Errorf("failed to execute features template: %w", err))
- }
-
- return buffer
-}
-
-// writeSIMDStubs returns two bytes.Buffers containing the declarations for the public
-// and internal-use vector intrinsics.
-func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer) {
- t := templateOf(simdStubsTmpl, "simdStubs")
- f = new(bytes.Buffer)
- fI = new(bytes.Buffer)
- f.WriteString(simdPackageHeader)
- fI.WriteString(simdPackageHeader)
-
- slices.SortFunc(ops, compareOperations)
-
- for i, op := range ops {
- if op.NoTypes != nil && *op.NoTypes == "true" {
- continue
- }
- if op.SkipMaskedMethod() {
- continue
- }
- idxVecAsScalar, err := checkVecAsScalar(op)
- if err != nil {
- panic(err)
- }
- if s, op, err := classifyOp(op); err == nil {
- if idxVecAsScalar != -1 {
- if s == "op2" || s == "op3" {
- s += "VecAsScalar"
- } else {
- panic(fmt.Errorf("simdgen only supports op2 or op3 with TreatLikeAScalarOfSize"))
- }
- }
- if i == 0 || op.Go != ops[i-1].Go {
- if unicode.IsUpper([]rune(op.Go)[0]) {
- fmt.Fprintf(f, "\n/* %s */\n", op.Go)
- } else {
- fmt.Fprintf(fI, "\n/* %s */\n", op.Go)
- }
- }
- if unicode.IsUpper([]rune(op.Go)[0]) {
- if err := t.ExecuteTemplate(f, s, op); err != nil {
- panic(fmt.Errorf("failed to execute template %s for op %v: %w", s, op, err))
- }
- } else {
- if err := t.ExecuteTemplate(fI, s, op); err != nil {
- panic(fmt.Errorf("failed to execute template %s for op %v: %w", s, op, err))
- }
- }
- } else {
- panic(fmt.Errorf("failed to classify op %v: %w", op.Go, err))
- }
- }
-
- vectorConversions := vConvertFromTypeMap(typeMap)
- for _, conv := range vectorConversions {
- if err := t.ExecuteTemplate(f, "vectorConversion", conv); err != nil {
- panic(fmt.Errorf("failed to execute vectorConversion template: %w", err))
- }
- }
-
- masks := masksFromTypeMap(typeMap)
- for _, mask := range masks {
- if err := t.ExecuteTemplate(f, "mask", mask); err != nil {
- panic(fmt.Errorf("failed to execute mask template for mask %s: %w", mask.Name, err))
- }
- }
-
- return
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "bytes"
- "fmt"
- "slices"
- "strings"
- "text/template"
-)
-
-type tplRuleData struct {
- tplName string // e.g. "sftimm"
- GoOp string // e.g. "ShiftAllLeft"
- GoType string // e.g. "Uint32x8"
- Args string // e.g. "x y"
- Asm string // e.g. "VPSLLD256"
- ArgsOut string // e.g. "x y"
- MaskInConvert string // e.g. "VPMOVVec32x8ToM"
- MaskOutConvert string // e.g. "VPMOVMToVec32x8"
- ElementSize int // e.g. 32
- Size int // e.g. 128
- ArgsLoadAddr string // [Args] with its last vreg arg being a concrete "(VMOVDQUload* ptr mem)", and might contain mask.
- ArgsAddr string // [Args] with its last vreg arg being replaced by "ptr", and might contain mask, and with a "mem" at the end.
- FeatCheck string // e.g. "v.Block.CPUfeatures.hasFeature(CPUavx512)" -- for a ssa/_gen rules file.
-}
-
-var (
- ruleTemplates = template.Must(template.New("simdRules").Parse(`
-{{define "pureVreg"}}({{.GoOp}}{{.GoType}} {{.Args}}) => ({{.Asm}} {{.ArgsOut}})
-{{end}}
-{{define "maskIn"}}({{.GoOp}}{{.GoType}} {{.Args}} mask) => ({{.Asm}} {{.ArgsOut}} ({{.MaskInConvert}} <types.TypeMask> mask))
-{{end}}
-{{define "maskOut"}}({{.GoOp}}{{.GoType}} {{.Args}}) => ({{.MaskOutConvert}} ({{.Asm}} {{.ArgsOut}}))
-{{end}}
-{{define "maskInMaskOut"}}({{.GoOp}}{{.GoType}} {{.Args}} mask) => ({{.MaskOutConvert}} ({{.Asm}} {{.ArgsOut}} ({{.MaskInConvert}} <types.TypeMask> mask)))
-{{end}}
-{{define "sftimm"}}({{.Asm}} x (MOVQconst [c])) => ({{.Asm}}const [uint8(c)] x)
-{{end}}
-{{define "masksftimm"}}({{.Asm}} x (MOVQconst [c]) mask) => ({{.Asm}}const [uint8(c)] x mask)
-{{end}}
-{{define "vregMem"}}({{.Asm}} {{.ArgsLoadAddr}}) && canMergeLoad(v, l) && clobber(l) => ({{.Asm}}load {{.ArgsAddr}})
-{{end}}
-{{define "vregMemFeatCheck"}}({{.Asm}} {{.ArgsLoadAddr}}) && {{.FeatCheck}} && canMergeLoad(v, l) && clobber(l)=> ({{.Asm}}load {{.ArgsAddr}})
-{{end}}
-`))
-)
-
-func (d tplRuleData) MaskOptimization(asmCheck map[string]bool) string {
- asmNoMask := d.Asm
- if i := strings.Index(asmNoMask, "Masked"); i == -1 {
- return ""
- }
- asmNoMask = strings.ReplaceAll(asmNoMask, "Masked", "")
- if asmCheck[asmNoMask] == false {
- return ""
- }
-
- for _, nope := range []string{"VMOVDQU", "VPCOMPRESS", "VCOMPRESS", "VPEXPAND", "VEXPAND", "VPBLENDM", "VMOVUP"} {
- if strings.HasPrefix(asmNoMask, nope) {
- return ""
- }
- }
-
- size := asmNoMask[len(asmNoMask)-3:]
- if strings.HasSuffix(asmNoMask, "const") {
- sufLen := len("128const")
- size = asmNoMask[len(asmNoMask)-sufLen:][:3]
- }
- switch size {
- case "128", "256", "512":
- default:
- panic("Unexpected operation size on " + d.Asm)
- }
-
- switch d.ElementSize {
- case 8, 16, 32, 64:
- default:
- panic(fmt.Errorf("Unexpected operation width %d on %v", d.ElementSize, d.Asm))
- }
-
- return fmt.Sprintf("(VMOVDQU%dMasked%s (%s %s) mask) => (%s %s mask)\n", d.ElementSize, size, asmNoMask, d.Args, d.Asm, d.Args)
-}
-
-// SSA rewrite rules need to appear in a most-to-least-specific order. This works for that.
-var tmplOrder = map[string]int{
- "masksftimm": 0,
- "sftimm": 1,
- "maskInMaskOut": 2,
- "maskOut": 3,
- "maskIn": 4,
- "pureVreg": 5,
- "vregMem": 6,
-}
-
-func compareTplRuleData(x, y tplRuleData) int {
- if c := compareNatural(x.GoOp, y.GoOp); c != 0 {
- return c
- }
- if c := compareNatural(x.GoType, y.GoType); c != 0 {
- return c
- }
- if c := compareNatural(x.Args, y.Args); c != 0 {
- return c
- }
- if x.tplName == y.tplName {
- return 0
- }
- xo, xok := tmplOrder[x.tplName]
- yo, yok := tmplOrder[y.tplName]
- if !xok {
- panic(fmt.Errorf("Unexpected template name %s, please add to tmplOrder", x.tplName))
- }
- if !yok {
- panic(fmt.Errorf("Unexpected template name %s, please add to tmplOrder", y.tplName))
- }
- return xo - yo
-}
-
-// writeSIMDRules generates the lowering and rewrite rules for ssa and writes it to simdAMD64.rules
-// within the specified directory.
-func writeSIMDRules(ops []Operation) *bytes.Buffer {
- buffer := new(bytes.Buffer)
- buffer.WriteString(generatedHeader + "\n")
-
- // asm -> masked merging rules
- maskedMergeOpts := make(map[string]string)
- s2n := map[int]string{8: "B", 16: "W", 32: "D", 64: "Q"}
- asmCheck := map[string]bool{}
- var allData []tplRuleData
- var optData []tplRuleData // for mask peephole optimizations, and other misc
- var memOptData []tplRuleData // for memory peephole optimizations
- memOpSeen := make(map[string]bool)
-
- for _, opr := range ops {
- opInShape, opOutShape, maskType, immType, gOp := opr.shape()
- asm := machineOpName(maskType, gOp)
- vregInCnt := len(gOp.In)
- if maskType == OneMask {
- vregInCnt--
- }
-
- data := tplRuleData{
- GoOp: gOp.Go,
- Asm: asm,
- }
-
- if vregInCnt == 1 {
- data.Args = "x"
- data.ArgsOut = data.Args
- } else if vregInCnt == 2 {
- data.Args = "x y"
- data.ArgsOut = data.Args
- } else if vregInCnt == 3 {
- data.Args = "x y z"
- data.ArgsOut = data.Args
- } else {
- panic(fmt.Errorf("simdgen does not support more than 3 vreg in inputs"))
- }
- if immType == ConstImm {
- data.ArgsOut = fmt.Sprintf("[%s] %s", *opr.In[0].Const, data.ArgsOut)
- } else if immType == VarImm {
- data.Args = fmt.Sprintf("[a] %s", data.Args)
- data.ArgsOut = fmt.Sprintf("[a] %s", data.ArgsOut)
- } else if immType == ConstVarImm {
- data.Args = fmt.Sprintf("[a] %s", data.Args)
- data.ArgsOut = fmt.Sprintf("[a+%s] %s", *opr.In[0].Const, data.ArgsOut)
- }
-
- goType := func(op Operation) string {
- if op.OperandOrder != nil {
- switch *op.OperandOrder {
- case "21Type1", "231Type1":
- // Permute uses operand[1] for method receiver.
- return *op.In[1].Go
- }
- }
- return *op.In[0].Go
- }
- var tplName string
- // If class overwrite is happening, that's not really a mask but a vreg.
- if opOutShape == OneVregOut || opOutShape == OneVregOutAtIn || gOp.Out[0].OverwriteClass != nil {
- switch opInShape {
- case OneImmIn:
- tplName = "pureVreg"
- data.GoType = goType(gOp)
- case PureVregIn:
- tplName = "pureVreg"
- data.GoType = goType(gOp)
- case OneKmaskImmIn:
- fallthrough
- case OneKmaskIn:
- tplName = "maskIn"
- data.GoType = goType(gOp)
- rearIdx := len(gOp.In) - 1
- // Mask is at the end.
- width := *gOp.In[rearIdx].ElemBits
- data.MaskInConvert = fmt.Sprintf("VPMOVVec%dx%dToM", width, *gOp.In[rearIdx].Lanes)
- data.ElementSize = width
- case PureKmaskIn:
- panic(fmt.Errorf("simdgen does not support pure k mask instructions, they should be generated by compiler optimizations"))
- }
- } else if opOutShape == OneGregOut {
- tplName = "pureVreg" // TODO this will be wrong
- data.GoType = goType(gOp)
- } else {
- // OneKmaskOut case
- data.MaskOutConvert = fmt.Sprintf("VPMOVMToVec%dx%d", *gOp.Out[0].ElemBits, *gOp.In[0].Lanes)
- switch opInShape {
- case OneImmIn:
- fallthrough
- case PureVregIn:
- tplName = "maskOut"
- data.GoType = goType(gOp)
- case OneKmaskImmIn:
- fallthrough
- case OneKmaskIn:
- tplName = "maskInMaskOut"
- data.GoType = goType(gOp)
- rearIdx := len(gOp.In) - 1
- data.MaskInConvert = fmt.Sprintf("VPMOVVec%dx%dToM", *gOp.In[rearIdx].ElemBits, *gOp.In[rearIdx].Lanes)
- case PureKmaskIn:
- panic(fmt.Errorf("simdgen does not support pure k mask instructions, they should be generated by compiler optimizations"))
- }
- }
-
- if gOp.SpecialLower != nil {
- if *gOp.SpecialLower == "sftimm" {
- if data.GoType[0] == 'I' {
- // only do these for signed types, it is a duplicate rewrite for unsigned
- sftImmData := data
- if tplName == "maskIn" {
- sftImmData.tplName = "masksftimm"
- } else {
- sftImmData.tplName = "sftimm"
- }
- allData = append(allData, sftImmData)
- asmCheck[sftImmData.Asm+"const"] = true
- }
- } else {
- panic("simdgen sees unknwon special lower " + *gOp.SpecialLower + ", maybe implement it?")
- }
- }
- if gOp.MemFeatures != nil && *gOp.MemFeatures == "vbcst" {
- // sanity check
- selected := true
- for _, a := range gOp.In {
- if a.TreatLikeAScalarOfSize != nil {
- selected = false
- break
- }
- }
- if _, ok := memOpSeen[data.Asm]; ok {
- selected = false
- }
- if selected {
- memOpSeen[data.Asm] = true
- lastVreg := gOp.In[vregInCnt-1]
- // sanity check
- if lastVreg.Class != "vreg" {
- panic(fmt.Errorf("simdgen expects vbcst replaced operand to be a vreg, but %v found", lastVreg))
- }
- memOpData := data
- // Remove the last vreg from the arg and change it to a load.
- origArgs := data.Args[:len(data.Args)-1]
- // Prepare imm args.
- immArg := ""
- immArgCombineOff := " [off] "
- if immType != NoImm && immType != InvalidImm {
- _, after, found := strings.Cut(origArgs, "]")
- if found {
- origArgs = after
- }
- immArg = "[c] "
- immArgCombineOff = " [makeValAndOff(int32(int8(c)),off)] "
- }
- memOpData.ArgsLoadAddr = immArg + origArgs + fmt.Sprintf("l:(VMOVDQUload%d {sym} [off] ptr mem)", *lastVreg.Bits)
- // Remove the last vreg from the arg and change it to "ptr".
- memOpData.ArgsAddr = "{sym}" + immArgCombineOff + origArgs + "ptr"
- if maskType == OneMask {
- memOpData.ArgsAddr += " mask"
- memOpData.ArgsLoadAddr += " mask"
- }
- memOpData.ArgsAddr += " mem"
- if gOp.MemFeaturesData != nil {
- _, feat2 := getVbcstData(*gOp.MemFeaturesData)
- knownFeatChecks := map[string]string{
- "AVX": "v.Block.CPUfeatures.hasFeature(CPUavx)",
- "AVX2": "v.Block.CPUfeatures.hasFeature(CPUavx2)",
- "AVX512": "v.Block.CPUfeatures.hasFeature(CPUavx512)",
- }
- memOpData.FeatCheck = knownFeatChecks[feat2]
- memOpData.tplName = "vregMemFeatCheck"
- } else {
- memOpData.tplName = "vregMem"
- }
- memOptData = append(memOptData, memOpData)
- asmCheck[memOpData.Asm+"load"] = true
- }
- }
- // Generate the masked merging optimization rules
- if gOp.hasMaskedMerging(maskType, opOutShape) {
- // TODO: handle customized operand order and special lower.
- maskElem := gOp.In[len(gOp.In)-1]
- if maskElem.Bits == nil {
- panic("mask has no bits")
- }
- if maskElem.ElemBits == nil {
- panic("mask has no elemBits")
- }
- if maskElem.Lanes == nil {
- panic("mask has no lanes")
- }
- switch *maskElem.Bits {
- case 128, 256:
- // VPBLENDVB cases.
- noMaskName := machineOpName(NoMask, gOp)
- ruleExisting, ok := maskedMergeOpts[noMaskName]
- rule := fmt.Sprintf("(VPBLENDVB%d dst (%s %s) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (%sMerging dst %s (VPMOVVec%dx%dToM <types.TypeMask> mask))\n",
- *maskElem.Bits, noMaskName, data.Args, data.Asm, data.Args, *maskElem.ElemBits, *maskElem.Lanes)
- if ok && ruleExisting != rule {
- panic(fmt.Sprintf("multiple masked merge rules for one op:\n%s\n%s\n", ruleExisting, rule))
- } else {
- maskedMergeOpts[noMaskName] = rule
- }
- case 512:
- // VPBLENDM[BWDQ] cases.
- noMaskName := machineOpName(NoMask, gOp)
- ruleExisting, ok := maskedMergeOpts[noMaskName]
- rule := fmt.Sprintf("(VPBLENDM%sMasked%d dst (%s %s) mask) => (%sMerging dst %s mask)\n",
- s2n[*maskElem.ElemBits], *maskElem.Bits, noMaskName, data.Args, data.Asm, data.Args)
- if ok && ruleExisting != rule {
- panic(fmt.Sprintf("multiple masked merge rules for one op:\n%s\n%s\n", ruleExisting, rule))
- } else {
- maskedMergeOpts[noMaskName] = rule
- }
- }
- }
-
- if tplName == "pureVreg" && data.Args == data.ArgsOut {
- data.Args = "..."
- data.ArgsOut = "..."
- }
- data.tplName = tplName
- if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" ||
- opr.SkipMaskedMethod() {
- optData = append(optData, data)
- continue
- }
- allData = append(allData, data)
- asmCheck[data.Asm] = true
- }
-
- slices.SortFunc(allData, compareTplRuleData)
-
- for _, data := range allData {
- if err := ruleTemplates.ExecuteTemplate(buffer, data.tplName, data); err != nil {
- panic(fmt.Errorf("failed to execute template %s for %s: %w", data.tplName, data.GoOp+data.GoType, err))
- }
- }
-
- seen := make(map[string]bool)
-
- for _, data := range optData {
- if data.tplName == "maskIn" {
- rule := data.MaskOptimization(asmCheck)
- if seen[rule] {
- continue
- }
- seen[rule] = true
- buffer.WriteString(rule)
- }
- }
-
- maskedMergeOptsRules := []string{}
- for asm, rule := range maskedMergeOpts {
- if !asmCheck[asm] {
- continue
- }
- maskedMergeOptsRules = append(maskedMergeOptsRules, rule)
- }
- slices.Sort(maskedMergeOptsRules)
- for _, rule := range maskedMergeOptsRules {
- buffer.WriteString(rule)
- }
-
- for _, data := range memOptData {
- if err := ruleTemplates.ExecuteTemplate(buffer, data.tplName, data); err != nil {
- panic(fmt.Errorf("failed to execute template %s for %s: %w", data.tplName, data.Asm, err))
- }
- }
-
- return buffer
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "bytes"
- "fmt"
- "log"
- "strings"
- "text/template"
-)
-
-var (
- ssaTemplates = template.Must(template.New("simdSSA").Parse(`
-{{define "header"}}// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
-
-package amd64
-
-import (
- "cmd/compile/internal/ssa"
- "cmd/compile/internal/ssagen"
- "cmd/internal/obj"
- "cmd/internal/obj/x86"
-)
-
-func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
- var p *obj.Prog
- switch v.Op {{"{"}}{{end}}
-{{define "case"}}
- case {{.Cases}}:
- p = {{.Helper}}(s, v)
-{{end}}
-{{define "footer"}}
- default:
- // Unknown reg shape
- return false
- }
-{{end}}
-{{define "zeroing"}}
- // Masked operation are always compiled with zeroing.
- switch v.Op {
- case {{.}}:
- x86.ParseSuffix(p, "Z")
- }
-{{end}}
-{{define "ending"}}
- return true
-}
-{{end}}`))
-)
-
-type tplSSAData struct {
- Cases string
- Helper string
-}
-
-// writeSIMDSSA generates the ssa to prog lowering codes and writes it to simdssa.go
-// within the specified directory.
-func writeSIMDSSA(ops []Operation) *bytes.Buffer {
- var ZeroingMask []string
- regInfoKeys := []string{
- "v11",
- "v21",
- "v2k",
- "v2kv",
- "v2kk",
- "vkv",
- "v31",
- "v3kv",
- "v11Imm8",
- "vkvImm8",
- "v21Imm8",
- "v2kImm8",
- "v2kkImm8",
- "v31ResultInArg0",
- "v3kvResultInArg0",
- "vfpv",
- "vfpkv",
- "vgpvImm8",
- "vgpImm8",
- "v2kvImm8",
- "vkvload",
- "v21load",
- "v31loadResultInArg0",
- "v3kvloadResultInArg0",
- "v2kvload",
- "v2kload",
- "v11load",
- "v11loadImm8",
- "vkvloadImm8",
- "v21loadImm8",
- "v2kloadImm8",
- "v2kkloadImm8",
- "v2kvloadImm8",
- "v31ResultInArg0Imm8",
- "v31loadResultInArg0Imm8",
- "v21ResultInArg0",
- "v21ResultInArg0Imm8",
- "v31x0AtIn2ResultInArg0",
- "v2kvResultInArg0",
- }
- regInfoSet := map[string][]string{}
- for _, key := range regInfoKeys {
- regInfoSet[key] = []string{}
- }
-
- seen := map[string]struct{}{}
- allUnseen := make(map[string][]Operation)
- allUnseenCaseStr := make(map[string][]string)
- classifyOp := func(op Operation, maskType maskShape, shapeIn inShape, shapeOut outShape, caseStr string, mem memShape) error {
- regShape, err := op.regShape(mem)
- if err != nil {
- return err
- }
- if regShape == "v01load" {
- regShape = "vload"
- }
- if shapeOut == OneVregOutAtIn {
- regShape += "ResultInArg0"
- }
- if shapeIn == OneImmIn || shapeIn == OneKmaskImmIn {
- regShape += "Imm8"
- }
- regShape, err = rewriteVecAsScalarRegInfo(op, regShape)
- if err != nil {
- return err
- }
- if _, ok := regInfoSet[regShape]; !ok {
- allUnseen[regShape] = append(allUnseen[regShape], op)
- allUnseenCaseStr[regShape] = append(allUnseenCaseStr[regShape], caseStr)
- }
- regInfoSet[regShape] = append(regInfoSet[regShape], caseStr)
- if mem == NoMem && op.hasMaskedMerging(maskType, shapeOut) {
- regShapeMerging := regShape
- if shapeOut != OneVregOutAtIn {
- // We have to copy the slice here becasue the sort will be visible from other
- // aliases when no reslicing is happening.
- newIn := make([]Operand, len(op.In), len(op.In)+1)
- copy(newIn, op.In)
- op.In = newIn
- op.In = append(op.In, op.Out[0])
- op.sortOperand()
- regShapeMerging, err = op.regShape(mem)
- regShapeMerging += "ResultInArg0"
- }
- if err != nil {
- return err
- }
- if _, ok := regInfoSet[regShapeMerging]; !ok {
- allUnseen[regShapeMerging] = append(allUnseen[regShapeMerging], op)
- allUnseenCaseStr[regShapeMerging] = append(allUnseenCaseStr[regShapeMerging], caseStr+"Merging")
- }
- regInfoSet[regShapeMerging] = append(regInfoSet[regShapeMerging], caseStr+"Merging")
- }
- return nil
- }
- for _, op := range ops {
- shapeIn, shapeOut, maskType, _, gOp := op.shape()
- asm := machineOpName(maskType, gOp)
- if _, ok := seen[asm]; ok {
- continue
- }
- seen[asm] = struct{}{}
- caseStr := fmt.Sprintf("ssa.OpAMD64%s", asm)
- isZeroMasking := false
- if shapeIn == OneKmaskIn || shapeIn == OneKmaskImmIn {
- if gOp.Zeroing == nil || *gOp.Zeroing {
- ZeroingMask = append(ZeroingMask, caseStr)
- isZeroMasking = true
- }
- }
- if err := classifyOp(op, maskType, shapeIn, shapeOut, caseStr, NoMem); err != nil {
- panic(err)
- }
- if op.MemFeatures != nil && *op.MemFeatures == "vbcst" {
- // Make a full vec memory variant
- op = rewriteLastVregToMem(op)
- // Ignore the error
- // an error could be triggered by [checkVecAsScalar].
- // TODO: make [checkVecAsScalar] aware of mem ops.
- if err := classifyOp(op, maskType, shapeIn, shapeOut, caseStr+"load", VregMemIn); err != nil {
- if *Verbose {
- log.Printf("Seen error: %e", err)
- }
- } else if isZeroMasking {
- ZeroingMask = append(ZeroingMask, caseStr+"load")
- }
- }
- }
- if len(allUnseen) != 0 {
- allKeys := make([]string, 0)
- for k := range allUnseen {
- allKeys = append(allKeys, k)
- }
- panic(fmt.Errorf("unsupported register constraint for prog, please update gen_simdssa.go and amd64/ssa.go: %+v\nAll keys: %v\n, cases: %v\n", allUnseen, allKeys, allUnseenCaseStr))
- }
-
- buffer := new(bytes.Buffer)
-
- if err := ssaTemplates.ExecuteTemplate(buffer, "header", nil); err != nil {
- panic(fmt.Errorf("failed to execute header template: %w", err))
- }
-
- for _, regShape := range regInfoKeys {
- // Stable traversal of regInfoSet
- cases := regInfoSet[regShape]
- if len(cases) == 0 {
- continue
- }
- data := tplSSAData{
- Cases: strings.Join(cases, ",\n\t\t"),
- Helper: "simd" + capitalizeFirst(regShape),
- }
- if err := ssaTemplates.ExecuteTemplate(buffer, "case", data); err != nil {
- panic(fmt.Errorf("failed to execute case template for %s: %w", regShape, err))
- }
- }
-
- if err := ssaTemplates.ExecuteTemplate(buffer, "footer", nil); err != nil {
- panic(fmt.Errorf("failed to execute footer template: %w", err))
- }
-
- if len(ZeroingMask) != 0 {
- if err := ssaTemplates.ExecuteTemplate(buffer, "zeroing", strings.Join(ZeroingMask, ",\n\t\t")); err != nil {
- panic(fmt.Errorf("failed to execute footer template: %w", err))
- }
- }
-
- if err := ssaTemplates.ExecuteTemplate(buffer, "ending", nil); err != nil {
- panic(fmt.Errorf("failed to execute footer template: %w", err))
- }
-
- return buffer
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "bufio"
- "bytes"
- "fmt"
- "go/format"
- "log"
- "os"
- "path/filepath"
- "reflect"
- "slices"
- "sort"
- "strings"
- "text/template"
- "unicode"
-)
-
-func templateOf(temp, name string) *template.Template {
- t, err := template.New(name).Parse(temp)
- if err != nil {
- panic(fmt.Errorf("failed to parse template %s: %w", name, err))
- }
- return t
-}
-
-func createPath(goroot string, file string) (*os.File, error) {
- fp := filepath.Join(goroot, file)
- dir := filepath.Dir(fp)
- err := os.MkdirAll(dir, 0755)
- if err != nil {
- return nil, fmt.Errorf("failed to create directory %s: %w", dir, err)
- }
- f, err := os.Create(fp)
- if err != nil {
- return nil, fmt.Errorf("failed to create file %s: %w", fp, err)
- }
- return f, nil
-}
-
-func formatWriteAndClose(out *bytes.Buffer, goroot string, file string) {
- b, err := format.Source(out.Bytes())
- if err != nil {
- fmt.Fprintf(os.Stderr, "%v\n", err)
- fmt.Fprintf(os.Stderr, "%s\n", numberLines(out.Bytes()))
- fmt.Fprintf(os.Stderr, "%v\n", err)
- panic(err)
- } else {
- writeAndClose(b, goroot, file)
- }
-}
-
-func writeAndClose(b []byte, goroot string, file string) {
- ofile, err := createPath(goroot, file)
- if err != nil {
- panic(err)
- }
- ofile.Write(b)
- ofile.Close()
-}
-
-// numberLines takes a slice of bytes, and returns a string where each line
-// is numbered, starting from 1.
-func numberLines(data []byte) string {
- var buf bytes.Buffer
- r := bytes.NewReader(data)
- s := bufio.NewScanner(r)
- for i := 1; s.Scan(); i++ {
- fmt.Fprintf(&buf, "%d: %s\n", i, s.Text())
- }
- return buf.String()
-}
-
-type inShape uint8
-type outShape uint8
-type maskShape uint8
-type immShape uint8
-type memShape uint8
-
-const (
- InvalidIn inShape = iota
- PureVregIn // vector register input only
- OneKmaskIn // vector and kmask input
- OneImmIn // vector and immediate input
- OneKmaskImmIn // vector, kmask, and immediate inputs
- PureKmaskIn // only mask inputs.
-)
-
-const (
- InvalidOut outShape = iota
- NoOut // no output
- OneVregOut // (one) vector register output
- OneGregOut // (one) general register output
- OneKmaskOut // mask output
- OneVregOutAtIn // the first input is also the output
-)
-
-const (
- InvalidMask maskShape = iota
- NoMask // no mask
- OneMask // with mask (K1 to K7)
- AllMasks // a K mask instruction (K0-K7)
-)
-
-const (
- InvalidImm immShape = iota
- NoImm // no immediate
- ConstImm // const only immediate
- VarImm // pure imm argument provided by the users
- ConstVarImm // a combination of user arg and const
-)
-
-const (
- InvalidMem memShape = iota
- NoMem
- VregMemIn // The instruction contains a mem input which is loading a vreg.
-)
-
-// opShape returns the several integers describing the shape of the operation,
-// and modified versions of the op:
-//
-// opNoImm is op with its inputs excluding the const imm.
-//
-// This function does not modify op.
-func (op *Operation) shape() (shapeIn inShape, shapeOut outShape, maskType maskShape, immType immShape,
- opNoImm Operation) {
- if len(op.Out) > 1 {
- panic(fmt.Errorf("simdgen only supports 1 output: %s", op))
- }
- var outputReg int
- if len(op.Out) == 1 {
- outputReg = op.Out[0].AsmPos
- if op.Out[0].Class == "vreg" {
- shapeOut = OneVregOut
- } else if op.Out[0].Class == "greg" {
- shapeOut = OneGregOut
- } else if op.Out[0].Class == "mask" {
- shapeOut = OneKmaskOut
- } else {
- panic(fmt.Errorf("simdgen only supports output of class vreg or mask: %s", op))
- }
- } else {
- shapeOut = NoOut
- // TODO: are these only Load/Stores?
- // We manually supported two Load and Store, are those enough?
- panic(fmt.Errorf("simdgen only supports 1 output: %s", op))
- }
- hasImm := false
- maskCount := 0
- hasVreg := false
- for _, in := range op.In {
- if in.AsmPos == outputReg {
- if shapeOut != OneVregOutAtIn && in.AsmPos == 0 && in.Class == "vreg" {
- shapeOut = OneVregOutAtIn
- } else {
- panic(fmt.Errorf("simdgen only support output and input sharing the same position case of \"the first input is vreg and the only output\": %s", op))
- }
- }
- if in.Class == "immediate" {
- // A manual check on XED data found that AMD64 SIMD instructions at most
- // have 1 immediates. So we don't need to check this here.
- if *in.Bits != 8 {
- panic(fmt.Errorf("simdgen only supports immediates of 8 bits: %s", op))
- }
- hasImm = true
- } else if in.Class == "mask" {
- maskCount++
- } else {
- hasVreg = true
- }
- }
- opNoImm = *op
-
- removeImm := func(o *Operation) {
- o.In = o.In[1:]
- }
- if hasImm {
- removeImm(&opNoImm)
- if op.In[0].Const != nil {
- if op.In[0].ImmOffset != nil {
- immType = ConstVarImm
- } else {
- immType = ConstImm
- }
- } else if op.In[0].ImmOffset != nil {
- immType = VarImm
- } else {
- panic(fmt.Errorf("simdgen requires imm to have at least one of ImmOffset or Const set: %s", op))
- }
- } else {
- immType = NoImm
- }
- if maskCount == 0 {
- maskType = NoMask
- } else {
- maskType = OneMask
- }
- checkPureMask := func() bool {
- if hasImm {
- panic(fmt.Errorf("simdgen does not support immediates in pure mask operations: %s", op))
- }
- if hasVreg {
- panic(fmt.Errorf("simdgen does not support more than 1 masks in non-pure mask operations: %s", op))
- }
- return false
- }
- if !hasImm && maskCount == 0 {
- shapeIn = PureVregIn
- } else if !hasImm && maskCount > 0 {
- if maskCount == 1 {
- shapeIn = OneKmaskIn
- } else {
- if checkPureMask() {
- return
- }
- shapeIn = PureKmaskIn
- maskType = AllMasks
- }
- } else if hasImm && maskCount == 0 {
- shapeIn = OneImmIn
- } else {
- if maskCount == 1 {
- shapeIn = OneKmaskImmIn
- } else {
- checkPureMask()
- return
- }
- }
- return
-}
-
-// regShape returns a string representation of the register shape.
-func (op *Operation) regShape(mem memShape) (string, error) {
- _, _, _, _, gOp := op.shape()
- var regInfo, fixedName string
- var vRegInCnt, gRegInCnt, kMaskInCnt, vRegOutCnt, gRegOutCnt, kMaskOutCnt, memInCnt, memOutCnt int
- for i, in := range gOp.In {
- switch in.Class {
- case "vreg":
- vRegInCnt++
- case "greg":
- gRegInCnt++
- case "mask":
- kMaskInCnt++
- case "memory":
- if mem != VregMemIn {
- panic("simdgen only knows VregMemIn in regShape")
- }
- memInCnt++
- vRegInCnt++
- }
- if in.FixedReg != nil {
- fixedName = fmt.Sprintf("%sAtIn%d", *in.FixedReg, i)
- }
- }
- for i, out := range gOp.Out {
- // If class overwrite is happening, that's not really a mask but a vreg.
- if out.Class == "vreg" || out.OverwriteClass != nil {
- vRegOutCnt++
- } else if out.Class == "greg" {
- gRegOutCnt++
- } else if out.Class == "mask" {
- kMaskOutCnt++
- } else if out.Class == "memory" {
- if mem != VregMemIn {
- panic("simdgen only knows VregMemIn in regShape")
- }
- vRegOutCnt++
- memOutCnt++
- }
- if out.FixedReg != nil {
- fixedName = fmt.Sprintf("%sAtIn%d", *out.FixedReg, i)
- }
- }
- var inRegs, inMasks, outRegs, outMasks string
-
- rmAbbrev := func(s string, i int) string {
- if i == 0 {
- return ""
- }
- if i == 1 {
- return s
- }
- return fmt.Sprintf("%s%d", s, i)
-
- }
-
- inRegs = rmAbbrev("v", vRegInCnt)
- inRegs += rmAbbrev("gp", gRegInCnt)
- inMasks = rmAbbrev("k", kMaskInCnt)
-
- outRegs = rmAbbrev("v", vRegOutCnt)
- outRegs += rmAbbrev("gp", gRegOutCnt)
- outMasks = rmAbbrev("k", kMaskOutCnt)
-
- if kMaskInCnt == 0 && kMaskOutCnt == 0 && gRegInCnt == 0 && gRegOutCnt == 0 {
- // For pure v we can abbreviate it as v%d%d.
- regInfo = fmt.Sprintf("v%d%d", vRegInCnt, vRegOutCnt)
- } else if kMaskInCnt == 0 && kMaskOutCnt == 0 {
- regInfo = fmt.Sprintf("%s%s", inRegs, outRegs)
- } else {
- regInfo = fmt.Sprintf("%s%s%s%s", inRegs, inMasks, outRegs, outMasks)
- }
- if memInCnt > 0 {
- if memInCnt == 1 {
- regInfo += "load"
- } else {
- panic("simdgen does not understand more than 1 mem op as of now")
- }
- }
- if memOutCnt > 0 {
- panic("simdgen does not understand memory as output as of now")
- }
- regInfo += fixedName
- return regInfo, nil
-}
-
-// sortOperand sorts op.In by putting immediates first, then vreg, and mask the last.
-// TODO: verify that this is a safe assumption of the prog structure.
-// from my observation looks like in asm, imms are always the first,
-// masks are always the last, with vreg in between.
-func (op *Operation) sortOperand() {
- priority := map[string]int{"immediate": 0, "vreg": 1, "greg": 1, "mask": 2}
- sort.SliceStable(op.In, func(i, j int) bool {
- pi := priority[op.In[i].Class]
- pj := priority[op.In[j].Class]
- if pi != pj {
- return pi < pj
- }
- return op.In[i].AsmPos < op.In[j].AsmPos
- })
-}
-
-// adjustAsm adjusts the asm to make it align with Go's assembler.
-func (op *Operation) adjustAsm() {
- if op.Asm == "VCVTTPD2DQ" || op.Asm == "VCVTTPD2UDQ" ||
- op.Asm == "VCVTQQ2PS" || op.Asm == "VCVTUQQ2PS" ||
- op.Asm == "VCVTPD2PS" {
- switch *op.In[0].Bits {
- case 128:
- op.Asm += "X"
- case 256:
- op.Asm += "Y"
- }
- }
-}
-
-// goNormalType returns the Go type name for the result of an Op that
-// does not return a vector, i.e., that returns a result in a general
-// register. Currently there's only one family of Ops in Go's simd library
-// that does this (GetElem), and so this is specialized to work for that,
-// but the problem (mismatch betwen hardware register width and Go type
-// width) seems likely to recur if there are any other cases.
-func (op Operation) goNormalType() string {
- if op.Go == "GetElem" {
- // GetElem returns an element of the vector into a general register
- // but as far as the hardware is concerned, that result is either 32
- // or 64 bits wide, no matter what the vector element width is.
- // This is not "wrong" but it is not the right answer for Go source code.
- // To get the Go type right, combine the base type ("int", "uint", "float"),
- // with the input vector element width in bits (8,16,32,64).
-
- at := 0 // proper value of at depends on whether immediate was stripped or not
- if op.In[at].Class == "immediate" {
- at++
- }
- return fmt.Sprintf("%s%d", *op.Out[0].Base, *op.In[at].ElemBits)
- }
- panic(fmt.Errorf("Implement goNormalType for %v", op))
-}
-
-// SSAType returns the string for the type reference in SSA generation,
-// for example in the intrinsics generating template.
-func (op Operation) SSAType() string {
- if op.Out[0].Class == "greg" {
- return fmt.Sprintf("types.Types[types.T%s]", strings.ToUpper(op.goNormalType()))
- }
- return fmt.Sprintf("types.TypeVec%d", *op.Out[0].Bits)
-}
-
-// GoType returns the Go type returned by this operation (relative to the simd package),
-// for example "int32" or "Int8x16". This is used in a template.
-func (op Operation) GoType() string {
- if op.Out[0].Class == "greg" {
- return op.goNormalType()
- }
- return *op.Out[0].Go
-}
-
-// ImmName returns the name to use for an operation's immediate operand.
-// This can be overriden in the yaml with "name" on an operand,
-// otherwise, for now, "constant"
-func (op Operation) ImmName() string {
- return op.Op0Name("constant")
-}
-
-func (o Operand) OpName(s string) string {
- if n := o.Name; n != nil {
- return *n
- }
- if o.Class == "mask" {
- return "mask"
- }
- return s
-}
-
-func (o Operand) OpNameAndType(s string) string {
- return o.OpName(s) + " " + *o.Go
-}
-
-// GoExported returns [Go] with first character capitalized.
-func (op Operation) GoExported() string {
- return capitalizeFirst(op.Go)
-}
-
-// DocumentationExported returns [Documentation] with method name capitalized.
-func (op Operation) DocumentationExported() string {
- return strings.ReplaceAll(op.Documentation, op.Go, op.GoExported())
-}
-
-// Op0Name returns the name to use for the 0 operand,
-// if any is present, otherwise the parameter is used.
-func (op Operation) Op0Name(s string) string {
- return op.In[0].OpName(s)
-}
-
-// Op1Name returns the name to use for the 1 operand,
-// if any is present, otherwise the parameter is used.
-func (op Operation) Op1Name(s string) string {
- return op.In[1].OpName(s)
-}
-
-// Op2Name returns the name to use for the 2 operand,
-// if any is present, otherwise the parameter is used.
-func (op Operation) Op2Name(s string) string {
- return op.In[2].OpName(s)
-}
-
-// Op3Name returns the name to use for the 3 operand,
-// if any is present, otherwise the parameter is used.
-func (op Operation) Op3Name(s string) string {
- return op.In[3].OpName(s)
-}
-
-// Op0NameAndType returns the name and type to use for
-// the 0 operand, if a name is provided, otherwise
-// the parameter value is used as the default.
-func (op Operation) Op0NameAndType(s string) string {
- return op.In[0].OpNameAndType(s)
-}
-
-// Op1NameAndType returns the name and type to use for
-// the 1 operand, if a name is provided, otherwise
-// the parameter value is used as the default.
-func (op Operation) Op1NameAndType(s string) string {
- return op.In[1].OpNameAndType(s)
-}
-
-// Op2NameAndType returns the name and type to use for
-// the 2 operand, if a name is provided, otherwise
-// the parameter value is used as the default.
-func (op Operation) Op2NameAndType(s string) string {
- return op.In[2].OpNameAndType(s)
-}
-
-// Op3NameAndType returns the name and type to use for
-// the 3 operand, if a name is provided, otherwise
-// the parameter value is used as the default.
-func (op Operation) Op3NameAndType(s string) string {
- return op.In[3].OpNameAndType(s)
-}
-
-// Op4NameAndType returns the name and type to use for
-// the 4 operand, if a name is provided, otherwise
-// the parameter value is used as the default.
-func (op Operation) Op4NameAndType(s string) string {
- return op.In[4].OpNameAndType(s)
-}
-
-var immClasses []string = []string{"BAD0Imm", "BAD1Imm", "op1Imm8", "op2Imm8", "op3Imm8", "op4Imm8"}
-var classes []string = []string{"BAD0", "op1", "op2", "op3", "op4"}
-
-// classifyOp returns a classification string, modified operation, and perhaps error based
-// on the stub and intrinsic shape for the operation.
-// The classification string is in the regular expression set "op[1234](Imm8)?(_<order>)?"
-// where the "<order>" suffix is optionally attached to the Operation in its input yaml.
-// The classification string is used to select a template or a clause of a template
-// for intrinsics declaration and the ssagen intrinisics glue code in the compiler.
-func classifyOp(op Operation) (string, Operation, error) {
- _, _, _, immType, gOp := op.shape()
-
- var class string
-
- if immType == VarImm || immType == ConstVarImm {
- switch l := len(op.In); l {
- case 1:
- return "", op, fmt.Errorf("simdgen does not recognize this operation of only immediate input: %s", op)
- case 2, 3, 4, 5:
- class = immClasses[l]
- default:
- return "", op, fmt.Errorf("simdgen does not recognize this operation of input length %d: %s", len(op.In), op)
- }
- if order := op.OperandOrder; order != nil {
- class += "_" + *order
- }
- return class, op, nil
- } else {
- switch l := len(gOp.In); l {
- case 1, 2, 3, 4:
- class = classes[l]
- default:
- return "", op, fmt.Errorf("simdgen does not recognize this operation of input length %d: %s", len(op.In), op)
- }
- if order := op.OperandOrder; order != nil {
- class += "_" + *order
- }
- return class, gOp, nil
- }
-}
-
-func checkVecAsScalar(op Operation) (idx int, err error) {
- idx = -1
- sSize := 0
- for i, o := range op.In {
- if o.TreatLikeAScalarOfSize != nil {
- if idx == -1 {
- idx = i
- sSize = *o.TreatLikeAScalarOfSize
- } else {
- err = fmt.Errorf("simdgen only supports one TreatLikeAScalarOfSize in the arg list: %s", op)
- return
- }
- }
- }
- if idx >= 0 {
- if sSize != 8 && sSize != 16 && sSize != 32 && sSize != 64 {
- err = fmt.Errorf("simdgen does not recognize this uint size: %d, %s", sSize, op)
- return
- }
- }
- return
-}
-
-func rewriteVecAsScalarRegInfo(op Operation, regInfo string) (string, error) {
- idx, err := checkVecAsScalar(op)
- if err != nil {
- return "", err
- }
- if idx != -1 {
- if regInfo == "v21" {
- regInfo = "vfpv"
- } else if regInfo == "v2kv" {
- regInfo = "vfpkv"
- } else if regInfo == "v31" {
- regInfo = "v2fpv"
- } else if regInfo == "v3kv" {
- regInfo = "v2fpkv"
- } else {
- return "", fmt.Errorf("simdgen does not recognize uses of treatLikeAScalarOfSize with op regShape %s in op: %s", regInfo, op)
- }
- }
- return regInfo, nil
-}
-
-func rewriteLastVregToMem(op Operation) Operation {
- newIn := make([]Operand, len(op.In))
- lastVregIdx := -1
- for i := range len(op.In) {
- newIn[i] = op.In[i]
- if op.In[i].Class == "vreg" {
- lastVregIdx = i
- }
- }
- // vbcst operations put their mem op always as the last vreg.
- if lastVregIdx == -1 {
- panic("simdgen cannot find one vreg in the mem op vreg original")
- }
- newIn[lastVregIdx].Class = "memory"
- op.In = newIn
-
- return op
-}
-
-// dedup is deduping operations in the full structure level.
-func dedup(ops []Operation) (deduped []Operation) {
- for _, op := range ops {
- seen := false
- for _, dop := range deduped {
- if reflect.DeepEqual(op, dop) {
- seen = true
- break
- }
- }
- if !seen {
- deduped = append(deduped, op)
- }
- }
- return
-}
-
-func (op Operation) GenericName() string {
- if op.OperandOrder != nil {
- switch *op.OperandOrder {
- case "21Type1", "231Type1":
- // Permute uses operand[1] for method receiver.
- return op.Go + *op.In[1].Go
- }
- }
- if op.In[0].Class == "immediate" {
- return op.Go + *op.In[1].Go
- }
- return op.Go + *op.In[0].Go
-}
-
-// dedupGodef is deduping operations in [Op.Go]+[*Op.In[0].Go] level.
-// By deduping, it means picking the least advanced architecture that satisfy the requirement:
-// AVX512 will be least preferred.
-// If FlagNoDedup is set, it will report the duplicates to the console.
-func dedupGodef(ops []Operation) ([]Operation, error) {
- seen := map[string][]Operation{}
- for _, op := range ops {
- _, _, _, _, gOp := op.shape()
-
- gN := gOp.GenericName()
- seen[gN] = append(seen[gN], op)
- }
- if *FlagReportDup {
- for gName, dup := range seen {
- if len(dup) > 1 {
- log.Printf("Duplicate for %s:\n", gName)
- for _, op := range dup {
- log.Printf("%s\n", op)
- }
- }
- }
- return ops, nil
- }
- isAVX512 := func(op Operation) bool {
- return strings.Contains(op.CPUFeature, "AVX512")
- }
- deduped := []Operation{}
- for _, dup := range seen {
- if len(dup) > 1 {
- slices.SortFunc(dup, func(i, j Operation) int {
- // Put non-AVX512 candidates at the beginning
- if !isAVX512(i) && isAVX512(j) {
- return -1
- }
- if isAVX512(i) && !isAVX512(j) {
- return 1
- }
- if i.CPUFeature != j.CPUFeature {
- return strings.Compare(i.CPUFeature, j.CPUFeature)
- }
- // Weirdly Intel sometimes has duplicated definitions for the same instruction,
- // this confuses the XED mem-op merge logic: [MemFeature] will only be attached to an instruction
- // for only once, which means that for essentially duplicated instructions only one will have the
- // proper [MemFeature] set. We have to make this sort deterministic for [MemFeature].
- if i.MemFeatures != nil && j.MemFeatures == nil {
- return -1
- }
- if i.MemFeatures == nil && j.MemFeatures != nil {
- return 1
- }
- if i.Commutative != j.Commutative {
- if j.Commutative {
- return -1
- }
- return 1
- }
- // Their order does not matter anymore, at least for now.
- return 0
- })
- }
- deduped = append(deduped, dup[0])
- }
- slices.SortFunc(deduped, compareOperations)
- return deduped, nil
-}
-
-// Copy op.ConstImm to op.In[0].Const
-// This is a hack to reduce the size of defs we need for const imm operations.
-func copyConstImm(ops []Operation) error {
- for _, op := range ops {
- if op.ConstImm == nil {
- continue
- }
- _, _, _, immType, _ := op.shape()
-
- if immType == ConstImm || immType == ConstVarImm {
- op.In[0].Const = op.ConstImm
- }
- // Otherwise, just not port it - e.g. {VPCMP[BWDQ] imm=0} and {VPCMPEQ[BWDQ]} are
- // the same operations "Equal", [dedupgodef] should be able to distinguish them.
- }
- return nil
-}
-
-func capitalizeFirst(s string) string {
- if s == "" {
- return ""
- }
- // Convert the string to a slice of runes to handle multi-byte characters correctly.
- r := []rune(s)
- r[0] = unicode.ToUpper(r[0])
- return string(r)
-}
-
-// overwrite corrects some errors due to:
-// - The XED data is wrong
-// - Go's SIMD API requirement, for example AVX2 compares should also produce masks.
-// This rewrite has strict constraints, please see the error message.
-// These constraints are also explointed in [writeSIMDRules], [writeSIMDMachineOps]
-// and [writeSIMDSSA], please be careful when updating these constraints.
-func overwrite(ops []Operation) error {
- hasClassOverwrite := false
- overwrite := func(op []Operand, idx int, o Operation) error {
- if op[idx].OverwriteElementBits != nil {
- if op[idx].ElemBits == nil {
- panic(fmt.Errorf("ElemBits is nil at operand %d of %v", idx, o))
- }
- *op[idx].ElemBits = *op[idx].OverwriteElementBits
- *op[idx].Lanes = *op[idx].Bits / *op[idx].ElemBits
- *op[idx].Go = fmt.Sprintf("%s%dx%d", capitalizeFirst(*op[idx].Base), *op[idx].ElemBits, *op[idx].Lanes)
- }
- if op[idx].OverwriteClass != nil {
- if op[idx].OverwriteBase == nil {
- panic(fmt.Errorf("simdgen: [OverwriteClass] must be set together with [OverwriteBase]: %s", op[idx]))
- }
- oBase := *op[idx].OverwriteBase
- oClass := *op[idx].OverwriteClass
- if oClass != "mask" {
- panic(fmt.Errorf("simdgen: [Class] overwrite only supports overwritting to mask: %s", op[idx]))
- }
- if oBase != "int" {
- panic(fmt.Errorf("simdgen: [Class] overwrite must set [OverwriteBase] to int: %s", op[idx]))
- }
- if op[idx].Class != "vreg" {
- panic(fmt.Errorf("simdgen: [Class] overwrite must be overwriting [Class] from vreg: %s", op[idx]))
- }
- hasClassOverwrite = true
- *op[idx].Base = oBase
- op[idx].Class = oClass
- *op[idx].Go = fmt.Sprintf("Mask%dx%d", *op[idx].ElemBits, *op[idx].Lanes)
- } else if op[idx].OverwriteBase != nil {
- oBase := *op[idx].OverwriteBase
- *op[idx].Go = strings.ReplaceAll(*op[idx].Go, capitalizeFirst(*op[idx].Base), capitalizeFirst(oBase))
- if op[idx].Class == "greg" {
- *op[idx].Go = strings.ReplaceAll(*op[idx].Go, *op[idx].Base, oBase)
- }
- *op[idx].Base = oBase
- }
- return nil
- }
- for i, o := range ops {
- hasClassOverwrite = false
- for j := range ops[i].In {
- if err := overwrite(ops[i].In, j, o); err != nil {
- return err
- }
- if hasClassOverwrite {
- return fmt.Errorf("simdgen does not support [OverwriteClass] in inputs: %s", ops[i])
- }
- }
- for j := range ops[i].Out {
- if err := overwrite(ops[i].Out, j, o); err != nil {
- return err
- }
- }
- if hasClassOverwrite {
- for _, in := range ops[i].In {
- if in.Class == "mask" {
- return fmt.Errorf("simdgen only supports [OverwriteClass] for operations without mask inputs")
- }
- }
- }
- }
- return nil
-}
-
-// reportXEDInconsistency reports potential XED inconsistencies.
-// We can add more fields to [Operation] to enable more checks and implement it here.
-// Supported checks:
-// [NameAndSizeCheck]: NAME[BWDQ] should set the elemBits accordingly.
-// This check is useful to find inconsistencies, then we can add overwrite fields to
-// those defs to correct them manually.
-func reportXEDInconsistency(ops []Operation) error {
- for _, o := range ops {
- if o.NameAndSizeCheck != nil {
- suffixSizeMap := map[byte]int{'B': 8, 'W': 16, 'D': 32, 'Q': 64}
- checkOperand := func(opr Operand) error {
- if opr.ElemBits == nil {
- return fmt.Errorf("simdgen expects elemBits to be set when performing NameAndSizeCheck")
- }
- if v, ok := suffixSizeMap[o.Asm[len(o.Asm)-1]]; !ok {
- return fmt.Errorf("simdgen expects asm to end with [BWDQ] when performing NameAndSizeCheck")
- } else {
- if v != *opr.ElemBits {
- return fmt.Errorf("simdgen finds NameAndSizeCheck inconsistency in def: %s", o)
- }
- }
- return nil
- }
- for _, in := range o.In {
- if in.Class != "vreg" && in.Class != "mask" {
- continue
- }
- if in.TreatLikeAScalarOfSize != nil {
- // This is an irregular operand, don't check it.
- continue
- }
- if err := checkOperand(in); err != nil {
- return err
- }
- }
- for _, out := range o.Out {
- if err := checkOperand(out); err != nil {
- return err
- }
- }
- }
- }
- return nil
-}
-
-func (o *Operation) hasMaskedMerging(maskType maskShape, outType outShape) bool {
- // BLEND and VMOVDQU are not user-facing ops so we should filter them out.
- return o.OperandOrder == nil && o.SpecialLower == nil && maskType == OneMask && outType == OneVregOut &&
- len(o.InVariant) == 1 && !strings.Contains(o.Asm, "BLEND") && !strings.Contains(o.Asm, "VMOVDQU")
-}
-
-func getVbcstData(s string) (feat1Match, feat2Match string) {
- _, err := fmt.Sscanf(s, "feat1=%[^;];feat2=%s", &feat1Match, &feat2Match)
- if err != nil {
- panic(err)
- }
- return
-}
-
-func (o Operation) String() string {
- return pprints(o)
-}
-
-func (op Operand) String() string {
- return pprints(op)
-}
+++ /dev/null
-!import ops/*/go.yaml
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "fmt"
- "log"
- "regexp"
- "slices"
- "strconv"
- "strings"
- "unicode"
-
- "simd/_gen/unify"
-)
-
-type Operation struct {
- rawOperation
-
- // Go is the Go method name of this operation.
- //
- // It is derived from the raw Go method name by adding optional suffixes.
- // Currently, "Masked" is the only suffix.
- Go string
-
- // Documentation is the doc string for this API.
- //
- // It is computed from the raw documentation:
- //
- // - "NAME" is replaced by the Go method name.
- //
- // - For masked operation, a sentence about masking is added.
- Documentation string
-
- // In is the sequence of parameters to the Go method.
- //
- // For masked operations, this will have the mask operand appended.
- In []Operand
-}
-
-// rawOperation is the unifier representation of an [Operation]. It is
-// translated into a more parsed form after unifier decoding.
-type rawOperation struct {
- Go string // Base Go method name
-
- GoArch string // GOARCH for this definition
- Asm string // Assembly mnemonic
- OperandOrder *string // optional Operand order for better Go declarations
- // Optional tag to indicate this operation is paired with special generic->machine ssa lowering rules.
- // Should be paired with special templates in gen_simdrules.go
- SpecialLower *string
-
- In []Operand // Parameters
- InVariant []Operand // Optional parameters
- Out []Operand // Results
- MemFeatures *string // The memory operand feature this operation supports
- MemFeaturesData *string // Additional data associated with MemFeatures
- Commutative bool // Commutativity
- CPUFeature string // CPUID/Has* feature name
- Zeroing *bool // nil => use asm suffix ".Z"; false => do not use asm suffix ".Z"
- Documentation *string // Documentation will be appended to the stubs comments.
- AddDoc *string // Additional doc to be appended.
- // ConstMask is a hack to reduce the size of defs the user writes for const-immediate
- // If present, it will be copied to [In[0].Const].
- ConstImm *string
- // NameAndSizeCheck is used to check [BWDQ] maps to (8|16|32|64) elemBits.
- NameAndSizeCheck *bool
- // If non-nil, all generation in gen_simdTypes.go and gen_intrinsics will be skipped.
- NoTypes *string
- // If non-nil, all generation in gen_simdGenericOps and gen_simdrules will be skipped.
- NoGenericOps *string
- // If non-nil, this string will be attached to the machine ssa op name. E.g. "const"
- SSAVariant *string
- // If true, do not emit method declarations, generic ops, or intrinsics for masked variants
- // DO emit the architecture-specific opcodes and optimizations.
- HideMaskMethods *bool
-}
-
-func (o *Operation) IsMasked() bool {
- if len(o.InVariant) == 0 {
- return false
- }
- if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
- return true
- }
- panic(fmt.Errorf("unknown inVariant"))
-}
-
-func (o *Operation) SkipMaskedMethod() bool {
- if o.HideMaskMethods == nil {
- return false
- }
- if *o.HideMaskMethods && o.IsMasked() {
- return true
- }
- return false
-}
-
-var reForName = regexp.MustCompile(`\bNAME\b`)
-
-func (o *Operation) DecodeUnified(v *unify.Value) error {
- if err := v.Decode(&o.rawOperation); err != nil {
- return err
- }
-
- isMasked := o.IsMasked()
-
- // Compute full Go method name.
- o.Go = o.rawOperation.Go
- if isMasked {
- o.Go += "Masked"
- }
-
- // Compute doc string.
- if o.rawOperation.Documentation != nil {
- o.Documentation = *o.rawOperation.Documentation
- } else {
- o.Documentation = "// UNDOCUMENTED"
- }
- o.Documentation = reForName.ReplaceAllString(o.Documentation, o.Go)
- if isMasked {
- o.Documentation += "\n//\n// This operation is applied selectively under a write mask."
- // Suppress generic op and method declaration for exported methods, if a mask is present.
- if unicode.IsUpper([]rune(o.Go)[0]) {
- trueVal := "true"
- o.NoGenericOps = &trueVal
- o.NoTypes = &trueVal
- }
- }
- if o.rawOperation.AddDoc != nil {
- o.Documentation += "\n" + reForName.ReplaceAllString(*o.rawOperation.AddDoc, o.Go)
- }
-
- o.In = append(o.rawOperation.In, o.rawOperation.InVariant...)
-
- return nil
-}
-
-func (o *Operation) VectorWidth() int {
- out := o.Out[0]
- if out.Class == "vreg" {
- return *out.Bits
- } else if out.Class == "greg" || out.Class == "mask" {
- for i := range o.In {
- if o.In[i].Class == "vreg" {
- return *o.In[i].Bits
- }
- }
- }
- panic(fmt.Errorf("Figure out what the vector width is for %v and implement it", *o))
-}
-
-// Right now simdgen computes the machine op name for most instructions
-// as $Name$OutputSize, by this denotation, these instructions are "overloaded".
-// for example:
-// (Uint16x8) ConvertToInt8
-// (Uint16x16) ConvertToInt8
-// are both VPMOVWB128.
-// To make them distinguishable we need to append the input size to them as well.
-// TODO: document them well in the generated code.
-var demotingConvertOps = map[string]bool{
- "VPMOVQD128": true, "VPMOVSQD128": true, "VPMOVUSQD128": true, "VPMOVQW128": true, "VPMOVSQW128": true,
- "VPMOVUSQW128": true, "VPMOVDW128": true, "VPMOVSDW128": true, "VPMOVUSDW128": true, "VPMOVQB128": true,
- "VPMOVSQB128": true, "VPMOVUSQB128": true, "VPMOVDB128": true, "VPMOVSDB128": true, "VPMOVUSDB128": true,
- "VPMOVWB128": true, "VPMOVSWB128": true, "VPMOVUSWB128": true,
- "VPMOVQDMasked128": true, "VPMOVSQDMasked128": true, "VPMOVUSQDMasked128": true, "VPMOVQWMasked128": true, "VPMOVSQWMasked128": true,
- "VPMOVUSQWMasked128": true, "VPMOVDWMasked128": true, "VPMOVSDWMasked128": true, "VPMOVUSDWMasked128": true, "VPMOVQBMasked128": true,
- "VPMOVSQBMasked128": true, "VPMOVUSQBMasked128": true, "VPMOVDBMasked128": true, "VPMOVSDBMasked128": true, "VPMOVUSDBMasked128": true,
- "VPMOVWBMasked128": true, "VPMOVSWBMasked128": true, "VPMOVUSWBMasked128": true,
-}
-
-func machineOpName(maskType maskShape, gOp Operation) string {
- asm := gOp.Asm
- if maskType == OneMask {
- asm += "Masked"
- }
- asm = fmt.Sprintf("%s%d", asm, gOp.VectorWidth())
- if gOp.SSAVariant != nil {
- asm += *gOp.SSAVariant
- }
- if demotingConvertOps[asm] {
- // Need to append the size of the source as well.
- // TODO: should be "%sto%d".
- asm = fmt.Sprintf("%s_%d", asm, *gOp.In[0].Bits)
- }
- return asm
-}
-
-func compareStringPointers(x, y *string) int {
- if x != nil && y != nil {
- return compareNatural(*x, *y)
- }
- if x == nil && y == nil {
- return 0
- }
- if x == nil {
- return -1
- }
- return 1
-}
-
-func compareIntPointers(x, y *int) int {
- if x != nil && y != nil {
- return *x - *y
- }
- if x == nil && y == nil {
- return 0
- }
- if x == nil {
- return -1
- }
- return 1
-}
-
-func compareOperations(x, y Operation) int {
- if c := compareNatural(x.Go, y.Go); c != 0 {
- return c
- }
- xIn, yIn := x.In, y.In
-
- if len(xIn) > len(yIn) && xIn[len(xIn)-1].Class == "mask" {
- xIn = xIn[:len(xIn)-1]
- } else if len(xIn) < len(yIn) && yIn[len(yIn)-1].Class == "mask" {
- yIn = yIn[:len(yIn)-1]
- }
-
- if len(xIn) < len(yIn) {
- return -1
- }
- if len(xIn) > len(yIn) {
- return 1
- }
- if len(x.Out) < len(y.Out) {
- return -1
- }
- if len(x.Out) > len(y.Out) {
- return 1
- }
- for i := range xIn {
- ox, oy := &xIn[i], &yIn[i]
- if c := compareOperands(ox, oy); c != 0 {
- return c
- }
- }
- return 0
-}
-
-func compareOperands(x, y *Operand) int {
- if c := compareNatural(x.Class, y.Class); c != 0 {
- return c
- }
- if x.Class == "immediate" {
- return compareStringPointers(x.ImmOffset, y.ImmOffset)
- } else {
- if c := compareStringPointers(x.Base, y.Base); c != 0 {
- return c
- }
- if c := compareIntPointers(x.ElemBits, y.ElemBits); c != 0 {
- return c
- }
- if c := compareIntPointers(x.Bits, y.Bits); c != 0 {
- return c
- }
- return 0
- }
-}
-
-type Operand struct {
- Class string // One of "mask", "immediate", "vreg", "greg", and "mem"
-
- Go *string // Go type of this operand
- AsmPos int // Position of this operand in the assembly instruction
-
- Base *string // Base Go type ("int", "uint", "float")
- ElemBits *int // Element bit width
- Bits *int // Total vector bit width
-
- Const *string // Optional constant value for immediates.
- // Optional immediate arg offsets. If this field is non-nil,
- // This operand will be an immediate operand:
- // The compiler will right-shift the user-passed value by ImmOffset and set it as the AuxInt
- // field of the operation.
- ImmOffset *string
- Name *string // optional name in the Go intrinsic declaration
- Lanes *int // *Lanes equals Bits/ElemBits except for scalars, when *Lanes == 1
- // TreatLikeAScalarOfSize means only the lower $TreatLikeAScalarOfSize bits of the vector
- // is used, so at the API level we can make it just a scalar value of this size; Then we
- // can overwrite it to a vector of the right size during intrinsics stage.
- TreatLikeAScalarOfSize *int
- // If non-nil, it means the [Class] field is overwritten here, right now this is used to
- // overwrite the results of AVX2 compares to masks.
- OverwriteClass *string
- // If non-nil, it means the [Base] field is overwritten here. This field exist solely
- // because Intel's XED data is inconsistent. e.g. VANDNP[SD] marks its operand int.
- OverwriteBase *string
- // If non-nil, it means the [ElementBits] field is overwritten. This field exist solely
- // because Intel's XED data is inconsistent. e.g. AVX512 VPMADDUBSW marks its operand
- // elemBits 16, which should be 8.
- OverwriteElementBits *int
- // FixedReg is the name of the fixed registers
- FixedReg *string
-}
-
-// isDigit returns true if the byte is an ASCII digit.
-func isDigit(b byte) bool {
- return b >= '0' && b <= '9'
-}
-
-// compareNatural performs a "natural sort" comparison of two strings.
-// It compares non-digit sections lexicographically and digit sections
-// numerically. In the case of string-unequal "equal" strings like
-// "a01b" and "a1b", strings.Compare breaks the tie.
-//
-// It returns:
-//
-// -1 if s1 < s2
-// 0 if s1 == s2
-// +1 if s1 > s2
-func compareNatural(s1, s2 string) int {
- i, j := 0, 0
- len1, len2 := len(s1), len(s2)
-
- for i < len1 && j < len2 {
- // Find a non-digit segment or a number segment in both strings.
- if isDigit(s1[i]) && isDigit(s2[j]) {
- // Number segment comparison.
- numStart1 := i
- for i < len1 && isDigit(s1[i]) {
- i++
- }
- num1, _ := strconv.Atoi(s1[numStart1:i])
-
- numStart2 := j
- for j < len2 && isDigit(s2[j]) {
- j++
- }
- num2, _ := strconv.Atoi(s2[numStart2:j])
-
- if num1 < num2 {
- return -1
- }
- if num1 > num2 {
- return 1
- }
- // If numbers are equal, continue to the next segment.
- } else {
- // Non-digit comparison.
- if s1[i] < s2[j] {
- return -1
- }
- if s1[i] > s2[j] {
- return 1
- }
- i++
- j++
- }
- }
-
- // deal with a01b vs a1b; there needs to be an order.
- return strings.Compare(s1, s2)
-}
-
-const generatedHeader = `// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
-`
-
-func writeGoDefs(path string, cl unify.Closure) error {
- // TODO: Merge operations with the same signature but multiple
- // implementations (e.g., SSE vs AVX)
- var ops []Operation
- for def := range cl.All() {
- var op Operation
- if !def.Exact() {
- continue
- }
- if err := def.Decode(&op); err != nil {
- log.Println(err.Error())
- log.Println(def)
- continue
- }
- // TODO: verify that this is safe.
- op.sortOperand()
- op.adjustAsm()
- ops = append(ops, op)
- }
- slices.SortFunc(ops, compareOperations)
- // The parsed XED data might contain duplicates, like
- // 512 bits VPADDP.
- deduped := dedup(ops)
- slices.SortFunc(deduped, compareOperations)
-
- if *Verbose {
- log.Printf("dedup len: %d\n", len(ops))
- }
- var err error
- if err = overwrite(deduped); err != nil {
- return err
- }
- if *Verbose {
- log.Printf("dedup len: %d\n", len(deduped))
- }
- if !*FlagNoDedup {
- // TODO: This can hide mistakes in the API definitions, especially when
- // multiple patterns result in the same API unintentionally. Make it stricter.
- if deduped, err = dedupGodef(deduped); err != nil {
- return err
- }
- }
- if *Verbose {
- log.Printf("dedup len: %d\n", len(deduped))
- }
- if !*FlagNoConstImmPorting {
- if err = copyConstImm(deduped); err != nil {
- return err
- }
- }
- if *Verbose {
- log.Printf("dedup len: %d\n", len(deduped))
- }
- reportXEDInconsistency(deduped)
- typeMap := parseSIMDTypes(deduped)
-
- formatWriteAndClose(writeSIMDTypes(typeMap), path, "src/"+simdPackage+"/types_amd64.go")
- formatWriteAndClose(writeSIMDFeatures(deduped), path, "src/"+simdPackage+"/cpu.go")
- f, fI := writeSIMDStubs(deduped, typeMap)
- formatWriteAndClose(f, path, "src/"+simdPackage+"/ops_amd64.go")
- formatWriteAndClose(fI, path, "src/"+simdPackage+"/ops_internal_amd64.go")
- formatWriteAndClose(writeSIMDIntrinsics(deduped, typeMap), path, "src/cmd/compile/internal/ssagen/simdintrinsics.go")
- formatWriteAndClose(writeSIMDGenericOps(deduped), path, "src/cmd/compile/internal/ssa/_gen/simdgenericOps.go")
- formatWriteAndClose(writeSIMDMachineOps(deduped), path, "src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go")
- formatWriteAndClose(writeSIMDSSA(deduped), path, "src/cmd/compile/internal/amd64/simdssa.go")
- writeAndClose(writeSIMDRules(deduped).Bytes(), path, "src/cmd/compile/internal/ssa/_gen/simdAMD64.rules")
-
- return nil
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// simdgen is an experiment in generating Go <-> asm SIMD mappings.
-//
-// Usage: simdgen [-xedPath=path] [-q=query] input.yaml...
-//
-// If -xedPath is provided, one of the inputs is a sum of op-code definitions
-// generated from the Intel XED data at path.
-//
-// If input YAML files are provided, each file is read as an input value. See
-// [unify.Closure.UnmarshalYAML] or "go doc unify.Closure.UnmarshalYAML" for the
-// format of these files.
-//
-// TODO: Example definitions and values.
-//
-// The command unifies across all of the inputs and prints all possible results
-// of this unification.
-//
-// If the -q flag is provided, its string value is parsed as a value and treated
-// as another input to unification. This is intended as a way to "query" the
-// result, typically by narrowing it down to a small subset of results.
-//
-// Typical usage:
-//
-// go run . -xedPath $XEDPATH *.yaml
-//
-// To see just the definitions generated from XED, run:
-//
-// go run . -xedPath $XEDPATH
-//
-// (This works because if there's only one input, there's nothing to unify it
-// with, so the result is simply itself.)
-//
-// To see just the definitions for VPADDQ:
-//
-// go run . -xedPath $XEDPATH -q '{asm: VPADDQ}'
-//
-// simdgen can also generate Go definitions of SIMD mappings:
-// To generate go files to the go root, run:
-//
-// go run . -xedPath $XEDPATH -o godefs -goroot $PATH/TO/go go.yaml categories.yaml types.yaml
-//
-// types.yaml is already written, it specifies the shapes of vectors.
-// categories.yaml and go.yaml contains definitions that unifies with types.yaml and XED
-// data, you can find an example in ops/AddSub/.
-//
-// When generating Go definitions, simdgen do 3 "magic"s:
-// - It splits masked operations(with op's [Masked] field set) to const and non const:
-// - One is a normal masked operation, the original
-// - The other has its mask operand's [Const] fields set to "K0".
-// - This way the user does not need to provide a separate "K0"-masked operation def.
-//
-// - It deduplicates intrinsic names that have duplicates:
-// - If there are two operations that shares the same signature, one is AVX512 the other
-// is before AVX512, the other will be selected.
-// - This happens often when some operations are defined both before AVX512 and after.
-// This way the user does not need to provide a separate "K0" operation for the
-// AVX512 counterpart.
-//
-// - It copies the op's [ConstImm] field to its immediate operand's [Const] field.
-// - This way the user does not need to provide verbose op definition while only
-// the const immediate field is different. This is useful to reduce verbosity of
-// compares with imm control predicates.
-//
-// These 3 magics could be disabled by enabling -nosplitmask, -nodedup or
-// -noconstimmporting flags.
-//
-// simdgen right now only supports amd64, -arch=$OTHERARCH will trigger a fatal error.
-package main
-
-// Big TODOs:
-//
-// - This can produce duplicates, which can also lead to less efficient
-// environment merging. Add hashing and use it for deduplication. Be careful
-// about how this shows up in debug traces, since it could make things
-// confusing if we don't show it happening.
-//
-// - Do I need Closure, Value, and Domain? It feels like I should only need two
-// types.
-
-import (
- "cmp"
- "flag"
- "fmt"
- "log"
- "maps"
- "os"
- "path/filepath"
- "runtime/pprof"
- "slices"
- "strings"
-
- "simd/_gen/unify"
-
- "gopkg.in/yaml.v3"
-)
-
-var (
- xedPath = flag.String("xedPath", "", "load XED datafiles from `path`")
- flagQ = flag.String("q", "", "query: read `def` as another input (skips final validation)")
- flagO = flag.String("o", "yaml", "output type: yaml, godefs (generate definitions into a Go source tree")
- flagGoDefRoot = flag.String("goroot", ".", "the path to the Go dev directory that will receive the generated files")
- FlagNoDedup = flag.Bool("nodedup", false, "disable deduplicating godefs of 2 qualifying operations from different extensions")
- FlagNoConstImmPorting = flag.Bool("noconstimmporting", false, "disable const immediate porting from op to imm operand")
- FlagArch = flag.String("arch", "amd64", "the target architecture")
-
- Verbose = flag.Bool("v", false, "verbose")
-
- flagDebugXED = flag.Bool("debug-xed", false, "show XED instructions")
- flagDebugUnify = flag.Bool("debug-unify", false, "print unification trace")
- flagDebugHTML = flag.String("debug-html", "", "write unification trace to `file.html`")
- FlagReportDup = flag.Bool("reportdup", false, "report the duplicate godefs")
-
- flagCPUProfile = flag.String("cpuprofile", "", "write CPU profile to `file`")
- flagMemProfile = flag.String("memprofile", "", "write memory profile to `file`")
-)
-
-const simdPackage = "simd"
-
-func main() {
- flag.Parse()
-
- if *flagCPUProfile != "" {
- f, err := os.Create(*flagCPUProfile)
- if err != nil {
- log.Fatalf("-cpuprofile: %s", err)
- }
- defer f.Close()
- pprof.StartCPUProfile(f)
- defer pprof.StopCPUProfile()
- }
- if *flagMemProfile != "" {
- f, err := os.Create(*flagMemProfile)
- if err != nil {
- log.Fatalf("-memprofile: %s", err)
- }
- defer func() {
- pprof.WriteHeapProfile(f)
- f.Close()
- }()
- }
-
- var inputs []unify.Closure
-
- if *FlagArch != "amd64" {
- log.Fatalf("simdgen only supports amd64")
- }
-
- // Load XED into a defs set.
- if *xedPath != "" {
- xedDefs := loadXED(*xedPath)
- inputs = append(inputs, unify.NewSum(xedDefs...))
- }
-
- // Load query.
- if *flagQ != "" {
- r := strings.NewReader(*flagQ)
- def, err := unify.Read(r, "<query>", unify.ReadOpts{})
- if err != nil {
- log.Fatalf("parsing -q: %s", err)
- }
- inputs = append(inputs, def)
- }
-
- // Load defs files.
- must := make(map[*unify.Value]struct{})
- for _, path := range flag.Args() {
- defs, err := unify.ReadFile(path, unify.ReadOpts{})
- if err != nil {
- log.Fatal(err)
- }
- inputs = append(inputs, defs)
-
- if filepath.Base(path) == "go.yaml" {
- // These must all be used in the final result
- for def := range defs.Summands() {
- must[def] = struct{}{}
- }
- }
- }
-
- // Prepare for unification
- if *flagDebugUnify {
- unify.Debug.UnifyLog = os.Stderr
- }
- if *flagDebugHTML != "" {
- f, err := os.Create(*flagDebugHTML)
- if err != nil {
- log.Fatal(err)
- }
- unify.Debug.HTML = f
- defer f.Close()
- }
-
- // Unify!
- unified, err := unify.Unify(inputs...)
- if err != nil {
- log.Fatal(err)
- }
-
- // Validate results.
- //
- // Don't validate if this is a command-line query because that tends to
- // eliminate lots of required defs and is used in cases where maybe defs
- // aren't enumerable anyway.
- if *flagQ == "" && len(must) > 0 {
- validate(unified, must)
- }
-
- // Print results.
- switch *flagO {
- case "yaml":
- // Produce a result that looks like encoding a slice, but stream it.
- fmt.Println("!sum")
- var val1 [1]*unify.Value
- for val := range unified.All() {
- val1[0] = val
- // We have to make a new encoder each time or it'll print a document
- // separator between each object.
- enc := yaml.NewEncoder(os.Stdout)
- if err := enc.Encode(val1); err != nil {
- log.Fatal(err)
- }
- enc.Close()
- }
- case "godefs":
- if err := writeGoDefs(*flagGoDefRoot, unified); err != nil {
- log.Fatalf("Failed writing godefs: %+v", err)
- }
- }
-
- if !*Verbose && *xedPath != "" {
- if operandRemarks == 0 {
- fmt.Fprintf(os.Stderr, "XED decoding generated no errors, which is unusual.\n")
- } else {
- fmt.Fprintf(os.Stderr, "XED decoding generated %d \"errors\" which is not cause for alarm, use -v for details.\n", operandRemarks)
- }
- }
-}
-
-func validate(cl unify.Closure, required map[*unify.Value]struct{}) {
- // Validate that:
- // 1. All final defs are exact
- // 2. All required defs are used
- for def := range cl.All() {
- if _, ok := def.Domain.(unify.Def); !ok {
- fmt.Fprintf(os.Stderr, "%s: expected Def, got %T\n", def.PosString(), def.Domain)
- continue
- }
-
- if !def.Exact() {
- fmt.Fprintf(os.Stderr, "%s: def not reduced to an exact value, why is %s:\n", def.PosString(), def.WhyNotExact())
- fmt.Fprintf(os.Stderr, "\t%s\n", strings.ReplaceAll(def.String(), "\n", "\n\t"))
- }
-
- for root := range def.Provenance() {
- delete(required, root)
- }
- }
- // Report unused defs
- unused := slices.SortedFunc(maps.Keys(required),
- func(a, b *unify.Value) int {
- return cmp.Or(
- cmp.Compare(a.Pos().Path, b.Pos().Path),
- cmp.Compare(a.Pos().Line, b.Pos().Line),
- )
- })
- for _, def := range unused {
- // TODO: Can we say anything more actionable? This is always a problem
- // with unification: if it fails, it's very hard to point a finger at
- // any particular reason. We could go back and try unifying this again
- // with each subset of the inputs (starting with individual inputs) to
- // at least say "it doesn't unify with anything in x.yaml". That's a lot
- // of work, but if we have trouble debugging unification failure it may
- // be worth it.
- fmt.Fprintf(os.Stderr, "%s: def required, but did not unify (%v)\n",
- def.PosString(), def)
- }
-}
+++ /dev/null
-!sum
-- go: Add
- commutative: true
- documentation: !string |-
- // NAME adds corresponding elements of two vectors.
-- go: AddSaturated
- commutative: true
- documentation: !string |-
- // NAME adds corresponding elements of two vectors with saturation.
-- go: Sub
- commutative: false
- documentation: !string |-
- // NAME subtracts corresponding elements of two vectors.
-- go: SubSaturated
- commutative: false
- documentation: !string |-
- // NAME subtracts corresponding elements of two vectors with saturation.
-- go: AddPairs
- commutative: false
- documentation: !string |-
- // NAME horizontally adds adjacent pairs of elements.
- // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-- go: SubPairs
- commutative: false
- documentation: !string |-
- // NAME horizontally subtracts adjacent pairs of elements.
- // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-- go: AddPairsSaturated
- commutative: false
- documentation: !string |-
- // NAME horizontally adds adjacent pairs of elements with saturation.
- // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-- go: SubPairsSaturated
- commutative: false
- documentation: !string |-
- // NAME horizontally subtracts adjacent pairs of elements with saturation.
- // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+++ /dev/null
-!sum
-# Add
-- go: Add
- asm: "VPADD[BWDQ]|VADDP[SD]"
- in:
- - &any
- go: $t
- - *any
- out:
- - *any
-# Add Saturated
-- go: AddSaturated
- asm: "VPADDS[BWDQ]"
- in:
- - &int
- go: $t
- base: int
- - *int
- out:
- - *int
-- go: AddSaturated
- asm: "VPADDUS[BWDQ]"
- in:
- - &uint
- go: $t
- base: uint
- - *uint
- out:
- - *uint
-
-# Sub
-- go: Sub
- asm: "VPSUB[BWDQ]|VSUBP[SD]"
- in: &2any
- - *any
- - *any
- out: &1any
- - *any
-# Sub Saturated
-- go: SubSaturated
- asm: "VPSUBS[BWDQ]"
- in: &2int
- - *int
- - *int
- out: &1int
- - *int
-- go: SubSaturated
- asm: "VPSUBUS[BWDQ]"
- in:
- - *uint
- - *uint
- out:
- - *uint
-- go: AddPairs
- asm: "VPHADD[DW]"
- in: *2any
- out: *1any
-- go: SubPairs
- asm: "VPHSUB[DW]"
- in: *2any
- out: *1any
-- go: AddPairs
- asm: "VHADDP[SD]" # floats
- in: *2any
- out: *1any
-- go: SubPairs
- asm: "VHSUBP[SD]" # floats
- in: *2any
- out: *1any
-- go: AddPairsSaturated
- asm: "VPHADDS[DW]"
- in: *2int
- out: *1int
-- go: SubPairsSaturated
- asm: "VPHSUBS[DW]"
- in: *2int
- out: *1int
+++ /dev/null
-!sum
-- go: And
- commutative: true
- documentation: !string |-
- // NAME performs a bitwise AND operation between two vectors.
-- go: Or
- commutative: true
- documentation: !string |-
- // NAME performs a bitwise OR operation between two vectors.
-- go: AndNot
- commutative: false
- documentation: !string |-
- // NAME performs a bitwise x &^ y.
-- go: Xor
- commutative: true
- documentation: !string |-
- // NAME performs a bitwise XOR operation between two vectors.
-- go: tern
- commutative: false
- documentation: !string |-
- // NAME performs a logical operation on three vectors based on the 8-bit truth table.
- // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-
-# We also have PTEST and VPTERNLOG, those should be hidden from the users
-# and only appear in rewrite rules.
+++ /dev/null
-!sum
-# In the XED data, *all* floating point bitwise logic operation has their
-# operand type marked as uint. We are not trying to understand why Intel
-# decided that they want FP bit-wise logic operations, but this irregularity
-# has to be dealed with in separate rules with some overwrites.
-
-# For many bit-wise operations, we have the following non-orthogonal
-# choices:
-#
-# - Non-masked AVX operations have no element width (because it
-# doesn't matter), but only cover 128 and 256 bit vectors.
-#
-# - Masked AVX-512 operations have an element width (because it needs
-# to know how to interpret the mask), and cover 128, 256, and 512 bit
-# vectors. These only cover 32- and 64-bit element widths.
-#
-# - Non-masked AVX-512 operations still have an element width (because
-# they're just the masked operations with an implicit K0 mask) but it
-# doesn't matter! This is the only option for non-masked 512 bit
-# operations, and we can pick any of the element widths.
-#
-# We unify with ALL of these operations and the compiler generator
-# picks when there are multiple options.
-
-# TODO: We don't currently generate unmasked bit-wise operations on 512 bit
-# vectors of 8- or 16-bit elements. AVX-512 only has *masked* bit-wise
-# operations for 32- and 64-bit elements; while the element width doesn't matter
-# for unmasked operations, right now we don't realize that we can just use the
-# 32- or 64-bit version for the unmasked form. Maybe in the XED decoder we
-# should recognize bit-wise operations when generating unmasked versions and
-# omit the element width.
-
-# For binary operations, we constrain their two inputs and one output to the
-# same Go type using a variable.
-
-- go: And
- asm: "VPAND[DQ]?"
- in:
- - &any
- go: $t
- - *any
- out:
- - *any
-
-- go: And
- asm: "VPANDD" # Fill in the gap, And is missing for Uint8x64 and Int8x64
- inVariant: []
- in: &twoI8x64
- - &i8x64
- go: $t
- overwriteElementBits: 8
- - *i8x64
- out: &oneI8x64
- - *i8x64
-
-- go: And
- asm: "VPANDD" # Fill in the gap, And is missing for Uint16x32 and Int16x32
- inVariant: []
- in: &twoI16x32
- - &i16x32
- go: $t
- overwriteElementBits: 16
- - *i16x32
- out: &oneI16x32
- - *i16x32
-
-- go: AndNot
- asm: "VPANDN[DQ]?"
- operandOrder: "21" # switch the arg order
- in:
- - *any
- - *any
- out:
- - *any
-
-- go: AndNot
- asm: "VPANDND" # Fill in the gap, AndNot is missing for Uint8x64 and Int8x64
- operandOrder: "21" # switch the arg order
- inVariant: []
- in: *twoI8x64
- out: *oneI8x64
-
-- go: AndNot
- asm: "VPANDND" # Fill in the gap, AndNot is missing for Uint16x32 and Int16x32
- operandOrder: "21" # switch the arg order
- inVariant: []
- in: *twoI16x32
- out: *oneI16x32
-
-- go: Or
- asm: "VPOR[DQ]?"
- in:
- - *any
- - *any
- out:
- - *any
-
-- go: Or
- asm: "VPORD" # Fill in the gap, Or is missing for Uint8x64 and Int8x64
- inVariant: []
- in: *twoI8x64
- out: *oneI8x64
-
-- go: Or
- asm: "VPORD" # Fill in the gap, Or is missing for Uint16x32 and Int16x32
- inVariant: []
- in: *twoI16x32
- out: *oneI16x32
-
-- go: Xor
- asm: "VPXOR[DQ]?"
- in:
- - *any
- - *any
- out:
- - *any
-
-- go: Xor
- asm: "VPXORD" # Fill in the gap, Or is missing for Uint8x64 and Int8x64
- inVariant: []
- in: *twoI8x64
- out: *oneI8x64
-
-- go: Xor
- asm: "VPXORD" # Fill in the gap, Or is missing for Uint16x32 and Int16x32
- inVariant: []
- in: *twoI16x32
- out: *oneI16x32
-
-- go: tern
- asm: "VPTERNLOGD|VPTERNLOGQ"
- in:
- - &tern_op
- go: $t
- - *tern_op
- - *tern_op
- - class: immediate
- immOffset: 0
- name: table
- inVariant: []
- out:
- - *tern_op
+++ /dev/null
-!sum
-# const imm predicate(holds for both float and int|uint):
-# 0: Equal
-# 1: Less
-# 2: LessEqual
-# 4: NotEqual
-# 5: GreaterEqual
-# 6: Greater
-- go: Equal
- constImm: 0
- commutative: true
- documentation: !string |-
- // NAME returns x equals y, elementwise.
-- go: Less
- constImm: 1
- commutative: false
- documentation: !string |-
- // NAME returns x less-than y, elementwise.
-- go: LessEqual
- constImm: 2
- commutative: false
- documentation: !string |-
- // NAME returns x less-than-or-equals y, elementwise.
-- go: IsNan # For float only.
- constImm: 3
- commutative: true
- documentation: !string |-
- // NAME checks if elements are NaN. Use as x.IsNan(x).
-- go: NotEqual
- constImm: 4
- commutative: true
- documentation: !string |-
- // NAME returns x not-equals y, elementwise.
-- go: GreaterEqual
- constImm: 13
- commutative: false
- documentation: !string |-
- // NAME returns x greater-than-or-equals y, elementwise.
-- go: Greater
- constImm: 14
- commutative: false
- documentation: !string |-
- // NAME returns x greater-than y, elementwise.
+++ /dev/null
-!sum
-# Ints
-- go: Equal
- asm: "V?PCMPEQ[BWDQ]"
- in:
- - &any
- go: $t
- - *any
- out:
- - &anyvregToMask
- go: $t
- overwriteBase: int
- overwriteClass: mask
-- go: Greater
- asm: "V?PCMPGT[BWDQ]"
- in:
- - &int
- go: $t
- base: int
- - *int
- out:
- - *anyvregToMask
-# 256-bit VCMPGTQ's output elemBits is marked 32-bit in the XED data, we
-# believe this is an error, so add this definition to overwrite.
-- go: Greater
- asm: "VPCMPGTQ"
- in:
- - &int64
- go: $t
- base: int
- elemBits: 64
- - *int64
- out:
- - base: int
- elemBits: 32
- overwriteElementBits: 64
- overwriteClass: mask
- overwriteBase: int
-
-# TODO these are redundant with VPCMP operations.
-# AVX-512 compares produce masks.
-- go: Equal
- asm: "V?PCMPEQ[BWDQ]"
- in:
- - *any
- - *any
- out:
- - class: mask
-- go: Greater
- asm: "V?PCMPGT[BWDQ]"
- in:
- - *int
- - *int
- out:
- - class: mask
-
-# MASKED signed comparisons for X/Y registers
-# unmasked would clash with emulations on AVX2
-- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
- regexpTag: "compares"
- asm: "VPCMP[BWDQ]"
- in:
- - &int
- bits: (128|256)
- go: $t
- base: int
- - *int
- - class: immediate
- const: 0 # Just a placeholder, will be overwritten by const imm porting.
- inVariant:
- - class: mask
- out:
- - class: mask
-
-# MASKED unsigned comparisons for X/Y registers
-# unmasked would clash with emulations on AVX2
-- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
- regexpTag: "compares"
- asm: "VPCMPU[BWDQ]"
- in:
- - &uint
- bits: (128|256)
- go: $t
- base: uint
- - *uint
- - class: immediate
- const: 0
- inVariant:
- - class: mask
- out:
- - class: mask
-
-# masked/unmasked signed comparisons for Z registers
-- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
- regexpTag: "compares"
- asm: "VPCMP[BWDQ]"
- in:
- - &int
- bits: 512
- go: $t
- base: int
- - *int
- - class: immediate
- const: 0 # Just a placeholder, will be overwritten by const imm porting.
- out:
- - class: mask
-
-# masked/unmasked unsigned comparisons for Z registers
-- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
- regexpTag: "compares"
- asm: "VPCMPU[BWDQ]"
- in:
- - &uint
- bits: 512
- go: $t
- base: uint
- - *uint
- - class: immediate
- const: 0
- out:
- - class: mask
-
-# Floats
-- go: Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual|IsNan
- regexpTag: "compares"
- asm: "VCMPP[SD]"
- in:
- - &float
- go: $t
- base: float
- - *float
- - class: immediate
- const: 0
- out:
- - go: $t
- overwriteBase: int
- overwriteClass: mask
-- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual|IsNan)
- regexpTag: "compares"
- asm: "VCMPP[SD]"
- in:
- - *float
- - *float
- - class: immediate
- const: 0
- out:
- - class: mask
\ No newline at end of file
+++ /dev/null
-!sum
-# Float <-> Int conversions
-- go: "ConvertToInt32"
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to int32.
- // When a conversion is inexact, a truncated (round toward zero) value is returned.
- // If a converted result cannot be represented in int32, an implementation-defined
- // architecture-specific value is returned.
-- go: "ConvertToUint32"
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to uint32.
- // When a conversion is inexact, a truncated (round toward zero) value is returned.
- // If a converted result cannot be represented in uint32, an implementation-defined
- // architecture-specific value is returned.
-- go: "ConvertToInt64"
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to int64.
- // When a conversion is inexact, a truncated (round toward zero) value is returned.
- // If a converted result cannot be represented in int64, an implementation-defined
- // architecture-specific value is returned.
-- go: "ConvertToUint64"
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to uint64.
- // When a conversion is inexact, a truncated (round toward zero) value is returned.
- // If a converted result cannot be represented in uint64, an implementation-defined
- // architecture-specific value is returned.
-- go: "ConvertToFloat32" # Also float64 -> float32
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to float32.
-- go: "ConvertToFloat64" # Also float32 -> float64
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to float64.
-
-# Int <-> Int conversions
-- go: "(Extend|Saturate|Truncate)?ToInt8"
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to int8.
-- go: "(Extend|Saturate|Truncate)?ToInt16(Concat)?"
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to int16.
-- go: "(Extend|Saturate|Truncate)?ToInt32"
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to int32.
-- go: "(Extend|Saturate|Truncate)?ToInt64"
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to int64.
-- go: "(Extend|Saturate|Truncate)?ToUint8"
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to uint8.
-- go: "(Extend|Saturate|Truncate)?ToUint16(Concat)?"
- commutative: false
- regexpTag: "convert"
- documentation: !string |-
- // NAME converts element values to uint16.
-- go: "(Extend|Saturate|Truncate)?ToUint32"
- regexpTag: "convert"
- commutative: false
- documentation: !string |-
- // NAME converts element values to uint32.
-- go: "(Extend|Saturate|Truncate)?ToUint64"
- regexpTag: "convert"
- commutative: false
- documentation: !string |-
- // NAME converts element values to uint64.
-# low-part only Int <-> Int conversions
-- go: ExtendLo8ToUint16x8
- commutative: false
- documentation: !string |-
- // NAME converts 8 lowest vector element values to uint16.
-- go: ExtendLo8ToInt16x8
- commutative: false
- documentation: !string |-
- // NAME converts 8 lowest vector element values to int16.
-- go: ExtendLo4ToUint32x4
- commutative: false
- documentation: !string |-
- // NAME converts 4 lowest vector element values to uint32.
-- go: ExtendLo4ToInt32x4
- commutative: false
- documentation: !string |-
- // NAME converts 4 lowest vector element values to int32.
-- go: ExtendLo2ToUint64x2
- commutative: false
- documentation: !string |-
- // NAME converts 2 lowest vector element values to uint64.
-- go: ExtendLo2ToInt64x2
- commutative: false
- documentation: !string |-
- // NAME converts 2 lowest vector element values to int64.
-- go: ExtendLo2ToUint64x2
- commutative: false
- documentation: !string |-
- // NAME converts 2 lowest vector element values to uint64.
-- go: ExtendLo4ToUint64x4
- commutative: false
- documentation: !string |-
- // NAME converts 4 lowest vector element values to uint64.
-- go: ExtendLo2ToInt64x2
- commutative: false
- documentation: !string |-
- // NAME converts 2 lowest vector element values to int64.
-- go: ExtendLo4ToInt64x4
- commutative: false
- documentation: !string |-
- // NAME converts 4 lowest vector element values to int64.
-- go: ExtendLo4ToUint32x4
- commutative: false
- documentation: !string |-
- // NAME converts 4 lowest vector element values to uint32.
-- go: ExtendLo8ToUint32x8
- commutative: false
- documentation: !string |-
- // NAME converts 8 lowest vector element values to uint32.
-- go: ExtendLo4ToInt32x4
- commutative: false
- documentation: !string |-
- // NAME converts 4 lowest vector element values to int32.
-- go: ExtendLo8ToInt32x8
- commutative: false
- documentation: !string |-
- // NAME converts 8 lowest vector element values to int32.
-- go: ExtendLo2ToUint64x2
- commutative: false
- documentation: !string |-
- // NAME converts 2 lowest vector element values to uint64.
-- go: ExtendLo4ToUint64x4
- commutative: false
- documentation: !string |-
- // NAME converts 4 lowest vector element values to uint64.
-- go: ExtendLo8ToUint64x8
- commutative: false
- documentation: !string |-
- // NAME converts 8 lowest vector element values to uint64.
-- go: ExtendLo2ToInt64x2
- commutative: false
- documentation: !string |-
- // NAME converts 2 lowest vector element values to int64.
-- go: ExtendLo4ToInt64x4
- commutative: false
- documentation: !string |-
- // NAME converts 4 lowest vector element values to int64.
-- go: ExtendLo8ToInt64x8
- commutative: false
- documentation: !string |-
- // NAME converts 8 lowest vector element values to int64.
\ No newline at end of file
+++ /dev/null
-!sum
-# Float <-> Int conversions
-# float32 -> int32
-- go: ConvertToInt32
- regexpTag: "convert"
- asm: "VCVTTP[SD]2DQ"
- in:
- - &fp
- go: $t
- base: float
- out:
- - &i32
- go: $u
- base: int
- elemBits: 32
-# float32 -> uint32
-- go: ConvertToUint32
- regexpTag: "convert"
- asm: "VCVTTP[SD]2UDQ"
- in:
- - *fp
- out:
- - &u32
- go: $u
- base: uint
- elemBits: 32
-# float32|float64 -> int64
-- go: ConvertToInt64
- regexpTag: "convert"
- asm: "VCVTTPD2QQ"
- in:
- - *fp
- out:
- - &i64
- go: $u
- base: int
- elemBits: 64
-- go: ConvertToInt64
- regexpTag: "convert"
- asm: "VCVTTPS2QQ"
- in:
- - *fp
- out:
- - go: $u
- base: int
- elemBits: 64
- bits: 256|512
-# float32|float64 -> uint64
-- go: ConvertToUint64
- regexpTag: "convert"
- asm: "VCVTTPD2UQQ"
- in:
- - *fp
- out:
- - &u64
- go: $u
- base: uint
- elemBits: 64
-- go: ConvertToUint64
- regexpTag: "convert"
- asm: "VCVTTPS2UQQ"
- in:
- - *fp
- out:
- - go: $u
- base: uint
- elemBits: 64
- bits: 256|512
-# int -> float32
-- go: ConvertToFloat32
- regexpTag: "convert"
- asm: "VCVT[DQ]Q2PS"
- in: &int
- - go: $i
- base: int
- out:
- - *fp
-# int -> float64
-- go: ConvertToFloat64
- regexpTag: "convert"
- asm: "VCVTQQ2PD"
- in: *int
- out:
- - *fp
-- go: ConvertToFloat64
- regexpTag: "convert"
- asm: "VCVTDQ2PD"
- in: *int
- out:
- - base: float
- bits: 256|512
-# uint -> float32
-- go: ConvertToFloat32
- regexpTag: "convert"
- asm: "VCVTU[DQ]Q2PS"
- in: &uint
- - go: $u
- base: uint
- out:
- - *fp
-# uint -> float64
-- go: ConvertToFloat64
- regexpTag: "convert"
- asm: "VCVTUQQ2PD"
- in: *uint
- out:
- - *fp
-- go: ConvertToFloat64
- regexpTag: "convert"
- asm: "VCVTUDQ2PD"
- in: *uint
- out:
- - base: float
- bits: 256|512
-# float64 -> float32
-- go: ConvertToFloat32
- regexpTag: "convert"
- asm: "VCVTPD2PS"
- addDoc:
- !string |-
- // The result vector's elements are rounded to the nearest value.
- in: &fp64
- - base: float
- elemBits: 64
- out: &fp32
- - base: float
- elemBits: 32
-# float32 -> float64
-- go: ConvertToFloat64
- regexpTag: "convert"
- asm: "VCVTPS2PD"
- in: *fp32
- out:
- - base: float
- elemBits: 64
- bits: 256|512
-
-# Widening integer conversions.
-# uint8 -> uint16
-- go: ExtendToUint16
- addDoc: &zeroExtendDoc
- !string |-
- // The result vector's elements are zero-extended.
- regexpTag: "convert"
- asm: "VPMOVZXBW"
- in:
- - &u8x16
- base: uint
- elemBits: 8
- bits: 128
- out:
- - &u16x16
- base: uint
- elemBits: 16
- bits: 256
-- go: ExtendToUint16
- regexpTag: "convert"
- asm: "VPMOVZXBW"
- addDoc: *zeroExtendDoc
- in:
- - &u8x32
- base: uint
- elemBits: 8
- bits: 256
- out:
- - &u16x32
- base: uint
- elemBits: 16
- bits: 512
-# int8 -> int16
-- go: ExtendToInt16
- regexpTag: "convert"
- asm: "VPMOVSXBW"
- addDoc: &signExtendDoc
- !string |-
- // The result vector's elements are sign-extended.
- in:
- - &i8x16
- base: int
- elemBits: 8
- bits: 128
- out:
- - &i16x16
- base: int
- elemBits: 16
- bits: 256
-- go: ExtendToInt16
- regexpTag: "convert"
- asm: "VPMOVSXBW"
- addDoc: *signExtendDoc
- in:
- - &i8x32
- base: int
- elemBits: 8
- bits: 256
- out:
- - &i16x32
- base: int
- elemBits: 16
- bits: 512
-# uint16->uint32
-- go: ExtendToUint32
- regexpTag: "convert"
- asm: "VPMOVZXWD"
- addDoc: *zeroExtendDoc
- in:
- - &u16x8
- base: uint
- elemBits: 16
- bits: 128
- out:
- - &u32x8
- base: uint
- elemBits: 32
- bits: 256
-- go: ExtendToUint32
- regexpTag: "convert"
- asm: "VPMOVZXWD"
- addDoc: *zeroExtendDoc
- in:
- - *u16x16
- out:
- - &u32x16
- base: uint
- elemBits: 32
- bits: 512
-# int16->int32
-- go: ExtendToInt32
- regexpTag: "convert"
- asm: "VPMOVSXWD"
- addDoc: *signExtendDoc
- in:
- - &i16x8
- base: int
- elemBits: 16
- bits: 128
- out:
- - &i32x8
- base: int
- elemBits: 32
- bits: 256
-- go: ExtendToInt32
- regexpTag: "convert"
- asm: "VPMOVSXWD"
- addDoc: *signExtendDoc
- in:
- - *i16x16
- out:
- - &i32x16
- base: int
- elemBits: 32
- bits: 512
-# uint32 -> uint64
-- go: ExtendToUint64
- regexpTag: "convert"
- asm: "VPMOVZXDQ"
- addDoc: *zeroExtendDoc
- in:
- - &u32x4
- base: uint
- elemBits: 32
- bits: 128
- out:
- - &u64x4
- base: uint
- elemBits: 64
- bits: 256
-- go: ExtendToUint64
- regexpTag: "convert"
- asm: "VPMOVZXDQ"
- addDoc: *zeroExtendDoc
- in:
- - *u32x8
- out:
- - &u64x8
- base: uint
- elemBits: 64
- bits: 512
-# int32 -> int64
-- go: ExtendToInt64
- regexpTag: "convert"
- asm: "VPMOVSXDQ"
- addDoc: *signExtendDoc
- in:
- - &i32x4
- base: int
- elemBits: 32
- bits: 128
- out:
- - &i64x4
- base: int
- elemBits: 64
- bits: 256
-- go: ExtendToInt64
- regexpTag: "convert"
- asm: "VPMOVSXDQ"
- addDoc: *signExtendDoc
- in:
- - *i32x8
- out:
- - &i64x8
- base: int
- elemBits: 64
- bits: 512
-# uint16 -> uint64
-- go: ExtendToUint64
- regexpTag: "convert"
- asm: "VPMOVZXWQ"
- addDoc: *zeroExtendDoc
- in:
- - *u16x8
- out:
- - *u64x8
-# int16 -> int64
-- go: ExtendToInt64
- regexpTag: "convert"
- asm: "VPMOVSXWQ"
- addDoc: *signExtendDoc
- in:
- - *i16x8
- out:
- - *i64x8
-# uint8 -> uint32
-- go: ExtendToUint32
- regexpTag: "convert"
- asm: "VPMOVZXBD"
- addDoc: *zeroExtendDoc
- in:
- - *u8x16
- out:
- - *u32x16
-# int8 -> int32
-- go: ExtendToInt32
- regexpTag: "convert"
- asm: "VPMOVSXBD"
- addDoc: *signExtendDoc
- in:
- - *i8x16
- out:
- - *i32x16
-# Truncating conversions
-- go: TruncateToInt8
- regexpTag: "convert"
- asm: "VPMOV[WDQ]B"
- addDoc: &truncDocZeroUpper
- !string |-
- // Conversion is done with truncation on the vector elements.
- // Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
- in:
- - base: int
- out:
- - base: int
- bits: 128
-- go: TruncateToUint8
- regexpTag: "convert"
- asm: "VPMOV[WDQ]B"
- addDoc: *truncDocZeroUpper
- in:
- - base: uint
- out:
- - base: uint
- bits: 128
-- go: TruncateToInt8
- regexpTag: "convert"
- asm: "VPMOV[WDQ]B"
- addDoc: &truncDoc
- !string |-
- // Conversion is done with truncation on the vector elements.
- in:
- - base: int
- out:
- - base: int
- bits: 256|512
-- go: TruncateToUint8
- regexpTag: "convert"
- asm: "VPMOV[WDQ]B"
- addDoc: *truncDoc
- in:
- - base: uint
- out:
- - base: uint
- bits: 256|512
-- go: TruncateToInt16
- regexpTag: "convert"
- asm: "VPMOV[DQ]W"
- addDoc: *truncDoc
- in:
- - base: int
- out:
- - base: int
-- go: TruncateToUint16
- regexpTag: "convert"
- asm: "VPMOV[DQ]W"
- addDoc: *truncDoc
- in:
- - base: uint
- out:
- - base: uint
-- go: TruncateToInt32
- regexpTag: "convert"
- asm: "VPMOVQD"
- addDoc: *truncDoc
- in:
- - base: int
- out:
- - base: int
-- go: TruncateToUint32
- regexpTag: "convert"
- asm: "VPMOVQD"
- addDoc: *truncDoc
- in:
- - base: uint
- out:
- - base: uint
-# Saturated conversions.
-- go: SaturateToInt8
- regexpTag: "convert"
- asm: "VPMOVS[WDQ]B"
- addDoc: &satDocZeroUpper
- !string |-
- // Conversion is done with saturation on the vector elements.
- // Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
- in:
- - base: int
- out:
- - base: int
- bits: 128
-- go: SaturateToUint8
- regexpTag: "convert"
- asm: "VPMOVS[WDQ]B"
- addDoc: *satDocZeroUpper
- in:
- - base: int
- out:
- - base: int
- bits: 128
-- go: SaturateToInt8
- regexpTag: "convert"
- asm: "VPMOVS[WDQ]B"
- addDoc: &satDoc
- !string |-
- // Conversion is done with saturation on the vector elements.
- in:
- - base: int
- out:
- - base: int
- bits: 256|512
-- go: SaturateToUint8
- regexpTag: "convert"
- asm: "VPMOVUS[WDQ]B"
- addDoc: *satDoc
- in:
- - base: uint
- out:
- - base: uint
- bits: 256|512
-- go: SaturateToInt16
- regexpTag: "convert"
- asm: "VPMOVS[DQ]W"
- addDoc: *satDoc
- in:
- - base: int
- out:
- - base: int
-- go: SaturateToUint16
- regexpTag: "convert"
- asm: "VPMOVUS[DQ]W"
- addDoc: *satDoc
- in:
- - base: uint
- out:
- - base: uint
-- go: SaturateToInt32
- regexpTag: "convert"
- asm: "VPMOVSQD"
- addDoc: *satDoc
- in:
- - base: int
- out:
- - base: int
-- go: SaturateToUint32
- regexpTag: "convert"
- asm: "VPMOVUSQD"
- addDoc: *satDoc
- in:
- - base: uint
- out:
- - base: uint
-# Truncating saturated packed
-- go: SaturateToInt16Concat
- regexpTag: "convert"
- asm: "VPACKSSDW"
- addDoc: &satDocConcat
- !string |-
- // With each 128-bit as a group:
- // The converted group from the first input vector will be packed to the lower part of the result vector,
- // the converted group from the second input vector will be packed to the upper part of the result vector.
- // Conversion is done with saturation on the vector elements.
- in:
- - base: int
- - base: int
- out:
- - base: int
-- go: SaturateToUint16Concat
- regexpTag: "convert"
- asm: "VPACKUSDW"
- addDoc: *satDocConcat
- in:
- - base: uint
- - base: uint
- out:
- - base: uint
-
-# low-part only conversions.
-# uint8->uint16
-- go: ExtendLo8ToUint16x8
- regexpTag: "convert"
- asm: "VPMOVZXBW"
- addDoc: *zeroExtendDoc
- in:
- - *u8x16
- out:
- - *u16x8
-# int8->int16
-- go: ExtendLo8ToInt16x8
- regexpTag: "convert"
- asm: "VPMOVSXBW"
- addDoc: *signExtendDoc
- in:
- - *i8x16
- out:
- - *i16x8
-# uint16->uint32
-- go: ExtendLo4ToUint32x4
- regexpTag: "convert"
- asm: "VPMOVZXWD"
- addDoc: *zeroExtendDoc
- in:
- - *u16x8
- out:
- - *u32x4
-# int16->int32
-- go: ExtendLo4ToInt32x4
- regexpTag: "convert"
- asm: "VPMOVSXWD"
- addDoc: *signExtendDoc
- in:
- - *i16x8
- out:
- - *i32x4
-# uint32 -> uint64
-- go: ExtendLo2ToUint64x2
- regexpTag: "convert"
- asm: "VPMOVZXDQ"
- addDoc: *zeroExtendDoc
- in:
- - *u32x4
- out:
- - &u64x2
- base: uint
- elemBits: 64
- bits: 128
-# int32 -> int64
-- go: ExtendLo2ToInt64x2
- regexpTag: "convert"
- asm: "VPMOVSXDQ"
- addDoc: *signExtendDoc
- in:
- - *i32x4
- out:
- - &i64x2
- base: int
- elemBits: 64
- bits: 128
-# uint16 -> uint64
-- go: ExtendLo2ToUint64x2
- regexpTag: "convert"
- asm: "VPMOVZXWQ"
- addDoc: *zeroExtendDoc
- in:
- - *u16x8
- out:
- - *u64x2
-- go: ExtendLo4ToUint64x4
- regexpTag: "convert"
- asm: "VPMOVZXWQ"
- addDoc: *zeroExtendDoc
- in:
- - *u16x8
- out:
- - *u64x4
-# int16 -> int64
-- go: ExtendLo2ToInt64x2
- regexpTag: "convert"
- asm: "VPMOVSXWQ"
- addDoc: *signExtendDoc
- in:
- - *i16x8
- out:
- - *i64x2
-- go: ExtendLo4ToInt64x4
- regexpTag: "convert"
- asm: "VPMOVSXWQ"
- addDoc: *signExtendDoc
- in:
- - *i16x8
- out:
- - *i64x4
-# uint8 -> uint32
-- go: ExtendLo4ToUint32x4
- regexpTag: "convert"
- asm: "VPMOVZXBD"
- addDoc: *zeroExtendDoc
- in:
- - *u8x16
- out:
- - *u32x4
-- go: ExtendLo8ToUint32x8
- regexpTag: "convert"
- asm: "VPMOVZXBD"
- addDoc: *zeroExtendDoc
- in:
- - *u8x16
- out:
- - *u32x8
-# int8 -> int32
-- go: ExtendLo4ToInt32x4
- regexpTag: "convert"
- asm: "VPMOVSXBD"
- addDoc: *signExtendDoc
- in:
- - *i8x16
- out:
- - *i32x4
-- go: ExtendLo8ToInt32x8
- regexpTag: "convert"
- asm: "VPMOVSXBD"
- addDoc: *signExtendDoc
- in:
- - *i8x16
- out:
- - *i32x8
-# uint8 -> uint64
-- go: ExtendLo2ToUint64x2
- regexpTag: "convert"
- asm: "VPMOVZXBQ"
- addDoc: *zeroExtendDoc
- in:
- - *u8x16
- out:
- - *u64x2
-- go: ExtendLo4ToUint64x4
- regexpTag: "convert"
- asm: "VPMOVZXBQ"
- addDoc: *zeroExtendDoc
- in:
- - *u8x16
- out:
- - *u64x4
-- go: ExtendLo8ToUint64x8
- regexpTag: "convert"
- asm: "VPMOVZXBQ"
- addDoc: *zeroExtendDoc
- in:
- - *u8x16
- out:
- - *u64x8
-# int8 -> int64
-- go: ExtendLo2ToInt64x2
- regexpTag: "convert"
- asm: "VPMOVSXBQ"
- addDoc: *signExtendDoc
- in:
- - *i8x16
- out:
- - *i64x2
-- go: ExtendLo4ToInt64x4
- regexpTag: "convert"
- asm: "VPMOVSXBQ"
- addDoc: *signExtendDoc
- in:
- - *i8x16
- out:
- - *i64x4
-- go: ExtendLo8ToInt64x8
- regexpTag: "convert"
- asm: "VPMOVSXBQ"
- addDoc: *signExtendDoc
- in:
- - *i8x16
- out:
- - *i64x8
\ No newline at end of file
+++ /dev/null
-!sum
-- go: Div
- commutative: false
- documentation: !string |-
- // NAME divides elements of two vectors.
-- go: Sqrt
- commutative: false
- documentation: !string |-
- // NAME computes the square root of each element.
-- go: Reciprocal
- commutative: false
- documentation: !string |-
- // NAME computes an approximate reciprocal of each element.
-- go: ReciprocalSqrt
- commutative: false
- documentation: !string |-
- // NAME computes an approximate reciprocal of the square root of each element.
-- go: Scale
- commutative: false
- documentation: !string |-
- // NAME multiplies elements by a power of 2.
-- go: RoundToEven
- commutative: false
- constImm: 0
- documentation: !string |-
- // NAME rounds elements to the nearest integer.
-- go: RoundToEvenScaled
- commutative: false
- constImm: 0
- documentation: !string |-
- // NAME rounds elements with specified precision.
-- go: RoundToEvenScaledResidue
- commutative: false
- constImm: 0
- documentation: !string |-
- // NAME computes the difference after rounding with specified precision.
-- go: Floor
- commutative: false
- constImm: 1
- documentation: !string |-
- // NAME rounds elements down to the nearest integer.
-- go: FloorScaled
- commutative: false
- constImm: 1
- documentation: !string |-
- // NAME rounds elements down with specified precision.
-- go: FloorScaledResidue
- commutative: false
- constImm: 1
- documentation: !string |-
- // NAME computes the difference after flooring with specified precision.
-- go: Ceil
- commutative: false
- constImm: 2
- documentation: !string |-
- // NAME rounds elements up to the nearest integer.
-- go: CeilScaled
- commutative: false
- constImm: 2
- documentation: !string |-
- // NAME rounds elements up with specified precision.
-- go: CeilScaledResidue
- commutative: false
- constImm: 2
- documentation: !string |-
- // NAME computes the difference after ceiling with specified precision.
-- go: Trunc
- commutative: false
- constImm: 3
- documentation: !string |-
- // NAME truncates elements towards zero.
-- go: TruncScaled
- commutative: false
- constImm: 3
- documentation: !string |-
- // NAME truncates elements with specified precision.
-- go: TruncScaledResidue
- commutative: false
- constImm: 3
- documentation: !string |-
- // NAME computes the difference after truncating with specified precision.
-- go: AddSub
- commutative: false
- documentation: !string |-
- // NAME subtracts even elements and adds odd elements of two vectors.
+++ /dev/null
-!sum
-- go: Div
- asm: "V?DIVP[SD]"
- in: &2fp
- - &fp
- go: $t
- base: float
- - *fp
- out: &1fp
- - *fp
-- go: Sqrt
- asm: "V?SQRTP[SD]"
- in: *1fp
- out: *1fp
-# TODO: Provide separate methods for 12-bit precision and 14-bit precision?
-- go: Reciprocal
- asm: "VRCP(14)?P[SD]"
- in: *1fp
- out: *1fp
-- go: ReciprocalSqrt
- asm: "V?RSQRT(14)?P[SD]"
- in: *1fp
- out: *1fp
-- go: Scale
- asm: "VSCALEFP[SD]"
- in: *2fp
- out: *1fp
-
-- go: "RoundToEven|Ceil|Floor|Trunc"
- regexpTag: "fp"
- asm: "VROUNDP[SD]"
- in:
- - *fp
- - class: immediate
- const: 0 # place holder
- out: *1fp
-
-- go: "(RoundToEven|Ceil|Floor|Trunc)Scaled"
- regexpTag: "fp"
- asm: "VRNDSCALEP[SD]"
- in:
- - *fp
- - class: immediate
- const: 0 # place holder
- immOffset: 4 # "M", round to numbers with M digits after dot(by means of binary number).
- name: prec
- out: *1fp
-- go: "(RoundToEven|Ceil|Floor|Trunc)ScaledResidue"
- regexpTag: "fp"
- asm: "VREDUCEP[SD]"
- in:
- - *fp
- - class: immediate
- const: 0 # place holder
- immOffset: 4 # "M", round to numbers with M digits after dot(by means of binary number).
- name: prec
- out: *1fp
-
-- go: "AddSub"
- asm: "VADDSUBP[SD]"
- in:
- - *fp
- - *fp
- out:
- - *fp
+++ /dev/null
-!sum
-- go: GaloisFieldAffineTransform
- commutative: false
- documentation: !string |-
- // NAME computes an affine transformation in GF(2^8):
- // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
- // b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
- // corresponding to a group of 8 elements in x.
-- go: GaloisFieldAffineTransformInverse
- commutative: false
- documentation: !string |-
- // NAME computes an affine transformation in GF(2^8),
- // with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
- // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
- // b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
- // corresponding to a group of 8 elements in x.
-- go: GaloisFieldMul
- commutative: false
- documentation: !string |-
- // NAME computes element-wise GF(2^8) multiplication with
- // reduction polynomial x^8 + x^4 + x^3 + x + 1.
-- go: carrylessMultiply
- commutative: false
+++ /dev/null
-!sum
-- go: GaloisFieldAffineTransform
- asm: VGF2P8AFFINEQB
- operandOrder: 2I # 2nd operand, then immediate
- in: &AffineArgs
- - &uint8
- go: $t
- base: uint
- - &uint8x8
- go: $t2
- base: uint
- - &pureImmVar
- class: immediate
- immOffset: 0
- name: b
- out:
- - *uint8
-
-- go: GaloisFieldAffineTransformInverse
- asm: VGF2P8AFFINEINVQB
- operandOrder: 2I # 2nd operand, then immediate
- in: *AffineArgs
- out:
- - *uint8
-
-- go: GaloisFieldMul
- asm: VGF2P8MULB
- in:
- - *uint8
- - *uint8
- out:
- - *uint8
-
-- go: carrylessMultiply
- documentation: !string |-
- // NAME computes one of four possible Galois polynomial
- // products of selected high and low halves of x and y,
- // depending on the value of xyHiLo, returning the 128-bit
- // product in the concatenated two elements of the result.
- // Bit 0 selects the low (0) or high (1) element of x and
- // bit 4 selects the low (0x00) or high (0x10) element of y.
- asm: V?PCLMULQDQ
- in:
- - go: Uint64x2
- - go: Uint64x2
- - class: immediate
- immOffset: 0
- name: xyHiLo
- out:
- - go: Uint64x2
- overwriteElementBits: 64
- hideMaskMethods: true
-
-- go: carrylessMultiply
- documentation: !string |-
- // NAME computes one of two possible Galois polynomial
- // products of selected high and low halves of each of the two
- // 128-bit lanes of x and y, depending on the value of xyHiLo,
- // and returns the four 128-bit products in the result's lanes.
- // Bit 0 selects the low (0) or high (1) elements of x's lanes and
- // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
- asm: V?PCLMULQDQ
- in:
- - go: Uint64x4
- - go: Uint64x4
- - class: immediate
- immOffset: 0
- name: xyHiLo
- out:
- - go: Uint64x4
- overwriteElementBits: 64
- hideMaskMethods: true
-
-- go: carrylessMultiply
- documentation: !string |-
- // NAME computes one of four possible Galois polynomial
- // products of selected high and low halves of each of the four
- // 128-bit lanes of x and y, depending on the value of xyHiLo,
- // and returns the four 128-bit products in the result's lanes.
- // Bit 0 selects the low (0) or high (1) elements of x's lanes and
- // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
- asm: V?PCLMULQDQ
- in:
- - go: Uint64x8
- - go: Uint64x8
- - class: immediate
- immOffset: 0
- name: xyHiLo
- out:
- - go: Uint64x8
- overwriteElementBits: 64
- hideMaskMethods: true
+++ /dev/null
-!sum
-- go: Average
- commutative: true
- documentation: !string |-
- // NAME computes the rounded average of corresponding elements.
-- go: Abs
- commutative: false
- # Unary operation, not commutative
- documentation: !string |-
- // NAME computes the absolute value of each element.
-- go: CopySign
- # Applies sign of second operand to first: sign(val, sign_src)
- commutative: false
- documentation: !string |-
- // NAME returns the product of the first operand with -1, 0, or 1,
- // whichever constant is nearest to the value of the second operand.
- # Sign does not have masked version
-- go: OnesCount
- commutative: false
- documentation: !string |-
- // NAME counts the number of set bits in each element.
+++ /dev/null
-!sum
-# Average (unsigned byte, unsigned word)
-# Instructions: VPAVGB, VPAVGW
-- go: Average
- asm: "VPAVG[BW]" # Matches VPAVGB (byte) and VPAVGW (word)
- in:
- - &uint_t # $t will be Uint8xN for VPAVGB, Uint16xN for VPAVGW
- go: $t
- base: uint
- - *uint_t
- out:
- - *uint_t
-
-# Absolute Value (signed byte, word, dword, qword)
-# Instructions: VPABSB, VPABSW, VPABSD, VPABSQ
-- go: Abs
- asm: "VPABS[BWDQ]" # Matches VPABSB, VPABSW, VPABSD, VPABSQ
- in:
- - &int_t # $t will be Int8xN, Int16xN, Int32xN, Int64xN
- go: $t
- base: int
- out:
- - *int_t # Output is magnitude, fits in the same signed type
-
-# Sign Operation (signed byte, word, dword)
-# Applies sign of second operand to the first.
-# Instructions: VPSIGNB, VPSIGNW, VPSIGND
-- go: CopySign
- asm: "VPSIGN[BWD]" # Matches VPSIGNB, VPSIGNW, VPSIGND
- in:
- - *int_t # value to apply sign to
- - *int_t # value from which to take the sign
- out:
- - *int_t
-
-# Population Count (count set bits in each element)
-# Instructions: VPOPCNTB, VPOPCNTW (AVX512_BITALG)
-# VPOPCNTD, VPOPCNTQ (AVX512_VPOPCNTDQ)
-- go: OnesCount
- asm: "VPOPCNT[BWDQ]"
- in:
- - &any
- go: $t
- out:
- - *any
+++ /dev/null
-!sum
-- go: DotProductPairs
- commutative: false
- documentation: !string |-
- // NAME multiplies the elements and add the pairs together,
- // yielding a vector of half as many elements with twice the input element size.
-# TODO: maybe simplify this name within the receiver-type + method-naming scheme we use.
-- go: DotProductPairsSaturated
- commutative: false
- documentation: !string |-
- // NAME multiplies the elements and add the pairs together with saturation,
- // yielding a vector of half as many elements with twice the input element size.
-# QuadDotProduct, i.e. VPDPBUSD(S) are operations with src/dst on the same register, we are not supporting this as of now.
-# - go: DotProductBroadcast
-# commutative: true
-# # documentation: !string |-
-# // NAME multiplies all elements and broadcasts the sum.
-- go: DotProductQuadruple
- commutative: false
- documentation: !string |-
- // NAME performs dot products on groups of 4 elements of x and y.
- // NAME(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-- go: DotProductQuadrupleSaturated
- commutative: false
- documentation: !string |-
- // NAME multiplies performs dot products on groups of 4 elements of x and y.
- // NAME(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-- go: AddDotProductPairs
- commutative: false
- noTypes: "true"
- noGenericOps: "true"
- documentation: !string |-
- // NAME performs dot products on pairs of elements of y and z and then adds x.
-- go: MulAdd
- commutative: false
- documentation: !string |-
- // NAME performs a fused (x * y) + z.
-- go: MulAddSub
- commutative: false
- documentation: !string |-
- // NAME performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
-- go: MulSubAdd
- commutative: false
- documentation: !string |-
- // NAME performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
-- go: SumAbsDiff
- commutative: false
- documentation: !string |-
- // NAME sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
- // be a vector of word-sized elements whose each 4*n-th element contains the sum of the n-th input group. The other elements in the result vector are zeroed.
- // This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
+++ /dev/null
-!sum
-- go: DotProductPairs
- asm: VPMADDWD
- in:
- - &int
- go: $t
- base: int
- - *int
- out:
- - &int2 # The elemBits are different
- go: $t2
- base: int
-- go: DotProductPairsSaturated
- asm: VPMADDUBSW
- in:
- - &uint
- go: $t
- base: uint
- overwriteElementBits: 8
- - &int3
- go: $t3
- base: int
- overwriteElementBits: 8
- out:
- - *int2
-# - go: DotProductBroadcast
-# asm: VDPP[SD]
-# in:
-# - &dpb_src
-# go: $t
-# - *dpb_src
-# - class: immediate
-# const: 127
-# out:
-# - *dpb_src
-- go: DotProductQuadruple
- asm: "VPDPBUSD"
- operandOrder: "31Zero3" # switch operand 3 and 1, and make 3 always 0
- in:
- - &qdpa_acc
- go: $t_acc
- base: int
- elemBits: 32
- - &qdpa_src1
- go: $t_src1
- base: uint
- overwriteElementBits: 8
- - &qdpa_src2
- go: $t_src2
- base: int
- overwriteElementBits: 8
- out:
- - *qdpa_acc
-- go: DotProductQuadrupleSaturated
- asm: "VPDPBUSDS"
- operandOrder: "31Zero3" # switch operand 3 and 1, and make 3 always 0
- in:
- - *qdpa_acc
- - *qdpa_src1
- - *qdpa_src2
- out:
- - *qdpa_acc
-- go: AddDotProductPairs
- asm: "VPDPWSSD"
- in:
- - &pdpa_acc
- go: $t_acc
- base: int
- elemBits: 32
- - &pdpa_src1
- go: $t_src1
- base: int
- overwriteElementBits: 16
- - &pdpa_src2
- go: $t_src2
- base: int
- overwriteElementBits: 16
- out:
- - *pdpa_acc
-- go: MulAdd
- asm: "VFMADD213PS|VFMADD213PD"
- in:
- - &fma_op
- go: $t
- base: float
- - *fma_op
- - *fma_op
- out:
- - *fma_op
-- go: MulAddSub
- asm: "VFMADDSUB213PS|VFMADDSUB213PD"
- in:
- - *fma_op
- - *fma_op
- - *fma_op
- out:
- - *fma_op
-- go: MulSubAdd
- asm: "VFMSUBADD213PS|VFMSUBADD213PD"
- in:
- - *fma_op
- - *fma_op
- - *fma_op
- out:
- - *fma_op
-- go: SumAbsDiff
- asm: "VPSADBW"
- in:
- - go: $t
- base: uint
- - go: $t
- base: uint
- out:
- - go: $t2
- base: uint
\ No newline at end of file
+++ /dev/null
-!sum
-- go: Max
- commutative: true
- documentation: !string |-
- // NAME computes the maximum of corresponding elements.
-- go: Min
- commutative: true
- documentation: !string |-
- // NAME computes the minimum of corresponding elements.
+++ /dev/null
-!sum
-- go: Max
- asm: "V?PMAXS[BWDQ]"
- in: &2int
- - &int
- go: $t
- base: int
- - *int
- out: &1int
- - *int
-- go: Max
- asm: "V?PMAXU[BWDQ]"
- in: &2uint
- - &uint
- go: $t
- base: uint
- - *uint
- out: &1uint
- - *uint
-
-- go: Min
- asm: "V?PMINS[BWDQ]"
- in: *2int
- out: *1int
-- go: Min
- asm: "V?PMINU[BWDQ]"
- in: *2uint
- out: *1uint
-
-- go: Max
- asm: "V?MAXP[SD]"
- in: &2float
- - &float
- go: $t
- base: float
- - *float
- out: &1float
- - *float
-- go: Min
- asm: "V?MINP[SD]"
- in: *2float
- out: *1float
+++ /dev/null
-!sum
-- go: SetElem
- commutative: false
- documentation: !string |-
- // NAME sets a single constant-indexed element's value.
-- go: GetElem
- commutative: false
- documentation: !string |-
- // NAME retrieves a single constant-indexed element's value.
-- go: SetLo
- commutative: false
- constImm: 0
- documentation: !string |-
- // NAME returns x with its lower half set to y.
-- go: GetLo
- commutative: false
- constImm: 0
- documentation: !string |-
- // NAME returns the lower half of x.
-- go: SetHi
- commutative: false
- constImm: 1
- documentation: !string |-
- // NAME returns x with its upper half set to y.
-- go: GetHi
- commutative: false
- constImm: 1
- documentation: !string |-
- // NAME returns the upper half of x.
-- go: PermuteOrZero
- commutative: false
- documentation: !string |-
- // NAME performs a full permutation of vector x using indices:
- // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-- go: Permute
- commutative: false
- documentation: !string |-
- // NAME performs a full permutation of vector x using indices:
- // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-- go: ConcatPermute # ConcatPermute is only available on or after AVX512
- commutative: false
- documentation: !string |-
- // NAME performs a full permutation of vector x, y using indices:
- // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
- // where xy is the concatenation of x (lower half) and y (upper half).
- // Only the needed bits to represent xy's index are used in indices' elements.
-- go: Compress
- commutative: false
- documentation: !string |-
- // NAME performs a compression on vector x using mask by
- // selecting elements as indicated by mask, and pack them to lower indexed elements.
-- go: blend
- commutative: false
- documentation: !string |-
- // NAME blends two vectors based on mask values, choosing either
- // the first or the second based on whether the third is false or true
-- go: move
- commutative: false
- noTypes: "true"
- noGenericOps: "true"
-- go: Expand
- commutative: false
- documentation: !string |-
- // NAME performs an expansion on a vector x whose elements are packed to lower parts.
- // The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-- go: Broadcast128
- commutative: false
- documentation: !string |-
- // NAME copies element zero of its (128-bit) input to all elements of
- // the 128-bit output vector.
-- go: Broadcast256
- commutative: false
- documentation: !string |-
- // NAME copies element zero of its (128-bit) input to all elements of
- // the 256-bit output vector.
-- go: Broadcast512
- commutative: false
- documentation: !string |-
- // NAME copies element zero of its (128-bit) input to all elements of
- // the 512-bit output vector.
-- go: PermuteOrZeroGrouped
- commutative: false
- documentation: !string |- # Detailed documentation will rely on the specific ops.
- // NAME performs a grouped permutation of vector x using indices:
-- go: PermuteGrouped
- commutative: false
- documentation: !string |- # Detailed documentation will rely on the specific ops.
- // NAME performs a grouped permutation of vector x using indices:
-- go: permuteScalars
- commutative: false
- documentation: !string |- # Detailed documentation will rely on the specific ops.
- // NAME performs a permutation of vector x using constant indices:
-- go: permuteScalarsGrouped
- commutative: false
- documentation: !string |- # Detailed documentation will rely on the specific ops.
- // NAME performs a grouped permutation of vector x using constant indices:
-- go: permuteScalarsLo
- commutative: false
- documentation: !string |- # Detailed documentation will rely on the specific ops.
- // NAME performs a permutation of vector x using constant indices:
-- go: permuteScalarsLoGrouped
- commutative: false
- documentation: !string |- # Detailed documentation will rely on the specific ops.
- // NAME performs a grouped permutation of vector x using constant indices:
-- go: permuteScalarsHi
- commutative: false
- documentation: !string |- # Detailed documentation will rely on the specific ops.
- // NAME performs a permutation of vector x using constant indices:
-- go: permuteScalarsHiGrouped
- commutative: false
- documentation: !string |- # Detailed documentation will rely on the specific ops.
- // NAME performs a grouped permutation of vector x using constant indices:
-- go: InterleaveHi
- commutative: false
- documentation: !string |-
- // NAME interleaves the elements of the high halves of x and y.
-- go: InterleaveLo
- commutative: false
- documentation: !string |-
- // NAME interleaves the elements of the low halves of x and y.
-- go: InterleaveHiGrouped
- commutative: false
- documentation: !string |-
- // NAME interleaves the elements of the high half of each 128-bit subvector of x and y.
-- go: InterleaveLoGrouped
- commutative: false
- documentation: !string |-
- // NAME interleaves the elements of the low half of each 128-bit subvector of x and y.
-
-- go: concatSelectedConstant
- commutative: false
- out:
- - elemBits: 32
- documentation: !string |-
- // NAME concatenates selected elements from x and y into the lower and upper
- // halves of the output. The selection is chosen by the constant parameter h1h0l1l0
- // where each {h,l}{1,0} is two bits specify which element from y or x to select.
- // For example, {0,1,2,3}.NAME(0b_11_01_00_10, {4,5,6,7}) returns
- // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
-
-- go: concatSelectedConstant
- commutative: false
- out:
- - elemBits: 64
- documentation: !string |-
- // NAME concatenates selected elements from x and y into the lower and upper
- // halves of the output. The selection is chosen by the constant parameter hilo
- // where hi and lo are each one bit specifying which 64-bit element to select
- // from y and x. For example {4,5}.NAME(0b10, {6,7})
- // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
- // selecting from y, is 1, and selects 7.
-
-- go: concatSelectedConstantGrouped
- commutative: false
- out:
- - elemBits: 32
- bits: 256
- documentation: !string |-
- // NAME concatenates selected elements from 128-bit subvectors of x and y
- // into the lower and upper halves of corresponding subvectors of the output.
- // The selection is chosen by the constant parameter h1h0l1l0
- // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
- // For example,
- // {0,1,2,3,8,9,10,11}.NAME(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
- // returns {2,0,5,7,10,8,13,15}
- // (don't forget that the binary constant is written big-endian).
-
-- go: concatSelectedConstantGrouped
- commutative: false
- out:
- - elemBits: 64
- bits: 256
- documentation: !string |-
- // NAME concatenates selected elements from 128-bit subvectors of x and y
- // into the lower and upper halves of corresponding subvectors of the output.
- // The selections are specified by the constant parameter hilos where each
- // hi and lo pair select 64-bit elements from the corresponding 128-bit
- // subvectors of x and y.
- //
- // For example {4,5,8,9}.NAME(0b_11_10, {6,7,10,11})
- // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
- // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
- // then 1, selecting element 1 from x's upper 128 bits (9), then 1,
- // selecting element 1 from y's upper 128 bits (11).
- // This differs from the same method applied to a 32x8 vector, where
- // the 8-bit constant performs the same selection on both subvectors.
-
-- go: concatSelectedConstantGrouped
- commutative: false
- out:
- - elemBits: 32
- bits: 512
- documentation: !string |-
- // NAME concatenates selected elements from 128-bit subvectors of x and y
- // into the lower and upper halves of corresponding subvectors of the output.
- // The selection is chosen by the constant parameter h1h0l1l0
- // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
- // For example,
- //
- // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.NAME(
- // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
- //
- // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
- //
- // (don't forget that the binary constant is written big-endian).
-
-- go: concatSelectedConstantGrouped
- commutative: false
- out:
- - elemBits: 64
- bits: 512
- documentation: !string |-
- // NAME concatenates selected elements from 128-bit subvectors of x and y
- // into the lower and upper halves of corresponding subvectors of the output.
- // The selections are specified by the constant parameter hilos where each
- // hi and lo pair select 64-bit elements from the corresponding 128-bit
- // subvectors of x and y.
- //
- // For example {4,5,8,9,12,13,16,17}.NAME(0b11_00_11_10, {6,7,10,11,14,15,18,19})
- // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
- // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
- // then 1, selecting element 1 from x's next 128 bits (9), then 1,
- // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select
- // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
- // 1 bits select the upper elements from x and y's last 128 bits (17, 19).
- // This differs from the same method applied to a 32x8 or 32x16 vector, where
- // the 8-bit constant performs the same selection on all the subvectors.
-
-- go: Select128FromPair
- commutative: false
- documentation: !string |-
- // NAME treats the 256-bit vectors x and y as a single vector of four
- // 128-bit elements, and returns a 256-bit result formed by
- // concatenating the two elements specified by lo and hi.
-
-- go: ConcatShiftBytesRight
- commutative: false
- documentation: !string |-
- // NAME concatenates x and y and shift it right by constant bytes.
- // The result vector will be the lower half of the concatenated vector.
-
-- go: ConcatShiftBytesRightGrouped
- commutative: false
- documentation: !string |-
- // NAME concatenates x and y and shift it right by constant bytes.
- // The result vector will be the lower half of the concatenated vector.
- // This operation is performed grouped by each 16 byte.
+++ /dev/null
-!sum
-- go: SetElem
- asm: "VPINSR[BWDQ]"
- in:
- - &t
- class: vreg
- base: $b
- - class: greg
- base: $b
- lanes: 1 # Scalar, darn it!
- - &imm
- class: immediate
- immOffset: 0
- name: index
- out:
- - *t
-
-- go: SetElem
- asm: "VPINSR[DQ]"
- in:
- - &t
- class: vreg
- base: int
- OverwriteBase: float
- - class: greg
- base: int
- OverwriteBase: float
- lanes: 1 # Scalar, darn it!
- - &imm
- class: immediate
- immOffset: 0
- name: index
- out:
- - *t
-
-- go: GetElem
- asm: "VPEXTR[BWDQ]"
- in:
- - class: vreg
- base: $b
- elemBits: $e
- - *imm
- out:
- - class: greg
- base: $b
- bits: $e
-
-- go: GetElem
- asm: "VPEXTR[DQ]"
- in:
- - class: vreg
- base: int
- elemBits: $e
- OverwriteBase: float
- - *imm
- out:
- - class: greg
- base: int
- bits: $e
- OverwriteBase: float
-
-- go: "SetHi|SetLo"
- regexpTag: "move"
- asm: "VINSERTI128|VINSERTI64X4"
- inVariant: []
- in:
- - &i8x2N
- class: vreg
- base: $t
- OverwriteElementBits: 8
- - &i8xN
- class: vreg
- base: $t
- OverwriteElementBits: 8
- - &imm01 # This immediate should be only 0 or 1
- class: immediate
- const: 0 # place holder
- name: index
- out:
- - *i8x2N
-
-- go: "GetHi|GetLo"
- asm: "VEXTRACTI128|VEXTRACTI64X4"
- regexpTag: "move"
- inVariant: []
- in:
- - *i8x2N
- - *imm01
- out:
- - *i8xN
-
-- go: "SetHi|SetLo"
- asm: "VINSERTI128|VINSERTI64X4"
- regexpTag: "move"
- inVariant: []
- in:
- - &i16x2N
- class: vreg
- base: $t
- OverwriteElementBits: 16
- - &i16xN
- class: vreg
- base: $t
- OverwriteElementBits: 16
- - *imm01
- out:
- - *i16x2N
-
-- go: "GetHi|GetLo"
- regexpTag: "move"
- asm: "VEXTRACTI128|VEXTRACTI64X4"
- inVariant: []
- in:
- - *i16x2N
- - *imm01
- out:
- - *i16xN
-
-- go: "SetHi|SetLo"
- regexpTag: "move"
- asm: "VINSERTI128|VINSERTI64X4"
- inVariant: []
- in:
- - &i32x2N
- class: vreg
- base: $t
- OverwriteElementBits: 32
- - &i32xN
- class: vreg
- base: $t
- OverwriteElementBits: 32
- - *imm01
- out:
- - *i32x2N
-
-- go: "GetHi|GetLo"
- regexpTag: "move"
- asm: "VEXTRACTI128|VEXTRACTI64X4"
- inVariant: []
- in:
- - *i32x2N
- - *imm01
- out:
- - *i32xN
-
-- go: "SetHi|SetLo"
- regexpTag: "move"
- asm: "VINSERTI128|VINSERTI64X4"
- inVariant: []
- in:
- - &i64x2N
- class: vreg
- base: $t
- OverwriteElementBits: 64
- - &i64xN
- class: vreg
- base: $t
- OverwriteElementBits: 64
- - *imm01
- out:
- - *i64x2N
-
-- go: "GetHi|GetLo"
- regexpTag: "move"
- asm: "VEXTRACTI128|VEXTRACTI64X4"
- inVariant: []
- in:
- - *i64x2N
- - *imm01
- out:
- - *i64xN
-
-- go: "SetHi|SetLo"
- regexpTag: "move"
- asm: "VINSERTF128|VINSERTF64X4"
- inVariant: []
- in:
- - &f32x2N
- class: vreg
- base: $t
- OverwriteElementBits: 32
- - &f32xN
- class: vreg
- base: $t
- OverwriteElementBits: 32
- - *imm01
- out:
- - *f32x2N
-
-- go: "GetHi|GetLo"
- regexpTag: "move"
- asm: "VEXTRACTF128|VEXTRACTF64X4"
- inVariant: []
- in:
- - *f32x2N
- - *imm01
- out:
- - *f32xN
-
-- go: "SetHi|SetLo"
- regexpTag: "move"
- asm: "VINSERTF128|VINSERTF64X4"
- inVariant: []
- in:
- - &f64x2N
- class: vreg
- base: $t
- OverwriteElementBits: 64
- - &f64xN
- class: vreg
- base: $t
- OverwriteElementBits: 64
- - *imm01
- out:
- - *f64x2N
-
-- go: "GetHi|GetLo"
- regexpTag: "move"
- asm: "VEXTRACTF128|VEXTRACTF64X4"
- inVariant: []
- in:
- - *f64x2N
- - *imm01
- out:
- - *f64xN
-
-- go: Permute
- asm: "VPERMQ|VPERMPD"
- addDoc: !string |-
- // The low 2 bits (values 0-3) of each element of indices is used
- operandOrder: "21Type1"
- in:
- - &anyindices
- go: $t
- name: indices
- overwriteBase: uint
- - &any4
- go: $t
- lanes: 4
- out:
- - &any
- go: $t
-
-- go: Permute
- asm: "VPERM[WDQ]|VPERMP[SD]"
- addDoc: !string |-
- // The low 3 bits (values 0-7) of each element of indices is used
- operandOrder: "21Type1"
- in:
- - *anyindices
- - &any8
- go: $t
- lanes: 8
- out:
- - *any
-
-- go: Permute
- asm: "VPERM[BWD]|VPERMPS"
- addDoc: !string |-
- // The low 4 bits (values 0-15) of each element of indices is used
- operandOrder: "21Type1"
- in:
- - *anyindices
- - &any16
- go: $t
- lanes: 16
- out:
- - *any
-
-- go: Permute
- asm: "VPERM[BW]"
- addDoc: !string |-
- // The low 5 bits (values 0-31) of each element of indices is used
- operandOrder: "21Type1"
- in:
- - *anyindices
- - &any32
- go: $t
- lanes: 32
- out:
- - *any
-
-- go: Permute
- asm: "VPERMB"
- addDoc: !string |-
- // The low 6 bits (values 0-63) of each element of indices is used
- operandOrder: "21Type1"
- in:
- - *anyindices
- - &any64
- go: $t
- lanes: 64
- out:
- - *any
-
-- go: ConcatPermute
- asm: "VPERMI2[BWDQ]|VPERMI2P[SD]"
- # Because we are overwriting the receiver's type, we
- # have to move the receiver to be a parameter so that
- # we can have no duplication.
- operandOrder: "231Type1"
- in:
- - *anyindices # result in arg 0
- - *any
- - *any
- out:
- - *any
-
-- go: Compress
- asm: "VPCOMPRESS[BWDQ]|VCOMPRESSP[SD]"
- in:
- # The mask in Compress is a control mask rather than a write mask, so it's not optional.
- - class: mask
- - *any
- out:
- - *any
-
-# For now a non-public method because
-# (1) [OverwriteClass] must be set together with [OverwriteBase]
-# (2) "simdgen does not support [OverwriteClass] in inputs".
-# That means the signature is wrong.
-- go: blend
- asm: VPBLENDVB
- zeroing: false
- in:
- - &v
- go: $t
- class: vreg
- base: int
- - *v
- -
- class: vreg
- base: int
- name: mask
- out:
- - *v
-
-# For AVX512
-- go: blend
- asm: VPBLENDM[BWDQ]
- zeroing: false
- in:
- - &v
- go: $t
- bits: 512
- class: vreg
- base: int
- - *v
- inVariant:
- -
- class: mask
- out:
- - *v
-
- # For AVX512
-- go: move
- asm: VMOVDQU(8|16|32|64)
- zeroing: true
- in:
- - &v
- go: $t
- class: vreg
- base: int|uint
- inVariant:
- -
- class: mask
- out:
- - *v
-
-- go: Expand
- asm: "VPEXPAND[BWDQ]|VEXPANDP[SD]"
- in:
- # The mask in Expand is a control mask rather than a write mask, so it's not optional.
- - class: mask
- - *any
- out:
- - *any
-
-- go: Broadcast128
- asm: VPBROADCAST[BWDQ]
- in:
- - class: vreg
- bits: 128
- elemBits: $e
- base: $b
- out:
- - class: vreg
- bits: 128
- elemBits: $e
- base: $b
-
-# weirdly, this one case on AVX2 is memory-operand-only
-- go: Broadcast128
- asm: VPBROADCASTQ
- in:
- - class: vreg
- bits: 128
- elemBits: 64
- base: int
- OverwriteBase: float
- out:
- - class: vreg
- bits: 128
- elemBits: 64
- base: int
- OverwriteBase: float
-
-- go: Broadcast256
- asm: VPBROADCAST[BWDQ]
- in:
- - class: vreg
- bits: 128
- elemBits: $e
- base: $b
- out:
- - class: vreg
- bits: 256
- elemBits: $e
- base: $b
-
-- go: Broadcast512
- asm: VPBROADCAST[BWDQ]
- in:
- - class: vreg
- bits: 128
- elemBits: $e
- base: $b
- out:
- - class: vreg
- bits: 512
- elemBits: $e
- base: $b
-
-- go: Broadcast128
- asm: VBROADCASTS[SD]
- in:
- - class: vreg
- bits: 128
- elemBits: $e
- base: $b
- out:
- - class: vreg
- bits: 128
- elemBits: $e
- base: $b
-
-- go: Broadcast256
- asm: VBROADCASTS[SD]
- in:
- - class: vreg
- bits: 128
- elemBits: $e
- base: $b
- out:
- - class: vreg
- bits: 256
- elemBits: $e
- base: $b
-
-- go: Broadcast512
- asm: VBROADCASTS[SD]
- in:
- - class: vreg
- bits: 128
- elemBits: $e
- base: $b
- out:
- - class: vreg
- bits: 512
- elemBits: $e
- base: $b
-
-# VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
-- go: PermuteOrZero
- asm: VPSHUFB
- addDoc: !string |-
- // The lower four bits of each byte-sized index in indices select an element from x,
- // unless the index's sign bit is set in which case zero is used instead.
- in:
- - &128any
- bits: 128
- go: $t
- - bits: 128
- name: indices
- base: int # always signed
- out:
- - *128any
-
-- go: PermuteOrZeroGrouped
- asm: VPSHUFB
- addDoc: !string |-
- // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
- // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
- // unless the index's sign bit is set in which case zero is used instead.
- // Each group is of size 128-bit.
- in:
- - &256Or512any
- bits: "256|512"
- go: $t
- - bits: "256|512"
- base: int
- name: indices
- out:
- - *256Or512any
-
-- go: permuteScalars
- asm: VPSHUFD
- addDoc: !string |-
- // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
- // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
- in:
- - *128any
- - class: immediate
- immOffset: 0
- name: indices
- hideMaskMethods: true
- out:
- - *128any
-
-- go: permuteScalarsGrouped
- asm: VPSHUFD
- addDoc: !string |-
- // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
- // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
- // Each group is of size 128-bit.
- in:
- - *256Or512any
- - class: immediate
- immOffset: 0
- name: indices
- hideMaskMethods: true
- out:
- - *256Or512any
-
-- go: permuteScalarsLo
- asm: VPSHUFLW
- addDoc: !string |-
- // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
- // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
- in:
- - &128lanes8
- bits: 128
- go: $t
- elemBits: 16
- - class: immediate
- immOffset: 0
- name: indices
- hideMaskMethods: true
- out:
- - *128lanes8
-
-- go: permuteScalarsLoGrouped
- asm: VPSHUFLW
- addDoc: !string |-
- //
- // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
- // x_group1[indices[0:2]], ...}
- //
- // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
- // Each group is of size 128-bit.
- in:
- - &256Or512lanes8
- bits: "256|512"
- go: $t
- elemBits: 16
- - class: immediate
- immOffset: 0
- name: indices
- hideMaskMethods: true
- out:
- - *256Or512lanes8
-
-- go: permuteScalarsHi
- asm: VPSHUFHW
- addDoc: !string |-
- // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
- // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
- in:
- - *128lanes8
- - class: immediate
- immOffset: 0
- name: indices
- hideMaskMethods: true
- out:
- - *128lanes8
-
-- go: permuteScalarsHiGrouped
- asm: VPSHUFHW
- addDoc: !string |-
- // result =
- //
- // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
- // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
- //
- // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
- // Each group is of size 128-bit.
- in:
- - *256Or512lanes8
- - class: immediate
- immOffset: 0
- name: indices
- hideMaskMethods: true
- out:
- - *256Or512lanes8
-
-- go: InterleaveHi
- asm: VPUNPCKH(QDQ|DQ|WD|WB)
- in:
- - *128any
- - *128any
- inVariant: []
- out:
- - *128any
-
-- go: InterleaveLo
- asm: VPUNPCKL(QDQ|DQ|WD|WB)
- in:
- - *128any
- - *128any
- inVariant: []
- out:
- - *128any
-
-- go: InterleaveHiGrouped
- asm: VPUNPCKH(QDQ|DQ|WD|WB)
- in:
- - *256Or512any
- - *256Or512any
- inVariant: []
- out:
- - *256Or512any
-
-- go: InterleaveLoGrouped
- asm: VPUNPCKL(QDQ|DQ|WD|WB)
- in:
- - *256Or512any
- - *256Or512any
- inVariant: []
- out:
- - *256Or512any
-
-# These are all described separately to carry the name of the constant parameter
-
-- go: concatSelectedConstant
- asm: VSHUFPS
- width: 32
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: 128
- - *v
- - class: immediate
- immOffset: 0
- name: h1h0l1l0
- inVariant: []
- out:
- - *v
-
-- go: concatSelectedConstant
- asm: VSHUFPS
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: 128
- OverwriteBase: int
- - *v
- - class: immediate
- immOffset: 0
- name: h1h0l1l0
- inVariant: []
- out:
- - *v
-
-- go: concatSelectedConstant
- asm: VSHUFPS
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: 128
- OverwriteBase: uint
- - *v
- - class: immediate
- immOffset: 0
- name: h1h0l1l0
- inVariant: []
- out:
- - *v
-
-
-- go: concatSelectedConstantGrouped
- asm: VSHUFPS
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: "256|512"
- - *v
- - class: immediate
- immOffset: 0
- name: h1h0l1l0
- inVariant: []
- out:
- - *v
-
-- go: concatSelectedConstantGrouped
- asm: VSHUFPS
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: "256|512"
- OverwriteBase: int
- - *v
- - class: immediate
- immOffset: 0
- name: h1h0l1l0
- inVariant: []
- out:
- - *v
-
-- go: concatSelectedConstantGrouped
- asm: VSHUFPS
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: "256|512"
- OverwriteBase: uint
- - *v
- - class: immediate
- immOffset: 0
- name: h1h0l1l0
- inVariant: []
- out:
- - *v
-
-
- # 64 bit versions
-
-- go: concatSelectedConstant
- asm: VSHUFPD
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: 128
- - *v
- - class: immediate
- immOffset: 0
- name: hilo
- inVariant: []
- out:
- - *v
-
-- go: concatSelectedConstant
- asm: VSHUFPD
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: 128
- OverwriteBase: int
- - *v
- - class: immediate
- immOffset: 0
- name: hilo
- inVariant: []
- out:
- - *v
-
-- go: concatSelectedConstant
- asm: VSHUFPD
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: 128
- OverwriteBase: uint
- - *v
- - class: immediate
- immOffset: 0
- name: hilo
- inVariant: []
- out:
- - *v
-
-- go: concatSelectedConstantGrouped
- asm: VSHUFPD
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: "256|512"
- - *v
- - class: immediate
- immOffset: 0
- name: hilos
- inVariant: []
- out:
- - *v
-
-- go: concatSelectedConstantGrouped
- asm: VSHUFPD
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: "256|512"
- OverwriteBase: int
- - *v
- - class: immediate
- immOffset: 0
- name: hilos
- inVariant: []
- out:
- - *v
-
-- go: concatSelectedConstantGrouped
- asm: VSHUFPD
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: "256|512"
- OverwriteBase: uint
- - *v
- - class: immediate
- immOffset: 0
- name: hilos
- inVariant: []
- out:
- - *v
-
-- go: Select128FromPair
- asm: VPERM2F128
- operandOrder: II
- addDoc: !string |-
- // For example,
- //
- // {40, 41, 50, 51}.NAME(3, 0, {60, 61, 70, 71})
- //
- // returns {70, 71, 40, 41}.
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: 256
- - *v
- - class: immediate
- immOffset: 0
- name: "lo, hi"
- inVariant: []
- out:
- - *v
-
-- go: Select128FromPair
- asm: VPERM2F128
- operandOrder: II
- addDoc: !string |-
- // For example,
- //
- // {40, 41, 42, 43, 50, 51, 52, 53}.NAME(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
- //
- // returns {70, 71, 72, 73, 40, 41, 42, 43}.
- in:
- - &v
- go: $t
- class: vreg
- base: float
- bits: 256
- OverwriteElementBits: 32
- - *v
- - class: immediate
- immOffset: 0
- name: "lo, hi"
- inVariant: []
- out:
- - *v
-
-- go: Select128FromPair
- asm: VPERM2I128
- operandOrder: II
- addDoc: !string |-
- // For example,
- //
- // {40, 41, 50, 51}.NAME(3, 0, {60, 61, 70, 71})
- //
- // returns {70, 71, 40, 41}.
- in:
- - &v
- go: $t
- class: vreg
- base: int|uint
- bits: 256
- OverwriteElementBits: 64
- - *v
- - class: immediate
- immOffset: 0
- name: "lo, hi"
- inVariant: []
- out:
- - *v
-
-- go: Select128FromPair
- asm: VPERM2I128
- operandOrder: II
- addDoc: !string |-
- // For example,
- //
- // {40, 41, 42, 43, 50, 51, 52, 53}.NAME(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
- //
- // returns {70, 71, 72, 73, 40, 41, 42, 43}.
- in:
- - &v
- go: $t
- class: vreg
- base: int|uint
- bits: 256
- OverwriteElementBits: 32
- - *v
- - class: immediate
- immOffset: 0
- name: "lo, hi"
- inVariant: []
- out:
- - *v
-
-- go: Select128FromPair
- asm: VPERM2I128
- operandOrder: II
- addDoc: !string |-
- // For example,
- //
- // {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.NAME(3, 0,
- // {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
- //
- // returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
- in:
- - &v
- go: $t
- class: vreg
- base: int|uint
- bits: 256
- OverwriteElementBits: 16
- - *v
- - class: immediate
- immOffset: 0
- name: "lo, hi"
- inVariant: []
- out:
- - *v
-
-- go: Select128FromPair
- asm: VPERM2I128
- operandOrder: II
- addDoc: !string |-
- // For example,
- //
- // {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.NAME(3, 0,
- // {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
- //
- // returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
- in:
- - &v
- go: $t
- class: vreg
- base: int|uint
- bits: 256
- OverwriteElementBits: 8
- - *v
- - class: immediate
- immOffset: 0
- name: "lo, hi"
- inVariant: []
- out:
- - *v
-
-- go: ConcatShiftBytesRight
- asm: VPALIGNR
- in:
- - &uint128
- go: $t
- base: uint
- bits: 128
- - *uint128
- - class: immediate
- immOffset: 0
- out:
- - *uint128
-
-- go: ConcatShiftBytesRightGrouped
- asm: VPALIGNR
- in:
- - &uint256512
- go: $t
- base: uint
- bits: 256|512
- - *uint256512
- - class: immediate
- immOffset: 0
- out:
- - *uint256512
+++ /dev/null
-!sum
-- go: Mul
- commutative: true
- documentation: !string |-
- // NAME multiplies corresponding elements of two vectors.
-- go: MulEvenWiden
- commutative: true
- documentation: !string |-
- // NAME multiplies even-indexed elements, widening the result.
- // Result[i] = v1.Even[i] * v2.Even[i].
-- go: MulHigh
- commutative: true
- documentation: !string |-
- // NAME multiplies elements and stores the high part of the result.
+++ /dev/null
-!sum
-# "Normal" multiplication is only available for floats.
-# This only covers the single and double precision.
-- go: Mul
- asm: "VMULP[SD]"
- in:
- - &fp
- go: $t
- base: float
- - *fp
- out:
- - *fp
-
-# Integer multiplications.
-
-# MulEvenWiden
-# Dword only.
-- go: MulEvenWiden
- asm: "VPMULDQ"
- in:
- - &intNot64
- go: $t
- elemBits: 8|16|32
- base: int
- - *intNot64
- out:
- - &int2
- go: $t2
- base: int
-- go: MulEvenWiden
- asm: "VPMULUDQ"
- in:
- - &uintNot64
- go: $t
- elemBits: 8|16|32
- base: uint
- - *uintNot64
- out:
- - &uint2
- go: $t2
- base: uint
-
-# MulHigh
-# Word only.
-- go: MulHigh
- asm: "VPMULHW"
- in:
- - &int
- go: $t
- base: int
- - *int
- out:
- - *int
-- go: MulHigh
- asm: "VPMULHUW"
- in:
- - &uint
- go: $t
- base: uint
- - *uint
- out:
- - *uint
-
-# MulLow
-# signed and unsigned are the same for lower bits.
-- go: Mul
- asm: "VPMULL[WDQ]"
- in:
- - &any
- go: $t
- - *any
- out:
- - *any
+++ /dev/null
-!sum
-- go: LeadingZeros
- commutative: false
- documentation: !string |-
- // NAME counts the leading zeros of each element in x.
-- go: AESEncryptOneRound
- commutative: false
- documentation: !string |-
- // NAME performs a series of operations in AES cipher algorithm defined in FIPS 197.
- // x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
- // y is the chunk of w array in use.
- // result = AddRoundKey(MixColumns(ShiftRows(SubBytes(x))), y)
-- go: AESEncryptLastRound
- commutative: false
- documentation: !string |-
- // NAME performs a series of operations in AES cipher algorithm defined in FIPS 197.
- // x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
- // y is the chunk of w array in use.
- // result = AddRoundKey((ShiftRows(SubBytes(x))), y)
-- go: AESRoundKeyGenAssist
- commutative: false
- documentation: !string |-
- // NAME performs some components of KeyExpansion in AES cipher algorithm defined in FIPS 197.
- // x is an array of AES words, but only x[0] and x[2] are used.
- // r is a value from the Rcon constant array.
- // result[0] = XOR(SubWord(RotWord(x[0])), r)
- // result[1] = SubWord(x[1])
- // result[2] = XOR(SubWord(RotWord(x[2])), r)
- // result[3] = SubWord(x[3])
-- go: AESDecryptOneRound
- commutative: false
- documentation: !string |-
- // NAME performs a series of operations in AES cipher algorithm defined in FIPS 197.
- // x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
- // y is the chunk of dw array in use.
- // result = AddRoundKey(InvMixColumns(InvShiftRows(InvSubBytes(x))), y)
-- go: AESDecryptLastRound
- commutative: false
- documentation: !string |-
- // NAME performs a series of operations in AES cipher algorithm defined in FIPS 197.
- // x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
- // y is the chunk of dw array in use.
- // result = AddRoundKey(InvShiftRows(InvSubBytes(x)), y)
-- go: AESInvMixColumns
- commutative: false
- documentation: !string |-
- // NAME performs the InvMixColumns operation in AES cipher algorithm defined in FIPS 197.
- // x is the chunk of w array in use.
- // result = InvMixColumns(x)
-- go: SHA1FourRounds
- commutative: false
- documentation: !string |-
- // NAME performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
- // x contains the state variables a, b, c and d from upper to lower order.
- // y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
- // result = the state variables a', b', c', d' updated after 4 rounds.
- // constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
-- go: SHA1NextE
- commutative: false
- documentation: !string |-
- // NAME calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
- // x contains the state variable a (before the 4 rounds), placed in the upper element.
- // y is the elements of W array for next 4 rounds from upper to lower order.
- // result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
- // from upper to lower order.
- // For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
- // for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
- // computation of the value of e'.)
-- go: SHA1Message1
- commutative: false
- documentation: !string |-
- // NAME does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
- // x = {W3, W2, W1, W0}
- // y = {0, 0, W5, W4}
- // result = {W3^W5, W2^W4, W1^W3, W0^W2}.
-- go: SHA1Message2
- commutative: false
- documentation: !string |-
- // NAME does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
- // x = result of 2.
- // y = {W15, W14, W13}
- // result = {W19, W18, W17, W16}
-- go: SHA256TwoRounds
- commutative: false
- documentation: !string |-
- // NAME does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
- // x = {h, g, d, c}
- // y = {f, e, b, a}
- // z = {W0+K0, W1+K1}
- // result = {f', e', b', a'}
- // The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
- // the corresponding element of the W array to make the input data z.
- // The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
- // y (the state variables a, b, e, f before the 2 rounds).
-- go: SHA256Message1
- commutative: false
- documentation: !string |-
- // NAME does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
- // x = {W0, W1, W2, W3}
- // y = {W4, 0, 0, 0}
- // result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
-- go: SHA256Message2
- commutative: false
- documentation: !string |-
- // NAME does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
- // x = result of 2
- // y = {0, 0, W14, W15}
- // result = {W16, W17, W18, W19}
\ No newline at end of file
+++ /dev/null
-!sum
-- go: LeadingZeros
- asm: "VPLZCNT[DQ]"
- in:
- - &any
- go: $t
- out:
- - *any
-- go: AESEncryptOneRound
- asm: VAESENC
- in:
- - &uint8s
- base: uint
- overwriteElementBits: 8
- - &uint32s
- base: uint
- overwriteElementBits: 32
- out:
- - *uint8s
-- go: AESEncryptLastRound
- asm: VAESENCLAST
- in:
- - *uint8s
- - *uint32s
- out:
- - *uint8s
-- go: AESRoundKeyGenAssist
- asm: VAESKEYGENASSIST
- in:
- - *uint32s
- - class: immediate
- immOffset: 0
- name: rconVal
- out:
- - *uint32s
-- go: AESDecryptOneRound
- asm: VAESDEC
- in:
- - *uint8s
- - *uint32s
- out:
- - *uint8s
-- go: AESDecryptLastRound
- asm: VAESDECLAST
- in:
- - *uint8s
- - *uint32s
- out:
- - *uint8s
-- go: AESInvMixColumns
- asm: VAESIMC
- in:
- - *uint32s
- out:
- - *uint32s
-- go: SHA1FourRounds
- asm: SHA1RNDS4
- operandOrder: "SHA1RNDS4"
- in: &2uint1imm
- - &uint
- go: $t
- base: uint
- - *uint
- - class: immediate
- immOffset: 0
- out: &1uint
- - *uint
-- go: SHA1NextE
- asm: SHA1NEXTE
- in: &2uint
- - *uint
- - *uint
- out: *1uint
-- go: SHA1Message1
- asm: SHA1MSG1
- in: *2uint
- out: *1uint
-- go: SHA1Message2
- asm: SHA1MSG2
- in: *2uint
- out: *1uint
-- go: SHA256TwoRounds
- asm: SHA256RNDS2
- in:
- - base: uint
- - base: uint
- - base: uint
- overwriteElementBits: 32
- out:
- - base: uint
-- go: SHA256Message1
- asm: SHA256MSG1
- in: *2uint
- out: *1uint
-- go: SHA256Message2
- asm: SHA256MSG2
- in: *2uint
- out: *1uint
+++ /dev/null
-!sum
-- go: ShiftAllLeft
- nameAndSizeCheck: true
- specialLower: sftimm
- commutative: false
- documentation: !string |-
- // NAME shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-- go: ShiftAllRight
- signed: false
- nameAndSizeCheck: true
- specialLower: sftimm
- commutative: false
- documentation: !string |-
- // NAME shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
-- go: ShiftAllRight
- signed: true
- specialLower: sftimm
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
-- go: shiftAllLeftConst # no APIs, only ssa ops.
- noTypes: "true"
- noGenericOps: "true"
- SSAVariant: "const" # to avoid its name colliding with reg version of this instruction, amend this to its ssa op name.
- nameAndSizeCheck: true
- commutative: false
-- go: shiftAllRightConst # no APIs, only ssa ops.
- noTypes: "true"
- noGenericOps: "true"
- SSAVariant: "const"
- signed: false
- nameAndSizeCheck: true
- commutative: false
-- go: shiftAllRightConst # no APIs, only ssa ops.
- noTypes: "true"
- noGenericOps: "true"
- SSAVariant: "const"
- signed: true
- nameAndSizeCheck: true
- commutative: false
-
-- go: ShiftLeft
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-- go: ShiftRight
- signed: false
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
-- go: ShiftRight
- signed: true
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
-- go: RotateAllLeft
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME rotates each element to the left by the number of bits specified by the immediate.
-- go: RotateLeft
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-- go: RotateAllRight
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME rotates each element to the right by the number of bits specified by the immediate.
-- go: RotateRight
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-- go: ShiftAllLeftConcat
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME shifts each element of x to the left by the number of bits specified by the
- // immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-- go: ShiftAllRightConcat
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME shifts each element of x to the right by the number of bits specified by the
- // immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-- go: ShiftLeftConcat
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME shifts each element of x to the left by the number of bits specified by the
- // corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-- go: ShiftRightConcat
- nameAndSizeCheck: true
- commutative: false
- documentation: !string |-
- // NAME shifts each element of x to the right by the number of bits specified by the
- // corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+++ /dev/null
-!sum
-# Integers
-# ShiftAll*
-- go: ShiftAllLeft
- asm: "VPSLL[WDQ]"
- in:
- - &any
- go: $t
- - &vecAsScalar64
- go: "Uint.*"
- treatLikeAScalarOfSize: 64
- out:
- - *any
-- go: ShiftAllRight
- signed: false
- asm: "VPSRL[WDQ]"
- in:
- - &uint
- go: $t
- base: uint
- - *vecAsScalar64
- out:
- - *uint
-- go: ShiftAllRight
- signed: true
- asm: "VPSRA[WDQ]"
- in:
- - &int
- go: $t
- base: int
- - *vecAsScalar64
- out:
- - *int
-
-- go: shiftAllLeftConst
- asm: "VPSLL[WDQ]"
- in:
- - *any
- - &imm
- class: immediate
- immOffset: 0
- out:
- - *any
-- go: shiftAllRightConst
- asm: "VPSRL[WDQ]"
- in:
- - *int
- - *imm
- out:
- - *int
-- go: shiftAllRightConst
- asm: "VPSRA[WDQ]"
- in:
- - *uint
- - *imm
- out:
- - *uint
-
-# Shift* (variable)
-- go: ShiftLeft
- asm: "VPSLLV[WD]"
- in:
- - *any
- - *any
- out:
- - *any
-# XED data of VPSLLVQ marks the element bits 32 which is off to the actual semantic, we need to overwrite
-# it to 64.
-- go: ShiftLeft
- asm: "VPSLLVQ"
- in:
- - &anyOverwriteElemBits
- go: $t
- overwriteElementBits: 64
- - *anyOverwriteElemBits
- out:
- - *anyOverwriteElemBits
-- go: ShiftRight
- signed: false
- asm: "VPSRLV[WD]"
- in:
- - *uint
- - *uint
- out:
- - *uint
-# XED data of VPSRLVQ needs the same overwrite as VPSLLVQ.
-- go: ShiftRight
- signed: false
- asm: "VPSRLVQ"
- in:
- - &uintOverwriteElemBits
- go: $t
- base: uint
- overwriteElementBits: 64
- - *uintOverwriteElemBits
- out:
- - *uintOverwriteElemBits
-- go: ShiftRight
- signed: true
- asm: "VPSRAV[WDQ]"
- in:
- - *int
- - *int
- out:
- - *int
-
-# Rotate
-- go: RotateAllLeft
- asm: "VPROL[DQ]"
- in:
- - *any
- - &pureImm
- class: immediate
- immOffset: 0
- name: shift
- out:
- - *any
-- go: RotateAllRight
- asm: "VPROR[DQ]"
- in:
- - *any
- - *pureImm
- out:
- - *any
-- go: RotateLeft
- asm: "VPROLV[DQ]"
- in:
- - *any
- - *any
- out:
- - *any
-- go: RotateRight
- asm: "VPRORV[DQ]"
- in:
- - *any
- - *any
- out:
- - *any
-
-# Bizzare shifts.
-- go: ShiftAllLeftConcat
- asm: "VPSHLD[WDQ]"
- in:
- - *any
- - *any
- - *pureImm
- out:
- - *any
-- go: ShiftAllRightConcat
- asm: "VPSHRD[WDQ]"
- in:
- - *any
- - *any
- - *pureImm
- out:
- - *any
-- go: ShiftLeftConcat
- asm: "VPSHLDV[WDQ]"
- in:
- - *any
- - *any
- - *any
- out:
- - *any
-- go: ShiftRightConcat
- asm: "VPSHRDV[WDQ]"
- in:
- - *any
- - *any
- - *any
- out:
- - *any
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "fmt"
- "reflect"
- "strconv"
-)
-
-func pprints(v any) string {
- var pp pprinter
- pp.val(reflect.ValueOf(v), 0)
- return string(pp.buf)
-}
-
-type pprinter struct {
- buf []byte
-}
-
-func (p *pprinter) indent(by int) {
- for range by {
- p.buf = append(p.buf, '\t')
- }
-}
-
-func (p *pprinter) val(v reflect.Value, indent int) {
- switch v.Kind() {
- default:
- p.buf = fmt.Appendf(p.buf, "unsupported kind %v", v.Kind())
-
- case reflect.Bool:
- p.buf = strconv.AppendBool(p.buf, v.Bool())
-
- case reflect.Int, reflect.Int16, reflect.Int32, reflect.Int64:
- p.buf = strconv.AppendInt(p.buf, v.Int(), 10)
-
- case reflect.String:
- p.buf = strconv.AppendQuote(p.buf, v.String())
-
- case reflect.Pointer:
- if v.IsNil() {
- p.buf = append(p.buf, "nil"...)
- } else {
- p.buf = append(p.buf, "&"...)
- p.val(v.Elem(), indent)
- }
-
- case reflect.Slice, reflect.Array:
- p.buf = append(p.buf, "[\n"...)
- for i := range v.Len() {
- p.indent(indent + 1)
- p.val(v.Index(i), indent+1)
- p.buf = append(p.buf, ",\n"...)
- }
- p.indent(indent)
- p.buf = append(p.buf, ']')
-
- case reflect.Struct:
- vt := v.Type()
- p.buf = append(append(p.buf, vt.String()...), "{\n"...)
- for f := range v.NumField() {
- p.indent(indent + 1)
- p.buf = append(append(p.buf, vt.Field(f).Name...), ": "...)
- p.val(v.Field(f), indent+1)
- p.buf = append(p.buf, ",\n"...)
- }
- p.indent(indent)
- p.buf = append(p.buf, '}')
- }
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import "testing"
-
-func TestSort(t *testing.T) {
- testCases := []struct {
- s1, s2 string
- want int
- }{
- {"a1", "a2", -1},
- {"a11a", "a11b", -1},
- {"a01a1", "a1a01", -1},
- {"a2", "a1", 1},
- {"a10", "a2", 1},
- {"a1", "a10", -1},
- {"z11", "z2", 1},
- {"z2", "z11", -1},
- {"abc", "abd", -1},
- {"123", "45", 1},
- {"file1", "file1", 0},
- {"file", "file1", -1},
- {"file1", "file", 1},
- {"a01", "a1", -1},
- {"a1a", "a1b", -1},
- }
-
- for _, tc := range testCases {
- got := compareNatural(tc.s1, tc.s2)
- result := "✅"
- if got != tc.want {
- result = "❌"
- t.Errorf("%s CompareNatural(\"%s\", \"%s\") -> got %2d, want %2d\n", result, tc.s1, tc.s2, got, tc.want)
- } else {
- t.Logf("%s CompareNatural(\"%s\", \"%s\") -> got %2d, want %2d\n", result, tc.s1, tc.s2, got, tc.want)
- }
- }
-}
+++ /dev/null
-# This file defines the possible types of each operand and result.
-#
-# In general, we're able to narrow this down on some attributes directly from
-# the machine instruction descriptions, but the Go mappings need to further
-# constrain them and how they relate. For example, on x86 we can't distinguish
-# int and uint, though we can distinguish these from float.
-
-in: !repeat
-- !sum &types
- - {class: vreg, go: Int8x16, base: "int", elemBits: 8, bits: 128, lanes: 16}
- - {class: vreg, go: Uint8x16, base: "uint", elemBits: 8, bits: 128, lanes: 16}
- - {class: vreg, go: Int16x8, base: "int", elemBits: 16, bits: 128, lanes: 8}
- - {class: vreg, go: Uint16x8, base: "uint", elemBits: 16, bits: 128, lanes: 8}
- - {class: vreg, go: Int32x4, base: "int", elemBits: 32, bits: 128, lanes: 4}
- - {class: vreg, go: Uint32x4, base: "uint", elemBits: 32, bits: 128, lanes: 4}
- - {class: vreg, go: Int64x2, base: "int", elemBits: 64, bits: 128, lanes: 2}
- - {class: vreg, go: Uint64x2, base: "uint", elemBits: 64, bits: 128, lanes: 2}
- - {class: vreg, go: Float32x4, base: "float", elemBits: 32, bits: 128, lanes: 4}
- - {class: vreg, go: Float64x2, base: "float", elemBits: 64, bits: 128, lanes: 2}
- - {class: vreg, go: Int8x32, base: "int", elemBits: 8, bits: 256, lanes: 32}
- - {class: vreg, go: Uint8x32, base: "uint", elemBits: 8, bits: 256, lanes: 32}
- - {class: vreg, go: Int16x16, base: "int", elemBits: 16, bits: 256, lanes: 16}
- - {class: vreg, go: Uint16x16, base: "uint", elemBits: 16, bits: 256, lanes: 16}
- - {class: vreg, go: Int32x8, base: "int", elemBits: 32, bits: 256, lanes: 8}
- - {class: vreg, go: Uint32x8, base: "uint", elemBits: 32, bits: 256, lanes: 8}
- - {class: vreg, go: Int64x4, base: "int", elemBits: 64, bits: 256, lanes: 4}
- - {class: vreg, go: Uint64x4, base: "uint", elemBits: 64, bits: 256, lanes: 4}
- - {class: vreg, go: Float32x8, base: "float", elemBits: 32, bits: 256, lanes: 8}
- - {class: vreg, go: Float64x4, base: "float", elemBits: 64, bits: 256, lanes: 4}
- - {class: vreg, go: Int8x64, base: "int", elemBits: 8, bits: 512, lanes: 64}
- - {class: vreg, go: Uint8x64, base: "uint", elemBits: 8, bits: 512, lanes: 64}
- - {class: vreg, go: Int16x32, base: "int", elemBits: 16, bits: 512, lanes: 32}
- - {class: vreg, go: Uint16x32, base: "uint", elemBits: 16, bits: 512, lanes: 32}
- - {class: vreg, go: Int32x16, base: "int", elemBits: 32, bits: 512, lanes: 16}
- - {class: vreg, go: Uint32x16, base: "uint", elemBits: 32, bits: 512, lanes: 16}
- - {class: vreg, go: Int64x8, base: "int", elemBits: 64, bits: 512, lanes: 8}
- - {class: vreg, go: Uint64x8, base: "uint", elemBits: 64, bits: 512, lanes: 8}
- - {class: vreg, go: Float32x16, base: "float", elemBits: 32, bits: 512, lanes: 16}
- - {class: vreg, go: Float64x8, base: "float", elemBits: 64, bits: 512, lanes: 8}
-
- - {class: mask, go: Mask8x16, base: "int", elemBits: 8, bits: 128, lanes: 16}
- - {class: mask, go: Mask16x8, base: "int", elemBits: 16, bits: 128, lanes: 8}
- - {class: mask, go: Mask32x4, base: "int", elemBits: 32, bits: 128, lanes: 4}
- - {class: mask, go: Mask64x2, base: "int", elemBits: 64, bits: 128, lanes: 2}
- - {class: mask, go: Mask8x32, base: "int", elemBits: 8, bits: 256, lanes: 32}
- - {class: mask, go: Mask16x16, base: "int", elemBits: 16, bits: 256, lanes: 16}
- - {class: mask, go: Mask32x8, base: "int", elemBits: 32, bits: 256, lanes: 8}
- - {class: mask, go: Mask64x4, base: "int", elemBits: 64, bits: 256, lanes: 4}
- - {class: mask, go: Mask8x64, base: "int", elemBits: 8, bits: 512, lanes: 64}
- - {class: mask, go: Mask16x32, base: "int", elemBits: 16, bits: 512, lanes: 32}
- - {class: mask, go: Mask32x16, base: "int", elemBits: 32, bits: 512, lanes: 16}
- - {class: mask, go: Mask64x8, base: "int", elemBits: 64, bits: 512, lanes: 8}
-
-
- - {class: greg, go: float64, base: "float", bits: 64, lanes: 1}
- - {class: greg, go: float32, base: "float", bits: 32, lanes: 1}
- - {class: greg, go: int64, base: "int", bits: 64, lanes: 1}
- - {class: greg, go: int32, base: "int", bits: 32, lanes: 1}
- - {class: greg, go: int16, base: "int", bits: 16, lanes: 1}
- - {class: greg, go: int8, base: "int", bits: 8, lanes: 1}
- - {class: greg, go: uint64, base: "uint", bits: 64, lanes: 1}
- - {class: greg, go: uint32, base: "uint", bits: 32, lanes: 1}
- - {class: greg, go: uint16, base: "uint", bits: 16, lanes: 1}
- - {class: greg, go: uint8, base: "uint", bits: 8, lanes: 1}
-
-# Special shapes just to make INSERT[IF]128 work.
-# The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits.
- - {class: vreg, go: Int8x16, base: "int", elemBits: 128, bits: 128, lanes: 16}
- - {class: vreg, go: Uint8x16, base: "uint", elemBits: 128, bits: 128, lanes: 16}
- - {class: vreg, go: Int16x8, base: "int", elemBits: 128, bits: 128, lanes: 8}
- - {class: vreg, go: Uint16x8, base: "uint", elemBits: 128, bits: 128, lanes: 8}
- - {class: vreg, go: Int32x4, base: "int", elemBits: 128, bits: 128, lanes: 4}
- - {class: vreg, go: Uint32x4, base: "uint", elemBits: 128, bits: 128, lanes: 4}
- - {class: vreg, go: Int64x2, base: "int", elemBits: 128, bits: 128, lanes: 2}
- - {class: vreg, go: Uint64x2, base: "uint", elemBits: 128, bits: 128, lanes: 2}
-
- - {class: vreg, go: Int8x32, base: "int", elemBits: 128, bits: 256, lanes: 32}
- - {class: vreg, go: Uint8x32, base: "uint", elemBits: 128, bits: 256, lanes: 32}
- - {class: vreg, go: Int16x16, base: "int", elemBits: 128, bits: 256, lanes: 16}
- - {class: vreg, go: Uint16x16, base: "uint", elemBits: 128, bits: 256, lanes: 16}
- - {class: vreg, go: Int32x8, base: "int", elemBits: 128, bits: 256, lanes: 8}
- - {class: vreg, go: Uint32x8, base: "uint", elemBits: 128, bits: 256, lanes: 8}
- - {class: vreg, go: Int64x4, base: "int", elemBits: 128, bits: 256, lanes: 4}
- - {class: vreg, go: Uint64x4, base: "uint", elemBits: 128, bits: 256, lanes: 4}
-
-# Special for carryless multiply
- - {class: vreg, go: Uint64x8, base: "uint", elemBits: 128, bits: 512, lanes: 8}
-
-# Special shapes just to make VAES(ENC|DEC)(LAST)?512 work.
-# The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits.
- - {class: vreg, go: Int8x32, base: "int", elemBits: 128, bits: 512, lanes: 32}
- - {class: vreg, go: Uint8x32, base: "uint", elemBits: 128, bits: 512, lanes: 32}
- - {class: vreg, go: Int16x16, base: "int", elemBits: 128, bits: 512, lanes: 16}
- - {class: vreg, go: Uint16x16, base: "uint", elemBits: 128, bits: 512, lanes: 16}
- - {class: vreg, go: Int32x8, base: "int", elemBits: 128, bits: 512, lanes: 8}
- - {class: vreg, go: Uint32x8, base: "uint", elemBits: 128, bits: 512, lanes: 8}
- - {class: vreg, go: Int64x4, base: "int", elemBits: 128, bits: 512, lanes: 4}
- - {class: vreg, go: Uint64x4, base: "uint", elemBits: 128, bits: 512, lanes: 4}
-
- - {class: immediate, go: Immediate} # TODO: we only support imms that are not used as value -- usually as instruction semantic predicate like VPCMP as of now.
-inVariant: !repeat
-- *types
-out: !repeat
-- *types
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "cmp"
- "fmt"
- "log"
- "maps"
- "reflect"
- "regexp"
- "slices"
- "strconv"
- "strings"
-
- "simd/_gen/unify"
-
- "golang.org/x/arch/x86/xeddata"
- "gopkg.in/yaml.v3"
-)
-
-const (
- NOT_REG_CLASS = iota // not a register
- VREG_CLASS // classify as a vector register; see
- GREG_CLASS // classify as a general register
-)
-
-// instVariant is a bitmap indicating a variant of an instruction that has
-// optional parameters.
-type instVariant uint8
-
-const (
- instVariantNone instVariant = 0
-
- // instVariantMasked indicates that this is the masked variant of an
- // optionally-masked instruction.
- instVariantMasked instVariant = 1 << iota
-)
-
-var operandRemarks int
-
-// TODO: Doc. Returns Values with Def domains.
-func loadXED(xedPath string) []*unify.Value {
- // TODO: Obviously a bunch more to do here.
-
- db, err := xeddata.NewDatabase(xedPath)
- if err != nil {
- log.Fatalf("open database: %v", err)
- }
-
- var defs []*unify.Value
- type opData struct {
- inst *xeddata.Inst
- ops []operand
- mem string
- }
- // Maps from opcode to opdata(s).
- memOps := make(map[string][]opData, 0)
- otherOps := make(map[string][]opData, 0)
- appendDefs := func(inst *xeddata.Inst, ops []operand, addFields map[string]string) {
- applyQuirks(inst, ops)
-
- defsPos := len(defs)
- defs = append(defs, instToUVal(inst, ops, addFields)...)
-
- if *flagDebugXED {
- for i := defsPos; i < len(defs); i++ {
- y, _ := yaml.Marshal(defs[i])
- fmt.Printf("==>\n%s\n", y)
- }
- }
- }
- err = xeddata.WalkInsts(xedPath, func(inst *xeddata.Inst) {
- inst.Pattern = xeddata.ExpandStates(db, inst.Pattern)
-
- switch {
- case inst.RealOpcode == "N":
- return // Skip unstable instructions
- case !(strings.HasPrefix(inst.Extension, "AVX") || strings.HasPrefix(inst.Extension, "SHA")):
- // We're only interested in AVX and SHA instructions.
- return
- }
-
- if *flagDebugXED {
- fmt.Printf("%s:\n%+v\n", inst.Pos, inst)
- }
-
- ops, err := decodeOperands(db, strings.Fields(inst.Operands))
- if err != nil {
- operandRemarks++
- if *Verbose {
- log.Printf("%s: [%s] %s", inst.Pos, inst.Opcode(), err)
- }
- return
- }
- var data map[string][]opData
- mem := checkMem(ops)
- if mem == "vbcst" {
- // A pure vreg variant might exist, wait for later to see if we can
- // merge them
- data = memOps
- } else {
- data = otherOps
- }
- opcode := inst.Opcode()
- if _, ok := data[opcode]; !ok {
- s := make([]opData, 1)
- s[0] = opData{inst, ops, mem}
- data[opcode] = s
- } else {
- data[opcode] = append(data[opcode], opData{inst, ops, mem})
- }
- })
- for _, s := range otherOps {
- for _, o := range s {
- addFields := map[string]string{}
- if o.mem == "noMem" {
- opcode := o.inst.Opcode()
- // Checking if there is a vbcst variant of this operation exist
- // First check the opcode
- // Keep this logic in sync with [decodeOperands]
- if ms, ok := memOps[opcode]; ok {
- feat1, ok1 := decodeCPUFeature(o.inst)
- // Then check if there exist such an operation that for all vreg
- // shapes they are the same at the same index
- var feat1Match, feat2Match string
- matchIdx := -1
- var featMismatchCnt int
- outer:
- for i, m := range ms {
- // Their CPU feature should match first
- var featMismatch bool
- feat2, ok2 := decodeCPUFeature(m.inst)
- if !ok1 || !ok2 {
- continue
- }
- if feat1 != feat2 {
- featMismatch = true
- featMismatchCnt++
- }
- if len(o.ops) == len(m.ops) {
- for j := range o.ops {
- if reflect.TypeOf(o.ops[j]) == reflect.TypeOf(m.ops[j]) {
- v1, ok3 := o.ops[j].(operandVReg)
- v2, _ := m.ops[j].(operandVReg)
- if !ok3 {
- continue
- }
- if v1.vecShape != v2.vecShape {
- // A mismatch, skip this memOp
- continue outer
- }
- } else {
- _, ok3 := o.ops[j].(operandVReg)
- _, ok4 := m.ops[j].(operandMem)
- // The only difference must be the vreg and mem, no other cases.
- if !ok3 || !ok4 {
- // A mismatch, skip this memOp
- continue outer
- }
- }
- }
- // Found a match, break early
- matchIdx = i
- feat1Match = feat1
- feat2Match = feat2
- if featMismatchCnt > 1 {
- panic("multiple feature mismatch vbcst memops detected, simdgen failed to distinguish")
- }
- if !featMismatch {
- // Mismatch feat is ok but should prioritize matching cases.
- break
- }
- }
- }
- // Remove the match from memOps, it's now merged to this pure vreg operation
- if matchIdx != -1 {
- memOps[opcode] = append(memOps[opcode][:matchIdx], memOps[opcode][matchIdx+1:]...)
- // Merge is done by adding a new field
- // Right now we only have vbcst
- addFields["memFeatures"] = "vbcst"
- if feat1Match != feat2Match {
- addFields["memFeaturesData"] = fmt.Sprintf("feat1=%s;feat2=%s", feat1Match, feat2Match)
- }
- }
- }
- }
- appendDefs(o.inst, o.ops, addFields)
- }
- }
- for _, ms := range memOps {
- for _, m := range ms {
- if *Verbose {
- log.Printf("mem op not merged: %s, %v\n", m.inst.Opcode(), m)
- }
- appendDefs(m.inst, m.ops, nil)
- }
- }
- if err != nil {
- log.Fatalf("walk insts: %v", err)
- }
-
- if len(unknownFeatures) > 0 {
- if !*Verbose {
- nInst := 0
- for _, insts := range unknownFeatures {
- nInst += len(insts)
- }
- log.Printf("%d unhandled CPU features for %d instructions (use -v for details)", len(unknownFeatures), nInst)
- } else {
- keys := slices.SortedFunc(maps.Keys(unknownFeatures), func(a, b cpuFeatureKey) int {
- return cmp.Or(cmp.Compare(a.Extension, b.Extension),
- cmp.Compare(a.ISASet, b.ISASet))
- })
- for _, key := range keys {
- if key.ISASet == "" || key.ISASet == key.Extension {
- log.Printf("unhandled Extension %s", key.Extension)
- } else {
- log.Printf("unhandled Extension %s and ISASet %s", key.Extension, key.ISASet)
- }
- log.Printf(" opcodes: %s", slices.Sorted(maps.Keys(unknownFeatures[key])))
- }
- }
- }
-
- return defs
-}
-
-var (
- maskRequiredRe = regexp.MustCompile(`VPCOMPRESS[BWDQ]|VCOMPRESSP[SD]|VPEXPAND[BWDQ]|VEXPANDP[SD]`)
- maskOptionalRe = regexp.MustCompile(`VPCMP(EQ|GT|U)?[BWDQ]|VCMPP[SD]`)
-)
-
-func applyQuirks(inst *xeddata.Inst, ops []operand) {
- opc := inst.Opcode()
- switch {
- case maskRequiredRe.MatchString(opc):
- // The mask on these instructions is marked optional, but the
- // instruction is pointless without the mask.
- for i, op := range ops {
- if op, ok := op.(operandMask); ok {
- op.optional = false
- ops[i] = op
- }
- }
-
- case maskOptionalRe.MatchString(opc):
- // Conversely, these masks should be marked optional and aren't.
- for i, op := range ops {
- if op, ok := op.(operandMask); ok && op.action.r {
- op.optional = true
- ops[i] = op
- }
- }
- }
-}
-
-type operandCommon struct {
- action operandAction
-}
-
-// operandAction defines whether this operand is read and/or written.
-//
-// TODO: Should this live in [xeddata.Operand]?
-type operandAction struct {
- r bool // Read
- w bool // Written
- cr bool // Read is conditional (implies r==true)
- cw bool // Write is conditional (implies w==true)
-}
-
-type operandMem struct {
- operandCommon
- vecShape
- elemBaseType scalarBaseType
- // The following fields are not flushed to the final output
- // Supports full-vector broadcasting; implies the operand having a "vv"(vector vector) type specified in width and
- // the instruction is with attribute TXT=BCASTSTR.
- vbcst bool
- unknown bool // unknown kind
-}
-
-type vecShape struct {
- elemBits int // Element size in bits
- bits int // Register width in bits (total vector bits)
- fixedName string // the fixed register name
-}
-
-type operandVReg struct { // Vector register
- operandCommon
- vecShape
- elemBaseType scalarBaseType
-}
-
-type operandGReg struct { // Vector register
- operandCommon
- vecShape
- elemBaseType scalarBaseType
-}
-
-// operandMask is a vector mask.
-//
-// Regardless of the actual mask representation, the [vecShape] of this operand
-// corresponds to the "bit for bit" type of mask. That is, elemBits gives the
-// element width covered by each mask element, and bits/elemBits gives the total
-// number of mask elements. (bits gives the total number of bits as if this were
-// a bit-for-bit mask, which may be meaningless on its own.)
-type operandMask struct {
- operandCommon
- vecShape
- // Bits in the mask is w/bits.
-
- allMasks bool // If set, size cannot be inferred because all operands are masks.
-
- // Mask can be omitted, in which case it defaults to K0/"no mask"
- optional bool
-}
-
-type operandImm struct {
- operandCommon
- bits int // Immediate size in bits
-}
-
-type operand interface {
- common() operandCommon
- addToDef(b *unify.DefBuilder)
-}
-
-func strVal(s any) *unify.Value {
- return unify.NewValue(unify.NewStringExact(fmt.Sprint(s)))
-}
-
-func (o operandCommon) common() operandCommon {
- return o
-}
-
-func (o operandMem) addToDef(b *unify.DefBuilder) {
- b.Add("class", strVal("memory"))
- if o.unknown {
- return
- }
- baseDomain, err := unify.NewStringRegex(o.elemBaseType.regex())
- if err != nil {
- panic("parsing baseRe: " + err.Error())
- }
- b.Add("base", unify.NewValue(baseDomain))
- b.Add("bits", strVal(o.bits))
- if o.elemBits != o.bits {
- b.Add("elemBits", strVal(o.elemBits))
- }
-}
-
-func (o operandVReg) addToDef(b *unify.DefBuilder) {
- baseDomain, err := unify.NewStringRegex(o.elemBaseType.regex())
- if err != nil {
- panic("parsing baseRe: " + err.Error())
- }
- b.Add("class", strVal("vreg"))
- b.Add("bits", strVal(o.bits))
- b.Add("base", unify.NewValue(baseDomain))
- // If elemBits == bits, then the vector can be ANY shape. This happens with,
- // for example, logical ops.
- if o.elemBits != o.bits {
- b.Add("elemBits", strVal(o.elemBits))
- }
- if o.fixedName != "" {
- b.Add("fixedReg", strVal(o.fixedName))
- }
-}
-
-func (o operandGReg) addToDef(b *unify.DefBuilder) {
- baseDomain, err := unify.NewStringRegex(o.elemBaseType.regex())
- if err != nil {
- panic("parsing baseRe: " + err.Error())
- }
- b.Add("class", strVal("greg"))
- b.Add("bits", strVal(o.bits))
- b.Add("base", unify.NewValue(baseDomain))
- if o.elemBits != o.bits {
- b.Add("elemBits", strVal(o.elemBits))
- }
- if o.fixedName != "" {
- b.Add("fixedReg", strVal(o.fixedName))
- }
-}
-
-func (o operandMask) addToDef(b *unify.DefBuilder) {
- b.Add("class", strVal("mask"))
- if o.allMasks {
- // If all operands are masks, omit sizes and let unification determine mask sizes.
- return
- }
- b.Add("elemBits", strVal(o.elemBits))
- b.Add("bits", strVal(o.bits))
- if o.fixedName != "" {
- b.Add("fixedReg", strVal(o.fixedName))
- }
-}
-
-func (o operandImm) addToDef(b *unify.DefBuilder) {
- b.Add("class", strVal("immediate"))
- b.Add("bits", strVal(o.bits))
-}
-
-var actionEncoding = map[string]operandAction{
- "r": {r: true},
- "cr": {r: true, cr: true},
- "w": {w: true},
- "cw": {w: true, cw: true},
- "rw": {r: true, w: true},
- "crw": {r: true, w: true, cr: true},
- "rcw": {r: true, w: true, cw: true},
-}
-
-func decodeOperand(db *xeddata.Database, operand string) (operand, error) {
- op, err := xeddata.NewOperand(db, operand)
- if err != nil {
- log.Fatalf("parsing operand %q: %v", operand, err)
- }
- if *flagDebugXED {
- fmt.Printf(" %+v\n", op)
- }
-
- if strings.HasPrefix(op.Name, "EMX_BROADCAST") {
- // This refers to a set of macros defined in all-state.txt that set a
- // BCAST operand to various fixed values. But the BCAST operand is
- // itself suppressed and "internal", so I think we can just ignore this
- // operand.
- return nil, nil
- }
-
- // TODO: See xed_decoded_inst_operand_action. This might need to be more
- // complicated.
- action, ok := actionEncoding[op.Action]
- if !ok {
- return nil, fmt.Errorf("unknown action %q", op.Action)
- }
- common := operandCommon{action: action}
-
- lhs := op.NameLHS()
- if strings.HasPrefix(lhs, "MEM") {
- // looks like XED data has an inconsistency on VPADDD, marking attribute
- // VPBROADCASTD instead of the canonical BCASTSTR.
- if op.Width == "vv" && (op.Attributes["TXT=BCASTSTR"] ||
- op.Attributes["TXT=VPBROADCASTD"]) {
- baseType, elemBits, ok := decodeType(op)
- if !ok {
- return nil, fmt.Errorf("failed to decode memory width %q", operand)
- }
- // This operand has two possible width([bits]):
- // 1. the same as the other operands
- // 2. the element width as the other operands (broaccasting)
- // left it default to 2, later we will set a new field in the operation
- // to indicate this dual-width property.
- shape := vecShape{elemBits: elemBits, bits: elemBits}
- return operandMem{
- operandCommon: common,
- vecShape: shape,
- elemBaseType: baseType,
- vbcst: true,
- unknown: false,
- }, nil
- }
- // TODO: parse op.Width better to handle all cases
- // Right now this will at least miss VPBROADCAST.
- return operandMem{
- operandCommon: common,
- unknown: true,
- }, nil
- } else if strings.HasPrefix(lhs, "REG") {
- if op.Width == "mskw" {
- // The mask operand doesn't specify a width. We have to infer it.
- //
- // XED uses the marker ZEROSTR to indicate that a mask operand is
- // optional and, if omitted, implies K0, aka "no mask".
- return operandMask{
- operandCommon: common,
- optional: op.Attributes["TXT=ZEROSTR"],
- }, nil
- } else {
- class, regBits, fixedReg := decodeReg(op)
- if class == NOT_REG_CLASS {
- return nil, fmt.Errorf("failed to decode register %q", operand)
- }
- baseType, elemBits, ok := decodeType(op)
- if !ok {
- return nil, fmt.Errorf("failed to decode register width %q", operand)
- }
- shape := vecShape{elemBits: elemBits, bits: regBits, fixedName: fixedReg}
- if class == VREG_CLASS {
- return operandVReg{
- operandCommon: common,
- vecShape: shape,
- elemBaseType: baseType,
- }, nil
- }
- // general register
- m := min(shape.bits, shape.elemBits)
- shape.bits, shape.elemBits = m, m
- return operandGReg{
- operandCommon: common,
- vecShape: shape,
- elemBaseType: baseType,
- }, nil
-
- }
- } else if strings.HasPrefix(lhs, "IMM") {
- _, bits, ok := decodeType(op)
- if !ok {
- return nil, fmt.Errorf("failed to decode register width %q", operand)
- }
- return operandImm{
- operandCommon: common,
- bits: bits,
- }, nil
- }
-
- // TODO: BASE and SEG
- return nil, fmt.Errorf("unknown operand LHS %q in %q", lhs, operand)
-}
-
-func decodeOperands(db *xeddata.Database, operands []string) (ops []operand, err error) {
- // Decode the XED operand descriptions.
- for _, o := range operands {
- op, err := decodeOperand(db, o)
- if err != nil {
- return nil, err
- }
- if op != nil {
- ops = append(ops, op)
- }
- }
-
- // XED doesn't encode the size of mask operands. If there are mask operands,
- // try to infer their sizes from other operands.
- if err := inferMaskSizes(ops); err != nil {
- return nil, fmt.Errorf("%w in operands %+v", err, operands)
- }
-
- return ops, nil
-}
-
-func inferMaskSizes(ops []operand) error {
- // This is a heuristic and it falls apart in some cases:
- //
- // - Mask operations like KAND[BWDQ] have *nothing* in the XED to indicate
- // mask size.
- //
- // - VINSERT*, VPSLL*, VPSRA*, and VPSRL* and some others naturally have
- // mixed input sizes and the XED doesn't indicate which operands the mask
- // applies to.
- //
- // - VPDP* and VP4DP* have really complex mixed operand patterns.
- //
- // I think for these we may just have to hand-write a table of which
- // operands each mask applies to.
- inferMask := func(r, w bool) error {
- var masks []int
- var rSizes, wSizes, sizes []vecShape
- allMasks := true
- hasWMask := false
- for i, op := range ops {
- action := op.common().action
- if _, ok := op.(operandMask); ok {
- if action.r && action.w {
- return fmt.Errorf("unexpected rw mask")
- }
- if action.r == r || action.w == w {
- masks = append(masks, i)
- }
- if action.w {
- hasWMask = true
- }
- } else {
- allMasks = false
- if reg, ok := op.(operandVReg); ok {
- if action.r {
- rSizes = append(rSizes, reg.vecShape)
- }
- if action.w {
- wSizes = append(wSizes, reg.vecShape)
- }
- }
- }
- }
- if len(masks) == 0 {
- return nil
- }
-
- if r {
- sizes = rSizes
- if len(sizes) == 0 {
- sizes = wSizes
- }
- }
- if w {
- sizes = wSizes
- if len(sizes) == 0 {
- sizes = rSizes
- }
- }
-
- if len(sizes) == 0 {
- // If all operands are masks, leave the mask inferrence to the users.
- if allMasks {
- for _, i := range masks {
- m := ops[i].(operandMask)
- m.allMasks = true
- ops[i] = m
- }
- return nil
- }
- return fmt.Errorf("cannot infer mask size: no register operands")
- }
- shape, ok := singular(sizes)
- if !ok {
- if !hasWMask && len(wSizes) == 1 && len(masks) == 1 {
- // This pattern looks like predicate mask, so its shape should align with the
- // output. TODO: verify this is a safe assumption.
- shape = wSizes[0]
- } else {
- return fmt.Errorf("cannot infer mask size: multiple register sizes %v", sizes)
- }
- }
- for _, i := range masks {
- m := ops[i].(operandMask)
- m.vecShape = shape
- ops[i] = m
- }
- return nil
- }
- if err := inferMask(true, false); err != nil {
- return err
- }
- if err := inferMask(false, true); err != nil {
- return err
- }
- return nil
-}
-
-// addOperandstoDef adds "in", "inVariant", and "out" to an instruction Def.
-//
-// Optional mask input operands are added to the inVariant field if
-// variant&instVariantMasked, and omitted otherwise.
-func addOperandsToDef(ops []operand, instDB *unify.DefBuilder, variant instVariant) {
- var inVals, inVar, outVals []*unify.Value
- asmPos := 0
- for _, op := range ops {
- var db unify.DefBuilder
- op.addToDef(&db)
- db.Add("asmPos", unify.NewValue(unify.NewStringExact(fmt.Sprint(asmPos))))
-
- action := op.common().action
- asmCount := 1 // # of assembly operands; 0 or 1
- if action.r {
- inVal := unify.NewValue(db.Build())
- // If this is an optional mask, put it in the input variant tuple.
- if mask, ok := op.(operandMask); ok && mask.optional {
- if variant&instVariantMasked != 0 {
- inVar = append(inVar, inVal)
- } else {
- // This operand doesn't appear in the assembly at all.
- asmCount = 0
- }
- } else {
- // Just a regular input operand.
- inVals = append(inVals, inVal)
- }
- }
- if action.w {
- outVal := unify.NewValue(db.Build())
- outVals = append(outVals, outVal)
- }
-
- asmPos += asmCount
- }
-
- instDB.Add("in", unify.NewValue(unify.NewTuple(inVals...)))
- instDB.Add("inVariant", unify.NewValue(unify.NewTuple(inVar...)))
- instDB.Add("out", unify.NewValue(unify.NewTuple(outVals...)))
- memFeatures := checkMem(ops)
- if memFeatures != "noMem" {
- instDB.Add("memFeatures", unify.NewValue(unify.NewStringExact(memFeatures)))
- }
-}
-
-// checkMem checks the shapes of memory operand in the operation and returns the shape.
-// Keep this function in sync with [decodeOperand].
-func checkMem(ops []operand) string {
- memState := "noMem"
- var mem *operandMem
- memCnt := 0
- for _, op := range ops {
- if m, ok := op.(operandMem); ok {
- mem = &m
- memCnt++
- }
- }
- if mem != nil {
- if mem.unknown {
- memState = "unknown"
- } else if memCnt > 1 {
- memState = "tooManyMem"
- } else {
- // We only have vbcst case as of now.
- // This shape has an indication that [bits] fields has two possible value:
- // 1. The element broadcast width, which is its peer vreg operand's [elemBits] (default val in the parsed XED data)
- // 2. The full vector width, which is its peer vreg operand's [bits] (godefs should be aware of this)
- memState = "vbcst"
- }
- }
- return memState
-}
-
-func instToUVal(inst *xeddata.Inst, ops []operand, addFields map[string]string) []*unify.Value {
- feature, ok := decodeCPUFeature(inst)
- if !ok {
- return nil
- }
-
- var vals []*unify.Value
- vals = append(vals, instToUVal1(inst, ops, feature, instVariantNone, addFields))
- if hasOptionalMask(ops) {
- vals = append(vals, instToUVal1(inst, ops, feature, instVariantMasked, addFields))
- }
- return vals
-}
-
-func instToUVal1(inst *xeddata.Inst, ops []operand, feature string, variant instVariant, addFields map[string]string) *unify.Value {
- var db unify.DefBuilder
- db.Add("goarch", unify.NewValue(unify.NewStringExact("amd64")))
- db.Add("asm", unify.NewValue(unify.NewStringExact(inst.Opcode())))
- addOperandsToDef(ops, &db, variant)
- db.Add("cpuFeature", unify.NewValue(unify.NewStringExact(feature)))
- for k, v := range addFields {
- db.Add(k, unify.NewValue(unify.NewStringExact(v)))
- }
-
- if strings.Contains(inst.Pattern, "ZEROING=0") {
- // This is an EVEX instruction, but the ".Z" (zero-merging)
- // instruction flag is NOT valid. EVEX.z must be zero.
- //
- // This can mean a few things:
- //
- // - The output of an instruction is a mask, so merging modes don't
- // make any sense. E.g., VCMPPS.
- //
- // - There are no masks involved anywhere. (Maybe MASK=0 is also set
- // in this case?) E.g., VINSERTPS.
- //
- // - The operation inherently performs merging. E.g., VCOMPRESSPS
- // with a mem operand.
- //
- // There may be other reasons.
- db.Add("zeroing", unify.NewValue(unify.NewStringExact("false")))
- }
- pos := unify.Pos{Path: inst.Pos.Path, Line: inst.Pos.Line}
- return unify.NewValuePos(db.Build(), pos)
-}
-
-// decodeCPUFeature returns the CPU feature name required by inst. These match
-// the names of the "Has*" feature checks in the simd package.
-func decodeCPUFeature(inst *xeddata.Inst) (string, bool) {
- key := cpuFeatureKey{
- Extension: inst.Extension,
- ISASet: isaSetStrip.ReplaceAllLiteralString(inst.ISASet, ""),
- }
- feat, ok := cpuFeatureMap[key]
- if !ok {
- imap := unknownFeatures[key]
- if imap == nil {
- imap = make(map[string]struct{})
- unknownFeatures[key] = imap
- }
- imap[inst.Opcode()] = struct{}{}
- return "", false
- }
- if feat == "ignore" {
- return "", false
- }
- return feat, true
-}
-
-var isaSetStrip = regexp.MustCompile("_(128N?|256N?|512)$")
-
-type cpuFeatureKey struct {
- Extension, ISASet string
-}
-
-// cpuFeatureMap maps from XED's "EXTENSION" and "ISA_SET" to a CPU feature name
-// that can be used in the SIMD API.
-var cpuFeatureMap = map[cpuFeatureKey]string{
- {"SHA", "SHA"}: "SHA",
-
- {"AVX", ""}: "AVX",
- {"AVX_VNNI", "AVX_VNNI"}: "AVXVNNI",
- {"AVX2", ""}: "AVX2",
- {"AVXAES", ""}: "AVX, AES",
-
- // AVX-512 foundational features. We combine all of these into one "AVX512" feature.
- {"AVX512EVEX", "AVX512F"}: "AVX512",
- {"AVX512EVEX", "AVX512CD"}: "AVX512",
- {"AVX512EVEX", "AVX512BW"}: "AVX512",
- {"AVX512EVEX", "AVX512DQ"}: "AVX512",
- // AVX512VL doesn't appear explicitly in the ISASet. I guess it's implied by
- // the vector length suffix.
-
- // AVX-512 extension features
- {"AVX512EVEX", "AVX512_BITALG"}: "AVX512BITALG",
- {"AVX512EVEX", "AVX512_GFNI"}: "AVX512GFNI",
- {"AVX512EVEX", "AVX512_VBMI2"}: "AVX512VBMI2",
- {"AVX512EVEX", "AVX512_VBMI"}: "AVX512VBMI",
- {"AVX512EVEX", "AVX512_VNNI"}: "AVX512VNNI",
- {"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ",
- {"AVX512EVEX", "AVX512_VAES"}: "AVX512VAES",
- {"AVX512EVEX", "AVX512_VPCLMULQDQ"}: "AVX512VPCLMULQDQ",
-
- // AVX 10.2 (not yet supported)
- {"AVX512EVEX", "AVX10_2_RC"}: "ignore",
-}
-
-var unknownFeatures = map[cpuFeatureKey]map[string]struct{}{}
-
-// hasOptionalMask returns whether there is an optional mask operand in ops.
-func hasOptionalMask(ops []operand) bool {
- for _, op := range ops {
- if op, ok := op.(operandMask); ok && op.optional {
- return true
- }
- }
- return false
-}
-
-func singular[T comparable](xs []T) (T, bool) {
- if len(xs) == 0 {
- return *new(T), false
- }
- for _, x := range xs[1:] {
- if x != xs[0] {
- return *new(T), false
- }
- }
- return xs[0], true
-}
-
-type fixedReg struct {
- class int
- name string
- width int
-}
-
-var fixedRegMap = map[string]fixedReg{
- "XED_REG_XMM0": {VREG_CLASS, "x0", 128},
-}
-
-// decodeReg returns class (NOT_REG_CLASS, VREG_CLASS, GREG_CLASS, VREG_CLASS_FIXED,
-// GREG_CLASS_FIXED), width in bits and reg name(if fixed).
-// If the operand cannot be decided as a register, then the clas is NOT_REG_CLASS.
-func decodeReg(op *xeddata.Operand) (class, width int, name string) {
- // op.Width tells us the total width, e.g.,:
- //
- // dq => 128 bits (XMM)
- // qq => 256 bits (YMM)
- // mskw => K
- // z[iuf?](8|16|32|...) => 512 bits (ZMM)
- //
- // But the encoding is really weird and it's not clear if these *always*
- // mean XMM/YMM/ZMM or if other irregular things can use these large widths.
- // Hence, we dig into the register sets themselves.
-
- if !strings.HasPrefix(op.NameLHS(), "REG") {
- return NOT_REG_CLASS, 0, ""
- }
- // TODO: We shouldn't be relying on the macro naming conventions. We should
- // use all-dec-patterns.txt, but xeddata doesn't support that table right now.
- rhs := op.NameRHS()
- if !strings.HasSuffix(rhs, "()") {
- if fixedReg, ok := fixedRegMap[rhs]; ok {
- return fixedReg.class, fixedReg.width, fixedReg.name
- }
- return NOT_REG_CLASS, 0, ""
- }
- switch {
- case strings.HasPrefix(rhs, "XMM_"):
- return VREG_CLASS, 128, ""
- case strings.HasPrefix(rhs, "YMM_"):
- return VREG_CLASS, 256, ""
- case strings.HasPrefix(rhs, "ZMM_"):
- return VREG_CLASS, 512, ""
- case strings.HasPrefix(rhs, "GPR64_"), strings.HasPrefix(rhs, "VGPR64_"):
- return GREG_CLASS, 64, ""
- case strings.HasPrefix(rhs, "GPR32_"), strings.HasPrefix(rhs, "VGPR32_"):
- return GREG_CLASS, 32, ""
- }
- return NOT_REG_CLASS, 0, ""
-}
-
-var xtypeRe = regexp.MustCompile(`^([iuf])([0-9]+)$`)
-
-// scalarBaseType describes the base type of a scalar element. This is a Go
-// type, but without the bit width suffix (with the exception of
-// scalarBaseIntOrUint).
-type scalarBaseType int
-
-const (
- scalarBaseInt scalarBaseType = iota
- scalarBaseUint
- scalarBaseIntOrUint // Signed or unsigned is unspecified
- scalarBaseFloat
- scalarBaseComplex
- scalarBaseBFloat
- scalarBaseHFloat
-)
-
-func (s scalarBaseType) regex() string {
- switch s {
- case scalarBaseInt:
- return "int"
- case scalarBaseUint:
- return "uint"
- case scalarBaseIntOrUint:
- return "int|uint"
- case scalarBaseFloat:
- return "float"
- case scalarBaseComplex:
- return "complex"
- case scalarBaseBFloat:
- return "BFloat"
- case scalarBaseHFloat:
- return "HFloat"
- }
- panic(fmt.Sprintf("unknown scalar base type %d", s))
-}
-
-func decodeType(op *xeddata.Operand) (base scalarBaseType, bits int, ok bool) {
- // The xtype tells you the element type. i8, i16, i32, i64, f32, etc.
- //
- // TODO: Things like AVX2 VPAND have an xtype of u256 because they're
- // element-width agnostic. Do I map that to all widths, or just omit the
- // element width and let unification flesh it out? There's no u512
- // (presumably those are all masked, so elem width matters). These are all
- // Category: LOGICAL, so maybe we could use that info?
-
- // Handle some weird ones.
- switch op.Xtype {
- // 8-bit float formats as defined by Open Compute Project "OCP 8-bit
- // Floating Point Specification (OFP8)".
- case "bf8": // E5M2 float
- return scalarBaseBFloat, 8, true
- case "hf8": // E4M3 float
- return scalarBaseHFloat, 8, true
- case "bf16": // bfloat16 float
- return scalarBaseBFloat, 16, true
- case "2f16":
- // Complex consisting of 2 float16s. Doesn't exist in Go, but we can say
- // what it would be.
- return scalarBaseComplex, 32, true
- case "2i8", "2I8":
- // These just use the lower INT8 in each 16 bit field.
- // As far as I can tell, "2I8" is a typo.
- return scalarBaseInt, 8, true
- case "2u16", "2U16":
- // some VPDP* has it
- // TODO: does "z" means it has zeroing?
- return scalarBaseUint, 16, true
- case "2i16", "2I16":
- // some VPDP* has it
- return scalarBaseInt, 16, true
- case "4u8", "4U8":
- // some VPDP* has it
- return scalarBaseUint, 8, true
- case "4i8", "4I8":
- // some VPDP* has it
- return scalarBaseInt, 8, true
- }
-
- // The rest follow a simple pattern.
- m := xtypeRe.FindStringSubmatch(op.Xtype)
- if m == nil {
- // TODO: Report unrecognized xtype
- return 0, 0, false
- }
- bits, _ = strconv.Atoi(m[2])
- switch m[1] {
- case "i", "u":
- // XED is rather inconsistent about what's signed, unsigned, or doesn't
- // matter, so merge them together and let the Go definitions narrow as
- // appropriate. Maybe there's a better way to do this.
- return scalarBaseIntOrUint, bits, true
- case "f":
- return scalarBaseFloat, bits, true
- default:
- panic("unreachable")
- }
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-// this generates type-instantiated boilerplate code for
-// slice operations and tests
-
-import (
- "bufio"
- "bytes"
- "flag"
- "fmt"
- "go/format"
- "io"
- "os"
- "strings"
- "text/template"
-)
-
-type resultTypeFunc func(t string, w, c int) (ot string, ow int, oc int)
-
-// shapes describes a combination of vector widths and various element types
-type shapes struct {
- vecs []int // Vector bit width for this shape.
- ints []int // Int element bit width(s) for this shape
- uints []int // Unsigned int element bit width(s) for this shape
- floats []int // Float element bit width(s) for this shape
- output resultTypeFunc
-}
-
-// shapeAndTemplate is a template and the set of shapes on which it will be expanded
-type shapeAndTemplate struct {
- s *shapes
- t *template.Template
-}
-
-func (sat shapeAndTemplate) target(outType string, width int) shapeAndTemplate {
- newSat := sat
- newShape := *sat.s
- newShape.output = func(t string, w, c int) (ot string, ow int, oc int) {
- return outType, width, c
- }
- newSat.s = &newShape
- return newSat
-}
-
-func (sat shapeAndTemplate) shrinkTo(outType string, by int) shapeAndTemplate {
- newSat := sat
- newShape := *sat.s
- newShape.output = func(t string, w, c int) (ot string, ow int, oc int) {
- return outType, w / by, c * by
- }
- newSat.s = &newShape
- return newSat
-}
-
-func (s *shapes) forAllShapes(f func(seq int, t, upperT string, w, c int, out io.Writer), out io.Writer) {
- vecs := s.vecs
- ints := s.ints
- uints := s.uints
- floats := s.floats
- seq := 0
- for _, v := range vecs {
- for _, w := range ints {
- c := v / w
- f(seq, "int", "Int", w, c, out)
- seq++
- }
- for _, w := range uints {
- c := v / w
- f(seq, "uint", "Uint", w, c, out)
- seq++
- }
- for _, w := range floats {
- c := v / w
- f(seq, "float", "Float", w, c, out)
- seq++
- }
- }
-}
-
-var allShapes = &shapes{
- vecs: []int{128, 256, 512},
- ints: []int{8, 16, 32, 64},
- uints: []int{8, 16, 32, 64},
- floats: []int{32, 64},
-}
-
-var intShapes = &shapes{
- vecs: []int{128, 256, 512},
- ints: []int{8, 16, 32, 64},
-}
-
-var uintShapes = &shapes{
- vecs: []int{128, 256, 512},
- uints: []int{8, 16, 32, 64},
-}
-
-var avx512Shapes = &shapes{
- vecs: []int{512},
- ints: []int{8, 16, 32, 64},
- uints: []int{8, 16, 32, 64},
- floats: []int{32, 64},
-}
-
-var avx2Shapes = &shapes{
- vecs: []int{128, 256},
- ints: []int{8, 16, 32, 64},
- uints: []int{8, 16, 32, 64},
- floats: []int{32, 64},
-}
-
-var avx2MaskedLoadShapes = &shapes{
- vecs: []int{128, 256},
- ints: []int{32, 64},
- uints: []int{32, 64},
- floats: []int{32, 64},
-}
-
-var avx2SmallLoadPunShapes = &shapes{
- // ints are done by hand, these are type-punned to int.
- vecs: []int{128, 256},
- uints: []int{8, 16},
-}
-
-var unaryFlaky = &shapes{ // for tests that support flaky equality
- vecs: []int{128, 256, 512},
- floats: []int{32, 64},
-}
-
-var ternaryFlaky = &shapes{ // for tests that support flaky equality
- vecs: []int{128, 256, 512},
- floats: []int{32},
-}
-
-var avx2SignedComparisons = &shapes{
- vecs: []int{128, 256},
- ints: []int{8, 16, 32, 64},
-}
-
-var avx2UnsignedComparisons = &shapes{
- vecs: []int{128, 256},
- uints: []int{8, 16, 32, 64},
-}
-
-type templateData struct {
- VType string // the type of the vector, e.g. Float32x4
- AOrAn string // for documentation, the article "a" or "an"
- EWidth int // the bit width of the element type, e.g. 32
- Vwidth int // the width of the vector type, e.g. 128
- Count int // the number of elements, e.g. 4
- WxC string // the width-by-type string, e.g., "32x4"
- BxC string // as if bytes, in the proper count, e.g., "8x16" (W==8)
- Base string // the title-case Base Type of the vector, e.g., "Float"
- Etype string // the element type, e.g. "float32"
- OxFF string // a mask for the lowest 'count' bits
-
- OVType string // type of output vector
- OEtype string // output element type
- OEType string // output element type, title-case
- OCount int // output element count
-}
-
-func (t templateData) As128BitVec() string {
- return fmt.Sprintf("%s%dx%d", t.Base, t.EWidth, 128/t.EWidth)
-}
-
-func oneTemplate(t *template.Template, baseType string, width, count int, out io.Writer, rtf resultTypeFunc) {
- b := width * count
- if b < 128 || b > 512 {
- return
- }
-
- ot, ow, oc := baseType, width, count
- if rtf != nil {
- ot, ow, oc = rtf(ot, ow, oc)
- if ow*oc > 512 || ow*oc < 128 || ow < 8 || ow > 64 {
- return
- }
- // TODO someday we will support conversions to 16-bit floats
- if ot == "float" && ow < 32 {
- return
- }
- }
- ovType := fmt.Sprintf("%s%dx%d", strings.ToUpper(ot[:1])+ot[1:], ow, oc)
- oeType := fmt.Sprintf("%s%d", ot, ow)
- oEType := fmt.Sprintf("%s%d", strings.ToUpper(ot[:1])+ot[1:], ow)
-
- wxc := fmt.Sprintf("%dx%d", width, count)
- BaseType := strings.ToUpper(baseType[:1]) + baseType[1:]
- vType := fmt.Sprintf("%s%s", BaseType, wxc)
- eType := fmt.Sprintf("%s%d", baseType, width)
-
- bxc := fmt.Sprintf("%dx%d", 8, count*(width/8))
- aOrAn := "a"
- if strings.Contains("aeiou", baseType[:1]) {
- aOrAn = "an"
- }
- oxFF := fmt.Sprintf("0x%x", uint64((1<<count)-1))
- t.Execute(out, templateData{
- VType: vType,
- AOrAn: aOrAn,
- EWidth: width,
- Vwidth: b,
- Count: count,
- WxC: wxc,
- BxC: bxc,
- Base: BaseType,
- Etype: eType,
- OxFF: oxFF,
- OVType: ovType,
- OEtype: oeType,
- OCount: oc,
- OEType: oEType,
- })
-}
-
-// forTemplates expands the template sat.t for each shape
-// in sat.s, writing to out.
-func (sat shapeAndTemplate) forTemplates(out io.Writer) {
- t, s := sat.t, sat.s
- vecs := s.vecs
- ints := s.ints
- uints := s.uints
- floats := s.floats
- for _, v := range vecs {
- for _, w := range ints {
- c := v / w
- oneTemplate(t, "int", w, c, out, sat.s.output)
- }
- for _, w := range uints {
- c := v / w
- oneTemplate(t, "uint", w, c, out, sat.s.output)
- }
- for _, w := range floats {
- c := v / w
- oneTemplate(t, "float", w, c, out, sat.s.output)
- }
- }
-}
-
-func prologue(s string, out io.Writer) {
- fmt.Fprintf(out,
- `// Code generated by '%s'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-`, s)
-}
-
-func ssaPrologue(s string, out io.Writer) {
- fmt.Fprintf(out,
- `// Code generated by '%s'; DO NOT EDIT.
-
-package ssa
-
-`, s)
-}
-
-func unsafePrologue(s string, out io.Writer) {
- fmt.Fprintf(out,
- `// Code generated by '%s'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-import "unsafe"
-
-`, s)
-}
-
-func testPrologue(t, s string, out io.Writer) {
- fmt.Fprintf(out,
- `// Code generated by '%s'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-// This file contains functions testing %s.
-// Each function in this file is specialized for a
-// particular simd type <BaseType><Width>x<Count>.
-
-package simd_test
-
-import (
- "simd"
- "testing"
-)
-
-`, s, t)
-}
-
-func curryTestPrologue(t string) func(s string, out io.Writer) {
- return func(s string, out io.Writer) {
- testPrologue(t, s, out)
- }
-}
-
-func templateOf(name, temp string) shapeAndTemplate {
- return shapeAndTemplate{s: allShapes,
- t: template.Must(template.New(name).Parse(temp))}
-}
-
-func shapedTemplateOf(s *shapes, name, temp string) shapeAndTemplate {
- return shapeAndTemplate{s: s,
- t: template.Must(template.New(name).Parse(temp))}
-}
-
-var sliceTemplate = templateOf("slice", `
-// Load{{.VType}}Slice loads {{.AOrAn}} {{.VType}} from a slice of at least {{.Count}} {{.Etype}}s
-func Load{{.VType}}Slice(s []{{.Etype}}) {{.VType}} {
- return Load{{.VType}}((*[{{.Count}}]{{.Etype}})(s))
-}
-
-// StoreSlice stores x into a slice of at least {{.Count}} {{.Etype}}s
-func (x {{.VType}}) StoreSlice(s []{{.Etype}}) {
- x.Store((*[{{.Count}}]{{.Etype}})(s))
-}
-`)
-
-var unaryTemplate = templateOf("unary_helpers", `
-// test{{.VType}}Unary tests the simd unary method f against the expected behavior generated by want
-func test{{.VType}}Unary(t *testing.T, f func(_ simd.{{.VType}}) simd.{{.VType}}, want func(_ []{{.Etype}}) []{{.Etype}}) {
- n := {{.Count}}
- t.Helper()
- forSlice(t, {{.Etype}}s, n, func(x []{{.Etype}}) bool {
- t.Helper()
- a := simd.Load{{.VType}}Slice(x)
- g := make([]{{.Etype}}, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() {t.Helper(); t.Logf("x=%v", x)})
- })
-}
-`)
-
-var unaryFlakyTemplate = shapedTemplateOf(unaryFlaky, "unary_flaky_helpers", `
-// test{{.VType}}UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func test{{.VType}}UnaryFlaky(t *testing.T, f func(x simd.{{.VType}}) simd.{{.VType}}, want func(x []{{.Etype}}) []{{.Etype}}, flakiness float64) {
- n := {{.Count}}
- t.Helper()
- forSlice(t, {{.Etype}}s, n, func(x []{{.Etype}}) bool {
- t.Helper()
- a := simd.Load{{.VType}}Slice(x)
- g := make([]{{.Etype}}, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, flakiness, func() {t.Helper(); t.Logf("x=%v", x)})
- })
-}
-`)
-
-var convertTemplate = templateOf("convert_helpers", `
-// test{{.VType}}ConvertTo{{.OEType}} tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func test{{.VType}}ConvertTo{{.OEType}}(t *testing.T, f func(x simd.{{.VType}}) simd.{{.OVType}}, want func(x []{{.Etype}}) []{{.OEtype}}) {
- n := {{.Count}}
- t.Helper()
- forSlice(t, {{.Etype}}s, n, func(x []{{.Etype}}) bool {
- t.Helper()
- a := simd.Load{{.VType}}Slice(x)
- g := make([]{{.OEtype}}, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() {t.Helper(); t.Logf("x=%v", x)})
- })
-}
-`)
-
-var unaryToInt32 = convertTemplate.target("int", 32)
-var unaryToUint32 = convertTemplate.target("uint", 32)
-var unaryToUint16 = convertTemplate.target("uint", 16)
-
-var binaryTemplate = templateOf("binary_helpers", `
-// test{{.VType}}Binary tests the simd binary method f against the expected behavior generated by want
-func test{{.VType}}Binary(t *testing.T, f func(_, _ simd.{{.VType}}) simd.{{.VType}}, want func(_, _ []{{.Etype}}) []{{.Etype}}) {
- n := {{.Count}}
- t.Helper()
- forSlicePair(t, {{.Etype}}s, n, func(x, y []{{.Etype}}) bool {
- t.Helper()
- a := simd.Load{{.VType}}Slice(x)
- b := simd.Load{{.VType}}Slice(y)
- g := make([]{{.Etype}}, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() {t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); })
- })
-}
-`)
-
-var ternaryTemplate = templateOf("ternary_helpers", `
-// test{{.VType}}Ternary tests the simd ternary method f against the expected behavior generated by want
-func test{{.VType}}Ternary(t *testing.T, f func(_, _, _ simd.{{.VType}}) simd.{{.VType}}, want func(_, _, _ []{{.Etype}}) []{{.Etype}}) {
- n := {{.Count}}
- t.Helper()
- forSliceTriple(t, {{.Etype}}s, n, func(x, y, z []{{.Etype}}) bool {
- t.Helper()
- a := simd.Load{{.VType}}Slice(x)
- b := simd.Load{{.VType}}Slice(y)
- c := simd.Load{{.VType}}Slice(z)
- g := make([]{{.Etype}}, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() {t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z); })
- })
-}
-`)
-
-var ternaryFlakyTemplate = shapedTemplateOf(ternaryFlaky, "ternary_helpers", `
-// test{{.VType}}TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func test{{.VType}}TernaryFlaky(t *testing.T, f func(x, y, z simd.{{.VType}}) simd.{{.VType}}, want func(x, y, z []{{.Etype}}) []{{.Etype}}, flakiness float64) {
- n := {{.Count}}
- t.Helper()
- forSliceTriple(t, {{.Etype}}s, n, func(x, y, z []{{.Etype}}) bool {
- t.Helper()
- a := simd.Load{{.VType}}Slice(x)
- b := simd.Load{{.VType}}Slice(y)
- c := simd.Load{{.VType}}Slice(z)
- g := make([]{{.Etype}}, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, flakiness, func() {t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z); })
- })
-}
-`)
-
-var compareTemplate = templateOf("compare_helpers", `
-// test{{.VType}}Compare tests the simd comparison method f against the expected behavior generated by want
-func test{{.VType}}Compare(t *testing.T, f func(_, _ simd.{{.VType}}) simd.Mask{{.WxC}}, want func(_, _ []{{.Etype}}) []int64) {
- n := {{.Count}}
- t.Helper()
- forSlicePair(t, {{.Etype}}s, n, func(x, y []{{.Etype}}) bool {
- t.Helper()
- a := simd.Load{{.VType}}Slice(x)
- b := simd.Load{{.VType}}Slice(y)
- g := make([]int{{.EWidth}}, n)
- f(a, b).AsInt{{.WxC}}().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() {t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); })
- })
-}
-`)
-
-// TODO this has not been tested yet.
-var compareMaskedTemplate = templateOf("comparemasked_helpers", `
-// test{{.VType}}CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func test{{.VType}}CompareMasked(t *testing.T,
- f func(_, _ simd.{{.VType}}, m simd.Mask{{.WxC}}) simd.Mask{{.WxC}},
- want func(_, _ []{{.Etype}}) []int64) {
- n := {{.Count}}
- t.Helper()
- forSlicePairMasked(t, {{.Etype}}s, n, func(x, y []{{.Etype}}, m []bool) bool {
- t.Helper()
- a := simd.Load{{.VType}}Slice(x)
- b := simd.Load{{.VType}}Slice(y)
- k := simd.LoadInt{{.WxC}}Slice(toVect[int{{.EWidth}}](m)).ToMask()
- g := make([]int{{.EWidth}}, n)
- f(a, b, k).AsInt{{.WxC}}().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() {t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m); })
- })
-}
-`)
-
-var avx512MaskedLoadSlicePartTemplate = shapedTemplateOf(avx512Shapes, "avx 512 load slice part", `
-// Load{{.VType}}SlicePart loads a {{.VType}} from the slice s.
-// If s has fewer than {{.Count}} elements, the remaining elements of the vector are filled with zeroes.
-// If s has {{.Count}} or more elements, the function is equivalent to Load{{.VType}}Slice.
-func Load{{.VType}}SlicePart(s []{{.Etype}}) {{.VType}} {
- l := len(s)
- if l >= {{.Count}} {
- return Load{{.VType}}Slice(s)
- }
- if l == 0 {
- var x {{.VType}}
- return x
- }
- mask := Mask{{.WxC}}FromBits({{.OxFF}} >> ({{.Count}} - l))
- return LoadMasked{{.VType}}(pa{{.VType}}(s), mask)
-}
-
-// StoreSlicePart stores the {{.Count}} elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has {{.Count}} or more elements, the method is equivalent to x.StoreSlice.
-func (x {{.VType}}) StoreSlicePart(s []{{.Etype}}) {
- l := len(s)
- if l >= {{.Count}} {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask{{.WxC}}FromBits({{.OxFF}} >> ({{.Count}} - l))
- x.StoreMasked(pa{{.VType}}(s), mask)
-}
-`)
-
-var avx2MaskedLoadSlicePartTemplate = shapedTemplateOf(avx2MaskedLoadShapes, "avx 2 load slice part", `
-// Load{{.VType}}SlicePart loads a {{.VType}} from the slice s.
-// If s has fewer than {{.Count}} elements, the remaining elements of the vector are filled with zeroes.
-// If s has {{.Count}} or more elements, the function is equivalent to Load{{.VType}}Slice.
-func Load{{.VType}}SlicePart(s []{{.Etype}}) {{.VType}} {
- l := len(s)
- if l >= {{.Count}} {
- return Load{{.VType}}Slice(s)
- }
- if l == 0 {
- var x {{.VType}}
- return x
- }
- mask := vecMask{{.EWidth}}[len(vecMask{{.EWidth}})/2-l:]
- return LoadMasked{{.VType}}(pa{{.VType}}(s), LoadInt{{.WxC}}Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the {{.Count}} elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has {{.Count}} or more elements, the method is equivalent to x.StoreSlice.
-func (x {{.VType}}) StoreSlicePart(s []{{.Etype}}) {
- l := len(s)
- if l >= {{.Count}} {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask{{.EWidth}}[len(vecMask{{.EWidth}})/2-l:]
- x.StoreMasked(pa{{.VType}}(s), LoadInt{{.WxC}}Slice(mask).asMask())
-}
-`)
-
-var avx2SmallLoadSlicePartTemplate = shapedTemplateOf(avx2SmallLoadPunShapes, "avx 2 small load slice part", `
-// Load{{.VType}}SlicePart loads a {{.VType}} from the slice s.
-// If s has fewer than {{.Count}} elements, the remaining elements of the vector are filled with zeroes.
-// If s has {{.Count}} or more elements, the function is equivalent to Load{{.VType}}Slice.
-func Load{{.VType}}SlicePart(s []{{.Etype}}) {{.VType}} {
- if len(s) == 0 {
- var zero {{.VType}}
- return zero
- }
- t := unsafe.Slice((*int{{.EWidth}})(unsafe.Pointer(&s[0])), len(s))
- return LoadInt{{.WxC}}SlicePart(t).As{{.VType}}()
-}
-
-// StoreSlicePart stores the {{.Count}} elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has {{.Count}} or more elements, the method is equivalent to x.StoreSlice.
-func (x {{.VType}}) StoreSlicePart(s []{{.Etype}}) {
- if len(s) == 0 {
- return
- }
- t := unsafe.Slice((*int{{.EWidth}})(unsafe.Pointer(&s[0])), len(s))
- x.AsInt{{.WxC}}().StoreSlicePart(t)
-}
-`)
-
-func (t templateData) CPUfeature() string {
- switch t.Vwidth {
- case 128:
- return "AVX"
- case 256:
- return "AVX2"
- case 512:
- return "AVX512"
- }
- panic(fmt.Errorf("unexpected vector width %d", t.Vwidth))
-}
-
-var avx2SignedComparisonsTemplate = shapedTemplateOf(avx2SignedComparisons, "avx2 signed comparisons", `
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature {{.CPUfeature}}
-func (x {{.VType}}) Less(y {{.VType}}) Mask{{.WxC}} {
- return y.Greater(x)
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature {{.CPUfeature}}
-func (x {{.VType}}) GreaterEqual(y {{.VType}}) Mask{{.WxC}} {
- ones := x.Equal(x).AsInt{{.WxC}}()
- return y.Greater(x).AsInt{{.WxC}}().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature {{.CPUfeature}}
-func (x {{.VType}}) LessEqual(y {{.VType}}) Mask{{.WxC}} {
- ones := x.Equal(x).AsInt{{.WxC}}()
- return x.Greater(y).AsInt{{.WxC}}().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature {{.CPUfeature}}
-func (x {{.VType}}) NotEqual(y {{.VType}}) Mask{{.WxC}} {
- ones := x.Equal(x).AsInt{{.WxC}}()
- return x.Equal(y).AsInt{{.WxC}}().Xor(ones).asMask()
-}
-`)
-
-var bitWiseIntTemplate = shapedTemplateOf(intShapes, "bitwise int complement", `
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature {{.CPUfeature}}
-func (x {{.VType}}) Not() {{.VType}} {
- return x.Xor(x.Equal(x).As{{.VType}}())
-}
-`)
-
-var bitWiseUintTemplate = shapedTemplateOf(uintShapes, "bitwise uint complement", `
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature {{.CPUfeature}}
-func (x {{.VType}}) Not() {{.VType}} {
- return x.Xor(x.Equal(x).AsInt{{.WxC}}().As{{.VType}}())
-}
-`)
-
-// CPUfeatureAVX2if8 return AVX2 if the element width is 8,
-// otherwise, it returns CPUfeature. This is for the cpufeature
-// of unsigned comparison emulation, which uses shifts for all
-// the sizes > 8 (shifts are AVX) but must use broadcast (AVX2)
-// for bytes.
-func (t templateData) CPUfeatureAVX2if8() string {
- if t.EWidth == 8 {
- return "AVX2"
- }
- return t.CPUfeature()
-}
-
-var avx2UnsignedComparisonsTemplate = shapedTemplateOf(avx2UnsignedComparisons, "avx2 unsigned comparisons", `
-// Greater returns a mask whose elements indicate whether x > y
-//
-// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
-func (x {{.VType}}) Greater(y {{.VType}}) Mask{{.WxC}} {
- a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
-{{- if eq .EWidth 8}}
- signs := BroadcastInt{{.WxC}}(-1 << ({{.EWidth}}-1))
-{{- else}}
- ones := x.Equal(x).AsInt{{.WxC}}()
- signs := ones.ShiftAllLeft({{.EWidth}}-1)
-{{- end }}
- return a.Xor(signs).Greater(b.Xor(signs))
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
-func (x {{.VType}}) Less(y {{.VType}}) Mask{{.WxC}} {
- a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
-{{- if eq .EWidth 8}}
- signs := BroadcastInt{{.WxC}}(-1 << ({{.EWidth}}-1))
-{{- else}}
- ones := x.Equal(x).AsInt{{.WxC}}()
- signs := ones.ShiftAllLeft({{.EWidth}}-1)
-{{- end }}
- return b.Xor(signs).Greater(a.Xor(signs))
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
-func (x {{.VType}}) GreaterEqual(y {{.VType}}) Mask{{.WxC}} {
- a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
- ones := x.Equal(x).AsInt{{.WxC}}()
-{{- if eq .EWidth 8}}
- signs := BroadcastInt{{.WxC}}(-1 << ({{.EWidth}}-1))
-{{- else}}
- signs := ones.ShiftAllLeft({{.EWidth}}-1)
-{{- end }}
- return b.Xor(signs).Greater(a.Xor(signs)).AsInt{{.WxC}}().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
-func (x {{.VType}}) LessEqual(y {{.VType}}) Mask{{.WxC}} {
- a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
- ones := x.Equal(x).AsInt{{.WxC}}()
-{{- if eq .EWidth 8}}
- signs := BroadcastInt{{.WxC}}(-1 << ({{.EWidth}}-1))
-{{- else}}
- signs := ones.ShiftAllLeft({{.EWidth}}-1)
-{{- end }}
- return a.Xor(signs).Greater(b.Xor(signs)).AsInt{{.WxC}}().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature {{.CPUfeature}}
-func (x {{.VType}}) NotEqual(y {{.VType}}) Mask{{.WxC}} {
- a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
- ones := x.Equal(x).AsInt{{.WxC}}()
- return a.Equal(b).AsInt{{.WxC}}().Xor(ones).asMask()
-}
-`)
-
-var unsafePATemplate = templateOf("unsafe PA helper", `
-// pa{{.VType}} returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func pa{{.VType}}(s []{{.Etype}}) *[{{.Count}}]{{.Etype}} {
- return (*[{{.Count}}]{{.Etype}})(unsafe.Pointer(&s[0]))
-}
-`)
-
-var avx2MaskedTemplate = shapedTemplateOf(avx2Shapes, "avx2 .Masked methods", `
-// Masked returns x but with elements zeroed where mask is false.
-func (x {{.VType}}) Masked(mask Mask{{.WxC}}) {{.VType}} {
- im := mask.AsInt{{.WxC}}()
-{{- if eq .Base "Int" }}
- return im.And(x)
-{{- else}}
- return x.AsInt{{.WxC}}().And(im).As{{.VType}}()
-{{- end -}}
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x {{.VType}}) Merge(y {{.VType}}, mask Mask{{.WxC}}) {{.VType}} {
-{{- if eq .BxC .WxC -}}
- im := mask.AsInt{{.BxC}}()
-{{- else}}
- im := mask.AsInt{{.WxC}}().AsInt{{.BxC}}()
-{{- end -}}
-{{- if and (eq .Base "Int") (eq .BxC .WxC) }}
- return y.blend(x, im)
-{{- else}}
- ix := x.AsInt{{.BxC}}()
- iy := y.AsInt{{.BxC}}()
- return iy.blend(ix, im).As{{.VType}}()
-{{- end -}}
-}
-`)
-
-// TODO perhaps write these in ways that work better on AVX512
-var avx512MaskedTemplate = shapedTemplateOf(avx512Shapes, "avx512 .Masked methods", `
-// Masked returns x but with elements zeroed where mask is false.
-func (x {{.VType}}) Masked(mask Mask{{.WxC}}) {{.VType}} {
- im := mask.AsInt{{.WxC}}()
-{{- if eq .Base "Int" }}
- return im.And(x)
-{{- else}}
- return x.AsInt{{.WxC}}().And(im).As{{.VType}}()
-{{- end -}}
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x {{.VType}}) Merge(y {{.VType}}, mask Mask{{.WxC}}) {{.VType}} {
-{{- if eq .Base "Int" }}
- return y.blendMasked(x, mask)
-{{- else}}
- ix := x.AsInt{{.WxC}}()
- iy := y.AsInt{{.WxC}}()
- return iy.blendMasked(ix, mask).As{{.VType}}()
-{{- end -}}
-}
-`)
-
-func (t templateData) CPUfeatureBC() string {
- switch t.Vwidth {
- case 128:
- return "AVX2"
- case 256:
- return "AVX2"
- case 512:
- if t.EWidth <= 16 {
- return "AVX512BW"
- }
- return "AVX512F"
- }
- panic(fmt.Errorf("unexpected vector width %d", t.Vwidth))
-}
-
-var broadcastTemplate = templateOf("Broadcast functions", `
-// Broadcast{{.VType}} returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature {{.CPUfeatureBC}}
-func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} {
- var z {{.As128BitVec }}
- return z.SetElem(0, x).Broadcast{{.Vwidth}}()
-}
-`)
-
-var maskCvtTemplate = templateOf("Mask conversions", `
-// ToMask converts from {{.Base}}{{.WxC}} to Mask{{.WxC}}, mask element is set to true when the corresponding vector element is non-zero.
-func (from {{.Base}}{{.WxC}}) ToMask() (to Mask{{.WxC}}) {
- return from.NotEqual({{.Base}}{{.WxC}}{})
-}
-`)
-
-var stringTemplate = shapedTemplateOf(allShapes, "String methods", `
-// String returns a string representation of SIMD vector x
-func (x {{.VType}}) String() string {
- var s [{{.Count}}]{{.Etype}}
- x.Store(&s)
- return sliceToString(s[:])
-}
-`)
-
-const SIMD = "../../"
-const TD = "../../internal/simd_test/"
-const SSA = "../../../cmd/compile/internal/ssa/"
-
-func main() {
- sl := flag.String("sl", SIMD+"slice_gen_amd64.go", "file name for slice operations")
- cm := flag.String("cm", SIMD+"compare_gen_amd64.go", "file name for comparison operations")
- mm := flag.String("mm", SIMD+"maskmerge_gen_amd64.go", "file name for mask/merge operations")
- op := flag.String("op", SIMD+"other_gen_amd64.go", "file name for other operations")
- ush := flag.String("ush", SIMD+"unsafe_helpers.go", "file name for unsafe helpers")
- bh := flag.String("bh", TD+"binary_helpers_test.go", "file name for binary test helpers")
- uh := flag.String("uh", TD+"unary_helpers_test.go", "file name for unary test helpers")
- th := flag.String("th", TD+"ternary_helpers_test.go", "file name for ternary test helpers")
- ch := flag.String("ch", TD+"compare_helpers_test.go", "file name for compare test helpers")
- cmh := flag.String("cmh", TD+"comparemasked_helpers_test.go", "file name for compare-masked test helpers")
- flag.Parse()
-
- if *sl != "" {
- one(*sl, unsafePrologue,
- sliceTemplate,
- avx512MaskedLoadSlicePartTemplate,
- avx2MaskedLoadSlicePartTemplate,
- avx2SmallLoadSlicePartTemplate,
- )
- }
- if *cm != "" {
- one(*cm, prologue,
- avx2SignedComparisonsTemplate,
- avx2UnsignedComparisonsTemplate,
- )
- }
- if *mm != "" {
- one(*mm, prologue,
- avx2MaskedTemplate,
- avx512MaskedTemplate,
- )
- }
- if *op != "" {
- one(*op, prologue,
- broadcastTemplate,
- maskCvtTemplate,
- bitWiseIntTemplate,
- bitWiseUintTemplate,
- stringTemplate,
- )
- }
- if *ush != "" {
- one(*ush, unsafePrologue, unsafePATemplate)
- }
- if *uh != "" {
- one(*uh, curryTestPrologue("unary simd methods"), unaryTemplate, unaryToInt32, unaryToUint32, unaryToUint16, unaryFlakyTemplate)
- }
- if *bh != "" {
- one(*bh, curryTestPrologue("binary simd methods"), binaryTemplate)
- }
- if *th != "" {
- one(*th, curryTestPrologue("ternary simd methods"), ternaryTemplate, ternaryFlakyTemplate)
- }
- if *ch != "" {
- one(*ch, curryTestPrologue("simd methods that compare two operands"), compareTemplate)
- }
- if *cmh != "" {
- one(*cmh, curryTestPrologue("simd methods that compare two operands under a mask"), compareMaskedTemplate)
- }
-
- nonTemplateRewrites(SSA+"tern_helpers.go", ssaPrologue, classifyBooleanSIMD, ternOpForLogical)
-
-}
-
-func ternOpForLogical(out io.Writer) {
- fmt.Fprintf(out, `
-func ternOpForLogical(op Op) Op {
- switch op {
-`)
-
- intShapes.forAllShapes(func(seq int, t, upperT string, w, c int, out io.Writer) {
- wt, ct := w, c
- if wt < 32 {
- wt = 32
- ct = (w * c) / wt
- }
- fmt.Fprintf(out, "case OpAndInt%[1]dx%[2]d, OpOrInt%[1]dx%[2]d, OpXorInt%[1]dx%[2]d,OpAndNotInt%[1]dx%[2]d: return OpternInt%dx%d\n", w, c, wt, ct)
- fmt.Fprintf(out, "case OpAndUint%[1]dx%[2]d, OpOrUint%[1]dx%[2]d, OpXorUint%[1]dx%[2]d,OpAndNotUint%[1]dx%[2]d: return OpternUint%dx%d\n", w, c, wt, ct)
- }, out)
-
- fmt.Fprintf(out, `
- }
- return op
-}
-`)
-
-}
-
-func classifyBooleanSIMD(out io.Writer) {
- fmt.Fprintf(out, `
-type SIMDLogicalOP uint8
-const (
- // boolean simd operations, for reducing expression to VPTERNLOG* instructions
- // sloInterior is set for non-root nodes in logical-op expression trees.
- // the operations are even-numbered.
- sloInterior SIMDLogicalOP = 1
- sloNone SIMDLogicalOP = 2 * iota
- sloAnd
- sloOr
- sloAndNot
- sloXor
- sloNot
-)
-func classifyBooleanSIMD(v *Value) SIMDLogicalOP {
- switch v.Op {
- case `)
- intShapes.forAllShapes(func(seq int, t, upperT string, w, c int, out io.Writer) {
- op := "And"
- if seq > 0 {
- fmt.Fprintf(out, ",Op%s%s%dx%d", op, upperT, w, c)
- } else {
- fmt.Fprintf(out, "Op%s%s%dx%d", op, upperT, w, c)
- }
- seq++
- }, out)
-
- fmt.Fprintf(out, `:
- return sloAnd
-
- case `)
- intShapes.forAllShapes(func(seq int, t, upperT string, w, c int, out io.Writer) {
- op := "Or"
- if seq > 0 {
- fmt.Fprintf(out, ",Op%s%s%dx%d", op, upperT, w, c)
- } else {
- fmt.Fprintf(out, "Op%s%s%dx%d", op, upperT, w, c)
- }
- seq++
- }, out)
-
- fmt.Fprintf(out, `:
- return sloOr
-
- case `)
- intShapes.forAllShapes(func(seq int, t, upperT string, w, c int, out io.Writer) {
- op := "AndNot"
- if seq > 0 {
- fmt.Fprintf(out, ",Op%s%s%dx%d", op, upperT, w, c)
- } else {
- fmt.Fprintf(out, "Op%s%s%dx%d", op, upperT, w, c)
- }
- seq++
- }, out)
-
- fmt.Fprintf(out, `:
- return sloAndNot
-`)
-
- // "Not" is encoded as x.Xor(x.Equal(x).AsInt8x16())
- // i.e. xor.Args[0] == x, xor.Args[1].Op == As...
- // but AsInt8x16 is a pun/passthrough.
-
- intShapes.forAllShapes(
- func(seq int, t, upperT string, w, c int, out io.Writer) {
- fmt.Fprintf(out, "case OpXor%s%dx%d: ", upperT, w, c)
- fmt.Fprintf(out, `
- if y := v.Args[1]; y.Op == OpEqual%s%dx%d &&
- y.Args[0] == y.Args[1] {
- return sloNot
- }
- `, upperT, w, c)
- fmt.Fprintf(out, "return sloXor\n")
- }, out)
-
- fmt.Fprintf(out, `
- }
- return sloNone
-}
-`)
-}
-
-// numberLines takes a slice of bytes, and returns a string where each line
-// is numbered, starting from 1.
-func numberLines(data []byte) string {
- var buf bytes.Buffer
- r := bytes.NewReader(data)
- s := bufio.NewScanner(r)
- for i := 1; s.Scan(); i++ {
- fmt.Fprintf(&buf, "%d: %s\n", i, s.Text())
- }
- return buf.String()
-}
-
-func nonTemplateRewrites(filename string, prologue func(s string, out io.Writer), rewrites ...func(out io.Writer)) {
- if filename == "" {
- return
- }
-
- ofile := os.Stdout
-
- if filename != "-" {
- var err error
- ofile, err = os.Create(filename)
- if err != nil {
- fmt.Fprintf(os.Stderr, "Could not create the output file %s for the generated code, %v", filename, err)
- os.Exit(1)
- }
- }
-
- out := new(bytes.Buffer)
-
- prologue("go run genfiles.go", out)
- for _, rewrite := range rewrites {
- rewrite(out)
- }
-
- b, err := format.Source(out.Bytes())
- if err != nil {
- fmt.Fprintf(os.Stderr, "There was a problem formatting the generated code for %s, %v\n", filename, err)
- fmt.Fprintf(os.Stderr, "%s\n", numberLines(out.Bytes()))
- fmt.Fprintf(os.Stderr, "There was a problem formatting the generated code for %s, %v\n", filename, err)
- os.Exit(1)
- } else {
- ofile.Write(b)
- ofile.Close()
- }
-
-}
-
-func one(filename string, prologue func(s string, out io.Writer), sats ...shapeAndTemplate) {
- if filename == "" {
- return
- }
-
- ofile := os.Stdout
-
- if filename != "-" {
- var err error
- ofile, err = os.Create(filename)
- if err != nil {
- fmt.Fprintf(os.Stderr, "Could not create the output file %s for the generated code, %v", filename, err)
- os.Exit(1)
- }
- }
-
- out := new(bytes.Buffer)
-
- prologue("go run genfiles.go", out)
- for _, sat := range sats {
- sat.forTemplates(out)
- }
-
- b, err := format.Source(out.Bytes())
- if err != nil {
- fmt.Fprintf(os.Stderr, "There was a problem formatting the generated code for %s, %v\n", filename, err)
- fmt.Fprintf(os.Stderr, "%s\n", numberLines(out.Bytes()))
- fmt.Fprintf(os.Stderr, "There was a problem formatting the generated code for %s, %v\n", filename, err)
- os.Exit(1)
- } else {
- ofile.Write(b)
- ofile.Close()
- }
-
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "fmt"
- "iter"
- "maps"
- "slices"
-)
-
-type Closure struct {
- val *Value
- env envSet
-}
-
-func NewSum(vs ...*Value) Closure {
- id := &ident{name: "sum"}
- return Closure{NewValue(Var{id}), topEnv.bind(id, vs...)}
-}
-
-// IsBottom returns whether c consists of no values.
-func (c Closure) IsBottom() bool {
- return c.val.Domain == nil
-}
-
-// Summands returns the top-level Values of c. This assumes the top-level of c
-// was constructed as a sum, and is mostly useful for debugging.
-func (c Closure) Summands() iter.Seq[*Value] {
- return func(yield func(*Value) bool) {
- var rec func(v *Value, env envSet) bool
- rec = func(v *Value, env envSet) bool {
- switch d := v.Domain.(type) {
- case Var:
- parts := env.partitionBy(d.id)
- for _, part := range parts {
- // It may be a sum of sums. Walk into this value.
- if !rec(part.value, part.env) {
- return false
- }
- }
- return true
- default:
- return yield(v)
- }
- }
- rec(c.val, c.env)
- }
-}
-
-// All enumerates all possible concrete values of c by substituting variables
-// from the environment.
-//
-// E.g., enumerating this Value
-//
-// a: !sum [1, 2]
-// b: !sum [3, 4]
-//
-// results in
-//
-// - {a: 1, b: 3}
-// - {a: 1, b: 4}
-// - {a: 2, b: 3}
-// - {a: 2, b: 4}
-func (c Closure) All() iter.Seq[*Value] {
- // In order to enumerate all concrete values under all possible variable
- // bindings, we use a "non-deterministic continuation passing style" to
- // implement this. We use CPS to traverse the Value tree, threading the
- // (possibly narrowing) environment through that CPS following an Euler
- // tour. Where the environment permits multiple choices, we invoke the same
- // continuation for each choice. Similar to a yield function, the
- // continuation can return false to stop the non-deterministic walk.
- return func(yield func(*Value) bool) {
- c.val.all1(c.env, func(v *Value, e envSet) bool {
- return yield(v)
- })
- }
-}
-
-func (v *Value) all1(e envSet, cont func(*Value, envSet) bool) bool {
- switch d := v.Domain.(type) {
- default:
- panic(fmt.Sprintf("unknown domain type %T", d))
-
- case nil:
- return true
-
- case Top, String:
- return cont(v, e)
-
- case Def:
- fields := d.keys()
- // We can reuse this parts slice because we're doing a DFS through the
- // state space. (Otherwise, we'd have to do some messy threading of an
- // immutable slice-like value through allElt.)
- parts := make(map[string]*Value, len(fields))
-
- // TODO: If there are no Vars or Sums under this Def, then nothing can
- // change the Value or env, so we could just cont(v, e).
- var allElt func(elt int, e envSet) bool
- allElt = func(elt int, e envSet) bool {
- if elt == len(fields) {
- // Build a new Def from the concrete parts. Clone parts because
- // we may reuse it on other non-deterministic branches.
- nVal := newValueFrom(Def{maps.Clone(parts)}, v)
- return cont(nVal, e)
- }
-
- return d.fields[fields[elt]].all1(e, func(v *Value, e envSet) bool {
- parts[fields[elt]] = v
- return allElt(elt+1, e)
- })
- }
- return allElt(0, e)
-
- case Tuple:
- // Essentially the same as Def.
- if d.repeat != nil {
- // There's nothing we can do with this.
- return cont(v, e)
- }
- parts := make([]*Value, len(d.vs))
- var allElt func(elt int, e envSet) bool
- allElt = func(elt int, e envSet) bool {
- if elt == len(d.vs) {
- // Build a new tuple from the concrete parts. Clone parts because
- // we may reuse it on other non-deterministic branches.
- nVal := newValueFrom(Tuple{vs: slices.Clone(parts)}, v)
- return cont(nVal, e)
- }
-
- return d.vs[elt].all1(e, func(v *Value, e envSet) bool {
- parts[elt] = v
- return allElt(elt+1, e)
- })
- }
- return allElt(0, e)
-
- case Var:
- // Go each way this variable can be bound.
- for _, ePart := range e.partitionBy(d.id) {
- // d.id is no longer bound in this environment partition. We'll may
- // need it later in the Euler tour, so bind it back to this single
- // value.
- env := ePart.env.bind(d.id, ePart.value)
- if !ePart.value.all1(env, cont) {
- return false
- }
- }
- return true
- }
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "fmt"
- "iter"
- "maps"
- "reflect"
- "regexp"
- "slices"
- "strconv"
- "strings"
-)
-
-// A Domain is a non-empty set of values, all of the same kind.
-//
-// Domain may be a scalar:
-//
-// - [String] - Represents string-typed values.
-//
-// Or a composite:
-//
-// - [Def] - A mapping from fixed keys to [Domain]s.
-//
-// - [Tuple] - A fixed-length sequence of [Domain]s or
-// all possible lengths repeating a [Domain].
-//
-// Or top or bottom:
-//
-// - [Top] - Represents all possible values of all kinds.
-//
-// - nil - Represents no values.
-//
-// Or a variable:
-//
-// - [Var] - A value captured in the environment.
-type Domain interface {
- Exact() bool
- WhyNotExact() string
-
- // decode stores this value in a Go value. If this value is not exact, this
- // returns a potentially wrapped *inexactError.
- decode(reflect.Value) error
-}
-
-type inexactError struct {
- valueType string
- goType string
-}
-
-func (e *inexactError) Error() string {
- return fmt.Sprintf("cannot store inexact %s value in %s", e.valueType, e.goType)
-}
-
-type decodeError struct {
- path string
- err error
-}
-
-func newDecodeError(path string, err error) *decodeError {
- if err, ok := err.(*decodeError); ok {
- return &decodeError{path: path + "." + err.path, err: err.err}
- }
- return &decodeError{path: path, err: err}
-}
-
-func (e *decodeError) Unwrap() error {
- return e.err
-}
-
-func (e *decodeError) Error() string {
- return fmt.Sprintf("%s: %s", e.path, e.err)
-}
-
-// Top represents all possible values of all possible types.
-type Top struct{}
-
-func (t Top) Exact() bool { return false }
-func (t Top) WhyNotExact() string { return "is top" }
-
-func (t Top) decode(rv reflect.Value) error {
- // We can decode Top into a pointer-typed value as nil.
- if rv.Kind() != reflect.Pointer {
- return &inexactError{"top", rv.Type().String()}
- }
- rv.SetZero()
- return nil
-}
-
-// A Def is a mapping from field names to [Value]s. Any fields not explicitly
-// listed have [Value] [Top].
-type Def struct {
- fields map[string]*Value
-}
-
-// A DefBuilder builds a [Def] one field at a time. The zero value is an empty
-// [Def].
-type DefBuilder struct {
- fields map[string]*Value
-}
-
-func (b *DefBuilder) Add(name string, v *Value) {
- if b.fields == nil {
- b.fields = make(map[string]*Value)
- }
- if old, ok := b.fields[name]; ok {
- panic(fmt.Sprintf("duplicate field %q, added value is %v, old value is %v", name, v, old))
- }
- b.fields[name] = v
-}
-
-// Build constructs a [Def] from the fields added to this builder.
-func (b *DefBuilder) Build() Def {
- return Def{maps.Clone(b.fields)}
-}
-
-// Exact returns true if all field Values are exact.
-func (d Def) Exact() bool {
- for _, v := range d.fields {
- if !v.Exact() {
- return false
- }
- }
- return true
-}
-
-// WhyNotExact returns why the value is not exact
-func (d Def) WhyNotExact() string {
- for s, v := range d.fields {
- if !v.Exact() {
- w := v.WhyNotExact()
- return "field " + s + ": " + w
- }
- }
- return ""
-}
-
-func (d Def) decode(rv reflect.Value) error {
- if rv.Kind() != reflect.Struct {
- return fmt.Errorf("cannot decode Def into %s", rv.Type())
- }
-
- var lowered map[string]string // Lower case -> canonical for d.fields.
- rt := rv.Type()
- for fi := range rv.NumField() {
- fType := rt.Field(fi)
- if fType.PkgPath != "" {
- continue
- }
- v := d.fields[fType.Name]
- if v == nil {
- v = topValue
-
- // Try a case-insensitive match
- canon, ok := d.fields[strings.ToLower(fType.Name)]
- if ok {
- v = canon
- } else {
- if lowered == nil {
- lowered = make(map[string]string, len(d.fields))
- for k := range d.fields {
- l := strings.ToLower(k)
- if k != l {
- lowered[l] = k
- }
- }
- }
- canon, ok := lowered[strings.ToLower(fType.Name)]
- if ok {
- v = d.fields[canon]
- }
- }
- }
- if err := decodeReflect(v, rv.Field(fi)); err != nil {
- return newDecodeError(fType.Name, err)
- }
- }
- return nil
-}
-
-func (d Def) keys() []string {
- return slices.Sorted(maps.Keys(d.fields))
-}
-
-func (d Def) All() iter.Seq2[string, *Value] {
- // TODO: We call All fairly often. It's probably bad to sort this every
- // time.
- keys := slices.Sorted(maps.Keys(d.fields))
- return func(yield func(string, *Value) bool) {
- for _, k := range keys {
- if !yield(k, d.fields[k]) {
- return
- }
- }
- }
-}
-
-// A Tuple is a sequence of Values in one of two forms: 1. a fixed-length tuple,
-// where each Value can be different or 2. a "repeated tuple", which is a Value
-// repeated 0 or more times.
-type Tuple struct {
- vs []*Value
-
- // repeat, if non-nil, means this Tuple consists of an element repeated 0 or
- // more times. If repeat is non-nil, vs must be nil. This is a generator
- // function because we don't necessarily want *exactly* the same Value
- // repeated. For example, in YAML encoding, a !sum in a repeated tuple needs
- // a fresh variable in each instance.
- repeat []func(envSet) (*Value, envSet)
-}
-
-func NewTuple(vs ...*Value) Tuple {
- return Tuple{vs: vs}
-}
-
-func NewRepeat(gens ...func(envSet) (*Value, envSet)) Tuple {
- return Tuple{repeat: gens}
-}
-
-func (d Tuple) Exact() bool {
- if d.repeat != nil {
- return false
- }
- for _, v := range d.vs {
- if !v.Exact() {
- return false
- }
- }
- return true
-}
-
-func (d Tuple) WhyNotExact() string {
- if d.repeat != nil {
- return "d.repeat is not nil"
- }
- for i, v := range d.vs {
- if !v.Exact() {
- w := v.WhyNotExact()
- return "index " + strconv.FormatInt(int64(i), 10) + ": " + w
- }
- }
- return ""
-}
-
-func (d Tuple) decode(rv reflect.Value) error {
- if d.repeat != nil {
- return &inexactError{"repeated tuple", rv.Type().String()}
- }
- // TODO: We could also do arrays.
- if rv.Kind() != reflect.Slice {
- return fmt.Errorf("cannot decode Tuple into %s", rv.Type())
- }
- if rv.IsNil() || rv.Cap() < len(d.vs) {
- rv.Set(reflect.MakeSlice(rv.Type(), len(d.vs), len(d.vs)))
- } else {
- rv.SetLen(len(d.vs))
- }
- for i, v := range d.vs {
- if err := decodeReflect(v, rv.Index(i)); err != nil {
- return newDecodeError(fmt.Sprintf("%d", i), err)
- }
- }
- return nil
-}
-
-// A String represents a set of strings. It can represent the intersection of a
-// set of regexps, or a single exact string. In general, the domain of a String
-// is non-empty, but we do not attempt to prove emptiness of a regexp value.
-type String struct {
- kind stringKind
- re []*regexp.Regexp // Intersection of regexps
- exact string
-}
-
-type stringKind int
-
-const (
- stringRegex stringKind = iota
- stringExact
-)
-
-func NewStringRegex(exprs ...string) (String, error) {
- if len(exprs) == 0 {
- exprs = []string{""}
- }
- v := String{kind: -1}
- for _, expr := range exprs {
- if expr == "" {
- // Skip constructing the regexp. It won't have a "literal prefix"
- // and so we wind up thinking this is a regexp instead of an exact
- // (empty) string.
- v = String{kind: stringExact, exact: ""}
- continue
- }
-
- re, err := regexp.Compile(`\A(?:` + expr + `)\z`)
- if err != nil {
- return String{}, fmt.Errorf("parsing value: %s", err)
- }
-
- // An exact value narrows the whole domain to exact, so we're done, but
- // should keep parsing.
- if v.kind == stringExact {
- continue
- }
-
- if exact, complete := re.LiteralPrefix(); complete {
- v = String{kind: stringExact, exact: exact}
- } else {
- v.kind = stringRegex
- v.re = append(v.re, re)
- }
- }
- return v, nil
-}
-
-func NewStringExact(s string) String {
- return String{kind: stringExact, exact: s}
-}
-
-// Exact returns whether this Value is known to consist of a single string.
-func (d String) Exact() bool {
- return d.kind == stringExact
-}
-
-func (d String) WhyNotExact() string {
- if d.kind == stringExact {
- return ""
- }
- return "string is not exact"
-}
-
-func (d String) decode(rv reflect.Value) error {
- if d.kind != stringExact {
- return &inexactError{"regex", rv.Type().String()}
- }
- switch rv.Kind() {
- default:
- return fmt.Errorf("cannot decode String into %s", rv.Type())
- case reflect.String:
- rv.SetString(d.exact)
- case reflect.Int:
- i, err := strconv.Atoi(d.exact)
- if err != nil {
- return fmt.Errorf("cannot decode String into %s: %s", rv.Type(), err)
- }
- rv.SetInt(int64(i))
- case reflect.Bool:
- b, err := strconv.ParseBool(d.exact)
- if err != nil {
- return fmt.Errorf("cannot decode String into %s: %s", rv.Type(), err)
- }
- rv.SetBool(b)
- }
- return nil
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "bytes"
- "fmt"
- "html"
- "io"
- "os"
- "os/exec"
- "strings"
-)
-
-const maxNodes = 30
-
-type dotEncoder struct {
- w *bytes.Buffer
-
- idGen int // Node name generation
- valLimit int // Limit the number of Values in a subgraph
-
- idp identPrinter
-}
-
-func newDotEncoder() *dotEncoder {
- return &dotEncoder{
- w: new(bytes.Buffer),
- }
-}
-
-func (enc *dotEncoder) clear() {
- enc.w.Reset()
- enc.idGen = 0
-}
-
-func (enc *dotEncoder) writeTo(w io.Writer) {
- fmt.Fprintln(w, "digraph {")
- // Use the "new" ranking algorithm, which lets us put nodes from different
- // clusters in the same rank.
- fmt.Fprintln(w, "newrank=true;")
- fmt.Fprintln(w, "node [shape=box, ordering=out];")
-
- w.Write(enc.w.Bytes())
- fmt.Fprintln(w, "}")
-}
-
-func (enc *dotEncoder) writeSvg(w io.Writer) error {
- cmd := exec.Command("dot", "-Tsvg")
- in, err := cmd.StdinPipe()
- if err != nil {
- return err
- }
- var out bytes.Buffer
- cmd.Stdout = &out
- cmd.Stderr = os.Stderr
- if err := cmd.Start(); err != nil {
- return err
- }
- enc.writeTo(in)
- in.Close()
- if err := cmd.Wait(); err != nil {
- return err
- }
- // Trim SVG header so the result can be embedded
- //
- // TODO: In Graphviz 10.0.1, we could use -Tsvg_inline.
- svg := out.Bytes()
- if i := bytes.Index(svg, []byte("<svg ")); i >= 0 {
- svg = svg[i:]
- }
- _, err = w.Write(svg)
- return err
-}
-
-func (enc *dotEncoder) newID(f string) string {
- id := fmt.Sprintf(f, enc.idGen)
- enc.idGen++
- return id
-}
-
-func (enc *dotEncoder) node(label, sublabel string) string {
- id := enc.newID("n%d")
- l := html.EscapeString(label)
- if sublabel != "" {
- l += fmt.Sprintf("<BR ALIGN=\"CENTER\"/><FONT POINT-SIZE=\"10\">%s</FONT>", html.EscapeString(sublabel))
- }
- fmt.Fprintf(enc.w, "%s [label=<%s>];\n", id, l)
- return id
-}
-
-func (enc *dotEncoder) edge(from, to string, label string, args ...any) {
- l := fmt.Sprintf(label, args...)
- fmt.Fprintf(enc.w, "%s -> %s [label=%q];\n", from, to, l)
-}
-
-func (enc *dotEncoder) valueSubgraph(v *Value) {
- enc.valLimit = maxNodes
- cID := enc.newID("cluster_%d")
- fmt.Fprintf(enc.w, "subgraph %s {\n", cID)
- fmt.Fprintf(enc.w, "style=invis;")
- vID := enc.value(v)
- fmt.Fprintf(enc.w, "}\n")
- // We don't need the IDs right now.
- _, _ = cID, vID
-}
-
-func (enc *dotEncoder) value(v *Value) string {
- if enc.valLimit <= 0 {
- id := enc.newID("n%d")
- fmt.Fprintf(enc.w, "%s [label=\"...\", shape=triangle];\n", id)
- return id
- }
- enc.valLimit--
-
- switch vd := v.Domain.(type) {
- default:
- panic(fmt.Sprintf("unknown domain type %T", vd))
-
- case nil:
- return enc.node("_|_", "")
-
- case Top:
- return enc.node("_", "")
-
- // TODO: Like in YAML, figure out if this is just a sum. In dot, we
- // could say any unentangled variable is a sum, and if it has more than
- // one reference just share the node.
-
- // case Sum:
- // node := enc.node("Sum", "")
- // for i, elt := range vd.vs {
- // enc.edge(node, enc.value(elt), "%d", i)
- // if enc.valLimit <= 0 {
- // break
- // }
- // }
- // return node
-
- case Def:
- node := enc.node("Def", "")
- for k, v := range vd.All() {
- enc.edge(node, enc.value(v), "%s", k)
- if enc.valLimit <= 0 {
- break
- }
- }
- return node
-
- case Tuple:
- if vd.repeat == nil {
- label := "Tuple"
- node := enc.node(label, "")
- for i, elt := range vd.vs {
- enc.edge(node, enc.value(elt), "%d", i)
- if enc.valLimit <= 0 {
- break
- }
- }
- return node
- } else {
- // TODO
- return enc.node("TODO: Repeat", "")
- }
-
- case String:
- switch vd.kind {
- case stringExact:
- return enc.node(fmt.Sprintf("%q", vd.exact), "")
- case stringRegex:
- var parts []string
- for _, re := range vd.re {
- parts = append(parts, fmt.Sprintf("%q", re))
- }
- return enc.node(strings.Join(parts, "&"), "")
- }
- panic("bad String kind")
-
- case Var:
- return enc.node(fmt.Sprintf("Var %s", enc.idp.unique(vd.id)), "")
- }
-}
-
-func (enc *dotEncoder) envSubgraph(e envSet) {
- enc.valLimit = maxNodes
- cID := enc.newID("cluster_%d")
- fmt.Fprintf(enc.w, "subgraph %s {\n", cID)
- fmt.Fprintf(enc.w, "style=invis;")
- vID := enc.env(e.root)
- fmt.Fprintf(enc.w, "}\n")
- _, _ = cID, vID
-}
-
-func (enc *dotEncoder) env(e *envExpr) string {
- switch e.kind {
- default:
- panic("bad kind")
- case envZero:
- return enc.node("0", "")
- case envUnit:
- return enc.node("1", "")
- case envBinding:
- node := enc.node(fmt.Sprintf("%q :", enc.idp.unique(e.id)), "")
- enc.edge(node, enc.value(e.val), "")
- return node
- case envProduct:
- node := enc.node("⨯", "")
- for _, op := range e.operands {
- enc.edge(node, enc.env(op), "")
- }
- return node
- case envSum:
- node := enc.node("+", "")
- for _, op := range e.operands {
- enc.edge(node, enc.env(op), "")
- }
- return node
- }
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "fmt"
- "iter"
- "reflect"
- "strings"
-)
-
-// An envSet is an immutable set of environments, where each environment is a
-// mapping from [ident]s to [Value]s.
-//
-// To keep this compact, we use an algebraic representation similar to
-// relational algebra. The atoms are zero, unit, or a singular binding:
-//
-// - A singular binding {x: v} is an environment set consisting of a single
-// environment that binds a single ident x to a single value v.
-//
-// - Zero (0) is the empty set.
-//
-// - Unit (1) is an environment set consisting of a single, empty environment
-// (no bindings).
-//
-// From these, we build up more complex sets of environments using sums and
-// cross products:
-//
-// - A sum, E + F, is simply the union of the two environment sets: E ∪ F
-//
-// - A cross product, E ⨯ F, is the Cartesian product of the two environment
-// sets, followed by joining each pair of environments: {e ⊕ f | (e, f) ∊ E ⨯ F}
-//
-// The join of two environments, e ⊕ f, is an environment that contains all of
-// the bindings in either e or f. To detect bugs, it is an error if an
-// identifier is bound in both e and f (however, see below for what we could do
-// differently).
-//
-// Environment sets form a commutative semiring and thus obey the usual
-// commutative semiring rules:
-//
-// e + 0 = e
-// e ⨯ 0 = 0
-// e ⨯ 1 = e
-// e + f = f + e
-// e ⨯ f = f ⨯ e
-//
-// Furthermore, environments sets are additively and multiplicatively idempotent
-// because + and ⨯ are themselves defined in terms of sets:
-//
-// e + e = e
-// e ⨯ e = e
-//
-// # Examples
-//
-// To represent {{x: 1, y: 1}, {x: 2, y: 2}}, we build the two environments and
-// sum them:
-//
-// ({x: 1} ⨯ {y: 1}) + ({x: 2} ⨯ {y: 2})
-//
-// If we add a third variable z that can be 1 or 2, independent of x and y, we
-// get four logical environments:
-//
-// {x: 1, y: 1, z: 1}
-// {x: 2, y: 2, z: 1}
-// {x: 1, y: 1, z: 2}
-// {x: 2, y: 2, z: 2}
-//
-// This could be represented as a sum of all four environments, but because z is
-// independent, we can use a more compact representation:
-//
-// (({x: 1} ⨯ {y: 1}) + ({x: 2} ⨯ {y: 2})) ⨯ ({z: 1} + {z: 2})
-//
-// # Generalized cross product
-//
-// While cross-product is currently restricted to disjoint environments, we
-// could generalize the definition of joining two environments to:
-//
-// {xₖ: vₖ} ⊕ {xₖ: wₖ} = {xₖ: vₖ ∩ wₖ} (where unbound idents are bound to the [Top] value, ⟙)
-//
-// where v ∩ w is the unification of v and w. This itself could be coarsened to
-//
-// v ∩ w = v if w = ⟙
-// = w if v = ⟙
-// = v if v = w
-// = 0 otherwise
-//
-// We could use this rule to implement substitution. For example, E ⨯ {x: 1}
-// narrows environment set E to only environments in which x is bound to 1. But
-// we currently don't do this.
-type envSet struct {
- root *envExpr
-}
-
-type envExpr struct {
- // TODO: A tree-based data structure for this may not be ideal, since it
- // involves a lot of walking to find things and we often have to do deep
- // rewrites anyway for partitioning. Would some flattened array-style
- // representation be better, possibly combined with an index of ident uses?
- // We could even combine that with an immutable array abstraction (ala
- // Clojure) that could enable more efficient construction operations.
-
- kind envExprKind
-
- // For envBinding
- id *ident
- val *Value
-
- // For sum or product. Len must be >= 2 and none of the elements can have
- // the same kind as this node.
- operands []*envExpr
-}
-
-type envExprKind byte
-
-const (
- envZero envExprKind = iota
- envUnit
- envProduct
- envSum
- envBinding
-)
-
-var (
- // topEnv is the unit value (multiplicative identity) of a [envSet].
- topEnv = envSet{envExprUnit}
- // bottomEnv is the zero value (additive identity) of a [envSet].
- bottomEnv = envSet{envExprZero}
-
- envExprZero = &envExpr{kind: envZero}
- envExprUnit = &envExpr{kind: envUnit}
-)
-
-// bind binds id to each of vals in e.
-//
-// Its panics if id is already bound in e.
-//
-// Environments are typically initially constructed by starting with [topEnv]
-// and calling bind one or more times.
-func (e envSet) bind(id *ident, vals ...*Value) envSet {
- if e.isEmpty() {
- return bottomEnv
- }
-
- // TODO: If any of vals are _, should we just drop that val? We're kind of
- // inconsistent about whether an id missing from e means id is invalid or
- // means id is _.
-
- // Check that id isn't present in e.
- for range e.root.bindings(id) {
- panic("id " + id.name + " already present in environment")
- }
-
- // Create a sum of all the values.
- bindings := make([]*envExpr, 0, 1)
- for _, val := range vals {
- bindings = append(bindings, &envExpr{kind: envBinding, id: id, val: val})
- }
-
- // Multiply it in.
- return envSet{newEnvExprProduct(e.root, newEnvExprSum(bindings...))}
-}
-
-func (e envSet) isEmpty() bool {
- return e.root.kind == envZero
-}
-
-// bindings yields all [envBinding] nodes in e with the given id. If id is nil,
-// it yields all binding nodes.
-func (e *envExpr) bindings(id *ident) iter.Seq[*envExpr] {
- // This is just a pre-order walk and it happens this is the only thing we
- // need a pre-order walk for.
- return func(yield func(*envExpr) bool) {
- var rec func(e *envExpr) bool
- rec = func(e *envExpr) bool {
- if e.kind == envBinding && (id == nil || e.id == id) {
- if !yield(e) {
- return false
- }
- }
- for _, o := range e.operands {
- if !rec(o) {
- return false
- }
- }
- return true
- }
- rec(e)
- }
-}
-
-// newEnvExprProduct constructs a product node from exprs, performing
-// simplifications. It does NOT check that bindings are disjoint.
-func newEnvExprProduct(exprs ...*envExpr) *envExpr {
- factors := make([]*envExpr, 0, 2)
- for _, expr := range exprs {
- switch expr.kind {
- case envZero:
- return envExprZero
- case envUnit:
- // No effect on product
- case envProduct:
- factors = append(factors, expr.operands...)
- default:
- factors = append(factors, expr)
- }
- }
-
- if len(factors) == 0 {
- return envExprUnit
- } else if len(factors) == 1 {
- return factors[0]
- }
- return &envExpr{kind: envProduct, operands: factors}
-}
-
-// newEnvExprSum constructs a sum node from exprs, performing simplifications.
-func newEnvExprSum(exprs ...*envExpr) *envExpr {
- // TODO: If all of envs are products (or bindings), factor any common terms.
- // E.g., x * y + x * z ==> x * (y + z). This is easy to do for binding
- // terms, but harder to do for more general terms.
-
- var have smallSet[*envExpr]
- terms := make([]*envExpr, 0, 2)
- for _, expr := range exprs {
- switch expr.kind {
- case envZero:
- // No effect on sum
- case envSum:
- for _, expr1 := range expr.operands {
- if have.Add(expr1) {
- terms = append(terms, expr1)
- }
- }
- default:
- if have.Add(expr) {
- terms = append(terms, expr)
- }
- }
- }
-
- if len(terms) == 0 {
- return envExprZero
- } else if len(terms) == 1 {
- return terms[0]
- }
- return &envExpr{kind: envSum, operands: terms}
-}
-
-func crossEnvs(env1, env2 envSet) envSet {
- // Confirm that envs have disjoint idents.
- var ids1 smallSet[*ident]
- for e := range env1.root.bindings(nil) {
- ids1.Add(e.id)
- }
- for e := range env2.root.bindings(nil) {
- if ids1.Has(e.id) {
- panic(fmt.Sprintf("%s bound on both sides of cross-product", e.id.name))
- }
- }
-
- return envSet{newEnvExprProduct(env1.root, env2.root)}
-}
-
-func unionEnvs(envs ...envSet) envSet {
- exprs := make([]*envExpr, len(envs))
- for i := range envs {
- exprs[i] = envs[i].root
- }
- return envSet{newEnvExprSum(exprs...)}
-}
-
-// envPartition is a subset of an env where id is bound to value in all
-// deterministic environments.
-type envPartition struct {
- id *ident
- value *Value
- env envSet
-}
-
-// partitionBy splits e by distinct bindings of id and removes id from each
-// partition.
-//
-// If there are environments in e where id is not bound, they will not be
-// reflected in any partition.
-//
-// It panics if e is bottom, since attempting to partition an empty environment
-// set almost certainly indicates a bug.
-func (e envSet) partitionBy(id *ident) []envPartition {
- if e.isEmpty() {
- // We could return zero partitions, but getting here at all almost
- // certainly indicates a bug.
- panic("cannot partition empty environment set")
- }
-
- // Emit a partition for each value of id.
- var seen smallSet[*Value]
- var parts []envPartition
- for n := range e.root.bindings(id) {
- if !seen.Add(n.val) {
- // Already emitted a partition for this value.
- continue
- }
-
- parts = append(parts, envPartition{
- id: id,
- value: n.val,
- env: envSet{e.root.substitute(id, n.val)},
- })
- }
-
- return parts
-}
-
-// substitute replaces bindings of id to val with 1 and bindings of id to any
-// other value with 0 and simplifies the result.
-func (e *envExpr) substitute(id *ident, val *Value) *envExpr {
- switch e.kind {
- default:
- panic("bad kind")
-
- case envZero, envUnit:
- return e
-
- case envBinding:
- if e.id != id {
- return e
- } else if e.val != val {
- return envExprZero
- } else {
- return envExprUnit
- }
-
- case envProduct, envSum:
- // Substitute each operand. Sometimes, this won't change anything, so we
- // build the new operands list lazily.
- var nOperands []*envExpr
- for i, op := range e.operands {
- nOp := op.substitute(id, val)
- if nOperands == nil && op != nOp {
- // Operand diverged; initialize nOperands.
- nOperands = make([]*envExpr, 0, len(e.operands))
- nOperands = append(nOperands, e.operands[:i]...)
- }
- if nOperands != nil {
- nOperands = append(nOperands, nOp)
- }
- }
- if nOperands == nil {
- // Nothing changed.
- return e
- }
- if e.kind == envProduct {
- return newEnvExprProduct(nOperands...)
- } else {
- return newEnvExprSum(nOperands...)
- }
- }
-}
-
-// A smallSet is a set optimized for stack allocation when small.
-type smallSet[T comparable] struct {
- array [32]T
- n int
-
- m map[T]struct{}
-}
-
-// Has returns whether val is in set.
-func (s *smallSet[T]) Has(val T) bool {
- arr := s.array[:s.n]
- for i := range arr {
- if arr[i] == val {
- return true
- }
- }
- _, ok := s.m[val]
- return ok
-}
-
-// Add adds val to the set and returns true if it was added (not already
-// present).
-func (s *smallSet[T]) Add(val T) bool {
- // Test for presence.
- if s.Has(val) {
- return false
- }
-
- // Add it
- if s.n < len(s.array) {
- s.array[s.n] = val
- s.n++
- } else {
- if s.m == nil {
- s.m = make(map[T]struct{})
- }
- s.m[val] = struct{}{}
- }
- return true
-}
-
-type ident struct {
- _ [0]func() // Not comparable (only compare *ident)
- name string
-}
-
-type Var struct {
- id *ident
-}
-
-func (d Var) Exact() bool {
- // These can't appear in concrete Values.
- panic("Exact called on non-concrete Value")
-}
-
-func (d Var) WhyNotExact() string {
- // These can't appear in concrete Values.
- return "WhyNotExact called on non-concrete Value"
-}
-
-func (d Var) decode(rv reflect.Value) error {
- return &inexactError{"var", rv.Type().String()}
-}
-
-func (d Var) unify(w *Value, e envSet, swap bool, uf *unifier) (Domain, envSet, error) {
- // TODO: Vars from !sums in the input can have a huge number of values.
- // Unifying these could be way more efficient with some indexes over any
- // exact values we can pull out, like Def fields that are exact Strings.
- // Maybe we try to produce an array of yes/no/maybe matches and then we only
- // have to do deeper evaluation of the maybes. We could probably cache this
- // on an envTerm. It may also help to special-case Var/Var unification to
- // pick which one to index versus enumerate.
-
- if vd, ok := w.Domain.(Var); ok && d.id == vd.id {
- // Unifying $x with $x results in $x. If we descend into this we'll have
- // problems because we strip $x out of the environment to keep ourselves
- // honest and then can't find it on the other side.
- //
- // TODO: I'm not positive this is the right fix.
- return vd, e, nil
- }
-
- // We need to unify w with the value of d in each possible environment. We
- // can save some work by grouping environments by the value of d, since
- // there will be a lot of redundancy here.
- var nEnvs []envSet
- envParts := e.partitionBy(d.id)
- for i, envPart := range envParts {
- exit := uf.enterVar(d.id, i)
- // Each branch logically gets its own copy of the initial environment
- // (narrowed down to just this binding of the variable), and each branch
- // may result in different changes to that starting environment.
- res, e2, err := w.unify(envPart.value, envPart.env, swap, uf)
- exit.exit()
- if err != nil {
- return nil, envSet{}, err
- }
- if res.Domain == nil {
- // This branch entirely failed to unify, so it's gone.
- continue
- }
- nEnv := e2.bind(d.id, res)
- nEnvs = append(nEnvs, nEnv)
- }
-
- if len(nEnvs) == 0 {
- // All branches failed
- return nil, bottomEnv, nil
- }
-
- // The effect of this is entirely captured in the environment. We can return
- // back the same Bind node.
- return d, unionEnvs(nEnvs...), nil
-}
-
-// An identPrinter maps [ident]s to unique string names.
-type identPrinter struct {
- ids map[*ident]string
- idGen map[string]int
-}
-
-func (p *identPrinter) unique(id *ident) string {
- if p.ids == nil {
- p.ids = make(map[*ident]string)
- p.idGen = make(map[string]int)
- }
-
- name, ok := p.ids[id]
- if !ok {
- gen := p.idGen[id.name]
- p.idGen[id.name]++
- if gen == 0 {
- name = id.name
- } else {
- name = fmt.Sprintf("%s#%d", id.name, gen)
- }
- p.ids[id] = name
- }
-
- return name
-}
-
-func (p *identPrinter) slice(ids []*ident) string {
- var strs []string
- for _, id := range ids {
- strs = append(strs, p.unique(id))
- }
- return fmt.Sprintf("[%s]", strings.Join(strs, ", "))
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "fmt"
- "html"
- "io"
- "strings"
-)
-
-func (t *tracer) writeHTML(w io.Writer) {
- if !t.saveTree {
- panic("writeHTML called without tracer.saveTree")
- }
-
- fmt.Fprintf(w, "<html><head><style>%s</style></head>", htmlCSS)
- for _, root := range t.trees {
- dot := newDotEncoder()
- html := htmlTracer{w: w, dot: dot}
- html.writeTree(root)
- }
- fmt.Fprintf(w, "</html>\n")
-}
-
-const htmlCSS = `
-.unify {
- display: grid;
- grid-auto-columns: min-content;
- text-align: center;
-}
-
-.header {
- grid-row: 1;
- font-weight: bold;
- padding: 0.25em;
- position: sticky;
- top: 0;
- background: white;
-}
-
-.envFactor {
- display: grid;
- grid-auto-rows: min-content;
- grid-template-columns: subgrid;
- text-align: center;
-}
-`
-
-type htmlTracer struct {
- w io.Writer
- dot *dotEncoder
- svgs map[any]string
-}
-
-func (t *htmlTracer) writeTree(node *traceTree) {
- // TODO: This could be really nice.
- //
- // - Put nodes that were unified on the same rank with {rank=same; a; b}
- //
- // - On hover, highlight nodes that node was unified with and the result. If
- // it's a variable, highlight it in the environment, too.
- //
- // - On click, show the details of unifying that node.
- //
- // This could be the only way to navigate, without necessarily needing the
- // whole nest of <detail> nodes.
-
- // TODO: It might be possible to write this out on the fly.
-
- t.emit([]*Value{node.v, node.w}, []string{"v", "w"}, node.envIn)
-
- // Render children.
- for i, child := range node.children {
- if i >= 10 {
- fmt.Fprintf(t.w, `<div style="margin-left: 4em">...</div>`)
- break
- }
- fmt.Fprintf(t.w, `<details style="margin-left: 4em"><summary>%s</summary>`, html.EscapeString(child.label))
- t.writeTree(child)
- fmt.Fprintf(t.w, "</details>\n")
- }
-
- // Render result.
- if node.err != nil {
- fmt.Fprintf(t.w, "Error: %s\n", html.EscapeString(node.err.Error()))
- } else {
- t.emit([]*Value{node.res}, []string{"res"}, node.env)
- }
-}
-
-func htmlSVG[Key comparable](t *htmlTracer, f func(Key), arg Key) string {
- if s, ok := t.svgs[arg]; ok {
- return s
- }
- var buf strings.Builder
- f(arg)
- t.dot.writeSvg(&buf)
- t.dot.clear()
- svg := buf.String()
- if t.svgs == nil {
- t.svgs = make(map[any]string)
- }
- t.svgs[arg] = svg
- buf.Reset()
- return svg
-}
-
-func (t *htmlTracer) emit(vs []*Value, labels []string, env envSet) {
- fmt.Fprintf(t.w, `<div class="unify">`)
- for i, v := range vs {
- fmt.Fprintf(t.w, `<div class="header" style="grid-column: %d">%s</div>`, i+1, html.EscapeString(labels[i]))
- fmt.Fprintf(t.w, `<div style="grid-area: 2 / %d">%s</div>`, i+1, htmlSVG(t, t.dot.valueSubgraph, v))
- }
- col := len(vs)
-
- fmt.Fprintf(t.w, `<div class="header" style="grid-column: %d">in</div>`, col+1)
- fmt.Fprintf(t.w, `<div style="grid-area: 2 / %d">%s</div>`, col+1, htmlSVG(t, t.dot.envSubgraph, env))
-
- fmt.Fprintf(t.w, `</div>`)
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "fmt"
-)
-
-type Pos struct {
- Path string
- Line int
-}
-
-func (p Pos) String() string {
- var b []byte
- b, _ = p.AppendText(b)
- return string(b)
-}
-
-func (p Pos) AppendText(b []byte) ([]byte, error) {
- if p.Line == 0 {
- if p.Path == "" {
- return append(b, "?:?"...), nil
- } else {
- return append(b, p.Path...), nil
- }
- } else if p.Path == "" {
- return fmt.Appendf(b, "?:%d", p.Line), nil
- }
- return fmt.Appendf(b, "%s:%d", p.Path, p.Line), nil
-}
+++ /dev/null
-# In the original representation of environments, this caused an exponential
-# blowup in time and allocation. With that representation, this took about 20
-# seconds on my laptop and had a max RSS of ~12 GB. Big enough to be really
-# noticeable, but not so big it's likely to crash a developer machine. With the
-# better environment representation, it runs almost instantly and has an RSS of
-# ~90 MB.
-unify:
-- !sum
- - !sum [1, 2]
- - !sum [3, 4]
- - !sum [5, 6]
- - !sum [7, 8]
- - !sum [9, 10]
- - !sum [11, 12]
- - !sum [13, 14]
- - !sum [15, 16]
- - !sum [17, 18]
- - !sum [19, 20]
- - !sum [21, 22]
-- !sum
- - !sum [1, 2]
- - !sum [3, 4]
- - !sum [5, 6]
- - !sum [7, 8]
- - !sum [9, 10]
- - !sum [11, 12]
- - !sum [13, 14]
- - !sum [15, 16]
- - !sum [17, 18]
- - !sum [19, 20]
- - !sum [21, 22]
-all:
- [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
+++ /dev/null
-# Basic tests of unification
-
-#
-# Terminals
-#
-
-unify:
-- _
-- _
-want:
- _
----
-unify:
-- _
-- test
-want:
- test
----
-unify:
-- test
-- t?est
-want:
- test
----
-unify:
-- 1
-- 1
-want:
- 1
----
-unify:
-- test
-- foo
-want:
- _|_
-
-#
-# Tuple
-#
-
----
-unify:
-- [a, b]
-- [a, b]
-want:
- [a, b]
----
-unify:
-- [a, _]
-- [_, b]
-want:
- [a, b]
----
-unify:
-- ["ab?c", "de?f"]
-- [ac, def]
-want:
- [ac, def]
-
-#
-# Repeats
-#
-
----
-unify:
-- !repeat [a]
-- [_]
-want:
- [a]
----
-unify:
-- !repeat [a]
-- [_, _]
-want:
- [a, a]
----
-unify:
-- !repeat [a]
-- [b]
-want:
- _|_
----
-unify:
-- !repeat [xy*]
-- [x, xy, xyy]
-want:
- [x, xy, xyy]
----
-unify:
-- !repeat [xy*]
-- !repeat ["xz?y*"]
-- [x, xy, xyy]
-want:
- [x, xy, xyy]
----
-unify:
-- !repeat [!sum [a, b]]
-- [a, b, a]
-all:
-- [a, b, a]
----
-unify:
-- !repeat [!sum [a, b]]
-- !repeat [!sum [b, c]]
-- [b, b, b]
-all:
-- [b, b, b]
----
-unify:
-- !repeat [!sum [a, b]]
-- !repeat [!sum [b, c]]
-- [a]
-all: []
-
-#
-# Def
-#
-
----
-unify:
-- {a: a, b: b}
-- {a: a, b: b}
-want:
- {a: a, b: b}
----
-unify:
-- {a: a}
-- {b: b}
-want:
- {a: a, b: b}
-
-#
-# Sum
-#
-
----
-unify:
-- !sum [1, 2]
-- !sum [2, 3]
-all:
-- 2
----
-unify:
-- !sum [{label: a, value: abc}, {label: b, value: def}]
-- !sum [{value: "ab?c", extra: d}, {value: "def?", extra: g}]
-all:
-- {extra: d, label: a, value: abc}
-- {extra: g, label: b, value: def}
----
-# A sum of repeats must deal with different dynamically-created variables in
-# each branch.
-unify:
-- !sum [!repeat [a], !repeat [b]]
-- [a, a, a]
-all:
-- [a, a, a]
----
-unify:
-- !sum [!repeat [a], !repeat [b]]
-- [a, a, b]
-all: []
----
-# Exercise sumEnvs with more than one result
-unify:
-- !sum
- - [a|b, c|d]
- - [e, g]
-- [!sum [a, b, e, f], !sum [c, d, g, h]]
-all:
-- [a, c]
-- [a, d]
-- [b, c]
-- [b, d]
-- [e, g]
+++ /dev/null
-#
-# Basic tests
-#
-
-name: "basic string"
-unify:
-- $x
-- test
-all:
-- test
----
-name: "basic tuple"
-unify:
-- [$x, $x]
-- [test, test]
-all:
-- [test, test]
----
-name: "three tuples"
-unify:
-- [$x, $x]
-- [test, _]
-- [_, test]
-all:
-- [test, test]
----
-name: "basic def"
-unify:
-- {a: $x, b: $x}
-- {a: test, b: test}
-all:
-- {a: test, b: test}
----
-name: "three defs"
-unify:
-- {a: $x, b: $x}
-- {a: test}
-- {b: test}
-all:
-- {a: test, b: test}
-
-#
-# Bottom tests
-#
-
----
-name: "basic bottom"
-unify:
-- [$x, $x]
-- [test, foo]
-all: []
----
-name: "three-way bottom"
-unify:
-- [$x, $x]
-- [test, _]
-- [_, foo]
-all: []
-
-#
-# Basic sum tests
-#
-
----
-name: "basic sum"
-unify:
-- $x
-- !sum [a, b]
-all:
-- a
-- b
----
-name: "sum of tuples"
-unify:
-- [$x]
-- !sum [[a], [b]]
-all:
-- [a]
-- [b]
----
-name: "acausal sum"
-unify:
-- [_, !sum [a, b]]
-- [$x, $x]
-all:
-- [a, a]
-- [b, b]
-
-#
-# Transitivity tests
-#
-
----
-name: "transitivity"
-unify:
-- [_, _, _, test]
-- [$x, $x, _, _]
-- [ _, $x, $x, _]
-- [ _, _, $x, $x]
-all:
-- [test, test, test, test]
-
-#
-# Multiple vars
-#
-
----
-name: "basic uncorrelated vars"
-unify:
-- - !sum [1, 2]
- - !sum [3, 4]
-- - $a
- - $b
-all:
-- [1, 3]
-- [1, 4]
-- [2, 3]
-- [2, 4]
----
-name: "uncorrelated vars"
-unify:
-- - !sum [1, 2]
- - !sum [3, 4]
- - !sum [1, 2]
-- - $a
- - $b
- - $a
-all:
-- [1, 3, 1]
-- [1, 4, 1]
-- [2, 3, 2]
-- [2, 4, 2]
----
-name: "entangled vars"
-unify:
-- - !sum [[1,2],[3,4]]
- - !sum [[2,1],[3,4],[4,3]]
-- - [$a, $b]
- - [$b, $a]
-all:
-- - [1, 2]
- - [2, 1]
-- - [3, 4]
- - [4, 3]
-
-#
-# End-to-end examples
-#
-
----
-name: "end-to-end"
-unify:
-- go: Add
- in:
- - go: $t
- - go: $t
-- in: !repeat
- - !sum
- - go: Int32x4
- base: int
- - go: Uint32x4
- base: uint
-all:
-- go: Add
- in:
- - base: int
- go: Int32x4
- - base: int
- go: Int32x4
-- go: Add
- in:
- - base: uint
- go: Uint32x4
- - base: uint
- go: Uint32x4
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "fmt"
- "io"
- "strings"
-
- "gopkg.in/yaml.v3"
-)
-
-// debugDotInHTML, if true, includes dot code for all graphs in the HTML. Useful
-// for debugging the dot output itself.
-const debugDotInHTML = false
-
-var Debug struct {
- // UnifyLog, if non-nil, receives a streaming text trace of unification.
- UnifyLog io.Writer
-
- // HTML, if non-nil, writes an HTML trace of unification to HTML.
- HTML io.Writer
-}
-
-type tracer struct {
- logw io.Writer
-
- enc yamlEncoder // Print consistent idents throughout
-
- saveTree bool // if set, record tree; required for HTML output
-
- path []string
-
- node *traceTree
- trees []*traceTree
-}
-
-type traceTree struct {
- label string // Identifies this node as a child of parent
- v, w *Value // Unification inputs
- envIn envSet
- res *Value // Unification result
- env envSet
- err error // or error
-
- parent *traceTree
- children []*traceTree
-}
-
-type tracerExit struct {
- t *tracer
- len int
- node *traceTree
-}
-
-func (t *tracer) enter(pat string, vals ...any) tracerExit {
- if t == nil {
- return tracerExit{}
- }
-
- label := fmt.Sprintf(pat, vals...)
-
- var p *traceTree
- if t.saveTree {
- p = t.node
- if p != nil {
- t.node = &traceTree{label: label, parent: p}
- p.children = append(p.children, t.node)
- }
- }
-
- t.path = append(t.path, label)
- return tracerExit{t, len(t.path) - 1, p}
-}
-
-func (t *tracer) enterVar(id *ident, branch int) tracerExit {
- if t == nil {
- return tracerExit{}
- }
-
- // Use the tracer's ident printer
- return t.enter("Var %s br %d", t.enc.idp.unique(id), branch)
-}
-
-func (te tracerExit) exit() {
- if te.t == nil {
- return
- }
- te.t.path = te.t.path[:te.len]
- te.t.node = te.node
-}
-
-func indentf(prefix string, pat string, vals ...any) string {
- s := fmt.Sprintf(pat, vals...)
- if len(prefix) == 0 {
- return s
- }
- if !strings.Contains(s, "\n") {
- return prefix + s
- }
-
- indent := prefix
- if strings.TrimLeft(prefix, " ") != "" {
- // Prefix has non-space characters in it. Construct an all space-indent.
- indent = strings.Repeat(" ", len(prefix))
- }
- return prefix + strings.ReplaceAll(s, "\n", "\n"+indent)
-}
-
-func yamlf(prefix string, node *yaml.Node) string {
- b, err := yaml.Marshal(node)
- if err != nil {
- return fmt.Sprintf("<marshal failed: %s>", err)
- }
- return strings.TrimRight(indentf(prefix, "%s", b), " \n")
-}
-
-func (t *tracer) logf(pat string, vals ...any) {
- if t == nil || t.logw == nil {
- return
- }
- prefix := fmt.Sprintf("[%s] ", strings.Join(t.path, "/"))
- s := indentf(prefix, pat, vals...)
- s = strings.TrimRight(s, " \n")
- fmt.Fprintf(t.logw, "%s\n", s)
-}
-
-func (t *tracer) traceUnify(v, w *Value, e envSet) {
- if t == nil {
- return
- }
-
- t.enc.e = e // Interpret values w.r.t. e
- t.logf("Unify\n%s\nwith\n%s\nin\n%s",
- yamlf(" ", t.enc.value(v)),
- yamlf(" ", t.enc.value(w)),
- yamlf(" ", t.enc.env(e)))
- t.enc.e = envSet{}
-
- if t.saveTree {
- if t.node == nil {
- t.node = &traceTree{}
- t.trees = append(t.trees, t.node)
- }
- t.node.v, t.node.w, t.node.envIn = v, w, e
- }
-}
-
-func (t *tracer) traceDone(res *Value, e envSet, err error) {
- if t == nil {
- return
- }
-
- if err != nil {
- t.logf("==> %s", err)
- } else {
- t.logf("==>\n%s", yamlf(" ", t.enc.closure(Closure{res, e})))
- }
-
- if t.saveTree {
- node := t.node
- if node == nil {
- panic("popped top of trace stack")
- }
- node.res, node.err = res, err
- node.env = e
- }
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package unify implements unification of structured values.
-//
-// A [Value] represents a possibly infinite set of concrete values, where a
-// value is either a string ([String]), a tuple of values ([Tuple]), or a
-// string-keyed map of values called a "def" ([Def]). These sets can be further
-// constrained by variables ([Var]). A [Value] combined with bindings of
-// variables is a [Closure].
-//
-// [Unify] finds a [Closure] that satisfies two or more other [Closure]s. This
-// can be thought of as intersecting the sets represented by these Closures'
-// values, or as the greatest lower bound/infimum of these Closures. If no such
-// Closure exists, the result of unification is "bottom", or the empty set.
-//
-// # Examples
-//
-// The regular expression "a*" is the infinite set of strings of zero or more
-// "a"s. "a*" can be unified with "a" or "aa" or "aaa", and the result is just
-// "a", "aa", or "aaa", respectively. However, unifying "a*" with "b" fails
-// because there are no values that satisfy both.
-//
-// Sums express sets directly. For example, !sum [a, b] is the set consisting of
-// "a" and "b". Unifying this with !sum [b, c] results in just "b". This also
-// makes it easy to demonstrate that unification isn't necessarily a single
-// concrete value. For example, unifying !sum [a, b, c] with !sum [b, c, d]
-// results in two concrete values: "b" and "c".
-//
-// The special value _ or "top" represents all possible values. Unifying _ with
-// any value x results in x.
-//
-// Unifying composite values—tuples and defs—unifies their elements.
-//
-// The value [a*, aa] is an infinite set of tuples. If we unify that with the
-// value [aaa, a*], the only possible value that satisfies both is [aaa, aa].
-// Likewise, this is the intersection of the sets described by these two values.
-//
-// Defs are similar to tuples, but they are indexed by strings and don't have a
-// fixed length. For example, {x: a, y: b} is a def with two fields. Any field
-// not mentioned in a def is implicitly top. Thus, unifying this with {y: b, z:
-// c} results in {x: a, y: b, z: c}.
-//
-// Variables constrain values. For example, the value [$x, $x] represents all
-// tuples whose first and second values are the same, but doesn't otherwise
-// constrain that value. Thus, this set includes [a, a] as well as [[b, c, d],
-// [b, c, d]], but it doesn't include [a, b].
-//
-// Sums are internally implemented as fresh variables that are simultaneously
-// bound to all values of the sum. That is !sum [a, b] is actually $var (where
-// var is some fresh name), closed under the environment $var=a | $var=b.
-package unify
-
-import (
- "errors"
- "fmt"
- "slices"
-)
-
-// Unify computes a Closure that satisfies each input Closure. If no such
-// Closure exists, it returns bottom.
-func Unify(closures ...Closure) (Closure, error) {
- if len(closures) == 0 {
- return Closure{topValue, topEnv}, nil
- }
-
- var trace *tracer
- if Debug.UnifyLog != nil || Debug.HTML != nil {
- trace = &tracer{
- logw: Debug.UnifyLog,
- saveTree: Debug.HTML != nil,
- }
- }
-
- unified := closures[0]
- for _, c := range closures[1:] {
- var err error
- uf := newUnifier()
- uf.tracer = trace
- e := crossEnvs(unified.env, c.env)
- unified.val, unified.env, err = unified.val.unify(c.val, e, false, uf)
- if Debug.HTML != nil {
- uf.writeHTML(Debug.HTML)
- }
- if err != nil {
- return Closure{}, err
- }
- }
-
- return unified, nil
-}
-
-type unifier struct {
- *tracer
-}
-
-func newUnifier() *unifier {
- return &unifier{}
-}
-
-// errDomains is a sentinel error used between unify and unify1 to indicate that
-// unify1 could not unify the domains of the two values.
-var errDomains = errors.New("cannot unify domains")
-
-func (v *Value) unify(w *Value, e envSet, swap bool, uf *unifier) (*Value, envSet, error) {
- if swap {
- // Put the values in order. This just happens to be a handy choke-point
- // to do this at.
- v, w = w, v
- }
-
- uf.traceUnify(v, w, e)
-
- d, e2, err := v.unify1(w, e, false, uf)
- if err == errDomains {
- // Try the other order.
- d, e2, err = w.unify1(v, e, true, uf)
- if err == errDomains {
- // Okay, we really can't unify these.
- err = fmt.Errorf("cannot unify %T (%s) and %T (%s): kind mismatch", v.Domain, v.PosString(), w.Domain, w.PosString())
- }
- }
- if err != nil {
- uf.traceDone(nil, envSet{}, err)
- return nil, envSet{}, err
- }
- res := unified(d, v, w)
- uf.traceDone(res, e2, nil)
- if d == nil {
- // Double check that a bottom Value also has a bottom env.
- if !e2.isEmpty() {
- panic("bottom Value has non-bottom environment")
- }
- }
-
- return res, e2, nil
-}
-
-func (v *Value) unify1(w *Value, e envSet, swap bool, uf *unifier) (Domain, envSet, error) {
- // TODO: If there's an error, attach position information to it.
-
- vd, wd := v.Domain, w.Domain
-
- // Bottom returns bottom, and eliminates all possible environments.
- if vd == nil || wd == nil {
- return nil, bottomEnv, nil
- }
-
- // Top always returns the other.
- if _, ok := vd.(Top); ok {
- return wd, e, nil
- }
-
- // Variables
- if vd, ok := vd.(Var); ok {
- return vd.unify(w, e, swap, uf)
- }
-
- // Composite values
- if vd, ok := vd.(Def); ok {
- if wd, ok := wd.(Def); ok {
- return vd.unify(wd, e, swap, uf)
- }
- }
- if vd, ok := vd.(Tuple); ok {
- if wd, ok := wd.(Tuple); ok {
- return vd.unify(wd, e, swap, uf)
- }
- }
-
- // Scalar values
- if vd, ok := vd.(String); ok {
- if wd, ok := wd.(String); ok {
- res := vd.unify(wd)
- if res == nil {
- e = bottomEnv
- }
- return res, e, nil
- }
- }
-
- return nil, envSet{}, errDomains
-}
-
-func (d Def) unify(o Def, e envSet, swap bool, uf *unifier) (Domain, envSet, error) {
- out := Def{fields: make(map[string]*Value)}
-
- // Check keys of d against o.
- for key, dv := range d.All() {
- ov, ok := o.fields[key]
- if !ok {
- // ov is implicitly Top. Bypass unification.
- out.fields[key] = dv
- continue
- }
- exit := uf.enter("%s", key)
- res, e2, err := dv.unify(ov, e, swap, uf)
- exit.exit()
- if err != nil {
- return nil, envSet{}, err
- } else if res.Domain == nil {
- // No match.
- return nil, bottomEnv, nil
- }
- out.fields[key] = res
- e = e2
- }
- // Check keys of o that we didn't already check. These all implicitly match
- // because we know the corresponding fields in d are all Top.
- for key, dv := range o.All() {
- if _, ok := d.fields[key]; !ok {
- out.fields[key] = dv
- }
- }
- return out, e, nil
-}
-
-func (v Tuple) unify(w Tuple, e envSet, swap bool, uf *unifier) (Domain, envSet, error) {
- if v.repeat != nil && w.repeat != nil {
- // Since we generate the content of these lazily, there's not much we
- // can do but just stick them on a list to unify later.
- return Tuple{repeat: concat(v.repeat, w.repeat)}, e, nil
- }
-
- // Expand any repeated tuples.
- tuples := make([]Tuple, 0, 2)
- if v.repeat == nil {
- tuples = append(tuples, v)
- } else {
- v2, e2 := v.doRepeat(e, len(w.vs))
- tuples = append(tuples, v2...)
- e = e2
- }
- if w.repeat == nil {
- tuples = append(tuples, w)
- } else {
- w2, e2 := w.doRepeat(e, len(v.vs))
- tuples = append(tuples, w2...)
- e = e2
- }
-
- // Now unify all of the tuples (usually this will be just 2 tuples)
- out := tuples[0]
- for _, t := range tuples[1:] {
- if len(out.vs) != len(t.vs) {
- uf.logf("tuple length mismatch")
- return nil, bottomEnv, nil
- }
- zs := make([]*Value, len(out.vs))
- for i, v1 := range out.vs {
- exit := uf.enter("%d", i)
- z, e2, err := v1.unify(t.vs[i], e, swap, uf)
- exit.exit()
- if err != nil {
- return nil, envSet{}, err
- } else if z.Domain == nil {
- return nil, bottomEnv, nil
- }
- zs[i] = z
- e = e2
- }
- out = Tuple{vs: zs}
- }
-
- return out, e, nil
-}
-
-// doRepeat creates a fixed-length tuple from a repeated tuple. The caller is
-// expected to unify the returned tuples.
-func (v Tuple) doRepeat(e envSet, n int) ([]Tuple, envSet) {
- res := make([]Tuple, len(v.repeat))
- for i, gen := range v.repeat {
- res[i].vs = make([]*Value, n)
- for j := range n {
- res[i].vs[j], e = gen(e)
- }
- }
- return res, e
-}
-
-// unify intersects the domains of two [String]s. If it can prove that this
-// domain is empty, it returns nil (bottom).
-//
-// TODO: Consider splitting literals and regexps into two domains.
-func (v String) unify(w String) Domain {
- // Unification is symmetric, so put them in order of string kind so we only
- // have to deal with half the cases.
- if v.kind > w.kind {
- v, w = w, v
- }
-
- switch v.kind {
- case stringRegex:
- switch w.kind {
- case stringRegex:
- // Construct a match against all of the regexps
- return String{kind: stringRegex, re: slices.Concat(v.re, w.re)}
- case stringExact:
- for _, re := range v.re {
- if !re.MatchString(w.exact) {
- return nil
- }
- }
- return w
- }
- case stringExact:
- if v.exact != w.exact {
- return nil
- }
- return v
- }
- panic("bad string kind")
-}
-
-func concat[T any](s1, s2 []T) []T {
- // Reuse s1 or s2 if possible.
- if len(s1) == 0 {
- return s2
- }
- return append(s1[:len(s1):len(s1)], s2...)
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "bytes"
- "fmt"
- "io"
- "os"
- "path/filepath"
- "slices"
- "strings"
- "testing"
-
- "gopkg.in/yaml.v3"
-)
-
-func TestUnify(t *testing.T) {
- paths, err := filepath.Glob("testdata/*")
- if err != nil {
- t.Fatal(err)
- }
- if len(paths) == 0 {
- t.Fatal("no testdata found")
- }
- for _, path := range paths {
- // Skip paths starting with _ so experimental files can be added.
- base := filepath.Base(path)
- if base[0] == '_' {
- continue
- }
- if !strings.HasSuffix(base, ".yaml") {
- t.Errorf("non-.yaml file in testdata: %s", base)
- continue
- }
- base = strings.TrimSuffix(base, ".yaml")
-
- t.Run(base, func(t *testing.T) {
- testUnify(t, path)
- })
- }
-}
-
-func testUnify(t *testing.T, path string) {
- f, err := os.Open(path)
- if err != nil {
- t.Fatal(err)
- }
- defer f.Close()
-
- type testCase struct {
- Skip bool
- Name string
- Unify []Closure
- Want yaml.Node
- All yaml.Node
- }
- dec := yaml.NewDecoder(f)
-
- for i := 0; ; i++ {
- var tc testCase
- err := dec.Decode(&tc)
- if err == io.EOF {
- break
- }
- if err != nil {
- t.Fatal(err)
- }
-
- name := tc.Name
- if name == "" {
- name = fmt.Sprint(i)
- }
-
- t.Run(name, func(t *testing.T) {
- if tc.Skip {
- t.Skip("skip: true set in test case")
- }
-
- defer func() {
- p := recover()
- if p != nil || t.Failed() {
- // Redo with a trace
- //
- // TODO: Use t.Output() in Go 1.25.
- var buf bytes.Buffer
- Debug.UnifyLog = &buf
- func() {
- defer func() {
- // If the original unify panicked, the second one
- // probably will, too. Ignore it and let the first panic
- // bubble.
- recover()
- }()
- Unify(tc.Unify...)
- }()
- Debug.UnifyLog = nil
- t.Logf("Trace:\n%s", buf.String())
- }
- if p != nil {
- panic(p)
- }
- }()
-
- // Unify the test cases
- //
- // TODO: Try reordering the inputs also
- c, err := Unify(tc.Unify...)
- if err != nil {
- // TODO: Tests of errors
- t.Fatal(err)
- }
-
- // Encode the result back to YAML so we can check if it's structurally
- // equal.
- clean := func(val any) *yaml.Node {
- var node yaml.Node
- node.Encode(val)
- for n := range allYamlNodes(&node) {
- // Canonicalize the style. There may be other style flags we need to
- // muck with.
- n.Style &^= yaml.FlowStyle
- n.HeadComment = ""
- n.LineComment = ""
- n.FootComment = ""
- }
- return &node
- }
- check := func(gotVal any, wantNode *yaml.Node) {
- got, err := yaml.Marshal(clean(gotVal))
- if err != nil {
- t.Fatalf("Encoding Value back to yaml failed: %s", err)
- }
- want, err := yaml.Marshal(clean(wantNode))
- if err != nil {
- t.Fatalf("Encoding Want back to yaml failed: %s", err)
- }
-
- if !bytes.Equal(got, want) {
- t.Errorf("%s:%d:\nwant:\n%sgot\n%s", f.Name(), wantNode.Line, want, got)
- }
- }
- if tc.Want.Kind != 0 {
- check(c.val, &tc.Want)
- }
- if tc.All.Kind != 0 {
- fVal := slices.Collect(c.All())
- check(fVal, &tc.All)
- }
- })
- }
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "fmt"
- "iter"
- "reflect"
-)
-
-// A Value represents a structured, non-deterministic value consisting of
-// strings, tuples of Values, and string-keyed maps of Values. A
-// non-deterministic Value will also contain variables, which are resolved via
-// an environment as part of a [Closure].
-//
-// For debugging, a Value can also track the source position it was read from in
-// an input file, and its provenance from other Values.
-type Value struct {
- Domain Domain
-
- // A Value has either a pos or parents (or neither).
- pos *Pos
- parents *[2]*Value
-}
-
-var (
- topValue = &Value{Domain: Top{}}
- bottomValue = &Value{Domain: nil}
-)
-
-// NewValue returns a new [Value] with the given domain and no position
-// information.
-func NewValue(d Domain) *Value {
- return &Value{Domain: d}
-}
-
-// NewValuePos returns a new [Value] with the given domain at position p.
-func NewValuePos(d Domain, p Pos) *Value {
- return &Value{Domain: d, pos: &p}
-}
-
-// newValueFrom returns a new [Value] with the given domain that copies the
-// position information of p.
-func newValueFrom(d Domain, p *Value) *Value {
- return &Value{Domain: d, pos: p.pos, parents: p.parents}
-}
-
-func unified(d Domain, p1, p2 *Value) *Value {
- return &Value{Domain: d, parents: &[2]*Value{p1, p2}}
-}
-
-func (v *Value) Pos() Pos {
- if v.pos == nil {
- return Pos{}
- }
- return *v.pos
-}
-
-func (v *Value) PosString() string {
- var b []byte
- for root := range v.Provenance() {
- if len(b) > 0 {
- b = append(b, ' ')
- }
- b, _ = root.pos.AppendText(b)
- }
- return string(b)
-}
-
-func (v *Value) WhyNotExact() string {
- if v.Domain == nil {
- return "v.Domain is nil"
- }
- return v.Domain.WhyNotExact()
-}
-
-func (v *Value) Exact() bool {
- if v.Domain == nil {
- return false
- }
- return v.Domain.Exact()
-}
-
-// Decode decodes v into a Go value.
-//
-// v must be exact, except that it can include Top. into must be a pointer.
-// [Def]s are decoded into structs. [Tuple]s are decoded into slices. [String]s
-// are decoded into strings or ints. Any field can itself be a pointer to one of
-// these types. Top can be decoded into a pointer-typed field and will set the
-// field to nil. Anything else will allocate a value if necessary.
-//
-// Any type may implement [Decoder], in which case its DecodeUnified method will
-// be called instead of using the default decoding scheme.
-func (v *Value) Decode(into any) error {
- rv := reflect.ValueOf(into)
- if rv.Kind() != reflect.Pointer {
- return fmt.Errorf("cannot decode into non-pointer %T", into)
- }
- return decodeReflect(v, rv.Elem())
-}
-
-func decodeReflect(v *Value, rv reflect.Value) error {
- var ptr reflect.Value
- if rv.Kind() == reflect.Pointer {
- if rv.IsNil() {
- // Transparently allocate through pointers, *except* for Top, which
- // wants to set the pointer to nil.
- //
- // TODO: Drop this condition if I switch to an explicit Optional[T]
- // or move the Top logic into Def.
- if _, ok := v.Domain.(Top); !ok {
- // Allocate the value to fill in, but don't actually store it in
- // the pointer until we successfully decode.
- ptr = rv
- rv = reflect.New(rv.Type().Elem()).Elem()
- }
- } else {
- rv = rv.Elem()
- }
- }
-
- var err error
- if reflect.PointerTo(rv.Type()).Implements(decoderType) {
- // Use the custom decoder.
- err = rv.Addr().Interface().(Decoder).DecodeUnified(v)
- } else {
- err = v.Domain.decode(rv)
- }
- if err == nil && ptr.IsValid() {
- ptr.Set(rv.Addr())
- }
- return err
-}
-
-// Decoder can be implemented by types as a custom implementation of [Decode]
-// for that type.
-type Decoder interface {
- DecodeUnified(v *Value) error
-}
-
-var decoderType = reflect.TypeOf((*Decoder)(nil)).Elem()
-
-// Provenance iterates over all of the source Values that have contributed to
-// this Value.
-func (v *Value) Provenance() iter.Seq[*Value] {
- return func(yield func(*Value) bool) {
- var rec func(d *Value) bool
- rec = func(d *Value) bool {
- if d.pos != nil {
- if !yield(d) {
- return false
- }
- }
- if d.parents != nil {
- for _, p := range d.parents {
- if !rec(p) {
- return false
- }
- }
- }
- return true
- }
- rec(v)
- }
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "reflect"
- "slices"
- "testing"
-)
-
-func ExampleClosure_All_tuple() {
- v := mustParse(`
-- !sum [1, 2]
-- !sum [3, 4]
-`)
- printYaml(slices.Collect(v.All()))
-
- // Output:
- // - [1, 3]
- // - [1, 4]
- // - [2, 3]
- // - [2, 4]
-}
-
-func ExampleClosure_All_def() {
- v := mustParse(`
-a: !sum [1, 2]
-b: !sum [3, 4]
-c: 5
-`)
- printYaml(slices.Collect(v.All()))
-
- // Output:
- // - {a: 1, b: 3, c: 5}
- // - {a: 1, b: 4, c: 5}
- // - {a: 2, b: 3, c: 5}
- // - {a: 2, b: 4, c: 5}
-}
-
-func checkDecode[T any](t *testing.T, got *Value, want T) {
- var gotT T
- if err := got.Decode(&gotT); err != nil {
- t.Fatalf("Decode failed: %v", err)
- }
- if !reflect.DeepEqual(&gotT, &want) {
- t.Fatalf("got:\n%s\nwant:\n%s", prettyYaml(gotT), prettyYaml(want))
- }
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "errors"
- "fmt"
- "io"
- "io/fs"
- "os"
- "path/filepath"
- "regexp"
- "strings"
-
- "gopkg.in/yaml.v3"
-)
-
-// ReadOpts provides options to [Read] and related functions. The zero value is
-// the default options.
-type ReadOpts struct {
- // FS, if non-nil, is the file system from which to resolve !import file
- // names.
- FS fs.FS
-}
-
-// Read reads a [Closure] in YAML format from r, using path for error messages.
-//
-// It maps YAML nodes into terminal Values as follows:
-//
-// - "_" or !top _ is the top value ([Top]).
-//
-// - "_|_" or !bottom _ is the bottom value. This is an error during
-// unmarshaling, but can appear in marshaled values.
-//
-// - "$<name>" or !var <name> is a variable ([Var]). Everywhere the same name
-// appears within a single unmarshal operation, it is mapped to the same
-// variable. Different unmarshal operations get different variables, even if
-// they have the same string name.
-//
-// - !regex "x" is a regular expression ([String]), as is any string that
-// doesn't match "_", "_|_", or "$...". Regular expressions are implicitly
-// anchored at the beginning and end. If the string doesn't contain any
-// meta-characters (that is, it's a "literal" regular expression), then it's
-// treated as an exact string.
-//
-// - !string "x", or any int, float, bool, or binary value is an exact string
-// ([String]).
-//
-// - !regex [x, y, ...] is an intersection of regular expressions ([String]).
-//
-// It maps YAML nodes into non-terminal Values as follows:
-//
-// - Sequence nodes like [x, y, z] are tuples ([Tuple]).
-//
-// - !repeat [x] is a repeated tuple ([Tuple]), which is 0 or more instances of
-// x. There must be exactly one element in the list.
-//
-// - Mapping nodes like {a: x, b: y} are defs ([Def]). Any fields not listed are
-// implicitly top.
-//
-// - !sum [x, y, z] is a sum of its children. This can be thought of as a union
-// of the values x, y, and z, or as a non-deterministic choice between x, y, and
-// z. If a variable appears both inside the sum and outside of it, only the
-// non-deterministic choice view really works. The unifier does not directly
-// implement sums; instead, this is decoded as a fresh variable that's
-// simultaneously bound to x, y, and z.
-//
-// - !import glob is like a !sum, but its children are read from all files
-// matching the given glob pattern, which is interpreted relative to the current
-// file path. Each file gets its own variable scope.
-func Read(r io.Reader, path string, opts ReadOpts) (Closure, error) {
- dec := yamlDecoder{opts: opts, path: path, env: topEnv}
- v, err := dec.read(r)
- if err != nil {
- return Closure{}, err
- }
- return dec.close(v), nil
-}
-
-// ReadFile reads a [Closure] in YAML format from a file.
-//
-// The file must consist of a single YAML document.
-//
-// If opts.FS is not set, this sets it to a FS rooted at path's directory.
-//
-// See [Read] for details.
-func ReadFile(path string, opts ReadOpts) (Closure, error) {
- f, err := os.Open(path)
- if err != nil {
- return Closure{}, err
- }
- defer f.Close()
-
- if opts.FS == nil {
- opts.FS = os.DirFS(filepath.Dir(path))
- }
-
- return Read(f, path, opts)
-}
-
-// UnmarshalYAML implements [yaml.Unmarshaler].
-//
-// Since there is no way to pass [ReadOpts] to this function, it assumes default
-// options.
-func (c *Closure) UnmarshalYAML(node *yaml.Node) error {
- dec := yamlDecoder{path: "<yaml.Node>", env: topEnv}
- v, err := dec.root(node)
- if err != nil {
- return err
- }
- *c = dec.close(v)
- return nil
-}
-
-type yamlDecoder struct {
- opts ReadOpts
- path string
-
- vars map[string]*ident
- nSums int
-
- env envSet
-}
-
-func (dec *yamlDecoder) read(r io.Reader) (*Value, error) {
- n, err := readOneNode(r)
- if err != nil {
- return nil, fmt.Errorf("%s: %w", dec.path, err)
- }
-
- // Decode YAML node to a Value
- v, err := dec.root(n)
- if err != nil {
- return nil, fmt.Errorf("%s: %w", dec.path, err)
- }
-
- return v, nil
-}
-
-// readOneNode reads a single YAML document from r and returns an error if there
-// are more documents in r.
-func readOneNode(r io.Reader) (*yaml.Node, error) {
- yd := yaml.NewDecoder(r)
-
- // Decode as a YAML node
- var node yaml.Node
- if err := yd.Decode(&node); err != nil {
- return nil, err
- }
- np := &node
- if np.Kind == yaml.DocumentNode {
- np = node.Content[0]
- }
-
- // Ensure there are no more YAML docs in this file
- if err := yd.Decode(nil); err == nil {
- return nil, fmt.Errorf("must not contain multiple documents")
- } else if err != io.EOF {
- return nil, err
- }
-
- return np, nil
-}
-
-// root parses the root of a file.
-func (dec *yamlDecoder) root(node *yaml.Node) (*Value, error) {
- // Prepare for variable name resolution in this file. This may be a nested
- // root, so restore the current values when we're done.
- oldVars, oldNSums := dec.vars, dec.nSums
- defer func() {
- dec.vars, dec.nSums = oldVars, oldNSums
- }()
- dec.vars = make(map[string]*ident, 0)
- dec.nSums = 0
-
- return dec.value(node)
-}
-
-// close wraps a decoded [Value] into a [Closure].
-func (dec *yamlDecoder) close(v *Value) Closure {
- return Closure{v, dec.env}
-}
-
-func (dec *yamlDecoder) value(node *yaml.Node) (vOut *Value, errOut error) {
- pos := &Pos{Path: dec.path, Line: node.Line}
-
- // Resolve alias nodes.
- if node.Kind == yaml.AliasNode {
- node = node.Alias
- }
-
- mk := func(d Domain) (*Value, error) {
- v := &Value{Domain: d, pos: pos}
- return v, nil
- }
- mk2 := func(d Domain, err error) (*Value, error) {
- if err != nil {
- return nil, err
- }
- return mk(d)
- }
-
- // is tests the kind and long tag of node.
- is := func(kind yaml.Kind, tag string) bool {
- return node.Kind == kind && node.LongTag() == tag
- }
- isExact := func() bool {
- if node.Kind != yaml.ScalarNode {
- return false
- }
- // We treat any string-ish YAML node as a string.
- switch node.LongTag() {
- case "!string", "tag:yaml.org,2002:int", "tag:yaml.org,2002:float", "tag:yaml.org,2002:bool", "tag:yaml.org,2002:binary":
- return true
- }
- return false
- }
-
- // !!str nodes provide a short-hand syntax for several leaf domains that are
- // also available under explicit tags. To simplify checking below, we set
- // strVal to non-"" only for !!str nodes.
- strVal := ""
- isStr := is(yaml.ScalarNode, "tag:yaml.org,2002:str")
- if isStr {
- strVal = node.Value
- }
-
- switch {
- case is(yaml.ScalarNode, "!var"):
- strVal = "$" + node.Value
- fallthrough
- case strings.HasPrefix(strVal, "$"):
- id, ok := dec.vars[strVal]
- if !ok {
- // We encode different idents with the same string name by adding a
- // #N suffix. Strip that off so it doesn't accumulate. This isn't
- // meant to be used in user-written input, though nothing stops that.
- name, _, _ := strings.Cut(strVal, "#")
- id = &ident{name: name}
- dec.vars[strVal] = id
- dec.env = dec.env.bind(id, topValue)
- }
- return mk(Var{id: id})
-
- case strVal == "_" || is(yaml.ScalarNode, "!top"):
- return mk(Top{})
-
- case strVal == "_|_" || is(yaml.ScalarNode, "!bottom"):
- return nil, errors.New("found bottom")
-
- case isExact():
- val := node.Value
- return mk(NewStringExact(val))
-
- case isStr || is(yaml.ScalarNode, "!regex"):
- // Any other string we treat as a regex. This will produce an exact
- // string anyway if the regex is literal.
- val := node.Value
- return mk2(NewStringRegex(val))
-
- case is(yaml.SequenceNode, "!regex"):
- var vals []string
- if err := node.Decode(&vals); err != nil {
- return nil, err
- }
- return mk2(NewStringRegex(vals...))
-
- case is(yaml.MappingNode, "tag:yaml.org,2002:map"):
- var db DefBuilder
- for i := 0; i < len(node.Content); i += 2 {
- key := node.Content[i]
- if key.Kind != yaml.ScalarNode {
- return nil, fmt.Errorf("non-scalar key %q", key.Value)
- }
- val, err := dec.value(node.Content[i+1])
- if err != nil {
- return nil, err
- }
- db.Add(key.Value, val)
- }
- return mk(db.Build())
-
- case is(yaml.SequenceNode, "tag:yaml.org,2002:seq"):
- elts := node.Content
- vs := make([]*Value, 0, len(elts))
- for _, elt := range elts {
- v, err := dec.value(elt)
- if err != nil {
- return nil, err
- }
- vs = append(vs, v)
- }
- return mk(NewTuple(vs...))
-
- case is(yaml.SequenceNode, "!repeat") || is(yaml.SequenceNode, "!repeat-unify"):
- // !repeat must have one child. !repeat-unify is used internally for
- // delayed unification, and is the same, it's just allowed to have more
- // than one child.
- if node.LongTag() == "!repeat" && len(node.Content) != 1 {
- return nil, fmt.Errorf("!repeat must have exactly one child")
- }
-
- // Decode the children to make sure they're well-formed, but otherwise
- // discard that decoding and do it again every time we need a new
- // element.
- var gen []func(e envSet) (*Value, envSet)
- origEnv := dec.env
- elts := node.Content
- for i, elt := range elts {
- _, err := dec.value(elt)
- if err != nil {
- return nil, err
- }
- // Undo any effects on the environment. We *do* keep any named
- // variables that were added to the vars map in case they were
- // introduced within the element.
- //
- // TODO: If we change how we implement repeat nodes, we might be
- // able to drop yamlEncoder.env and yamlDecoder.env.
- dec.env = origEnv
- // Add a generator function
- gen = append(gen, func(e envSet) (*Value, envSet) {
- dec.env = e
- // TODO: If this is in a sum, this tends to generate a ton of
- // fresh variables that are different on each branch of the
- // parent sum. Does it make sense to hold on to the i'th value
- // of the tuple after we've generated it?
- v, err := dec.value(elts[i])
- if err != nil {
- // It worked the first time, so this really shouldn't hapen.
- panic("decoding repeat element failed")
- }
- return v, dec.env
- })
- }
- return mk(NewRepeat(gen...))
-
- case is(yaml.SequenceNode, "!sum"):
- vs := make([]*Value, 0, len(node.Content))
- for _, elt := range node.Content {
- v, err := dec.value(elt)
- if err != nil {
- return nil, err
- }
- vs = append(vs, v)
- }
- if len(vs) == 1 {
- return vs[0], nil
- }
-
- // A sum is implemented as a fresh variable that's simultaneously bound
- // to each of the descendants.
- id := &ident{name: fmt.Sprintf("sum%d", dec.nSums)}
- dec.nSums++
- dec.env = dec.env.bind(id, vs...)
- return mk(Var{id: id})
-
- case is(yaml.ScalarNode, "!import"):
- if dec.opts.FS == nil {
- return nil, fmt.Errorf("!import not allowed (ReadOpts.FS not set)")
- }
- pat := node.Value
-
- if !fs.ValidPath(pat) {
- // This will result in Glob returning no results. Give a more useful
- // error message for this case.
- return nil, fmt.Errorf("!import path must not contain '.' or '..'")
- }
-
- ms, err := fs.Glob(dec.opts.FS, pat)
- if err != nil {
- return nil, fmt.Errorf("resolving !import: %w", err)
- }
- if len(ms) == 0 {
- return nil, fmt.Errorf("!import did not match any files")
- }
-
- // Parse each file
- vs := make([]*Value, 0, len(ms))
- for _, m := range ms {
- v, err := dec.import1(m)
- if err != nil {
- return nil, err
- }
- vs = append(vs, v)
- }
-
- // Create a sum.
- if len(vs) == 1 {
- return vs[0], nil
- }
- id := &ident{name: "import"}
- dec.env = dec.env.bind(id, vs...)
- return mk(Var{id: id})
- }
-
- return nil, fmt.Errorf("unknown node kind %d %v", node.Kind, node.Tag)
-}
-
-func (dec *yamlDecoder) import1(path string) (*Value, error) {
- // Make sure we can open the path first.
- f, err := dec.opts.FS.Open(path)
- if err != nil {
- return nil, fmt.Errorf("!import failed: %w", err)
- }
- defer f.Close()
-
- // Prepare the enter path.
- oldFS, oldPath := dec.opts.FS, dec.path
- defer func() {
- dec.opts.FS, dec.path = oldFS, oldPath
- }()
-
- // Enter path, which is relative to the current path's directory.
- newPath := filepath.Join(filepath.Dir(dec.path), path)
- subFS, err := fs.Sub(dec.opts.FS, filepath.Dir(path))
- if err != nil {
- return nil, err
- }
- dec.opts.FS, dec.path = subFS, newPath
-
- // Parse the file.
- return dec.read(f)
-}
-
-type yamlEncoder struct {
- idp identPrinter
- e envSet // We track the environment for !repeat nodes.
-}
-
-// TODO: Switch some Value marshaling to Closure?
-
-func (c Closure) MarshalYAML() (any, error) {
- // TODO: If the environment is trivial, just marshal the value.
- enc := &yamlEncoder{}
- return enc.closure(c), nil
-}
-
-func (c Closure) String() string {
- b, err := yaml.Marshal(c)
- if err != nil {
- return fmt.Sprintf("marshal failed: %s", err)
- }
- return string(b)
-}
-
-func (v *Value) MarshalYAML() (any, error) {
- enc := &yamlEncoder{e: topEnv}
- return enc.value(v), nil
-}
-
-func (v *Value) String() string {
- b, err := yaml.Marshal(v)
- if err != nil {
- return fmt.Sprintf("marshal failed: %s", err)
- }
- return string(b)
-}
-
-func (enc *yamlEncoder) closure(c Closure) *yaml.Node {
- enc.e = c.env
- var n yaml.Node
- n.Kind = yaml.MappingNode
- n.Tag = "!closure"
- n.Content = make([]*yaml.Node, 4)
- n.Content[0] = new(yaml.Node)
- n.Content[0].SetString("env")
- n.Content[2] = new(yaml.Node)
- n.Content[2].SetString("in")
- n.Content[3] = enc.value(c.val)
- // Fill in the env after we've written the value in case value encoding
- // affects the env.
- n.Content[1] = enc.env(enc.e)
- enc.e = envSet{} // Allow GC'ing the env
- return &n
-}
-
-func (enc *yamlEncoder) env(e envSet) *yaml.Node {
- var encode func(e *envExpr) *yaml.Node
- encode = func(e *envExpr) *yaml.Node {
- var n yaml.Node
- switch e.kind {
- default:
- panic("bad kind")
- case envZero:
- n.SetString("0")
- case envUnit:
- n.SetString("1")
- case envBinding:
- var id yaml.Node
- id.SetString(enc.idp.unique(e.id))
- n.Kind = yaml.MappingNode
- n.Content = []*yaml.Node{&id, enc.value(e.val)}
- case envProduct, envSum:
- n.Kind = yaml.SequenceNode
- if e.kind == envProduct {
- n.Tag = "!product"
- } else {
- n.Tag = "!sum"
- }
- for _, e2 := range e.operands {
- n.Content = append(n.Content, encode(e2))
- }
- }
- return &n
- }
- return encode(e.root)
-}
-
-var yamlIntRe = regexp.MustCompile(`^-?[0-9]+$`)
-
-func (enc *yamlEncoder) value(v *Value) *yaml.Node {
- var n yaml.Node
- switch d := v.Domain.(type) {
- case nil:
- // Not allowed by unmarshaler, but useful for understanding when
- // something goes horribly wrong.
- //
- // TODO: We might be able to track useful provenance for this, which
- // would really help with debugging unexpected bottoms.
- n.SetString("_|_")
- return &n
-
- case Top:
- n.SetString("_")
- return &n
-
- case Def:
- n.Kind = yaml.MappingNode
- for k, elt := range d.All() {
- var kn yaml.Node
- kn.SetString(k)
- n.Content = append(n.Content, &kn, enc.value(elt))
- }
- n.HeadComment = v.PosString()
- return &n
-
- case Tuple:
- n.Kind = yaml.SequenceNode
- if d.repeat == nil {
- for _, elt := range d.vs {
- n.Content = append(n.Content, enc.value(elt))
- }
- } else {
- if len(d.repeat) == 1 {
- n.Tag = "!repeat"
- } else {
- n.Tag = "!repeat-unify"
- }
- // TODO: I'm not positive this will round-trip everything correctly.
- for _, gen := range d.repeat {
- v, e := gen(enc.e)
- enc.e = e
- n.Content = append(n.Content, enc.value(v))
- }
- }
- return &n
-
- case String:
- switch d.kind {
- case stringExact:
- n.SetString(d.exact)
- switch {
- // Make this into a "nice" !!int node if I can.
- case yamlIntRe.MatchString(d.exact):
- n.Tag = "tag:yaml.org,2002:int"
-
- // Or a "nice" !!bool node.
- case d.exact == "false" || d.exact == "true":
- n.Tag = "tag:yaml.org,2002:bool"
-
- // If this doesn't require escaping, leave it as a str node to avoid
- // the annoying YAML tags. Otherwise, mark it as an exact string.
- // Alternatively, we could always emit a str node with regexp
- // quoting.
- case d.exact != regexp.QuoteMeta(d.exact):
- n.Tag = "!string"
- }
- return &n
- case stringRegex:
- o := make([]string, 0, 1)
- for _, re := range d.re {
- s := re.String()
- s = strings.TrimSuffix(strings.TrimPrefix(s, `\A(?:`), `)\z`)
- o = append(o, s)
- }
- if len(o) == 1 {
- n.SetString(o[0])
- return &n
- }
- n.Encode(o)
- n.Tag = "!regex"
- return &n
- }
- panic("bad String kind")
-
- case Var:
- // TODO: If Var only appears once in the whole Value and is independent
- // in the environment (part of a term that is only over Var), then emit
- // this as a !sum instead.
- if false {
- var vs []*Value // TODO: Get values of this var.
- if len(vs) == 1 {
- return enc.value(vs[0])
- }
- n.Kind = yaml.SequenceNode
- n.Tag = "!sum"
- for _, elt := range vs {
- n.Content = append(n.Content, enc.value(elt))
- }
- return &n
- }
- n.SetString(enc.idp.unique(d.id))
- if !strings.HasPrefix(d.id.name, "$") {
- n.Tag = "!var"
- }
- return &n
- }
- panic(fmt.Sprintf("unknown domain type %T", v.Domain))
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package unify
-
-import (
- "bytes"
- "fmt"
- "iter"
- "log"
- "strings"
- "testing"
- "testing/fstest"
-
- "gopkg.in/yaml.v3"
-)
-
-func mustParse(expr string) Closure {
- var c Closure
- if err := yaml.Unmarshal([]byte(expr), &c); err != nil {
- panic(err)
- }
- return c
-}
-
-func oneValue(t *testing.T, c Closure) *Value {
- t.Helper()
- var v *Value
- var i int
- for v = range c.All() {
- i++
- }
- if i != 1 {
- t.Fatalf("expected 1 value, got %d", i)
- }
- return v
-}
-
-func printYaml(val any) {
- fmt.Println(prettyYaml(val))
-}
-
-func prettyYaml(val any) string {
- b, err := yaml.Marshal(val)
- if err != nil {
- panic(err)
- }
- var node yaml.Node
- if err := yaml.Unmarshal(b, &node); err != nil {
- panic(err)
- }
-
- // Map lines to start offsets. We'll use this to figure out when nodes are
- // "small" and should use inline style.
- lines := []int{-1, 0}
- for pos := 0; pos < len(b); {
- next := bytes.IndexByte(b[pos:], '\n')
- if next == -1 {
- break
- }
- pos += next + 1
- lines = append(lines, pos)
- }
- lines = append(lines, len(b))
-
- // Strip comments and switch small nodes to inline style
- cleanYaml(&node, lines, len(b))
-
- b, err = yaml.Marshal(&node)
- if err != nil {
- panic(err)
- }
- return string(b)
-}
-
-func cleanYaml(node *yaml.Node, lines []int, endPos int) {
- node.HeadComment = ""
- node.FootComment = ""
- node.LineComment = ""
-
- for i, n2 := range node.Content {
- end2 := endPos
- if i < len(node.Content)-1 {
- end2 = lines[node.Content[i+1].Line]
- }
- cleanYaml(n2, lines, end2)
- }
-
- // Use inline style?
- switch node.Kind {
- case yaml.MappingNode, yaml.SequenceNode:
- if endPos-lines[node.Line] < 40 {
- node.Style = yaml.FlowStyle
- }
- }
-}
-
-func allYamlNodes(n *yaml.Node) iter.Seq[*yaml.Node] {
- return func(yield func(*yaml.Node) bool) {
- if !yield(n) {
- return
- }
- for _, n2 := range n.Content {
- for n3 := range allYamlNodes(n2) {
- if !yield(n3) {
- return
- }
- }
- }
- }
-}
-
-func TestRoundTripString(t *testing.T) {
- // Check that we can round-trip a string with regexp meta-characters in it.
- const y = `!string test*`
- t.Logf("input:\n%s", y)
-
- v1 := oneValue(t, mustParse(y))
- var buf1 strings.Builder
- enc := yaml.NewEncoder(&buf1)
- if err := enc.Encode(v1); err != nil {
- log.Fatal(err)
- }
- enc.Close()
- t.Logf("after parse 1:\n%s", buf1.String())
-
- v2 := oneValue(t, mustParse(buf1.String()))
- var buf2 strings.Builder
- enc = yaml.NewEncoder(&buf2)
- if err := enc.Encode(v2); err != nil {
- log.Fatal(err)
- }
- enc.Close()
- t.Logf("after parse 2:\n%s", buf2.String())
-
- if buf1.String() != buf2.String() {
- t.Fatal("parse 1 and parse 2 differ")
- }
-}
-
-func TestEmptyString(t *testing.T) {
- // Regression test. Make sure an empty string is parsed as an exact string,
- // not a regexp.
- const y = `""`
- t.Logf("input:\n%s", y)
-
- v1 := oneValue(t, mustParse(y))
- if !v1.Exact() {
- t.Fatal("expected exact string")
- }
-}
-
-func TestImport(t *testing.T) {
- // Test a basic import
- main := strings.NewReader("!import x/y.yaml")
- fs := fstest.MapFS{
- // Test a glob import with a relative path
- "x/y.yaml": {Data: []byte("!import y/*.yaml")},
- "x/y/z.yaml": {Data: []byte("42")},
- }
- cl, err := Read(main, "x.yaml", ReadOpts{FS: fs})
- if err != nil {
- t.Fatal(err)
- }
- x := 42
- checkDecode(t, oneValue(t, cl), &x)
-}
-
-func TestImportEscape(t *testing.T) {
- // Make sure an import can't escape its subdirectory.
- main := strings.NewReader("!import x/y.yaml")
- fs := fstest.MapFS{
- "x/y.yaml": {Data: []byte("!import ../y/*.yaml")},
- "y/z.yaml": {Data: []byte("42")},
- }
- _, err := Read(main, "x.yaml", ReadOpts{FS: fs})
- if err == nil {
- t.Fatal("relative !import should have failed")
- }
- if !strings.Contains(err.Error(), "must not contain") {
- t.Fatalf("unexpected error %v", err)
- }
-}
-
-func TestImportScope(t *testing.T) {
- // Test that imports have different variable scopes.
- main := strings.NewReader("[!import y.yaml, !import y.yaml]")
- fs := fstest.MapFS{
- "y.yaml": {Data: []byte("$v")},
- }
- cl1, err := Read(main, "x.yaml", ReadOpts{FS: fs})
- if err != nil {
- t.Fatal(err)
- }
- cl2 := mustParse("[1, 2]")
- res, err := Unify(cl1, cl2)
- if err != nil {
- t.Fatal(err)
- }
- checkDecode(t, oneValue(t, res), []int{1, 2})
-}
--- /dev/null
+module simd/archsimd/_gen
+
+go 1.24
+
+require (
+ golang.org/x/arch v0.20.0
+ gopkg.in/yaml.v3 v3.0.1
+)
--- /dev/null
+golang.org/x/arch v0.20.0 h1:dx1zTU0MAE98U+TQ8BLl7XsJbgze2WnNKF/8tGp/Q6c=
+golang.org/x/arch v0.20.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Run all SIMD-related code generators.
+package main
+
+import (
+ "flag"
+ "fmt"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "strings"
+)
+
+const defaultXedPath = "$XEDPATH" + string(filepath.ListSeparator) + "./simdgen/xeddata" + string(filepath.ListSeparator) + "$HOME/xed/obj/dgen"
+
+var (
+ flagTmplgen = flag.Bool("tmplgen", true, "run tmplgen generator")
+ flagSimdgen = flag.Bool("simdgen", true, "run simdgen generator")
+
+ flagN = flag.Bool("n", false, "dry run")
+ flagXedPath = flag.String("xedPath", defaultXedPath, "load XED datafile from `path`, which must be the XED obj/dgen directory")
+)
+
+var goRoot string
+
+func main() {
+ flag.Parse()
+ if flag.NArg() > 0 {
+ flag.Usage()
+ os.Exit(1)
+ }
+
+ if *flagXedPath == defaultXedPath {
+ // In general we want the shell to do variable expansion, but for the
+ // default value we don't get that, so do it ourselves.
+ *flagXedPath = os.ExpandEnv(defaultXedPath)
+ }
+
+ var err error
+ goRoot, err = resolveGOROOT()
+ if err != nil {
+ fmt.Fprintln(os.Stderr, err)
+ os.Exit(1)
+ }
+
+ if *flagTmplgen {
+ doTmplgen()
+ }
+ if *flagSimdgen {
+ doSimdgen()
+ }
+}
+
+func doTmplgen() {
+ goRun("-C", "tmplgen", ".")
+}
+
+func doSimdgen() {
+ xedPath, err := resolveXEDPath(*flagXedPath)
+ if err != nil {
+ fmt.Fprintln(os.Stderr, err)
+ os.Exit(1)
+ }
+
+ // Regenerate the XED-derived SIMD files
+ goRun("-C", "simdgen", ".", "-o", "godefs", "-goroot", goRoot, "-xedPath", prettyPath("./simdgen", xedPath), "go.yaml", "types.yaml", "categories.yaml")
+
+ // simdgen produces SSA rule files, so update the SSA files
+ goRun("-C", prettyPath(".", filepath.Join(goRoot, "src", "cmd", "compile", "internal", "ssa", "_gen")), ".")
+}
+
+func resolveXEDPath(pathList string) (xedPath string, err error) {
+ for _, path := range filepath.SplitList(pathList) {
+ if path == "" {
+ // Probably an unknown shell variable. Ignore.
+ continue
+ }
+ if _, err := os.Stat(filepath.Join(path, "all-dec-instructions.txt")); err == nil {
+ return filepath.Abs(path)
+ }
+ }
+ return "", fmt.Errorf("set $XEDPATH or -xedPath to the XED obj/dgen directory")
+}
+
+func resolveGOROOT() (goRoot string, err error) {
+ cmd := exec.Command("go", "env", "GOROOT")
+ cmd.Stderr = os.Stderr
+ out, err := cmd.Output()
+ if err != nil {
+ return "", fmt.Errorf("%s: %s", cmd, err)
+ }
+ goRoot = strings.TrimSuffix(string(out), "\n")
+ return goRoot, nil
+}
+
+func goRun(args ...string) {
+ exe := filepath.Join(goRoot, "bin", "go")
+ cmd := exec.Command(exe, append([]string{"run"}, args...)...)
+ run(cmd)
+}
+
+func run(cmd *exec.Cmd) {
+ cmd.Stdout = os.Stdout
+ cmd.Stderr = os.Stderr
+ fmt.Fprintf(os.Stderr, "%s\n", cmdString(cmd))
+ if *flagN {
+ return
+ }
+ if err := cmd.Run(); err != nil {
+ fmt.Fprintf(os.Stderr, "%s failed: %s\n", cmd, err)
+ }
+}
+
+func prettyPath(base, path string) string {
+ base, err := filepath.Abs(base)
+ if err != nil {
+ return path
+ }
+ p, err := filepath.Rel(base, path)
+ if err != nil {
+ return path
+ }
+ return p
+}
+
+func cmdString(cmd *exec.Cmd) string {
+ // TODO: Shell quoting?
+ // TODO: Environment.
+
+ var buf strings.Builder
+
+ cmdPath, err := exec.LookPath(filepath.Base(cmd.Path))
+ if err == nil && cmdPath == cmd.Path {
+ cmdPath = filepath.Base(cmdPath)
+ } else {
+ cmdPath = prettyPath(".", cmd.Path)
+ }
+ buf.WriteString(cmdPath)
+
+ for _, arg := range cmd.Args[1:] {
+ buf.WriteByte(' ')
+ buf.WriteString(arg)
+ }
+
+ return buf.String()
+}
--- /dev/null
+!import ops/*/categories.yaml
--- /dev/null
+#!/bin/bash
+
+# This is an end-to-end test of Go SIMD. It updates all generated
+# files in this repo and then runs several tests.
+
+XEDDATA="${XEDDATA:-xeddata}"
+if [[ ! -d "$XEDDATA" ]]; then
+ echo >&2 "Must either set \$XEDDATA or symlink xeddata/ to the XED obj/dgen directory."
+ exit 1
+fi
+
+# Ensure that goroot is the appropriate ancestor of this directory
+which go >/dev/null || exit 1
+goroot="$(go env GOROOT)"
+ancestor="../../../../.."
+if [[ ! $ancestor -ef "$goroot" ]]; then
+ # We might be able to make this work but it's SO CONFUSING.
+ echo >&2 "go command in path has GOROOT $goroot instead of" `(cd $ancestor; pwd)`
+ exit 1
+fi
+
+set -ex
+
+# Regenerate SIMD files
+go run . -o godefs -goroot "$goroot" -xedPath "$XEDDATA" go.yaml types.yaml categories.yaml
+# Regenerate SSA files from SIMD rules
+go run -C "$goroot"/src/cmd/compile/internal/ssa/_gen .
+
+# Rebuild compiler
+cd "$goroot"/src
+go install cmd/compile
+
+# Tests
+# Set the GOEXPERIMENT explicitly.
+GOEXPERIMENT=simd GOARCH=amd64 go run -C simd/archsimd/testdata .
+GOEXPERIMENT=simd GOARCH=amd64 go test -v simd/archsimd
+GOEXPERIMENT=simd GOARCH=amd64 go test go/doc go/build
+GOEXPERIMENT=simd GOARCH=amd64 go test cmd/api -v -check -run ^TestCheck$
+GOEXPERIMENT=simd GOARCH=amd64 go test cmd/compile/internal/ssagen -simd=0
+
+# Check tests without the GOEXPERIMENT
+GOEXPERIMENT= go test go/doc go/build
+GOEXPERIMENT= go test cmd/api -v -check -run ^TestCheck$
+GOEXPERIMENT= go test cmd/compile/internal/ssagen -simd=0
+
+# TODO: Add some tests of SIMD itself
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "sort"
+)
+
+const simdGenericOpsTmpl = `
+package main
+
+func simdGenericOps() []opData {
+ return []opData{
+{{- range .Ops }}
+ {name: "{{.OpName}}", argLength: {{.OpInLen}}, commutative: {{.Comm}}},
+{{- end }}
+{{- range .OpsImm }}
+ {name: "{{.OpName}}", argLength: {{.OpInLen}}, commutative: {{.Comm}}, aux: "UInt8"},
+{{- end }}
+ }
+}
+`
+
+// writeSIMDGenericOps generates the generic ops and writes it to simdAMD64ops.go
+// within the specified directory.
+func writeSIMDGenericOps(ops []Operation) *bytes.Buffer {
+ t := templateOf(simdGenericOpsTmpl, "simdgenericOps")
+ buffer := new(bytes.Buffer)
+ buffer.WriteString(generatedHeader)
+
+ type genericOpsData struct {
+ OpName string
+ OpInLen int
+ Comm bool
+ }
+ type opData struct {
+ Ops []genericOpsData
+ OpsImm []genericOpsData
+ }
+ var opsData opData
+ for _, op := range ops {
+ if op.NoGenericOps != nil && *op.NoGenericOps == "true" {
+ continue
+ }
+ if op.SkipMaskedMethod() {
+ continue
+ }
+ _, _, _, immType, gOp := op.shape()
+ gOpData := genericOpsData{gOp.GenericName(), len(gOp.In), op.Commutative}
+ if immType == VarImm || immType == ConstVarImm {
+ opsData.OpsImm = append(opsData.OpsImm, gOpData)
+ } else {
+ opsData.Ops = append(opsData.Ops, gOpData)
+ }
+ }
+ sort.Slice(opsData.Ops, func(i, j int) bool {
+ return compareNatural(opsData.Ops[i].OpName, opsData.Ops[j].OpName) < 0
+ })
+ sort.Slice(opsData.OpsImm, func(i, j int) bool {
+ return compareNatural(opsData.OpsImm[i].OpName, opsData.OpsImm[j].OpName) < 0
+ })
+
+ err := t.Execute(buffer, opsData)
+ if err != nil {
+ panic(fmt.Errorf("failed to execute template: %w", err))
+ }
+
+ return buffer
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "slices"
+)
+
+const simdIntrinsicsTmpl = `
+{{define "header"}}
+package ssagen
+
+import (
+ "cmd/compile/internal/ir"
+ "cmd/compile/internal/ssa"
+ "cmd/compile/internal/types"
+ "cmd/internal/sys"
+)
+
+const simdPackage = "` + simdPackage + `"
+
+func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily)) {
+{{end}}
+
+{{define "op1"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen1(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op2"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen2(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op2_21"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen2_21(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op2_21Type1"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2_21(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op3"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen3(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op3_21"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen3_21(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op3_21Type1"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3_21(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op3_231Type1"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3_231(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op3_31Zero3"}} addF(simdPackage, "{{(index .In 2).Go}}.{{.Go}}", opLen3_31Zero3(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op4"}} addF(simdPackage, "{{(index .In 0).Go}}.{{.Go}}", opLen4(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op4_231Type1"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen4_231(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op4_31"}} addF(simdPackage, "{{(index .In 2).Go}}.{{.Go}}", opLen4_31(ssa.Op{{.GenericName}}, {{.SSAType}}), sys.AMD64)
+{{end}}
+{{define "op1Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen1Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
+{{define "op2Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
+{{define "op2Imm8_2I"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
+{{define "op2Imm8_II"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_II(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
+{{define "op2Imm8_SHA1RNDS4"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_SHA1RNDS4(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
+{{define "op3Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
+{{define "op3Imm8_2I"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
+{{define "op4Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen4Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
+
+{{define "vectorConversion"}} addF(simdPackage, "{{.Tsrc.Name}}.As{{.Tdst.Name}}", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
+{{end}}
+
+{{define "loadStore"}} addF(simdPackage, "Load{{.Name}}", simdLoad(), sys.AMD64)
+ addF(simdPackage, "{{.Name}}.Store", simdStore(), sys.AMD64)
+{{end}}
+
+{{define "maskedLoadStore"}} addF(simdPackage, "LoadMasked{{.Name}}", simdMaskedLoad(ssa.OpLoadMasked{{.ElemBits}}), sys.AMD64)
+ addF(simdPackage, "{{.Name}}.StoreMasked", simdMaskedStore(ssa.OpStoreMasked{{.ElemBits}}), sys.AMD64)
+{{end}}
+
+{{define "mask"}} addF(simdPackage, "{{.Name}}.As{{.VectorCounterpart}}", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
+ addF(simdPackage, "{{.VectorCounterpart}}.asMask", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
+ addF(simdPackage, "{{.Name}}.And", opLen2(ssa.OpAnd{{.ReshapedVectorWithAndOr}}, types.TypeVec{{.Size}}), sys.AMD64)
+ addF(simdPackage, "{{.Name}}.Or", opLen2(ssa.OpOr{{.ReshapedVectorWithAndOr}}, types.TypeVec{{.Size}}), sys.AMD64)
+ addF(simdPackage, "{{.Name}}FromBits", simdCvtVToMask({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
+ addF(simdPackage, "{{.Name}}.ToBits", simdCvtMaskToV({{.ElemBits}}, {{.Lanes}}), sys.AMD64)
+{{end}}
+
+{{define "footer"}}}
+{{end}}
+`
+
+// writeSIMDIntrinsics generates the intrinsic mappings and writes it to simdintrinsics.go
+// within the specified directory.
+func writeSIMDIntrinsics(ops []Operation, typeMap simdTypeMap) *bytes.Buffer {
+ t := templateOf(simdIntrinsicsTmpl, "simdintrinsics")
+ buffer := new(bytes.Buffer)
+ buffer.WriteString(generatedHeader)
+
+ if err := t.ExecuteTemplate(buffer, "header", nil); err != nil {
+ panic(fmt.Errorf("failed to execute header template: %w", err))
+ }
+
+ slices.SortFunc(ops, compareOperations)
+
+ for _, op := range ops {
+ if op.NoTypes != nil && *op.NoTypes == "true" {
+ continue
+ }
+ if op.SkipMaskedMethod() {
+ continue
+ }
+ if s, op, err := classifyOp(op); err == nil {
+ if err := t.ExecuteTemplate(buffer, s, op); err != nil {
+ panic(fmt.Errorf("failed to execute template %s for op %s: %w", s, op.Go, err))
+ }
+
+ } else {
+ panic(fmt.Errorf("failed to classify op %v: %w", op.Go, err))
+ }
+ }
+
+ for _, conv := range vConvertFromTypeMap(typeMap) {
+ if err := t.ExecuteTemplate(buffer, "vectorConversion", conv); err != nil {
+ panic(fmt.Errorf("failed to execute vectorConversion template: %w", err))
+ }
+ }
+
+ for _, typ := range typesFromTypeMap(typeMap) {
+ if typ.Type != "mask" {
+ if err := t.ExecuteTemplate(buffer, "loadStore", typ); err != nil {
+ panic(fmt.Errorf("failed to execute loadStore template: %w", err))
+ }
+ }
+ }
+
+ for _, typ := range typesFromTypeMap(typeMap) {
+ if typ.MaskedLoadStoreFilter() {
+ if err := t.ExecuteTemplate(buffer, "maskedLoadStore", typ); err != nil {
+ panic(fmt.Errorf("failed to execute maskedLoadStore template: %w", err))
+ }
+ }
+ }
+
+ for _, mask := range masksFromTypeMap(typeMap) {
+ if err := t.ExecuteTemplate(buffer, "mask", mask); err != nil {
+ panic(fmt.Errorf("failed to execute mask template: %w", err))
+ }
+ }
+
+ if err := t.ExecuteTemplate(buffer, "footer", nil); err != nil {
+ panic(fmt.Errorf("failed to execute footer template: %w", err))
+ }
+
+ return buffer
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "log"
+ "sort"
+ "strings"
+)
+
+const simdMachineOpsTmpl = `
+package main
+
+func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv, w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw,
+ wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload, v31x0AtIn2 regInfo) []opData {
+ return []opData{
+{{- range .OpsData }}
+ {name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", resultInArg0: {{.ResultInArg0}}},
+{{- end }}
+{{- range .OpsDataImm }}
+ {name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", aux: "UInt8", commutative: {{.Comm}}, typ: "{{.Type}}", resultInArg0: {{.ResultInArg0}}},
+{{- end }}
+{{- range .OpsDataLoad}}
+ {name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", aux: "SymOff", symEffect: "Read", resultInArg0: {{.ResultInArg0}}},
+{{- end}}
+{{- range .OpsDataImmLoad}}
+ {name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", aux: "SymValAndOff", symEffect: "Read", resultInArg0: {{.ResultInArg0}}},
+{{- end}}
+{{- range .OpsDataMerging }}
+ {name: "{{.OpName}}Merging", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: false, typ: "{{.Type}}", resultInArg0: true},
+{{- end }}
+{{- range .OpsDataImmMerging }}
+ {name: "{{.OpName}}Merging", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", aux: "UInt8", commutative: false, typ: "{{.Type}}", resultInArg0: true},
+{{- end }}
+ }
+}
+`
+
+// writeSIMDMachineOps generates the machine ops and writes it to simdAMD64ops.go
+// within the specified directory.
+func writeSIMDMachineOps(ops []Operation) *bytes.Buffer {
+ t := templateOf(simdMachineOpsTmpl, "simdAMD64Ops")
+ buffer := new(bytes.Buffer)
+ buffer.WriteString(generatedHeader)
+
+ type opData struct {
+ OpName string
+ Asm string
+ OpInLen int
+ RegInfo string
+ Comm bool
+ Type string
+ ResultInArg0 bool
+ }
+ type machineOpsData struct {
+ OpsData []opData
+ OpsDataImm []opData
+ OpsDataLoad []opData
+ OpsDataImmLoad []opData
+ OpsDataMerging []opData
+ OpsDataImmMerging []opData
+ }
+
+ regInfoSet := map[string]bool{
+ "v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true,
+ "w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true,
+ "wkwload": true, "v21load": true, "v31load": true, "v11load": true, "w21load": true, "w31load": true, "w2kload": true, "w2kwload": true, "w11load": true,
+ "w3kwload": true, "w2kkload": true, "v31x0AtIn2": true}
+ opsData := make([]opData, 0)
+ opsDataImm := make([]opData, 0)
+ opsDataLoad := make([]opData, 0)
+ opsDataImmLoad := make([]opData, 0)
+ opsDataMerging := make([]opData, 0)
+ opsDataImmMerging := make([]opData, 0)
+
+ // Determine the "best" version of an instruction to use
+ best := make(map[string]Operation)
+ var mOpOrder []string
+ countOverrides := func(s []Operand) int {
+ a := 0
+ for _, o := range s {
+ if o.OverwriteBase != nil {
+ a++
+ }
+ }
+ return a
+ }
+ for _, op := range ops {
+ _, _, maskType, _, gOp := op.shape()
+ asm := machineOpName(maskType, gOp)
+ other, ok := best[asm]
+ if !ok {
+ best[asm] = op
+ mOpOrder = append(mOpOrder, asm)
+ continue
+ }
+ if !op.Commutative && other.Commutative { // if there's a non-commutative version of the op, it wins.
+ best[asm] = op
+ continue
+ }
+ // see if "op" is better than "other"
+ if countOverrides(op.In)+countOverrides(op.Out) < countOverrides(other.In)+countOverrides(other.Out) {
+ best[asm] = op
+ }
+ }
+
+ regInfoErrs := make([]error, 0)
+ regInfoMissing := make(map[string]bool, 0)
+ for _, asm := range mOpOrder {
+ op := best[asm]
+ shapeIn, shapeOut, maskType, _, gOp := op.shape()
+
+ // TODO: all our masked operations are now zeroing, we need to generate machine ops with merging masks, maybe copy
+ // one here with a name suffix "Merging". The rewrite rules will need them.
+ makeRegInfo := func(op Operation, mem memShape) (string, error) {
+ regInfo, err := op.regShape(mem)
+ if err != nil {
+ panic(err)
+ }
+ regInfo, err = rewriteVecAsScalarRegInfo(op, regInfo)
+ if err != nil {
+ if mem == NoMem || mem == InvalidMem {
+ panic(err)
+ }
+ return "", err
+ }
+ if regInfo == "v01load" {
+ regInfo = "vload"
+ }
+ // Makes AVX512 operations use upper registers
+ if strings.Contains(op.CPUFeature, "AVX512") {
+ regInfo = strings.ReplaceAll(regInfo, "v", "w")
+ }
+ if _, ok := regInfoSet[regInfo]; !ok {
+ regInfoErrs = append(regInfoErrs, fmt.Errorf("unsupported register constraint, please update the template and AMD64Ops.go: %s. Op is %s", regInfo, op))
+ regInfoMissing[regInfo] = true
+ }
+ return regInfo, nil
+ }
+ regInfo, err := makeRegInfo(op, NoMem)
+ if err != nil {
+ panic(err)
+ }
+ var outType string
+ if shapeOut == OneVregOut || shapeOut == OneVregOutAtIn || gOp.Out[0].OverwriteClass != nil {
+ // If class overwrite is happening, that's not really a mask but a vreg.
+ outType = fmt.Sprintf("Vec%d", *gOp.Out[0].Bits)
+ } else if shapeOut == OneGregOut {
+ outType = gOp.GoType() // this is a straight Go type, not a VecNNN type
+ } else if shapeOut == OneKmaskOut {
+ outType = "Mask"
+ } else {
+ panic(fmt.Errorf("simdgen does not recognize this output shape: %d", shapeOut))
+ }
+ resultInArg0 := false
+ if shapeOut == OneVregOutAtIn {
+ resultInArg0 = true
+ }
+ var memOpData *opData
+ regInfoMerging := regInfo
+ hasMerging := false
+ if op.MemFeatures != nil && *op.MemFeatures == "vbcst" {
+ // Right now we only have vbcst case
+ // Make a full vec memory variant.
+ opMem := rewriteLastVregToMem(op)
+ regInfo, err := makeRegInfo(opMem, VregMemIn)
+ if err != nil {
+ // Just skip it if it's non nill.
+ // an error could be triggered by [checkVecAsScalar].
+ // TODO: make [checkVecAsScalar] aware of mem ops.
+ if *Verbose {
+ log.Printf("Seen error: %e", err)
+ }
+ } else {
+ memOpData = &opData{asm + "load", gOp.Asm, len(gOp.In) + 1, regInfo, false, outType, resultInArg0}
+ }
+ }
+ hasMerging = gOp.hasMaskedMerging(maskType, shapeOut)
+ if hasMerging && !resultInArg0 {
+ // We have to copy the slice here becasue the sort will be visible from other
+ // aliases when no reslicing is happening.
+ newIn := make([]Operand, len(op.In), len(op.In)+1)
+ copy(newIn, op.In)
+ op.In = newIn
+ op.In = append(op.In, op.Out[0])
+ op.sortOperand()
+ regInfoMerging, err = makeRegInfo(op, NoMem)
+ if err != nil {
+ panic(err)
+ }
+ }
+
+ if shapeIn == OneImmIn || shapeIn == OneKmaskImmIn {
+ opsDataImm = append(opsDataImm, opData{asm, gOp.Asm, len(gOp.In), regInfo, gOp.Commutative, outType, resultInArg0})
+ if memOpData != nil {
+ if *op.MemFeatures != "vbcst" {
+ panic("simdgen only knows vbcst for mem ops for now")
+ }
+ opsDataImmLoad = append(opsDataImmLoad, *memOpData)
+ }
+ if hasMerging {
+ mergingLen := len(gOp.In)
+ if !resultInArg0 {
+ mergingLen++
+ }
+ opsDataImmMerging = append(opsDataImmMerging, opData{asm, gOp.Asm, mergingLen, regInfoMerging, gOp.Commutative, outType, resultInArg0})
+ }
+ } else {
+ opsData = append(opsData, opData{asm, gOp.Asm, len(gOp.In), regInfo, gOp.Commutative, outType, resultInArg0})
+ if memOpData != nil {
+ if *op.MemFeatures != "vbcst" {
+ panic("simdgen only knows vbcst for mem ops for now")
+ }
+ opsDataLoad = append(opsDataLoad, *memOpData)
+ }
+ if hasMerging {
+ mergingLen := len(gOp.In)
+ if !resultInArg0 {
+ mergingLen++
+ }
+ opsDataMerging = append(opsDataMerging, opData{asm, gOp.Asm, mergingLen, regInfoMerging, gOp.Commutative, outType, resultInArg0})
+ }
+ }
+ }
+ if len(regInfoErrs) != 0 {
+ for _, e := range regInfoErrs {
+ log.Printf("Errors: %e\n", e)
+ }
+ panic(fmt.Errorf("these regInfo unseen: %v", regInfoMissing))
+ }
+ sort.Slice(opsData, func(i, j int) bool {
+ return compareNatural(opsData[i].OpName, opsData[j].OpName) < 0
+ })
+ sort.Slice(opsDataImm, func(i, j int) bool {
+ return compareNatural(opsDataImm[i].OpName, opsDataImm[j].OpName) < 0
+ })
+ sort.Slice(opsDataLoad, func(i, j int) bool {
+ return compareNatural(opsDataLoad[i].OpName, opsDataLoad[j].OpName) < 0
+ })
+ sort.Slice(opsDataImmLoad, func(i, j int) bool {
+ return compareNatural(opsDataImmLoad[i].OpName, opsDataImmLoad[j].OpName) < 0
+ })
+ sort.Slice(opsDataMerging, func(i, j int) bool {
+ return compareNatural(opsDataMerging[i].OpName, opsDataMerging[j].OpName) < 0
+ })
+ sort.Slice(opsDataImmMerging, func(i, j int) bool {
+ return compareNatural(opsDataImmMerging[i].OpName, opsDataImmMerging[j].OpName) < 0
+ })
+ err := t.Execute(buffer, machineOpsData{opsData, opsDataImm, opsDataLoad, opsDataImmLoad,
+ opsDataMerging, opsDataImmMerging})
+ if err != nil {
+ panic(fmt.Errorf("failed to execute template: %w", err))
+ }
+
+ return buffer
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "bytes"
+ "cmp"
+ "fmt"
+ "maps"
+ "slices"
+ "sort"
+ "strings"
+ "unicode"
+)
+
+type simdType struct {
+ Name string // The go type name of this simd type, for example Int32x4.
+ Lanes int // The number of elements in this vector/mask.
+ Base string // The element's type, like for Int32x4 it will be int32.
+ Fields string // The struct fields, it should be right formatted.
+ Type string // Either "mask" or "vreg"
+ VectorCounterpart string // For mask use only: just replacing the "Mask" in [simdType.Name] with "Int"
+ ReshapedVectorWithAndOr string // For mask use only: vector AND and OR are only available in some shape with element width 32.
+ Size int // The size of the vector type
+}
+
+func (x simdType) ElemBits() int {
+ return x.Size / x.Lanes
+}
+
+// LanesContainer returns the smallest int/uint bit size that is
+// large enough to hold one bit for each lane. E.g., Mask32x4
+// is 4 lanes, and a uint8 is the smallest uint that has 4 bits.
+func (x simdType) LanesContainer() int {
+ if x.Lanes > 64 {
+ panic("too many lanes")
+ }
+ if x.Lanes > 32 {
+ return 64
+ }
+ if x.Lanes > 16 {
+ return 32
+ }
+ if x.Lanes > 8 {
+ return 16
+ }
+ return 8
+}
+
+// MaskedLoadStoreFilter encodes which simd type type currently
+// get masked loads/stores generated, it is used in two places,
+// this forces coordination.
+func (x simdType) MaskedLoadStoreFilter() bool {
+ return x.Size == 512 || x.ElemBits() >= 32 && x.Type != "mask"
+}
+
+func (x simdType) IntelSizeSuffix() string {
+ switch x.ElemBits() {
+ case 8:
+ return "B"
+ case 16:
+ return "W"
+ case 32:
+ return "D"
+ case 64:
+ return "Q"
+ }
+ panic("oops")
+}
+
+func (x simdType) MaskedLoadDoc() string {
+ if x.Size == 512 || x.ElemBits() < 32 {
+ return fmt.Sprintf("// Asm: VMOVDQU%d.Z, CPU Feature: AVX512", x.ElemBits())
+ } else {
+ return fmt.Sprintf("// Asm: VMASKMOV%s, CPU Feature: AVX2", x.IntelSizeSuffix())
+ }
+}
+
+func (x simdType) MaskedStoreDoc() string {
+ if x.Size == 512 || x.ElemBits() < 32 {
+ return fmt.Sprintf("// Asm: VMOVDQU%d, CPU Feature: AVX512", x.ElemBits())
+ } else {
+ return fmt.Sprintf("// Asm: VMASKMOV%s, CPU Feature: AVX2", x.IntelSizeSuffix())
+ }
+}
+
+func compareSimdTypes(x, y simdType) int {
+ // "vreg" then "mask"
+ if c := -compareNatural(x.Type, y.Type); c != 0 {
+ return c
+ }
+ // want "flo" < "int" < "uin" (and then 8 < 16 < 32 < 64),
+ // not "int16" < "int32" < "int64" < "int8")
+ // so limit comparison to first 3 bytes in string.
+ if c := compareNatural(x.Base[:3], y.Base[:3]); c != 0 {
+ return c
+ }
+ // base type size, 8 < 16 < 32 < 64
+ if c := x.ElemBits() - y.ElemBits(); c != 0 {
+ return c
+ }
+ // vector size last
+ return x.Size - y.Size
+}
+
+type simdTypeMap map[int][]simdType
+
+type simdTypePair struct {
+ Tsrc simdType
+ Tdst simdType
+}
+
+func compareSimdTypePairs(x, y simdTypePair) int {
+ c := compareSimdTypes(x.Tsrc, y.Tsrc)
+ if c != 0 {
+ return c
+ }
+ return compareSimdTypes(x.Tdst, y.Tdst)
+}
+
+const simdPackageHeader = generatedHeader + `
+//go:build goexperiment.simd
+
+package archsimd
+`
+
+const simdTypesTemplates = `
+{{define "sizeTmpl"}}
+// v{{.}} is a tag type that tells the compiler that this is really {{.}}-bit SIMD
+type v{{.}} struct {
+ _{{.}} [0]func() // uncomparable
+}
+{{end}}
+
+{{define "typeTmpl"}}
+// {{.Name}} is a {{.Size}}-bit SIMD vector of {{.Lanes}} {{.Base}}
+type {{.Name}} struct {
+{{.Fields}}
+}
+
+{{end}}
+`
+
+const simdFeaturesTemplate = `
+import "internal/cpu"
+
+type X86Features struct {}
+
+var X86 X86Features
+
+{{range .}}
+{{- if eq .Feature "AVX512"}}
+// {{.Feature}} returns whether the CPU supports the AVX512F+CD+BW+DQ+VL features.
+//
+// These five CPU features are bundled together, and no use of AVX-512
+// is allowed unless all of these features are supported together.
+// Nearly every CPU that has shipped with any support for AVX-512 has
+// supported all five of these features.
+{{- else -}}
+// {{.Feature}} returns whether the CPU supports the {{.Feature}} feature.
+{{- end}}
+//
+// {{.Feature}} is defined on all GOARCHes, but will only return true on
+// GOARCH {{.GoArch}}.
+func (X86Features) {{.Feature}}() bool {
+ return cpu.X86.Has{{.Feature}}
+}
+{{end}}
+`
+
+const simdLoadStoreTemplate = `
+// Len returns the number of elements in a {{.Name}}
+func (x {{.Name}}) Len() int { return {{.Lanes}} }
+
+// Load{{.Name}} loads a {{.Name}} from an array
+//
+//go:noescape
+func Load{{.Name}}(y *[{{.Lanes}}]{{.Base}}) {{.Name}}
+
+// Store stores a {{.Name}} to an array
+//
+//go:noescape
+func (x {{.Name}}) Store(y *[{{.Lanes}}]{{.Base}})
+`
+
+const simdMaskFromValTemplate = `
+// {{.Name}}FromBits constructs a {{.Name}} from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+{{- if ne .Lanes .LanesContainer}}
+// Only the lower {{.Lanes}} bits of y are used.
+{{- end}}
+//
+// Asm: KMOV{{.IntelSizeSuffix}}, CPU Feature: AVX512
+func {{.Name}}FromBits(y uint{{.LanesContainer}}) {{.Name}}
+
+// ToBits constructs a bitmap from a {{.Name}}, where 1 means set for the indexed element, 0 means unset.
+{{- if ne .Lanes .LanesContainer}}
+// Only the lower {{.Lanes}} bits of y are used.
+{{- end}}
+//
+// Asm: KMOV{{.IntelSizeSuffix}}, CPU Features: AVX512
+func (x {{.Name}}) ToBits() uint{{.LanesContainer}}
+`
+
+const simdMaskedLoadStoreTemplate = `
+// LoadMasked{{.Name}} loads a {{.Name}} from an array,
+// at those elements enabled by mask
+//
+{{.MaskedLoadDoc}}
+//
+//go:noescape
+func LoadMasked{{.Name}}(y *[{{.Lanes}}]{{.Base}}, mask Mask{{.ElemBits}}x{{.Lanes}}) {{.Name}}
+
+// StoreMasked stores a {{.Name}} to an array,
+// at those elements enabled by mask
+//
+{{.MaskedStoreDoc}}
+//
+//go:noescape
+func (x {{.Name}}) StoreMasked(y *[{{.Lanes}}]{{.Base}}, mask Mask{{.ElemBits}}x{{.Lanes}})
+`
+
+const simdStubsTmpl = `
+{{define "op1"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op0NameAndType "x"}}) {{.Go}}() {{.GoType}}
+{{end}}
+
+{{define "op2"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op0NameAndType "x"}}) {{.Go}}({{.Op1NameAndType "y"}}) {{.GoType}}
+{{end}}
+
+{{define "op2_21"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op0NameAndType "y"}}) {{.GoType}}
+{{end}}
+
+{{define "op2_21Type1"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op0NameAndType "y"}}) {{.GoType}}
+{{end}}
+
+{{define "op3"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op0NameAndType "x"}}) {{.Go}}({{.Op1NameAndType "y"}}, {{.Op2NameAndType "z"}}) {{.GoType}}
+{{end}}
+
+{{define "op3_31Zero3"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op2NameAndType "x"}}) {{.Go}}({{.Op1NameAndType "y"}}) {{.GoType}}
+{{end}}
+
+{{define "op3_21"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op0NameAndType "y"}}, {{.Op2NameAndType "z"}}) {{.GoType}}
+{{end}}
+
+{{define "op3_21Type1"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op0NameAndType "y"}}, {{.Op2NameAndType "z"}}) {{.GoType}}
+{{end}}
+
+{{define "op3_231Type1"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.Op0NameAndType "z"}}) {{.GoType}}
+{{end}}
+
+{{define "op2VecAsScalar"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op0NameAndType "x"}}) {{.Go}}(y uint{{(index .In 1).TreatLikeAScalarOfSize}}) {{(index .Out 0).Go}}
+{{end}}
+
+{{define "op3VecAsScalar"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op0NameAndType "x"}}) {{.Go}}(y uint{{(index .In 1).TreatLikeAScalarOfSize}}, {{.Op2NameAndType "z"}}) {{(index .Out 0).Go}}
+{{end}}
+
+{{define "op4"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op0NameAndType "x"}}) {{.Go}}({{.Op1NameAndType "y"}}, {{.Op2NameAndType "z"}}, {{.Op3NameAndType "u"}}) {{.GoType}}
+{{end}}
+
+{{define "op4_231Type1"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.Op0NameAndType "z"}}, {{.Op3NameAndType "u"}}) {{.GoType}}
+{{end}}
+
+{{define "op4_31"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op2NameAndType "x"}}) {{.Go}}({{.Op1NameAndType "y"}}, {{.Op0NameAndType "z"}}, {{.Op3NameAndType "u"}}) {{.GoType}}
+{{end}}
+
+{{define "op1Imm8"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8) {{.GoType}}
+{{end}}
+
+{{define "op2Imm8"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
+{{end}}
+
+{{define "op2Imm8_2I"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uint8) {{.GoType}}
+{{end}}
+
+{{define "op2Imm8_II"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} result in better performance when they are constants, non-constant values will be translated into a jump table.
+// {{.ImmName}} should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
+{{end}}
+
+{{define "op2Imm8_SHA1RNDS4"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
+{{end}}
+
+{{define "op3Imm8"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}, {{.Op3NameAndType "z"}}) {{.GoType}}
+{{end}}
+
+{{define "op3Imm8_2I"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uint8, {{.Op3NameAndType "z"}}) {{.GoType}}
+{{end}}
+
+
+{{define "op4Imm8"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}, {{.Op3NameAndType "z"}}, {{.Op4NameAndType "u"}}) {{.GoType}}
+{{end}}
+
+{{define "vectorConversion"}}
+// {{.Tdst.Name}} converts from {{.Tsrc.Name}} to {{.Tdst.Name}}
+func (from {{.Tsrc.Name}}) As{{.Tdst.Name}}() (to {{.Tdst.Name}})
+{{end}}
+
+{{define "mask"}}
+// As{{.VectorCounterpart}} converts from {{.Name}} to {{.VectorCounterpart}}
+func (from {{.Name}}) As{{.VectorCounterpart}}() (to {{.VectorCounterpart}})
+
+// asMask converts from {{.VectorCounterpart}} to {{.Name}}
+func (from {{.VectorCounterpart}}) asMask() (to {{.Name}})
+
+func (x {{.Name}}) And(y {{.Name}}) {{.Name}}
+
+func (x {{.Name}}) Or(y {{.Name}}) {{.Name}}
+{{end}}
+`
+
+// parseSIMDTypes groups go simd types by their vector sizes, and
+// returns a map whose key is the vector size, value is the simd type.
+func parseSIMDTypes(ops []Operation) simdTypeMap {
+ // TODO: maybe instead of going over ops, let's try go over types.yaml.
+ ret := map[int][]simdType{}
+ seen := map[string]struct{}{}
+ processArg := func(arg Operand) {
+ if arg.Class == "immediate" || arg.Class == "greg" {
+ // Immediates are not encoded as vector types.
+ return
+ }
+ if _, ok := seen[*arg.Go]; ok {
+ return
+ }
+ seen[*arg.Go] = struct{}{}
+
+ lanes := *arg.Lanes
+ base := fmt.Sprintf("%s%d", *arg.Base, *arg.ElemBits)
+ tagFieldNameS := fmt.Sprintf("%sx%d", base, lanes)
+ tagFieldS := fmt.Sprintf("%s v%d", tagFieldNameS, *arg.Bits)
+ valFieldS := fmt.Sprintf("vals%s[%d]%s", strings.Repeat(" ", len(tagFieldNameS)-3), lanes, base)
+ fields := fmt.Sprintf("\t%s\n\t%s", tagFieldS, valFieldS)
+ if arg.Class == "mask" {
+ vectorCounterpart := strings.ReplaceAll(*arg.Go, "Mask", "Int")
+ reshapedVectorWithAndOr := fmt.Sprintf("Int32x%d", *arg.Bits/32)
+ ret[*arg.Bits] = append(ret[*arg.Bits], simdType{*arg.Go, lanes, base, fields, arg.Class, vectorCounterpart, reshapedVectorWithAndOr, *arg.Bits})
+ // In case the vector counterpart of a mask is not present, put its vector counterpart typedef into the map as well.
+ if _, ok := seen[vectorCounterpart]; !ok {
+ seen[vectorCounterpart] = struct{}{}
+ ret[*arg.Bits] = append(ret[*arg.Bits], simdType{vectorCounterpart, lanes, base, fields, "vreg", "", "", *arg.Bits})
+ }
+ } else {
+ ret[*arg.Bits] = append(ret[*arg.Bits], simdType{*arg.Go, lanes, base, fields, arg.Class, "", "", *arg.Bits})
+ }
+ }
+ for _, op := range ops {
+ for _, arg := range op.In {
+ processArg(arg)
+ }
+ for _, arg := range op.Out {
+ processArg(arg)
+ }
+ }
+ return ret
+}
+
+func vConvertFromTypeMap(typeMap simdTypeMap) []simdTypePair {
+ v := []simdTypePair{}
+ for _, ts := range typeMap {
+ for i, tsrc := range ts {
+ for j, tdst := range ts {
+ if i != j && tsrc.Type == tdst.Type && tsrc.Type == "vreg" &&
+ tsrc.Lanes > 1 && tdst.Lanes > 1 {
+ v = append(v, simdTypePair{tsrc, tdst})
+ }
+ }
+ }
+ }
+ slices.SortFunc(v, compareSimdTypePairs)
+ return v
+}
+
+func masksFromTypeMap(typeMap simdTypeMap) []simdType {
+ m := []simdType{}
+ for _, ts := range typeMap {
+ for _, tsrc := range ts {
+ if tsrc.Type == "mask" {
+ m = append(m, tsrc)
+ }
+ }
+ }
+ slices.SortFunc(m, compareSimdTypes)
+ return m
+}
+
+func typesFromTypeMap(typeMap simdTypeMap) []simdType {
+ m := []simdType{}
+ for _, ts := range typeMap {
+ for _, tsrc := range ts {
+ if tsrc.Lanes > 1 {
+ m = append(m, tsrc)
+ }
+ }
+ }
+ slices.SortFunc(m, compareSimdTypes)
+ return m
+}
+
+// writeSIMDTypes generates the simd vector types into a bytes.Buffer
+func writeSIMDTypes(typeMap simdTypeMap) *bytes.Buffer {
+ t := templateOf(simdTypesTemplates, "types_amd64")
+ loadStore := templateOf(simdLoadStoreTemplate, "loadstore_amd64")
+ maskedLoadStore := templateOf(simdMaskedLoadStoreTemplate, "maskedloadstore_amd64")
+ maskFromVal := templateOf(simdMaskFromValTemplate, "maskFromVal_amd64")
+
+ buffer := new(bytes.Buffer)
+ buffer.WriteString(simdPackageHeader)
+
+ sizes := make([]int, 0, len(typeMap))
+ for size, types := range typeMap {
+ slices.SortFunc(types, compareSimdTypes)
+ sizes = append(sizes, size)
+ }
+ sort.Ints(sizes)
+
+ for _, size := range sizes {
+ if size <= 64 {
+ // these are scalar
+ continue
+ }
+ if err := t.ExecuteTemplate(buffer, "sizeTmpl", size); err != nil {
+ panic(fmt.Errorf("failed to execute size template for size %d: %w", size, err))
+ }
+ for _, typeDef := range typeMap[size] {
+ if typeDef.Lanes == 1 {
+ continue
+ }
+ if err := t.ExecuteTemplate(buffer, "typeTmpl", typeDef); err != nil {
+ panic(fmt.Errorf("failed to execute type template for type %s: %w", typeDef.Name, err))
+ }
+ if typeDef.Type != "mask" {
+ if err := loadStore.ExecuteTemplate(buffer, "loadstore_amd64", typeDef); err != nil {
+ panic(fmt.Errorf("failed to execute loadstore template for type %s: %w", typeDef.Name, err))
+ }
+ // restrict to AVX2 masked loads/stores first.
+ if typeDef.MaskedLoadStoreFilter() {
+ if err := maskedLoadStore.ExecuteTemplate(buffer, "maskedloadstore_amd64", typeDef); err != nil {
+ panic(fmt.Errorf("failed to execute maskedloadstore template for type %s: %w", typeDef.Name, err))
+ }
+ }
+ } else {
+ if err := maskFromVal.ExecuteTemplate(buffer, "maskFromVal_amd64", typeDef); err != nil {
+ panic(fmt.Errorf("failed to execute maskFromVal template for type %s: %w", typeDef.Name, err))
+ }
+ }
+ }
+ }
+
+ return buffer
+}
+
+func writeSIMDFeatures(ops []Operation) *bytes.Buffer {
+ // Gather all features
+ type featureKey struct {
+ GoArch string
+ Feature string
+ }
+ featureSet := make(map[featureKey]struct{})
+ for _, op := range ops {
+ // Generate a feature check for each independant feature in a
+ // composite feature.
+ for feature := range strings.SplitSeq(op.CPUFeature, ",") {
+ feature = strings.TrimSpace(feature)
+ featureSet[featureKey{op.GoArch, feature}] = struct{}{}
+ }
+ }
+ features := slices.SortedFunc(maps.Keys(featureSet), func(a, b featureKey) int {
+ if c := cmp.Compare(a.GoArch, b.GoArch); c != 0 {
+ return c
+ }
+ return compareNatural(a.Feature, b.Feature)
+ })
+
+ // If we ever have the same feature name on more than one GOARCH, we'll have
+ // to be more careful about this.
+ t := templateOf(simdFeaturesTemplate, "features")
+
+ buffer := new(bytes.Buffer)
+ buffer.WriteString(simdPackageHeader)
+
+ if err := t.Execute(buffer, features); err != nil {
+ panic(fmt.Errorf("failed to execute features template: %w", err))
+ }
+
+ return buffer
+}
+
+// writeSIMDStubs returns two bytes.Buffers containing the declarations for the public
+// and internal-use vector intrinsics.
+func writeSIMDStubs(ops []Operation, typeMap simdTypeMap) (f, fI *bytes.Buffer) {
+ t := templateOf(simdStubsTmpl, "simdStubs")
+ f = new(bytes.Buffer)
+ fI = new(bytes.Buffer)
+ f.WriteString(simdPackageHeader)
+ fI.WriteString(simdPackageHeader)
+
+ slices.SortFunc(ops, compareOperations)
+
+ for i, op := range ops {
+ if op.NoTypes != nil && *op.NoTypes == "true" {
+ continue
+ }
+ if op.SkipMaskedMethod() {
+ continue
+ }
+ idxVecAsScalar, err := checkVecAsScalar(op)
+ if err != nil {
+ panic(err)
+ }
+ if s, op, err := classifyOp(op); err == nil {
+ if idxVecAsScalar != -1 {
+ if s == "op2" || s == "op3" {
+ s += "VecAsScalar"
+ } else {
+ panic(fmt.Errorf("simdgen only supports op2 or op3 with TreatLikeAScalarOfSize"))
+ }
+ }
+ if i == 0 || op.Go != ops[i-1].Go {
+ if unicode.IsUpper([]rune(op.Go)[0]) {
+ fmt.Fprintf(f, "\n/* %s */\n", op.Go)
+ } else {
+ fmt.Fprintf(fI, "\n/* %s */\n", op.Go)
+ }
+ }
+ if unicode.IsUpper([]rune(op.Go)[0]) {
+ if err := t.ExecuteTemplate(f, s, op); err != nil {
+ panic(fmt.Errorf("failed to execute template %s for op %v: %w", s, op, err))
+ }
+ } else {
+ if err := t.ExecuteTemplate(fI, s, op); err != nil {
+ panic(fmt.Errorf("failed to execute template %s for op %v: %w", s, op, err))
+ }
+ }
+ } else {
+ panic(fmt.Errorf("failed to classify op %v: %w", op.Go, err))
+ }
+ }
+
+ vectorConversions := vConvertFromTypeMap(typeMap)
+ for _, conv := range vectorConversions {
+ if err := t.ExecuteTemplate(f, "vectorConversion", conv); err != nil {
+ panic(fmt.Errorf("failed to execute vectorConversion template: %w", err))
+ }
+ }
+
+ masks := masksFromTypeMap(typeMap)
+ for _, mask := range masks {
+ if err := t.ExecuteTemplate(f, "mask", mask); err != nil {
+ panic(fmt.Errorf("failed to execute mask template for mask %s: %w", mask.Name, err))
+ }
+ }
+
+ return
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "slices"
+ "strings"
+ "text/template"
+)
+
+type tplRuleData struct {
+ tplName string // e.g. "sftimm"
+ GoOp string // e.g. "ShiftAllLeft"
+ GoType string // e.g. "Uint32x8"
+ Args string // e.g. "x y"
+ Asm string // e.g. "VPSLLD256"
+ ArgsOut string // e.g. "x y"
+ MaskInConvert string // e.g. "VPMOVVec32x8ToM"
+ MaskOutConvert string // e.g. "VPMOVMToVec32x8"
+ ElementSize int // e.g. 32
+ Size int // e.g. 128
+ ArgsLoadAddr string // [Args] with its last vreg arg being a concrete "(VMOVDQUload* ptr mem)", and might contain mask.
+ ArgsAddr string // [Args] with its last vreg arg being replaced by "ptr", and might contain mask, and with a "mem" at the end.
+ FeatCheck string // e.g. "v.Block.CPUfeatures.hasFeature(CPUavx512)" -- for a ssa/_gen rules file.
+}
+
+var (
+ ruleTemplates = template.Must(template.New("simdRules").Parse(`
+{{define "pureVreg"}}({{.GoOp}}{{.GoType}} {{.Args}}) => ({{.Asm}} {{.ArgsOut}})
+{{end}}
+{{define "maskIn"}}({{.GoOp}}{{.GoType}} {{.Args}} mask) => ({{.Asm}} {{.ArgsOut}} ({{.MaskInConvert}} <types.TypeMask> mask))
+{{end}}
+{{define "maskOut"}}({{.GoOp}}{{.GoType}} {{.Args}}) => ({{.MaskOutConvert}} ({{.Asm}} {{.ArgsOut}}))
+{{end}}
+{{define "maskInMaskOut"}}({{.GoOp}}{{.GoType}} {{.Args}} mask) => ({{.MaskOutConvert}} ({{.Asm}} {{.ArgsOut}} ({{.MaskInConvert}} <types.TypeMask> mask)))
+{{end}}
+{{define "sftimm"}}({{.Asm}} x (MOVQconst [c])) => ({{.Asm}}const [uint8(c)] x)
+{{end}}
+{{define "masksftimm"}}({{.Asm}} x (MOVQconst [c]) mask) => ({{.Asm}}const [uint8(c)] x mask)
+{{end}}
+{{define "vregMem"}}({{.Asm}} {{.ArgsLoadAddr}}) && canMergeLoad(v, l) && clobber(l) => ({{.Asm}}load {{.ArgsAddr}})
+{{end}}
+{{define "vregMemFeatCheck"}}({{.Asm}} {{.ArgsLoadAddr}}) && {{.FeatCheck}} && canMergeLoad(v, l) && clobber(l)=> ({{.Asm}}load {{.ArgsAddr}})
+{{end}}
+`))
+)
+
+func (d tplRuleData) MaskOptimization(asmCheck map[string]bool) string {
+ asmNoMask := d.Asm
+ if i := strings.Index(asmNoMask, "Masked"); i == -1 {
+ return ""
+ }
+ asmNoMask = strings.ReplaceAll(asmNoMask, "Masked", "")
+ if asmCheck[asmNoMask] == false {
+ return ""
+ }
+
+ for _, nope := range []string{"VMOVDQU", "VPCOMPRESS", "VCOMPRESS", "VPEXPAND", "VEXPAND", "VPBLENDM", "VMOVUP"} {
+ if strings.HasPrefix(asmNoMask, nope) {
+ return ""
+ }
+ }
+
+ size := asmNoMask[len(asmNoMask)-3:]
+ if strings.HasSuffix(asmNoMask, "const") {
+ sufLen := len("128const")
+ size = asmNoMask[len(asmNoMask)-sufLen:][:3]
+ }
+ switch size {
+ case "128", "256", "512":
+ default:
+ panic("Unexpected operation size on " + d.Asm)
+ }
+
+ switch d.ElementSize {
+ case 8, 16, 32, 64:
+ default:
+ panic(fmt.Errorf("Unexpected operation width %d on %v", d.ElementSize, d.Asm))
+ }
+
+ return fmt.Sprintf("(VMOVDQU%dMasked%s (%s %s) mask) => (%s %s mask)\n", d.ElementSize, size, asmNoMask, d.Args, d.Asm, d.Args)
+}
+
+// SSA rewrite rules need to appear in a most-to-least-specific order. This works for that.
+var tmplOrder = map[string]int{
+ "masksftimm": 0,
+ "sftimm": 1,
+ "maskInMaskOut": 2,
+ "maskOut": 3,
+ "maskIn": 4,
+ "pureVreg": 5,
+ "vregMem": 6,
+}
+
+func compareTplRuleData(x, y tplRuleData) int {
+ if c := compareNatural(x.GoOp, y.GoOp); c != 0 {
+ return c
+ }
+ if c := compareNatural(x.GoType, y.GoType); c != 0 {
+ return c
+ }
+ if c := compareNatural(x.Args, y.Args); c != 0 {
+ return c
+ }
+ if x.tplName == y.tplName {
+ return 0
+ }
+ xo, xok := tmplOrder[x.tplName]
+ yo, yok := tmplOrder[y.tplName]
+ if !xok {
+ panic(fmt.Errorf("Unexpected template name %s, please add to tmplOrder", x.tplName))
+ }
+ if !yok {
+ panic(fmt.Errorf("Unexpected template name %s, please add to tmplOrder", y.tplName))
+ }
+ return xo - yo
+}
+
+// writeSIMDRules generates the lowering and rewrite rules for ssa and writes it to simdAMD64.rules
+// within the specified directory.
+func writeSIMDRules(ops []Operation) *bytes.Buffer {
+ buffer := new(bytes.Buffer)
+ buffer.WriteString(generatedHeader + "\n")
+
+ // asm -> masked merging rules
+ maskedMergeOpts := make(map[string]string)
+ s2n := map[int]string{8: "B", 16: "W", 32: "D", 64: "Q"}
+ asmCheck := map[string]bool{}
+ var allData []tplRuleData
+ var optData []tplRuleData // for mask peephole optimizations, and other misc
+ var memOptData []tplRuleData // for memory peephole optimizations
+ memOpSeen := make(map[string]bool)
+
+ for _, opr := range ops {
+ opInShape, opOutShape, maskType, immType, gOp := opr.shape()
+ asm := machineOpName(maskType, gOp)
+ vregInCnt := len(gOp.In)
+ if maskType == OneMask {
+ vregInCnt--
+ }
+
+ data := tplRuleData{
+ GoOp: gOp.Go,
+ Asm: asm,
+ }
+
+ if vregInCnt == 1 {
+ data.Args = "x"
+ data.ArgsOut = data.Args
+ } else if vregInCnt == 2 {
+ data.Args = "x y"
+ data.ArgsOut = data.Args
+ } else if vregInCnt == 3 {
+ data.Args = "x y z"
+ data.ArgsOut = data.Args
+ } else {
+ panic(fmt.Errorf("simdgen does not support more than 3 vreg in inputs"))
+ }
+ if immType == ConstImm {
+ data.ArgsOut = fmt.Sprintf("[%s] %s", *opr.In[0].Const, data.ArgsOut)
+ } else if immType == VarImm {
+ data.Args = fmt.Sprintf("[a] %s", data.Args)
+ data.ArgsOut = fmt.Sprintf("[a] %s", data.ArgsOut)
+ } else if immType == ConstVarImm {
+ data.Args = fmt.Sprintf("[a] %s", data.Args)
+ data.ArgsOut = fmt.Sprintf("[a+%s] %s", *opr.In[0].Const, data.ArgsOut)
+ }
+
+ goType := func(op Operation) string {
+ if op.OperandOrder != nil {
+ switch *op.OperandOrder {
+ case "21Type1", "231Type1":
+ // Permute uses operand[1] for method receiver.
+ return *op.In[1].Go
+ }
+ }
+ return *op.In[0].Go
+ }
+ var tplName string
+ // If class overwrite is happening, that's not really a mask but a vreg.
+ if opOutShape == OneVregOut || opOutShape == OneVregOutAtIn || gOp.Out[0].OverwriteClass != nil {
+ switch opInShape {
+ case OneImmIn:
+ tplName = "pureVreg"
+ data.GoType = goType(gOp)
+ case PureVregIn:
+ tplName = "pureVreg"
+ data.GoType = goType(gOp)
+ case OneKmaskImmIn:
+ fallthrough
+ case OneKmaskIn:
+ tplName = "maskIn"
+ data.GoType = goType(gOp)
+ rearIdx := len(gOp.In) - 1
+ // Mask is at the end.
+ width := *gOp.In[rearIdx].ElemBits
+ data.MaskInConvert = fmt.Sprintf("VPMOVVec%dx%dToM", width, *gOp.In[rearIdx].Lanes)
+ data.ElementSize = width
+ case PureKmaskIn:
+ panic(fmt.Errorf("simdgen does not support pure k mask instructions, they should be generated by compiler optimizations"))
+ }
+ } else if opOutShape == OneGregOut {
+ tplName = "pureVreg" // TODO this will be wrong
+ data.GoType = goType(gOp)
+ } else {
+ // OneKmaskOut case
+ data.MaskOutConvert = fmt.Sprintf("VPMOVMToVec%dx%d", *gOp.Out[0].ElemBits, *gOp.In[0].Lanes)
+ switch opInShape {
+ case OneImmIn:
+ fallthrough
+ case PureVregIn:
+ tplName = "maskOut"
+ data.GoType = goType(gOp)
+ case OneKmaskImmIn:
+ fallthrough
+ case OneKmaskIn:
+ tplName = "maskInMaskOut"
+ data.GoType = goType(gOp)
+ rearIdx := len(gOp.In) - 1
+ data.MaskInConvert = fmt.Sprintf("VPMOVVec%dx%dToM", *gOp.In[rearIdx].ElemBits, *gOp.In[rearIdx].Lanes)
+ case PureKmaskIn:
+ panic(fmt.Errorf("simdgen does not support pure k mask instructions, they should be generated by compiler optimizations"))
+ }
+ }
+
+ if gOp.SpecialLower != nil {
+ if *gOp.SpecialLower == "sftimm" {
+ if data.GoType[0] == 'I' {
+ // only do these for signed types, it is a duplicate rewrite for unsigned
+ sftImmData := data
+ if tplName == "maskIn" {
+ sftImmData.tplName = "masksftimm"
+ } else {
+ sftImmData.tplName = "sftimm"
+ }
+ allData = append(allData, sftImmData)
+ asmCheck[sftImmData.Asm+"const"] = true
+ }
+ } else {
+ panic("simdgen sees unknwon special lower " + *gOp.SpecialLower + ", maybe implement it?")
+ }
+ }
+ if gOp.MemFeatures != nil && *gOp.MemFeatures == "vbcst" {
+ // sanity check
+ selected := true
+ for _, a := range gOp.In {
+ if a.TreatLikeAScalarOfSize != nil {
+ selected = false
+ break
+ }
+ }
+ if _, ok := memOpSeen[data.Asm]; ok {
+ selected = false
+ }
+ if selected {
+ memOpSeen[data.Asm] = true
+ lastVreg := gOp.In[vregInCnt-1]
+ // sanity check
+ if lastVreg.Class != "vreg" {
+ panic(fmt.Errorf("simdgen expects vbcst replaced operand to be a vreg, but %v found", lastVreg))
+ }
+ memOpData := data
+ // Remove the last vreg from the arg and change it to a load.
+ origArgs := data.Args[:len(data.Args)-1]
+ // Prepare imm args.
+ immArg := ""
+ immArgCombineOff := " [off] "
+ if immType != NoImm && immType != InvalidImm {
+ _, after, found := strings.Cut(origArgs, "]")
+ if found {
+ origArgs = after
+ }
+ immArg = "[c] "
+ immArgCombineOff = " [makeValAndOff(int32(int8(c)),off)] "
+ }
+ memOpData.ArgsLoadAddr = immArg + origArgs + fmt.Sprintf("l:(VMOVDQUload%d {sym} [off] ptr mem)", *lastVreg.Bits)
+ // Remove the last vreg from the arg and change it to "ptr".
+ memOpData.ArgsAddr = "{sym}" + immArgCombineOff + origArgs + "ptr"
+ if maskType == OneMask {
+ memOpData.ArgsAddr += " mask"
+ memOpData.ArgsLoadAddr += " mask"
+ }
+ memOpData.ArgsAddr += " mem"
+ if gOp.MemFeaturesData != nil {
+ _, feat2 := getVbcstData(*gOp.MemFeaturesData)
+ knownFeatChecks := map[string]string{
+ "AVX": "v.Block.CPUfeatures.hasFeature(CPUavx)",
+ "AVX2": "v.Block.CPUfeatures.hasFeature(CPUavx2)",
+ "AVX512": "v.Block.CPUfeatures.hasFeature(CPUavx512)",
+ }
+ memOpData.FeatCheck = knownFeatChecks[feat2]
+ memOpData.tplName = "vregMemFeatCheck"
+ } else {
+ memOpData.tplName = "vregMem"
+ }
+ memOptData = append(memOptData, memOpData)
+ asmCheck[memOpData.Asm+"load"] = true
+ }
+ }
+ // Generate the masked merging optimization rules
+ if gOp.hasMaskedMerging(maskType, opOutShape) {
+ // TODO: handle customized operand order and special lower.
+ maskElem := gOp.In[len(gOp.In)-1]
+ if maskElem.Bits == nil {
+ panic("mask has no bits")
+ }
+ if maskElem.ElemBits == nil {
+ panic("mask has no elemBits")
+ }
+ if maskElem.Lanes == nil {
+ panic("mask has no lanes")
+ }
+ switch *maskElem.Bits {
+ case 128, 256:
+ // VPBLENDVB cases.
+ noMaskName := machineOpName(NoMask, gOp)
+ ruleExisting, ok := maskedMergeOpts[noMaskName]
+ rule := fmt.Sprintf("(VPBLENDVB%d dst (%s %s) mask) && v.Block.CPUfeatures.hasFeature(CPUavx512) => (%sMerging dst %s (VPMOVVec%dx%dToM <types.TypeMask> mask))\n",
+ *maskElem.Bits, noMaskName, data.Args, data.Asm, data.Args, *maskElem.ElemBits, *maskElem.Lanes)
+ if ok && ruleExisting != rule {
+ panic(fmt.Sprintf("multiple masked merge rules for one op:\n%s\n%s\n", ruleExisting, rule))
+ } else {
+ maskedMergeOpts[noMaskName] = rule
+ }
+ case 512:
+ // VPBLENDM[BWDQ] cases.
+ noMaskName := machineOpName(NoMask, gOp)
+ ruleExisting, ok := maskedMergeOpts[noMaskName]
+ rule := fmt.Sprintf("(VPBLENDM%sMasked%d dst (%s %s) mask) => (%sMerging dst %s mask)\n",
+ s2n[*maskElem.ElemBits], *maskElem.Bits, noMaskName, data.Args, data.Asm, data.Args)
+ if ok && ruleExisting != rule {
+ panic(fmt.Sprintf("multiple masked merge rules for one op:\n%s\n%s\n", ruleExisting, rule))
+ } else {
+ maskedMergeOpts[noMaskName] = rule
+ }
+ }
+ }
+
+ if tplName == "pureVreg" && data.Args == data.ArgsOut {
+ data.Args = "..."
+ data.ArgsOut = "..."
+ }
+ data.tplName = tplName
+ if opr.NoGenericOps != nil && *opr.NoGenericOps == "true" ||
+ opr.SkipMaskedMethod() {
+ optData = append(optData, data)
+ continue
+ }
+ allData = append(allData, data)
+ asmCheck[data.Asm] = true
+ }
+
+ slices.SortFunc(allData, compareTplRuleData)
+
+ for _, data := range allData {
+ if err := ruleTemplates.ExecuteTemplate(buffer, data.tplName, data); err != nil {
+ panic(fmt.Errorf("failed to execute template %s for %s: %w", data.tplName, data.GoOp+data.GoType, err))
+ }
+ }
+
+ seen := make(map[string]bool)
+
+ for _, data := range optData {
+ if data.tplName == "maskIn" {
+ rule := data.MaskOptimization(asmCheck)
+ if seen[rule] {
+ continue
+ }
+ seen[rule] = true
+ buffer.WriteString(rule)
+ }
+ }
+
+ maskedMergeOptsRules := []string{}
+ for asm, rule := range maskedMergeOpts {
+ if !asmCheck[asm] {
+ continue
+ }
+ maskedMergeOptsRules = append(maskedMergeOptsRules, rule)
+ }
+ slices.Sort(maskedMergeOptsRules)
+ for _, rule := range maskedMergeOptsRules {
+ buffer.WriteString(rule)
+ }
+
+ for _, data := range memOptData {
+ if err := ruleTemplates.ExecuteTemplate(buffer, data.tplName, data); err != nil {
+ panic(fmt.Errorf("failed to execute template %s for %s: %w", data.tplName, data.Asm, err))
+ }
+ }
+
+ return buffer
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "bytes"
+ "fmt"
+ "log"
+ "strings"
+ "text/template"
+)
+
+var (
+ ssaTemplates = template.Must(template.New("simdSSA").Parse(`
+{{define "header"}}// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+package amd64
+
+import (
+ "cmd/compile/internal/ssa"
+ "cmd/compile/internal/ssagen"
+ "cmd/internal/obj"
+ "cmd/internal/obj/x86"
+)
+
+func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
+ var p *obj.Prog
+ switch v.Op {{"{"}}{{end}}
+{{define "case"}}
+ case {{.Cases}}:
+ p = {{.Helper}}(s, v)
+{{end}}
+{{define "footer"}}
+ default:
+ // Unknown reg shape
+ return false
+ }
+{{end}}
+{{define "zeroing"}}
+ // Masked operation are always compiled with zeroing.
+ switch v.Op {
+ case {{.}}:
+ x86.ParseSuffix(p, "Z")
+ }
+{{end}}
+{{define "ending"}}
+ return true
+}
+{{end}}`))
+)
+
+type tplSSAData struct {
+ Cases string
+ Helper string
+}
+
+// writeSIMDSSA generates the ssa to prog lowering codes and writes it to simdssa.go
+// within the specified directory.
+func writeSIMDSSA(ops []Operation) *bytes.Buffer {
+ var ZeroingMask []string
+ regInfoKeys := []string{
+ "v11",
+ "v21",
+ "v2k",
+ "v2kv",
+ "v2kk",
+ "vkv",
+ "v31",
+ "v3kv",
+ "v11Imm8",
+ "vkvImm8",
+ "v21Imm8",
+ "v2kImm8",
+ "v2kkImm8",
+ "v31ResultInArg0",
+ "v3kvResultInArg0",
+ "vfpv",
+ "vfpkv",
+ "vgpvImm8",
+ "vgpImm8",
+ "v2kvImm8",
+ "vkvload",
+ "v21load",
+ "v31loadResultInArg0",
+ "v3kvloadResultInArg0",
+ "v2kvload",
+ "v2kload",
+ "v11load",
+ "v11loadImm8",
+ "vkvloadImm8",
+ "v21loadImm8",
+ "v2kloadImm8",
+ "v2kkloadImm8",
+ "v2kvloadImm8",
+ "v31ResultInArg0Imm8",
+ "v31loadResultInArg0Imm8",
+ "v21ResultInArg0",
+ "v21ResultInArg0Imm8",
+ "v31x0AtIn2ResultInArg0",
+ "v2kvResultInArg0",
+ }
+ regInfoSet := map[string][]string{}
+ for _, key := range regInfoKeys {
+ regInfoSet[key] = []string{}
+ }
+
+ seen := map[string]struct{}{}
+ allUnseen := make(map[string][]Operation)
+ allUnseenCaseStr := make(map[string][]string)
+ classifyOp := func(op Operation, maskType maskShape, shapeIn inShape, shapeOut outShape, caseStr string, mem memShape) error {
+ regShape, err := op.regShape(mem)
+ if err != nil {
+ return err
+ }
+ if regShape == "v01load" {
+ regShape = "vload"
+ }
+ if shapeOut == OneVregOutAtIn {
+ regShape += "ResultInArg0"
+ }
+ if shapeIn == OneImmIn || shapeIn == OneKmaskImmIn {
+ regShape += "Imm8"
+ }
+ regShape, err = rewriteVecAsScalarRegInfo(op, regShape)
+ if err != nil {
+ return err
+ }
+ if _, ok := regInfoSet[regShape]; !ok {
+ allUnseen[regShape] = append(allUnseen[regShape], op)
+ allUnseenCaseStr[regShape] = append(allUnseenCaseStr[regShape], caseStr)
+ }
+ regInfoSet[regShape] = append(regInfoSet[regShape], caseStr)
+ if mem == NoMem && op.hasMaskedMerging(maskType, shapeOut) {
+ regShapeMerging := regShape
+ if shapeOut != OneVregOutAtIn {
+ // We have to copy the slice here becasue the sort will be visible from other
+ // aliases when no reslicing is happening.
+ newIn := make([]Operand, len(op.In), len(op.In)+1)
+ copy(newIn, op.In)
+ op.In = newIn
+ op.In = append(op.In, op.Out[0])
+ op.sortOperand()
+ regShapeMerging, err = op.regShape(mem)
+ regShapeMerging += "ResultInArg0"
+ }
+ if err != nil {
+ return err
+ }
+ if _, ok := regInfoSet[regShapeMerging]; !ok {
+ allUnseen[regShapeMerging] = append(allUnseen[regShapeMerging], op)
+ allUnseenCaseStr[regShapeMerging] = append(allUnseenCaseStr[regShapeMerging], caseStr+"Merging")
+ }
+ regInfoSet[regShapeMerging] = append(regInfoSet[regShapeMerging], caseStr+"Merging")
+ }
+ return nil
+ }
+ for _, op := range ops {
+ shapeIn, shapeOut, maskType, _, gOp := op.shape()
+ asm := machineOpName(maskType, gOp)
+ if _, ok := seen[asm]; ok {
+ continue
+ }
+ seen[asm] = struct{}{}
+ caseStr := fmt.Sprintf("ssa.OpAMD64%s", asm)
+ isZeroMasking := false
+ if shapeIn == OneKmaskIn || shapeIn == OneKmaskImmIn {
+ if gOp.Zeroing == nil || *gOp.Zeroing {
+ ZeroingMask = append(ZeroingMask, caseStr)
+ isZeroMasking = true
+ }
+ }
+ if err := classifyOp(op, maskType, shapeIn, shapeOut, caseStr, NoMem); err != nil {
+ panic(err)
+ }
+ if op.MemFeatures != nil && *op.MemFeatures == "vbcst" {
+ // Make a full vec memory variant
+ op = rewriteLastVregToMem(op)
+ // Ignore the error
+ // an error could be triggered by [checkVecAsScalar].
+ // TODO: make [checkVecAsScalar] aware of mem ops.
+ if err := classifyOp(op, maskType, shapeIn, shapeOut, caseStr+"load", VregMemIn); err != nil {
+ if *Verbose {
+ log.Printf("Seen error: %e", err)
+ }
+ } else if isZeroMasking {
+ ZeroingMask = append(ZeroingMask, caseStr+"load")
+ }
+ }
+ }
+ if len(allUnseen) != 0 {
+ allKeys := make([]string, 0)
+ for k := range allUnseen {
+ allKeys = append(allKeys, k)
+ }
+ panic(fmt.Errorf("unsupported register constraint for prog, please update gen_simdssa.go and amd64/ssa.go: %+v\nAll keys: %v\n, cases: %v\n", allUnseen, allKeys, allUnseenCaseStr))
+ }
+
+ buffer := new(bytes.Buffer)
+
+ if err := ssaTemplates.ExecuteTemplate(buffer, "header", nil); err != nil {
+ panic(fmt.Errorf("failed to execute header template: %w", err))
+ }
+
+ for _, regShape := range regInfoKeys {
+ // Stable traversal of regInfoSet
+ cases := regInfoSet[regShape]
+ if len(cases) == 0 {
+ continue
+ }
+ data := tplSSAData{
+ Cases: strings.Join(cases, ",\n\t\t"),
+ Helper: "simd" + capitalizeFirst(regShape),
+ }
+ if err := ssaTemplates.ExecuteTemplate(buffer, "case", data); err != nil {
+ panic(fmt.Errorf("failed to execute case template for %s: %w", regShape, err))
+ }
+ }
+
+ if err := ssaTemplates.ExecuteTemplate(buffer, "footer", nil); err != nil {
+ panic(fmt.Errorf("failed to execute footer template: %w", err))
+ }
+
+ if len(ZeroingMask) != 0 {
+ if err := ssaTemplates.ExecuteTemplate(buffer, "zeroing", strings.Join(ZeroingMask, ",\n\t\t")); err != nil {
+ panic(fmt.Errorf("failed to execute footer template: %w", err))
+ }
+ }
+
+ if err := ssaTemplates.ExecuteTemplate(buffer, "ending", nil); err != nil {
+ panic(fmt.Errorf("failed to execute footer template: %w", err))
+ }
+
+ return buffer
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "fmt"
+ "go/format"
+ "log"
+ "os"
+ "path/filepath"
+ "reflect"
+ "slices"
+ "sort"
+ "strings"
+ "text/template"
+ "unicode"
+)
+
+func templateOf(temp, name string) *template.Template {
+ t, err := template.New(name).Parse(temp)
+ if err != nil {
+ panic(fmt.Errorf("failed to parse template %s: %w", name, err))
+ }
+ return t
+}
+
+func createPath(goroot string, file string) (*os.File, error) {
+ fp := filepath.Join(goroot, file)
+ dir := filepath.Dir(fp)
+ err := os.MkdirAll(dir, 0755)
+ if err != nil {
+ return nil, fmt.Errorf("failed to create directory %s: %w", dir, err)
+ }
+ f, err := os.Create(fp)
+ if err != nil {
+ return nil, fmt.Errorf("failed to create file %s: %w", fp, err)
+ }
+ return f, nil
+}
+
+func formatWriteAndClose(out *bytes.Buffer, goroot string, file string) {
+ b, err := format.Source(out.Bytes())
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "%v\n", err)
+ fmt.Fprintf(os.Stderr, "%s\n", numberLines(out.Bytes()))
+ fmt.Fprintf(os.Stderr, "%v\n", err)
+ panic(err)
+ } else {
+ writeAndClose(b, goroot, file)
+ }
+}
+
+func writeAndClose(b []byte, goroot string, file string) {
+ ofile, err := createPath(goroot, file)
+ if err != nil {
+ panic(err)
+ }
+ ofile.Write(b)
+ ofile.Close()
+}
+
+// numberLines takes a slice of bytes, and returns a string where each line
+// is numbered, starting from 1.
+func numberLines(data []byte) string {
+ var buf bytes.Buffer
+ r := bytes.NewReader(data)
+ s := bufio.NewScanner(r)
+ for i := 1; s.Scan(); i++ {
+ fmt.Fprintf(&buf, "%d: %s\n", i, s.Text())
+ }
+ return buf.String()
+}
+
+type inShape uint8
+type outShape uint8
+type maskShape uint8
+type immShape uint8
+type memShape uint8
+
+const (
+ InvalidIn inShape = iota
+ PureVregIn // vector register input only
+ OneKmaskIn // vector and kmask input
+ OneImmIn // vector and immediate input
+ OneKmaskImmIn // vector, kmask, and immediate inputs
+ PureKmaskIn // only mask inputs.
+)
+
+const (
+ InvalidOut outShape = iota
+ NoOut // no output
+ OneVregOut // (one) vector register output
+ OneGregOut // (one) general register output
+ OneKmaskOut // mask output
+ OneVregOutAtIn // the first input is also the output
+)
+
+const (
+ InvalidMask maskShape = iota
+ NoMask // no mask
+ OneMask // with mask (K1 to K7)
+ AllMasks // a K mask instruction (K0-K7)
+)
+
+const (
+ InvalidImm immShape = iota
+ NoImm // no immediate
+ ConstImm // const only immediate
+ VarImm // pure imm argument provided by the users
+ ConstVarImm // a combination of user arg and const
+)
+
+const (
+ InvalidMem memShape = iota
+ NoMem
+ VregMemIn // The instruction contains a mem input which is loading a vreg.
+)
+
+// opShape returns the several integers describing the shape of the operation,
+// and modified versions of the op:
+//
+// opNoImm is op with its inputs excluding the const imm.
+//
+// This function does not modify op.
+func (op *Operation) shape() (shapeIn inShape, shapeOut outShape, maskType maskShape, immType immShape,
+ opNoImm Operation) {
+ if len(op.Out) > 1 {
+ panic(fmt.Errorf("simdgen only supports 1 output: %s", op))
+ }
+ var outputReg int
+ if len(op.Out) == 1 {
+ outputReg = op.Out[0].AsmPos
+ if op.Out[0].Class == "vreg" {
+ shapeOut = OneVregOut
+ } else if op.Out[0].Class == "greg" {
+ shapeOut = OneGregOut
+ } else if op.Out[0].Class == "mask" {
+ shapeOut = OneKmaskOut
+ } else {
+ panic(fmt.Errorf("simdgen only supports output of class vreg or mask: %s", op))
+ }
+ } else {
+ shapeOut = NoOut
+ // TODO: are these only Load/Stores?
+ // We manually supported two Load and Store, are those enough?
+ panic(fmt.Errorf("simdgen only supports 1 output: %s", op))
+ }
+ hasImm := false
+ maskCount := 0
+ hasVreg := false
+ for _, in := range op.In {
+ if in.AsmPos == outputReg {
+ if shapeOut != OneVregOutAtIn && in.AsmPos == 0 && in.Class == "vreg" {
+ shapeOut = OneVregOutAtIn
+ } else {
+ panic(fmt.Errorf("simdgen only support output and input sharing the same position case of \"the first input is vreg and the only output\": %s", op))
+ }
+ }
+ if in.Class == "immediate" {
+ // A manual check on XED data found that AMD64 SIMD instructions at most
+ // have 1 immediates. So we don't need to check this here.
+ if *in.Bits != 8 {
+ panic(fmt.Errorf("simdgen only supports immediates of 8 bits: %s", op))
+ }
+ hasImm = true
+ } else if in.Class == "mask" {
+ maskCount++
+ } else {
+ hasVreg = true
+ }
+ }
+ opNoImm = *op
+
+ removeImm := func(o *Operation) {
+ o.In = o.In[1:]
+ }
+ if hasImm {
+ removeImm(&opNoImm)
+ if op.In[0].Const != nil {
+ if op.In[0].ImmOffset != nil {
+ immType = ConstVarImm
+ } else {
+ immType = ConstImm
+ }
+ } else if op.In[0].ImmOffset != nil {
+ immType = VarImm
+ } else {
+ panic(fmt.Errorf("simdgen requires imm to have at least one of ImmOffset or Const set: %s", op))
+ }
+ } else {
+ immType = NoImm
+ }
+ if maskCount == 0 {
+ maskType = NoMask
+ } else {
+ maskType = OneMask
+ }
+ checkPureMask := func() bool {
+ if hasImm {
+ panic(fmt.Errorf("simdgen does not support immediates in pure mask operations: %s", op))
+ }
+ if hasVreg {
+ panic(fmt.Errorf("simdgen does not support more than 1 masks in non-pure mask operations: %s", op))
+ }
+ return false
+ }
+ if !hasImm && maskCount == 0 {
+ shapeIn = PureVregIn
+ } else if !hasImm && maskCount > 0 {
+ if maskCount == 1 {
+ shapeIn = OneKmaskIn
+ } else {
+ if checkPureMask() {
+ return
+ }
+ shapeIn = PureKmaskIn
+ maskType = AllMasks
+ }
+ } else if hasImm && maskCount == 0 {
+ shapeIn = OneImmIn
+ } else {
+ if maskCount == 1 {
+ shapeIn = OneKmaskImmIn
+ } else {
+ checkPureMask()
+ return
+ }
+ }
+ return
+}
+
+// regShape returns a string representation of the register shape.
+func (op *Operation) regShape(mem memShape) (string, error) {
+ _, _, _, _, gOp := op.shape()
+ var regInfo, fixedName string
+ var vRegInCnt, gRegInCnt, kMaskInCnt, vRegOutCnt, gRegOutCnt, kMaskOutCnt, memInCnt, memOutCnt int
+ for i, in := range gOp.In {
+ switch in.Class {
+ case "vreg":
+ vRegInCnt++
+ case "greg":
+ gRegInCnt++
+ case "mask":
+ kMaskInCnt++
+ case "memory":
+ if mem != VregMemIn {
+ panic("simdgen only knows VregMemIn in regShape")
+ }
+ memInCnt++
+ vRegInCnt++
+ }
+ if in.FixedReg != nil {
+ fixedName = fmt.Sprintf("%sAtIn%d", *in.FixedReg, i)
+ }
+ }
+ for i, out := range gOp.Out {
+ // If class overwrite is happening, that's not really a mask but a vreg.
+ if out.Class == "vreg" || out.OverwriteClass != nil {
+ vRegOutCnt++
+ } else if out.Class == "greg" {
+ gRegOutCnt++
+ } else if out.Class == "mask" {
+ kMaskOutCnt++
+ } else if out.Class == "memory" {
+ if mem != VregMemIn {
+ panic("simdgen only knows VregMemIn in regShape")
+ }
+ vRegOutCnt++
+ memOutCnt++
+ }
+ if out.FixedReg != nil {
+ fixedName = fmt.Sprintf("%sAtIn%d", *out.FixedReg, i)
+ }
+ }
+ var inRegs, inMasks, outRegs, outMasks string
+
+ rmAbbrev := func(s string, i int) string {
+ if i == 0 {
+ return ""
+ }
+ if i == 1 {
+ return s
+ }
+ return fmt.Sprintf("%s%d", s, i)
+
+ }
+
+ inRegs = rmAbbrev("v", vRegInCnt)
+ inRegs += rmAbbrev("gp", gRegInCnt)
+ inMasks = rmAbbrev("k", kMaskInCnt)
+
+ outRegs = rmAbbrev("v", vRegOutCnt)
+ outRegs += rmAbbrev("gp", gRegOutCnt)
+ outMasks = rmAbbrev("k", kMaskOutCnt)
+
+ if kMaskInCnt == 0 && kMaskOutCnt == 0 && gRegInCnt == 0 && gRegOutCnt == 0 {
+ // For pure v we can abbreviate it as v%d%d.
+ regInfo = fmt.Sprintf("v%d%d", vRegInCnt, vRegOutCnt)
+ } else if kMaskInCnt == 0 && kMaskOutCnt == 0 {
+ regInfo = fmt.Sprintf("%s%s", inRegs, outRegs)
+ } else {
+ regInfo = fmt.Sprintf("%s%s%s%s", inRegs, inMasks, outRegs, outMasks)
+ }
+ if memInCnt > 0 {
+ if memInCnt == 1 {
+ regInfo += "load"
+ } else {
+ panic("simdgen does not understand more than 1 mem op as of now")
+ }
+ }
+ if memOutCnt > 0 {
+ panic("simdgen does not understand memory as output as of now")
+ }
+ regInfo += fixedName
+ return regInfo, nil
+}
+
+// sortOperand sorts op.In by putting immediates first, then vreg, and mask the last.
+// TODO: verify that this is a safe assumption of the prog structure.
+// from my observation looks like in asm, imms are always the first,
+// masks are always the last, with vreg in between.
+func (op *Operation) sortOperand() {
+ priority := map[string]int{"immediate": 0, "vreg": 1, "greg": 1, "mask": 2}
+ sort.SliceStable(op.In, func(i, j int) bool {
+ pi := priority[op.In[i].Class]
+ pj := priority[op.In[j].Class]
+ if pi != pj {
+ return pi < pj
+ }
+ return op.In[i].AsmPos < op.In[j].AsmPos
+ })
+}
+
+// adjustAsm adjusts the asm to make it align with Go's assembler.
+func (op *Operation) adjustAsm() {
+ if op.Asm == "VCVTTPD2DQ" || op.Asm == "VCVTTPD2UDQ" ||
+ op.Asm == "VCVTQQ2PS" || op.Asm == "VCVTUQQ2PS" ||
+ op.Asm == "VCVTPD2PS" {
+ switch *op.In[0].Bits {
+ case 128:
+ op.Asm += "X"
+ case 256:
+ op.Asm += "Y"
+ }
+ }
+}
+
+// goNormalType returns the Go type name for the result of an Op that
+// does not return a vector, i.e., that returns a result in a general
+// register. Currently there's only one family of Ops in Go's simd library
+// that does this (GetElem), and so this is specialized to work for that,
+// but the problem (mismatch betwen hardware register width and Go type
+// width) seems likely to recur if there are any other cases.
+func (op Operation) goNormalType() string {
+ if op.Go == "GetElem" {
+ // GetElem returns an element of the vector into a general register
+ // but as far as the hardware is concerned, that result is either 32
+ // or 64 bits wide, no matter what the vector element width is.
+ // This is not "wrong" but it is not the right answer for Go source code.
+ // To get the Go type right, combine the base type ("int", "uint", "float"),
+ // with the input vector element width in bits (8,16,32,64).
+
+ at := 0 // proper value of at depends on whether immediate was stripped or not
+ if op.In[at].Class == "immediate" {
+ at++
+ }
+ return fmt.Sprintf("%s%d", *op.Out[0].Base, *op.In[at].ElemBits)
+ }
+ panic(fmt.Errorf("Implement goNormalType for %v", op))
+}
+
+// SSAType returns the string for the type reference in SSA generation,
+// for example in the intrinsics generating template.
+func (op Operation) SSAType() string {
+ if op.Out[0].Class == "greg" {
+ return fmt.Sprintf("types.Types[types.T%s]", strings.ToUpper(op.goNormalType()))
+ }
+ return fmt.Sprintf("types.TypeVec%d", *op.Out[0].Bits)
+}
+
+// GoType returns the Go type returned by this operation (relative to the simd package),
+// for example "int32" or "Int8x16". This is used in a template.
+func (op Operation) GoType() string {
+ if op.Out[0].Class == "greg" {
+ return op.goNormalType()
+ }
+ return *op.Out[0].Go
+}
+
+// ImmName returns the name to use for an operation's immediate operand.
+// This can be overriden in the yaml with "name" on an operand,
+// otherwise, for now, "constant"
+func (op Operation) ImmName() string {
+ return op.Op0Name("constant")
+}
+
+func (o Operand) OpName(s string) string {
+ if n := o.Name; n != nil {
+ return *n
+ }
+ if o.Class == "mask" {
+ return "mask"
+ }
+ return s
+}
+
+func (o Operand) OpNameAndType(s string) string {
+ return o.OpName(s) + " " + *o.Go
+}
+
+// GoExported returns [Go] with first character capitalized.
+func (op Operation) GoExported() string {
+ return capitalizeFirst(op.Go)
+}
+
+// DocumentationExported returns [Documentation] with method name capitalized.
+func (op Operation) DocumentationExported() string {
+ return strings.ReplaceAll(op.Documentation, op.Go, op.GoExported())
+}
+
+// Op0Name returns the name to use for the 0 operand,
+// if any is present, otherwise the parameter is used.
+func (op Operation) Op0Name(s string) string {
+ return op.In[0].OpName(s)
+}
+
+// Op1Name returns the name to use for the 1 operand,
+// if any is present, otherwise the parameter is used.
+func (op Operation) Op1Name(s string) string {
+ return op.In[1].OpName(s)
+}
+
+// Op2Name returns the name to use for the 2 operand,
+// if any is present, otherwise the parameter is used.
+func (op Operation) Op2Name(s string) string {
+ return op.In[2].OpName(s)
+}
+
+// Op3Name returns the name to use for the 3 operand,
+// if any is present, otherwise the parameter is used.
+func (op Operation) Op3Name(s string) string {
+ return op.In[3].OpName(s)
+}
+
+// Op0NameAndType returns the name and type to use for
+// the 0 operand, if a name is provided, otherwise
+// the parameter value is used as the default.
+func (op Operation) Op0NameAndType(s string) string {
+ return op.In[0].OpNameAndType(s)
+}
+
+// Op1NameAndType returns the name and type to use for
+// the 1 operand, if a name is provided, otherwise
+// the parameter value is used as the default.
+func (op Operation) Op1NameAndType(s string) string {
+ return op.In[1].OpNameAndType(s)
+}
+
+// Op2NameAndType returns the name and type to use for
+// the 2 operand, if a name is provided, otherwise
+// the parameter value is used as the default.
+func (op Operation) Op2NameAndType(s string) string {
+ return op.In[2].OpNameAndType(s)
+}
+
+// Op3NameAndType returns the name and type to use for
+// the 3 operand, if a name is provided, otherwise
+// the parameter value is used as the default.
+func (op Operation) Op3NameAndType(s string) string {
+ return op.In[3].OpNameAndType(s)
+}
+
+// Op4NameAndType returns the name and type to use for
+// the 4 operand, if a name is provided, otherwise
+// the parameter value is used as the default.
+func (op Operation) Op4NameAndType(s string) string {
+ return op.In[4].OpNameAndType(s)
+}
+
+var immClasses []string = []string{"BAD0Imm", "BAD1Imm", "op1Imm8", "op2Imm8", "op3Imm8", "op4Imm8"}
+var classes []string = []string{"BAD0", "op1", "op2", "op3", "op4"}
+
+// classifyOp returns a classification string, modified operation, and perhaps error based
+// on the stub and intrinsic shape for the operation.
+// The classification string is in the regular expression set "op[1234](Imm8)?(_<order>)?"
+// where the "<order>" suffix is optionally attached to the Operation in its input yaml.
+// The classification string is used to select a template or a clause of a template
+// for intrinsics declaration and the ssagen intrinisics glue code in the compiler.
+func classifyOp(op Operation) (string, Operation, error) {
+ _, _, _, immType, gOp := op.shape()
+
+ var class string
+
+ if immType == VarImm || immType == ConstVarImm {
+ switch l := len(op.In); l {
+ case 1:
+ return "", op, fmt.Errorf("simdgen does not recognize this operation of only immediate input: %s", op)
+ case 2, 3, 4, 5:
+ class = immClasses[l]
+ default:
+ return "", op, fmt.Errorf("simdgen does not recognize this operation of input length %d: %s", len(op.In), op)
+ }
+ if order := op.OperandOrder; order != nil {
+ class += "_" + *order
+ }
+ return class, op, nil
+ } else {
+ switch l := len(gOp.In); l {
+ case 1, 2, 3, 4:
+ class = classes[l]
+ default:
+ return "", op, fmt.Errorf("simdgen does not recognize this operation of input length %d: %s", len(op.In), op)
+ }
+ if order := op.OperandOrder; order != nil {
+ class += "_" + *order
+ }
+ return class, gOp, nil
+ }
+}
+
+func checkVecAsScalar(op Operation) (idx int, err error) {
+ idx = -1
+ sSize := 0
+ for i, o := range op.In {
+ if o.TreatLikeAScalarOfSize != nil {
+ if idx == -1 {
+ idx = i
+ sSize = *o.TreatLikeAScalarOfSize
+ } else {
+ err = fmt.Errorf("simdgen only supports one TreatLikeAScalarOfSize in the arg list: %s", op)
+ return
+ }
+ }
+ }
+ if idx >= 0 {
+ if sSize != 8 && sSize != 16 && sSize != 32 && sSize != 64 {
+ err = fmt.Errorf("simdgen does not recognize this uint size: %d, %s", sSize, op)
+ return
+ }
+ }
+ return
+}
+
+func rewriteVecAsScalarRegInfo(op Operation, regInfo string) (string, error) {
+ idx, err := checkVecAsScalar(op)
+ if err != nil {
+ return "", err
+ }
+ if idx != -1 {
+ if regInfo == "v21" {
+ regInfo = "vfpv"
+ } else if regInfo == "v2kv" {
+ regInfo = "vfpkv"
+ } else if regInfo == "v31" {
+ regInfo = "v2fpv"
+ } else if regInfo == "v3kv" {
+ regInfo = "v2fpkv"
+ } else {
+ return "", fmt.Errorf("simdgen does not recognize uses of treatLikeAScalarOfSize with op regShape %s in op: %s", regInfo, op)
+ }
+ }
+ return regInfo, nil
+}
+
+func rewriteLastVregToMem(op Operation) Operation {
+ newIn := make([]Operand, len(op.In))
+ lastVregIdx := -1
+ for i := range len(op.In) {
+ newIn[i] = op.In[i]
+ if op.In[i].Class == "vreg" {
+ lastVregIdx = i
+ }
+ }
+ // vbcst operations put their mem op always as the last vreg.
+ if lastVregIdx == -1 {
+ panic("simdgen cannot find one vreg in the mem op vreg original")
+ }
+ newIn[lastVregIdx].Class = "memory"
+ op.In = newIn
+
+ return op
+}
+
+// dedup is deduping operations in the full structure level.
+func dedup(ops []Operation) (deduped []Operation) {
+ for _, op := range ops {
+ seen := false
+ for _, dop := range deduped {
+ if reflect.DeepEqual(op, dop) {
+ seen = true
+ break
+ }
+ }
+ if !seen {
+ deduped = append(deduped, op)
+ }
+ }
+ return
+}
+
+func (op Operation) GenericName() string {
+ if op.OperandOrder != nil {
+ switch *op.OperandOrder {
+ case "21Type1", "231Type1":
+ // Permute uses operand[1] for method receiver.
+ return op.Go + *op.In[1].Go
+ }
+ }
+ if op.In[0].Class == "immediate" {
+ return op.Go + *op.In[1].Go
+ }
+ return op.Go + *op.In[0].Go
+}
+
+// dedupGodef is deduping operations in [Op.Go]+[*Op.In[0].Go] level.
+// By deduping, it means picking the least advanced architecture that satisfy the requirement:
+// AVX512 will be least preferred.
+// If FlagNoDedup is set, it will report the duplicates to the console.
+func dedupGodef(ops []Operation) ([]Operation, error) {
+ seen := map[string][]Operation{}
+ for _, op := range ops {
+ _, _, _, _, gOp := op.shape()
+
+ gN := gOp.GenericName()
+ seen[gN] = append(seen[gN], op)
+ }
+ if *FlagReportDup {
+ for gName, dup := range seen {
+ if len(dup) > 1 {
+ log.Printf("Duplicate for %s:\n", gName)
+ for _, op := range dup {
+ log.Printf("%s\n", op)
+ }
+ }
+ }
+ return ops, nil
+ }
+ isAVX512 := func(op Operation) bool {
+ return strings.Contains(op.CPUFeature, "AVX512")
+ }
+ deduped := []Operation{}
+ for _, dup := range seen {
+ if len(dup) > 1 {
+ slices.SortFunc(dup, func(i, j Operation) int {
+ // Put non-AVX512 candidates at the beginning
+ if !isAVX512(i) && isAVX512(j) {
+ return -1
+ }
+ if isAVX512(i) && !isAVX512(j) {
+ return 1
+ }
+ if i.CPUFeature != j.CPUFeature {
+ return strings.Compare(i.CPUFeature, j.CPUFeature)
+ }
+ // Weirdly Intel sometimes has duplicated definitions for the same instruction,
+ // this confuses the XED mem-op merge logic: [MemFeature] will only be attached to an instruction
+ // for only once, which means that for essentially duplicated instructions only one will have the
+ // proper [MemFeature] set. We have to make this sort deterministic for [MemFeature].
+ if i.MemFeatures != nil && j.MemFeatures == nil {
+ return -1
+ }
+ if i.MemFeatures == nil && j.MemFeatures != nil {
+ return 1
+ }
+ if i.Commutative != j.Commutative {
+ if j.Commutative {
+ return -1
+ }
+ return 1
+ }
+ // Their order does not matter anymore, at least for now.
+ return 0
+ })
+ }
+ deduped = append(deduped, dup[0])
+ }
+ slices.SortFunc(deduped, compareOperations)
+ return deduped, nil
+}
+
+// Copy op.ConstImm to op.In[0].Const
+// This is a hack to reduce the size of defs we need for const imm operations.
+func copyConstImm(ops []Operation) error {
+ for _, op := range ops {
+ if op.ConstImm == nil {
+ continue
+ }
+ _, _, _, immType, _ := op.shape()
+
+ if immType == ConstImm || immType == ConstVarImm {
+ op.In[0].Const = op.ConstImm
+ }
+ // Otherwise, just not port it - e.g. {VPCMP[BWDQ] imm=0} and {VPCMPEQ[BWDQ]} are
+ // the same operations "Equal", [dedupgodef] should be able to distinguish them.
+ }
+ return nil
+}
+
+func capitalizeFirst(s string) string {
+ if s == "" {
+ return ""
+ }
+ // Convert the string to a slice of runes to handle multi-byte characters correctly.
+ r := []rune(s)
+ r[0] = unicode.ToUpper(r[0])
+ return string(r)
+}
+
+// overwrite corrects some errors due to:
+// - The XED data is wrong
+// - Go's SIMD API requirement, for example AVX2 compares should also produce masks.
+// This rewrite has strict constraints, please see the error message.
+// These constraints are also explointed in [writeSIMDRules], [writeSIMDMachineOps]
+// and [writeSIMDSSA], please be careful when updating these constraints.
+func overwrite(ops []Operation) error {
+ hasClassOverwrite := false
+ overwrite := func(op []Operand, idx int, o Operation) error {
+ if op[idx].OverwriteElementBits != nil {
+ if op[idx].ElemBits == nil {
+ panic(fmt.Errorf("ElemBits is nil at operand %d of %v", idx, o))
+ }
+ *op[idx].ElemBits = *op[idx].OverwriteElementBits
+ *op[idx].Lanes = *op[idx].Bits / *op[idx].ElemBits
+ *op[idx].Go = fmt.Sprintf("%s%dx%d", capitalizeFirst(*op[idx].Base), *op[idx].ElemBits, *op[idx].Lanes)
+ }
+ if op[idx].OverwriteClass != nil {
+ if op[idx].OverwriteBase == nil {
+ panic(fmt.Errorf("simdgen: [OverwriteClass] must be set together with [OverwriteBase]: %s", op[idx]))
+ }
+ oBase := *op[idx].OverwriteBase
+ oClass := *op[idx].OverwriteClass
+ if oClass != "mask" {
+ panic(fmt.Errorf("simdgen: [Class] overwrite only supports overwritting to mask: %s", op[idx]))
+ }
+ if oBase != "int" {
+ panic(fmt.Errorf("simdgen: [Class] overwrite must set [OverwriteBase] to int: %s", op[idx]))
+ }
+ if op[idx].Class != "vreg" {
+ panic(fmt.Errorf("simdgen: [Class] overwrite must be overwriting [Class] from vreg: %s", op[idx]))
+ }
+ hasClassOverwrite = true
+ *op[idx].Base = oBase
+ op[idx].Class = oClass
+ *op[idx].Go = fmt.Sprintf("Mask%dx%d", *op[idx].ElemBits, *op[idx].Lanes)
+ } else if op[idx].OverwriteBase != nil {
+ oBase := *op[idx].OverwriteBase
+ *op[idx].Go = strings.ReplaceAll(*op[idx].Go, capitalizeFirst(*op[idx].Base), capitalizeFirst(oBase))
+ if op[idx].Class == "greg" {
+ *op[idx].Go = strings.ReplaceAll(*op[idx].Go, *op[idx].Base, oBase)
+ }
+ *op[idx].Base = oBase
+ }
+ return nil
+ }
+ for i, o := range ops {
+ hasClassOverwrite = false
+ for j := range ops[i].In {
+ if err := overwrite(ops[i].In, j, o); err != nil {
+ return err
+ }
+ if hasClassOverwrite {
+ return fmt.Errorf("simdgen does not support [OverwriteClass] in inputs: %s", ops[i])
+ }
+ }
+ for j := range ops[i].Out {
+ if err := overwrite(ops[i].Out, j, o); err != nil {
+ return err
+ }
+ }
+ if hasClassOverwrite {
+ for _, in := range ops[i].In {
+ if in.Class == "mask" {
+ return fmt.Errorf("simdgen only supports [OverwriteClass] for operations without mask inputs")
+ }
+ }
+ }
+ }
+ return nil
+}
+
+// reportXEDInconsistency reports potential XED inconsistencies.
+// We can add more fields to [Operation] to enable more checks and implement it here.
+// Supported checks:
+// [NameAndSizeCheck]: NAME[BWDQ] should set the elemBits accordingly.
+// This check is useful to find inconsistencies, then we can add overwrite fields to
+// those defs to correct them manually.
+func reportXEDInconsistency(ops []Operation) error {
+ for _, o := range ops {
+ if o.NameAndSizeCheck != nil {
+ suffixSizeMap := map[byte]int{'B': 8, 'W': 16, 'D': 32, 'Q': 64}
+ checkOperand := func(opr Operand) error {
+ if opr.ElemBits == nil {
+ return fmt.Errorf("simdgen expects elemBits to be set when performing NameAndSizeCheck")
+ }
+ if v, ok := suffixSizeMap[o.Asm[len(o.Asm)-1]]; !ok {
+ return fmt.Errorf("simdgen expects asm to end with [BWDQ] when performing NameAndSizeCheck")
+ } else {
+ if v != *opr.ElemBits {
+ return fmt.Errorf("simdgen finds NameAndSizeCheck inconsistency in def: %s", o)
+ }
+ }
+ return nil
+ }
+ for _, in := range o.In {
+ if in.Class != "vreg" && in.Class != "mask" {
+ continue
+ }
+ if in.TreatLikeAScalarOfSize != nil {
+ // This is an irregular operand, don't check it.
+ continue
+ }
+ if err := checkOperand(in); err != nil {
+ return err
+ }
+ }
+ for _, out := range o.Out {
+ if err := checkOperand(out); err != nil {
+ return err
+ }
+ }
+ }
+ }
+ return nil
+}
+
+func (o *Operation) hasMaskedMerging(maskType maskShape, outType outShape) bool {
+ // BLEND and VMOVDQU are not user-facing ops so we should filter them out.
+ return o.OperandOrder == nil && o.SpecialLower == nil && maskType == OneMask && outType == OneVregOut &&
+ len(o.InVariant) == 1 && !strings.Contains(o.Asm, "BLEND") && !strings.Contains(o.Asm, "VMOVDQU")
+}
+
+func getVbcstData(s string) (feat1Match, feat2Match string) {
+ _, err := fmt.Sscanf(s, "feat1=%[^;];feat2=%s", &feat1Match, &feat2Match)
+ if err != nil {
+ panic(err)
+ }
+ return
+}
+
+func (o Operation) String() string {
+ return pprints(o)
+}
+
+func (op Operand) String() string {
+ return pprints(op)
+}
--- /dev/null
+!import ops/*/go.yaml
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "fmt"
+ "log"
+ "regexp"
+ "slices"
+ "strconv"
+ "strings"
+ "unicode"
+
+ "simd/archsimd/_gen/unify"
+)
+
+type Operation struct {
+ rawOperation
+
+ // Go is the Go method name of this operation.
+ //
+ // It is derived from the raw Go method name by adding optional suffixes.
+ // Currently, "Masked" is the only suffix.
+ Go string
+
+ // Documentation is the doc string for this API.
+ //
+ // It is computed from the raw documentation:
+ //
+ // - "NAME" is replaced by the Go method name.
+ //
+ // - For masked operation, a sentence about masking is added.
+ Documentation string
+
+ // In is the sequence of parameters to the Go method.
+ //
+ // For masked operations, this will have the mask operand appended.
+ In []Operand
+}
+
+// rawOperation is the unifier representation of an [Operation]. It is
+// translated into a more parsed form after unifier decoding.
+type rawOperation struct {
+ Go string // Base Go method name
+
+ GoArch string // GOARCH for this definition
+ Asm string // Assembly mnemonic
+ OperandOrder *string // optional Operand order for better Go declarations
+ // Optional tag to indicate this operation is paired with special generic->machine ssa lowering rules.
+ // Should be paired with special templates in gen_simdrules.go
+ SpecialLower *string
+
+ In []Operand // Parameters
+ InVariant []Operand // Optional parameters
+ Out []Operand // Results
+ MemFeatures *string // The memory operand feature this operation supports
+ MemFeaturesData *string // Additional data associated with MemFeatures
+ Commutative bool // Commutativity
+ CPUFeature string // CPUID/Has* feature name
+ Zeroing *bool // nil => use asm suffix ".Z"; false => do not use asm suffix ".Z"
+ Documentation *string // Documentation will be appended to the stubs comments.
+ AddDoc *string // Additional doc to be appended.
+ // ConstMask is a hack to reduce the size of defs the user writes for const-immediate
+ // If present, it will be copied to [In[0].Const].
+ ConstImm *string
+ // NameAndSizeCheck is used to check [BWDQ] maps to (8|16|32|64) elemBits.
+ NameAndSizeCheck *bool
+ // If non-nil, all generation in gen_simdTypes.go and gen_intrinsics will be skipped.
+ NoTypes *string
+ // If non-nil, all generation in gen_simdGenericOps and gen_simdrules will be skipped.
+ NoGenericOps *string
+ // If non-nil, this string will be attached to the machine ssa op name. E.g. "const"
+ SSAVariant *string
+ // If true, do not emit method declarations, generic ops, or intrinsics for masked variants
+ // DO emit the architecture-specific opcodes and optimizations.
+ HideMaskMethods *bool
+}
+
+func (o *Operation) IsMasked() bool {
+ if len(o.InVariant) == 0 {
+ return false
+ }
+ if len(o.InVariant) == 1 && o.InVariant[0].Class == "mask" {
+ return true
+ }
+ panic(fmt.Errorf("unknown inVariant"))
+}
+
+func (o *Operation) SkipMaskedMethod() bool {
+ if o.HideMaskMethods == nil {
+ return false
+ }
+ if *o.HideMaskMethods && o.IsMasked() {
+ return true
+ }
+ return false
+}
+
+var reForName = regexp.MustCompile(`\bNAME\b`)
+
+func (o *Operation) DecodeUnified(v *unify.Value) error {
+ if err := v.Decode(&o.rawOperation); err != nil {
+ return err
+ }
+
+ isMasked := o.IsMasked()
+
+ // Compute full Go method name.
+ o.Go = o.rawOperation.Go
+ if isMasked {
+ o.Go += "Masked"
+ }
+
+ // Compute doc string.
+ if o.rawOperation.Documentation != nil {
+ o.Documentation = *o.rawOperation.Documentation
+ } else {
+ o.Documentation = "// UNDOCUMENTED"
+ }
+ o.Documentation = reForName.ReplaceAllString(o.Documentation, o.Go)
+ if isMasked {
+ o.Documentation += "\n//\n// This operation is applied selectively under a write mask."
+ // Suppress generic op and method declaration for exported methods, if a mask is present.
+ if unicode.IsUpper([]rune(o.Go)[0]) {
+ trueVal := "true"
+ o.NoGenericOps = &trueVal
+ o.NoTypes = &trueVal
+ }
+ }
+ if o.rawOperation.AddDoc != nil {
+ o.Documentation += "\n" + reForName.ReplaceAllString(*o.rawOperation.AddDoc, o.Go)
+ }
+
+ o.In = append(o.rawOperation.In, o.rawOperation.InVariant...)
+
+ return nil
+}
+
+func (o *Operation) VectorWidth() int {
+ out := o.Out[0]
+ if out.Class == "vreg" {
+ return *out.Bits
+ } else if out.Class == "greg" || out.Class == "mask" {
+ for i := range o.In {
+ if o.In[i].Class == "vreg" {
+ return *o.In[i].Bits
+ }
+ }
+ }
+ panic(fmt.Errorf("Figure out what the vector width is for %v and implement it", *o))
+}
+
+// Right now simdgen computes the machine op name for most instructions
+// as $Name$OutputSize, by this denotation, these instructions are "overloaded".
+// for example:
+// (Uint16x8) ConvertToInt8
+// (Uint16x16) ConvertToInt8
+// are both VPMOVWB128.
+// To make them distinguishable we need to append the input size to them as well.
+// TODO: document them well in the generated code.
+var demotingConvertOps = map[string]bool{
+ "VPMOVQD128": true, "VPMOVSQD128": true, "VPMOVUSQD128": true, "VPMOVQW128": true, "VPMOVSQW128": true,
+ "VPMOVUSQW128": true, "VPMOVDW128": true, "VPMOVSDW128": true, "VPMOVUSDW128": true, "VPMOVQB128": true,
+ "VPMOVSQB128": true, "VPMOVUSQB128": true, "VPMOVDB128": true, "VPMOVSDB128": true, "VPMOVUSDB128": true,
+ "VPMOVWB128": true, "VPMOVSWB128": true, "VPMOVUSWB128": true,
+ "VPMOVQDMasked128": true, "VPMOVSQDMasked128": true, "VPMOVUSQDMasked128": true, "VPMOVQWMasked128": true, "VPMOVSQWMasked128": true,
+ "VPMOVUSQWMasked128": true, "VPMOVDWMasked128": true, "VPMOVSDWMasked128": true, "VPMOVUSDWMasked128": true, "VPMOVQBMasked128": true,
+ "VPMOVSQBMasked128": true, "VPMOVUSQBMasked128": true, "VPMOVDBMasked128": true, "VPMOVSDBMasked128": true, "VPMOVUSDBMasked128": true,
+ "VPMOVWBMasked128": true, "VPMOVSWBMasked128": true, "VPMOVUSWBMasked128": true,
+}
+
+func machineOpName(maskType maskShape, gOp Operation) string {
+ asm := gOp.Asm
+ if maskType == OneMask {
+ asm += "Masked"
+ }
+ asm = fmt.Sprintf("%s%d", asm, gOp.VectorWidth())
+ if gOp.SSAVariant != nil {
+ asm += *gOp.SSAVariant
+ }
+ if demotingConvertOps[asm] {
+ // Need to append the size of the source as well.
+ // TODO: should be "%sto%d".
+ asm = fmt.Sprintf("%s_%d", asm, *gOp.In[0].Bits)
+ }
+ return asm
+}
+
+func compareStringPointers(x, y *string) int {
+ if x != nil && y != nil {
+ return compareNatural(*x, *y)
+ }
+ if x == nil && y == nil {
+ return 0
+ }
+ if x == nil {
+ return -1
+ }
+ return 1
+}
+
+func compareIntPointers(x, y *int) int {
+ if x != nil && y != nil {
+ return *x - *y
+ }
+ if x == nil && y == nil {
+ return 0
+ }
+ if x == nil {
+ return -1
+ }
+ return 1
+}
+
+func compareOperations(x, y Operation) int {
+ if c := compareNatural(x.Go, y.Go); c != 0 {
+ return c
+ }
+ xIn, yIn := x.In, y.In
+
+ if len(xIn) > len(yIn) && xIn[len(xIn)-1].Class == "mask" {
+ xIn = xIn[:len(xIn)-1]
+ } else if len(xIn) < len(yIn) && yIn[len(yIn)-1].Class == "mask" {
+ yIn = yIn[:len(yIn)-1]
+ }
+
+ if len(xIn) < len(yIn) {
+ return -1
+ }
+ if len(xIn) > len(yIn) {
+ return 1
+ }
+ if len(x.Out) < len(y.Out) {
+ return -1
+ }
+ if len(x.Out) > len(y.Out) {
+ return 1
+ }
+ for i := range xIn {
+ ox, oy := &xIn[i], &yIn[i]
+ if c := compareOperands(ox, oy); c != 0 {
+ return c
+ }
+ }
+ return 0
+}
+
+func compareOperands(x, y *Operand) int {
+ if c := compareNatural(x.Class, y.Class); c != 0 {
+ return c
+ }
+ if x.Class == "immediate" {
+ return compareStringPointers(x.ImmOffset, y.ImmOffset)
+ } else {
+ if c := compareStringPointers(x.Base, y.Base); c != 0 {
+ return c
+ }
+ if c := compareIntPointers(x.ElemBits, y.ElemBits); c != 0 {
+ return c
+ }
+ if c := compareIntPointers(x.Bits, y.Bits); c != 0 {
+ return c
+ }
+ return 0
+ }
+}
+
+type Operand struct {
+ Class string // One of "mask", "immediate", "vreg", "greg", and "mem"
+
+ Go *string // Go type of this operand
+ AsmPos int // Position of this operand in the assembly instruction
+
+ Base *string // Base Go type ("int", "uint", "float")
+ ElemBits *int // Element bit width
+ Bits *int // Total vector bit width
+
+ Const *string // Optional constant value for immediates.
+ // Optional immediate arg offsets. If this field is non-nil,
+ // This operand will be an immediate operand:
+ // The compiler will right-shift the user-passed value by ImmOffset and set it as the AuxInt
+ // field of the operation.
+ ImmOffset *string
+ Name *string // optional name in the Go intrinsic declaration
+ Lanes *int // *Lanes equals Bits/ElemBits except for scalars, when *Lanes == 1
+ // TreatLikeAScalarOfSize means only the lower $TreatLikeAScalarOfSize bits of the vector
+ // is used, so at the API level we can make it just a scalar value of this size; Then we
+ // can overwrite it to a vector of the right size during intrinsics stage.
+ TreatLikeAScalarOfSize *int
+ // If non-nil, it means the [Class] field is overwritten here, right now this is used to
+ // overwrite the results of AVX2 compares to masks.
+ OverwriteClass *string
+ // If non-nil, it means the [Base] field is overwritten here. This field exist solely
+ // because Intel's XED data is inconsistent. e.g. VANDNP[SD] marks its operand int.
+ OverwriteBase *string
+ // If non-nil, it means the [ElementBits] field is overwritten. This field exist solely
+ // because Intel's XED data is inconsistent. e.g. AVX512 VPMADDUBSW marks its operand
+ // elemBits 16, which should be 8.
+ OverwriteElementBits *int
+ // FixedReg is the name of the fixed registers
+ FixedReg *string
+}
+
+// isDigit returns true if the byte is an ASCII digit.
+func isDigit(b byte) bool {
+ return b >= '0' && b <= '9'
+}
+
+// compareNatural performs a "natural sort" comparison of two strings.
+// It compares non-digit sections lexicographically and digit sections
+// numerically. In the case of string-unequal "equal" strings like
+// "a01b" and "a1b", strings.Compare breaks the tie.
+//
+// It returns:
+//
+// -1 if s1 < s2
+// 0 if s1 == s2
+// +1 if s1 > s2
+func compareNatural(s1, s2 string) int {
+ i, j := 0, 0
+ len1, len2 := len(s1), len(s2)
+
+ for i < len1 && j < len2 {
+ // Find a non-digit segment or a number segment in both strings.
+ if isDigit(s1[i]) && isDigit(s2[j]) {
+ // Number segment comparison.
+ numStart1 := i
+ for i < len1 && isDigit(s1[i]) {
+ i++
+ }
+ num1, _ := strconv.Atoi(s1[numStart1:i])
+
+ numStart2 := j
+ for j < len2 && isDigit(s2[j]) {
+ j++
+ }
+ num2, _ := strconv.Atoi(s2[numStart2:j])
+
+ if num1 < num2 {
+ return -1
+ }
+ if num1 > num2 {
+ return 1
+ }
+ // If numbers are equal, continue to the next segment.
+ } else {
+ // Non-digit comparison.
+ if s1[i] < s2[j] {
+ return -1
+ }
+ if s1[i] > s2[j] {
+ return 1
+ }
+ i++
+ j++
+ }
+ }
+
+ // deal with a01b vs a1b; there needs to be an order.
+ return strings.Compare(s1, s2)
+}
+
+const generatedHeader = `// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+`
+
+func writeGoDefs(path string, cl unify.Closure) error {
+ // TODO: Merge operations with the same signature but multiple
+ // implementations (e.g., SSE vs AVX)
+ var ops []Operation
+ for def := range cl.All() {
+ var op Operation
+ if !def.Exact() {
+ continue
+ }
+ if err := def.Decode(&op); err != nil {
+ log.Println(err.Error())
+ log.Println(def)
+ continue
+ }
+ // TODO: verify that this is safe.
+ op.sortOperand()
+ op.adjustAsm()
+ ops = append(ops, op)
+ }
+ slices.SortFunc(ops, compareOperations)
+ // The parsed XED data might contain duplicates, like
+ // 512 bits VPADDP.
+ deduped := dedup(ops)
+ slices.SortFunc(deduped, compareOperations)
+
+ if *Verbose {
+ log.Printf("dedup len: %d\n", len(ops))
+ }
+ var err error
+ if err = overwrite(deduped); err != nil {
+ return err
+ }
+ if *Verbose {
+ log.Printf("dedup len: %d\n", len(deduped))
+ }
+ if !*FlagNoDedup {
+ // TODO: This can hide mistakes in the API definitions, especially when
+ // multiple patterns result in the same API unintentionally. Make it stricter.
+ if deduped, err = dedupGodef(deduped); err != nil {
+ return err
+ }
+ }
+ if *Verbose {
+ log.Printf("dedup len: %d\n", len(deduped))
+ }
+ if !*FlagNoConstImmPorting {
+ if err = copyConstImm(deduped); err != nil {
+ return err
+ }
+ }
+ if *Verbose {
+ log.Printf("dedup len: %d\n", len(deduped))
+ }
+ reportXEDInconsistency(deduped)
+ typeMap := parseSIMDTypes(deduped)
+
+ formatWriteAndClose(writeSIMDTypes(typeMap), path, "src/"+simdPackage+"/types_amd64.go")
+ formatWriteAndClose(writeSIMDFeatures(deduped), path, "src/"+simdPackage+"/cpu.go")
+ f, fI := writeSIMDStubs(deduped, typeMap)
+ formatWriteAndClose(f, path, "src/"+simdPackage+"/ops_amd64.go")
+ formatWriteAndClose(fI, path, "src/"+simdPackage+"/ops_internal_amd64.go")
+ formatWriteAndClose(writeSIMDIntrinsics(deduped, typeMap), path, "src/cmd/compile/internal/ssagen/simdintrinsics.go")
+ formatWriteAndClose(writeSIMDGenericOps(deduped), path, "src/cmd/compile/internal/ssa/_gen/simdgenericOps.go")
+ formatWriteAndClose(writeSIMDMachineOps(deduped), path, "src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go")
+ formatWriteAndClose(writeSIMDSSA(deduped), path, "src/cmd/compile/internal/amd64/simdssa.go")
+ writeAndClose(writeSIMDRules(deduped).Bytes(), path, "src/cmd/compile/internal/ssa/_gen/simdAMD64.rules")
+
+ return nil
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// simdgen is an experiment in generating Go <-> asm SIMD mappings.
+//
+// Usage: simdgen [-xedPath=path] [-q=query] input.yaml...
+//
+// If -xedPath is provided, one of the inputs is a sum of op-code definitions
+// generated from the Intel XED data at path.
+//
+// If input YAML files are provided, each file is read as an input value. See
+// [unify.Closure.UnmarshalYAML] or "go doc unify.Closure.UnmarshalYAML" for the
+// format of these files.
+//
+// TODO: Example definitions and values.
+//
+// The command unifies across all of the inputs and prints all possible results
+// of this unification.
+//
+// If the -q flag is provided, its string value is parsed as a value and treated
+// as another input to unification. This is intended as a way to "query" the
+// result, typically by narrowing it down to a small subset of results.
+//
+// Typical usage:
+//
+// go run . -xedPath $XEDPATH *.yaml
+//
+// To see just the definitions generated from XED, run:
+//
+// go run . -xedPath $XEDPATH
+//
+// (This works because if there's only one input, there's nothing to unify it
+// with, so the result is simply itself.)
+//
+// To see just the definitions for VPADDQ:
+//
+// go run . -xedPath $XEDPATH -q '{asm: VPADDQ}'
+//
+// simdgen can also generate Go definitions of SIMD mappings:
+// To generate go files to the go root, run:
+//
+// go run . -xedPath $XEDPATH -o godefs -goroot $PATH/TO/go go.yaml categories.yaml types.yaml
+//
+// types.yaml is already written, it specifies the shapes of vectors.
+// categories.yaml and go.yaml contains definitions that unifies with types.yaml and XED
+// data, you can find an example in ops/AddSub/.
+//
+// When generating Go definitions, simdgen do 3 "magic"s:
+// - It splits masked operations(with op's [Masked] field set) to const and non const:
+// - One is a normal masked operation, the original
+// - The other has its mask operand's [Const] fields set to "K0".
+// - This way the user does not need to provide a separate "K0"-masked operation def.
+//
+// - It deduplicates intrinsic names that have duplicates:
+// - If there are two operations that shares the same signature, one is AVX512 the other
+// is before AVX512, the other will be selected.
+// - This happens often when some operations are defined both before AVX512 and after.
+// This way the user does not need to provide a separate "K0" operation for the
+// AVX512 counterpart.
+//
+// - It copies the op's [ConstImm] field to its immediate operand's [Const] field.
+// - This way the user does not need to provide verbose op definition while only
+// the const immediate field is different. This is useful to reduce verbosity of
+// compares with imm control predicates.
+//
+// These 3 magics could be disabled by enabling -nosplitmask, -nodedup or
+// -noconstimmporting flags.
+//
+// simdgen right now only supports amd64, -arch=$OTHERARCH will trigger a fatal error.
+package main
+
+// Big TODOs:
+//
+// - This can produce duplicates, which can also lead to less efficient
+// environment merging. Add hashing and use it for deduplication. Be careful
+// about how this shows up in debug traces, since it could make things
+// confusing if we don't show it happening.
+//
+// - Do I need Closure, Value, and Domain? It feels like I should only need two
+// types.
+
+import (
+ "cmp"
+ "flag"
+ "fmt"
+ "log"
+ "maps"
+ "os"
+ "path/filepath"
+ "runtime/pprof"
+ "slices"
+ "strings"
+
+ "simd/archsimd/_gen/unify"
+
+ "gopkg.in/yaml.v3"
+)
+
+var (
+ xedPath = flag.String("xedPath", "", "load XED datafiles from `path`")
+ flagQ = flag.String("q", "", "query: read `def` as another input (skips final validation)")
+ flagO = flag.String("o", "yaml", "output type: yaml, godefs (generate definitions into a Go source tree")
+ flagGoDefRoot = flag.String("goroot", ".", "the path to the Go dev directory that will receive the generated files")
+ FlagNoDedup = flag.Bool("nodedup", false, "disable deduplicating godefs of 2 qualifying operations from different extensions")
+ FlagNoConstImmPorting = flag.Bool("noconstimmporting", false, "disable const immediate porting from op to imm operand")
+ FlagArch = flag.String("arch", "amd64", "the target architecture")
+
+ Verbose = flag.Bool("v", false, "verbose")
+
+ flagDebugXED = flag.Bool("debug-xed", false, "show XED instructions")
+ flagDebugUnify = flag.Bool("debug-unify", false, "print unification trace")
+ flagDebugHTML = flag.String("debug-html", "", "write unification trace to `file.html`")
+ FlagReportDup = flag.Bool("reportdup", false, "report the duplicate godefs")
+
+ flagCPUProfile = flag.String("cpuprofile", "", "write CPU profile to `file`")
+ flagMemProfile = flag.String("memprofile", "", "write memory profile to `file`")
+)
+
+const simdPackage = "simd/archsimd"
+
+func main() {
+ flag.Parse()
+
+ if *flagCPUProfile != "" {
+ f, err := os.Create(*flagCPUProfile)
+ if err != nil {
+ log.Fatalf("-cpuprofile: %s", err)
+ }
+ defer f.Close()
+ pprof.StartCPUProfile(f)
+ defer pprof.StopCPUProfile()
+ }
+ if *flagMemProfile != "" {
+ f, err := os.Create(*flagMemProfile)
+ if err != nil {
+ log.Fatalf("-memprofile: %s", err)
+ }
+ defer func() {
+ pprof.WriteHeapProfile(f)
+ f.Close()
+ }()
+ }
+
+ var inputs []unify.Closure
+
+ if *FlagArch != "amd64" {
+ log.Fatalf("simdgen only supports amd64")
+ }
+
+ // Load XED into a defs set.
+ if *xedPath != "" {
+ xedDefs := loadXED(*xedPath)
+ inputs = append(inputs, unify.NewSum(xedDefs...))
+ }
+
+ // Load query.
+ if *flagQ != "" {
+ r := strings.NewReader(*flagQ)
+ def, err := unify.Read(r, "<query>", unify.ReadOpts{})
+ if err != nil {
+ log.Fatalf("parsing -q: %s", err)
+ }
+ inputs = append(inputs, def)
+ }
+
+ // Load defs files.
+ must := make(map[*unify.Value]struct{})
+ for _, path := range flag.Args() {
+ defs, err := unify.ReadFile(path, unify.ReadOpts{})
+ if err != nil {
+ log.Fatal(err)
+ }
+ inputs = append(inputs, defs)
+
+ if filepath.Base(path) == "go.yaml" {
+ // These must all be used in the final result
+ for def := range defs.Summands() {
+ must[def] = struct{}{}
+ }
+ }
+ }
+
+ // Prepare for unification
+ if *flagDebugUnify {
+ unify.Debug.UnifyLog = os.Stderr
+ }
+ if *flagDebugHTML != "" {
+ f, err := os.Create(*flagDebugHTML)
+ if err != nil {
+ log.Fatal(err)
+ }
+ unify.Debug.HTML = f
+ defer f.Close()
+ }
+
+ // Unify!
+ unified, err := unify.Unify(inputs...)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ // Validate results.
+ //
+ // Don't validate if this is a command-line query because that tends to
+ // eliminate lots of required defs and is used in cases where maybe defs
+ // aren't enumerable anyway.
+ if *flagQ == "" && len(must) > 0 {
+ validate(unified, must)
+ }
+
+ // Print results.
+ switch *flagO {
+ case "yaml":
+ // Produce a result that looks like encoding a slice, but stream it.
+ fmt.Println("!sum")
+ var val1 [1]*unify.Value
+ for val := range unified.All() {
+ val1[0] = val
+ // We have to make a new encoder each time or it'll print a document
+ // separator between each object.
+ enc := yaml.NewEncoder(os.Stdout)
+ if err := enc.Encode(val1); err != nil {
+ log.Fatal(err)
+ }
+ enc.Close()
+ }
+ case "godefs":
+ if err := writeGoDefs(*flagGoDefRoot, unified); err != nil {
+ log.Fatalf("Failed writing godefs: %+v", err)
+ }
+ }
+
+ if !*Verbose && *xedPath != "" {
+ if operandRemarks == 0 {
+ fmt.Fprintf(os.Stderr, "XED decoding generated no errors, which is unusual.\n")
+ } else {
+ fmt.Fprintf(os.Stderr, "XED decoding generated %d \"errors\" which is not cause for alarm, use -v for details.\n", operandRemarks)
+ }
+ }
+}
+
+func validate(cl unify.Closure, required map[*unify.Value]struct{}) {
+ // Validate that:
+ // 1. All final defs are exact
+ // 2. All required defs are used
+ for def := range cl.All() {
+ if _, ok := def.Domain.(unify.Def); !ok {
+ fmt.Fprintf(os.Stderr, "%s: expected Def, got %T\n", def.PosString(), def.Domain)
+ continue
+ }
+
+ if !def.Exact() {
+ fmt.Fprintf(os.Stderr, "%s: def not reduced to an exact value, why is %s:\n", def.PosString(), def.WhyNotExact())
+ fmt.Fprintf(os.Stderr, "\t%s\n", strings.ReplaceAll(def.String(), "\n", "\n\t"))
+ }
+
+ for root := range def.Provenance() {
+ delete(required, root)
+ }
+ }
+ // Report unused defs
+ unused := slices.SortedFunc(maps.Keys(required),
+ func(a, b *unify.Value) int {
+ return cmp.Or(
+ cmp.Compare(a.Pos().Path, b.Pos().Path),
+ cmp.Compare(a.Pos().Line, b.Pos().Line),
+ )
+ })
+ for _, def := range unused {
+ // TODO: Can we say anything more actionable? This is always a problem
+ // with unification: if it fails, it's very hard to point a finger at
+ // any particular reason. We could go back and try unifying this again
+ // with each subset of the inputs (starting with individual inputs) to
+ // at least say "it doesn't unify with anything in x.yaml". That's a lot
+ // of work, but if we have trouble debugging unification failure it may
+ // be worth it.
+ fmt.Fprintf(os.Stderr, "%s: def required, but did not unify (%v)\n",
+ def.PosString(), def)
+ }
+}
--- /dev/null
+!sum
+- go: Add
+ commutative: true
+ documentation: !string |-
+ // NAME adds corresponding elements of two vectors.
+- go: AddSaturated
+ commutative: true
+ documentation: !string |-
+ // NAME adds corresponding elements of two vectors with saturation.
+- go: Sub
+ commutative: false
+ documentation: !string |-
+ // NAME subtracts corresponding elements of two vectors.
+- go: SubSaturated
+ commutative: false
+ documentation: !string |-
+ // NAME subtracts corresponding elements of two vectors with saturation.
+- go: AddPairs
+ commutative: false
+ documentation: !string |-
+ // NAME horizontally adds adjacent pairs of elements.
+ // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+- go: SubPairs
+ commutative: false
+ documentation: !string |-
+ // NAME horizontally subtracts adjacent pairs of elements.
+ // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+- go: AddPairsSaturated
+ commutative: false
+ documentation: !string |-
+ // NAME horizontally adds adjacent pairs of elements with saturation.
+ // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+- go: SubPairsSaturated
+ commutative: false
+ documentation: !string |-
+ // NAME horizontally subtracts adjacent pairs of elements with saturation.
+ // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
--- /dev/null
+!sum
+# Add
+- go: Add
+ asm: "VPADD[BWDQ]|VADDP[SD]"
+ in:
+ - &any
+ go: $t
+ - *any
+ out:
+ - *any
+# Add Saturated
+- go: AddSaturated
+ asm: "VPADDS[BWDQ]"
+ in:
+ - &int
+ go: $t
+ base: int
+ - *int
+ out:
+ - *int
+- go: AddSaturated
+ asm: "VPADDUS[BWDQ]"
+ in:
+ - &uint
+ go: $t
+ base: uint
+ - *uint
+ out:
+ - *uint
+
+# Sub
+- go: Sub
+ asm: "VPSUB[BWDQ]|VSUBP[SD]"
+ in: &2any
+ - *any
+ - *any
+ out: &1any
+ - *any
+# Sub Saturated
+- go: SubSaturated
+ asm: "VPSUBS[BWDQ]"
+ in: &2int
+ - *int
+ - *int
+ out: &1int
+ - *int
+- go: SubSaturated
+ asm: "VPSUBUS[BWDQ]"
+ in:
+ - *uint
+ - *uint
+ out:
+ - *uint
+- go: AddPairs
+ asm: "VPHADD[DW]"
+ in: *2any
+ out: *1any
+- go: SubPairs
+ asm: "VPHSUB[DW]"
+ in: *2any
+ out: *1any
+- go: AddPairs
+ asm: "VHADDP[SD]" # floats
+ in: *2any
+ out: *1any
+- go: SubPairs
+ asm: "VHSUBP[SD]" # floats
+ in: *2any
+ out: *1any
+- go: AddPairsSaturated
+ asm: "VPHADDS[DW]"
+ in: *2int
+ out: *1int
+- go: SubPairsSaturated
+ asm: "VPHSUBS[DW]"
+ in: *2int
+ out: *1int
--- /dev/null
+!sum
+- go: And
+ commutative: true
+ documentation: !string |-
+ // NAME performs a bitwise AND operation between two vectors.
+- go: Or
+ commutative: true
+ documentation: !string |-
+ // NAME performs a bitwise OR operation between two vectors.
+- go: AndNot
+ commutative: false
+ documentation: !string |-
+ // NAME performs a bitwise x &^ y.
+- go: Xor
+ commutative: true
+ documentation: !string |-
+ // NAME performs a bitwise XOR operation between two vectors.
+- go: tern
+ commutative: false
+ documentation: !string |-
+ // NAME performs a logical operation on three vectors based on the 8-bit truth table.
+ // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+
+# We also have PTEST and VPTERNLOG, those should be hidden from the users
+# and only appear in rewrite rules.
--- /dev/null
+!sum
+# In the XED data, *all* floating point bitwise logic operation has their
+# operand type marked as uint. We are not trying to understand why Intel
+# decided that they want FP bit-wise logic operations, but this irregularity
+# has to be dealed with in separate rules with some overwrites.
+
+# For many bit-wise operations, we have the following non-orthogonal
+# choices:
+#
+# - Non-masked AVX operations have no element width (because it
+# doesn't matter), but only cover 128 and 256 bit vectors.
+#
+# - Masked AVX-512 operations have an element width (because it needs
+# to know how to interpret the mask), and cover 128, 256, and 512 bit
+# vectors. These only cover 32- and 64-bit element widths.
+#
+# - Non-masked AVX-512 operations still have an element width (because
+# they're just the masked operations with an implicit K0 mask) but it
+# doesn't matter! This is the only option for non-masked 512 bit
+# operations, and we can pick any of the element widths.
+#
+# We unify with ALL of these operations and the compiler generator
+# picks when there are multiple options.
+
+# TODO: We don't currently generate unmasked bit-wise operations on 512 bit
+# vectors of 8- or 16-bit elements. AVX-512 only has *masked* bit-wise
+# operations for 32- and 64-bit elements; while the element width doesn't matter
+# for unmasked operations, right now we don't realize that we can just use the
+# 32- or 64-bit version for the unmasked form. Maybe in the XED decoder we
+# should recognize bit-wise operations when generating unmasked versions and
+# omit the element width.
+
+# For binary operations, we constrain their two inputs and one output to the
+# same Go type using a variable.
+
+- go: And
+ asm: "VPAND[DQ]?"
+ in:
+ - &any
+ go: $t
+ - *any
+ out:
+ - *any
+
+- go: And
+ asm: "VPANDD" # Fill in the gap, And is missing for Uint8x64 and Int8x64
+ inVariant: []
+ in: &twoI8x64
+ - &i8x64
+ go: $t
+ overwriteElementBits: 8
+ - *i8x64
+ out: &oneI8x64
+ - *i8x64
+
+- go: And
+ asm: "VPANDD" # Fill in the gap, And is missing for Uint16x32 and Int16x32
+ inVariant: []
+ in: &twoI16x32
+ - &i16x32
+ go: $t
+ overwriteElementBits: 16
+ - *i16x32
+ out: &oneI16x32
+ - *i16x32
+
+- go: AndNot
+ asm: "VPANDN[DQ]?"
+ operandOrder: "21" # switch the arg order
+ in:
+ - *any
+ - *any
+ out:
+ - *any
+
+- go: AndNot
+ asm: "VPANDND" # Fill in the gap, AndNot is missing for Uint8x64 and Int8x64
+ operandOrder: "21" # switch the arg order
+ inVariant: []
+ in: *twoI8x64
+ out: *oneI8x64
+
+- go: AndNot
+ asm: "VPANDND" # Fill in the gap, AndNot is missing for Uint16x32 and Int16x32
+ operandOrder: "21" # switch the arg order
+ inVariant: []
+ in: *twoI16x32
+ out: *oneI16x32
+
+- go: Or
+ asm: "VPOR[DQ]?"
+ in:
+ - *any
+ - *any
+ out:
+ - *any
+
+- go: Or
+ asm: "VPORD" # Fill in the gap, Or is missing for Uint8x64 and Int8x64
+ inVariant: []
+ in: *twoI8x64
+ out: *oneI8x64
+
+- go: Or
+ asm: "VPORD" # Fill in the gap, Or is missing for Uint16x32 and Int16x32
+ inVariant: []
+ in: *twoI16x32
+ out: *oneI16x32
+
+- go: Xor
+ asm: "VPXOR[DQ]?"
+ in:
+ - *any
+ - *any
+ out:
+ - *any
+
+- go: Xor
+ asm: "VPXORD" # Fill in the gap, Or is missing for Uint8x64 and Int8x64
+ inVariant: []
+ in: *twoI8x64
+ out: *oneI8x64
+
+- go: Xor
+ asm: "VPXORD" # Fill in the gap, Or is missing for Uint16x32 and Int16x32
+ inVariant: []
+ in: *twoI16x32
+ out: *oneI16x32
+
+- go: tern
+ asm: "VPTERNLOGD|VPTERNLOGQ"
+ in:
+ - &tern_op
+ go: $t
+ - *tern_op
+ - *tern_op
+ - class: immediate
+ immOffset: 0
+ name: table
+ inVariant: []
+ out:
+ - *tern_op
--- /dev/null
+!sum
+# const imm predicate(holds for both float and int|uint):
+# 0: Equal
+# 1: Less
+# 2: LessEqual
+# 4: NotEqual
+# 5: GreaterEqual
+# 6: Greater
+- go: Equal
+ constImm: 0
+ commutative: true
+ documentation: !string |-
+ // NAME returns x equals y, elementwise.
+- go: Less
+ constImm: 1
+ commutative: false
+ documentation: !string |-
+ // NAME returns x less-than y, elementwise.
+- go: LessEqual
+ constImm: 2
+ commutative: false
+ documentation: !string |-
+ // NAME returns x less-than-or-equals y, elementwise.
+- go: IsNan # For float only.
+ constImm: 3
+ commutative: true
+ documentation: !string |-
+ // NAME checks if elements are NaN. Use as x.IsNan(x).
+- go: NotEqual
+ constImm: 4
+ commutative: true
+ documentation: !string |-
+ // NAME returns x not-equals y, elementwise.
+- go: GreaterEqual
+ constImm: 13
+ commutative: false
+ documentation: !string |-
+ // NAME returns x greater-than-or-equals y, elementwise.
+- go: Greater
+ constImm: 14
+ commutative: false
+ documentation: !string |-
+ // NAME returns x greater-than y, elementwise.
--- /dev/null
+!sum
+# Ints
+- go: Equal
+ asm: "V?PCMPEQ[BWDQ]"
+ in:
+ - &any
+ go: $t
+ - *any
+ out:
+ - &anyvregToMask
+ go: $t
+ overwriteBase: int
+ overwriteClass: mask
+- go: Greater
+ asm: "V?PCMPGT[BWDQ]"
+ in:
+ - &int
+ go: $t
+ base: int
+ - *int
+ out:
+ - *anyvregToMask
+# 256-bit VCMPGTQ's output elemBits is marked 32-bit in the XED data, we
+# believe this is an error, so add this definition to overwrite.
+- go: Greater
+ asm: "VPCMPGTQ"
+ in:
+ - &int64
+ go: $t
+ base: int
+ elemBits: 64
+ - *int64
+ out:
+ - base: int
+ elemBits: 32
+ overwriteElementBits: 64
+ overwriteClass: mask
+ overwriteBase: int
+
+# TODO these are redundant with VPCMP operations.
+# AVX-512 compares produce masks.
+- go: Equal
+ asm: "V?PCMPEQ[BWDQ]"
+ in:
+ - *any
+ - *any
+ out:
+ - class: mask
+- go: Greater
+ asm: "V?PCMPGT[BWDQ]"
+ in:
+ - *int
+ - *int
+ out:
+ - class: mask
+
+# MASKED signed comparisons for X/Y registers
+# unmasked would clash with emulations on AVX2
+- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
+ regexpTag: "compares"
+ asm: "VPCMP[BWDQ]"
+ in:
+ - &int
+ bits: (128|256)
+ go: $t
+ base: int
+ - *int
+ - class: immediate
+ const: 0 # Just a placeholder, will be overwritten by const imm porting.
+ inVariant:
+ - class: mask
+ out:
+ - class: mask
+
+# MASKED unsigned comparisons for X/Y registers
+# unmasked would clash with emulations on AVX2
+- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
+ regexpTag: "compares"
+ asm: "VPCMPU[BWDQ]"
+ in:
+ - &uint
+ bits: (128|256)
+ go: $t
+ base: uint
+ - *uint
+ - class: immediate
+ const: 0
+ inVariant:
+ - class: mask
+ out:
+ - class: mask
+
+# masked/unmasked signed comparisons for Z registers
+- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
+ regexpTag: "compares"
+ asm: "VPCMP[BWDQ]"
+ in:
+ - &int
+ bits: 512
+ go: $t
+ base: int
+ - *int
+ - class: immediate
+ const: 0 # Just a placeholder, will be overwritten by const imm porting.
+ out:
+ - class: mask
+
+# masked/unmasked unsigned comparisons for Z registers
+- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual)
+ regexpTag: "compares"
+ asm: "VPCMPU[BWDQ]"
+ in:
+ - &uint
+ bits: 512
+ go: $t
+ base: uint
+ - *uint
+ - class: immediate
+ const: 0
+ out:
+ - class: mask
+
+# Floats
+- go: Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual|IsNan
+ regexpTag: "compares"
+ asm: "VCMPP[SD]"
+ in:
+ - &float
+ go: $t
+ base: float
+ - *float
+ - class: immediate
+ const: 0
+ out:
+ - go: $t
+ overwriteBase: int
+ overwriteClass: mask
+- go: (Equal|Greater|Less|LessEqual|GreaterEqual|NotEqual|IsNan)
+ regexpTag: "compares"
+ asm: "VCMPP[SD]"
+ in:
+ - *float
+ - *float
+ - class: immediate
+ const: 0
+ out:
+ - class: mask
\ No newline at end of file
--- /dev/null
+!sum
+# Float <-> Int conversions
+- go: "ConvertToInt32"
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to int32.
+ // When a conversion is inexact, a truncated (round toward zero) value is returned.
+ // If a converted result cannot be represented in int32, an implementation-defined
+ // architecture-specific value is returned.
+- go: "ConvertToUint32"
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to uint32.
+ // When a conversion is inexact, a truncated (round toward zero) value is returned.
+ // If a converted result cannot be represented in uint32, an implementation-defined
+ // architecture-specific value is returned.
+- go: "ConvertToInt64"
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to int64.
+ // When a conversion is inexact, a truncated (round toward zero) value is returned.
+ // If a converted result cannot be represented in int64, an implementation-defined
+ // architecture-specific value is returned.
+- go: "ConvertToUint64"
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to uint64.
+ // When a conversion is inexact, a truncated (round toward zero) value is returned.
+ // If a converted result cannot be represented in uint64, an implementation-defined
+ // architecture-specific value is returned.
+- go: "ConvertToFloat32" # Also float64 -> float32
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to float32.
+- go: "ConvertToFloat64" # Also float32 -> float64
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to float64.
+
+# Int <-> Int conversions
+- go: "(Extend|Saturate|Truncate)?ToInt8"
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to int8.
+- go: "(Extend|Saturate|Truncate)?ToInt16(Concat)?"
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to int16.
+- go: "(Extend|Saturate|Truncate)?ToInt32"
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to int32.
+- go: "(Extend|Saturate|Truncate)?ToInt64"
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to int64.
+- go: "(Extend|Saturate|Truncate)?ToUint8"
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to uint8.
+- go: "(Extend|Saturate|Truncate)?ToUint16(Concat)?"
+ commutative: false
+ regexpTag: "convert"
+ documentation: !string |-
+ // NAME converts element values to uint16.
+- go: "(Extend|Saturate|Truncate)?ToUint32"
+ regexpTag: "convert"
+ commutative: false
+ documentation: !string |-
+ // NAME converts element values to uint32.
+- go: "(Extend|Saturate|Truncate)?ToUint64"
+ regexpTag: "convert"
+ commutative: false
+ documentation: !string |-
+ // NAME converts element values to uint64.
+# low-part only Int <-> Int conversions
+- go: ExtendLo8ToUint16x8
+ commutative: false
+ documentation: !string |-
+ // NAME converts 8 lowest vector element values to uint16.
+- go: ExtendLo8ToInt16x8
+ commutative: false
+ documentation: !string |-
+ // NAME converts 8 lowest vector element values to int16.
+- go: ExtendLo4ToUint32x4
+ commutative: false
+ documentation: !string |-
+ // NAME converts 4 lowest vector element values to uint32.
+- go: ExtendLo4ToInt32x4
+ commutative: false
+ documentation: !string |-
+ // NAME converts 4 lowest vector element values to int32.
+- go: ExtendLo2ToUint64x2
+ commutative: false
+ documentation: !string |-
+ // NAME converts 2 lowest vector element values to uint64.
+- go: ExtendLo2ToInt64x2
+ commutative: false
+ documentation: !string |-
+ // NAME converts 2 lowest vector element values to int64.
+- go: ExtendLo2ToUint64x2
+ commutative: false
+ documentation: !string |-
+ // NAME converts 2 lowest vector element values to uint64.
+- go: ExtendLo4ToUint64x4
+ commutative: false
+ documentation: !string |-
+ // NAME converts 4 lowest vector element values to uint64.
+- go: ExtendLo2ToInt64x2
+ commutative: false
+ documentation: !string |-
+ // NAME converts 2 lowest vector element values to int64.
+- go: ExtendLo4ToInt64x4
+ commutative: false
+ documentation: !string |-
+ // NAME converts 4 lowest vector element values to int64.
+- go: ExtendLo4ToUint32x4
+ commutative: false
+ documentation: !string |-
+ // NAME converts 4 lowest vector element values to uint32.
+- go: ExtendLo8ToUint32x8
+ commutative: false
+ documentation: !string |-
+ // NAME converts 8 lowest vector element values to uint32.
+- go: ExtendLo4ToInt32x4
+ commutative: false
+ documentation: !string |-
+ // NAME converts 4 lowest vector element values to int32.
+- go: ExtendLo8ToInt32x8
+ commutative: false
+ documentation: !string |-
+ // NAME converts 8 lowest vector element values to int32.
+- go: ExtendLo2ToUint64x2
+ commutative: false
+ documentation: !string |-
+ // NAME converts 2 lowest vector element values to uint64.
+- go: ExtendLo4ToUint64x4
+ commutative: false
+ documentation: !string |-
+ // NAME converts 4 lowest vector element values to uint64.
+- go: ExtendLo8ToUint64x8
+ commutative: false
+ documentation: !string |-
+ // NAME converts 8 lowest vector element values to uint64.
+- go: ExtendLo2ToInt64x2
+ commutative: false
+ documentation: !string |-
+ // NAME converts 2 lowest vector element values to int64.
+- go: ExtendLo4ToInt64x4
+ commutative: false
+ documentation: !string |-
+ // NAME converts 4 lowest vector element values to int64.
+- go: ExtendLo8ToInt64x8
+ commutative: false
+ documentation: !string |-
+ // NAME converts 8 lowest vector element values to int64.
\ No newline at end of file
--- /dev/null
+!sum
+# Float <-> Int conversions
+# float32 -> int32
+- go: ConvertToInt32
+ regexpTag: "convert"
+ asm: "VCVTTP[SD]2DQ"
+ in:
+ - &fp
+ go: $t
+ base: float
+ out:
+ - &i32
+ go: $u
+ base: int
+ elemBits: 32
+# float32 -> uint32
+- go: ConvertToUint32
+ regexpTag: "convert"
+ asm: "VCVTTP[SD]2UDQ"
+ in:
+ - *fp
+ out:
+ - &u32
+ go: $u
+ base: uint
+ elemBits: 32
+# float32|float64 -> int64
+- go: ConvertToInt64
+ regexpTag: "convert"
+ asm: "VCVTTPD2QQ"
+ in:
+ - *fp
+ out:
+ - &i64
+ go: $u
+ base: int
+ elemBits: 64
+- go: ConvertToInt64
+ regexpTag: "convert"
+ asm: "VCVTTPS2QQ"
+ in:
+ - *fp
+ out:
+ - go: $u
+ base: int
+ elemBits: 64
+ bits: 256|512
+# float32|float64 -> uint64
+- go: ConvertToUint64
+ regexpTag: "convert"
+ asm: "VCVTTPD2UQQ"
+ in:
+ - *fp
+ out:
+ - &u64
+ go: $u
+ base: uint
+ elemBits: 64
+- go: ConvertToUint64
+ regexpTag: "convert"
+ asm: "VCVTTPS2UQQ"
+ in:
+ - *fp
+ out:
+ - go: $u
+ base: uint
+ elemBits: 64
+ bits: 256|512
+# int -> float32
+- go: ConvertToFloat32
+ regexpTag: "convert"
+ asm: "VCVT[DQ]Q2PS"
+ in: &int
+ - go: $i
+ base: int
+ out:
+ - *fp
+# int -> float64
+- go: ConvertToFloat64
+ regexpTag: "convert"
+ asm: "VCVTQQ2PD"
+ in: *int
+ out:
+ - *fp
+- go: ConvertToFloat64
+ regexpTag: "convert"
+ asm: "VCVTDQ2PD"
+ in: *int
+ out:
+ - base: float
+ bits: 256|512
+# uint -> float32
+- go: ConvertToFloat32
+ regexpTag: "convert"
+ asm: "VCVTU[DQ]Q2PS"
+ in: &uint
+ - go: $u
+ base: uint
+ out:
+ - *fp
+# uint -> float64
+- go: ConvertToFloat64
+ regexpTag: "convert"
+ asm: "VCVTUQQ2PD"
+ in: *uint
+ out:
+ - *fp
+- go: ConvertToFloat64
+ regexpTag: "convert"
+ asm: "VCVTUDQ2PD"
+ in: *uint
+ out:
+ - base: float
+ bits: 256|512
+# float64 -> float32
+- go: ConvertToFloat32
+ regexpTag: "convert"
+ asm: "VCVTPD2PS"
+ addDoc:
+ !string |-
+ // The result vector's elements are rounded to the nearest value.
+ in: &fp64
+ - base: float
+ elemBits: 64
+ out: &fp32
+ - base: float
+ elemBits: 32
+# float32 -> float64
+- go: ConvertToFloat64
+ regexpTag: "convert"
+ asm: "VCVTPS2PD"
+ in: *fp32
+ out:
+ - base: float
+ elemBits: 64
+ bits: 256|512
+
+# Widening integer conversions.
+# uint8 -> uint16
+- go: ExtendToUint16
+ addDoc: &zeroExtendDoc
+ !string |-
+ // The result vector's elements are zero-extended.
+ regexpTag: "convert"
+ asm: "VPMOVZXBW"
+ in:
+ - &u8x16
+ base: uint
+ elemBits: 8
+ bits: 128
+ out:
+ - &u16x16
+ base: uint
+ elemBits: 16
+ bits: 256
+- go: ExtendToUint16
+ regexpTag: "convert"
+ asm: "VPMOVZXBW"
+ addDoc: *zeroExtendDoc
+ in:
+ - &u8x32
+ base: uint
+ elemBits: 8
+ bits: 256
+ out:
+ - &u16x32
+ base: uint
+ elemBits: 16
+ bits: 512
+# int8 -> int16
+- go: ExtendToInt16
+ regexpTag: "convert"
+ asm: "VPMOVSXBW"
+ addDoc: &signExtendDoc
+ !string |-
+ // The result vector's elements are sign-extended.
+ in:
+ - &i8x16
+ base: int
+ elemBits: 8
+ bits: 128
+ out:
+ - &i16x16
+ base: int
+ elemBits: 16
+ bits: 256
+- go: ExtendToInt16
+ regexpTag: "convert"
+ asm: "VPMOVSXBW"
+ addDoc: *signExtendDoc
+ in:
+ - &i8x32
+ base: int
+ elemBits: 8
+ bits: 256
+ out:
+ - &i16x32
+ base: int
+ elemBits: 16
+ bits: 512
+# uint16->uint32
+- go: ExtendToUint32
+ regexpTag: "convert"
+ asm: "VPMOVZXWD"
+ addDoc: *zeroExtendDoc
+ in:
+ - &u16x8
+ base: uint
+ elemBits: 16
+ bits: 128
+ out:
+ - &u32x8
+ base: uint
+ elemBits: 32
+ bits: 256
+- go: ExtendToUint32
+ regexpTag: "convert"
+ asm: "VPMOVZXWD"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u16x16
+ out:
+ - &u32x16
+ base: uint
+ elemBits: 32
+ bits: 512
+# int16->int32
+- go: ExtendToInt32
+ regexpTag: "convert"
+ asm: "VPMOVSXWD"
+ addDoc: *signExtendDoc
+ in:
+ - &i16x8
+ base: int
+ elemBits: 16
+ bits: 128
+ out:
+ - &i32x8
+ base: int
+ elemBits: 32
+ bits: 256
+- go: ExtendToInt32
+ regexpTag: "convert"
+ asm: "VPMOVSXWD"
+ addDoc: *signExtendDoc
+ in:
+ - *i16x16
+ out:
+ - &i32x16
+ base: int
+ elemBits: 32
+ bits: 512
+# uint32 -> uint64
+- go: ExtendToUint64
+ regexpTag: "convert"
+ asm: "VPMOVZXDQ"
+ addDoc: *zeroExtendDoc
+ in:
+ - &u32x4
+ base: uint
+ elemBits: 32
+ bits: 128
+ out:
+ - &u64x4
+ base: uint
+ elemBits: 64
+ bits: 256
+- go: ExtendToUint64
+ regexpTag: "convert"
+ asm: "VPMOVZXDQ"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u32x8
+ out:
+ - &u64x8
+ base: uint
+ elemBits: 64
+ bits: 512
+# int32 -> int64
+- go: ExtendToInt64
+ regexpTag: "convert"
+ asm: "VPMOVSXDQ"
+ addDoc: *signExtendDoc
+ in:
+ - &i32x4
+ base: int
+ elemBits: 32
+ bits: 128
+ out:
+ - &i64x4
+ base: int
+ elemBits: 64
+ bits: 256
+- go: ExtendToInt64
+ regexpTag: "convert"
+ asm: "VPMOVSXDQ"
+ addDoc: *signExtendDoc
+ in:
+ - *i32x8
+ out:
+ - &i64x8
+ base: int
+ elemBits: 64
+ bits: 512
+# uint16 -> uint64
+- go: ExtendToUint64
+ regexpTag: "convert"
+ asm: "VPMOVZXWQ"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u16x8
+ out:
+ - *u64x8
+# int16 -> int64
+- go: ExtendToInt64
+ regexpTag: "convert"
+ asm: "VPMOVSXWQ"
+ addDoc: *signExtendDoc
+ in:
+ - *i16x8
+ out:
+ - *i64x8
+# uint8 -> uint32
+- go: ExtendToUint32
+ regexpTag: "convert"
+ asm: "VPMOVZXBD"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u8x16
+ out:
+ - *u32x16
+# int8 -> int32
+- go: ExtendToInt32
+ regexpTag: "convert"
+ asm: "VPMOVSXBD"
+ addDoc: *signExtendDoc
+ in:
+ - *i8x16
+ out:
+ - *i32x16
+# Truncating conversions
+- go: TruncateToInt8
+ regexpTag: "convert"
+ asm: "VPMOV[WDQ]B"
+ addDoc: &truncDocZeroUpper
+ !string |-
+ // Conversion is done with truncation on the vector elements.
+ // Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+ in:
+ - base: int
+ out:
+ - base: int
+ bits: 128
+- go: TruncateToUint8
+ regexpTag: "convert"
+ asm: "VPMOV[WDQ]B"
+ addDoc: *truncDocZeroUpper
+ in:
+ - base: uint
+ out:
+ - base: uint
+ bits: 128
+- go: TruncateToInt8
+ regexpTag: "convert"
+ asm: "VPMOV[WDQ]B"
+ addDoc: &truncDoc
+ !string |-
+ // Conversion is done with truncation on the vector elements.
+ in:
+ - base: int
+ out:
+ - base: int
+ bits: 256|512
+- go: TruncateToUint8
+ regexpTag: "convert"
+ asm: "VPMOV[WDQ]B"
+ addDoc: *truncDoc
+ in:
+ - base: uint
+ out:
+ - base: uint
+ bits: 256|512
+- go: TruncateToInt16
+ regexpTag: "convert"
+ asm: "VPMOV[DQ]W"
+ addDoc: *truncDoc
+ in:
+ - base: int
+ out:
+ - base: int
+- go: TruncateToUint16
+ regexpTag: "convert"
+ asm: "VPMOV[DQ]W"
+ addDoc: *truncDoc
+ in:
+ - base: uint
+ out:
+ - base: uint
+- go: TruncateToInt32
+ regexpTag: "convert"
+ asm: "VPMOVQD"
+ addDoc: *truncDoc
+ in:
+ - base: int
+ out:
+ - base: int
+- go: TruncateToUint32
+ regexpTag: "convert"
+ asm: "VPMOVQD"
+ addDoc: *truncDoc
+ in:
+ - base: uint
+ out:
+ - base: uint
+# Saturated conversions.
+- go: SaturateToInt8
+ regexpTag: "convert"
+ asm: "VPMOVS[WDQ]B"
+ addDoc: &satDocZeroUpper
+ !string |-
+ // Conversion is done with saturation on the vector elements.
+ // Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+ in:
+ - base: int
+ out:
+ - base: int
+ bits: 128
+- go: SaturateToUint8
+ regexpTag: "convert"
+ asm: "VPMOVS[WDQ]B"
+ addDoc: *satDocZeroUpper
+ in:
+ - base: int
+ out:
+ - base: int
+ bits: 128
+- go: SaturateToInt8
+ regexpTag: "convert"
+ asm: "VPMOVS[WDQ]B"
+ addDoc: &satDoc
+ !string |-
+ // Conversion is done with saturation on the vector elements.
+ in:
+ - base: int
+ out:
+ - base: int
+ bits: 256|512
+- go: SaturateToUint8
+ regexpTag: "convert"
+ asm: "VPMOVUS[WDQ]B"
+ addDoc: *satDoc
+ in:
+ - base: uint
+ out:
+ - base: uint
+ bits: 256|512
+- go: SaturateToInt16
+ regexpTag: "convert"
+ asm: "VPMOVS[DQ]W"
+ addDoc: *satDoc
+ in:
+ - base: int
+ out:
+ - base: int
+- go: SaturateToUint16
+ regexpTag: "convert"
+ asm: "VPMOVUS[DQ]W"
+ addDoc: *satDoc
+ in:
+ - base: uint
+ out:
+ - base: uint
+- go: SaturateToInt32
+ regexpTag: "convert"
+ asm: "VPMOVSQD"
+ addDoc: *satDoc
+ in:
+ - base: int
+ out:
+ - base: int
+- go: SaturateToUint32
+ regexpTag: "convert"
+ asm: "VPMOVUSQD"
+ addDoc: *satDoc
+ in:
+ - base: uint
+ out:
+ - base: uint
+# Truncating saturated packed
+- go: SaturateToInt16Concat
+ regexpTag: "convert"
+ asm: "VPACKSSDW"
+ addDoc: &satDocConcat
+ !string |-
+ // With each 128-bit as a group:
+ // The converted group from the first input vector will be packed to the lower part of the result vector,
+ // the converted group from the second input vector will be packed to the upper part of the result vector.
+ // Conversion is done with saturation on the vector elements.
+ in:
+ - base: int
+ - base: int
+ out:
+ - base: int
+- go: SaturateToUint16Concat
+ regexpTag: "convert"
+ asm: "VPACKUSDW"
+ addDoc: *satDocConcat
+ in:
+ - base: uint
+ - base: uint
+ out:
+ - base: uint
+
+# low-part only conversions.
+# uint8->uint16
+- go: ExtendLo8ToUint16x8
+ regexpTag: "convert"
+ asm: "VPMOVZXBW"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u8x16
+ out:
+ - *u16x8
+# int8->int16
+- go: ExtendLo8ToInt16x8
+ regexpTag: "convert"
+ asm: "VPMOVSXBW"
+ addDoc: *signExtendDoc
+ in:
+ - *i8x16
+ out:
+ - *i16x8
+# uint16->uint32
+- go: ExtendLo4ToUint32x4
+ regexpTag: "convert"
+ asm: "VPMOVZXWD"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u16x8
+ out:
+ - *u32x4
+# int16->int32
+- go: ExtendLo4ToInt32x4
+ regexpTag: "convert"
+ asm: "VPMOVSXWD"
+ addDoc: *signExtendDoc
+ in:
+ - *i16x8
+ out:
+ - *i32x4
+# uint32 -> uint64
+- go: ExtendLo2ToUint64x2
+ regexpTag: "convert"
+ asm: "VPMOVZXDQ"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u32x4
+ out:
+ - &u64x2
+ base: uint
+ elemBits: 64
+ bits: 128
+# int32 -> int64
+- go: ExtendLo2ToInt64x2
+ regexpTag: "convert"
+ asm: "VPMOVSXDQ"
+ addDoc: *signExtendDoc
+ in:
+ - *i32x4
+ out:
+ - &i64x2
+ base: int
+ elemBits: 64
+ bits: 128
+# uint16 -> uint64
+- go: ExtendLo2ToUint64x2
+ regexpTag: "convert"
+ asm: "VPMOVZXWQ"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u16x8
+ out:
+ - *u64x2
+- go: ExtendLo4ToUint64x4
+ regexpTag: "convert"
+ asm: "VPMOVZXWQ"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u16x8
+ out:
+ - *u64x4
+# int16 -> int64
+- go: ExtendLo2ToInt64x2
+ regexpTag: "convert"
+ asm: "VPMOVSXWQ"
+ addDoc: *signExtendDoc
+ in:
+ - *i16x8
+ out:
+ - *i64x2
+- go: ExtendLo4ToInt64x4
+ regexpTag: "convert"
+ asm: "VPMOVSXWQ"
+ addDoc: *signExtendDoc
+ in:
+ - *i16x8
+ out:
+ - *i64x4
+# uint8 -> uint32
+- go: ExtendLo4ToUint32x4
+ regexpTag: "convert"
+ asm: "VPMOVZXBD"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u8x16
+ out:
+ - *u32x4
+- go: ExtendLo8ToUint32x8
+ regexpTag: "convert"
+ asm: "VPMOVZXBD"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u8x16
+ out:
+ - *u32x8
+# int8 -> int32
+- go: ExtendLo4ToInt32x4
+ regexpTag: "convert"
+ asm: "VPMOVSXBD"
+ addDoc: *signExtendDoc
+ in:
+ - *i8x16
+ out:
+ - *i32x4
+- go: ExtendLo8ToInt32x8
+ regexpTag: "convert"
+ asm: "VPMOVSXBD"
+ addDoc: *signExtendDoc
+ in:
+ - *i8x16
+ out:
+ - *i32x8
+# uint8 -> uint64
+- go: ExtendLo2ToUint64x2
+ regexpTag: "convert"
+ asm: "VPMOVZXBQ"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u8x16
+ out:
+ - *u64x2
+- go: ExtendLo4ToUint64x4
+ regexpTag: "convert"
+ asm: "VPMOVZXBQ"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u8x16
+ out:
+ - *u64x4
+- go: ExtendLo8ToUint64x8
+ regexpTag: "convert"
+ asm: "VPMOVZXBQ"
+ addDoc: *zeroExtendDoc
+ in:
+ - *u8x16
+ out:
+ - *u64x8
+# int8 -> int64
+- go: ExtendLo2ToInt64x2
+ regexpTag: "convert"
+ asm: "VPMOVSXBQ"
+ addDoc: *signExtendDoc
+ in:
+ - *i8x16
+ out:
+ - *i64x2
+- go: ExtendLo4ToInt64x4
+ regexpTag: "convert"
+ asm: "VPMOVSXBQ"
+ addDoc: *signExtendDoc
+ in:
+ - *i8x16
+ out:
+ - *i64x4
+- go: ExtendLo8ToInt64x8
+ regexpTag: "convert"
+ asm: "VPMOVSXBQ"
+ addDoc: *signExtendDoc
+ in:
+ - *i8x16
+ out:
+ - *i64x8
\ No newline at end of file
--- /dev/null
+!sum
+- go: Div
+ commutative: false
+ documentation: !string |-
+ // NAME divides elements of two vectors.
+- go: Sqrt
+ commutative: false
+ documentation: !string |-
+ // NAME computes the square root of each element.
+- go: Reciprocal
+ commutative: false
+ documentation: !string |-
+ // NAME computes an approximate reciprocal of each element.
+- go: ReciprocalSqrt
+ commutative: false
+ documentation: !string |-
+ // NAME computes an approximate reciprocal of the square root of each element.
+- go: Scale
+ commutative: false
+ documentation: !string |-
+ // NAME multiplies elements by a power of 2.
+- go: RoundToEven
+ commutative: false
+ constImm: 0
+ documentation: !string |-
+ // NAME rounds elements to the nearest integer.
+- go: RoundToEvenScaled
+ commutative: false
+ constImm: 0
+ documentation: !string |-
+ // NAME rounds elements with specified precision.
+- go: RoundToEvenScaledResidue
+ commutative: false
+ constImm: 0
+ documentation: !string |-
+ // NAME computes the difference after rounding with specified precision.
+- go: Floor
+ commutative: false
+ constImm: 1
+ documentation: !string |-
+ // NAME rounds elements down to the nearest integer.
+- go: FloorScaled
+ commutative: false
+ constImm: 1
+ documentation: !string |-
+ // NAME rounds elements down with specified precision.
+- go: FloorScaledResidue
+ commutative: false
+ constImm: 1
+ documentation: !string |-
+ // NAME computes the difference after flooring with specified precision.
+- go: Ceil
+ commutative: false
+ constImm: 2
+ documentation: !string |-
+ // NAME rounds elements up to the nearest integer.
+- go: CeilScaled
+ commutative: false
+ constImm: 2
+ documentation: !string |-
+ // NAME rounds elements up with specified precision.
+- go: CeilScaledResidue
+ commutative: false
+ constImm: 2
+ documentation: !string |-
+ // NAME computes the difference after ceiling with specified precision.
+- go: Trunc
+ commutative: false
+ constImm: 3
+ documentation: !string |-
+ // NAME truncates elements towards zero.
+- go: TruncScaled
+ commutative: false
+ constImm: 3
+ documentation: !string |-
+ // NAME truncates elements with specified precision.
+- go: TruncScaledResidue
+ commutative: false
+ constImm: 3
+ documentation: !string |-
+ // NAME computes the difference after truncating with specified precision.
+- go: AddSub
+ commutative: false
+ documentation: !string |-
+ // NAME subtracts even elements and adds odd elements of two vectors.
--- /dev/null
+!sum
+- go: Div
+ asm: "V?DIVP[SD]"
+ in: &2fp
+ - &fp
+ go: $t
+ base: float
+ - *fp
+ out: &1fp
+ - *fp
+- go: Sqrt
+ asm: "V?SQRTP[SD]"
+ in: *1fp
+ out: *1fp
+# TODO: Provide separate methods for 12-bit precision and 14-bit precision?
+- go: Reciprocal
+ asm: "VRCP(14)?P[SD]"
+ in: *1fp
+ out: *1fp
+- go: ReciprocalSqrt
+ asm: "V?RSQRT(14)?P[SD]"
+ in: *1fp
+ out: *1fp
+- go: Scale
+ asm: "VSCALEFP[SD]"
+ in: *2fp
+ out: *1fp
+
+- go: "RoundToEven|Ceil|Floor|Trunc"
+ regexpTag: "fp"
+ asm: "VROUNDP[SD]"
+ in:
+ - *fp
+ - class: immediate
+ const: 0 # place holder
+ out: *1fp
+
+- go: "(RoundToEven|Ceil|Floor|Trunc)Scaled"
+ regexpTag: "fp"
+ asm: "VRNDSCALEP[SD]"
+ in:
+ - *fp
+ - class: immediate
+ const: 0 # place holder
+ immOffset: 4 # "M", round to numbers with M digits after dot(by means of binary number).
+ name: prec
+ out: *1fp
+- go: "(RoundToEven|Ceil|Floor|Trunc)ScaledResidue"
+ regexpTag: "fp"
+ asm: "VREDUCEP[SD]"
+ in:
+ - *fp
+ - class: immediate
+ const: 0 # place holder
+ immOffset: 4 # "M", round to numbers with M digits after dot(by means of binary number).
+ name: prec
+ out: *1fp
+
+- go: "AddSub"
+ asm: "VADDSUBP[SD]"
+ in:
+ - *fp
+ - *fp
+ out:
+ - *fp
--- /dev/null
+!sum
+- go: GaloisFieldAffineTransform
+ commutative: false
+ documentation: !string |-
+ // NAME computes an affine transformation in GF(2^8):
+ // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
+ // b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
+ // corresponding to a group of 8 elements in x.
+- go: GaloisFieldAffineTransformInverse
+ commutative: false
+ documentation: !string |-
+ // NAME computes an affine transformation in GF(2^8),
+ // with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
+ // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
+ // b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
+ // corresponding to a group of 8 elements in x.
+- go: GaloisFieldMul
+ commutative: false
+ documentation: !string |-
+ // NAME computes element-wise GF(2^8) multiplication with
+ // reduction polynomial x^8 + x^4 + x^3 + x + 1.
+- go: carrylessMultiply
+ commutative: false
--- /dev/null
+!sum
+- go: GaloisFieldAffineTransform
+ asm: VGF2P8AFFINEQB
+ operandOrder: 2I # 2nd operand, then immediate
+ in: &AffineArgs
+ - &uint8
+ go: $t
+ base: uint
+ - &uint8x8
+ go: $t2
+ base: uint
+ - &pureImmVar
+ class: immediate
+ immOffset: 0
+ name: b
+ out:
+ - *uint8
+
+- go: GaloisFieldAffineTransformInverse
+ asm: VGF2P8AFFINEINVQB
+ operandOrder: 2I # 2nd operand, then immediate
+ in: *AffineArgs
+ out:
+ - *uint8
+
+- go: GaloisFieldMul
+ asm: VGF2P8MULB
+ in:
+ - *uint8
+ - *uint8
+ out:
+ - *uint8
+
+- go: carrylessMultiply
+ documentation: !string |-
+ // NAME computes one of four possible Galois polynomial
+ // products of selected high and low halves of x and y,
+ // depending on the value of xyHiLo, returning the 128-bit
+ // product in the concatenated two elements of the result.
+ // Bit 0 selects the low (0) or high (1) element of x and
+ // bit 4 selects the low (0x00) or high (0x10) element of y.
+ asm: V?PCLMULQDQ
+ in:
+ - go: Uint64x2
+ - go: Uint64x2
+ - class: immediate
+ immOffset: 0
+ name: xyHiLo
+ out:
+ - go: Uint64x2
+ overwriteElementBits: 64
+ hideMaskMethods: true
+
+- go: carrylessMultiply
+ documentation: !string |-
+ // NAME computes one of two possible Galois polynomial
+ // products of selected high and low halves of each of the two
+ // 128-bit lanes of x and y, depending on the value of xyHiLo,
+ // and returns the four 128-bit products in the result's lanes.
+ // Bit 0 selects the low (0) or high (1) elements of x's lanes and
+ // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+ asm: V?PCLMULQDQ
+ in:
+ - go: Uint64x4
+ - go: Uint64x4
+ - class: immediate
+ immOffset: 0
+ name: xyHiLo
+ out:
+ - go: Uint64x4
+ overwriteElementBits: 64
+ hideMaskMethods: true
+
+- go: carrylessMultiply
+ documentation: !string |-
+ // NAME computes one of four possible Galois polynomial
+ // products of selected high and low halves of each of the four
+ // 128-bit lanes of x and y, depending on the value of xyHiLo,
+ // and returns the four 128-bit products in the result's lanes.
+ // Bit 0 selects the low (0) or high (1) elements of x's lanes and
+ // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+ asm: V?PCLMULQDQ
+ in:
+ - go: Uint64x8
+ - go: Uint64x8
+ - class: immediate
+ immOffset: 0
+ name: xyHiLo
+ out:
+ - go: Uint64x8
+ overwriteElementBits: 64
+ hideMaskMethods: true
--- /dev/null
+!sum
+- go: Average
+ commutative: true
+ documentation: !string |-
+ // NAME computes the rounded average of corresponding elements.
+- go: Abs
+ commutative: false
+ # Unary operation, not commutative
+ documentation: !string |-
+ // NAME computes the absolute value of each element.
+- go: CopySign
+ # Applies sign of second operand to first: sign(val, sign_src)
+ commutative: false
+ documentation: !string |-
+ // NAME returns the product of the first operand with -1, 0, or 1,
+ // whichever constant is nearest to the value of the second operand.
+ # Sign does not have masked version
+- go: OnesCount
+ commutative: false
+ documentation: !string |-
+ // NAME counts the number of set bits in each element.
--- /dev/null
+!sum
+# Average (unsigned byte, unsigned word)
+# Instructions: VPAVGB, VPAVGW
+- go: Average
+ asm: "VPAVG[BW]" # Matches VPAVGB (byte) and VPAVGW (word)
+ in:
+ - &uint_t # $t will be Uint8xN for VPAVGB, Uint16xN for VPAVGW
+ go: $t
+ base: uint
+ - *uint_t
+ out:
+ - *uint_t
+
+# Absolute Value (signed byte, word, dword, qword)
+# Instructions: VPABSB, VPABSW, VPABSD, VPABSQ
+- go: Abs
+ asm: "VPABS[BWDQ]" # Matches VPABSB, VPABSW, VPABSD, VPABSQ
+ in:
+ - &int_t # $t will be Int8xN, Int16xN, Int32xN, Int64xN
+ go: $t
+ base: int
+ out:
+ - *int_t # Output is magnitude, fits in the same signed type
+
+# Sign Operation (signed byte, word, dword)
+# Applies sign of second operand to the first.
+# Instructions: VPSIGNB, VPSIGNW, VPSIGND
+- go: CopySign
+ asm: "VPSIGN[BWD]" # Matches VPSIGNB, VPSIGNW, VPSIGND
+ in:
+ - *int_t # value to apply sign to
+ - *int_t # value from which to take the sign
+ out:
+ - *int_t
+
+# Population Count (count set bits in each element)
+# Instructions: VPOPCNTB, VPOPCNTW (AVX512_BITALG)
+# VPOPCNTD, VPOPCNTQ (AVX512_VPOPCNTDQ)
+- go: OnesCount
+ asm: "VPOPCNT[BWDQ]"
+ in:
+ - &any
+ go: $t
+ out:
+ - *any
--- /dev/null
+!sum
+- go: DotProductPairs
+ commutative: false
+ documentation: !string |-
+ // NAME multiplies the elements and add the pairs together,
+ // yielding a vector of half as many elements with twice the input element size.
+# TODO: maybe simplify this name within the receiver-type + method-naming scheme we use.
+- go: DotProductPairsSaturated
+ commutative: false
+ documentation: !string |-
+ // NAME multiplies the elements and add the pairs together with saturation,
+ // yielding a vector of half as many elements with twice the input element size.
+# QuadDotProduct, i.e. VPDPBUSD(S) are operations with src/dst on the same register, we are not supporting this as of now.
+# - go: DotProductBroadcast
+# commutative: true
+# # documentation: !string |-
+# // NAME multiplies all elements and broadcasts the sum.
+- go: DotProductQuadruple
+ commutative: false
+ documentation: !string |-
+ // NAME performs dot products on groups of 4 elements of x and y.
+ // NAME(x, y).Add(z) will be optimized to the full form of the underlying instruction.
+- go: DotProductQuadrupleSaturated
+ commutative: false
+ documentation: !string |-
+ // NAME multiplies performs dot products on groups of 4 elements of x and y.
+ // NAME(x, y).Add(z) will be optimized to the full form of the underlying instruction.
+- go: AddDotProductPairs
+ commutative: false
+ noTypes: "true"
+ noGenericOps: "true"
+ documentation: !string |-
+ // NAME performs dot products on pairs of elements of y and z and then adds x.
+- go: MulAdd
+ commutative: false
+ documentation: !string |-
+ // NAME performs a fused (x * y) + z.
+- go: MulAddSub
+ commutative: false
+ documentation: !string |-
+ // NAME performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
+- go: MulSubAdd
+ commutative: false
+ documentation: !string |-
+ // NAME performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
+- go: SumAbsDiff
+ commutative: false
+ documentation: !string |-
+ // NAME sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+ // be a vector of word-sized elements whose each 4*n-th element contains the sum of the n-th input group. The other elements in the result vector are zeroed.
+ // This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
--- /dev/null
+!sum
+- go: DotProductPairs
+ asm: VPMADDWD
+ in:
+ - &int
+ go: $t
+ base: int
+ - *int
+ out:
+ - &int2 # The elemBits are different
+ go: $t2
+ base: int
+- go: DotProductPairsSaturated
+ asm: VPMADDUBSW
+ in:
+ - &uint
+ go: $t
+ base: uint
+ overwriteElementBits: 8
+ - &int3
+ go: $t3
+ base: int
+ overwriteElementBits: 8
+ out:
+ - *int2
+# - go: DotProductBroadcast
+# asm: VDPP[SD]
+# in:
+# - &dpb_src
+# go: $t
+# - *dpb_src
+# - class: immediate
+# const: 127
+# out:
+# - *dpb_src
+- go: DotProductQuadruple
+ asm: "VPDPBUSD"
+ operandOrder: "31Zero3" # switch operand 3 and 1, and make 3 always 0
+ in:
+ - &qdpa_acc
+ go: $t_acc
+ base: int
+ elemBits: 32
+ - &qdpa_src1
+ go: $t_src1
+ base: uint
+ overwriteElementBits: 8
+ - &qdpa_src2
+ go: $t_src2
+ base: int
+ overwriteElementBits: 8
+ out:
+ - *qdpa_acc
+- go: DotProductQuadrupleSaturated
+ asm: "VPDPBUSDS"
+ operandOrder: "31Zero3" # switch operand 3 and 1, and make 3 always 0
+ in:
+ - *qdpa_acc
+ - *qdpa_src1
+ - *qdpa_src2
+ out:
+ - *qdpa_acc
+- go: AddDotProductPairs
+ asm: "VPDPWSSD"
+ in:
+ - &pdpa_acc
+ go: $t_acc
+ base: int
+ elemBits: 32
+ - &pdpa_src1
+ go: $t_src1
+ base: int
+ overwriteElementBits: 16
+ - &pdpa_src2
+ go: $t_src2
+ base: int
+ overwriteElementBits: 16
+ out:
+ - *pdpa_acc
+- go: MulAdd
+ asm: "VFMADD213PS|VFMADD213PD"
+ in:
+ - &fma_op
+ go: $t
+ base: float
+ - *fma_op
+ - *fma_op
+ out:
+ - *fma_op
+- go: MulAddSub
+ asm: "VFMADDSUB213PS|VFMADDSUB213PD"
+ in:
+ - *fma_op
+ - *fma_op
+ - *fma_op
+ out:
+ - *fma_op
+- go: MulSubAdd
+ asm: "VFMSUBADD213PS|VFMSUBADD213PD"
+ in:
+ - *fma_op
+ - *fma_op
+ - *fma_op
+ out:
+ - *fma_op
+- go: SumAbsDiff
+ asm: "VPSADBW"
+ in:
+ - go: $t
+ base: uint
+ - go: $t
+ base: uint
+ out:
+ - go: $t2
+ base: uint
\ No newline at end of file
--- /dev/null
+!sum
+- go: Max
+ commutative: true
+ documentation: !string |-
+ // NAME computes the maximum of corresponding elements.
+- go: Min
+ commutative: true
+ documentation: !string |-
+ // NAME computes the minimum of corresponding elements.
--- /dev/null
+!sum
+- go: Max
+ asm: "V?PMAXS[BWDQ]"
+ in: &2int
+ - &int
+ go: $t
+ base: int
+ - *int
+ out: &1int
+ - *int
+- go: Max
+ asm: "V?PMAXU[BWDQ]"
+ in: &2uint
+ - &uint
+ go: $t
+ base: uint
+ - *uint
+ out: &1uint
+ - *uint
+
+- go: Min
+ asm: "V?PMINS[BWDQ]"
+ in: *2int
+ out: *1int
+- go: Min
+ asm: "V?PMINU[BWDQ]"
+ in: *2uint
+ out: *1uint
+
+- go: Max
+ asm: "V?MAXP[SD]"
+ in: &2float
+ - &float
+ go: $t
+ base: float
+ - *float
+ out: &1float
+ - *float
+- go: Min
+ asm: "V?MINP[SD]"
+ in: *2float
+ out: *1float
--- /dev/null
+!sum
+- go: SetElem
+ commutative: false
+ documentation: !string |-
+ // NAME sets a single constant-indexed element's value.
+- go: GetElem
+ commutative: false
+ documentation: !string |-
+ // NAME retrieves a single constant-indexed element's value.
+- go: SetLo
+ commutative: false
+ constImm: 0
+ documentation: !string |-
+ // NAME returns x with its lower half set to y.
+- go: GetLo
+ commutative: false
+ constImm: 0
+ documentation: !string |-
+ // NAME returns the lower half of x.
+- go: SetHi
+ commutative: false
+ constImm: 1
+ documentation: !string |-
+ // NAME returns x with its upper half set to y.
+- go: GetHi
+ commutative: false
+ constImm: 1
+ documentation: !string |-
+ // NAME returns the upper half of x.
+- go: PermuteOrZero
+ commutative: false
+ documentation: !string |-
+ // NAME performs a full permutation of vector x using indices:
+ // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+- go: Permute
+ commutative: false
+ documentation: !string |-
+ // NAME performs a full permutation of vector x using indices:
+ // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+- go: ConcatPermute # ConcatPermute is only available on or after AVX512
+ commutative: false
+ documentation: !string |-
+ // NAME performs a full permutation of vector x, y using indices:
+ // result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+ // where xy is the concatenation of x (lower half) and y (upper half).
+ // Only the needed bits to represent xy's index are used in indices' elements.
+- go: Compress
+ commutative: false
+ documentation: !string |-
+ // NAME performs a compression on vector x using mask by
+ // selecting elements as indicated by mask, and pack them to lower indexed elements.
+- go: blend
+ commutative: false
+ documentation: !string |-
+ // NAME blends two vectors based on mask values, choosing either
+ // the first or the second based on whether the third is false or true
+- go: move
+ commutative: false
+ noTypes: "true"
+ noGenericOps: "true"
+- go: Expand
+ commutative: false
+ documentation: !string |-
+ // NAME performs an expansion on a vector x whose elements are packed to lower parts.
+ // The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+- go: Broadcast128
+ commutative: false
+ documentation: !string |-
+ // NAME copies element zero of its (128-bit) input to all elements of
+ // the 128-bit output vector.
+- go: Broadcast256
+ commutative: false
+ documentation: !string |-
+ // NAME copies element zero of its (128-bit) input to all elements of
+ // the 256-bit output vector.
+- go: Broadcast512
+ commutative: false
+ documentation: !string |-
+ // NAME copies element zero of its (128-bit) input to all elements of
+ // the 512-bit output vector.
+- go: PermuteOrZeroGrouped
+ commutative: false
+ documentation: !string |- # Detailed documentation will rely on the specific ops.
+ // NAME performs a grouped permutation of vector x using indices:
+- go: PermuteGrouped
+ commutative: false
+ documentation: !string |- # Detailed documentation will rely on the specific ops.
+ // NAME performs a grouped permutation of vector x using indices:
+- go: permuteScalars
+ commutative: false
+ documentation: !string |- # Detailed documentation will rely on the specific ops.
+ // NAME performs a permutation of vector x using constant indices:
+- go: permuteScalarsGrouped
+ commutative: false
+ documentation: !string |- # Detailed documentation will rely on the specific ops.
+ // NAME performs a grouped permutation of vector x using constant indices:
+- go: permuteScalarsLo
+ commutative: false
+ documentation: !string |- # Detailed documentation will rely on the specific ops.
+ // NAME performs a permutation of vector x using constant indices:
+- go: permuteScalarsLoGrouped
+ commutative: false
+ documentation: !string |- # Detailed documentation will rely on the specific ops.
+ // NAME performs a grouped permutation of vector x using constant indices:
+- go: permuteScalarsHi
+ commutative: false
+ documentation: !string |- # Detailed documentation will rely on the specific ops.
+ // NAME performs a permutation of vector x using constant indices:
+- go: permuteScalarsHiGrouped
+ commutative: false
+ documentation: !string |- # Detailed documentation will rely on the specific ops.
+ // NAME performs a grouped permutation of vector x using constant indices:
+- go: InterleaveHi
+ commutative: false
+ documentation: !string |-
+ // NAME interleaves the elements of the high halves of x and y.
+- go: InterleaveLo
+ commutative: false
+ documentation: !string |-
+ // NAME interleaves the elements of the low halves of x and y.
+- go: InterleaveHiGrouped
+ commutative: false
+ documentation: !string |-
+ // NAME interleaves the elements of the high half of each 128-bit subvector of x and y.
+- go: InterleaveLoGrouped
+ commutative: false
+ documentation: !string |-
+ // NAME interleaves the elements of the low half of each 128-bit subvector of x and y.
+
+- go: concatSelectedConstant
+ commutative: false
+ out:
+ - elemBits: 32
+ documentation: !string |-
+ // NAME concatenates selected elements from x and y into the lower and upper
+ // halves of the output. The selection is chosen by the constant parameter h1h0l1l0
+ // where each {h,l}{1,0} is two bits specify which element from y or x to select.
+ // For example, {0,1,2,3}.NAME(0b_11_01_00_10, {4,5,6,7}) returns
+ // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
+
+- go: concatSelectedConstant
+ commutative: false
+ out:
+ - elemBits: 64
+ documentation: !string |-
+ // NAME concatenates selected elements from x and y into the lower and upper
+ // halves of the output. The selection is chosen by the constant parameter hilo
+ // where hi and lo are each one bit specifying which 64-bit element to select
+ // from y and x. For example {4,5}.NAME(0b10, {6,7})
+ // returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
+ // selecting from y, is 1, and selects 7.
+
+- go: concatSelectedConstantGrouped
+ commutative: false
+ out:
+ - elemBits: 32
+ bits: 256
+ documentation: !string |-
+ // NAME concatenates selected elements from 128-bit subvectors of x and y
+ // into the lower and upper halves of corresponding subvectors of the output.
+ // The selection is chosen by the constant parameter h1h0l1l0
+ // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+ // For example,
+ // {0,1,2,3,8,9,10,11}.NAME(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
+ // returns {2,0,5,7,10,8,13,15}
+ // (don't forget that the binary constant is written big-endian).
+
+- go: concatSelectedConstantGrouped
+ commutative: false
+ out:
+ - elemBits: 64
+ bits: 256
+ documentation: !string |-
+ // NAME concatenates selected elements from 128-bit subvectors of x and y
+ // into the lower and upper halves of corresponding subvectors of the output.
+ // The selections are specified by the constant parameter hilos where each
+ // hi and lo pair select 64-bit elements from the corresponding 128-bit
+ // subvectors of x and y.
+ //
+ // For example {4,5,8,9}.NAME(0b_11_10, {6,7,10,11})
+ // returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
+ // 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+ // then 1, selecting element 1 from x's upper 128 bits (9), then 1,
+ // selecting element 1 from y's upper 128 bits (11).
+ // This differs from the same method applied to a 32x8 vector, where
+ // the 8-bit constant performs the same selection on both subvectors.
+
+- go: concatSelectedConstantGrouped
+ commutative: false
+ out:
+ - elemBits: 32
+ bits: 512
+ documentation: !string |-
+ // NAME concatenates selected elements from 128-bit subvectors of x and y
+ // into the lower and upper halves of corresponding subvectors of the output.
+ // The selection is chosen by the constant parameter h1h0l1l0
+ // where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+ // For example,
+ //
+ // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.NAME(
+ // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+ //
+ // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+ //
+ // (don't forget that the binary constant is written big-endian).
+
+- go: concatSelectedConstantGrouped
+ commutative: false
+ out:
+ - elemBits: 64
+ bits: 512
+ documentation: !string |-
+ // NAME concatenates selected elements from 128-bit subvectors of x and y
+ // into the lower and upper halves of corresponding subvectors of the output.
+ // The selections are specified by the constant parameter hilos where each
+ // hi and lo pair select 64-bit elements from the corresponding 128-bit
+ // subvectors of x and y.
+ //
+ // For example {4,5,8,9,12,13,16,17}.NAME(0b11_00_11_10, {6,7,10,11,14,15,18,19})
+ // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
+ // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+ // then 1, selecting element 1 from x's next 128 bits (9), then 1,
+ // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select
+ // the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
+ // 1 bits select the upper elements from x and y's last 128 bits (17, 19).
+ // This differs from the same method applied to a 32x8 or 32x16 vector, where
+ // the 8-bit constant performs the same selection on all the subvectors.
+
+- go: Select128FromPair
+ commutative: false
+ documentation: !string |-
+ // NAME treats the 256-bit vectors x and y as a single vector of four
+ // 128-bit elements, and returns a 256-bit result formed by
+ // concatenating the two elements specified by lo and hi.
+
+- go: ConcatShiftBytesRight
+ commutative: false
+ documentation: !string |-
+ // NAME concatenates x and y and shift it right by constant bytes.
+ // The result vector will be the lower half of the concatenated vector.
+
+- go: ConcatShiftBytesRightGrouped
+ commutative: false
+ documentation: !string |-
+ // NAME concatenates x and y and shift it right by constant bytes.
+ // The result vector will be the lower half of the concatenated vector.
+ // This operation is performed grouped by each 16 byte.
--- /dev/null
+!sum
+- go: SetElem
+ asm: "VPINSR[BWDQ]"
+ in:
+ - &t
+ class: vreg
+ base: $b
+ - class: greg
+ base: $b
+ lanes: 1 # Scalar, darn it!
+ - &imm
+ class: immediate
+ immOffset: 0
+ name: index
+ out:
+ - *t
+
+- go: SetElem
+ asm: "VPINSR[DQ]"
+ in:
+ - &t
+ class: vreg
+ base: int
+ OverwriteBase: float
+ - class: greg
+ base: int
+ OverwriteBase: float
+ lanes: 1 # Scalar, darn it!
+ - &imm
+ class: immediate
+ immOffset: 0
+ name: index
+ out:
+ - *t
+
+- go: GetElem
+ asm: "VPEXTR[BWDQ]"
+ in:
+ - class: vreg
+ base: $b
+ elemBits: $e
+ - *imm
+ out:
+ - class: greg
+ base: $b
+ bits: $e
+
+- go: GetElem
+ asm: "VPEXTR[DQ]"
+ in:
+ - class: vreg
+ base: int
+ elemBits: $e
+ OverwriteBase: float
+ - *imm
+ out:
+ - class: greg
+ base: int
+ bits: $e
+ OverwriteBase: float
+
+- go: "SetHi|SetLo"
+ regexpTag: "move"
+ asm: "VINSERTI128|VINSERTI64X4"
+ inVariant: []
+ in:
+ - &i8x2N
+ class: vreg
+ base: $t
+ OverwriteElementBits: 8
+ - &i8xN
+ class: vreg
+ base: $t
+ OverwriteElementBits: 8
+ - &imm01 # This immediate should be only 0 or 1
+ class: immediate
+ const: 0 # place holder
+ name: index
+ out:
+ - *i8x2N
+
+- go: "GetHi|GetLo"
+ asm: "VEXTRACTI128|VEXTRACTI64X4"
+ regexpTag: "move"
+ inVariant: []
+ in:
+ - *i8x2N
+ - *imm01
+ out:
+ - *i8xN
+
+- go: "SetHi|SetLo"
+ asm: "VINSERTI128|VINSERTI64X4"
+ regexpTag: "move"
+ inVariant: []
+ in:
+ - &i16x2N
+ class: vreg
+ base: $t
+ OverwriteElementBits: 16
+ - &i16xN
+ class: vreg
+ base: $t
+ OverwriteElementBits: 16
+ - *imm01
+ out:
+ - *i16x2N
+
+- go: "GetHi|GetLo"
+ regexpTag: "move"
+ asm: "VEXTRACTI128|VEXTRACTI64X4"
+ inVariant: []
+ in:
+ - *i16x2N
+ - *imm01
+ out:
+ - *i16xN
+
+- go: "SetHi|SetLo"
+ regexpTag: "move"
+ asm: "VINSERTI128|VINSERTI64X4"
+ inVariant: []
+ in:
+ - &i32x2N
+ class: vreg
+ base: $t
+ OverwriteElementBits: 32
+ - &i32xN
+ class: vreg
+ base: $t
+ OverwriteElementBits: 32
+ - *imm01
+ out:
+ - *i32x2N
+
+- go: "GetHi|GetLo"
+ regexpTag: "move"
+ asm: "VEXTRACTI128|VEXTRACTI64X4"
+ inVariant: []
+ in:
+ - *i32x2N
+ - *imm01
+ out:
+ - *i32xN
+
+- go: "SetHi|SetLo"
+ regexpTag: "move"
+ asm: "VINSERTI128|VINSERTI64X4"
+ inVariant: []
+ in:
+ - &i64x2N
+ class: vreg
+ base: $t
+ OverwriteElementBits: 64
+ - &i64xN
+ class: vreg
+ base: $t
+ OverwriteElementBits: 64
+ - *imm01
+ out:
+ - *i64x2N
+
+- go: "GetHi|GetLo"
+ regexpTag: "move"
+ asm: "VEXTRACTI128|VEXTRACTI64X4"
+ inVariant: []
+ in:
+ - *i64x2N
+ - *imm01
+ out:
+ - *i64xN
+
+- go: "SetHi|SetLo"
+ regexpTag: "move"
+ asm: "VINSERTF128|VINSERTF64X4"
+ inVariant: []
+ in:
+ - &f32x2N
+ class: vreg
+ base: $t
+ OverwriteElementBits: 32
+ - &f32xN
+ class: vreg
+ base: $t
+ OverwriteElementBits: 32
+ - *imm01
+ out:
+ - *f32x2N
+
+- go: "GetHi|GetLo"
+ regexpTag: "move"
+ asm: "VEXTRACTF128|VEXTRACTF64X4"
+ inVariant: []
+ in:
+ - *f32x2N
+ - *imm01
+ out:
+ - *f32xN
+
+- go: "SetHi|SetLo"
+ regexpTag: "move"
+ asm: "VINSERTF128|VINSERTF64X4"
+ inVariant: []
+ in:
+ - &f64x2N
+ class: vreg
+ base: $t
+ OverwriteElementBits: 64
+ - &f64xN
+ class: vreg
+ base: $t
+ OverwriteElementBits: 64
+ - *imm01
+ out:
+ - *f64x2N
+
+- go: "GetHi|GetLo"
+ regexpTag: "move"
+ asm: "VEXTRACTF128|VEXTRACTF64X4"
+ inVariant: []
+ in:
+ - *f64x2N
+ - *imm01
+ out:
+ - *f64xN
+
+- go: Permute
+ asm: "VPERMQ|VPERMPD"
+ addDoc: !string |-
+ // The low 2 bits (values 0-3) of each element of indices is used
+ operandOrder: "21Type1"
+ in:
+ - &anyindices
+ go: $t
+ name: indices
+ overwriteBase: uint
+ - &any4
+ go: $t
+ lanes: 4
+ out:
+ - &any
+ go: $t
+
+- go: Permute
+ asm: "VPERM[WDQ]|VPERMP[SD]"
+ addDoc: !string |-
+ // The low 3 bits (values 0-7) of each element of indices is used
+ operandOrder: "21Type1"
+ in:
+ - *anyindices
+ - &any8
+ go: $t
+ lanes: 8
+ out:
+ - *any
+
+- go: Permute
+ asm: "VPERM[BWD]|VPERMPS"
+ addDoc: !string |-
+ // The low 4 bits (values 0-15) of each element of indices is used
+ operandOrder: "21Type1"
+ in:
+ - *anyindices
+ - &any16
+ go: $t
+ lanes: 16
+ out:
+ - *any
+
+- go: Permute
+ asm: "VPERM[BW]"
+ addDoc: !string |-
+ // The low 5 bits (values 0-31) of each element of indices is used
+ operandOrder: "21Type1"
+ in:
+ - *anyindices
+ - &any32
+ go: $t
+ lanes: 32
+ out:
+ - *any
+
+- go: Permute
+ asm: "VPERMB"
+ addDoc: !string |-
+ // The low 6 bits (values 0-63) of each element of indices is used
+ operandOrder: "21Type1"
+ in:
+ - *anyindices
+ - &any64
+ go: $t
+ lanes: 64
+ out:
+ - *any
+
+- go: ConcatPermute
+ asm: "VPERMI2[BWDQ]|VPERMI2P[SD]"
+ # Because we are overwriting the receiver's type, we
+ # have to move the receiver to be a parameter so that
+ # we can have no duplication.
+ operandOrder: "231Type1"
+ in:
+ - *anyindices # result in arg 0
+ - *any
+ - *any
+ out:
+ - *any
+
+- go: Compress
+ asm: "VPCOMPRESS[BWDQ]|VCOMPRESSP[SD]"
+ in:
+ # The mask in Compress is a control mask rather than a write mask, so it's not optional.
+ - class: mask
+ - *any
+ out:
+ - *any
+
+# For now a non-public method because
+# (1) [OverwriteClass] must be set together with [OverwriteBase]
+# (2) "simdgen does not support [OverwriteClass] in inputs".
+# That means the signature is wrong.
+- go: blend
+ asm: VPBLENDVB
+ zeroing: false
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: int
+ - *v
+ -
+ class: vreg
+ base: int
+ name: mask
+ out:
+ - *v
+
+# For AVX512
+- go: blend
+ asm: VPBLENDM[BWDQ]
+ zeroing: false
+ in:
+ - &v
+ go: $t
+ bits: 512
+ class: vreg
+ base: int
+ - *v
+ inVariant:
+ -
+ class: mask
+ out:
+ - *v
+
+ # For AVX512
+- go: move
+ asm: VMOVDQU(8|16|32|64)
+ zeroing: true
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: int|uint
+ inVariant:
+ -
+ class: mask
+ out:
+ - *v
+
+- go: Expand
+ asm: "VPEXPAND[BWDQ]|VEXPANDP[SD]"
+ in:
+ # The mask in Expand is a control mask rather than a write mask, so it's not optional.
+ - class: mask
+ - *any
+ out:
+ - *any
+
+- go: Broadcast128
+ asm: VPBROADCAST[BWDQ]
+ in:
+ - class: vreg
+ bits: 128
+ elemBits: $e
+ base: $b
+ out:
+ - class: vreg
+ bits: 128
+ elemBits: $e
+ base: $b
+
+# weirdly, this one case on AVX2 is memory-operand-only
+- go: Broadcast128
+ asm: VPBROADCASTQ
+ in:
+ - class: vreg
+ bits: 128
+ elemBits: 64
+ base: int
+ OverwriteBase: float
+ out:
+ - class: vreg
+ bits: 128
+ elemBits: 64
+ base: int
+ OverwriteBase: float
+
+- go: Broadcast256
+ asm: VPBROADCAST[BWDQ]
+ in:
+ - class: vreg
+ bits: 128
+ elemBits: $e
+ base: $b
+ out:
+ - class: vreg
+ bits: 256
+ elemBits: $e
+ base: $b
+
+- go: Broadcast512
+ asm: VPBROADCAST[BWDQ]
+ in:
+ - class: vreg
+ bits: 128
+ elemBits: $e
+ base: $b
+ out:
+ - class: vreg
+ bits: 512
+ elemBits: $e
+ base: $b
+
+- go: Broadcast128
+ asm: VBROADCASTS[SD]
+ in:
+ - class: vreg
+ bits: 128
+ elemBits: $e
+ base: $b
+ out:
+ - class: vreg
+ bits: 128
+ elemBits: $e
+ base: $b
+
+- go: Broadcast256
+ asm: VBROADCASTS[SD]
+ in:
+ - class: vreg
+ bits: 128
+ elemBits: $e
+ base: $b
+ out:
+ - class: vreg
+ bits: 256
+ elemBits: $e
+ base: $b
+
+- go: Broadcast512
+ asm: VBROADCASTS[SD]
+ in:
+ - class: vreg
+ bits: 128
+ elemBits: $e
+ base: $b
+ out:
+ - class: vreg
+ bits: 512
+ elemBits: $e
+ base: $b
+
+# VPSHUFB for 128-bit byte shuffles will be picked with higher priority than VPERMB, given its lower CPU feature requirement. (It's AVX)
+- go: PermuteOrZero
+ asm: VPSHUFB
+ addDoc: !string |-
+ // The lower four bits of each byte-sized index in indices select an element from x,
+ // unless the index's sign bit is set in which case zero is used instead.
+ in:
+ - &128any
+ bits: 128
+ go: $t
+ - bits: 128
+ name: indices
+ base: int # always signed
+ out:
+ - *128any
+
+- go: PermuteOrZeroGrouped
+ asm: VPSHUFB
+ addDoc: !string |-
+ // result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+ // The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+ // unless the index's sign bit is set in which case zero is used instead.
+ // Each group is of size 128-bit.
+ in:
+ - &256Or512any
+ bits: "256|512"
+ go: $t
+ - bits: "256|512"
+ base: int
+ name: indices
+ out:
+ - *256Or512any
+
+- go: permuteScalars
+ asm: VPSHUFD
+ addDoc: !string |-
+ // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+ in:
+ - *128any
+ - class: immediate
+ immOffset: 0
+ name: indices
+ hideMaskMethods: true
+ out:
+ - *128any
+
+- go: permuteScalarsGrouped
+ asm: VPSHUFD
+ addDoc: !string |-
+ // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+ // Each group is of size 128-bit.
+ in:
+ - *256Or512any
+ - class: immediate
+ immOffset: 0
+ name: indices
+ hideMaskMethods: true
+ out:
+ - *256Or512any
+
+- go: permuteScalarsLo
+ asm: VPSHUFLW
+ addDoc: !string |-
+ // result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+ in:
+ - &128lanes8
+ bits: 128
+ go: $t
+ elemBits: 16
+ - class: immediate
+ immOffset: 0
+ name: indices
+ hideMaskMethods: true
+ out:
+ - *128lanes8
+
+- go: permuteScalarsLoGrouped
+ asm: VPSHUFLW
+ addDoc: !string |-
+ //
+ // result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+ // x_group1[indices[0:2]], ...}
+ //
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+ // Each group is of size 128-bit.
+ in:
+ - &256Or512lanes8
+ bits: "256|512"
+ go: $t
+ elemBits: 16
+ - class: immediate
+ immOffset: 0
+ name: indices
+ hideMaskMethods: true
+ out:
+ - *256Or512lanes8
+
+- go: permuteScalarsHi
+ asm: VPSHUFHW
+ addDoc: !string |-
+ // result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+ in:
+ - *128lanes8
+ - class: immediate
+ immOffset: 0
+ name: indices
+ hideMaskMethods: true
+ out:
+ - *128lanes8
+
+- go: permuteScalarsHiGrouped
+ asm: VPSHUFHW
+ addDoc: !string |-
+ // result =
+ //
+ // {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+ // x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+ //
+ // Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+ // Each group is of size 128-bit.
+ in:
+ - *256Or512lanes8
+ - class: immediate
+ immOffset: 0
+ name: indices
+ hideMaskMethods: true
+ out:
+ - *256Or512lanes8
+
+- go: InterleaveHi
+ asm: VPUNPCKH(QDQ|DQ|WD|WB)
+ in:
+ - *128any
+ - *128any
+ inVariant: []
+ out:
+ - *128any
+
+- go: InterleaveLo
+ asm: VPUNPCKL(QDQ|DQ|WD|WB)
+ in:
+ - *128any
+ - *128any
+ inVariant: []
+ out:
+ - *128any
+
+- go: InterleaveHiGrouped
+ asm: VPUNPCKH(QDQ|DQ|WD|WB)
+ in:
+ - *256Or512any
+ - *256Or512any
+ inVariant: []
+ out:
+ - *256Or512any
+
+- go: InterleaveLoGrouped
+ asm: VPUNPCKL(QDQ|DQ|WD|WB)
+ in:
+ - *256Or512any
+ - *256Or512any
+ inVariant: []
+ out:
+ - *256Or512any
+
+# These are all described separately to carry the name of the constant parameter
+
+- go: concatSelectedConstant
+ asm: VSHUFPS
+ width: 32
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: 128
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: h1h0l1l0
+ inVariant: []
+ out:
+ - *v
+
+- go: concatSelectedConstant
+ asm: VSHUFPS
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: 128
+ OverwriteBase: int
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: h1h0l1l0
+ inVariant: []
+ out:
+ - *v
+
+- go: concatSelectedConstant
+ asm: VSHUFPS
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: 128
+ OverwriteBase: uint
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: h1h0l1l0
+ inVariant: []
+ out:
+ - *v
+
+
+- go: concatSelectedConstantGrouped
+ asm: VSHUFPS
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: "256|512"
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: h1h0l1l0
+ inVariant: []
+ out:
+ - *v
+
+- go: concatSelectedConstantGrouped
+ asm: VSHUFPS
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: "256|512"
+ OverwriteBase: int
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: h1h0l1l0
+ inVariant: []
+ out:
+ - *v
+
+- go: concatSelectedConstantGrouped
+ asm: VSHUFPS
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: "256|512"
+ OverwriteBase: uint
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: h1h0l1l0
+ inVariant: []
+ out:
+ - *v
+
+
+ # 64 bit versions
+
+- go: concatSelectedConstant
+ asm: VSHUFPD
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: 128
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: hilo
+ inVariant: []
+ out:
+ - *v
+
+- go: concatSelectedConstant
+ asm: VSHUFPD
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: 128
+ OverwriteBase: int
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: hilo
+ inVariant: []
+ out:
+ - *v
+
+- go: concatSelectedConstant
+ asm: VSHUFPD
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: 128
+ OverwriteBase: uint
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: hilo
+ inVariant: []
+ out:
+ - *v
+
+- go: concatSelectedConstantGrouped
+ asm: VSHUFPD
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: "256|512"
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: hilos
+ inVariant: []
+ out:
+ - *v
+
+- go: concatSelectedConstantGrouped
+ asm: VSHUFPD
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: "256|512"
+ OverwriteBase: int
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: hilos
+ inVariant: []
+ out:
+ - *v
+
+- go: concatSelectedConstantGrouped
+ asm: VSHUFPD
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: "256|512"
+ OverwriteBase: uint
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: hilos
+ inVariant: []
+ out:
+ - *v
+
+- go: Select128FromPair
+ asm: VPERM2F128
+ operandOrder: II
+ addDoc: !string |-
+ // For example,
+ //
+ // {40, 41, 50, 51}.NAME(3, 0, {60, 61, 70, 71})
+ //
+ // returns {70, 71, 40, 41}.
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: 256
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: "lo, hi"
+ inVariant: []
+ out:
+ - *v
+
+- go: Select128FromPair
+ asm: VPERM2F128
+ operandOrder: II
+ addDoc: !string |-
+ // For example,
+ //
+ // {40, 41, 42, 43, 50, 51, 52, 53}.NAME(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+ //
+ // returns {70, 71, 72, 73, 40, 41, 42, 43}.
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: float
+ bits: 256
+ OverwriteElementBits: 32
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: "lo, hi"
+ inVariant: []
+ out:
+ - *v
+
+- go: Select128FromPair
+ asm: VPERM2I128
+ operandOrder: II
+ addDoc: !string |-
+ // For example,
+ //
+ // {40, 41, 50, 51}.NAME(3, 0, {60, 61, 70, 71})
+ //
+ // returns {70, 71, 40, 41}.
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: int|uint
+ bits: 256
+ OverwriteElementBits: 64
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: "lo, hi"
+ inVariant: []
+ out:
+ - *v
+
+- go: Select128FromPair
+ asm: VPERM2I128
+ operandOrder: II
+ addDoc: !string |-
+ // For example,
+ //
+ // {40, 41, 42, 43, 50, 51, 52, 53}.NAME(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+ //
+ // returns {70, 71, 72, 73, 40, 41, 42, 43}.
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: int|uint
+ bits: 256
+ OverwriteElementBits: 32
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: "lo, hi"
+ inVariant: []
+ out:
+ - *v
+
+- go: Select128FromPair
+ asm: VPERM2I128
+ operandOrder: II
+ addDoc: !string |-
+ // For example,
+ //
+ // {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.NAME(3, 0,
+ // {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
+ //
+ // returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: int|uint
+ bits: 256
+ OverwriteElementBits: 16
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: "lo, hi"
+ inVariant: []
+ out:
+ - *v
+
+- go: Select128FromPair
+ asm: VPERM2I128
+ operandOrder: II
+ addDoc: !string |-
+ // For example,
+ //
+ // {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.NAME(3, 0,
+ // {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
+ //
+ // returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
+ in:
+ - &v
+ go: $t
+ class: vreg
+ base: int|uint
+ bits: 256
+ OverwriteElementBits: 8
+ - *v
+ - class: immediate
+ immOffset: 0
+ name: "lo, hi"
+ inVariant: []
+ out:
+ - *v
+
+- go: ConcatShiftBytesRight
+ asm: VPALIGNR
+ in:
+ - &uint128
+ go: $t
+ base: uint
+ bits: 128
+ - *uint128
+ - class: immediate
+ immOffset: 0
+ out:
+ - *uint128
+
+- go: ConcatShiftBytesRightGrouped
+ asm: VPALIGNR
+ in:
+ - &uint256512
+ go: $t
+ base: uint
+ bits: 256|512
+ - *uint256512
+ - class: immediate
+ immOffset: 0
+ out:
+ - *uint256512
--- /dev/null
+!sum
+- go: Mul
+ commutative: true
+ documentation: !string |-
+ // NAME multiplies corresponding elements of two vectors.
+- go: MulEvenWiden
+ commutative: true
+ documentation: !string |-
+ // NAME multiplies even-indexed elements, widening the result.
+ // Result[i] = v1.Even[i] * v2.Even[i].
+- go: MulHigh
+ commutative: true
+ documentation: !string |-
+ // NAME multiplies elements and stores the high part of the result.
--- /dev/null
+!sum
+# "Normal" multiplication is only available for floats.
+# This only covers the single and double precision.
+- go: Mul
+ asm: "VMULP[SD]"
+ in:
+ - &fp
+ go: $t
+ base: float
+ - *fp
+ out:
+ - *fp
+
+# Integer multiplications.
+
+# MulEvenWiden
+# Dword only.
+- go: MulEvenWiden
+ asm: "VPMULDQ"
+ in:
+ - &intNot64
+ go: $t
+ elemBits: 8|16|32
+ base: int
+ - *intNot64
+ out:
+ - &int2
+ go: $t2
+ base: int
+- go: MulEvenWiden
+ asm: "VPMULUDQ"
+ in:
+ - &uintNot64
+ go: $t
+ elemBits: 8|16|32
+ base: uint
+ - *uintNot64
+ out:
+ - &uint2
+ go: $t2
+ base: uint
+
+# MulHigh
+# Word only.
+- go: MulHigh
+ asm: "VPMULHW"
+ in:
+ - &int
+ go: $t
+ base: int
+ - *int
+ out:
+ - *int
+- go: MulHigh
+ asm: "VPMULHUW"
+ in:
+ - &uint
+ go: $t
+ base: uint
+ - *uint
+ out:
+ - *uint
+
+# MulLow
+# signed and unsigned are the same for lower bits.
+- go: Mul
+ asm: "VPMULL[WDQ]"
+ in:
+ - &any
+ go: $t
+ - *any
+ out:
+ - *any
--- /dev/null
+!sum
+- go: LeadingZeros
+ commutative: false
+ documentation: !string |-
+ // NAME counts the leading zeros of each element in x.
+- go: AESEncryptOneRound
+ commutative: false
+ documentation: !string |-
+ // NAME performs a series of operations in AES cipher algorithm defined in FIPS 197.
+ // x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+ // y is the chunk of w array in use.
+ // result = AddRoundKey(MixColumns(ShiftRows(SubBytes(x))), y)
+- go: AESEncryptLastRound
+ commutative: false
+ documentation: !string |-
+ // NAME performs a series of operations in AES cipher algorithm defined in FIPS 197.
+ // x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+ // y is the chunk of w array in use.
+ // result = AddRoundKey((ShiftRows(SubBytes(x))), y)
+- go: AESRoundKeyGenAssist
+ commutative: false
+ documentation: !string |-
+ // NAME performs some components of KeyExpansion in AES cipher algorithm defined in FIPS 197.
+ // x is an array of AES words, but only x[0] and x[2] are used.
+ // r is a value from the Rcon constant array.
+ // result[0] = XOR(SubWord(RotWord(x[0])), r)
+ // result[1] = SubWord(x[1])
+ // result[2] = XOR(SubWord(RotWord(x[2])), r)
+ // result[3] = SubWord(x[3])
+- go: AESDecryptOneRound
+ commutative: false
+ documentation: !string |-
+ // NAME performs a series of operations in AES cipher algorithm defined in FIPS 197.
+ // x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+ // y is the chunk of dw array in use.
+ // result = AddRoundKey(InvMixColumns(InvShiftRows(InvSubBytes(x))), y)
+- go: AESDecryptLastRound
+ commutative: false
+ documentation: !string |-
+ // NAME performs a series of operations in AES cipher algorithm defined in FIPS 197.
+ // x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+ // y is the chunk of dw array in use.
+ // result = AddRoundKey(InvShiftRows(InvSubBytes(x)), y)
+- go: AESInvMixColumns
+ commutative: false
+ documentation: !string |-
+ // NAME performs the InvMixColumns operation in AES cipher algorithm defined in FIPS 197.
+ // x is the chunk of w array in use.
+ // result = InvMixColumns(x)
+- go: SHA1FourRounds
+ commutative: false
+ documentation: !string |-
+ // NAME performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+ // x contains the state variables a, b, c and d from upper to lower order.
+ // y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+ // result = the state variables a', b', c', d' updated after 4 rounds.
+ // constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+- go: SHA1NextE
+ commutative: false
+ documentation: !string |-
+ // NAME calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+ // x contains the state variable a (before the 4 rounds), placed in the upper element.
+ // y is the elements of W array for next 4 rounds from upper to lower order.
+ // result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+ // from upper to lower order.
+ // For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+ // for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+ // computation of the value of e'.)
+- go: SHA1Message1
+ commutative: false
+ documentation: !string |-
+ // NAME does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+ // x = {W3, W2, W1, W0}
+ // y = {0, 0, W5, W4}
+ // result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+- go: SHA1Message2
+ commutative: false
+ documentation: !string |-
+ // NAME does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+ // x = result of 2.
+ // y = {W15, W14, W13}
+ // result = {W19, W18, W17, W16}
+- go: SHA256TwoRounds
+ commutative: false
+ documentation: !string |-
+ // NAME does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+ // x = {h, g, d, c}
+ // y = {f, e, b, a}
+ // z = {W0+K0, W1+K1}
+ // result = {f', e', b', a'}
+ // The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+ // the corresponding element of the W array to make the input data z.
+ // The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+ // y (the state variables a, b, e, f before the 2 rounds).
+- go: SHA256Message1
+ commutative: false
+ documentation: !string |-
+ // NAME does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+ // x = {W0, W1, W2, W3}
+ // y = {W4, 0, 0, 0}
+ // result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+- go: SHA256Message2
+ commutative: false
+ documentation: !string |-
+ // NAME does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+ // x = result of 2
+ // y = {0, 0, W14, W15}
+ // result = {W16, W17, W18, W19}
\ No newline at end of file
--- /dev/null
+!sum
+- go: LeadingZeros
+ asm: "VPLZCNT[DQ]"
+ in:
+ - &any
+ go: $t
+ out:
+ - *any
+- go: AESEncryptOneRound
+ asm: VAESENC
+ in:
+ - &uint8s
+ base: uint
+ overwriteElementBits: 8
+ - &uint32s
+ base: uint
+ overwriteElementBits: 32
+ out:
+ - *uint8s
+- go: AESEncryptLastRound
+ asm: VAESENCLAST
+ in:
+ - *uint8s
+ - *uint32s
+ out:
+ - *uint8s
+- go: AESRoundKeyGenAssist
+ asm: VAESKEYGENASSIST
+ in:
+ - *uint32s
+ - class: immediate
+ immOffset: 0
+ name: rconVal
+ out:
+ - *uint32s
+- go: AESDecryptOneRound
+ asm: VAESDEC
+ in:
+ - *uint8s
+ - *uint32s
+ out:
+ - *uint8s
+- go: AESDecryptLastRound
+ asm: VAESDECLAST
+ in:
+ - *uint8s
+ - *uint32s
+ out:
+ - *uint8s
+- go: AESInvMixColumns
+ asm: VAESIMC
+ in:
+ - *uint32s
+ out:
+ - *uint32s
+- go: SHA1FourRounds
+ asm: SHA1RNDS4
+ operandOrder: "SHA1RNDS4"
+ in: &2uint1imm
+ - &uint
+ go: $t
+ base: uint
+ - *uint
+ - class: immediate
+ immOffset: 0
+ out: &1uint
+ - *uint
+- go: SHA1NextE
+ asm: SHA1NEXTE
+ in: &2uint
+ - *uint
+ - *uint
+ out: *1uint
+- go: SHA1Message1
+ asm: SHA1MSG1
+ in: *2uint
+ out: *1uint
+- go: SHA1Message2
+ asm: SHA1MSG2
+ in: *2uint
+ out: *1uint
+- go: SHA256TwoRounds
+ asm: SHA256RNDS2
+ in:
+ - base: uint
+ - base: uint
+ - base: uint
+ overwriteElementBits: 32
+ out:
+ - base: uint
+- go: SHA256Message1
+ asm: SHA256MSG1
+ in: *2uint
+ out: *1uint
+- go: SHA256Message2
+ asm: SHA256MSG2
+ in: *2uint
+ out: *1uint
--- /dev/null
+!sum
+- go: ShiftAllLeft
+ nameAndSizeCheck: true
+ specialLower: sftimm
+ commutative: false
+ documentation: !string |-
+ // NAME shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+- go: ShiftAllRight
+ signed: false
+ nameAndSizeCheck: true
+ specialLower: sftimm
+ commutative: false
+ documentation: !string |-
+ // NAME shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+- go: ShiftAllRight
+ signed: true
+ specialLower: sftimm
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+- go: shiftAllLeftConst # no APIs, only ssa ops.
+ noTypes: "true"
+ noGenericOps: "true"
+ SSAVariant: "const" # to avoid its name colliding with reg version of this instruction, amend this to its ssa op name.
+ nameAndSizeCheck: true
+ commutative: false
+- go: shiftAllRightConst # no APIs, only ssa ops.
+ noTypes: "true"
+ noGenericOps: "true"
+ SSAVariant: "const"
+ signed: false
+ nameAndSizeCheck: true
+ commutative: false
+- go: shiftAllRightConst # no APIs, only ssa ops.
+ noTypes: "true"
+ noGenericOps: "true"
+ SSAVariant: "const"
+ signed: true
+ nameAndSizeCheck: true
+ commutative: false
+
+- go: ShiftLeft
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+- go: ShiftRight
+ signed: false
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+- go: ShiftRight
+ signed: true
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+- go: RotateAllLeft
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME rotates each element to the left by the number of bits specified by the immediate.
+- go: RotateLeft
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+- go: RotateAllRight
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME rotates each element to the right by the number of bits specified by the immediate.
+- go: RotateRight
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+- go: ShiftAllLeftConcat
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts each element of x to the left by the number of bits specified by the
+ // immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+- go: ShiftAllRightConcat
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts each element of x to the right by the number of bits specified by the
+ // immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+- go: ShiftLeftConcat
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts each element of x to the left by the number of bits specified by the
+ // corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+- go: ShiftRightConcat
+ nameAndSizeCheck: true
+ commutative: false
+ documentation: !string |-
+ // NAME shifts each element of x to the right by the number of bits specified by the
+ // corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
--- /dev/null
+!sum
+# Integers
+# ShiftAll*
+- go: ShiftAllLeft
+ asm: "VPSLL[WDQ]"
+ in:
+ - &any
+ go: $t
+ - &vecAsScalar64
+ go: "Uint.*"
+ treatLikeAScalarOfSize: 64
+ out:
+ - *any
+- go: ShiftAllRight
+ signed: false
+ asm: "VPSRL[WDQ]"
+ in:
+ - &uint
+ go: $t
+ base: uint
+ - *vecAsScalar64
+ out:
+ - *uint
+- go: ShiftAllRight
+ signed: true
+ asm: "VPSRA[WDQ]"
+ in:
+ - &int
+ go: $t
+ base: int
+ - *vecAsScalar64
+ out:
+ - *int
+
+- go: shiftAllLeftConst
+ asm: "VPSLL[WDQ]"
+ in:
+ - *any
+ - &imm
+ class: immediate
+ immOffset: 0
+ out:
+ - *any
+- go: shiftAllRightConst
+ asm: "VPSRL[WDQ]"
+ in:
+ - *int
+ - *imm
+ out:
+ - *int
+- go: shiftAllRightConst
+ asm: "VPSRA[WDQ]"
+ in:
+ - *uint
+ - *imm
+ out:
+ - *uint
+
+# Shift* (variable)
+- go: ShiftLeft
+ asm: "VPSLLV[WD]"
+ in:
+ - *any
+ - *any
+ out:
+ - *any
+# XED data of VPSLLVQ marks the element bits 32 which is off to the actual semantic, we need to overwrite
+# it to 64.
+- go: ShiftLeft
+ asm: "VPSLLVQ"
+ in:
+ - &anyOverwriteElemBits
+ go: $t
+ overwriteElementBits: 64
+ - *anyOverwriteElemBits
+ out:
+ - *anyOverwriteElemBits
+- go: ShiftRight
+ signed: false
+ asm: "VPSRLV[WD]"
+ in:
+ - *uint
+ - *uint
+ out:
+ - *uint
+# XED data of VPSRLVQ needs the same overwrite as VPSLLVQ.
+- go: ShiftRight
+ signed: false
+ asm: "VPSRLVQ"
+ in:
+ - &uintOverwriteElemBits
+ go: $t
+ base: uint
+ overwriteElementBits: 64
+ - *uintOverwriteElemBits
+ out:
+ - *uintOverwriteElemBits
+- go: ShiftRight
+ signed: true
+ asm: "VPSRAV[WDQ]"
+ in:
+ - *int
+ - *int
+ out:
+ - *int
+
+# Rotate
+- go: RotateAllLeft
+ asm: "VPROL[DQ]"
+ in:
+ - *any
+ - &pureImm
+ class: immediate
+ immOffset: 0
+ name: shift
+ out:
+ - *any
+- go: RotateAllRight
+ asm: "VPROR[DQ]"
+ in:
+ - *any
+ - *pureImm
+ out:
+ - *any
+- go: RotateLeft
+ asm: "VPROLV[DQ]"
+ in:
+ - *any
+ - *any
+ out:
+ - *any
+- go: RotateRight
+ asm: "VPRORV[DQ]"
+ in:
+ - *any
+ - *any
+ out:
+ - *any
+
+# Bizzare shifts.
+- go: ShiftAllLeftConcat
+ asm: "VPSHLD[WDQ]"
+ in:
+ - *any
+ - *any
+ - *pureImm
+ out:
+ - *any
+- go: ShiftAllRightConcat
+ asm: "VPSHRD[WDQ]"
+ in:
+ - *any
+ - *any
+ - *pureImm
+ out:
+ - *any
+- go: ShiftLeftConcat
+ asm: "VPSHLDV[WDQ]"
+ in:
+ - *any
+ - *any
+ - *any
+ out:
+ - *any
+- go: ShiftRightConcat
+ asm: "VPSHRDV[WDQ]"
+ in:
+ - *any
+ - *any
+ - *any
+ out:
+ - *any
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "fmt"
+ "reflect"
+ "strconv"
+)
+
+func pprints(v any) string {
+ var pp pprinter
+ pp.val(reflect.ValueOf(v), 0)
+ return string(pp.buf)
+}
+
+type pprinter struct {
+ buf []byte
+}
+
+func (p *pprinter) indent(by int) {
+ for range by {
+ p.buf = append(p.buf, '\t')
+ }
+}
+
+func (p *pprinter) val(v reflect.Value, indent int) {
+ switch v.Kind() {
+ default:
+ p.buf = fmt.Appendf(p.buf, "unsupported kind %v", v.Kind())
+
+ case reflect.Bool:
+ p.buf = strconv.AppendBool(p.buf, v.Bool())
+
+ case reflect.Int, reflect.Int16, reflect.Int32, reflect.Int64:
+ p.buf = strconv.AppendInt(p.buf, v.Int(), 10)
+
+ case reflect.String:
+ p.buf = strconv.AppendQuote(p.buf, v.String())
+
+ case reflect.Pointer:
+ if v.IsNil() {
+ p.buf = append(p.buf, "nil"...)
+ } else {
+ p.buf = append(p.buf, "&"...)
+ p.val(v.Elem(), indent)
+ }
+
+ case reflect.Slice, reflect.Array:
+ p.buf = append(p.buf, "[\n"...)
+ for i := range v.Len() {
+ p.indent(indent + 1)
+ p.val(v.Index(i), indent+1)
+ p.buf = append(p.buf, ",\n"...)
+ }
+ p.indent(indent)
+ p.buf = append(p.buf, ']')
+
+ case reflect.Struct:
+ vt := v.Type()
+ p.buf = append(append(p.buf, vt.String()...), "{\n"...)
+ for f := range v.NumField() {
+ p.indent(indent + 1)
+ p.buf = append(append(p.buf, vt.Field(f).Name...), ": "...)
+ p.val(v.Field(f), indent+1)
+ p.buf = append(p.buf, ",\n"...)
+ }
+ p.indent(indent)
+ p.buf = append(p.buf, '}')
+ }
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "testing"
+
+func TestSort(t *testing.T) {
+ testCases := []struct {
+ s1, s2 string
+ want int
+ }{
+ {"a1", "a2", -1},
+ {"a11a", "a11b", -1},
+ {"a01a1", "a1a01", -1},
+ {"a2", "a1", 1},
+ {"a10", "a2", 1},
+ {"a1", "a10", -1},
+ {"z11", "z2", 1},
+ {"z2", "z11", -1},
+ {"abc", "abd", -1},
+ {"123", "45", 1},
+ {"file1", "file1", 0},
+ {"file", "file1", -1},
+ {"file1", "file", 1},
+ {"a01", "a1", -1},
+ {"a1a", "a1b", -1},
+ }
+
+ for _, tc := range testCases {
+ got := compareNatural(tc.s1, tc.s2)
+ result := "✅"
+ if got != tc.want {
+ result = "❌"
+ t.Errorf("%s CompareNatural(\"%s\", \"%s\") -> got %2d, want %2d\n", result, tc.s1, tc.s2, got, tc.want)
+ } else {
+ t.Logf("%s CompareNatural(\"%s\", \"%s\") -> got %2d, want %2d\n", result, tc.s1, tc.s2, got, tc.want)
+ }
+ }
+}
--- /dev/null
+# This file defines the possible types of each operand and result.
+#
+# In general, we're able to narrow this down on some attributes directly from
+# the machine instruction descriptions, but the Go mappings need to further
+# constrain them and how they relate. For example, on x86 we can't distinguish
+# int and uint, though we can distinguish these from float.
+
+in: !repeat
+- !sum &types
+ - {class: vreg, go: Int8x16, base: "int", elemBits: 8, bits: 128, lanes: 16}
+ - {class: vreg, go: Uint8x16, base: "uint", elemBits: 8, bits: 128, lanes: 16}
+ - {class: vreg, go: Int16x8, base: "int", elemBits: 16, bits: 128, lanes: 8}
+ - {class: vreg, go: Uint16x8, base: "uint", elemBits: 16, bits: 128, lanes: 8}
+ - {class: vreg, go: Int32x4, base: "int", elemBits: 32, bits: 128, lanes: 4}
+ - {class: vreg, go: Uint32x4, base: "uint", elemBits: 32, bits: 128, lanes: 4}
+ - {class: vreg, go: Int64x2, base: "int", elemBits: 64, bits: 128, lanes: 2}
+ - {class: vreg, go: Uint64x2, base: "uint", elemBits: 64, bits: 128, lanes: 2}
+ - {class: vreg, go: Float32x4, base: "float", elemBits: 32, bits: 128, lanes: 4}
+ - {class: vreg, go: Float64x2, base: "float", elemBits: 64, bits: 128, lanes: 2}
+ - {class: vreg, go: Int8x32, base: "int", elemBits: 8, bits: 256, lanes: 32}
+ - {class: vreg, go: Uint8x32, base: "uint", elemBits: 8, bits: 256, lanes: 32}
+ - {class: vreg, go: Int16x16, base: "int", elemBits: 16, bits: 256, lanes: 16}
+ - {class: vreg, go: Uint16x16, base: "uint", elemBits: 16, bits: 256, lanes: 16}
+ - {class: vreg, go: Int32x8, base: "int", elemBits: 32, bits: 256, lanes: 8}
+ - {class: vreg, go: Uint32x8, base: "uint", elemBits: 32, bits: 256, lanes: 8}
+ - {class: vreg, go: Int64x4, base: "int", elemBits: 64, bits: 256, lanes: 4}
+ - {class: vreg, go: Uint64x4, base: "uint", elemBits: 64, bits: 256, lanes: 4}
+ - {class: vreg, go: Float32x8, base: "float", elemBits: 32, bits: 256, lanes: 8}
+ - {class: vreg, go: Float64x4, base: "float", elemBits: 64, bits: 256, lanes: 4}
+ - {class: vreg, go: Int8x64, base: "int", elemBits: 8, bits: 512, lanes: 64}
+ - {class: vreg, go: Uint8x64, base: "uint", elemBits: 8, bits: 512, lanes: 64}
+ - {class: vreg, go: Int16x32, base: "int", elemBits: 16, bits: 512, lanes: 32}
+ - {class: vreg, go: Uint16x32, base: "uint", elemBits: 16, bits: 512, lanes: 32}
+ - {class: vreg, go: Int32x16, base: "int", elemBits: 32, bits: 512, lanes: 16}
+ - {class: vreg, go: Uint32x16, base: "uint", elemBits: 32, bits: 512, lanes: 16}
+ - {class: vreg, go: Int64x8, base: "int", elemBits: 64, bits: 512, lanes: 8}
+ - {class: vreg, go: Uint64x8, base: "uint", elemBits: 64, bits: 512, lanes: 8}
+ - {class: vreg, go: Float32x16, base: "float", elemBits: 32, bits: 512, lanes: 16}
+ - {class: vreg, go: Float64x8, base: "float", elemBits: 64, bits: 512, lanes: 8}
+
+ - {class: mask, go: Mask8x16, base: "int", elemBits: 8, bits: 128, lanes: 16}
+ - {class: mask, go: Mask16x8, base: "int", elemBits: 16, bits: 128, lanes: 8}
+ - {class: mask, go: Mask32x4, base: "int", elemBits: 32, bits: 128, lanes: 4}
+ - {class: mask, go: Mask64x2, base: "int", elemBits: 64, bits: 128, lanes: 2}
+ - {class: mask, go: Mask8x32, base: "int", elemBits: 8, bits: 256, lanes: 32}
+ - {class: mask, go: Mask16x16, base: "int", elemBits: 16, bits: 256, lanes: 16}
+ - {class: mask, go: Mask32x8, base: "int", elemBits: 32, bits: 256, lanes: 8}
+ - {class: mask, go: Mask64x4, base: "int", elemBits: 64, bits: 256, lanes: 4}
+ - {class: mask, go: Mask8x64, base: "int", elemBits: 8, bits: 512, lanes: 64}
+ - {class: mask, go: Mask16x32, base: "int", elemBits: 16, bits: 512, lanes: 32}
+ - {class: mask, go: Mask32x16, base: "int", elemBits: 32, bits: 512, lanes: 16}
+ - {class: mask, go: Mask64x8, base: "int", elemBits: 64, bits: 512, lanes: 8}
+
+
+ - {class: greg, go: float64, base: "float", bits: 64, lanes: 1}
+ - {class: greg, go: float32, base: "float", bits: 32, lanes: 1}
+ - {class: greg, go: int64, base: "int", bits: 64, lanes: 1}
+ - {class: greg, go: int32, base: "int", bits: 32, lanes: 1}
+ - {class: greg, go: int16, base: "int", bits: 16, lanes: 1}
+ - {class: greg, go: int8, base: "int", bits: 8, lanes: 1}
+ - {class: greg, go: uint64, base: "uint", bits: 64, lanes: 1}
+ - {class: greg, go: uint32, base: "uint", bits: 32, lanes: 1}
+ - {class: greg, go: uint16, base: "uint", bits: 16, lanes: 1}
+ - {class: greg, go: uint8, base: "uint", bits: 8, lanes: 1}
+
+# Special shapes just to make INSERT[IF]128 work.
+# The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits.
+ - {class: vreg, go: Int8x16, base: "int", elemBits: 128, bits: 128, lanes: 16}
+ - {class: vreg, go: Uint8x16, base: "uint", elemBits: 128, bits: 128, lanes: 16}
+ - {class: vreg, go: Int16x8, base: "int", elemBits: 128, bits: 128, lanes: 8}
+ - {class: vreg, go: Uint16x8, base: "uint", elemBits: 128, bits: 128, lanes: 8}
+ - {class: vreg, go: Int32x4, base: "int", elemBits: 128, bits: 128, lanes: 4}
+ - {class: vreg, go: Uint32x4, base: "uint", elemBits: 128, bits: 128, lanes: 4}
+ - {class: vreg, go: Int64x2, base: "int", elemBits: 128, bits: 128, lanes: 2}
+ - {class: vreg, go: Uint64x2, base: "uint", elemBits: 128, bits: 128, lanes: 2}
+
+ - {class: vreg, go: Int8x32, base: "int", elemBits: 128, bits: 256, lanes: 32}
+ - {class: vreg, go: Uint8x32, base: "uint", elemBits: 128, bits: 256, lanes: 32}
+ - {class: vreg, go: Int16x16, base: "int", elemBits: 128, bits: 256, lanes: 16}
+ - {class: vreg, go: Uint16x16, base: "uint", elemBits: 128, bits: 256, lanes: 16}
+ - {class: vreg, go: Int32x8, base: "int", elemBits: 128, bits: 256, lanes: 8}
+ - {class: vreg, go: Uint32x8, base: "uint", elemBits: 128, bits: 256, lanes: 8}
+ - {class: vreg, go: Int64x4, base: "int", elemBits: 128, bits: 256, lanes: 4}
+ - {class: vreg, go: Uint64x4, base: "uint", elemBits: 128, bits: 256, lanes: 4}
+
+# Special for carryless multiply
+ - {class: vreg, go: Uint64x8, base: "uint", elemBits: 128, bits: 512, lanes: 8}
+
+# Special shapes just to make VAES(ENC|DEC)(LAST)?512 work.
+# The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits.
+ - {class: vreg, go: Int8x32, base: "int", elemBits: 128, bits: 512, lanes: 32}
+ - {class: vreg, go: Uint8x32, base: "uint", elemBits: 128, bits: 512, lanes: 32}
+ - {class: vreg, go: Int16x16, base: "int", elemBits: 128, bits: 512, lanes: 16}
+ - {class: vreg, go: Uint16x16, base: "uint", elemBits: 128, bits: 512, lanes: 16}
+ - {class: vreg, go: Int32x8, base: "int", elemBits: 128, bits: 512, lanes: 8}
+ - {class: vreg, go: Uint32x8, base: "uint", elemBits: 128, bits: 512, lanes: 8}
+ - {class: vreg, go: Int64x4, base: "int", elemBits: 128, bits: 512, lanes: 4}
+ - {class: vreg, go: Uint64x4, base: "uint", elemBits: 128, bits: 512, lanes: 4}
+
+ - {class: immediate, go: Immediate} # TODO: we only support imms that are not used as value -- usually as instruction semantic predicate like VPCMP as of now.
+inVariant: !repeat
+- *types
+out: !repeat
+- *types
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "cmp"
+ "fmt"
+ "log"
+ "maps"
+ "reflect"
+ "regexp"
+ "slices"
+ "strconv"
+ "strings"
+
+ "simd/archsimd/_gen/unify"
+
+ "golang.org/x/arch/x86/xeddata"
+ "gopkg.in/yaml.v3"
+)
+
+const (
+ NOT_REG_CLASS = iota // not a register
+ VREG_CLASS // classify as a vector register; see
+ GREG_CLASS // classify as a general register
+)
+
+// instVariant is a bitmap indicating a variant of an instruction that has
+// optional parameters.
+type instVariant uint8
+
+const (
+ instVariantNone instVariant = 0
+
+ // instVariantMasked indicates that this is the masked variant of an
+ // optionally-masked instruction.
+ instVariantMasked instVariant = 1 << iota
+)
+
+var operandRemarks int
+
+// TODO: Doc. Returns Values with Def domains.
+func loadXED(xedPath string) []*unify.Value {
+ // TODO: Obviously a bunch more to do here.
+
+ db, err := xeddata.NewDatabase(xedPath)
+ if err != nil {
+ log.Fatalf("open database: %v", err)
+ }
+
+ var defs []*unify.Value
+ type opData struct {
+ inst *xeddata.Inst
+ ops []operand
+ mem string
+ }
+ // Maps from opcode to opdata(s).
+ memOps := make(map[string][]opData, 0)
+ otherOps := make(map[string][]opData, 0)
+ appendDefs := func(inst *xeddata.Inst, ops []operand, addFields map[string]string) {
+ applyQuirks(inst, ops)
+
+ defsPos := len(defs)
+ defs = append(defs, instToUVal(inst, ops, addFields)...)
+
+ if *flagDebugXED {
+ for i := defsPos; i < len(defs); i++ {
+ y, _ := yaml.Marshal(defs[i])
+ fmt.Printf("==>\n%s\n", y)
+ }
+ }
+ }
+ err = xeddata.WalkInsts(xedPath, func(inst *xeddata.Inst) {
+ inst.Pattern = xeddata.ExpandStates(db, inst.Pattern)
+
+ switch {
+ case inst.RealOpcode == "N":
+ return // Skip unstable instructions
+ case !(strings.HasPrefix(inst.Extension, "AVX") || strings.HasPrefix(inst.Extension, "SHA")):
+ // We're only interested in AVX and SHA instructions.
+ return
+ }
+
+ if *flagDebugXED {
+ fmt.Printf("%s:\n%+v\n", inst.Pos, inst)
+ }
+
+ ops, err := decodeOperands(db, strings.Fields(inst.Operands))
+ if err != nil {
+ operandRemarks++
+ if *Verbose {
+ log.Printf("%s: [%s] %s", inst.Pos, inst.Opcode(), err)
+ }
+ return
+ }
+ var data map[string][]opData
+ mem := checkMem(ops)
+ if mem == "vbcst" {
+ // A pure vreg variant might exist, wait for later to see if we can
+ // merge them
+ data = memOps
+ } else {
+ data = otherOps
+ }
+ opcode := inst.Opcode()
+ if _, ok := data[opcode]; !ok {
+ s := make([]opData, 1)
+ s[0] = opData{inst, ops, mem}
+ data[opcode] = s
+ } else {
+ data[opcode] = append(data[opcode], opData{inst, ops, mem})
+ }
+ })
+ for _, s := range otherOps {
+ for _, o := range s {
+ addFields := map[string]string{}
+ if o.mem == "noMem" {
+ opcode := o.inst.Opcode()
+ // Checking if there is a vbcst variant of this operation exist
+ // First check the opcode
+ // Keep this logic in sync with [decodeOperands]
+ if ms, ok := memOps[opcode]; ok {
+ feat1, ok1 := decodeCPUFeature(o.inst)
+ // Then check if there exist such an operation that for all vreg
+ // shapes they are the same at the same index
+ var feat1Match, feat2Match string
+ matchIdx := -1
+ var featMismatchCnt int
+ outer:
+ for i, m := range ms {
+ // Their CPU feature should match first
+ var featMismatch bool
+ feat2, ok2 := decodeCPUFeature(m.inst)
+ if !ok1 || !ok2 {
+ continue
+ }
+ if feat1 != feat2 {
+ featMismatch = true
+ featMismatchCnt++
+ }
+ if len(o.ops) == len(m.ops) {
+ for j := range o.ops {
+ if reflect.TypeOf(o.ops[j]) == reflect.TypeOf(m.ops[j]) {
+ v1, ok3 := o.ops[j].(operandVReg)
+ v2, _ := m.ops[j].(operandVReg)
+ if !ok3 {
+ continue
+ }
+ if v1.vecShape != v2.vecShape {
+ // A mismatch, skip this memOp
+ continue outer
+ }
+ } else {
+ _, ok3 := o.ops[j].(operandVReg)
+ _, ok4 := m.ops[j].(operandMem)
+ // The only difference must be the vreg and mem, no other cases.
+ if !ok3 || !ok4 {
+ // A mismatch, skip this memOp
+ continue outer
+ }
+ }
+ }
+ // Found a match, break early
+ matchIdx = i
+ feat1Match = feat1
+ feat2Match = feat2
+ if featMismatchCnt > 1 {
+ panic("multiple feature mismatch vbcst memops detected, simdgen failed to distinguish")
+ }
+ if !featMismatch {
+ // Mismatch feat is ok but should prioritize matching cases.
+ break
+ }
+ }
+ }
+ // Remove the match from memOps, it's now merged to this pure vreg operation
+ if matchIdx != -1 {
+ memOps[opcode] = append(memOps[opcode][:matchIdx], memOps[opcode][matchIdx+1:]...)
+ // Merge is done by adding a new field
+ // Right now we only have vbcst
+ addFields["memFeatures"] = "vbcst"
+ if feat1Match != feat2Match {
+ addFields["memFeaturesData"] = fmt.Sprintf("feat1=%s;feat2=%s", feat1Match, feat2Match)
+ }
+ }
+ }
+ }
+ appendDefs(o.inst, o.ops, addFields)
+ }
+ }
+ for _, ms := range memOps {
+ for _, m := range ms {
+ if *Verbose {
+ log.Printf("mem op not merged: %s, %v\n", m.inst.Opcode(), m)
+ }
+ appendDefs(m.inst, m.ops, nil)
+ }
+ }
+ if err != nil {
+ log.Fatalf("walk insts: %v", err)
+ }
+
+ if len(unknownFeatures) > 0 {
+ if !*Verbose {
+ nInst := 0
+ for _, insts := range unknownFeatures {
+ nInst += len(insts)
+ }
+ log.Printf("%d unhandled CPU features for %d instructions (use -v for details)", len(unknownFeatures), nInst)
+ } else {
+ keys := slices.SortedFunc(maps.Keys(unknownFeatures), func(a, b cpuFeatureKey) int {
+ return cmp.Or(cmp.Compare(a.Extension, b.Extension),
+ cmp.Compare(a.ISASet, b.ISASet))
+ })
+ for _, key := range keys {
+ if key.ISASet == "" || key.ISASet == key.Extension {
+ log.Printf("unhandled Extension %s", key.Extension)
+ } else {
+ log.Printf("unhandled Extension %s and ISASet %s", key.Extension, key.ISASet)
+ }
+ log.Printf(" opcodes: %s", slices.Sorted(maps.Keys(unknownFeatures[key])))
+ }
+ }
+ }
+
+ return defs
+}
+
+var (
+ maskRequiredRe = regexp.MustCompile(`VPCOMPRESS[BWDQ]|VCOMPRESSP[SD]|VPEXPAND[BWDQ]|VEXPANDP[SD]`)
+ maskOptionalRe = regexp.MustCompile(`VPCMP(EQ|GT|U)?[BWDQ]|VCMPP[SD]`)
+)
+
+func applyQuirks(inst *xeddata.Inst, ops []operand) {
+ opc := inst.Opcode()
+ switch {
+ case maskRequiredRe.MatchString(opc):
+ // The mask on these instructions is marked optional, but the
+ // instruction is pointless without the mask.
+ for i, op := range ops {
+ if op, ok := op.(operandMask); ok {
+ op.optional = false
+ ops[i] = op
+ }
+ }
+
+ case maskOptionalRe.MatchString(opc):
+ // Conversely, these masks should be marked optional and aren't.
+ for i, op := range ops {
+ if op, ok := op.(operandMask); ok && op.action.r {
+ op.optional = true
+ ops[i] = op
+ }
+ }
+ }
+}
+
+type operandCommon struct {
+ action operandAction
+}
+
+// operandAction defines whether this operand is read and/or written.
+//
+// TODO: Should this live in [xeddata.Operand]?
+type operandAction struct {
+ r bool // Read
+ w bool // Written
+ cr bool // Read is conditional (implies r==true)
+ cw bool // Write is conditional (implies w==true)
+}
+
+type operandMem struct {
+ operandCommon
+ vecShape
+ elemBaseType scalarBaseType
+ // The following fields are not flushed to the final output
+ // Supports full-vector broadcasting; implies the operand having a "vv"(vector vector) type specified in width and
+ // the instruction is with attribute TXT=BCASTSTR.
+ vbcst bool
+ unknown bool // unknown kind
+}
+
+type vecShape struct {
+ elemBits int // Element size in bits
+ bits int // Register width in bits (total vector bits)
+ fixedName string // the fixed register name
+}
+
+type operandVReg struct { // Vector register
+ operandCommon
+ vecShape
+ elemBaseType scalarBaseType
+}
+
+type operandGReg struct { // Vector register
+ operandCommon
+ vecShape
+ elemBaseType scalarBaseType
+}
+
+// operandMask is a vector mask.
+//
+// Regardless of the actual mask representation, the [vecShape] of this operand
+// corresponds to the "bit for bit" type of mask. That is, elemBits gives the
+// element width covered by each mask element, and bits/elemBits gives the total
+// number of mask elements. (bits gives the total number of bits as if this were
+// a bit-for-bit mask, which may be meaningless on its own.)
+type operandMask struct {
+ operandCommon
+ vecShape
+ // Bits in the mask is w/bits.
+
+ allMasks bool // If set, size cannot be inferred because all operands are masks.
+
+ // Mask can be omitted, in which case it defaults to K0/"no mask"
+ optional bool
+}
+
+type operandImm struct {
+ operandCommon
+ bits int // Immediate size in bits
+}
+
+type operand interface {
+ common() operandCommon
+ addToDef(b *unify.DefBuilder)
+}
+
+func strVal(s any) *unify.Value {
+ return unify.NewValue(unify.NewStringExact(fmt.Sprint(s)))
+}
+
+func (o operandCommon) common() operandCommon {
+ return o
+}
+
+func (o operandMem) addToDef(b *unify.DefBuilder) {
+ b.Add("class", strVal("memory"))
+ if o.unknown {
+ return
+ }
+ baseDomain, err := unify.NewStringRegex(o.elemBaseType.regex())
+ if err != nil {
+ panic("parsing baseRe: " + err.Error())
+ }
+ b.Add("base", unify.NewValue(baseDomain))
+ b.Add("bits", strVal(o.bits))
+ if o.elemBits != o.bits {
+ b.Add("elemBits", strVal(o.elemBits))
+ }
+}
+
+func (o operandVReg) addToDef(b *unify.DefBuilder) {
+ baseDomain, err := unify.NewStringRegex(o.elemBaseType.regex())
+ if err != nil {
+ panic("parsing baseRe: " + err.Error())
+ }
+ b.Add("class", strVal("vreg"))
+ b.Add("bits", strVal(o.bits))
+ b.Add("base", unify.NewValue(baseDomain))
+ // If elemBits == bits, then the vector can be ANY shape. This happens with,
+ // for example, logical ops.
+ if o.elemBits != o.bits {
+ b.Add("elemBits", strVal(o.elemBits))
+ }
+ if o.fixedName != "" {
+ b.Add("fixedReg", strVal(o.fixedName))
+ }
+}
+
+func (o operandGReg) addToDef(b *unify.DefBuilder) {
+ baseDomain, err := unify.NewStringRegex(o.elemBaseType.regex())
+ if err != nil {
+ panic("parsing baseRe: " + err.Error())
+ }
+ b.Add("class", strVal("greg"))
+ b.Add("bits", strVal(o.bits))
+ b.Add("base", unify.NewValue(baseDomain))
+ if o.elemBits != o.bits {
+ b.Add("elemBits", strVal(o.elemBits))
+ }
+ if o.fixedName != "" {
+ b.Add("fixedReg", strVal(o.fixedName))
+ }
+}
+
+func (o operandMask) addToDef(b *unify.DefBuilder) {
+ b.Add("class", strVal("mask"))
+ if o.allMasks {
+ // If all operands are masks, omit sizes and let unification determine mask sizes.
+ return
+ }
+ b.Add("elemBits", strVal(o.elemBits))
+ b.Add("bits", strVal(o.bits))
+ if o.fixedName != "" {
+ b.Add("fixedReg", strVal(o.fixedName))
+ }
+}
+
+func (o operandImm) addToDef(b *unify.DefBuilder) {
+ b.Add("class", strVal("immediate"))
+ b.Add("bits", strVal(o.bits))
+}
+
+var actionEncoding = map[string]operandAction{
+ "r": {r: true},
+ "cr": {r: true, cr: true},
+ "w": {w: true},
+ "cw": {w: true, cw: true},
+ "rw": {r: true, w: true},
+ "crw": {r: true, w: true, cr: true},
+ "rcw": {r: true, w: true, cw: true},
+}
+
+func decodeOperand(db *xeddata.Database, operand string) (operand, error) {
+ op, err := xeddata.NewOperand(db, operand)
+ if err != nil {
+ log.Fatalf("parsing operand %q: %v", operand, err)
+ }
+ if *flagDebugXED {
+ fmt.Printf(" %+v\n", op)
+ }
+
+ if strings.HasPrefix(op.Name, "EMX_BROADCAST") {
+ // This refers to a set of macros defined in all-state.txt that set a
+ // BCAST operand to various fixed values. But the BCAST operand is
+ // itself suppressed and "internal", so I think we can just ignore this
+ // operand.
+ return nil, nil
+ }
+
+ // TODO: See xed_decoded_inst_operand_action. This might need to be more
+ // complicated.
+ action, ok := actionEncoding[op.Action]
+ if !ok {
+ return nil, fmt.Errorf("unknown action %q", op.Action)
+ }
+ common := operandCommon{action: action}
+
+ lhs := op.NameLHS()
+ if strings.HasPrefix(lhs, "MEM") {
+ // looks like XED data has an inconsistency on VPADDD, marking attribute
+ // VPBROADCASTD instead of the canonical BCASTSTR.
+ if op.Width == "vv" && (op.Attributes["TXT=BCASTSTR"] ||
+ op.Attributes["TXT=VPBROADCASTD"]) {
+ baseType, elemBits, ok := decodeType(op)
+ if !ok {
+ return nil, fmt.Errorf("failed to decode memory width %q", operand)
+ }
+ // This operand has two possible width([bits]):
+ // 1. the same as the other operands
+ // 2. the element width as the other operands (broaccasting)
+ // left it default to 2, later we will set a new field in the operation
+ // to indicate this dual-width property.
+ shape := vecShape{elemBits: elemBits, bits: elemBits}
+ return operandMem{
+ operandCommon: common,
+ vecShape: shape,
+ elemBaseType: baseType,
+ vbcst: true,
+ unknown: false,
+ }, nil
+ }
+ // TODO: parse op.Width better to handle all cases
+ // Right now this will at least miss VPBROADCAST.
+ return operandMem{
+ operandCommon: common,
+ unknown: true,
+ }, nil
+ } else if strings.HasPrefix(lhs, "REG") {
+ if op.Width == "mskw" {
+ // The mask operand doesn't specify a width. We have to infer it.
+ //
+ // XED uses the marker ZEROSTR to indicate that a mask operand is
+ // optional and, if omitted, implies K0, aka "no mask".
+ return operandMask{
+ operandCommon: common,
+ optional: op.Attributes["TXT=ZEROSTR"],
+ }, nil
+ } else {
+ class, regBits, fixedReg := decodeReg(op)
+ if class == NOT_REG_CLASS {
+ return nil, fmt.Errorf("failed to decode register %q", operand)
+ }
+ baseType, elemBits, ok := decodeType(op)
+ if !ok {
+ return nil, fmt.Errorf("failed to decode register width %q", operand)
+ }
+ shape := vecShape{elemBits: elemBits, bits: regBits, fixedName: fixedReg}
+ if class == VREG_CLASS {
+ return operandVReg{
+ operandCommon: common,
+ vecShape: shape,
+ elemBaseType: baseType,
+ }, nil
+ }
+ // general register
+ m := min(shape.bits, shape.elemBits)
+ shape.bits, shape.elemBits = m, m
+ return operandGReg{
+ operandCommon: common,
+ vecShape: shape,
+ elemBaseType: baseType,
+ }, nil
+
+ }
+ } else if strings.HasPrefix(lhs, "IMM") {
+ _, bits, ok := decodeType(op)
+ if !ok {
+ return nil, fmt.Errorf("failed to decode register width %q", operand)
+ }
+ return operandImm{
+ operandCommon: common,
+ bits: bits,
+ }, nil
+ }
+
+ // TODO: BASE and SEG
+ return nil, fmt.Errorf("unknown operand LHS %q in %q", lhs, operand)
+}
+
+func decodeOperands(db *xeddata.Database, operands []string) (ops []operand, err error) {
+ // Decode the XED operand descriptions.
+ for _, o := range operands {
+ op, err := decodeOperand(db, o)
+ if err != nil {
+ return nil, err
+ }
+ if op != nil {
+ ops = append(ops, op)
+ }
+ }
+
+ // XED doesn't encode the size of mask operands. If there are mask operands,
+ // try to infer their sizes from other operands.
+ if err := inferMaskSizes(ops); err != nil {
+ return nil, fmt.Errorf("%w in operands %+v", err, operands)
+ }
+
+ return ops, nil
+}
+
+func inferMaskSizes(ops []operand) error {
+ // This is a heuristic and it falls apart in some cases:
+ //
+ // - Mask operations like KAND[BWDQ] have *nothing* in the XED to indicate
+ // mask size.
+ //
+ // - VINSERT*, VPSLL*, VPSRA*, and VPSRL* and some others naturally have
+ // mixed input sizes and the XED doesn't indicate which operands the mask
+ // applies to.
+ //
+ // - VPDP* and VP4DP* have really complex mixed operand patterns.
+ //
+ // I think for these we may just have to hand-write a table of which
+ // operands each mask applies to.
+ inferMask := func(r, w bool) error {
+ var masks []int
+ var rSizes, wSizes, sizes []vecShape
+ allMasks := true
+ hasWMask := false
+ for i, op := range ops {
+ action := op.common().action
+ if _, ok := op.(operandMask); ok {
+ if action.r && action.w {
+ return fmt.Errorf("unexpected rw mask")
+ }
+ if action.r == r || action.w == w {
+ masks = append(masks, i)
+ }
+ if action.w {
+ hasWMask = true
+ }
+ } else {
+ allMasks = false
+ if reg, ok := op.(operandVReg); ok {
+ if action.r {
+ rSizes = append(rSizes, reg.vecShape)
+ }
+ if action.w {
+ wSizes = append(wSizes, reg.vecShape)
+ }
+ }
+ }
+ }
+ if len(masks) == 0 {
+ return nil
+ }
+
+ if r {
+ sizes = rSizes
+ if len(sizes) == 0 {
+ sizes = wSizes
+ }
+ }
+ if w {
+ sizes = wSizes
+ if len(sizes) == 0 {
+ sizes = rSizes
+ }
+ }
+
+ if len(sizes) == 0 {
+ // If all operands are masks, leave the mask inferrence to the users.
+ if allMasks {
+ for _, i := range masks {
+ m := ops[i].(operandMask)
+ m.allMasks = true
+ ops[i] = m
+ }
+ return nil
+ }
+ return fmt.Errorf("cannot infer mask size: no register operands")
+ }
+ shape, ok := singular(sizes)
+ if !ok {
+ if !hasWMask && len(wSizes) == 1 && len(masks) == 1 {
+ // This pattern looks like predicate mask, so its shape should align with the
+ // output. TODO: verify this is a safe assumption.
+ shape = wSizes[0]
+ } else {
+ return fmt.Errorf("cannot infer mask size: multiple register sizes %v", sizes)
+ }
+ }
+ for _, i := range masks {
+ m := ops[i].(operandMask)
+ m.vecShape = shape
+ ops[i] = m
+ }
+ return nil
+ }
+ if err := inferMask(true, false); err != nil {
+ return err
+ }
+ if err := inferMask(false, true); err != nil {
+ return err
+ }
+ return nil
+}
+
+// addOperandstoDef adds "in", "inVariant", and "out" to an instruction Def.
+//
+// Optional mask input operands are added to the inVariant field if
+// variant&instVariantMasked, and omitted otherwise.
+func addOperandsToDef(ops []operand, instDB *unify.DefBuilder, variant instVariant) {
+ var inVals, inVar, outVals []*unify.Value
+ asmPos := 0
+ for _, op := range ops {
+ var db unify.DefBuilder
+ op.addToDef(&db)
+ db.Add("asmPos", unify.NewValue(unify.NewStringExact(fmt.Sprint(asmPos))))
+
+ action := op.common().action
+ asmCount := 1 // # of assembly operands; 0 or 1
+ if action.r {
+ inVal := unify.NewValue(db.Build())
+ // If this is an optional mask, put it in the input variant tuple.
+ if mask, ok := op.(operandMask); ok && mask.optional {
+ if variant&instVariantMasked != 0 {
+ inVar = append(inVar, inVal)
+ } else {
+ // This operand doesn't appear in the assembly at all.
+ asmCount = 0
+ }
+ } else {
+ // Just a regular input operand.
+ inVals = append(inVals, inVal)
+ }
+ }
+ if action.w {
+ outVal := unify.NewValue(db.Build())
+ outVals = append(outVals, outVal)
+ }
+
+ asmPos += asmCount
+ }
+
+ instDB.Add("in", unify.NewValue(unify.NewTuple(inVals...)))
+ instDB.Add("inVariant", unify.NewValue(unify.NewTuple(inVar...)))
+ instDB.Add("out", unify.NewValue(unify.NewTuple(outVals...)))
+ memFeatures := checkMem(ops)
+ if memFeatures != "noMem" {
+ instDB.Add("memFeatures", unify.NewValue(unify.NewStringExact(memFeatures)))
+ }
+}
+
+// checkMem checks the shapes of memory operand in the operation and returns the shape.
+// Keep this function in sync with [decodeOperand].
+func checkMem(ops []operand) string {
+ memState := "noMem"
+ var mem *operandMem
+ memCnt := 0
+ for _, op := range ops {
+ if m, ok := op.(operandMem); ok {
+ mem = &m
+ memCnt++
+ }
+ }
+ if mem != nil {
+ if mem.unknown {
+ memState = "unknown"
+ } else if memCnt > 1 {
+ memState = "tooManyMem"
+ } else {
+ // We only have vbcst case as of now.
+ // This shape has an indication that [bits] fields has two possible value:
+ // 1. The element broadcast width, which is its peer vreg operand's [elemBits] (default val in the parsed XED data)
+ // 2. The full vector width, which is its peer vreg operand's [bits] (godefs should be aware of this)
+ memState = "vbcst"
+ }
+ }
+ return memState
+}
+
+func instToUVal(inst *xeddata.Inst, ops []operand, addFields map[string]string) []*unify.Value {
+ feature, ok := decodeCPUFeature(inst)
+ if !ok {
+ return nil
+ }
+
+ var vals []*unify.Value
+ vals = append(vals, instToUVal1(inst, ops, feature, instVariantNone, addFields))
+ if hasOptionalMask(ops) {
+ vals = append(vals, instToUVal1(inst, ops, feature, instVariantMasked, addFields))
+ }
+ return vals
+}
+
+func instToUVal1(inst *xeddata.Inst, ops []operand, feature string, variant instVariant, addFields map[string]string) *unify.Value {
+ var db unify.DefBuilder
+ db.Add("goarch", unify.NewValue(unify.NewStringExact("amd64")))
+ db.Add("asm", unify.NewValue(unify.NewStringExact(inst.Opcode())))
+ addOperandsToDef(ops, &db, variant)
+ db.Add("cpuFeature", unify.NewValue(unify.NewStringExact(feature)))
+ for k, v := range addFields {
+ db.Add(k, unify.NewValue(unify.NewStringExact(v)))
+ }
+
+ if strings.Contains(inst.Pattern, "ZEROING=0") {
+ // This is an EVEX instruction, but the ".Z" (zero-merging)
+ // instruction flag is NOT valid. EVEX.z must be zero.
+ //
+ // This can mean a few things:
+ //
+ // - The output of an instruction is a mask, so merging modes don't
+ // make any sense. E.g., VCMPPS.
+ //
+ // - There are no masks involved anywhere. (Maybe MASK=0 is also set
+ // in this case?) E.g., VINSERTPS.
+ //
+ // - The operation inherently performs merging. E.g., VCOMPRESSPS
+ // with a mem operand.
+ //
+ // There may be other reasons.
+ db.Add("zeroing", unify.NewValue(unify.NewStringExact("false")))
+ }
+ pos := unify.Pos{Path: inst.Pos.Path, Line: inst.Pos.Line}
+ return unify.NewValuePos(db.Build(), pos)
+}
+
+// decodeCPUFeature returns the CPU feature name required by inst. These match
+// the names of the "Has*" feature checks in the simd package.
+func decodeCPUFeature(inst *xeddata.Inst) (string, bool) {
+ key := cpuFeatureKey{
+ Extension: inst.Extension,
+ ISASet: isaSetStrip.ReplaceAllLiteralString(inst.ISASet, ""),
+ }
+ feat, ok := cpuFeatureMap[key]
+ if !ok {
+ imap := unknownFeatures[key]
+ if imap == nil {
+ imap = make(map[string]struct{})
+ unknownFeatures[key] = imap
+ }
+ imap[inst.Opcode()] = struct{}{}
+ return "", false
+ }
+ if feat == "ignore" {
+ return "", false
+ }
+ return feat, true
+}
+
+var isaSetStrip = regexp.MustCompile("_(128N?|256N?|512)$")
+
+type cpuFeatureKey struct {
+ Extension, ISASet string
+}
+
+// cpuFeatureMap maps from XED's "EXTENSION" and "ISA_SET" to a CPU feature name
+// that can be used in the SIMD API.
+var cpuFeatureMap = map[cpuFeatureKey]string{
+ {"SHA", "SHA"}: "SHA",
+
+ {"AVX", ""}: "AVX",
+ {"AVX_VNNI", "AVX_VNNI"}: "AVXVNNI",
+ {"AVX2", ""}: "AVX2",
+ {"AVXAES", ""}: "AVX, AES",
+
+ // AVX-512 foundational features. We combine all of these into one "AVX512" feature.
+ {"AVX512EVEX", "AVX512F"}: "AVX512",
+ {"AVX512EVEX", "AVX512CD"}: "AVX512",
+ {"AVX512EVEX", "AVX512BW"}: "AVX512",
+ {"AVX512EVEX", "AVX512DQ"}: "AVX512",
+ // AVX512VL doesn't appear explicitly in the ISASet. I guess it's implied by
+ // the vector length suffix.
+
+ // AVX-512 extension features
+ {"AVX512EVEX", "AVX512_BITALG"}: "AVX512BITALG",
+ {"AVX512EVEX", "AVX512_GFNI"}: "AVX512GFNI",
+ {"AVX512EVEX", "AVX512_VBMI2"}: "AVX512VBMI2",
+ {"AVX512EVEX", "AVX512_VBMI"}: "AVX512VBMI",
+ {"AVX512EVEX", "AVX512_VNNI"}: "AVX512VNNI",
+ {"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ",
+ {"AVX512EVEX", "AVX512_VAES"}: "AVX512VAES",
+ {"AVX512EVEX", "AVX512_VPCLMULQDQ"}: "AVX512VPCLMULQDQ",
+
+ // AVX 10.2 (not yet supported)
+ {"AVX512EVEX", "AVX10_2_RC"}: "ignore",
+}
+
+var unknownFeatures = map[cpuFeatureKey]map[string]struct{}{}
+
+// hasOptionalMask returns whether there is an optional mask operand in ops.
+func hasOptionalMask(ops []operand) bool {
+ for _, op := range ops {
+ if op, ok := op.(operandMask); ok && op.optional {
+ return true
+ }
+ }
+ return false
+}
+
+func singular[T comparable](xs []T) (T, bool) {
+ if len(xs) == 0 {
+ return *new(T), false
+ }
+ for _, x := range xs[1:] {
+ if x != xs[0] {
+ return *new(T), false
+ }
+ }
+ return xs[0], true
+}
+
+type fixedReg struct {
+ class int
+ name string
+ width int
+}
+
+var fixedRegMap = map[string]fixedReg{
+ "XED_REG_XMM0": {VREG_CLASS, "x0", 128},
+}
+
+// decodeReg returns class (NOT_REG_CLASS, VREG_CLASS, GREG_CLASS, VREG_CLASS_FIXED,
+// GREG_CLASS_FIXED), width in bits and reg name(if fixed).
+// If the operand cannot be decided as a register, then the clas is NOT_REG_CLASS.
+func decodeReg(op *xeddata.Operand) (class, width int, name string) {
+ // op.Width tells us the total width, e.g.,:
+ //
+ // dq => 128 bits (XMM)
+ // qq => 256 bits (YMM)
+ // mskw => K
+ // z[iuf?](8|16|32|...) => 512 bits (ZMM)
+ //
+ // But the encoding is really weird and it's not clear if these *always*
+ // mean XMM/YMM/ZMM or if other irregular things can use these large widths.
+ // Hence, we dig into the register sets themselves.
+
+ if !strings.HasPrefix(op.NameLHS(), "REG") {
+ return NOT_REG_CLASS, 0, ""
+ }
+ // TODO: We shouldn't be relying on the macro naming conventions. We should
+ // use all-dec-patterns.txt, but xeddata doesn't support that table right now.
+ rhs := op.NameRHS()
+ if !strings.HasSuffix(rhs, "()") {
+ if fixedReg, ok := fixedRegMap[rhs]; ok {
+ return fixedReg.class, fixedReg.width, fixedReg.name
+ }
+ return NOT_REG_CLASS, 0, ""
+ }
+ switch {
+ case strings.HasPrefix(rhs, "XMM_"):
+ return VREG_CLASS, 128, ""
+ case strings.HasPrefix(rhs, "YMM_"):
+ return VREG_CLASS, 256, ""
+ case strings.HasPrefix(rhs, "ZMM_"):
+ return VREG_CLASS, 512, ""
+ case strings.HasPrefix(rhs, "GPR64_"), strings.HasPrefix(rhs, "VGPR64_"):
+ return GREG_CLASS, 64, ""
+ case strings.HasPrefix(rhs, "GPR32_"), strings.HasPrefix(rhs, "VGPR32_"):
+ return GREG_CLASS, 32, ""
+ }
+ return NOT_REG_CLASS, 0, ""
+}
+
+var xtypeRe = regexp.MustCompile(`^([iuf])([0-9]+)$`)
+
+// scalarBaseType describes the base type of a scalar element. This is a Go
+// type, but without the bit width suffix (with the exception of
+// scalarBaseIntOrUint).
+type scalarBaseType int
+
+const (
+ scalarBaseInt scalarBaseType = iota
+ scalarBaseUint
+ scalarBaseIntOrUint // Signed or unsigned is unspecified
+ scalarBaseFloat
+ scalarBaseComplex
+ scalarBaseBFloat
+ scalarBaseHFloat
+)
+
+func (s scalarBaseType) regex() string {
+ switch s {
+ case scalarBaseInt:
+ return "int"
+ case scalarBaseUint:
+ return "uint"
+ case scalarBaseIntOrUint:
+ return "int|uint"
+ case scalarBaseFloat:
+ return "float"
+ case scalarBaseComplex:
+ return "complex"
+ case scalarBaseBFloat:
+ return "BFloat"
+ case scalarBaseHFloat:
+ return "HFloat"
+ }
+ panic(fmt.Sprintf("unknown scalar base type %d", s))
+}
+
+func decodeType(op *xeddata.Operand) (base scalarBaseType, bits int, ok bool) {
+ // The xtype tells you the element type. i8, i16, i32, i64, f32, etc.
+ //
+ // TODO: Things like AVX2 VPAND have an xtype of u256 because they're
+ // element-width agnostic. Do I map that to all widths, or just omit the
+ // element width and let unification flesh it out? There's no u512
+ // (presumably those are all masked, so elem width matters). These are all
+ // Category: LOGICAL, so maybe we could use that info?
+
+ // Handle some weird ones.
+ switch op.Xtype {
+ // 8-bit float formats as defined by Open Compute Project "OCP 8-bit
+ // Floating Point Specification (OFP8)".
+ case "bf8": // E5M2 float
+ return scalarBaseBFloat, 8, true
+ case "hf8": // E4M3 float
+ return scalarBaseHFloat, 8, true
+ case "bf16": // bfloat16 float
+ return scalarBaseBFloat, 16, true
+ case "2f16":
+ // Complex consisting of 2 float16s. Doesn't exist in Go, but we can say
+ // what it would be.
+ return scalarBaseComplex, 32, true
+ case "2i8", "2I8":
+ // These just use the lower INT8 in each 16 bit field.
+ // As far as I can tell, "2I8" is a typo.
+ return scalarBaseInt, 8, true
+ case "2u16", "2U16":
+ // some VPDP* has it
+ // TODO: does "z" means it has zeroing?
+ return scalarBaseUint, 16, true
+ case "2i16", "2I16":
+ // some VPDP* has it
+ return scalarBaseInt, 16, true
+ case "4u8", "4U8":
+ // some VPDP* has it
+ return scalarBaseUint, 8, true
+ case "4i8", "4I8":
+ // some VPDP* has it
+ return scalarBaseInt, 8, true
+ }
+
+ // The rest follow a simple pattern.
+ m := xtypeRe.FindStringSubmatch(op.Xtype)
+ if m == nil {
+ // TODO: Report unrecognized xtype
+ return 0, 0, false
+ }
+ bits, _ = strconv.Atoi(m[2])
+ switch m[1] {
+ case "i", "u":
+ // XED is rather inconsistent about what's signed, unsigned, or doesn't
+ // matter, so merge them together and let the Go definitions narrow as
+ // appropriate. Maybe there's a better way to do this.
+ return scalarBaseIntOrUint, bits, true
+ case "f":
+ return scalarBaseFloat, bits, true
+ default:
+ panic("unreachable")
+ }
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// this generates type-instantiated boilerplate code for
+// slice operations and tests
+
+import (
+ "bufio"
+ "bytes"
+ "flag"
+ "fmt"
+ "go/format"
+ "io"
+ "os"
+ "strings"
+ "text/template"
+)
+
+type resultTypeFunc func(t string, w, c int) (ot string, ow int, oc int)
+
+// shapes describes a combination of vector widths and various element types
+type shapes struct {
+ vecs []int // Vector bit width for this shape.
+ ints []int // Int element bit width(s) for this shape
+ uints []int // Unsigned int element bit width(s) for this shape
+ floats []int // Float element bit width(s) for this shape
+ output resultTypeFunc
+}
+
+// shapeAndTemplate is a template and the set of shapes on which it will be expanded
+type shapeAndTemplate struct {
+ s *shapes
+ t *template.Template
+}
+
+func (sat shapeAndTemplate) target(outType string, width int) shapeAndTemplate {
+ newSat := sat
+ newShape := *sat.s
+ newShape.output = func(t string, w, c int) (ot string, ow int, oc int) {
+ return outType, width, c
+ }
+ newSat.s = &newShape
+ return newSat
+}
+
+func (sat shapeAndTemplate) shrinkTo(outType string, by int) shapeAndTemplate {
+ newSat := sat
+ newShape := *sat.s
+ newShape.output = func(t string, w, c int) (ot string, ow int, oc int) {
+ return outType, w / by, c * by
+ }
+ newSat.s = &newShape
+ return newSat
+}
+
+func (s *shapes) forAllShapes(f func(seq int, t, upperT string, w, c int, out io.Writer), out io.Writer) {
+ vecs := s.vecs
+ ints := s.ints
+ uints := s.uints
+ floats := s.floats
+ seq := 0
+ for _, v := range vecs {
+ for _, w := range ints {
+ c := v / w
+ f(seq, "int", "Int", w, c, out)
+ seq++
+ }
+ for _, w := range uints {
+ c := v / w
+ f(seq, "uint", "Uint", w, c, out)
+ seq++
+ }
+ for _, w := range floats {
+ c := v / w
+ f(seq, "float", "Float", w, c, out)
+ seq++
+ }
+ }
+}
+
+var allShapes = &shapes{
+ vecs: []int{128, 256, 512},
+ ints: []int{8, 16, 32, 64},
+ uints: []int{8, 16, 32, 64},
+ floats: []int{32, 64},
+}
+
+var intShapes = &shapes{
+ vecs: []int{128, 256, 512},
+ ints: []int{8, 16, 32, 64},
+}
+
+var uintShapes = &shapes{
+ vecs: []int{128, 256, 512},
+ uints: []int{8, 16, 32, 64},
+}
+
+var avx512Shapes = &shapes{
+ vecs: []int{512},
+ ints: []int{8, 16, 32, 64},
+ uints: []int{8, 16, 32, 64},
+ floats: []int{32, 64},
+}
+
+var avx2Shapes = &shapes{
+ vecs: []int{128, 256},
+ ints: []int{8, 16, 32, 64},
+ uints: []int{8, 16, 32, 64},
+ floats: []int{32, 64},
+}
+
+var avx2MaskedLoadShapes = &shapes{
+ vecs: []int{128, 256},
+ ints: []int{32, 64},
+ uints: []int{32, 64},
+ floats: []int{32, 64},
+}
+
+var avx2SmallLoadPunShapes = &shapes{
+ // ints are done by hand, these are type-punned to int.
+ vecs: []int{128, 256},
+ uints: []int{8, 16},
+}
+
+var unaryFlaky = &shapes{ // for tests that support flaky equality
+ vecs: []int{128, 256, 512},
+ floats: []int{32, 64},
+}
+
+var ternaryFlaky = &shapes{ // for tests that support flaky equality
+ vecs: []int{128, 256, 512},
+ floats: []int{32},
+}
+
+var avx2SignedComparisons = &shapes{
+ vecs: []int{128, 256},
+ ints: []int{8, 16, 32, 64},
+}
+
+var avx2UnsignedComparisons = &shapes{
+ vecs: []int{128, 256},
+ uints: []int{8, 16, 32, 64},
+}
+
+type templateData struct {
+ VType string // the type of the vector, e.g. Float32x4
+ AOrAn string // for documentation, the article "a" or "an"
+ EWidth int // the bit width of the element type, e.g. 32
+ Vwidth int // the width of the vector type, e.g. 128
+ Count int // the number of elements, e.g. 4
+ WxC string // the width-by-type string, e.g., "32x4"
+ BxC string // as if bytes, in the proper count, e.g., "8x16" (W==8)
+ Base string // the title-case Base Type of the vector, e.g., "Float"
+ Etype string // the element type, e.g. "float32"
+ OxFF string // a mask for the lowest 'count' bits
+
+ OVType string // type of output vector
+ OEtype string // output element type
+ OEType string // output element type, title-case
+ OCount int // output element count
+}
+
+func (t templateData) As128BitVec() string {
+ return fmt.Sprintf("%s%dx%d", t.Base, t.EWidth, 128/t.EWidth)
+}
+
+func oneTemplate(t *template.Template, baseType string, width, count int, out io.Writer, rtf resultTypeFunc) {
+ b := width * count
+ if b < 128 || b > 512 {
+ return
+ }
+
+ ot, ow, oc := baseType, width, count
+ if rtf != nil {
+ ot, ow, oc = rtf(ot, ow, oc)
+ if ow*oc > 512 || ow*oc < 128 || ow < 8 || ow > 64 {
+ return
+ }
+ // TODO someday we will support conversions to 16-bit floats
+ if ot == "float" && ow < 32 {
+ return
+ }
+ }
+ ovType := fmt.Sprintf("%s%dx%d", strings.ToUpper(ot[:1])+ot[1:], ow, oc)
+ oeType := fmt.Sprintf("%s%d", ot, ow)
+ oEType := fmt.Sprintf("%s%d", strings.ToUpper(ot[:1])+ot[1:], ow)
+
+ wxc := fmt.Sprintf("%dx%d", width, count)
+ BaseType := strings.ToUpper(baseType[:1]) + baseType[1:]
+ vType := fmt.Sprintf("%s%s", BaseType, wxc)
+ eType := fmt.Sprintf("%s%d", baseType, width)
+
+ bxc := fmt.Sprintf("%dx%d", 8, count*(width/8))
+ aOrAn := "a"
+ if strings.Contains("aeiou", baseType[:1]) {
+ aOrAn = "an"
+ }
+ oxFF := fmt.Sprintf("0x%x", uint64((1<<count)-1))
+ t.Execute(out, templateData{
+ VType: vType,
+ AOrAn: aOrAn,
+ EWidth: width,
+ Vwidth: b,
+ Count: count,
+ WxC: wxc,
+ BxC: bxc,
+ Base: BaseType,
+ Etype: eType,
+ OxFF: oxFF,
+ OVType: ovType,
+ OEtype: oeType,
+ OCount: oc,
+ OEType: oEType,
+ })
+}
+
+// forTemplates expands the template sat.t for each shape
+// in sat.s, writing to out.
+func (sat shapeAndTemplate) forTemplates(out io.Writer) {
+ t, s := sat.t, sat.s
+ vecs := s.vecs
+ ints := s.ints
+ uints := s.uints
+ floats := s.floats
+ for _, v := range vecs {
+ for _, w := range ints {
+ c := v / w
+ oneTemplate(t, "int", w, c, out, sat.s.output)
+ }
+ for _, w := range uints {
+ c := v / w
+ oneTemplate(t, "uint", w, c, out, sat.s.output)
+ }
+ for _, w := range floats {
+ c := v / w
+ oneTemplate(t, "float", w, c, out, sat.s.output)
+ }
+ }
+}
+
+func prologue(s string, out io.Writer) {
+ fmt.Fprintf(out,
+ `// Code generated by '%s'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+`, s)
+}
+
+func ssaPrologue(s string, out io.Writer) {
+ fmt.Fprintf(out,
+ `// Code generated by '%s'; DO NOT EDIT.
+
+package ssa
+
+`, s)
+}
+
+func unsafePrologue(s string, out io.Writer) {
+ fmt.Fprintf(out,
+ `// Code generated by '%s'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+import "unsafe"
+
+`, s)
+}
+
+func testPrologue(t, s string, out io.Writer) {
+ fmt.Fprintf(out,
+ `// Code generated by '%s'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing %s.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+`, s, t)
+}
+
+func curryTestPrologue(t string) func(s string, out io.Writer) {
+ return func(s string, out io.Writer) {
+ testPrologue(t, s, out)
+ }
+}
+
+func templateOf(name, temp string) shapeAndTemplate {
+ return shapeAndTemplate{s: allShapes,
+ t: template.Must(template.New(name).Parse(temp))}
+}
+
+func shapedTemplateOf(s *shapes, name, temp string) shapeAndTemplate {
+ return shapeAndTemplate{s: s,
+ t: template.Must(template.New(name).Parse(temp))}
+}
+
+var sliceTemplate = templateOf("slice", `
+// Load{{.VType}}Slice loads {{.AOrAn}} {{.VType}} from a slice of at least {{.Count}} {{.Etype}}s
+func Load{{.VType}}Slice(s []{{.Etype}}) {{.VType}} {
+ return Load{{.VType}}((*[{{.Count}}]{{.Etype}})(s))
+}
+
+// StoreSlice stores x into a slice of at least {{.Count}} {{.Etype}}s
+func (x {{.VType}}) StoreSlice(s []{{.Etype}}) {
+ x.Store((*[{{.Count}}]{{.Etype}})(s))
+}
+`)
+
+var unaryTemplate = templateOf("unary_helpers", `
+// test{{.VType}}Unary tests the simd unary method f against the expected behavior generated by want
+func test{{.VType}}Unary(t *testing.T, f func(_ archsimd.{{.VType}}) archsimd.{{.VType}}, want func(_ []{{.Etype}}) []{{.Etype}}) {
+ n := {{.Count}}
+ t.Helper()
+ forSlice(t, {{.Etype}}s, n, func(x []{{.Etype}}) bool {
+ t.Helper()
+ a := archsimd.Load{{.VType}}Slice(x)
+ g := make([]{{.Etype}}, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() {t.Helper(); t.Logf("x=%v", x)})
+ })
+}
+`)
+
+var unaryFlakyTemplate = shapedTemplateOf(unaryFlaky, "unary_flaky_helpers", `
+// test{{.VType}}UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func test{{.VType}}UnaryFlaky(t *testing.T, f func(x archsimd.{{.VType}}) archsimd.{{.VType}}, want func(x []{{.Etype}}) []{{.Etype}}, flakiness float64) {
+ n := {{.Count}}
+ t.Helper()
+ forSlice(t, {{.Etype}}s, n, func(x []{{.Etype}}) bool {
+ t.Helper()
+ a := archsimd.Load{{.VType}}Slice(x)
+ g := make([]{{.Etype}}, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, flakiness, func() {t.Helper(); t.Logf("x=%v", x)})
+ })
+}
+`)
+
+var convertTemplate = templateOf("convert_helpers", `
+// test{{.VType}}ConvertTo{{.OEType}} tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func test{{.VType}}ConvertTo{{.OEType}}(t *testing.T, f func(x archsimd.{{.VType}}) archsimd.{{.OVType}}, want func(x []{{.Etype}}) []{{.OEtype}}) {
+ n := {{.Count}}
+ t.Helper()
+ forSlice(t, {{.Etype}}s, n, func(x []{{.Etype}}) bool {
+ t.Helper()
+ a := archsimd.Load{{.VType}}Slice(x)
+ g := make([]{{.OEtype}}, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() {t.Helper(); t.Logf("x=%v", x)})
+ })
+}
+`)
+
+var unaryToInt32 = convertTemplate.target("int", 32)
+var unaryToUint32 = convertTemplate.target("uint", 32)
+var unaryToUint16 = convertTemplate.target("uint", 16)
+
+var binaryTemplate = templateOf("binary_helpers", `
+// test{{.VType}}Binary tests the simd binary method f against the expected behavior generated by want
+func test{{.VType}}Binary(t *testing.T, f func(_, _ archsimd.{{.VType}}) archsimd.{{.VType}}, want func(_, _ []{{.Etype}}) []{{.Etype}}) {
+ n := {{.Count}}
+ t.Helper()
+ forSlicePair(t, {{.Etype}}s, n, func(x, y []{{.Etype}}) bool {
+ t.Helper()
+ a := archsimd.Load{{.VType}}Slice(x)
+ b := archsimd.Load{{.VType}}Slice(y)
+ g := make([]{{.Etype}}, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() {t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); })
+ })
+}
+`)
+
+var ternaryTemplate = templateOf("ternary_helpers", `
+// test{{.VType}}Ternary tests the simd ternary method f against the expected behavior generated by want
+func test{{.VType}}Ternary(t *testing.T, f func(_, _, _ archsimd.{{.VType}}) archsimd.{{.VType}}, want func(_, _, _ []{{.Etype}}) []{{.Etype}}) {
+ n := {{.Count}}
+ t.Helper()
+ forSliceTriple(t, {{.Etype}}s, n, func(x, y, z []{{.Etype}}) bool {
+ t.Helper()
+ a := archsimd.Load{{.VType}}Slice(x)
+ b := archsimd.Load{{.VType}}Slice(y)
+ c := archsimd.Load{{.VType}}Slice(z)
+ g := make([]{{.Etype}}, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() {t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z); })
+ })
+}
+`)
+
+var ternaryFlakyTemplate = shapedTemplateOf(ternaryFlaky, "ternary_helpers", `
+// test{{.VType}}TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func test{{.VType}}TernaryFlaky(t *testing.T, f func(x, y, z archsimd.{{.VType}}) archsimd.{{.VType}}, want func(x, y, z []{{.Etype}}) []{{.Etype}}, flakiness float64) {
+ n := {{.Count}}
+ t.Helper()
+ forSliceTriple(t, {{.Etype}}s, n, func(x, y, z []{{.Etype}}) bool {
+ t.Helper()
+ a := archsimd.Load{{.VType}}Slice(x)
+ b := archsimd.Load{{.VType}}Slice(y)
+ c := archsimd.Load{{.VType}}Slice(z)
+ g := make([]{{.Etype}}, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, flakiness, func() {t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z); })
+ })
+}
+`)
+
+var compareTemplate = templateOf("compare_helpers", `
+// test{{.VType}}Compare tests the simd comparison method f against the expected behavior generated by want
+func test{{.VType}}Compare(t *testing.T, f func(_, _ archsimd.{{.VType}}) archsimd.Mask{{.WxC}}, want func(_, _ []{{.Etype}}) []int64) {
+ n := {{.Count}}
+ t.Helper()
+ forSlicePair(t, {{.Etype}}s, n, func(x, y []{{.Etype}}) bool {
+ t.Helper()
+ a := archsimd.Load{{.VType}}Slice(x)
+ b := archsimd.Load{{.VType}}Slice(y)
+ g := make([]int{{.EWidth}}, n)
+ f(a, b).AsInt{{.WxC}}().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() {t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); })
+ })
+}
+`)
+
+// TODO this has not been tested yet.
+var compareMaskedTemplate = templateOf("comparemasked_helpers", `
+// test{{.VType}}CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func test{{.VType}}CompareMasked(t *testing.T,
+ f func(_, _ archsimd.{{.VType}}, m archsimd.Mask{{.WxC}}) archsimd.Mask{{.WxC}},
+ want func(_, _ []{{.Etype}}) []int64) {
+ n := {{.Count}}
+ t.Helper()
+ forSlicePairMasked(t, {{.Etype}}s, n, func(x, y []{{.Etype}}, m []bool) bool {
+ t.Helper()
+ a := archsimd.Load{{.VType}}Slice(x)
+ b := archsimd.Load{{.VType}}Slice(y)
+ k := archsimd.LoadInt{{.WxC}}Slice(toVect[int{{.EWidth}}](m)).ToMask()
+ g := make([]int{{.EWidth}}, n)
+ f(a, b, k).AsInt{{.WxC}}().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() {t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m); })
+ })
+}
+`)
+
+var avx512MaskedLoadSlicePartTemplate = shapedTemplateOf(avx512Shapes, "avx 512 load slice part", `
+// Load{{.VType}}SlicePart loads a {{.VType}} from the slice s.
+// If s has fewer than {{.Count}} elements, the remaining elements of the vector are filled with zeroes.
+// If s has {{.Count}} or more elements, the function is equivalent to Load{{.VType}}Slice.
+func Load{{.VType}}SlicePart(s []{{.Etype}}) {{.VType}} {
+ l := len(s)
+ if l >= {{.Count}} {
+ return Load{{.VType}}Slice(s)
+ }
+ if l == 0 {
+ var x {{.VType}}
+ return x
+ }
+ mask := Mask{{.WxC}}FromBits({{.OxFF}} >> ({{.Count}} - l))
+ return LoadMasked{{.VType}}(pa{{.VType}}(s), mask)
+}
+
+// StoreSlicePart stores the {{.Count}} elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has {{.Count}} or more elements, the method is equivalent to x.StoreSlice.
+func (x {{.VType}}) StoreSlicePart(s []{{.Etype}}) {
+ l := len(s)
+ if l >= {{.Count}} {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask{{.WxC}}FromBits({{.OxFF}} >> ({{.Count}} - l))
+ x.StoreMasked(pa{{.VType}}(s), mask)
+}
+`)
+
+var avx2MaskedLoadSlicePartTemplate = shapedTemplateOf(avx2MaskedLoadShapes, "avx 2 load slice part", `
+// Load{{.VType}}SlicePart loads a {{.VType}} from the slice s.
+// If s has fewer than {{.Count}} elements, the remaining elements of the vector are filled with zeroes.
+// If s has {{.Count}} or more elements, the function is equivalent to Load{{.VType}}Slice.
+func Load{{.VType}}SlicePart(s []{{.Etype}}) {{.VType}} {
+ l := len(s)
+ if l >= {{.Count}} {
+ return Load{{.VType}}Slice(s)
+ }
+ if l == 0 {
+ var x {{.VType}}
+ return x
+ }
+ mask := vecMask{{.EWidth}}[len(vecMask{{.EWidth}})/2-l:]
+ return LoadMasked{{.VType}}(pa{{.VType}}(s), LoadInt{{.WxC}}Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the {{.Count}} elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has {{.Count}} or more elements, the method is equivalent to x.StoreSlice.
+func (x {{.VType}}) StoreSlicePart(s []{{.Etype}}) {
+ l := len(s)
+ if l >= {{.Count}} {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask{{.EWidth}}[len(vecMask{{.EWidth}})/2-l:]
+ x.StoreMasked(pa{{.VType}}(s), LoadInt{{.WxC}}Slice(mask).asMask())
+}
+`)
+
+var avx2SmallLoadSlicePartTemplate = shapedTemplateOf(avx2SmallLoadPunShapes, "avx 2 small load slice part", `
+// Load{{.VType}}SlicePart loads a {{.VType}} from the slice s.
+// If s has fewer than {{.Count}} elements, the remaining elements of the vector are filled with zeroes.
+// If s has {{.Count}} or more elements, the function is equivalent to Load{{.VType}}Slice.
+func Load{{.VType}}SlicePart(s []{{.Etype}}) {{.VType}} {
+ if len(s) == 0 {
+ var zero {{.VType}}
+ return zero
+ }
+ t := unsafe.Slice((*int{{.EWidth}})(unsafe.Pointer(&s[0])), len(s))
+ return LoadInt{{.WxC}}SlicePart(t).As{{.VType}}()
+}
+
+// StoreSlicePart stores the {{.Count}} elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has {{.Count}} or more elements, the method is equivalent to x.StoreSlice.
+func (x {{.VType}}) StoreSlicePart(s []{{.Etype}}) {
+ if len(s) == 0 {
+ return
+ }
+ t := unsafe.Slice((*int{{.EWidth}})(unsafe.Pointer(&s[0])), len(s))
+ x.AsInt{{.WxC}}().StoreSlicePart(t)
+}
+`)
+
+func (t templateData) CPUfeature() string {
+ switch t.Vwidth {
+ case 128:
+ return "AVX"
+ case 256:
+ return "AVX2"
+ case 512:
+ return "AVX512"
+ }
+ panic(fmt.Errorf("unexpected vector width %d", t.Vwidth))
+}
+
+var avx2SignedComparisonsTemplate = shapedTemplateOf(avx2SignedComparisons, "avx2 signed comparisons", `
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature {{.CPUfeature}}
+func (x {{.VType}}) Less(y {{.VType}}) Mask{{.WxC}} {
+ return y.Greater(x)
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature {{.CPUfeature}}
+func (x {{.VType}}) GreaterEqual(y {{.VType}}) Mask{{.WxC}} {
+ ones := x.Equal(x).AsInt{{.WxC}}()
+ return y.Greater(x).AsInt{{.WxC}}().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature {{.CPUfeature}}
+func (x {{.VType}}) LessEqual(y {{.VType}}) Mask{{.WxC}} {
+ ones := x.Equal(x).AsInt{{.WxC}}()
+ return x.Greater(y).AsInt{{.WxC}}().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature {{.CPUfeature}}
+func (x {{.VType}}) NotEqual(y {{.VType}}) Mask{{.WxC}} {
+ ones := x.Equal(x).AsInt{{.WxC}}()
+ return x.Equal(y).AsInt{{.WxC}}().Xor(ones).asMask()
+}
+`)
+
+var bitWiseIntTemplate = shapedTemplateOf(intShapes, "bitwise int complement", `
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature {{.CPUfeature}}
+func (x {{.VType}}) Not() {{.VType}} {
+ return x.Xor(x.Equal(x).As{{.VType}}())
+}
+`)
+
+var bitWiseUintTemplate = shapedTemplateOf(uintShapes, "bitwise uint complement", `
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature {{.CPUfeature}}
+func (x {{.VType}}) Not() {{.VType}} {
+ return x.Xor(x.Equal(x).AsInt{{.WxC}}().As{{.VType}}())
+}
+`)
+
+// CPUfeatureAVX2if8 return AVX2 if the element width is 8,
+// otherwise, it returns CPUfeature. This is for the cpufeature
+// of unsigned comparison emulation, which uses shifts for all
+// the sizes > 8 (shifts are AVX) but must use broadcast (AVX2)
+// for bytes.
+func (t templateData) CPUfeatureAVX2if8() string {
+ if t.EWidth == 8 {
+ return "AVX2"
+ }
+ return t.CPUfeature()
+}
+
+var avx2UnsignedComparisonsTemplate = shapedTemplateOf(avx2UnsignedComparisons, "avx2 unsigned comparisons", `
+// Greater returns a mask whose elements indicate whether x > y
+//
+// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
+func (x {{.VType}}) Greater(y {{.VType}}) Mask{{.WxC}} {
+ a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
+{{- if eq .EWidth 8}}
+ signs := BroadcastInt{{.WxC}}(-1 << ({{.EWidth}}-1))
+{{- else}}
+ ones := x.Equal(x).AsInt{{.WxC}}()
+ signs := ones.ShiftAllLeft({{.EWidth}}-1)
+{{- end }}
+ return a.Xor(signs).Greater(b.Xor(signs))
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
+func (x {{.VType}}) Less(y {{.VType}}) Mask{{.WxC}} {
+ a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
+{{- if eq .EWidth 8}}
+ signs := BroadcastInt{{.WxC}}(-1 << ({{.EWidth}}-1))
+{{- else}}
+ ones := x.Equal(x).AsInt{{.WxC}}()
+ signs := ones.ShiftAllLeft({{.EWidth}}-1)
+{{- end }}
+ return b.Xor(signs).Greater(a.Xor(signs))
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
+func (x {{.VType}}) GreaterEqual(y {{.VType}}) Mask{{.WxC}} {
+ a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
+ ones := x.Equal(x).AsInt{{.WxC}}()
+{{- if eq .EWidth 8}}
+ signs := BroadcastInt{{.WxC}}(-1 << ({{.EWidth}}-1))
+{{- else}}
+ signs := ones.ShiftAllLeft({{.EWidth}}-1)
+{{- end }}
+ return b.Xor(signs).Greater(a.Xor(signs)).AsInt{{.WxC}}().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature {{.CPUfeatureAVX2if8}}
+func (x {{.VType}}) LessEqual(y {{.VType}}) Mask{{.WxC}} {
+ a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
+ ones := x.Equal(x).AsInt{{.WxC}}()
+{{- if eq .EWidth 8}}
+ signs := BroadcastInt{{.WxC}}(-1 << ({{.EWidth}}-1))
+{{- else}}
+ signs := ones.ShiftAllLeft({{.EWidth}}-1)
+{{- end }}
+ return a.Xor(signs).Greater(b.Xor(signs)).AsInt{{.WxC}}().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature {{.CPUfeature}}
+func (x {{.VType}}) NotEqual(y {{.VType}}) Mask{{.WxC}} {
+ a, b := x.AsInt{{.WxC}}(), y.AsInt{{.WxC}}()
+ ones := x.Equal(x).AsInt{{.WxC}}()
+ return a.Equal(b).AsInt{{.WxC}}().Xor(ones).asMask()
+}
+`)
+
+var unsafePATemplate = templateOf("unsafe PA helper", `
+// pa{{.VType}} returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func pa{{.VType}}(s []{{.Etype}}) *[{{.Count}}]{{.Etype}} {
+ return (*[{{.Count}}]{{.Etype}})(unsafe.Pointer(&s[0]))
+}
+`)
+
+var avx2MaskedTemplate = shapedTemplateOf(avx2Shapes, "avx2 .Masked methods", `
+// Masked returns x but with elements zeroed where mask is false.
+func (x {{.VType}}) Masked(mask Mask{{.WxC}}) {{.VType}} {
+ im := mask.AsInt{{.WxC}}()
+{{- if eq .Base "Int" }}
+ return im.And(x)
+{{- else}}
+ return x.AsInt{{.WxC}}().And(im).As{{.VType}}()
+{{- end -}}
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x {{.VType}}) Merge(y {{.VType}}, mask Mask{{.WxC}}) {{.VType}} {
+{{- if eq .BxC .WxC -}}
+ im := mask.AsInt{{.BxC}}()
+{{- else}}
+ im := mask.AsInt{{.WxC}}().AsInt{{.BxC}}()
+{{- end -}}
+{{- if and (eq .Base "Int") (eq .BxC .WxC) }}
+ return y.blend(x, im)
+{{- else}}
+ ix := x.AsInt{{.BxC}}()
+ iy := y.AsInt{{.BxC}}()
+ return iy.blend(ix, im).As{{.VType}}()
+{{- end -}}
+}
+`)
+
+// TODO perhaps write these in ways that work better on AVX512
+var avx512MaskedTemplate = shapedTemplateOf(avx512Shapes, "avx512 .Masked methods", `
+// Masked returns x but with elements zeroed where mask is false.
+func (x {{.VType}}) Masked(mask Mask{{.WxC}}) {{.VType}} {
+ im := mask.AsInt{{.WxC}}()
+{{- if eq .Base "Int" }}
+ return im.And(x)
+{{- else}}
+ return x.AsInt{{.WxC}}().And(im).As{{.VType}}()
+{{- end -}}
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x {{.VType}}) Merge(y {{.VType}}, mask Mask{{.WxC}}) {{.VType}} {
+{{- if eq .Base "Int" }}
+ return y.blendMasked(x, mask)
+{{- else}}
+ ix := x.AsInt{{.WxC}}()
+ iy := y.AsInt{{.WxC}}()
+ return iy.blendMasked(ix, mask).As{{.VType}}()
+{{- end -}}
+}
+`)
+
+func (t templateData) CPUfeatureBC() string {
+ switch t.Vwidth {
+ case 128:
+ return "AVX2"
+ case 256:
+ return "AVX2"
+ case 512:
+ if t.EWidth <= 16 {
+ return "AVX512BW"
+ }
+ return "AVX512F"
+ }
+ panic(fmt.Errorf("unexpected vector width %d", t.Vwidth))
+}
+
+var broadcastTemplate = templateOf("Broadcast functions", `
+// Broadcast{{.VType}} returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature {{.CPUfeatureBC}}
+func Broadcast{{.VType}}(x {{.Etype}}) {{.VType}} {
+ var z {{.As128BitVec }}
+ return z.SetElem(0, x).Broadcast{{.Vwidth}}()
+}
+`)
+
+var maskCvtTemplate = templateOf("Mask conversions", `
+// ToMask converts from {{.Base}}{{.WxC}} to Mask{{.WxC}}, mask element is set to true when the corresponding vector element is non-zero.
+func (from {{.Base}}{{.WxC}}) ToMask() (to Mask{{.WxC}}) {
+ return from.NotEqual({{.Base}}{{.WxC}}{})
+}
+`)
+
+var stringTemplate = shapedTemplateOf(allShapes, "String methods", `
+// String returns a string representation of SIMD vector x
+func (x {{.VType}}) String() string {
+ var s [{{.Count}}]{{.Etype}}
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+`)
+
+const SIMD = "../../"
+const TD = "../../internal/simd_test/"
+const SSA = "../../../../cmd/compile/internal/ssa/"
+
+func main() {
+ sl := flag.String("sl", SIMD+"slice_gen_amd64.go", "file name for slice operations")
+ cm := flag.String("cm", SIMD+"compare_gen_amd64.go", "file name for comparison operations")
+ mm := flag.String("mm", SIMD+"maskmerge_gen_amd64.go", "file name for mask/merge operations")
+ op := flag.String("op", SIMD+"other_gen_amd64.go", "file name for other operations")
+ ush := flag.String("ush", SIMD+"unsafe_helpers.go", "file name for unsafe helpers")
+ bh := flag.String("bh", TD+"binary_helpers_test.go", "file name for binary test helpers")
+ uh := flag.String("uh", TD+"unary_helpers_test.go", "file name for unary test helpers")
+ th := flag.String("th", TD+"ternary_helpers_test.go", "file name for ternary test helpers")
+ ch := flag.String("ch", TD+"compare_helpers_test.go", "file name for compare test helpers")
+ cmh := flag.String("cmh", TD+"comparemasked_helpers_test.go", "file name for compare-masked test helpers")
+ flag.Parse()
+
+ if *sl != "" {
+ one(*sl, unsafePrologue,
+ sliceTemplate,
+ avx512MaskedLoadSlicePartTemplate,
+ avx2MaskedLoadSlicePartTemplate,
+ avx2SmallLoadSlicePartTemplate,
+ )
+ }
+ if *cm != "" {
+ one(*cm, prologue,
+ avx2SignedComparisonsTemplate,
+ avx2UnsignedComparisonsTemplate,
+ )
+ }
+ if *mm != "" {
+ one(*mm, prologue,
+ avx2MaskedTemplate,
+ avx512MaskedTemplate,
+ )
+ }
+ if *op != "" {
+ one(*op, prologue,
+ broadcastTemplate,
+ maskCvtTemplate,
+ bitWiseIntTemplate,
+ bitWiseUintTemplate,
+ stringTemplate,
+ )
+ }
+ if *ush != "" {
+ one(*ush, unsafePrologue, unsafePATemplate)
+ }
+ if *uh != "" {
+ one(*uh, curryTestPrologue("unary simd methods"), unaryTemplate, unaryToInt32, unaryToUint32, unaryToUint16, unaryFlakyTemplate)
+ }
+ if *bh != "" {
+ one(*bh, curryTestPrologue("binary simd methods"), binaryTemplate)
+ }
+ if *th != "" {
+ one(*th, curryTestPrologue("ternary simd methods"), ternaryTemplate, ternaryFlakyTemplate)
+ }
+ if *ch != "" {
+ one(*ch, curryTestPrologue("simd methods that compare two operands"), compareTemplate)
+ }
+ if *cmh != "" {
+ one(*cmh, curryTestPrologue("simd methods that compare two operands under a mask"), compareMaskedTemplate)
+ }
+
+ nonTemplateRewrites(SSA+"tern_helpers.go", ssaPrologue, classifyBooleanSIMD, ternOpForLogical)
+
+}
+
+func ternOpForLogical(out io.Writer) {
+ fmt.Fprintf(out, `
+func ternOpForLogical(op Op) Op {
+ switch op {
+`)
+
+ intShapes.forAllShapes(func(seq int, t, upperT string, w, c int, out io.Writer) {
+ wt, ct := w, c
+ if wt < 32 {
+ wt = 32
+ ct = (w * c) / wt
+ }
+ fmt.Fprintf(out, "case OpAndInt%[1]dx%[2]d, OpOrInt%[1]dx%[2]d, OpXorInt%[1]dx%[2]d,OpAndNotInt%[1]dx%[2]d: return OpternInt%dx%d\n", w, c, wt, ct)
+ fmt.Fprintf(out, "case OpAndUint%[1]dx%[2]d, OpOrUint%[1]dx%[2]d, OpXorUint%[1]dx%[2]d,OpAndNotUint%[1]dx%[2]d: return OpternUint%dx%d\n", w, c, wt, ct)
+ }, out)
+
+ fmt.Fprintf(out, `
+ }
+ return op
+}
+`)
+
+}
+
+func classifyBooleanSIMD(out io.Writer) {
+ fmt.Fprintf(out, `
+type SIMDLogicalOP uint8
+const (
+ // boolean simd operations, for reducing expression to VPTERNLOG* instructions
+ // sloInterior is set for non-root nodes in logical-op expression trees.
+ // the operations are even-numbered.
+ sloInterior SIMDLogicalOP = 1
+ sloNone SIMDLogicalOP = 2 * iota
+ sloAnd
+ sloOr
+ sloAndNot
+ sloXor
+ sloNot
+)
+func classifyBooleanSIMD(v *Value) SIMDLogicalOP {
+ switch v.Op {
+ case `)
+ intShapes.forAllShapes(func(seq int, t, upperT string, w, c int, out io.Writer) {
+ op := "And"
+ if seq > 0 {
+ fmt.Fprintf(out, ",Op%s%s%dx%d", op, upperT, w, c)
+ } else {
+ fmt.Fprintf(out, "Op%s%s%dx%d", op, upperT, w, c)
+ }
+ seq++
+ }, out)
+
+ fmt.Fprintf(out, `:
+ return sloAnd
+
+ case `)
+ intShapes.forAllShapes(func(seq int, t, upperT string, w, c int, out io.Writer) {
+ op := "Or"
+ if seq > 0 {
+ fmt.Fprintf(out, ",Op%s%s%dx%d", op, upperT, w, c)
+ } else {
+ fmt.Fprintf(out, "Op%s%s%dx%d", op, upperT, w, c)
+ }
+ seq++
+ }, out)
+
+ fmt.Fprintf(out, `:
+ return sloOr
+
+ case `)
+ intShapes.forAllShapes(func(seq int, t, upperT string, w, c int, out io.Writer) {
+ op := "AndNot"
+ if seq > 0 {
+ fmt.Fprintf(out, ",Op%s%s%dx%d", op, upperT, w, c)
+ } else {
+ fmt.Fprintf(out, "Op%s%s%dx%d", op, upperT, w, c)
+ }
+ seq++
+ }, out)
+
+ fmt.Fprintf(out, `:
+ return sloAndNot
+`)
+
+ // "Not" is encoded as x.Xor(x.Equal(x).AsInt8x16())
+ // i.e. xor.Args[0] == x, xor.Args[1].Op == As...
+ // but AsInt8x16 is a pun/passthrough.
+
+ intShapes.forAllShapes(
+ func(seq int, t, upperT string, w, c int, out io.Writer) {
+ fmt.Fprintf(out, "case OpXor%s%dx%d: ", upperT, w, c)
+ fmt.Fprintf(out, `
+ if y := v.Args[1]; y.Op == OpEqual%s%dx%d &&
+ y.Args[0] == y.Args[1] {
+ return sloNot
+ }
+ `, upperT, w, c)
+ fmt.Fprintf(out, "return sloXor\n")
+ }, out)
+
+ fmt.Fprintf(out, `
+ }
+ return sloNone
+}
+`)
+}
+
+// numberLines takes a slice of bytes, and returns a string where each line
+// is numbered, starting from 1.
+func numberLines(data []byte) string {
+ var buf bytes.Buffer
+ r := bytes.NewReader(data)
+ s := bufio.NewScanner(r)
+ for i := 1; s.Scan(); i++ {
+ fmt.Fprintf(&buf, "%d: %s\n", i, s.Text())
+ }
+ return buf.String()
+}
+
+func nonTemplateRewrites(filename string, prologue func(s string, out io.Writer), rewrites ...func(out io.Writer)) {
+ if filename == "" {
+ return
+ }
+
+ ofile := os.Stdout
+
+ if filename != "-" {
+ var err error
+ ofile, err = os.Create(filename)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "Could not create the output file %s for the generated code, %v", filename, err)
+ os.Exit(1)
+ }
+ }
+
+ out := new(bytes.Buffer)
+
+ prologue("go run genfiles.go", out)
+ for _, rewrite := range rewrites {
+ rewrite(out)
+ }
+
+ b, err := format.Source(out.Bytes())
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "There was a problem formatting the generated code for %s, %v\n", filename, err)
+ fmt.Fprintf(os.Stderr, "%s\n", numberLines(out.Bytes()))
+ fmt.Fprintf(os.Stderr, "There was a problem formatting the generated code for %s, %v\n", filename, err)
+ os.Exit(1)
+ } else {
+ ofile.Write(b)
+ ofile.Close()
+ }
+
+}
+
+func one(filename string, prologue func(s string, out io.Writer), sats ...shapeAndTemplate) {
+ if filename == "" {
+ return
+ }
+
+ ofile := os.Stdout
+
+ if filename != "-" {
+ var err error
+ ofile, err = os.Create(filename)
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "Could not create the output file %s for the generated code, %v", filename, err)
+ os.Exit(1)
+ }
+ }
+
+ out := new(bytes.Buffer)
+
+ prologue("go run genfiles.go", out)
+ for _, sat := range sats {
+ sat.forTemplates(out)
+ }
+
+ b, err := format.Source(out.Bytes())
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "There was a problem formatting the generated code for %s, %v\n", filename, err)
+ fmt.Fprintf(os.Stderr, "%s\n", numberLines(out.Bytes()))
+ fmt.Fprintf(os.Stderr, "There was a problem formatting the generated code for %s, %v\n", filename, err)
+ os.Exit(1)
+ } else {
+ ofile.Write(b)
+ ofile.Close()
+ }
+
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "fmt"
+ "iter"
+ "maps"
+ "slices"
+)
+
+type Closure struct {
+ val *Value
+ env envSet
+}
+
+func NewSum(vs ...*Value) Closure {
+ id := &ident{name: "sum"}
+ return Closure{NewValue(Var{id}), topEnv.bind(id, vs...)}
+}
+
+// IsBottom returns whether c consists of no values.
+func (c Closure) IsBottom() bool {
+ return c.val.Domain == nil
+}
+
+// Summands returns the top-level Values of c. This assumes the top-level of c
+// was constructed as a sum, and is mostly useful for debugging.
+func (c Closure) Summands() iter.Seq[*Value] {
+ return func(yield func(*Value) bool) {
+ var rec func(v *Value, env envSet) bool
+ rec = func(v *Value, env envSet) bool {
+ switch d := v.Domain.(type) {
+ case Var:
+ parts := env.partitionBy(d.id)
+ for _, part := range parts {
+ // It may be a sum of sums. Walk into this value.
+ if !rec(part.value, part.env) {
+ return false
+ }
+ }
+ return true
+ default:
+ return yield(v)
+ }
+ }
+ rec(c.val, c.env)
+ }
+}
+
+// All enumerates all possible concrete values of c by substituting variables
+// from the environment.
+//
+// E.g., enumerating this Value
+//
+// a: !sum [1, 2]
+// b: !sum [3, 4]
+//
+// results in
+//
+// - {a: 1, b: 3}
+// - {a: 1, b: 4}
+// - {a: 2, b: 3}
+// - {a: 2, b: 4}
+func (c Closure) All() iter.Seq[*Value] {
+ // In order to enumerate all concrete values under all possible variable
+ // bindings, we use a "non-deterministic continuation passing style" to
+ // implement this. We use CPS to traverse the Value tree, threading the
+ // (possibly narrowing) environment through that CPS following an Euler
+ // tour. Where the environment permits multiple choices, we invoke the same
+ // continuation for each choice. Similar to a yield function, the
+ // continuation can return false to stop the non-deterministic walk.
+ return func(yield func(*Value) bool) {
+ c.val.all1(c.env, func(v *Value, e envSet) bool {
+ return yield(v)
+ })
+ }
+}
+
+func (v *Value) all1(e envSet, cont func(*Value, envSet) bool) bool {
+ switch d := v.Domain.(type) {
+ default:
+ panic(fmt.Sprintf("unknown domain type %T", d))
+
+ case nil:
+ return true
+
+ case Top, String:
+ return cont(v, e)
+
+ case Def:
+ fields := d.keys()
+ // We can reuse this parts slice because we're doing a DFS through the
+ // state space. (Otherwise, we'd have to do some messy threading of an
+ // immutable slice-like value through allElt.)
+ parts := make(map[string]*Value, len(fields))
+
+ // TODO: If there are no Vars or Sums under this Def, then nothing can
+ // change the Value or env, so we could just cont(v, e).
+ var allElt func(elt int, e envSet) bool
+ allElt = func(elt int, e envSet) bool {
+ if elt == len(fields) {
+ // Build a new Def from the concrete parts. Clone parts because
+ // we may reuse it on other non-deterministic branches.
+ nVal := newValueFrom(Def{maps.Clone(parts)}, v)
+ return cont(nVal, e)
+ }
+
+ return d.fields[fields[elt]].all1(e, func(v *Value, e envSet) bool {
+ parts[fields[elt]] = v
+ return allElt(elt+1, e)
+ })
+ }
+ return allElt(0, e)
+
+ case Tuple:
+ // Essentially the same as Def.
+ if d.repeat != nil {
+ // There's nothing we can do with this.
+ return cont(v, e)
+ }
+ parts := make([]*Value, len(d.vs))
+ var allElt func(elt int, e envSet) bool
+ allElt = func(elt int, e envSet) bool {
+ if elt == len(d.vs) {
+ // Build a new tuple from the concrete parts. Clone parts because
+ // we may reuse it on other non-deterministic branches.
+ nVal := newValueFrom(Tuple{vs: slices.Clone(parts)}, v)
+ return cont(nVal, e)
+ }
+
+ return d.vs[elt].all1(e, func(v *Value, e envSet) bool {
+ parts[elt] = v
+ return allElt(elt+1, e)
+ })
+ }
+ return allElt(0, e)
+
+ case Var:
+ // Go each way this variable can be bound.
+ for _, ePart := range e.partitionBy(d.id) {
+ // d.id is no longer bound in this environment partition. We'll may
+ // need it later in the Euler tour, so bind it back to this single
+ // value.
+ env := ePart.env.bind(d.id, ePart.value)
+ if !ePart.value.all1(env, cont) {
+ return false
+ }
+ }
+ return true
+ }
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "fmt"
+ "iter"
+ "maps"
+ "reflect"
+ "regexp"
+ "slices"
+ "strconv"
+ "strings"
+)
+
+// A Domain is a non-empty set of values, all of the same kind.
+//
+// Domain may be a scalar:
+//
+// - [String] - Represents string-typed values.
+//
+// Or a composite:
+//
+// - [Def] - A mapping from fixed keys to [Domain]s.
+//
+// - [Tuple] - A fixed-length sequence of [Domain]s or
+// all possible lengths repeating a [Domain].
+//
+// Or top or bottom:
+//
+// - [Top] - Represents all possible values of all kinds.
+//
+// - nil - Represents no values.
+//
+// Or a variable:
+//
+// - [Var] - A value captured in the environment.
+type Domain interface {
+ Exact() bool
+ WhyNotExact() string
+
+ // decode stores this value in a Go value. If this value is not exact, this
+ // returns a potentially wrapped *inexactError.
+ decode(reflect.Value) error
+}
+
+type inexactError struct {
+ valueType string
+ goType string
+}
+
+func (e *inexactError) Error() string {
+ return fmt.Sprintf("cannot store inexact %s value in %s", e.valueType, e.goType)
+}
+
+type decodeError struct {
+ path string
+ err error
+}
+
+func newDecodeError(path string, err error) *decodeError {
+ if err, ok := err.(*decodeError); ok {
+ return &decodeError{path: path + "." + err.path, err: err.err}
+ }
+ return &decodeError{path: path, err: err}
+}
+
+func (e *decodeError) Unwrap() error {
+ return e.err
+}
+
+func (e *decodeError) Error() string {
+ return fmt.Sprintf("%s: %s", e.path, e.err)
+}
+
+// Top represents all possible values of all possible types.
+type Top struct{}
+
+func (t Top) Exact() bool { return false }
+func (t Top) WhyNotExact() string { return "is top" }
+
+func (t Top) decode(rv reflect.Value) error {
+ // We can decode Top into a pointer-typed value as nil.
+ if rv.Kind() != reflect.Pointer {
+ return &inexactError{"top", rv.Type().String()}
+ }
+ rv.SetZero()
+ return nil
+}
+
+// A Def is a mapping from field names to [Value]s. Any fields not explicitly
+// listed have [Value] [Top].
+type Def struct {
+ fields map[string]*Value
+}
+
+// A DefBuilder builds a [Def] one field at a time. The zero value is an empty
+// [Def].
+type DefBuilder struct {
+ fields map[string]*Value
+}
+
+func (b *DefBuilder) Add(name string, v *Value) {
+ if b.fields == nil {
+ b.fields = make(map[string]*Value)
+ }
+ if old, ok := b.fields[name]; ok {
+ panic(fmt.Sprintf("duplicate field %q, added value is %v, old value is %v", name, v, old))
+ }
+ b.fields[name] = v
+}
+
+// Build constructs a [Def] from the fields added to this builder.
+func (b *DefBuilder) Build() Def {
+ return Def{maps.Clone(b.fields)}
+}
+
+// Exact returns true if all field Values are exact.
+func (d Def) Exact() bool {
+ for _, v := range d.fields {
+ if !v.Exact() {
+ return false
+ }
+ }
+ return true
+}
+
+// WhyNotExact returns why the value is not exact
+func (d Def) WhyNotExact() string {
+ for s, v := range d.fields {
+ if !v.Exact() {
+ w := v.WhyNotExact()
+ return "field " + s + ": " + w
+ }
+ }
+ return ""
+}
+
+func (d Def) decode(rv reflect.Value) error {
+ if rv.Kind() != reflect.Struct {
+ return fmt.Errorf("cannot decode Def into %s", rv.Type())
+ }
+
+ var lowered map[string]string // Lower case -> canonical for d.fields.
+ rt := rv.Type()
+ for fi := range rv.NumField() {
+ fType := rt.Field(fi)
+ if fType.PkgPath != "" {
+ continue
+ }
+ v := d.fields[fType.Name]
+ if v == nil {
+ v = topValue
+
+ // Try a case-insensitive match
+ canon, ok := d.fields[strings.ToLower(fType.Name)]
+ if ok {
+ v = canon
+ } else {
+ if lowered == nil {
+ lowered = make(map[string]string, len(d.fields))
+ for k := range d.fields {
+ l := strings.ToLower(k)
+ if k != l {
+ lowered[l] = k
+ }
+ }
+ }
+ canon, ok := lowered[strings.ToLower(fType.Name)]
+ if ok {
+ v = d.fields[canon]
+ }
+ }
+ }
+ if err := decodeReflect(v, rv.Field(fi)); err != nil {
+ return newDecodeError(fType.Name, err)
+ }
+ }
+ return nil
+}
+
+func (d Def) keys() []string {
+ return slices.Sorted(maps.Keys(d.fields))
+}
+
+func (d Def) All() iter.Seq2[string, *Value] {
+ // TODO: We call All fairly often. It's probably bad to sort this every
+ // time.
+ keys := slices.Sorted(maps.Keys(d.fields))
+ return func(yield func(string, *Value) bool) {
+ for _, k := range keys {
+ if !yield(k, d.fields[k]) {
+ return
+ }
+ }
+ }
+}
+
+// A Tuple is a sequence of Values in one of two forms: 1. a fixed-length tuple,
+// where each Value can be different or 2. a "repeated tuple", which is a Value
+// repeated 0 or more times.
+type Tuple struct {
+ vs []*Value
+
+ // repeat, if non-nil, means this Tuple consists of an element repeated 0 or
+ // more times. If repeat is non-nil, vs must be nil. This is a generator
+ // function because we don't necessarily want *exactly* the same Value
+ // repeated. For example, in YAML encoding, a !sum in a repeated tuple needs
+ // a fresh variable in each instance.
+ repeat []func(envSet) (*Value, envSet)
+}
+
+func NewTuple(vs ...*Value) Tuple {
+ return Tuple{vs: vs}
+}
+
+func NewRepeat(gens ...func(envSet) (*Value, envSet)) Tuple {
+ return Tuple{repeat: gens}
+}
+
+func (d Tuple) Exact() bool {
+ if d.repeat != nil {
+ return false
+ }
+ for _, v := range d.vs {
+ if !v.Exact() {
+ return false
+ }
+ }
+ return true
+}
+
+func (d Tuple) WhyNotExact() string {
+ if d.repeat != nil {
+ return "d.repeat is not nil"
+ }
+ for i, v := range d.vs {
+ if !v.Exact() {
+ w := v.WhyNotExact()
+ return "index " + strconv.FormatInt(int64(i), 10) + ": " + w
+ }
+ }
+ return ""
+}
+
+func (d Tuple) decode(rv reflect.Value) error {
+ if d.repeat != nil {
+ return &inexactError{"repeated tuple", rv.Type().String()}
+ }
+ // TODO: We could also do arrays.
+ if rv.Kind() != reflect.Slice {
+ return fmt.Errorf("cannot decode Tuple into %s", rv.Type())
+ }
+ if rv.IsNil() || rv.Cap() < len(d.vs) {
+ rv.Set(reflect.MakeSlice(rv.Type(), len(d.vs), len(d.vs)))
+ } else {
+ rv.SetLen(len(d.vs))
+ }
+ for i, v := range d.vs {
+ if err := decodeReflect(v, rv.Index(i)); err != nil {
+ return newDecodeError(fmt.Sprintf("%d", i), err)
+ }
+ }
+ return nil
+}
+
+// A String represents a set of strings. It can represent the intersection of a
+// set of regexps, or a single exact string. In general, the domain of a String
+// is non-empty, but we do not attempt to prove emptiness of a regexp value.
+type String struct {
+ kind stringKind
+ re []*regexp.Regexp // Intersection of regexps
+ exact string
+}
+
+type stringKind int
+
+const (
+ stringRegex stringKind = iota
+ stringExact
+)
+
+func NewStringRegex(exprs ...string) (String, error) {
+ if len(exprs) == 0 {
+ exprs = []string{""}
+ }
+ v := String{kind: -1}
+ for _, expr := range exprs {
+ if expr == "" {
+ // Skip constructing the regexp. It won't have a "literal prefix"
+ // and so we wind up thinking this is a regexp instead of an exact
+ // (empty) string.
+ v = String{kind: stringExact, exact: ""}
+ continue
+ }
+
+ re, err := regexp.Compile(`\A(?:` + expr + `)\z`)
+ if err != nil {
+ return String{}, fmt.Errorf("parsing value: %s", err)
+ }
+
+ // An exact value narrows the whole domain to exact, so we're done, but
+ // should keep parsing.
+ if v.kind == stringExact {
+ continue
+ }
+
+ if exact, complete := re.LiteralPrefix(); complete {
+ v = String{kind: stringExact, exact: exact}
+ } else {
+ v.kind = stringRegex
+ v.re = append(v.re, re)
+ }
+ }
+ return v, nil
+}
+
+func NewStringExact(s string) String {
+ return String{kind: stringExact, exact: s}
+}
+
+// Exact returns whether this Value is known to consist of a single string.
+func (d String) Exact() bool {
+ return d.kind == stringExact
+}
+
+func (d String) WhyNotExact() string {
+ if d.kind == stringExact {
+ return ""
+ }
+ return "string is not exact"
+}
+
+func (d String) decode(rv reflect.Value) error {
+ if d.kind != stringExact {
+ return &inexactError{"regex", rv.Type().String()}
+ }
+ switch rv.Kind() {
+ default:
+ return fmt.Errorf("cannot decode String into %s", rv.Type())
+ case reflect.String:
+ rv.SetString(d.exact)
+ case reflect.Int:
+ i, err := strconv.Atoi(d.exact)
+ if err != nil {
+ return fmt.Errorf("cannot decode String into %s: %s", rv.Type(), err)
+ }
+ rv.SetInt(int64(i))
+ case reflect.Bool:
+ b, err := strconv.ParseBool(d.exact)
+ if err != nil {
+ return fmt.Errorf("cannot decode String into %s: %s", rv.Type(), err)
+ }
+ rv.SetBool(b)
+ }
+ return nil
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "bytes"
+ "fmt"
+ "html"
+ "io"
+ "os"
+ "os/exec"
+ "strings"
+)
+
+const maxNodes = 30
+
+type dotEncoder struct {
+ w *bytes.Buffer
+
+ idGen int // Node name generation
+ valLimit int // Limit the number of Values in a subgraph
+
+ idp identPrinter
+}
+
+func newDotEncoder() *dotEncoder {
+ return &dotEncoder{
+ w: new(bytes.Buffer),
+ }
+}
+
+func (enc *dotEncoder) clear() {
+ enc.w.Reset()
+ enc.idGen = 0
+}
+
+func (enc *dotEncoder) writeTo(w io.Writer) {
+ fmt.Fprintln(w, "digraph {")
+ // Use the "new" ranking algorithm, which lets us put nodes from different
+ // clusters in the same rank.
+ fmt.Fprintln(w, "newrank=true;")
+ fmt.Fprintln(w, "node [shape=box, ordering=out];")
+
+ w.Write(enc.w.Bytes())
+ fmt.Fprintln(w, "}")
+}
+
+func (enc *dotEncoder) writeSvg(w io.Writer) error {
+ cmd := exec.Command("dot", "-Tsvg")
+ in, err := cmd.StdinPipe()
+ if err != nil {
+ return err
+ }
+ var out bytes.Buffer
+ cmd.Stdout = &out
+ cmd.Stderr = os.Stderr
+ if err := cmd.Start(); err != nil {
+ return err
+ }
+ enc.writeTo(in)
+ in.Close()
+ if err := cmd.Wait(); err != nil {
+ return err
+ }
+ // Trim SVG header so the result can be embedded
+ //
+ // TODO: In Graphviz 10.0.1, we could use -Tsvg_inline.
+ svg := out.Bytes()
+ if i := bytes.Index(svg, []byte("<svg ")); i >= 0 {
+ svg = svg[i:]
+ }
+ _, err = w.Write(svg)
+ return err
+}
+
+func (enc *dotEncoder) newID(f string) string {
+ id := fmt.Sprintf(f, enc.idGen)
+ enc.idGen++
+ return id
+}
+
+func (enc *dotEncoder) node(label, sublabel string) string {
+ id := enc.newID("n%d")
+ l := html.EscapeString(label)
+ if sublabel != "" {
+ l += fmt.Sprintf("<BR ALIGN=\"CENTER\"/><FONT POINT-SIZE=\"10\">%s</FONT>", html.EscapeString(sublabel))
+ }
+ fmt.Fprintf(enc.w, "%s [label=<%s>];\n", id, l)
+ return id
+}
+
+func (enc *dotEncoder) edge(from, to string, label string, args ...any) {
+ l := fmt.Sprintf(label, args...)
+ fmt.Fprintf(enc.w, "%s -> %s [label=%q];\n", from, to, l)
+}
+
+func (enc *dotEncoder) valueSubgraph(v *Value) {
+ enc.valLimit = maxNodes
+ cID := enc.newID("cluster_%d")
+ fmt.Fprintf(enc.w, "subgraph %s {\n", cID)
+ fmt.Fprintf(enc.w, "style=invis;")
+ vID := enc.value(v)
+ fmt.Fprintf(enc.w, "}\n")
+ // We don't need the IDs right now.
+ _, _ = cID, vID
+}
+
+func (enc *dotEncoder) value(v *Value) string {
+ if enc.valLimit <= 0 {
+ id := enc.newID("n%d")
+ fmt.Fprintf(enc.w, "%s [label=\"...\", shape=triangle];\n", id)
+ return id
+ }
+ enc.valLimit--
+
+ switch vd := v.Domain.(type) {
+ default:
+ panic(fmt.Sprintf("unknown domain type %T", vd))
+
+ case nil:
+ return enc.node("_|_", "")
+
+ case Top:
+ return enc.node("_", "")
+
+ // TODO: Like in YAML, figure out if this is just a sum. In dot, we
+ // could say any unentangled variable is a sum, and if it has more than
+ // one reference just share the node.
+
+ // case Sum:
+ // node := enc.node("Sum", "")
+ // for i, elt := range vd.vs {
+ // enc.edge(node, enc.value(elt), "%d", i)
+ // if enc.valLimit <= 0 {
+ // break
+ // }
+ // }
+ // return node
+
+ case Def:
+ node := enc.node("Def", "")
+ for k, v := range vd.All() {
+ enc.edge(node, enc.value(v), "%s", k)
+ if enc.valLimit <= 0 {
+ break
+ }
+ }
+ return node
+
+ case Tuple:
+ if vd.repeat == nil {
+ label := "Tuple"
+ node := enc.node(label, "")
+ for i, elt := range vd.vs {
+ enc.edge(node, enc.value(elt), "%d", i)
+ if enc.valLimit <= 0 {
+ break
+ }
+ }
+ return node
+ } else {
+ // TODO
+ return enc.node("TODO: Repeat", "")
+ }
+
+ case String:
+ switch vd.kind {
+ case stringExact:
+ return enc.node(fmt.Sprintf("%q", vd.exact), "")
+ case stringRegex:
+ var parts []string
+ for _, re := range vd.re {
+ parts = append(parts, fmt.Sprintf("%q", re))
+ }
+ return enc.node(strings.Join(parts, "&"), "")
+ }
+ panic("bad String kind")
+
+ case Var:
+ return enc.node(fmt.Sprintf("Var %s", enc.idp.unique(vd.id)), "")
+ }
+}
+
+func (enc *dotEncoder) envSubgraph(e envSet) {
+ enc.valLimit = maxNodes
+ cID := enc.newID("cluster_%d")
+ fmt.Fprintf(enc.w, "subgraph %s {\n", cID)
+ fmt.Fprintf(enc.w, "style=invis;")
+ vID := enc.env(e.root)
+ fmt.Fprintf(enc.w, "}\n")
+ _, _ = cID, vID
+}
+
+func (enc *dotEncoder) env(e *envExpr) string {
+ switch e.kind {
+ default:
+ panic("bad kind")
+ case envZero:
+ return enc.node("0", "")
+ case envUnit:
+ return enc.node("1", "")
+ case envBinding:
+ node := enc.node(fmt.Sprintf("%q :", enc.idp.unique(e.id)), "")
+ enc.edge(node, enc.value(e.val), "")
+ return node
+ case envProduct:
+ node := enc.node("⨯", "")
+ for _, op := range e.operands {
+ enc.edge(node, enc.env(op), "")
+ }
+ return node
+ case envSum:
+ node := enc.node("+", "")
+ for _, op := range e.operands {
+ enc.edge(node, enc.env(op), "")
+ }
+ return node
+ }
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "fmt"
+ "iter"
+ "reflect"
+ "strings"
+)
+
+// An envSet is an immutable set of environments, where each environment is a
+// mapping from [ident]s to [Value]s.
+//
+// To keep this compact, we use an algebraic representation similar to
+// relational algebra. The atoms are zero, unit, or a singular binding:
+//
+// - A singular binding {x: v} is an environment set consisting of a single
+// environment that binds a single ident x to a single value v.
+//
+// - Zero (0) is the empty set.
+//
+// - Unit (1) is an environment set consisting of a single, empty environment
+// (no bindings).
+//
+// From these, we build up more complex sets of environments using sums and
+// cross products:
+//
+// - A sum, E + F, is simply the union of the two environment sets: E ∪ F
+//
+// - A cross product, E ⨯ F, is the Cartesian product of the two environment
+// sets, followed by joining each pair of environments: {e ⊕ f | (e, f) ∊ E ⨯ F}
+//
+// The join of two environments, e ⊕ f, is an environment that contains all of
+// the bindings in either e or f. To detect bugs, it is an error if an
+// identifier is bound in both e and f (however, see below for what we could do
+// differently).
+//
+// Environment sets form a commutative semiring and thus obey the usual
+// commutative semiring rules:
+//
+// e + 0 = e
+// e ⨯ 0 = 0
+// e ⨯ 1 = e
+// e + f = f + e
+// e ⨯ f = f ⨯ e
+//
+// Furthermore, environments sets are additively and multiplicatively idempotent
+// because + and ⨯ are themselves defined in terms of sets:
+//
+// e + e = e
+// e ⨯ e = e
+//
+// # Examples
+//
+// To represent {{x: 1, y: 1}, {x: 2, y: 2}}, we build the two environments and
+// sum them:
+//
+// ({x: 1} ⨯ {y: 1}) + ({x: 2} ⨯ {y: 2})
+//
+// If we add a third variable z that can be 1 or 2, independent of x and y, we
+// get four logical environments:
+//
+// {x: 1, y: 1, z: 1}
+// {x: 2, y: 2, z: 1}
+// {x: 1, y: 1, z: 2}
+// {x: 2, y: 2, z: 2}
+//
+// This could be represented as a sum of all four environments, but because z is
+// independent, we can use a more compact representation:
+//
+// (({x: 1} ⨯ {y: 1}) + ({x: 2} ⨯ {y: 2})) ⨯ ({z: 1} + {z: 2})
+//
+// # Generalized cross product
+//
+// While cross-product is currently restricted to disjoint environments, we
+// could generalize the definition of joining two environments to:
+//
+// {xₖ: vₖ} ⊕ {xₖ: wₖ} = {xₖ: vₖ ∩ wₖ} (where unbound idents are bound to the [Top] value, ⟙)
+//
+// where v ∩ w is the unification of v and w. This itself could be coarsened to
+//
+// v ∩ w = v if w = ⟙
+// = w if v = ⟙
+// = v if v = w
+// = 0 otherwise
+//
+// We could use this rule to implement substitution. For example, E ⨯ {x: 1}
+// narrows environment set E to only environments in which x is bound to 1. But
+// we currently don't do this.
+type envSet struct {
+ root *envExpr
+}
+
+type envExpr struct {
+ // TODO: A tree-based data structure for this may not be ideal, since it
+ // involves a lot of walking to find things and we often have to do deep
+ // rewrites anyway for partitioning. Would some flattened array-style
+ // representation be better, possibly combined with an index of ident uses?
+ // We could even combine that with an immutable array abstraction (ala
+ // Clojure) that could enable more efficient construction operations.
+
+ kind envExprKind
+
+ // For envBinding
+ id *ident
+ val *Value
+
+ // For sum or product. Len must be >= 2 and none of the elements can have
+ // the same kind as this node.
+ operands []*envExpr
+}
+
+type envExprKind byte
+
+const (
+ envZero envExprKind = iota
+ envUnit
+ envProduct
+ envSum
+ envBinding
+)
+
+var (
+ // topEnv is the unit value (multiplicative identity) of a [envSet].
+ topEnv = envSet{envExprUnit}
+ // bottomEnv is the zero value (additive identity) of a [envSet].
+ bottomEnv = envSet{envExprZero}
+
+ envExprZero = &envExpr{kind: envZero}
+ envExprUnit = &envExpr{kind: envUnit}
+)
+
+// bind binds id to each of vals in e.
+//
+// Its panics if id is already bound in e.
+//
+// Environments are typically initially constructed by starting with [topEnv]
+// and calling bind one or more times.
+func (e envSet) bind(id *ident, vals ...*Value) envSet {
+ if e.isEmpty() {
+ return bottomEnv
+ }
+
+ // TODO: If any of vals are _, should we just drop that val? We're kind of
+ // inconsistent about whether an id missing from e means id is invalid or
+ // means id is _.
+
+ // Check that id isn't present in e.
+ for range e.root.bindings(id) {
+ panic("id " + id.name + " already present in environment")
+ }
+
+ // Create a sum of all the values.
+ bindings := make([]*envExpr, 0, 1)
+ for _, val := range vals {
+ bindings = append(bindings, &envExpr{kind: envBinding, id: id, val: val})
+ }
+
+ // Multiply it in.
+ return envSet{newEnvExprProduct(e.root, newEnvExprSum(bindings...))}
+}
+
+func (e envSet) isEmpty() bool {
+ return e.root.kind == envZero
+}
+
+// bindings yields all [envBinding] nodes in e with the given id. If id is nil,
+// it yields all binding nodes.
+func (e *envExpr) bindings(id *ident) iter.Seq[*envExpr] {
+ // This is just a pre-order walk and it happens this is the only thing we
+ // need a pre-order walk for.
+ return func(yield func(*envExpr) bool) {
+ var rec func(e *envExpr) bool
+ rec = func(e *envExpr) bool {
+ if e.kind == envBinding && (id == nil || e.id == id) {
+ if !yield(e) {
+ return false
+ }
+ }
+ for _, o := range e.operands {
+ if !rec(o) {
+ return false
+ }
+ }
+ return true
+ }
+ rec(e)
+ }
+}
+
+// newEnvExprProduct constructs a product node from exprs, performing
+// simplifications. It does NOT check that bindings are disjoint.
+func newEnvExprProduct(exprs ...*envExpr) *envExpr {
+ factors := make([]*envExpr, 0, 2)
+ for _, expr := range exprs {
+ switch expr.kind {
+ case envZero:
+ return envExprZero
+ case envUnit:
+ // No effect on product
+ case envProduct:
+ factors = append(factors, expr.operands...)
+ default:
+ factors = append(factors, expr)
+ }
+ }
+
+ if len(factors) == 0 {
+ return envExprUnit
+ } else if len(factors) == 1 {
+ return factors[0]
+ }
+ return &envExpr{kind: envProduct, operands: factors}
+}
+
+// newEnvExprSum constructs a sum node from exprs, performing simplifications.
+func newEnvExprSum(exprs ...*envExpr) *envExpr {
+ // TODO: If all of envs are products (or bindings), factor any common terms.
+ // E.g., x * y + x * z ==> x * (y + z). This is easy to do for binding
+ // terms, but harder to do for more general terms.
+
+ var have smallSet[*envExpr]
+ terms := make([]*envExpr, 0, 2)
+ for _, expr := range exprs {
+ switch expr.kind {
+ case envZero:
+ // No effect on sum
+ case envSum:
+ for _, expr1 := range expr.operands {
+ if have.Add(expr1) {
+ terms = append(terms, expr1)
+ }
+ }
+ default:
+ if have.Add(expr) {
+ terms = append(terms, expr)
+ }
+ }
+ }
+
+ if len(terms) == 0 {
+ return envExprZero
+ } else if len(terms) == 1 {
+ return terms[0]
+ }
+ return &envExpr{kind: envSum, operands: terms}
+}
+
+func crossEnvs(env1, env2 envSet) envSet {
+ // Confirm that envs have disjoint idents.
+ var ids1 smallSet[*ident]
+ for e := range env1.root.bindings(nil) {
+ ids1.Add(e.id)
+ }
+ for e := range env2.root.bindings(nil) {
+ if ids1.Has(e.id) {
+ panic(fmt.Sprintf("%s bound on both sides of cross-product", e.id.name))
+ }
+ }
+
+ return envSet{newEnvExprProduct(env1.root, env2.root)}
+}
+
+func unionEnvs(envs ...envSet) envSet {
+ exprs := make([]*envExpr, len(envs))
+ for i := range envs {
+ exprs[i] = envs[i].root
+ }
+ return envSet{newEnvExprSum(exprs...)}
+}
+
+// envPartition is a subset of an env where id is bound to value in all
+// deterministic environments.
+type envPartition struct {
+ id *ident
+ value *Value
+ env envSet
+}
+
+// partitionBy splits e by distinct bindings of id and removes id from each
+// partition.
+//
+// If there are environments in e where id is not bound, they will not be
+// reflected in any partition.
+//
+// It panics if e is bottom, since attempting to partition an empty environment
+// set almost certainly indicates a bug.
+func (e envSet) partitionBy(id *ident) []envPartition {
+ if e.isEmpty() {
+ // We could return zero partitions, but getting here at all almost
+ // certainly indicates a bug.
+ panic("cannot partition empty environment set")
+ }
+
+ // Emit a partition for each value of id.
+ var seen smallSet[*Value]
+ var parts []envPartition
+ for n := range e.root.bindings(id) {
+ if !seen.Add(n.val) {
+ // Already emitted a partition for this value.
+ continue
+ }
+
+ parts = append(parts, envPartition{
+ id: id,
+ value: n.val,
+ env: envSet{e.root.substitute(id, n.val)},
+ })
+ }
+
+ return parts
+}
+
+// substitute replaces bindings of id to val with 1 and bindings of id to any
+// other value with 0 and simplifies the result.
+func (e *envExpr) substitute(id *ident, val *Value) *envExpr {
+ switch e.kind {
+ default:
+ panic("bad kind")
+
+ case envZero, envUnit:
+ return e
+
+ case envBinding:
+ if e.id != id {
+ return e
+ } else if e.val != val {
+ return envExprZero
+ } else {
+ return envExprUnit
+ }
+
+ case envProduct, envSum:
+ // Substitute each operand. Sometimes, this won't change anything, so we
+ // build the new operands list lazily.
+ var nOperands []*envExpr
+ for i, op := range e.operands {
+ nOp := op.substitute(id, val)
+ if nOperands == nil && op != nOp {
+ // Operand diverged; initialize nOperands.
+ nOperands = make([]*envExpr, 0, len(e.operands))
+ nOperands = append(nOperands, e.operands[:i]...)
+ }
+ if nOperands != nil {
+ nOperands = append(nOperands, nOp)
+ }
+ }
+ if nOperands == nil {
+ // Nothing changed.
+ return e
+ }
+ if e.kind == envProduct {
+ return newEnvExprProduct(nOperands...)
+ } else {
+ return newEnvExprSum(nOperands...)
+ }
+ }
+}
+
+// A smallSet is a set optimized for stack allocation when small.
+type smallSet[T comparable] struct {
+ array [32]T
+ n int
+
+ m map[T]struct{}
+}
+
+// Has returns whether val is in set.
+func (s *smallSet[T]) Has(val T) bool {
+ arr := s.array[:s.n]
+ for i := range arr {
+ if arr[i] == val {
+ return true
+ }
+ }
+ _, ok := s.m[val]
+ return ok
+}
+
+// Add adds val to the set and returns true if it was added (not already
+// present).
+func (s *smallSet[T]) Add(val T) bool {
+ // Test for presence.
+ if s.Has(val) {
+ return false
+ }
+
+ // Add it
+ if s.n < len(s.array) {
+ s.array[s.n] = val
+ s.n++
+ } else {
+ if s.m == nil {
+ s.m = make(map[T]struct{})
+ }
+ s.m[val] = struct{}{}
+ }
+ return true
+}
+
+type ident struct {
+ _ [0]func() // Not comparable (only compare *ident)
+ name string
+}
+
+type Var struct {
+ id *ident
+}
+
+func (d Var) Exact() bool {
+ // These can't appear in concrete Values.
+ panic("Exact called on non-concrete Value")
+}
+
+func (d Var) WhyNotExact() string {
+ // These can't appear in concrete Values.
+ return "WhyNotExact called on non-concrete Value"
+}
+
+func (d Var) decode(rv reflect.Value) error {
+ return &inexactError{"var", rv.Type().String()}
+}
+
+func (d Var) unify(w *Value, e envSet, swap bool, uf *unifier) (Domain, envSet, error) {
+ // TODO: Vars from !sums in the input can have a huge number of values.
+ // Unifying these could be way more efficient with some indexes over any
+ // exact values we can pull out, like Def fields that are exact Strings.
+ // Maybe we try to produce an array of yes/no/maybe matches and then we only
+ // have to do deeper evaluation of the maybes. We could probably cache this
+ // on an envTerm. It may also help to special-case Var/Var unification to
+ // pick which one to index versus enumerate.
+
+ if vd, ok := w.Domain.(Var); ok && d.id == vd.id {
+ // Unifying $x with $x results in $x. If we descend into this we'll have
+ // problems because we strip $x out of the environment to keep ourselves
+ // honest and then can't find it on the other side.
+ //
+ // TODO: I'm not positive this is the right fix.
+ return vd, e, nil
+ }
+
+ // We need to unify w with the value of d in each possible environment. We
+ // can save some work by grouping environments by the value of d, since
+ // there will be a lot of redundancy here.
+ var nEnvs []envSet
+ envParts := e.partitionBy(d.id)
+ for i, envPart := range envParts {
+ exit := uf.enterVar(d.id, i)
+ // Each branch logically gets its own copy of the initial environment
+ // (narrowed down to just this binding of the variable), and each branch
+ // may result in different changes to that starting environment.
+ res, e2, err := w.unify(envPart.value, envPart.env, swap, uf)
+ exit.exit()
+ if err != nil {
+ return nil, envSet{}, err
+ }
+ if res.Domain == nil {
+ // This branch entirely failed to unify, so it's gone.
+ continue
+ }
+ nEnv := e2.bind(d.id, res)
+ nEnvs = append(nEnvs, nEnv)
+ }
+
+ if len(nEnvs) == 0 {
+ // All branches failed
+ return nil, bottomEnv, nil
+ }
+
+ // The effect of this is entirely captured in the environment. We can return
+ // back the same Bind node.
+ return d, unionEnvs(nEnvs...), nil
+}
+
+// An identPrinter maps [ident]s to unique string names.
+type identPrinter struct {
+ ids map[*ident]string
+ idGen map[string]int
+}
+
+func (p *identPrinter) unique(id *ident) string {
+ if p.ids == nil {
+ p.ids = make(map[*ident]string)
+ p.idGen = make(map[string]int)
+ }
+
+ name, ok := p.ids[id]
+ if !ok {
+ gen := p.idGen[id.name]
+ p.idGen[id.name]++
+ if gen == 0 {
+ name = id.name
+ } else {
+ name = fmt.Sprintf("%s#%d", id.name, gen)
+ }
+ p.ids[id] = name
+ }
+
+ return name
+}
+
+func (p *identPrinter) slice(ids []*ident) string {
+ var strs []string
+ for _, id := range ids {
+ strs = append(strs, p.unique(id))
+ }
+ return fmt.Sprintf("[%s]", strings.Join(strs, ", "))
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "fmt"
+ "html"
+ "io"
+ "strings"
+)
+
+func (t *tracer) writeHTML(w io.Writer) {
+ if !t.saveTree {
+ panic("writeHTML called without tracer.saveTree")
+ }
+
+ fmt.Fprintf(w, "<html><head><style>%s</style></head>", htmlCSS)
+ for _, root := range t.trees {
+ dot := newDotEncoder()
+ html := htmlTracer{w: w, dot: dot}
+ html.writeTree(root)
+ }
+ fmt.Fprintf(w, "</html>\n")
+}
+
+const htmlCSS = `
+.unify {
+ display: grid;
+ grid-auto-columns: min-content;
+ text-align: center;
+}
+
+.header {
+ grid-row: 1;
+ font-weight: bold;
+ padding: 0.25em;
+ position: sticky;
+ top: 0;
+ background: white;
+}
+
+.envFactor {
+ display: grid;
+ grid-auto-rows: min-content;
+ grid-template-columns: subgrid;
+ text-align: center;
+}
+`
+
+type htmlTracer struct {
+ w io.Writer
+ dot *dotEncoder
+ svgs map[any]string
+}
+
+func (t *htmlTracer) writeTree(node *traceTree) {
+ // TODO: This could be really nice.
+ //
+ // - Put nodes that were unified on the same rank with {rank=same; a; b}
+ //
+ // - On hover, highlight nodes that node was unified with and the result. If
+ // it's a variable, highlight it in the environment, too.
+ //
+ // - On click, show the details of unifying that node.
+ //
+ // This could be the only way to navigate, without necessarily needing the
+ // whole nest of <detail> nodes.
+
+ // TODO: It might be possible to write this out on the fly.
+
+ t.emit([]*Value{node.v, node.w}, []string{"v", "w"}, node.envIn)
+
+ // Render children.
+ for i, child := range node.children {
+ if i >= 10 {
+ fmt.Fprintf(t.w, `<div style="margin-left: 4em">...</div>`)
+ break
+ }
+ fmt.Fprintf(t.w, `<details style="margin-left: 4em"><summary>%s</summary>`, html.EscapeString(child.label))
+ t.writeTree(child)
+ fmt.Fprintf(t.w, "</details>\n")
+ }
+
+ // Render result.
+ if node.err != nil {
+ fmt.Fprintf(t.w, "Error: %s\n", html.EscapeString(node.err.Error()))
+ } else {
+ t.emit([]*Value{node.res}, []string{"res"}, node.env)
+ }
+}
+
+func htmlSVG[Key comparable](t *htmlTracer, f func(Key), arg Key) string {
+ if s, ok := t.svgs[arg]; ok {
+ return s
+ }
+ var buf strings.Builder
+ f(arg)
+ t.dot.writeSvg(&buf)
+ t.dot.clear()
+ svg := buf.String()
+ if t.svgs == nil {
+ t.svgs = make(map[any]string)
+ }
+ t.svgs[arg] = svg
+ buf.Reset()
+ return svg
+}
+
+func (t *htmlTracer) emit(vs []*Value, labels []string, env envSet) {
+ fmt.Fprintf(t.w, `<div class="unify">`)
+ for i, v := range vs {
+ fmt.Fprintf(t.w, `<div class="header" style="grid-column: %d">%s</div>`, i+1, html.EscapeString(labels[i]))
+ fmt.Fprintf(t.w, `<div style="grid-area: 2 / %d">%s</div>`, i+1, htmlSVG(t, t.dot.valueSubgraph, v))
+ }
+ col := len(vs)
+
+ fmt.Fprintf(t.w, `<div class="header" style="grid-column: %d">in</div>`, col+1)
+ fmt.Fprintf(t.w, `<div style="grid-area: 2 / %d">%s</div>`, col+1, htmlSVG(t, t.dot.envSubgraph, env))
+
+ fmt.Fprintf(t.w, `</div>`)
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "fmt"
+)
+
+type Pos struct {
+ Path string
+ Line int
+}
+
+func (p Pos) String() string {
+ var b []byte
+ b, _ = p.AppendText(b)
+ return string(b)
+}
+
+func (p Pos) AppendText(b []byte) ([]byte, error) {
+ if p.Line == 0 {
+ if p.Path == "" {
+ return append(b, "?:?"...), nil
+ } else {
+ return append(b, p.Path...), nil
+ }
+ } else if p.Path == "" {
+ return fmt.Appendf(b, "?:%d", p.Line), nil
+ }
+ return fmt.Appendf(b, "%s:%d", p.Path, p.Line), nil
+}
--- /dev/null
+# In the original representation of environments, this caused an exponential
+# blowup in time and allocation. With that representation, this took about 20
+# seconds on my laptop and had a max RSS of ~12 GB. Big enough to be really
+# noticeable, but not so big it's likely to crash a developer machine. With the
+# better environment representation, it runs almost instantly and has an RSS of
+# ~90 MB.
+unify:
+- !sum
+ - !sum [1, 2]
+ - !sum [3, 4]
+ - !sum [5, 6]
+ - !sum [7, 8]
+ - !sum [9, 10]
+ - !sum [11, 12]
+ - !sum [13, 14]
+ - !sum [15, 16]
+ - !sum [17, 18]
+ - !sum [19, 20]
+ - !sum [21, 22]
+- !sum
+ - !sum [1, 2]
+ - !sum [3, 4]
+ - !sum [5, 6]
+ - !sum [7, 8]
+ - !sum [9, 10]
+ - !sum [11, 12]
+ - !sum [13, 14]
+ - !sum [15, 16]
+ - !sum [17, 18]
+ - !sum [19, 20]
+ - !sum [21, 22]
+all:
+ [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
--- /dev/null
+# Basic tests of unification
+
+#
+# Terminals
+#
+
+unify:
+- _
+- _
+want:
+ _
+---
+unify:
+- _
+- test
+want:
+ test
+---
+unify:
+- test
+- t?est
+want:
+ test
+---
+unify:
+- 1
+- 1
+want:
+ 1
+---
+unify:
+- test
+- foo
+want:
+ _|_
+
+#
+# Tuple
+#
+
+---
+unify:
+- [a, b]
+- [a, b]
+want:
+ [a, b]
+---
+unify:
+- [a, _]
+- [_, b]
+want:
+ [a, b]
+---
+unify:
+- ["ab?c", "de?f"]
+- [ac, def]
+want:
+ [ac, def]
+
+#
+# Repeats
+#
+
+---
+unify:
+- !repeat [a]
+- [_]
+want:
+ [a]
+---
+unify:
+- !repeat [a]
+- [_, _]
+want:
+ [a, a]
+---
+unify:
+- !repeat [a]
+- [b]
+want:
+ _|_
+---
+unify:
+- !repeat [xy*]
+- [x, xy, xyy]
+want:
+ [x, xy, xyy]
+---
+unify:
+- !repeat [xy*]
+- !repeat ["xz?y*"]
+- [x, xy, xyy]
+want:
+ [x, xy, xyy]
+---
+unify:
+- !repeat [!sum [a, b]]
+- [a, b, a]
+all:
+- [a, b, a]
+---
+unify:
+- !repeat [!sum [a, b]]
+- !repeat [!sum [b, c]]
+- [b, b, b]
+all:
+- [b, b, b]
+---
+unify:
+- !repeat [!sum [a, b]]
+- !repeat [!sum [b, c]]
+- [a]
+all: []
+
+#
+# Def
+#
+
+---
+unify:
+- {a: a, b: b}
+- {a: a, b: b}
+want:
+ {a: a, b: b}
+---
+unify:
+- {a: a}
+- {b: b}
+want:
+ {a: a, b: b}
+
+#
+# Sum
+#
+
+---
+unify:
+- !sum [1, 2]
+- !sum [2, 3]
+all:
+- 2
+---
+unify:
+- !sum [{label: a, value: abc}, {label: b, value: def}]
+- !sum [{value: "ab?c", extra: d}, {value: "def?", extra: g}]
+all:
+- {extra: d, label: a, value: abc}
+- {extra: g, label: b, value: def}
+---
+# A sum of repeats must deal with different dynamically-created variables in
+# each branch.
+unify:
+- !sum [!repeat [a], !repeat [b]]
+- [a, a, a]
+all:
+- [a, a, a]
+---
+unify:
+- !sum [!repeat [a], !repeat [b]]
+- [a, a, b]
+all: []
+---
+# Exercise sumEnvs with more than one result
+unify:
+- !sum
+ - [a|b, c|d]
+ - [e, g]
+- [!sum [a, b, e, f], !sum [c, d, g, h]]
+all:
+- [a, c]
+- [a, d]
+- [b, c]
+- [b, d]
+- [e, g]
--- /dev/null
+#
+# Basic tests
+#
+
+name: "basic string"
+unify:
+- $x
+- test
+all:
+- test
+---
+name: "basic tuple"
+unify:
+- [$x, $x]
+- [test, test]
+all:
+- [test, test]
+---
+name: "three tuples"
+unify:
+- [$x, $x]
+- [test, _]
+- [_, test]
+all:
+- [test, test]
+---
+name: "basic def"
+unify:
+- {a: $x, b: $x}
+- {a: test, b: test}
+all:
+- {a: test, b: test}
+---
+name: "three defs"
+unify:
+- {a: $x, b: $x}
+- {a: test}
+- {b: test}
+all:
+- {a: test, b: test}
+
+#
+# Bottom tests
+#
+
+---
+name: "basic bottom"
+unify:
+- [$x, $x]
+- [test, foo]
+all: []
+---
+name: "three-way bottom"
+unify:
+- [$x, $x]
+- [test, _]
+- [_, foo]
+all: []
+
+#
+# Basic sum tests
+#
+
+---
+name: "basic sum"
+unify:
+- $x
+- !sum [a, b]
+all:
+- a
+- b
+---
+name: "sum of tuples"
+unify:
+- [$x]
+- !sum [[a], [b]]
+all:
+- [a]
+- [b]
+---
+name: "acausal sum"
+unify:
+- [_, !sum [a, b]]
+- [$x, $x]
+all:
+- [a, a]
+- [b, b]
+
+#
+# Transitivity tests
+#
+
+---
+name: "transitivity"
+unify:
+- [_, _, _, test]
+- [$x, $x, _, _]
+- [ _, $x, $x, _]
+- [ _, _, $x, $x]
+all:
+- [test, test, test, test]
+
+#
+# Multiple vars
+#
+
+---
+name: "basic uncorrelated vars"
+unify:
+- - !sum [1, 2]
+ - !sum [3, 4]
+- - $a
+ - $b
+all:
+- [1, 3]
+- [1, 4]
+- [2, 3]
+- [2, 4]
+---
+name: "uncorrelated vars"
+unify:
+- - !sum [1, 2]
+ - !sum [3, 4]
+ - !sum [1, 2]
+- - $a
+ - $b
+ - $a
+all:
+- [1, 3, 1]
+- [1, 4, 1]
+- [2, 3, 2]
+- [2, 4, 2]
+---
+name: "entangled vars"
+unify:
+- - !sum [[1,2],[3,4]]
+ - !sum [[2,1],[3,4],[4,3]]
+- - [$a, $b]
+ - [$b, $a]
+all:
+- - [1, 2]
+ - [2, 1]
+- - [3, 4]
+ - [4, 3]
+
+#
+# End-to-end examples
+#
+
+---
+name: "end-to-end"
+unify:
+- go: Add
+ in:
+ - go: $t
+ - go: $t
+- in: !repeat
+ - !sum
+ - go: Int32x4
+ base: int
+ - go: Uint32x4
+ base: uint
+all:
+- go: Add
+ in:
+ - base: int
+ go: Int32x4
+ - base: int
+ go: Int32x4
+- go: Add
+ in:
+ - base: uint
+ go: Uint32x4
+ - base: uint
+ go: Uint32x4
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "fmt"
+ "io"
+ "strings"
+
+ "gopkg.in/yaml.v3"
+)
+
+// debugDotInHTML, if true, includes dot code for all graphs in the HTML. Useful
+// for debugging the dot output itself.
+const debugDotInHTML = false
+
+var Debug struct {
+ // UnifyLog, if non-nil, receives a streaming text trace of unification.
+ UnifyLog io.Writer
+
+ // HTML, if non-nil, writes an HTML trace of unification to HTML.
+ HTML io.Writer
+}
+
+type tracer struct {
+ logw io.Writer
+
+ enc yamlEncoder // Print consistent idents throughout
+
+ saveTree bool // if set, record tree; required for HTML output
+
+ path []string
+
+ node *traceTree
+ trees []*traceTree
+}
+
+type traceTree struct {
+ label string // Identifies this node as a child of parent
+ v, w *Value // Unification inputs
+ envIn envSet
+ res *Value // Unification result
+ env envSet
+ err error // or error
+
+ parent *traceTree
+ children []*traceTree
+}
+
+type tracerExit struct {
+ t *tracer
+ len int
+ node *traceTree
+}
+
+func (t *tracer) enter(pat string, vals ...any) tracerExit {
+ if t == nil {
+ return tracerExit{}
+ }
+
+ label := fmt.Sprintf(pat, vals...)
+
+ var p *traceTree
+ if t.saveTree {
+ p = t.node
+ if p != nil {
+ t.node = &traceTree{label: label, parent: p}
+ p.children = append(p.children, t.node)
+ }
+ }
+
+ t.path = append(t.path, label)
+ return tracerExit{t, len(t.path) - 1, p}
+}
+
+func (t *tracer) enterVar(id *ident, branch int) tracerExit {
+ if t == nil {
+ return tracerExit{}
+ }
+
+ // Use the tracer's ident printer
+ return t.enter("Var %s br %d", t.enc.idp.unique(id), branch)
+}
+
+func (te tracerExit) exit() {
+ if te.t == nil {
+ return
+ }
+ te.t.path = te.t.path[:te.len]
+ te.t.node = te.node
+}
+
+func indentf(prefix string, pat string, vals ...any) string {
+ s := fmt.Sprintf(pat, vals...)
+ if len(prefix) == 0 {
+ return s
+ }
+ if !strings.Contains(s, "\n") {
+ return prefix + s
+ }
+
+ indent := prefix
+ if strings.TrimLeft(prefix, " ") != "" {
+ // Prefix has non-space characters in it. Construct an all space-indent.
+ indent = strings.Repeat(" ", len(prefix))
+ }
+ return prefix + strings.ReplaceAll(s, "\n", "\n"+indent)
+}
+
+func yamlf(prefix string, node *yaml.Node) string {
+ b, err := yaml.Marshal(node)
+ if err != nil {
+ return fmt.Sprintf("<marshal failed: %s>", err)
+ }
+ return strings.TrimRight(indentf(prefix, "%s", b), " \n")
+}
+
+func (t *tracer) logf(pat string, vals ...any) {
+ if t == nil || t.logw == nil {
+ return
+ }
+ prefix := fmt.Sprintf("[%s] ", strings.Join(t.path, "/"))
+ s := indentf(prefix, pat, vals...)
+ s = strings.TrimRight(s, " \n")
+ fmt.Fprintf(t.logw, "%s\n", s)
+}
+
+func (t *tracer) traceUnify(v, w *Value, e envSet) {
+ if t == nil {
+ return
+ }
+
+ t.enc.e = e // Interpret values w.r.t. e
+ t.logf("Unify\n%s\nwith\n%s\nin\n%s",
+ yamlf(" ", t.enc.value(v)),
+ yamlf(" ", t.enc.value(w)),
+ yamlf(" ", t.enc.env(e)))
+ t.enc.e = envSet{}
+
+ if t.saveTree {
+ if t.node == nil {
+ t.node = &traceTree{}
+ t.trees = append(t.trees, t.node)
+ }
+ t.node.v, t.node.w, t.node.envIn = v, w, e
+ }
+}
+
+func (t *tracer) traceDone(res *Value, e envSet, err error) {
+ if t == nil {
+ return
+ }
+
+ if err != nil {
+ t.logf("==> %s", err)
+ } else {
+ t.logf("==>\n%s", yamlf(" ", t.enc.closure(Closure{res, e})))
+ }
+
+ if t.saveTree {
+ node := t.node
+ if node == nil {
+ panic("popped top of trace stack")
+ }
+ node.res, node.err = res, err
+ node.env = e
+ }
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package unify implements unification of structured values.
+//
+// A [Value] represents a possibly infinite set of concrete values, where a
+// value is either a string ([String]), a tuple of values ([Tuple]), or a
+// string-keyed map of values called a "def" ([Def]). These sets can be further
+// constrained by variables ([Var]). A [Value] combined with bindings of
+// variables is a [Closure].
+//
+// [Unify] finds a [Closure] that satisfies two or more other [Closure]s. This
+// can be thought of as intersecting the sets represented by these Closures'
+// values, or as the greatest lower bound/infimum of these Closures. If no such
+// Closure exists, the result of unification is "bottom", or the empty set.
+//
+// # Examples
+//
+// The regular expression "a*" is the infinite set of strings of zero or more
+// "a"s. "a*" can be unified with "a" or "aa" or "aaa", and the result is just
+// "a", "aa", or "aaa", respectively. However, unifying "a*" with "b" fails
+// because there are no values that satisfy both.
+//
+// Sums express sets directly. For example, !sum [a, b] is the set consisting of
+// "a" and "b". Unifying this with !sum [b, c] results in just "b". This also
+// makes it easy to demonstrate that unification isn't necessarily a single
+// concrete value. For example, unifying !sum [a, b, c] with !sum [b, c, d]
+// results in two concrete values: "b" and "c".
+//
+// The special value _ or "top" represents all possible values. Unifying _ with
+// any value x results in x.
+//
+// Unifying composite values—tuples and defs—unifies their elements.
+//
+// The value [a*, aa] is an infinite set of tuples. If we unify that with the
+// value [aaa, a*], the only possible value that satisfies both is [aaa, aa].
+// Likewise, this is the intersection of the sets described by these two values.
+//
+// Defs are similar to tuples, but they are indexed by strings and don't have a
+// fixed length. For example, {x: a, y: b} is a def with two fields. Any field
+// not mentioned in a def is implicitly top. Thus, unifying this with {y: b, z:
+// c} results in {x: a, y: b, z: c}.
+//
+// Variables constrain values. For example, the value [$x, $x] represents all
+// tuples whose first and second values are the same, but doesn't otherwise
+// constrain that value. Thus, this set includes [a, a] as well as [[b, c, d],
+// [b, c, d]], but it doesn't include [a, b].
+//
+// Sums are internally implemented as fresh variables that are simultaneously
+// bound to all values of the sum. That is !sum [a, b] is actually $var (where
+// var is some fresh name), closed under the environment $var=a | $var=b.
+package unify
+
+import (
+ "errors"
+ "fmt"
+ "slices"
+)
+
+// Unify computes a Closure that satisfies each input Closure. If no such
+// Closure exists, it returns bottom.
+func Unify(closures ...Closure) (Closure, error) {
+ if len(closures) == 0 {
+ return Closure{topValue, topEnv}, nil
+ }
+
+ var trace *tracer
+ if Debug.UnifyLog != nil || Debug.HTML != nil {
+ trace = &tracer{
+ logw: Debug.UnifyLog,
+ saveTree: Debug.HTML != nil,
+ }
+ }
+
+ unified := closures[0]
+ for _, c := range closures[1:] {
+ var err error
+ uf := newUnifier()
+ uf.tracer = trace
+ e := crossEnvs(unified.env, c.env)
+ unified.val, unified.env, err = unified.val.unify(c.val, e, false, uf)
+ if Debug.HTML != nil {
+ uf.writeHTML(Debug.HTML)
+ }
+ if err != nil {
+ return Closure{}, err
+ }
+ }
+
+ return unified, nil
+}
+
+type unifier struct {
+ *tracer
+}
+
+func newUnifier() *unifier {
+ return &unifier{}
+}
+
+// errDomains is a sentinel error used between unify and unify1 to indicate that
+// unify1 could not unify the domains of the two values.
+var errDomains = errors.New("cannot unify domains")
+
+func (v *Value) unify(w *Value, e envSet, swap bool, uf *unifier) (*Value, envSet, error) {
+ if swap {
+ // Put the values in order. This just happens to be a handy choke-point
+ // to do this at.
+ v, w = w, v
+ }
+
+ uf.traceUnify(v, w, e)
+
+ d, e2, err := v.unify1(w, e, false, uf)
+ if err == errDomains {
+ // Try the other order.
+ d, e2, err = w.unify1(v, e, true, uf)
+ if err == errDomains {
+ // Okay, we really can't unify these.
+ err = fmt.Errorf("cannot unify %T (%s) and %T (%s): kind mismatch", v.Domain, v.PosString(), w.Domain, w.PosString())
+ }
+ }
+ if err != nil {
+ uf.traceDone(nil, envSet{}, err)
+ return nil, envSet{}, err
+ }
+ res := unified(d, v, w)
+ uf.traceDone(res, e2, nil)
+ if d == nil {
+ // Double check that a bottom Value also has a bottom env.
+ if !e2.isEmpty() {
+ panic("bottom Value has non-bottom environment")
+ }
+ }
+
+ return res, e2, nil
+}
+
+func (v *Value) unify1(w *Value, e envSet, swap bool, uf *unifier) (Domain, envSet, error) {
+ // TODO: If there's an error, attach position information to it.
+
+ vd, wd := v.Domain, w.Domain
+
+ // Bottom returns bottom, and eliminates all possible environments.
+ if vd == nil || wd == nil {
+ return nil, bottomEnv, nil
+ }
+
+ // Top always returns the other.
+ if _, ok := vd.(Top); ok {
+ return wd, e, nil
+ }
+
+ // Variables
+ if vd, ok := vd.(Var); ok {
+ return vd.unify(w, e, swap, uf)
+ }
+
+ // Composite values
+ if vd, ok := vd.(Def); ok {
+ if wd, ok := wd.(Def); ok {
+ return vd.unify(wd, e, swap, uf)
+ }
+ }
+ if vd, ok := vd.(Tuple); ok {
+ if wd, ok := wd.(Tuple); ok {
+ return vd.unify(wd, e, swap, uf)
+ }
+ }
+
+ // Scalar values
+ if vd, ok := vd.(String); ok {
+ if wd, ok := wd.(String); ok {
+ res := vd.unify(wd)
+ if res == nil {
+ e = bottomEnv
+ }
+ return res, e, nil
+ }
+ }
+
+ return nil, envSet{}, errDomains
+}
+
+func (d Def) unify(o Def, e envSet, swap bool, uf *unifier) (Domain, envSet, error) {
+ out := Def{fields: make(map[string]*Value)}
+
+ // Check keys of d against o.
+ for key, dv := range d.All() {
+ ov, ok := o.fields[key]
+ if !ok {
+ // ov is implicitly Top. Bypass unification.
+ out.fields[key] = dv
+ continue
+ }
+ exit := uf.enter("%s", key)
+ res, e2, err := dv.unify(ov, e, swap, uf)
+ exit.exit()
+ if err != nil {
+ return nil, envSet{}, err
+ } else if res.Domain == nil {
+ // No match.
+ return nil, bottomEnv, nil
+ }
+ out.fields[key] = res
+ e = e2
+ }
+ // Check keys of o that we didn't already check. These all implicitly match
+ // because we know the corresponding fields in d are all Top.
+ for key, dv := range o.All() {
+ if _, ok := d.fields[key]; !ok {
+ out.fields[key] = dv
+ }
+ }
+ return out, e, nil
+}
+
+func (v Tuple) unify(w Tuple, e envSet, swap bool, uf *unifier) (Domain, envSet, error) {
+ if v.repeat != nil && w.repeat != nil {
+ // Since we generate the content of these lazily, there's not much we
+ // can do but just stick them on a list to unify later.
+ return Tuple{repeat: concat(v.repeat, w.repeat)}, e, nil
+ }
+
+ // Expand any repeated tuples.
+ tuples := make([]Tuple, 0, 2)
+ if v.repeat == nil {
+ tuples = append(tuples, v)
+ } else {
+ v2, e2 := v.doRepeat(e, len(w.vs))
+ tuples = append(tuples, v2...)
+ e = e2
+ }
+ if w.repeat == nil {
+ tuples = append(tuples, w)
+ } else {
+ w2, e2 := w.doRepeat(e, len(v.vs))
+ tuples = append(tuples, w2...)
+ e = e2
+ }
+
+ // Now unify all of the tuples (usually this will be just 2 tuples)
+ out := tuples[0]
+ for _, t := range tuples[1:] {
+ if len(out.vs) != len(t.vs) {
+ uf.logf("tuple length mismatch")
+ return nil, bottomEnv, nil
+ }
+ zs := make([]*Value, len(out.vs))
+ for i, v1 := range out.vs {
+ exit := uf.enter("%d", i)
+ z, e2, err := v1.unify(t.vs[i], e, swap, uf)
+ exit.exit()
+ if err != nil {
+ return nil, envSet{}, err
+ } else if z.Domain == nil {
+ return nil, bottomEnv, nil
+ }
+ zs[i] = z
+ e = e2
+ }
+ out = Tuple{vs: zs}
+ }
+
+ return out, e, nil
+}
+
+// doRepeat creates a fixed-length tuple from a repeated tuple. The caller is
+// expected to unify the returned tuples.
+func (v Tuple) doRepeat(e envSet, n int) ([]Tuple, envSet) {
+ res := make([]Tuple, len(v.repeat))
+ for i, gen := range v.repeat {
+ res[i].vs = make([]*Value, n)
+ for j := range n {
+ res[i].vs[j], e = gen(e)
+ }
+ }
+ return res, e
+}
+
+// unify intersects the domains of two [String]s. If it can prove that this
+// domain is empty, it returns nil (bottom).
+//
+// TODO: Consider splitting literals and regexps into two domains.
+func (v String) unify(w String) Domain {
+ // Unification is symmetric, so put them in order of string kind so we only
+ // have to deal with half the cases.
+ if v.kind > w.kind {
+ v, w = w, v
+ }
+
+ switch v.kind {
+ case stringRegex:
+ switch w.kind {
+ case stringRegex:
+ // Construct a match against all of the regexps
+ return String{kind: stringRegex, re: slices.Concat(v.re, w.re)}
+ case stringExact:
+ for _, re := range v.re {
+ if !re.MatchString(w.exact) {
+ return nil
+ }
+ }
+ return w
+ }
+ case stringExact:
+ if v.exact != w.exact {
+ return nil
+ }
+ return v
+ }
+ panic("bad string kind")
+}
+
+func concat[T any](s1, s2 []T) []T {
+ // Reuse s1 or s2 if possible.
+ if len(s1) == 0 {
+ return s2
+ }
+ return append(s1[:len(s1):len(s1)], s2...)
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "os"
+ "path/filepath"
+ "slices"
+ "strings"
+ "testing"
+
+ "gopkg.in/yaml.v3"
+)
+
+func TestUnify(t *testing.T) {
+ paths, err := filepath.Glob("testdata/*")
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(paths) == 0 {
+ t.Fatal("no testdata found")
+ }
+ for _, path := range paths {
+ // Skip paths starting with _ so experimental files can be added.
+ base := filepath.Base(path)
+ if base[0] == '_' {
+ continue
+ }
+ if !strings.HasSuffix(base, ".yaml") {
+ t.Errorf("non-.yaml file in testdata: %s", base)
+ continue
+ }
+ base = strings.TrimSuffix(base, ".yaml")
+
+ t.Run(base, func(t *testing.T) {
+ testUnify(t, path)
+ })
+ }
+}
+
+func testUnify(t *testing.T, path string) {
+ f, err := os.Open(path)
+ if err != nil {
+ t.Fatal(err)
+ }
+ defer f.Close()
+
+ type testCase struct {
+ Skip bool
+ Name string
+ Unify []Closure
+ Want yaml.Node
+ All yaml.Node
+ }
+ dec := yaml.NewDecoder(f)
+
+ for i := 0; ; i++ {
+ var tc testCase
+ err := dec.Decode(&tc)
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ name := tc.Name
+ if name == "" {
+ name = fmt.Sprint(i)
+ }
+
+ t.Run(name, func(t *testing.T) {
+ if tc.Skip {
+ t.Skip("skip: true set in test case")
+ }
+
+ defer func() {
+ p := recover()
+ if p != nil || t.Failed() {
+ // Redo with a trace
+ //
+ // TODO: Use t.Output() in Go 1.25.
+ var buf bytes.Buffer
+ Debug.UnifyLog = &buf
+ func() {
+ defer func() {
+ // If the original unify panicked, the second one
+ // probably will, too. Ignore it and let the first panic
+ // bubble.
+ recover()
+ }()
+ Unify(tc.Unify...)
+ }()
+ Debug.UnifyLog = nil
+ t.Logf("Trace:\n%s", buf.String())
+ }
+ if p != nil {
+ panic(p)
+ }
+ }()
+
+ // Unify the test cases
+ //
+ // TODO: Try reordering the inputs also
+ c, err := Unify(tc.Unify...)
+ if err != nil {
+ // TODO: Tests of errors
+ t.Fatal(err)
+ }
+
+ // Encode the result back to YAML so we can check if it's structurally
+ // equal.
+ clean := func(val any) *yaml.Node {
+ var node yaml.Node
+ node.Encode(val)
+ for n := range allYamlNodes(&node) {
+ // Canonicalize the style. There may be other style flags we need to
+ // muck with.
+ n.Style &^= yaml.FlowStyle
+ n.HeadComment = ""
+ n.LineComment = ""
+ n.FootComment = ""
+ }
+ return &node
+ }
+ check := func(gotVal any, wantNode *yaml.Node) {
+ got, err := yaml.Marshal(clean(gotVal))
+ if err != nil {
+ t.Fatalf("Encoding Value back to yaml failed: %s", err)
+ }
+ want, err := yaml.Marshal(clean(wantNode))
+ if err != nil {
+ t.Fatalf("Encoding Want back to yaml failed: %s", err)
+ }
+
+ if !bytes.Equal(got, want) {
+ t.Errorf("%s:%d:\nwant:\n%sgot\n%s", f.Name(), wantNode.Line, want, got)
+ }
+ }
+ if tc.Want.Kind != 0 {
+ check(c.val, &tc.Want)
+ }
+ if tc.All.Kind != 0 {
+ fVal := slices.Collect(c.All())
+ check(fVal, &tc.All)
+ }
+ })
+ }
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "fmt"
+ "iter"
+ "reflect"
+)
+
+// A Value represents a structured, non-deterministic value consisting of
+// strings, tuples of Values, and string-keyed maps of Values. A
+// non-deterministic Value will also contain variables, which are resolved via
+// an environment as part of a [Closure].
+//
+// For debugging, a Value can also track the source position it was read from in
+// an input file, and its provenance from other Values.
+type Value struct {
+ Domain Domain
+
+ // A Value has either a pos or parents (or neither).
+ pos *Pos
+ parents *[2]*Value
+}
+
+var (
+ topValue = &Value{Domain: Top{}}
+ bottomValue = &Value{Domain: nil}
+)
+
+// NewValue returns a new [Value] with the given domain and no position
+// information.
+func NewValue(d Domain) *Value {
+ return &Value{Domain: d}
+}
+
+// NewValuePos returns a new [Value] with the given domain at position p.
+func NewValuePos(d Domain, p Pos) *Value {
+ return &Value{Domain: d, pos: &p}
+}
+
+// newValueFrom returns a new [Value] with the given domain that copies the
+// position information of p.
+func newValueFrom(d Domain, p *Value) *Value {
+ return &Value{Domain: d, pos: p.pos, parents: p.parents}
+}
+
+func unified(d Domain, p1, p2 *Value) *Value {
+ return &Value{Domain: d, parents: &[2]*Value{p1, p2}}
+}
+
+func (v *Value) Pos() Pos {
+ if v.pos == nil {
+ return Pos{}
+ }
+ return *v.pos
+}
+
+func (v *Value) PosString() string {
+ var b []byte
+ for root := range v.Provenance() {
+ if len(b) > 0 {
+ b = append(b, ' ')
+ }
+ b, _ = root.pos.AppendText(b)
+ }
+ return string(b)
+}
+
+func (v *Value) WhyNotExact() string {
+ if v.Domain == nil {
+ return "v.Domain is nil"
+ }
+ return v.Domain.WhyNotExact()
+}
+
+func (v *Value) Exact() bool {
+ if v.Domain == nil {
+ return false
+ }
+ return v.Domain.Exact()
+}
+
+// Decode decodes v into a Go value.
+//
+// v must be exact, except that it can include Top. into must be a pointer.
+// [Def]s are decoded into structs. [Tuple]s are decoded into slices. [String]s
+// are decoded into strings or ints. Any field can itself be a pointer to one of
+// these types. Top can be decoded into a pointer-typed field and will set the
+// field to nil. Anything else will allocate a value if necessary.
+//
+// Any type may implement [Decoder], in which case its DecodeUnified method will
+// be called instead of using the default decoding scheme.
+func (v *Value) Decode(into any) error {
+ rv := reflect.ValueOf(into)
+ if rv.Kind() != reflect.Pointer {
+ return fmt.Errorf("cannot decode into non-pointer %T", into)
+ }
+ return decodeReflect(v, rv.Elem())
+}
+
+func decodeReflect(v *Value, rv reflect.Value) error {
+ var ptr reflect.Value
+ if rv.Kind() == reflect.Pointer {
+ if rv.IsNil() {
+ // Transparently allocate through pointers, *except* for Top, which
+ // wants to set the pointer to nil.
+ //
+ // TODO: Drop this condition if I switch to an explicit Optional[T]
+ // or move the Top logic into Def.
+ if _, ok := v.Domain.(Top); !ok {
+ // Allocate the value to fill in, but don't actually store it in
+ // the pointer until we successfully decode.
+ ptr = rv
+ rv = reflect.New(rv.Type().Elem()).Elem()
+ }
+ } else {
+ rv = rv.Elem()
+ }
+ }
+
+ var err error
+ if reflect.PointerTo(rv.Type()).Implements(decoderType) {
+ // Use the custom decoder.
+ err = rv.Addr().Interface().(Decoder).DecodeUnified(v)
+ } else {
+ err = v.Domain.decode(rv)
+ }
+ if err == nil && ptr.IsValid() {
+ ptr.Set(rv.Addr())
+ }
+ return err
+}
+
+// Decoder can be implemented by types as a custom implementation of [Decode]
+// for that type.
+type Decoder interface {
+ DecodeUnified(v *Value) error
+}
+
+var decoderType = reflect.TypeOf((*Decoder)(nil)).Elem()
+
+// Provenance iterates over all of the source Values that have contributed to
+// this Value.
+func (v *Value) Provenance() iter.Seq[*Value] {
+ return func(yield func(*Value) bool) {
+ var rec func(d *Value) bool
+ rec = func(d *Value) bool {
+ if d.pos != nil {
+ if !yield(d) {
+ return false
+ }
+ }
+ if d.parents != nil {
+ for _, p := range d.parents {
+ if !rec(p) {
+ return false
+ }
+ }
+ }
+ return true
+ }
+ rec(v)
+ }
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "reflect"
+ "slices"
+ "testing"
+)
+
+func ExampleClosure_All_tuple() {
+ v := mustParse(`
+- !sum [1, 2]
+- !sum [3, 4]
+`)
+ printYaml(slices.Collect(v.All()))
+
+ // Output:
+ // - [1, 3]
+ // - [1, 4]
+ // - [2, 3]
+ // - [2, 4]
+}
+
+func ExampleClosure_All_def() {
+ v := mustParse(`
+a: !sum [1, 2]
+b: !sum [3, 4]
+c: 5
+`)
+ printYaml(slices.Collect(v.All()))
+
+ // Output:
+ // - {a: 1, b: 3, c: 5}
+ // - {a: 1, b: 4, c: 5}
+ // - {a: 2, b: 3, c: 5}
+ // - {a: 2, b: 4, c: 5}
+}
+
+func checkDecode[T any](t *testing.T, got *Value, want T) {
+ var gotT T
+ if err := got.Decode(&gotT); err != nil {
+ t.Fatalf("Decode failed: %v", err)
+ }
+ if !reflect.DeepEqual(&gotT, &want) {
+ t.Fatalf("got:\n%s\nwant:\n%s", prettyYaml(gotT), prettyYaml(want))
+ }
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "errors"
+ "fmt"
+ "io"
+ "io/fs"
+ "os"
+ "path/filepath"
+ "regexp"
+ "strings"
+
+ "gopkg.in/yaml.v3"
+)
+
+// ReadOpts provides options to [Read] and related functions. The zero value is
+// the default options.
+type ReadOpts struct {
+ // FS, if non-nil, is the file system from which to resolve !import file
+ // names.
+ FS fs.FS
+}
+
+// Read reads a [Closure] in YAML format from r, using path for error messages.
+//
+// It maps YAML nodes into terminal Values as follows:
+//
+// - "_" or !top _ is the top value ([Top]).
+//
+// - "_|_" or !bottom _ is the bottom value. This is an error during
+// unmarshaling, but can appear in marshaled values.
+//
+// - "$<name>" or !var <name> is a variable ([Var]). Everywhere the same name
+// appears within a single unmarshal operation, it is mapped to the same
+// variable. Different unmarshal operations get different variables, even if
+// they have the same string name.
+//
+// - !regex "x" is a regular expression ([String]), as is any string that
+// doesn't match "_", "_|_", or "$...". Regular expressions are implicitly
+// anchored at the beginning and end. If the string doesn't contain any
+// meta-characters (that is, it's a "literal" regular expression), then it's
+// treated as an exact string.
+//
+// - !string "x", or any int, float, bool, or binary value is an exact string
+// ([String]).
+//
+// - !regex [x, y, ...] is an intersection of regular expressions ([String]).
+//
+// It maps YAML nodes into non-terminal Values as follows:
+//
+// - Sequence nodes like [x, y, z] are tuples ([Tuple]).
+//
+// - !repeat [x] is a repeated tuple ([Tuple]), which is 0 or more instances of
+// x. There must be exactly one element in the list.
+//
+// - Mapping nodes like {a: x, b: y} are defs ([Def]). Any fields not listed are
+// implicitly top.
+//
+// - !sum [x, y, z] is a sum of its children. This can be thought of as a union
+// of the values x, y, and z, or as a non-deterministic choice between x, y, and
+// z. If a variable appears both inside the sum and outside of it, only the
+// non-deterministic choice view really works. The unifier does not directly
+// implement sums; instead, this is decoded as a fresh variable that's
+// simultaneously bound to x, y, and z.
+//
+// - !import glob is like a !sum, but its children are read from all files
+// matching the given glob pattern, which is interpreted relative to the current
+// file path. Each file gets its own variable scope.
+func Read(r io.Reader, path string, opts ReadOpts) (Closure, error) {
+ dec := yamlDecoder{opts: opts, path: path, env: topEnv}
+ v, err := dec.read(r)
+ if err != nil {
+ return Closure{}, err
+ }
+ return dec.close(v), nil
+}
+
+// ReadFile reads a [Closure] in YAML format from a file.
+//
+// The file must consist of a single YAML document.
+//
+// If opts.FS is not set, this sets it to a FS rooted at path's directory.
+//
+// See [Read] for details.
+func ReadFile(path string, opts ReadOpts) (Closure, error) {
+ f, err := os.Open(path)
+ if err != nil {
+ return Closure{}, err
+ }
+ defer f.Close()
+
+ if opts.FS == nil {
+ opts.FS = os.DirFS(filepath.Dir(path))
+ }
+
+ return Read(f, path, opts)
+}
+
+// UnmarshalYAML implements [yaml.Unmarshaler].
+//
+// Since there is no way to pass [ReadOpts] to this function, it assumes default
+// options.
+func (c *Closure) UnmarshalYAML(node *yaml.Node) error {
+ dec := yamlDecoder{path: "<yaml.Node>", env: topEnv}
+ v, err := dec.root(node)
+ if err != nil {
+ return err
+ }
+ *c = dec.close(v)
+ return nil
+}
+
+type yamlDecoder struct {
+ opts ReadOpts
+ path string
+
+ vars map[string]*ident
+ nSums int
+
+ env envSet
+}
+
+func (dec *yamlDecoder) read(r io.Reader) (*Value, error) {
+ n, err := readOneNode(r)
+ if err != nil {
+ return nil, fmt.Errorf("%s: %w", dec.path, err)
+ }
+
+ // Decode YAML node to a Value
+ v, err := dec.root(n)
+ if err != nil {
+ return nil, fmt.Errorf("%s: %w", dec.path, err)
+ }
+
+ return v, nil
+}
+
+// readOneNode reads a single YAML document from r and returns an error if there
+// are more documents in r.
+func readOneNode(r io.Reader) (*yaml.Node, error) {
+ yd := yaml.NewDecoder(r)
+
+ // Decode as a YAML node
+ var node yaml.Node
+ if err := yd.Decode(&node); err != nil {
+ return nil, err
+ }
+ np := &node
+ if np.Kind == yaml.DocumentNode {
+ np = node.Content[0]
+ }
+
+ // Ensure there are no more YAML docs in this file
+ if err := yd.Decode(nil); err == nil {
+ return nil, fmt.Errorf("must not contain multiple documents")
+ } else if err != io.EOF {
+ return nil, err
+ }
+
+ return np, nil
+}
+
+// root parses the root of a file.
+func (dec *yamlDecoder) root(node *yaml.Node) (*Value, error) {
+ // Prepare for variable name resolution in this file. This may be a nested
+ // root, so restore the current values when we're done.
+ oldVars, oldNSums := dec.vars, dec.nSums
+ defer func() {
+ dec.vars, dec.nSums = oldVars, oldNSums
+ }()
+ dec.vars = make(map[string]*ident, 0)
+ dec.nSums = 0
+
+ return dec.value(node)
+}
+
+// close wraps a decoded [Value] into a [Closure].
+func (dec *yamlDecoder) close(v *Value) Closure {
+ return Closure{v, dec.env}
+}
+
+func (dec *yamlDecoder) value(node *yaml.Node) (vOut *Value, errOut error) {
+ pos := &Pos{Path: dec.path, Line: node.Line}
+
+ // Resolve alias nodes.
+ if node.Kind == yaml.AliasNode {
+ node = node.Alias
+ }
+
+ mk := func(d Domain) (*Value, error) {
+ v := &Value{Domain: d, pos: pos}
+ return v, nil
+ }
+ mk2 := func(d Domain, err error) (*Value, error) {
+ if err != nil {
+ return nil, err
+ }
+ return mk(d)
+ }
+
+ // is tests the kind and long tag of node.
+ is := func(kind yaml.Kind, tag string) bool {
+ return node.Kind == kind && node.LongTag() == tag
+ }
+ isExact := func() bool {
+ if node.Kind != yaml.ScalarNode {
+ return false
+ }
+ // We treat any string-ish YAML node as a string.
+ switch node.LongTag() {
+ case "!string", "tag:yaml.org,2002:int", "tag:yaml.org,2002:float", "tag:yaml.org,2002:bool", "tag:yaml.org,2002:binary":
+ return true
+ }
+ return false
+ }
+
+ // !!str nodes provide a short-hand syntax for several leaf domains that are
+ // also available under explicit tags. To simplify checking below, we set
+ // strVal to non-"" only for !!str nodes.
+ strVal := ""
+ isStr := is(yaml.ScalarNode, "tag:yaml.org,2002:str")
+ if isStr {
+ strVal = node.Value
+ }
+
+ switch {
+ case is(yaml.ScalarNode, "!var"):
+ strVal = "$" + node.Value
+ fallthrough
+ case strings.HasPrefix(strVal, "$"):
+ id, ok := dec.vars[strVal]
+ if !ok {
+ // We encode different idents with the same string name by adding a
+ // #N suffix. Strip that off so it doesn't accumulate. This isn't
+ // meant to be used in user-written input, though nothing stops that.
+ name, _, _ := strings.Cut(strVal, "#")
+ id = &ident{name: name}
+ dec.vars[strVal] = id
+ dec.env = dec.env.bind(id, topValue)
+ }
+ return mk(Var{id: id})
+
+ case strVal == "_" || is(yaml.ScalarNode, "!top"):
+ return mk(Top{})
+
+ case strVal == "_|_" || is(yaml.ScalarNode, "!bottom"):
+ return nil, errors.New("found bottom")
+
+ case isExact():
+ val := node.Value
+ return mk(NewStringExact(val))
+
+ case isStr || is(yaml.ScalarNode, "!regex"):
+ // Any other string we treat as a regex. This will produce an exact
+ // string anyway if the regex is literal.
+ val := node.Value
+ return mk2(NewStringRegex(val))
+
+ case is(yaml.SequenceNode, "!regex"):
+ var vals []string
+ if err := node.Decode(&vals); err != nil {
+ return nil, err
+ }
+ return mk2(NewStringRegex(vals...))
+
+ case is(yaml.MappingNode, "tag:yaml.org,2002:map"):
+ var db DefBuilder
+ for i := 0; i < len(node.Content); i += 2 {
+ key := node.Content[i]
+ if key.Kind != yaml.ScalarNode {
+ return nil, fmt.Errorf("non-scalar key %q", key.Value)
+ }
+ val, err := dec.value(node.Content[i+1])
+ if err != nil {
+ return nil, err
+ }
+ db.Add(key.Value, val)
+ }
+ return mk(db.Build())
+
+ case is(yaml.SequenceNode, "tag:yaml.org,2002:seq"):
+ elts := node.Content
+ vs := make([]*Value, 0, len(elts))
+ for _, elt := range elts {
+ v, err := dec.value(elt)
+ if err != nil {
+ return nil, err
+ }
+ vs = append(vs, v)
+ }
+ return mk(NewTuple(vs...))
+
+ case is(yaml.SequenceNode, "!repeat") || is(yaml.SequenceNode, "!repeat-unify"):
+ // !repeat must have one child. !repeat-unify is used internally for
+ // delayed unification, and is the same, it's just allowed to have more
+ // than one child.
+ if node.LongTag() == "!repeat" && len(node.Content) != 1 {
+ return nil, fmt.Errorf("!repeat must have exactly one child")
+ }
+
+ // Decode the children to make sure they're well-formed, but otherwise
+ // discard that decoding and do it again every time we need a new
+ // element.
+ var gen []func(e envSet) (*Value, envSet)
+ origEnv := dec.env
+ elts := node.Content
+ for i, elt := range elts {
+ _, err := dec.value(elt)
+ if err != nil {
+ return nil, err
+ }
+ // Undo any effects on the environment. We *do* keep any named
+ // variables that were added to the vars map in case they were
+ // introduced within the element.
+ //
+ // TODO: If we change how we implement repeat nodes, we might be
+ // able to drop yamlEncoder.env and yamlDecoder.env.
+ dec.env = origEnv
+ // Add a generator function
+ gen = append(gen, func(e envSet) (*Value, envSet) {
+ dec.env = e
+ // TODO: If this is in a sum, this tends to generate a ton of
+ // fresh variables that are different on each branch of the
+ // parent sum. Does it make sense to hold on to the i'th value
+ // of the tuple after we've generated it?
+ v, err := dec.value(elts[i])
+ if err != nil {
+ // It worked the first time, so this really shouldn't hapen.
+ panic("decoding repeat element failed")
+ }
+ return v, dec.env
+ })
+ }
+ return mk(NewRepeat(gen...))
+
+ case is(yaml.SequenceNode, "!sum"):
+ vs := make([]*Value, 0, len(node.Content))
+ for _, elt := range node.Content {
+ v, err := dec.value(elt)
+ if err != nil {
+ return nil, err
+ }
+ vs = append(vs, v)
+ }
+ if len(vs) == 1 {
+ return vs[0], nil
+ }
+
+ // A sum is implemented as a fresh variable that's simultaneously bound
+ // to each of the descendants.
+ id := &ident{name: fmt.Sprintf("sum%d", dec.nSums)}
+ dec.nSums++
+ dec.env = dec.env.bind(id, vs...)
+ return mk(Var{id: id})
+
+ case is(yaml.ScalarNode, "!import"):
+ if dec.opts.FS == nil {
+ return nil, fmt.Errorf("!import not allowed (ReadOpts.FS not set)")
+ }
+ pat := node.Value
+
+ if !fs.ValidPath(pat) {
+ // This will result in Glob returning no results. Give a more useful
+ // error message for this case.
+ return nil, fmt.Errorf("!import path must not contain '.' or '..'")
+ }
+
+ ms, err := fs.Glob(dec.opts.FS, pat)
+ if err != nil {
+ return nil, fmt.Errorf("resolving !import: %w", err)
+ }
+ if len(ms) == 0 {
+ return nil, fmt.Errorf("!import did not match any files")
+ }
+
+ // Parse each file
+ vs := make([]*Value, 0, len(ms))
+ for _, m := range ms {
+ v, err := dec.import1(m)
+ if err != nil {
+ return nil, err
+ }
+ vs = append(vs, v)
+ }
+
+ // Create a sum.
+ if len(vs) == 1 {
+ return vs[0], nil
+ }
+ id := &ident{name: "import"}
+ dec.env = dec.env.bind(id, vs...)
+ return mk(Var{id: id})
+ }
+
+ return nil, fmt.Errorf("unknown node kind %d %v", node.Kind, node.Tag)
+}
+
+func (dec *yamlDecoder) import1(path string) (*Value, error) {
+ // Make sure we can open the path first.
+ f, err := dec.opts.FS.Open(path)
+ if err != nil {
+ return nil, fmt.Errorf("!import failed: %w", err)
+ }
+ defer f.Close()
+
+ // Prepare the enter path.
+ oldFS, oldPath := dec.opts.FS, dec.path
+ defer func() {
+ dec.opts.FS, dec.path = oldFS, oldPath
+ }()
+
+ // Enter path, which is relative to the current path's directory.
+ newPath := filepath.Join(filepath.Dir(dec.path), path)
+ subFS, err := fs.Sub(dec.opts.FS, filepath.Dir(path))
+ if err != nil {
+ return nil, err
+ }
+ dec.opts.FS, dec.path = subFS, newPath
+
+ // Parse the file.
+ return dec.read(f)
+}
+
+type yamlEncoder struct {
+ idp identPrinter
+ e envSet // We track the environment for !repeat nodes.
+}
+
+// TODO: Switch some Value marshaling to Closure?
+
+func (c Closure) MarshalYAML() (any, error) {
+ // TODO: If the environment is trivial, just marshal the value.
+ enc := &yamlEncoder{}
+ return enc.closure(c), nil
+}
+
+func (c Closure) String() string {
+ b, err := yaml.Marshal(c)
+ if err != nil {
+ return fmt.Sprintf("marshal failed: %s", err)
+ }
+ return string(b)
+}
+
+func (v *Value) MarshalYAML() (any, error) {
+ enc := &yamlEncoder{e: topEnv}
+ return enc.value(v), nil
+}
+
+func (v *Value) String() string {
+ b, err := yaml.Marshal(v)
+ if err != nil {
+ return fmt.Sprintf("marshal failed: %s", err)
+ }
+ return string(b)
+}
+
+func (enc *yamlEncoder) closure(c Closure) *yaml.Node {
+ enc.e = c.env
+ var n yaml.Node
+ n.Kind = yaml.MappingNode
+ n.Tag = "!closure"
+ n.Content = make([]*yaml.Node, 4)
+ n.Content[0] = new(yaml.Node)
+ n.Content[0].SetString("env")
+ n.Content[2] = new(yaml.Node)
+ n.Content[2].SetString("in")
+ n.Content[3] = enc.value(c.val)
+ // Fill in the env after we've written the value in case value encoding
+ // affects the env.
+ n.Content[1] = enc.env(enc.e)
+ enc.e = envSet{} // Allow GC'ing the env
+ return &n
+}
+
+func (enc *yamlEncoder) env(e envSet) *yaml.Node {
+ var encode func(e *envExpr) *yaml.Node
+ encode = func(e *envExpr) *yaml.Node {
+ var n yaml.Node
+ switch e.kind {
+ default:
+ panic("bad kind")
+ case envZero:
+ n.SetString("0")
+ case envUnit:
+ n.SetString("1")
+ case envBinding:
+ var id yaml.Node
+ id.SetString(enc.idp.unique(e.id))
+ n.Kind = yaml.MappingNode
+ n.Content = []*yaml.Node{&id, enc.value(e.val)}
+ case envProduct, envSum:
+ n.Kind = yaml.SequenceNode
+ if e.kind == envProduct {
+ n.Tag = "!product"
+ } else {
+ n.Tag = "!sum"
+ }
+ for _, e2 := range e.operands {
+ n.Content = append(n.Content, encode(e2))
+ }
+ }
+ return &n
+ }
+ return encode(e.root)
+}
+
+var yamlIntRe = regexp.MustCompile(`^-?[0-9]+$`)
+
+func (enc *yamlEncoder) value(v *Value) *yaml.Node {
+ var n yaml.Node
+ switch d := v.Domain.(type) {
+ case nil:
+ // Not allowed by unmarshaler, but useful for understanding when
+ // something goes horribly wrong.
+ //
+ // TODO: We might be able to track useful provenance for this, which
+ // would really help with debugging unexpected bottoms.
+ n.SetString("_|_")
+ return &n
+
+ case Top:
+ n.SetString("_")
+ return &n
+
+ case Def:
+ n.Kind = yaml.MappingNode
+ for k, elt := range d.All() {
+ var kn yaml.Node
+ kn.SetString(k)
+ n.Content = append(n.Content, &kn, enc.value(elt))
+ }
+ n.HeadComment = v.PosString()
+ return &n
+
+ case Tuple:
+ n.Kind = yaml.SequenceNode
+ if d.repeat == nil {
+ for _, elt := range d.vs {
+ n.Content = append(n.Content, enc.value(elt))
+ }
+ } else {
+ if len(d.repeat) == 1 {
+ n.Tag = "!repeat"
+ } else {
+ n.Tag = "!repeat-unify"
+ }
+ // TODO: I'm not positive this will round-trip everything correctly.
+ for _, gen := range d.repeat {
+ v, e := gen(enc.e)
+ enc.e = e
+ n.Content = append(n.Content, enc.value(v))
+ }
+ }
+ return &n
+
+ case String:
+ switch d.kind {
+ case stringExact:
+ n.SetString(d.exact)
+ switch {
+ // Make this into a "nice" !!int node if I can.
+ case yamlIntRe.MatchString(d.exact):
+ n.Tag = "tag:yaml.org,2002:int"
+
+ // Or a "nice" !!bool node.
+ case d.exact == "false" || d.exact == "true":
+ n.Tag = "tag:yaml.org,2002:bool"
+
+ // If this doesn't require escaping, leave it as a str node to avoid
+ // the annoying YAML tags. Otherwise, mark it as an exact string.
+ // Alternatively, we could always emit a str node with regexp
+ // quoting.
+ case d.exact != regexp.QuoteMeta(d.exact):
+ n.Tag = "!string"
+ }
+ return &n
+ case stringRegex:
+ o := make([]string, 0, 1)
+ for _, re := range d.re {
+ s := re.String()
+ s = strings.TrimSuffix(strings.TrimPrefix(s, `\A(?:`), `)\z`)
+ o = append(o, s)
+ }
+ if len(o) == 1 {
+ n.SetString(o[0])
+ return &n
+ }
+ n.Encode(o)
+ n.Tag = "!regex"
+ return &n
+ }
+ panic("bad String kind")
+
+ case Var:
+ // TODO: If Var only appears once in the whole Value and is independent
+ // in the environment (part of a term that is only over Var), then emit
+ // this as a !sum instead.
+ if false {
+ var vs []*Value // TODO: Get values of this var.
+ if len(vs) == 1 {
+ return enc.value(vs[0])
+ }
+ n.Kind = yaml.SequenceNode
+ n.Tag = "!sum"
+ for _, elt := range vs {
+ n.Content = append(n.Content, enc.value(elt))
+ }
+ return &n
+ }
+ n.SetString(enc.idp.unique(d.id))
+ if !strings.HasPrefix(d.id.name, "$") {
+ n.Tag = "!var"
+ }
+ return &n
+ }
+ panic(fmt.Sprintf("unknown domain type %T", v.Domain))
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package unify
+
+import (
+ "bytes"
+ "fmt"
+ "iter"
+ "log"
+ "strings"
+ "testing"
+ "testing/fstest"
+
+ "gopkg.in/yaml.v3"
+)
+
+func mustParse(expr string) Closure {
+ var c Closure
+ if err := yaml.Unmarshal([]byte(expr), &c); err != nil {
+ panic(err)
+ }
+ return c
+}
+
+func oneValue(t *testing.T, c Closure) *Value {
+ t.Helper()
+ var v *Value
+ var i int
+ for v = range c.All() {
+ i++
+ }
+ if i != 1 {
+ t.Fatalf("expected 1 value, got %d", i)
+ }
+ return v
+}
+
+func printYaml(val any) {
+ fmt.Println(prettyYaml(val))
+}
+
+func prettyYaml(val any) string {
+ b, err := yaml.Marshal(val)
+ if err != nil {
+ panic(err)
+ }
+ var node yaml.Node
+ if err := yaml.Unmarshal(b, &node); err != nil {
+ panic(err)
+ }
+
+ // Map lines to start offsets. We'll use this to figure out when nodes are
+ // "small" and should use inline style.
+ lines := []int{-1, 0}
+ for pos := 0; pos < len(b); {
+ next := bytes.IndexByte(b[pos:], '\n')
+ if next == -1 {
+ break
+ }
+ pos += next + 1
+ lines = append(lines, pos)
+ }
+ lines = append(lines, len(b))
+
+ // Strip comments and switch small nodes to inline style
+ cleanYaml(&node, lines, len(b))
+
+ b, err = yaml.Marshal(&node)
+ if err != nil {
+ panic(err)
+ }
+ return string(b)
+}
+
+func cleanYaml(node *yaml.Node, lines []int, endPos int) {
+ node.HeadComment = ""
+ node.FootComment = ""
+ node.LineComment = ""
+
+ for i, n2 := range node.Content {
+ end2 := endPos
+ if i < len(node.Content)-1 {
+ end2 = lines[node.Content[i+1].Line]
+ }
+ cleanYaml(n2, lines, end2)
+ }
+
+ // Use inline style?
+ switch node.Kind {
+ case yaml.MappingNode, yaml.SequenceNode:
+ if endPos-lines[node.Line] < 40 {
+ node.Style = yaml.FlowStyle
+ }
+ }
+}
+
+func allYamlNodes(n *yaml.Node) iter.Seq[*yaml.Node] {
+ return func(yield func(*yaml.Node) bool) {
+ if !yield(n) {
+ return
+ }
+ for _, n2 := range n.Content {
+ for n3 := range allYamlNodes(n2) {
+ if !yield(n3) {
+ return
+ }
+ }
+ }
+ }
+}
+
+func TestRoundTripString(t *testing.T) {
+ // Check that we can round-trip a string with regexp meta-characters in it.
+ const y = `!string test*`
+ t.Logf("input:\n%s", y)
+
+ v1 := oneValue(t, mustParse(y))
+ var buf1 strings.Builder
+ enc := yaml.NewEncoder(&buf1)
+ if err := enc.Encode(v1); err != nil {
+ log.Fatal(err)
+ }
+ enc.Close()
+ t.Logf("after parse 1:\n%s", buf1.String())
+
+ v2 := oneValue(t, mustParse(buf1.String()))
+ var buf2 strings.Builder
+ enc = yaml.NewEncoder(&buf2)
+ if err := enc.Encode(v2); err != nil {
+ log.Fatal(err)
+ }
+ enc.Close()
+ t.Logf("after parse 2:\n%s", buf2.String())
+
+ if buf1.String() != buf2.String() {
+ t.Fatal("parse 1 and parse 2 differ")
+ }
+}
+
+func TestEmptyString(t *testing.T) {
+ // Regression test. Make sure an empty string is parsed as an exact string,
+ // not a regexp.
+ const y = `""`
+ t.Logf("input:\n%s", y)
+
+ v1 := oneValue(t, mustParse(y))
+ if !v1.Exact() {
+ t.Fatal("expected exact string")
+ }
+}
+
+func TestImport(t *testing.T) {
+ // Test a basic import
+ main := strings.NewReader("!import x/y.yaml")
+ fs := fstest.MapFS{
+ // Test a glob import with a relative path
+ "x/y.yaml": {Data: []byte("!import y/*.yaml")},
+ "x/y/z.yaml": {Data: []byte("42")},
+ }
+ cl, err := Read(main, "x.yaml", ReadOpts{FS: fs})
+ if err != nil {
+ t.Fatal(err)
+ }
+ x := 42
+ checkDecode(t, oneValue(t, cl), &x)
+}
+
+func TestImportEscape(t *testing.T) {
+ // Make sure an import can't escape its subdirectory.
+ main := strings.NewReader("!import x/y.yaml")
+ fs := fstest.MapFS{
+ "x/y.yaml": {Data: []byte("!import ../y/*.yaml")},
+ "y/z.yaml": {Data: []byte("42")},
+ }
+ _, err := Read(main, "x.yaml", ReadOpts{FS: fs})
+ if err == nil {
+ t.Fatal("relative !import should have failed")
+ }
+ if !strings.Contains(err.Error(), "must not contain") {
+ t.Fatalf("unexpected error %v", err)
+ }
+}
+
+func TestImportScope(t *testing.T) {
+ // Test that imports have different variable scopes.
+ main := strings.NewReader("[!import y.yaml, !import y.yaml]")
+ fs := fstest.MapFS{
+ "y.yaml": {Data: []byte("$v")},
+ }
+ cl1, err := Read(main, "x.yaml", ReadOpts{FS: fs})
+ if err != nil {
+ t.Fatal(err)
+ }
+ cl2 := mustParse("[1, 2]")
+ res, err := Unify(cl1, cl2)
+ if err != nil {
+ t.Fatal(err)
+ }
+ checkDecode(t, oneValue(t, res), []int{1, 2})
+}
--- /dev/null
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX
+func (x Int8x16) Less(y Int8x16) Mask8x16 {
+ return y.Greater(x)
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX
+func (x Int8x16) GreaterEqual(y Int8x16) Mask8x16 {
+ ones := x.Equal(x).AsInt8x16()
+ return y.Greater(x).AsInt8x16().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX
+func (x Int8x16) LessEqual(y Int8x16) Mask8x16 {
+ ones := x.Equal(x).AsInt8x16()
+ return x.Greater(y).AsInt8x16().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX
+func (x Int8x16) NotEqual(y Int8x16) Mask8x16 {
+ ones := x.Equal(x).AsInt8x16()
+ return x.Equal(y).AsInt8x16().Xor(ones).asMask()
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX
+func (x Int16x8) Less(y Int16x8) Mask16x8 {
+ return y.Greater(x)
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX
+func (x Int16x8) GreaterEqual(y Int16x8) Mask16x8 {
+ ones := x.Equal(x).AsInt16x8()
+ return y.Greater(x).AsInt16x8().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX
+func (x Int16x8) LessEqual(y Int16x8) Mask16x8 {
+ ones := x.Equal(x).AsInt16x8()
+ return x.Greater(y).AsInt16x8().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX
+func (x Int16x8) NotEqual(y Int16x8) Mask16x8 {
+ ones := x.Equal(x).AsInt16x8()
+ return x.Equal(y).AsInt16x8().Xor(ones).asMask()
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX
+func (x Int32x4) Less(y Int32x4) Mask32x4 {
+ return y.Greater(x)
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX
+func (x Int32x4) GreaterEqual(y Int32x4) Mask32x4 {
+ ones := x.Equal(x).AsInt32x4()
+ return y.Greater(x).AsInt32x4().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX
+func (x Int32x4) LessEqual(y Int32x4) Mask32x4 {
+ ones := x.Equal(x).AsInt32x4()
+ return x.Greater(y).AsInt32x4().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX
+func (x Int32x4) NotEqual(y Int32x4) Mask32x4 {
+ ones := x.Equal(x).AsInt32x4()
+ return x.Equal(y).AsInt32x4().Xor(ones).asMask()
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX
+func (x Int64x2) Less(y Int64x2) Mask64x2 {
+ return y.Greater(x)
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX
+func (x Int64x2) GreaterEqual(y Int64x2) Mask64x2 {
+ ones := x.Equal(x).AsInt64x2()
+ return y.Greater(x).AsInt64x2().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX
+func (x Int64x2) LessEqual(y Int64x2) Mask64x2 {
+ ones := x.Equal(x).AsInt64x2()
+ return x.Greater(y).AsInt64x2().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX
+func (x Int64x2) NotEqual(y Int64x2) Mask64x2 {
+ ones := x.Equal(x).AsInt64x2()
+ return x.Equal(y).AsInt64x2().Xor(ones).asMask()
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX2
+func (x Int8x32) Less(y Int8x32) Mask8x32 {
+ return y.Greater(x)
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX2
+func (x Int8x32) GreaterEqual(y Int8x32) Mask8x32 {
+ ones := x.Equal(x).AsInt8x32()
+ return y.Greater(x).AsInt8x32().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX2
+func (x Int8x32) LessEqual(y Int8x32) Mask8x32 {
+ ones := x.Equal(x).AsInt8x32()
+ return x.Greater(y).AsInt8x32().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX2
+func (x Int8x32) NotEqual(y Int8x32) Mask8x32 {
+ ones := x.Equal(x).AsInt8x32()
+ return x.Equal(y).AsInt8x32().Xor(ones).asMask()
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX2
+func (x Int16x16) Less(y Int16x16) Mask16x16 {
+ return y.Greater(x)
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX2
+func (x Int16x16) GreaterEqual(y Int16x16) Mask16x16 {
+ ones := x.Equal(x).AsInt16x16()
+ return y.Greater(x).AsInt16x16().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX2
+func (x Int16x16) LessEqual(y Int16x16) Mask16x16 {
+ ones := x.Equal(x).AsInt16x16()
+ return x.Greater(y).AsInt16x16().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX2
+func (x Int16x16) NotEqual(y Int16x16) Mask16x16 {
+ ones := x.Equal(x).AsInt16x16()
+ return x.Equal(y).AsInt16x16().Xor(ones).asMask()
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX2
+func (x Int32x8) Less(y Int32x8) Mask32x8 {
+ return y.Greater(x)
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX2
+func (x Int32x8) GreaterEqual(y Int32x8) Mask32x8 {
+ ones := x.Equal(x).AsInt32x8()
+ return y.Greater(x).AsInt32x8().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX2
+func (x Int32x8) LessEqual(y Int32x8) Mask32x8 {
+ ones := x.Equal(x).AsInt32x8()
+ return x.Greater(y).AsInt32x8().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX2
+func (x Int32x8) NotEqual(y Int32x8) Mask32x8 {
+ ones := x.Equal(x).AsInt32x8()
+ return x.Equal(y).AsInt32x8().Xor(ones).asMask()
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX2
+func (x Int64x4) Less(y Int64x4) Mask64x4 {
+ return y.Greater(x)
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX2
+func (x Int64x4) GreaterEqual(y Int64x4) Mask64x4 {
+ ones := x.Equal(x).AsInt64x4()
+ return y.Greater(x).AsInt64x4().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX2
+func (x Int64x4) LessEqual(y Int64x4) Mask64x4 {
+ ones := x.Equal(x).AsInt64x4()
+ return x.Greater(y).AsInt64x4().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX2
+func (x Int64x4) NotEqual(y Int64x4) Mask64x4 {
+ ones := x.Equal(x).AsInt64x4()
+ return x.Equal(y).AsInt64x4().Xor(ones).asMask()
+}
+
+// Greater returns a mask whose elements indicate whether x > y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint8x16) Greater(y Uint8x16) Mask8x16 {
+ a, b := x.AsInt8x16(), y.AsInt8x16()
+ signs := BroadcastInt8x16(-1 << (8 - 1))
+ return a.Xor(signs).Greater(b.Xor(signs))
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint8x16) Less(y Uint8x16) Mask8x16 {
+ a, b := x.AsInt8x16(), y.AsInt8x16()
+ signs := BroadcastInt8x16(-1 << (8 - 1))
+ return b.Xor(signs).Greater(a.Xor(signs))
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint8x16) GreaterEqual(y Uint8x16) Mask8x16 {
+ a, b := x.AsInt8x16(), y.AsInt8x16()
+ ones := x.Equal(x).AsInt8x16()
+ signs := BroadcastInt8x16(-1 << (8 - 1))
+ return b.Xor(signs).Greater(a.Xor(signs)).AsInt8x16().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint8x16) LessEqual(y Uint8x16) Mask8x16 {
+ a, b := x.AsInt8x16(), y.AsInt8x16()
+ ones := x.Equal(x).AsInt8x16()
+ signs := BroadcastInt8x16(-1 << (8 - 1))
+ return a.Xor(signs).Greater(b.Xor(signs)).AsInt8x16().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX
+func (x Uint8x16) NotEqual(y Uint8x16) Mask8x16 {
+ a, b := x.AsInt8x16(), y.AsInt8x16()
+ ones := x.Equal(x).AsInt8x16()
+ return a.Equal(b).AsInt8x16().Xor(ones).asMask()
+}
+
+// Greater returns a mask whose elements indicate whether x > y
+//
+// Emulated, CPU Feature AVX
+func (x Uint16x8) Greater(y Uint16x8) Mask16x8 {
+ a, b := x.AsInt16x8(), y.AsInt16x8()
+ ones := x.Equal(x).AsInt16x8()
+ signs := ones.ShiftAllLeft(16 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs))
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX
+func (x Uint16x8) Less(y Uint16x8) Mask16x8 {
+ a, b := x.AsInt16x8(), y.AsInt16x8()
+ ones := x.Equal(x).AsInt16x8()
+ signs := ones.ShiftAllLeft(16 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs))
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX
+func (x Uint16x8) GreaterEqual(y Uint16x8) Mask16x8 {
+ a, b := x.AsInt16x8(), y.AsInt16x8()
+ ones := x.Equal(x).AsInt16x8()
+ signs := ones.ShiftAllLeft(16 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs)).AsInt16x8().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX
+func (x Uint16x8) LessEqual(y Uint16x8) Mask16x8 {
+ a, b := x.AsInt16x8(), y.AsInt16x8()
+ ones := x.Equal(x).AsInt16x8()
+ signs := ones.ShiftAllLeft(16 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs)).AsInt16x8().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX
+func (x Uint16x8) NotEqual(y Uint16x8) Mask16x8 {
+ a, b := x.AsInt16x8(), y.AsInt16x8()
+ ones := x.Equal(x).AsInt16x8()
+ return a.Equal(b).AsInt16x8().Xor(ones).asMask()
+}
+
+// Greater returns a mask whose elements indicate whether x > y
+//
+// Emulated, CPU Feature AVX
+func (x Uint32x4) Greater(y Uint32x4) Mask32x4 {
+ a, b := x.AsInt32x4(), y.AsInt32x4()
+ ones := x.Equal(x).AsInt32x4()
+ signs := ones.ShiftAllLeft(32 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs))
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX
+func (x Uint32x4) Less(y Uint32x4) Mask32x4 {
+ a, b := x.AsInt32x4(), y.AsInt32x4()
+ ones := x.Equal(x).AsInt32x4()
+ signs := ones.ShiftAllLeft(32 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs))
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX
+func (x Uint32x4) GreaterEqual(y Uint32x4) Mask32x4 {
+ a, b := x.AsInt32x4(), y.AsInt32x4()
+ ones := x.Equal(x).AsInt32x4()
+ signs := ones.ShiftAllLeft(32 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs)).AsInt32x4().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX
+func (x Uint32x4) LessEqual(y Uint32x4) Mask32x4 {
+ a, b := x.AsInt32x4(), y.AsInt32x4()
+ ones := x.Equal(x).AsInt32x4()
+ signs := ones.ShiftAllLeft(32 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs)).AsInt32x4().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX
+func (x Uint32x4) NotEqual(y Uint32x4) Mask32x4 {
+ a, b := x.AsInt32x4(), y.AsInt32x4()
+ ones := x.Equal(x).AsInt32x4()
+ return a.Equal(b).AsInt32x4().Xor(ones).asMask()
+}
+
+// Greater returns a mask whose elements indicate whether x > y
+//
+// Emulated, CPU Feature AVX
+func (x Uint64x2) Greater(y Uint64x2) Mask64x2 {
+ a, b := x.AsInt64x2(), y.AsInt64x2()
+ ones := x.Equal(x).AsInt64x2()
+ signs := ones.ShiftAllLeft(64 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs))
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX
+func (x Uint64x2) Less(y Uint64x2) Mask64x2 {
+ a, b := x.AsInt64x2(), y.AsInt64x2()
+ ones := x.Equal(x).AsInt64x2()
+ signs := ones.ShiftAllLeft(64 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs))
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX
+func (x Uint64x2) GreaterEqual(y Uint64x2) Mask64x2 {
+ a, b := x.AsInt64x2(), y.AsInt64x2()
+ ones := x.Equal(x).AsInt64x2()
+ signs := ones.ShiftAllLeft(64 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs)).AsInt64x2().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX
+func (x Uint64x2) LessEqual(y Uint64x2) Mask64x2 {
+ a, b := x.AsInt64x2(), y.AsInt64x2()
+ ones := x.Equal(x).AsInt64x2()
+ signs := ones.ShiftAllLeft(64 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs)).AsInt64x2().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX
+func (x Uint64x2) NotEqual(y Uint64x2) Mask64x2 {
+ a, b := x.AsInt64x2(), y.AsInt64x2()
+ ones := x.Equal(x).AsInt64x2()
+ return a.Equal(b).AsInt64x2().Xor(ones).asMask()
+}
+
+// Greater returns a mask whose elements indicate whether x > y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint8x32) Greater(y Uint8x32) Mask8x32 {
+ a, b := x.AsInt8x32(), y.AsInt8x32()
+ signs := BroadcastInt8x32(-1 << (8 - 1))
+ return a.Xor(signs).Greater(b.Xor(signs))
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint8x32) Less(y Uint8x32) Mask8x32 {
+ a, b := x.AsInt8x32(), y.AsInt8x32()
+ signs := BroadcastInt8x32(-1 << (8 - 1))
+ return b.Xor(signs).Greater(a.Xor(signs))
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint8x32) GreaterEqual(y Uint8x32) Mask8x32 {
+ a, b := x.AsInt8x32(), y.AsInt8x32()
+ ones := x.Equal(x).AsInt8x32()
+ signs := BroadcastInt8x32(-1 << (8 - 1))
+ return b.Xor(signs).Greater(a.Xor(signs)).AsInt8x32().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint8x32) LessEqual(y Uint8x32) Mask8x32 {
+ a, b := x.AsInt8x32(), y.AsInt8x32()
+ ones := x.Equal(x).AsInt8x32()
+ signs := BroadcastInt8x32(-1 << (8 - 1))
+ return a.Xor(signs).Greater(b.Xor(signs)).AsInt8x32().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint8x32) NotEqual(y Uint8x32) Mask8x32 {
+ a, b := x.AsInt8x32(), y.AsInt8x32()
+ ones := x.Equal(x).AsInt8x32()
+ return a.Equal(b).AsInt8x32().Xor(ones).asMask()
+}
+
+// Greater returns a mask whose elements indicate whether x > y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint16x16) Greater(y Uint16x16) Mask16x16 {
+ a, b := x.AsInt16x16(), y.AsInt16x16()
+ ones := x.Equal(x).AsInt16x16()
+ signs := ones.ShiftAllLeft(16 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs))
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint16x16) Less(y Uint16x16) Mask16x16 {
+ a, b := x.AsInt16x16(), y.AsInt16x16()
+ ones := x.Equal(x).AsInt16x16()
+ signs := ones.ShiftAllLeft(16 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs))
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint16x16) GreaterEqual(y Uint16x16) Mask16x16 {
+ a, b := x.AsInt16x16(), y.AsInt16x16()
+ ones := x.Equal(x).AsInt16x16()
+ signs := ones.ShiftAllLeft(16 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs)).AsInt16x16().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint16x16) LessEqual(y Uint16x16) Mask16x16 {
+ a, b := x.AsInt16x16(), y.AsInt16x16()
+ ones := x.Equal(x).AsInt16x16()
+ signs := ones.ShiftAllLeft(16 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs)).AsInt16x16().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint16x16) NotEqual(y Uint16x16) Mask16x16 {
+ a, b := x.AsInt16x16(), y.AsInt16x16()
+ ones := x.Equal(x).AsInt16x16()
+ return a.Equal(b).AsInt16x16().Xor(ones).asMask()
+}
+
+// Greater returns a mask whose elements indicate whether x > y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint32x8) Greater(y Uint32x8) Mask32x8 {
+ a, b := x.AsInt32x8(), y.AsInt32x8()
+ ones := x.Equal(x).AsInt32x8()
+ signs := ones.ShiftAllLeft(32 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs))
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint32x8) Less(y Uint32x8) Mask32x8 {
+ a, b := x.AsInt32x8(), y.AsInt32x8()
+ ones := x.Equal(x).AsInt32x8()
+ signs := ones.ShiftAllLeft(32 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs))
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint32x8) GreaterEqual(y Uint32x8) Mask32x8 {
+ a, b := x.AsInt32x8(), y.AsInt32x8()
+ ones := x.Equal(x).AsInt32x8()
+ signs := ones.ShiftAllLeft(32 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs)).AsInt32x8().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint32x8) LessEqual(y Uint32x8) Mask32x8 {
+ a, b := x.AsInt32x8(), y.AsInt32x8()
+ ones := x.Equal(x).AsInt32x8()
+ signs := ones.ShiftAllLeft(32 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs)).AsInt32x8().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint32x8) NotEqual(y Uint32x8) Mask32x8 {
+ a, b := x.AsInt32x8(), y.AsInt32x8()
+ ones := x.Equal(x).AsInt32x8()
+ return a.Equal(b).AsInt32x8().Xor(ones).asMask()
+}
+
+// Greater returns a mask whose elements indicate whether x > y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint64x4) Greater(y Uint64x4) Mask64x4 {
+ a, b := x.AsInt64x4(), y.AsInt64x4()
+ ones := x.Equal(x).AsInt64x4()
+ signs := ones.ShiftAllLeft(64 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs))
+}
+
+// Less returns a mask whose elements indicate whether x < y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint64x4) Less(y Uint64x4) Mask64x4 {
+ a, b := x.AsInt64x4(), y.AsInt64x4()
+ ones := x.Equal(x).AsInt64x4()
+ signs := ones.ShiftAllLeft(64 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs))
+}
+
+// GreaterEqual returns a mask whose elements indicate whether x >= y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint64x4) GreaterEqual(y Uint64x4) Mask64x4 {
+ a, b := x.AsInt64x4(), y.AsInt64x4()
+ ones := x.Equal(x).AsInt64x4()
+ signs := ones.ShiftAllLeft(64 - 1)
+ return b.Xor(signs).Greater(a.Xor(signs)).AsInt64x4().Xor(ones).asMask()
+}
+
+// LessEqual returns a mask whose elements indicate whether x <= y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint64x4) LessEqual(y Uint64x4) Mask64x4 {
+ a, b := x.AsInt64x4(), y.AsInt64x4()
+ ones := x.Equal(x).AsInt64x4()
+ signs := ones.ShiftAllLeft(64 - 1)
+ return a.Xor(signs).Greater(b.Xor(signs)).AsInt64x4().Xor(ones).asMask()
+}
+
+// NotEqual returns a mask whose elements indicate whether x != y
+//
+// Emulated, CPU Feature AVX2
+func (x Uint64x4) NotEqual(y Uint64x4) Mask64x4 {
+ a, b := x.AsInt64x4(), y.AsInt64x4()
+ ones := x.Equal(x).AsInt64x4()
+ return a.Equal(b).AsInt64x4().Xor(ones).asMask()
+}
--- /dev/null
+// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+import "internal/cpu"
+
+type X86Features struct{}
+
+var X86 X86Features
+
+// AES returns whether the CPU supports the AES feature.
+//
+// AES is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AES() bool {
+ return cpu.X86.HasAES
+}
+
+// AVX returns whether the CPU supports the AVX feature.
+//
+// AVX is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX() bool {
+ return cpu.X86.HasAVX
+}
+
+// AVX2 returns whether the CPU supports the AVX2 feature.
+//
+// AVX2 is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX2() bool {
+ return cpu.X86.HasAVX2
+}
+
+// AVX512 returns whether the CPU supports the AVX512F+CD+BW+DQ+VL features.
+//
+// These five CPU features are bundled together, and no use of AVX-512
+// is allowed unless all of these features are supported together.
+// Nearly every CPU that has shipped with any support for AVX-512 has
+// supported all five of these features.
+//
+// AVX512 is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512() bool {
+ return cpu.X86.HasAVX512
+}
+
+// AVX512BITALG returns whether the CPU supports the AVX512BITALG feature.
+//
+// AVX512BITALG is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512BITALG() bool {
+ return cpu.X86.HasAVX512BITALG
+}
+
+// AVX512GFNI returns whether the CPU supports the AVX512GFNI feature.
+//
+// AVX512GFNI is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512GFNI() bool {
+ return cpu.X86.HasAVX512GFNI
+}
+
+// AVX512VAES returns whether the CPU supports the AVX512VAES feature.
+//
+// AVX512VAES is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512VAES() bool {
+ return cpu.X86.HasAVX512VAES
+}
+
+// AVX512VBMI returns whether the CPU supports the AVX512VBMI feature.
+//
+// AVX512VBMI is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512VBMI() bool {
+ return cpu.X86.HasAVX512VBMI
+}
+
+// AVX512VBMI2 returns whether the CPU supports the AVX512VBMI2 feature.
+//
+// AVX512VBMI2 is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512VBMI2() bool {
+ return cpu.X86.HasAVX512VBMI2
+}
+
+// AVX512VNNI returns whether the CPU supports the AVX512VNNI feature.
+//
+// AVX512VNNI is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512VNNI() bool {
+ return cpu.X86.HasAVX512VNNI
+}
+
+// AVX512VPCLMULQDQ returns whether the CPU supports the AVX512VPCLMULQDQ feature.
+//
+// AVX512VPCLMULQDQ is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512VPCLMULQDQ() bool {
+ return cpu.X86.HasAVX512VPCLMULQDQ
+}
+
+// AVX512VPOPCNTDQ returns whether the CPU supports the AVX512VPOPCNTDQ feature.
+//
+// AVX512VPOPCNTDQ is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512VPOPCNTDQ() bool {
+ return cpu.X86.HasAVX512VPOPCNTDQ
+}
+
+// AVXVNNI returns whether the CPU supports the AVXVNNI feature.
+//
+// AVXVNNI is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVXVNNI() bool {
+ return cpu.X86.HasAVXVNNI
+}
+
+// SHA returns whether the CPU supports the SHA feature.
+//
+// SHA is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) SHA() bool {
+ return cpu.X86.HasSHA
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64
+
+// Empty file to allow bodyless functions.
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+// This exposes some internal interfaces to simd_test.
+
+package archsimd
+
+func (x Int64x2) ExportTestConcatSelectedConstant(indices uint8, y Int64x2) Int64x2 {
+ return x.concatSelectedConstant(indices, y)
+}
+
+func (x Float64x4) ExportTestConcatSelectedConstantGrouped(indices uint8, y Float64x4) Float64x4 {
+ return x.concatSelectedConstantGrouped(indices, y)
+}
+
+func (x Float32x4) ExportTestConcatSelectedConstant(indices uint8, y Float32x4) Float32x4 {
+ return x.concatSelectedConstant(indices, y)
+}
+
+func (x Int32x4) ExportTestConcatSelectedConstant(indices uint8, y Int32x4) Int32x4 {
+ return x.concatSelectedConstant(indices, y)
+}
+
+func (x Uint32x8) ExportTestConcatSelectedConstantGrouped(indices uint8, y Uint32x8) Uint32x8 {
+ return x.concatSelectedConstantGrouped(indices, y)
+}
+
+func (x Int32x8) ExportTestConcatSelectedConstantGrouped(indices uint8, y Int32x8) Int32x8 {
+ return x.concatSelectedConstantGrouped(indices, y)
+}
+
+func (x Int32x8) ExportTestTern(table uint8, y Int32x8, z Int32x8) Int32x8 {
+ return x.tern(table, y, z)
+}
+
+func (x Int32x4) ExportTestTern(table uint8, y Int32x4, z Int32x4) Int32x4 {
+ return x.tern(table, y, z)
+}
+
+func ExportTestCscImm4(a, b, c, d uint8) uint8 {
+ return cscimm4(a, b, c, d)
+}
+
+const (
+ LLLL = _LLLL
+ HLLL = _HLLL
+ LHLL = _LHLL
+ HHLL = _HHLL
+ LLHL = _LLHL
+ HLHL = _HLHL
+ LHHL = _LHHL
+ HHHL = _HHHL
+ LLLH = _LLLH
+ HLLH = _HLLH
+ LHLH = _LHLH
+ HHLH = _HHLH
+ LLHH = _LLHH
+ HLHH = _HLHH
+ LHHH = _LHHH
+ HHHH = _HHHH
+)
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package archsimd
+
+// ClearAVXUpperBits clears the high bits of Y0-Y15 and Z0-Z15 registers.
+// It is intended for transitioning from AVX to SSE, eliminating the
+// performance penalties caused by false dependencies.
+//
+// Note: in the future the compiler may automatically generate the
+// instruction, making this function unnecessary.
+//
+// Asm: VZEROUPPER, CPU Feature: AVX
+func ClearAVXUpperBits()
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Int8x16) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Int8x32) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Int16x8) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Int16x16) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Int32x4) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Int32x8) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Int64x2) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Int64x4) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Uint8x16) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Uint8x32) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Uint16x8) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Uint16x16) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Uint32x4) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Uint32x8) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Uint64x2) IsZero() bool
+
+// IsZero returns true if all elements of x are zeros.
+//
+// This method compiles to VPTEST x, x.
+// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
+//
+// Asm: VPTEST, CPU Feature: AVX
+func (x Uint64x4) IsZero() bool
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+// Invoke code generators.
+
+//go:generate go run -C _gen . -tmplgen -simdgen
--- /dev/null
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing binary simd methods.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+// testInt8x16Binary tests the simd binary method f against the expected behavior generated by want
+func testInt8x16Binary(t *testing.T, f func(_, _ archsimd.Int8x16) archsimd.Int8x16, want func(_, _ []int8) []int8) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, int8s, n, func(x, y []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x16Slice(x)
+ b := archsimd.LoadInt8x16Slice(y)
+ g := make([]int8, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt16x8Binary tests the simd binary method f against the expected behavior generated by want
+func testInt16x8Binary(t *testing.T, f func(_, _ archsimd.Int16x8) archsimd.Int16x8, want func(_, _ []int16) []int16) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, int16s, n, func(x, y []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x8Slice(x)
+ b := archsimd.LoadInt16x8Slice(y)
+ g := make([]int16, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt32x4Binary tests the simd binary method f against the expected behavior generated by want
+func testInt32x4Binary(t *testing.T, f func(_, _ archsimd.Int32x4) archsimd.Int32x4, want func(_, _ []int32) []int32) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, int32s, n, func(x, y []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x4Slice(x)
+ b := archsimd.LoadInt32x4Slice(y)
+ g := make([]int32, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt64x2Binary tests the simd binary method f against the expected behavior generated by want
+func testInt64x2Binary(t *testing.T, f func(_, _ archsimd.Int64x2) archsimd.Int64x2, want func(_, _ []int64) []int64) {
+ n := 2
+ t.Helper()
+ forSlicePair(t, int64s, n, func(x, y []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x2Slice(x)
+ b := archsimd.LoadInt64x2Slice(y)
+ g := make([]int64, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint8x16Binary tests the simd binary method f against the expected behavior generated by want
+func testUint8x16Binary(t *testing.T, f func(_, _ archsimd.Uint8x16) archsimd.Uint8x16, want func(_, _ []uint8) []uint8) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x16Slice(x)
+ b := archsimd.LoadUint8x16Slice(y)
+ g := make([]uint8, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint16x8Binary tests the simd binary method f against the expected behavior generated by want
+func testUint16x8Binary(t *testing.T, f func(_, _ archsimd.Uint16x8) archsimd.Uint16x8, want func(_, _ []uint16) []uint16) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x8Slice(x)
+ b := archsimd.LoadUint16x8Slice(y)
+ g := make([]uint16, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint32x4Binary tests the simd binary method f against the expected behavior generated by want
+func testUint32x4Binary(t *testing.T, f func(_, _ archsimd.Uint32x4) archsimd.Uint32x4, want func(_, _ []uint32) []uint32) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x4Slice(x)
+ b := archsimd.LoadUint32x4Slice(y)
+ g := make([]uint32, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint64x2Binary tests the simd binary method f against the expected behavior generated by want
+func testUint64x2Binary(t *testing.T, f func(_, _ archsimd.Uint64x2) archsimd.Uint64x2, want func(_, _ []uint64) []uint64) {
+ n := 2
+ t.Helper()
+ forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x2Slice(x)
+ b := archsimd.LoadUint64x2Slice(y)
+ g := make([]uint64, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat32x4Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat32x4Binary(t *testing.T, f func(_, _ archsimd.Float32x4) archsimd.Float32x4, want func(_, _ []float32) []float32) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, float32s, n, func(x, y []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x4Slice(x)
+ b := archsimd.LoadFloat32x4Slice(y)
+ g := make([]float32, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat64x2Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat64x2Binary(t *testing.T, f func(_, _ archsimd.Float64x2) archsimd.Float64x2, want func(_, _ []float64) []float64) {
+ n := 2
+ t.Helper()
+ forSlicePair(t, float64s, n, func(x, y []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x2Slice(x)
+ b := archsimd.LoadFloat64x2Slice(y)
+ g := make([]float64, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt8x32Binary tests the simd binary method f against the expected behavior generated by want
+func testInt8x32Binary(t *testing.T, f func(_, _ archsimd.Int8x32) archsimd.Int8x32, want func(_, _ []int8) []int8) {
+ n := 32
+ t.Helper()
+ forSlicePair(t, int8s, n, func(x, y []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x32Slice(x)
+ b := archsimd.LoadInt8x32Slice(y)
+ g := make([]int8, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt16x16Binary tests the simd binary method f against the expected behavior generated by want
+func testInt16x16Binary(t *testing.T, f func(_, _ archsimd.Int16x16) archsimd.Int16x16, want func(_, _ []int16) []int16) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, int16s, n, func(x, y []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x16Slice(x)
+ b := archsimd.LoadInt16x16Slice(y)
+ g := make([]int16, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt32x8Binary tests the simd binary method f against the expected behavior generated by want
+func testInt32x8Binary(t *testing.T, f func(_, _ archsimd.Int32x8) archsimd.Int32x8, want func(_, _ []int32) []int32) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, int32s, n, func(x, y []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x8Slice(x)
+ b := archsimd.LoadInt32x8Slice(y)
+ g := make([]int32, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt64x4Binary tests the simd binary method f against the expected behavior generated by want
+func testInt64x4Binary(t *testing.T, f func(_, _ archsimd.Int64x4) archsimd.Int64x4, want func(_, _ []int64) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, int64s, n, func(x, y []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x4Slice(x)
+ b := archsimd.LoadInt64x4Slice(y)
+ g := make([]int64, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint8x32Binary tests the simd binary method f against the expected behavior generated by want
+func testUint8x32Binary(t *testing.T, f func(_, _ archsimd.Uint8x32) archsimd.Uint8x32, want func(_, _ []uint8) []uint8) {
+ n := 32
+ t.Helper()
+ forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x32Slice(x)
+ b := archsimd.LoadUint8x32Slice(y)
+ g := make([]uint8, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint16x16Binary tests the simd binary method f against the expected behavior generated by want
+func testUint16x16Binary(t *testing.T, f func(_, _ archsimd.Uint16x16) archsimd.Uint16x16, want func(_, _ []uint16) []uint16) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x16Slice(x)
+ b := archsimd.LoadUint16x16Slice(y)
+ g := make([]uint16, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint32x8Binary tests the simd binary method f against the expected behavior generated by want
+func testUint32x8Binary(t *testing.T, f func(_, _ archsimd.Uint32x8) archsimd.Uint32x8, want func(_, _ []uint32) []uint32) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x8Slice(x)
+ b := archsimd.LoadUint32x8Slice(y)
+ g := make([]uint32, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint64x4Binary tests the simd binary method f against the expected behavior generated by want
+func testUint64x4Binary(t *testing.T, f func(_, _ archsimd.Uint64x4) archsimd.Uint64x4, want func(_, _ []uint64) []uint64) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x4Slice(x)
+ b := archsimd.LoadUint64x4Slice(y)
+ g := make([]uint64, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat32x8Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat32x8Binary(t *testing.T, f func(_, _ archsimd.Float32x8) archsimd.Float32x8, want func(_, _ []float32) []float32) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, float32s, n, func(x, y []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x8Slice(x)
+ b := archsimd.LoadFloat32x8Slice(y)
+ g := make([]float32, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat64x4Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat64x4Binary(t *testing.T, f func(_, _ archsimd.Float64x4) archsimd.Float64x4, want func(_, _ []float64) []float64) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, float64s, n, func(x, y []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x4Slice(x)
+ b := archsimd.LoadFloat64x4Slice(y)
+ g := make([]float64, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt8x64Binary tests the simd binary method f against the expected behavior generated by want
+func testInt8x64Binary(t *testing.T, f func(_, _ archsimd.Int8x64) archsimd.Int8x64, want func(_, _ []int8) []int8) {
+ n := 64
+ t.Helper()
+ forSlicePair(t, int8s, n, func(x, y []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x64Slice(x)
+ b := archsimd.LoadInt8x64Slice(y)
+ g := make([]int8, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt16x32Binary tests the simd binary method f against the expected behavior generated by want
+func testInt16x32Binary(t *testing.T, f func(_, _ archsimd.Int16x32) archsimd.Int16x32, want func(_, _ []int16) []int16) {
+ n := 32
+ t.Helper()
+ forSlicePair(t, int16s, n, func(x, y []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x32Slice(x)
+ b := archsimd.LoadInt16x32Slice(y)
+ g := make([]int16, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt32x16Binary tests the simd binary method f against the expected behavior generated by want
+func testInt32x16Binary(t *testing.T, f func(_, _ archsimd.Int32x16) archsimd.Int32x16, want func(_, _ []int32) []int32) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, int32s, n, func(x, y []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x16Slice(x)
+ b := archsimd.LoadInt32x16Slice(y)
+ g := make([]int32, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt64x8Binary tests the simd binary method f against the expected behavior generated by want
+func testInt64x8Binary(t *testing.T, f func(_, _ archsimd.Int64x8) archsimd.Int64x8, want func(_, _ []int64) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, int64s, n, func(x, y []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x8Slice(x)
+ b := archsimd.LoadInt64x8Slice(y)
+ g := make([]int64, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint8x64Binary tests the simd binary method f against the expected behavior generated by want
+func testUint8x64Binary(t *testing.T, f func(_, _ archsimd.Uint8x64) archsimd.Uint8x64, want func(_, _ []uint8) []uint8) {
+ n := 64
+ t.Helper()
+ forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x64Slice(x)
+ b := archsimd.LoadUint8x64Slice(y)
+ g := make([]uint8, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint16x32Binary tests the simd binary method f against the expected behavior generated by want
+func testUint16x32Binary(t *testing.T, f func(_, _ archsimd.Uint16x32) archsimd.Uint16x32, want func(_, _ []uint16) []uint16) {
+ n := 32
+ t.Helper()
+ forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x32Slice(x)
+ b := archsimd.LoadUint16x32Slice(y)
+ g := make([]uint16, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint32x16Binary tests the simd binary method f against the expected behavior generated by want
+func testUint32x16Binary(t *testing.T, f func(_, _ archsimd.Uint32x16) archsimd.Uint32x16, want func(_, _ []uint32) []uint32) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x16Slice(x)
+ b := archsimd.LoadUint32x16Slice(y)
+ g := make([]uint32, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint64x8Binary tests the simd binary method f against the expected behavior generated by want
+func testUint64x8Binary(t *testing.T, f func(_, _ archsimd.Uint64x8) archsimd.Uint64x8, want func(_, _ []uint64) []uint64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x8Slice(x)
+ b := archsimd.LoadUint64x8Slice(y)
+ g := make([]uint64, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat32x16Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat32x16Binary(t *testing.T, f func(_, _ archsimd.Float32x16) archsimd.Float32x16, want func(_, _ []float32) []float32) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, float32s, n, func(x, y []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x16Slice(x)
+ b := archsimd.LoadFloat32x16Slice(y)
+ g := make([]float32, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat64x8Binary tests the simd binary method f against the expected behavior generated by want
+func testFloat64x8Binary(t *testing.T, f func(_, _ archsimd.Float64x8) archsimd.Float64x8, want func(_, _ []float64) []float64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, float64s, n, func(x, y []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x8Slice(x)
+ b := archsimd.LoadFloat64x8Slice(y)
+ g := make([]float64, n)
+ f(a, b).StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+func TestAdd(t *testing.T) {
+ testFloat32x4Binary(t, archsimd.Float32x4.Add, addSlice[float32])
+ testFloat32x8Binary(t, archsimd.Float32x8.Add, addSlice[float32])
+ testFloat64x2Binary(t, archsimd.Float64x2.Add, addSlice[float64])
+ testFloat64x4Binary(t, archsimd.Float64x4.Add, addSlice[float64])
+
+ testInt16x16Binary(t, archsimd.Int16x16.Add, addSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.Add, addSlice[int16])
+ testInt32x4Binary(t, archsimd.Int32x4.Add, addSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.Add, addSlice[int32])
+ testInt64x2Binary(t, archsimd.Int64x2.Add, addSlice[int64])
+ testInt64x4Binary(t, archsimd.Int64x4.Add, addSlice[int64])
+ testInt8x16Binary(t, archsimd.Int8x16.Add, addSlice[int8])
+ testInt8x32Binary(t, archsimd.Int8x32.Add, addSlice[int8])
+
+ testUint16x16Binary(t, archsimd.Uint16x16.Add, addSlice[uint16])
+ testUint16x8Binary(t, archsimd.Uint16x8.Add, addSlice[uint16])
+ testUint32x4Binary(t, archsimd.Uint32x4.Add, addSlice[uint32])
+ testUint32x8Binary(t, archsimd.Uint32x8.Add, addSlice[uint32])
+ testUint64x2Binary(t, archsimd.Uint64x2.Add, addSlice[uint64])
+ testUint64x4Binary(t, archsimd.Uint64x4.Add, addSlice[uint64])
+ testUint8x16Binary(t, archsimd.Uint8x16.Add, addSlice[uint8])
+ testUint8x32Binary(t, archsimd.Uint8x32.Add, addSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ testFloat32x16Binary(t, archsimd.Float32x16.Add, addSlice[float32])
+ testFloat64x8Binary(t, archsimd.Float64x8.Add, addSlice[float64])
+ testInt8x64Binary(t, archsimd.Int8x64.Add, addSlice[int8])
+ testInt16x32Binary(t, archsimd.Int16x32.Add, addSlice[int16])
+ testInt32x16Binary(t, archsimd.Int32x16.Add, addSlice[int32])
+ testInt64x8Binary(t, archsimd.Int64x8.Add, addSlice[int64])
+ testUint8x64Binary(t, archsimd.Uint8x64.Add, addSlice[uint8])
+ testUint16x32Binary(t, archsimd.Uint16x32.Add, addSlice[uint16])
+ testUint32x16Binary(t, archsimd.Uint32x16.Add, addSlice[uint32])
+ testUint64x8Binary(t, archsimd.Uint64x8.Add, addSlice[uint64])
+ }
+}
+
+func TestSub(t *testing.T) {
+ testFloat32x4Binary(t, archsimd.Float32x4.Sub, subSlice[float32])
+ testFloat32x8Binary(t, archsimd.Float32x8.Sub, subSlice[float32])
+ testFloat64x2Binary(t, archsimd.Float64x2.Sub, subSlice[float64])
+ testFloat64x4Binary(t, archsimd.Float64x4.Sub, subSlice[float64])
+
+ testInt16x16Binary(t, archsimd.Int16x16.Sub, subSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.Sub, subSlice[int16])
+ testInt32x4Binary(t, archsimd.Int32x4.Sub, subSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.Sub, subSlice[int32])
+ testInt64x2Binary(t, archsimd.Int64x2.Sub, subSlice[int64])
+ testInt64x4Binary(t, archsimd.Int64x4.Sub, subSlice[int64])
+ testInt8x16Binary(t, archsimd.Int8x16.Sub, subSlice[int8])
+ testInt8x32Binary(t, archsimd.Int8x32.Sub, subSlice[int8])
+
+ testUint16x16Binary(t, archsimd.Uint16x16.Sub, subSlice[uint16])
+ testUint16x8Binary(t, archsimd.Uint16x8.Sub, subSlice[uint16])
+ testUint32x4Binary(t, archsimd.Uint32x4.Sub, subSlice[uint32])
+ testUint32x8Binary(t, archsimd.Uint32x8.Sub, subSlice[uint32])
+ testUint64x2Binary(t, archsimd.Uint64x2.Sub, subSlice[uint64])
+ testUint64x4Binary(t, archsimd.Uint64x4.Sub, subSlice[uint64])
+ testUint8x16Binary(t, archsimd.Uint8x16.Sub, subSlice[uint8])
+ testUint8x32Binary(t, archsimd.Uint8x32.Sub, subSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ testFloat32x16Binary(t, archsimd.Float32x16.Sub, subSlice[float32])
+ testFloat64x8Binary(t, archsimd.Float64x8.Sub, subSlice[float64])
+ testInt8x64Binary(t, archsimd.Int8x64.Sub, subSlice[int8])
+ testInt16x32Binary(t, archsimd.Int16x32.Sub, subSlice[int16])
+ testInt32x16Binary(t, archsimd.Int32x16.Sub, subSlice[int32])
+ testInt64x8Binary(t, archsimd.Int64x8.Sub, subSlice[int64])
+ testUint8x64Binary(t, archsimd.Uint8x64.Sub, subSlice[uint8])
+ testUint16x32Binary(t, archsimd.Uint16x32.Sub, subSlice[uint16])
+ testUint32x16Binary(t, archsimd.Uint32x16.Sub, subSlice[uint32])
+ testUint64x8Binary(t, archsimd.Uint64x8.Sub, subSlice[uint64])
+ }
+}
+
+func TestMax(t *testing.T) {
+ // testFloat32x4Binary(t, archsimd.Float32x4.Max, maxSlice[float32]) // nan is wrong
+ // testFloat32x8Binary(t, archsimd.Float32x8.Max, maxSlice[float32]) // nan is wrong
+ // testFloat64x2Binary(t, archsimd.Float64x2.Max, maxSlice[float64]) // nan is wrong
+ // testFloat64x4Binary(t, archsimd.Float64x4.Max, maxSlice[float64]) // nan is wrong
+
+ testInt16x16Binary(t, archsimd.Int16x16.Max, maxSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.Max, maxSlice[int16])
+ testInt32x4Binary(t, archsimd.Int32x4.Max, maxSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.Max, maxSlice[int32])
+
+ if archsimd.X86.AVX512() {
+ testInt64x2Binary(t, archsimd.Int64x2.Max, maxSlice[int64])
+ testInt64x4Binary(t, archsimd.Int64x4.Max, maxSlice[int64])
+ }
+
+ testInt8x16Binary(t, archsimd.Int8x16.Max, maxSlice[int8])
+ testInt8x32Binary(t, archsimd.Int8x32.Max, maxSlice[int8])
+
+ testUint16x16Binary(t, archsimd.Uint16x16.Max, maxSlice[uint16])
+ testUint16x8Binary(t, archsimd.Uint16x8.Max, maxSlice[uint16])
+ testUint32x4Binary(t, archsimd.Uint32x4.Max, maxSlice[uint32])
+ testUint32x8Binary(t, archsimd.Uint32x8.Max, maxSlice[uint32])
+
+ if archsimd.X86.AVX512() {
+ testUint64x2Binary(t, archsimd.Uint64x2.Max, maxSlice[uint64])
+ testUint64x4Binary(t, archsimd.Uint64x4.Max, maxSlice[uint64])
+ }
+
+ testUint8x16Binary(t, archsimd.Uint8x16.Max, maxSlice[uint8])
+ testUint8x32Binary(t, archsimd.Uint8x32.Max, maxSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ // testFloat32x16Binary(t, archsimd.Float32x16.Max, maxSlice[float32]) // nan is wrong
+ // testFloat64x8Binary(t, archsimd.Float64x8.Max, maxSlice[float64]) // nan is wrong
+ testInt8x64Binary(t, archsimd.Int8x64.Max, maxSlice[int8])
+ testInt16x32Binary(t, archsimd.Int16x32.Max, maxSlice[int16])
+ testInt32x16Binary(t, archsimd.Int32x16.Max, maxSlice[int32])
+ testInt64x8Binary(t, archsimd.Int64x8.Max, maxSlice[int64])
+ testUint8x64Binary(t, archsimd.Uint8x64.Max, maxSlice[uint8])
+ testUint16x32Binary(t, archsimd.Uint16x32.Max, maxSlice[uint16])
+ testUint32x16Binary(t, archsimd.Uint32x16.Max, maxSlice[uint32])
+ testUint64x8Binary(t, archsimd.Uint64x8.Max, maxSlice[uint64])
+ }
+}
+
+func TestMin(t *testing.T) {
+ // testFloat32x4Binary(t, archsimd.Float32x4.Min, minSlice[float32]) // nan is wrong
+ // testFloat32x8Binary(t, archsimd.Float32x8.Min, minSlice[float32]) // nan is wrong
+ // testFloat64x2Binary(t, archsimd.Float64x2.Min, minSlice[float64]) // nan is wrong
+ // testFloat64x4Binary(t, archsimd.Float64x4.Min, minSlice[float64]) // nan is wrong
+
+ testInt16x16Binary(t, archsimd.Int16x16.Min, minSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.Min, minSlice[int16])
+ testInt32x4Binary(t, archsimd.Int32x4.Min, minSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.Min, minSlice[int32])
+
+ if archsimd.X86.AVX512() {
+ testInt64x2Binary(t, archsimd.Int64x2.Min, minSlice[int64])
+ testInt64x4Binary(t, archsimd.Int64x4.Min, minSlice[int64])
+ }
+
+ testInt8x16Binary(t, archsimd.Int8x16.Min, minSlice[int8])
+ testInt8x32Binary(t, archsimd.Int8x32.Min, minSlice[int8])
+
+ testUint16x16Binary(t, archsimd.Uint16x16.Min, minSlice[uint16])
+ testUint16x8Binary(t, archsimd.Uint16x8.Min, minSlice[uint16])
+ testUint32x4Binary(t, archsimd.Uint32x4.Min, minSlice[uint32])
+ testUint32x8Binary(t, archsimd.Uint32x8.Min, minSlice[uint32])
+
+ if archsimd.X86.AVX512() {
+ testUint64x2Binary(t, archsimd.Uint64x2.Min, minSlice[uint64])
+ testUint64x4Binary(t, archsimd.Uint64x4.Min, minSlice[uint64])
+ }
+
+ testUint8x16Binary(t, archsimd.Uint8x16.Min, minSlice[uint8])
+ testUint8x32Binary(t, archsimd.Uint8x32.Min, minSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ // testFloat32x16Binary(t, archsimd.Float32x16.Min, minSlice[float32]) // nan is wrong
+ // testFloat64x8Binary(t, archsimd.Float64x8.Min, minSlice[float64]) // nan is wrong
+ testInt8x64Binary(t, archsimd.Int8x64.Min, minSlice[int8])
+ testInt16x32Binary(t, archsimd.Int16x32.Min, minSlice[int16])
+ testInt32x16Binary(t, archsimd.Int32x16.Min, minSlice[int32])
+ testInt64x8Binary(t, archsimd.Int64x8.Min, minSlice[int64])
+ testUint8x64Binary(t, archsimd.Uint8x64.Min, minSlice[uint8])
+ testUint16x32Binary(t, archsimd.Uint16x32.Min, minSlice[uint16])
+ testUint32x16Binary(t, archsimd.Uint32x16.Min, minSlice[uint32])
+ testUint64x8Binary(t, archsimd.Uint64x8.Min, minSlice[uint64])
+ }
+}
+
+func TestAnd(t *testing.T) {
+ testInt16x16Binary(t, archsimd.Int16x16.And, andSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.And, andSlice[int16])
+ testInt32x4Binary(t, archsimd.Int32x4.And, andSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.And, andSlice[int32])
+ testInt64x2Binary(t, archsimd.Int64x2.And, andSlice[int64])
+ testInt64x4Binary(t, archsimd.Int64x4.And, andSlice[int64])
+ testInt8x16Binary(t, archsimd.Int8x16.And, andSlice[int8])
+ testInt8x32Binary(t, archsimd.Int8x32.And, andSlice[int8])
+
+ testUint16x16Binary(t, archsimd.Uint16x16.And, andSlice[uint16])
+ testUint16x8Binary(t, archsimd.Uint16x8.And, andSlice[uint16])
+ testUint32x4Binary(t, archsimd.Uint32x4.And, andSlice[uint32])
+ testUint32x8Binary(t, archsimd.Uint32x8.And, andSlice[uint32])
+ testUint64x2Binary(t, archsimd.Uint64x2.And, andSlice[uint64])
+ testUint64x4Binary(t, archsimd.Uint64x4.And, andSlice[uint64])
+ testUint8x16Binary(t, archsimd.Uint8x16.And, andSlice[uint8])
+ testUint8x32Binary(t, archsimd.Uint8x32.And, andSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ // testInt8x64Binary(t, archsimd.Int8x64.And, andISlice[int8]) // missing
+ // testInt16x32Binary(t, archsimd.Int16x32.And, andISlice[int16]) // missing
+ testInt32x16Binary(t, archsimd.Int32x16.And, andSlice[int32])
+ testInt64x8Binary(t, archsimd.Int64x8.And, andSlice[int64])
+ // testUint8x64Binary(t, archsimd.Uint8x64.And, andISlice[uint8]) // missing
+ // testUint16x32Binary(t, archsimd.Uint16x32.And, andISlice[uint16]) // missing
+ testUint32x16Binary(t, archsimd.Uint32x16.And, andSlice[uint32])
+ testUint64x8Binary(t, archsimd.Uint64x8.And, andSlice[uint64])
+ }
+}
+
+func TestAndNot(t *testing.T) {
+ testInt16x16Binary(t, archsimd.Int16x16.AndNot, andNotSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.AndNot, andNotSlice[int16])
+ testInt32x4Binary(t, archsimd.Int32x4.AndNot, andNotSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.AndNot, andNotSlice[int32])
+ testInt64x2Binary(t, archsimd.Int64x2.AndNot, andNotSlice[int64])
+ testInt64x4Binary(t, archsimd.Int64x4.AndNot, andNotSlice[int64])
+ testInt8x16Binary(t, archsimd.Int8x16.AndNot, andNotSlice[int8])
+ testInt8x32Binary(t, archsimd.Int8x32.AndNot, andNotSlice[int8])
+
+ testUint16x16Binary(t, archsimd.Uint16x16.AndNot, andNotSlice[uint16])
+ testUint16x8Binary(t, archsimd.Uint16x8.AndNot, andNotSlice[uint16])
+ testUint32x4Binary(t, archsimd.Uint32x4.AndNot, andNotSlice[uint32])
+ testUint32x8Binary(t, archsimd.Uint32x8.AndNot, andNotSlice[uint32])
+ testUint64x2Binary(t, archsimd.Uint64x2.AndNot, andNotSlice[uint64])
+ testUint64x4Binary(t, archsimd.Uint64x4.AndNot, andNotSlice[uint64])
+ testUint8x16Binary(t, archsimd.Uint8x16.AndNot, andNotSlice[uint8])
+ testUint8x32Binary(t, archsimd.Uint8x32.AndNot, andNotSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ testInt8x64Binary(t, archsimd.Int8x64.AndNot, andNotSlice[int8])
+ testInt16x32Binary(t, archsimd.Int16x32.AndNot, andNotSlice[int16])
+ testInt32x16Binary(t, archsimd.Int32x16.AndNot, andNotSlice[int32])
+ testInt64x8Binary(t, archsimd.Int64x8.AndNot, andNotSlice[int64])
+ testUint8x64Binary(t, archsimd.Uint8x64.AndNot, andNotSlice[uint8])
+ testUint16x32Binary(t, archsimd.Uint16x32.AndNot, andNotSlice[uint16])
+ testUint32x16Binary(t, archsimd.Uint32x16.AndNot, andNotSlice[uint32])
+ testUint64x8Binary(t, archsimd.Uint64x8.AndNot, andNotSlice[uint64])
+ }
+}
+
+func TestXor(t *testing.T) {
+ testInt16x16Binary(t, archsimd.Int16x16.Xor, xorSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.Xor, xorSlice[int16])
+ testInt32x4Binary(t, archsimd.Int32x4.Xor, xorSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.Xor, xorSlice[int32])
+ testInt64x2Binary(t, archsimd.Int64x2.Xor, xorSlice[int64])
+ testInt64x4Binary(t, archsimd.Int64x4.Xor, xorSlice[int64])
+ testInt8x16Binary(t, archsimd.Int8x16.Xor, xorSlice[int8])
+ testInt8x32Binary(t, archsimd.Int8x32.Xor, xorSlice[int8])
+
+ testUint16x16Binary(t, archsimd.Uint16x16.Xor, xorSlice[uint16])
+ testUint16x8Binary(t, archsimd.Uint16x8.Xor, xorSlice[uint16])
+ testUint32x4Binary(t, archsimd.Uint32x4.Xor, xorSlice[uint32])
+ testUint32x8Binary(t, archsimd.Uint32x8.Xor, xorSlice[uint32])
+ testUint64x2Binary(t, archsimd.Uint64x2.Xor, xorSlice[uint64])
+ testUint64x4Binary(t, archsimd.Uint64x4.Xor, xorSlice[uint64])
+ testUint8x16Binary(t, archsimd.Uint8x16.Xor, xorSlice[uint8])
+ testUint8x32Binary(t, archsimd.Uint8x32.Xor, xorSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ // testInt8x64Binary(t, archsimd.Int8x64.Xor, andISlice[int8]) // missing
+ // testInt16x32Binary(t, archsimd.Int16x32.Xor, andISlice[int16]) // missing
+ testInt32x16Binary(t, archsimd.Int32x16.Xor, xorSlice[int32])
+ testInt64x8Binary(t, archsimd.Int64x8.Xor, xorSlice[int64])
+ // testUint8x64Binary(t, archsimd.Uint8x64.Xor, andISlice[uint8]) // missing
+ // testUint16x32Binary(t, archsimd.Uint16x32.Xor, andISlice[uint16]) // missing
+ testUint32x16Binary(t, archsimd.Uint32x16.Xor, xorSlice[uint32])
+ testUint64x8Binary(t, archsimd.Uint64x8.Xor, xorSlice[uint64])
+ }
+}
+
+func TestOr(t *testing.T) {
+ testInt16x16Binary(t, archsimd.Int16x16.Or, orSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.Or, orSlice[int16])
+ testInt32x4Binary(t, archsimd.Int32x4.Or, orSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.Or, orSlice[int32])
+ testInt64x2Binary(t, archsimd.Int64x2.Or, orSlice[int64])
+ testInt64x4Binary(t, archsimd.Int64x4.Or, orSlice[int64])
+ testInt8x16Binary(t, archsimd.Int8x16.Or, orSlice[int8])
+ testInt8x32Binary(t, archsimd.Int8x32.Or, orSlice[int8])
+
+ testUint16x16Binary(t, archsimd.Uint16x16.Or, orSlice[uint16])
+ testUint16x8Binary(t, archsimd.Uint16x8.Or, orSlice[uint16])
+ testUint32x4Binary(t, archsimd.Uint32x4.Or, orSlice[uint32])
+ testUint32x8Binary(t, archsimd.Uint32x8.Or, orSlice[uint32])
+ testUint64x2Binary(t, archsimd.Uint64x2.Or, orSlice[uint64])
+ testUint64x4Binary(t, archsimd.Uint64x4.Or, orSlice[uint64])
+ testUint8x16Binary(t, archsimd.Uint8x16.Or, orSlice[uint8])
+ testUint8x32Binary(t, archsimd.Uint8x32.Or, orSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ // testInt8x64Binary(t, archsimd.Int8x64.Or, andISlice[int8]) // missing
+ // testInt16x32Binary(t, archsimd.Int16x32.Or, andISlice[int16]) // missing
+ testInt32x16Binary(t, archsimd.Int32x16.Or, orSlice[int32])
+ testInt64x8Binary(t, archsimd.Int64x8.Or, orSlice[int64])
+ // testUint8x64Binary(t, archsimd.Uint8x64.Or, andISlice[uint8]) // missing
+ // testUint16x32Binary(t, archsimd.Uint16x32.Or, andISlice[uint16]) // missing
+ testUint32x16Binary(t, archsimd.Uint32x16.Or, orSlice[uint32])
+ testUint64x8Binary(t, archsimd.Uint64x8.Or, orSlice[uint64])
+ }
+}
+
+func TestMul(t *testing.T) {
+ testFloat32x4Binary(t, archsimd.Float32x4.Mul, mulSlice[float32])
+ testFloat32x8Binary(t, archsimd.Float32x8.Mul, mulSlice[float32])
+ testFloat64x2Binary(t, archsimd.Float64x2.Mul, mulSlice[float64])
+ testFloat64x4Binary(t, archsimd.Float64x4.Mul, mulSlice[float64])
+
+ testInt16x16Binary(t, archsimd.Int16x16.Mul, mulSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.Mul, mulSlice[int16])
+ testInt32x4Binary(t, archsimd.Int32x4.Mul, mulSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.Mul, mulSlice[int32])
+
+ // testInt8x16Binary(t, archsimd.Int8x16.Mul, mulSlice[int8]) // nope
+ // testInt8x32Binary(t, archsimd.Int8x32.Mul, mulSlice[int8])
+
+ // TODO we should be able to do these, there's no difference between signed/unsigned Mul
+ // testUint16x16Binary(t, archsimd.Uint16x16.Mul, mulSlice[uint16])
+ // testUint16x8Binary(t, archsimd.Uint16x8.Mul, mulSlice[uint16])
+ // testUint32x4Binary(t, archsimd.Uint32x4.Mul, mulSlice[uint32])
+ // testUint32x8Binary(t, archsimd.Uint32x8.Mul, mulSlice[uint32])
+ // testUint64x2Binary(t, archsimd.Uint64x2.Mul, mulSlice[uint64])
+ // testUint64x4Binary(t, archsimd.Uint64x4.Mul, mulSlice[uint64])
+
+ // testUint8x16Binary(t, archsimd.Uint8x16.Mul, mulSlice[uint8]) // nope
+ // testUint8x32Binary(t, archsimd.Uint8x32.Mul, mulSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ testInt64x2Binary(t, archsimd.Int64x2.Mul, mulSlice[int64]) // avx512 only
+ testInt64x4Binary(t, archsimd.Int64x4.Mul, mulSlice[int64])
+
+ testFloat32x16Binary(t, archsimd.Float32x16.Mul, mulSlice[float32])
+ testFloat64x8Binary(t, archsimd.Float64x8.Mul, mulSlice[float64])
+
+ // testInt8x64Binary(t, archsimd.Int8x64.Mul, mulSlice[int8]) // nope
+ testInt16x32Binary(t, archsimd.Int16x32.Mul, mulSlice[int16])
+ testInt32x16Binary(t, archsimd.Int32x16.Mul, mulSlice[int32])
+ testInt64x8Binary(t, archsimd.Int64x8.Mul, mulSlice[int64])
+ // testUint8x64Binary(t, archsimd.Uint8x64.Mul, mulSlice[uint8]) // nope
+
+ // TODO signed should do the job
+ // testUint16x32Binary(t, archsimd.Uint16x32.Mul, mulSlice[uint16])
+ // testUint32x16Binary(t, archsimd.Uint32x16.Mul, mulSlice[uint32])
+ // testUint64x8Binary(t, archsimd.Uint64x8.Mul, mulSlice[uint64])
+ }
+}
+
+func TestDiv(t *testing.T) {
+ testFloat32x4Binary(t, archsimd.Float32x4.Div, divSlice[float32])
+ testFloat32x8Binary(t, archsimd.Float32x8.Div, divSlice[float32])
+ testFloat64x2Binary(t, archsimd.Float64x2.Div, divSlice[float64])
+ testFloat64x4Binary(t, archsimd.Float64x4.Div, divSlice[float64])
+
+ if archsimd.X86.AVX512() {
+ testFloat32x16Binary(t, archsimd.Float32x16.Div, divSlice[float32])
+ testFloat64x8Binary(t, archsimd.Float64x8.Div, divSlice[float64])
+ }
+}
--- /dev/null
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing simd methods that compare two operands.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+// testInt8x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt8x16Compare(t *testing.T, f func(_, _ archsimd.Int8x16) archsimd.Mask8x16, want func(_, _ []int8) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, int8s, n, func(x, y []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x16Slice(x)
+ b := archsimd.LoadInt8x16Slice(y)
+ g := make([]int8, n)
+ f(a, b).AsInt8x16().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt16x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt16x8Compare(t *testing.T, f func(_, _ archsimd.Int16x8) archsimd.Mask16x8, want func(_, _ []int16) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, int16s, n, func(x, y []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x8Slice(x)
+ b := archsimd.LoadInt16x8Slice(y)
+ g := make([]int16, n)
+ f(a, b).AsInt16x8().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt32x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt32x4Compare(t *testing.T, f func(_, _ archsimd.Int32x4) archsimd.Mask32x4, want func(_, _ []int32) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, int32s, n, func(x, y []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x4Slice(x)
+ b := archsimd.LoadInt32x4Slice(y)
+ g := make([]int32, n)
+ f(a, b).AsInt32x4().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt64x2Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt64x2Compare(t *testing.T, f func(_, _ archsimd.Int64x2) archsimd.Mask64x2, want func(_, _ []int64) []int64) {
+ n := 2
+ t.Helper()
+ forSlicePair(t, int64s, n, func(x, y []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x2Slice(x)
+ b := archsimd.LoadInt64x2Slice(y)
+ g := make([]int64, n)
+ f(a, b).AsInt64x2().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint8x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint8x16Compare(t *testing.T, f func(_, _ archsimd.Uint8x16) archsimd.Mask8x16, want func(_, _ []uint8) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x16Slice(x)
+ b := archsimd.LoadUint8x16Slice(y)
+ g := make([]int8, n)
+ f(a, b).AsInt8x16().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint16x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint16x8Compare(t *testing.T, f func(_, _ archsimd.Uint16x8) archsimd.Mask16x8, want func(_, _ []uint16) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x8Slice(x)
+ b := archsimd.LoadUint16x8Slice(y)
+ g := make([]int16, n)
+ f(a, b).AsInt16x8().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint32x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint32x4Compare(t *testing.T, f func(_, _ archsimd.Uint32x4) archsimd.Mask32x4, want func(_, _ []uint32) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x4Slice(x)
+ b := archsimd.LoadUint32x4Slice(y)
+ g := make([]int32, n)
+ f(a, b).AsInt32x4().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint64x2Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint64x2Compare(t *testing.T, f func(_, _ archsimd.Uint64x2) archsimd.Mask64x2, want func(_, _ []uint64) []int64) {
+ n := 2
+ t.Helper()
+ forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x2Slice(x)
+ b := archsimd.LoadUint64x2Slice(y)
+ g := make([]int64, n)
+ f(a, b).AsInt64x2().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat32x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat32x4Compare(t *testing.T, f func(_, _ archsimd.Float32x4) archsimd.Mask32x4, want func(_, _ []float32) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, float32s, n, func(x, y []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x4Slice(x)
+ b := archsimd.LoadFloat32x4Slice(y)
+ g := make([]int32, n)
+ f(a, b).AsInt32x4().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat64x2Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat64x2Compare(t *testing.T, f func(_, _ archsimd.Float64x2) archsimd.Mask64x2, want func(_, _ []float64) []int64) {
+ n := 2
+ t.Helper()
+ forSlicePair(t, float64s, n, func(x, y []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x2Slice(x)
+ b := archsimd.LoadFloat64x2Slice(y)
+ g := make([]int64, n)
+ f(a, b).AsInt64x2().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt8x32Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt8x32Compare(t *testing.T, f func(_, _ archsimd.Int8x32) archsimd.Mask8x32, want func(_, _ []int8) []int64) {
+ n := 32
+ t.Helper()
+ forSlicePair(t, int8s, n, func(x, y []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x32Slice(x)
+ b := archsimd.LoadInt8x32Slice(y)
+ g := make([]int8, n)
+ f(a, b).AsInt8x32().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt16x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt16x16Compare(t *testing.T, f func(_, _ archsimd.Int16x16) archsimd.Mask16x16, want func(_, _ []int16) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, int16s, n, func(x, y []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x16Slice(x)
+ b := archsimd.LoadInt16x16Slice(y)
+ g := make([]int16, n)
+ f(a, b).AsInt16x16().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt32x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt32x8Compare(t *testing.T, f func(_, _ archsimd.Int32x8) archsimd.Mask32x8, want func(_, _ []int32) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, int32s, n, func(x, y []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x8Slice(x)
+ b := archsimd.LoadInt32x8Slice(y)
+ g := make([]int32, n)
+ f(a, b).AsInt32x8().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt64x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt64x4Compare(t *testing.T, f func(_, _ archsimd.Int64x4) archsimd.Mask64x4, want func(_, _ []int64) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, int64s, n, func(x, y []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x4Slice(x)
+ b := archsimd.LoadInt64x4Slice(y)
+ g := make([]int64, n)
+ f(a, b).AsInt64x4().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint8x32Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint8x32Compare(t *testing.T, f func(_, _ archsimd.Uint8x32) archsimd.Mask8x32, want func(_, _ []uint8) []int64) {
+ n := 32
+ t.Helper()
+ forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x32Slice(x)
+ b := archsimd.LoadUint8x32Slice(y)
+ g := make([]int8, n)
+ f(a, b).AsInt8x32().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint16x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint16x16Compare(t *testing.T, f func(_, _ archsimd.Uint16x16) archsimd.Mask16x16, want func(_, _ []uint16) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x16Slice(x)
+ b := archsimd.LoadUint16x16Slice(y)
+ g := make([]int16, n)
+ f(a, b).AsInt16x16().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint32x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint32x8Compare(t *testing.T, f func(_, _ archsimd.Uint32x8) archsimd.Mask32x8, want func(_, _ []uint32) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x8Slice(x)
+ b := archsimd.LoadUint32x8Slice(y)
+ g := make([]int32, n)
+ f(a, b).AsInt32x8().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint64x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint64x4Compare(t *testing.T, f func(_, _ archsimd.Uint64x4) archsimd.Mask64x4, want func(_, _ []uint64) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x4Slice(x)
+ b := archsimd.LoadUint64x4Slice(y)
+ g := make([]int64, n)
+ f(a, b).AsInt64x4().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat32x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat32x8Compare(t *testing.T, f func(_, _ archsimd.Float32x8) archsimd.Mask32x8, want func(_, _ []float32) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, float32s, n, func(x, y []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x8Slice(x)
+ b := archsimd.LoadFloat32x8Slice(y)
+ g := make([]int32, n)
+ f(a, b).AsInt32x8().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat64x4Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat64x4Compare(t *testing.T, f func(_, _ archsimd.Float64x4) archsimd.Mask64x4, want func(_, _ []float64) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePair(t, float64s, n, func(x, y []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x4Slice(x)
+ b := archsimd.LoadFloat64x4Slice(y)
+ g := make([]int64, n)
+ f(a, b).AsInt64x4().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt8x64Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt8x64Compare(t *testing.T, f func(_, _ archsimd.Int8x64) archsimd.Mask8x64, want func(_, _ []int8) []int64) {
+ n := 64
+ t.Helper()
+ forSlicePair(t, int8s, n, func(x, y []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x64Slice(x)
+ b := archsimd.LoadInt8x64Slice(y)
+ g := make([]int8, n)
+ f(a, b).AsInt8x64().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt16x32Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt16x32Compare(t *testing.T, f func(_, _ archsimd.Int16x32) archsimd.Mask16x32, want func(_, _ []int16) []int64) {
+ n := 32
+ t.Helper()
+ forSlicePair(t, int16s, n, func(x, y []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x32Slice(x)
+ b := archsimd.LoadInt16x32Slice(y)
+ g := make([]int16, n)
+ f(a, b).AsInt16x32().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt32x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt32x16Compare(t *testing.T, f func(_, _ archsimd.Int32x16) archsimd.Mask32x16, want func(_, _ []int32) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, int32s, n, func(x, y []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x16Slice(x)
+ b := archsimd.LoadInt32x16Slice(y)
+ g := make([]int32, n)
+ f(a, b).AsInt32x16().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testInt64x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testInt64x8Compare(t *testing.T, f func(_, _ archsimd.Int64x8) archsimd.Mask64x8, want func(_, _ []int64) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, int64s, n, func(x, y []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x8Slice(x)
+ b := archsimd.LoadInt64x8Slice(y)
+ g := make([]int64, n)
+ f(a, b).AsInt64x8().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint8x64Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint8x64Compare(t *testing.T, f func(_, _ archsimd.Uint8x64) archsimd.Mask8x64, want func(_, _ []uint8) []int64) {
+ n := 64
+ t.Helper()
+ forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x64Slice(x)
+ b := archsimd.LoadUint8x64Slice(y)
+ g := make([]int8, n)
+ f(a, b).AsInt8x64().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint16x32Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint16x32Compare(t *testing.T, f func(_, _ archsimd.Uint16x32) archsimd.Mask16x32, want func(_, _ []uint16) []int64) {
+ n := 32
+ t.Helper()
+ forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x32Slice(x)
+ b := archsimd.LoadUint16x32Slice(y)
+ g := make([]int16, n)
+ f(a, b).AsInt16x32().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint32x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint32x16Compare(t *testing.T, f func(_, _ archsimd.Uint32x16) archsimd.Mask32x16, want func(_, _ []uint32) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x16Slice(x)
+ b := archsimd.LoadUint32x16Slice(y)
+ g := make([]int32, n)
+ f(a, b).AsInt32x16().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testUint64x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testUint64x8Compare(t *testing.T, f func(_, _ archsimd.Uint64x8) archsimd.Mask64x8, want func(_, _ []uint64) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x8Slice(x)
+ b := archsimd.LoadUint64x8Slice(y)
+ g := make([]int64, n)
+ f(a, b).AsInt64x8().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat32x16Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat32x16Compare(t *testing.T, f func(_, _ archsimd.Float32x16) archsimd.Mask32x16, want func(_, _ []float32) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePair(t, float32s, n, func(x, y []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x16Slice(x)
+ b := archsimd.LoadFloat32x16Slice(y)
+ g := make([]int32, n)
+ f(a, b).AsInt32x16().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
+
+// testFloat64x8Compare tests the simd comparison method f against the expected behavior generated by want
+func testFloat64x8Compare(t *testing.T, f func(_, _ archsimd.Float64x8) archsimd.Mask64x8, want func(_, _ []float64) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePair(t, float64s, n, func(x, y []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x8Slice(x)
+ b := archsimd.LoadFloat64x8Slice(y)
+ g := make([]int64, n)
+ f(a, b).AsInt64x8().StoreSlice(g)
+ w := want(x, y)
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
+ })
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+// AVX 2 lacks most comparisons, but they can be synthesized
+// from > and =
+var comparisonFixed bool = archsimd.X86.AVX512()
+
+func TestLess(t *testing.T) {
+ testFloat32x4Compare(t, archsimd.Float32x4.Less, lessSlice[float32])
+ testFloat32x8Compare(t, archsimd.Float32x8.Less, lessSlice[float32])
+ testFloat64x2Compare(t, archsimd.Float64x2.Less, lessSlice[float64])
+ testFloat64x4Compare(t, archsimd.Float64x4.Less, lessSlice[float64])
+
+ testInt16x16Compare(t, archsimd.Int16x16.Less, lessSlice[int16])
+ testInt16x8Compare(t, archsimd.Int16x8.Less, lessSlice[int16])
+ testInt32x4Compare(t, archsimd.Int32x4.Less, lessSlice[int32])
+ testInt32x8Compare(t, archsimd.Int32x8.Less, lessSlice[int32])
+ testInt64x2Compare(t, archsimd.Int64x2.Less, lessSlice[int64])
+ testInt64x4Compare(t, archsimd.Int64x4.Less, lessSlice[int64])
+ testInt8x16Compare(t, archsimd.Int8x16.Less, lessSlice[int8])
+ testInt8x32Compare(t, archsimd.Int8x32.Less, lessSlice[int8])
+
+ testInt16x16Compare(t, archsimd.Int16x16.Less, lessSlice[int16])
+ testInt16x8Compare(t, archsimd.Int16x8.Less, lessSlice[int16])
+ testInt32x4Compare(t, archsimd.Int32x4.Less, lessSlice[int32])
+ testInt32x8Compare(t, archsimd.Int32x8.Less, lessSlice[int32])
+ testInt64x2Compare(t, archsimd.Int64x2.Less, lessSlice[int64])
+ testInt64x4Compare(t, archsimd.Int64x4.Less, lessSlice[int64])
+ testInt8x16Compare(t, archsimd.Int8x16.Less, lessSlice[int8])
+ testInt8x32Compare(t, archsimd.Int8x32.Less, lessSlice[int8])
+
+ testUint16x16Compare(t, archsimd.Uint16x16.Less, lessSlice[uint16])
+ testUint16x8Compare(t, archsimd.Uint16x8.Less, lessSlice[uint16])
+ testUint32x4Compare(t, archsimd.Uint32x4.Less, lessSlice[uint32])
+ testUint32x8Compare(t, archsimd.Uint32x8.Less, lessSlice[uint32])
+ testUint64x2Compare(t, archsimd.Uint64x2.Less, lessSlice[uint64])
+ testUint64x4Compare(t, archsimd.Uint64x4.Less, lessSlice[uint64])
+ testUint8x16Compare(t, archsimd.Uint8x16.Less, lessSlice[uint8])
+ testUint8x32Compare(t, archsimd.Uint8x32.Less, lessSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ testUint16x16Compare(t, archsimd.Uint16x16.Less, lessSlice[uint16])
+ testUint16x8Compare(t, archsimd.Uint16x8.Less, lessSlice[uint16])
+ testUint32x4Compare(t, archsimd.Uint32x4.Less, lessSlice[uint32])
+ testUint32x8Compare(t, archsimd.Uint32x8.Less, lessSlice[uint32])
+ testUint64x2Compare(t, archsimd.Uint64x2.Less, lessSlice[uint64])
+ testUint64x4Compare(t, archsimd.Uint64x4.Less, lessSlice[uint64])
+ testUint8x16Compare(t, archsimd.Uint8x16.Less, lessSlice[uint8])
+ testUint8x32Compare(t, archsimd.Uint8x32.Less, lessSlice[uint8])
+
+ testFloat32x16Compare(t, archsimd.Float32x16.Less, lessSlice[float32])
+ testFloat64x8Compare(t, archsimd.Float64x8.Less, lessSlice[float64])
+ testInt8x64Compare(t, archsimd.Int8x64.Less, lessSlice[int8])
+ testInt16x32Compare(t, archsimd.Int16x32.Less, lessSlice[int16])
+ testInt32x16Compare(t, archsimd.Int32x16.Less, lessSlice[int32])
+ testInt64x8Compare(t, archsimd.Int64x8.Less, lessSlice[int64])
+ testUint8x64Compare(t, archsimd.Uint8x64.Less, lessSlice[uint8])
+ testUint16x32Compare(t, archsimd.Uint16x32.Less, lessSlice[uint16])
+ testUint32x16Compare(t, archsimd.Uint32x16.Less, lessSlice[uint32])
+ testUint64x8Compare(t, archsimd.Uint64x8.Less, lessSlice[uint64])
+ }
+}
+
+func TestLessEqual(t *testing.T) {
+ testFloat32x4Compare(t, archsimd.Float32x4.LessEqual, lessEqualSlice[float32])
+ testFloat32x8Compare(t, archsimd.Float32x8.LessEqual, lessEqualSlice[float32])
+ testFloat64x2Compare(t, archsimd.Float64x2.LessEqual, lessEqualSlice[float64])
+ testFloat64x4Compare(t, archsimd.Float64x4.LessEqual, lessEqualSlice[float64])
+
+ testInt16x16Compare(t, archsimd.Int16x16.LessEqual, lessEqualSlice[int16])
+ testInt16x8Compare(t, archsimd.Int16x8.LessEqual, lessEqualSlice[int16])
+ testInt32x4Compare(t, archsimd.Int32x4.LessEqual, lessEqualSlice[int32])
+ testInt32x8Compare(t, archsimd.Int32x8.LessEqual, lessEqualSlice[int32])
+ testInt64x2Compare(t, archsimd.Int64x2.LessEqual, lessEqualSlice[int64])
+ testInt64x4Compare(t, archsimd.Int64x4.LessEqual, lessEqualSlice[int64])
+ testInt8x16Compare(t, archsimd.Int8x16.LessEqual, lessEqualSlice[int8])
+ testInt8x32Compare(t, archsimd.Int8x32.LessEqual, lessEqualSlice[int8])
+
+ testUint16x16Compare(t, archsimd.Uint16x16.LessEqual, lessEqualSlice[uint16])
+ testUint16x8Compare(t, archsimd.Uint16x8.LessEqual, lessEqualSlice[uint16])
+ testUint32x4Compare(t, archsimd.Uint32x4.LessEqual, lessEqualSlice[uint32])
+ testUint32x8Compare(t, archsimd.Uint32x8.LessEqual, lessEqualSlice[uint32])
+ testUint64x2Compare(t, archsimd.Uint64x2.LessEqual, lessEqualSlice[uint64])
+ testUint64x4Compare(t, archsimd.Uint64x4.LessEqual, lessEqualSlice[uint64])
+ testUint8x16Compare(t, archsimd.Uint8x16.LessEqual, lessEqualSlice[uint8])
+ testUint8x32Compare(t, archsimd.Uint8x32.LessEqual, lessEqualSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ testFloat32x16Compare(t, archsimd.Float32x16.LessEqual, lessEqualSlice[float32])
+ testFloat64x8Compare(t, archsimd.Float64x8.LessEqual, lessEqualSlice[float64])
+ testInt8x64Compare(t, archsimd.Int8x64.LessEqual, lessEqualSlice[int8])
+ testInt16x32Compare(t, archsimd.Int16x32.LessEqual, lessEqualSlice[int16])
+ testInt32x16Compare(t, archsimd.Int32x16.LessEqual, lessEqualSlice[int32])
+ testInt64x8Compare(t, archsimd.Int64x8.LessEqual, lessEqualSlice[int64])
+ testUint8x64Compare(t, archsimd.Uint8x64.LessEqual, lessEqualSlice[uint8])
+ testUint16x32Compare(t, archsimd.Uint16x32.LessEqual, lessEqualSlice[uint16])
+ testUint32x16Compare(t, archsimd.Uint32x16.LessEqual, lessEqualSlice[uint32])
+ testUint64x8Compare(t, archsimd.Uint64x8.LessEqual, lessEqualSlice[uint64])
+ }
+}
+
+func TestGreater(t *testing.T) {
+ testFloat32x4Compare(t, archsimd.Float32x4.Greater, greaterSlice[float32])
+ testFloat32x8Compare(t, archsimd.Float32x8.Greater, greaterSlice[float32])
+ testFloat64x2Compare(t, archsimd.Float64x2.Greater, greaterSlice[float64])
+ testFloat64x4Compare(t, archsimd.Float64x4.Greater, greaterSlice[float64])
+
+ testInt16x16Compare(t, archsimd.Int16x16.Greater, greaterSlice[int16])
+ testInt16x8Compare(t, archsimd.Int16x8.Greater, greaterSlice[int16])
+ testInt32x4Compare(t, archsimd.Int32x4.Greater, greaterSlice[int32])
+ testInt32x8Compare(t, archsimd.Int32x8.Greater, greaterSlice[int32])
+
+ testInt64x2Compare(t, archsimd.Int64x2.Greater, greaterSlice[int64])
+ testInt64x4Compare(t, archsimd.Int64x4.Greater, greaterSlice[int64])
+ testInt8x16Compare(t, archsimd.Int8x16.Greater, greaterSlice[int8])
+ testInt8x32Compare(t, archsimd.Int8x32.Greater, greaterSlice[int8])
+
+ testUint16x16Compare(t, archsimd.Uint16x16.Greater, greaterSlice[uint16])
+ testUint16x8Compare(t, archsimd.Uint16x8.Greater, greaterSlice[uint16])
+ testUint32x4Compare(t, archsimd.Uint32x4.Greater, greaterSlice[uint32])
+ testUint32x8Compare(t, archsimd.Uint32x8.Greater, greaterSlice[uint32])
+
+ testUint64x2Compare(t, archsimd.Uint64x2.Greater, greaterSlice[uint64])
+ testUint64x4Compare(t, archsimd.Uint64x4.Greater, greaterSlice[uint64])
+ testUint8x16Compare(t, archsimd.Uint8x16.Greater, greaterSlice[uint8])
+ testUint8x32Compare(t, archsimd.Uint8x32.Greater, greaterSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+
+ testFloat32x16Compare(t, archsimd.Float32x16.Greater, greaterSlice[float32])
+ testFloat64x8Compare(t, archsimd.Float64x8.Greater, greaterSlice[float64])
+ testInt8x64Compare(t, archsimd.Int8x64.Greater, greaterSlice[int8])
+ testInt16x32Compare(t, archsimd.Int16x32.Greater, greaterSlice[int16])
+ testInt32x16Compare(t, archsimd.Int32x16.Greater, greaterSlice[int32])
+ testInt64x8Compare(t, archsimd.Int64x8.Greater, greaterSlice[int64])
+ testUint8x64Compare(t, archsimd.Uint8x64.Greater, greaterSlice[uint8])
+ testUint16x32Compare(t, archsimd.Uint16x32.Greater, greaterSlice[uint16])
+ testUint32x16Compare(t, archsimd.Uint32x16.Greater, greaterSlice[uint32])
+ testUint64x8Compare(t, archsimd.Uint64x8.Greater, greaterSlice[uint64])
+ }
+}
+
+func TestGreaterEqual(t *testing.T) {
+ testFloat32x4Compare(t, archsimd.Float32x4.GreaterEqual, greaterEqualSlice[float32])
+ testFloat32x8Compare(t, archsimd.Float32x8.GreaterEqual, greaterEqualSlice[float32])
+ testFloat64x2Compare(t, archsimd.Float64x2.GreaterEqual, greaterEqualSlice[float64])
+ testFloat64x4Compare(t, archsimd.Float64x4.GreaterEqual, greaterEqualSlice[float64])
+
+ testInt16x16Compare(t, archsimd.Int16x16.GreaterEqual, greaterEqualSlice[int16])
+ testInt16x8Compare(t, archsimd.Int16x8.GreaterEqual, greaterEqualSlice[int16])
+ testInt32x4Compare(t, archsimd.Int32x4.GreaterEqual, greaterEqualSlice[int32])
+ testInt32x8Compare(t, archsimd.Int32x8.GreaterEqual, greaterEqualSlice[int32])
+ testInt64x2Compare(t, archsimd.Int64x2.GreaterEqual, greaterEqualSlice[int64])
+ testInt64x4Compare(t, archsimd.Int64x4.GreaterEqual, greaterEqualSlice[int64])
+ testInt8x16Compare(t, archsimd.Int8x16.GreaterEqual, greaterEqualSlice[int8])
+ testInt8x32Compare(t, archsimd.Int8x32.GreaterEqual, greaterEqualSlice[int8])
+
+ testUint16x16Compare(t, archsimd.Uint16x16.GreaterEqual, greaterEqualSlice[uint16])
+ testUint16x8Compare(t, archsimd.Uint16x8.GreaterEqual, greaterEqualSlice[uint16])
+ testUint32x4Compare(t, archsimd.Uint32x4.GreaterEqual, greaterEqualSlice[uint32])
+ testUint32x8Compare(t, archsimd.Uint32x8.GreaterEqual, greaterEqualSlice[uint32])
+ testUint64x2Compare(t, archsimd.Uint64x2.GreaterEqual, greaterEqualSlice[uint64])
+ testUint64x4Compare(t, archsimd.Uint64x4.GreaterEqual, greaterEqualSlice[uint64])
+ testUint8x16Compare(t, archsimd.Uint8x16.GreaterEqual, greaterEqualSlice[uint8])
+ testUint8x32Compare(t, archsimd.Uint8x32.GreaterEqual, greaterEqualSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ testFloat32x16Compare(t, archsimd.Float32x16.GreaterEqual, greaterEqualSlice[float32])
+ testFloat64x8Compare(t, archsimd.Float64x8.GreaterEqual, greaterEqualSlice[float64])
+ testInt8x64Compare(t, archsimd.Int8x64.GreaterEqual, greaterEqualSlice[int8])
+ testInt16x32Compare(t, archsimd.Int16x32.GreaterEqual, greaterEqualSlice[int16])
+ testInt32x16Compare(t, archsimd.Int32x16.GreaterEqual, greaterEqualSlice[int32])
+ testInt64x8Compare(t, archsimd.Int64x8.GreaterEqual, greaterEqualSlice[int64])
+ testUint8x64Compare(t, archsimd.Uint8x64.GreaterEqual, greaterEqualSlice[uint8])
+ testUint16x32Compare(t, archsimd.Uint16x32.GreaterEqual, greaterEqualSlice[uint16])
+ testUint32x16Compare(t, archsimd.Uint32x16.GreaterEqual, greaterEqualSlice[uint32])
+ testUint64x8Compare(t, archsimd.Uint64x8.GreaterEqual, greaterEqualSlice[uint64])
+ }
+}
+
+func TestEqual(t *testing.T) {
+ testFloat32x4Compare(t, archsimd.Float32x4.Equal, equalSlice[float32])
+ testFloat32x8Compare(t, archsimd.Float32x8.Equal, equalSlice[float32])
+ testFloat64x2Compare(t, archsimd.Float64x2.Equal, equalSlice[float64])
+ testFloat64x4Compare(t, archsimd.Float64x4.Equal, equalSlice[float64])
+
+ testInt16x16Compare(t, archsimd.Int16x16.Equal, equalSlice[int16])
+ testInt16x8Compare(t, archsimd.Int16x8.Equal, equalSlice[int16])
+ testInt32x4Compare(t, archsimd.Int32x4.Equal, equalSlice[int32])
+ testInt32x8Compare(t, archsimd.Int32x8.Equal, equalSlice[int32])
+ testInt64x2Compare(t, archsimd.Int64x2.Equal, equalSlice[int64])
+ testInt64x4Compare(t, archsimd.Int64x4.Equal, equalSlice[int64])
+ testInt8x16Compare(t, archsimd.Int8x16.Equal, equalSlice[int8])
+ testInt8x32Compare(t, archsimd.Int8x32.Equal, equalSlice[int8])
+
+ testUint16x16Compare(t, archsimd.Uint16x16.Equal, equalSlice[uint16])
+ testUint16x8Compare(t, archsimd.Uint16x8.Equal, equalSlice[uint16])
+ testUint32x4Compare(t, archsimd.Uint32x4.Equal, equalSlice[uint32])
+ testUint32x8Compare(t, archsimd.Uint32x8.Equal, equalSlice[uint32])
+ testUint64x2Compare(t, archsimd.Uint64x2.Equal, equalSlice[uint64])
+ testUint64x4Compare(t, archsimd.Uint64x4.Equal, equalSlice[uint64])
+ testUint8x16Compare(t, archsimd.Uint8x16.Equal, equalSlice[uint8])
+ testUint8x32Compare(t, archsimd.Uint8x32.Equal, equalSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ testFloat32x16Compare(t, archsimd.Float32x16.Equal, equalSlice[float32])
+ testFloat64x8Compare(t, archsimd.Float64x8.Equal, equalSlice[float64])
+ testInt8x64Compare(t, archsimd.Int8x64.Equal, equalSlice[int8])
+ testInt16x32Compare(t, archsimd.Int16x32.Equal, equalSlice[int16])
+ testInt32x16Compare(t, archsimd.Int32x16.Equal, equalSlice[int32])
+ testInt64x8Compare(t, archsimd.Int64x8.Equal, equalSlice[int64])
+ testUint8x64Compare(t, archsimd.Uint8x64.Equal, equalSlice[uint8])
+ testUint16x32Compare(t, archsimd.Uint16x32.Equal, equalSlice[uint16])
+ testUint32x16Compare(t, archsimd.Uint32x16.Equal, equalSlice[uint32])
+ testUint64x8Compare(t, archsimd.Uint64x8.Equal, equalSlice[uint64])
+ }
+}
+
+func TestNotEqual(t *testing.T) {
+ testFloat32x4Compare(t, archsimd.Float32x4.NotEqual, notEqualSlice[float32])
+ testFloat32x8Compare(t, archsimd.Float32x8.NotEqual, notEqualSlice[float32])
+ testFloat64x2Compare(t, archsimd.Float64x2.NotEqual, notEqualSlice[float64])
+ testFloat64x4Compare(t, archsimd.Float64x4.NotEqual, notEqualSlice[float64])
+
+ testInt16x16Compare(t, archsimd.Int16x16.NotEqual, notEqualSlice[int16])
+ testInt16x8Compare(t, archsimd.Int16x8.NotEqual, notEqualSlice[int16])
+ testInt32x4Compare(t, archsimd.Int32x4.NotEqual, notEqualSlice[int32])
+ testInt32x8Compare(t, archsimd.Int32x8.NotEqual, notEqualSlice[int32])
+ testInt64x2Compare(t, archsimd.Int64x2.NotEqual, notEqualSlice[int64])
+ testInt64x4Compare(t, archsimd.Int64x4.NotEqual, notEqualSlice[int64])
+ testInt8x16Compare(t, archsimd.Int8x16.NotEqual, notEqualSlice[int8])
+ testInt8x32Compare(t, archsimd.Int8x32.NotEqual, notEqualSlice[int8])
+
+ testUint16x16Compare(t, archsimd.Uint16x16.NotEqual, notEqualSlice[uint16])
+ testUint16x8Compare(t, archsimd.Uint16x8.NotEqual, notEqualSlice[uint16])
+ testUint32x4Compare(t, archsimd.Uint32x4.NotEqual, notEqualSlice[uint32])
+ testUint32x8Compare(t, archsimd.Uint32x8.NotEqual, notEqualSlice[uint32])
+ testUint64x2Compare(t, archsimd.Uint64x2.NotEqual, notEqualSlice[uint64])
+ testUint64x4Compare(t, archsimd.Uint64x4.NotEqual, notEqualSlice[uint64])
+ testUint8x16Compare(t, archsimd.Uint8x16.NotEqual, notEqualSlice[uint8])
+ testUint8x32Compare(t, archsimd.Uint8x32.NotEqual, notEqualSlice[uint8])
+
+ if archsimd.X86.AVX512() {
+ testFloat32x16Compare(t, archsimd.Float32x16.NotEqual, notEqualSlice[float32])
+ testFloat64x8Compare(t, archsimd.Float64x8.NotEqual, notEqualSlice[float64])
+ testInt8x64Compare(t, archsimd.Int8x64.NotEqual, notEqualSlice[int8])
+ testInt16x32Compare(t, archsimd.Int16x32.NotEqual, notEqualSlice[int16])
+ testInt32x16Compare(t, archsimd.Int32x16.NotEqual, notEqualSlice[int32])
+ testInt64x8Compare(t, archsimd.Int64x8.NotEqual, notEqualSlice[int64])
+ testUint8x64Compare(t, archsimd.Uint8x64.NotEqual, notEqualSlice[uint8])
+ testUint16x32Compare(t, archsimd.Uint16x32.NotEqual, notEqualSlice[uint16])
+ testUint32x16Compare(t, archsimd.Uint32x16.NotEqual, notEqualSlice[uint32])
+ testUint64x8Compare(t, archsimd.Uint64x8.NotEqual, notEqualSlice[uint64])
+ }
+}
--- /dev/null
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing simd methods that compare two operands under a mask.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+// testInt8x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt8x16CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int8x16, m archsimd.Mask8x16) archsimd.Mask8x16,
+ want func(_, _ []int8) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePairMasked(t, int8s, n, func(x, y []int8, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x16Slice(x)
+ b := archsimd.LoadInt8x16Slice(y)
+ k := archsimd.LoadInt8x16Slice(toVect[int8](m)).ToMask()
+ g := make([]int8, n)
+ f(a, b, k).AsInt8x16().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt16x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt16x8CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int16x8, m archsimd.Mask16x8) archsimd.Mask16x8,
+ want func(_, _ []int16) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePairMasked(t, int16s, n, func(x, y []int16, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x8Slice(x)
+ b := archsimd.LoadInt16x8Slice(y)
+ k := archsimd.LoadInt16x8Slice(toVect[int16](m)).ToMask()
+ g := make([]int16, n)
+ f(a, b, k).AsInt16x8().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt32x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt32x4CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int32x4, m archsimd.Mask32x4) archsimd.Mask32x4,
+ want func(_, _ []int32) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePairMasked(t, int32s, n, func(x, y []int32, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x4Slice(x)
+ b := archsimd.LoadInt32x4Slice(y)
+ k := archsimd.LoadInt32x4Slice(toVect[int32](m)).ToMask()
+ g := make([]int32, n)
+ f(a, b, k).AsInt32x4().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt64x2CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt64x2CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int64x2, m archsimd.Mask64x2) archsimd.Mask64x2,
+ want func(_, _ []int64) []int64) {
+ n := 2
+ t.Helper()
+ forSlicePairMasked(t, int64s, n, func(x, y []int64, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x2Slice(x)
+ b := archsimd.LoadInt64x2Slice(y)
+ k := archsimd.LoadInt64x2Slice(toVect[int64](m)).ToMask()
+ g := make([]int64, n)
+ f(a, b, k).AsInt64x2().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint8x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint8x16CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint8x16, m archsimd.Mask8x16) archsimd.Mask8x16,
+ want func(_, _ []uint8) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePairMasked(t, uint8s, n, func(x, y []uint8, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x16Slice(x)
+ b := archsimd.LoadUint8x16Slice(y)
+ k := archsimd.LoadInt8x16Slice(toVect[int8](m)).ToMask()
+ g := make([]int8, n)
+ f(a, b, k).AsInt8x16().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint16x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint16x8CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint16x8, m archsimd.Mask16x8) archsimd.Mask16x8,
+ want func(_, _ []uint16) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePairMasked(t, uint16s, n, func(x, y []uint16, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x8Slice(x)
+ b := archsimd.LoadUint16x8Slice(y)
+ k := archsimd.LoadInt16x8Slice(toVect[int16](m)).ToMask()
+ g := make([]int16, n)
+ f(a, b, k).AsInt16x8().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint32x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint32x4CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint32x4, m archsimd.Mask32x4) archsimd.Mask32x4,
+ want func(_, _ []uint32) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePairMasked(t, uint32s, n, func(x, y []uint32, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x4Slice(x)
+ b := archsimd.LoadUint32x4Slice(y)
+ k := archsimd.LoadInt32x4Slice(toVect[int32](m)).ToMask()
+ g := make([]int32, n)
+ f(a, b, k).AsInt32x4().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint64x2CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint64x2CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint64x2, m archsimd.Mask64x2) archsimd.Mask64x2,
+ want func(_, _ []uint64) []int64) {
+ n := 2
+ t.Helper()
+ forSlicePairMasked(t, uint64s, n, func(x, y []uint64, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x2Slice(x)
+ b := archsimd.LoadUint64x2Slice(y)
+ k := archsimd.LoadInt64x2Slice(toVect[int64](m)).ToMask()
+ g := make([]int64, n)
+ f(a, b, k).AsInt64x2().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testFloat32x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat32x4CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Float32x4, m archsimd.Mask32x4) archsimd.Mask32x4,
+ want func(_, _ []float32) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePairMasked(t, float32s, n, func(x, y []float32, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x4Slice(x)
+ b := archsimd.LoadFloat32x4Slice(y)
+ k := archsimd.LoadInt32x4Slice(toVect[int32](m)).ToMask()
+ g := make([]int32, n)
+ f(a, b, k).AsInt32x4().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testFloat64x2CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat64x2CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Float64x2, m archsimd.Mask64x2) archsimd.Mask64x2,
+ want func(_, _ []float64) []int64) {
+ n := 2
+ t.Helper()
+ forSlicePairMasked(t, float64s, n, func(x, y []float64, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x2Slice(x)
+ b := archsimd.LoadFloat64x2Slice(y)
+ k := archsimd.LoadInt64x2Slice(toVect[int64](m)).ToMask()
+ g := make([]int64, n)
+ f(a, b, k).AsInt64x2().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt8x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt8x32CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int8x32, m archsimd.Mask8x32) archsimd.Mask8x32,
+ want func(_, _ []int8) []int64) {
+ n := 32
+ t.Helper()
+ forSlicePairMasked(t, int8s, n, func(x, y []int8, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x32Slice(x)
+ b := archsimd.LoadInt8x32Slice(y)
+ k := archsimd.LoadInt8x32Slice(toVect[int8](m)).ToMask()
+ g := make([]int8, n)
+ f(a, b, k).AsInt8x32().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt16x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt16x16CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int16x16, m archsimd.Mask16x16) archsimd.Mask16x16,
+ want func(_, _ []int16) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePairMasked(t, int16s, n, func(x, y []int16, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x16Slice(x)
+ b := archsimd.LoadInt16x16Slice(y)
+ k := archsimd.LoadInt16x16Slice(toVect[int16](m)).ToMask()
+ g := make([]int16, n)
+ f(a, b, k).AsInt16x16().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt32x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt32x8CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int32x8, m archsimd.Mask32x8) archsimd.Mask32x8,
+ want func(_, _ []int32) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePairMasked(t, int32s, n, func(x, y []int32, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x8Slice(x)
+ b := archsimd.LoadInt32x8Slice(y)
+ k := archsimd.LoadInt32x8Slice(toVect[int32](m)).ToMask()
+ g := make([]int32, n)
+ f(a, b, k).AsInt32x8().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt64x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt64x4CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int64x4, m archsimd.Mask64x4) archsimd.Mask64x4,
+ want func(_, _ []int64) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePairMasked(t, int64s, n, func(x, y []int64, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x4Slice(x)
+ b := archsimd.LoadInt64x4Slice(y)
+ k := archsimd.LoadInt64x4Slice(toVect[int64](m)).ToMask()
+ g := make([]int64, n)
+ f(a, b, k).AsInt64x4().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint8x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint8x32CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint8x32, m archsimd.Mask8x32) archsimd.Mask8x32,
+ want func(_, _ []uint8) []int64) {
+ n := 32
+ t.Helper()
+ forSlicePairMasked(t, uint8s, n, func(x, y []uint8, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x32Slice(x)
+ b := archsimd.LoadUint8x32Slice(y)
+ k := archsimd.LoadInt8x32Slice(toVect[int8](m)).ToMask()
+ g := make([]int8, n)
+ f(a, b, k).AsInt8x32().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint16x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint16x16CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint16x16, m archsimd.Mask16x16) archsimd.Mask16x16,
+ want func(_, _ []uint16) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePairMasked(t, uint16s, n, func(x, y []uint16, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x16Slice(x)
+ b := archsimd.LoadUint16x16Slice(y)
+ k := archsimd.LoadInt16x16Slice(toVect[int16](m)).ToMask()
+ g := make([]int16, n)
+ f(a, b, k).AsInt16x16().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint32x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint32x8CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint32x8, m archsimd.Mask32x8) archsimd.Mask32x8,
+ want func(_, _ []uint32) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePairMasked(t, uint32s, n, func(x, y []uint32, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x8Slice(x)
+ b := archsimd.LoadUint32x8Slice(y)
+ k := archsimd.LoadInt32x8Slice(toVect[int32](m)).ToMask()
+ g := make([]int32, n)
+ f(a, b, k).AsInt32x8().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint64x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint64x4CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint64x4, m archsimd.Mask64x4) archsimd.Mask64x4,
+ want func(_, _ []uint64) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePairMasked(t, uint64s, n, func(x, y []uint64, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x4Slice(x)
+ b := archsimd.LoadUint64x4Slice(y)
+ k := archsimd.LoadInt64x4Slice(toVect[int64](m)).ToMask()
+ g := make([]int64, n)
+ f(a, b, k).AsInt64x4().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testFloat32x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat32x8CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Float32x8, m archsimd.Mask32x8) archsimd.Mask32x8,
+ want func(_, _ []float32) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePairMasked(t, float32s, n, func(x, y []float32, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x8Slice(x)
+ b := archsimd.LoadFloat32x8Slice(y)
+ k := archsimd.LoadInt32x8Slice(toVect[int32](m)).ToMask()
+ g := make([]int32, n)
+ f(a, b, k).AsInt32x8().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testFloat64x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat64x4CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Float64x4, m archsimd.Mask64x4) archsimd.Mask64x4,
+ want func(_, _ []float64) []int64) {
+ n := 4
+ t.Helper()
+ forSlicePairMasked(t, float64s, n, func(x, y []float64, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x4Slice(x)
+ b := archsimd.LoadFloat64x4Slice(y)
+ k := archsimd.LoadInt64x4Slice(toVect[int64](m)).ToMask()
+ g := make([]int64, n)
+ f(a, b, k).AsInt64x4().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt8x64CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt8x64CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int8x64, m archsimd.Mask8x64) archsimd.Mask8x64,
+ want func(_, _ []int8) []int64) {
+ n := 64
+ t.Helper()
+ forSlicePairMasked(t, int8s, n, func(x, y []int8, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x64Slice(x)
+ b := archsimd.LoadInt8x64Slice(y)
+ k := archsimd.LoadInt8x64Slice(toVect[int8](m)).ToMask()
+ g := make([]int8, n)
+ f(a, b, k).AsInt8x64().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt16x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt16x32CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int16x32, m archsimd.Mask16x32) archsimd.Mask16x32,
+ want func(_, _ []int16) []int64) {
+ n := 32
+ t.Helper()
+ forSlicePairMasked(t, int16s, n, func(x, y []int16, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x32Slice(x)
+ b := archsimd.LoadInt16x32Slice(y)
+ k := archsimd.LoadInt16x32Slice(toVect[int16](m)).ToMask()
+ g := make([]int16, n)
+ f(a, b, k).AsInt16x32().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt32x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt32x16CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int32x16, m archsimd.Mask32x16) archsimd.Mask32x16,
+ want func(_, _ []int32) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePairMasked(t, int32s, n, func(x, y []int32, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x16Slice(x)
+ b := archsimd.LoadInt32x16Slice(y)
+ k := archsimd.LoadInt32x16Slice(toVect[int32](m)).ToMask()
+ g := make([]int32, n)
+ f(a, b, k).AsInt32x16().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testInt64x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testInt64x8CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Int64x8, m archsimd.Mask64x8) archsimd.Mask64x8,
+ want func(_, _ []int64) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePairMasked(t, int64s, n, func(x, y []int64, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x8Slice(x)
+ b := archsimd.LoadInt64x8Slice(y)
+ k := archsimd.LoadInt64x8Slice(toVect[int64](m)).ToMask()
+ g := make([]int64, n)
+ f(a, b, k).AsInt64x8().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint8x64CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint8x64CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint8x64, m archsimd.Mask8x64) archsimd.Mask8x64,
+ want func(_, _ []uint8) []int64) {
+ n := 64
+ t.Helper()
+ forSlicePairMasked(t, uint8s, n, func(x, y []uint8, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x64Slice(x)
+ b := archsimd.LoadUint8x64Slice(y)
+ k := archsimd.LoadInt8x64Slice(toVect[int8](m)).ToMask()
+ g := make([]int8, n)
+ f(a, b, k).AsInt8x64().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint16x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint16x32CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint16x32, m archsimd.Mask16x32) archsimd.Mask16x32,
+ want func(_, _ []uint16) []int64) {
+ n := 32
+ t.Helper()
+ forSlicePairMasked(t, uint16s, n, func(x, y []uint16, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x32Slice(x)
+ b := archsimd.LoadUint16x32Slice(y)
+ k := archsimd.LoadInt16x32Slice(toVect[int16](m)).ToMask()
+ g := make([]int16, n)
+ f(a, b, k).AsInt16x32().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint32x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint32x16CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint32x16, m archsimd.Mask32x16) archsimd.Mask32x16,
+ want func(_, _ []uint32) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePairMasked(t, uint32s, n, func(x, y []uint32, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x16Slice(x)
+ b := archsimd.LoadUint32x16Slice(y)
+ k := archsimd.LoadInt32x16Slice(toVect[int32](m)).ToMask()
+ g := make([]int32, n)
+ f(a, b, k).AsInt32x16().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testUint64x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testUint64x8CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Uint64x8, m archsimd.Mask64x8) archsimd.Mask64x8,
+ want func(_, _ []uint64) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePairMasked(t, uint64s, n, func(x, y []uint64, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x8Slice(x)
+ b := archsimd.LoadUint64x8Slice(y)
+ k := archsimd.LoadInt64x8Slice(toVect[int64](m)).ToMask()
+ g := make([]int64, n)
+ f(a, b, k).AsInt64x8().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testFloat32x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat32x16CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Float32x16, m archsimd.Mask32x16) archsimd.Mask32x16,
+ want func(_, _ []float32) []int64) {
+ n := 16
+ t.Helper()
+ forSlicePairMasked(t, float32s, n, func(x, y []float32, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x16Slice(x)
+ b := archsimd.LoadFloat32x16Slice(y)
+ k := archsimd.LoadInt32x16Slice(toVect[int32](m)).ToMask()
+ g := make([]int32, n)
+ f(a, b, k).AsInt32x16().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
+
+// testFloat64x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
+// The mask is applied to the output of want; anything not in the mask, is zeroed.
+func testFloat64x8CompareMasked(t *testing.T,
+ f func(_, _ archsimd.Float64x8, m archsimd.Mask64x8) archsimd.Mask64x8,
+ want func(_, _ []float64) []int64) {
+ n := 8
+ t.Helper()
+ forSlicePairMasked(t, float64s, n, func(x, y []float64, m []bool) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x8Slice(x)
+ b := archsimd.LoadFloat64x8Slice(y)
+ k := archsimd.LoadInt64x8Slice(toVect[int64](m)).ToMask()
+ g := make([]int64, n)
+ f(a, b, k).AsInt64x8().StoreSlice(g)
+ w := want(x, y)
+ for i := range m {
+ if !m[i] {
+ w[i] = 0
+ }
+ }
+ return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
+ })
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd
+
+package simd
+
+// Invoke code generators.
+
+//go:generate go run -C ../.. genfiles.go
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+ "math"
+ "simd/archsimd/internal/test_helpers"
+ "testing"
+)
+
+type signed interface {
+ ~int | ~int8 | ~int16 | ~int32 | ~int64
+}
+
+type integer interface {
+ ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr
+}
+
+type float interface {
+ ~float32 | ~float64
+}
+
+type number interface {
+ ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64
+}
+
+func checkSlices[T number](t *testing.T, got, want []T) bool {
+ t.Helper()
+ return test_helpers.CheckSlicesLogInput[T](t, got, want, 0.0, nil)
+}
+
+func checkSlicesLogInput[T number](t *testing.T, got, want []T, flakiness float64, logInput func()) bool {
+ t.Helper()
+ return test_helpers.CheckSlicesLogInput[T](t, got, want, flakiness, logInput)
+}
+
+// sliceOf returns a slice n T's, with each
+// element of the slice initialized to its
+// index + 1.
+func sliceOf[T number](n int) []T {
+ s := make([]T, n)
+ for i := 0; i < n; i++ {
+ s[i] = T(i + 1)
+ }
+ return s
+}
+
+func toVect[T signed](b []bool) []T {
+ s := make([]T, len(b))
+ for i := range b {
+ if b[i] {
+ s[i] = -1
+ }
+ }
+ return s
+}
+
+// s64 converts a slice of some integer type into a slice of int64
+func s64[T number](s []T) []int64 {
+ var is any = s
+ if r, ok := is.([]int64); ok {
+ return r
+ }
+ r := make([]int64, len(s))
+ for i := range s {
+ r[i] = int64(s[i])
+ }
+ return r
+}
+
+// Do implements slice part testing. It repeatedly calls
+// body on smaller and smaller slices and an output slice
+// for the result, then compares the result to its own
+// calculation of what the result should be.
+func Do[T number](t *testing.T, n int, body func(a, c []T)) {
+ a := sliceOf[T](n)
+ b := sliceOf[T](n)
+
+ for i := n; i >= 0; i-- {
+ c := make([]T, n, n)
+ body(a[:i], c)
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = T(0)
+ }
+ }
+}
+
+// map3 returns a function that returns the slice of the results of applying
+// input parameter elem to the respective elements of its 3 slice inputs.
+func map3[T, U any](elem func(x, y, z T) U) func(x, y, z []T) []U {
+ return func(x, y, z []T) []U {
+ s := make([]U, len(x))
+ for i := range s {
+ s[i] = elem(x[i], y[i], z[i])
+ }
+ return s
+ }
+}
+
+// map2 returns a function that returns the slice of the results of applying
+// input parameter elem to the respective elements of its 2 slice inputs.
+func map2[T, U any](elem func(x, y T) U) func(x, y []T) []U {
+ return func(x, y []T) []U {
+ s := make([]U, len(x))
+ for i := range s {
+ s[i] = elem(x[i], y[i])
+ }
+ return s
+ }
+}
+
+// map1 returns a function that returns the slice of the results of applying
+// input parameter elem to the respective elements of its single slice input.
+func map1[T, U any](elem func(x T) U) func(x []T) []U {
+ return func(x []T) []U {
+ s := make([]U, len(x))
+ for i := range s {
+ s[i] = elem(x[i])
+ }
+ return s
+ }
+}
+
+// map1 returns a function that returns the slice of the results of applying
+// comparison function elem to the respective elements of its two slice inputs.
+func mapCompare[T number](elem func(x, y T) bool) func(x, y []T) []int64 {
+ return func(x, y []T) []int64 {
+ s := make([]int64, len(x))
+ for i := range s {
+ if elem(x[i], y[i]) {
+ s[i] = -1
+ }
+ }
+ return s
+ }
+}
+
+// nOf returns a slice of length n whose elements are taken
+// from input slice s.
+func nOf[T any](n int, s []T) []T {
+ if len(s) >= n {
+ return s
+ }
+ r := make([]T, n)
+ for i := range r {
+ r[i] = s[i%len(s)]
+ }
+ return r
+}
+
+const (
+ PN22 = 1.0 / 1024 / 1024 / 4
+ PN24 = 1.0 / 1024 / 1024 / 16
+ PN53 = PN24 * PN24 / 32
+ F0 = float32(1.0 + 513*PN22/2)
+ F1 = float32(1.0 + 511*PN22*8)
+ Aeasy = float32(2046 * PN53)
+ Ahard = float32(2047 * PN53) // 2047 provokes a 2-rounding in 64-bit FMA rounded to 32-bit
+)
+
+var zero = 0.0
+var nzero = -zero
+var inf = 1 / zero
+var ninf = -1 / zero
+var nan = math.NaN()
+
+// N controls how large the test vectors are
+const N = 144
+
+var float32s = nOf(N, []float32{float32(inf), float32(ninf), 1, float32(nan), float32(zero), 2, float32(nan), float32(zero), 3, float32(-zero), float32(1.0 / zero), float32(-1.0 / zero), 1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 1000, 1.0 / 1000000, 1, -1, 0, 2, -2, 3, -3, math.MaxFloat32, 1 / math.MaxFloat32, 10, -10, 100, 20, -20, 300, -300, -4000, -80, -160, -3200, -64, -4, -8, -16, -32, -64})
+var float64s = nOf(N, []float64{inf, ninf, nan, zero, -zero, 1 / zero, -1 / zero, 0.0001, 0.0000001, 1, -1, 0, 2, -2, 3, -3, math.MaxFloat64, 1.0 / math.MaxFloat64, 10, -10, 100, 20, -20, 300, -300, -4000, -80, -16, -32, -64})
+
+var int32s = nOf(N, []int32{1, -1, 0, 2, 4, 8, 1024, 0xffffff, -0xffffff, 0x55555, 0x77777, 0xccccc, -0x55555, -0x77777, -0xccccc, -4, -8, -16, -32, -64})
+var uint32s = nOf(N, []uint32{1, 0, 2, 4, 8, 1024, 0xffffff, ^uint32(0xffffff), 0x55555, 0x77777, 0xccccc, ^uint32(0x55555), ^uint32(0x77777), ^uint32(0xccccc)})
+
+var int64s = nOf(N, []int64{1, -1, 0, 2, 4, 8, 1024, 0xffffff, -0xffffff, 0x55555, 0x77777, 0xccccc, -0x55555, -0x77777, -0xccccc, -4, -8, -16, -32, -64})
+var uint64s = nOf(N, []uint64{1, 0, 2, 4, 8, 1024, 0xffffff, ^uint64(0xffffff), 0x55555, 0x77777, 0xccccc, ^uint64(0x55555), ^uint64(0x77777), ^uint64(0xccccc)})
+
+var int16s = nOf(N, []int16{1, -1, 0, 2, 4, 8, 1024, 3, 5, 7, 11, 13, 3000, 5555, 7777, 11111, 32767, 32766, -32767, -32768, -11111, -4, -8, -16, -32, -64})
+var uint16s = nOf(N, []uint16{1, 0, 2, 4, 8, 1024, 3, 5, 7, 11, 13, 3000, 5555, 7777, 11111, 32767, 32766, 32768, 65535, 45678, 56789})
+
+var int8s = nOf(N, []int8{0, 1, 2, 3, 5, 7, 11, 22, 33, 55, 77, 121, 127, -1, -2, -3, -5, -7, -11, -77, -121, -127, -128, 4, 8, 16, 32, 64, -4, -8, -16, -32, -64})
+var uint8s = nOf(N, []uint8{0, 1, 2, 3, 5, 7, 11, 22, 33, 55, 77, 121, 127, 128, 255, 233, 211, 177, 144, 4, 8, 16, 32, 64})
+
+var bools = nOf(N, []bool{
+ true, false, true, true, false, false, true, true, true, false, false, false, true, true, true, true, false, false, false, false})
+
+func forSlice[T number](t *testing.T, s []T, n int, f func(a []T) bool) {
+ t.Helper()
+ for i := 0; i < len(s)-n; i++ {
+ if !f(s[i : i+n]) {
+ return
+ }
+ }
+}
+
+func forSlicePair[T number](t *testing.T, s []T, n int, f func(a, b []T) bool) {
+ t.Helper()
+ for i := 0; i < len(s)-n; i++ {
+ for j := 0; j < len(s)-n; j++ {
+ if !f(s[i:i+n], s[j:j+n]) {
+ return
+ }
+ }
+ }
+}
+
+func forSliceTriple[T number](t *testing.T, s []T, n int, f func(a, b, c []T) bool) {
+ t.Helper()
+ for i := 0; i < len(s)-n; i += 3 {
+ for j := 0; j < len(s)-n; j += 3 {
+ for k := 0; k < len(s)-n; k += 3 {
+ if !f(s[i:i+n], s[j:j+n], s[k:k+n]) {
+ return
+ }
+ }
+ }
+ }
+}
+
+func forSlicePairMasked[T number](t *testing.T, s []T, n int, f func(a, b []T, m []bool) bool) {
+ t.Helper()
+ m := bools
+ // Step slice pair masked forward much more quickly, otherwise it is slooooow
+ for i := 0; i < len(s)-n; i += 3 {
+ for j := 0; j < len(s)-n; j += 3 {
+ for k := 0; k < len(m)-n; k += 3 {
+ if !f(s[i:i+n], s[j:j+n], m[k:k+n]) {
+ return
+ }
+ }
+ }
+ }
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+ "reflect"
+ "simd/archsimd"
+ "slices"
+ "testing"
+)
+
+var sink any
+
+func TestType(t *testing.T) {
+ // Testing:
+ // - Defined as another struct's field is ok
+ // - Pointer is ok
+ // - Type defition is ok
+ // - Type alias is ok
+ // - Type conversion is ok
+ // - Conversion to interface is ok
+ type alias = archsimd.Int32x4
+ type maskT archsimd.Mask32x4
+ type myStruct struct {
+ x alias
+ y *archsimd.Int32x4
+ z maskT
+ }
+ vals := [4]int32{1, 2, 3, 4}
+ v := myStruct{x: archsimd.LoadInt32x4(&vals)}
+ // masking elements 1 and 2.
+ want := []int32{2, 4, 0, 0}
+ y := archsimd.LoadInt32x4(&vals)
+ v.y = &y
+ sink = y
+
+ if !archsimd.X86.AVX512GFNI() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ v.z = maskT(archsimd.Mask32x4FromBits(0b0011))
+ *v.y = v.y.Add(v.x).Masked(archsimd.Mask32x4(v.z))
+
+ got := [4]int32{}
+ v.y.Store(&got)
+ checkSlices(t, got[:], want)
+}
+
+func TestUncomparable(t *testing.T) {
+ // Test that simd vectors are not comparable
+ var x, y any = archsimd.LoadUint32x4(&[4]uint32{1, 2, 3, 4}), archsimd.LoadUint32x4(&[4]uint32{5, 6, 7, 8})
+ shouldPanic := func(fn func()) {
+ defer func() {
+ if recover() == nil {
+ panic("did not panic")
+ }
+ }()
+ fn()
+ }
+ shouldPanic(func() { _ = x == y })
+}
+
+func TestFuncValue(t *testing.T) {
+ // Test that simd intrinsic can be used as a function value.
+ xv := [4]int32{1, 2, 3, 4}
+ yv := [4]int32{5, 6, 7, 8}
+ want := []int32{6, 8, 10, 12}
+ x := archsimd.LoadInt32x4(&xv)
+ y := archsimd.LoadInt32x4(&yv)
+ fn := archsimd.Int32x4.Add
+ sink = fn
+ x = fn(x, y)
+ got := [4]int32{}
+ x.Store(&got)
+ checkSlices(t, got[:], want)
+}
+
+func TestReflectMethod(t *testing.T) {
+ // Test that simd intrinsic can be accessed via reflection.
+ // NOTE: we don't yet support reflect method.Call.
+ xv := [4]int32{1, 2, 3, 4}
+ yv := [4]int32{5, 6, 7, 8}
+ want := []int32{6, 8, 10, 12}
+ x := archsimd.LoadInt32x4(&xv)
+ y := archsimd.LoadInt32x4(&yv)
+ m, ok := reflect.TypeOf(x).MethodByName("Add")
+ if !ok {
+ t.Fatal("Add method not found")
+ }
+ fn := m.Func.Interface().(func(x, y archsimd.Int32x4) archsimd.Int32x4)
+ x = fn(x, y)
+ got := [4]int32{}
+ x.Store(&got)
+ checkSlices(t, got[:], want)
+}
+
+func TestVectorConversion(t *testing.T) {
+ if !archsimd.X86.AVX512GFNI() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ xv := [4]int32{1, 2, 3, 4}
+ x := archsimd.LoadInt32x4(&xv)
+ xPromoted := x.AsInt64x2()
+ xPromotedDemoted := xPromoted.AsInt32x4()
+ got := [4]int32{}
+ xPromotedDemoted.Store(&got)
+ for i := range 4 {
+ if xv[i] != got[i] {
+ t.Errorf("Result at %d incorrect: want %d, got %d", i, xv[i], got[i])
+ }
+ }
+}
+
+func TestMaskConversion(t *testing.T) {
+ if !archsimd.X86.AVX512GFNI() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ x := archsimd.LoadInt32x4Slice([]int32{5, 0, 7, 0})
+ mask := archsimd.Int32x4{}.Sub(x).ToMask()
+ y := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4}).Add(x).Masked(mask)
+ want := [4]int32{6, 0, 10, 0}
+ got := make([]int32, 4)
+ y.StoreSlice(got)
+ checkSlices(t, got[:], want[:])
+}
+
+func TestPermute(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
+ indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
+ want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
+ got := make([]int64, 8)
+ archsimd.LoadInt64x8Slice(x).Permute(archsimd.LoadUint64x8Slice(indices)).StoreSlice(got)
+ checkSlices(t, got, want)
+}
+
+func TestPermuteOrZero(t *testing.T) {
+ x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
+ want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
+ got := make([]uint8, len(x))
+ archsimd.LoadUint8x16Slice(x).PermuteOrZero(archsimd.LoadInt8x16Slice(indices)).StoreSlice(got)
+ checkSlices(t, got, want)
+}
+
+func TestConcatPermute(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
+ y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
+ indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
+ want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
+ got := make([]int64, 8)
+ archsimd.LoadInt64x8Slice(x).ConcatPermute(archsimd.LoadInt64x8Slice(y), archsimd.LoadUint64x8Slice(indices)).StoreSlice(got)
+ checkSlices(t, got, want)
+}
+
+func TestCompress(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ v1234 := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
+ v2400 := v1234.Compress(archsimd.Mask32x4FromBits(0b1010))
+ got := make([]int32, 4)
+ v2400.StoreSlice(got)
+ want := []int32{2, 4, 0, 0}
+ if !slices.Equal(got, want) {
+ t.Errorf("want and got differ, want=%v, got=%v", want, got)
+ }
+}
+
+func TestExpand(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ v3400 := archsimd.LoadInt32x4Slice([]int32{3, 4, 0, 0})
+ v2400 := v3400.Expand(archsimd.Mask32x4FromBits(0b1010))
+ got := make([]int32, 4)
+ v2400.StoreSlice(got)
+ want := []int32{0, 3, 0, 4}
+ if !slices.Equal(got, want) {
+ t.Errorf("want and got differ, want=%v, got=%v", want, got)
+ }
+}
+
+var testShiftAllVal uint64 = 3
+
+func TestShiftAll(t *testing.T) {
+ got := make([]int32, 4)
+ archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(2).StoreSlice(got)
+ for _, v := range got {
+ if v != 0b1100 {
+ t.Errorf("expect 0b1100, got %b", v)
+ }
+ }
+ archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(testShiftAllVal).StoreSlice(got)
+ for _, v := range got {
+ if v != 0b11000 {
+ t.Errorf("expect 0b11000, got %b", v)
+ }
+ }
+}
+
+func TestSlicesInt8(t *testing.T) {
+ a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+ v := archsimd.LoadInt8x32Slice(a)
+ b := make([]int8, 32, 32)
+ v.StoreSlice(b)
+ checkSlices(t, a, b)
+}
+
+func TestSlicesInt8SetElem(t *testing.T) {
+ a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+ v := archsimd.LoadInt8x16Slice(a)
+
+ v = v.SetElem(3, 13)
+ a[3] = 13
+
+ b := make([]int8, 16, 16)
+ v.StoreSlice(b)
+ checkSlices(t, a, b)
+}
+
+func TestSlicesInt8GetElem(t *testing.T) {
+ a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+ v := archsimd.LoadInt8x16Slice(a)
+ e := v.GetElem(2)
+ if e != a[2] {
+ t.Errorf("GetElem(2) = %d != a[2] = %d", e, a[2])
+ }
+
+}
+
+func TestSlicesInt8TooShortLoad(t *testing.T) {
+ defer func() {
+ if r := recover(); r != nil {
+ t.Logf("Saw EXPECTED panic %v", r)
+ } else {
+ t.Errorf("Did not see expected panic")
+ }
+ }()
+ a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31} // TOO SHORT, should panic
+ v := archsimd.LoadInt8x32Slice(a)
+ b := make([]int8, 32, 32)
+ v.StoreSlice(b)
+ checkSlices(t, a, b)
+}
+
+func TestSlicesInt8TooShortStore(t *testing.T) {
+ defer func() {
+ if r := recover(); r != nil {
+ t.Logf("Saw EXPECTED panic %v", r)
+ } else {
+ t.Errorf("Did not see expected panic")
+ }
+ }()
+ a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+ v := archsimd.LoadInt8x32Slice(a)
+ b := make([]int8, 31) // TOO SHORT, should panic
+ v.StoreSlice(b)
+ checkSlices(t, a, b)
+}
+
+func TestSlicesFloat64(t *testing.T) {
+ a := []float64{1, 2, 3, 4, 5, 6, 7, 8} // too long, should be fine
+ v := archsimd.LoadFloat64x4Slice(a)
+ b := make([]float64, 4, 4)
+ v.StoreSlice(b)
+ for i := range b {
+ if a[i] != b[i] {
+ t.Errorf("a and b differ at index %d, a=%f, b=%f", i, a[i], b[i])
+ }
+ }
+}
+
+// TODO: try to reduce this test to be smaller.
+func TestMergeLocals(t *testing.T) {
+ testMergeLocalswrapper(t, archsimd.Int64x4.Add)
+}
+
+//go:noinline
+func forceSpill() {}
+
+func testMergeLocalswrapper(t *testing.T, op func(archsimd.Int64x4, archsimd.Int64x4) archsimd.Int64x4) {
+ t.Helper()
+ s0 := []int64{0, 1, 2, 3}
+ s1 := []int64{-1, 0, -1, 0}
+ want := []int64{-1, 1, 1, 3}
+ v := archsimd.LoadInt64x4Slice(s0)
+ m := archsimd.LoadInt64x4Slice(s1)
+ forceSpill()
+ got := make([]int64, 4)
+ gotv := op(v, m)
+ gotv.StoreSlice(got)
+ for i := range len(want) {
+ if !(got[i] == want[i]) {
+ t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i])
+ }
+ }
+}
+
+func TestBitMaskFromBits(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ results := [2]int64{}
+ want := [2]int64{0, 6}
+ m := archsimd.Mask64x2FromBits(0b10)
+ archsimd.LoadInt64x2Slice([]int64{1, 2}).Add(archsimd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
+ for i := range 2 {
+ if results[i] != want[i] {
+ t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
+ }
+ }
+}
+
+var maskForTestBitMaskFromBitsLoad = uint8(0b10)
+
+func TestBitMaskFromBitsLoad(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ results := [2]int64{}
+ want := [2]int64{0, 6}
+ m := archsimd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad)
+ archsimd.LoadInt64x2Slice([]int64{1, 2}).Add(archsimd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
+ for i := range 2 {
+ if results[i] != want[i] {
+ t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
+ }
+ }
+}
+
+func TestBitMaskToBits(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ if v := archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits(); v != 0b101 {
+ t.Errorf("Want 0b101, got %b", v)
+ }
+}
+
+var maskForTestBitMaskFromBitsStore uint8
+
+func TestBitMaskToBitsStore(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ maskForTestBitMaskFromBitsStore = archsimd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits()
+ if maskForTestBitMaskFromBitsStore != 0b101 {
+ t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore)
+ }
+}
+
+func TestMergeFloat(t *testing.T) {
+ k := make([]int64, 4, 4)
+ s := make([]float64, 4, 4)
+
+ a := archsimd.LoadFloat64x4Slice([]float64{1, 2, 3, 4})
+ b := archsimd.LoadFloat64x4Slice([]float64{4, 2, 3, 1})
+ g := a.Greater(b)
+ g.AsInt64x4().StoreSlice(k)
+ c := a.Merge(b, g)
+
+ c.StoreSlice(s)
+
+ checkSlices[int64](t, k, []int64{0, 0, 0, -1})
+ checkSlices[float64](t, s, []float64{4, 2, 3, 4})
+}
+
+func TestMergeFloat512(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+
+ k := make([]int64, 8, 8)
+ s := make([]float64, 8, 8)
+
+ a := archsimd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
+ b := archsimd.LoadFloat64x8Slice([]float64{8, 7, 6, 5, 4, 2, 3, 1})
+ g := a.Greater(b)
+ g.AsInt64x8().StoreSlice(k)
+ c := a.Merge(b, g)
+ d := a.Masked(g)
+
+ checkSlices[int64](t, k, []int64{0, 0, 0, 0, -1, -1, -1, -1})
+
+ c.StoreSlice(s)
+ checkSlices[float64](t, s, []float64{8, 7, 6, 5, 5, 6, 7, 8})
+
+ d.StoreSlice(s)
+ checkSlices[float64](t, s, []float64{0, 0, 0, 0, 5, 6, 7, 8})
+}
+
+var ro uint8 = 2
+
+func TestRotateAllVariable(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ got := make([]int32, 4)
+ archsimd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(ro).StoreSlice(got)
+ for _, v := range got {
+ if v != 0b1100 {
+ t.Errorf("Want 0b1100, got %b", v)
+ }
+ }
+}
+
+func TestBroadcastUint32x4(t *testing.T) {
+ s := make([]uint32, 4, 4)
+ archsimd.BroadcastUint32x4(123456789).StoreSlice(s)
+ checkSlices(t, s, []uint32{123456789, 123456789, 123456789, 123456789})
+}
+
+func TestBroadcastFloat32x8(t *testing.T) {
+ s := make([]float32, 8, 8)
+ archsimd.BroadcastFloat32x8(123456789).StoreSlice(s)
+ checkSlices(t, s, []float32{123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789})
+}
+
+func TestBroadcastFloat64x2(t *testing.T) {
+ s := make([]float64, 2, 2)
+ archsimd.BroadcastFloat64x2(123456789).StoreSlice(s)
+ checkSlices(t, s, []float64{123456789, 123456789})
+}
+
+func TestBroadcastUint64x2(t *testing.T) {
+ s := make([]uint64, 2, 2)
+ archsimd.BroadcastUint64x2(123456789).StoreSlice(s)
+ checkSlices(t, s, []uint64{123456789, 123456789})
+}
+
+func TestBroadcastUint16x8(t *testing.T) {
+ s := make([]uint16, 8, 8)
+ archsimd.BroadcastUint16x8(12345).StoreSlice(s)
+ checkSlices(t, s, []uint16{12345, 12345, 12345, 12345})
+}
+
+func TestBroadcastInt8x32(t *testing.T) {
+ s := make([]int8, 32, 32)
+ archsimd.BroadcastInt8x32(-123).StoreSlice(s)
+ checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
+ -123, -123, -123, -123, -123, -123, -123, -123,
+ -123, -123, -123, -123, -123, -123, -123, -123,
+ -123, -123, -123, -123, -123, -123, -123, -123,
+ })
+}
+
+func TestMaskOpt512(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+
+ k := make([]int64, 8, 8)
+ s := make([]float64, 8, 8)
+
+ a := archsimd.LoadFloat64x8Slice([]float64{2, 0, 2, 0, 2, 0, 2, 0})
+ b := archsimd.LoadFloat64x8Slice([]float64{1, 1, 1, 1, 1, 1, 1, 1})
+ c := archsimd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
+ d := archsimd.LoadFloat64x8Slice([]float64{2, 4, 6, 8, 10, 12, 14, 16})
+ g := a.Greater(b)
+ e := c.Add(d).Masked(g)
+ e.StoreSlice(s)
+ g.AsInt64x8().StoreSlice(k)
+ checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0})
+ checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0})
+}
+
+// flattenedTranspose tranposes x and y, regarded as a pair of 2x2
+// matrices, but then flattens the rows in order, i.e
+// x: ABCD ==> a: A1B2
+// y: 1234 b: C3D4
+func flattenedTranspose(x, y archsimd.Int32x4) (a, b archsimd.Int32x4) {
+ return x.InterleaveLo(y), x.InterleaveHi(y)
+}
+
+func TestFlattenedTranspose(t *testing.T) {
+ r := make([]int32, 4, 4)
+ s := make([]int32, 4, 4)
+
+ x := archsimd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
+ y := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
+ a, b := flattenedTranspose(x, y)
+
+ a.StoreSlice(r)
+ b.StoreSlice(s)
+
+ checkSlices[int32](t, r, []int32{0xA, 1, 0xB, 2})
+ checkSlices[int32](t, s, []int32{0xC, 3, 0xD, 4})
+
+}
+
+func TestClearAVXUpperBits(t *testing.T) {
+ // Test that ClearAVXUpperBits is safe even if there are SIMD values
+ // alive (although usually one should not do this).
+ if !archsimd.X86.AVX2() {
+ t.Skip("Test requires X86.AVX2, not available on this hardware")
+ return
+ }
+
+ r := make([]int64, 4)
+ s := make([]int64, 4)
+
+ x := archsimd.LoadInt64x4Slice([]int64{10, 20, 30, 40})
+ y := archsimd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
+
+ x.Add(y).StoreSlice(r)
+ archsimd.ClearAVXUpperBits()
+ x.Sub(y).StoreSlice(s)
+
+ checkSlices[int64](t, r, []int64{11, 22, 33, 44})
+ checkSlices[int64](t, s, []int64{9, 18, 27, 36})
+}
+
+func TestLeadingZeros(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+
+ src := []uint64{0b1111, 0}
+ want := []uint64{60, 64}
+ got := make([]uint64, 2)
+ archsimd.LoadUint64x2Slice(src).LeadingZeros().StoreSlice(got)
+ for i := range 2 {
+ if want[i] != got[i] {
+ t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i])
+ }
+ }
+}
+
+func TestIsZero(t *testing.T) {
+ v1 := archsimd.LoadUint64x2Slice([]uint64{0, 1})
+ v2 := archsimd.LoadUint64x2Slice([]uint64{0, 0})
+ if v1.IsZero() {
+ t.Errorf("Result incorrect, want false, got true")
+ }
+ if !v2.IsZero() {
+ t.Errorf("Result incorrect, want true, got false")
+ }
+ if !v1.And(v2).IsZero() {
+ t.Errorf("Result incorrect, want true, got false")
+ }
+ if v1.AndNot(v2).IsZero() {
+ t.Errorf("Result incorrect, want false, got true")
+ }
+ if !v2.And(v1).IsZero() {
+ t.Errorf("Result incorrect, want true, got false")
+ }
+ if !v2.AndNot(v1).IsZero() {
+ t.Errorf("Result incorrect, want true, got false")
+ }
+}
+
+func TestSelect4FromPairConst(t *testing.T) {
+ x := archsimd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
+ y := archsimd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
+
+ llll := x.SelectFromPair(0, 1, 2, 3, y)
+ hhhh := x.SelectFromPair(4, 5, 6, 7, y)
+ llhh := x.SelectFromPair(0, 1, 6, 7, y)
+ hhll := x.SelectFromPair(6, 7, 0, 1, y)
+
+ lllh := x.SelectFromPair(0, 1, 2, 7, y)
+ llhl := x.SelectFromPair(0, 1, 7, 2, y)
+ lhll := x.SelectFromPair(0, 7, 1, 2, y)
+ hlll := x.SelectFromPair(7, 0, 1, 2, y)
+
+ hhhl := x.SelectFromPair(4, 5, 6, 0, y)
+ hhlh := x.SelectFromPair(4, 5, 0, 6, y)
+ hlhh := x.SelectFromPair(4, 0, 5, 6, y)
+ lhhh := x.SelectFromPair(0, 4, 5, 6, y)
+
+ lhlh := x.SelectFromPair(0, 4, 1, 5, y)
+ hlhl := x.SelectFromPair(4, 0, 5, 1, y)
+ lhhl := x.SelectFromPair(0, 4, 5, 1, y)
+ hllh := x.SelectFromPair(4, 0, 1, 5, y)
+
+ r := make([]int32, 4, 4)
+
+ foo := func(v archsimd.Int32x4, a, b, c, d int32) {
+ v.StoreSlice(r)
+ checkSlices[int32](t, r, []int32{a, b, c, d})
+ }
+
+ foo(llll, 0, 1, 2, 3)
+ foo(hhhh, 4, 5, 6, 7)
+ foo(llhh, 0, 1, 6, 7)
+ foo(hhll, 6, 7, 0, 1)
+
+ foo(lllh, 0, 1, 2, 7)
+ foo(llhl, 0, 1, 7, 2)
+ foo(lhll, 0, 7, 1, 2)
+ foo(hlll, 7, 0, 1, 2)
+
+ foo(hhhl, 4, 5, 6, 0)
+ foo(hhlh, 4, 5, 0, 6)
+ foo(hlhh, 4, 0, 5, 6)
+ foo(lhhh, 0, 4, 5, 6)
+
+ foo(lhlh, 0, 4, 1, 5)
+ foo(hlhl, 4, 0, 5, 1)
+ foo(lhhl, 0, 4, 5, 1)
+ foo(hllh, 4, 0, 1, 5)
+}
+
+//go:noinline
+func selectFromPairInt32x4(x archsimd.Int32x4, a, b, c, d uint8, y archsimd.Int32x4) archsimd.Int32x4 {
+ return x.SelectFromPair(a, b, c, d, y)
+}
+
+func TestSelect4FromPairVar(t *testing.T) {
+ x := archsimd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
+ y := archsimd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
+
+ llll := selectFromPairInt32x4(x, 0, 1, 2, 3, y)
+ hhhh := selectFromPairInt32x4(x, 4, 5, 6, 7, y)
+ llhh := selectFromPairInt32x4(x, 0, 1, 6, 7, y)
+ hhll := selectFromPairInt32x4(x, 6, 7, 0, 1, y)
+
+ lllh := selectFromPairInt32x4(x, 0, 1, 2, 7, y)
+ llhl := selectFromPairInt32x4(x, 0, 1, 7, 2, y)
+ lhll := selectFromPairInt32x4(x, 0, 7, 1, 2, y)
+ hlll := selectFromPairInt32x4(x, 7, 0, 1, 2, y)
+
+ hhhl := selectFromPairInt32x4(x, 4, 5, 6, 0, y)
+ hhlh := selectFromPairInt32x4(x, 4, 5, 0, 6, y)
+ hlhh := selectFromPairInt32x4(x, 4, 0, 5, 6, y)
+ lhhh := selectFromPairInt32x4(x, 0, 4, 5, 6, y)
+
+ lhlh := selectFromPairInt32x4(x, 0, 4, 1, 5, y)
+ hlhl := selectFromPairInt32x4(x, 4, 0, 5, 1, y)
+ lhhl := selectFromPairInt32x4(x, 0, 4, 5, 1, y)
+ hllh := selectFromPairInt32x4(x, 4, 0, 1, 5, y)
+
+ r := make([]int32, 4, 4)
+
+ foo := func(v archsimd.Int32x4, a, b, c, d int32) {
+ v.StoreSlice(r)
+ checkSlices[int32](t, r, []int32{a, b, c, d})
+ }
+
+ foo(llll, 0, 1, 2, 3)
+ foo(hhhh, 4, 5, 6, 7)
+ foo(llhh, 0, 1, 6, 7)
+ foo(hhll, 6, 7, 0, 1)
+
+ foo(lllh, 0, 1, 2, 7)
+ foo(llhl, 0, 1, 7, 2)
+ foo(lhll, 0, 7, 1, 2)
+ foo(hlll, 7, 0, 1, 2)
+
+ foo(hhhl, 4, 5, 6, 0)
+ foo(hhlh, 4, 5, 0, 6)
+ foo(hlhh, 4, 0, 5, 6)
+ foo(lhhh, 0, 4, 5, 6)
+
+ foo(lhlh, 0, 4, 1, 5)
+ foo(hlhl, 4, 0, 5, 1)
+ foo(lhhl, 0, 4, 5, 1)
+ foo(hllh, 4, 0, 1, 5)
+}
+
+func TestSelect4FromPairConstGrouped(t *testing.T) {
+ x := archsimd.LoadFloat32x8Slice([]float32{0, 1, 2, 3, 10, 11, 12, 13})
+ y := archsimd.LoadFloat32x8Slice([]float32{4, 5, 6, 7, 14, 15, 16, 17})
+
+ llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
+ hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
+ llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
+ hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
+
+ lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
+ llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
+ lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
+ hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
+
+ hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
+ hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
+ hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
+ lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
+
+ lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
+ hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
+ lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
+ hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
+
+ r := make([]float32, 8, 8)
+
+ foo := func(v archsimd.Float32x8, a, b, c, d float32) {
+ v.StoreSlice(r)
+ checkSlices[float32](t, r, []float32{a, b, c, d, 10 + a, 10 + b, 10 + c, 10 + d})
+ }
+
+ foo(llll, 0, 1, 2, 3)
+ foo(hhhh, 4, 5, 6, 7)
+ foo(llhh, 0, 1, 6, 7)
+ foo(hhll, 6, 7, 0, 1)
+
+ foo(lllh, 0, 1, 2, 7)
+ foo(llhl, 0, 1, 7, 2)
+ foo(lhll, 0, 7, 1, 2)
+ foo(hlll, 7, 0, 1, 2)
+
+ foo(hhhl, 4, 5, 6, 0)
+ foo(hhlh, 4, 5, 0, 6)
+ foo(hlhh, 4, 0, 5, 6)
+ foo(lhhh, 0, 4, 5, 6)
+
+ foo(lhlh, 0, 4, 1, 5)
+ foo(hlhl, 4, 0, 5, 1)
+ foo(lhhl, 0, 4, 5, 1)
+ foo(hllh, 4, 0, 1, 5)
+}
+
+func TestSelectFromPairConstGroupedUint32x16(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ x := archsimd.LoadUint32x16Slice([]uint32{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23, 30, 31, 32, 33})
+ y := archsimd.LoadUint32x16Slice([]uint32{4, 5, 6, 7, 14, 15, 16, 17, 24, 25, 26, 27, 34, 35, 36, 37})
+
+ llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
+ hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
+ llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
+ hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
+
+ lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
+ llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
+ lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
+ hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
+
+ hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
+ hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
+ hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
+ lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
+
+ lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
+ hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
+ lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
+ hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
+
+ r := make([]uint32, 16, 16)
+
+ foo := func(v archsimd.Uint32x16, a, b, c, d uint32) {
+ v.StoreSlice(r)
+ checkSlices[uint32](t, r, []uint32{a, b, c, d,
+ 10 + a, 10 + b, 10 + c, 10 + d,
+ 20 + a, 20 + b, 20 + c, 20 + d,
+ 30 + a, 30 + b, 30 + c, 30 + d,
+ })
+ }
+
+ foo(llll, 0, 1, 2, 3)
+ foo(hhhh, 4, 5, 6, 7)
+ foo(llhh, 0, 1, 6, 7)
+ foo(hhll, 6, 7, 0, 1)
+
+ foo(lllh, 0, 1, 2, 7)
+ foo(llhl, 0, 1, 7, 2)
+ foo(lhll, 0, 7, 1, 2)
+ foo(hlll, 7, 0, 1, 2)
+
+ foo(hhhl, 4, 5, 6, 0)
+ foo(hhlh, 4, 5, 0, 6)
+ foo(hlhh, 4, 0, 5, 6)
+ foo(lhhh, 0, 4, 5, 6)
+
+ foo(lhlh, 0, 4, 1, 5)
+ foo(hlhl, 4, 0, 5, 1)
+ foo(lhhl, 0, 4, 5, 1)
+ foo(hllh, 4, 0, 1, 5)
+}
+
+func TestSelect128FromPair(t *testing.T) {
+ x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+ y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+ aa := x.Select128FromPair(0, 0, y)
+ ab := x.Select128FromPair(0, 1, y)
+ bc := x.Select128FromPair(1, 2, y)
+ cd := x.Select128FromPair(2, 3, y)
+ da := x.Select128FromPair(3, 0, y)
+ dc := x.Select128FromPair(3, 2, y)
+
+ r := make([]uint64, 4, 4)
+
+ foo := func(v archsimd.Uint64x4, a, b uint64) {
+ a, b = 2*a, 2*b
+ v.StoreSlice(r)
+ checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
+ }
+
+ foo(aa, 0, 0)
+ foo(ab, 0, 1)
+ foo(bc, 1, 2)
+ foo(cd, 2, 3)
+ foo(da, 3, 0)
+ foo(dc, 3, 2)
+}
+
+func TestSelect128FromPairError(t *testing.T) {
+ x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+ y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+ defer func() {
+ if r := recover(); r != nil {
+ t.Logf("Saw expected panic %v", r)
+ }
+ }()
+ _ = x.Select128FromPair(0, 4, y)
+
+ t.Errorf("Should have panicked")
+}
+
+//go:noinline
+func select128FromPair(x archsimd.Uint64x4, lo, hi uint8, y archsimd.Uint64x4) archsimd.Uint64x4 {
+ return x.Select128FromPair(lo, hi, y)
+}
+
+func TestSelect128FromPairVar(t *testing.T) {
+ x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
+ y := archsimd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
+
+ aa := select128FromPair(x, 0, 0, y)
+ ab := select128FromPair(x, 0, 1, y)
+ bc := select128FromPair(x, 1, 2, y)
+ cd := select128FromPair(x, 2, 3, y)
+ da := select128FromPair(x, 3, 0, y)
+ dc := select128FromPair(x, 3, 2, y)
+
+ r := make([]uint64, 4, 4)
+
+ foo := func(v archsimd.Uint64x4, a, b uint64) {
+ a, b = 2*a, 2*b
+ v.StoreSlice(r)
+ checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
+ }
+
+ foo(aa, 0, 0)
+ foo(ab, 0, 1)
+ foo(bc, 1, 2)
+ foo(cd, 2, 3)
+ foo(da, 3, 0)
+ foo(dc, 3, 2)
+}
+
+func TestSelect2FromPairConst(t *testing.T) {
+ x := archsimd.LoadUint64x2Slice([]uint64{0, 1})
+ y := archsimd.LoadUint64x2Slice([]uint64{2, 3})
+
+ ll := x.SelectFromPair(0, 1, y)
+ hh := x.SelectFromPair(3, 2, y)
+ lh := x.SelectFromPair(0, 3, y)
+ hl := x.SelectFromPair(2, 1, y)
+
+ r := make([]uint64, 2, 2)
+
+ foo := func(v archsimd.Uint64x2, a, b uint64) {
+ v.StoreSlice(r)
+ checkSlices[uint64](t, r, []uint64{a, b})
+ }
+
+ foo(ll, 0, 1)
+ foo(hh, 3, 2)
+ foo(lh, 0, 3)
+ foo(hl, 2, 1)
+}
+
+func TestSelect2FromPairConstGroupedUint(t *testing.T) {
+ x := archsimd.LoadUint64x4Slice([]uint64{0, 1, 10, 11})
+ y := archsimd.LoadUint64x4Slice([]uint64{2, 3, 12, 13})
+
+ ll := x.SelectFromPairGrouped(0, 1, y)
+ hh := x.SelectFromPairGrouped(3, 2, y)
+ lh := x.SelectFromPairGrouped(0, 3, y)
+ hl := x.SelectFromPairGrouped(2, 1, y)
+
+ r := make([]uint64, 4, 4)
+
+ foo := func(v archsimd.Uint64x4, a, b uint64) {
+ v.StoreSlice(r)
+ checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10})
+ }
+
+ foo(ll, 0, 1)
+ foo(hh, 3, 2)
+ foo(lh, 0, 3)
+ foo(hl, 2, 1)
+}
+
+func TestSelect2FromPairConstGroupedFloat(t *testing.T) {
+ x := archsimd.LoadFloat64x4Slice([]float64{0, 1, 10, 11})
+ y := archsimd.LoadFloat64x4Slice([]float64{2, 3, 12, 13})
+
+ ll := x.SelectFromPairGrouped(0, 1, y)
+ hh := x.SelectFromPairGrouped(3, 2, y)
+ lh := x.SelectFromPairGrouped(0, 3, y)
+ hl := x.SelectFromPairGrouped(2, 1, y)
+
+ r := make([]float64, 4, 4)
+
+ foo := func(v archsimd.Float64x4, a, b float64) {
+ v.StoreSlice(r)
+ checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10})
+ }
+
+ foo(ll, 0, 1)
+ foo(hh, 3, 2)
+ foo(lh, 0, 3)
+ foo(hl, 2, 1)
+}
+
+func TestSelect2FromPairConstGroupedInt(t *testing.T) {
+ x := archsimd.LoadInt64x4Slice([]int64{0, 1, 10, 11})
+ y := archsimd.LoadInt64x4Slice([]int64{2, 3, 12, 13})
+
+ ll := x.SelectFromPairGrouped(0, 1, y)
+ hh := x.SelectFromPairGrouped(3, 2, y)
+ lh := x.SelectFromPairGrouped(0, 3, y)
+ hl := x.SelectFromPairGrouped(2, 1, y)
+
+ r := make([]int64, 4, 4)
+
+ foo := func(v archsimd.Int64x4, a, b int64) {
+ v.StoreSlice(r)
+ checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10})
+ }
+
+ foo(ll, 0, 1)
+ foo(hh, 3, 2)
+ foo(lh, 0, 3)
+ foo(hl, 2, 1)
+}
+
+func TestSelect2FromPairConstGroupedInt512(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+
+ x := archsimd.LoadInt64x8Slice([]int64{0, 1, 10, 11, 20, 21, 30, 31})
+ y := archsimd.LoadInt64x8Slice([]int64{2, 3, 12, 13, 22, 23, 32, 33})
+
+ ll := x.SelectFromPairGrouped(0, 1, y)
+ hh := x.SelectFromPairGrouped(3, 2, y)
+ lh := x.SelectFromPairGrouped(0, 3, y)
+ hl := x.SelectFromPairGrouped(2, 1, y)
+
+ r := make([]int64, 8, 8)
+
+ foo := func(v archsimd.Int64x8, a, b int64) {
+ v.StoreSlice(r)
+ checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30})
+ }
+
+ foo(ll, 0, 1)
+ foo(hh, 3, 2)
+ foo(lh, 0, 3)
+ foo(hl, 2, 1)
+}
+
+func TestString(t *testing.T) {
+ x := archsimd.LoadUint32x4Slice([]uint32{0, 1, 2, 3})
+ y := archsimd.LoadInt64x4Slice([]int64{-4, -5, -6, -7})
+ z := archsimd.LoadFloat32x4Slice([]float32{0.5, 1.5, -2.5, 3.5e9})
+ w := archsimd.LoadFloat64x4Slice([]float64{0.5, 1.5, -2.5, 3.5e9})
+
+ sx := "{0,1,2,3}"
+ sy := "{-4,-5,-6,-7}"
+ sz := "{0.5,1.5,-2.5,3.5e+09}"
+ sw := sz
+
+ if x.String() != sx {
+ t.Errorf("x=%s wanted %s", x, sx)
+ }
+ if y.String() != sy {
+ t.Errorf("y=%s wanted %s", y, sy)
+ }
+ if z.String() != sz {
+ t.Errorf("z=%s wanted %s", z, sz)
+ }
+ if w.String() != sw {
+ t.Errorf("w=%s wanted %s", w, sw)
+ }
+ t.Logf("w=%s", w)
+ t.Logf("x=%s", x)
+ t.Logf("y=%s", y)
+ t.Logf("z=%s", z)
+}
+
+// a returns an slice of 16 int32
+func a() []int32 {
+ return make([]int32, 16, 16)
+}
+
+// applyTo3 returns a 16-element slice of the results of
+// applying f to the respective elements of vectors x, y, and z.
+func applyTo3(x, y, z archsimd.Int32x16, f func(x, y, z int32) int32) []int32 {
+ ax, ay, az := a(), a(), a()
+ x.StoreSlice(ax)
+ y.StoreSlice(ay)
+ z.StoreSlice(az)
+
+ r := a()
+ for i := range r {
+ r[i] = f(ax[i], ay[i], az[i])
+ }
+ return r
+}
+
+// applyTo3 returns a 16-element slice of the results of
+// applying f to the respective elements of vectors x, y, z, and w.
+func applyTo4(x, y, z, w archsimd.Int32x16, f func(x, y, z, w int32) int32) []int32 {
+ ax, ay, az, aw := a(), a(), a(), a()
+ x.StoreSlice(ax)
+ y.StoreSlice(ay)
+ z.StoreSlice(az)
+ w.StoreSlice(aw)
+
+ r := make([]int32, len(ax), len(ax))
+ for i := range r {
+ r[i] = f(ax[i], ay[i], az[i], aw[i])
+ }
+ return r
+}
+
+func TestSelectTernOptInt32x16(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+ ax := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
+ ay := []int32{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}
+ az := []int32{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}
+ aw := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
+ am := []int32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+
+ x := archsimd.LoadInt32x16Slice(ax)
+ y := archsimd.LoadInt32x16Slice(ay)
+ z := archsimd.LoadInt32x16Slice(az)
+ w := archsimd.LoadInt32x16Slice(aw)
+ m := archsimd.LoadInt32x16Slice(am)
+
+ foo := func(v archsimd.Int32x16, s []int32) {
+ r := make([]int32, 16, 16)
+ v.StoreSlice(r)
+ checkSlices[int32](t, r, s)
+ }
+
+ t0 := w.Xor(y).Xor(z)
+ ft0 := func(w, y, z int32) int32 {
+ return w ^ y ^ z
+ }
+ foo(t0, applyTo3(w, y, z, ft0))
+
+ t1 := m.And(w.Xor(y).Xor(z.Not()))
+ ft1 := func(m, w, y, z int32) int32 {
+ return m & (w ^ y ^ ^z)
+ }
+ foo(t1, applyTo4(m, w, y, z, ft1))
+
+ t2 := x.Xor(y).Xor(z).And(x.Xor(y).Xor(z.Not()))
+ ft2 := func(x, y, z int32) int32 {
+ return (x ^ y ^ z) & (x ^ y ^ ^z)
+ }
+ foo(t2, applyTo3(x, y, z, ft2))
+}
+
+func TestMaskedMerge(t *testing.T) {
+ x := archsimd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
+ y := archsimd.LoadInt64x4Slice([]int64{5, 6, 1, 1})
+ z := archsimd.LoadInt64x4Slice([]int64{-1, -2, -3, -4})
+ res := make([]int64, 4)
+ expected := []int64{6, 8, -3, -4}
+ mask := x.Less(y)
+ if archsimd.X86.AVX512() {
+ x.Add(y).Merge(z, mask).StoreSlice(res)
+ } else {
+ x.Add(y).Merge(z, mask).StoreSlice(res)
+ }
+ for i := range 4 {
+ if res[i] != expected[i] {
+ t.Errorf("got %d wanted %d", res[i], expected[i])
+ }
+ }
+}
+
+func TestDotProductQuadruple(t *testing.T) {
+ if !archsimd.X86.AVXVNNI() {
+ t.Skip("Test requires X86.AVXVNNI, not available on this hardware")
+ return
+ }
+ xd := make([]int8, 16)
+ yd := make([]uint8, 16)
+ zd := make([]int32, 4)
+ wanted1 := make([]int32, 4)
+ wanted2 := make([]int32, 4)
+ res1 := make([]int32, 4)
+ res2 := make([]int32, 4)
+ for i := range 4 {
+ xd[i] = 5
+ yd[i] = 6
+ zd[i] = 3
+ wanted1[i] = 30
+ wanted2[i] = 30
+ }
+ x := archsimd.LoadInt8x16Slice(xd)
+ y := archsimd.LoadUint8x16Slice(yd)
+ z := archsimd.LoadInt32x4Slice(zd)
+ x.DotProductQuadruple(y).StoreSlice(res1)
+ x.DotProductQuadruple(y).Add(z).StoreSlice(res1)
+ for i := range 4 {
+ if res1[i] != wanted1[i] {
+ t.Errorf("got %d wanted %d", res1[i], wanted1[i])
+ }
+ if res2[i] != wanted2[i] {
+ t.Errorf("got %d wanted %d", res2[i], wanted2[i])
+ }
+ }
+}
+
+func TestPermuteScalars(t *testing.T) {
+ x := []int32{11, 12, 13, 14}
+ want := []int32{12, 13, 14, 11}
+ got := make([]int32, 4)
+ archsimd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
+ checkSlices(t, got, want)
+}
+
+func TestPermuteScalarsGrouped(t *testing.T) {
+ x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
+ want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
+ got := make([]int32, 8)
+ archsimd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
+ checkSlices(t, got, want)
+}
+
+func TestPermuteScalarsHi(t *testing.T) {
+ x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
+ want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
+ got := make([]int16, len(x))
+ archsimd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
+ checkSlices(t, got, want)
+}
+
+func TestPermuteScalarsLo(t *testing.T) {
+ x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
+ want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
+ got := make([]int16, len(x))
+ archsimd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
+ checkSlices(t, got, want)
+}
+
+func TestPermuteScalarsHiGrouped(t *testing.T) {
+ x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
+ want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
+ got := make([]int16, len(x))
+ archsimd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
+ checkSlices(t, got, want)
+}
+
+func TestPermuteScalarsLoGrouped(t *testing.T) {
+ x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
+ want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
+ got := make([]int16, len(x))
+ archsimd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
+ checkSlices(t, got, want)
+}
+
+func TestClMul(t *testing.T) {
+ var x = archsimd.LoadUint64x2Slice([]uint64{1, 5})
+ var y = archsimd.LoadUint64x2Slice([]uint64{3, 9})
+
+ foo := func(v archsimd.Uint64x2, s []uint64) {
+ r := make([]uint64, 2, 2)
+ v.StoreSlice(r)
+ checkSlices[uint64](t, r, s)
+ }
+
+ foo(x.CarrylessMultiply(0, 0, y), []uint64{3, 0})
+ foo(x.CarrylessMultiply(0, 1, y), []uint64{9, 0})
+ foo(x.CarrylessMultiply(1, 0, y), []uint64{15, 0})
+ foo(x.CarrylessMultiply(1, 1, y), []uint64{45, 0})
+ foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
+
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+ "math"
+)
+
+func less[T number](x, y T) bool {
+ return x < y
+}
+func lessEqual[T number](x, y T) bool {
+ return x <= y
+}
+func greater[T number](x, y T) bool {
+ return x > y
+}
+func greaterEqual[T number](x, y T) bool {
+ return x >= y
+}
+func equal[T number](x, y T) bool {
+ return x == y
+}
+func notEqual[T number](x, y T) bool {
+ return x != y
+}
+
+func abs[T number](x T) T {
+ // TODO this will need a non-standard FP-equality test.
+ if x == 0 { // true if x is -0.
+ return 0 // this is not a negative zero
+ }
+ if x < 0 {
+ return -x
+ }
+ return x
+}
+
+func ceil[T float](x T) T {
+ return T(math.Ceil(float64(x)))
+}
+func floor[T float](x T) T {
+ return T(math.Floor(float64(x)))
+}
+func not[T integer](x T) T {
+ return ^x
+}
+func round[T float](x T) T {
+ return T(math.RoundToEven(float64(x)))
+}
+func sqrt[T float](x T) T {
+ return T(math.Sqrt(float64(x)))
+}
+func trunc[T float](x T) T {
+ return T(math.Trunc(float64(x)))
+}
+
+func add[T number](x, y T) T {
+ return x + y
+}
+
+func sub[T number](x, y T) T {
+ return x - y
+}
+
+func max_[T number](x, y T) T { // "max" lands in infinite recursion
+ return max(x, y)
+}
+
+func min_[T number](x, y T) T { // "min" lands in infinite recursion
+ return min(x, y)
+}
+
+// Also mulLow for integers
+func mul[T number](x, y T) T {
+ return x * y
+}
+
+func div[T number](x, y T) T {
+ return x / y
+}
+
+func and[T integer](x, y T) T {
+ return x & y
+}
+
+func andNotI[T integer](x, y T) T {
+ return x & ^y // order corrected to match expectations
+}
+
+func orI[T integer](x, y T) T {
+ return x | y
+}
+
+func xorI[T integer](x, y T) T {
+ return x ^ y
+}
+
+func ima[T integer](x, y, z T) T {
+ return x*y + z
+}
+
+func fma[T float](x, y, z T) T {
+ return T(math.FMA(float64(x), float64(y), float64(z)))
+}
+
+func toUint8[T number](x T) uint8 {
+ return uint8(x)
+}
+
+func toUint16[T number](x T) uint16 {
+ return uint16(x)
+}
+
+func toUint64[T number](x T) uint64 {
+ return uint64(x)
+}
+
+func toUint32[T number](x T) uint32 {
+ switch y := (any(x)).(type) {
+ case float32:
+ if y < 0 || y > float32(math.MaxUint32) || y != y {
+ return math.MaxUint32
+ }
+ case float64:
+ if y < 0 || y > float64(math.MaxUint32) || y != y {
+ return math.MaxUint32
+ }
+ }
+ return uint32(x)
+}
+
+func toInt8[T number](x T) int8 {
+ return int8(x)
+}
+
+func toInt16[T number](x T) int16 {
+ return int16(x)
+}
+
+func toInt32[T number](x T) int32 {
+ return int32(x)
+}
+
+func toInt64[T number](x T) int64 {
+ return int64(x)
+}
+
+func toFloat32[T number](x T) float32 {
+ return float32(x)
+}
+
+func toFloat64[T number](x T) float64 {
+ return float64(x)
+}
+
+func ceilResidueForPrecision[T float](i int) func(T) T {
+ f := 1.0
+ for i > 0 {
+ f *= 2
+ i--
+ }
+ return func(x T) T {
+ y := float64(x)
+ if math.IsInf(float64(x*T(f)), 0) {
+ return 0
+ }
+ // TODO sort out the rounding issues when T === float32
+ return T(y - math.Ceil(y*f)/f)
+ }
+}
+
+// Slice versions of all these elementwise operations
+
+func addSlice[T number](x, y []T) []T {
+ return map2[T](add)(x, y)
+}
+
+func subSlice[T number](x, y []T) []T {
+ return map2[T](sub)(x, y)
+}
+
+func maxSlice[T number](x, y []T) []T {
+ return map2[T](max_)(x, y)
+}
+
+func minSlice[T number](x, y []T) []T {
+ return map2[T](min_)(x, y)
+}
+
+// mulLow for integers
+func mulSlice[T number](x, y []T) []T {
+ return map2[T](mul)(x, y)
+}
+
+func divSlice[T number](x, y []T) []T {
+ return map2[T](div)(x, y)
+}
+
+func andSlice[T integer](x, y []T) []T {
+ return map2[T](and)(x, y)
+}
+
+func andNotSlice[T integer](x, y []T) []T {
+ return map2[T](andNotI)(x, y)
+}
+
+func orSlice[T integer](x, y []T) []T {
+ return map2[T](orI)(x, y)
+}
+
+func xorSlice[T integer](x, y []T) []T {
+ return map2[T](xorI)(x, y)
+}
+
+func lessSlice[T number](x, y []T) []int64 {
+ return mapCompare[T](less)(x, y)
+}
+
+func lessEqualSlice[T number](x, y []T) []int64 {
+ return mapCompare[T](lessEqual)(x, y)
+}
+
+func greaterSlice[T number](x, y []T) []int64 {
+ return mapCompare[T](greater)(x, y)
+}
+
+func greaterEqualSlice[T number](x, y []T) []int64 {
+ return mapCompare[T](greaterEqual)(x, y)
+}
+
+func equalSlice[T number](x, y []T) []int64 {
+ return mapCompare[T](equal)(x, y)
+}
+
+func notEqualSlice[T number](x, y []T) []int64 {
+ return mapCompare[T](notEqual)(x, y)
+}
+
+func ceilSlice[T float](x []T) []T {
+ return map1[T](ceil)(x)
+}
+
+func floorSlice[T float](x []T) []T {
+ return map1[T](floor)(x)
+}
+
+func notSlice[T integer](x []T) []T {
+ return map1[T](not)(x)
+}
+
+func roundSlice[T float](x []T) []T {
+ return map1[T](round)(x)
+}
+
+func sqrtSlice[T float](x []T) []T {
+ return map1[T](sqrt)(x)
+}
+
+func truncSlice[T float](x []T) []T {
+ return map1[T](trunc)(x)
+}
+
+func imaSlice[T integer](x, y, z []T) []T {
+ return map3[T](ima)(x, y, z)
+}
+
+func fmaSlice[T float](x, y, z []T) []T {
+ return map3[T](fma)(x, y, z)
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+func TestSlicePartInt8x16(t *testing.T) {
+ Do(t, 16, func(a, c []int8) {
+ u := archsimd.LoadInt8x16SlicePart(a)
+ u.StoreSlice(c)
+ })
+}
+
+func TestSlicePartInt8x32(t *testing.T) {
+ a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+ b := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+ for i := 32; i >= 0; i-- {
+ u := archsimd.LoadInt8x32SlicePart(a[:i])
+ c := make([]int8, 32, 32)
+ u.StoreSlice(c)
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicePartUint8x16(t *testing.T) {
+ a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ for i := 16; i >= 0; i-- {
+ u := archsimd.LoadUint8x16SlicePart(a[:i])
+ c := make([]uint8, 32, 32)
+ u.StoreSlice(c)
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicePartUint8x32(t *testing.T) {
+ a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+ b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+ for i := 32; i >= 0; i-- {
+ u := archsimd.LoadUint8x32SlicePart(a[:i])
+ c := make([]uint8, 32, 32)
+ u.StoreSlice(c)
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicePartInt16x8(t *testing.T) {
+ a := []int16{1, 2, 3, 4, 5, 6, 7, 8}
+ b := []int16{1, 2, 3, 4, 5, 6, 7, 8}
+ for i := 8; i >= 0; i-- {
+ u := archsimd.LoadInt16x8SlicePart(a[:i])
+ c := make([]int16, 16, 16)
+ u.StoreSlice(c)
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicePartInt16x16(t *testing.T) {
+ a := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ b := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ for i := 16; i >= 0; i-- {
+ u := archsimd.LoadInt16x16SlicePart(a[:i])
+ c := make([]int16, 16, 16)
+ u.StoreSlice(c)
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicesPartStoreInt8x16(t *testing.T) {
+ a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ b := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ for i := 16; i >= 0; i-- {
+ v := archsimd.LoadInt8x16Slice(a)
+ c := make([]int8, 32, 32)
+ v.StoreSlicePart(c[:i])
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicesPartStoreInt16x8(t *testing.T) {
+ a := []int16{1, 2, 3, 4, 5, 6, 7, 8}
+ b := []int16{1, 2, 3, 4, 5, 6, 7, 8}
+ for i := 8; i >= 0; i-- {
+ v := archsimd.LoadInt16x8Slice(a)
+ c := make([]int16, 32, 32)
+ v.StoreSlicePart(c[:i])
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicesPartStoreInt16x16(t *testing.T) {
+ a := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ b := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ for i := 16; i >= 0; i-- {
+ v := archsimd.LoadInt16x16Slice(a)
+ c := make([]int16, 32, 32)
+ v.StoreSlicePart(c[:i])
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicesPartStoreUint8x16(t *testing.T) {
+ a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ for i := 16; i >= 0; i-- {
+ v := archsimd.LoadUint8x16Slice(a)
+ c := make([]uint8, 32, 32)
+ v.StoreSlicePart(c[:i])
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicesPartStoreUint16x16(t *testing.T) {
+ a := []uint16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ b := []uint16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+ for i := 16; i >= 0; i-- {
+ v := archsimd.LoadUint16x16Slice(a)
+ c := make([]uint16, 32, 32)
+ v.StoreSlicePart(c[:i])
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicesPartStoreUint8x32(t *testing.T) {
+ a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+ b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+ for i := 32; i >= 0; i-- {
+ v := archsimd.LoadUint8x32Slice(a)
+ c := make([]uint8, 32, 32)
+ v.StoreSlicePart(c[:i])
+ checkSlices(t, c, b)
+ if i > 0 {
+ b[i-1] = 0
+ }
+ }
+}
+
+func TestSlicePartInt32(t *testing.T) {
+ // 32x4
+ L := 4
+ c := []int32{1, 2, 3, 4, 5, -1, -1, -1, -1}
+ a := c[:L+1]
+ for i := range a {
+ // Test the load first
+ // e is a partial slice.
+ e := a[i:]
+ v := archsimd.LoadInt32x4SlicePart(e)
+ // d contains what a ought to contain
+ d := make([]int32, L)
+ for j := 0; j < len(e) && j < len(d); j++ {
+ d[j] = e[j]
+ }
+
+ b := make([]int32, L)
+ v.StoreSlice(b)
+ // test the load
+ checkSlices(t, d, b)
+
+ // Test the store
+ f := make([]int32, L+1)
+ for i := range f {
+ f[i] = 99
+ }
+
+ v.StoreSlicePart(f[:len(e)])
+ if len(e) < len(b) {
+ checkSlices(t, f, b[:len(e)])
+ } else {
+ checkSlices(t, f, b)
+ }
+ for i := len(e); i < len(f); i++ {
+ if f[i] != 99 {
+ t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %d", i, f[i])
+ }
+ }
+ }
+}
+
+func TestSlicePartUint64(t *testing.T) {
+ // 64x4
+ L := 4
+ c := []uint64{1, 2, 3, 4, 5, 86, 86, 86, 86}
+ a := c[:L+1]
+ for i := range a {
+ // Test the load first
+ // e is a partial slice.
+ e := a[i:]
+ v := archsimd.LoadUint64x4SlicePart(e)
+ // d contains what a ought to contain
+ d := make([]uint64, L)
+ for j := 0; j < len(e) && j < len(d); j++ {
+ d[j] = e[j]
+ }
+
+ b := make([]uint64, L)
+ v.StoreSlice(b)
+ // test the load
+ checkSlices(t, d, b)
+
+ // Test the store
+ f := make([]uint64, L+1)
+ for i := range f {
+ f[i] = 99
+ }
+
+ v.StoreSlicePart(f[:len(e)])
+ if len(e) < len(b) {
+ checkSlices(t, f, b[:len(e)])
+ } else {
+ checkSlices(t, f, b)
+ }
+ for i := len(e); i < len(f); i++ {
+ if f[i] != 99 {
+ t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %d", i, f[i])
+ }
+ }
+ }
+}
+
+func TestSlicePartFloat64(t *testing.T) {
+ // 64x2
+ L := 2
+ c := []float64{1, 2, 3, 86, 86, 86, 86}
+ a := c[:L+1]
+ for i := range a {
+ // Test the load first
+ // e is a partial slice.
+ e := a[i:]
+ v := archsimd.LoadFloat64x2SlicePart(e)
+ // d contains what a ought to contain
+ d := make([]float64, L)
+ for j := 0; j < len(e) && j < len(d); j++ {
+ d[j] = e[j]
+ }
+
+ b := make([]float64, L)
+ v.StoreSlice(b)
+ // test the load
+ checkSlices(t, d, b)
+
+ // Test the store
+ f := make([]float64, L+1)
+ for i := range f {
+ f[i] = 99
+ }
+
+ v.StoreSlicePart(f[:len(e)])
+ if len(e) < len(b) {
+ checkSlices(t, f, b[:len(e)])
+ } else {
+ checkSlices(t, f, b)
+ }
+ for i := len(e); i < len(f); i++ {
+ if f[i] != 99 {
+ t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
+ }
+ }
+ }
+}
+
+func TestSlicePartFloat32(t *testing.T) {
+ // 32x8
+ L := 8
+ c := []float32{1, 2, 3, 4, 5, 6, 7, 8, 86, 86, 86, 86}
+ a := c[:L+1]
+ for i := range a {
+ // Test the load first
+ // e is a partial slice.
+ e := a[i:]
+ v := archsimd.LoadFloat32x8SlicePart(e)
+ // d contains what a ought to contain
+ d := make([]float32, L)
+ for j := 0; j < len(e) && j < len(d); j++ {
+ d[j] = e[j]
+ }
+
+ b := make([]float32, L)
+ v.StoreSlice(b)
+ // test the load
+ checkSlices(t, d, b)
+
+ // Test the store
+ f := make([]float32, L+1)
+ for i := range f {
+ f[i] = 99
+ }
+
+ v.StoreSlicePart(f[:len(e)])
+ if len(e) < len(b) {
+ checkSlices(t, f, b[:len(e)])
+ } else {
+ checkSlices(t, f, b)
+ }
+ for i := len(e); i < len(f); i++ {
+ if f[i] != 99 {
+ t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
+ }
+ }
+ }
+}
+
+// 512-bit load
+
+func TestSlicePartInt64(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Test requires X86.AVX512, not available on this hardware")
+ return
+ }
+
+ L := 8
+ c := []int64{1, 2, 3, 4, 5, 6, 7, 8, 86, 86, 86, 86}
+ a := c[:L+1]
+ for i := range a {
+ // Test the load first
+ // e is a partial slice.
+ e := a[i:]
+ v := archsimd.LoadInt64x8SlicePart(e)
+ // d contains what a ought to contain
+ d := make([]int64, L)
+ for j := 0; j < len(e) && j < len(d); j++ {
+ d[j] = e[j]
+ }
+
+ b := make([]int64, L)
+ v.StoreSlice(b)
+ // test the load
+ checkSlicesLogInput(t, b, d, 0.0, func() { t.Helper(); t.Logf("Len(e)=%d", len(e)) })
+
+ // Test the store
+ f := make([]int64, L+1)
+ for i := range f {
+ f[i] = 99
+ }
+
+ v.StoreSlicePart(f[:len(e)])
+ if len(e) < len(b) {
+ checkSlices(t, f, b[:len(e)])
+ } else {
+ checkSlices(t, f, b)
+ }
+ for i := len(e); i < len(f); i++ {
+ if f[i] != 99 {
+ t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
+ }
+ }
+ }
+}
--- /dev/null
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing ternary simd methods.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+// testInt8x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt8x16Ternary(t *testing.T, f func(_, _, _ archsimd.Int8x16) archsimd.Int8x16, want func(_, _, _ []int8) []int8) {
+ n := 16
+ t.Helper()
+ forSliceTriple(t, int8s, n, func(x, y, z []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x16Slice(x)
+ b := archsimd.LoadInt8x16Slice(y)
+ c := archsimd.LoadInt8x16Slice(z)
+ g := make([]int8, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt16x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt16x8Ternary(t *testing.T, f func(_, _, _ archsimd.Int16x8) archsimd.Int16x8, want func(_, _, _ []int16) []int16) {
+ n := 8
+ t.Helper()
+ forSliceTriple(t, int16s, n, func(x, y, z []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x8Slice(x)
+ b := archsimd.LoadInt16x8Slice(y)
+ c := archsimd.LoadInt16x8Slice(z)
+ g := make([]int16, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt32x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt32x4Ternary(t *testing.T, f func(_, _, _ archsimd.Int32x4) archsimd.Int32x4, want func(_, _, _ []int32) []int32) {
+ n := 4
+ t.Helper()
+ forSliceTriple(t, int32s, n, func(x, y, z []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x4Slice(x)
+ b := archsimd.LoadInt32x4Slice(y)
+ c := archsimd.LoadInt32x4Slice(z)
+ g := make([]int32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt64x2Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt64x2Ternary(t *testing.T, f func(_, _, _ archsimd.Int64x2) archsimd.Int64x2, want func(_, _, _ []int64) []int64) {
+ n := 2
+ t.Helper()
+ forSliceTriple(t, int64s, n, func(x, y, z []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x2Slice(x)
+ b := archsimd.LoadInt64x2Slice(y)
+ c := archsimd.LoadInt64x2Slice(z)
+ g := make([]int64, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint8x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint8x16Ternary(t *testing.T, f func(_, _, _ archsimd.Uint8x16) archsimd.Uint8x16, want func(_, _, _ []uint8) []uint8) {
+ n := 16
+ t.Helper()
+ forSliceTriple(t, uint8s, n, func(x, y, z []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x16Slice(x)
+ b := archsimd.LoadUint8x16Slice(y)
+ c := archsimd.LoadUint8x16Slice(z)
+ g := make([]uint8, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint16x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint16x8Ternary(t *testing.T, f func(_, _, _ archsimd.Uint16x8) archsimd.Uint16x8, want func(_, _, _ []uint16) []uint16) {
+ n := 8
+ t.Helper()
+ forSliceTriple(t, uint16s, n, func(x, y, z []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x8Slice(x)
+ b := archsimd.LoadUint16x8Slice(y)
+ c := archsimd.LoadUint16x8Slice(z)
+ g := make([]uint16, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint32x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint32x4Ternary(t *testing.T, f func(_, _, _ archsimd.Uint32x4) archsimd.Uint32x4, want func(_, _, _ []uint32) []uint32) {
+ n := 4
+ t.Helper()
+ forSliceTriple(t, uint32s, n, func(x, y, z []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x4Slice(x)
+ b := archsimd.LoadUint32x4Slice(y)
+ c := archsimd.LoadUint32x4Slice(z)
+ g := make([]uint32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint64x2Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint64x2Ternary(t *testing.T, f func(_, _, _ archsimd.Uint64x2) archsimd.Uint64x2, want func(_, _, _ []uint64) []uint64) {
+ n := 2
+ t.Helper()
+ forSliceTriple(t, uint64s, n, func(x, y, z []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x2Slice(x)
+ b := archsimd.LoadUint64x2Slice(y)
+ c := archsimd.LoadUint64x2Slice(z)
+ g := make([]uint64, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testFloat32x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat32x4Ternary(t *testing.T, f func(_, _, _ archsimd.Float32x4) archsimd.Float32x4, want func(_, _, _ []float32) []float32) {
+ n := 4
+ t.Helper()
+ forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x4Slice(x)
+ b := archsimd.LoadFloat32x4Slice(y)
+ c := archsimd.LoadFloat32x4Slice(z)
+ g := make([]float32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testFloat64x2Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat64x2Ternary(t *testing.T, f func(_, _, _ archsimd.Float64x2) archsimd.Float64x2, want func(_, _, _ []float64) []float64) {
+ n := 2
+ t.Helper()
+ forSliceTriple(t, float64s, n, func(x, y, z []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x2Slice(x)
+ b := archsimd.LoadFloat64x2Slice(y)
+ c := archsimd.LoadFloat64x2Slice(z)
+ g := make([]float64, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt8x32Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt8x32Ternary(t *testing.T, f func(_, _, _ archsimd.Int8x32) archsimd.Int8x32, want func(_, _, _ []int8) []int8) {
+ n := 32
+ t.Helper()
+ forSliceTriple(t, int8s, n, func(x, y, z []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x32Slice(x)
+ b := archsimd.LoadInt8x32Slice(y)
+ c := archsimd.LoadInt8x32Slice(z)
+ g := make([]int8, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt16x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt16x16Ternary(t *testing.T, f func(_, _, _ archsimd.Int16x16) archsimd.Int16x16, want func(_, _, _ []int16) []int16) {
+ n := 16
+ t.Helper()
+ forSliceTriple(t, int16s, n, func(x, y, z []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x16Slice(x)
+ b := archsimd.LoadInt16x16Slice(y)
+ c := archsimd.LoadInt16x16Slice(z)
+ g := make([]int16, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt32x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt32x8Ternary(t *testing.T, f func(_, _, _ archsimd.Int32x8) archsimd.Int32x8, want func(_, _, _ []int32) []int32) {
+ n := 8
+ t.Helper()
+ forSliceTriple(t, int32s, n, func(x, y, z []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x8Slice(x)
+ b := archsimd.LoadInt32x8Slice(y)
+ c := archsimd.LoadInt32x8Slice(z)
+ g := make([]int32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt64x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt64x4Ternary(t *testing.T, f func(_, _, _ archsimd.Int64x4) archsimd.Int64x4, want func(_, _, _ []int64) []int64) {
+ n := 4
+ t.Helper()
+ forSliceTriple(t, int64s, n, func(x, y, z []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x4Slice(x)
+ b := archsimd.LoadInt64x4Slice(y)
+ c := archsimd.LoadInt64x4Slice(z)
+ g := make([]int64, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint8x32Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint8x32Ternary(t *testing.T, f func(_, _, _ archsimd.Uint8x32) archsimd.Uint8x32, want func(_, _, _ []uint8) []uint8) {
+ n := 32
+ t.Helper()
+ forSliceTriple(t, uint8s, n, func(x, y, z []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x32Slice(x)
+ b := archsimd.LoadUint8x32Slice(y)
+ c := archsimd.LoadUint8x32Slice(z)
+ g := make([]uint8, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint16x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint16x16Ternary(t *testing.T, f func(_, _, _ archsimd.Uint16x16) archsimd.Uint16x16, want func(_, _, _ []uint16) []uint16) {
+ n := 16
+ t.Helper()
+ forSliceTriple(t, uint16s, n, func(x, y, z []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x16Slice(x)
+ b := archsimd.LoadUint16x16Slice(y)
+ c := archsimd.LoadUint16x16Slice(z)
+ g := make([]uint16, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint32x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint32x8Ternary(t *testing.T, f func(_, _, _ archsimd.Uint32x8) archsimd.Uint32x8, want func(_, _, _ []uint32) []uint32) {
+ n := 8
+ t.Helper()
+ forSliceTriple(t, uint32s, n, func(x, y, z []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x8Slice(x)
+ b := archsimd.LoadUint32x8Slice(y)
+ c := archsimd.LoadUint32x8Slice(z)
+ g := make([]uint32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint64x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint64x4Ternary(t *testing.T, f func(_, _, _ archsimd.Uint64x4) archsimd.Uint64x4, want func(_, _, _ []uint64) []uint64) {
+ n := 4
+ t.Helper()
+ forSliceTriple(t, uint64s, n, func(x, y, z []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x4Slice(x)
+ b := archsimd.LoadUint64x4Slice(y)
+ c := archsimd.LoadUint64x4Slice(z)
+ g := make([]uint64, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testFloat32x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat32x8Ternary(t *testing.T, f func(_, _, _ archsimd.Float32x8) archsimd.Float32x8, want func(_, _, _ []float32) []float32) {
+ n := 8
+ t.Helper()
+ forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x8Slice(x)
+ b := archsimd.LoadFloat32x8Slice(y)
+ c := archsimd.LoadFloat32x8Slice(z)
+ g := make([]float32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testFloat64x4Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat64x4Ternary(t *testing.T, f func(_, _, _ archsimd.Float64x4) archsimd.Float64x4, want func(_, _, _ []float64) []float64) {
+ n := 4
+ t.Helper()
+ forSliceTriple(t, float64s, n, func(x, y, z []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x4Slice(x)
+ b := archsimd.LoadFloat64x4Slice(y)
+ c := archsimd.LoadFloat64x4Slice(z)
+ g := make([]float64, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt8x64Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt8x64Ternary(t *testing.T, f func(_, _, _ archsimd.Int8x64) archsimd.Int8x64, want func(_, _, _ []int8) []int8) {
+ n := 64
+ t.Helper()
+ forSliceTriple(t, int8s, n, func(x, y, z []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x64Slice(x)
+ b := archsimd.LoadInt8x64Slice(y)
+ c := archsimd.LoadInt8x64Slice(z)
+ g := make([]int8, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt16x32Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt16x32Ternary(t *testing.T, f func(_, _, _ archsimd.Int16x32) archsimd.Int16x32, want func(_, _, _ []int16) []int16) {
+ n := 32
+ t.Helper()
+ forSliceTriple(t, int16s, n, func(x, y, z []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x32Slice(x)
+ b := archsimd.LoadInt16x32Slice(y)
+ c := archsimd.LoadInt16x32Slice(z)
+ g := make([]int16, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt32x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt32x16Ternary(t *testing.T, f func(_, _, _ archsimd.Int32x16) archsimd.Int32x16, want func(_, _, _ []int32) []int32) {
+ n := 16
+ t.Helper()
+ forSliceTriple(t, int32s, n, func(x, y, z []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x16Slice(x)
+ b := archsimd.LoadInt32x16Slice(y)
+ c := archsimd.LoadInt32x16Slice(z)
+ g := make([]int32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testInt64x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testInt64x8Ternary(t *testing.T, f func(_, _, _ archsimd.Int64x8) archsimd.Int64x8, want func(_, _, _ []int64) []int64) {
+ n := 8
+ t.Helper()
+ forSliceTriple(t, int64s, n, func(x, y, z []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x8Slice(x)
+ b := archsimd.LoadInt64x8Slice(y)
+ c := archsimd.LoadInt64x8Slice(z)
+ g := make([]int64, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint8x64Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint8x64Ternary(t *testing.T, f func(_, _, _ archsimd.Uint8x64) archsimd.Uint8x64, want func(_, _, _ []uint8) []uint8) {
+ n := 64
+ t.Helper()
+ forSliceTriple(t, uint8s, n, func(x, y, z []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x64Slice(x)
+ b := archsimd.LoadUint8x64Slice(y)
+ c := archsimd.LoadUint8x64Slice(z)
+ g := make([]uint8, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint16x32Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint16x32Ternary(t *testing.T, f func(_, _, _ archsimd.Uint16x32) archsimd.Uint16x32, want func(_, _, _ []uint16) []uint16) {
+ n := 32
+ t.Helper()
+ forSliceTriple(t, uint16s, n, func(x, y, z []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x32Slice(x)
+ b := archsimd.LoadUint16x32Slice(y)
+ c := archsimd.LoadUint16x32Slice(z)
+ g := make([]uint16, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint32x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint32x16Ternary(t *testing.T, f func(_, _, _ archsimd.Uint32x16) archsimd.Uint32x16, want func(_, _, _ []uint32) []uint32) {
+ n := 16
+ t.Helper()
+ forSliceTriple(t, uint32s, n, func(x, y, z []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x16Slice(x)
+ b := archsimd.LoadUint32x16Slice(y)
+ c := archsimd.LoadUint32x16Slice(z)
+ g := make([]uint32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testUint64x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testUint64x8Ternary(t *testing.T, f func(_, _, _ archsimd.Uint64x8) archsimd.Uint64x8, want func(_, _, _ []uint64) []uint64) {
+ n := 8
+ t.Helper()
+ forSliceTriple(t, uint64s, n, func(x, y, z []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x8Slice(x)
+ b := archsimd.LoadUint64x8Slice(y)
+ c := archsimd.LoadUint64x8Slice(z)
+ g := make([]uint64, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testFloat32x16Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat32x16Ternary(t *testing.T, f func(_, _, _ archsimd.Float32x16) archsimd.Float32x16, want func(_, _, _ []float32) []float32) {
+ n := 16
+ t.Helper()
+ forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x16Slice(x)
+ b := archsimd.LoadFloat32x16Slice(y)
+ c := archsimd.LoadFloat32x16Slice(z)
+ g := make([]float32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testFloat64x8Ternary tests the simd ternary method f against the expected behavior generated by want
+func testFloat64x8Ternary(t *testing.T, f func(_, _, _ archsimd.Float64x8) archsimd.Float64x8, want func(_, _, _ []float64) []float64) {
+ n := 8
+ t.Helper()
+ forSliceTriple(t, float64s, n, func(x, y, z []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x8Slice(x)
+ b := archsimd.LoadFloat64x8Slice(y)
+ c := archsimd.LoadFloat64x8Slice(z)
+ g := make([]float64, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testFloat32x4TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x4TernaryFlaky(t *testing.T, f func(x, y, z archsimd.Float32x4) archsimd.Float32x4, want func(x, y, z []float32) []float32, flakiness float64) {
+ n := 4
+ t.Helper()
+ forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x4Slice(x)
+ b := archsimd.LoadFloat32x4Slice(y)
+ c := archsimd.LoadFloat32x4Slice(z)
+ g := make([]float32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testFloat32x8TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x8TernaryFlaky(t *testing.T, f func(x, y, z archsimd.Float32x8) archsimd.Float32x8, want func(x, y, z []float32) []float32, flakiness float64) {
+ n := 8
+ t.Helper()
+ forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x8Slice(x)
+ b := archsimd.LoadFloat32x8Slice(y)
+ c := archsimd.LoadFloat32x8Slice(z)
+ g := make([]float32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
+
+// testFloat32x16TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x16TernaryFlaky(t *testing.T, f func(x, y, z archsimd.Float32x16) archsimd.Float32x16, want func(x, y, z []float32) []float32, flakiness float64) {
+ n := 16
+ t.Helper()
+ forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x16Slice(x)
+ b := archsimd.LoadFloat32x16Slice(y)
+ c := archsimd.LoadFloat32x16Slice(z)
+ g := make([]float32, n)
+ f(a, b, c).StoreSlice(g)
+ w := want(x, y, z)
+ return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
+ })
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+func TestFMA(t *testing.T) {
+ if archsimd.X86.AVX512() {
+ testFloat32x4TernaryFlaky(t, archsimd.Float32x4.MulAdd, fmaSlice[float32], 0.001)
+ testFloat32x8TernaryFlaky(t, archsimd.Float32x8.MulAdd, fmaSlice[float32], 0.001)
+ testFloat32x16TernaryFlaky(t, archsimd.Float32x16.MulAdd, fmaSlice[float32], 0.001)
+ testFloat64x2Ternary(t, archsimd.Float64x2.MulAdd, fmaSlice[float64])
+ testFloat64x4Ternary(t, archsimd.Float64x4.MulAdd, fmaSlice[float64])
+ testFloat64x8Ternary(t, archsimd.Float64x8.MulAdd, fmaSlice[float64])
+ }
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+ "fmt"
+ "simd/archsimd"
+ "testing"
+)
+
+func Transpose4(a0, a1, a2, a3 archsimd.Int32x4) (b0, b1, b2, b3 archsimd.Int32x4) {
+ t0, t1 := a0.InterleaveLo(a1), a0.InterleaveHi(a1)
+ t2, t3 := a2.InterleaveLo(a3), a2.InterleaveHi(a3)
+
+ // a0: ABCD ==> t0: A1B2
+ // a1: 1234 t1: C3D4
+ // a2: EFGH t2: E5F6
+ // a3: 5678 t3: G7H8
+
+ // need
+ // A1E5
+ // B2F6
+ // C3G7
+ // D4H8
+
+ b0 = t0.SelectFromPair(0, 1, 4, 5, t2) // lower elements from each
+ b1 = t0.SelectFromPair(2, 3, 6, 7, t2) // upper elements from each
+ b2 = t1.SelectFromPair(0, 1, 4, 5, t3) // lowers
+ b3 = t1.SelectFromPair(2, 3, 6, 7, t3) // uppers
+ return
+}
+
+func Transpose8(a0, a1, a2, a3, a4, a5, a6, a7 archsimd.Int32x8) (b0, b1, b2, b3, b4, b5, b6, b7 archsimd.Int32x8) {
+ t0, t1 := a0.InterleaveLoGrouped(a1), a0.InterleaveHiGrouped(a1)
+ t2, t3 := a2.InterleaveLoGrouped(a3), a2.InterleaveHiGrouped(a3)
+ t4, t5 := a4.InterleaveLoGrouped(a5), a4.InterleaveHiGrouped(a5)
+ t6, t7 := a6.InterleaveLoGrouped(a7), a6.InterleaveHiGrouped(a7)
+
+ // a0: ABCD ==> t0: A1B2
+ // a1: 1234 t1: C3D4
+ // a2: EFGH t2: E5F6
+ // a3: 5678 t3: G7H8
+
+ // need
+ // A1E5
+ // B2F6
+ // C3G7
+ // D4H8
+
+ a0 = t0.SelectFromPairGrouped(0, 1, 4, 5, t2) // lower elements from each
+ a1 = t0.SelectFromPairGrouped(2, 3, 6, 7, t2) // upper elements from each
+ a2 = t1.SelectFromPairGrouped(0, 1, 4, 5, t3) // lowers
+ a3 = t1.SelectFromPairGrouped(2, 3, 6, 7, t3) // uppers
+
+ a4 = t4.SelectFromPairGrouped(0, 1, 4, 5, t6) // lower elements from each
+ a5 = t4.SelectFromPairGrouped(2, 3, 6, 7, t6) // upper elements from each
+ a6 = t5.SelectFromPairGrouped(0, 1, 4, 5, t7) // lowers
+ a7 = t5.SelectFromPairGrouped(2, 3, 6, 7, t7) // uppers
+
+ // next need to swap the upper 128 bits of a0-a3 with the lower 128 bits of a4-a7
+
+ b0 = a0.Select128FromPair(0, 2, a4)
+ b4 = a0.Select128FromPair(1, 3, a4)
+
+ b1 = a1.Select128FromPair(0, 2, a5)
+ b5 = a1.Select128FromPair(1, 3, a5)
+
+ b2 = a2.Select128FromPair(0, 2, a6)
+ b6 = a2.Select128FromPair(1, 3, a6)
+
+ b3 = a3.Select128FromPair(0, 2, a7)
+ b7 = a3.Select128FromPair(1, 3, a7)
+
+ return
+}
+
+func TestTranspose4(t *testing.T) {
+ r := make([]int32, 16, 16)
+
+ w := archsimd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
+ x := archsimd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
+ y := archsimd.LoadInt32x4Slice([]int32{0xE, 0xF, 0x10, 0x11})
+ z := archsimd.LoadInt32x4Slice([]int32{5, 6, 7, 8})
+ a, b, c, d := Transpose4(w, x, y, z)
+
+ a.StoreSlice(r[0:])
+ b.StoreSlice(r[4:])
+ c.StoreSlice(r[8:])
+ d.StoreSlice(r[12:])
+
+ checkSlices[int32](t, r, []int32{
+ 0xA, 1, 0xE, 5,
+ 0xB, 2, 0xF, 6,
+ 0xC, 3, 0x10, 7,
+ 0xD, 4, 0x11, 8,
+ })
+
+}
+
+func TestTranspose8(t *testing.T) {
+ m := make([]int32, 8)
+
+ a := []int32{}
+ for i := int32(1); i <= 64; i++ {
+ a = append(a, i)
+ }
+
+ p := archsimd.LoadInt32x8Slice(a[0:])
+ q := archsimd.LoadInt32x8Slice(a[8:])
+ r := archsimd.LoadInt32x8Slice(a[16:])
+ s := archsimd.LoadInt32x8Slice(a[24:])
+
+ w := archsimd.LoadInt32x8Slice(a[32:])
+ x := archsimd.LoadInt32x8Slice(a[40:])
+ y := archsimd.LoadInt32x8Slice(a[48:])
+ z := archsimd.LoadInt32x8Slice(a[56:])
+
+ p, q, r, s, w, x, y, z = Transpose8(p, q, r, s, w, x, y, z)
+
+ foo := func(a archsimd.Int32x8, z int32) {
+ a.StoreSlice(m)
+ var o []int32
+ for i := int32(0); i < 8; i++ {
+ o = append(o, z+i*8)
+ }
+ checkSlices[int32](t, m, o)
+ }
+
+ foo(p, 1)
+ foo(q, 2)
+ foo(r, 3)
+ foo(s, 4)
+ foo(w, 5)
+ foo(x, 6)
+ foo(y, 7)
+ foo(z, 8)
+
+}
+
+const BIG = 20000
+
+var bigMatrix [][]int32
+
+// 9x9 is smallest matrix with diagonal and off-diagonal tiles, plus a fringe.
+var nineMatrix [][]int32
+
+var thirtyMatrix [][]int32
+
+func fill(m [][]int32) {
+ for i := range m {
+ m[i] = make([]int32, len(m))
+ for j := range m[i] {
+ m[i][j] = int32(-i<<16 + j)
+ }
+ }
+}
+
+func isTransposed(m [][]int32) bool {
+ for i, mi := range m {
+ for j, a := range mi {
+ if a != int32(-j<<16+i) {
+ return false
+ }
+ }
+ }
+ return true
+}
+
+func dupe(m [][]int32) [][]int32 {
+ n := len(m)
+ p := make([][]int32, n, n)
+ for i := range p {
+ t := make([]int32, n)
+ for j, a := range m[i] {
+ t[j] = a
+ }
+ p[i] = t
+ }
+ return p
+}
+
+func init() {
+ bigMatrix = make([][]int32, BIG, BIG)
+ fill(bigMatrix)
+ nineMatrix = make([][]int32, 9, 9)
+ fill(nineMatrix)
+ thirtyMatrix = make([][]int32, 30, 30)
+ fill(thirtyMatrix)
+}
+
+func BenchmarkPlainTranspose(b *testing.B) {
+ d := dupe(bigMatrix)
+ for b.Loop() {
+ transposePlain(d)
+ }
+}
+
+func BenchmarkTiled4Transpose(b *testing.B) {
+ d := dupe(bigMatrix)
+ for b.Loop() {
+ transposeTiled4(d)
+ }
+}
+
+func BenchmarkTiled8Transpose(b *testing.B) {
+ d := dupe(bigMatrix)
+ for b.Loop() {
+ transposeTiled8(d)
+ }
+}
+
+func Benchmark2BlockedTranspose(b *testing.B) {
+ d := dupe(bigMatrix)
+ for b.Loop() {
+ transpose2Blocked(d)
+ }
+}
+func Benchmark3BlockedTranspose(b *testing.B) {
+ d := dupe(bigMatrix)
+ for b.Loop() {
+ transpose3Blocked(d)
+ }
+}
+func Benchmark4BlockedTranspose(b *testing.B) {
+ d := dupe(bigMatrix)
+ for b.Loop() {
+ transpose4Blocked(d)
+ }
+}
+func Benchmark5aBlockedTranspose(b *testing.B) {
+ d := dupe(bigMatrix)
+ for b.Loop() {
+ transpose5aBlocked(d)
+ }
+}
+
+func Benchmark5bBlockedTranspose(b *testing.B) {
+ d := dupe(bigMatrix)
+ for b.Loop() {
+ transpose5bBlocked(d)
+ }
+}
+
+func transposePlain(m [][]int32) {
+ for i := range m {
+ for j := 0; j < i; j++ {
+ t := m[i][j]
+ m[i][j] = m[j][i]
+ m[j][i] = t
+ }
+ }
+}
+
+func TestTransposePlain(t *testing.T) {
+ d := dupe(nineMatrix)
+ t.Logf("Input matrix is %s", formatMatrix(d))
+ transposePlain(d)
+ if !isTransposed(d) {
+ t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+ } else {
+ t.Logf("Transposed plain matrix = %s", formatMatrix(d))
+ }
+}
+
+func TestTranspose2Blocked(t *testing.T) {
+ d := dupe(nineMatrix)
+ t.Logf("Input matrix is %s", formatMatrix(d))
+ transpose2Blocked(d)
+ if !isTransposed(d) {
+ t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+ }
+}
+
+func TestTranspose3Blocked(t *testing.T) {
+ d := dupe(nineMatrix)
+ t.Logf("Input matrix is %s", formatMatrix(d))
+ transpose3Blocked(d)
+ if !isTransposed(d) {
+ t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+ }
+}
+
+func TestTranspose4Blocked(t *testing.T) {
+ d := dupe(nineMatrix)
+ t.Logf("Input matrix is %s", formatMatrix(d))
+ transpose4Blocked(d)
+ if !isTransposed(d) {
+ t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+ }
+}
+
+func TestTranspose5aBlocked(t *testing.T) {
+ d := dupe(nineMatrix)
+ t.Logf("Input matrix is %s", formatMatrix(d))
+ transpose5aBlocked(d)
+ if !isTransposed(d) {
+ t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+ }
+}
+
+func TestTranspose5bBlocked(t *testing.T) {
+ d := dupe(nineMatrix)
+ t.Logf("Input matrix is %s", formatMatrix(d))
+ transpose5bBlocked(d)
+ if !isTransposed(d) {
+ t.Errorf("d is not transposed, d = %s", formatMatrix(d))
+ }
+}
+
+func TestTransposeTiled4(t *testing.T) {
+ d := dupe(nineMatrix)
+ transposeTiled4(d)
+ if !isTransposed(d) {
+ t.Errorf("d is not transposed, d = %v", d)
+ }
+}
+
+func TestTransposeTiled8(t *testing.T) {
+ d := dupe(thirtyMatrix)
+ transposeTiled8(d)
+ if !isTransposed(d) {
+ t.Errorf("d is not transposed, d = %v", d)
+ }
+}
+
+func formatMatrix(m [][]int32) string {
+ s := ""
+ for _, mi := range m {
+ s += "\n["
+ for _, t := range mi {
+ h := t >> 16
+ l := t & 0xffff
+ s += fmt.Sprintf(" (%d %d)", h, l)
+ }
+ s += " ]"
+ }
+ return s
+}
+
+func transpose2Blocked(m [][]int32) {
+ const B = 2
+ N := len(m)
+ i := 0
+ for ; i <= len(m)-B; i += B {
+ r0, r1 := m[i], m[i+1]
+ if len(r0) < N || len(r1) < N {
+ panic("Early bounds check failure")
+ }
+ // transpose around diagonal
+ d01, d10 := r0[i+1], r1[i]
+ r0[i+1], r1[i] = d10, d01
+
+ // transpose across diagonal
+ j := 0
+ for ; j < i; j += B {
+ a0, a1 := m[j], m[j+1]
+
+ b00, b01 := a0[i], a0[i+1]
+ b10, b11 := a1[i], a1[i+1]
+
+ a0[i], a0[i+1] = r0[j], r1[j]
+ a1[i], a1[i+1] = r0[j+1], r1[j+1]
+
+ r0[j], r0[j+1] = b00, b10
+ r1[j], r1[j+1] = b01, b11
+ }
+ }
+
+ // Do the fringe
+ for ; i < len(m); i++ {
+ j := 0
+ r := m[i]
+ for ; j < i; j++ {
+ t := r[j]
+ r[j] = m[j][i]
+ m[j][i] = t
+ }
+ }
+}
+
+func transpose3Blocked(m [][]int32) {
+ const B = 3
+ N := len(m)
+ i := 0
+ for ; i <= len(m)-B; i += B {
+ r0, r1, r2 := m[i], m[i+1], m[i+2]
+ if len(r0) < N || len(r1) < N {
+ panic("Early bounds check failure")
+ }
+ // transpose around diagonal
+ d01, d10 := r0[i+1], r1[i]
+ d02, d20 := r0[i+2], r2[i]
+ d12, d21 := r1[i+2], r2[i+1]
+
+ r0[i+1], r1[i] = d10, d01
+ r0[i+2], r2[i] = d20, d02
+ r1[i+2], r2[i+1] = d21, d12
+
+ // transpose across diagonal
+ j := 0
+ for ; j < i; j += B {
+ a0, a1, a2 := m[j], m[j+1], m[j+2]
+
+ b00, b01, b02 := a0[i], a0[i+1], a0[i+2]
+ b10, b11, b12 := a1[i], a1[i+1], a1[i+2]
+ b20, b21, b22 := a2[i], a2[i+1], a2[i+2]
+
+ a0[i], a0[i+1], a0[i+2] = r0[j], r1[j], r2[j]
+ a1[i], a1[i+1], a1[i+2] = r0[j+1], r1[j+1], r2[j+1]
+ a2[i], a2[i+1], a2[i+2] = r0[j+2], r1[j+2], r2[j+2]
+
+ r0[j], r0[j+1], r0[j+2] = b00, b10, b20
+ r1[j], r1[j+1], r1[j+2] = b01, b11, b21
+ r2[j], r2[j+1], r2[j+2] = b02, b12, b22
+ }
+ }
+
+ // Do the fringe
+ for ; i < len(m); i++ {
+ j := 0
+ r := m[i]
+ for ; j < i; j++ {
+ t := r[j]
+ r[j] = m[j][i]
+ m[j][i] = t
+ }
+ }
+}
+
+func transpose4Blocked(m [][]int32) {
+ const B = 4
+ N := len(m)
+ i := 0
+ for ; i <= len(m)-B; i += B {
+ r0, r1, r2, r3 := m[i], m[i+1], m[i+2], m[i+3]
+ if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N {
+ panic("Early bounds check failure")
+ }
+ // transpose around diagonal
+ d01, d10 := r0[i+1], r1[i]
+ d02, d20 := r0[i+2], r2[i]
+ d03, d30 := r0[i+3], r3[i]
+ d12, d21 := r1[i+2], r2[i+1]
+ d13, d31 := r1[i+3], r3[i+1]
+ d23, d32 := r2[i+3], r3[i+2]
+
+ r0[i+1], r1[i] = d10, d01
+ r0[i+2], r2[i] = d20, d02
+ r0[i+3], r3[i] = d30, d03
+ r1[i+2], r2[i+1] = d21, d12
+ r1[i+3], r3[i+1] = d31, d13
+ r2[i+3], r3[i+2] = d32, d23
+
+ // transpose across diagonal
+ j := 0
+ for ; j < i; j += B {
+ a0, a1, a2, a3 := m[j], m[j+1], m[j+2], m[j+3]
+
+ b00, b01, b02, b03 := a0[i], a0[i+1], a0[i+2], a0[i+3]
+ b10, b11, b12, b13 := a1[i], a1[i+1], a1[i+2], a1[i+3]
+ b20, b21, b22, b23 := a2[i], a2[i+1], a2[i+2], a2[i+3]
+ b30, b31, b32, b33 := a3[i], a3[i+1], a3[i+2], a3[i+3]
+
+ a0[i], a0[i+1], a0[i+2], a0[i+3] = r0[j], r1[j], r2[j], r3[j]
+ a1[i], a1[i+1], a1[i+2], a1[i+3] = r0[j+1], r1[j+1], r2[j+1], r3[j+1]
+ a2[i], a2[i+1], a2[i+2], a2[i+3] = r0[j+2], r1[j+2], r2[j+2], r3[j+2]
+ a3[i], a3[i+1], a3[i+2], a3[i+3] = r0[j+3], r1[j+3], r2[j+3], r3[j+3]
+
+ r0[j], r0[j+1], r0[j+2], r0[j+3] = b00, b10, b20, b30
+ r1[j], r1[j+1], r1[j+2], r1[j+3] = b01, b11, b21, b31
+ r2[j], r2[j+1], r2[j+2], r2[j+3] = b02, b12, b22, b32
+ r3[j], r3[j+1], r3[j+2], r3[j+3] = b03, b13, b23, b33
+ }
+ }
+
+ // Do the fringe
+ for ; i < len(m); i++ {
+ j := 0
+ r := m[i]
+ for ; j < i; j++ {
+ t := r[j]
+ r[j] = m[j][i]
+ m[j][i] = t
+ }
+ }
+}
+
+func transpose5aBlocked(m [][]int32) {
+ const B = 5
+ N := len(m)
+ i := 0
+ for ; i <= len(m)-B; i += B {
+ r0, r1, r2, r3, r4 := m[i], m[i+1], m[i+2], m[i+3], m[i+4]
+ if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N {
+ panic("Early bounds check failure")
+ }
+ // transpose around diagonal
+ d01, d10 := r0[i+1], r1[i]
+ d02, d20 := r0[i+2], r2[i]
+ d03, d30 := r0[i+3], r3[i]
+ d04, d40 := r0[i+4], r4[i]
+
+ d12, d21 := r1[i+2], r2[i+1]
+ d13, d31 := r1[i+3], r3[i+1]
+ d14, d41 := r1[i+4], r4[i+1]
+
+ d23, d32 := r2[i+3], r3[i+2]
+ d24, d42 := r2[i+4], r4[i+2]
+
+ d34, d43 := r3[i+4], r4[i+3]
+
+ r0[i+1], r1[i] = d10, d01
+ r0[i+2], r2[i] = d20, d02
+ r0[i+3], r3[i] = d30, d03
+ r0[i+4], r4[i] = d40, d04
+
+ r1[i+2], r2[i+1] = d21, d12
+ r1[i+3], r3[i+1] = d31, d13
+ r1[i+4], r4[i+1] = d41, d14
+
+ r2[i+3], r3[i+2] = d32, d23
+ r2[i+4], r4[i+2] = d42, d24
+
+ r3[i+4], r4[i+3] = d43, d34
+
+ // transpose across diagonal
+ j := 0
+ for ; j < i; j += B {
+ a0, a1, a2, a3, a4 := m[j], m[j+1], m[j+2], m[j+3], m[j+4]
+
+ b00, b01, b02, b03, b04 := a0[i], a0[i+1], a0[i+2], a0[i+3], a0[i+4]
+ b10, b11, b12, b13, b14 := a1[i], a1[i+1], a1[i+2], a1[i+3], a1[i+4]
+ b20, b21, b22, b23, b24 := a2[i], a2[i+1], a2[i+2], a2[i+3], a2[i+4]
+ b30, b31, b32, b33, b34 := a3[i], a3[i+1], a3[i+2], a3[i+3], a3[i+4]
+ b40, b41, b42, b43, b44 := a4[i], a4[i+1], a4[i+2], a4[i+3], a4[i+4]
+
+ a0[i], a0[i+1], a0[i+2], a0[i+3], a0[i+4] = r0[j], r1[j], r2[j], r3[j], r4[j]
+ a1[i], a1[i+1], a1[i+2], a1[i+3], a1[i+4] = r0[j+1], r1[j+1], r2[j+1], r3[j+1], r4[j+1]
+ a2[i], a2[i+1], a2[i+2], a2[i+3], a2[i+4] = r0[j+2], r1[j+2], r2[j+2], r3[j+2], r4[j+2]
+ a3[i], a3[i+1], a3[i+2], a3[i+3], a3[i+4] = r0[j+3], r1[j+3], r2[j+3], r3[j+3], r4[j+3]
+ a4[i], a4[i+1], a4[i+2], a4[i+3], a4[i+4] = r0[j+4], r1[j+4], r2[j+4], r3[j+4], r4[j+4]
+
+ r0[j], r0[j+1], r0[j+2], r0[j+3], r0[j+4] = b00, b10, b20, b30, b40
+ r1[j], r1[j+1], r1[j+2], r1[j+3], r1[j+4] = b01, b11, b21, b31, b41
+ r2[j], r2[j+1], r2[j+2], r2[j+3], r2[j+4] = b02, b12, b22, b32, b42
+ r3[j], r3[j+1], r3[j+2], r3[j+3], r3[j+4] = b03, b13, b23, b33, b43
+ r4[j], r4[j+1], r4[j+2], r4[j+3], r4[j+4] = b04, b14, b24, b34, b44
+ }
+ }
+
+ // Do the fringe
+ for ; i < len(m); i++ {
+ j := 0
+ r := m[i]
+ for ; j < i; j++ {
+ t := r[j]
+ r[j] = m[j][i]
+ m[j][i] = t
+ }
+ }
+}
+
+// transpose5bBlocked is just like transpose5aBlocked
+// but rewritten to reduce register pressure in the
+// inner loop.
+func transpose5bBlocked(m [][]int32) {
+ const B = 5
+ N := len(m)
+ i := 0
+ for ; i <= len(m)-B; i += B {
+ r0, r1, r2, r3, r4 := m[i], m[i+1], m[i+2], m[i+3], m[i+4]
+ if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N {
+ panic("Early bounds check failure")
+ }
+ // transpose around diagonal
+ d01, d10 := r0[i+1], r1[i]
+ d02, d20 := r0[i+2], r2[i]
+ d03, d30 := r0[i+3], r3[i]
+ d04, d40 := r0[i+4], r4[i]
+ r0[i+1], r1[i] = d10, d01
+ r0[i+2], r2[i] = d20, d02
+ r0[i+3], r3[i] = d30, d03
+ r0[i+4], r4[i] = d40, d04
+
+ d12, d21 := r1[i+2], r2[i+1]
+ d13, d31 := r1[i+3], r3[i+1]
+ d14, d41 := r1[i+4], r4[i+1]
+ r1[i+2], r2[i+1] = d21, d12
+ r1[i+3], r3[i+1] = d31, d13
+ r1[i+4], r4[i+1] = d41, d14
+
+ d23, d32 := r2[i+3], r3[i+2]
+ d24, d42 := r2[i+4], r4[i+2]
+ r2[i+3], r3[i+2] = d32, d23
+ r2[i+4], r4[i+2] = d42, d24
+
+ d34, d43 := r3[i+4], r4[i+3]
+ r3[i+4], r4[i+3] = d43, d34
+
+ // transpose across diagonal
+ j := 0
+ for ; j < i; j += B {
+ a4, a0, a1, a2, a3 := m[j+4], m[j], m[j+1], m[j+2], m[j+3]
+
+ // Process column i+4
+ temp0 := a0[i+4]
+ temp1 := a1[i+4]
+ temp2 := a2[i+4]
+ temp3 := a3[i+4]
+ temp4 := a4[i+4]
+
+ a4[i+4] = r4[j+4]
+ a0[i+4] = r4[j]
+ a1[i+4] = r4[j+1]
+ a2[i+4] = r4[j+2]
+ a3[i+4] = r4[j+3]
+
+ r0[j+4] = temp0
+ r1[j+4] = temp1
+ r2[j+4] = temp2
+ r3[j+4] = temp3
+ r4[j+4] = temp4
+
+ // Process column i
+ temp0 = a0[i]
+ temp1 = a1[i]
+ temp2 = a2[i]
+ temp3 = a3[i]
+ temp4 = a4[i]
+
+ a4[i] = r0[j+4]
+ a0[i] = r0[j]
+ a1[i] = r0[j+1]
+ a2[i] = r0[j+2]
+ a3[i] = r0[j+3]
+
+ r0[j] = temp0
+ r1[j] = temp1
+ r2[j] = temp2
+ r3[j] = temp3
+ r4[j] = temp4
+
+ // Process column i+1
+ temp0 = a0[i+1]
+ temp1 = a1[i+1]
+ temp2 = a2[i+1]
+ temp3 = a3[i+1]
+ temp4 = a4[i+1]
+
+ a4[i+1] = r1[j+4]
+ a0[i+1] = r1[j]
+ a1[i+1] = r1[j+1]
+ a2[i+1] = r1[j+2]
+ a3[i+1] = r1[j+3]
+
+ r0[j+1] = temp0
+ r1[j+1] = temp1
+ r2[j+1] = temp2
+ r3[j+1] = temp3
+ r4[j+1] = temp4
+
+ // Process column i+2
+ temp0 = a0[i+2]
+ temp1 = a1[i+2]
+ temp2 = a2[i+2]
+ temp3 = a3[i+2]
+ temp4 = a4[i+2]
+
+ a4[i+2] = r2[j+4]
+ a0[i+2] = r2[j]
+ a1[i+2] = r2[j+1]
+ a2[i+2] = r2[j+2]
+ a3[i+2] = r2[j+3]
+
+ r0[j+2] = temp0
+ r1[j+2] = temp1
+ r2[j+2] = temp2
+ r3[j+2] = temp3
+ r4[j+2] = temp4
+
+ // Process column i+3
+ temp0 = a0[i+3]
+ temp1 = a1[i+3]
+ temp2 = a2[i+3]
+ temp3 = a3[i+3]
+ temp4 = a4[i+3]
+
+ a4[i+3] = r3[j+4]
+ a0[i+3] = r3[j]
+ a1[i+3] = r3[j+1]
+ a2[i+3] = r3[j+2]
+ a3[i+3] = r3[j+3]
+
+ r0[j+3] = temp0
+ r1[j+3] = temp1
+ r2[j+3] = temp2
+ r3[j+3] = temp3
+ r4[j+3] = temp4
+ }
+ }
+
+ // Do the fringe
+ for ; i < len(m); i++ {
+ j := 0
+ r := m[i]
+ for ; j < i; j++ {
+ t := r[j]
+ r[j] = m[j][i]
+ m[j][i] = t
+ }
+ }
+}
+
+func transposeTiled4(m [][]int32) {
+ const B = 4
+ N := len(m)
+ i := 0
+ for ; i < len(m)-(B-1); i += B {
+ r0, r1, r2, r3 := m[i], m[i+1], m[i+2], m[i+3]
+ if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N {
+ panic("Early bounds check failure")
+ }
+ // transpose diagonal
+ d0, d1, d2, d3 :=
+ archsimd.LoadInt32x4Slice(r0[i:]),
+ archsimd.LoadInt32x4Slice(r1[i:]),
+ archsimd.LoadInt32x4Slice(r2[i:]),
+ archsimd.LoadInt32x4Slice(r3[i:])
+
+ d0, d1, d2, d3 = Transpose4(d0, d1, d2, d3)
+
+ d0.StoreSlice(r0[i:])
+ d1.StoreSlice(r1[i:])
+ d2.StoreSlice(r2[i:])
+ d3.StoreSlice(r3[i:])
+
+ // transpose across diagonal
+ j := 0
+ for ; j < i; j += B {
+ a0, a1, a2, a3 := m[j], m[j+1], m[j+2], m[j+3]
+ u0, u1, u2, u3 :=
+ archsimd.LoadInt32x4Slice(a0[i:]),
+ archsimd.LoadInt32x4Slice(a1[i:]),
+ archsimd.LoadInt32x4Slice(a2[i:]),
+ archsimd.LoadInt32x4Slice(a3[i:])
+
+ u0, u1, u2, u3 = Transpose4(u0, u1, u2, u3)
+
+ l0 := archsimd.LoadInt32x4Slice(r0[j:])
+ u0.StoreSlice(r0[j:])
+ l1 := archsimd.LoadInt32x4Slice(r1[j:])
+ u1.StoreSlice(r1[j:])
+ l2 := archsimd.LoadInt32x4Slice(r2[j:])
+ u2.StoreSlice(r2[j:])
+ l3 := archsimd.LoadInt32x4Slice(r3[j:])
+ u3.StoreSlice(r3[j:])
+
+ u0, u1, u2, u3 = Transpose4(l0, l1, l2, l3)
+
+ u0.StoreSlice(a0[i:])
+ u1.StoreSlice(a1[i:])
+ u2.StoreSlice(a2[i:])
+ u3.StoreSlice(a3[i:])
+ }
+ }
+ // Do the fringe
+ for ; i < len(m); i++ {
+ j := 0
+ r := m[i]
+ for ; j < i; j++ {
+ t := r[j]
+ r[j] = m[j][i]
+ m[j][i] = t
+ }
+ }
+}
+
+func transposeTiled8(m [][]int32) {
+ const B = 8
+ N := len(m)
+ i := 0
+ for ; i < len(m)-(B-1); i += B {
+ r0, r1, r2, r3, r4, r5, r6, r7 := m[i], m[i+1], m[i+2], m[i+3], m[i+4], m[i+5], m[i+6], m[i+7]
+ if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N || len(r5) < N || len(r6) < N || len(r7) < N {
+ panic("Early bounds check failure")
+ }
+ // transpose diagonal
+ d0, d1, d2, d3, d4, d5, d6, d7 :=
+ archsimd.LoadInt32x8Slice(r0[i:]),
+ archsimd.LoadInt32x8Slice(r1[i:]),
+ archsimd.LoadInt32x8Slice(r2[i:]),
+ archsimd.LoadInt32x8Slice(r3[i:]),
+ archsimd.LoadInt32x8Slice(r4[i:]),
+ archsimd.LoadInt32x8Slice(r5[i:]),
+ archsimd.LoadInt32x8Slice(r6[i:]),
+ archsimd.LoadInt32x8Slice(r7[i:])
+
+ d0, d1, d2, d3, d4, d5, d6, d7 = Transpose8(d0, d1, d2, d3, d4, d5, d6, d7)
+
+ d0.StoreSlice(r0[i:])
+ d1.StoreSlice(r1[i:])
+ d2.StoreSlice(r2[i:])
+ d3.StoreSlice(r3[i:])
+ d4.StoreSlice(r4[i:])
+ d5.StoreSlice(r5[i:])
+ d6.StoreSlice(r6[i:])
+ d7.StoreSlice(r7[i:])
+
+ // transpose across diagonal
+ j := 0
+ for ; j < i; j += B {
+ a7, a0, a1, a2, a3, a4, a5, a6 := m[j+7], m[j], m[j+1], m[j+2], m[j+3], m[j+4], m[j+5], m[j+6]
+ u0, u1, u2, u3, u4, u5, u6, u7 :=
+ archsimd.LoadInt32x8Slice(a0[i:]),
+ archsimd.LoadInt32x8Slice(a1[i:]),
+ archsimd.LoadInt32x8Slice(a2[i:]),
+ archsimd.LoadInt32x8Slice(a3[i:]),
+ archsimd.LoadInt32x8Slice(a4[i:]),
+ archsimd.LoadInt32x8Slice(a5[i:]),
+ archsimd.LoadInt32x8Slice(a6[i:]),
+ archsimd.LoadInt32x8Slice(a7[i:])
+
+ u0, u1, u2, u3, u4, u5, u6, u7 = Transpose8(u0, u1, u2, u3, u4, u5, u6, u7)
+
+ l0 := archsimd.LoadInt32x8Slice(r0[j:])
+ u0.StoreSlice(r0[j:])
+ l1 := archsimd.LoadInt32x8Slice(r1[j:])
+ u1.StoreSlice(r1[j:])
+ l2 := archsimd.LoadInt32x8Slice(r2[j:])
+ u2.StoreSlice(r2[j:])
+ l3 := archsimd.LoadInt32x8Slice(r3[j:])
+ u3.StoreSlice(r3[j:])
+ l4 := archsimd.LoadInt32x8Slice(r4[j:])
+ u4.StoreSlice(r4[j:])
+ l5 := archsimd.LoadInt32x8Slice(r5[j:])
+ u5.StoreSlice(r5[j:])
+ l6 := archsimd.LoadInt32x8Slice(r6[j:])
+ u6.StoreSlice(r6[j:])
+ l7 := archsimd.LoadInt32x8Slice(r7[j:])
+ u7.StoreSlice(r7[j:])
+
+ u0, u1, u2, u3, u4, u5, u6, u7 = Transpose8(l0, l1, l2, l3, l4, l5, l6, l7)
+
+ u0.StoreSlice(a0[i:])
+ u1.StoreSlice(a1[i:])
+ u2.StoreSlice(a2[i:])
+ u3.StoreSlice(a3[i:])
+ u4.StoreSlice(a4[i:])
+ u5.StoreSlice(a5[i:])
+ u6.StoreSlice(a6[i:])
+ u7.StoreSlice(a7[i:])
+ }
+ }
+ // Do the fringe
+ for ; i < len(m); i++ {
+ j := 0
+ r := m[i]
+ for ; j < i; j++ {
+ t := r[j]
+ r[j] = m[j][i]
+ m[j][i] = t
+ }
+ }
+}
--- /dev/null
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+// This file contains functions testing unary simd methods.
+// Each function in this file is specialized for a
+// particular simd type <BaseType><Width>x<Count>.
+
+package simd_test
+
+import (
+ "simd/archsimd"
+ "testing"
+)
+
+// testInt8x16Unary tests the simd unary method f against the expected behavior generated by want
+func testInt8x16Unary(t *testing.T, f func(_ archsimd.Int8x16) archsimd.Int8x16, want func(_ []int8) []int8) {
+ n := 16
+ t.Helper()
+ forSlice(t, int8s, n, func(x []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x16Slice(x)
+ g := make([]int8, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt16x8Unary tests the simd unary method f against the expected behavior generated by want
+func testInt16x8Unary(t *testing.T, f func(_ archsimd.Int16x8) archsimd.Int16x8, want func(_ []int16) []int16) {
+ n := 8
+ t.Helper()
+ forSlice(t, int16s, n, func(x []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x8Slice(x)
+ g := make([]int16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x4Unary tests the simd unary method f against the expected behavior generated by want
+func testInt32x4Unary(t *testing.T, f func(_ archsimd.Int32x4) archsimd.Int32x4, want func(_ []int32) []int32) {
+ n := 4
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x4Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt64x2Unary tests the simd unary method f against the expected behavior generated by want
+func testInt64x2Unary(t *testing.T, f func(_ archsimd.Int64x2) archsimd.Int64x2, want func(_ []int64) []int64) {
+ n := 2
+ t.Helper()
+ forSlice(t, int64s, n, func(x []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x2Slice(x)
+ g := make([]int64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint8x16Unary tests the simd unary method f against the expected behavior generated by want
+func testUint8x16Unary(t *testing.T, f func(_ archsimd.Uint8x16) archsimd.Uint8x16, want func(_ []uint8) []uint8) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint8s, n, func(x []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x16Slice(x)
+ g := make([]uint8, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint16x8Unary tests the simd unary method f against the expected behavior generated by want
+func testUint16x8Unary(t *testing.T, f func(_ archsimd.Uint16x8) archsimd.Uint16x8, want func(_ []uint16) []uint16) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint16s, n, func(x []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x8Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x4Unary tests the simd unary method f against the expected behavior generated by want
+func testUint32x4Unary(t *testing.T, f func(_ archsimd.Uint32x4) archsimd.Uint32x4, want func(_ []uint32) []uint32) {
+ n := 4
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x4Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint64x2Unary tests the simd unary method f against the expected behavior generated by want
+func testUint64x2Unary(t *testing.T, f func(_ archsimd.Uint64x2) archsimd.Uint64x2, want func(_ []uint64) []uint64) {
+ n := 2
+ t.Helper()
+ forSlice(t, uint64s, n, func(x []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x2Slice(x)
+ g := make([]uint64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x4Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat32x4Unary(t *testing.T, f func(_ archsimd.Float32x4) archsimd.Float32x4, want func(_ []float32) []float32) {
+ n := 4
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x4Slice(x)
+ g := make([]float32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x2Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat64x2Unary(t *testing.T, f func(_ archsimd.Float64x2) archsimd.Float64x2, want func(_ []float64) []float64) {
+ n := 2
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x2Slice(x)
+ g := make([]float64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt8x32Unary tests the simd unary method f against the expected behavior generated by want
+func testInt8x32Unary(t *testing.T, f func(_ archsimd.Int8x32) archsimd.Int8x32, want func(_ []int8) []int8) {
+ n := 32
+ t.Helper()
+ forSlice(t, int8s, n, func(x []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x32Slice(x)
+ g := make([]int8, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt16x16Unary tests the simd unary method f against the expected behavior generated by want
+func testInt16x16Unary(t *testing.T, f func(_ archsimd.Int16x16) archsimd.Int16x16, want func(_ []int16) []int16) {
+ n := 16
+ t.Helper()
+ forSlice(t, int16s, n, func(x []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x16Slice(x)
+ g := make([]int16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x8Unary tests the simd unary method f against the expected behavior generated by want
+func testInt32x8Unary(t *testing.T, f func(_ archsimd.Int32x8) archsimd.Int32x8, want func(_ []int32) []int32) {
+ n := 8
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x8Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt64x4Unary tests the simd unary method f against the expected behavior generated by want
+func testInt64x4Unary(t *testing.T, f func(_ archsimd.Int64x4) archsimd.Int64x4, want func(_ []int64) []int64) {
+ n := 4
+ t.Helper()
+ forSlice(t, int64s, n, func(x []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x4Slice(x)
+ g := make([]int64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint8x32Unary tests the simd unary method f against the expected behavior generated by want
+func testUint8x32Unary(t *testing.T, f func(_ archsimd.Uint8x32) archsimd.Uint8x32, want func(_ []uint8) []uint8) {
+ n := 32
+ t.Helper()
+ forSlice(t, uint8s, n, func(x []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x32Slice(x)
+ g := make([]uint8, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint16x16Unary tests the simd unary method f against the expected behavior generated by want
+func testUint16x16Unary(t *testing.T, f func(_ archsimd.Uint16x16) archsimd.Uint16x16, want func(_ []uint16) []uint16) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint16s, n, func(x []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x16Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x8Unary tests the simd unary method f against the expected behavior generated by want
+func testUint32x8Unary(t *testing.T, f func(_ archsimd.Uint32x8) archsimd.Uint32x8, want func(_ []uint32) []uint32) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x8Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint64x4Unary tests the simd unary method f against the expected behavior generated by want
+func testUint64x4Unary(t *testing.T, f func(_ archsimd.Uint64x4) archsimd.Uint64x4, want func(_ []uint64) []uint64) {
+ n := 4
+ t.Helper()
+ forSlice(t, uint64s, n, func(x []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x4Slice(x)
+ g := make([]uint64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x8Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat32x8Unary(t *testing.T, f func(_ archsimd.Float32x8) archsimd.Float32x8, want func(_ []float32) []float32) {
+ n := 8
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x8Slice(x)
+ g := make([]float32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x4Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat64x4Unary(t *testing.T, f func(_ archsimd.Float64x4) archsimd.Float64x4, want func(_ []float64) []float64) {
+ n := 4
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x4Slice(x)
+ g := make([]float64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt8x64Unary tests the simd unary method f against the expected behavior generated by want
+func testInt8x64Unary(t *testing.T, f func(_ archsimd.Int8x64) archsimd.Int8x64, want func(_ []int8) []int8) {
+ n := 64
+ t.Helper()
+ forSlice(t, int8s, n, func(x []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x64Slice(x)
+ g := make([]int8, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt16x32Unary tests the simd unary method f against the expected behavior generated by want
+func testInt16x32Unary(t *testing.T, f func(_ archsimd.Int16x32) archsimd.Int16x32, want func(_ []int16) []int16) {
+ n := 32
+ t.Helper()
+ forSlice(t, int16s, n, func(x []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x32Slice(x)
+ g := make([]int16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x16Unary tests the simd unary method f against the expected behavior generated by want
+func testInt32x16Unary(t *testing.T, f func(_ archsimd.Int32x16) archsimd.Int32x16, want func(_ []int32) []int32) {
+ n := 16
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x16Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt64x8Unary tests the simd unary method f against the expected behavior generated by want
+func testInt64x8Unary(t *testing.T, f func(_ archsimd.Int64x8) archsimd.Int64x8, want func(_ []int64) []int64) {
+ n := 8
+ t.Helper()
+ forSlice(t, int64s, n, func(x []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x8Slice(x)
+ g := make([]int64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint8x64Unary tests the simd unary method f against the expected behavior generated by want
+func testUint8x64Unary(t *testing.T, f func(_ archsimd.Uint8x64) archsimd.Uint8x64, want func(_ []uint8) []uint8) {
+ n := 64
+ t.Helper()
+ forSlice(t, uint8s, n, func(x []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x64Slice(x)
+ g := make([]uint8, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint16x32Unary tests the simd unary method f against the expected behavior generated by want
+func testUint16x32Unary(t *testing.T, f func(_ archsimd.Uint16x32) archsimd.Uint16x32, want func(_ []uint16) []uint16) {
+ n := 32
+ t.Helper()
+ forSlice(t, uint16s, n, func(x []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x32Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x16Unary tests the simd unary method f against the expected behavior generated by want
+func testUint32x16Unary(t *testing.T, f func(_ archsimd.Uint32x16) archsimd.Uint32x16, want func(_ []uint32) []uint32) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x16Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint64x8Unary tests the simd unary method f against the expected behavior generated by want
+func testUint64x8Unary(t *testing.T, f func(_ archsimd.Uint64x8) archsimd.Uint64x8, want func(_ []uint64) []uint64) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint64s, n, func(x []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x8Slice(x)
+ g := make([]uint64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x16Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat32x16Unary(t *testing.T, f func(_ archsimd.Float32x16) archsimd.Float32x16, want func(_ []float32) []float32) {
+ n := 16
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x16Slice(x)
+ g := make([]float32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x8Unary tests the simd unary method f against the expected behavior generated by want
+func testFloat64x8Unary(t *testing.T, f func(_ archsimd.Float64x8) archsimd.Float64x8, want func(_ []float64) []float64) {
+ n := 8
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x8Slice(x)
+ g := make([]float64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt8x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt8x16ConvertToInt32(t *testing.T, f func(x archsimd.Int8x16) archsimd.Int32x16, want func(x []int8) []int32) {
+ n := 16
+ t.Helper()
+ forSlice(t, int8s, n, func(x []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x16Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt16x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x8ConvertToInt32(t *testing.T, f func(x archsimd.Int16x8) archsimd.Int32x8, want func(x []int16) []int32) {
+ n := 8
+ t.Helper()
+ forSlice(t, int16s, n, func(x []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x8Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x4ConvertToInt32(t *testing.T, f func(x archsimd.Int32x4) archsimd.Int32x4, want func(x []int32) []int32) {
+ n := 4
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x4Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint8x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint8x16ConvertToInt32(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Int32x16, want func(x []uint8) []int32) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint8s, n, func(x []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x16Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint16x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x8ConvertToInt32(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Int32x8, want func(x []uint16) []int32) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint16s, n, func(x []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x8Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x4ConvertToInt32(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Int32x4, want func(x []uint32) []int32) {
+ n := 4
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x4Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x4ConvertToInt32(t *testing.T, f func(x archsimd.Float32x4) archsimd.Int32x4, want func(x []float32) []int32) {
+ n := 4
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x4Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt16x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x16ConvertToInt32(t *testing.T, f func(x archsimd.Int16x16) archsimd.Int32x16, want func(x []int16) []int32) {
+ n := 16
+ t.Helper()
+ forSlice(t, int16s, n, func(x []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x16Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x8ConvertToInt32(t *testing.T, f func(x archsimd.Int32x8) archsimd.Int32x8, want func(x []int32) []int32) {
+ n := 8
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x8Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt64x4ConvertToInt32(t *testing.T, f func(x archsimd.Int64x4) archsimd.Int32x4, want func(x []int64) []int32) {
+ n := 4
+ t.Helper()
+ forSlice(t, int64s, n, func(x []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x4Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint16x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x16ConvertToInt32(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Int32x16, want func(x []uint16) []int32) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint16s, n, func(x []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x16Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x8ConvertToInt32(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Int32x8, want func(x []uint32) []int32) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x8Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint64x4ConvertToInt32(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Int32x4, want func(x []uint64) []int32) {
+ n := 4
+ t.Helper()
+ forSlice(t, uint64s, n, func(x []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x4Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x8ConvertToInt32(t *testing.T, f func(x archsimd.Float32x8) archsimd.Int32x8, want func(x []float32) []int32) {
+ n := 8
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x8Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat64x4ConvertToInt32(t *testing.T, f func(x archsimd.Float64x4) archsimd.Int32x4, want func(x []float64) []int32) {
+ n := 4
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x4Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x16ConvertToInt32(t *testing.T, f func(x archsimd.Int32x16) archsimd.Int32x16, want func(x []int32) []int32) {
+ n := 16
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x16Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt64x8ConvertToInt32(t *testing.T, f func(x archsimd.Int64x8) archsimd.Int32x8, want func(x []int64) []int32) {
+ n := 8
+ t.Helper()
+ forSlice(t, int64s, n, func(x []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x8Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x16ConvertToInt32(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Int32x16, want func(x []uint32) []int32) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x16Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint64x8ConvertToInt32(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Int32x8, want func(x []uint64) []int32) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint64s, n, func(x []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x8Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x16ConvertToInt32(t *testing.T, f func(x archsimd.Float32x16) archsimd.Int32x16, want func(x []float32) []int32) {
+ n := 16
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x16Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat64x8ConvertToInt32(t *testing.T, f func(x archsimd.Float64x8) archsimd.Int32x8, want func(x []float64) []int32) {
+ n := 8
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x8Slice(x)
+ g := make([]int32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt8x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt8x16ConvertToUint32(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint32x16, want func(x []int8) []uint32) {
+ n := 16
+ t.Helper()
+ forSlice(t, int8s, n, func(x []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x16Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt16x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x8ConvertToUint32(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint32x8, want func(x []int16) []uint32) {
+ n := 8
+ t.Helper()
+ forSlice(t, int16s, n, func(x []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x8Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x4ConvertToUint32(t *testing.T, f func(x archsimd.Int32x4) archsimd.Uint32x4, want func(x []int32) []uint32) {
+ n := 4
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x4Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint8x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint8x16ConvertToUint32(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint32x16, want func(x []uint8) []uint32) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint8s, n, func(x []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x16Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint16x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x8ConvertToUint32(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint32x8, want func(x []uint16) []uint32) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint16s, n, func(x []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x8Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x4ConvertToUint32(t *testing.T, f func(x archsimd.Uint32x4) archsimd.Uint32x4, want func(x []uint32) []uint32) {
+ n := 4
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x4Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x4ConvertToUint32(t *testing.T, f func(x archsimd.Float32x4) archsimd.Uint32x4, want func(x []float32) []uint32) {
+ n := 4
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x4Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt16x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x16ConvertToUint32(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint32x16, want func(x []int16) []uint32) {
+ n := 16
+ t.Helper()
+ forSlice(t, int16s, n, func(x []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x16Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x8ConvertToUint32(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint32x8, want func(x []int32) []uint32) {
+ n := 8
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x8Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt64x4ConvertToUint32(t *testing.T, f func(x archsimd.Int64x4) archsimd.Uint32x4, want func(x []int64) []uint32) {
+ n := 4
+ t.Helper()
+ forSlice(t, int64s, n, func(x []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x4Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint16x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x16ConvertToUint32(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint32x16, want func(x []uint16) []uint32) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint16s, n, func(x []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x16Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x8ConvertToUint32(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint32x8, want func(x []uint32) []uint32) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x8Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint64x4ConvertToUint32(t *testing.T, f func(x archsimd.Uint64x4) archsimd.Uint32x4, want func(x []uint64) []uint32) {
+ n := 4
+ t.Helper()
+ forSlice(t, uint64s, n, func(x []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x4Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x8ConvertToUint32(t *testing.T, f func(x archsimd.Float32x8) archsimd.Uint32x8, want func(x []float32) []uint32) {
+ n := 8
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x8Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat64x4ConvertToUint32(t *testing.T, f func(x archsimd.Float64x4) archsimd.Uint32x4, want func(x []float64) []uint32) {
+ n := 4
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x4Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x16ConvertToUint32(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint32x16, want func(x []int32) []uint32) {
+ n := 16
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x16Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt64x8ConvertToUint32(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint32x8, want func(x []int64) []uint32) {
+ n := 8
+ t.Helper()
+ forSlice(t, int64s, n, func(x []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x8Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x16ConvertToUint32(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint32x16, want func(x []uint32) []uint32) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x16Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint64x8ConvertToUint32(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint32x8, want func(x []uint64) []uint32) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint64s, n, func(x []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x8Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x16ConvertToUint32(t *testing.T, f func(x archsimd.Float32x16) archsimd.Uint32x16, want func(x []float32) []uint32) {
+ n := 16
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x16Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat64x8ConvertToUint32(t *testing.T, f func(x archsimd.Float64x8) archsimd.Uint32x8, want func(x []float64) []uint32) {
+ n := 8
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x8Slice(x)
+ g := make([]uint32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt8x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt8x16ConvertToUint16(t *testing.T, f func(x archsimd.Int8x16) archsimd.Uint16x16, want func(x []int8) []uint16) {
+ n := 16
+ t.Helper()
+ forSlice(t, int8s, n, func(x []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x16Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt16x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x8ConvertToUint16(t *testing.T, f func(x archsimd.Int16x8) archsimd.Uint16x8, want func(x []int16) []uint16) {
+ n := 8
+ t.Helper()
+ forSlice(t, int16s, n, func(x []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x8Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint8x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint8x16ConvertToUint16(t *testing.T, f func(x archsimd.Uint8x16) archsimd.Uint16x16, want func(x []uint8) []uint16) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint8s, n, func(x []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x16Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint16x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x8ConvertToUint16(t *testing.T, f func(x archsimd.Uint16x8) archsimd.Uint16x8, want func(x []uint16) []uint16) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint16s, n, func(x []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x8Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt8x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt8x32ConvertToUint16(t *testing.T, f func(x archsimd.Int8x32) archsimd.Uint16x32, want func(x []int8) []uint16) {
+ n := 32
+ t.Helper()
+ forSlice(t, int8s, n, func(x []int8) bool {
+ t.Helper()
+ a := archsimd.LoadInt8x32Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt16x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x16ConvertToUint16(t *testing.T, f func(x archsimd.Int16x16) archsimd.Uint16x16, want func(x []int16) []uint16) {
+ n := 16
+ t.Helper()
+ forSlice(t, int16s, n, func(x []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x16Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x8ConvertToUint16(t *testing.T, f func(x archsimd.Int32x8) archsimd.Uint16x8, want func(x []int32) []uint16) {
+ n := 8
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x8Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint8x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint8x32ConvertToUint16(t *testing.T, f func(x archsimd.Uint8x32) archsimd.Uint16x32, want func(x []uint8) []uint16) {
+ n := 32
+ t.Helper()
+ forSlice(t, uint8s, n, func(x []uint8) bool {
+ t.Helper()
+ a := archsimd.LoadUint8x32Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint16x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x16ConvertToUint16(t *testing.T, f func(x archsimd.Uint16x16) archsimd.Uint16x16, want func(x []uint16) []uint16) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint16s, n, func(x []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x16Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x8ConvertToUint16(t *testing.T, f func(x archsimd.Uint32x8) archsimd.Uint16x8, want func(x []uint32) []uint16) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x8Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x8ConvertToUint16(t *testing.T, f func(x archsimd.Float32x8) archsimd.Uint16x8, want func(x []float32) []uint16) {
+ n := 8
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x8Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt16x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt16x32ConvertToUint16(t *testing.T, f func(x archsimd.Int16x32) archsimd.Uint16x32, want func(x []int16) []uint16) {
+ n := 32
+ t.Helper()
+ forSlice(t, int16s, n, func(x []int16) bool {
+ t.Helper()
+ a := archsimd.LoadInt16x32Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt32x16ConvertToUint16(t *testing.T, f func(x archsimd.Int32x16) archsimd.Uint16x16, want func(x []int32) []uint16) {
+ n := 16
+ t.Helper()
+ forSlice(t, int32s, n, func(x []int32) bool {
+ t.Helper()
+ a := archsimd.LoadInt32x16Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testInt64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testInt64x8ConvertToUint16(t *testing.T, f func(x archsimd.Int64x8) archsimd.Uint16x8, want func(x []int64) []uint16) {
+ n := 8
+ t.Helper()
+ forSlice(t, int64s, n, func(x []int64) bool {
+ t.Helper()
+ a := archsimd.LoadInt64x8Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint16x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint16x32ConvertToUint16(t *testing.T, f func(x archsimd.Uint16x32) archsimd.Uint16x32, want func(x []uint16) []uint16) {
+ n := 32
+ t.Helper()
+ forSlice(t, uint16s, n, func(x []uint16) bool {
+ t.Helper()
+ a := archsimd.LoadUint16x32Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint32x16ConvertToUint16(t *testing.T, f func(x archsimd.Uint32x16) archsimd.Uint16x16, want func(x []uint32) []uint16) {
+ n := 16
+ t.Helper()
+ forSlice(t, uint32s, n, func(x []uint32) bool {
+ t.Helper()
+ a := archsimd.LoadUint32x16Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testUint64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testUint64x8ConvertToUint16(t *testing.T, f func(x archsimd.Uint64x8) archsimd.Uint16x8, want func(x []uint64) []uint16) {
+ n := 8
+ t.Helper()
+ forSlice(t, uint64s, n, func(x []uint64) bool {
+ t.Helper()
+ a := archsimd.LoadUint64x8Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat32x16ConvertToUint16(t *testing.T, f func(x archsimd.Float32x16) archsimd.Uint16x16, want func(x []float32) []uint16) {
+ n := 16
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x16Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
+// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
+func testFloat64x8ConvertToUint16(t *testing.T, f func(x archsimd.Float64x8) archsimd.Uint16x8, want func(x []float64) []uint16) {
+ n := 8
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x8Slice(x)
+ g := make([]uint16, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x4UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x4UnaryFlaky(t *testing.T, f func(x archsimd.Float32x4) archsimd.Float32x4, want func(x []float32) []float32, flakiness float64) {
+ n := 4
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x4Slice(x)
+ g := make([]float32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x2UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat64x2UnaryFlaky(t *testing.T, f func(x archsimd.Float64x2) archsimd.Float64x2, want func(x []float64) []float64, flakiness float64) {
+ n := 2
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x2Slice(x)
+ g := make([]float64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x8UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x8UnaryFlaky(t *testing.T, f func(x archsimd.Float32x8) archsimd.Float32x8, want func(x []float32) []float32, flakiness float64) {
+ n := 8
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x8Slice(x)
+ g := make([]float32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x4UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat64x4UnaryFlaky(t *testing.T, f func(x archsimd.Float64x4) archsimd.Float64x4, want func(x []float64) []float64, flakiness float64) {
+ n := 4
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x4Slice(x)
+ g := make([]float64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat32x16UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat32x16UnaryFlaky(t *testing.T, f func(x archsimd.Float32x16) archsimd.Float32x16, want func(x []float32) []float32, flakiness float64) {
+ n := 16
+ t.Helper()
+ forSlice(t, float32s, n, func(x []float32) bool {
+ t.Helper()
+ a := archsimd.LoadFloat32x16Slice(x)
+ g := make([]float32, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
+
+// testFloat64x8UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
+// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
+func testFloat64x8UnaryFlaky(t *testing.T, f func(x archsimd.Float64x8) archsimd.Float64x8, want func(x []float64) []float64, flakiness float64) {
+ n := 8
+ t.Helper()
+ forSlice(t, float64s, n, func(x []float64) bool {
+ t.Helper()
+ a := archsimd.LoadFloat64x8Slice(x)
+ g := make([]float64, n)
+ f(a).StoreSlice(g)
+ w := want(x)
+ return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
+ })
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package simd_test
+
+import (
+ "math"
+ "simd/archsimd"
+ "testing"
+)
+
+func TestCeil(t *testing.T) {
+ testFloat32x4Unary(t, archsimd.Float32x4.Ceil, ceilSlice[float32])
+ testFloat32x8Unary(t, archsimd.Float32x8.Ceil, ceilSlice[float32])
+ testFloat64x2Unary(t, archsimd.Float64x2.Ceil, ceilSlice[float64])
+ testFloat64x4Unary(t, archsimd.Float64x4.Ceil, ceilSlice[float64])
+ if archsimd.X86.AVX512() {
+ // testFloat32x16Unary(t, archsimd.Float32x16.Ceil, ceilSlice[float32]) // missing
+ // testFloat64x8Unary(t, archsimd.Float64x8.Ceil, ceilSlice[float64]) // missing
+ }
+}
+
+func TestFloor(t *testing.T) {
+ testFloat32x4Unary(t, archsimd.Float32x4.Floor, floorSlice[float32])
+ testFloat32x8Unary(t, archsimd.Float32x8.Floor, floorSlice[float32])
+ testFloat64x2Unary(t, archsimd.Float64x2.Floor, floorSlice[float64])
+ testFloat64x4Unary(t, archsimd.Float64x4.Floor, floorSlice[float64])
+ if archsimd.X86.AVX512() {
+ // testFloat32x16Unary(t, archsimd.Float32x16.Floor, floorSlice[float32]) // missing
+ // testFloat64x8Unary(t, archsimd.Float64x8.Floor, floorSlice[float64]) // missing
+ }
+}
+
+func TestTrunc(t *testing.T) {
+ testFloat32x4Unary(t, archsimd.Float32x4.Trunc, truncSlice[float32])
+ testFloat32x8Unary(t, archsimd.Float32x8.Trunc, truncSlice[float32])
+ testFloat64x2Unary(t, archsimd.Float64x2.Trunc, truncSlice[float64])
+ testFloat64x4Unary(t, archsimd.Float64x4.Trunc, truncSlice[float64])
+ if archsimd.X86.AVX512() {
+ // testFloat32x16Unary(t, archsimd.Float32x16.Trunc, truncSlice[float32]) // missing
+ // testFloat64x8Unary(t, archsimd.Float64x8.Trunc, truncSlice[float64]) // missing
+ }
+}
+
+func TestRound(t *testing.T) {
+ testFloat32x4Unary(t, archsimd.Float32x4.RoundToEven, roundSlice[float32])
+ testFloat32x8Unary(t, archsimd.Float32x8.RoundToEven, roundSlice[float32])
+ testFloat64x2Unary(t, archsimd.Float64x2.RoundToEven, roundSlice[float64])
+ testFloat64x4Unary(t, archsimd.Float64x4.RoundToEven, roundSlice[float64])
+ if archsimd.X86.AVX512() {
+ // testFloat32x16Unary(t, archsimd.Float32x16.Round, roundSlice[float32]) // missing
+ // testFloat64x8Unary(t, archsimd.Float64x8.Round, roundSlice[float64]) // missing
+ }
+}
+
+func TestSqrt(t *testing.T) {
+ testFloat32x4Unary(t, archsimd.Float32x4.Sqrt, sqrtSlice[float32])
+ testFloat32x8Unary(t, archsimd.Float32x8.Sqrt, sqrtSlice[float32])
+ testFloat64x2Unary(t, archsimd.Float64x2.Sqrt, sqrtSlice[float64])
+ testFloat64x4Unary(t, archsimd.Float64x4.Sqrt, sqrtSlice[float64])
+ if archsimd.X86.AVX512() {
+ testFloat32x16Unary(t, archsimd.Float32x16.Sqrt, sqrtSlice[float32])
+ testFloat64x8Unary(t, archsimd.Float64x8.Sqrt, sqrtSlice[float64])
+ }
+}
+
+func TestNot(t *testing.T) {
+ testInt8x16Unary(t, archsimd.Int8x16.Not, map1[int8](not))
+ testInt8x32Unary(t, archsimd.Int8x32.Not, map1[int8](not))
+ testInt16x8Unary(t, archsimd.Int16x8.Not, map1[int16](not))
+ testInt16x16Unary(t, archsimd.Int16x16.Not, map1[int16](not))
+ testInt32x4Unary(t, archsimd.Int32x4.Not, map1[int32](not))
+ testInt32x8Unary(t, archsimd.Int32x8.Not, map1[int32](not))
+}
+
+func TestAbsolute(t *testing.T) {
+ testInt8x16Unary(t, archsimd.Int8x16.Abs, map1[int8](abs))
+ testInt8x32Unary(t, archsimd.Int8x32.Abs, map1[int8](abs))
+ testInt16x8Unary(t, archsimd.Int16x8.Abs, map1[int16](abs))
+ testInt16x16Unary(t, archsimd.Int16x16.Abs, map1[int16](abs))
+ testInt32x4Unary(t, archsimd.Int32x4.Abs, map1[int32](abs))
+ testInt32x8Unary(t, archsimd.Int32x8.Abs, map1[int32](abs))
+ if archsimd.X86.AVX512() {
+ testInt8x64Unary(t, archsimd.Int8x64.Abs, map1[int8](abs))
+ testInt16x32Unary(t, archsimd.Int16x32.Abs, map1[int16](abs))
+ testInt32x16Unary(t, archsimd.Int32x16.Abs, map1[int32](abs))
+ testInt64x2Unary(t, archsimd.Int64x2.Abs, map1[int64](abs))
+ testInt64x4Unary(t, archsimd.Int64x4.Abs, map1[int64](abs))
+ testInt64x8Unary(t, archsimd.Int64x8.Abs, map1[int64](abs))
+ }
+}
+
+func TestCeilScaledResidue(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Needs AVX512")
+ }
+ testFloat64x8UnaryFlaky(t,
+ func(x archsimd.Float64x8) archsimd.Float64x8 { return x.CeilScaledResidue(0) },
+ map1(ceilResidueForPrecision[float64](0)),
+ 0.001)
+ testFloat64x8UnaryFlaky(t,
+ func(x archsimd.Float64x8) archsimd.Float64x8 { return x.CeilScaledResidue(1) },
+ map1(ceilResidueForPrecision[float64](1)),
+ 0.001)
+ testFloat64x8Unary(t,
+ func(x archsimd.Float64x8) archsimd.Float64x8 { return x.Sub(x.CeilScaled(0)) },
+ map1[float64](func(x float64) float64 { return x - math.Ceil(x) }))
+}
+
+func TestToUint32(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Needs AVX512")
+ }
+ testFloat32x4ConvertToUint32(t, archsimd.Float32x4.ConvertToUint32, map1[float32](toUint32))
+ testFloat32x8ConvertToUint32(t, archsimd.Float32x8.ConvertToUint32, map1[float32](toUint32))
+ testFloat32x16ConvertToUint32(t, archsimd.Float32x16.ConvertToUint32, map1[float32](toUint32))
+}
+
+func TestToInt32(t *testing.T) {
+ testFloat32x4ConvertToInt32(t, archsimd.Float32x4.ConvertToInt32, map1[float32](toInt32))
+ testFloat32x8ConvertToInt32(t, archsimd.Float32x8.ConvertToInt32, map1[float32](toInt32))
+}
+
+func TestConverts(t *testing.T) {
+ testUint8x16ConvertToUint16(t, archsimd.Uint8x16.ExtendToUint16, map1[uint8](toUint16))
+ testUint16x8ConvertToUint32(t, archsimd.Uint16x8.ExtendToUint32, map1[uint16](toUint32))
+}
+
+func TestConvertsAVX512(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("Needs AVX512")
+ }
+ testUint8x32ConvertToUint16(t, archsimd.Uint8x32.ExtendToUint16, map1[uint8](toUint16))
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package test_helpers
+
+import (
+ "math"
+ "testing"
+)
+
+type signed interface {
+ ~int | ~int8 | ~int16 | ~int32 | ~int64
+}
+
+type integer interface {
+ ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr
+}
+
+type float interface {
+ ~float32 | ~float64
+}
+
+type number interface {
+ ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64
+}
+
+func CheckSlices[T number](t *testing.T, got, want []T) bool {
+ t.Helper()
+ return CheckSlicesLogInput[T](t, got, want, 0.0, nil)
+}
+
+// CheckSlices compares two slices for equality,
+// reporting a test error if there is a problem,
+// and also consumes the two slices so that a
+// test/benchmark won't be dead-code eliminated.
+func CheckSlicesLogInput[T number](t *testing.T, got, want []T, flakiness float64, logInput func()) bool {
+ t.Helper()
+ var z T
+ for i := range want {
+ if got[i] != want[i] {
+ var ia any = got[i]
+ var ib any = want[i]
+ switch x := ia.(type) {
+ case float32:
+ y := ib.(float32)
+ if math.IsNaN(float64(x)) && math.IsNaN(float64(y)) {
+ continue
+ }
+ if flakiness > 0 {
+ if y == 0 {
+ if math.Abs(float64(x)) < flakiness {
+ continue
+ }
+ } else {
+ if math.Abs(float64((x-y)/y)) < flakiness {
+ continue
+ }
+ }
+ }
+ case float64:
+ y := ib.(float64)
+ if math.IsNaN(x) && math.IsNaN(y) {
+ continue
+ }
+ if flakiness > 0 {
+ if y == 0 {
+ if math.Abs(x) < flakiness {
+ continue
+ }
+ } else if math.Abs((x-y)/y) < flakiness {
+ continue
+ }
+ }
+
+ default:
+ }
+
+ t.Logf("For %T vector elements:", z)
+ t.Logf("got =%v", got)
+ t.Logf("want=%v", want)
+ if logInput != nil {
+ logInput()
+ }
+ t.Errorf("at index %d, got=%v, want=%v", i, got[i], want[i])
+ return false
+ } else if got[i] == 0 { // for floating point, 0.0 == -0.0 but a bitwise check can see the difference
+ var ia any = got[i]
+ var ib any = want[i]
+ switch x := ia.(type) {
+ case float32:
+ y := ib.(float32)
+ if math.Float32bits(x) != math.Float32bits(y) {
+ t.Logf("For %T vector elements:", z)
+ t.Logf("got =%v", got)
+ t.Logf("want=%v", want)
+ if logInput != nil {
+ logInput()
+ }
+ t.Errorf("at index %d, different signs of zero", i)
+ return false
+ }
+ case float64:
+ y := ib.(float64)
+ if math.Float64bits(x) != math.Float64bits(y) {
+ t.Logf("For %T vector elements:", z)
+ t.Logf("got =%v", got)
+ t.Logf("want=%v", want)
+ if logInput != nil {
+ logInput()
+ }
+ t.Errorf("at index %d, different signs of zero", i)
+ return false
+ }
+ default:
+ }
+
+ }
+ }
+ return true
+}
--- /dev/null
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int8x16) Masked(mask Mask8x16) Int8x16 {
+ im := mask.AsInt8x16()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int8x16) Merge(y Int8x16, mask Mask8x16) Int8x16 {
+ im := mask.AsInt8x16()
+ return y.blend(x, im)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int16x8) Masked(mask Mask16x8) Int16x8 {
+ im := mask.AsInt16x8()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int16x8) Merge(y Int16x8, mask Mask16x8) Int16x8 {
+ im := mask.AsInt16x8().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsInt16x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int32x4) Masked(mask Mask32x4) Int32x4 {
+ im := mask.AsInt32x4()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int32x4) Merge(y Int32x4, mask Mask32x4) Int32x4 {
+ im := mask.AsInt32x4().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsInt32x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int64x2) Masked(mask Mask64x2) Int64x2 {
+ im := mask.AsInt64x2()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int64x2) Merge(y Int64x2, mask Mask64x2) Int64x2 {
+ im := mask.AsInt64x2().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsInt64x2()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint8x16) Masked(mask Mask8x16) Uint8x16 {
+ im := mask.AsInt8x16()
+ return x.AsInt8x16().And(im).AsUint8x16()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint8x16) Merge(y Uint8x16, mask Mask8x16) Uint8x16 {
+ im := mask.AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsUint8x16()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint16x8) Masked(mask Mask16x8) Uint16x8 {
+ im := mask.AsInt16x8()
+ return x.AsInt16x8().And(im).AsUint16x8()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint16x8) Merge(y Uint16x8, mask Mask16x8) Uint16x8 {
+ im := mask.AsInt16x8().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsUint16x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint32x4) Masked(mask Mask32x4) Uint32x4 {
+ im := mask.AsInt32x4()
+ return x.AsInt32x4().And(im).AsUint32x4()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint32x4) Merge(y Uint32x4, mask Mask32x4) Uint32x4 {
+ im := mask.AsInt32x4().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsUint32x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint64x2) Masked(mask Mask64x2) Uint64x2 {
+ im := mask.AsInt64x2()
+ return x.AsInt64x2().And(im).AsUint64x2()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint64x2) Merge(y Uint64x2, mask Mask64x2) Uint64x2 {
+ im := mask.AsInt64x2().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsUint64x2()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float32x4) Masked(mask Mask32x4) Float32x4 {
+ im := mask.AsInt32x4()
+ return x.AsInt32x4().And(im).AsFloat32x4()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Float32x4) Merge(y Float32x4, mask Mask32x4) Float32x4 {
+ im := mask.AsInt32x4().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsFloat32x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float64x2) Masked(mask Mask64x2) Float64x2 {
+ im := mask.AsInt64x2()
+ return x.AsInt64x2().And(im).AsFloat64x2()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Float64x2) Merge(y Float64x2, mask Mask64x2) Float64x2 {
+ im := mask.AsInt64x2().AsInt8x16()
+ ix := x.AsInt8x16()
+ iy := y.AsInt8x16()
+ return iy.blend(ix, im).AsFloat64x2()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int8x32) Masked(mask Mask8x32) Int8x32 {
+ im := mask.AsInt8x32()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int8x32) Merge(y Int8x32, mask Mask8x32) Int8x32 {
+ im := mask.AsInt8x32()
+ return y.blend(x, im)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int16x16) Masked(mask Mask16x16) Int16x16 {
+ im := mask.AsInt16x16()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int16x16) Merge(y Int16x16, mask Mask16x16) Int16x16 {
+ im := mask.AsInt16x16().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsInt16x16()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int32x8) Masked(mask Mask32x8) Int32x8 {
+ im := mask.AsInt32x8()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int32x8) Merge(y Int32x8, mask Mask32x8) Int32x8 {
+ im := mask.AsInt32x8().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsInt32x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int64x4) Masked(mask Mask64x4) Int64x4 {
+ im := mask.AsInt64x4()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Int64x4) Merge(y Int64x4, mask Mask64x4) Int64x4 {
+ im := mask.AsInt64x4().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsInt64x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint8x32) Masked(mask Mask8x32) Uint8x32 {
+ im := mask.AsInt8x32()
+ return x.AsInt8x32().And(im).AsUint8x32()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint8x32) Merge(y Uint8x32, mask Mask8x32) Uint8x32 {
+ im := mask.AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsUint8x32()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint16x16) Masked(mask Mask16x16) Uint16x16 {
+ im := mask.AsInt16x16()
+ return x.AsInt16x16().And(im).AsUint16x16()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint16x16) Merge(y Uint16x16, mask Mask16x16) Uint16x16 {
+ im := mask.AsInt16x16().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsUint16x16()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint32x8) Masked(mask Mask32x8) Uint32x8 {
+ im := mask.AsInt32x8()
+ return x.AsInt32x8().And(im).AsUint32x8()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint32x8) Merge(y Uint32x8, mask Mask32x8) Uint32x8 {
+ im := mask.AsInt32x8().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsUint32x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint64x4) Masked(mask Mask64x4) Uint64x4 {
+ im := mask.AsInt64x4()
+ return x.AsInt64x4().And(im).AsUint64x4()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Uint64x4) Merge(y Uint64x4, mask Mask64x4) Uint64x4 {
+ im := mask.AsInt64x4().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsUint64x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float32x8) Masked(mask Mask32x8) Float32x8 {
+ im := mask.AsInt32x8()
+ return x.AsInt32x8().And(im).AsFloat32x8()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Float32x8) Merge(y Float32x8, mask Mask32x8) Float32x8 {
+ im := mask.AsInt32x8().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsFloat32x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float64x4) Masked(mask Mask64x4) Float64x4 {
+ im := mask.AsInt64x4()
+ return x.AsInt64x4().And(im).AsFloat64x4()
+}
+
+// Merge returns x but with elements set to y where mask is false.
+func (x Float64x4) Merge(y Float64x4, mask Mask64x4) Float64x4 {
+ im := mask.AsInt64x4().AsInt8x32()
+ ix := x.AsInt8x32()
+ iy := y.AsInt8x32()
+ return iy.blend(ix, im).AsFloat64x4()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int8x64) Masked(mask Mask8x64) Int8x64 {
+ im := mask.AsInt8x64()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Int8x64) Merge(y Int8x64, mask Mask8x64) Int8x64 {
+ return y.blendMasked(x, mask)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int16x32) Masked(mask Mask16x32) Int16x32 {
+ im := mask.AsInt16x32()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Int16x32) Merge(y Int16x32, mask Mask16x32) Int16x32 {
+ return y.blendMasked(x, mask)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int32x16) Masked(mask Mask32x16) Int32x16 {
+ im := mask.AsInt32x16()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Int32x16) Merge(y Int32x16, mask Mask32x16) Int32x16 {
+ return y.blendMasked(x, mask)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Int64x8) Masked(mask Mask64x8) Int64x8 {
+ im := mask.AsInt64x8()
+ return im.And(x)
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Int64x8) Merge(y Int64x8, mask Mask64x8) Int64x8 {
+ return y.blendMasked(x, mask)
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint8x64) Masked(mask Mask8x64) Uint8x64 {
+ im := mask.AsInt8x64()
+ return x.AsInt8x64().And(im).AsUint8x64()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Uint8x64) Merge(y Uint8x64, mask Mask8x64) Uint8x64 {
+ ix := x.AsInt8x64()
+ iy := y.AsInt8x64()
+ return iy.blendMasked(ix, mask).AsUint8x64()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint16x32) Masked(mask Mask16x32) Uint16x32 {
+ im := mask.AsInt16x32()
+ return x.AsInt16x32().And(im).AsUint16x32()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Uint16x32) Merge(y Uint16x32, mask Mask16x32) Uint16x32 {
+ ix := x.AsInt16x32()
+ iy := y.AsInt16x32()
+ return iy.blendMasked(ix, mask).AsUint16x32()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint32x16) Masked(mask Mask32x16) Uint32x16 {
+ im := mask.AsInt32x16()
+ return x.AsInt32x16().And(im).AsUint32x16()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Uint32x16) Merge(y Uint32x16, mask Mask32x16) Uint32x16 {
+ ix := x.AsInt32x16()
+ iy := y.AsInt32x16()
+ return iy.blendMasked(ix, mask).AsUint32x16()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Uint64x8) Masked(mask Mask64x8) Uint64x8 {
+ im := mask.AsInt64x8()
+ return x.AsInt64x8().And(im).AsUint64x8()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Uint64x8) Merge(y Uint64x8, mask Mask64x8) Uint64x8 {
+ ix := x.AsInt64x8()
+ iy := y.AsInt64x8()
+ return iy.blendMasked(ix, mask).AsUint64x8()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float32x16) Masked(mask Mask32x16) Float32x16 {
+ im := mask.AsInt32x16()
+ return x.AsInt32x16().And(im).AsFloat32x16()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Float32x16) Merge(y Float32x16, mask Mask32x16) Float32x16 {
+ ix := x.AsInt32x16()
+ iy := y.AsInt32x16()
+ return iy.blendMasked(ix, mask).AsFloat32x16()
+}
+
+// Masked returns x but with elements zeroed where mask is false.
+func (x Float64x8) Masked(mask Mask64x8) Float64x8 {
+ im := mask.AsInt64x8()
+ return x.AsInt64x8().And(im).AsFloat64x8()
+}
+
+// Merge returns x but with elements set to y where m is false.
+func (x Float64x8) Merge(y Float64x8, mask Mask64x8) Float64x8 {
+ ix := x.AsInt64x8()
+ iy := y.AsInt64x8()
+ return iy.blendMasked(ix, mask).AsFloat64x8()
+}
--- /dev/null
+// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+/* AESDecryptLastRound */
+
+// AESDecryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of dw array in use.
+// result = AddRoundKey(InvShiftRows(InvSubBytes(x)), y)
+//
+// Asm: VAESDECLAST, CPU Feature: AVX, AES
+func (x Uint8x16) AESDecryptLastRound(y Uint32x4) Uint8x16
+
+// AESDecryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of dw array in use.
+// result = AddRoundKey(InvShiftRows(InvSubBytes(x)), y)
+//
+// Asm: VAESDECLAST, CPU Feature: AVX512VAES
+func (x Uint8x32) AESDecryptLastRound(y Uint32x8) Uint8x32
+
+// AESDecryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of dw array in use.
+// result = AddRoundKey(InvShiftRows(InvSubBytes(x)), y)
+//
+// Asm: VAESDECLAST, CPU Feature: AVX512VAES
+func (x Uint8x64) AESDecryptLastRound(y Uint32x16) Uint8x64
+
+/* AESDecryptOneRound */
+
+// AESDecryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of dw array in use.
+// result = AddRoundKey(InvMixColumns(InvShiftRows(InvSubBytes(x))), y)
+//
+// Asm: VAESDEC, CPU Feature: AVX, AES
+func (x Uint8x16) AESDecryptOneRound(y Uint32x4) Uint8x16
+
+// AESDecryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of dw array in use.
+// result = AddRoundKey(InvMixColumns(InvShiftRows(InvSubBytes(x))), y)
+//
+// Asm: VAESDEC, CPU Feature: AVX512VAES
+func (x Uint8x32) AESDecryptOneRound(y Uint32x8) Uint8x32
+
+// AESDecryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of dw array in use.
+// result = AddRoundKey(InvMixColumns(InvShiftRows(InvSubBytes(x))), y)
+//
+// Asm: VAESDEC, CPU Feature: AVX512VAES
+func (x Uint8x64) AESDecryptOneRound(y Uint32x16) Uint8x64
+
+/* AESEncryptLastRound */
+
+// AESEncryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of w array in use.
+// result = AddRoundKey((ShiftRows(SubBytes(x))), y)
+//
+// Asm: VAESENCLAST, CPU Feature: AVX, AES
+func (x Uint8x16) AESEncryptLastRound(y Uint32x4) Uint8x16
+
+// AESEncryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of w array in use.
+// result = AddRoundKey((ShiftRows(SubBytes(x))), y)
+//
+// Asm: VAESENCLAST, CPU Feature: AVX512VAES
+func (x Uint8x32) AESEncryptLastRound(y Uint32x8) Uint8x32
+
+// AESEncryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of w array in use.
+// result = AddRoundKey((ShiftRows(SubBytes(x))), y)
+//
+// Asm: VAESENCLAST, CPU Feature: AVX512VAES
+func (x Uint8x64) AESEncryptLastRound(y Uint32x16) Uint8x64
+
+/* AESEncryptOneRound */
+
+// AESEncryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of w array in use.
+// result = AddRoundKey(MixColumns(ShiftRows(SubBytes(x))), y)
+//
+// Asm: VAESENC, CPU Feature: AVX, AES
+func (x Uint8x16) AESEncryptOneRound(y Uint32x4) Uint8x16
+
+// AESEncryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of w array in use.
+// result = AddRoundKey(MixColumns(ShiftRows(SubBytes(x))), y)
+//
+// Asm: VAESENC, CPU Feature: AVX512VAES
+func (x Uint8x32) AESEncryptOneRound(y Uint32x8) Uint8x32
+
+// AESEncryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
+// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
+// y is the chunk of w array in use.
+// result = AddRoundKey(MixColumns(ShiftRows(SubBytes(x))), y)
+//
+// Asm: VAESENC, CPU Feature: AVX512VAES
+func (x Uint8x64) AESEncryptOneRound(y Uint32x16) Uint8x64
+
+/* AESInvMixColumns */
+
+// AESInvMixColumns performs the InvMixColumns operation in AES cipher algorithm defined in FIPS 197.
+// x is the chunk of w array in use.
+// result = InvMixColumns(x)
+//
+// Asm: VAESIMC, CPU Feature: AVX, AES
+func (x Uint32x4) AESInvMixColumns() Uint32x4
+
+/* AESRoundKeyGenAssist */
+
+// AESRoundKeyGenAssist performs some components of KeyExpansion in AES cipher algorithm defined in FIPS 197.
+// x is an array of AES words, but only x[0] and x[2] are used.
+// r is a value from the Rcon constant array.
+// result[0] = XOR(SubWord(RotWord(x[0])), r)
+// result[1] = SubWord(x[1])
+// result[2] = XOR(SubWord(RotWord(x[2])), r)
+// result[3] = SubWord(x[3])
+//
+// rconVal results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VAESKEYGENASSIST, CPU Feature: AVX, AES
+func (x Uint32x4) AESRoundKeyGenAssist(rconVal uint8) Uint32x4
+
+/* Abs */
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSB, CPU Feature: AVX
+func (x Int8x16) Abs() Int8x16
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSB, CPU Feature: AVX2
+func (x Int8x32) Abs() Int8x32
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSB, CPU Feature: AVX512
+func (x Int8x64) Abs() Int8x64
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSW, CPU Feature: AVX
+func (x Int16x8) Abs() Int16x8
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSW, CPU Feature: AVX2
+func (x Int16x16) Abs() Int16x16
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSW, CPU Feature: AVX512
+func (x Int16x32) Abs() Int16x32
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSD, CPU Feature: AVX
+func (x Int32x4) Abs() Int32x4
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSD, CPU Feature: AVX2
+func (x Int32x8) Abs() Int32x8
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSD, CPU Feature: AVX512
+func (x Int32x16) Abs() Int32x16
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSQ, CPU Feature: AVX512
+func (x Int64x2) Abs() Int64x2
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSQ, CPU Feature: AVX512
+func (x Int64x4) Abs() Int64x4
+
+// Abs computes the absolute value of each element.
+//
+// Asm: VPABSQ, CPU Feature: AVX512
+func (x Int64x8) Abs() Int64x8
+
+/* Add */
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VADDPS, CPU Feature: AVX
+func (x Float32x4) Add(y Float32x4) Float32x4
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VADDPS, CPU Feature: AVX
+func (x Float32x8) Add(y Float32x8) Float32x8
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VADDPS, CPU Feature: AVX512
+func (x Float32x16) Add(y Float32x16) Float32x16
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VADDPD, CPU Feature: AVX
+func (x Float64x2) Add(y Float64x2) Float64x2
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VADDPD, CPU Feature: AVX
+func (x Float64x4) Add(y Float64x4) Float64x4
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VADDPD, CPU Feature: AVX512
+func (x Float64x8) Add(y Float64x8) Float64x8
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDB, CPU Feature: AVX
+func (x Int8x16) Add(y Int8x16) Int8x16
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDB, CPU Feature: AVX2
+func (x Int8x32) Add(y Int8x32) Int8x32
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDB, CPU Feature: AVX512
+func (x Int8x64) Add(y Int8x64) Int8x64
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDW, CPU Feature: AVX
+func (x Int16x8) Add(y Int16x8) Int16x8
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDW, CPU Feature: AVX2
+func (x Int16x16) Add(y Int16x16) Int16x16
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDW, CPU Feature: AVX512
+func (x Int16x32) Add(y Int16x32) Int16x32
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDD, CPU Feature: AVX
+func (x Int32x4) Add(y Int32x4) Int32x4
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDD, CPU Feature: AVX2
+func (x Int32x8) Add(y Int32x8) Int32x8
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDD, CPU Feature: AVX512
+func (x Int32x16) Add(y Int32x16) Int32x16
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDQ, CPU Feature: AVX
+func (x Int64x2) Add(y Int64x2) Int64x2
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDQ, CPU Feature: AVX2
+func (x Int64x4) Add(y Int64x4) Int64x4
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDQ, CPU Feature: AVX512
+func (x Int64x8) Add(y Int64x8) Int64x8
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDB, CPU Feature: AVX
+func (x Uint8x16) Add(y Uint8x16) Uint8x16
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDB, CPU Feature: AVX2
+func (x Uint8x32) Add(y Uint8x32) Uint8x32
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDB, CPU Feature: AVX512
+func (x Uint8x64) Add(y Uint8x64) Uint8x64
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDW, CPU Feature: AVX
+func (x Uint16x8) Add(y Uint16x8) Uint16x8
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDW, CPU Feature: AVX2
+func (x Uint16x16) Add(y Uint16x16) Uint16x16
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDW, CPU Feature: AVX512
+func (x Uint16x32) Add(y Uint16x32) Uint16x32
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDD, CPU Feature: AVX
+func (x Uint32x4) Add(y Uint32x4) Uint32x4
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDD, CPU Feature: AVX2
+func (x Uint32x8) Add(y Uint32x8) Uint32x8
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDD, CPU Feature: AVX512
+func (x Uint32x16) Add(y Uint32x16) Uint32x16
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDQ, CPU Feature: AVX
+func (x Uint64x2) Add(y Uint64x2) Uint64x2
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDQ, CPU Feature: AVX2
+func (x Uint64x4) Add(y Uint64x4) Uint64x4
+
+// Add adds corresponding elements of two vectors.
+//
+// Asm: VPADDQ, CPU Feature: AVX512
+func (x Uint64x8) Add(y Uint64x8) Uint64x8
+
+/* AddPairs */
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x4) AddPairs(y Float32x4) Float32x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x8) AddPairs(y Float32x8) Float32x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x2) AddPairs(y Float64x2) Float64x2
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x4) AddPairs(y Float64x4) Float64x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Int16x8) AddPairs(y Int16x8) Int16x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Int16x16) AddPairs(y Int16x16) Int16x16
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Int32x4) AddPairs(y Int32x4) Int32x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Int32x8) AddPairs(y Int32x8) Int32x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Uint16x16) AddPairs(y Uint16x16) Uint16x16
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
+
+// AddPairs horizontally adds adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Uint32x8) AddPairs(y Uint32x8) Uint32x8
+
+/* AddPairsSaturated */
+
+// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDSW, CPU Feature: AVX
+func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8
+
+// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+//
+// Asm: VPHADDSW, CPU Feature: AVX2
+func (x Int16x16) AddPairsSaturated(y Int16x16) Int16x16
+
+/* AddSaturated */
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX
+func (x Int8x16) AddSaturated(y Int8x16) Int8x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX2
+func (x Int8x32) AddSaturated(y Int8x32) Int8x32
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSB, CPU Feature: AVX512
+func (x Int8x64) AddSaturated(y Int8x64) Int8x64
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX
+func (x Int16x8) AddSaturated(y Int16x8) Int16x8
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX2
+func (x Int16x16) AddSaturated(y Int16x16) Int16x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDSW, CPU Feature: AVX512
+func (x Int16x32) AddSaturated(y Int16x32) Int16x32
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDUSB, CPU Feature: AVX
+func (x Uint8x16) AddSaturated(y Uint8x16) Uint8x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDUSB, CPU Feature: AVX2
+func (x Uint8x32) AddSaturated(y Uint8x32) Uint8x32
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDUSB, CPU Feature: AVX512
+func (x Uint8x64) AddSaturated(y Uint8x64) Uint8x64
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDUSW, CPU Feature: AVX
+func (x Uint16x8) AddSaturated(y Uint16x8) Uint16x8
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDUSW, CPU Feature: AVX2
+func (x Uint16x16) AddSaturated(y Uint16x16) Uint16x16
+
+// AddSaturated adds corresponding elements of two vectors with saturation.
+//
+// Asm: VPADDUSW, CPU Feature: AVX512
+func (x Uint16x32) AddSaturated(y Uint16x32) Uint16x32
+
+/* AddSub */
+
+// AddSub subtracts even elements and adds odd elements of two vectors.
+//
+// Asm: VADDSUBPS, CPU Feature: AVX
+func (x Float32x4) AddSub(y Float32x4) Float32x4
+
+// AddSub subtracts even elements and adds odd elements of two vectors.
+//
+// Asm: VADDSUBPS, CPU Feature: AVX
+func (x Float32x8) AddSub(y Float32x8) Float32x8
+
+// AddSub subtracts even elements and adds odd elements of two vectors.
+//
+// Asm: VADDSUBPD, CPU Feature: AVX
+func (x Float64x2) AddSub(y Float64x2) Float64x2
+
+// AddSub subtracts even elements and adds odd elements of two vectors.
+//
+// Asm: VADDSUBPD, CPU Feature: AVX
+func (x Float64x4) AddSub(y Float64x4) Float64x4
+
+/* And */
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX
+func (x Int8x16) And(y Int8x16) Int8x16
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX2
+func (x Int8x32) And(y Int8x32) Int8x32
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPANDD, CPU Feature: AVX512
+func (x Int8x64) And(y Int8x64) Int8x64
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX
+func (x Int16x8) And(y Int16x8) Int16x8
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX2
+func (x Int16x16) And(y Int16x16) Int16x16
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPANDD, CPU Feature: AVX512
+func (x Int16x32) And(y Int16x32) Int16x32
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX
+func (x Int32x4) And(y Int32x4) Int32x4
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX2
+func (x Int32x8) And(y Int32x8) Int32x8
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPANDD, CPU Feature: AVX512
+func (x Int32x16) And(y Int32x16) Int32x16
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX
+func (x Int64x2) And(y Int64x2) Int64x2
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX2
+func (x Int64x4) And(y Int64x4) Int64x4
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPANDQ, CPU Feature: AVX512
+func (x Int64x8) And(y Int64x8) Int64x8
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX
+func (x Uint8x16) And(y Uint8x16) Uint8x16
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX2
+func (x Uint8x32) And(y Uint8x32) Uint8x32
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPANDD, CPU Feature: AVX512
+func (x Uint8x64) And(y Uint8x64) Uint8x64
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX
+func (x Uint16x8) And(y Uint16x8) Uint16x8
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX2
+func (x Uint16x16) And(y Uint16x16) Uint16x16
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPANDD, CPU Feature: AVX512
+func (x Uint16x32) And(y Uint16x32) Uint16x32
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX
+func (x Uint32x4) And(y Uint32x4) Uint32x4
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX2
+func (x Uint32x8) And(y Uint32x8) Uint32x8
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPANDD, CPU Feature: AVX512
+func (x Uint32x16) And(y Uint32x16) Uint32x16
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX
+func (x Uint64x2) And(y Uint64x2) Uint64x2
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPAND, CPU Feature: AVX2
+func (x Uint64x4) And(y Uint64x4) Uint64x4
+
+// And performs a bitwise AND operation between two vectors.
+//
+// Asm: VPANDQ, CPU Feature: AVX512
+func (x Uint64x8) And(y Uint64x8) Uint64x8
+
+/* AndNot */
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX
+func (x Int8x16) AndNot(y Int8x16) Int8x16
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX2
+func (x Int8x32) AndNot(y Int8x32) Int8x32
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDND, CPU Feature: AVX512
+func (x Int8x64) AndNot(y Int8x64) Int8x64
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX
+func (x Int16x8) AndNot(y Int16x8) Int16x8
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX2
+func (x Int16x16) AndNot(y Int16x16) Int16x16
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDND, CPU Feature: AVX512
+func (x Int16x32) AndNot(y Int16x32) Int16x32
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX
+func (x Int32x4) AndNot(y Int32x4) Int32x4
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX2
+func (x Int32x8) AndNot(y Int32x8) Int32x8
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDND, CPU Feature: AVX512
+func (x Int32x16) AndNot(y Int32x16) Int32x16
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX
+func (x Int64x2) AndNot(y Int64x2) Int64x2
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX2
+func (x Int64x4) AndNot(y Int64x4) Int64x4
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDNQ, CPU Feature: AVX512
+func (x Int64x8) AndNot(y Int64x8) Int64x8
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX
+func (x Uint8x16) AndNot(y Uint8x16) Uint8x16
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX2
+func (x Uint8x32) AndNot(y Uint8x32) Uint8x32
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDND, CPU Feature: AVX512
+func (x Uint8x64) AndNot(y Uint8x64) Uint8x64
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX
+func (x Uint16x8) AndNot(y Uint16x8) Uint16x8
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX2
+func (x Uint16x16) AndNot(y Uint16x16) Uint16x16
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDND, CPU Feature: AVX512
+func (x Uint16x32) AndNot(y Uint16x32) Uint16x32
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX
+func (x Uint32x4) AndNot(y Uint32x4) Uint32x4
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX2
+func (x Uint32x8) AndNot(y Uint32x8) Uint32x8
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDND, CPU Feature: AVX512
+func (x Uint32x16) AndNot(y Uint32x16) Uint32x16
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX
+func (x Uint64x2) AndNot(y Uint64x2) Uint64x2
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDN, CPU Feature: AVX2
+func (x Uint64x4) AndNot(y Uint64x4) Uint64x4
+
+// AndNot performs a bitwise x &^ y.
+//
+// Asm: VPANDNQ, CPU Feature: AVX512
+func (x Uint64x8) AndNot(y Uint64x8) Uint64x8
+
+/* Average */
+
+// Average computes the rounded average of corresponding elements.
+//
+// Asm: VPAVGB, CPU Feature: AVX
+func (x Uint8x16) Average(y Uint8x16) Uint8x16
+
+// Average computes the rounded average of corresponding elements.
+//
+// Asm: VPAVGB, CPU Feature: AVX2
+func (x Uint8x32) Average(y Uint8x32) Uint8x32
+
+// Average computes the rounded average of corresponding elements.
+//
+// Asm: VPAVGB, CPU Feature: AVX512
+func (x Uint8x64) Average(y Uint8x64) Uint8x64
+
+// Average computes the rounded average of corresponding elements.
+//
+// Asm: VPAVGW, CPU Feature: AVX
+func (x Uint16x8) Average(y Uint16x8) Uint16x8
+
+// Average computes the rounded average of corresponding elements.
+//
+// Asm: VPAVGW, CPU Feature: AVX2
+func (x Uint16x16) Average(y Uint16x16) Uint16x16
+
+// Average computes the rounded average of corresponding elements.
+//
+// Asm: VPAVGW, CPU Feature: AVX512
+func (x Uint16x32) Average(y Uint16x32) Uint16x32
+
+/* Broadcast128 */
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX2
+func (x Float32x4) Broadcast128() Float32x4
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Float64x2) Broadcast128() Float64x2
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) Broadcast128() Int8x16
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Int16x8) Broadcast128() Int16x8
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Int32x4) Broadcast128() Int32x4
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Int64x2) Broadcast128() Int64x2
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) Broadcast128() Uint8x16
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Uint16x8) Broadcast128() Uint16x8
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Uint32x4) Broadcast128() Uint32x4
+
+// Broadcast128 copies element zero of its (128-bit) input to all elements of
+// the 128-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Uint64x2) Broadcast128() Uint64x2
+
+/* Broadcast256 */
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX2
+func (x Float32x4) Broadcast256() Float32x8
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX2
+func (x Float64x2) Broadcast256() Float64x4
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Int8x16) Broadcast256() Int8x32
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Int16x8) Broadcast256() Int16x16
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Int32x4) Broadcast256() Int32x8
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Int64x2) Broadcast256() Int64x4
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX2
+func (x Uint8x16) Broadcast256() Uint8x32
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX2
+func (x Uint16x8) Broadcast256() Uint16x16
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX2
+func (x Uint32x4) Broadcast256() Uint32x8
+
+// Broadcast256 copies element zero of its (128-bit) input to all elements of
+// the 256-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX2
+func (x Uint64x2) Broadcast256() Uint64x4
+
+/* Broadcast512 */
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VBROADCASTSS, CPU Feature: AVX512
+func (x Float32x4) Broadcast512() Float32x16
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VBROADCASTSD, CPU Feature: AVX512
+func (x Float64x2) Broadcast512() Float64x8
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Int8x16) Broadcast512() Int8x64
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Int16x8) Broadcast512() Int16x32
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Int32x4) Broadcast512() Int32x16
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Int64x2) Broadcast512() Int64x8
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTB, CPU Feature: AVX512
+func (x Uint8x16) Broadcast512() Uint8x64
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTW, CPU Feature: AVX512
+func (x Uint16x8) Broadcast512() Uint16x32
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTD, CPU Feature: AVX512
+func (x Uint32x4) Broadcast512() Uint32x16
+
+// Broadcast512 copies element zero of its (128-bit) input to all elements of
+// the 512-bit output vector.
+//
+// Asm: VPBROADCASTQ, CPU Feature: AVX512
+func (x Uint64x2) Broadcast512() Uint64x8
+
+/* Ceil */
+
+// Ceil rounds elements up to the nearest integer.
+//
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x4) Ceil() Float32x4
+
+// Ceil rounds elements up to the nearest integer.
+//
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x8) Ceil() Float32x8
+
+// Ceil rounds elements up to the nearest integer.
+//
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x2) Ceil() Float64x2
+
+// Ceil rounds elements up to the nearest integer.
+//
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x4) Ceil() Float64x4
+
+/* CeilScaled */
+
+// CeilScaled rounds elements up with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x4) CeilScaled(prec uint8) Float32x4
+
+// CeilScaled rounds elements up with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x8) CeilScaled(prec uint8) Float32x8
+
+// CeilScaled rounds elements up with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x16) CeilScaled(prec uint8) Float32x16
+
+// CeilScaled rounds elements up with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x2) CeilScaled(prec uint8) Float64x2
+
+// CeilScaled rounds elements up with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x4) CeilScaled(prec uint8) Float64x4
+
+// CeilScaled rounds elements up with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x8) CeilScaled(prec uint8) Float64x8
+
+/* CeilScaledResidue */
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x4) CeilScaledResidue(prec uint8) Float32x4
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x8) CeilScaledResidue(prec uint8) Float32x8
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x16) CeilScaledResidue(prec uint8) Float32x16
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x2) CeilScaledResidue(prec uint8) Float64x2
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x4) CeilScaledResidue(prec uint8) Float64x4
+
+// CeilScaledResidue computes the difference after ceiling with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x8) CeilScaledResidue(prec uint8) Float64x8
+
+/* Compress */
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VCOMPRESSPS, CPU Feature: AVX512
+func (x Float32x4) Compress(mask Mask32x4) Float32x4
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VCOMPRESSPS, CPU Feature: AVX512
+func (x Float32x8) Compress(mask Mask32x8) Float32x8
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VCOMPRESSPS, CPU Feature: AVX512
+func (x Float32x16) Compress(mask Mask32x16) Float32x16
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VCOMPRESSPD, CPU Feature: AVX512
+func (x Float64x2) Compress(mask Mask64x2) Float64x2
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VCOMPRESSPD, CPU Feature: AVX512
+func (x Float64x4) Compress(mask Mask64x4) Float64x4
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VCOMPRESSPD, CPU Feature: AVX512
+func (x Float64x8) Compress(mask Mask64x8) Float64x8
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
+func (x Int8x16) Compress(mask Mask8x16) Int8x16
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
+func (x Int8x32) Compress(mask Mask8x32) Int8x32
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
+func (x Int8x64) Compress(mask Mask8x64) Int8x64
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
+func (x Int16x8) Compress(mask Mask16x8) Int16x8
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
+func (x Int16x16) Compress(mask Mask16x16) Int16x16
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
+func (x Int16x32) Compress(mask Mask16x32) Int16x32
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSD, CPU Feature: AVX512
+func (x Int32x4) Compress(mask Mask32x4) Int32x4
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSD, CPU Feature: AVX512
+func (x Int32x8) Compress(mask Mask32x8) Int32x8
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSD, CPU Feature: AVX512
+func (x Int32x16) Compress(mask Mask32x16) Int32x16
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSQ, CPU Feature: AVX512
+func (x Int64x2) Compress(mask Mask64x2) Int64x2
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSQ, CPU Feature: AVX512
+func (x Int64x4) Compress(mask Mask64x4) Int64x4
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSQ, CPU Feature: AVX512
+func (x Int64x8) Compress(mask Mask64x8) Int64x8
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
+func (x Uint8x16) Compress(mask Mask8x16) Uint8x16
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
+func (x Uint8x32) Compress(mask Mask8x32) Uint8x32
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
+func (x Uint8x64) Compress(mask Mask8x64) Uint8x64
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
+func (x Uint16x8) Compress(mask Mask16x8) Uint16x8
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
+func (x Uint16x16) Compress(mask Mask16x16) Uint16x16
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
+func (x Uint16x32) Compress(mask Mask16x32) Uint16x32
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSD, CPU Feature: AVX512
+func (x Uint32x4) Compress(mask Mask32x4) Uint32x4
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSD, CPU Feature: AVX512
+func (x Uint32x8) Compress(mask Mask32x8) Uint32x8
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSD, CPU Feature: AVX512
+func (x Uint32x16) Compress(mask Mask32x16) Uint32x16
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSQ, CPU Feature: AVX512
+func (x Uint64x2) Compress(mask Mask64x2) Uint64x2
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSQ, CPU Feature: AVX512
+func (x Uint64x4) Compress(mask Mask64x4) Uint64x4
+
+// Compress performs a compression on vector x using mask by
+// selecting elements as indicated by mask, and pack them to lower indexed elements.
+//
+// Asm: VPCOMPRESSQ, CPU Feature: AVX512
+func (x Uint64x8) Compress(mask Mask64x8) Uint64x8
+
+/* ConcatPermute */
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x16) ConcatPermute(y Int8x16, indices Uint8x16) Int8x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x16) ConcatPermute(y Uint8x16, indices Uint8x16) Uint8x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x32) ConcatPermute(y Int8x32, indices Uint8x32) Int8x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x32) ConcatPermute(y Uint8x32, indices Uint8x32) Uint8x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Int8x64) ConcatPermute(y Int8x64, indices Uint8x64) Int8x64
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2B, CPU Feature: AVX512VBMI
+func (x Uint8x64) ConcatPermute(y Uint8x64, indices Uint8x64) Uint8x64
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x8) ConcatPermute(y Int16x8, indices Uint16x8) Int16x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x8) ConcatPermute(y Uint16x8, indices Uint16x8) Uint16x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x16) ConcatPermute(y Int16x16, indices Uint16x16) Int16x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x16) ConcatPermute(y Uint16x16, indices Uint16x16) Uint16x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Int16x32) ConcatPermute(y Int16x32, indices Uint16x32) Int16x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2W, CPU Feature: AVX512
+func (x Uint16x32) ConcatPermute(y Uint16x32, indices Uint16x32) Uint16x32
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x4) ConcatPermute(y Float32x4, indices Uint32x4) Float32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x4) ConcatPermute(y Int32x4, indices Uint32x4) Int32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x4) ConcatPermute(y Uint32x4, indices Uint32x4) Uint32x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x8) ConcatPermute(y Float32x8, indices Uint32x8) Float32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x8) ConcatPermute(y Int32x8, indices Uint32x8) Int32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x8) ConcatPermute(y Uint32x8, indices Uint32x8) Uint32x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PS, CPU Feature: AVX512
+func (x Float32x16) ConcatPermute(y Float32x16, indices Uint32x16) Float32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Int32x16) ConcatPermute(y Int32x16, indices Uint32x16) Int32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2D, CPU Feature: AVX512
+func (x Uint32x16) ConcatPermute(y Uint32x16, indices Uint32x16) Uint32x16
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x2) ConcatPermute(y Float64x2, indices Uint64x2) Float64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x2) ConcatPermute(y Int64x2, indices Uint64x2) Int64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x2) ConcatPermute(y Uint64x2, indices Uint64x2) Uint64x2
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x4) ConcatPermute(y Float64x4, indices Uint64x4) Float64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x4) ConcatPermute(y Int64x4, indices Uint64x4) Int64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x4) ConcatPermute(y Uint64x4, indices Uint64x4) Uint64x4
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2PD, CPU Feature: AVX512
+func (x Float64x8) ConcatPermute(y Float64x8, indices Uint64x8) Float64x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Int64x8) ConcatPermute(y Int64x8, indices Uint64x8) Int64x8
+
+// ConcatPermute performs a full permutation of vector x, y using indices:
+// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
+// where xy is the concatenation of x (lower half) and y (upper half).
+// Only the needed bits to represent xy's index are used in indices' elements.
+//
+// Asm: VPERMI2Q, CPU Feature: AVX512
+func (x Uint64x8) ConcatPermute(y Uint64x8, indices Uint64x8) Uint64x8
+
+/* ConcatShiftBytesRight */
+
+// ConcatShiftBytesRight concatenates x and y and shift it right by constant bytes.
+// The result vector will be the lower half of the concatenated vector.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPALIGNR, CPU Feature: AVX
+func (x Uint8x16) ConcatShiftBytesRight(constant uint8, y Uint8x16) Uint8x16
+
+/* ConcatShiftBytesRightGrouped */
+
+// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by constant bytes.
+// The result vector will be the lower half of the concatenated vector.
+// This operation is performed grouped by each 16 byte.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPALIGNR, CPU Feature: AVX2
+func (x Uint8x32) ConcatShiftBytesRightGrouped(constant uint8, y Uint8x32) Uint8x32
+
+// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by constant bytes.
+// The result vector will be the lower half of the concatenated vector.
+// This operation is performed grouped by each 16 byte.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPALIGNR, CPU Feature: AVX512
+func (x Uint8x64) ConcatShiftBytesRightGrouped(constant uint8, y Uint8x64) Uint8x64
+
+/* ConvertToFloat32 */
+
+// ConvertToFloat32 converts element values to float32.
+// The result vector's elements are rounded to the nearest value.
+//
+// Asm: VCVTPD2PSX, CPU Feature: AVX
+func (x Float64x2) ConvertToFloat32() Float32x4
+
+// ConvertToFloat32 converts element values to float32.
+// The result vector's elements are rounded to the nearest value.
+//
+// Asm: VCVTPD2PSY, CPU Feature: AVX
+func (x Float64x4) ConvertToFloat32() Float32x4
+
+// ConvertToFloat32 converts element values to float32.
+// The result vector's elements are rounded to the nearest value.
+//
+// Asm: VCVTPD2PS, CPU Feature: AVX512
+func (x Float64x8) ConvertToFloat32() Float32x8
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTDQ2PS, CPU Feature: AVX
+func (x Int32x4) ConvertToFloat32() Float32x4
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTDQ2PS, CPU Feature: AVX
+func (x Int32x8) ConvertToFloat32() Float32x8
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTDQ2PS, CPU Feature: AVX512
+func (x Int32x16) ConvertToFloat32() Float32x16
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTQQ2PSX, CPU Feature: AVX512
+func (x Int64x2) ConvertToFloat32() Float32x4
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTQQ2PSY, CPU Feature: AVX512
+func (x Int64x4) ConvertToFloat32() Float32x4
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTQQ2PS, CPU Feature: AVX512
+func (x Int64x8) ConvertToFloat32() Float32x8
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTUDQ2PS, CPU Feature: AVX512
+func (x Uint32x4) ConvertToFloat32() Float32x4
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTUDQ2PS, CPU Feature: AVX512
+func (x Uint32x8) ConvertToFloat32() Float32x8
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTUDQ2PS, CPU Feature: AVX512
+func (x Uint32x16) ConvertToFloat32() Float32x16
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTUQQ2PSX, CPU Feature: AVX512
+func (x Uint64x2) ConvertToFloat32() Float32x4
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTUQQ2PSY, CPU Feature: AVX512
+func (x Uint64x4) ConvertToFloat32() Float32x4
+
+// ConvertToFloat32 converts element values to float32.
+//
+// Asm: VCVTUQQ2PS, CPU Feature: AVX512
+func (x Uint64x8) ConvertToFloat32() Float32x8
+
+/* ConvertToFloat64 */
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTPS2PD, CPU Feature: AVX
+func (x Float32x4) ConvertToFloat64() Float64x4
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTPS2PD, CPU Feature: AVX512
+func (x Float32x8) ConvertToFloat64() Float64x8
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTDQ2PD, CPU Feature: AVX
+func (x Int32x4) ConvertToFloat64() Float64x4
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTDQ2PD, CPU Feature: AVX512
+func (x Int32x8) ConvertToFloat64() Float64x8
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTQQ2PD, CPU Feature: AVX512
+func (x Int64x2) ConvertToFloat64() Float64x2
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTQQ2PD, CPU Feature: AVX512
+func (x Int64x4) ConvertToFloat64() Float64x4
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTQQ2PD, CPU Feature: AVX512
+func (x Int64x8) ConvertToFloat64() Float64x8
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTUDQ2PD, CPU Feature: AVX512
+func (x Uint32x4) ConvertToFloat64() Float64x4
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTUDQ2PD, CPU Feature: AVX512
+func (x Uint32x8) ConvertToFloat64() Float64x8
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTUQQ2PD, CPU Feature: AVX512
+func (x Uint64x2) ConvertToFloat64() Float64x2
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTUQQ2PD, CPU Feature: AVX512
+func (x Uint64x4) ConvertToFloat64() Float64x4
+
+// ConvertToFloat64 converts element values to float64.
+//
+// Asm: VCVTUQQ2PD, CPU Feature: AVX512
+func (x Uint64x8) ConvertToFloat64() Float64x8
+
+/* ConvertToInt32 */
+
+// ConvertToInt32 converts element values to int32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPS2DQ, CPU Feature: AVX
+func (x Float32x4) ConvertToInt32() Int32x4
+
+// ConvertToInt32 converts element values to int32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPS2DQ, CPU Feature: AVX
+func (x Float32x8) ConvertToInt32() Int32x8
+
+// ConvertToInt32 converts element values to int32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPS2DQ, CPU Feature: AVX512
+func (x Float32x16) ConvertToInt32() Int32x16
+
+// ConvertToInt32 converts element values to int32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2DQX, CPU Feature: AVX
+func (x Float64x2) ConvertToInt32() Int32x4
+
+// ConvertToInt32 converts element values to int32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2DQY, CPU Feature: AVX
+func (x Float64x4) ConvertToInt32() Int32x4
+
+// ConvertToInt32 converts element values to int32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2DQ, CPU Feature: AVX512
+func (x Float64x8) ConvertToInt32() Int32x8
+
+/* ConvertToInt64 */
+
+// ConvertToInt64 converts element values to int64.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int64, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPS2QQ, CPU Feature: AVX512
+func (x Float32x4) ConvertToInt64() Int64x4
+
+// ConvertToInt64 converts element values to int64.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int64, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPS2QQ, CPU Feature: AVX512
+func (x Float32x8) ConvertToInt64() Int64x8
+
+// ConvertToInt64 converts element values to int64.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int64, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2QQ, CPU Feature: AVX512
+func (x Float64x2) ConvertToInt64() Int64x2
+
+// ConvertToInt64 converts element values to int64.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int64, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2QQ, CPU Feature: AVX512
+func (x Float64x4) ConvertToInt64() Int64x4
+
+// ConvertToInt64 converts element values to int64.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in int64, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2QQ, CPU Feature: AVX512
+func (x Float64x8) ConvertToInt64() Int64x8
+
+/* ConvertToUint32 */
+
+// ConvertToUint32 converts element values to uint32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPS2UDQ, CPU Feature: AVX512
+func (x Float32x4) ConvertToUint32() Uint32x4
+
+// ConvertToUint32 converts element values to uint32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPS2UDQ, CPU Feature: AVX512
+func (x Float32x8) ConvertToUint32() Uint32x8
+
+// ConvertToUint32 converts element values to uint32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPS2UDQ, CPU Feature: AVX512
+func (x Float32x16) ConvertToUint32() Uint32x16
+
+// ConvertToUint32 converts element values to uint32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2UDQX, CPU Feature: AVX512
+func (x Float64x2) ConvertToUint32() Uint32x4
+
+// ConvertToUint32 converts element values to uint32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2UDQY, CPU Feature: AVX512
+func (x Float64x4) ConvertToUint32() Uint32x4
+
+// ConvertToUint32 converts element values to uint32.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint32, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2UDQ, CPU Feature: AVX512
+func (x Float64x8) ConvertToUint32() Uint32x8
+
+/* ConvertToUint64 */
+
+// ConvertToUint64 converts element values to uint64.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint64, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPS2UQQ, CPU Feature: AVX512
+func (x Float32x4) ConvertToUint64() Uint64x4
+
+// ConvertToUint64 converts element values to uint64.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint64, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPS2UQQ, CPU Feature: AVX512
+func (x Float32x8) ConvertToUint64() Uint64x8
+
+// ConvertToUint64 converts element values to uint64.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint64, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2UQQ, CPU Feature: AVX512
+func (x Float64x2) ConvertToUint64() Uint64x2
+
+// ConvertToUint64 converts element values to uint64.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint64, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2UQQ, CPU Feature: AVX512
+func (x Float64x4) ConvertToUint64() Uint64x4
+
+// ConvertToUint64 converts element values to uint64.
+// When a conversion is inexact, a truncated (round toward zero) value is returned.
+// If a converted result cannot be represented in uint64, an implementation-defined
+// architecture-specific value is returned.
+//
+// Asm: VCVTTPD2UQQ, CPU Feature: AVX512
+func (x Float64x8) ConvertToUint64() Uint64x8
+
+/* CopySign */
+
+// CopySign returns the product of the first operand with -1, 0, or 1,
+// whichever constant is nearest to the value of the second operand.
+//
+// Asm: VPSIGNB, CPU Feature: AVX
+func (x Int8x16) CopySign(y Int8x16) Int8x16
+
+// CopySign returns the product of the first operand with -1, 0, or 1,
+// whichever constant is nearest to the value of the second operand.
+//
+// Asm: VPSIGNB, CPU Feature: AVX2
+func (x Int8x32) CopySign(y Int8x32) Int8x32
+
+// CopySign returns the product of the first operand with -1, 0, or 1,
+// whichever constant is nearest to the value of the second operand.
+//
+// Asm: VPSIGNW, CPU Feature: AVX
+func (x Int16x8) CopySign(y Int16x8) Int16x8
+
+// CopySign returns the product of the first operand with -1, 0, or 1,
+// whichever constant is nearest to the value of the second operand.
+//
+// Asm: VPSIGNW, CPU Feature: AVX2
+func (x Int16x16) CopySign(y Int16x16) Int16x16
+
+// CopySign returns the product of the first operand with -1, 0, or 1,
+// whichever constant is nearest to the value of the second operand.
+//
+// Asm: VPSIGND, CPU Feature: AVX
+func (x Int32x4) CopySign(y Int32x4) Int32x4
+
+// CopySign returns the product of the first operand with -1, 0, or 1,
+// whichever constant is nearest to the value of the second operand.
+//
+// Asm: VPSIGND, CPU Feature: AVX2
+func (x Int32x8) CopySign(y Int32x8) Int32x8
+
+/* Div */
+
+// Div divides elements of two vectors.
+//
+// Asm: VDIVPS, CPU Feature: AVX
+func (x Float32x4) Div(y Float32x4) Float32x4
+
+// Div divides elements of two vectors.
+//
+// Asm: VDIVPS, CPU Feature: AVX
+func (x Float32x8) Div(y Float32x8) Float32x8
+
+// Div divides elements of two vectors.
+//
+// Asm: VDIVPS, CPU Feature: AVX512
+func (x Float32x16) Div(y Float32x16) Float32x16
+
+// Div divides elements of two vectors.
+//
+// Asm: VDIVPD, CPU Feature: AVX
+func (x Float64x2) Div(y Float64x2) Float64x2
+
+// Div divides elements of two vectors.
+//
+// Asm: VDIVPD, CPU Feature: AVX
+func (x Float64x4) Div(y Float64x4) Float64x4
+
+// Div divides elements of two vectors.
+//
+// Asm: VDIVPD, CPU Feature: AVX512
+func (x Float64x8) Div(y Float64x8) Float64x8
+
+/* DotProductPairs */
+
+// DotProductPairs multiplies the elements and add the pairs together,
+// yielding a vector of half as many elements with twice the input element size.
+//
+// Asm: VPMADDWD, CPU Feature: AVX
+func (x Int16x8) DotProductPairs(y Int16x8) Int32x4
+
+// DotProductPairs multiplies the elements and add the pairs together,
+// yielding a vector of half as many elements with twice the input element size.
+//
+// Asm: VPMADDWD, CPU Feature: AVX2
+func (x Int16x16) DotProductPairs(y Int16x16) Int32x8
+
+// DotProductPairs multiplies the elements and add the pairs together,
+// yielding a vector of half as many elements with twice the input element size.
+//
+// Asm: VPMADDWD, CPU Feature: AVX512
+func (x Int16x32) DotProductPairs(y Int16x32) Int32x16
+
+/* DotProductPairsSaturated */
+
+// DotProductPairsSaturated multiplies the elements and add the pairs together with saturation,
+// yielding a vector of half as many elements with twice the input element size.
+//
+// Asm: VPMADDUBSW, CPU Feature: AVX
+func (x Uint8x16) DotProductPairsSaturated(y Int8x16) Int16x8
+
+// DotProductPairsSaturated multiplies the elements and add the pairs together with saturation,
+// yielding a vector of half as many elements with twice the input element size.
+//
+// Asm: VPMADDUBSW, CPU Feature: AVX2
+func (x Uint8x32) DotProductPairsSaturated(y Int8x32) Int16x16
+
+// DotProductPairsSaturated multiplies the elements and add the pairs together with saturation,
+// yielding a vector of half as many elements with twice the input element size.
+//
+// Asm: VPMADDUBSW, CPU Feature: AVX512
+func (x Uint8x64) DotProductPairsSaturated(y Int8x64) Int16x32
+
+/* DotProductQuadruple */
+
+// DotProductQuadruple performs dot products on groups of 4 elements of x and y.
+// DotProductQuadruple(x, y).Add(z) will be optimized to the full form of the underlying instruction.
+//
+// Asm: VPDPBUSD, CPU Feature: AVXVNNI
+func (x Int8x16) DotProductQuadruple(y Uint8x16) Int32x4
+
+// DotProductQuadruple performs dot products on groups of 4 elements of x and y.
+// DotProductQuadruple(x, y).Add(z) will be optimized to the full form of the underlying instruction.
+//
+// Asm: VPDPBUSD, CPU Feature: AVXVNNI
+func (x Int8x32) DotProductQuadruple(y Uint8x32) Int32x8
+
+// DotProductQuadruple performs dot products on groups of 4 elements of x and y.
+// DotProductQuadruple(x, y).Add(z) will be optimized to the full form of the underlying instruction.
+//
+// Asm: VPDPBUSD, CPU Feature: AVX512VNNI
+func (x Int8x64) DotProductQuadruple(y Uint8x64) Int32x16
+
+/* DotProductQuadrupleSaturated */
+
+// DotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y.
+// DotProductQuadrupleSaturated(x, y).Add(z) will be optimized to the full form of the underlying instruction.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVXVNNI
+func (x Int8x16) DotProductQuadrupleSaturated(y Uint8x16) Int32x4
+
+// DotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y.
+// DotProductQuadrupleSaturated(x, y).Add(z) will be optimized to the full form of the underlying instruction.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVXVNNI
+func (x Int8x32) DotProductQuadrupleSaturated(y Uint8x32) Int32x8
+
+// DotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y.
+// DotProductQuadrupleSaturated(x, y).Add(z) will be optimized to the full form of the underlying instruction.
+//
+// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
+func (x Int8x64) DotProductQuadrupleSaturated(y Uint8x64) Int32x16
+
+/* Equal */
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQB, CPU Feature: AVX
+func (x Int8x16) Equal(y Int8x16) Mask8x16
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQB, CPU Feature: AVX2
+func (x Int8x32) Equal(y Int8x32) Mask8x32
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQB, CPU Feature: AVX512
+func (x Int8x64) Equal(y Int8x64) Mask8x64
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQW, CPU Feature: AVX
+func (x Int16x8) Equal(y Int16x8) Mask16x8
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQW, CPU Feature: AVX2
+func (x Int16x16) Equal(y Int16x16) Mask16x16
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQW, CPU Feature: AVX512
+func (x Int16x32) Equal(y Int16x32) Mask16x32
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQD, CPU Feature: AVX
+func (x Int32x4) Equal(y Int32x4) Mask32x4
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQD, CPU Feature: AVX2
+func (x Int32x8) Equal(y Int32x8) Mask32x8
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQD, CPU Feature: AVX512
+func (x Int32x16) Equal(y Int32x16) Mask32x16
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQQ, CPU Feature: AVX
+func (x Int64x2) Equal(y Int64x2) Mask64x2
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQQ, CPU Feature: AVX2
+func (x Int64x4) Equal(y Int64x4) Mask64x4
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQQ, CPU Feature: AVX512
+func (x Int64x8) Equal(y Int64x8) Mask64x8
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQB, CPU Feature: AVX
+func (x Uint8x16) Equal(y Uint8x16) Mask8x16
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQB, CPU Feature: AVX2
+func (x Uint8x32) Equal(y Uint8x32) Mask8x32
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQB, CPU Feature: AVX512
+func (x Uint8x64) Equal(y Uint8x64) Mask8x64
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQW, CPU Feature: AVX
+func (x Uint16x8) Equal(y Uint16x8) Mask16x8
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQW, CPU Feature: AVX2
+func (x Uint16x16) Equal(y Uint16x16) Mask16x16
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQW, CPU Feature: AVX512
+func (x Uint16x32) Equal(y Uint16x32) Mask16x32
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQD, CPU Feature: AVX
+func (x Uint32x4) Equal(y Uint32x4) Mask32x4
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQD, CPU Feature: AVX2
+func (x Uint32x8) Equal(y Uint32x8) Mask32x8
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQD, CPU Feature: AVX512
+func (x Uint32x16) Equal(y Uint32x16) Mask32x16
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQQ, CPU Feature: AVX
+func (x Uint64x2) Equal(y Uint64x2) Mask64x2
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQQ, CPU Feature: AVX2
+func (x Uint64x4) Equal(y Uint64x4) Mask64x4
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VPCMPEQQ, CPU Feature: AVX512
+func (x Uint64x8) Equal(y Uint64x8) Mask64x8
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x4) Equal(y Float32x4) Mask32x4
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x8) Equal(y Float32x8) Mask32x8
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX512
+func (x Float32x16) Equal(y Float32x16) Mask32x16
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x2) Equal(y Float64x2) Mask64x2
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x4) Equal(y Float64x4) Mask64x4
+
+// Equal returns x equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX512
+func (x Float64x8) Equal(y Float64x8) Mask64x8
+
+/* Expand */
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPS, CPU Feature: AVX512
+func (x Float32x4) Expand(mask Mask32x4) Float32x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPS, CPU Feature: AVX512
+func (x Float32x8) Expand(mask Mask32x8) Float32x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPS, CPU Feature: AVX512
+func (x Float32x16) Expand(mask Mask32x16) Float32x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPD, CPU Feature: AVX512
+func (x Float64x2) Expand(mask Mask64x2) Float64x2
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPD, CPU Feature: AVX512
+func (x Float64x4) Expand(mask Mask64x4) Float64x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VEXPANDPD, CPU Feature: AVX512
+func (x Float64x8) Expand(mask Mask64x8) Float64x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Int8x16) Expand(mask Mask8x16) Int8x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Int8x32) Expand(mask Mask8x32) Int8x32
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Int8x64) Expand(mask Mask8x64) Int8x64
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Int16x8) Expand(mask Mask16x8) Int16x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Int16x16) Expand(mask Mask16x16) Int16x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Int16x32) Expand(mask Mask16x32) Int16x32
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512
+func (x Int32x4) Expand(mask Mask32x4) Int32x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512
+func (x Int32x8) Expand(mask Mask32x8) Int32x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512
+func (x Int32x16) Expand(mask Mask32x16) Int32x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512
+func (x Int64x2) Expand(mask Mask64x2) Int64x2
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512
+func (x Int64x4) Expand(mask Mask64x4) Int64x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512
+func (x Int64x8) Expand(mask Mask64x8) Int64x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Uint8x16) Expand(mask Mask8x16) Uint8x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Uint8x32) Expand(mask Mask8x32) Uint8x32
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
+func (x Uint8x64) Expand(mask Mask8x64) Uint8x64
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Uint16x8) Expand(mask Mask16x8) Uint16x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Uint16x16) Expand(mask Mask16x16) Uint16x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
+func (x Uint16x32) Expand(mask Mask16x32) Uint16x32
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512
+func (x Uint32x4) Expand(mask Mask32x4) Uint32x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512
+func (x Uint32x8) Expand(mask Mask32x8) Uint32x8
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDD, CPU Feature: AVX512
+func (x Uint32x16) Expand(mask Mask32x16) Uint32x16
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512
+func (x Uint64x2) Expand(mask Mask64x2) Uint64x2
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512
+func (x Uint64x4) Expand(mask Mask64x4) Uint64x4
+
+// Expand performs an expansion on a vector x whose elements are packed to lower parts.
+// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
+//
+// Asm: VPEXPANDQ, CPU Feature: AVX512
+func (x Uint64x8) Expand(mask Mask64x8) Uint64x8
+
+/* ExtendLo2ToInt64x2 */
+
+// ExtendLo2ToInt64x2 converts 2 lowest vector element values to int64.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXBQ, CPU Feature: AVX
+func (x Int8x16) ExtendLo2ToInt64x2() Int64x2
+
+// ExtendLo2ToInt64x2 converts 2 lowest vector element values to int64.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXWQ, CPU Feature: AVX
+func (x Int16x8) ExtendLo2ToInt64x2() Int64x2
+
+// ExtendLo2ToInt64x2 converts 2 lowest vector element values to int64.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXDQ, CPU Feature: AVX
+func (x Int32x4) ExtendLo2ToInt64x2() Int64x2
+
+/* ExtendLo2ToUint64x2 */
+
+// ExtendLo2ToUint64x2 converts 2 lowest vector element values to uint64.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXBQ, CPU Feature: AVX
+func (x Uint8x16) ExtendLo2ToUint64x2() Uint64x2
+
+// ExtendLo2ToUint64x2 converts 2 lowest vector element values to uint64.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXWQ, CPU Feature: AVX
+func (x Uint16x8) ExtendLo2ToUint64x2() Uint64x2
+
+// ExtendLo2ToUint64x2 converts 2 lowest vector element values to uint64.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXDQ, CPU Feature: AVX
+func (x Uint32x4) ExtendLo2ToUint64x2() Uint64x2
+
+/* ExtendLo4ToInt32x4 */
+
+// ExtendLo4ToInt32x4 converts 4 lowest vector element values to int32.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXBD, CPU Feature: AVX
+func (x Int8x16) ExtendLo4ToInt32x4() Int32x4
+
+// ExtendLo4ToInt32x4 converts 4 lowest vector element values to int32.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXWD, CPU Feature: AVX
+func (x Int16x8) ExtendLo4ToInt32x4() Int32x4
+
+/* ExtendLo4ToInt64x4 */
+
+// ExtendLo4ToInt64x4 converts 4 lowest vector element values to int64.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXBQ, CPU Feature: AVX2
+func (x Int8x16) ExtendLo4ToInt64x4() Int64x4
+
+// ExtendLo4ToInt64x4 converts 4 lowest vector element values to int64.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXWQ, CPU Feature: AVX2
+func (x Int16x8) ExtendLo4ToInt64x4() Int64x4
+
+/* ExtendLo4ToUint32x4 */
+
+// ExtendLo4ToUint32x4 converts 4 lowest vector element values to uint32.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXBD, CPU Feature: AVX
+func (x Uint8x16) ExtendLo4ToUint32x4() Uint32x4
+
+// ExtendLo4ToUint32x4 converts 4 lowest vector element values to uint32.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXWD, CPU Feature: AVX
+func (x Uint16x8) ExtendLo4ToUint32x4() Uint32x4
+
+/* ExtendLo4ToUint64x4 */
+
+// ExtendLo4ToUint64x4 converts 4 lowest vector element values to uint64.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXBQ, CPU Feature: AVX2
+func (x Uint8x16) ExtendLo4ToUint64x4() Uint64x4
+
+// ExtendLo4ToUint64x4 converts 4 lowest vector element values to uint64.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXWQ, CPU Feature: AVX2
+func (x Uint16x8) ExtendLo4ToUint64x4() Uint64x4
+
+/* ExtendLo8ToInt16x8 */
+
+// ExtendLo8ToInt16x8 converts 8 lowest vector element values to int16.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXBW, CPU Feature: AVX
+func (x Int8x16) ExtendLo8ToInt16x8() Int16x8
+
+/* ExtendLo8ToInt32x8 */
+
+// ExtendLo8ToInt32x8 converts 8 lowest vector element values to int32.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXBD, CPU Feature: AVX2
+func (x Int8x16) ExtendLo8ToInt32x8() Int32x8
+
+/* ExtendLo8ToInt64x8 */
+
+// ExtendLo8ToInt64x8 converts 8 lowest vector element values to int64.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXBQ, CPU Feature: AVX512
+func (x Int8x16) ExtendLo8ToInt64x8() Int64x8
+
+/* ExtendLo8ToUint16x8 */
+
+// ExtendLo8ToUint16x8 converts 8 lowest vector element values to uint16.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXBW, CPU Feature: AVX
+func (x Uint8x16) ExtendLo8ToUint16x8() Uint16x8
+
+/* ExtendLo8ToUint32x8 */
+
+// ExtendLo8ToUint32x8 converts 8 lowest vector element values to uint32.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXBD, CPU Feature: AVX2
+func (x Uint8x16) ExtendLo8ToUint32x8() Uint32x8
+
+/* ExtendLo8ToUint64x8 */
+
+// ExtendLo8ToUint64x8 converts 8 lowest vector element values to uint64.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXBQ, CPU Feature: AVX512
+func (x Uint8x16) ExtendLo8ToUint64x8() Uint64x8
+
+/* ExtendToInt16 */
+
+// ExtendToInt16 converts element values to int16.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXBW, CPU Feature: AVX2
+func (x Int8x16) ExtendToInt16() Int16x16
+
+// ExtendToInt16 converts element values to int16.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXBW, CPU Feature: AVX512
+func (x Int8x32) ExtendToInt16() Int16x32
+
+/* ExtendToInt32 */
+
+// ExtendToInt32 converts element values to int32.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXBD, CPU Feature: AVX512
+func (x Int8x16) ExtendToInt32() Int32x16
+
+// ExtendToInt32 converts element values to int32.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXWD, CPU Feature: AVX2
+func (x Int16x8) ExtendToInt32() Int32x8
+
+// ExtendToInt32 converts element values to int32.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXWD, CPU Feature: AVX512
+func (x Int16x16) ExtendToInt32() Int32x16
+
+/* ExtendToInt64 */
+
+// ExtendToInt64 converts element values to int64.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXWQ, CPU Feature: AVX512
+func (x Int16x8) ExtendToInt64() Int64x8
+
+// ExtendToInt64 converts element values to int64.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXDQ, CPU Feature: AVX2
+func (x Int32x4) ExtendToInt64() Int64x4
+
+// ExtendToInt64 converts element values to int64.
+// The result vector's elements are sign-extended.
+//
+// Asm: VPMOVSXDQ, CPU Feature: AVX512
+func (x Int32x8) ExtendToInt64() Int64x8
+
+/* ExtendToUint16 */
+
+// ExtendToUint16 converts element values to uint16.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXBW, CPU Feature: AVX2
+func (x Uint8x16) ExtendToUint16() Uint16x16
+
+// ExtendToUint16 converts element values to uint16.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXBW, CPU Feature: AVX512
+func (x Uint8x32) ExtendToUint16() Uint16x32
+
+/* ExtendToUint32 */
+
+// ExtendToUint32 converts element values to uint32.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXBD, CPU Feature: AVX512
+func (x Uint8x16) ExtendToUint32() Uint32x16
+
+// ExtendToUint32 converts element values to uint32.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXWD, CPU Feature: AVX2
+func (x Uint16x8) ExtendToUint32() Uint32x8
+
+// ExtendToUint32 converts element values to uint32.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXWD, CPU Feature: AVX512
+func (x Uint16x16) ExtendToUint32() Uint32x16
+
+/* ExtendToUint64 */
+
+// ExtendToUint64 converts element values to uint64.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXWQ, CPU Feature: AVX512
+func (x Uint16x8) ExtendToUint64() Uint64x8
+
+// ExtendToUint64 converts element values to uint64.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXDQ, CPU Feature: AVX2
+func (x Uint32x4) ExtendToUint64() Uint64x4
+
+// ExtendToUint64 converts element values to uint64.
+// The result vector's elements are zero-extended.
+//
+// Asm: VPMOVZXDQ, CPU Feature: AVX512
+func (x Uint32x8) ExtendToUint64() Uint64x8
+
+/* Floor */
+
+// Floor rounds elements down to the nearest integer.
+//
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x4) Floor() Float32x4
+
+// Floor rounds elements down to the nearest integer.
+//
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x8) Floor() Float32x8
+
+// Floor rounds elements down to the nearest integer.
+//
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x2) Floor() Float64x2
+
+// Floor rounds elements down to the nearest integer.
+//
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x4) Floor() Float64x4
+
+/* FloorScaled */
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x4) FloorScaled(prec uint8) Float32x4
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x8) FloorScaled(prec uint8) Float32x8
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x16) FloorScaled(prec uint8) Float32x16
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x2) FloorScaled(prec uint8) Float64x2
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x4) FloorScaled(prec uint8) Float64x4
+
+// FloorScaled rounds elements down with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x8) FloorScaled(prec uint8) Float64x8
+
+/* FloorScaledResidue */
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x4) FloorScaledResidue(prec uint8) Float32x4
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x8) FloorScaledResidue(prec uint8) Float32x8
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x16) FloorScaledResidue(prec uint8) Float32x16
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x2) FloorScaledResidue(prec uint8) Float64x2
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x4) FloorScaledResidue(prec uint8) Float64x4
+
+// FloorScaledResidue computes the difference after flooring with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x8) FloorScaledResidue(prec uint8) Float64x8
+
+/* GaloisFieldAffineTransform */
+
+// GaloisFieldAffineTransform computes an affine transformation in GF(2^8):
+// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
+// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
+// corresponding to a group of 8 elements in x.
+//
+// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
+func (x Uint8x16) GaloisFieldAffineTransform(y Uint64x2, b uint8) Uint8x16
+
+// GaloisFieldAffineTransform computes an affine transformation in GF(2^8):
+// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
+// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
+// corresponding to a group of 8 elements in x.
+//
+// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
+func (x Uint8x32) GaloisFieldAffineTransform(y Uint64x4, b uint8) Uint8x32
+
+// GaloisFieldAffineTransform computes an affine transformation in GF(2^8):
+// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
+// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
+// corresponding to a group of 8 elements in x.
+//
+// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
+func (x Uint8x64) GaloisFieldAffineTransform(y Uint64x8, b uint8) Uint8x64
+
+/* GaloisFieldAffineTransformInverse */
+
+// GaloisFieldAffineTransformInverse computes an affine transformation in GF(2^8),
+// with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
+// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
+// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
+// corresponding to a group of 8 elements in x.
+//
+// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
+func (x Uint8x16) GaloisFieldAffineTransformInverse(y Uint64x2, b uint8) Uint8x16
+
+// GaloisFieldAffineTransformInverse computes an affine transformation in GF(2^8),
+// with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
+// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
+// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
+// corresponding to a group of 8 elements in x.
+//
+// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
+func (x Uint8x32) GaloisFieldAffineTransformInverse(y Uint64x4, b uint8) Uint8x32
+
+// GaloisFieldAffineTransformInverse computes an affine transformation in GF(2^8),
+// with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
+// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
+// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
+// corresponding to a group of 8 elements in x.
+//
+// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
+func (x Uint8x64) GaloisFieldAffineTransformInverse(y Uint64x8, b uint8) Uint8x64
+
+/* GaloisFieldMul */
+
+// GaloisFieldMul computes element-wise GF(2^8) multiplication with
+// reduction polynomial x^8 + x^4 + x^3 + x + 1.
+//
+// Asm: VGF2P8MULB, CPU Feature: AVX512GFNI
+func (x Uint8x16) GaloisFieldMul(y Uint8x16) Uint8x16
+
+// GaloisFieldMul computes element-wise GF(2^8) multiplication with
+// reduction polynomial x^8 + x^4 + x^3 + x + 1.
+//
+// Asm: VGF2P8MULB, CPU Feature: AVX512GFNI
+func (x Uint8x32) GaloisFieldMul(y Uint8x32) Uint8x32
+
+// GaloisFieldMul computes element-wise GF(2^8) multiplication with
+// reduction polynomial x^8 + x^4 + x^3 + x + 1.
+//
+// Asm: VGF2P8MULB, CPU Feature: AVX512GFNI
+func (x Uint8x64) GaloisFieldMul(y Uint8x64) Uint8x64
+
+/* GetElem */
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRD, CPU Feature: AVX
+func (x Float32x4) GetElem(index uint8) float32
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRQ, CPU Feature: AVX
+func (x Float64x2) GetElem(index uint8) float64
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRB, CPU Feature: AVX512
+func (x Int8x16) GetElem(index uint8) int8
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRW, CPU Feature: AVX512
+func (x Int16x8) GetElem(index uint8) int16
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRD, CPU Feature: AVX
+func (x Int32x4) GetElem(index uint8) int32
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRQ, CPU Feature: AVX
+func (x Int64x2) GetElem(index uint8) int64
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRB, CPU Feature: AVX512
+func (x Uint8x16) GetElem(index uint8) uint8
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRW, CPU Feature: AVX512
+func (x Uint16x8) GetElem(index uint8) uint16
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRD, CPU Feature: AVX
+func (x Uint32x4) GetElem(index uint8) uint32
+
+// GetElem retrieves a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPEXTRQ, CPU Feature: AVX
+func (x Uint64x2) GetElem(index uint8) uint64
+
+/* GetHi */
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTF128, CPU Feature: AVX
+func (x Float32x8) GetHi() Float32x4
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTF64X4, CPU Feature: AVX512
+func (x Float32x16) GetHi() Float32x8
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTF128, CPU Feature: AVX
+func (x Float64x4) GetHi() Float64x2
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTF64X4, CPU Feature: AVX512
+func (x Float64x8) GetHi() Float64x4
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int8x32) GetHi() Int8x16
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Int8x64) GetHi() Int8x32
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int16x16) GetHi() Int16x8
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Int16x32) GetHi() Int16x16
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int32x8) GetHi() Int32x4
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Int32x16) GetHi() Int32x8
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int64x4) GetHi() Int64x2
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Int64x8) GetHi() Int64x4
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint8x32) GetHi() Uint8x16
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Uint8x64) GetHi() Uint8x32
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint16x16) GetHi() Uint16x8
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Uint16x32) GetHi() Uint16x16
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint32x8) GetHi() Uint32x4
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Uint32x16) GetHi() Uint32x8
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint64x4) GetHi() Uint64x2
+
+// GetHi returns the upper half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Uint64x8) GetHi() Uint64x4
+
+/* GetLo */
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTF128, CPU Feature: AVX
+func (x Float32x8) GetLo() Float32x4
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTF64X4, CPU Feature: AVX512
+func (x Float32x16) GetLo() Float32x8
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTF128, CPU Feature: AVX
+func (x Float64x4) GetLo() Float64x2
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTF64X4, CPU Feature: AVX512
+func (x Float64x8) GetLo() Float64x4
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int8x32) GetLo() Int8x16
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Int8x64) GetLo() Int8x32
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int16x16) GetLo() Int16x8
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Int16x32) GetLo() Int16x16
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int32x8) GetLo() Int32x4
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Int32x16) GetLo() Int32x8
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int64x4) GetLo() Int64x2
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Int64x8) GetLo() Int64x4
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint8x32) GetLo() Uint8x16
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Uint8x64) GetLo() Uint8x32
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint16x16) GetLo() Uint16x8
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Uint16x32) GetLo() Uint16x16
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint32x8) GetLo() Uint32x4
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Uint32x16) GetLo() Uint32x8
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint64x4) GetLo() Uint64x2
+
+// GetLo returns the lower half of x.
+//
+// Asm: VEXTRACTI64X4, CPU Feature: AVX512
+func (x Uint64x8) GetLo() Uint64x4
+
+/* Greater */
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTB, CPU Feature: AVX
+func (x Int8x16) Greater(y Int8x16) Mask8x16
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTB, CPU Feature: AVX2
+func (x Int8x32) Greater(y Int8x32) Mask8x32
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTB, CPU Feature: AVX512
+func (x Int8x64) Greater(y Int8x64) Mask8x64
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTW, CPU Feature: AVX
+func (x Int16x8) Greater(y Int16x8) Mask16x8
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTW, CPU Feature: AVX2
+func (x Int16x16) Greater(y Int16x16) Mask16x16
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTW, CPU Feature: AVX512
+func (x Int16x32) Greater(y Int16x32) Mask16x32
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTD, CPU Feature: AVX
+func (x Int32x4) Greater(y Int32x4) Mask32x4
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTD, CPU Feature: AVX2
+func (x Int32x8) Greater(y Int32x8) Mask32x8
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTD, CPU Feature: AVX512
+func (x Int32x16) Greater(y Int32x16) Mask32x16
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTQ, CPU Feature: AVX
+func (x Int64x2) Greater(y Int64x2) Mask64x2
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTQ, CPU Feature: AVX2
+func (x Int64x4) Greater(y Int64x4) Mask64x4
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPGTQ, CPU Feature: AVX512
+func (x Int64x8) Greater(y Int64x8) Mask64x8
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x4) Greater(y Float32x4) Mask32x4
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x8) Greater(y Float32x8) Mask32x8
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX512
+func (x Float32x16) Greater(y Float32x16) Mask32x16
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x2) Greater(y Float64x2) Mask64x2
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x4) Greater(y Float64x4) Mask64x4
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX512
+func (x Float64x8) Greater(y Float64x8) Mask64x8
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPUB, CPU Feature: AVX512
+func (x Uint8x64) Greater(y Uint8x64) Mask8x64
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPUW, CPU Feature: AVX512
+func (x Uint16x32) Greater(y Uint16x32) Mask16x32
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPUD, CPU Feature: AVX512
+func (x Uint32x16) Greater(y Uint32x16) Mask32x16
+
+// Greater returns x greater-than y, elementwise.
+//
+// Asm: VPCMPUQ, CPU Feature: AVX512
+func (x Uint64x8) Greater(y Uint64x8) Mask64x8
+
+/* GreaterEqual */
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x4) GreaterEqual(y Float32x4) Mask32x4
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x8) GreaterEqual(y Float32x8) Mask32x8
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX512
+func (x Float32x16) GreaterEqual(y Float32x16) Mask32x16
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x2) GreaterEqual(y Float64x2) Mask64x2
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x4) GreaterEqual(y Float64x4) Mask64x4
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX512
+func (x Float64x8) GreaterEqual(y Float64x8) Mask64x8
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VPCMPB, CPU Feature: AVX512
+func (x Int8x64) GreaterEqual(y Int8x64) Mask8x64
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VPCMPW, CPU Feature: AVX512
+func (x Int16x32) GreaterEqual(y Int16x32) Mask16x32
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VPCMPD, CPU Feature: AVX512
+func (x Int32x16) GreaterEqual(y Int32x16) Mask32x16
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VPCMPQ, CPU Feature: AVX512
+func (x Int64x8) GreaterEqual(y Int64x8) Mask64x8
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VPCMPUB, CPU Feature: AVX512
+func (x Uint8x64) GreaterEqual(y Uint8x64) Mask8x64
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VPCMPUW, CPU Feature: AVX512
+func (x Uint16x32) GreaterEqual(y Uint16x32) Mask16x32
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VPCMPUD, CPU Feature: AVX512
+func (x Uint32x16) GreaterEqual(y Uint32x16) Mask32x16
+
+// GreaterEqual returns x greater-than-or-equals y, elementwise.
+//
+// Asm: VPCMPUQ, CPU Feature: AVX512
+func (x Uint64x8) GreaterEqual(y Uint64x8) Mask64x8
+
+/* InterleaveHi */
+
+// InterleaveHi interleaves the elements of the high halves of x and y.
+//
+// Asm: VPUNPCKHWD, CPU Feature: AVX
+func (x Int16x8) InterleaveHi(y Int16x8) Int16x8
+
+// InterleaveHi interleaves the elements of the high halves of x and y.
+//
+// Asm: VPUNPCKHDQ, CPU Feature: AVX
+func (x Int32x4) InterleaveHi(y Int32x4) Int32x4
+
+// InterleaveHi interleaves the elements of the high halves of x and y.
+//
+// Asm: VPUNPCKHQDQ, CPU Feature: AVX
+func (x Int64x2) InterleaveHi(y Int64x2) Int64x2
+
+// InterleaveHi interleaves the elements of the high halves of x and y.
+//
+// Asm: VPUNPCKHWD, CPU Feature: AVX
+func (x Uint16x8) InterleaveHi(y Uint16x8) Uint16x8
+
+// InterleaveHi interleaves the elements of the high halves of x and y.
+//
+// Asm: VPUNPCKHDQ, CPU Feature: AVX
+func (x Uint32x4) InterleaveHi(y Uint32x4) Uint32x4
+
+// InterleaveHi interleaves the elements of the high halves of x and y.
+//
+// Asm: VPUNPCKHQDQ, CPU Feature: AVX
+func (x Uint64x2) InterleaveHi(y Uint64x2) Uint64x2
+
+/* InterleaveHiGrouped */
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHWD, CPU Feature: AVX2
+func (x Int16x16) InterleaveHiGrouped(y Int16x16) Int16x16
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHWD, CPU Feature: AVX512
+func (x Int16x32) InterleaveHiGrouped(y Int16x32) Int16x32
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHDQ, CPU Feature: AVX2
+func (x Int32x8) InterleaveHiGrouped(y Int32x8) Int32x8
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHDQ, CPU Feature: AVX512
+func (x Int32x16) InterleaveHiGrouped(y Int32x16) Int32x16
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHQDQ, CPU Feature: AVX2
+func (x Int64x4) InterleaveHiGrouped(y Int64x4) Int64x4
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHQDQ, CPU Feature: AVX512
+func (x Int64x8) InterleaveHiGrouped(y Int64x8) Int64x8
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHWD, CPU Feature: AVX2
+func (x Uint16x16) InterleaveHiGrouped(y Uint16x16) Uint16x16
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHWD, CPU Feature: AVX512
+func (x Uint16x32) InterleaveHiGrouped(y Uint16x32) Uint16x32
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHDQ, CPU Feature: AVX2
+func (x Uint32x8) InterleaveHiGrouped(y Uint32x8) Uint32x8
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHDQ, CPU Feature: AVX512
+func (x Uint32x16) InterleaveHiGrouped(y Uint32x16) Uint32x16
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHQDQ, CPU Feature: AVX2
+func (x Uint64x4) InterleaveHiGrouped(y Uint64x4) Uint64x4
+
+// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKHQDQ, CPU Feature: AVX512
+func (x Uint64x8) InterleaveHiGrouped(y Uint64x8) Uint64x8
+
+/* InterleaveLo */
+
+// InterleaveLo interleaves the elements of the low halves of x and y.
+//
+// Asm: VPUNPCKLWD, CPU Feature: AVX
+func (x Int16x8) InterleaveLo(y Int16x8) Int16x8
+
+// InterleaveLo interleaves the elements of the low halves of x and y.
+//
+// Asm: VPUNPCKLDQ, CPU Feature: AVX
+func (x Int32x4) InterleaveLo(y Int32x4) Int32x4
+
+// InterleaveLo interleaves the elements of the low halves of x and y.
+//
+// Asm: VPUNPCKLQDQ, CPU Feature: AVX
+func (x Int64x2) InterleaveLo(y Int64x2) Int64x2
+
+// InterleaveLo interleaves the elements of the low halves of x and y.
+//
+// Asm: VPUNPCKLWD, CPU Feature: AVX
+func (x Uint16x8) InterleaveLo(y Uint16x8) Uint16x8
+
+// InterleaveLo interleaves the elements of the low halves of x and y.
+//
+// Asm: VPUNPCKLDQ, CPU Feature: AVX
+func (x Uint32x4) InterleaveLo(y Uint32x4) Uint32x4
+
+// InterleaveLo interleaves the elements of the low halves of x and y.
+//
+// Asm: VPUNPCKLQDQ, CPU Feature: AVX
+func (x Uint64x2) InterleaveLo(y Uint64x2) Uint64x2
+
+/* InterleaveLoGrouped */
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLWD, CPU Feature: AVX2
+func (x Int16x16) InterleaveLoGrouped(y Int16x16) Int16x16
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLWD, CPU Feature: AVX512
+func (x Int16x32) InterleaveLoGrouped(y Int16x32) Int16x32
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLDQ, CPU Feature: AVX2
+func (x Int32x8) InterleaveLoGrouped(y Int32x8) Int32x8
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLDQ, CPU Feature: AVX512
+func (x Int32x16) InterleaveLoGrouped(y Int32x16) Int32x16
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLQDQ, CPU Feature: AVX2
+func (x Int64x4) InterleaveLoGrouped(y Int64x4) Int64x4
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLQDQ, CPU Feature: AVX512
+func (x Int64x8) InterleaveLoGrouped(y Int64x8) Int64x8
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLWD, CPU Feature: AVX2
+func (x Uint16x16) InterleaveLoGrouped(y Uint16x16) Uint16x16
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLWD, CPU Feature: AVX512
+func (x Uint16x32) InterleaveLoGrouped(y Uint16x32) Uint16x32
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLDQ, CPU Feature: AVX2
+func (x Uint32x8) InterleaveLoGrouped(y Uint32x8) Uint32x8
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLDQ, CPU Feature: AVX512
+func (x Uint32x16) InterleaveLoGrouped(y Uint32x16) Uint32x16
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLQDQ, CPU Feature: AVX2
+func (x Uint64x4) InterleaveLoGrouped(y Uint64x4) Uint64x4
+
+// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
+//
+// Asm: VPUNPCKLQDQ, CPU Feature: AVX512
+func (x Uint64x8) InterleaveLoGrouped(y Uint64x8) Uint64x8
+
+/* IsNan */
+
+// IsNan checks if elements are NaN. Use as x.IsNan(x).
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x4) IsNan(y Float32x4) Mask32x4
+
+// IsNan checks if elements are NaN. Use as x.IsNan(x).
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x8) IsNan(y Float32x8) Mask32x8
+
+// IsNan checks if elements are NaN. Use as x.IsNan(x).
+//
+// Asm: VCMPPS, CPU Feature: AVX512
+func (x Float32x16) IsNan(y Float32x16) Mask32x16
+
+// IsNan checks if elements are NaN. Use as x.IsNan(x).
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x2) IsNan(y Float64x2) Mask64x2
+
+// IsNan checks if elements are NaN. Use as x.IsNan(x).
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x4) IsNan(y Float64x4) Mask64x4
+
+// IsNan checks if elements are NaN. Use as x.IsNan(x).
+//
+// Asm: VCMPPD, CPU Feature: AVX512
+func (x Float64x8) IsNan(y Float64x8) Mask64x8
+
+/* LeadingZeros */
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Int32x4) LeadingZeros() Int32x4
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Int32x8) LeadingZeros() Int32x8
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Int32x16) LeadingZeros() Int32x16
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Int64x2) LeadingZeros() Int64x2
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Int64x4) LeadingZeros() Int64x4
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Int64x8) LeadingZeros() Int64x8
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Uint32x4) LeadingZeros() Uint32x4
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Uint32x8) LeadingZeros() Uint32x8
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTD, CPU Feature: AVX512
+func (x Uint32x16) LeadingZeros() Uint32x16
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Uint64x2) LeadingZeros() Uint64x2
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Uint64x4) LeadingZeros() Uint64x4
+
+// LeadingZeros counts the leading zeros of each element in x.
+//
+// Asm: VPLZCNTQ, CPU Feature: AVX512
+func (x Uint64x8) LeadingZeros() Uint64x8
+
+/* Less */
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x4) Less(y Float32x4) Mask32x4
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x8) Less(y Float32x8) Mask32x8
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX512
+func (x Float32x16) Less(y Float32x16) Mask32x16
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x2) Less(y Float64x2) Mask64x2
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x4) Less(y Float64x4) Mask64x4
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX512
+func (x Float64x8) Less(y Float64x8) Mask64x8
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VPCMPB, CPU Feature: AVX512
+func (x Int8x64) Less(y Int8x64) Mask8x64
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VPCMPW, CPU Feature: AVX512
+func (x Int16x32) Less(y Int16x32) Mask16x32
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VPCMPD, CPU Feature: AVX512
+func (x Int32x16) Less(y Int32x16) Mask32x16
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VPCMPQ, CPU Feature: AVX512
+func (x Int64x8) Less(y Int64x8) Mask64x8
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VPCMPUB, CPU Feature: AVX512
+func (x Uint8x64) Less(y Uint8x64) Mask8x64
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VPCMPUW, CPU Feature: AVX512
+func (x Uint16x32) Less(y Uint16x32) Mask16x32
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VPCMPUD, CPU Feature: AVX512
+func (x Uint32x16) Less(y Uint32x16) Mask32x16
+
+// Less returns x less-than y, elementwise.
+//
+// Asm: VPCMPUQ, CPU Feature: AVX512
+func (x Uint64x8) Less(y Uint64x8) Mask64x8
+
+/* LessEqual */
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x4) LessEqual(y Float32x4) Mask32x4
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x8) LessEqual(y Float32x8) Mask32x8
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX512
+func (x Float32x16) LessEqual(y Float32x16) Mask32x16
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x2) LessEqual(y Float64x2) Mask64x2
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x4) LessEqual(y Float64x4) Mask64x4
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX512
+func (x Float64x8) LessEqual(y Float64x8) Mask64x8
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VPCMPB, CPU Feature: AVX512
+func (x Int8x64) LessEqual(y Int8x64) Mask8x64
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VPCMPW, CPU Feature: AVX512
+func (x Int16x32) LessEqual(y Int16x32) Mask16x32
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VPCMPD, CPU Feature: AVX512
+func (x Int32x16) LessEqual(y Int32x16) Mask32x16
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VPCMPQ, CPU Feature: AVX512
+func (x Int64x8) LessEqual(y Int64x8) Mask64x8
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VPCMPUB, CPU Feature: AVX512
+func (x Uint8x64) LessEqual(y Uint8x64) Mask8x64
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VPCMPUW, CPU Feature: AVX512
+func (x Uint16x32) LessEqual(y Uint16x32) Mask16x32
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VPCMPUD, CPU Feature: AVX512
+func (x Uint32x16) LessEqual(y Uint32x16) Mask32x16
+
+// LessEqual returns x less-than-or-equals y, elementwise.
+//
+// Asm: VPCMPUQ, CPU Feature: AVX512
+func (x Uint64x8) LessEqual(y Uint64x8) Mask64x8
+
+/* Max */
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VMAXPS, CPU Feature: AVX
+func (x Float32x4) Max(y Float32x4) Float32x4
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VMAXPS, CPU Feature: AVX
+func (x Float32x8) Max(y Float32x8) Float32x8
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VMAXPS, CPU Feature: AVX512
+func (x Float32x16) Max(y Float32x16) Float32x16
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VMAXPD, CPU Feature: AVX
+func (x Float64x2) Max(y Float64x2) Float64x2
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VMAXPD, CPU Feature: AVX
+func (x Float64x4) Max(y Float64x4) Float64x4
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VMAXPD, CPU Feature: AVX512
+func (x Float64x8) Max(y Float64x8) Float64x8
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSB, CPU Feature: AVX
+func (x Int8x16) Max(y Int8x16) Int8x16
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSB, CPU Feature: AVX2
+func (x Int8x32) Max(y Int8x32) Int8x32
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSB, CPU Feature: AVX512
+func (x Int8x64) Max(y Int8x64) Int8x64
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSW, CPU Feature: AVX
+func (x Int16x8) Max(y Int16x8) Int16x8
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSW, CPU Feature: AVX2
+func (x Int16x16) Max(y Int16x16) Int16x16
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSW, CPU Feature: AVX512
+func (x Int16x32) Max(y Int16x32) Int16x32
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSD, CPU Feature: AVX
+func (x Int32x4) Max(y Int32x4) Int32x4
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSD, CPU Feature: AVX2
+func (x Int32x8) Max(y Int32x8) Int32x8
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSD, CPU Feature: AVX512
+func (x Int32x16) Max(y Int32x16) Int32x16
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSQ, CPU Feature: AVX512
+func (x Int64x2) Max(y Int64x2) Int64x2
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSQ, CPU Feature: AVX512
+func (x Int64x4) Max(y Int64x4) Int64x4
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXSQ, CPU Feature: AVX512
+func (x Int64x8) Max(y Int64x8) Int64x8
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUB, CPU Feature: AVX
+func (x Uint8x16) Max(y Uint8x16) Uint8x16
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUB, CPU Feature: AVX2
+func (x Uint8x32) Max(y Uint8x32) Uint8x32
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUB, CPU Feature: AVX512
+func (x Uint8x64) Max(y Uint8x64) Uint8x64
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUW, CPU Feature: AVX
+func (x Uint16x8) Max(y Uint16x8) Uint16x8
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUW, CPU Feature: AVX2
+func (x Uint16x16) Max(y Uint16x16) Uint16x16
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUW, CPU Feature: AVX512
+func (x Uint16x32) Max(y Uint16x32) Uint16x32
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUD, CPU Feature: AVX
+func (x Uint32x4) Max(y Uint32x4) Uint32x4
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUD, CPU Feature: AVX2
+func (x Uint32x8) Max(y Uint32x8) Uint32x8
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUD, CPU Feature: AVX512
+func (x Uint32x16) Max(y Uint32x16) Uint32x16
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUQ, CPU Feature: AVX512
+func (x Uint64x2) Max(y Uint64x2) Uint64x2
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUQ, CPU Feature: AVX512
+func (x Uint64x4) Max(y Uint64x4) Uint64x4
+
+// Max computes the maximum of corresponding elements.
+//
+// Asm: VPMAXUQ, CPU Feature: AVX512
+func (x Uint64x8) Max(y Uint64x8) Uint64x8
+
+/* Min */
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VMINPS, CPU Feature: AVX
+func (x Float32x4) Min(y Float32x4) Float32x4
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VMINPS, CPU Feature: AVX
+func (x Float32x8) Min(y Float32x8) Float32x8
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VMINPS, CPU Feature: AVX512
+func (x Float32x16) Min(y Float32x16) Float32x16
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VMINPD, CPU Feature: AVX
+func (x Float64x2) Min(y Float64x2) Float64x2
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VMINPD, CPU Feature: AVX
+func (x Float64x4) Min(y Float64x4) Float64x4
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VMINPD, CPU Feature: AVX512
+func (x Float64x8) Min(y Float64x8) Float64x8
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSB, CPU Feature: AVX
+func (x Int8x16) Min(y Int8x16) Int8x16
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSB, CPU Feature: AVX2
+func (x Int8x32) Min(y Int8x32) Int8x32
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSB, CPU Feature: AVX512
+func (x Int8x64) Min(y Int8x64) Int8x64
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSW, CPU Feature: AVX
+func (x Int16x8) Min(y Int16x8) Int16x8
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSW, CPU Feature: AVX2
+func (x Int16x16) Min(y Int16x16) Int16x16
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSW, CPU Feature: AVX512
+func (x Int16x32) Min(y Int16x32) Int16x32
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSD, CPU Feature: AVX
+func (x Int32x4) Min(y Int32x4) Int32x4
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSD, CPU Feature: AVX2
+func (x Int32x8) Min(y Int32x8) Int32x8
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSD, CPU Feature: AVX512
+func (x Int32x16) Min(y Int32x16) Int32x16
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSQ, CPU Feature: AVX512
+func (x Int64x2) Min(y Int64x2) Int64x2
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSQ, CPU Feature: AVX512
+func (x Int64x4) Min(y Int64x4) Int64x4
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINSQ, CPU Feature: AVX512
+func (x Int64x8) Min(y Int64x8) Int64x8
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUB, CPU Feature: AVX
+func (x Uint8x16) Min(y Uint8x16) Uint8x16
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUB, CPU Feature: AVX2
+func (x Uint8x32) Min(y Uint8x32) Uint8x32
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUB, CPU Feature: AVX512
+func (x Uint8x64) Min(y Uint8x64) Uint8x64
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUW, CPU Feature: AVX
+func (x Uint16x8) Min(y Uint16x8) Uint16x8
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUW, CPU Feature: AVX2
+func (x Uint16x16) Min(y Uint16x16) Uint16x16
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUW, CPU Feature: AVX512
+func (x Uint16x32) Min(y Uint16x32) Uint16x32
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUD, CPU Feature: AVX
+func (x Uint32x4) Min(y Uint32x4) Uint32x4
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUD, CPU Feature: AVX2
+func (x Uint32x8) Min(y Uint32x8) Uint32x8
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUD, CPU Feature: AVX512
+func (x Uint32x16) Min(y Uint32x16) Uint32x16
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUQ, CPU Feature: AVX512
+func (x Uint64x2) Min(y Uint64x2) Uint64x2
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUQ, CPU Feature: AVX512
+func (x Uint64x4) Min(y Uint64x4) Uint64x4
+
+// Min computes the minimum of corresponding elements.
+//
+// Asm: VPMINUQ, CPU Feature: AVX512
+func (x Uint64x8) Min(y Uint64x8) Uint64x8
+
+/* Mul */
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VMULPS, CPU Feature: AVX
+func (x Float32x4) Mul(y Float32x4) Float32x4
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VMULPS, CPU Feature: AVX
+func (x Float32x8) Mul(y Float32x8) Float32x8
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VMULPS, CPU Feature: AVX512
+func (x Float32x16) Mul(y Float32x16) Float32x16
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VMULPD, CPU Feature: AVX
+func (x Float64x2) Mul(y Float64x2) Float64x2
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VMULPD, CPU Feature: AVX
+func (x Float64x4) Mul(y Float64x4) Float64x4
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VMULPD, CPU Feature: AVX512
+func (x Float64x8) Mul(y Float64x8) Float64x8
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLW, CPU Feature: AVX
+func (x Int16x8) Mul(y Int16x8) Int16x8
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLW, CPU Feature: AVX2
+func (x Int16x16) Mul(y Int16x16) Int16x16
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLW, CPU Feature: AVX512
+func (x Int16x32) Mul(y Int16x32) Int16x32
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLD, CPU Feature: AVX
+func (x Int32x4) Mul(y Int32x4) Int32x4
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLD, CPU Feature: AVX2
+func (x Int32x8) Mul(y Int32x8) Int32x8
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLD, CPU Feature: AVX512
+func (x Int32x16) Mul(y Int32x16) Int32x16
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLQ, CPU Feature: AVX512
+func (x Int64x2) Mul(y Int64x2) Int64x2
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLQ, CPU Feature: AVX512
+func (x Int64x4) Mul(y Int64x4) Int64x4
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLQ, CPU Feature: AVX512
+func (x Int64x8) Mul(y Int64x8) Int64x8
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLW, CPU Feature: AVX
+func (x Uint16x8) Mul(y Uint16x8) Uint16x8
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLW, CPU Feature: AVX2
+func (x Uint16x16) Mul(y Uint16x16) Uint16x16
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLW, CPU Feature: AVX512
+func (x Uint16x32) Mul(y Uint16x32) Uint16x32
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLD, CPU Feature: AVX
+func (x Uint32x4) Mul(y Uint32x4) Uint32x4
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLD, CPU Feature: AVX2
+func (x Uint32x8) Mul(y Uint32x8) Uint32x8
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLD, CPU Feature: AVX512
+func (x Uint32x16) Mul(y Uint32x16) Uint32x16
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLQ, CPU Feature: AVX512
+func (x Uint64x2) Mul(y Uint64x2) Uint64x2
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLQ, CPU Feature: AVX512
+func (x Uint64x4) Mul(y Uint64x4) Uint64x4
+
+// Mul multiplies corresponding elements of two vectors.
+//
+// Asm: VPMULLQ, CPU Feature: AVX512
+func (x Uint64x8) Mul(y Uint64x8) Uint64x8
+
+/* MulAdd */
+
+// MulAdd performs a fused (x * y) + z.
+//
+// Asm: VFMADD213PS, CPU Feature: AVX512
+func (x Float32x4) MulAdd(y Float32x4, z Float32x4) Float32x4
+
+// MulAdd performs a fused (x * y) + z.
+//
+// Asm: VFMADD213PS, CPU Feature: AVX512
+func (x Float32x8) MulAdd(y Float32x8, z Float32x8) Float32x8
+
+// MulAdd performs a fused (x * y) + z.
+//
+// Asm: VFMADD213PS, CPU Feature: AVX512
+func (x Float32x16) MulAdd(y Float32x16, z Float32x16) Float32x16
+
+// MulAdd performs a fused (x * y) + z.
+//
+// Asm: VFMADD213PD, CPU Feature: AVX512
+func (x Float64x2) MulAdd(y Float64x2, z Float64x2) Float64x2
+
+// MulAdd performs a fused (x * y) + z.
+//
+// Asm: VFMADD213PD, CPU Feature: AVX512
+func (x Float64x4) MulAdd(y Float64x4, z Float64x4) Float64x4
+
+// MulAdd performs a fused (x * y) + z.
+//
+// Asm: VFMADD213PD, CPU Feature: AVX512
+func (x Float64x8) MulAdd(y Float64x8, z Float64x8) Float64x8
+
+/* MulAddSub */
+
+// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
+//
+// Asm: VFMADDSUB213PS, CPU Feature: AVX512
+func (x Float32x4) MulAddSub(y Float32x4, z Float32x4) Float32x4
+
+// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
+//
+// Asm: VFMADDSUB213PS, CPU Feature: AVX512
+func (x Float32x8) MulAddSub(y Float32x8, z Float32x8) Float32x8
+
+// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
+//
+// Asm: VFMADDSUB213PS, CPU Feature: AVX512
+func (x Float32x16) MulAddSub(y Float32x16, z Float32x16) Float32x16
+
+// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
+//
+// Asm: VFMADDSUB213PD, CPU Feature: AVX512
+func (x Float64x2) MulAddSub(y Float64x2, z Float64x2) Float64x2
+
+// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
+//
+// Asm: VFMADDSUB213PD, CPU Feature: AVX512
+func (x Float64x4) MulAddSub(y Float64x4, z Float64x4) Float64x4
+
+// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
+//
+// Asm: VFMADDSUB213PD, CPU Feature: AVX512
+func (x Float64x8) MulAddSub(y Float64x8, z Float64x8) Float64x8
+
+/* MulEvenWiden */
+
+// MulEvenWiden multiplies even-indexed elements, widening the result.
+// Result[i] = v1.Even[i] * v2.Even[i].
+//
+// Asm: VPMULDQ, CPU Feature: AVX
+func (x Int32x4) MulEvenWiden(y Int32x4) Int64x2
+
+// MulEvenWiden multiplies even-indexed elements, widening the result.
+// Result[i] = v1.Even[i] * v2.Even[i].
+//
+// Asm: VPMULDQ, CPU Feature: AVX2
+func (x Int32x8) MulEvenWiden(y Int32x8) Int64x4
+
+// MulEvenWiden multiplies even-indexed elements, widening the result.
+// Result[i] = v1.Even[i] * v2.Even[i].
+//
+// Asm: VPMULUDQ, CPU Feature: AVX
+func (x Uint32x4) MulEvenWiden(y Uint32x4) Uint64x2
+
+// MulEvenWiden multiplies even-indexed elements, widening the result.
+// Result[i] = v1.Even[i] * v2.Even[i].
+//
+// Asm: VPMULUDQ, CPU Feature: AVX2
+func (x Uint32x8) MulEvenWiden(y Uint32x8) Uint64x4
+
+/* MulHigh */
+
+// MulHigh multiplies elements and stores the high part of the result.
+//
+// Asm: VPMULHW, CPU Feature: AVX
+func (x Int16x8) MulHigh(y Int16x8) Int16x8
+
+// MulHigh multiplies elements and stores the high part of the result.
+//
+// Asm: VPMULHW, CPU Feature: AVX2
+func (x Int16x16) MulHigh(y Int16x16) Int16x16
+
+// MulHigh multiplies elements and stores the high part of the result.
+//
+// Asm: VPMULHW, CPU Feature: AVX512
+func (x Int16x32) MulHigh(y Int16x32) Int16x32
+
+// MulHigh multiplies elements and stores the high part of the result.
+//
+// Asm: VPMULHUW, CPU Feature: AVX
+func (x Uint16x8) MulHigh(y Uint16x8) Uint16x8
+
+// MulHigh multiplies elements and stores the high part of the result.
+//
+// Asm: VPMULHUW, CPU Feature: AVX2
+func (x Uint16x16) MulHigh(y Uint16x16) Uint16x16
+
+// MulHigh multiplies elements and stores the high part of the result.
+//
+// Asm: VPMULHUW, CPU Feature: AVX512
+func (x Uint16x32) MulHigh(y Uint16x32) Uint16x32
+
+/* MulSubAdd */
+
+// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
+//
+// Asm: VFMSUBADD213PS, CPU Feature: AVX512
+func (x Float32x4) MulSubAdd(y Float32x4, z Float32x4) Float32x4
+
+// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
+//
+// Asm: VFMSUBADD213PS, CPU Feature: AVX512
+func (x Float32x8) MulSubAdd(y Float32x8, z Float32x8) Float32x8
+
+// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
+//
+// Asm: VFMSUBADD213PS, CPU Feature: AVX512
+func (x Float32x16) MulSubAdd(y Float32x16, z Float32x16) Float32x16
+
+// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
+//
+// Asm: VFMSUBADD213PD, CPU Feature: AVX512
+func (x Float64x2) MulSubAdd(y Float64x2, z Float64x2) Float64x2
+
+// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
+//
+// Asm: VFMSUBADD213PD, CPU Feature: AVX512
+func (x Float64x4) MulSubAdd(y Float64x4, z Float64x4) Float64x4
+
+// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
+//
+// Asm: VFMSUBADD213PD, CPU Feature: AVX512
+func (x Float64x8) MulSubAdd(y Float64x8, z Float64x8) Float64x8
+
+/* NotEqual */
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x4) NotEqual(y Float32x4) Mask32x4
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX
+func (x Float32x8) NotEqual(y Float32x8) Mask32x8
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VCMPPS, CPU Feature: AVX512
+func (x Float32x16) NotEqual(y Float32x16) Mask32x16
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x2) NotEqual(y Float64x2) Mask64x2
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX
+func (x Float64x4) NotEqual(y Float64x4) Mask64x4
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VCMPPD, CPU Feature: AVX512
+func (x Float64x8) NotEqual(y Float64x8) Mask64x8
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VPCMPB, CPU Feature: AVX512
+func (x Int8x64) NotEqual(y Int8x64) Mask8x64
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VPCMPW, CPU Feature: AVX512
+func (x Int16x32) NotEqual(y Int16x32) Mask16x32
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VPCMPD, CPU Feature: AVX512
+func (x Int32x16) NotEqual(y Int32x16) Mask32x16
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VPCMPQ, CPU Feature: AVX512
+func (x Int64x8) NotEqual(y Int64x8) Mask64x8
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VPCMPUB, CPU Feature: AVX512
+func (x Uint8x64) NotEqual(y Uint8x64) Mask8x64
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VPCMPUW, CPU Feature: AVX512
+func (x Uint16x32) NotEqual(y Uint16x32) Mask16x32
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VPCMPUD, CPU Feature: AVX512
+func (x Uint32x16) NotEqual(y Uint32x16) Mask32x16
+
+// NotEqual returns x not-equals y, elementwise.
+//
+// Asm: VPCMPUQ, CPU Feature: AVX512
+func (x Uint64x8) NotEqual(y Uint64x8) Mask64x8
+
+/* OnesCount */
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
+func (x Int8x16) OnesCount() Int8x16
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
+func (x Int8x32) OnesCount() Int8x32
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
+func (x Int8x64) OnesCount() Int8x64
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
+func (x Int16x8) OnesCount() Int16x8
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
+func (x Int16x16) OnesCount() Int16x16
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
+func (x Int16x32) OnesCount() Int16x32
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
+func (x Int32x4) OnesCount() Int32x4
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
+func (x Int32x8) OnesCount() Int32x8
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
+func (x Int32x16) OnesCount() Int32x16
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
+func (x Int64x2) OnesCount() Int64x2
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
+func (x Int64x4) OnesCount() Int64x4
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
+func (x Int64x8) OnesCount() Int64x8
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
+func (x Uint8x16) OnesCount() Uint8x16
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
+func (x Uint8x32) OnesCount() Uint8x32
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
+func (x Uint8x64) OnesCount() Uint8x64
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
+func (x Uint16x8) OnesCount() Uint16x8
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
+func (x Uint16x16) OnesCount() Uint16x16
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
+func (x Uint16x32) OnesCount() Uint16x32
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
+func (x Uint32x4) OnesCount() Uint32x4
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
+func (x Uint32x8) OnesCount() Uint32x8
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
+func (x Uint32x16) OnesCount() Uint32x16
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
+func (x Uint64x2) OnesCount() Uint64x2
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
+func (x Uint64x4) OnesCount() Uint64x4
+
+// OnesCount counts the number of set bits in each element.
+//
+// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
+func (x Uint64x8) OnesCount() Uint64x8
+
+/* Or */
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX
+func (x Int8x16) Or(y Int8x16) Int8x16
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX2
+func (x Int8x32) Or(y Int8x32) Int8x32
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPORD, CPU Feature: AVX512
+func (x Int8x64) Or(y Int8x64) Int8x64
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX
+func (x Int16x8) Or(y Int16x8) Int16x8
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX2
+func (x Int16x16) Or(y Int16x16) Int16x16
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPORD, CPU Feature: AVX512
+func (x Int16x32) Or(y Int16x32) Int16x32
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX
+func (x Int32x4) Or(y Int32x4) Int32x4
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX2
+func (x Int32x8) Or(y Int32x8) Int32x8
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPORD, CPU Feature: AVX512
+func (x Int32x16) Or(y Int32x16) Int32x16
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX
+func (x Int64x2) Or(y Int64x2) Int64x2
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX2
+func (x Int64x4) Or(y Int64x4) Int64x4
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPORQ, CPU Feature: AVX512
+func (x Int64x8) Or(y Int64x8) Int64x8
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX
+func (x Uint8x16) Or(y Uint8x16) Uint8x16
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX2
+func (x Uint8x32) Or(y Uint8x32) Uint8x32
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPORD, CPU Feature: AVX512
+func (x Uint8x64) Or(y Uint8x64) Uint8x64
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX
+func (x Uint16x8) Or(y Uint16x8) Uint16x8
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX2
+func (x Uint16x16) Or(y Uint16x16) Uint16x16
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPORD, CPU Feature: AVX512
+func (x Uint16x32) Or(y Uint16x32) Uint16x32
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX
+func (x Uint32x4) Or(y Uint32x4) Uint32x4
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX2
+func (x Uint32x8) Or(y Uint32x8) Uint32x8
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPORD, CPU Feature: AVX512
+func (x Uint32x16) Or(y Uint32x16) Uint32x16
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX
+func (x Uint64x2) Or(y Uint64x2) Uint64x2
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPOR, CPU Feature: AVX2
+func (x Uint64x4) Or(y Uint64x4) Uint64x4
+
+// Or performs a bitwise OR operation between two vectors.
+//
+// Asm: VPORQ, CPU Feature: AVX512
+func (x Uint64x8) Or(y Uint64x8) Uint64x8
+
+/* Permute */
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 4 bits (values 0-15) of each element of indices is used
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x16) Permute(indices Uint8x16) Int8x16
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 4 bits (values 0-15) of each element of indices is used
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 5 bits (values 0-31) of each element of indices is used
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x32) Permute(indices Uint8x32) Int8x32
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 5 bits (values 0-31) of each element of indices is used
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 6 bits (values 0-63) of each element of indices is used
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Int8x64) Permute(indices Uint8x64) Int8x64
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 6 bits (values 0-63) of each element of indices is used
+//
+// Asm: VPERMB, CPU Feature: AVX512VBMI
+func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 3 bits (values 0-7) of each element of indices is used
+//
+// Asm: VPERMW, CPU Feature: AVX512
+func (x Int16x8) Permute(indices Uint16x8) Int16x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 3 bits (values 0-7) of each element of indices is used
+//
+// Asm: VPERMW, CPU Feature: AVX512
+func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 4 bits (values 0-15) of each element of indices is used
+//
+// Asm: VPERMW, CPU Feature: AVX512
+func (x Int16x16) Permute(indices Uint16x16) Int16x16
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 4 bits (values 0-15) of each element of indices is used
+//
+// Asm: VPERMW, CPU Feature: AVX512
+func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 5 bits (values 0-31) of each element of indices is used
+//
+// Asm: VPERMW, CPU Feature: AVX512
+func (x Int16x32) Permute(indices Uint16x32) Int16x32
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 5 bits (values 0-31) of each element of indices is used
+//
+// Asm: VPERMW, CPU Feature: AVX512
+func (x Uint16x32) Permute(indices Uint16x32) Uint16x32
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 3 bits (values 0-7) of each element of indices is used
+//
+// Asm: VPERMPS, CPU Feature: AVX2
+func (x Float32x8) Permute(indices Uint32x8) Float32x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 3 bits (values 0-7) of each element of indices is used
+//
+// Asm: VPERMD, CPU Feature: AVX2
+func (x Int32x8) Permute(indices Uint32x8) Int32x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 3 bits (values 0-7) of each element of indices is used
+//
+// Asm: VPERMD, CPU Feature: AVX2
+func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 4 bits (values 0-15) of each element of indices is used
+//
+// Asm: VPERMPS, CPU Feature: AVX512
+func (x Float32x16) Permute(indices Uint32x16) Float32x16
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 4 bits (values 0-15) of each element of indices is used
+//
+// Asm: VPERMD, CPU Feature: AVX512
+func (x Int32x16) Permute(indices Uint32x16) Int32x16
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 4 bits (values 0-15) of each element of indices is used
+//
+// Asm: VPERMD, CPU Feature: AVX512
+func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 2 bits (values 0-3) of each element of indices is used
+//
+// Asm: VPERMPD, CPU Feature: AVX512
+func (x Float64x4) Permute(indices Uint64x4) Float64x4
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 2 bits (values 0-3) of each element of indices is used
+//
+// Asm: VPERMQ, CPU Feature: AVX512
+func (x Int64x4) Permute(indices Uint64x4) Int64x4
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 2 bits (values 0-3) of each element of indices is used
+//
+// Asm: VPERMQ, CPU Feature: AVX512
+func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 3 bits (values 0-7) of each element of indices is used
+//
+// Asm: VPERMPD, CPU Feature: AVX512
+func (x Float64x8) Permute(indices Uint64x8) Float64x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 3 bits (values 0-7) of each element of indices is used
+//
+// Asm: VPERMQ, CPU Feature: AVX512
+func (x Int64x8) Permute(indices Uint64x8) Int64x8
+
+// Permute performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The low 3 bits (values 0-7) of each element of indices is used
+//
+// Asm: VPERMQ, CPU Feature: AVX512
+func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
+
+/* PermuteOrZero */
+
+// PermuteOrZero performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The lower four bits of each byte-sized index in indices select an element from x,
+// unless the index's sign bit is set in which case zero is used instead.
+//
+// Asm: VPSHUFB, CPU Feature: AVX
+func (x Int8x16) PermuteOrZero(indices Int8x16) Int8x16
+
+// PermuteOrZero performs a full permutation of vector x using indices:
+// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
+// The lower four bits of each byte-sized index in indices select an element from x,
+// unless the index's sign bit is set in which case zero is used instead.
+//
+// Asm: VPSHUFB, CPU Feature: AVX
+func (x Uint8x16) PermuteOrZero(indices Int8x16) Uint8x16
+
+/* PermuteOrZeroGrouped */
+
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX2
+func (x Int8x32) PermuteOrZeroGrouped(indices Int8x32) Int8x32
+
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX512
+func (x Int8x64) PermuteOrZeroGrouped(indices Int8x64) Int8x64
+
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX2
+func (x Uint8x32) PermuteOrZeroGrouped(indices Int8x32) Uint8x32
+
+// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
+// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
+// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
+// unless the index's sign bit is set in which case zero is used instead.
+// Each group is of size 128-bit.
+//
+// Asm: VPSHUFB, CPU Feature: AVX512
+func (x Uint8x64) PermuteOrZeroGrouped(indices Int8x64) Uint8x64
+
+/* Reciprocal */
+
+// Reciprocal computes an approximate reciprocal of each element.
+//
+// Asm: VRCPPS, CPU Feature: AVX
+func (x Float32x4) Reciprocal() Float32x4
+
+// Reciprocal computes an approximate reciprocal of each element.
+//
+// Asm: VRCPPS, CPU Feature: AVX
+func (x Float32x8) Reciprocal() Float32x8
+
+// Reciprocal computes an approximate reciprocal of each element.
+//
+// Asm: VRCP14PS, CPU Feature: AVX512
+func (x Float32x16) Reciprocal() Float32x16
+
+// Reciprocal computes an approximate reciprocal of each element.
+//
+// Asm: VRCP14PD, CPU Feature: AVX512
+func (x Float64x2) Reciprocal() Float64x2
+
+// Reciprocal computes an approximate reciprocal of each element.
+//
+// Asm: VRCP14PD, CPU Feature: AVX512
+func (x Float64x4) Reciprocal() Float64x4
+
+// Reciprocal computes an approximate reciprocal of each element.
+//
+// Asm: VRCP14PD, CPU Feature: AVX512
+func (x Float64x8) Reciprocal() Float64x8
+
+/* ReciprocalSqrt */
+
+// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
+//
+// Asm: VRSQRTPS, CPU Feature: AVX
+func (x Float32x4) ReciprocalSqrt() Float32x4
+
+// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
+//
+// Asm: VRSQRTPS, CPU Feature: AVX
+func (x Float32x8) ReciprocalSqrt() Float32x8
+
+// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
+//
+// Asm: VRSQRT14PS, CPU Feature: AVX512
+func (x Float32x16) ReciprocalSqrt() Float32x16
+
+// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
+//
+// Asm: VRSQRT14PD, CPU Feature: AVX512
+func (x Float64x2) ReciprocalSqrt() Float64x2
+
+// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
+//
+// Asm: VRSQRT14PD, CPU Feature: AVX512
+func (x Float64x4) ReciprocalSqrt() Float64x4
+
+// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
+//
+// Asm: VRSQRT14PD, CPU Feature: AVX512
+func (x Float64x8) ReciprocalSqrt() Float64x8
+
+/* RotateAllLeft */
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLD, CPU Feature: AVX512
+func (x Int32x4) RotateAllLeft(shift uint8) Int32x4
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLD, CPU Feature: AVX512
+func (x Int32x8) RotateAllLeft(shift uint8) Int32x8
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLD, CPU Feature: AVX512
+func (x Int32x16) RotateAllLeft(shift uint8) Int32x16
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLQ, CPU Feature: AVX512
+func (x Int64x2) RotateAllLeft(shift uint8) Int64x2
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLQ, CPU Feature: AVX512
+func (x Int64x4) RotateAllLeft(shift uint8) Int64x4
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLQ, CPU Feature: AVX512
+func (x Int64x8) RotateAllLeft(shift uint8) Int64x8
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLD, CPU Feature: AVX512
+func (x Uint32x4) RotateAllLeft(shift uint8) Uint32x4
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLD, CPU Feature: AVX512
+func (x Uint32x8) RotateAllLeft(shift uint8) Uint32x8
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLD, CPU Feature: AVX512
+func (x Uint32x16) RotateAllLeft(shift uint8) Uint32x16
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLQ, CPU Feature: AVX512
+func (x Uint64x2) RotateAllLeft(shift uint8) Uint64x2
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLQ, CPU Feature: AVX512
+func (x Uint64x4) RotateAllLeft(shift uint8) Uint64x4
+
+// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPROLQ, CPU Feature: AVX512
+func (x Uint64x8) RotateAllLeft(shift uint8) Uint64x8
+
+/* RotateAllRight */
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORD, CPU Feature: AVX512
+func (x Int32x4) RotateAllRight(shift uint8) Int32x4
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORD, CPU Feature: AVX512
+func (x Int32x8) RotateAllRight(shift uint8) Int32x8
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORD, CPU Feature: AVX512
+func (x Int32x16) RotateAllRight(shift uint8) Int32x16
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORQ, CPU Feature: AVX512
+func (x Int64x2) RotateAllRight(shift uint8) Int64x2
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORQ, CPU Feature: AVX512
+func (x Int64x4) RotateAllRight(shift uint8) Int64x4
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORQ, CPU Feature: AVX512
+func (x Int64x8) RotateAllRight(shift uint8) Int64x8
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORD, CPU Feature: AVX512
+func (x Uint32x4) RotateAllRight(shift uint8) Uint32x4
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORD, CPU Feature: AVX512
+func (x Uint32x8) RotateAllRight(shift uint8) Uint32x8
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORD, CPU Feature: AVX512
+func (x Uint32x16) RotateAllRight(shift uint8) Uint32x16
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORQ, CPU Feature: AVX512
+func (x Uint64x2) RotateAllRight(shift uint8) Uint64x2
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORQ, CPU Feature: AVX512
+func (x Uint64x4) RotateAllRight(shift uint8) Uint64x4
+
+// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPRORQ, CPU Feature: AVX512
+func (x Uint64x8) RotateAllRight(shift uint8) Uint64x8
+
+/* RotateLeft */
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVD, CPU Feature: AVX512
+func (x Int32x4) RotateLeft(y Int32x4) Int32x4
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVD, CPU Feature: AVX512
+func (x Int32x8) RotateLeft(y Int32x8) Int32x8
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVD, CPU Feature: AVX512
+func (x Int32x16) RotateLeft(y Int32x16) Int32x16
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVQ, CPU Feature: AVX512
+func (x Int64x2) RotateLeft(y Int64x2) Int64x2
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVQ, CPU Feature: AVX512
+func (x Int64x4) RotateLeft(y Int64x4) Int64x4
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVQ, CPU Feature: AVX512
+func (x Int64x8) RotateLeft(y Int64x8) Int64x8
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVD, CPU Feature: AVX512
+func (x Uint32x4) RotateLeft(y Uint32x4) Uint32x4
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVD, CPU Feature: AVX512
+func (x Uint32x8) RotateLeft(y Uint32x8) Uint32x8
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVD, CPU Feature: AVX512
+func (x Uint32x16) RotateLeft(y Uint32x16) Uint32x16
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVQ, CPU Feature: AVX512
+func (x Uint64x2) RotateLeft(y Uint64x2) Uint64x2
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVQ, CPU Feature: AVX512
+func (x Uint64x4) RotateLeft(y Uint64x4) Uint64x4
+
+// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPROLVQ, CPU Feature: AVX512
+func (x Uint64x8) RotateLeft(y Uint64x8) Uint64x8
+
+/* RotateRight */
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVD, CPU Feature: AVX512
+func (x Int32x4) RotateRight(y Int32x4) Int32x4
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVD, CPU Feature: AVX512
+func (x Int32x8) RotateRight(y Int32x8) Int32x8
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVD, CPU Feature: AVX512
+func (x Int32x16) RotateRight(y Int32x16) Int32x16
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVQ, CPU Feature: AVX512
+func (x Int64x2) RotateRight(y Int64x2) Int64x2
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVQ, CPU Feature: AVX512
+func (x Int64x4) RotateRight(y Int64x4) Int64x4
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVQ, CPU Feature: AVX512
+func (x Int64x8) RotateRight(y Int64x8) Int64x8
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVD, CPU Feature: AVX512
+func (x Uint32x4) RotateRight(y Uint32x4) Uint32x4
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVD, CPU Feature: AVX512
+func (x Uint32x8) RotateRight(y Uint32x8) Uint32x8
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVD, CPU Feature: AVX512
+func (x Uint32x16) RotateRight(y Uint32x16) Uint32x16
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVQ, CPU Feature: AVX512
+func (x Uint64x2) RotateRight(y Uint64x2) Uint64x2
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVQ, CPU Feature: AVX512
+func (x Uint64x4) RotateRight(y Uint64x4) Uint64x4
+
+// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
+//
+// Asm: VPRORVQ, CPU Feature: AVX512
+func (x Uint64x8) RotateRight(y Uint64x8) Uint64x8
+
+/* RoundToEven */
+
+// RoundToEven rounds elements to the nearest integer.
+//
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x4) RoundToEven() Float32x4
+
+// RoundToEven rounds elements to the nearest integer.
+//
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x8) RoundToEven() Float32x8
+
+// RoundToEven rounds elements to the nearest integer.
+//
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x2) RoundToEven() Float64x2
+
+// RoundToEven rounds elements to the nearest integer.
+//
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x4) RoundToEven() Float64x4
+
+/* RoundToEvenScaled */
+
+// RoundToEvenScaled rounds elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x4) RoundToEvenScaled(prec uint8) Float32x4
+
+// RoundToEvenScaled rounds elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x8) RoundToEvenScaled(prec uint8) Float32x8
+
+// RoundToEvenScaled rounds elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x16) RoundToEvenScaled(prec uint8) Float32x16
+
+// RoundToEvenScaled rounds elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x2) RoundToEvenScaled(prec uint8) Float64x2
+
+// RoundToEvenScaled rounds elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x4) RoundToEvenScaled(prec uint8) Float64x4
+
+// RoundToEvenScaled rounds elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x8) RoundToEvenScaled(prec uint8) Float64x8
+
+/* RoundToEvenScaledResidue */
+
+// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x4) RoundToEvenScaledResidue(prec uint8) Float32x4
+
+// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x8) RoundToEvenScaledResidue(prec uint8) Float32x8
+
+// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x16) RoundToEvenScaledResidue(prec uint8) Float32x16
+
+// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x2) RoundToEvenScaledResidue(prec uint8) Float64x2
+
+// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x4) RoundToEvenScaledResidue(prec uint8) Float64x4
+
+// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x8) RoundToEvenScaledResidue(prec uint8) Float64x8
+
+/* SHA1FourRounds */
+
+// SHA1FourRounds performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variables a, b, c and d from upper to lower order.
+// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+// result = the state variables a', b', c', d' updated after 4 rounds.
+// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: SHA1RNDS4, CPU Feature: SHA
+func (x Uint32x4) SHA1FourRounds(constant uint8, y Uint32x4) Uint32x4
+
+/* SHA1Message1 */
+
+// SHA1Message1 does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W3, W2, W1, W0}
+// y = {0, 0, W5, W4}
+// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+//
+// Asm: SHA1MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA1Message1(y Uint32x4) Uint32x4
+
+/* SHA1Message2 */
+
+// SHA1Message2 does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2.
+// y = {W15, W14, W13}
+// result = {W19, W18, W17, W16}
+//
+// Asm: SHA1MSG2, CPU Feature: SHA
+func (x Uint32x4) SHA1Message2(y Uint32x4) Uint32x4
+
+/* SHA1NextE */
+
+// SHA1NextE calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variable a (before the 4 rounds), placed in the upper element.
+// y is the elements of W array for next 4 rounds from upper to lower order.
+// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+// from upper to lower order.
+// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+// computation of the value of e'.)
+//
+// Asm: SHA1NEXTE, CPU Feature: SHA
+func (x Uint32x4) SHA1NextE(y Uint32x4) Uint32x4
+
+/* SHA256Message1 */
+
+// SHA256Message1 does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W0, W1, W2, W3}
+// y = {W4, 0, 0, 0}
+// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA256Message1(y Uint32x4) Uint32x4
+
+/* SHA256Message2 */
+
+// SHA256Message2 does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2
+// y = {0, 0, W14, W15}
+// result = {W16, W17, W18, W19}
+//
+// Asm: SHA256MSG2, CPU Feature: SHA
+func (x Uint32x4) SHA256Message2(y Uint32x4) Uint32x4
+
+/* SHA256TwoRounds */
+
+// SHA256TwoRounds does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+// x = {h, g, d, c}
+// y = {f, e, b, a}
+// z = {W0+K0, W1+K1}
+// result = {f', e', b', a'}
+// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+// the corresponding element of the W array to make the input data z.
+// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+// y (the state variables a, b, e, f before the 2 rounds).
+//
+// Asm: SHA256RNDS2, CPU Feature: SHA
+func (x Uint32x4) SHA256TwoRounds(y Uint32x4, z Uint32x4) Uint32x4
+
+/* SaturateToInt8 */
+
+// SaturateToInt8 converts element values to int8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSWB, CPU Feature: AVX512
+func (x Int16x8) SaturateToInt8() Int8x16
+
+// SaturateToInt8 converts element values to int8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSWB, CPU Feature: AVX512
+func (x Int16x16) SaturateToInt8() Int8x16
+
+// SaturateToInt8 converts element values to int8.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVSWB, CPU Feature: AVX512
+func (x Int16x32) SaturateToInt8() Int8x32
+
+// SaturateToInt8 converts element values to int8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSDB, CPU Feature: AVX512
+func (x Int32x4) SaturateToInt8() Int8x16
+
+// SaturateToInt8 converts element values to int8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSDB, CPU Feature: AVX512
+func (x Int32x8) SaturateToInt8() Int8x16
+
+// SaturateToInt8 converts element values to int8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSDB, CPU Feature: AVX512
+func (x Int32x16) SaturateToInt8() Int8x16
+
+// SaturateToInt8 converts element values to int8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSQB, CPU Feature: AVX512
+func (x Int64x2) SaturateToInt8() Int8x16
+
+// SaturateToInt8 converts element values to int8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSQB, CPU Feature: AVX512
+func (x Int64x4) SaturateToInt8() Int8x16
+
+// SaturateToInt8 converts element values to int8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSQB, CPU Feature: AVX512
+func (x Int64x8) SaturateToInt8() Int8x16
+
+/* SaturateToInt16 */
+
+// SaturateToInt16 converts element values to int16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVSDW, CPU Feature: AVX512
+func (x Int32x4) SaturateToInt16() Int16x8
+
+// SaturateToInt16 converts element values to int16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVSDW, CPU Feature: AVX512
+func (x Int32x8) SaturateToInt16() Int16x8
+
+// SaturateToInt16 converts element values to int16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVSDW, CPU Feature: AVX512
+func (x Int32x16) SaturateToInt16() Int16x16
+
+// SaturateToInt16 converts element values to int16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVSQW, CPU Feature: AVX512
+func (x Int64x2) SaturateToInt16() Int16x8
+
+// SaturateToInt16 converts element values to int16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVSQW, CPU Feature: AVX512
+func (x Int64x4) SaturateToInt16() Int16x8
+
+// SaturateToInt16 converts element values to int16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVSQW, CPU Feature: AVX512
+func (x Int64x8) SaturateToInt16() Int16x8
+
+/* SaturateToInt16Concat */
+
+// SaturateToInt16Concat converts element values to int16.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second input vector will be packed to the upper part of the result vector.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPACKSSDW, CPU Feature: AVX
+func (x Int32x4) SaturateToInt16Concat(y Int32x4) Int16x8
+
+// SaturateToInt16Concat converts element values to int16.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second input vector will be packed to the upper part of the result vector.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPACKSSDW, CPU Feature: AVX2
+func (x Int32x8) SaturateToInt16Concat(y Int32x8) Int16x16
+
+// SaturateToInt16Concat converts element values to int16.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second input vector will be packed to the upper part of the result vector.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPACKSSDW, CPU Feature: AVX512
+func (x Int32x16) SaturateToInt16Concat(y Int32x16) Int16x32
+
+/* SaturateToInt32 */
+
+// SaturateToInt32 converts element values to int32.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVSQD, CPU Feature: AVX512
+func (x Int64x2) SaturateToInt32() Int32x4
+
+// SaturateToInt32 converts element values to int32.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVSQD, CPU Feature: AVX512
+func (x Int64x4) SaturateToInt32() Int32x4
+
+// SaturateToInt32 converts element values to int32.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVSQD, CPU Feature: AVX512
+func (x Int64x8) SaturateToInt32() Int32x8
+
+/* SaturateToUint8 */
+
+// SaturateToUint8 converts element values to uint8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSWB, CPU Feature: AVX512
+func (x Int16x8) SaturateToUint8() Int8x16
+
+// SaturateToUint8 converts element values to uint8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSWB, CPU Feature: AVX512
+func (x Int16x16) SaturateToUint8() Int8x16
+
+// SaturateToUint8 converts element values to uint8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSDB, CPU Feature: AVX512
+func (x Int32x4) SaturateToUint8() Int8x16
+
+// SaturateToUint8 converts element values to uint8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSDB, CPU Feature: AVX512
+func (x Int32x8) SaturateToUint8() Int8x16
+
+// SaturateToUint8 converts element values to uint8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSDB, CPU Feature: AVX512
+func (x Int32x16) SaturateToUint8() Int8x16
+
+// SaturateToUint8 converts element values to uint8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSQB, CPU Feature: AVX512
+func (x Int64x2) SaturateToUint8() Int8x16
+
+// SaturateToUint8 converts element values to uint8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSQB, CPU Feature: AVX512
+func (x Int64x4) SaturateToUint8() Int8x16
+
+// SaturateToUint8 converts element values to uint8.
+// Conversion is done with saturation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVSQB, CPU Feature: AVX512
+func (x Int64x8) SaturateToUint8() Int8x16
+
+// SaturateToUint8 converts element values to uint8.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVUSWB, CPU Feature: AVX512
+func (x Uint16x32) SaturateToUint8() Uint8x32
+
+/* SaturateToUint16 */
+
+// SaturateToUint16 converts element values to uint16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVUSDW, CPU Feature: AVX512
+func (x Uint32x4) SaturateToUint16() Uint16x8
+
+// SaturateToUint16 converts element values to uint16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVUSDW, CPU Feature: AVX512
+func (x Uint32x8) SaturateToUint16() Uint16x8
+
+// SaturateToUint16 converts element values to uint16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVUSDW, CPU Feature: AVX512
+func (x Uint32x16) SaturateToUint16() Uint16x16
+
+// SaturateToUint16 converts element values to uint16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVUSQW, CPU Feature: AVX512
+func (x Uint64x2) SaturateToUint16() Uint16x8
+
+// SaturateToUint16 converts element values to uint16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVUSQW, CPU Feature: AVX512
+func (x Uint64x4) SaturateToUint16() Uint16x8
+
+// SaturateToUint16 converts element values to uint16.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVUSQW, CPU Feature: AVX512
+func (x Uint64x8) SaturateToUint16() Uint16x8
+
+/* SaturateToUint16Concat */
+
+// SaturateToUint16Concat converts element values to uint16.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second input vector will be packed to the upper part of the result vector.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPACKUSDW, CPU Feature: AVX
+func (x Uint32x4) SaturateToUint16Concat(y Uint32x4) Uint16x8
+
+// SaturateToUint16Concat converts element values to uint16.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second input vector will be packed to the upper part of the result vector.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPACKUSDW, CPU Feature: AVX2
+func (x Uint32x8) SaturateToUint16Concat(y Uint32x8) Uint16x16
+
+// SaturateToUint16Concat converts element values to uint16.
+// With each 128-bit as a group:
+// The converted group from the first input vector will be packed to the lower part of the result vector,
+// the converted group from the second input vector will be packed to the upper part of the result vector.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPACKUSDW, CPU Feature: AVX512
+func (x Uint32x16) SaturateToUint16Concat(y Uint32x16) Uint16x32
+
+/* SaturateToUint32 */
+
+// SaturateToUint32 converts element values to uint32.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVUSQD, CPU Feature: AVX512
+func (x Uint64x2) SaturateToUint32() Uint32x4
+
+// SaturateToUint32 converts element values to uint32.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVUSQD, CPU Feature: AVX512
+func (x Uint64x4) SaturateToUint32() Uint32x4
+
+// SaturateToUint32 converts element values to uint32.
+// Conversion is done with saturation on the vector elements.
+//
+// Asm: VPMOVUSQD, CPU Feature: AVX512
+func (x Uint64x8) SaturateToUint32() Uint32x8
+
+/* Scale */
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512
+func (x Float32x4) Scale(y Float32x4) Float32x4
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512
+func (x Float32x8) Scale(y Float32x8) Float32x8
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPS, CPU Feature: AVX512
+func (x Float32x16) Scale(y Float32x16) Float32x16
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512
+func (x Float64x2) Scale(y Float64x2) Float64x2
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512
+func (x Float64x4) Scale(y Float64x4) Float64x4
+
+// Scale multiplies elements by a power of 2.
+//
+// Asm: VSCALEFPD, CPU Feature: AVX512
+func (x Float64x8) Scale(y Float64x8) Float64x8
+
+/* Select128FromPair */
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2F128, CPU Feature: AVX
+func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2F128, CPU Feature: AVX
+func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0,
+// {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
+//
+// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int8x32) Select128FromPair(lo, hi uint8, y Int8x32) Int8x32
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0,
+// {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
+//
+// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int16x16) Select128FromPair(lo, hi uint8, y Int16x16) Int16x16
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0,
+// {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
+//
+// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint8x32) Select128FromPair(lo, hi uint8, y Uint8x32) Uint8x32
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0,
+// {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
+//
+// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint16x16) Select128FromPair(lo, hi uint8, y Uint16x16) Uint16x16
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
+//
+// returns {70, 71, 72, 73, 40, 41, 42, 43}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
+
+// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
+// 128-bit elements, and returns a 256-bit result formed by
+// concatenating the two elements specified by lo and hi.
+// For example,
+//
+// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
+//
+// returns {70, 71, 40, 41}.
+//
+// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
+// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
+//
+// Asm: VPERM2I128, CPU Feature: AVX2
+func (x Uint64x4) Select128FromPair(lo, hi uint8, y Uint64x4) Uint64x4
+
+/* SetElem */
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRD, CPU Feature: AVX
+func (x Float32x4) SetElem(index uint8, y float32) Float32x4
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRQ, CPU Feature: AVX
+func (x Float64x2) SetElem(index uint8, y float64) Float64x2
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRB, CPU Feature: AVX
+func (x Int8x16) SetElem(index uint8, y int8) Int8x16
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRW, CPU Feature: AVX
+func (x Int16x8) SetElem(index uint8, y int16) Int16x8
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRD, CPU Feature: AVX
+func (x Int32x4) SetElem(index uint8, y int32) Int32x4
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRQ, CPU Feature: AVX
+func (x Int64x2) SetElem(index uint8, y int64) Int64x2
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRB, CPU Feature: AVX
+func (x Uint8x16) SetElem(index uint8, y uint8) Uint8x16
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRW, CPU Feature: AVX
+func (x Uint16x8) SetElem(index uint8, y uint16) Uint16x8
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRD, CPU Feature: AVX
+func (x Uint32x4) SetElem(index uint8, y uint32) Uint32x4
+
+// SetElem sets a single constant-indexed element's value.
+//
+// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPINSRQ, CPU Feature: AVX
+func (x Uint64x2) SetElem(index uint8, y uint64) Uint64x2
+
+/* SetHi */
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTF128, CPU Feature: AVX
+func (x Float32x8) SetHi(y Float32x4) Float32x8
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTF64X4, CPU Feature: AVX512
+func (x Float32x16) SetHi(y Float32x8) Float32x16
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTF128, CPU Feature: AVX
+func (x Float64x4) SetHi(y Float64x2) Float64x4
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTF64X4, CPU Feature: AVX512
+func (x Float64x8) SetHi(y Float64x4) Float64x8
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int8x32) SetHi(y Int8x16) Int8x32
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Int8x64) SetHi(y Int8x32) Int8x64
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int16x16) SetHi(y Int16x8) Int16x16
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Int16x32) SetHi(y Int16x16) Int16x32
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int32x8) SetHi(y Int32x4) Int32x8
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Int32x16) SetHi(y Int32x8) Int32x16
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int64x4) SetHi(y Int64x2) Int64x4
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Int64x8) SetHi(y Int64x4) Int64x8
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint8x32) SetHi(y Uint8x16) Uint8x32
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Uint8x64) SetHi(y Uint8x32) Uint8x64
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint16x16) SetHi(y Uint16x8) Uint16x16
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Uint16x32) SetHi(y Uint16x16) Uint16x32
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint32x8) SetHi(y Uint32x4) Uint32x8
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Uint32x16) SetHi(y Uint32x8) Uint32x16
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint64x4) SetHi(y Uint64x2) Uint64x4
+
+// SetHi returns x with its upper half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Uint64x8) SetHi(y Uint64x4) Uint64x8
+
+/* SetLo */
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTF128, CPU Feature: AVX
+func (x Float32x8) SetLo(y Float32x4) Float32x8
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTF64X4, CPU Feature: AVX512
+func (x Float32x16) SetLo(y Float32x8) Float32x16
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTF128, CPU Feature: AVX
+func (x Float64x4) SetLo(y Float64x2) Float64x4
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTF64X4, CPU Feature: AVX512
+func (x Float64x8) SetLo(y Float64x4) Float64x8
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int8x32) SetLo(y Int8x16) Int8x32
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Int8x64) SetLo(y Int8x32) Int8x64
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int16x16) SetLo(y Int16x8) Int16x16
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Int16x32) SetLo(y Int16x16) Int16x32
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int32x8) SetLo(y Int32x4) Int32x8
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Int32x16) SetLo(y Int32x8) Int32x16
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Int64x4) SetLo(y Int64x2) Int64x4
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Int64x8) SetLo(y Int64x4) Int64x8
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint8x32) SetLo(y Uint8x16) Uint8x32
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Uint8x64) SetLo(y Uint8x32) Uint8x64
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint16x16) SetLo(y Uint16x8) Uint16x16
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Uint16x32) SetLo(y Uint16x16) Uint16x32
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint32x8) SetLo(y Uint32x4) Uint32x8
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Uint32x16) SetLo(y Uint32x8) Uint32x16
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI128, CPU Feature: AVX2
+func (x Uint64x4) SetLo(y Uint64x2) Uint64x4
+
+// SetLo returns x with its lower half set to y.
+//
+// Asm: VINSERTI64X4, CPU Feature: AVX512
+func (x Uint64x8) SetLo(y Uint64x4) Uint64x8
+
+/* ShiftAllLeft */
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLW, CPU Feature: AVX
+func (x Int16x8) ShiftAllLeft(y uint64) Int16x8
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLW, CPU Feature: AVX2
+func (x Int16x16) ShiftAllLeft(y uint64) Int16x16
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLW, CPU Feature: AVX512
+func (x Int16x32) ShiftAllLeft(y uint64) Int16x32
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLD, CPU Feature: AVX
+func (x Int32x4) ShiftAllLeft(y uint64) Int32x4
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLD, CPU Feature: AVX2
+func (x Int32x8) ShiftAllLeft(y uint64) Int32x8
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLD, CPU Feature: AVX512
+func (x Int32x16) ShiftAllLeft(y uint64) Int32x16
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLQ, CPU Feature: AVX
+func (x Int64x2) ShiftAllLeft(y uint64) Int64x2
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLQ, CPU Feature: AVX2
+func (x Int64x4) ShiftAllLeft(y uint64) Int64x4
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLQ, CPU Feature: AVX512
+func (x Int64x8) ShiftAllLeft(y uint64) Int64x8
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLW, CPU Feature: AVX
+func (x Uint16x8) ShiftAllLeft(y uint64) Uint16x8
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLW, CPU Feature: AVX2
+func (x Uint16x16) ShiftAllLeft(y uint64) Uint16x16
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLW, CPU Feature: AVX512
+func (x Uint16x32) ShiftAllLeft(y uint64) Uint16x32
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLD, CPU Feature: AVX
+func (x Uint32x4) ShiftAllLeft(y uint64) Uint32x4
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLD, CPU Feature: AVX2
+func (x Uint32x8) ShiftAllLeft(y uint64) Uint32x8
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLD, CPU Feature: AVX512
+func (x Uint32x16) ShiftAllLeft(y uint64) Uint32x16
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLQ, CPU Feature: AVX
+func (x Uint64x2) ShiftAllLeft(y uint64) Uint64x2
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLQ, CPU Feature: AVX2
+func (x Uint64x4) ShiftAllLeft(y uint64) Uint64x4
+
+// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLQ, CPU Feature: AVX512
+func (x Uint64x8) ShiftAllLeft(y uint64) Uint64x8
+
+/* ShiftAllLeftConcat */
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
+func (x Int16x8) ShiftAllLeftConcat(shift uint8, y Int16x8) Int16x8
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
+func (x Int16x16) ShiftAllLeftConcat(shift uint8, y Int16x16) Int16x16
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
+func (x Int16x32) ShiftAllLeftConcat(shift uint8, y Int16x32) Int16x32
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
+func (x Int32x4) ShiftAllLeftConcat(shift uint8, y Int32x4) Int32x4
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
+func (x Int32x8) ShiftAllLeftConcat(shift uint8, y Int32x8) Int32x8
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
+func (x Int32x16) ShiftAllLeftConcat(shift uint8, y Int32x16) Int32x16
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
+func (x Int64x2) ShiftAllLeftConcat(shift uint8, y Int64x2) Int64x2
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
+func (x Int64x4) ShiftAllLeftConcat(shift uint8, y Int64x4) Int64x4
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
+func (x Int64x8) ShiftAllLeftConcat(shift uint8, y Int64x8) Int64x8
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
+func (x Uint16x8) ShiftAllLeftConcat(shift uint8, y Uint16x8) Uint16x8
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
+func (x Uint16x16) ShiftAllLeftConcat(shift uint8, y Uint16x16) Uint16x16
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
+func (x Uint16x32) ShiftAllLeftConcat(shift uint8, y Uint16x32) Uint16x32
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
+func (x Uint32x4) ShiftAllLeftConcat(shift uint8, y Uint32x4) Uint32x4
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
+func (x Uint32x8) ShiftAllLeftConcat(shift uint8, y Uint32x8) Uint32x8
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
+func (x Uint32x16) ShiftAllLeftConcat(shift uint8, y Uint32x16) Uint32x16
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x2) ShiftAllLeftConcat(shift uint8, y Uint64x2) Uint64x2
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x4) ShiftAllLeftConcat(shift uint8, y Uint64x4) Uint64x4
+
+// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x8) ShiftAllLeftConcat(shift uint8, y Uint64x8) Uint64x8
+
+/* ShiftAllRight */
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAW, CPU Feature: AVX
+func (x Int16x8) ShiftAllRight(y uint64) Int16x8
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAW, CPU Feature: AVX2
+func (x Int16x16) ShiftAllRight(y uint64) Int16x16
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAW, CPU Feature: AVX512
+func (x Int16x32) ShiftAllRight(y uint64) Int16x32
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAD, CPU Feature: AVX
+func (x Int32x4) ShiftAllRight(y uint64) Int32x4
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAD, CPU Feature: AVX2
+func (x Int32x8) ShiftAllRight(y uint64) Int32x8
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAD, CPU Feature: AVX512
+func (x Int32x16) ShiftAllRight(y uint64) Int32x16
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAQ, CPU Feature: AVX512
+func (x Int64x2) ShiftAllRight(y uint64) Int64x2
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAQ, CPU Feature: AVX512
+func (x Int64x4) ShiftAllRight(y uint64) Int64x4
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAQ, CPU Feature: AVX512
+func (x Int64x8) ShiftAllRight(y uint64) Int64x8
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLW, CPU Feature: AVX
+func (x Uint16x8) ShiftAllRight(y uint64) Uint16x8
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLW, CPU Feature: AVX2
+func (x Uint16x16) ShiftAllRight(y uint64) Uint16x16
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLW, CPU Feature: AVX512
+func (x Uint16x32) ShiftAllRight(y uint64) Uint16x32
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLD, CPU Feature: AVX
+func (x Uint32x4) ShiftAllRight(y uint64) Uint32x4
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLD, CPU Feature: AVX2
+func (x Uint32x8) ShiftAllRight(y uint64) Uint32x8
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLD, CPU Feature: AVX512
+func (x Uint32x16) ShiftAllRight(y uint64) Uint32x16
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLQ, CPU Feature: AVX
+func (x Uint64x2) ShiftAllRight(y uint64) Uint64x2
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLQ, CPU Feature: AVX2
+func (x Uint64x4) ShiftAllRight(y uint64) Uint64x4
+
+// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLQ, CPU Feature: AVX512
+func (x Uint64x8) ShiftAllRight(y uint64) Uint64x8
+
+/* ShiftAllRightConcat */
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
+func (x Int16x8) ShiftAllRightConcat(shift uint8, y Int16x8) Int16x8
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
+func (x Int16x16) ShiftAllRightConcat(shift uint8, y Int16x16) Int16x16
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
+func (x Int16x32) ShiftAllRightConcat(shift uint8, y Int16x32) Int16x32
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
+func (x Int32x4) ShiftAllRightConcat(shift uint8, y Int32x4) Int32x4
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
+func (x Int32x8) ShiftAllRightConcat(shift uint8, y Int32x8) Int32x8
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
+func (x Int32x16) ShiftAllRightConcat(shift uint8, y Int32x16) Int32x16
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
+func (x Int64x2) ShiftAllRightConcat(shift uint8, y Int64x2) Int64x2
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
+func (x Int64x4) ShiftAllRightConcat(shift uint8, y Int64x4) Int64x4
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
+func (x Int64x8) ShiftAllRightConcat(shift uint8, y Int64x8) Int64x8
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
+func (x Uint16x8) ShiftAllRightConcat(shift uint8, y Uint16x8) Uint16x8
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
+func (x Uint16x16) ShiftAllRightConcat(shift uint8, y Uint16x16) Uint16x16
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
+func (x Uint16x32) ShiftAllRightConcat(shift uint8, y Uint16x32) Uint16x32
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
+func (x Uint32x4) ShiftAllRightConcat(shift uint8, y Uint32x4) Uint32x4
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
+func (x Uint32x8) ShiftAllRightConcat(shift uint8, y Uint32x8) Uint32x8
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
+func (x Uint32x16) ShiftAllRightConcat(shift uint8, y Uint32x16) Uint32x16
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x2) ShiftAllRightConcat(shift uint8, y Uint64x2) Uint64x2
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x4) ShiftAllRightConcat(shift uint8, y Uint64x4) Uint64x4
+
+// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
+// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
+//
+// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
+func (x Uint64x8) ShiftAllRightConcat(shift uint8, y Uint64x8) Uint64x8
+
+/* ShiftLeft */
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVW, CPU Feature: AVX512
+func (x Int16x8) ShiftLeft(y Int16x8) Int16x8
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVW, CPU Feature: AVX512
+func (x Int16x16) ShiftLeft(y Int16x16) Int16x16
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVW, CPU Feature: AVX512
+func (x Int16x32) ShiftLeft(y Int16x32) Int16x32
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVD, CPU Feature: AVX2
+func (x Int32x4) ShiftLeft(y Int32x4) Int32x4
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVD, CPU Feature: AVX2
+func (x Int32x8) ShiftLeft(y Int32x8) Int32x8
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVD, CPU Feature: AVX512
+func (x Int32x16) ShiftLeft(y Int32x16) Int32x16
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVQ, CPU Feature: AVX2
+func (x Int64x2) ShiftLeft(y Int64x2) Int64x2
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVQ, CPU Feature: AVX2
+func (x Int64x4) ShiftLeft(y Int64x4) Int64x4
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVQ, CPU Feature: AVX512
+func (x Int64x8) ShiftLeft(y Int64x8) Int64x8
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVW, CPU Feature: AVX512
+func (x Uint16x8) ShiftLeft(y Uint16x8) Uint16x8
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVW, CPU Feature: AVX512
+func (x Uint16x16) ShiftLeft(y Uint16x16) Uint16x16
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVW, CPU Feature: AVX512
+func (x Uint16x32) ShiftLeft(y Uint16x32) Uint16x32
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVD, CPU Feature: AVX2
+func (x Uint32x4) ShiftLeft(y Uint32x4) Uint32x4
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVD, CPU Feature: AVX2
+func (x Uint32x8) ShiftLeft(y Uint32x8) Uint32x8
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVD, CPU Feature: AVX512
+func (x Uint32x16) ShiftLeft(y Uint32x16) Uint32x16
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVQ, CPU Feature: AVX2
+func (x Uint64x2) ShiftLeft(y Uint64x2) Uint64x2
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVQ, CPU Feature: AVX2
+func (x Uint64x4) ShiftLeft(y Uint64x4) Uint64x4
+
+// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
+//
+// Asm: VPSLLVQ, CPU Feature: AVX512
+func (x Uint64x8) ShiftLeft(y Uint64x8) Uint64x8
+
+/* ShiftLeftConcat */
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
+func (x Int16x8) ShiftLeftConcat(y Int16x8, z Int16x8) Int16x8
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
+func (x Int16x16) ShiftLeftConcat(y Int16x16, z Int16x16) Int16x16
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
+func (x Int16x32) ShiftLeftConcat(y Int16x32, z Int16x32) Int16x32
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
+func (x Int32x4) ShiftLeftConcat(y Int32x4, z Int32x4) Int32x4
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
+func (x Int32x8) ShiftLeftConcat(y Int32x8, z Int32x8) Int32x8
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
+func (x Int32x16) ShiftLeftConcat(y Int32x16, z Int32x16) Int32x16
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Int64x2) ShiftLeftConcat(y Int64x2, z Int64x2) Int64x2
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Int64x4) ShiftLeftConcat(y Int64x4, z Int64x4) Int64x4
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Int64x8) ShiftLeftConcat(y Int64x8, z Int64x8) Int64x8
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
+func (x Uint16x8) ShiftLeftConcat(y Uint16x8, z Uint16x8) Uint16x8
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
+func (x Uint16x16) ShiftLeftConcat(y Uint16x16, z Uint16x16) Uint16x16
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
+func (x Uint16x32) ShiftLeftConcat(y Uint16x32, z Uint16x32) Uint16x32
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
+func (x Uint32x4) ShiftLeftConcat(y Uint32x4, z Uint32x4) Uint32x4
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
+func (x Uint32x8) ShiftLeftConcat(y Uint32x8, z Uint32x8) Uint32x8
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
+func (x Uint32x16) ShiftLeftConcat(y Uint32x16, z Uint32x16) Uint32x16
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x2) ShiftLeftConcat(y Uint64x2, z Uint64x2) Uint64x2
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x4) ShiftLeftConcat(y Uint64x4, z Uint64x4) Uint64x4
+
+// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
+//
+// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x8) ShiftLeftConcat(y Uint64x8, z Uint64x8) Uint64x8
+
+/* ShiftRight */
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAVW, CPU Feature: AVX512
+func (x Int16x8) ShiftRight(y Int16x8) Int16x8
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAVW, CPU Feature: AVX512
+func (x Int16x16) ShiftRight(y Int16x16) Int16x16
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAVW, CPU Feature: AVX512
+func (x Int16x32) ShiftRight(y Int16x32) Int16x32
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAVD, CPU Feature: AVX2
+func (x Int32x4) ShiftRight(y Int32x4) Int32x4
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAVD, CPU Feature: AVX2
+func (x Int32x8) ShiftRight(y Int32x8) Int32x8
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAVD, CPU Feature: AVX512
+func (x Int32x16) ShiftRight(y Int32x16) Int32x16
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAVQ, CPU Feature: AVX512
+func (x Int64x2) ShiftRight(y Int64x2) Int64x2
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAVQ, CPU Feature: AVX512
+func (x Int64x4) ShiftRight(y Int64x4) Int64x4
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
+//
+// Asm: VPSRAVQ, CPU Feature: AVX512
+func (x Int64x8) ShiftRight(y Int64x8) Int64x8
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLVW, CPU Feature: AVX512
+func (x Uint16x8) ShiftRight(y Uint16x8) Uint16x8
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLVW, CPU Feature: AVX512
+func (x Uint16x16) ShiftRight(y Uint16x16) Uint16x16
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLVW, CPU Feature: AVX512
+func (x Uint16x32) ShiftRight(y Uint16x32) Uint16x32
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLVD, CPU Feature: AVX2
+func (x Uint32x4) ShiftRight(y Uint32x4) Uint32x4
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLVD, CPU Feature: AVX2
+func (x Uint32x8) ShiftRight(y Uint32x8) Uint32x8
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLVD, CPU Feature: AVX512
+func (x Uint32x16) ShiftRight(y Uint32x16) Uint32x16
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLVQ, CPU Feature: AVX2
+func (x Uint64x2) ShiftRight(y Uint64x2) Uint64x2
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLVQ, CPU Feature: AVX2
+func (x Uint64x4) ShiftRight(y Uint64x4) Uint64x4
+
+// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
+//
+// Asm: VPSRLVQ, CPU Feature: AVX512
+func (x Uint64x8) ShiftRight(y Uint64x8) Uint64x8
+
+/* ShiftRightConcat */
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
+func (x Int16x8) ShiftRightConcat(y Int16x8, z Int16x8) Int16x8
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
+func (x Int16x16) ShiftRightConcat(y Int16x16, z Int16x16) Int16x16
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
+func (x Int16x32) ShiftRightConcat(y Int16x32, z Int16x32) Int16x32
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
+func (x Int32x4) ShiftRightConcat(y Int32x4, z Int32x4) Int32x4
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
+func (x Int32x8) ShiftRightConcat(y Int32x8, z Int32x8) Int32x8
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
+func (x Int32x16) ShiftRightConcat(y Int32x16, z Int32x16) Int32x16
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
+func (x Int64x2) ShiftRightConcat(y Int64x2, z Int64x2) Int64x2
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
+func (x Int64x4) ShiftRightConcat(y Int64x4, z Int64x4) Int64x4
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
+func (x Int64x8) ShiftRightConcat(y Int64x8, z Int64x8) Int64x8
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
+func (x Uint16x8) ShiftRightConcat(y Uint16x8, z Uint16x8) Uint16x8
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
+func (x Uint16x16) ShiftRightConcat(y Uint16x16, z Uint16x16) Uint16x16
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
+func (x Uint16x32) ShiftRightConcat(y Uint16x32, z Uint16x32) Uint16x32
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
+func (x Uint32x4) ShiftRightConcat(y Uint32x4, z Uint32x4) Uint32x4
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
+func (x Uint32x8) ShiftRightConcat(y Uint32x8, z Uint32x8) Uint32x8
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
+func (x Uint32x16) ShiftRightConcat(y Uint32x16, z Uint32x16) Uint32x16
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x2) ShiftRightConcat(y Uint64x2, z Uint64x2) Uint64x2
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x4) ShiftRightConcat(y Uint64x4, z Uint64x4) Uint64x4
+
+// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
+// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
+//
+// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
+func (x Uint64x8) ShiftRightConcat(y Uint64x8, z Uint64x8) Uint64x8
+
+/* Sqrt */
+
+// Sqrt computes the square root of each element.
+//
+// Asm: VSQRTPS, CPU Feature: AVX
+func (x Float32x4) Sqrt() Float32x4
+
+// Sqrt computes the square root of each element.
+//
+// Asm: VSQRTPS, CPU Feature: AVX
+func (x Float32x8) Sqrt() Float32x8
+
+// Sqrt computes the square root of each element.
+//
+// Asm: VSQRTPS, CPU Feature: AVX512
+func (x Float32x16) Sqrt() Float32x16
+
+// Sqrt computes the square root of each element.
+//
+// Asm: VSQRTPD, CPU Feature: AVX
+func (x Float64x2) Sqrt() Float64x2
+
+// Sqrt computes the square root of each element.
+//
+// Asm: VSQRTPD, CPU Feature: AVX
+func (x Float64x4) Sqrt() Float64x4
+
+// Sqrt computes the square root of each element.
+//
+// Asm: VSQRTPD, CPU Feature: AVX512
+func (x Float64x8) Sqrt() Float64x8
+
+/* Sub */
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VSUBPS, CPU Feature: AVX
+func (x Float32x4) Sub(y Float32x4) Float32x4
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VSUBPS, CPU Feature: AVX
+func (x Float32x8) Sub(y Float32x8) Float32x8
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VSUBPS, CPU Feature: AVX512
+func (x Float32x16) Sub(y Float32x16) Float32x16
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VSUBPD, CPU Feature: AVX
+func (x Float64x2) Sub(y Float64x2) Float64x2
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VSUBPD, CPU Feature: AVX
+func (x Float64x4) Sub(y Float64x4) Float64x4
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VSUBPD, CPU Feature: AVX512
+func (x Float64x8) Sub(y Float64x8) Float64x8
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBB, CPU Feature: AVX
+func (x Int8x16) Sub(y Int8x16) Int8x16
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBB, CPU Feature: AVX2
+func (x Int8x32) Sub(y Int8x32) Int8x32
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBB, CPU Feature: AVX512
+func (x Int8x64) Sub(y Int8x64) Int8x64
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBW, CPU Feature: AVX
+func (x Int16x8) Sub(y Int16x8) Int16x8
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBW, CPU Feature: AVX2
+func (x Int16x16) Sub(y Int16x16) Int16x16
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBW, CPU Feature: AVX512
+func (x Int16x32) Sub(y Int16x32) Int16x32
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBD, CPU Feature: AVX
+func (x Int32x4) Sub(y Int32x4) Int32x4
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBD, CPU Feature: AVX2
+func (x Int32x8) Sub(y Int32x8) Int32x8
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBD, CPU Feature: AVX512
+func (x Int32x16) Sub(y Int32x16) Int32x16
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBQ, CPU Feature: AVX
+func (x Int64x2) Sub(y Int64x2) Int64x2
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBQ, CPU Feature: AVX2
+func (x Int64x4) Sub(y Int64x4) Int64x4
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBQ, CPU Feature: AVX512
+func (x Int64x8) Sub(y Int64x8) Int64x8
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBB, CPU Feature: AVX
+func (x Uint8x16) Sub(y Uint8x16) Uint8x16
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBB, CPU Feature: AVX2
+func (x Uint8x32) Sub(y Uint8x32) Uint8x32
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBB, CPU Feature: AVX512
+func (x Uint8x64) Sub(y Uint8x64) Uint8x64
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBW, CPU Feature: AVX
+func (x Uint16x8) Sub(y Uint16x8) Uint16x8
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBW, CPU Feature: AVX2
+func (x Uint16x16) Sub(y Uint16x16) Uint16x16
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBW, CPU Feature: AVX512
+func (x Uint16x32) Sub(y Uint16x32) Uint16x32
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBD, CPU Feature: AVX
+func (x Uint32x4) Sub(y Uint32x4) Uint32x4
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBD, CPU Feature: AVX2
+func (x Uint32x8) Sub(y Uint32x8) Uint32x8
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBD, CPU Feature: AVX512
+func (x Uint32x16) Sub(y Uint32x16) Uint32x16
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBQ, CPU Feature: AVX
+func (x Uint64x2) Sub(y Uint64x2) Uint64x2
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBQ, CPU Feature: AVX2
+func (x Uint64x4) Sub(y Uint64x4) Uint64x4
+
+// Sub subtracts corresponding elements of two vectors.
+//
+// Asm: VPSUBQ, CPU Feature: AVX512
+func (x Uint64x8) Sub(y Uint64x8) Uint64x8
+
+/* SubPairs */
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x4) SubPairs(y Float32x4) Float32x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x8) SubPairs(y Float32x8) Float32x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x2) SubPairs(y Float64x2) Float64x2
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x4) SubPairs(y Float64x4) Float64x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Int16x8) SubPairs(y Int16x8) Int16x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Int16x16) SubPairs(y Int16x16) Int16x16
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Int32x4) SubPairs(y Int32x4) Int32x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Int32x8) SubPairs(y Int32x8) Int32x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Uint16x16) SubPairs(y Uint16x16) Uint16x16
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
+
+// SubPairs horizontally subtracts adjacent pairs of elements.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Uint32x8) SubPairs(y Uint32x8) Uint32x8
+
+/* SubPairsSaturated */
+
+// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBSW, CPU Feature: AVX
+func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8
+
+// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+//
+// Asm: VPHSUBSW, CPU Feature: AVX2
+func (x Int16x16) SubPairsSaturated(y Int16x16) Int16x16
+
+/* SubSaturated */
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX
+func (x Int8x16) SubSaturated(y Int8x16) Int8x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX2
+func (x Int8x32) SubSaturated(y Int8x32) Int8x32
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSB, CPU Feature: AVX512
+func (x Int8x64) SubSaturated(y Int8x64) Int8x64
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX
+func (x Int16x8) SubSaturated(y Int16x8) Int16x8
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX2
+func (x Int16x16) SubSaturated(y Int16x16) Int16x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBSW, CPU Feature: AVX512
+func (x Int16x32) SubSaturated(y Int16x32) Int16x32
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBUSB, CPU Feature: AVX
+func (x Uint8x16) SubSaturated(y Uint8x16) Uint8x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBUSB, CPU Feature: AVX2
+func (x Uint8x32) SubSaturated(y Uint8x32) Uint8x32
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBUSB, CPU Feature: AVX512
+func (x Uint8x64) SubSaturated(y Uint8x64) Uint8x64
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBUSW, CPU Feature: AVX
+func (x Uint16x8) SubSaturated(y Uint16x8) Uint16x8
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBUSW, CPU Feature: AVX2
+func (x Uint16x16) SubSaturated(y Uint16x16) Uint16x16
+
+// SubSaturated subtracts corresponding elements of two vectors with saturation.
+//
+// Asm: VPSUBUSW, CPU Feature: AVX512
+func (x Uint16x32) SubSaturated(y Uint16x32) Uint16x32
+
+/* SumAbsDiff */
+
+// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+// be a vector of word-sized elements whose each 4*n-th element contains the sum of the n-th input group. The other elements in the result vector are zeroed.
+// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
+//
+// Asm: VPSADBW, CPU Feature: AVX
+func (x Uint8x16) SumAbsDiff(y Uint8x16) Uint16x8
+
+// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+// be a vector of word-sized elements whose each 4*n-th element contains the sum of the n-th input group. The other elements in the result vector are zeroed.
+// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
+//
+// Asm: VPSADBW, CPU Feature: AVX2
+func (x Uint8x32) SumAbsDiff(y Uint8x32) Uint16x16
+
+// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
+// be a vector of word-sized elements whose each 4*n-th element contains the sum of the n-th input group. The other elements in the result vector are zeroed.
+// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
+//
+// Asm: VPSADBW, CPU Feature: AVX512
+func (x Uint8x64) SumAbsDiff(y Uint8x64) Uint16x32
+
+/* Trunc */
+
+// Trunc truncates elements towards zero.
+//
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x4) Trunc() Float32x4
+
+// Trunc truncates elements towards zero.
+//
+// Asm: VROUNDPS, CPU Feature: AVX
+func (x Float32x8) Trunc() Float32x8
+
+// Trunc truncates elements towards zero.
+//
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x2) Trunc() Float64x2
+
+// Trunc truncates elements towards zero.
+//
+// Asm: VROUNDPD, CPU Feature: AVX
+func (x Float64x4) Trunc() Float64x4
+
+/* TruncScaled */
+
+// TruncScaled truncates elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x4) TruncScaled(prec uint8) Float32x4
+
+// TruncScaled truncates elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x8) TruncScaled(prec uint8) Float32x8
+
+// TruncScaled truncates elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPS, CPU Feature: AVX512
+func (x Float32x16) TruncScaled(prec uint8) Float32x16
+
+// TruncScaled truncates elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x2) TruncScaled(prec uint8) Float64x2
+
+// TruncScaled truncates elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x4) TruncScaled(prec uint8) Float64x4
+
+// TruncScaled truncates elements with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VRNDSCALEPD, CPU Feature: AVX512
+func (x Float64x8) TruncScaled(prec uint8) Float64x8
+
+/* TruncScaledResidue */
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x4) TruncScaledResidue(prec uint8) Float32x4
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x8) TruncScaledResidue(prec uint8) Float32x8
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPS, CPU Feature: AVX512
+func (x Float32x16) TruncScaledResidue(prec uint8) Float32x16
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x2) TruncScaledResidue(prec uint8) Float64x2
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x4) TruncScaledResidue(prec uint8) Float64x4
+
+// TruncScaledResidue computes the difference after truncating with specified precision.
+//
+// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VREDUCEPD, CPU Feature: AVX512
+func (x Float64x8) TruncScaledResidue(prec uint8) Float64x8
+
+/* TruncateToInt8 */
+
+// TruncateToInt8 converts element values to int8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVWB, CPU Feature: AVX512
+func (x Int16x8) TruncateToInt8() Int8x16
+
+// TruncateToInt8 converts element values to int8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVWB, CPU Feature: AVX512
+func (x Int16x16) TruncateToInt8() Int8x16
+
+// TruncateToInt8 converts element values to int8.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVWB, CPU Feature: AVX512
+func (x Int16x32) TruncateToInt8() Int8x32
+
+// TruncateToInt8 converts element values to int8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVDB, CPU Feature: AVX512
+func (x Int32x4) TruncateToInt8() Int8x16
+
+// TruncateToInt8 converts element values to int8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVDB, CPU Feature: AVX512
+func (x Int32x8) TruncateToInt8() Int8x16
+
+// TruncateToInt8 converts element values to int8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVDB, CPU Feature: AVX512
+func (x Int32x16) TruncateToInt8() Int8x16
+
+// TruncateToInt8 converts element values to int8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVQB, CPU Feature: AVX512
+func (x Int64x2) TruncateToInt8() Int8x16
+
+// TruncateToInt8 converts element values to int8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVQB, CPU Feature: AVX512
+func (x Int64x4) TruncateToInt8() Int8x16
+
+// TruncateToInt8 converts element values to int8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVQB, CPU Feature: AVX512
+func (x Int64x8) TruncateToInt8() Int8x16
+
+/* TruncateToInt16 */
+
+// TruncateToInt16 converts element values to int16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVDW, CPU Feature: AVX512
+func (x Int32x4) TruncateToInt16() Int16x8
+
+// TruncateToInt16 converts element values to int16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVDW, CPU Feature: AVX512
+func (x Int32x8) TruncateToInt16() Int16x8
+
+// TruncateToInt16 converts element values to int16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVDW, CPU Feature: AVX512
+func (x Int32x16) TruncateToInt16() Int16x16
+
+// TruncateToInt16 converts element values to int16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQW, CPU Feature: AVX512
+func (x Int64x2) TruncateToInt16() Int16x8
+
+// TruncateToInt16 converts element values to int16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQW, CPU Feature: AVX512
+func (x Int64x4) TruncateToInt16() Int16x8
+
+// TruncateToInt16 converts element values to int16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQW, CPU Feature: AVX512
+func (x Int64x8) TruncateToInt16() Int16x8
+
+/* TruncateToInt32 */
+
+// TruncateToInt32 converts element values to int32.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQD, CPU Feature: AVX512
+func (x Int64x2) TruncateToInt32() Int32x4
+
+// TruncateToInt32 converts element values to int32.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQD, CPU Feature: AVX512
+func (x Int64x4) TruncateToInt32() Int32x4
+
+// TruncateToInt32 converts element values to int32.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQD, CPU Feature: AVX512
+func (x Int64x8) TruncateToInt32() Int32x8
+
+/* TruncateToUint8 */
+
+// TruncateToUint8 converts element values to uint8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVWB, CPU Feature: AVX512
+func (x Uint16x8) TruncateToUint8() Uint8x16
+
+// TruncateToUint8 converts element values to uint8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVWB, CPU Feature: AVX512
+func (x Uint16x16) TruncateToUint8() Uint8x16
+
+// TruncateToUint8 converts element values to uint8.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVWB, CPU Feature: AVX512
+func (x Uint16x32) TruncateToUint8() Uint8x32
+
+// TruncateToUint8 converts element values to uint8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVDB, CPU Feature: AVX512
+func (x Uint32x4) TruncateToUint8() Uint8x16
+
+// TruncateToUint8 converts element values to uint8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVDB, CPU Feature: AVX512
+func (x Uint32x8) TruncateToUint8() Uint8x16
+
+// TruncateToUint8 converts element values to uint8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVDB, CPU Feature: AVX512
+func (x Uint32x16) TruncateToUint8() Uint8x16
+
+// TruncateToUint8 converts element values to uint8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVQB, CPU Feature: AVX512
+func (x Uint64x2) TruncateToUint8() Uint8x16
+
+// TruncateToUint8 converts element values to uint8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVQB, CPU Feature: AVX512
+func (x Uint64x4) TruncateToUint8() Uint8x16
+
+// TruncateToUint8 converts element values to uint8.
+// Conversion is done with truncation on the vector elements.
+// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
+//
+// Asm: VPMOVQB, CPU Feature: AVX512
+func (x Uint64x8) TruncateToUint8() Uint8x16
+
+/* TruncateToUint16 */
+
+// TruncateToUint16 converts element values to uint16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVDW, CPU Feature: AVX512
+func (x Uint32x4) TruncateToUint16() Uint16x8
+
+// TruncateToUint16 converts element values to uint16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVDW, CPU Feature: AVX512
+func (x Uint32x8) TruncateToUint16() Uint16x8
+
+// TruncateToUint16 converts element values to uint16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVDW, CPU Feature: AVX512
+func (x Uint32x16) TruncateToUint16() Uint16x16
+
+// TruncateToUint16 converts element values to uint16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQW, CPU Feature: AVX512
+func (x Uint64x2) TruncateToUint16() Uint16x8
+
+// TruncateToUint16 converts element values to uint16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQW, CPU Feature: AVX512
+func (x Uint64x4) TruncateToUint16() Uint16x8
+
+// TruncateToUint16 converts element values to uint16.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQW, CPU Feature: AVX512
+func (x Uint64x8) TruncateToUint16() Uint16x8
+
+/* TruncateToUint32 */
+
+// TruncateToUint32 converts element values to uint32.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQD, CPU Feature: AVX512
+func (x Uint64x2) TruncateToUint32() Uint32x4
+
+// TruncateToUint32 converts element values to uint32.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQD, CPU Feature: AVX512
+func (x Uint64x4) TruncateToUint32() Uint32x4
+
+// TruncateToUint32 converts element values to uint32.
+// Conversion is done with truncation on the vector elements.
+//
+// Asm: VPMOVQD, CPU Feature: AVX512
+func (x Uint64x8) TruncateToUint32() Uint32x8
+
+/* Xor */
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX
+func (x Int8x16) Xor(y Int8x16) Int8x16
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX2
+func (x Int8x32) Xor(y Int8x32) Int8x32
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXORD, CPU Feature: AVX512
+func (x Int8x64) Xor(y Int8x64) Int8x64
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX
+func (x Int16x8) Xor(y Int16x8) Int16x8
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX2
+func (x Int16x16) Xor(y Int16x16) Int16x16
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXORD, CPU Feature: AVX512
+func (x Int16x32) Xor(y Int16x32) Int16x32
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX
+func (x Int32x4) Xor(y Int32x4) Int32x4
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX2
+func (x Int32x8) Xor(y Int32x8) Int32x8
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXORD, CPU Feature: AVX512
+func (x Int32x16) Xor(y Int32x16) Int32x16
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX
+func (x Int64x2) Xor(y Int64x2) Int64x2
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX2
+func (x Int64x4) Xor(y Int64x4) Int64x4
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXORQ, CPU Feature: AVX512
+func (x Int64x8) Xor(y Int64x8) Int64x8
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX
+func (x Uint8x16) Xor(y Uint8x16) Uint8x16
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX2
+func (x Uint8x32) Xor(y Uint8x32) Uint8x32
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXORD, CPU Feature: AVX512
+func (x Uint8x64) Xor(y Uint8x64) Uint8x64
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX
+func (x Uint16x8) Xor(y Uint16x8) Uint16x8
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX2
+func (x Uint16x16) Xor(y Uint16x16) Uint16x16
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXORD, CPU Feature: AVX512
+func (x Uint16x32) Xor(y Uint16x32) Uint16x32
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX
+func (x Uint32x4) Xor(y Uint32x4) Uint32x4
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX2
+func (x Uint32x8) Xor(y Uint32x8) Uint32x8
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXORD, CPU Feature: AVX512
+func (x Uint32x16) Xor(y Uint32x16) Uint32x16
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX
+func (x Uint64x2) Xor(y Uint64x2) Uint64x2
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXOR, CPU Feature: AVX2
+func (x Uint64x4) Xor(y Uint64x4) Uint64x4
+
+// Xor performs a bitwise XOR operation between two vectors.
+//
+// Asm: VPXORQ, CPU Feature: AVX512
+func (x Uint64x8) Xor(y Uint64x8) Uint64x8
+
+// Float64x2 converts from Float32x4 to Float64x2
+func (from Float32x4) AsFloat64x2() (to Float64x2)
+
+// Int8x16 converts from Float32x4 to Int8x16
+func (from Float32x4) AsInt8x16() (to Int8x16)
+
+// Int16x8 converts from Float32x4 to Int16x8
+func (from Float32x4) AsInt16x8() (to Int16x8)
+
+// Int32x4 converts from Float32x4 to Int32x4
+func (from Float32x4) AsInt32x4() (to Int32x4)
+
+// Int64x2 converts from Float32x4 to Int64x2
+func (from Float32x4) AsInt64x2() (to Int64x2)
+
+// Uint8x16 converts from Float32x4 to Uint8x16
+func (from Float32x4) AsUint8x16() (to Uint8x16)
+
+// Uint16x8 converts from Float32x4 to Uint16x8
+func (from Float32x4) AsUint16x8() (to Uint16x8)
+
+// Uint32x4 converts from Float32x4 to Uint32x4
+func (from Float32x4) AsUint32x4() (to Uint32x4)
+
+// Uint64x2 converts from Float32x4 to Uint64x2
+func (from Float32x4) AsUint64x2() (to Uint64x2)
+
+// Float64x4 converts from Float32x8 to Float64x4
+func (from Float32x8) AsFloat64x4() (to Float64x4)
+
+// Int8x32 converts from Float32x8 to Int8x32
+func (from Float32x8) AsInt8x32() (to Int8x32)
+
+// Int16x16 converts from Float32x8 to Int16x16
+func (from Float32x8) AsInt16x16() (to Int16x16)
+
+// Int32x8 converts from Float32x8 to Int32x8
+func (from Float32x8) AsInt32x8() (to Int32x8)
+
+// Int64x4 converts from Float32x8 to Int64x4
+func (from Float32x8) AsInt64x4() (to Int64x4)
+
+// Uint8x32 converts from Float32x8 to Uint8x32
+func (from Float32x8) AsUint8x32() (to Uint8x32)
+
+// Uint16x16 converts from Float32x8 to Uint16x16
+func (from Float32x8) AsUint16x16() (to Uint16x16)
+
+// Uint32x8 converts from Float32x8 to Uint32x8
+func (from Float32x8) AsUint32x8() (to Uint32x8)
+
+// Uint64x4 converts from Float32x8 to Uint64x4
+func (from Float32x8) AsUint64x4() (to Uint64x4)
+
+// Float64x8 converts from Float32x16 to Float64x8
+func (from Float32x16) AsFloat64x8() (to Float64x8)
+
+// Int8x64 converts from Float32x16 to Int8x64
+func (from Float32x16) AsInt8x64() (to Int8x64)
+
+// Int16x32 converts from Float32x16 to Int16x32
+func (from Float32x16) AsInt16x32() (to Int16x32)
+
+// Int32x16 converts from Float32x16 to Int32x16
+func (from Float32x16) AsInt32x16() (to Int32x16)
+
+// Int64x8 converts from Float32x16 to Int64x8
+func (from Float32x16) AsInt64x8() (to Int64x8)
+
+// Uint8x64 converts from Float32x16 to Uint8x64
+func (from Float32x16) AsUint8x64() (to Uint8x64)
+
+// Uint16x32 converts from Float32x16 to Uint16x32
+func (from Float32x16) AsUint16x32() (to Uint16x32)
+
+// Uint32x16 converts from Float32x16 to Uint32x16
+func (from Float32x16) AsUint32x16() (to Uint32x16)
+
+// Uint64x8 converts from Float32x16 to Uint64x8
+func (from Float32x16) AsUint64x8() (to Uint64x8)
+
+// Float32x4 converts from Float64x2 to Float32x4
+func (from Float64x2) AsFloat32x4() (to Float32x4)
+
+// Int8x16 converts from Float64x2 to Int8x16
+func (from Float64x2) AsInt8x16() (to Int8x16)
+
+// Int16x8 converts from Float64x2 to Int16x8
+func (from Float64x2) AsInt16x8() (to Int16x8)
+
+// Int32x4 converts from Float64x2 to Int32x4
+func (from Float64x2) AsInt32x4() (to Int32x4)
+
+// Int64x2 converts from Float64x2 to Int64x2
+func (from Float64x2) AsInt64x2() (to Int64x2)
+
+// Uint8x16 converts from Float64x2 to Uint8x16
+func (from Float64x2) AsUint8x16() (to Uint8x16)
+
+// Uint16x8 converts from Float64x2 to Uint16x8
+func (from Float64x2) AsUint16x8() (to Uint16x8)
+
+// Uint32x4 converts from Float64x2 to Uint32x4
+func (from Float64x2) AsUint32x4() (to Uint32x4)
+
+// Uint64x2 converts from Float64x2 to Uint64x2
+func (from Float64x2) AsUint64x2() (to Uint64x2)
+
+// Float32x8 converts from Float64x4 to Float32x8
+func (from Float64x4) AsFloat32x8() (to Float32x8)
+
+// Int8x32 converts from Float64x4 to Int8x32
+func (from Float64x4) AsInt8x32() (to Int8x32)
+
+// Int16x16 converts from Float64x4 to Int16x16
+func (from Float64x4) AsInt16x16() (to Int16x16)
+
+// Int32x8 converts from Float64x4 to Int32x8
+func (from Float64x4) AsInt32x8() (to Int32x8)
+
+// Int64x4 converts from Float64x4 to Int64x4
+func (from Float64x4) AsInt64x4() (to Int64x4)
+
+// Uint8x32 converts from Float64x4 to Uint8x32
+func (from Float64x4) AsUint8x32() (to Uint8x32)
+
+// Uint16x16 converts from Float64x4 to Uint16x16
+func (from Float64x4) AsUint16x16() (to Uint16x16)
+
+// Uint32x8 converts from Float64x4 to Uint32x8
+func (from Float64x4) AsUint32x8() (to Uint32x8)
+
+// Uint64x4 converts from Float64x4 to Uint64x4
+func (from Float64x4) AsUint64x4() (to Uint64x4)
+
+// Float32x16 converts from Float64x8 to Float32x16
+func (from Float64x8) AsFloat32x16() (to Float32x16)
+
+// Int8x64 converts from Float64x8 to Int8x64
+func (from Float64x8) AsInt8x64() (to Int8x64)
+
+// Int16x32 converts from Float64x8 to Int16x32
+func (from Float64x8) AsInt16x32() (to Int16x32)
+
+// Int32x16 converts from Float64x8 to Int32x16
+func (from Float64x8) AsInt32x16() (to Int32x16)
+
+// Int64x8 converts from Float64x8 to Int64x8
+func (from Float64x8) AsInt64x8() (to Int64x8)
+
+// Uint8x64 converts from Float64x8 to Uint8x64
+func (from Float64x8) AsUint8x64() (to Uint8x64)
+
+// Uint16x32 converts from Float64x8 to Uint16x32
+func (from Float64x8) AsUint16x32() (to Uint16x32)
+
+// Uint32x16 converts from Float64x8 to Uint32x16
+func (from Float64x8) AsUint32x16() (to Uint32x16)
+
+// Uint64x8 converts from Float64x8 to Uint64x8
+func (from Float64x8) AsUint64x8() (to Uint64x8)
+
+// Float32x4 converts from Int8x16 to Float32x4
+func (from Int8x16) AsFloat32x4() (to Float32x4)
+
+// Float64x2 converts from Int8x16 to Float64x2
+func (from Int8x16) AsFloat64x2() (to Float64x2)
+
+// Int16x8 converts from Int8x16 to Int16x8
+func (from Int8x16) AsInt16x8() (to Int16x8)
+
+// Int32x4 converts from Int8x16 to Int32x4
+func (from Int8x16) AsInt32x4() (to Int32x4)
+
+// Int64x2 converts from Int8x16 to Int64x2
+func (from Int8x16) AsInt64x2() (to Int64x2)
+
+// Uint8x16 converts from Int8x16 to Uint8x16
+func (from Int8x16) AsUint8x16() (to Uint8x16)
+
+// Uint16x8 converts from Int8x16 to Uint16x8
+func (from Int8x16) AsUint16x8() (to Uint16x8)
+
+// Uint32x4 converts from Int8x16 to Uint32x4
+func (from Int8x16) AsUint32x4() (to Uint32x4)
+
+// Uint64x2 converts from Int8x16 to Uint64x2
+func (from Int8x16) AsUint64x2() (to Uint64x2)
+
+// Float32x8 converts from Int8x32 to Float32x8
+func (from Int8x32) AsFloat32x8() (to Float32x8)
+
+// Float64x4 converts from Int8x32 to Float64x4
+func (from Int8x32) AsFloat64x4() (to Float64x4)
+
+// Int16x16 converts from Int8x32 to Int16x16
+func (from Int8x32) AsInt16x16() (to Int16x16)
+
+// Int32x8 converts from Int8x32 to Int32x8
+func (from Int8x32) AsInt32x8() (to Int32x8)
+
+// Int64x4 converts from Int8x32 to Int64x4
+func (from Int8x32) AsInt64x4() (to Int64x4)
+
+// Uint8x32 converts from Int8x32 to Uint8x32
+func (from Int8x32) AsUint8x32() (to Uint8x32)
+
+// Uint16x16 converts from Int8x32 to Uint16x16
+func (from Int8x32) AsUint16x16() (to Uint16x16)
+
+// Uint32x8 converts from Int8x32 to Uint32x8
+func (from Int8x32) AsUint32x8() (to Uint32x8)
+
+// Uint64x4 converts from Int8x32 to Uint64x4
+func (from Int8x32) AsUint64x4() (to Uint64x4)
+
+// Float32x16 converts from Int8x64 to Float32x16
+func (from Int8x64) AsFloat32x16() (to Float32x16)
+
+// Float64x8 converts from Int8x64 to Float64x8
+func (from Int8x64) AsFloat64x8() (to Float64x8)
+
+// Int16x32 converts from Int8x64 to Int16x32
+func (from Int8x64) AsInt16x32() (to Int16x32)
+
+// Int32x16 converts from Int8x64 to Int32x16
+func (from Int8x64) AsInt32x16() (to Int32x16)
+
+// Int64x8 converts from Int8x64 to Int64x8
+func (from Int8x64) AsInt64x8() (to Int64x8)
+
+// Uint8x64 converts from Int8x64 to Uint8x64
+func (from Int8x64) AsUint8x64() (to Uint8x64)
+
+// Uint16x32 converts from Int8x64 to Uint16x32
+func (from Int8x64) AsUint16x32() (to Uint16x32)
+
+// Uint32x16 converts from Int8x64 to Uint32x16
+func (from Int8x64) AsUint32x16() (to Uint32x16)
+
+// Uint64x8 converts from Int8x64 to Uint64x8
+func (from Int8x64) AsUint64x8() (to Uint64x8)
+
+// Float32x4 converts from Int16x8 to Float32x4
+func (from Int16x8) AsFloat32x4() (to Float32x4)
+
+// Float64x2 converts from Int16x8 to Float64x2
+func (from Int16x8) AsFloat64x2() (to Float64x2)
+
+// Int8x16 converts from Int16x8 to Int8x16
+func (from Int16x8) AsInt8x16() (to Int8x16)
+
+// Int32x4 converts from Int16x8 to Int32x4
+func (from Int16x8) AsInt32x4() (to Int32x4)
+
+// Int64x2 converts from Int16x8 to Int64x2
+func (from Int16x8) AsInt64x2() (to Int64x2)
+
+// Uint8x16 converts from Int16x8 to Uint8x16
+func (from Int16x8) AsUint8x16() (to Uint8x16)
+
+// Uint16x8 converts from Int16x8 to Uint16x8
+func (from Int16x8) AsUint16x8() (to Uint16x8)
+
+// Uint32x4 converts from Int16x8 to Uint32x4
+func (from Int16x8) AsUint32x4() (to Uint32x4)
+
+// Uint64x2 converts from Int16x8 to Uint64x2
+func (from Int16x8) AsUint64x2() (to Uint64x2)
+
+// Float32x8 converts from Int16x16 to Float32x8
+func (from Int16x16) AsFloat32x8() (to Float32x8)
+
+// Float64x4 converts from Int16x16 to Float64x4
+func (from Int16x16) AsFloat64x4() (to Float64x4)
+
+// Int8x32 converts from Int16x16 to Int8x32
+func (from Int16x16) AsInt8x32() (to Int8x32)
+
+// Int32x8 converts from Int16x16 to Int32x8
+func (from Int16x16) AsInt32x8() (to Int32x8)
+
+// Int64x4 converts from Int16x16 to Int64x4
+func (from Int16x16) AsInt64x4() (to Int64x4)
+
+// Uint8x32 converts from Int16x16 to Uint8x32
+func (from Int16x16) AsUint8x32() (to Uint8x32)
+
+// Uint16x16 converts from Int16x16 to Uint16x16
+func (from Int16x16) AsUint16x16() (to Uint16x16)
+
+// Uint32x8 converts from Int16x16 to Uint32x8
+func (from Int16x16) AsUint32x8() (to Uint32x8)
+
+// Uint64x4 converts from Int16x16 to Uint64x4
+func (from Int16x16) AsUint64x4() (to Uint64x4)
+
+// Float32x16 converts from Int16x32 to Float32x16
+func (from Int16x32) AsFloat32x16() (to Float32x16)
+
+// Float64x8 converts from Int16x32 to Float64x8
+func (from Int16x32) AsFloat64x8() (to Float64x8)
+
+// Int8x64 converts from Int16x32 to Int8x64
+func (from Int16x32) AsInt8x64() (to Int8x64)
+
+// Int32x16 converts from Int16x32 to Int32x16
+func (from Int16x32) AsInt32x16() (to Int32x16)
+
+// Int64x8 converts from Int16x32 to Int64x8
+func (from Int16x32) AsInt64x8() (to Int64x8)
+
+// Uint8x64 converts from Int16x32 to Uint8x64
+func (from Int16x32) AsUint8x64() (to Uint8x64)
+
+// Uint16x32 converts from Int16x32 to Uint16x32
+func (from Int16x32) AsUint16x32() (to Uint16x32)
+
+// Uint32x16 converts from Int16x32 to Uint32x16
+func (from Int16x32) AsUint32x16() (to Uint32x16)
+
+// Uint64x8 converts from Int16x32 to Uint64x8
+func (from Int16x32) AsUint64x8() (to Uint64x8)
+
+// Float32x4 converts from Int32x4 to Float32x4
+func (from Int32x4) AsFloat32x4() (to Float32x4)
+
+// Float64x2 converts from Int32x4 to Float64x2
+func (from Int32x4) AsFloat64x2() (to Float64x2)
+
+// Int8x16 converts from Int32x4 to Int8x16
+func (from Int32x4) AsInt8x16() (to Int8x16)
+
+// Int16x8 converts from Int32x4 to Int16x8
+func (from Int32x4) AsInt16x8() (to Int16x8)
+
+// Int64x2 converts from Int32x4 to Int64x2
+func (from Int32x4) AsInt64x2() (to Int64x2)
+
+// Uint8x16 converts from Int32x4 to Uint8x16
+func (from Int32x4) AsUint8x16() (to Uint8x16)
+
+// Uint16x8 converts from Int32x4 to Uint16x8
+func (from Int32x4) AsUint16x8() (to Uint16x8)
+
+// Uint32x4 converts from Int32x4 to Uint32x4
+func (from Int32x4) AsUint32x4() (to Uint32x4)
+
+// Uint64x2 converts from Int32x4 to Uint64x2
+func (from Int32x4) AsUint64x2() (to Uint64x2)
+
+// Float32x8 converts from Int32x8 to Float32x8
+func (from Int32x8) AsFloat32x8() (to Float32x8)
+
+// Float64x4 converts from Int32x8 to Float64x4
+func (from Int32x8) AsFloat64x4() (to Float64x4)
+
+// Int8x32 converts from Int32x8 to Int8x32
+func (from Int32x8) AsInt8x32() (to Int8x32)
+
+// Int16x16 converts from Int32x8 to Int16x16
+func (from Int32x8) AsInt16x16() (to Int16x16)
+
+// Int64x4 converts from Int32x8 to Int64x4
+func (from Int32x8) AsInt64x4() (to Int64x4)
+
+// Uint8x32 converts from Int32x8 to Uint8x32
+func (from Int32x8) AsUint8x32() (to Uint8x32)
+
+// Uint16x16 converts from Int32x8 to Uint16x16
+func (from Int32x8) AsUint16x16() (to Uint16x16)
+
+// Uint32x8 converts from Int32x8 to Uint32x8
+func (from Int32x8) AsUint32x8() (to Uint32x8)
+
+// Uint64x4 converts from Int32x8 to Uint64x4
+func (from Int32x8) AsUint64x4() (to Uint64x4)
+
+// Float32x16 converts from Int32x16 to Float32x16
+func (from Int32x16) AsFloat32x16() (to Float32x16)
+
+// Float64x8 converts from Int32x16 to Float64x8
+func (from Int32x16) AsFloat64x8() (to Float64x8)
+
+// Int8x64 converts from Int32x16 to Int8x64
+func (from Int32x16) AsInt8x64() (to Int8x64)
+
+// Int16x32 converts from Int32x16 to Int16x32
+func (from Int32x16) AsInt16x32() (to Int16x32)
+
+// Int64x8 converts from Int32x16 to Int64x8
+func (from Int32x16) AsInt64x8() (to Int64x8)
+
+// Uint8x64 converts from Int32x16 to Uint8x64
+func (from Int32x16) AsUint8x64() (to Uint8x64)
+
+// Uint16x32 converts from Int32x16 to Uint16x32
+func (from Int32x16) AsUint16x32() (to Uint16x32)
+
+// Uint32x16 converts from Int32x16 to Uint32x16
+func (from Int32x16) AsUint32x16() (to Uint32x16)
+
+// Uint64x8 converts from Int32x16 to Uint64x8
+func (from Int32x16) AsUint64x8() (to Uint64x8)
+
+// Float32x4 converts from Int64x2 to Float32x4
+func (from Int64x2) AsFloat32x4() (to Float32x4)
+
+// Float64x2 converts from Int64x2 to Float64x2
+func (from Int64x2) AsFloat64x2() (to Float64x2)
+
+// Int8x16 converts from Int64x2 to Int8x16
+func (from Int64x2) AsInt8x16() (to Int8x16)
+
+// Int16x8 converts from Int64x2 to Int16x8
+func (from Int64x2) AsInt16x8() (to Int16x8)
+
+// Int32x4 converts from Int64x2 to Int32x4
+func (from Int64x2) AsInt32x4() (to Int32x4)
+
+// Uint8x16 converts from Int64x2 to Uint8x16
+func (from Int64x2) AsUint8x16() (to Uint8x16)
+
+// Uint16x8 converts from Int64x2 to Uint16x8
+func (from Int64x2) AsUint16x8() (to Uint16x8)
+
+// Uint32x4 converts from Int64x2 to Uint32x4
+func (from Int64x2) AsUint32x4() (to Uint32x4)
+
+// Uint64x2 converts from Int64x2 to Uint64x2
+func (from Int64x2) AsUint64x2() (to Uint64x2)
+
+// Float32x8 converts from Int64x4 to Float32x8
+func (from Int64x4) AsFloat32x8() (to Float32x8)
+
+// Float64x4 converts from Int64x4 to Float64x4
+func (from Int64x4) AsFloat64x4() (to Float64x4)
+
+// Int8x32 converts from Int64x4 to Int8x32
+func (from Int64x4) AsInt8x32() (to Int8x32)
+
+// Int16x16 converts from Int64x4 to Int16x16
+func (from Int64x4) AsInt16x16() (to Int16x16)
+
+// Int32x8 converts from Int64x4 to Int32x8
+func (from Int64x4) AsInt32x8() (to Int32x8)
+
+// Uint8x32 converts from Int64x4 to Uint8x32
+func (from Int64x4) AsUint8x32() (to Uint8x32)
+
+// Uint16x16 converts from Int64x4 to Uint16x16
+func (from Int64x4) AsUint16x16() (to Uint16x16)
+
+// Uint32x8 converts from Int64x4 to Uint32x8
+func (from Int64x4) AsUint32x8() (to Uint32x8)
+
+// Uint64x4 converts from Int64x4 to Uint64x4
+func (from Int64x4) AsUint64x4() (to Uint64x4)
+
+// Float32x16 converts from Int64x8 to Float32x16
+func (from Int64x8) AsFloat32x16() (to Float32x16)
+
+// Float64x8 converts from Int64x8 to Float64x8
+func (from Int64x8) AsFloat64x8() (to Float64x8)
+
+// Int8x64 converts from Int64x8 to Int8x64
+func (from Int64x8) AsInt8x64() (to Int8x64)
+
+// Int16x32 converts from Int64x8 to Int16x32
+func (from Int64x8) AsInt16x32() (to Int16x32)
+
+// Int32x16 converts from Int64x8 to Int32x16
+func (from Int64x8) AsInt32x16() (to Int32x16)
+
+// Uint8x64 converts from Int64x8 to Uint8x64
+func (from Int64x8) AsUint8x64() (to Uint8x64)
+
+// Uint16x32 converts from Int64x8 to Uint16x32
+func (from Int64x8) AsUint16x32() (to Uint16x32)
+
+// Uint32x16 converts from Int64x8 to Uint32x16
+func (from Int64x8) AsUint32x16() (to Uint32x16)
+
+// Uint64x8 converts from Int64x8 to Uint64x8
+func (from Int64x8) AsUint64x8() (to Uint64x8)
+
+// Float32x4 converts from Uint8x16 to Float32x4
+func (from Uint8x16) AsFloat32x4() (to Float32x4)
+
+// Float64x2 converts from Uint8x16 to Float64x2
+func (from Uint8x16) AsFloat64x2() (to Float64x2)
+
+// Int8x16 converts from Uint8x16 to Int8x16
+func (from Uint8x16) AsInt8x16() (to Int8x16)
+
+// Int16x8 converts from Uint8x16 to Int16x8
+func (from Uint8x16) AsInt16x8() (to Int16x8)
+
+// Int32x4 converts from Uint8x16 to Int32x4
+func (from Uint8x16) AsInt32x4() (to Int32x4)
+
+// Int64x2 converts from Uint8x16 to Int64x2
+func (from Uint8x16) AsInt64x2() (to Int64x2)
+
+// Uint16x8 converts from Uint8x16 to Uint16x8
+func (from Uint8x16) AsUint16x8() (to Uint16x8)
+
+// Uint32x4 converts from Uint8x16 to Uint32x4
+func (from Uint8x16) AsUint32x4() (to Uint32x4)
+
+// Uint64x2 converts from Uint8x16 to Uint64x2
+func (from Uint8x16) AsUint64x2() (to Uint64x2)
+
+// Float32x8 converts from Uint8x32 to Float32x8
+func (from Uint8x32) AsFloat32x8() (to Float32x8)
+
+// Float64x4 converts from Uint8x32 to Float64x4
+func (from Uint8x32) AsFloat64x4() (to Float64x4)
+
+// Int8x32 converts from Uint8x32 to Int8x32
+func (from Uint8x32) AsInt8x32() (to Int8x32)
+
+// Int16x16 converts from Uint8x32 to Int16x16
+func (from Uint8x32) AsInt16x16() (to Int16x16)
+
+// Int32x8 converts from Uint8x32 to Int32x8
+func (from Uint8x32) AsInt32x8() (to Int32x8)
+
+// Int64x4 converts from Uint8x32 to Int64x4
+func (from Uint8x32) AsInt64x4() (to Int64x4)
+
+// Uint16x16 converts from Uint8x32 to Uint16x16
+func (from Uint8x32) AsUint16x16() (to Uint16x16)
+
+// Uint32x8 converts from Uint8x32 to Uint32x8
+func (from Uint8x32) AsUint32x8() (to Uint32x8)
+
+// Uint64x4 converts from Uint8x32 to Uint64x4
+func (from Uint8x32) AsUint64x4() (to Uint64x4)
+
+// Float32x16 converts from Uint8x64 to Float32x16
+func (from Uint8x64) AsFloat32x16() (to Float32x16)
+
+// Float64x8 converts from Uint8x64 to Float64x8
+func (from Uint8x64) AsFloat64x8() (to Float64x8)
+
+// Int8x64 converts from Uint8x64 to Int8x64
+func (from Uint8x64) AsInt8x64() (to Int8x64)
+
+// Int16x32 converts from Uint8x64 to Int16x32
+func (from Uint8x64) AsInt16x32() (to Int16x32)
+
+// Int32x16 converts from Uint8x64 to Int32x16
+func (from Uint8x64) AsInt32x16() (to Int32x16)
+
+// Int64x8 converts from Uint8x64 to Int64x8
+func (from Uint8x64) AsInt64x8() (to Int64x8)
+
+// Uint16x32 converts from Uint8x64 to Uint16x32
+func (from Uint8x64) AsUint16x32() (to Uint16x32)
+
+// Uint32x16 converts from Uint8x64 to Uint32x16
+func (from Uint8x64) AsUint32x16() (to Uint32x16)
+
+// Uint64x8 converts from Uint8x64 to Uint64x8
+func (from Uint8x64) AsUint64x8() (to Uint64x8)
+
+// Float32x4 converts from Uint16x8 to Float32x4
+func (from Uint16x8) AsFloat32x4() (to Float32x4)
+
+// Float64x2 converts from Uint16x8 to Float64x2
+func (from Uint16x8) AsFloat64x2() (to Float64x2)
+
+// Int8x16 converts from Uint16x8 to Int8x16
+func (from Uint16x8) AsInt8x16() (to Int8x16)
+
+// Int16x8 converts from Uint16x8 to Int16x8
+func (from Uint16x8) AsInt16x8() (to Int16x8)
+
+// Int32x4 converts from Uint16x8 to Int32x4
+func (from Uint16x8) AsInt32x4() (to Int32x4)
+
+// Int64x2 converts from Uint16x8 to Int64x2
+func (from Uint16x8) AsInt64x2() (to Int64x2)
+
+// Uint8x16 converts from Uint16x8 to Uint8x16
+func (from Uint16x8) AsUint8x16() (to Uint8x16)
+
+// Uint32x4 converts from Uint16x8 to Uint32x4
+func (from Uint16x8) AsUint32x4() (to Uint32x4)
+
+// Uint64x2 converts from Uint16x8 to Uint64x2
+func (from Uint16x8) AsUint64x2() (to Uint64x2)
+
+// Float32x8 converts from Uint16x16 to Float32x8
+func (from Uint16x16) AsFloat32x8() (to Float32x8)
+
+// Float64x4 converts from Uint16x16 to Float64x4
+func (from Uint16x16) AsFloat64x4() (to Float64x4)
+
+// Int8x32 converts from Uint16x16 to Int8x32
+func (from Uint16x16) AsInt8x32() (to Int8x32)
+
+// Int16x16 converts from Uint16x16 to Int16x16
+func (from Uint16x16) AsInt16x16() (to Int16x16)
+
+// Int32x8 converts from Uint16x16 to Int32x8
+func (from Uint16x16) AsInt32x8() (to Int32x8)
+
+// Int64x4 converts from Uint16x16 to Int64x4
+func (from Uint16x16) AsInt64x4() (to Int64x4)
+
+// Uint8x32 converts from Uint16x16 to Uint8x32
+func (from Uint16x16) AsUint8x32() (to Uint8x32)
+
+// Uint32x8 converts from Uint16x16 to Uint32x8
+func (from Uint16x16) AsUint32x8() (to Uint32x8)
+
+// Uint64x4 converts from Uint16x16 to Uint64x4
+func (from Uint16x16) AsUint64x4() (to Uint64x4)
+
+// Float32x16 converts from Uint16x32 to Float32x16
+func (from Uint16x32) AsFloat32x16() (to Float32x16)
+
+// Float64x8 converts from Uint16x32 to Float64x8
+func (from Uint16x32) AsFloat64x8() (to Float64x8)
+
+// Int8x64 converts from Uint16x32 to Int8x64
+func (from Uint16x32) AsInt8x64() (to Int8x64)
+
+// Int16x32 converts from Uint16x32 to Int16x32
+func (from Uint16x32) AsInt16x32() (to Int16x32)
+
+// Int32x16 converts from Uint16x32 to Int32x16
+func (from Uint16x32) AsInt32x16() (to Int32x16)
+
+// Int64x8 converts from Uint16x32 to Int64x8
+func (from Uint16x32) AsInt64x8() (to Int64x8)
+
+// Uint8x64 converts from Uint16x32 to Uint8x64
+func (from Uint16x32) AsUint8x64() (to Uint8x64)
+
+// Uint32x16 converts from Uint16x32 to Uint32x16
+func (from Uint16x32) AsUint32x16() (to Uint32x16)
+
+// Uint64x8 converts from Uint16x32 to Uint64x8
+func (from Uint16x32) AsUint64x8() (to Uint64x8)
+
+// Float32x4 converts from Uint32x4 to Float32x4
+func (from Uint32x4) AsFloat32x4() (to Float32x4)
+
+// Float64x2 converts from Uint32x4 to Float64x2
+func (from Uint32x4) AsFloat64x2() (to Float64x2)
+
+// Int8x16 converts from Uint32x4 to Int8x16
+func (from Uint32x4) AsInt8x16() (to Int8x16)
+
+// Int16x8 converts from Uint32x4 to Int16x8
+func (from Uint32x4) AsInt16x8() (to Int16x8)
+
+// Int32x4 converts from Uint32x4 to Int32x4
+func (from Uint32x4) AsInt32x4() (to Int32x4)
+
+// Int64x2 converts from Uint32x4 to Int64x2
+func (from Uint32x4) AsInt64x2() (to Int64x2)
+
+// Uint8x16 converts from Uint32x4 to Uint8x16
+func (from Uint32x4) AsUint8x16() (to Uint8x16)
+
+// Uint16x8 converts from Uint32x4 to Uint16x8
+func (from Uint32x4) AsUint16x8() (to Uint16x8)
+
+// Uint64x2 converts from Uint32x4 to Uint64x2
+func (from Uint32x4) AsUint64x2() (to Uint64x2)
+
+// Float32x8 converts from Uint32x8 to Float32x8
+func (from Uint32x8) AsFloat32x8() (to Float32x8)
+
+// Float64x4 converts from Uint32x8 to Float64x4
+func (from Uint32x8) AsFloat64x4() (to Float64x4)
+
+// Int8x32 converts from Uint32x8 to Int8x32
+func (from Uint32x8) AsInt8x32() (to Int8x32)
+
+// Int16x16 converts from Uint32x8 to Int16x16
+func (from Uint32x8) AsInt16x16() (to Int16x16)
+
+// Int32x8 converts from Uint32x8 to Int32x8
+func (from Uint32x8) AsInt32x8() (to Int32x8)
+
+// Int64x4 converts from Uint32x8 to Int64x4
+func (from Uint32x8) AsInt64x4() (to Int64x4)
+
+// Uint8x32 converts from Uint32x8 to Uint8x32
+func (from Uint32x8) AsUint8x32() (to Uint8x32)
+
+// Uint16x16 converts from Uint32x8 to Uint16x16
+func (from Uint32x8) AsUint16x16() (to Uint16x16)
+
+// Uint64x4 converts from Uint32x8 to Uint64x4
+func (from Uint32x8) AsUint64x4() (to Uint64x4)
+
+// Float32x16 converts from Uint32x16 to Float32x16
+func (from Uint32x16) AsFloat32x16() (to Float32x16)
+
+// Float64x8 converts from Uint32x16 to Float64x8
+func (from Uint32x16) AsFloat64x8() (to Float64x8)
+
+// Int8x64 converts from Uint32x16 to Int8x64
+func (from Uint32x16) AsInt8x64() (to Int8x64)
+
+// Int16x32 converts from Uint32x16 to Int16x32
+func (from Uint32x16) AsInt16x32() (to Int16x32)
+
+// Int32x16 converts from Uint32x16 to Int32x16
+func (from Uint32x16) AsInt32x16() (to Int32x16)
+
+// Int64x8 converts from Uint32x16 to Int64x8
+func (from Uint32x16) AsInt64x8() (to Int64x8)
+
+// Uint8x64 converts from Uint32x16 to Uint8x64
+func (from Uint32x16) AsUint8x64() (to Uint8x64)
+
+// Uint16x32 converts from Uint32x16 to Uint16x32
+func (from Uint32x16) AsUint16x32() (to Uint16x32)
+
+// Uint64x8 converts from Uint32x16 to Uint64x8
+func (from Uint32x16) AsUint64x8() (to Uint64x8)
+
+// Float32x4 converts from Uint64x2 to Float32x4
+func (from Uint64x2) AsFloat32x4() (to Float32x4)
+
+// Float64x2 converts from Uint64x2 to Float64x2
+func (from Uint64x2) AsFloat64x2() (to Float64x2)
+
+// Int8x16 converts from Uint64x2 to Int8x16
+func (from Uint64x2) AsInt8x16() (to Int8x16)
+
+// Int16x8 converts from Uint64x2 to Int16x8
+func (from Uint64x2) AsInt16x8() (to Int16x8)
+
+// Int32x4 converts from Uint64x2 to Int32x4
+func (from Uint64x2) AsInt32x4() (to Int32x4)
+
+// Int64x2 converts from Uint64x2 to Int64x2
+func (from Uint64x2) AsInt64x2() (to Int64x2)
+
+// Uint8x16 converts from Uint64x2 to Uint8x16
+func (from Uint64x2) AsUint8x16() (to Uint8x16)
+
+// Uint16x8 converts from Uint64x2 to Uint16x8
+func (from Uint64x2) AsUint16x8() (to Uint16x8)
+
+// Uint32x4 converts from Uint64x2 to Uint32x4
+func (from Uint64x2) AsUint32x4() (to Uint32x4)
+
+// Float32x8 converts from Uint64x4 to Float32x8
+func (from Uint64x4) AsFloat32x8() (to Float32x8)
+
+// Float64x4 converts from Uint64x4 to Float64x4
+func (from Uint64x4) AsFloat64x4() (to Float64x4)
+
+// Int8x32 converts from Uint64x4 to Int8x32
+func (from Uint64x4) AsInt8x32() (to Int8x32)
+
+// Int16x16 converts from Uint64x4 to Int16x16
+func (from Uint64x4) AsInt16x16() (to Int16x16)
+
+// Int32x8 converts from Uint64x4 to Int32x8
+func (from Uint64x4) AsInt32x8() (to Int32x8)
+
+// Int64x4 converts from Uint64x4 to Int64x4
+func (from Uint64x4) AsInt64x4() (to Int64x4)
+
+// Uint8x32 converts from Uint64x4 to Uint8x32
+func (from Uint64x4) AsUint8x32() (to Uint8x32)
+
+// Uint16x16 converts from Uint64x4 to Uint16x16
+func (from Uint64x4) AsUint16x16() (to Uint16x16)
+
+// Uint32x8 converts from Uint64x4 to Uint32x8
+func (from Uint64x4) AsUint32x8() (to Uint32x8)
+
+// Float32x16 converts from Uint64x8 to Float32x16
+func (from Uint64x8) AsFloat32x16() (to Float32x16)
+
+// Float64x8 converts from Uint64x8 to Float64x8
+func (from Uint64x8) AsFloat64x8() (to Float64x8)
+
+// Int8x64 converts from Uint64x8 to Int8x64
+func (from Uint64x8) AsInt8x64() (to Int8x64)
+
+// Int16x32 converts from Uint64x8 to Int16x32
+func (from Uint64x8) AsInt16x32() (to Int16x32)
+
+// Int32x16 converts from Uint64x8 to Int32x16
+func (from Uint64x8) AsInt32x16() (to Int32x16)
+
+// Int64x8 converts from Uint64x8 to Int64x8
+func (from Uint64x8) AsInt64x8() (to Int64x8)
+
+// Uint8x64 converts from Uint64x8 to Uint8x64
+func (from Uint64x8) AsUint8x64() (to Uint8x64)
+
+// Uint16x32 converts from Uint64x8 to Uint16x32
+func (from Uint64x8) AsUint16x32() (to Uint16x32)
+
+// Uint32x16 converts from Uint64x8 to Uint32x16
+func (from Uint64x8) AsUint32x16() (to Uint32x16)
+
+// AsInt8x16 converts from Mask8x16 to Int8x16
+func (from Mask8x16) AsInt8x16() (to Int8x16)
+
+// asMask converts from Int8x16 to Mask8x16
+func (from Int8x16) asMask() (to Mask8x16)
+
+func (x Mask8x16) And(y Mask8x16) Mask8x16
+
+func (x Mask8x16) Or(y Mask8x16) Mask8x16
+
+// AsInt8x32 converts from Mask8x32 to Int8x32
+func (from Mask8x32) AsInt8x32() (to Int8x32)
+
+// asMask converts from Int8x32 to Mask8x32
+func (from Int8x32) asMask() (to Mask8x32)
+
+func (x Mask8x32) And(y Mask8x32) Mask8x32
+
+func (x Mask8x32) Or(y Mask8x32) Mask8x32
+
+// AsInt8x64 converts from Mask8x64 to Int8x64
+func (from Mask8x64) AsInt8x64() (to Int8x64)
+
+// asMask converts from Int8x64 to Mask8x64
+func (from Int8x64) asMask() (to Mask8x64)
+
+func (x Mask8x64) And(y Mask8x64) Mask8x64
+
+func (x Mask8x64) Or(y Mask8x64) Mask8x64
+
+// AsInt16x8 converts from Mask16x8 to Int16x8
+func (from Mask16x8) AsInt16x8() (to Int16x8)
+
+// asMask converts from Int16x8 to Mask16x8
+func (from Int16x8) asMask() (to Mask16x8)
+
+func (x Mask16x8) And(y Mask16x8) Mask16x8
+
+func (x Mask16x8) Or(y Mask16x8) Mask16x8
+
+// AsInt16x16 converts from Mask16x16 to Int16x16
+func (from Mask16x16) AsInt16x16() (to Int16x16)
+
+// asMask converts from Int16x16 to Mask16x16
+func (from Int16x16) asMask() (to Mask16x16)
+
+func (x Mask16x16) And(y Mask16x16) Mask16x16
+
+func (x Mask16x16) Or(y Mask16x16) Mask16x16
+
+// AsInt16x32 converts from Mask16x32 to Int16x32
+func (from Mask16x32) AsInt16x32() (to Int16x32)
+
+// asMask converts from Int16x32 to Mask16x32
+func (from Int16x32) asMask() (to Mask16x32)
+
+func (x Mask16x32) And(y Mask16x32) Mask16x32
+
+func (x Mask16x32) Or(y Mask16x32) Mask16x32
+
+// AsInt32x4 converts from Mask32x4 to Int32x4
+func (from Mask32x4) AsInt32x4() (to Int32x4)
+
+// asMask converts from Int32x4 to Mask32x4
+func (from Int32x4) asMask() (to Mask32x4)
+
+func (x Mask32x4) And(y Mask32x4) Mask32x4
+
+func (x Mask32x4) Or(y Mask32x4) Mask32x4
+
+// AsInt32x8 converts from Mask32x8 to Int32x8
+func (from Mask32x8) AsInt32x8() (to Int32x8)
+
+// asMask converts from Int32x8 to Mask32x8
+func (from Int32x8) asMask() (to Mask32x8)
+
+func (x Mask32x8) And(y Mask32x8) Mask32x8
+
+func (x Mask32x8) Or(y Mask32x8) Mask32x8
+
+// AsInt32x16 converts from Mask32x16 to Int32x16
+func (from Mask32x16) AsInt32x16() (to Int32x16)
+
+// asMask converts from Int32x16 to Mask32x16
+func (from Int32x16) asMask() (to Mask32x16)
+
+func (x Mask32x16) And(y Mask32x16) Mask32x16
+
+func (x Mask32x16) Or(y Mask32x16) Mask32x16
+
+// AsInt64x2 converts from Mask64x2 to Int64x2
+func (from Mask64x2) AsInt64x2() (to Int64x2)
+
+// asMask converts from Int64x2 to Mask64x2
+func (from Int64x2) asMask() (to Mask64x2)
+
+func (x Mask64x2) And(y Mask64x2) Mask64x2
+
+func (x Mask64x2) Or(y Mask64x2) Mask64x2
+
+// AsInt64x4 converts from Mask64x4 to Int64x4
+func (from Mask64x4) AsInt64x4() (to Int64x4)
+
+// asMask converts from Int64x4 to Mask64x4
+func (from Int64x4) asMask() (to Mask64x4)
+
+func (x Mask64x4) And(y Mask64x4) Mask64x4
+
+func (x Mask64x4) Or(y Mask64x4) Mask64x4
+
+// AsInt64x8 converts from Mask64x8 to Int64x8
+func (from Mask64x8) AsInt64x8() (to Int64x8)
+
+// asMask converts from Int64x8 to Mask64x8
+func (from Int64x8) asMask() (to Mask64x8)
+
+func (x Mask64x8) And(y Mask64x8) Mask64x8
+
+func (x Mask64x8) Or(y Mask64x8) Mask64x8
--- /dev/null
+// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+/* blend */
+
+// blend blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// Asm: VPBLENDVB, CPU Feature: AVX
+func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16
+
+// blend blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// Asm: VPBLENDVB, CPU Feature: AVX2
+func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32
+
+/* blendMasked */
+
+// blendMasked blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBLENDMB, CPU Feature: AVX512
+func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64
+
+// blendMasked blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBLENDMW, CPU Feature: AVX512
+func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32
+
+// blendMasked blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBLENDMD, CPU Feature: AVX512
+func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16
+
+// blendMasked blends two vectors based on mask values, choosing either
+// the first or the second based on whether the third is false or true
+//
+// This operation is applied selectively under a write mask.
+//
+// Asm: VPBLENDMQ, CPU Feature: AVX512
+func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8
+
+/* carrylessMultiply */
+
+// carrylessMultiply computes one of four possible Galois polynomial
+// products of selected high and low halves of x and y,
+// depending on the value of xyHiLo, returning the 128-bit
+// product in the concatenated two elements of the result.
+// Bit 0 selects the low (0) or high (1) element of x and
+// bit 4 selects the low (0x00) or high (0x10) element of y.
+//
+// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX
+func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2
+
+// carrylessMultiply computes one of two possible Galois polynomial
+// products of selected high and low halves of each of the two
+// 128-bit lanes of x and y, depending on the value of xyHiLo,
+// and returns the four 128-bit products in the result's lanes.
+// Bit 0 selects the low (0) or high (1) elements of x's lanes and
+// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+//
+// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4
+
+// carrylessMultiply computes one of four possible Galois polynomial
+// products of selected high and low halves of each of the four
+// 128-bit lanes of x and y, depending on the value of xyHiLo,
+// and returns the four 128-bit products in the result's lanes.
+// Bit 0 selects the low (0) or high (1) elements of x's lanes and
+// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+//
+// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8
+
+/* concatSelectedConstant */
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output. The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specify which element from y or x to select.
+// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
+// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output. The selection is chosen by the constant parameter hilo
+// where hi and lo are each one bit specifying which 64-bit element to select
+// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7})
+// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
+// selecting from y, is 1, and selects 7.
+//
+// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output. The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specify which element from y or x to select.
+// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
+// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output. The selection is chosen by the constant parameter hilo
+// where hi and lo are each one bit specifying which 64-bit element to select
+// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7})
+// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
+// selecting from y, is 1, and selects 7.
+//
+// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output. The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specify which element from y or x to select.
+// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
+// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4
+
+// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
+// halves of the output. The selection is chosen by the constant parameter hilo
+// where hi and lo are each one bit specifying which 64-bit element to select
+// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7})
+// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
+// selecting from y, is 1, and selects 7.
+//
+// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2
+
+/* concatSelectedConstantGrouped */
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
+// returns {2,0,5,7,10,8,13,15}
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+//
+// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
+// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+//
+// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+//
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX512
+func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
+// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
+// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11).
+// This differs from the same method applied to a 32x8 vector, where
+// the 8-bit constant performs the same selection on both subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
+// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
+// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's next 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select
+// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
+// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
+// This differs from the same method applied to a 32x8 or 32x16 vector, where
+// the 8-bit constant performs the same selection on all the subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX512
+func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
+// returns {2,0,5,7,10,8,13,15}
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+//
+// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
+// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+//
+// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+//
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX512
+func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
+// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
+// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11).
+// This differs from the same method applied to a 32x8 vector, where
+// the 8-bit constant performs the same selection on both subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
+// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
+// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's next 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select
+// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
+// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
+// This differs from the same method applied to a 32x8 or 32x16 vector, where
+// the 8-bit constant performs the same selection on all the subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX512
+func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
+// returns {2,0,5,7,10,8,13,15}
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selection is chosen by the constant parameter h1h0l1l0
+// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
+// For example,
+//
+// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
+// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
+//
+// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
+//
+// (don't forget that the binary constant is written big-endian).
+//
+// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPS, CPU Feature: AVX512
+func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
+// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
+// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11).
+// This differs from the same method applied to a 32x8 vector, where
+// the 8-bit constant performs the same selection on both subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4
+
+// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
+// into the lower and upper halves of corresponding subvectors of the output.
+// The selections are specified by the constant parameter hilos where each
+// hi and lo pair select 64-bit elements from the corresponding 128-bit
+// subvectors of x and y.
+//
+// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
+// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
+// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
+// then 1, selecting element 1 from x's next 128 bits (9), then 1,
+// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select
+// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
+// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
+// This differs from the same method applied to a 32x8 or 32x16 vector, where
+// the 8-bit constant performs the same selection on all the subvectors.
+//
+// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VSHUFPD, CPU Feature: AVX512
+func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
+
+/* permuteScalars */
+
+// permuteScalars performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) permuteScalars(indices uint8) Int32x4
+
+// permuteScalars performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
+
+/* permuteScalarsGrouped */
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
+
+// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
+
+/* permuteScalarsHi */
+
+// permuteScalarsHi performs a permutation of vector x using constant indices:
+// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
+
+// permuteScalarsHi performs a permutation of vector x using constant indices:
+// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8
+
+/* permuteScalarsHiGrouped */
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16
+
+// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
+// result =
+//
+// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
+// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
+
+/* permuteScalarsLo */
+
+// permuteScalarsLo performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
+
+// permuteScalarsLo performs a permutation of vector x using constant indices:
+// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8
+
+/* permuteScalarsLoGrouped */
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+// x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+// x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+// x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16
+
+// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
+//
+// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
+// x_group1[indices[0:2]], ...}
+//
+// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
+// Each group is of size 128-bit.
+//
+// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32
+
+/* tern */
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGD, CPU Feature: AVX512
+func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4
+
+// tern performs a logical operation on three vectors based on the 8-bit truth table.
+// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
+//
+// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPTERNLOGQ, CPU Feature: AVX512
+func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8
--- /dev/null
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+// BroadcastInt8x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt8x16(x int8) Int8x16 {
+ var z Int8x16
+ return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastInt16x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt16x8(x int16) Int16x8 {
+ var z Int16x8
+ return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastInt32x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt32x4(x int32) Int32x4 {
+ var z Int32x4
+ return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastInt64x2 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt64x2(x int64) Int64x2 {
+ var z Int64x2
+ return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastUint8x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint8x16(x uint8) Uint8x16 {
+ var z Uint8x16
+ return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastUint16x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint16x8(x uint16) Uint16x8 {
+ var z Uint16x8
+ return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastUint32x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint32x4(x uint32) Uint32x4 {
+ var z Uint32x4
+ return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastUint64x2 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint64x2(x uint64) Uint64x2 {
+ var z Uint64x2
+ return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastFloat32x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastFloat32x4(x float32) Float32x4 {
+ var z Float32x4
+ return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastFloat64x2 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastFloat64x2(x float64) Float64x2 {
+ var z Float64x2
+ return z.SetElem(0, x).Broadcast128()
+}
+
+// BroadcastInt8x32 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt8x32(x int8) Int8x32 {
+ var z Int8x16
+ return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastInt16x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt16x16(x int16) Int16x16 {
+ var z Int16x8
+ return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastInt32x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt32x8(x int32) Int32x8 {
+ var z Int32x4
+ return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastInt64x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastInt64x4(x int64) Int64x4 {
+ var z Int64x2
+ return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastUint8x32 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint8x32(x uint8) Uint8x32 {
+ var z Uint8x16
+ return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastUint16x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint16x16(x uint16) Uint16x16 {
+ var z Uint16x8
+ return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastUint32x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint32x8(x uint32) Uint32x8 {
+ var z Uint32x4
+ return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastUint64x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastUint64x4(x uint64) Uint64x4 {
+ var z Uint64x2
+ return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastFloat32x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastFloat32x8(x float32) Float32x8 {
+ var z Float32x4
+ return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastFloat64x4 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX2
+func BroadcastFloat64x4(x float64) Float64x4 {
+ var z Float64x2
+ return z.SetElem(0, x).Broadcast256()
+}
+
+// BroadcastInt8x64 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512BW
+func BroadcastInt8x64(x int8) Int8x64 {
+ var z Int8x16
+ return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastInt16x32 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512BW
+func BroadcastInt16x32(x int16) Int16x32 {
+ var z Int16x8
+ return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastInt32x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastInt32x16(x int32) Int32x16 {
+ var z Int32x4
+ return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastInt64x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastInt64x8(x int64) Int64x8 {
+ var z Int64x2
+ return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastUint8x64 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512BW
+func BroadcastUint8x64(x uint8) Uint8x64 {
+ var z Uint8x16
+ return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastUint16x32 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512BW
+func BroadcastUint16x32(x uint16) Uint16x32 {
+ var z Uint16x8
+ return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastUint32x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastUint32x16(x uint32) Uint32x16 {
+ var z Uint32x4
+ return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastUint64x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastUint64x8(x uint64) Uint64x8 {
+ var z Uint64x2
+ return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastFloat32x16 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastFloat32x16(x float32) Float32x16 {
+ var z Float32x4
+ return z.SetElem(0, x).Broadcast512()
+}
+
+// BroadcastFloat64x8 returns a vector with the input
+// x assigned to all elements of the output.
+//
+// Emulated, CPU Feature AVX512F
+func BroadcastFloat64x8(x float64) Float64x8 {
+ var z Float64x2
+ return z.SetElem(0, x).Broadcast512()
+}
+
+// ToMask converts from Int8x16 to Mask8x16, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int8x16) ToMask() (to Mask8x16) {
+ return from.NotEqual(Int8x16{})
+}
+
+// ToMask converts from Int16x8 to Mask16x8, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int16x8) ToMask() (to Mask16x8) {
+ return from.NotEqual(Int16x8{})
+}
+
+// ToMask converts from Int32x4 to Mask32x4, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int32x4) ToMask() (to Mask32x4) {
+ return from.NotEqual(Int32x4{})
+}
+
+// ToMask converts from Int64x2 to Mask64x2, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int64x2) ToMask() (to Mask64x2) {
+ return from.NotEqual(Int64x2{})
+}
+
+// ToMask converts from Uint8x16 to Mask8x16, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint8x16) ToMask() (to Mask8x16) {
+ return from.NotEqual(Uint8x16{})
+}
+
+// ToMask converts from Uint16x8 to Mask16x8, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint16x8) ToMask() (to Mask16x8) {
+ return from.NotEqual(Uint16x8{})
+}
+
+// ToMask converts from Uint32x4 to Mask32x4, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint32x4) ToMask() (to Mask32x4) {
+ return from.NotEqual(Uint32x4{})
+}
+
+// ToMask converts from Uint64x2 to Mask64x2, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint64x2) ToMask() (to Mask64x2) {
+ return from.NotEqual(Uint64x2{})
+}
+
+// ToMask converts from Float32x4 to Mask32x4, mask element is set to true when the corresponding vector element is non-zero.
+func (from Float32x4) ToMask() (to Mask32x4) {
+ return from.NotEqual(Float32x4{})
+}
+
+// ToMask converts from Float64x2 to Mask64x2, mask element is set to true when the corresponding vector element is non-zero.
+func (from Float64x2) ToMask() (to Mask64x2) {
+ return from.NotEqual(Float64x2{})
+}
+
+// ToMask converts from Int8x32 to Mask8x32, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int8x32) ToMask() (to Mask8x32) {
+ return from.NotEqual(Int8x32{})
+}
+
+// ToMask converts from Int16x16 to Mask16x16, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int16x16) ToMask() (to Mask16x16) {
+ return from.NotEqual(Int16x16{})
+}
+
+// ToMask converts from Int32x8 to Mask32x8, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int32x8) ToMask() (to Mask32x8) {
+ return from.NotEqual(Int32x8{})
+}
+
+// ToMask converts from Int64x4 to Mask64x4, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int64x4) ToMask() (to Mask64x4) {
+ return from.NotEqual(Int64x4{})
+}
+
+// ToMask converts from Uint8x32 to Mask8x32, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint8x32) ToMask() (to Mask8x32) {
+ return from.NotEqual(Uint8x32{})
+}
+
+// ToMask converts from Uint16x16 to Mask16x16, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint16x16) ToMask() (to Mask16x16) {
+ return from.NotEqual(Uint16x16{})
+}
+
+// ToMask converts from Uint32x8 to Mask32x8, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint32x8) ToMask() (to Mask32x8) {
+ return from.NotEqual(Uint32x8{})
+}
+
+// ToMask converts from Uint64x4 to Mask64x4, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint64x4) ToMask() (to Mask64x4) {
+ return from.NotEqual(Uint64x4{})
+}
+
+// ToMask converts from Float32x8 to Mask32x8, mask element is set to true when the corresponding vector element is non-zero.
+func (from Float32x8) ToMask() (to Mask32x8) {
+ return from.NotEqual(Float32x8{})
+}
+
+// ToMask converts from Float64x4 to Mask64x4, mask element is set to true when the corresponding vector element is non-zero.
+func (from Float64x4) ToMask() (to Mask64x4) {
+ return from.NotEqual(Float64x4{})
+}
+
+// ToMask converts from Int8x64 to Mask8x64, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int8x64) ToMask() (to Mask8x64) {
+ return from.NotEqual(Int8x64{})
+}
+
+// ToMask converts from Int16x32 to Mask16x32, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int16x32) ToMask() (to Mask16x32) {
+ return from.NotEqual(Int16x32{})
+}
+
+// ToMask converts from Int32x16 to Mask32x16, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int32x16) ToMask() (to Mask32x16) {
+ return from.NotEqual(Int32x16{})
+}
+
+// ToMask converts from Int64x8 to Mask64x8, mask element is set to true when the corresponding vector element is non-zero.
+func (from Int64x8) ToMask() (to Mask64x8) {
+ return from.NotEqual(Int64x8{})
+}
+
+// ToMask converts from Uint8x64 to Mask8x64, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint8x64) ToMask() (to Mask8x64) {
+ return from.NotEqual(Uint8x64{})
+}
+
+// ToMask converts from Uint16x32 to Mask16x32, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint16x32) ToMask() (to Mask16x32) {
+ return from.NotEqual(Uint16x32{})
+}
+
+// ToMask converts from Uint32x16 to Mask32x16, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint32x16) ToMask() (to Mask32x16) {
+ return from.NotEqual(Uint32x16{})
+}
+
+// ToMask converts from Uint64x8 to Mask64x8, mask element is set to true when the corresponding vector element is non-zero.
+func (from Uint64x8) ToMask() (to Mask64x8) {
+ return from.NotEqual(Uint64x8{})
+}
+
+// ToMask converts from Float32x16 to Mask32x16, mask element is set to true when the corresponding vector element is non-zero.
+func (from Float32x16) ToMask() (to Mask32x16) {
+ return from.NotEqual(Float32x16{})
+}
+
+// ToMask converts from Float64x8 to Mask64x8, mask element is set to true when the corresponding vector element is non-zero.
+func (from Float64x8) ToMask() (to Mask64x8) {
+ return from.NotEqual(Float64x8{})
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX
+func (x Int8x16) Not() Int8x16 {
+ return x.Xor(x.Equal(x).AsInt8x16())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX
+func (x Int16x8) Not() Int16x8 {
+ return x.Xor(x.Equal(x).AsInt16x8())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX
+func (x Int32x4) Not() Int32x4 {
+ return x.Xor(x.Equal(x).AsInt32x4())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX
+func (x Int64x2) Not() Int64x2 {
+ return x.Xor(x.Equal(x).AsInt64x2())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX2
+func (x Int8x32) Not() Int8x32 {
+ return x.Xor(x.Equal(x).AsInt8x32())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX2
+func (x Int16x16) Not() Int16x16 {
+ return x.Xor(x.Equal(x).AsInt16x16())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX2
+func (x Int32x8) Not() Int32x8 {
+ return x.Xor(x.Equal(x).AsInt32x8())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX2
+func (x Int64x4) Not() Int64x4 {
+ return x.Xor(x.Equal(x).AsInt64x4())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX512
+func (x Int8x64) Not() Int8x64 {
+ return x.Xor(x.Equal(x).AsInt8x64())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX512
+func (x Int16x32) Not() Int16x32 {
+ return x.Xor(x.Equal(x).AsInt16x32())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX512
+func (x Int32x16) Not() Int32x16 {
+ return x.Xor(x.Equal(x).AsInt32x16())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX512
+func (x Int64x8) Not() Int64x8 {
+ return x.Xor(x.Equal(x).AsInt64x8())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX
+func (x Uint8x16) Not() Uint8x16 {
+ return x.Xor(x.Equal(x).AsInt8x16().AsUint8x16())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX
+func (x Uint16x8) Not() Uint16x8 {
+ return x.Xor(x.Equal(x).AsInt16x8().AsUint16x8())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX
+func (x Uint32x4) Not() Uint32x4 {
+ return x.Xor(x.Equal(x).AsInt32x4().AsUint32x4())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX
+func (x Uint64x2) Not() Uint64x2 {
+ return x.Xor(x.Equal(x).AsInt64x2().AsUint64x2())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX2
+func (x Uint8x32) Not() Uint8x32 {
+ return x.Xor(x.Equal(x).AsInt8x32().AsUint8x32())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX2
+func (x Uint16x16) Not() Uint16x16 {
+ return x.Xor(x.Equal(x).AsInt16x16().AsUint16x16())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX2
+func (x Uint32x8) Not() Uint32x8 {
+ return x.Xor(x.Equal(x).AsInt32x8().AsUint32x8())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX2
+func (x Uint64x4) Not() Uint64x4 {
+ return x.Xor(x.Equal(x).AsInt64x4().AsUint64x4())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX512
+func (x Uint8x64) Not() Uint8x64 {
+ return x.Xor(x.Equal(x).AsInt8x64().AsUint8x64())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX512
+func (x Uint16x32) Not() Uint16x32 {
+ return x.Xor(x.Equal(x).AsInt16x32().AsUint16x32())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX512
+func (x Uint32x16) Not() Uint32x16 {
+ return x.Xor(x.Equal(x).AsInt32x16().AsUint32x16())
+}
+
+// Not returns the bitwise complement of x
+//
+// Emulated, CPU Feature AVX512
+func (x Uint64x8) Not() Uint64x8 {
+ return x.Xor(x.Equal(x).AsInt64x8().AsUint64x8())
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int8x16) String() string {
+ var s [16]int8
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int16x8) String() string {
+ var s [8]int16
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int32x4) String() string {
+ var s [4]int32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int64x2) String() string {
+ var s [2]int64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint8x16) String() string {
+ var s [16]uint8
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint16x8) String() string {
+ var s [8]uint16
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint32x4) String() string {
+ var s [4]uint32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint64x2) String() string {
+ var s [2]uint64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Float32x4) String() string {
+ var s [4]float32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Float64x2) String() string {
+ var s [2]float64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int8x32) String() string {
+ var s [32]int8
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int16x16) String() string {
+ var s [16]int16
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int32x8) String() string {
+ var s [8]int32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int64x4) String() string {
+ var s [4]int64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint8x32) String() string {
+ var s [32]uint8
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint16x16) String() string {
+ var s [16]uint16
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint32x8) String() string {
+ var s [8]uint32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint64x4) String() string {
+ var s [4]uint64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Float32x8) String() string {
+ var s [8]float32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Float64x4) String() string {
+ var s [4]float64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int8x64) String() string {
+ var s [64]int8
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int16x32) String() string {
+ var s [32]int16
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int32x16) String() string {
+ var s [16]int32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Int64x8) String() string {
+ var s [8]int64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint8x64) String() string {
+ var s [64]uint8
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint16x32) String() string {
+ var s [32]uint16
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint32x16) String() string {
+ var s [16]uint32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Uint64x8) String() string {
+ var s [8]uint64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Float32x16) String() string {
+ var s [16]float32
+ x.Store(&s)
+ return sliceToString(s[:])
+}
+
+// String returns a string representation of SIMD vector x
+func (x Float64x8) String() string {
+ var s [8]float64
+ x.Store(&s)
+ return sliceToString(s[:])
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package archsimd_test
+
+import (
+ "simd/archsimd"
+ "simd/archsimd/internal/test_helpers"
+ "testing"
+)
+
+func TestConcatSelectedConstant64(t *testing.T) {
+ a := make([]int64, 2)
+ x := archsimd.LoadInt64x2Slice([]int64{4, 5})
+ y := archsimd.LoadInt64x2Slice([]int64{6, 7})
+ z := x.ExportTestConcatSelectedConstant(0b10, y)
+ z.StoreSlice(a)
+ test_helpers.CheckSlices[int64](t, a, []int64{4, 7})
+}
+
+func TestConcatSelectedConstantGrouped64(t *testing.T) {
+ a := make([]float64, 4)
+ x := archsimd.LoadFloat64x4Slice([]float64{4, 5, 8, 9})
+ y := archsimd.LoadFloat64x4Slice([]float64{6, 7, 10, 11})
+ z := x.ExportTestConcatSelectedConstantGrouped(0b_11_10, y)
+ z.StoreSlice(a)
+ test_helpers.CheckSlices[float64](t, a, []float64{4, 7, 9, 11})
+}
+
+func TestConcatSelectedConstant32(t *testing.T) {
+ a := make([]float32, 4)
+ x := archsimd.LoadFloat32x4Slice([]float32{4, 5, 8, 9})
+ y := archsimd.LoadFloat32x4Slice([]float32{6, 7, 10, 11})
+ z := x.ExportTestConcatSelectedConstant(0b_11_01_10_00, y)
+ z.StoreSlice(a)
+ test_helpers.CheckSlices[float32](t, a, []float32{4, 8, 7, 11})
+}
+
+func TestConcatSelectedConstantGrouped32(t *testing.T) {
+ a := make([]uint32, 8)
+ x := archsimd.LoadUint32x8Slice([]uint32{0, 1, 2, 3, 8, 9, 10, 11})
+ y := archsimd.LoadUint32x8Slice([]uint32{4, 5, 6, 7, 12, 13, 14, 15})
+ z := x.ExportTestConcatSelectedConstantGrouped(0b_11_01_00_10, y)
+ z.StoreSlice(a)
+ test_helpers.CheckSlices[uint32](t, a, []uint32{2, 0, 5, 7, 10, 8, 13, 15})
+}
+
+func TestTern(t *testing.T) {
+ if !archsimd.X86.AVX512() {
+ t.Skip("This test needs AVX512")
+ }
+ x := archsimd.LoadInt32x8Slice([]int32{0, 0, 0, 0, 1, 1, 1, 1})
+ y := archsimd.LoadInt32x8Slice([]int32{0, 0, 1, 1, 0, 0, 1, 1})
+ z := archsimd.LoadInt32x8Slice([]int32{0, 1, 0, 1, 0, 1, 0, 1})
+
+ foo := func(w archsimd.Int32x8, k uint8) {
+ a := make([]int32, 8)
+ w.StoreSlice(a)
+ t.Logf("For k=%0b, w=%v", k, a)
+ for i, b := range a {
+ if (int32(k)>>i)&1 != b {
+ t.Errorf("Element %d of stored slice (=%d) did not match corresponding bit in 0b%b",
+ i, b, k)
+ }
+ }
+ }
+
+ foo(x.ExportTestTern(0b1111_0000, y, z), 0b1111_0000)
+ foo(x.ExportTestTern(0b1100_1100, y, z), 0b1100_1100)
+ foo(x.ExportTestTern(0b1010_1010, y, z), 0b1010_1010)
+}
+
+func TestSelect2x4x32(t *testing.T) {
+ for a := range uint8(8) {
+ for b := range uint8(8) {
+ for c := range uint8(8) {
+ for d := range uint8(8) {
+ x := archsimd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
+ y := archsimd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
+ z := select2x4x32(x, a, b, c, d, y)
+ w := make([]int32, 4, 4)
+ z.StoreSlice(w)
+ if w[0] != int32(a) || w[1] != int32(b) ||
+ w[2] != int32(c) || w[3] != int32(d) {
+ t.Errorf("Expected [%d %d %d %d] got %v", a, b, c, d, w)
+ }
+ }
+ }
+ }
+ }
+}
+
+func TestSelect2x8x32Grouped(t *testing.T) {
+ for a := range uint8(8) {
+ for b := range uint8(8) {
+ for c := range uint8(8) {
+ for d := range uint8(8) {
+ x := archsimd.LoadInt32x8Slice([]int32{0, 1, 2, 3, 10, 11, 12, 13})
+ y := archsimd.LoadInt32x8Slice([]int32{4, 5, 6, 7, 14, 15, 16, 17})
+ z := select2x8x32Grouped(x, a, b, c, d, y)
+ w := make([]int32, 8, 8)
+ z.StoreSlice(w)
+ if w[0] != int32(a) || w[1] != int32(b) ||
+ w[2] != int32(c) || w[3] != int32(d) ||
+ w[4] != int32(10+a) || w[5] != int32(10+b) ||
+ w[6] != int32(10+c) || w[7] != int32(10+d) {
+ t.Errorf("Expected [%d %d %d %d %d %d %d %d] got %v", a, b, c, d, 10+a, 10+b, 10+c, 10+d, w)
+ }
+ }
+ }
+ }
+ }
+}
+
+// select2x4x32 returns a selection of 4 elements in x and y, numbered
+// 0-7, where 0-3 are the four elements of x and 4-7 are the four elements
+// of y.
+func select2x4x32(x archsimd.Int32x4, a, b, c, d uint8, y archsimd.Int32x4) archsimd.Int32x4 {
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case archsimd.LLLL:
+ return x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, b, c, d), x)
+ case archsimd.HHHH:
+ return y.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, b, c, d), y)
+ case archsimd.LLHH:
+ return x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, b, c, d), y)
+ case archsimd.HHLL:
+ return y.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, b, c, d), x)
+
+ case archsimd.HLLL:
+ z := y.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, a, b, b), x)
+ return z.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(0, 2, c, d), x)
+ case archsimd.LHLL:
+ z := x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, a, b, b), y)
+ return z.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(0, 2, c, d), x)
+
+ case archsimd.HLHH:
+ z := y.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, a, b, b), x)
+ return z.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(0, 2, c, d), y)
+ case archsimd.LHHH:
+ z := x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, a, b, b), y)
+ return z.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(0, 2, c, d), y)
+
+ case archsimd.LLLH:
+ z := x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(c, c, d, d), y)
+ return x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, b, 0, 2), z)
+ case archsimd.LLHL:
+ z := y.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(c, c, d, d), x)
+ return x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, b, 0, 2), z)
+ case archsimd.HHLH:
+ z := x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(c, c, d, d), y)
+ return y.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, b, 0, 2), z)
+ case archsimd.HHHL:
+ z := y.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(c, c, d, d), x)
+ return y.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, b, 0, 2), z)
+
+ case archsimd.LHLH:
+ z := x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, c, b, d), y)
+ return z.ExportTestConcatSelectedConstant(0b11_01_10_00 /* =archsimd.ExportTestCscImm4(0, 2, 1, 3) */, z)
+ case archsimd.HLHL:
+ z := x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(b, d, a, c), y)
+ return z.ExportTestConcatSelectedConstant(0b01_11_00_10 /* =archsimd.ExportTestCscImm4(2, 0, 3, 1) */, z)
+ case archsimd.HLLH:
+ z := x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(b, c, a, d), y)
+ return z.ExportTestConcatSelectedConstant(0b11_01_00_10 /* =archsimd.ExportTestCscImm4(2, 0, 1, 3) */, z)
+ case archsimd.LHHL:
+ z := x.ExportTestConcatSelectedConstant(archsimd.ExportTestCscImm4(a, d, b, c), y)
+ return z.ExportTestConcatSelectedConstant(0b01_11_10_00 /* =archsimd.ExportTestCscImm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// select2x8x32Grouped returns a pair of selection of 4 elements in x and y,
+// numbered 0-7, where 0-3 are the four elements of x's two groups (lower and
+// upper 128 bits) and 4-7 are the four elements of y's two groups.
+
+func select2x8x32Grouped(x archsimd.Int32x8, a, b, c, d uint8, y archsimd.Int32x8) archsimd.Int32x8 {
+ // selections as being expressible in the ExportTestConcatSelectedConstant pattern,
+ // or not. Classification is by H and L, where H is a selection from 4-7
+ // and L is a selection from 0-3.
+ // archsimd.LLHH -> CSC(x,y, a, b, c&3, d&3)
+ // archsimd.HHLL -> CSC(y,x, a&3, b&3, c, d)
+ // archsimd.LLLL -> CSC(x,x, a, b, c, d)
+ // archsimd.HHHH -> CSC(y,y, a&3, b&3, c&3, d&3)
+
+ // archsimd.LLLH -> z = CSC(x, y, c, c, d&3, d&3); CSC(x, z, a, b, 0, 2)
+ // archsimd.LLHL -> z = CSC(x, y, c&3, c&3, d, d); CSC(x, z, a, b, 0, 2)
+ // archsimd.HHLH -> z = CSC(x, y, c, c, d&3, d&3); CSC(y, z, a&3, b&3, 0, 2)
+ // archsimd.HHHL -> z = CSC(x, y, c&3, c&3, d, d); CSC(y, z, a&3, b&3, 0, 2)
+
+ // archsimd.LHLL -> z = CSC(x, y, a, a, b&3, b&3); CSC(z, x, 0, 2, c, d)
+ // etc
+
+ // archsimd.LHLH -> z = CSC(x, y, a, c, b&3, d&3); CSC(z, z, 0, 2, 1, 3)
+ // archsimd.HLHL -> z = CSC(x, y, b, d, a&3, c&3); CSC(z, z, 2, 0, 3, 1)
+
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case archsimd.LLLL:
+ return x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, b, c, d), x)
+ case archsimd.HHHH:
+ return y.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, b, c, d), y)
+ case archsimd.LLHH:
+ return x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, b, c, d), y)
+ case archsimd.HHLL:
+ return y.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, b, c, d), x)
+
+ case archsimd.HLLL:
+ z := y.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, a, b, b), x)
+ return z.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(0, 2, c, d), x)
+ case archsimd.LHLL:
+ z := x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, a, b, b), y)
+ return z.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(0, 2, c, d), x)
+
+ case archsimd.HLHH:
+ z := y.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, a, b, b), x)
+ return z.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(0, 2, c, d), y)
+ case archsimd.LHHH:
+ z := x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, a, b, b), y)
+ return z.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(0, 2, c, d), y)
+
+ case archsimd.LLLH:
+ z := x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(c, c, d, d), y)
+ return x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, b, 0, 2), z)
+ case archsimd.LLHL:
+ z := y.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(c, c, d, d), x)
+ return x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, b, 0, 2), z)
+ case archsimd.HHLH:
+ z := x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(c, c, d, d), y)
+ return y.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, b, 0, 2), z)
+ case archsimd.HHHL:
+ z := y.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(c, c, d, d), x)
+ return y.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, b, 0, 2), z)
+
+ case archsimd.LHLH:
+ z := x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, c, b, d), y)
+ return z.ExportTestConcatSelectedConstantGrouped(0b11_01_10_00 /* =archsimd.ExportTestCscImm4(0, 2, 1, 3) */, z)
+ case archsimd.HLHL:
+ z := x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(b, d, a, c), y)
+ return z.ExportTestConcatSelectedConstantGrouped(0b01_11_00_10 /* =archsimd.ExportTestCscImm4(2, 0, 3, 1) */, z)
+ case archsimd.HLLH:
+ z := x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(b, c, a, d), y)
+ return z.ExportTestConcatSelectedConstantGrouped(0b11_01_00_10 /* =archsimd.ExportTestCscImm4(2, 0, 1, 3) */, z)
+ case archsimd.LHHL:
+ z := x.ExportTestConcatSelectedConstantGrouped(archsimd.ExportTestCscImm4(a, d, b, c), y)
+ return z.ExportTestConcatSelectedConstantGrouped(0b01_11_10_00 /* =archsimd.ExportTestCscImm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package archsimd
+
+// These constants represent the source pattern for the four parameters
+// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped.
+// L means the element comes from the 'x' vector (Low), and
+// H means it comes from the 'y' vector (High).
+// The order of the letters corresponds to elements a, b, c, d.
+// The underlying integer value is a bitmask where:
+// Bit 0: Source of element 'a' (0 for x, 1 for y)
+// Bit 1: Source of element 'b' (0 for x, 1 for y)
+// Bit 2: Source of element 'c' (0 for x, 1 for y)
+// Bit 3: Source of element 'd' (0 for x, 1 for y)
+// Note that the least-significant bit is on the LEFT in this encoding.
+const (
+ _LLLL = iota // a:x, b:x, c:x, d:x
+ _HLLL // a:y, b:x, c:x, d:x
+ _LHLL // a:x, b:y, c:x, d:x
+ _HHLL // a:y, b:y, c:x, d:x
+ _LLHL // a:x, b:x, c:y, d:x
+ _HLHL // a:y, b:x, c:y, d:x
+ _LHHL // a:x, b:y, c:y, d:x
+ _HHHL // a:y, b:y, c:y, d:x
+ _LLLH // a:x, b:x, c:x, d:y
+ _HLLH // a:y, b:x, c:x, d:y
+ _LHLH // a:x, b:y, c:x, d:y
+ _HHLH // a:y, b:y, c:x, d:y
+ _LLHH // a:x, b:x, c:y, d:y
+ _HLHH // a:y, b:x, c:y, d:y
+ _LHHH // a:x, b:y, c:y, d:y
+ _HHHH // a:y, b:y, c:y, d:y
+)
+
+// These constants represent the source pattern for the four parameters
+// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped for
+// two-element vectors.
+const (
+ _LL = iota
+ _HL
+ _LH
+ _HH
+)
+
+// SelectFromPair returns the selection of four elements from the two
+// vectors x and y, where selector values in the range 0-3 specify
+// elements from x and values in the range 4-7 specify the 0-3 elements
+// of y. When the selectors are constants and the selection can be
+// implemented in a single instruction, it will be, otherwise it
+// requires two. a is the source index of the least element in the
+// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
+// elements in the output. For example,
+// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Int32x4) SelectFromPair(a, b, c, d uint8, y Int32x4) Int32x4 {
+ // pattern gets the concatenation of "x or y?" bits
+ // (0 == x, 1 == y)
+ // This will determine operand choice/order and whether a second
+ // instruction is needed.
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ // a-d are masked down to their offsets within x or y
+ // this is not necessary for x, but this is easier on the
+ // eyes and reduces the risk of an error now or later.
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case _LLLL:
+ return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
+ case _HHHH:
+ return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
+ case _LLHH:
+ return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
+ case _HHLL:
+ return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
+
+ case _HLLL:
+ z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
+ case _LHLL:
+ z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
+
+ case _HLHH:
+ z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
+ case _LHHH:
+ z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
+
+ case _LLLH:
+ z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
+ return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+ case _LLHL:
+ z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
+ return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+ case _HHLH:
+ z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
+ return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+ case _HHHL:
+ z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
+ return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+
+ case _LHLH:
+ z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
+ return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
+ case _HLHL:
+ z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
+ return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
+ case _HLLH:
+ z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
+ return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
+ case _LHHL:
+ z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
+ return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPair returns the selection of four elements from the two
+// vectors x and y, where selector values in the range 0-3 specify
+// elements from x and values in the range 4-7 specify the 0-3 elements
+// of y. When the selectors are constants and can be the selection
+// can be implemented in a single instruction, it will be, otherwise
+// it requires two. a is the source index of the least element in the
+// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
+// elements in the output. For example,
+// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Uint32x4) SelectFromPair(a, b, c, d uint8, y Uint32x4) Uint32x4 {
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case _LLLL:
+ return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
+ case _HHHH:
+ return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
+ case _LLHH:
+ return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
+ case _HHLL:
+ return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
+
+ case _HLLL:
+ z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
+ case _LHLL:
+ z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
+
+ case _HLHH:
+ z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
+ case _LHHH:
+ z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
+
+ case _LLLH:
+ z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
+ return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+ case _LLHL:
+ z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
+ return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+ case _HHLH:
+ z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
+ return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+ case _HHHL:
+ z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
+ return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+
+ case _LHLH:
+ z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
+ return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
+ case _HLHL:
+ z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
+ return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
+ case _HLLH:
+ z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
+ return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
+ case _LHHL:
+ z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
+ return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPair returns the selection of four elements from the two
+// vectors x and y, where selector values in the range 0-3 specify
+// elements from x and values in the range 4-7 specify the 0-3 elements
+// of y. When the selectors are constants and can be the selection
+// can be implemented in a single instruction, it will be, otherwise
+// it requires two. a is the source index of the least element in the
+// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
+// elements in the output. For example,
+// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Float32x4) SelectFromPair(a, b, c, d uint8, y Float32x4) Float32x4 {
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case _LLLL:
+ return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
+ case _HHHH:
+ return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
+ case _LLHH:
+ return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
+ case _HHLL:
+ return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
+
+ case _HLLL:
+ z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
+ case _LHLL:
+ z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
+
+ case _HLHH:
+ z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
+ case _LHHH:
+ z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
+
+ case _LLLH:
+ z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
+ return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+ case _LLHL:
+ z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
+ return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+ case _HHLH:
+ z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
+ return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+ case _HHHL:
+ z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
+ return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
+
+ case _LHLH:
+ z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
+ return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
+ case _HLHL:
+ z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
+ return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
+ case _HLLH:
+ z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
+ return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
+ case _LHHL:
+ z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
+ return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// the vectors x and y, the selection of four elements from x and y,
+// where selector values in the range 0-3 specify elements from x and
+// values in the range 4-7 specify the 0-3 elements of y.
+// When the selectors are constants and can be the selection
+// can be implemented in a single instruction, it will be, otherwise
+// it requires two. a is the source index of the least element in the
+// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
+// elements in the output. For example,
+// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+//
+// returns {4,8,25,81,64,128,169,289}
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Int32x8) SelectFromPairGrouped(a, b, c, d uint8, y Int32x8) Int32x8 {
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case _LLLL:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+ case _HHHH:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _LLHH:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _HHLL:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+
+ case _HLLL:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+ case _LHLL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+
+ case _HLHH:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+ case _LHHH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+
+ case _LLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _LLHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+
+ case _LHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
+ case _HLHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
+ case _HLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
+ case _LHHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// the vectors x and y, the selection of four elements from x and y,
+// where selector values in the range 0-3 specify elements from x and
+// values in the range 4-7 specify the 0-3 elements of y.
+// When the selectors are constants and can be the selection
+// can be implemented in a single instruction, it will be, otherwise
+// it requires two. a is the source index of the least element in the
+// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
+// elements in the output. For example,
+// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+//
+// returns {4,8,25,81,64,128,169,289}
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Uint32x8) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 {
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case _LLLL:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+ case _HHHH:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _LLHH:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _HHLL:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+
+ case _HLLL:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+ case _LHLL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+
+ case _HLHH:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+ case _LHHH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+
+ case _LLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _LLHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+
+ case _LHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
+ case _HLHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
+ case _HLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
+ case _LHHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// the vectors x and y, the selection of four elements from x and y,
+// where selector values in the range 0-3 specify elements from x and
+// values in the range 4-7 specify the 0-3 elements of y.
+// When the selectors are constants and can be the selection
+// can be implemented in a single instruction, it will be, otherwise
+// it requires two. a is the source index of the least element in the
+// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
+// elements in the output. For example,
+// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
+//
+// returns {4,8,25,81,64,128,169,289}
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPS, CPU Feature: AVX
+func (x Float32x8) SelectFromPairGrouped(a, b, c, d uint8, y Float32x8) Float32x8 {
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case _LLLL:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+ case _HHHH:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _LLHH:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _HHLL:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+
+ case _HLLL:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+ case _LHLL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+
+ case _HLHH:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+ case _LHHH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+
+ case _LLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _LLHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+
+ case _LHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
+ case _HLHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
+ case _HLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
+ case _LHHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// of the vectors x and y, the selection of four elements from x and y,
+// where selector values in the range 0-3 specify elements from x and
+// values in the range 4-7 specify the 0-3 elements of y.
+// When the selectors are constants and can be the selection
+// can be implemented in a single instruction, it will be, otherwise
+// it requires two.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPS, CPU Feature: AVX512
+func (x Int32x16) SelectFromPairGrouped(a, b, c, d uint8, y Int32x16) Int32x16 {
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case _LLLL:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+ case _HHHH:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _LLHH:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _HHLL:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+
+ case _HLLL:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+ case _LHLL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+
+ case _HLHH:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+ case _LHHH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+
+ case _LLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _LLHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+
+ case _LHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
+ case _HLHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
+ case _HLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
+ case _LHHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// of the vectors x and y, the selection of four elements from x and y,
+// where selector values in the range 0-3 specify elements from x and
+// values in the range 4-7 specify the 0-3 elements of y.
+// When the selectors are constants and can be the selection
+// can be implemented in a single instruction, it will be, otherwise
+// it requires two.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPS, CPU Feature: AVX512
+func (x Uint32x16) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x16) Uint32x16 {
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case _LLLL:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+ case _HHHH:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _LLHH:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _HHLL:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+
+ case _HLLL:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+ case _LHLL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+
+ case _HLHH:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+ case _LHHH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+
+ case _LLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _LLHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+
+ case _LHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
+ case _HLHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
+ case _HLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
+ case _LHHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// of the vectors x and y, the selection of four elements from x and y,
+// where selector values in the range 0-3 specify elements from x and
+// values in the range 4-7 specify the 0-3 elements of y.
+// When the selectors are constants and can be the selection
+// can be implemented in a single instruction, it will be, otherwise
+// it requires two.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPS, CPU Feature: AVX512
+func (x Float32x16) SelectFromPairGrouped(a, b, c, d uint8, y Float32x16) Float32x16 {
+ pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
+
+ a, b, c, d = a&3, b&3, c&3, d&3
+
+ switch pattern {
+ case _LLLL:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+ case _HHHH:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _LLHH:
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
+ case _HHLL:
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
+
+ case _HLLL:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+ case _LHLL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
+
+ case _HLHH:
+ z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+ case _LHHH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
+ return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
+
+ case _LLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _LLHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+ case _HHHL:
+ z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
+ return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
+
+ case _LHLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
+ case _HLHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
+ case _HLLH:
+ z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
+ return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
+ case _LHHL:
+ z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
+ return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// cscimm4 converts the 4 vector element indices into a single
+// uint8 for use as an immediate.
+func cscimm4(a, b, c, d uint8) uint8 {
+ return uint8(a + b<<2 + c<<4 + d<<6)
+}
+
+// cscimm2 converts the 2 vector element indices into a single
+// uint8 for use as an immediate.
+func cscimm2(a, b uint8) uint8 {
+ return uint8(a + b<<1)
+}
+
+// cscimm2g2 converts the 2 vector element indices into a single
+// uint8 for use as an immediate, but duplicated for VSHUFPD
+// to emulate grouped behavior of VSHUFPS
+func cscimm2g2(a, b uint8) uint8 {
+ g := cscimm2(a, b)
+ return g + g<<2
+}
+
+// cscimm2g4 converts the 2 vector element indices into a single
+// uint8 for use as an immediate, but with four copies for VSHUFPD
+// to emulate grouped behavior of VSHUFPS
+func cscimm2g4(a, b uint8) uint8 {
+ g := cscimm2g2(a, b)
+ return g + g<<4
+}
+
+// SelectFromPair returns the selection of two elements from the two
+// vectors x and y, where selector values in the range 0-1 specify
+// elements from x and values in the range 2-3 specify the 0-1 elements
+// of y. When the selectors are constants the selection can be
+// implemented in a single instruction.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Uint64x2) SelectFromPair(a, b uint8, y Uint64x2) Uint64x2 {
+ pattern := (a&2)>>1 + (b & 2)
+
+ a, b = a&1, b&1
+
+ switch pattern {
+ case _LL:
+ return x.concatSelectedConstant(cscimm2(a, b), x)
+ case _HH:
+ return y.concatSelectedConstant(cscimm2(a, b), y)
+ case _LH:
+ return x.concatSelectedConstant(cscimm2(a, b), y)
+ case _HL:
+ return y.concatSelectedConstant(cscimm2(a, b), x)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// the vectors x and y, the selection of two elements from the two
+// vectors x and y, where selector values in the range 0-1 specify
+// elements from x and values in the range 2-3 specify the 0-1 elements
+// of y. When the selectors are constants the selection can be
+// implemented in a single instruction.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Uint64x4) SelectFromPairGrouped(a, b uint8, y Uint64x4) Uint64x4 {
+ pattern := (a&2)>>1 + (b & 2)
+
+ a, b = a&1, b&1
+
+ switch pattern {
+ case _LL:
+ return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
+ case _HH:
+ return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
+ case _LH:
+ return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
+ case _HL:
+ return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// of the vectors x and y, the selection of two elements from the two
+// vectors x and y, where selector values in the range 0-1 specify
+// elements from x and values in the range 2-3 specify the 0-1 elements
+// of y. When the selectors are constants the selection can be
+// implemented in a single instruction.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPD, CPU Feature: AVX512
+func (x Uint64x8) SelectFromPairGrouped(a, b uint8, y Uint64x8) Uint64x8 {
+ pattern := (a&2)>>1 + (b & 2)
+
+ a, b = a&1, b&1
+
+ switch pattern {
+ case _LL:
+ return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
+ case _HH:
+ return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
+ case _LH:
+ return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
+ case _HL:
+ return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPair returns the selection of two elements from the two
+// vectors x and y, where selector values in the range 0-1 specify
+// elements from x and values in the range 2-3 specify the 0-1 elements
+// of y. When the selectors are constants the selection can be
+// implemented in a single instruction.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Float64x2) SelectFromPair(a, b uint8, y Float64x2) Float64x2 {
+ pattern := (a&2)>>1 + (b & 2)
+
+ a, b = a&1, b&1
+
+ switch pattern {
+ case _LL:
+ return x.concatSelectedConstant(cscimm2(a, b), x)
+ case _HH:
+ return y.concatSelectedConstant(cscimm2(a, b), y)
+ case _LH:
+ return x.concatSelectedConstant(cscimm2(a, b), y)
+ case _HL:
+ return y.concatSelectedConstant(cscimm2(a, b), x)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// the vectors x and y, the selection of two elements from the two
+// vectors x and y, where selector values in the range 0-1 specify
+// elements from x and values in the range 2-3 specify the 0-1 elements
+// of y. When the selectors are constants the selection can be
+// implemented in a single instruction.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Float64x4) SelectFromPairGrouped(a, b uint8, y Float64x4) Float64x4 {
+ pattern := (a&2)>>1 + (b & 2)
+
+ a, b = a&1, b&1
+
+ switch pattern {
+ case _LL:
+ return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
+ case _HH:
+ return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
+ case _LH:
+ return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
+ case _HL:
+ return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// of the vectors x and y, the selection of two elements from the two
+// vectors x and y, where selector values in the range 0-1 specify
+// elements from x and values in the range 2-3 specify the 0-1 elements
+// of y. When the selectors are constants the selection can be
+// implemented in a single instruction.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPD, CPU Feature: AVX512
+func (x Float64x8) SelectFromPairGrouped(a, b uint8, y Float64x8) Float64x8 {
+ pattern := (a&2)>>1 + (b & 2)
+
+ a, b = a&1, b&1
+
+ switch pattern {
+ case _LL:
+ return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
+ case _HH:
+ return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
+ case _LH:
+ return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
+ case _HL:
+ return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPair returns the selection of two elements from the two
+// vectors x and y, where selector values in the range 0-1 specify
+// elements from x and values in the range 2-3 specify the 0-1 elements
+// of y. When the selectors are constants the selection can be
+// implemented in a single instruction.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Int64x2) SelectFromPair(a, b uint8, y Int64x2) Int64x2 {
+ pattern := (a&2)>>1 + (b & 2)
+
+ a, b = a&1, b&1
+
+ switch pattern {
+ case _LL:
+ return x.concatSelectedConstant(cscimm2(a, b), x)
+ case _HH:
+ return y.concatSelectedConstant(cscimm2(a, b), y)
+ case _LH:
+ return x.concatSelectedConstant(cscimm2(a, b), y)
+ case _HL:
+ return y.concatSelectedConstant(cscimm2(a, b), x)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the two 128-bit halves of
+// the vectors x and y, the selection of two elements from the two
+// vectors x and y, where selector values in the range 0-1 specify
+// elements from x and values in the range 2-3 specify the 0-1 elements
+// of y. When the selectors are constants the selection can be
+// implemented in a single instruction.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPD, CPU Feature: AVX
+func (x Int64x4) SelectFromPairGrouped(a, b uint8, y Int64x4) Int64x4 {
+ pattern := (a&2)>>1 + (b & 2)
+
+ a, b = a&1, b&1
+
+ switch pattern {
+ case _LL:
+ return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
+ case _HH:
+ return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
+ case _LH:
+ return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
+ case _HL:
+ return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
+// of the vectors x and y, the selection of two elements from the two
+// vectors x and y, where selector values in the range 0-1 specify
+// elements from x and values in the range 2-3 specify the 0-1 elements
+// of y. When the selectors are constants the selection can be
+// implemented in a single instruction.
+//
+// If the selectors are not constant this will translate to a function
+// call.
+//
+// Asm: VSHUFPD, CPU Feature: AVX512
+func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
+ pattern := (a&2)>>1 + (b & 2)
+
+ a, b = a&1, b&1
+
+ switch pattern {
+ case _LL:
+ return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
+ case _HH:
+ return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
+ case _LH:
+ return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
+ case _HL:
+ return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
+ }
+ panic("missing case, switch should be exhaustive")
+}
+
+/* PermuteScalars */
+
+// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
+ return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX
+func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
+ return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsGrouped */
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
+ return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table may be generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
+ return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX2
+func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
+ return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFD, CPU Feature: AVX512
+func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
+ return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsHi */
+
+// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
+//
+// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
+ return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
+//
+// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
+ return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsHiGrouped */
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
+ return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12],
+// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
+// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
+ return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
+//
+// Each group is of size 128-bit.
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX2
+func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
+ return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// { x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
+// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12],
+// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
+// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFHW, CPU Feature: AVX512
+func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
+ return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsLo */
+
+// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
+ return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
+ return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+/* PermuteScalarsLoGrouped */
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
+ return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15],
+// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
+// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
+ return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX2
+func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
+ return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
+//
+// result =
+// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
+// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15],
+// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
+// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
+//
+// Each group is of size 128-bit.
+//
+// Parameters a,b,c,d should have values between 0 and 3.
+// If a through d are constants, then an instruction will be inlined, otherwise
+// a jump table is generated.
+//
+// Asm: VPSHUFLW, CPU Feature: AVX512
+func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
+ return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
+}
+
+// CarrylessMultiply computes one of four possible carryless
+// multiplications of selected high and low halves of x and y,
+// depending on the values of a and b, returning the 128-bit
+// product in the concatenated two elements of the result.
+// a selects the low (0) or high (1) element of x and
+// b selects the low (0) or high (1) element of y.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// constant values of a and b will result in better performance,
+// otherwise the intrinsic may translate into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX
+func (x Uint64x2) CarrylessMultiply(a, b uint8, y Uint64x2) Uint64x2 {
+ return x.carrylessMultiply(a&1+((b&1)<<4), y)
+}
+
+// CarrylessMultiplyGrouped computes one of four possible carryless
+// multiplications of selected high and low halves of each of the two
+// 128-bit lanes of x and y, depending on the values of a and b,
+// and returns the four 128-bit products in the result's lanes.
+// a selects the low (0) or high (1) elements of x's lanes and
+// b selects the low (0) or high (1) elements of y's lanes.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// constant values of a and b will result in better performance,
+// otherwise the intrinsic may translate into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x4) CarrylessMultiplyGrouped(a, b uint8, y Uint64x4) Uint64x4 {
+ return x.carrylessMultiply(a&1+((b&1)<<4), y)
+}
+
+// CarrylessMultiplyGrouped computes one of four possible carryless
+// multiplications of selected high and low halves of each of the four
+// 128-bit lanes of x and y, depending on the values of a and b,
+// and returns the four 128-bit products in the result's lanes.
+// a selects the low (0) or high (1) elements of x's lanes and
+// b selects the low (0) or high (1) elements of y's lanes.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// constant values of a and b will result in better performance,
+// otherwise the intrinsic may translate into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x8) CarrylessMultiplyGrouped(a, b uint8, y Uint64x8) Uint64x8 {
+ return x.carrylessMultiply(a&1+((b&1)<<4), y)
+}
--- /dev/null
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+import "unsafe"
+
+// LoadInt8x16Slice loads an Int8x16 from a slice of at least 16 int8s
+func LoadInt8x16Slice(s []int8) Int8x16 {
+ return LoadInt8x16((*[16]int8)(s))
+}
+
+// StoreSlice stores x into a slice of at least 16 int8s
+func (x Int8x16) StoreSlice(s []int8) {
+ x.Store((*[16]int8)(s))
+}
+
+// LoadInt16x8Slice loads an Int16x8 from a slice of at least 8 int16s
+func LoadInt16x8Slice(s []int16) Int16x8 {
+ return LoadInt16x8((*[8]int16)(s))
+}
+
+// StoreSlice stores x into a slice of at least 8 int16s
+func (x Int16x8) StoreSlice(s []int16) {
+ x.Store((*[8]int16)(s))
+}
+
+// LoadInt32x4Slice loads an Int32x4 from a slice of at least 4 int32s
+func LoadInt32x4Slice(s []int32) Int32x4 {
+ return LoadInt32x4((*[4]int32)(s))
+}
+
+// StoreSlice stores x into a slice of at least 4 int32s
+func (x Int32x4) StoreSlice(s []int32) {
+ x.Store((*[4]int32)(s))
+}
+
+// LoadInt64x2Slice loads an Int64x2 from a slice of at least 2 int64s
+func LoadInt64x2Slice(s []int64) Int64x2 {
+ return LoadInt64x2((*[2]int64)(s))
+}
+
+// StoreSlice stores x into a slice of at least 2 int64s
+func (x Int64x2) StoreSlice(s []int64) {
+ x.Store((*[2]int64)(s))
+}
+
+// LoadUint8x16Slice loads an Uint8x16 from a slice of at least 16 uint8s
+func LoadUint8x16Slice(s []uint8) Uint8x16 {
+ return LoadUint8x16((*[16]uint8)(s))
+}
+
+// StoreSlice stores x into a slice of at least 16 uint8s
+func (x Uint8x16) StoreSlice(s []uint8) {
+ x.Store((*[16]uint8)(s))
+}
+
+// LoadUint16x8Slice loads an Uint16x8 from a slice of at least 8 uint16s
+func LoadUint16x8Slice(s []uint16) Uint16x8 {
+ return LoadUint16x8((*[8]uint16)(s))
+}
+
+// StoreSlice stores x into a slice of at least 8 uint16s
+func (x Uint16x8) StoreSlice(s []uint16) {
+ x.Store((*[8]uint16)(s))
+}
+
+// LoadUint32x4Slice loads an Uint32x4 from a slice of at least 4 uint32s
+func LoadUint32x4Slice(s []uint32) Uint32x4 {
+ return LoadUint32x4((*[4]uint32)(s))
+}
+
+// StoreSlice stores x into a slice of at least 4 uint32s
+func (x Uint32x4) StoreSlice(s []uint32) {
+ x.Store((*[4]uint32)(s))
+}
+
+// LoadUint64x2Slice loads an Uint64x2 from a slice of at least 2 uint64s
+func LoadUint64x2Slice(s []uint64) Uint64x2 {
+ return LoadUint64x2((*[2]uint64)(s))
+}
+
+// StoreSlice stores x into a slice of at least 2 uint64s
+func (x Uint64x2) StoreSlice(s []uint64) {
+ x.Store((*[2]uint64)(s))
+}
+
+// LoadFloat32x4Slice loads a Float32x4 from a slice of at least 4 float32s
+func LoadFloat32x4Slice(s []float32) Float32x4 {
+ return LoadFloat32x4((*[4]float32)(s))
+}
+
+// StoreSlice stores x into a slice of at least 4 float32s
+func (x Float32x4) StoreSlice(s []float32) {
+ x.Store((*[4]float32)(s))
+}
+
+// LoadFloat64x2Slice loads a Float64x2 from a slice of at least 2 float64s
+func LoadFloat64x2Slice(s []float64) Float64x2 {
+ return LoadFloat64x2((*[2]float64)(s))
+}
+
+// StoreSlice stores x into a slice of at least 2 float64s
+func (x Float64x2) StoreSlice(s []float64) {
+ x.Store((*[2]float64)(s))
+}
+
+// LoadInt8x32Slice loads an Int8x32 from a slice of at least 32 int8s
+func LoadInt8x32Slice(s []int8) Int8x32 {
+ return LoadInt8x32((*[32]int8)(s))
+}
+
+// StoreSlice stores x into a slice of at least 32 int8s
+func (x Int8x32) StoreSlice(s []int8) {
+ x.Store((*[32]int8)(s))
+}
+
+// LoadInt16x16Slice loads an Int16x16 from a slice of at least 16 int16s
+func LoadInt16x16Slice(s []int16) Int16x16 {
+ return LoadInt16x16((*[16]int16)(s))
+}
+
+// StoreSlice stores x into a slice of at least 16 int16s
+func (x Int16x16) StoreSlice(s []int16) {
+ x.Store((*[16]int16)(s))
+}
+
+// LoadInt32x8Slice loads an Int32x8 from a slice of at least 8 int32s
+func LoadInt32x8Slice(s []int32) Int32x8 {
+ return LoadInt32x8((*[8]int32)(s))
+}
+
+// StoreSlice stores x into a slice of at least 8 int32s
+func (x Int32x8) StoreSlice(s []int32) {
+ x.Store((*[8]int32)(s))
+}
+
+// LoadInt64x4Slice loads an Int64x4 from a slice of at least 4 int64s
+func LoadInt64x4Slice(s []int64) Int64x4 {
+ return LoadInt64x4((*[4]int64)(s))
+}
+
+// StoreSlice stores x into a slice of at least 4 int64s
+func (x Int64x4) StoreSlice(s []int64) {
+ x.Store((*[4]int64)(s))
+}
+
+// LoadUint8x32Slice loads an Uint8x32 from a slice of at least 32 uint8s
+func LoadUint8x32Slice(s []uint8) Uint8x32 {
+ return LoadUint8x32((*[32]uint8)(s))
+}
+
+// StoreSlice stores x into a slice of at least 32 uint8s
+func (x Uint8x32) StoreSlice(s []uint8) {
+ x.Store((*[32]uint8)(s))
+}
+
+// LoadUint16x16Slice loads an Uint16x16 from a slice of at least 16 uint16s
+func LoadUint16x16Slice(s []uint16) Uint16x16 {
+ return LoadUint16x16((*[16]uint16)(s))
+}
+
+// StoreSlice stores x into a slice of at least 16 uint16s
+func (x Uint16x16) StoreSlice(s []uint16) {
+ x.Store((*[16]uint16)(s))
+}
+
+// LoadUint32x8Slice loads an Uint32x8 from a slice of at least 8 uint32s
+func LoadUint32x8Slice(s []uint32) Uint32x8 {
+ return LoadUint32x8((*[8]uint32)(s))
+}
+
+// StoreSlice stores x into a slice of at least 8 uint32s
+func (x Uint32x8) StoreSlice(s []uint32) {
+ x.Store((*[8]uint32)(s))
+}
+
+// LoadUint64x4Slice loads an Uint64x4 from a slice of at least 4 uint64s
+func LoadUint64x4Slice(s []uint64) Uint64x4 {
+ return LoadUint64x4((*[4]uint64)(s))
+}
+
+// StoreSlice stores x into a slice of at least 4 uint64s
+func (x Uint64x4) StoreSlice(s []uint64) {
+ x.Store((*[4]uint64)(s))
+}
+
+// LoadFloat32x8Slice loads a Float32x8 from a slice of at least 8 float32s
+func LoadFloat32x8Slice(s []float32) Float32x8 {
+ return LoadFloat32x8((*[8]float32)(s))
+}
+
+// StoreSlice stores x into a slice of at least 8 float32s
+func (x Float32x8) StoreSlice(s []float32) {
+ x.Store((*[8]float32)(s))
+}
+
+// LoadFloat64x4Slice loads a Float64x4 from a slice of at least 4 float64s
+func LoadFloat64x4Slice(s []float64) Float64x4 {
+ return LoadFloat64x4((*[4]float64)(s))
+}
+
+// StoreSlice stores x into a slice of at least 4 float64s
+func (x Float64x4) StoreSlice(s []float64) {
+ x.Store((*[4]float64)(s))
+}
+
+// LoadInt8x64Slice loads an Int8x64 from a slice of at least 64 int8s
+func LoadInt8x64Slice(s []int8) Int8x64 {
+ return LoadInt8x64((*[64]int8)(s))
+}
+
+// StoreSlice stores x into a slice of at least 64 int8s
+func (x Int8x64) StoreSlice(s []int8) {
+ x.Store((*[64]int8)(s))
+}
+
+// LoadInt16x32Slice loads an Int16x32 from a slice of at least 32 int16s
+func LoadInt16x32Slice(s []int16) Int16x32 {
+ return LoadInt16x32((*[32]int16)(s))
+}
+
+// StoreSlice stores x into a slice of at least 32 int16s
+func (x Int16x32) StoreSlice(s []int16) {
+ x.Store((*[32]int16)(s))
+}
+
+// LoadInt32x16Slice loads an Int32x16 from a slice of at least 16 int32s
+func LoadInt32x16Slice(s []int32) Int32x16 {
+ return LoadInt32x16((*[16]int32)(s))
+}
+
+// StoreSlice stores x into a slice of at least 16 int32s
+func (x Int32x16) StoreSlice(s []int32) {
+ x.Store((*[16]int32)(s))
+}
+
+// LoadInt64x8Slice loads an Int64x8 from a slice of at least 8 int64s
+func LoadInt64x8Slice(s []int64) Int64x8 {
+ return LoadInt64x8((*[8]int64)(s))
+}
+
+// StoreSlice stores x into a slice of at least 8 int64s
+func (x Int64x8) StoreSlice(s []int64) {
+ x.Store((*[8]int64)(s))
+}
+
+// LoadUint8x64Slice loads an Uint8x64 from a slice of at least 64 uint8s
+func LoadUint8x64Slice(s []uint8) Uint8x64 {
+ return LoadUint8x64((*[64]uint8)(s))
+}
+
+// StoreSlice stores x into a slice of at least 64 uint8s
+func (x Uint8x64) StoreSlice(s []uint8) {
+ x.Store((*[64]uint8)(s))
+}
+
+// LoadUint16x32Slice loads an Uint16x32 from a slice of at least 32 uint16s
+func LoadUint16x32Slice(s []uint16) Uint16x32 {
+ return LoadUint16x32((*[32]uint16)(s))
+}
+
+// StoreSlice stores x into a slice of at least 32 uint16s
+func (x Uint16x32) StoreSlice(s []uint16) {
+ x.Store((*[32]uint16)(s))
+}
+
+// LoadUint32x16Slice loads an Uint32x16 from a slice of at least 16 uint32s
+func LoadUint32x16Slice(s []uint32) Uint32x16 {
+ return LoadUint32x16((*[16]uint32)(s))
+}
+
+// StoreSlice stores x into a slice of at least 16 uint32s
+func (x Uint32x16) StoreSlice(s []uint32) {
+ x.Store((*[16]uint32)(s))
+}
+
+// LoadUint64x8Slice loads an Uint64x8 from a slice of at least 8 uint64s
+func LoadUint64x8Slice(s []uint64) Uint64x8 {
+ return LoadUint64x8((*[8]uint64)(s))
+}
+
+// StoreSlice stores x into a slice of at least 8 uint64s
+func (x Uint64x8) StoreSlice(s []uint64) {
+ x.Store((*[8]uint64)(s))
+}
+
+// LoadFloat32x16Slice loads a Float32x16 from a slice of at least 16 float32s
+func LoadFloat32x16Slice(s []float32) Float32x16 {
+ return LoadFloat32x16((*[16]float32)(s))
+}
+
+// StoreSlice stores x into a slice of at least 16 float32s
+func (x Float32x16) StoreSlice(s []float32) {
+ x.Store((*[16]float32)(s))
+}
+
+// LoadFloat64x8Slice loads a Float64x8 from a slice of at least 8 float64s
+func LoadFloat64x8Slice(s []float64) Float64x8 {
+ return LoadFloat64x8((*[8]float64)(s))
+}
+
+// StoreSlice stores x into a slice of at least 8 float64s
+func (x Float64x8) StoreSlice(s []float64) {
+ x.Store((*[8]float64)(s))
+}
+
+// LoadInt8x64SlicePart loads a Int8x64 from the slice s.
+// If s has fewer than 64 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 64 or more elements, the function is equivalent to LoadInt8x64Slice.
+func LoadInt8x64SlicePart(s []int8) Int8x64 {
+ l := len(s)
+ if l >= 64 {
+ return LoadInt8x64Slice(s)
+ }
+ if l == 0 {
+ var x Int8x64
+ return x
+ }
+ mask := Mask8x64FromBits(0xffffffffffffffff >> (64 - l))
+ return LoadMaskedInt8x64(paInt8x64(s), mask)
+}
+
+// StoreSlicePart stores the 64 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 64 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int8x64) StoreSlicePart(s []int8) {
+ l := len(s)
+ if l >= 64 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask8x64FromBits(0xffffffffffffffff >> (64 - l))
+ x.StoreMasked(paInt8x64(s), mask)
+}
+
+// LoadInt16x32SlicePart loads a Int16x32 from the slice s.
+// If s has fewer than 32 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 32 or more elements, the function is equivalent to LoadInt16x32Slice.
+func LoadInt16x32SlicePart(s []int16) Int16x32 {
+ l := len(s)
+ if l >= 32 {
+ return LoadInt16x32Slice(s)
+ }
+ if l == 0 {
+ var x Int16x32
+ return x
+ }
+ mask := Mask16x32FromBits(0xffffffff >> (32 - l))
+ return LoadMaskedInt16x32(paInt16x32(s), mask)
+}
+
+// StoreSlicePart stores the 32 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 32 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int16x32) StoreSlicePart(s []int16) {
+ l := len(s)
+ if l >= 32 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask16x32FromBits(0xffffffff >> (32 - l))
+ x.StoreMasked(paInt16x32(s), mask)
+}
+
+// LoadInt32x16SlicePart loads a Int32x16 from the slice s.
+// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 16 or more elements, the function is equivalent to LoadInt32x16Slice.
+func LoadInt32x16SlicePart(s []int32) Int32x16 {
+ l := len(s)
+ if l >= 16 {
+ return LoadInt32x16Slice(s)
+ }
+ if l == 0 {
+ var x Int32x16
+ return x
+ }
+ mask := Mask32x16FromBits(0xffff >> (16 - l))
+ return LoadMaskedInt32x16(paInt32x16(s), mask)
+}
+
+// StoreSlicePart stores the 16 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int32x16) StoreSlicePart(s []int32) {
+ l := len(s)
+ if l >= 16 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask32x16FromBits(0xffff >> (16 - l))
+ x.StoreMasked(paInt32x16(s), mask)
+}
+
+// LoadInt64x8SlicePart loads a Int64x8 from the slice s.
+// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 8 or more elements, the function is equivalent to LoadInt64x8Slice.
+func LoadInt64x8SlicePart(s []int64) Int64x8 {
+ l := len(s)
+ if l >= 8 {
+ return LoadInt64x8Slice(s)
+ }
+ if l == 0 {
+ var x Int64x8
+ return x
+ }
+ mask := Mask64x8FromBits(0xff >> (8 - l))
+ return LoadMaskedInt64x8(paInt64x8(s), mask)
+}
+
+// StoreSlicePart stores the 8 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int64x8) StoreSlicePart(s []int64) {
+ l := len(s)
+ if l >= 8 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask64x8FromBits(0xff >> (8 - l))
+ x.StoreMasked(paInt64x8(s), mask)
+}
+
+// LoadUint8x64SlicePart loads a Uint8x64 from the slice s.
+// If s has fewer than 64 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 64 or more elements, the function is equivalent to LoadUint8x64Slice.
+func LoadUint8x64SlicePart(s []uint8) Uint8x64 {
+ l := len(s)
+ if l >= 64 {
+ return LoadUint8x64Slice(s)
+ }
+ if l == 0 {
+ var x Uint8x64
+ return x
+ }
+ mask := Mask8x64FromBits(0xffffffffffffffff >> (64 - l))
+ return LoadMaskedUint8x64(paUint8x64(s), mask)
+}
+
+// StoreSlicePart stores the 64 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 64 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint8x64) StoreSlicePart(s []uint8) {
+ l := len(s)
+ if l >= 64 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask8x64FromBits(0xffffffffffffffff >> (64 - l))
+ x.StoreMasked(paUint8x64(s), mask)
+}
+
+// LoadUint16x32SlicePart loads a Uint16x32 from the slice s.
+// If s has fewer than 32 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 32 or more elements, the function is equivalent to LoadUint16x32Slice.
+func LoadUint16x32SlicePart(s []uint16) Uint16x32 {
+ l := len(s)
+ if l >= 32 {
+ return LoadUint16x32Slice(s)
+ }
+ if l == 0 {
+ var x Uint16x32
+ return x
+ }
+ mask := Mask16x32FromBits(0xffffffff >> (32 - l))
+ return LoadMaskedUint16x32(paUint16x32(s), mask)
+}
+
+// StoreSlicePart stores the 32 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 32 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint16x32) StoreSlicePart(s []uint16) {
+ l := len(s)
+ if l >= 32 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask16x32FromBits(0xffffffff >> (32 - l))
+ x.StoreMasked(paUint16x32(s), mask)
+}
+
+// LoadUint32x16SlicePart loads a Uint32x16 from the slice s.
+// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 16 or more elements, the function is equivalent to LoadUint32x16Slice.
+func LoadUint32x16SlicePart(s []uint32) Uint32x16 {
+ l := len(s)
+ if l >= 16 {
+ return LoadUint32x16Slice(s)
+ }
+ if l == 0 {
+ var x Uint32x16
+ return x
+ }
+ mask := Mask32x16FromBits(0xffff >> (16 - l))
+ return LoadMaskedUint32x16(paUint32x16(s), mask)
+}
+
+// StoreSlicePart stores the 16 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint32x16) StoreSlicePart(s []uint32) {
+ l := len(s)
+ if l >= 16 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask32x16FromBits(0xffff >> (16 - l))
+ x.StoreMasked(paUint32x16(s), mask)
+}
+
+// LoadUint64x8SlicePart loads a Uint64x8 from the slice s.
+// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 8 or more elements, the function is equivalent to LoadUint64x8Slice.
+func LoadUint64x8SlicePart(s []uint64) Uint64x8 {
+ l := len(s)
+ if l >= 8 {
+ return LoadUint64x8Slice(s)
+ }
+ if l == 0 {
+ var x Uint64x8
+ return x
+ }
+ mask := Mask64x8FromBits(0xff >> (8 - l))
+ return LoadMaskedUint64x8(paUint64x8(s), mask)
+}
+
+// StoreSlicePart stores the 8 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint64x8) StoreSlicePart(s []uint64) {
+ l := len(s)
+ if l >= 8 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask64x8FromBits(0xff >> (8 - l))
+ x.StoreMasked(paUint64x8(s), mask)
+}
+
+// LoadFloat32x16SlicePart loads a Float32x16 from the slice s.
+// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 16 or more elements, the function is equivalent to LoadFloat32x16Slice.
+func LoadFloat32x16SlicePart(s []float32) Float32x16 {
+ l := len(s)
+ if l >= 16 {
+ return LoadFloat32x16Slice(s)
+ }
+ if l == 0 {
+ var x Float32x16
+ return x
+ }
+ mask := Mask32x16FromBits(0xffff >> (16 - l))
+ return LoadMaskedFloat32x16(paFloat32x16(s), mask)
+}
+
+// StoreSlicePart stores the 16 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
+func (x Float32x16) StoreSlicePart(s []float32) {
+ l := len(s)
+ if l >= 16 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask32x16FromBits(0xffff >> (16 - l))
+ x.StoreMasked(paFloat32x16(s), mask)
+}
+
+// LoadFloat64x8SlicePart loads a Float64x8 from the slice s.
+// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 8 or more elements, the function is equivalent to LoadFloat64x8Slice.
+func LoadFloat64x8SlicePart(s []float64) Float64x8 {
+ l := len(s)
+ if l >= 8 {
+ return LoadFloat64x8Slice(s)
+ }
+ if l == 0 {
+ var x Float64x8
+ return x
+ }
+ mask := Mask64x8FromBits(0xff >> (8 - l))
+ return LoadMaskedFloat64x8(paFloat64x8(s), mask)
+}
+
+// StoreSlicePart stores the 8 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
+func (x Float64x8) StoreSlicePart(s []float64) {
+ l := len(s)
+ if l >= 8 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := Mask64x8FromBits(0xff >> (8 - l))
+ x.StoreMasked(paFloat64x8(s), mask)
+}
+
+// LoadInt32x4SlicePart loads a Int32x4 from the slice s.
+// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 4 or more elements, the function is equivalent to LoadInt32x4Slice.
+func LoadInt32x4SlicePart(s []int32) Int32x4 {
+ l := len(s)
+ if l >= 4 {
+ return LoadInt32x4Slice(s)
+ }
+ if l == 0 {
+ var x Int32x4
+ return x
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ return LoadMaskedInt32x4(paInt32x4(s), LoadInt32x4Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 4 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int32x4) StoreSlicePart(s []int32) {
+ l := len(s)
+ if l >= 4 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ x.StoreMasked(paInt32x4(s), LoadInt32x4Slice(mask).asMask())
+}
+
+// LoadInt64x2SlicePart loads a Int64x2 from the slice s.
+// If s has fewer than 2 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 2 or more elements, the function is equivalent to LoadInt64x2Slice.
+func LoadInt64x2SlicePart(s []int64) Int64x2 {
+ l := len(s)
+ if l >= 2 {
+ return LoadInt64x2Slice(s)
+ }
+ if l == 0 {
+ var x Int64x2
+ return x
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ return LoadMaskedInt64x2(paInt64x2(s), LoadInt64x2Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 2 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 2 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int64x2) StoreSlicePart(s []int64) {
+ l := len(s)
+ if l >= 2 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ x.StoreMasked(paInt64x2(s), LoadInt64x2Slice(mask).asMask())
+}
+
+// LoadUint32x4SlicePart loads a Uint32x4 from the slice s.
+// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 4 or more elements, the function is equivalent to LoadUint32x4Slice.
+func LoadUint32x4SlicePart(s []uint32) Uint32x4 {
+ l := len(s)
+ if l >= 4 {
+ return LoadUint32x4Slice(s)
+ }
+ if l == 0 {
+ var x Uint32x4
+ return x
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ return LoadMaskedUint32x4(paUint32x4(s), LoadInt32x4Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 4 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint32x4) StoreSlicePart(s []uint32) {
+ l := len(s)
+ if l >= 4 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ x.StoreMasked(paUint32x4(s), LoadInt32x4Slice(mask).asMask())
+}
+
+// LoadUint64x2SlicePart loads a Uint64x2 from the slice s.
+// If s has fewer than 2 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 2 or more elements, the function is equivalent to LoadUint64x2Slice.
+func LoadUint64x2SlicePart(s []uint64) Uint64x2 {
+ l := len(s)
+ if l >= 2 {
+ return LoadUint64x2Slice(s)
+ }
+ if l == 0 {
+ var x Uint64x2
+ return x
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ return LoadMaskedUint64x2(paUint64x2(s), LoadInt64x2Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 2 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 2 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint64x2) StoreSlicePart(s []uint64) {
+ l := len(s)
+ if l >= 2 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ x.StoreMasked(paUint64x2(s), LoadInt64x2Slice(mask).asMask())
+}
+
+// LoadFloat32x4SlicePart loads a Float32x4 from the slice s.
+// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 4 or more elements, the function is equivalent to LoadFloat32x4Slice.
+func LoadFloat32x4SlicePart(s []float32) Float32x4 {
+ l := len(s)
+ if l >= 4 {
+ return LoadFloat32x4Slice(s)
+ }
+ if l == 0 {
+ var x Float32x4
+ return x
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ return LoadMaskedFloat32x4(paFloat32x4(s), LoadInt32x4Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 4 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
+func (x Float32x4) StoreSlicePart(s []float32) {
+ l := len(s)
+ if l >= 4 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ x.StoreMasked(paFloat32x4(s), LoadInt32x4Slice(mask).asMask())
+}
+
+// LoadFloat64x2SlicePart loads a Float64x2 from the slice s.
+// If s has fewer than 2 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 2 or more elements, the function is equivalent to LoadFloat64x2Slice.
+func LoadFloat64x2SlicePart(s []float64) Float64x2 {
+ l := len(s)
+ if l >= 2 {
+ return LoadFloat64x2Slice(s)
+ }
+ if l == 0 {
+ var x Float64x2
+ return x
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ return LoadMaskedFloat64x2(paFloat64x2(s), LoadInt64x2Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 2 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 2 or more elements, the method is equivalent to x.StoreSlice.
+func (x Float64x2) StoreSlicePart(s []float64) {
+ l := len(s)
+ if l >= 2 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ x.StoreMasked(paFloat64x2(s), LoadInt64x2Slice(mask).asMask())
+}
+
+// LoadInt32x8SlicePart loads a Int32x8 from the slice s.
+// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 8 or more elements, the function is equivalent to LoadInt32x8Slice.
+func LoadInt32x8SlicePart(s []int32) Int32x8 {
+ l := len(s)
+ if l >= 8 {
+ return LoadInt32x8Slice(s)
+ }
+ if l == 0 {
+ var x Int32x8
+ return x
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ return LoadMaskedInt32x8(paInt32x8(s), LoadInt32x8Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 8 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int32x8) StoreSlicePart(s []int32) {
+ l := len(s)
+ if l >= 8 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ x.StoreMasked(paInt32x8(s), LoadInt32x8Slice(mask).asMask())
+}
+
+// LoadInt64x4SlicePart loads a Int64x4 from the slice s.
+// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 4 or more elements, the function is equivalent to LoadInt64x4Slice.
+func LoadInt64x4SlicePart(s []int64) Int64x4 {
+ l := len(s)
+ if l >= 4 {
+ return LoadInt64x4Slice(s)
+ }
+ if l == 0 {
+ var x Int64x4
+ return x
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ return LoadMaskedInt64x4(paInt64x4(s), LoadInt64x4Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 4 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int64x4) StoreSlicePart(s []int64) {
+ l := len(s)
+ if l >= 4 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ x.StoreMasked(paInt64x4(s), LoadInt64x4Slice(mask).asMask())
+}
+
+// LoadUint32x8SlicePart loads a Uint32x8 from the slice s.
+// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 8 or more elements, the function is equivalent to LoadUint32x8Slice.
+func LoadUint32x8SlicePart(s []uint32) Uint32x8 {
+ l := len(s)
+ if l >= 8 {
+ return LoadUint32x8Slice(s)
+ }
+ if l == 0 {
+ var x Uint32x8
+ return x
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ return LoadMaskedUint32x8(paUint32x8(s), LoadInt32x8Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 8 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint32x8) StoreSlicePart(s []uint32) {
+ l := len(s)
+ if l >= 8 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ x.StoreMasked(paUint32x8(s), LoadInt32x8Slice(mask).asMask())
+}
+
+// LoadUint64x4SlicePart loads a Uint64x4 from the slice s.
+// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 4 or more elements, the function is equivalent to LoadUint64x4Slice.
+func LoadUint64x4SlicePart(s []uint64) Uint64x4 {
+ l := len(s)
+ if l >= 4 {
+ return LoadUint64x4Slice(s)
+ }
+ if l == 0 {
+ var x Uint64x4
+ return x
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ return LoadMaskedUint64x4(paUint64x4(s), LoadInt64x4Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 4 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint64x4) StoreSlicePart(s []uint64) {
+ l := len(s)
+ if l >= 4 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ x.StoreMasked(paUint64x4(s), LoadInt64x4Slice(mask).asMask())
+}
+
+// LoadFloat32x8SlicePart loads a Float32x8 from the slice s.
+// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 8 or more elements, the function is equivalent to LoadFloat32x8Slice.
+func LoadFloat32x8SlicePart(s []float32) Float32x8 {
+ l := len(s)
+ if l >= 8 {
+ return LoadFloat32x8Slice(s)
+ }
+ if l == 0 {
+ var x Float32x8
+ return x
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ return LoadMaskedFloat32x8(paFloat32x8(s), LoadInt32x8Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 8 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
+func (x Float32x8) StoreSlicePart(s []float32) {
+ l := len(s)
+ if l >= 8 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask32[len(vecMask32)/2-l:]
+ x.StoreMasked(paFloat32x8(s), LoadInt32x8Slice(mask).asMask())
+}
+
+// LoadFloat64x4SlicePart loads a Float64x4 from the slice s.
+// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 4 or more elements, the function is equivalent to LoadFloat64x4Slice.
+func LoadFloat64x4SlicePart(s []float64) Float64x4 {
+ l := len(s)
+ if l >= 4 {
+ return LoadFloat64x4Slice(s)
+ }
+ if l == 0 {
+ var x Float64x4
+ return x
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ return LoadMaskedFloat64x4(paFloat64x4(s), LoadInt64x4Slice(mask).asMask())
+}
+
+// StoreSlicePart stores the 4 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
+func (x Float64x4) StoreSlicePart(s []float64) {
+ l := len(s)
+ if l >= 4 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ mask := vecMask64[len(vecMask64)/2-l:]
+ x.StoreMasked(paFloat64x4(s), LoadInt64x4Slice(mask).asMask())
+}
+
+// LoadUint8x16SlicePart loads a Uint8x16 from the slice s.
+// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 16 or more elements, the function is equivalent to LoadUint8x16Slice.
+func LoadUint8x16SlicePart(s []uint8) Uint8x16 {
+ if len(s) == 0 {
+ var zero Uint8x16
+ return zero
+ }
+ t := unsafe.Slice((*int8)(unsafe.Pointer(&s[0])), len(s))
+ return LoadInt8x16SlicePart(t).AsUint8x16()
+}
+
+// StoreSlicePart stores the 16 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint8x16) StoreSlicePart(s []uint8) {
+ if len(s) == 0 {
+ return
+ }
+ t := unsafe.Slice((*int8)(unsafe.Pointer(&s[0])), len(s))
+ x.AsInt8x16().StoreSlicePart(t)
+}
+
+// LoadUint16x8SlicePart loads a Uint16x8 from the slice s.
+// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 8 or more elements, the function is equivalent to LoadUint16x8Slice.
+func LoadUint16x8SlicePart(s []uint16) Uint16x8 {
+ if len(s) == 0 {
+ var zero Uint16x8
+ return zero
+ }
+ t := unsafe.Slice((*int16)(unsafe.Pointer(&s[0])), len(s))
+ return LoadInt16x8SlicePart(t).AsUint16x8()
+}
+
+// StoreSlicePart stores the 8 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint16x8) StoreSlicePart(s []uint16) {
+ if len(s) == 0 {
+ return
+ }
+ t := unsafe.Slice((*int16)(unsafe.Pointer(&s[0])), len(s))
+ x.AsInt16x8().StoreSlicePart(t)
+}
+
+// LoadUint8x32SlicePart loads a Uint8x32 from the slice s.
+// If s has fewer than 32 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 32 or more elements, the function is equivalent to LoadUint8x32Slice.
+func LoadUint8x32SlicePart(s []uint8) Uint8x32 {
+ if len(s) == 0 {
+ var zero Uint8x32
+ return zero
+ }
+ t := unsafe.Slice((*int8)(unsafe.Pointer(&s[0])), len(s))
+ return LoadInt8x32SlicePart(t).AsUint8x32()
+}
+
+// StoreSlicePart stores the 32 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 32 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint8x32) StoreSlicePart(s []uint8) {
+ if len(s) == 0 {
+ return
+ }
+ t := unsafe.Slice((*int8)(unsafe.Pointer(&s[0])), len(s))
+ x.AsInt8x32().StoreSlicePart(t)
+}
+
+// LoadUint16x16SlicePart loads a Uint16x16 from the slice s.
+// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 16 or more elements, the function is equivalent to LoadUint16x16Slice.
+func LoadUint16x16SlicePart(s []uint16) Uint16x16 {
+ if len(s) == 0 {
+ var zero Uint16x16
+ return zero
+ }
+ t := unsafe.Slice((*int16)(unsafe.Pointer(&s[0])), len(s))
+ return LoadInt16x16SlicePart(t).AsUint16x16()
+}
+
+// StoreSlicePart stores the 16 elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
+func (x Uint16x16) StoreSlicePart(s []uint16) {
+ if len(s) == 0 {
+ return
+ }
+ t := unsafe.Slice((*int16)(unsafe.Pointer(&s[0])), len(s))
+ x.AsInt16x16().StoreSlicePart(t)
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+import "unsafe"
+
+// Implementation of all the {Int,Uint}{8,16} load and store slice part
+// functions and methods for 128-bit and 256-bit vectors.
+
+/* pointer-punning functions for chunked slice part loads. */
+
+func int16atP8(p *int8) *int16 {
+ return (*int16)(unsafe.Pointer(p))
+}
+
+func int32atP8(p *int8) *int32 {
+ return (*int32)(unsafe.Pointer(p))
+}
+
+func int64atP8(p *int8) *int64 {
+ return (*int64)(unsafe.Pointer(p))
+}
+
+func int32atP16(p *int16) *int32 {
+ return (*int32)(unsafe.Pointer(p))
+}
+
+func int64atP16(p *int16) *int64 {
+ return (*int64)(unsafe.Pointer(p))
+}
+
+func int64atP32(p *int32) *int64 {
+ return (*int64)(unsafe.Pointer(p))
+}
+
+func int32atP64(p *int64) *int32 {
+ return (*int32)(unsafe.Pointer(p))
+}
+
+/* These two masks are used by generated code */
+
+var vecMask64 = [16]int64{
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+}
+
+var vecMask32 = [32]int32{
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ -1, -1, -1, -1,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+}
+
+/* 256-bit int vector loads and stores made from 128-bit parts */
+
+// LoadInt8x32SlicePart loads a Int8x32 from the slice s.
+// If s has fewer than 32 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 32 or more elements, the function is equivalent to LoadInt8x32Slice.
+func LoadInt8x32SlicePart(s []int8) Int8x32 {
+ l := len(s)
+ if l >= 32 {
+ return LoadInt8x32Slice(s)
+ }
+ var x Int8x32
+ if l == 0 {
+ return x
+ }
+ if l > 16 {
+ return x.SetLo(LoadInt8x16Slice(s)).SetHi(LoadInt8x16SlicePart(s[16:]))
+ } else {
+ return x.SetLo(LoadInt8x16SlicePart(s))
+ }
+}
+
+// LoadInt16x16SlicePart loads a Int16x16 from the slice s.
+// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 16 or more elements, the function is equivalent to LoadInt16x16Slice.
+func LoadInt16x16SlicePart(s []int16) Int16x16 {
+ l := len(s)
+ if l >= 16 {
+ return LoadInt16x16Slice(s)
+ }
+ var x Int16x16
+ if l == 0 {
+ return x
+ }
+ if l > 8 {
+ return x.SetLo(LoadInt16x8Slice(s)).SetHi(LoadInt16x8SlicePart(s[8:]))
+ } else {
+ return x.SetLo(LoadInt16x8SlicePart(s))
+ }
+}
+
+// StoreSlicePart stores the elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 32 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int8x32) StoreSlicePart(s []int8) {
+ l := len(s)
+ if l >= 32 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ if l > 16 {
+ x.GetLo().StoreSlice(s)
+ x.GetHi().StoreSlicePart(s[16:])
+ } else { // fits in one
+ x.GetLo().StoreSlicePart(s)
+ }
+}
+
+// StoreSlicePart stores the elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int16x16) StoreSlicePart(s []int16) {
+ l := len(s)
+ if l >= 16 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ if l > 8 {
+ x.GetLo().StoreSlice(s)
+ x.GetHi().StoreSlicePart(s[8:])
+ } else { // fits in one
+ x.GetLo().StoreSlicePart(s)
+ }
+}
+
+/* 128-bit vector load and store slice parts for 8 and 16-bit int elements */
+
+// LoadInt8x16SlicePart loads a Int8x16 from the slice s.
+// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 16 or more elements, the function is equivalent to LoadInt8x16Slice.
+func LoadInt8x16SlicePart(s []int8) Int8x16 {
+ l := len(s)
+ if l >= 16 {
+ return LoadInt8x16Slice(s)
+ }
+ var x Int8x16
+ if l == 0 {
+ return x
+ }
+ if l >= 8 { // 8-15
+ x = x.AsInt64x2().SetElem(0, *int64atP8(&s[0])).AsInt8x16()
+ if l >= 12 { // 12, 13, 14, 15
+ x = x.AsInt32x4().SetElem(8/4, *int32atP8(&s[8])).AsInt8x16()
+ if l >= 14 {
+ x = x.AsInt16x8().SetElem(12/2, *int16atP8(&s[12])).AsInt8x16()
+ if l == 15 {
+ x = x.SetElem(14, s[14])
+ }
+ } else if l == 13 {
+ x = x.SetElem(12, s[12])
+ }
+ } else if l >= 10 { // 10, 11
+ x = x.AsInt16x8().SetElem(8/2, *int16atP8(&s[8])).AsInt8x16()
+ if l == 11 {
+ x = x.SetElem(10, s[10])
+ }
+ } else if l == 9 {
+ x = x.SetElem(8, s[8])
+ }
+ } else if l >= 4 { // 4-7
+ x = x.AsInt32x4().SetElem(0, *int32atP8(&s[0])).AsInt8x16()
+ if l >= 6 {
+ x = x.AsInt16x8().SetElem(4/2, *int16atP8(&s[4])).AsInt8x16()
+ if l == 7 {
+ x = x.SetElem(6, s[6])
+ }
+ } else if l == 5 {
+ x = x.SetElem(4, s[4])
+ }
+ } else if l >= 2 { // 2,3
+ x = x.AsInt16x8().SetElem(0, *int16atP8(&s[0])).AsInt8x16()
+ if l == 3 {
+ x = x.SetElem(2, s[2])
+ }
+ } else { // l == 1
+ x = x.SetElem(0, s[0])
+ }
+ return x
+}
+
+// StoreSlicePart stores the elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int8x16) StoreSlicePart(s []int8) {
+ l := len(s)
+ if l >= 16 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ if l >= 8 { // 8-15
+ *int64atP8(&s[0]) = x.AsInt64x2().GetElem(0)
+ if l >= 12 { // 12, 13, 14, 15
+ *int32atP8(&s[8]) = x.AsInt32x4().GetElem(8 / 4)
+ if l >= 14 {
+ *int16atP8(&s[12]) = x.AsInt16x8().GetElem(12 / 2)
+ if l == 15 {
+ s[14] = x.GetElem(14)
+ }
+ } else if l == 13 {
+ s[12] = x.GetElem(12)
+ }
+ } else if l >= 10 { // 10, 11
+ *int16atP8(&s[8]) = x.AsInt16x8().GetElem(8 / 2)
+ if l == 11 {
+ s[10] = x.GetElem(10)
+ }
+ } else if l == 9 {
+ s[8] = x.GetElem(8)
+ }
+ } else if l >= 4 { // 4-7
+ *int32atP8(&s[0]) = x.AsInt32x4().GetElem(0)
+ if l >= 6 {
+ *int16atP8(&s[4]) = x.AsInt16x8().GetElem(4 / 2)
+ if l == 7 {
+ s[6] = x.GetElem(6)
+ }
+ } else if l == 5 {
+ s[4] = x.GetElem(4)
+ }
+ } else if l >= 2 { // 2,3
+ *int16atP8(&s[0]) = x.AsInt16x8().GetElem(0)
+ if l == 3 {
+ s[2] = x.GetElem(2)
+ }
+ } else { // l == 1
+ s[0] = x.GetElem(0)
+ }
+}
+
+// LoadInt16x8SlicePart loads a Int16x8 from the slice s.
+// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
+// If s has 8 or more elements, the function is equivalent to LoadInt16x8Slice.
+func LoadInt16x8SlicePart(s []int16) Int16x8 {
+ l := len(s)
+ if l >= 8 {
+ return LoadInt16x8Slice(s)
+ }
+ var x Int16x8
+ if l == 0 {
+ return x
+ }
+ if l >= 4 { // 4-7
+ x = x.AsInt64x2().SetElem(0, *int64atP16(&s[0])).AsInt16x8()
+ if l >= 6 {
+ x = x.AsInt32x4().SetElem(4/2, *int32atP16(&s[4])).AsInt16x8()
+ if l == 7 {
+ x = x.SetElem(6, s[6])
+ }
+ } else if l == 5 {
+ x = x.SetElem(4, s[4])
+ }
+ } else if l >= 2 { // 2,3
+ x = x.AsInt32x4().SetElem(0, *int32atP16(&s[0])).AsInt16x8()
+ if l == 3 {
+ x = x.SetElem(2, s[2])
+ }
+ } else { // l == 1
+ x = x.SetElem(0, s[0])
+ }
+ return x
+}
+
+// StoreSlicePart stores the elements of x into the slice s.
+// It stores as many elements as will fit in s.
+// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
+func (x Int16x8) StoreSlicePart(s []int16) {
+ l := len(s)
+ if l >= 8 {
+ x.StoreSlice(s)
+ return
+ }
+ if l == 0 {
+ return
+ }
+ if l >= 4 { // 4-7
+ *int64atP16(&s[0]) = x.AsInt64x2().GetElem(0)
+ if l >= 6 {
+ *int32atP16(&s[4]) = x.AsInt32x4().GetElem(4 / 2)
+ if l == 7 {
+ s[6] = x.GetElem(6)
+ }
+ } else if l == 5 {
+ s[4] = x.GetElem(4)
+ }
+ } else if l >= 2 { // 2,3
+ *int32atP16(&s[0]) = x.AsInt32x4().GetElem(0)
+ if l == 3 {
+ s[2] = x.GetElem(2)
+ }
+ } else { // l == 1
+ s[0] = x.GetElem(0)
+ }
+ return
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.simd && amd64
+
+package archsimd
+
+import (
+ "internal/strconv"
+)
+
+type number interface {
+ ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64
+}
+
+func sliceToString[T number](x []T) string {
+ s := ""
+ pfx := "{"
+ for _, y := range x {
+ s += pfx
+ pfx = ","
+ switch e := any(y).(type) {
+ case int8:
+ s += strconv.Itoa(int(e))
+ case int16:
+ s += strconv.Itoa(int(e))
+ case int32:
+ s += strconv.Itoa(int(e))
+ case int64:
+ s += strconv.Itoa(int(e))
+ case uint8:
+ s += strconv.FormatUint(uint64(e), 10)
+ case uint16:
+ s += strconv.FormatUint(uint64(e), 10)
+ case uint32:
+ s += strconv.FormatUint(uint64(e), 10)
+ case uint64:
+ s += strconv.FormatUint(uint64(e), 10)
+ case float32:
+ s += strconv.FormatFloat(float64(e), 'g', -1, 32)
+ case float64:
+ s += strconv.FormatFloat(e, 'g', -1, 64)
+ }
+ }
+ s += "}"
+ return s
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+ "fmt"
+ "os"
+ "simd/archsimd"
+ "unsafe"
+)
+
+func load(s []float64) archsimd.Float64x4 {
+ return archsimd.LoadFloat64x4((*[4]float64)(s[:4]))
+}
+
+type S1 = archsimd.Float64x4
+
+type S2 archsimd.Float64x4
+
+func (s S2) Len() int {
+ return archsimd.Float64x4(s).Len()
+}
+
+func (s S2) Load(a []float64) S2 {
+ return S2(load(a))
+}
+
+func (s S2) Store(a *[4]float64) {
+ archsimd.Float64x4(s).Store(a)
+}
+
+func (s S2) Add(a S2) S2 {
+ return S2(archsimd.Float64x4(s).Add(archsimd.Float64x4(a)))
+}
+
+func (s S2) Mul(a S2) S2 {
+ return S2(archsimd.Float64x4(s).Mul(archsimd.Float64x4(a)))
+}
+
+type S3 struct {
+ archsimd.Float64x4
+}
+
+func ip64_0(a, b []float64) float64 {
+ s := 0.0
+ for i := range a {
+ s += a[i] * b[i]
+ }
+ return s
+}
+
+func ip64_1(a, b []float64) float64 {
+ var z S1
+ sum := z
+ var i int
+ stride := z.Len()
+ for ; i <= len(a)-stride; i += stride {
+ va := load(a[i:])
+ vb := load(b[i:])
+ sum = sum.Add(va.Mul(vb))
+ }
+ var tmp [4]float64
+ sum.Store(&tmp)
+ return tmp[0] + tmp[1] + tmp[2] + tmp[3]
+}
+
+func ip64_1a(a, b []float64) float64 {
+ var z S1
+ sum := z
+ var i int
+ stride := z.Len()
+ for ; i <= len(a)-stride; i += stride {
+ va := load(a[i:])
+ vb := load(b[i:])
+ sum = FMA(sum, va, vb)
+ }
+ var tmp [4]float64
+ sum.Store(&tmp)
+ return tmp[0] + tmp[1] + tmp[2] + tmp[3]
+}
+
+//go:noinline
+func FMA(a, b, c archsimd.Float64x4) archsimd.Float64x4 {
+ return a.Add(b.Mul(c))
+}
+
+func ip64_2(a, b []float64) float64 {
+ var z S2
+ sum := z
+ var i int
+ stride := z.Len()
+ for ; i <= len(a)-stride; i += stride {
+ va := z.Load(a[i:])
+ vb := z.Load(b[i:])
+ sum = sum.Add(va.Mul(vb))
+ }
+ var tmp [4]float64
+ sum.Store(&tmp)
+ return tmp[0] + tmp[1] + tmp[2] + tmp[3]
+}
+
+func ip64_3(a, b []float64) float64 {
+ var z S3
+ sum := z
+ var i int
+ stride := z.Len()
+ for ; i <= len(a)-stride; i += stride {
+ va := load(a[i:])
+ vb := load(b[i:])
+ sum = S3{sum.Add(va.Mul(vb))}
+ }
+ var tmp [4]float64
+ sum.Store(&tmp)
+ return tmp[0] + tmp[1] + tmp[2] + tmp[3]
+}
+
+func main() {
+ a := []float64{1, 2, 3, 4, 5, 6, 7, 8}
+ ip0 := ip64_0(a, a)
+ ip1 := ip64_1(a, a)
+ ip1a := ip64_1a(a, a)
+ ip2 := ip64_2(a, a)
+ ip3 := ip64_3(a, a)
+ fmt.Printf("Test IP = %f\n", ip0)
+ fmt.Printf("SIMD IP 1 = %f\n", ip1)
+ fmt.Printf("SIMD IP 1a = %f\n", ip1a)
+ fmt.Printf("SIMD IP 2 = %f\n", ip2)
+ fmt.Printf("SIMD IP 3 = %f\n", ip3)
+ var z1 S1
+ var z2 S2
+ var z3 S2
+
+ s1, s2, s3 := unsafe.Sizeof(z1), unsafe.Sizeof(z2), unsafe.Sizeof(z3)
+
+ fmt.Printf("unsafe.Sizeof(z1, z2, z3)=%d, %d, %d\n", s1, s2, s3)
+
+ fail := false
+
+ if s1 != 32 || s2 != 32 || s3 != 32 {
+ fmt.Println("Failed a sizeof check, should all be 32")
+ fail = true
+ }
+
+ if ip1 != ip0 || ip1a != ip0 || ip2 != ip0 || ip3 != ip0 {
+ fmt.Println("Failed an inner product check, should all be", ip0)
+ fail = true
+ }
+
+ if fail {
+ os.Exit(1)
+ }
+}
--- /dev/null
+// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+// v128 is a tag type that tells the compiler that this is really 128-bit SIMD
+type v128 struct {
+ _128 [0]func() // uncomparable
+}
+
+// Float32x4 is a 128-bit SIMD vector of 4 float32
+type Float32x4 struct {
+ float32x4 v128
+ vals [4]float32
+}
+
+// Len returns the number of elements in a Float32x4
+func (x Float32x4) Len() int { return 4 }
+
+// LoadFloat32x4 loads a Float32x4 from an array
+//
+//go:noescape
+func LoadFloat32x4(y *[4]float32) Float32x4
+
+// Store stores a Float32x4 to an array
+//
+//go:noescape
+func (x Float32x4) Store(y *[4]float32)
+
+// LoadMaskedFloat32x4 loads a Float32x4 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedFloat32x4(y *[4]float32, mask Mask32x4) Float32x4
+
+// StoreMasked stores a Float32x4 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func (x Float32x4) StoreMasked(y *[4]float32, mask Mask32x4)
+
+// Float64x2 is a 128-bit SIMD vector of 2 float64
+type Float64x2 struct {
+ float64x2 v128
+ vals [2]float64
+}
+
+// Len returns the number of elements in a Float64x2
+func (x Float64x2) Len() int { return 2 }
+
+// LoadFloat64x2 loads a Float64x2 from an array
+//
+//go:noescape
+func LoadFloat64x2(y *[2]float64) Float64x2
+
+// Store stores a Float64x2 to an array
+//
+//go:noescape
+func (x Float64x2) Store(y *[2]float64)
+
+// LoadMaskedFloat64x2 loads a Float64x2 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedFloat64x2(y *[2]float64, mask Mask64x2) Float64x2
+
+// StoreMasked stores a Float64x2 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func (x Float64x2) StoreMasked(y *[2]float64, mask Mask64x2)
+
+// Int8x16 is a 128-bit SIMD vector of 16 int8
+type Int8x16 struct {
+ int8x16 v128
+ vals [16]int8
+}
+
+// Len returns the number of elements in a Int8x16
+func (x Int8x16) Len() int { return 16 }
+
+// LoadInt8x16 loads a Int8x16 from an array
+//
+//go:noescape
+func LoadInt8x16(y *[16]int8) Int8x16
+
+// Store stores a Int8x16 to an array
+//
+//go:noescape
+func (x Int8x16) Store(y *[16]int8)
+
+// Int16x8 is a 128-bit SIMD vector of 8 int16
+type Int16x8 struct {
+ int16x8 v128
+ vals [8]int16
+}
+
+// Len returns the number of elements in a Int16x8
+func (x Int16x8) Len() int { return 8 }
+
+// LoadInt16x8 loads a Int16x8 from an array
+//
+//go:noescape
+func LoadInt16x8(y *[8]int16) Int16x8
+
+// Store stores a Int16x8 to an array
+//
+//go:noescape
+func (x Int16x8) Store(y *[8]int16)
+
+// Int32x4 is a 128-bit SIMD vector of 4 int32
+type Int32x4 struct {
+ int32x4 v128
+ vals [4]int32
+}
+
+// Len returns the number of elements in a Int32x4
+func (x Int32x4) Len() int { return 4 }
+
+// LoadInt32x4 loads a Int32x4 from an array
+//
+//go:noescape
+func LoadInt32x4(y *[4]int32) Int32x4
+
+// Store stores a Int32x4 to an array
+//
+//go:noescape
+func (x Int32x4) Store(y *[4]int32)
+
+// LoadMaskedInt32x4 loads a Int32x4 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedInt32x4(y *[4]int32, mask Mask32x4) Int32x4
+
+// StoreMasked stores a Int32x4 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func (x Int32x4) StoreMasked(y *[4]int32, mask Mask32x4)
+
+// Int64x2 is a 128-bit SIMD vector of 2 int64
+type Int64x2 struct {
+ int64x2 v128
+ vals [2]int64
+}
+
+// Len returns the number of elements in a Int64x2
+func (x Int64x2) Len() int { return 2 }
+
+// LoadInt64x2 loads a Int64x2 from an array
+//
+//go:noescape
+func LoadInt64x2(y *[2]int64) Int64x2
+
+// Store stores a Int64x2 to an array
+//
+//go:noescape
+func (x Int64x2) Store(y *[2]int64)
+
+// LoadMaskedInt64x2 loads a Int64x2 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedInt64x2(y *[2]int64, mask Mask64x2) Int64x2
+
+// StoreMasked stores a Int64x2 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func (x Int64x2) StoreMasked(y *[2]int64, mask Mask64x2)
+
+// Uint8x16 is a 128-bit SIMD vector of 16 uint8
+type Uint8x16 struct {
+ uint8x16 v128
+ vals [16]uint8
+}
+
+// Len returns the number of elements in a Uint8x16
+func (x Uint8x16) Len() int { return 16 }
+
+// LoadUint8x16 loads a Uint8x16 from an array
+//
+//go:noescape
+func LoadUint8x16(y *[16]uint8) Uint8x16
+
+// Store stores a Uint8x16 to an array
+//
+//go:noescape
+func (x Uint8x16) Store(y *[16]uint8)
+
+// Uint16x8 is a 128-bit SIMD vector of 8 uint16
+type Uint16x8 struct {
+ uint16x8 v128
+ vals [8]uint16
+}
+
+// Len returns the number of elements in a Uint16x8
+func (x Uint16x8) Len() int { return 8 }
+
+// LoadUint16x8 loads a Uint16x8 from an array
+//
+//go:noescape
+func LoadUint16x8(y *[8]uint16) Uint16x8
+
+// Store stores a Uint16x8 to an array
+//
+//go:noescape
+func (x Uint16x8) Store(y *[8]uint16)
+
+// Uint32x4 is a 128-bit SIMD vector of 4 uint32
+type Uint32x4 struct {
+ uint32x4 v128
+ vals [4]uint32
+}
+
+// Len returns the number of elements in a Uint32x4
+func (x Uint32x4) Len() int { return 4 }
+
+// LoadUint32x4 loads a Uint32x4 from an array
+//
+//go:noescape
+func LoadUint32x4(y *[4]uint32) Uint32x4
+
+// Store stores a Uint32x4 to an array
+//
+//go:noescape
+func (x Uint32x4) Store(y *[4]uint32)
+
+// LoadMaskedUint32x4 loads a Uint32x4 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedUint32x4(y *[4]uint32, mask Mask32x4) Uint32x4
+
+// StoreMasked stores a Uint32x4 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func (x Uint32x4) StoreMasked(y *[4]uint32, mask Mask32x4)
+
+// Uint64x2 is a 128-bit SIMD vector of 2 uint64
+type Uint64x2 struct {
+ uint64x2 v128
+ vals [2]uint64
+}
+
+// Len returns the number of elements in a Uint64x2
+func (x Uint64x2) Len() int { return 2 }
+
+// LoadUint64x2 loads a Uint64x2 from an array
+//
+//go:noescape
+func LoadUint64x2(y *[2]uint64) Uint64x2
+
+// Store stores a Uint64x2 to an array
+//
+//go:noescape
+func (x Uint64x2) Store(y *[2]uint64)
+
+// LoadMaskedUint64x2 loads a Uint64x2 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedUint64x2(y *[2]uint64, mask Mask64x2) Uint64x2
+
+// StoreMasked stores a Uint64x2 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func (x Uint64x2) StoreMasked(y *[2]uint64, mask Mask64x2)
+
+// Mask8x16 is a 128-bit SIMD vector of 16 int8
+type Mask8x16 struct {
+ int8x16 v128
+ vals [16]int8
+}
+
+// Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVB, CPU Feature: AVX512
+func Mask8x16FromBits(y uint16) Mask8x16
+
+// ToBits constructs a bitmap from a Mask8x16, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVB, CPU Features: AVX512
+func (x Mask8x16) ToBits() uint16
+
+// Mask16x8 is a 128-bit SIMD vector of 8 int16
+type Mask16x8 struct {
+ int16x8 v128
+ vals [8]int16
+}
+
+// Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVW, CPU Feature: AVX512
+func Mask16x8FromBits(y uint8) Mask16x8
+
+// ToBits constructs a bitmap from a Mask16x8, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVW, CPU Features: AVX512
+func (x Mask16x8) ToBits() uint8
+
+// Mask32x4 is a 128-bit SIMD vector of 4 int32
+type Mask32x4 struct {
+ int32x4 v128
+ vals [4]int32
+}
+
+// Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+// Only the lower 4 bits of y are used.
+//
+// Asm: KMOVD, CPU Feature: AVX512
+func Mask32x4FromBits(y uint8) Mask32x4
+
+// ToBits constructs a bitmap from a Mask32x4, where 1 means set for the indexed element, 0 means unset.
+// Only the lower 4 bits of y are used.
+//
+// Asm: KMOVD, CPU Features: AVX512
+func (x Mask32x4) ToBits() uint8
+
+// Mask64x2 is a 128-bit SIMD vector of 2 int64
+type Mask64x2 struct {
+ int64x2 v128
+ vals [2]int64
+}
+
+// Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+// Only the lower 2 bits of y are used.
+//
+// Asm: KMOVQ, CPU Feature: AVX512
+func Mask64x2FromBits(y uint8) Mask64x2
+
+// ToBits constructs a bitmap from a Mask64x2, where 1 means set for the indexed element, 0 means unset.
+// Only the lower 2 bits of y are used.
+//
+// Asm: KMOVQ, CPU Features: AVX512
+func (x Mask64x2) ToBits() uint8
+
+// v256 is a tag type that tells the compiler that this is really 256-bit SIMD
+type v256 struct {
+ _256 [0]func() // uncomparable
+}
+
+// Float32x8 is a 256-bit SIMD vector of 8 float32
+type Float32x8 struct {
+ float32x8 v256
+ vals [8]float32
+}
+
+// Len returns the number of elements in a Float32x8
+func (x Float32x8) Len() int { return 8 }
+
+// LoadFloat32x8 loads a Float32x8 from an array
+//
+//go:noescape
+func LoadFloat32x8(y *[8]float32) Float32x8
+
+// Store stores a Float32x8 to an array
+//
+//go:noescape
+func (x Float32x8) Store(y *[8]float32)
+
+// LoadMaskedFloat32x8 loads a Float32x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedFloat32x8(y *[8]float32, mask Mask32x8) Float32x8
+
+// StoreMasked stores a Float32x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func (x Float32x8) StoreMasked(y *[8]float32, mask Mask32x8)
+
+// Float64x4 is a 256-bit SIMD vector of 4 float64
+type Float64x4 struct {
+ float64x4 v256
+ vals [4]float64
+}
+
+// Len returns the number of elements in a Float64x4
+func (x Float64x4) Len() int { return 4 }
+
+// LoadFloat64x4 loads a Float64x4 from an array
+//
+//go:noescape
+func LoadFloat64x4(y *[4]float64) Float64x4
+
+// Store stores a Float64x4 to an array
+//
+//go:noescape
+func (x Float64x4) Store(y *[4]float64)
+
+// LoadMaskedFloat64x4 loads a Float64x4 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedFloat64x4(y *[4]float64, mask Mask64x4) Float64x4
+
+// StoreMasked stores a Float64x4 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func (x Float64x4) StoreMasked(y *[4]float64, mask Mask64x4)
+
+// Int8x32 is a 256-bit SIMD vector of 32 int8
+type Int8x32 struct {
+ int8x32 v256
+ vals [32]int8
+}
+
+// Len returns the number of elements in a Int8x32
+func (x Int8x32) Len() int { return 32 }
+
+// LoadInt8x32 loads a Int8x32 from an array
+//
+//go:noescape
+func LoadInt8x32(y *[32]int8) Int8x32
+
+// Store stores a Int8x32 to an array
+//
+//go:noescape
+func (x Int8x32) Store(y *[32]int8)
+
+// Int16x16 is a 256-bit SIMD vector of 16 int16
+type Int16x16 struct {
+ int16x16 v256
+ vals [16]int16
+}
+
+// Len returns the number of elements in a Int16x16
+func (x Int16x16) Len() int { return 16 }
+
+// LoadInt16x16 loads a Int16x16 from an array
+//
+//go:noescape
+func LoadInt16x16(y *[16]int16) Int16x16
+
+// Store stores a Int16x16 to an array
+//
+//go:noescape
+func (x Int16x16) Store(y *[16]int16)
+
+// Int32x8 is a 256-bit SIMD vector of 8 int32
+type Int32x8 struct {
+ int32x8 v256
+ vals [8]int32
+}
+
+// Len returns the number of elements in a Int32x8
+func (x Int32x8) Len() int { return 8 }
+
+// LoadInt32x8 loads a Int32x8 from an array
+//
+//go:noescape
+func LoadInt32x8(y *[8]int32) Int32x8
+
+// Store stores a Int32x8 to an array
+//
+//go:noescape
+func (x Int32x8) Store(y *[8]int32)
+
+// LoadMaskedInt32x8 loads a Int32x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedInt32x8(y *[8]int32, mask Mask32x8) Int32x8
+
+// StoreMasked stores a Int32x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func (x Int32x8) StoreMasked(y *[8]int32, mask Mask32x8)
+
+// Int64x4 is a 256-bit SIMD vector of 4 int64
+type Int64x4 struct {
+ int64x4 v256
+ vals [4]int64
+}
+
+// Len returns the number of elements in a Int64x4
+func (x Int64x4) Len() int { return 4 }
+
+// LoadInt64x4 loads a Int64x4 from an array
+//
+//go:noescape
+func LoadInt64x4(y *[4]int64) Int64x4
+
+// Store stores a Int64x4 to an array
+//
+//go:noescape
+func (x Int64x4) Store(y *[4]int64)
+
+// LoadMaskedInt64x4 loads a Int64x4 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedInt64x4(y *[4]int64, mask Mask64x4) Int64x4
+
+// StoreMasked stores a Int64x4 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func (x Int64x4) StoreMasked(y *[4]int64, mask Mask64x4)
+
+// Uint8x32 is a 256-bit SIMD vector of 32 uint8
+type Uint8x32 struct {
+ uint8x32 v256
+ vals [32]uint8
+}
+
+// Len returns the number of elements in a Uint8x32
+func (x Uint8x32) Len() int { return 32 }
+
+// LoadUint8x32 loads a Uint8x32 from an array
+//
+//go:noescape
+func LoadUint8x32(y *[32]uint8) Uint8x32
+
+// Store stores a Uint8x32 to an array
+//
+//go:noescape
+func (x Uint8x32) Store(y *[32]uint8)
+
+// Uint16x16 is a 256-bit SIMD vector of 16 uint16
+type Uint16x16 struct {
+ uint16x16 v256
+ vals [16]uint16
+}
+
+// Len returns the number of elements in a Uint16x16
+func (x Uint16x16) Len() int { return 16 }
+
+// LoadUint16x16 loads a Uint16x16 from an array
+//
+//go:noescape
+func LoadUint16x16(y *[16]uint16) Uint16x16
+
+// Store stores a Uint16x16 to an array
+//
+//go:noescape
+func (x Uint16x16) Store(y *[16]uint16)
+
+// Uint32x8 is a 256-bit SIMD vector of 8 uint32
+type Uint32x8 struct {
+ uint32x8 v256
+ vals [8]uint32
+}
+
+// Len returns the number of elements in a Uint32x8
+func (x Uint32x8) Len() int { return 8 }
+
+// LoadUint32x8 loads a Uint32x8 from an array
+//
+//go:noescape
+func LoadUint32x8(y *[8]uint32) Uint32x8
+
+// Store stores a Uint32x8 to an array
+//
+//go:noescape
+func (x Uint32x8) Store(y *[8]uint32)
+
+// LoadMaskedUint32x8 loads a Uint32x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedUint32x8(y *[8]uint32, mask Mask32x8) Uint32x8
+
+// StoreMasked stores a Uint32x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
+//go:noescape
+func (x Uint32x8) StoreMasked(y *[8]uint32, mask Mask32x8)
+
+// Uint64x4 is a 256-bit SIMD vector of 4 uint64
+type Uint64x4 struct {
+ uint64x4 v256
+ vals [4]uint64
+}
+
+// Len returns the number of elements in a Uint64x4
+func (x Uint64x4) Len() int { return 4 }
+
+// LoadUint64x4 loads a Uint64x4 from an array
+//
+//go:noescape
+func LoadUint64x4(y *[4]uint64) Uint64x4
+
+// Store stores a Uint64x4 to an array
+//
+//go:noescape
+func (x Uint64x4) Store(y *[4]uint64)
+
+// LoadMaskedUint64x4 loads a Uint64x4 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func LoadMaskedUint64x4(y *[4]uint64, mask Mask64x4) Uint64x4
+
+// StoreMasked stores a Uint64x4 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
+//go:noescape
+func (x Uint64x4) StoreMasked(y *[4]uint64, mask Mask64x4)
+
+// Mask8x32 is a 256-bit SIMD vector of 32 int8
+type Mask8x32 struct {
+ int8x32 v256
+ vals [32]int8
+}
+
+// Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVB, CPU Feature: AVX512
+func Mask8x32FromBits(y uint32) Mask8x32
+
+// ToBits constructs a bitmap from a Mask8x32, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVB, CPU Features: AVX512
+func (x Mask8x32) ToBits() uint32
+
+// Mask16x16 is a 256-bit SIMD vector of 16 int16
+type Mask16x16 struct {
+ int16x16 v256
+ vals [16]int16
+}
+
+// Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVW, CPU Feature: AVX512
+func Mask16x16FromBits(y uint16) Mask16x16
+
+// ToBits constructs a bitmap from a Mask16x16, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVW, CPU Features: AVX512
+func (x Mask16x16) ToBits() uint16
+
+// Mask32x8 is a 256-bit SIMD vector of 8 int32
+type Mask32x8 struct {
+ int32x8 v256
+ vals [8]int32
+}
+
+// Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVD, CPU Feature: AVX512
+func Mask32x8FromBits(y uint8) Mask32x8
+
+// ToBits constructs a bitmap from a Mask32x8, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVD, CPU Features: AVX512
+func (x Mask32x8) ToBits() uint8
+
+// Mask64x4 is a 256-bit SIMD vector of 4 int64
+type Mask64x4 struct {
+ int64x4 v256
+ vals [4]int64
+}
+
+// Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+// Only the lower 4 bits of y are used.
+//
+// Asm: KMOVQ, CPU Feature: AVX512
+func Mask64x4FromBits(y uint8) Mask64x4
+
+// ToBits constructs a bitmap from a Mask64x4, where 1 means set for the indexed element, 0 means unset.
+// Only the lower 4 bits of y are used.
+//
+// Asm: KMOVQ, CPU Features: AVX512
+func (x Mask64x4) ToBits() uint8
+
+// v512 is a tag type that tells the compiler that this is really 512-bit SIMD
+type v512 struct {
+ _512 [0]func() // uncomparable
+}
+
+// Float32x16 is a 512-bit SIMD vector of 16 float32
+type Float32x16 struct {
+ float32x16 v512
+ vals [16]float32
+}
+
+// Len returns the number of elements in a Float32x16
+func (x Float32x16) Len() int { return 16 }
+
+// LoadFloat32x16 loads a Float32x16 from an array
+//
+//go:noescape
+func LoadFloat32x16(y *[16]float32) Float32x16
+
+// Store stores a Float32x16 to an array
+//
+//go:noescape
+func (x Float32x16) Store(y *[16]float32)
+
+// LoadMaskedFloat32x16 loads a Float32x16 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedFloat32x16(y *[16]float32, mask Mask32x16) Float32x16
+
+// StoreMasked stores a Float32x16 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+//
+//go:noescape
+func (x Float32x16) StoreMasked(y *[16]float32, mask Mask32x16)
+
+// Float64x8 is a 512-bit SIMD vector of 8 float64
+type Float64x8 struct {
+ float64x8 v512
+ vals [8]float64
+}
+
+// Len returns the number of elements in a Float64x8
+func (x Float64x8) Len() int { return 8 }
+
+// LoadFloat64x8 loads a Float64x8 from an array
+//
+//go:noescape
+func LoadFloat64x8(y *[8]float64) Float64x8
+
+// Store stores a Float64x8 to an array
+//
+//go:noescape
+func (x Float64x8) Store(y *[8]float64)
+
+// LoadMaskedFloat64x8 loads a Float64x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedFloat64x8(y *[8]float64, mask Mask64x8) Float64x8
+
+// StoreMasked stores a Float64x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+//
+//go:noescape
+func (x Float64x8) StoreMasked(y *[8]float64, mask Mask64x8)
+
+// Int8x64 is a 512-bit SIMD vector of 64 int8
+type Int8x64 struct {
+ int8x64 v512
+ vals [64]int8
+}
+
+// Len returns the number of elements in a Int8x64
+func (x Int8x64) Len() int { return 64 }
+
+// LoadInt8x64 loads a Int8x64 from an array
+//
+//go:noescape
+func LoadInt8x64(y *[64]int8) Int8x64
+
+// Store stores a Int8x64 to an array
+//
+//go:noescape
+func (x Int8x64) Store(y *[64]int8)
+
+// LoadMaskedInt8x64 loads a Int8x64 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt8x64(y *[64]int8, mask Mask8x64) Int8x64
+
+// StoreMasked stores a Int8x64 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int8x64) StoreMasked(y *[64]int8, mask Mask8x64)
+
+// Int16x32 is a 512-bit SIMD vector of 32 int16
+type Int16x32 struct {
+ int16x32 v512
+ vals [32]int16
+}
+
+// Len returns the number of elements in a Int16x32
+func (x Int16x32) Len() int { return 32 }
+
+// LoadInt16x32 loads a Int16x32 from an array
+//
+//go:noescape
+func LoadInt16x32(y *[32]int16) Int16x32
+
+// Store stores a Int16x32 to an array
+//
+//go:noescape
+func (x Int16x32) Store(y *[32]int16)
+
+// LoadMaskedInt16x32 loads a Int16x32 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt16x32(y *[32]int16, mask Mask16x32) Int16x32
+
+// StoreMasked stores a Int16x32 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int16x32) StoreMasked(y *[32]int16, mask Mask16x32)
+
+// Int32x16 is a 512-bit SIMD vector of 16 int32
+type Int32x16 struct {
+ int32x16 v512
+ vals [16]int32
+}
+
+// Len returns the number of elements in a Int32x16
+func (x Int32x16) Len() int { return 16 }
+
+// LoadInt32x16 loads a Int32x16 from an array
+//
+//go:noescape
+func LoadInt32x16(y *[16]int32) Int32x16
+
+// Store stores a Int32x16 to an array
+//
+//go:noescape
+func (x Int32x16) Store(y *[16]int32)
+
+// LoadMaskedInt32x16 loads a Int32x16 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt32x16(y *[16]int32, mask Mask32x16) Int32x16
+
+// StoreMasked stores a Int32x16 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int32x16) StoreMasked(y *[16]int32, mask Mask32x16)
+
+// Int64x8 is a 512-bit SIMD vector of 8 int64
+type Int64x8 struct {
+ int64x8 v512
+ vals [8]int64
+}
+
+// Len returns the number of elements in a Int64x8
+func (x Int64x8) Len() int { return 8 }
+
+// LoadInt64x8 loads a Int64x8 from an array
+//
+//go:noescape
+func LoadInt64x8(y *[8]int64) Int64x8
+
+// Store stores a Int64x8 to an array
+//
+//go:noescape
+func (x Int64x8) Store(y *[8]int64)
+
+// LoadMaskedInt64x8 loads a Int64x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt64x8(y *[8]int64, mask Mask64x8) Int64x8
+
+// StoreMasked stores a Int64x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int64x8) StoreMasked(y *[8]int64, mask Mask64x8)
+
+// Uint8x64 is a 512-bit SIMD vector of 64 uint8
+type Uint8x64 struct {
+ uint8x64 v512
+ vals [64]uint8
+}
+
+// Len returns the number of elements in a Uint8x64
+func (x Uint8x64) Len() int { return 64 }
+
+// LoadUint8x64 loads a Uint8x64 from an array
+//
+//go:noescape
+func LoadUint8x64(y *[64]uint8) Uint8x64
+
+// Store stores a Uint8x64 to an array
+//
+//go:noescape
+func (x Uint8x64) Store(y *[64]uint8)
+
+// LoadMaskedUint8x64 loads a Uint8x64 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint8x64(y *[64]uint8, mask Mask8x64) Uint8x64
+
+// StoreMasked stores a Uint8x64 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint8x64) StoreMasked(y *[64]uint8, mask Mask8x64)
+
+// Uint16x32 is a 512-bit SIMD vector of 32 uint16
+type Uint16x32 struct {
+ uint16x32 v512
+ vals [32]uint16
+}
+
+// Len returns the number of elements in a Uint16x32
+func (x Uint16x32) Len() int { return 32 }
+
+// LoadUint16x32 loads a Uint16x32 from an array
+//
+//go:noescape
+func LoadUint16x32(y *[32]uint16) Uint16x32
+
+// Store stores a Uint16x32 to an array
+//
+//go:noescape
+func (x Uint16x32) Store(y *[32]uint16)
+
+// LoadMaskedUint16x32 loads a Uint16x32 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint16x32(y *[32]uint16, mask Mask16x32) Uint16x32
+
+// StoreMasked stores a Uint16x32 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint16x32) StoreMasked(y *[32]uint16, mask Mask16x32)
+
+// Uint32x16 is a 512-bit SIMD vector of 16 uint32
+type Uint32x16 struct {
+ uint32x16 v512
+ vals [16]uint32
+}
+
+// Len returns the number of elements in a Uint32x16
+func (x Uint32x16) Len() int { return 16 }
+
+// LoadUint32x16 loads a Uint32x16 from an array
+//
+//go:noescape
+func LoadUint32x16(y *[16]uint32) Uint32x16
+
+// Store stores a Uint32x16 to an array
+//
+//go:noescape
+func (x Uint32x16) Store(y *[16]uint32)
+
+// LoadMaskedUint32x16 loads a Uint32x16 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint32x16(y *[16]uint32, mask Mask32x16) Uint32x16
+
+// StoreMasked stores a Uint32x16 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint32x16) StoreMasked(y *[16]uint32, mask Mask32x16)
+
+// Uint64x8 is a 512-bit SIMD vector of 8 uint64
+type Uint64x8 struct {
+ uint64x8 v512
+ vals [8]uint64
+}
+
+// Len returns the number of elements in a Uint64x8
+func (x Uint64x8) Len() int { return 8 }
+
+// LoadUint64x8 loads a Uint64x8 from an array
+//
+//go:noescape
+func LoadUint64x8(y *[8]uint64) Uint64x8
+
+// Store stores a Uint64x8 to an array
+//
+//go:noescape
+func (x Uint64x8) Store(y *[8]uint64)
+
+// LoadMaskedUint64x8 loads a Uint64x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint64x8(y *[8]uint64, mask Mask64x8) Uint64x8
+
+// StoreMasked stores a Uint64x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint64x8) StoreMasked(y *[8]uint64, mask Mask64x8)
+
+// Mask8x64 is a 512-bit SIMD vector of 64 int8
+type Mask8x64 struct {
+ int8x64 v512
+ vals [64]int8
+}
+
+// Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVB, CPU Feature: AVX512
+func Mask8x64FromBits(y uint64) Mask8x64
+
+// ToBits constructs a bitmap from a Mask8x64, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVB, CPU Features: AVX512
+func (x Mask8x64) ToBits() uint64
+
+// Mask16x32 is a 512-bit SIMD vector of 32 int16
+type Mask16x32 struct {
+ int16x32 v512
+ vals [32]int16
+}
+
+// Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVW, CPU Feature: AVX512
+func Mask16x32FromBits(y uint32) Mask16x32
+
+// ToBits constructs a bitmap from a Mask16x32, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVW, CPU Features: AVX512
+func (x Mask16x32) ToBits() uint32
+
+// Mask32x16 is a 512-bit SIMD vector of 16 int32
+type Mask32x16 struct {
+ int32x16 v512
+ vals [16]int32
+}
+
+// Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVD, CPU Feature: AVX512
+func Mask32x16FromBits(y uint16) Mask32x16
+
+// ToBits constructs a bitmap from a Mask32x16, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVD, CPU Features: AVX512
+func (x Mask32x16) ToBits() uint16
+
+// Mask64x8 is a 512-bit SIMD vector of 8 int64
+type Mask64x8 struct {
+ int64x8 v512
+ vals [8]int64
+}
+
+// Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVQ, CPU Feature: AVX512
+func Mask64x8FromBits(y uint8) Mask64x8
+
+// ToBits constructs a bitmap from a Mask64x8, where 1 means set for the indexed element, 0 means unset.
+//
+// Asm: KMOVQ, CPU Features: AVX512
+func (x Mask64x8) ToBits() uint8
--- /dev/null
+// Code generated by 'go run genfiles.go'; DO NOT EDIT.
+
+//go:build goexperiment.simd
+
+package archsimd
+
+import "unsafe"
+
+// paInt8x16 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt8x16(s []int8) *[16]int8 {
+ return (*[16]int8)(unsafe.Pointer(&s[0]))
+}
+
+// paInt16x8 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt16x8(s []int16) *[8]int16 {
+ return (*[8]int16)(unsafe.Pointer(&s[0]))
+}
+
+// paInt32x4 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt32x4(s []int32) *[4]int32 {
+ return (*[4]int32)(unsafe.Pointer(&s[0]))
+}
+
+// paInt64x2 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt64x2(s []int64) *[2]int64 {
+ return (*[2]int64)(unsafe.Pointer(&s[0]))
+}
+
+// paUint8x16 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint8x16(s []uint8) *[16]uint8 {
+ return (*[16]uint8)(unsafe.Pointer(&s[0]))
+}
+
+// paUint16x8 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint16x8(s []uint16) *[8]uint16 {
+ return (*[8]uint16)(unsafe.Pointer(&s[0]))
+}
+
+// paUint32x4 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint32x4(s []uint32) *[4]uint32 {
+ return (*[4]uint32)(unsafe.Pointer(&s[0]))
+}
+
+// paUint64x2 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint64x2(s []uint64) *[2]uint64 {
+ return (*[2]uint64)(unsafe.Pointer(&s[0]))
+}
+
+// paFloat32x4 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paFloat32x4(s []float32) *[4]float32 {
+ return (*[4]float32)(unsafe.Pointer(&s[0]))
+}
+
+// paFloat64x2 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paFloat64x2(s []float64) *[2]float64 {
+ return (*[2]float64)(unsafe.Pointer(&s[0]))
+}
+
+// paInt8x32 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt8x32(s []int8) *[32]int8 {
+ return (*[32]int8)(unsafe.Pointer(&s[0]))
+}
+
+// paInt16x16 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt16x16(s []int16) *[16]int16 {
+ return (*[16]int16)(unsafe.Pointer(&s[0]))
+}
+
+// paInt32x8 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt32x8(s []int32) *[8]int32 {
+ return (*[8]int32)(unsafe.Pointer(&s[0]))
+}
+
+// paInt64x4 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt64x4(s []int64) *[4]int64 {
+ return (*[4]int64)(unsafe.Pointer(&s[0]))
+}
+
+// paUint8x32 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint8x32(s []uint8) *[32]uint8 {
+ return (*[32]uint8)(unsafe.Pointer(&s[0]))
+}
+
+// paUint16x16 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint16x16(s []uint16) *[16]uint16 {
+ return (*[16]uint16)(unsafe.Pointer(&s[0]))
+}
+
+// paUint32x8 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint32x8(s []uint32) *[8]uint32 {
+ return (*[8]uint32)(unsafe.Pointer(&s[0]))
+}
+
+// paUint64x4 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint64x4(s []uint64) *[4]uint64 {
+ return (*[4]uint64)(unsafe.Pointer(&s[0]))
+}
+
+// paFloat32x8 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paFloat32x8(s []float32) *[8]float32 {
+ return (*[8]float32)(unsafe.Pointer(&s[0]))
+}
+
+// paFloat64x4 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paFloat64x4(s []float64) *[4]float64 {
+ return (*[4]float64)(unsafe.Pointer(&s[0]))
+}
+
+// paInt8x64 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt8x64(s []int8) *[64]int8 {
+ return (*[64]int8)(unsafe.Pointer(&s[0]))
+}
+
+// paInt16x32 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt16x32(s []int16) *[32]int16 {
+ return (*[32]int16)(unsafe.Pointer(&s[0]))
+}
+
+// paInt32x16 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt32x16(s []int32) *[16]int32 {
+ return (*[16]int32)(unsafe.Pointer(&s[0]))
+}
+
+// paInt64x8 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paInt64x8(s []int64) *[8]int64 {
+ return (*[8]int64)(unsafe.Pointer(&s[0]))
+}
+
+// paUint8x64 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint8x64(s []uint8) *[64]uint8 {
+ return (*[64]uint8)(unsafe.Pointer(&s[0]))
+}
+
+// paUint16x32 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint16x32(s []uint16) *[32]uint16 {
+ return (*[32]uint16)(unsafe.Pointer(&s[0]))
+}
+
+// paUint32x16 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint32x16(s []uint32) *[16]uint32 {
+ return (*[16]uint32)(unsafe.Pointer(&s[0]))
+}
+
+// paUint64x8 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paUint64x8(s []uint64) *[8]uint64 {
+ return (*[8]uint64)(unsafe.Pointer(&s[0]))
+}
+
+// paFloat32x16 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paFloat32x16(s []float32) *[16]float32 {
+ return (*[16]float32)(unsafe.Pointer(&s[0]))
+}
+
+// paFloat64x8 returns a type-unsafe pointer to array that can
+// only be used with partial load/store operations that only
+// access the known-safe portions of the array.
+func paFloat64x8(s []float64) *[8]float64 {
+ return (*[8]float64)(unsafe.Pointer(&s[0]))
+}
+++ /dev/null
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX
-func (x Int8x16) Less(y Int8x16) Mask8x16 {
- return y.Greater(x)
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX
-func (x Int8x16) GreaterEqual(y Int8x16) Mask8x16 {
- ones := x.Equal(x).AsInt8x16()
- return y.Greater(x).AsInt8x16().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX
-func (x Int8x16) LessEqual(y Int8x16) Mask8x16 {
- ones := x.Equal(x).AsInt8x16()
- return x.Greater(y).AsInt8x16().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX
-func (x Int8x16) NotEqual(y Int8x16) Mask8x16 {
- ones := x.Equal(x).AsInt8x16()
- return x.Equal(y).AsInt8x16().Xor(ones).asMask()
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX
-func (x Int16x8) Less(y Int16x8) Mask16x8 {
- return y.Greater(x)
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX
-func (x Int16x8) GreaterEqual(y Int16x8) Mask16x8 {
- ones := x.Equal(x).AsInt16x8()
- return y.Greater(x).AsInt16x8().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX
-func (x Int16x8) LessEqual(y Int16x8) Mask16x8 {
- ones := x.Equal(x).AsInt16x8()
- return x.Greater(y).AsInt16x8().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX
-func (x Int16x8) NotEqual(y Int16x8) Mask16x8 {
- ones := x.Equal(x).AsInt16x8()
- return x.Equal(y).AsInt16x8().Xor(ones).asMask()
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX
-func (x Int32x4) Less(y Int32x4) Mask32x4 {
- return y.Greater(x)
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX
-func (x Int32x4) GreaterEqual(y Int32x4) Mask32x4 {
- ones := x.Equal(x).AsInt32x4()
- return y.Greater(x).AsInt32x4().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX
-func (x Int32x4) LessEqual(y Int32x4) Mask32x4 {
- ones := x.Equal(x).AsInt32x4()
- return x.Greater(y).AsInt32x4().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX
-func (x Int32x4) NotEqual(y Int32x4) Mask32x4 {
- ones := x.Equal(x).AsInt32x4()
- return x.Equal(y).AsInt32x4().Xor(ones).asMask()
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX
-func (x Int64x2) Less(y Int64x2) Mask64x2 {
- return y.Greater(x)
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX
-func (x Int64x2) GreaterEqual(y Int64x2) Mask64x2 {
- ones := x.Equal(x).AsInt64x2()
- return y.Greater(x).AsInt64x2().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX
-func (x Int64x2) LessEqual(y Int64x2) Mask64x2 {
- ones := x.Equal(x).AsInt64x2()
- return x.Greater(y).AsInt64x2().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX
-func (x Int64x2) NotEqual(y Int64x2) Mask64x2 {
- ones := x.Equal(x).AsInt64x2()
- return x.Equal(y).AsInt64x2().Xor(ones).asMask()
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX2
-func (x Int8x32) Less(y Int8x32) Mask8x32 {
- return y.Greater(x)
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX2
-func (x Int8x32) GreaterEqual(y Int8x32) Mask8x32 {
- ones := x.Equal(x).AsInt8x32()
- return y.Greater(x).AsInt8x32().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX2
-func (x Int8x32) LessEqual(y Int8x32) Mask8x32 {
- ones := x.Equal(x).AsInt8x32()
- return x.Greater(y).AsInt8x32().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX2
-func (x Int8x32) NotEqual(y Int8x32) Mask8x32 {
- ones := x.Equal(x).AsInt8x32()
- return x.Equal(y).AsInt8x32().Xor(ones).asMask()
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX2
-func (x Int16x16) Less(y Int16x16) Mask16x16 {
- return y.Greater(x)
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX2
-func (x Int16x16) GreaterEqual(y Int16x16) Mask16x16 {
- ones := x.Equal(x).AsInt16x16()
- return y.Greater(x).AsInt16x16().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX2
-func (x Int16x16) LessEqual(y Int16x16) Mask16x16 {
- ones := x.Equal(x).AsInt16x16()
- return x.Greater(y).AsInt16x16().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX2
-func (x Int16x16) NotEqual(y Int16x16) Mask16x16 {
- ones := x.Equal(x).AsInt16x16()
- return x.Equal(y).AsInt16x16().Xor(ones).asMask()
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX2
-func (x Int32x8) Less(y Int32x8) Mask32x8 {
- return y.Greater(x)
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX2
-func (x Int32x8) GreaterEqual(y Int32x8) Mask32x8 {
- ones := x.Equal(x).AsInt32x8()
- return y.Greater(x).AsInt32x8().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX2
-func (x Int32x8) LessEqual(y Int32x8) Mask32x8 {
- ones := x.Equal(x).AsInt32x8()
- return x.Greater(y).AsInt32x8().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX2
-func (x Int32x8) NotEqual(y Int32x8) Mask32x8 {
- ones := x.Equal(x).AsInt32x8()
- return x.Equal(y).AsInt32x8().Xor(ones).asMask()
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX2
-func (x Int64x4) Less(y Int64x4) Mask64x4 {
- return y.Greater(x)
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX2
-func (x Int64x4) GreaterEqual(y Int64x4) Mask64x4 {
- ones := x.Equal(x).AsInt64x4()
- return y.Greater(x).AsInt64x4().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX2
-func (x Int64x4) LessEqual(y Int64x4) Mask64x4 {
- ones := x.Equal(x).AsInt64x4()
- return x.Greater(y).AsInt64x4().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX2
-func (x Int64x4) NotEqual(y Int64x4) Mask64x4 {
- ones := x.Equal(x).AsInt64x4()
- return x.Equal(y).AsInt64x4().Xor(ones).asMask()
-}
-
-// Greater returns a mask whose elements indicate whether x > y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint8x16) Greater(y Uint8x16) Mask8x16 {
- a, b := x.AsInt8x16(), y.AsInt8x16()
- signs := BroadcastInt8x16(-1 << (8 - 1))
- return a.Xor(signs).Greater(b.Xor(signs))
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint8x16) Less(y Uint8x16) Mask8x16 {
- a, b := x.AsInt8x16(), y.AsInt8x16()
- signs := BroadcastInt8x16(-1 << (8 - 1))
- return b.Xor(signs).Greater(a.Xor(signs))
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint8x16) GreaterEqual(y Uint8x16) Mask8x16 {
- a, b := x.AsInt8x16(), y.AsInt8x16()
- ones := x.Equal(x).AsInt8x16()
- signs := BroadcastInt8x16(-1 << (8 - 1))
- return b.Xor(signs).Greater(a.Xor(signs)).AsInt8x16().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint8x16) LessEqual(y Uint8x16) Mask8x16 {
- a, b := x.AsInt8x16(), y.AsInt8x16()
- ones := x.Equal(x).AsInt8x16()
- signs := BroadcastInt8x16(-1 << (8 - 1))
- return a.Xor(signs).Greater(b.Xor(signs)).AsInt8x16().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX
-func (x Uint8x16) NotEqual(y Uint8x16) Mask8x16 {
- a, b := x.AsInt8x16(), y.AsInt8x16()
- ones := x.Equal(x).AsInt8x16()
- return a.Equal(b).AsInt8x16().Xor(ones).asMask()
-}
-
-// Greater returns a mask whose elements indicate whether x > y
-//
-// Emulated, CPU Feature AVX
-func (x Uint16x8) Greater(y Uint16x8) Mask16x8 {
- a, b := x.AsInt16x8(), y.AsInt16x8()
- ones := x.Equal(x).AsInt16x8()
- signs := ones.ShiftAllLeft(16 - 1)
- return a.Xor(signs).Greater(b.Xor(signs))
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX
-func (x Uint16x8) Less(y Uint16x8) Mask16x8 {
- a, b := x.AsInt16x8(), y.AsInt16x8()
- ones := x.Equal(x).AsInt16x8()
- signs := ones.ShiftAllLeft(16 - 1)
- return b.Xor(signs).Greater(a.Xor(signs))
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX
-func (x Uint16x8) GreaterEqual(y Uint16x8) Mask16x8 {
- a, b := x.AsInt16x8(), y.AsInt16x8()
- ones := x.Equal(x).AsInt16x8()
- signs := ones.ShiftAllLeft(16 - 1)
- return b.Xor(signs).Greater(a.Xor(signs)).AsInt16x8().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX
-func (x Uint16x8) LessEqual(y Uint16x8) Mask16x8 {
- a, b := x.AsInt16x8(), y.AsInt16x8()
- ones := x.Equal(x).AsInt16x8()
- signs := ones.ShiftAllLeft(16 - 1)
- return a.Xor(signs).Greater(b.Xor(signs)).AsInt16x8().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX
-func (x Uint16x8) NotEqual(y Uint16x8) Mask16x8 {
- a, b := x.AsInt16x8(), y.AsInt16x8()
- ones := x.Equal(x).AsInt16x8()
- return a.Equal(b).AsInt16x8().Xor(ones).asMask()
-}
-
-// Greater returns a mask whose elements indicate whether x > y
-//
-// Emulated, CPU Feature AVX
-func (x Uint32x4) Greater(y Uint32x4) Mask32x4 {
- a, b := x.AsInt32x4(), y.AsInt32x4()
- ones := x.Equal(x).AsInt32x4()
- signs := ones.ShiftAllLeft(32 - 1)
- return a.Xor(signs).Greater(b.Xor(signs))
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX
-func (x Uint32x4) Less(y Uint32x4) Mask32x4 {
- a, b := x.AsInt32x4(), y.AsInt32x4()
- ones := x.Equal(x).AsInt32x4()
- signs := ones.ShiftAllLeft(32 - 1)
- return b.Xor(signs).Greater(a.Xor(signs))
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX
-func (x Uint32x4) GreaterEqual(y Uint32x4) Mask32x4 {
- a, b := x.AsInt32x4(), y.AsInt32x4()
- ones := x.Equal(x).AsInt32x4()
- signs := ones.ShiftAllLeft(32 - 1)
- return b.Xor(signs).Greater(a.Xor(signs)).AsInt32x4().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX
-func (x Uint32x4) LessEqual(y Uint32x4) Mask32x4 {
- a, b := x.AsInt32x4(), y.AsInt32x4()
- ones := x.Equal(x).AsInt32x4()
- signs := ones.ShiftAllLeft(32 - 1)
- return a.Xor(signs).Greater(b.Xor(signs)).AsInt32x4().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX
-func (x Uint32x4) NotEqual(y Uint32x4) Mask32x4 {
- a, b := x.AsInt32x4(), y.AsInt32x4()
- ones := x.Equal(x).AsInt32x4()
- return a.Equal(b).AsInt32x4().Xor(ones).asMask()
-}
-
-// Greater returns a mask whose elements indicate whether x > y
-//
-// Emulated, CPU Feature AVX
-func (x Uint64x2) Greater(y Uint64x2) Mask64x2 {
- a, b := x.AsInt64x2(), y.AsInt64x2()
- ones := x.Equal(x).AsInt64x2()
- signs := ones.ShiftAllLeft(64 - 1)
- return a.Xor(signs).Greater(b.Xor(signs))
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX
-func (x Uint64x2) Less(y Uint64x2) Mask64x2 {
- a, b := x.AsInt64x2(), y.AsInt64x2()
- ones := x.Equal(x).AsInt64x2()
- signs := ones.ShiftAllLeft(64 - 1)
- return b.Xor(signs).Greater(a.Xor(signs))
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX
-func (x Uint64x2) GreaterEqual(y Uint64x2) Mask64x2 {
- a, b := x.AsInt64x2(), y.AsInt64x2()
- ones := x.Equal(x).AsInt64x2()
- signs := ones.ShiftAllLeft(64 - 1)
- return b.Xor(signs).Greater(a.Xor(signs)).AsInt64x2().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX
-func (x Uint64x2) LessEqual(y Uint64x2) Mask64x2 {
- a, b := x.AsInt64x2(), y.AsInt64x2()
- ones := x.Equal(x).AsInt64x2()
- signs := ones.ShiftAllLeft(64 - 1)
- return a.Xor(signs).Greater(b.Xor(signs)).AsInt64x2().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX
-func (x Uint64x2) NotEqual(y Uint64x2) Mask64x2 {
- a, b := x.AsInt64x2(), y.AsInt64x2()
- ones := x.Equal(x).AsInt64x2()
- return a.Equal(b).AsInt64x2().Xor(ones).asMask()
-}
-
-// Greater returns a mask whose elements indicate whether x > y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint8x32) Greater(y Uint8x32) Mask8x32 {
- a, b := x.AsInt8x32(), y.AsInt8x32()
- signs := BroadcastInt8x32(-1 << (8 - 1))
- return a.Xor(signs).Greater(b.Xor(signs))
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint8x32) Less(y Uint8x32) Mask8x32 {
- a, b := x.AsInt8x32(), y.AsInt8x32()
- signs := BroadcastInt8x32(-1 << (8 - 1))
- return b.Xor(signs).Greater(a.Xor(signs))
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint8x32) GreaterEqual(y Uint8x32) Mask8x32 {
- a, b := x.AsInt8x32(), y.AsInt8x32()
- ones := x.Equal(x).AsInt8x32()
- signs := BroadcastInt8x32(-1 << (8 - 1))
- return b.Xor(signs).Greater(a.Xor(signs)).AsInt8x32().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint8x32) LessEqual(y Uint8x32) Mask8x32 {
- a, b := x.AsInt8x32(), y.AsInt8x32()
- ones := x.Equal(x).AsInt8x32()
- signs := BroadcastInt8x32(-1 << (8 - 1))
- return a.Xor(signs).Greater(b.Xor(signs)).AsInt8x32().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint8x32) NotEqual(y Uint8x32) Mask8x32 {
- a, b := x.AsInt8x32(), y.AsInt8x32()
- ones := x.Equal(x).AsInt8x32()
- return a.Equal(b).AsInt8x32().Xor(ones).asMask()
-}
-
-// Greater returns a mask whose elements indicate whether x > y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint16x16) Greater(y Uint16x16) Mask16x16 {
- a, b := x.AsInt16x16(), y.AsInt16x16()
- ones := x.Equal(x).AsInt16x16()
- signs := ones.ShiftAllLeft(16 - 1)
- return a.Xor(signs).Greater(b.Xor(signs))
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint16x16) Less(y Uint16x16) Mask16x16 {
- a, b := x.AsInt16x16(), y.AsInt16x16()
- ones := x.Equal(x).AsInt16x16()
- signs := ones.ShiftAllLeft(16 - 1)
- return b.Xor(signs).Greater(a.Xor(signs))
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint16x16) GreaterEqual(y Uint16x16) Mask16x16 {
- a, b := x.AsInt16x16(), y.AsInt16x16()
- ones := x.Equal(x).AsInt16x16()
- signs := ones.ShiftAllLeft(16 - 1)
- return b.Xor(signs).Greater(a.Xor(signs)).AsInt16x16().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint16x16) LessEqual(y Uint16x16) Mask16x16 {
- a, b := x.AsInt16x16(), y.AsInt16x16()
- ones := x.Equal(x).AsInt16x16()
- signs := ones.ShiftAllLeft(16 - 1)
- return a.Xor(signs).Greater(b.Xor(signs)).AsInt16x16().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint16x16) NotEqual(y Uint16x16) Mask16x16 {
- a, b := x.AsInt16x16(), y.AsInt16x16()
- ones := x.Equal(x).AsInt16x16()
- return a.Equal(b).AsInt16x16().Xor(ones).asMask()
-}
-
-// Greater returns a mask whose elements indicate whether x > y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint32x8) Greater(y Uint32x8) Mask32x8 {
- a, b := x.AsInt32x8(), y.AsInt32x8()
- ones := x.Equal(x).AsInt32x8()
- signs := ones.ShiftAllLeft(32 - 1)
- return a.Xor(signs).Greater(b.Xor(signs))
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint32x8) Less(y Uint32x8) Mask32x8 {
- a, b := x.AsInt32x8(), y.AsInt32x8()
- ones := x.Equal(x).AsInt32x8()
- signs := ones.ShiftAllLeft(32 - 1)
- return b.Xor(signs).Greater(a.Xor(signs))
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint32x8) GreaterEqual(y Uint32x8) Mask32x8 {
- a, b := x.AsInt32x8(), y.AsInt32x8()
- ones := x.Equal(x).AsInt32x8()
- signs := ones.ShiftAllLeft(32 - 1)
- return b.Xor(signs).Greater(a.Xor(signs)).AsInt32x8().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint32x8) LessEqual(y Uint32x8) Mask32x8 {
- a, b := x.AsInt32x8(), y.AsInt32x8()
- ones := x.Equal(x).AsInt32x8()
- signs := ones.ShiftAllLeft(32 - 1)
- return a.Xor(signs).Greater(b.Xor(signs)).AsInt32x8().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint32x8) NotEqual(y Uint32x8) Mask32x8 {
- a, b := x.AsInt32x8(), y.AsInt32x8()
- ones := x.Equal(x).AsInt32x8()
- return a.Equal(b).AsInt32x8().Xor(ones).asMask()
-}
-
-// Greater returns a mask whose elements indicate whether x > y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint64x4) Greater(y Uint64x4) Mask64x4 {
- a, b := x.AsInt64x4(), y.AsInt64x4()
- ones := x.Equal(x).AsInt64x4()
- signs := ones.ShiftAllLeft(64 - 1)
- return a.Xor(signs).Greater(b.Xor(signs))
-}
-
-// Less returns a mask whose elements indicate whether x < y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint64x4) Less(y Uint64x4) Mask64x4 {
- a, b := x.AsInt64x4(), y.AsInt64x4()
- ones := x.Equal(x).AsInt64x4()
- signs := ones.ShiftAllLeft(64 - 1)
- return b.Xor(signs).Greater(a.Xor(signs))
-}
-
-// GreaterEqual returns a mask whose elements indicate whether x >= y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint64x4) GreaterEqual(y Uint64x4) Mask64x4 {
- a, b := x.AsInt64x4(), y.AsInt64x4()
- ones := x.Equal(x).AsInt64x4()
- signs := ones.ShiftAllLeft(64 - 1)
- return b.Xor(signs).Greater(a.Xor(signs)).AsInt64x4().Xor(ones).asMask()
-}
-
-// LessEqual returns a mask whose elements indicate whether x <= y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint64x4) LessEqual(y Uint64x4) Mask64x4 {
- a, b := x.AsInt64x4(), y.AsInt64x4()
- ones := x.Equal(x).AsInt64x4()
- signs := ones.ShiftAllLeft(64 - 1)
- return a.Xor(signs).Greater(b.Xor(signs)).AsInt64x4().Xor(ones).asMask()
-}
-
-// NotEqual returns a mask whose elements indicate whether x != y
-//
-// Emulated, CPU Feature AVX2
-func (x Uint64x4) NotEqual(y Uint64x4) Mask64x4 {
- a, b := x.AsInt64x4(), y.AsInt64x4()
- ones := x.Equal(x).AsInt64x4()
- return a.Equal(b).AsInt64x4().Xor(ones).asMask()
-}
+++ /dev/null
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-import "internal/cpu"
-
-type X86Features struct{}
-
-var X86 X86Features
-
-// AES returns whether the CPU supports the AES feature.
-//
-// AES is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AES() bool {
- return cpu.X86.HasAES
-}
-
-// AVX returns whether the CPU supports the AVX feature.
-//
-// AVX is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX() bool {
- return cpu.X86.HasAVX
-}
-
-// AVX2 returns whether the CPU supports the AVX2 feature.
-//
-// AVX2 is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX2() bool {
- return cpu.X86.HasAVX2
-}
-
-// AVX512 returns whether the CPU supports the AVX512F+CD+BW+DQ+VL features.
-//
-// These five CPU features are bundled together, and no use of AVX-512
-// is allowed unless all of these features are supported together.
-// Nearly every CPU that has shipped with any support for AVX-512 has
-// supported all five of these features.
-//
-// AVX512 is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX512() bool {
- return cpu.X86.HasAVX512
-}
-
-// AVX512BITALG returns whether the CPU supports the AVX512BITALG feature.
-//
-// AVX512BITALG is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX512BITALG() bool {
- return cpu.X86.HasAVX512BITALG
-}
-
-// AVX512GFNI returns whether the CPU supports the AVX512GFNI feature.
-//
-// AVX512GFNI is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX512GFNI() bool {
- return cpu.X86.HasAVX512GFNI
-}
-
-// AVX512VAES returns whether the CPU supports the AVX512VAES feature.
-//
-// AVX512VAES is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX512VAES() bool {
- return cpu.X86.HasAVX512VAES
-}
-
-// AVX512VBMI returns whether the CPU supports the AVX512VBMI feature.
-//
-// AVX512VBMI is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX512VBMI() bool {
- return cpu.X86.HasAVX512VBMI
-}
-
-// AVX512VBMI2 returns whether the CPU supports the AVX512VBMI2 feature.
-//
-// AVX512VBMI2 is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX512VBMI2() bool {
- return cpu.X86.HasAVX512VBMI2
-}
-
-// AVX512VNNI returns whether the CPU supports the AVX512VNNI feature.
-//
-// AVX512VNNI is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX512VNNI() bool {
- return cpu.X86.HasAVX512VNNI
-}
-
-// AVX512VPCLMULQDQ returns whether the CPU supports the AVX512VPCLMULQDQ feature.
-//
-// AVX512VPCLMULQDQ is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX512VPCLMULQDQ() bool {
- return cpu.X86.HasAVX512VPCLMULQDQ
-}
-
-// AVX512VPOPCNTDQ returns whether the CPU supports the AVX512VPOPCNTDQ feature.
-//
-// AVX512VPOPCNTDQ is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVX512VPOPCNTDQ() bool {
- return cpu.X86.HasAVX512VPOPCNTDQ
-}
-
-// AVXVNNI returns whether the CPU supports the AVXVNNI feature.
-//
-// AVXVNNI is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) AVXVNNI() bool {
- return cpu.X86.HasAVXVNNI
-}
-
-// SHA returns whether the CPU supports the SHA feature.
-//
-// SHA is defined on all GOARCHes, but will only return true on
-// GOARCH amd64.
-func (X86Features) SHA() bool {
- return cpu.X86.HasSHA
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build amd64
-
-// Empty file to allow bodyless functions.
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-// This exposes some internal interfaces to simd_test.
-
-package simd
-
-func (x Int64x2) ExportTestConcatSelectedConstant(indices uint8, y Int64x2) Int64x2 {
- return x.concatSelectedConstant(indices, y)
-}
-
-func (x Float64x4) ExportTestConcatSelectedConstantGrouped(indices uint8, y Float64x4) Float64x4 {
- return x.concatSelectedConstantGrouped(indices, y)
-}
-
-func (x Float32x4) ExportTestConcatSelectedConstant(indices uint8, y Float32x4) Float32x4 {
- return x.concatSelectedConstant(indices, y)
-}
-
-func (x Int32x4) ExportTestConcatSelectedConstant(indices uint8, y Int32x4) Int32x4 {
- return x.concatSelectedConstant(indices, y)
-}
-
-func (x Uint32x8) ExportTestConcatSelectedConstantGrouped(indices uint8, y Uint32x8) Uint32x8 {
- return x.concatSelectedConstantGrouped(indices, y)
-}
-
-func (x Int32x8) ExportTestConcatSelectedConstantGrouped(indices uint8, y Int32x8) Int32x8 {
- return x.concatSelectedConstantGrouped(indices, y)
-}
-
-func (x Int32x8) ExportTestTern(table uint8, y Int32x8, z Int32x8) Int32x8 {
- return x.tern(table, y, z)
-}
-
-func (x Int32x4) ExportTestTern(table uint8, y Int32x4, z Int32x4) Int32x4 {
- return x.tern(table, y, z)
-}
-
-func ExportTestCscImm4(a, b, c, d uint8) uint8 {
- return cscimm4(a, b, c, d)
-}
-
-const (
- LLLL = _LLLL
- HLLL = _HLLL
- LHLL = _LHLL
- HHLL = _HHLL
- LLHL = _LLHL
- HLHL = _HLHL
- LHHL = _LHHL
- HHHL = _HHHL
- LLLH = _LLLH
- HLLH = _HLLH
- LHLH = _LHLH
- HHLH = _HHLH
- LLHH = _LLHH
- HLHH = _HLHH
- LHHH = _LHHH
- HHHH = _HHHH
-)
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd
-
-// ClearAVXUpperBits clears the high bits of Y0-Y15 and Z0-Z15 registers.
-// It is intended for transitioning from AVX to SSE, eliminating the
-// performance penalties caused by false dependencies.
-//
-// Note: in the future the compiler may automatically generate the
-// instruction, making this function unnecessary.
-//
-// Asm: VZEROUPPER, CPU Feature: AVX
-func ClearAVXUpperBits()
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Int8x16) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Int8x32) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Int16x8) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Int16x16) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Int32x4) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Int32x8) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Int64x2) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Int64x4) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Uint8x16) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Uint8x32) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Uint16x8) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Uint16x16) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Uint32x4) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Uint32x8) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Uint64x2) IsZero() bool
-
-// IsZero returns true if all elements of x are zeros.
-//
-// This method compiles to VPTEST x, x.
-// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
-//
-// Asm: VPTEST, CPU Feature: AVX
-func (x Uint64x4) IsZero() bool
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd
-
-package simd
-
-// Invoke code generators.
-
-//go:generate go run -C _gen . -tmplgen -simdgen
+++ /dev/null
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-// This file contains functions testing binary simd methods.
-// Each function in this file is specialized for a
-// particular simd type <BaseType><Width>x<Count>.
-
-package simd_test
-
-import (
- "simd"
- "testing"
-)
-
-// testInt8x16Binary tests the simd binary method f against the expected behavior generated by want
-func testInt8x16Binary(t *testing.T, f func(_, _ simd.Int8x16) simd.Int8x16, want func(_, _ []int8) []int8) {
- n := 16
- t.Helper()
- forSlicePair(t, int8s, n, func(x, y []int8) bool {
- t.Helper()
- a := simd.LoadInt8x16Slice(x)
- b := simd.LoadInt8x16Slice(y)
- g := make([]int8, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt16x8Binary tests the simd binary method f against the expected behavior generated by want
-func testInt16x8Binary(t *testing.T, f func(_, _ simd.Int16x8) simd.Int16x8, want func(_, _ []int16) []int16) {
- n := 8
- t.Helper()
- forSlicePair(t, int16s, n, func(x, y []int16) bool {
- t.Helper()
- a := simd.LoadInt16x8Slice(x)
- b := simd.LoadInt16x8Slice(y)
- g := make([]int16, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt32x4Binary tests the simd binary method f against the expected behavior generated by want
-func testInt32x4Binary(t *testing.T, f func(_, _ simd.Int32x4) simd.Int32x4, want func(_, _ []int32) []int32) {
- n := 4
- t.Helper()
- forSlicePair(t, int32s, n, func(x, y []int32) bool {
- t.Helper()
- a := simd.LoadInt32x4Slice(x)
- b := simd.LoadInt32x4Slice(y)
- g := make([]int32, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt64x2Binary tests the simd binary method f against the expected behavior generated by want
-func testInt64x2Binary(t *testing.T, f func(_, _ simd.Int64x2) simd.Int64x2, want func(_, _ []int64) []int64) {
- n := 2
- t.Helper()
- forSlicePair(t, int64s, n, func(x, y []int64) bool {
- t.Helper()
- a := simd.LoadInt64x2Slice(x)
- b := simd.LoadInt64x2Slice(y)
- g := make([]int64, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint8x16Binary tests the simd binary method f against the expected behavior generated by want
-func testUint8x16Binary(t *testing.T, f func(_, _ simd.Uint8x16) simd.Uint8x16, want func(_, _ []uint8) []uint8) {
- n := 16
- t.Helper()
- forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x16Slice(x)
- b := simd.LoadUint8x16Slice(y)
- g := make([]uint8, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint16x8Binary tests the simd binary method f against the expected behavior generated by want
-func testUint16x8Binary(t *testing.T, f func(_, _ simd.Uint16x8) simd.Uint16x8, want func(_, _ []uint16) []uint16) {
- n := 8
- t.Helper()
- forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x8Slice(x)
- b := simd.LoadUint16x8Slice(y)
- g := make([]uint16, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint32x4Binary tests the simd binary method f against the expected behavior generated by want
-func testUint32x4Binary(t *testing.T, f func(_, _ simd.Uint32x4) simd.Uint32x4, want func(_, _ []uint32) []uint32) {
- n := 4
- t.Helper()
- forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x4Slice(x)
- b := simd.LoadUint32x4Slice(y)
- g := make([]uint32, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint64x2Binary tests the simd binary method f against the expected behavior generated by want
-func testUint64x2Binary(t *testing.T, f func(_, _ simd.Uint64x2) simd.Uint64x2, want func(_, _ []uint64) []uint64) {
- n := 2
- t.Helper()
- forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x2Slice(x)
- b := simd.LoadUint64x2Slice(y)
- g := make([]uint64, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat32x4Binary tests the simd binary method f against the expected behavior generated by want
-func testFloat32x4Binary(t *testing.T, f func(_, _ simd.Float32x4) simd.Float32x4, want func(_, _ []float32) []float32) {
- n := 4
- t.Helper()
- forSlicePair(t, float32s, n, func(x, y []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x4Slice(x)
- b := simd.LoadFloat32x4Slice(y)
- g := make([]float32, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat64x2Binary tests the simd binary method f against the expected behavior generated by want
-func testFloat64x2Binary(t *testing.T, f func(_, _ simd.Float64x2) simd.Float64x2, want func(_, _ []float64) []float64) {
- n := 2
- t.Helper()
- forSlicePair(t, float64s, n, func(x, y []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x2Slice(x)
- b := simd.LoadFloat64x2Slice(y)
- g := make([]float64, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt8x32Binary tests the simd binary method f against the expected behavior generated by want
-func testInt8x32Binary(t *testing.T, f func(_, _ simd.Int8x32) simd.Int8x32, want func(_, _ []int8) []int8) {
- n := 32
- t.Helper()
- forSlicePair(t, int8s, n, func(x, y []int8) bool {
- t.Helper()
- a := simd.LoadInt8x32Slice(x)
- b := simd.LoadInt8x32Slice(y)
- g := make([]int8, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt16x16Binary tests the simd binary method f against the expected behavior generated by want
-func testInt16x16Binary(t *testing.T, f func(_, _ simd.Int16x16) simd.Int16x16, want func(_, _ []int16) []int16) {
- n := 16
- t.Helper()
- forSlicePair(t, int16s, n, func(x, y []int16) bool {
- t.Helper()
- a := simd.LoadInt16x16Slice(x)
- b := simd.LoadInt16x16Slice(y)
- g := make([]int16, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt32x8Binary tests the simd binary method f against the expected behavior generated by want
-func testInt32x8Binary(t *testing.T, f func(_, _ simd.Int32x8) simd.Int32x8, want func(_, _ []int32) []int32) {
- n := 8
- t.Helper()
- forSlicePair(t, int32s, n, func(x, y []int32) bool {
- t.Helper()
- a := simd.LoadInt32x8Slice(x)
- b := simd.LoadInt32x8Slice(y)
- g := make([]int32, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt64x4Binary tests the simd binary method f against the expected behavior generated by want
-func testInt64x4Binary(t *testing.T, f func(_, _ simd.Int64x4) simd.Int64x4, want func(_, _ []int64) []int64) {
- n := 4
- t.Helper()
- forSlicePair(t, int64s, n, func(x, y []int64) bool {
- t.Helper()
- a := simd.LoadInt64x4Slice(x)
- b := simd.LoadInt64x4Slice(y)
- g := make([]int64, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint8x32Binary tests the simd binary method f against the expected behavior generated by want
-func testUint8x32Binary(t *testing.T, f func(_, _ simd.Uint8x32) simd.Uint8x32, want func(_, _ []uint8) []uint8) {
- n := 32
- t.Helper()
- forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x32Slice(x)
- b := simd.LoadUint8x32Slice(y)
- g := make([]uint8, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint16x16Binary tests the simd binary method f against the expected behavior generated by want
-func testUint16x16Binary(t *testing.T, f func(_, _ simd.Uint16x16) simd.Uint16x16, want func(_, _ []uint16) []uint16) {
- n := 16
- t.Helper()
- forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x16Slice(x)
- b := simd.LoadUint16x16Slice(y)
- g := make([]uint16, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint32x8Binary tests the simd binary method f against the expected behavior generated by want
-func testUint32x8Binary(t *testing.T, f func(_, _ simd.Uint32x8) simd.Uint32x8, want func(_, _ []uint32) []uint32) {
- n := 8
- t.Helper()
- forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x8Slice(x)
- b := simd.LoadUint32x8Slice(y)
- g := make([]uint32, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint64x4Binary tests the simd binary method f against the expected behavior generated by want
-func testUint64x4Binary(t *testing.T, f func(_, _ simd.Uint64x4) simd.Uint64x4, want func(_, _ []uint64) []uint64) {
- n := 4
- t.Helper()
- forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x4Slice(x)
- b := simd.LoadUint64x4Slice(y)
- g := make([]uint64, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat32x8Binary tests the simd binary method f against the expected behavior generated by want
-func testFloat32x8Binary(t *testing.T, f func(_, _ simd.Float32x8) simd.Float32x8, want func(_, _ []float32) []float32) {
- n := 8
- t.Helper()
- forSlicePair(t, float32s, n, func(x, y []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x8Slice(x)
- b := simd.LoadFloat32x8Slice(y)
- g := make([]float32, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat64x4Binary tests the simd binary method f against the expected behavior generated by want
-func testFloat64x4Binary(t *testing.T, f func(_, _ simd.Float64x4) simd.Float64x4, want func(_, _ []float64) []float64) {
- n := 4
- t.Helper()
- forSlicePair(t, float64s, n, func(x, y []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x4Slice(x)
- b := simd.LoadFloat64x4Slice(y)
- g := make([]float64, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt8x64Binary tests the simd binary method f against the expected behavior generated by want
-func testInt8x64Binary(t *testing.T, f func(_, _ simd.Int8x64) simd.Int8x64, want func(_, _ []int8) []int8) {
- n := 64
- t.Helper()
- forSlicePair(t, int8s, n, func(x, y []int8) bool {
- t.Helper()
- a := simd.LoadInt8x64Slice(x)
- b := simd.LoadInt8x64Slice(y)
- g := make([]int8, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt16x32Binary tests the simd binary method f against the expected behavior generated by want
-func testInt16x32Binary(t *testing.T, f func(_, _ simd.Int16x32) simd.Int16x32, want func(_, _ []int16) []int16) {
- n := 32
- t.Helper()
- forSlicePair(t, int16s, n, func(x, y []int16) bool {
- t.Helper()
- a := simd.LoadInt16x32Slice(x)
- b := simd.LoadInt16x32Slice(y)
- g := make([]int16, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt32x16Binary tests the simd binary method f against the expected behavior generated by want
-func testInt32x16Binary(t *testing.T, f func(_, _ simd.Int32x16) simd.Int32x16, want func(_, _ []int32) []int32) {
- n := 16
- t.Helper()
- forSlicePair(t, int32s, n, func(x, y []int32) bool {
- t.Helper()
- a := simd.LoadInt32x16Slice(x)
- b := simd.LoadInt32x16Slice(y)
- g := make([]int32, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt64x8Binary tests the simd binary method f against the expected behavior generated by want
-func testInt64x8Binary(t *testing.T, f func(_, _ simd.Int64x8) simd.Int64x8, want func(_, _ []int64) []int64) {
- n := 8
- t.Helper()
- forSlicePair(t, int64s, n, func(x, y []int64) bool {
- t.Helper()
- a := simd.LoadInt64x8Slice(x)
- b := simd.LoadInt64x8Slice(y)
- g := make([]int64, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint8x64Binary tests the simd binary method f against the expected behavior generated by want
-func testUint8x64Binary(t *testing.T, f func(_, _ simd.Uint8x64) simd.Uint8x64, want func(_, _ []uint8) []uint8) {
- n := 64
- t.Helper()
- forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x64Slice(x)
- b := simd.LoadUint8x64Slice(y)
- g := make([]uint8, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint16x32Binary tests the simd binary method f against the expected behavior generated by want
-func testUint16x32Binary(t *testing.T, f func(_, _ simd.Uint16x32) simd.Uint16x32, want func(_, _ []uint16) []uint16) {
- n := 32
- t.Helper()
- forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x32Slice(x)
- b := simd.LoadUint16x32Slice(y)
- g := make([]uint16, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint32x16Binary tests the simd binary method f against the expected behavior generated by want
-func testUint32x16Binary(t *testing.T, f func(_, _ simd.Uint32x16) simd.Uint32x16, want func(_, _ []uint32) []uint32) {
- n := 16
- t.Helper()
- forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x16Slice(x)
- b := simd.LoadUint32x16Slice(y)
- g := make([]uint32, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint64x8Binary tests the simd binary method f against the expected behavior generated by want
-func testUint64x8Binary(t *testing.T, f func(_, _ simd.Uint64x8) simd.Uint64x8, want func(_, _ []uint64) []uint64) {
- n := 8
- t.Helper()
- forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x8Slice(x)
- b := simd.LoadUint64x8Slice(y)
- g := make([]uint64, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat32x16Binary tests the simd binary method f against the expected behavior generated by want
-func testFloat32x16Binary(t *testing.T, f func(_, _ simd.Float32x16) simd.Float32x16, want func(_, _ []float32) []float32) {
- n := 16
- t.Helper()
- forSlicePair(t, float32s, n, func(x, y []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x16Slice(x)
- b := simd.LoadFloat32x16Slice(y)
- g := make([]float32, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat64x8Binary tests the simd binary method f against the expected behavior generated by want
-func testFloat64x8Binary(t *testing.T, f func(_, _ simd.Float64x8) simd.Float64x8, want func(_, _ []float64) []float64) {
- n := 8
- t.Helper()
- forSlicePair(t, float64s, n, func(x, y []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x8Slice(x)
- b := simd.LoadFloat64x8Slice(y)
- g := make([]float64, n)
- f(a, b).StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd_test
-
-import (
- "simd"
- "testing"
-)
-
-func TestAdd(t *testing.T) {
- testFloat32x4Binary(t, simd.Float32x4.Add, addSlice[float32])
- testFloat32x8Binary(t, simd.Float32x8.Add, addSlice[float32])
- testFloat64x2Binary(t, simd.Float64x2.Add, addSlice[float64])
- testFloat64x4Binary(t, simd.Float64x4.Add, addSlice[float64])
-
- testInt16x16Binary(t, simd.Int16x16.Add, addSlice[int16])
- testInt16x8Binary(t, simd.Int16x8.Add, addSlice[int16])
- testInt32x4Binary(t, simd.Int32x4.Add, addSlice[int32])
- testInt32x8Binary(t, simd.Int32x8.Add, addSlice[int32])
- testInt64x2Binary(t, simd.Int64x2.Add, addSlice[int64])
- testInt64x4Binary(t, simd.Int64x4.Add, addSlice[int64])
- testInt8x16Binary(t, simd.Int8x16.Add, addSlice[int8])
- testInt8x32Binary(t, simd.Int8x32.Add, addSlice[int8])
-
- testUint16x16Binary(t, simd.Uint16x16.Add, addSlice[uint16])
- testUint16x8Binary(t, simd.Uint16x8.Add, addSlice[uint16])
- testUint32x4Binary(t, simd.Uint32x4.Add, addSlice[uint32])
- testUint32x8Binary(t, simd.Uint32x8.Add, addSlice[uint32])
- testUint64x2Binary(t, simd.Uint64x2.Add, addSlice[uint64])
- testUint64x4Binary(t, simd.Uint64x4.Add, addSlice[uint64])
- testUint8x16Binary(t, simd.Uint8x16.Add, addSlice[uint8])
- testUint8x32Binary(t, simd.Uint8x32.Add, addSlice[uint8])
-
- if simd.X86.AVX512() {
- testFloat32x16Binary(t, simd.Float32x16.Add, addSlice[float32])
- testFloat64x8Binary(t, simd.Float64x8.Add, addSlice[float64])
- testInt8x64Binary(t, simd.Int8x64.Add, addSlice[int8])
- testInt16x32Binary(t, simd.Int16x32.Add, addSlice[int16])
- testInt32x16Binary(t, simd.Int32x16.Add, addSlice[int32])
- testInt64x8Binary(t, simd.Int64x8.Add, addSlice[int64])
- testUint8x64Binary(t, simd.Uint8x64.Add, addSlice[uint8])
- testUint16x32Binary(t, simd.Uint16x32.Add, addSlice[uint16])
- testUint32x16Binary(t, simd.Uint32x16.Add, addSlice[uint32])
- testUint64x8Binary(t, simd.Uint64x8.Add, addSlice[uint64])
- }
-}
-
-func TestSub(t *testing.T) {
- testFloat32x4Binary(t, simd.Float32x4.Sub, subSlice[float32])
- testFloat32x8Binary(t, simd.Float32x8.Sub, subSlice[float32])
- testFloat64x2Binary(t, simd.Float64x2.Sub, subSlice[float64])
- testFloat64x4Binary(t, simd.Float64x4.Sub, subSlice[float64])
-
- testInt16x16Binary(t, simd.Int16x16.Sub, subSlice[int16])
- testInt16x8Binary(t, simd.Int16x8.Sub, subSlice[int16])
- testInt32x4Binary(t, simd.Int32x4.Sub, subSlice[int32])
- testInt32x8Binary(t, simd.Int32x8.Sub, subSlice[int32])
- testInt64x2Binary(t, simd.Int64x2.Sub, subSlice[int64])
- testInt64x4Binary(t, simd.Int64x4.Sub, subSlice[int64])
- testInt8x16Binary(t, simd.Int8x16.Sub, subSlice[int8])
- testInt8x32Binary(t, simd.Int8x32.Sub, subSlice[int8])
-
- testUint16x16Binary(t, simd.Uint16x16.Sub, subSlice[uint16])
- testUint16x8Binary(t, simd.Uint16x8.Sub, subSlice[uint16])
- testUint32x4Binary(t, simd.Uint32x4.Sub, subSlice[uint32])
- testUint32x8Binary(t, simd.Uint32x8.Sub, subSlice[uint32])
- testUint64x2Binary(t, simd.Uint64x2.Sub, subSlice[uint64])
- testUint64x4Binary(t, simd.Uint64x4.Sub, subSlice[uint64])
- testUint8x16Binary(t, simd.Uint8x16.Sub, subSlice[uint8])
- testUint8x32Binary(t, simd.Uint8x32.Sub, subSlice[uint8])
-
- if simd.X86.AVX512() {
- testFloat32x16Binary(t, simd.Float32x16.Sub, subSlice[float32])
- testFloat64x8Binary(t, simd.Float64x8.Sub, subSlice[float64])
- testInt8x64Binary(t, simd.Int8x64.Sub, subSlice[int8])
- testInt16x32Binary(t, simd.Int16x32.Sub, subSlice[int16])
- testInt32x16Binary(t, simd.Int32x16.Sub, subSlice[int32])
- testInt64x8Binary(t, simd.Int64x8.Sub, subSlice[int64])
- testUint8x64Binary(t, simd.Uint8x64.Sub, subSlice[uint8])
- testUint16x32Binary(t, simd.Uint16x32.Sub, subSlice[uint16])
- testUint32x16Binary(t, simd.Uint32x16.Sub, subSlice[uint32])
- testUint64x8Binary(t, simd.Uint64x8.Sub, subSlice[uint64])
- }
-}
-
-func TestMax(t *testing.T) {
- // testFloat32x4Binary(t, simd.Float32x4.Max, maxSlice[float32]) // nan is wrong
- // testFloat32x8Binary(t, simd.Float32x8.Max, maxSlice[float32]) // nan is wrong
- // testFloat64x2Binary(t, simd.Float64x2.Max, maxSlice[float64]) // nan is wrong
- // testFloat64x4Binary(t, simd.Float64x4.Max, maxSlice[float64]) // nan is wrong
-
- testInt16x16Binary(t, simd.Int16x16.Max, maxSlice[int16])
- testInt16x8Binary(t, simd.Int16x8.Max, maxSlice[int16])
- testInt32x4Binary(t, simd.Int32x4.Max, maxSlice[int32])
- testInt32x8Binary(t, simd.Int32x8.Max, maxSlice[int32])
-
- if simd.X86.AVX512() {
- testInt64x2Binary(t, simd.Int64x2.Max, maxSlice[int64])
- testInt64x4Binary(t, simd.Int64x4.Max, maxSlice[int64])
- }
-
- testInt8x16Binary(t, simd.Int8x16.Max, maxSlice[int8])
- testInt8x32Binary(t, simd.Int8x32.Max, maxSlice[int8])
-
- testUint16x16Binary(t, simd.Uint16x16.Max, maxSlice[uint16])
- testUint16x8Binary(t, simd.Uint16x8.Max, maxSlice[uint16])
- testUint32x4Binary(t, simd.Uint32x4.Max, maxSlice[uint32])
- testUint32x8Binary(t, simd.Uint32x8.Max, maxSlice[uint32])
-
- if simd.X86.AVX512() {
- testUint64x2Binary(t, simd.Uint64x2.Max, maxSlice[uint64])
- testUint64x4Binary(t, simd.Uint64x4.Max, maxSlice[uint64])
- }
-
- testUint8x16Binary(t, simd.Uint8x16.Max, maxSlice[uint8])
- testUint8x32Binary(t, simd.Uint8x32.Max, maxSlice[uint8])
-
- if simd.X86.AVX512() {
- // testFloat32x16Binary(t, simd.Float32x16.Max, maxSlice[float32]) // nan is wrong
- // testFloat64x8Binary(t, simd.Float64x8.Max, maxSlice[float64]) // nan is wrong
- testInt8x64Binary(t, simd.Int8x64.Max, maxSlice[int8])
- testInt16x32Binary(t, simd.Int16x32.Max, maxSlice[int16])
- testInt32x16Binary(t, simd.Int32x16.Max, maxSlice[int32])
- testInt64x8Binary(t, simd.Int64x8.Max, maxSlice[int64])
- testUint8x64Binary(t, simd.Uint8x64.Max, maxSlice[uint8])
- testUint16x32Binary(t, simd.Uint16x32.Max, maxSlice[uint16])
- testUint32x16Binary(t, simd.Uint32x16.Max, maxSlice[uint32])
- testUint64x8Binary(t, simd.Uint64x8.Max, maxSlice[uint64])
- }
-}
-
-func TestMin(t *testing.T) {
- // testFloat32x4Binary(t, simd.Float32x4.Min, minSlice[float32]) // nan is wrong
- // testFloat32x8Binary(t, simd.Float32x8.Min, minSlice[float32]) // nan is wrong
- // testFloat64x2Binary(t, simd.Float64x2.Min, minSlice[float64]) // nan is wrong
- // testFloat64x4Binary(t, simd.Float64x4.Min, minSlice[float64]) // nan is wrong
-
- testInt16x16Binary(t, simd.Int16x16.Min, minSlice[int16])
- testInt16x8Binary(t, simd.Int16x8.Min, minSlice[int16])
- testInt32x4Binary(t, simd.Int32x4.Min, minSlice[int32])
- testInt32x8Binary(t, simd.Int32x8.Min, minSlice[int32])
-
- if simd.X86.AVX512() {
- testInt64x2Binary(t, simd.Int64x2.Min, minSlice[int64])
- testInt64x4Binary(t, simd.Int64x4.Min, minSlice[int64])
- }
-
- testInt8x16Binary(t, simd.Int8x16.Min, minSlice[int8])
- testInt8x32Binary(t, simd.Int8x32.Min, minSlice[int8])
-
- testUint16x16Binary(t, simd.Uint16x16.Min, minSlice[uint16])
- testUint16x8Binary(t, simd.Uint16x8.Min, minSlice[uint16])
- testUint32x4Binary(t, simd.Uint32x4.Min, minSlice[uint32])
- testUint32x8Binary(t, simd.Uint32x8.Min, minSlice[uint32])
-
- if simd.X86.AVX512() {
- testUint64x2Binary(t, simd.Uint64x2.Min, minSlice[uint64])
- testUint64x4Binary(t, simd.Uint64x4.Min, minSlice[uint64])
- }
-
- testUint8x16Binary(t, simd.Uint8x16.Min, minSlice[uint8])
- testUint8x32Binary(t, simd.Uint8x32.Min, minSlice[uint8])
-
- if simd.X86.AVX512() {
- // testFloat32x16Binary(t, simd.Float32x16.Min, minSlice[float32]) // nan is wrong
- // testFloat64x8Binary(t, simd.Float64x8.Min, minSlice[float64]) // nan is wrong
- testInt8x64Binary(t, simd.Int8x64.Min, minSlice[int8])
- testInt16x32Binary(t, simd.Int16x32.Min, minSlice[int16])
- testInt32x16Binary(t, simd.Int32x16.Min, minSlice[int32])
- testInt64x8Binary(t, simd.Int64x8.Min, minSlice[int64])
- testUint8x64Binary(t, simd.Uint8x64.Min, minSlice[uint8])
- testUint16x32Binary(t, simd.Uint16x32.Min, minSlice[uint16])
- testUint32x16Binary(t, simd.Uint32x16.Min, minSlice[uint32])
- testUint64x8Binary(t, simd.Uint64x8.Min, minSlice[uint64])
- }
-}
-
-func TestAnd(t *testing.T) {
- testInt16x16Binary(t, simd.Int16x16.And, andSlice[int16])
- testInt16x8Binary(t, simd.Int16x8.And, andSlice[int16])
- testInt32x4Binary(t, simd.Int32x4.And, andSlice[int32])
- testInt32x8Binary(t, simd.Int32x8.And, andSlice[int32])
- testInt64x2Binary(t, simd.Int64x2.And, andSlice[int64])
- testInt64x4Binary(t, simd.Int64x4.And, andSlice[int64])
- testInt8x16Binary(t, simd.Int8x16.And, andSlice[int8])
- testInt8x32Binary(t, simd.Int8x32.And, andSlice[int8])
-
- testUint16x16Binary(t, simd.Uint16x16.And, andSlice[uint16])
- testUint16x8Binary(t, simd.Uint16x8.And, andSlice[uint16])
- testUint32x4Binary(t, simd.Uint32x4.And, andSlice[uint32])
- testUint32x8Binary(t, simd.Uint32x8.And, andSlice[uint32])
- testUint64x2Binary(t, simd.Uint64x2.And, andSlice[uint64])
- testUint64x4Binary(t, simd.Uint64x4.And, andSlice[uint64])
- testUint8x16Binary(t, simd.Uint8x16.And, andSlice[uint8])
- testUint8x32Binary(t, simd.Uint8x32.And, andSlice[uint8])
-
- if simd.X86.AVX512() {
- // testInt8x64Binary(t, simd.Int8x64.And, andISlice[int8]) // missing
- // testInt16x32Binary(t, simd.Int16x32.And, andISlice[int16]) // missing
- testInt32x16Binary(t, simd.Int32x16.And, andSlice[int32])
- testInt64x8Binary(t, simd.Int64x8.And, andSlice[int64])
- // testUint8x64Binary(t, simd.Uint8x64.And, andISlice[uint8]) // missing
- // testUint16x32Binary(t, simd.Uint16x32.And, andISlice[uint16]) // missing
- testUint32x16Binary(t, simd.Uint32x16.And, andSlice[uint32])
- testUint64x8Binary(t, simd.Uint64x8.And, andSlice[uint64])
- }
-}
-
-func TestAndNot(t *testing.T) {
- testInt16x16Binary(t, simd.Int16x16.AndNot, andNotSlice[int16])
- testInt16x8Binary(t, simd.Int16x8.AndNot, andNotSlice[int16])
- testInt32x4Binary(t, simd.Int32x4.AndNot, andNotSlice[int32])
- testInt32x8Binary(t, simd.Int32x8.AndNot, andNotSlice[int32])
- testInt64x2Binary(t, simd.Int64x2.AndNot, andNotSlice[int64])
- testInt64x4Binary(t, simd.Int64x4.AndNot, andNotSlice[int64])
- testInt8x16Binary(t, simd.Int8x16.AndNot, andNotSlice[int8])
- testInt8x32Binary(t, simd.Int8x32.AndNot, andNotSlice[int8])
-
- testUint16x16Binary(t, simd.Uint16x16.AndNot, andNotSlice[uint16])
- testUint16x8Binary(t, simd.Uint16x8.AndNot, andNotSlice[uint16])
- testUint32x4Binary(t, simd.Uint32x4.AndNot, andNotSlice[uint32])
- testUint32x8Binary(t, simd.Uint32x8.AndNot, andNotSlice[uint32])
- testUint64x2Binary(t, simd.Uint64x2.AndNot, andNotSlice[uint64])
- testUint64x4Binary(t, simd.Uint64x4.AndNot, andNotSlice[uint64])
- testUint8x16Binary(t, simd.Uint8x16.AndNot, andNotSlice[uint8])
- testUint8x32Binary(t, simd.Uint8x32.AndNot, andNotSlice[uint8])
-
- if simd.X86.AVX512() {
- testInt8x64Binary(t, simd.Int8x64.AndNot, andNotSlice[int8])
- testInt16x32Binary(t, simd.Int16x32.AndNot, andNotSlice[int16])
- testInt32x16Binary(t, simd.Int32x16.AndNot, andNotSlice[int32])
- testInt64x8Binary(t, simd.Int64x8.AndNot, andNotSlice[int64])
- testUint8x64Binary(t, simd.Uint8x64.AndNot, andNotSlice[uint8])
- testUint16x32Binary(t, simd.Uint16x32.AndNot, andNotSlice[uint16])
- testUint32x16Binary(t, simd.Uint32x16.AndNot, andNotSlice[uint32])
- testUint64x8Binary(t, simd.Uint64x8.AndNot, andNotSlice[uint64])
- }
-}
-
-func TestXor(t *testing.T) {
- testInt16x16Binary(t, simd.Int16x16.Xor, xorSlice[int16])
- testInt16x8Binary(t, simd.Int16x8.Xor, xorSlice[int16])
- testInt32x4Binary(t, simd.Int32x4.Xor, xorSlice[int32])
- testInt32x8Binary(t, simd.Int32x8.Xor, xorSlice[int32])
- testInt64x2Binary(t, simd.Int64x2.Xor, xorSlice[int64])
- testInt64x4Binary(t, simd.Int64x4.Xor, xorSlice[int64])
- testInt8x16Binary(t, simd.Int8x16.Xor, xorSlice[int8])
- testInt8x32Binary(t, simd.Int8x32.Xor, xorSlice[int8])
-
- testUint16x16Binary(t, simd.Uint16x16.Xor, xorSlice[uint16])
- testUint16x8Binary(t, simd.Uint16x8.Xor, xorSlice[uint16])
- testUint32x4Binary(t, simd.Uint32x4.Xor, xorSlice[uint32])
- testUint32x8Binary(t, simd.Uint32x8.Xor, xorSlice[uint32])
- testUint64x2Binary(t, simd.Uint64x2.Xor, xorSlice[uint64])
- testUint64x4Binary(t, simd.Uint64x4.Xor, xorSlice[uint64])
- testUint8x16Binary(t, simd.Uint8x16.Xor, xorSlice[uint8])
- testUint8x32Binary(t, simd.Uint8x32.Xor, xorSlice[uint8])
-
- if simd.X86.AVX512() {
- // testInt8x64Binary(t, simd.Int8x64.Xor, andISlice[int8]) // missing
- // testInt16x32Binary(t, simd.Int16x32.Xor, andISlice[int16]) // missing
- testInt32x16Binary(t, simd.Int32x16.Xor, xorSlice[int32])
- testInt64x8Binary(t, simd.Int64x8.Xor, xorSlice[int64])
- // testUint8x64Binary(t, simd.Uint8x64.Xor, andISlice[uint8]) // missing
- // testUint16x32Binary(t, simd.Uint16x32.Xor, andISlice[uint16]) // missing
- testUint32x16Binary(t, simd.Uint32x16.Xor, xorSlice[uint32])
- testUint64x8Binary(t, simd.Uint64x8.Xor, xorSlice[uint64])
- }
-}
-
-func TestOr(t *testing.T) {
- testInt16x16Binary(t, simd.Int16x16.Or, orSlice[int16])
- testInt16x8Binary(t, simd.Int16x8.Or, orSlice[int16])
- testInt32x4Binary(t, simd.Int32x4.Or, orSlice[int32])
- testInt32x8Binary(t, simd.Int32x8.Or, orSlice[int32])
- testInt64x2Binary(t, simd.Int64x2.Or, orSlice[int64])
- testInt64x4Binary(t, simd.Int64x4.Or, orSlice[int64])
- testInt8x16Binary(t, simd.Int8x16.Or, orSlice[int8])
- testInt8x32Binary(t, simd.Int8x32.Or, orSlice[int8])
-
- testUint16x16Binary(t, simd.Uint16x16.Or, orSlice[uint16])
- testUint16x8Binary(t, simd.Uint16x8.Or, orSlice[uint16])
- testUint32x4Binary(t, simd.Uint32x4.Or, orSlice[uint32])
- testUint32x8Binary(t, simd.Uint32x8.Or, orSlice[uint32])
- testUint64x2Binary(t, simd.Uint64x2.Or, orSlice[uint64])
- testUint64x4Binary(t, simd.Uint64x4.Or, orSlice[uint64])
- testUint8x16Binary(t, simd.Uint8x16.Or, orSlice[uint8])
- testUint8x32Binary(t, simd.Uint8x32.Or, orSlice[uint8])
-
- if simd.X86.AVX512() {
- // testInt8x64Binary(t, simd.Int8x64.Or, andISlice[int8]) // missing
- // testInt16x32Binary(t, simd.Int16x32.Or, andISlice[int16]) // missing
- testInt32x16Binary(t, simd.Int32x16.Or, orSlice[int32])
- testInt64x8Binary(t, simd.Int64x8.Or, orSlice[int64])
- // testUint8x64Binary(t, simd.Uint8x64.Or, andISlice[uint8]) // missing
- // testUint16x32Binary(t, simd.Uint16x32.Or, andISlice[uint16]) // missing
- testUint32x16Binary(t, simd.Uint32x16.Or, orSlice[uint32])
- testUint64x8Binary(t, simd.Uint64x8.Or, orSlice[uint64])
- }
-}
-
-func TestMul(t *testing.T) {
- testFloat32x4Binary(t, simd.Float32x4.Mul, mulSlice[float32])
- testFloat32x8Binary(t, simd.Float32x8.Mul, mulSlice[float32])
- testFloat64x2Binary(t, simd.Float64x2.Mul, mulSlice[float64])
- testFloat64x4Binary(t, simd.Float64x4.Mul, mulSlice[float64])
-
- testInt16x16Binary(t, simd.Int16x16.Mul, mulSlice[int16])
- testInt16x8Binary(t, simd.Int16x8.Mul, mulSlice[int16])
- testInt32x4Binary(t, simd.Int32x4.Mul, mulSlice[int32])
- testInt32x8Binary(t, simd.Int32x8.Mul, mulSlice[int32])
-
- // testInt8x16Binary(t, simd.Int8x16.Mul, mulSlice[int8]) // nope
- // testInt8x32Binary(t, simd.Int8x32.Mul, mulSlice[int8])
-
- // TODO we should be able to do these, there's no difference between signed/unsigned Mul
- // testUint16x16Binary(t, simd.Uint16x16.Mul, mulSlice[uint16])
- // testUint16x8Binary(t, simd.Uint16x8.Mul, mulSlice[uint16])
- // testUint32x4Binary(t, simd.Uint32x4.Mul, mulSlice[uint32])
- // testUint32x8Binary(t, simd.Uint32x8.Mul, mulSlice[uint32])
- // testUint64x2Binary(t, simd.Uint64x2.Mul, mulSlice[uint64])
- // testUint64x4Binary(t, simd.Uint64x4.Mul, mulSlice[uint64])
-
- // testUint8x16Binary(t, simd.Uint8x16.Mul, mulSlice[uint8]) // nope
- // testUint8x32Binary(t, simd.Uint8x32.Mul, mulSlice[uint8])
-
- if simd.X86.AVX512() {
- testInt64x2Binary(t, simd.Int64x2.Mul, mulSlice[int64]) // avx512 only
- testInt64x4Binary(t, simd.Int64x4.Mul, mulSlice[int64])
-
- testFloat32x16Binary(t, simd.Float32x16.Mul, mulSlice[float32])
- testFloat64x8Binary(t, simd.Float64x8.Mul, mulSlice[float64])
-
- // testInt8x64Binary(t, simd.Int8x64.Mul, mulSlice[int8]) // nope
- testInt16x32Binary(t, simd.Int16x32.Mul, mulSlice[int16])
- testInt32x16Binary(t, simd.Int32x16.Mul, mulSlice[int32])
- testInt64x8Binary(t, simd.Int64x8.Mul, mulSlice[int64])
- // testUint8x64Binary(t, simd.Uint8x64.Mul, mulSlice[uint8]) // nope
-
- // TODO signed should do the job
- // testUint16x32Binary(t, simd.Uint16x32.Mul, mulSlice[uint16])
- // testUint32x16Binary(t, simd.Uint32x16.Mul, mulSlice[uint32])
- // testUint64x8Binary(t, simd.Uint64x8.Mul, mulSlice[uint64])
- }
-}
-
-func TestDiv(t *testing.T) {
- testFloat32x4Binary(t, simd.Float32x4.Div, divSlice[float32])
- testFloat32x8Binary(t, simd.Float32x8.Div, divSlice[float32])
- testFloat64x2Binary(t, simd.Float64x2.Div, divSlice[float64])
- testFloat64x4Binary(t, simd.Float64x4.Div, divSlice[float64])
-
- if simd.X86.AVX512() {
- testFloat32x16Binary(t, simd.Float32x16.Div, divSlice[float32])
- testFloat64x8Binary(t, simd.Float64x8.Div, divSlice[float64])
- }
-}
+++ /dev/null
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-// This file contains functions testing simd methods that compare two operands.
-// Each function in this file is specialized for a
-// particular simd type <BaseType><Width>x<Count>.
-
-package simd_test
-
-import (
- "simd"
- "testing"
-)
-
-// testInt8x16Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt8x16Compare(t *testing.T, f func(_, _ simd.Int8x16) simd.Mask8x16, want func(_, _ []int8) []int64) {
- n := 16
- t.Helper()
- forSlicePair(t, int8s, n, func(x, y []int8) bool {
- t.Helper()
- a := simd.LoadInt8x16Slice(x)
- b := simd.LoadInt8x16Slice(y)
- g := make([]int8, n)
- f(a, b).AsInt8x16().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt16x8Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt16x8Compare(t *testing.T, f func(_, _ simd.Int16x8) simd.Mask16x8, want func(_, _ []int16) []int64) {
- n := 8
- t.Helper()
- forSlicePair(t, int16s, n, func(x, y []int16) bool {
- t.Helper()
- a := simd.LoadInt16x8Slice(x)
- b := simd.LoadInt16x8Slice(y)
- g := make([]int16, n)
- f(a, b).AsInt16x8().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt32x4Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt32x4Compare(t *testing.T, f func(_, _ simd.Int32x4) simd.Mask32x4, want func(_, _ []int32) []int64) {
- n := 4
- t.Helper()
- forSlicePair(t, int32s, n, func(x, y []int32) bool {
- t.Helper()
- a := simd.LoadInt32x4Slice(x)
- b := simd.LoadInt32x4Slice(y)
- g := make([]int32, n)
- f(a, b).AsInt32x4().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt64x2Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt64x2Compare(t *testing.T, f func(_, _ simd.Int64x2) simd.Mask64x2, want func(_, _ []int64) []int64) {
- n := 2
- t.Helper()
- forSlicePair(t, int64s, n, func(x, y []int64) bool {
- t.Helper()
- a := simd.LoadInt64x2Slice(x)
- b := simd.LoadInt64x2Slice(y)
- g := make([]int64, n)
- f(a, b).AsInt64x2().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint8x16Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint8x16Compare(t *testing.T, f func(_, _ simd.Uint8x16) simd.Mask8x16, want func(_, _ []uint8) []int64) {
- n := 16
- t.Helper()
- forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x16Slice(x)
- b := simd.LoadUint8x16Slice(y)
- g := make([]int8, n)
- f(a, b).AsInt8x16().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint16x8Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint16x8Compare(t *testing.T, f func(_, _ simd.Uint16x8) simd.Mask16x8, want func(_, _ []uint16) []int64) {
- n := 8
- t.Helper()
- forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x8Slice(x)
- b := simd.LoadUint16x8Slice(y)
- g := make([]int16, n)
- f(a, b).AsInt16x8().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint32x4Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint32x4Compare(t *testing.T, f func(_, _ simd.Uint32x4) simd.Mask32x4, want func(_, _ []uint32) []int64) {
- n := 4
- t.Helper()
- forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x4Slice(x)
- b := simd.LoadUint32x4Slice(y)
- g := make([]int32, n)
- f(a, b).AsInt32x4().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint64x2Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint64x2Compare(t *testing.T, f func(_, _ simd.Uint64x2) simd.Mask64x2, want func(_, _ []uint64) []int64) {
- n := 2
- t.Helper()
- forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x2Slice(x)
- b := simd.LoadUint64x2Slice(y)
- g := make([]int64, n)
- f(a, b).AsInt64x2().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat32x4Compare tests the simd comparison method f against the expected behavior generated by want
-func testFloat32x4Compare(t *testing.T, f func(_, _ simd.Float32x4) simd.Mask32x4, want func(_, _ []float32) []int64) {
- n := 4
- t.Helper()
- forSlicePair(t, float32s, n, func(x, y []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x4Slice(x)
- b := simd.LoadFloat32x4Slice(y)
- g := make([]int32, n)
- f(a, b).AsInt32x4().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat64x2Compare tests the simd comparison method f against the expected behavior generated by want
-func testFloat64x2Compare(t *testing.T, f func(_, _ simd.Float64x2) simd.Mask64x2, want func(_, _ []float64) []int64) {
- n := 2
- t.Helper()
- forSlicePair(t, float64s, n, func(x, y []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x2Slice(x)
- b := simd.LoadFloat64x2Slice(y)
- g := make([]int64, n)
- f(a, b).AsInt64x2().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt8x32Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt8x32Compare(t *testing.T, f func(_, _ simd.Int8x32) simd.Mask8x32, want func(_, _ []int8) []int64) {
- n := 32
- t.Helper()
- forSlicePair(t, int8s, n, func(x, y []int8) bool {
- t.Helper()
- a := simd.LoadInt8x32Slice(x)
- b := simd.LoadInt8x32Slice(y)
- g := make([]int8, n)
- f(a, b).AsInt8x32().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt16x16Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt16x16Compare(t *testing.T, f func(_, _ simd.Int16x16) simd.Mask16x16, want func(_, _ []int16) []int64) {
- n := 16
- t.Helper()
- forSlicePair(t, int16s, n, func(x, y []int16) bool {
- t.Helper()
- a := simd.LoadInt16x16Slice(x)
- b := simd.LoadInt16x16Slice(y)
- g := make([]int16, n)
- f(a, b).AsInt16x16().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt32x8Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt32x8Compare(t *testing.T, f func(_, _ simd.Int32x8) simd.Mask32x8, want func(_, _ []int32) []int64) {
- n := 8
- t.Helper()
- forSlicePair(t, int32s, n, func(x, y []int32) bool {
- t.Helper()
- a := simd.LoadInt32x8Slice(x)
- b := simd.LoadInt32x8Slice(y)
- g := make([]int32, n)
- f(a, b).AsInt32x8().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt64x4Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt64x4Compare(t *testing.T, f func(_, _ simd.Int64x4) simd.Mask64x4, want func(_, _ []int64) []int64) {
- n := 4
- t.Helper()
- forSlicePair(t, int64s, n, func(x, y []int64) bool {
- t.Helper()
- a := simd.LoadInt64x4Slice(x)
- b := simd.LoadInt64x4Slice(y)
- g := make([]int64, n)
- f(a, b).AsInt64x4().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint8x32Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint8x32Compare(t *testing.T, f func(_, _ simd.Uint8x32) simd.Mask8x32, want func(_, _ []uint8) []int64) {
- n := 32
- t.Helper()
- forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x32Slice(x)
- b := simd.LoadUint8x32Slice(y)
- g := make([]int8, n)
- f(a, b).AsInt8x32().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint16x16Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint16x16Compare(t *testing.T, f func(_, _ simd.Uint16x16) simd.Mask16x16, want func(_, _ []uint16) []int64) {
- n := 16
- t.Helper()
- forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x16Slice(x)
- b := simd.LoadUint16x16Slice(y)
- g := make([]int16, n)
- f(a, b).AsInt16x16().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint32x8Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint32x8Compare(t *testing.T, f func(_, _ simd.Uint32x8) simd.Mask32x8, want func(_, _ []uint32) []int64) {
- n := 8
- t.Helper()
- forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x8Slice(x)
- b := simd.LoadUint32x8Slice(y)
- g := make([]int32, n)
- f(a, b).AsInt32x8().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint64x4Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint64x4Compare(t *testing.T, f func(_, _ simd.Uint64x4) simd.Mask64x4, want func(_, _ []uint64) []int64) {
- n := 4
- t.Helper()
- forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x4Slice(x)
- b := simd.LoadUint64x4Slice(y)
- g := make([]int64, n)
- f(a, b).AsInt64x4().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat32x8Compare tests the simd comparison method f against the expected behavior generated by want
-func testFloat32x8Compare(t *testing.T, f func(_, _ simd.Float32x8) simd.Mask32x8, want func(_, _ []float32) []int64) {
- n := 8
- t.Helper()
- forSlicePair(t, float32s, n, func(x, y []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x8Slice(x)
- b := simd.LoadFloat32x8Slice(y)
- g := make([]int32, n)
- f(a, b).AsInt32x8().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat64x4Compare tests the simd comparison method f against the expected behavior generated by want
-func testFloat64x4Compare(t *testing.T, f func(_, _ simd.Float64x4) simd.Mask64x4, want func(_, _ []float64) []int64) {
- n := 4
- t.Helper()
- forSlicePair(t, float64s, n, func(x, y []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x4Slice(x)
- b := simd.LoadFloat64x4Slice(y)
- g := make([]int64, n)
- f(a, b).AsInt64x4().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt8x64Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt8x64Compare(t *testing.T, f func(_, _ simd.Int8x64) simd.Mask8x64, want func(_, _ []int8) []int64) {
- n := 64
- t.Helper()
- forSlicePair(t, int8s, n, func(x, y []int8) bool {
- t.Helper()
- a := simd.LoadInt8x64Slice(x)
- b := simd.LoadInt8x64Slice(y)
- g := make([]int8, n)
- f(a, b).AsInt8x64().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt16x32Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt16x32Compare(t *testing.T, f func(_, _ simd.Int16x32) simd.Mask16x32, want func(_, _ []int16) []int64) {
- n := 32
- t.Helper()
- forSlicePair(t, int16s, n, func(x, y []int16) bool {
- t.Helper()
- a := simd.LoadInt16x32Slice(x)
- b := simd.LoadInt16x32Slice(y)
- g := make([]int16, n)
- f(a, b).AsInt16x32().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt32x16Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt32x16Compare(t *testing.T, f func(_, _ simd.Int32x16) simd.Mask32x16, want func(_, _ []int32) []int64) {
- n := 16
- t.Helper()
- forSlicePair(t, int32s, n, func(x, y []int32) bool {
- t.Helper()
- a := simd.LoadInt32x16Slice(x)
- b := simd.LoadInt32x16Slice(y)
- g := make([]int32, n)
- f(a, b).AsInt32x16().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testInt64x8Compare tests the simd comparison method f against the expected behavior generated by want
-func testInt64x8Compare(t *testing.T, f func(_, _ simd.Int64x8) simd.Mask64x8, want func(_, _ []int64) []int64) {
- n := 8
- t.Helper()
- forSlicePair(t, int64s, n, func(x, y []int64) bool {
- t.Helper()
- a := simd.LoadInt64x8Slice(x)
- b := simd.LoadInt64x8Slice(y)
- g := make([]int64, n)
- f(a, b).AsInt64x8().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint8x64Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint8x64Compare(t *testing.T, f func(_, _ simd.Uint8x64) simd.Mask8x64, want func(_, _ []uint8) []int64) {
- n := 64
- t.Helper()
- forSlicePair(t, uint8s, n, func(x, y []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x64Slice(x)
- b := simd.LoadUint8x64Slice(y)
- g := make([]int8, n)
- f(a, b).AsInt8x64().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint16x32Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint16x32Compare(t *testing.T, f func(_, _ simd.Uint16x32) simd.Mask16x32, want func(_, _ []uint16) []int64) {
- n := 32
- t.Helper()
- forSlicePair(t, uint16s, n, func(x, y []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x32Slice(x)
- b := simd.LoadUint16x32Slice(y)
- g := make([]int16, n)
- f(a, b).AsInt16x32().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint32x16Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint32x16Compare(t *testing.T, f func(_, _ simd.Uint32x16) simd.Mask32x16, want func(_, _ []uint32) []int64) {
- n := 16
- t.Helper()
- forSlicePair(t, uint32s, n, func(x, y []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x16Slice(x)
- b := simd.LoadUint32x16Slice(y)
- g := make([]int32, n)
- f(a, b).AsInt32x16().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testUint64x8Compare tests the simd comparison method f against the expected behavior generated by want
-func testUint64x8Compare(t *testing.T, f func(_, _ simd.Uint64x8) simd.Mask64x8, want func(_, _ []uint64) []int64) {
- n := 8
- t.Helper()
- forSlicePair(t, uint64s, n, func(x, y []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x8Slice(x)
- b := simd.LoadUint64x8Slice(y)
- g := make([]int64, n)
- f(a, b).AsInt64x8().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat32x16Compare tests the simd comparison method f against the expected behavior generated by want
-func testFloat32x16Compare(t *testing.T, f func(_, _ simd.Float32x16) simd.Mask32x16, want func(_, _ []float32) []int64) {
- n := 16
- t.Helper()
- forSlicePair(t, float32s, n, func(x, y []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x16Slice(x)
- b := simd.LoadFloat32x16Slice(y)
- g := make([]int32, n)
- f(a, b).AsInt32x16().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
-
-// testFloat64x8Compare tests the simd comparison method f against the expected behavior generated by want
-func testFloat64x8Compare(t *testing.T, f func(_, _ simd.Float64x8) simd.Mask64x8, want func(_, _ []float64) []int64) {
- n := 8
- t.Helper()
- forSlicePair(t, float64s, n, func(x, y []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x8Slice(x)
- b := simd.LoadFloat64x8Slice(y)
- g := make([]int64, n)
- f(a, b).AsInt64x8().StoreSlice(g)
- w := want(x, y)
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y) })
- })
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd_test
-
-import (
- "simd"
- "testing"
-)
-
-// AVX 2 lacks most comparisons, but they can be synthesized
-// from > and =
-var comparisonFixed bool = simd.X86.AVX512()
-
-func TestLess(t *testing.T) {
- testFloat32x4Compare(t, simd.Float32x4.Less, lessSlice[float32])
- testFloat32x8Compare(t, simd.Float32x8.Less, lessSlice[float32])
- testFloat64x2Compare(t, simd.Float64x2.Less, lessSlice[float64])
- testFloat64x4Compare(t, simd.Float64x4.Less, lessSlice[float64])
-
- testInt16x16Compare(t, simd.Int16x16.Less, lessSlice[int16])
- testInt16x8Compare(t, simd.Int16x8.Less, lessSlice[int16])
- testInt32x4Compare(t, simd.Int32x4.Less, lessSlice[int32])
- testInt32x8Compare(t, simd.Int32x8.Less, lessSlice[int32])
- testInt64x2Compare(t, simd.Int64x2.Less, lessSlice[int64])
- testInt64x4Compare(t, simd.Int64x4.Less, lessSlice[int64])
- testInt8x16Compare(t, simd.Int8x16.Less, lessSlice[int8])
- testInt8x32Compare(t, simd.Int8x32.Less, lessSlice[int8])
-
- testInt16x16Compare(t, simd.Int16x16.Less, lessSlice[int16])
- testInt16x8Compare(t, simd.Int16x8.Less, lessSlice[int16])
- testInt32x4Compare(t, simd.Int32x4.Less, lessSlice[int32])
- testInt32x8Compare(t, simd.Int32x8.Less, lessSlice[int32])
- testInt64x2Compare(t, simd.Int64x2.Less, lessSlice[int64])
- testInt64x4Compare(t, simd.Int64x4.Less, lessSlice[int64])
- testInt8x16Compare(t, simd.Int8x16.Less, lessSlice[int8])
- testInt8x32Compare(t, simd.Int8x32.Less, lessSlice[int8])
-
- testUint16x16Compare(t, simd.Uint16x16.Less, lessSlice[uint16])
- testUint16x8Compare(t, simd.Uint16x8.Less, lessSlice[uint16])
- testUint32x4Compare(t, simd.Uint32x4.Less, lessSlice[uint32])
- testUint32x8Compare(t, simd.Uint32x8.Less, lessSlice[uint32])
- testUint64x2Compare(t, simd.Uint64x2.Less, lessSlice[uint64])
- testUint64x4Compare(t, simd.Uint64x4.Less, lessSlice[uint64])
- testUint8x16Compare(t, simd.Uint8x16.Less, lessSlice[uint8])
- testUint8x32Compare(t, simd.Uint8x32.Less, lessSlice[uint8])
-
- if simd.X86.AVX512() {
- testUint16x16Compare(t, simd.Uint16x16.Less, lessSlice[uint16])
- testUint16x8Compare(t, simd.Uint16x8.Less, lessSlice[uint16])
- testUint32x4Compare(t, simd.Uint32x4.Less, lessSlice[uint32])
- testUint32x8Compare(t, simd.Uint32x8.Less, lessSlice[uint32])
- testUint64x2Compare(t, simd.Uint64x2.Less, lessSlice[uint64])
- testUint64x4Compare(t, simd.Uint64x4.Less, lessSlice[uint64])
- testUint8x16Compare(t, simd.Uint8x16.Less, lessSlice[uint8])
- testUint8x32Compare(t, simd.Uint8x32.Less, lessSlice[uint8])
-
- testFloat32x16Compare(t, simd.Float32x16.Less, lessSlice[float32])
- testFloat64x8Compare(t, simd.Float64x8.Less, lessSlice[float64])
- testInt8x64Compare(t, simd.Int8x64.Less, lessSlice[int8])
- testInt16x32Compare(t, simd.Int16x32.Less, lessSlice[int16])
- testInt32x16Compare(t, simd.Int32x16.Less, lessSlice[int32])
- testInt64x8Compare(t, simd.Int64x8.Less, lessSlice[int64])
- testUint8x64Compare(t, simd.Uint8x64.Less, lessSlice[uint8])
- testUint16x32Compare(t, simd.Uint16x32.Less, lessSlice[uint16])
- testUint32x16Compare(t, simd.Uint32x16.Less, lessSlice[uint32])
- testUint64x8Compare(t, simd.Uint64x8.Less, lessSlice[uint64])
- }
-}
-
-func TestLessEqual(t *testing.T) {
- testFloat32x4Compare(t, simd.Float32x4.LessEqual, lessEqualSlice[float32])
- testFloat32x8Compare(t, simd.Float32x8.LessEqual, lessEqualSlice[float32])
- testFloat64x2Compare(t, simd.Float64x2.LessEqual, lessEqualSlice[float64])
- testFloat64x4Compare(t, simd.Float64x4.LessEqual, lessEqualSlice[float64])
-
- testInt16x16Compare(t, simd.Int16x16.LessEqual, lessEqualSlice[int16])
- testInt16x8Compare(t, simd.Int16x8.LessEqual, lessEqualSlice[int16])
- testInt32x4Compare(t, simd.Int32x4.LessEqual, lessEqualSlice[int32])
- testInt32x8Compare(t, simd.Int32x8.LessEqual, lessEqualSlice[int32])
- testInt64x2Compare(t, simd.Int64x2.LessEqual, lessEqualSlice[int64])
- testInt64x4Compare(t, simd.Int64x4.LessEqual, lessEqualSlice[int64])
- testInt8x16Compare(t, simd.Int8x16.LessEqual, lessEqualSlice[int8])
- testInt8x32Compare(t, simd.Int8x32.LessEqual, lessEqualSlice[int8])
-
- testUint16x16Compare(t, simd.Uint16x16.LessEqual, lessEqualSlice[uint16])
- testUint16x8Compare(t, simd.Uint16x8.LessEqual, lessEqualSlice[uint16])
- testUint32x4Compare(t, simd.Uint32x4.LessEqual, lessEqualSlice[uint32])
- testUint32x8Compare(t, simd.Uint32x8.LessEqual, lessEqualSlice[uint32])
- testUint64x2Compare(t, simd.Uint64x2.LessEqual, lessEqualSlice[uint64])
- testUint64x4Compare(t, simd.Uint64x4.LessEqual, lessEqualSlice[uint64])
- testUint8x16Compare(t, simd.Uint8x16.LessEqual, lessEqualSlice[uint8])
- testUint8x32Compare(t, simd.Uint8x32.LessEqual, lessEqualSlice[uint8])
-
- if simd.X86.AVX512() {
- testFloat32x16Compare(t, simd.Float32x16.LessEqual, lessEqualSlice[float32])
- testFloat64x8Compare(t, simd.Float64x8.LessEqual, lessEqualSlice[float64])
- testInt8x64Compare(t, simd.Int8x64.LessEqual, lessEqualSlice[int8])
- testInt16x32Compare(t, simd.Int16x32.LessEqual, lessEqualSlice[int16])
- testInt32x16Compare(t, simd.Int32x16.LessEqual, lessEqualSlice[int32])
- testInt64x8Compare(t, simd.Int64x8.LessEqual, lessEqualSlice[int64])
- testUint8x64Compare(t, simd.Uint8x64.LessEqual, lessEqualSlice[uint8])
- testUint16x32Compare(t, simd.Uint16x32.LessEqual, lessEqualSlice[uint16])
- testUint32x16Compare(t, simd.Uint32x16.LessEqual, lessEqualSlice[uint32])
- testUint64x8Compare(t, simd.Uint64x8.LessEqual, lessEqualSlice[uint64])
- }
-}
-
-func TestGreater(t *testing.T) {
- testFloat32x4Compare(t, simd.Float32x4.Greater, greaterSlice[float32])
- testFloat32x8Compare(t, simd.Float32x8.Greater, greaterSlice[float32])
- testFloat64x2Compare(t, simd.Float64x2.Greater, greaterSlice[float64])
- testFloat64x4Compare(t, simd.Float64x4.Greater, greaterSlice[float64])
-
- testInt16x16Compare(t, simd.Int16x16.Greater, greaterSlice[int16])
- testInt16x8Compare(t, simd.Int16x8.Greater, greaterSlice[int16])
- testInt32x4Compare(t, simd.Int32x4.Greater, greaterSlice[int32])
- testInt32x8Compare(t, simd.Int32x8.Greater, greaterSlice[int32])
-
- testInt64x2Compare(t, simd.Int64x2.Greater, greaterSlice[int64])
- testInt64x4Compare(t, simd.Int64x4.Greater, greaterSlice[int64])
- testInt8x16Compare(t, simd.Int8x16.Greater, greaterSlice[int8])
- testInt8x32Compare(t, simd.Int8x32.Greater, greaterSlice[int8])
-
- testUint16x16Compare(t, simd.Uint16x16.Greater, greaterSlice[uint16])
- testUint16x8Compare(t, simd.Uint16x8.Greater, greaterSlice[uint16])
- testUint32x4Compare(t, simd.Uint32x4.Greater, greaterSlice[uint32])
- testUint32x8Compare(t, simd.Uint32x8.Greater, greaterSlice[uint32])
-
- testUint64x2Compare(t, simd.Uint64x2.Greater, greaterSlice[uint64])
- testUint64x4Compare(t, simd.Uint64x4.Greater, greaterSlice[uint64])
- testUint8x16Compare(t, simd.Uint8x16.Greater, greaterSlice[uint8])
- testUint8x32Compare(t, simd.Uint8x32.Greater, greaterSlice[uint8])
-
- if simd.X86.AVX512() {
-
- testFloat32x16Compare(t, simd.Float32x16.Greater, greaterSlice[float32])
- testFloat64x8Compare(t, simd.Float64x8.Greater, greaterSlice[float64])
- testInt8x64Compare(t, simd.Int8x64.Greater, greaterSlice[int8])
- testInt16x32Compare(t, simd.Int16x32.Greater, greaterSlice[int16])
- testInt32x16Compare(t, simd.Int32x16.Greater, greaterSlice[int32])
- testInt64x8Compare(t, simd.Int64x8.Greater, greaterSlice[int64])
- testUint8x64Compare(t, simd.Uint8x64.Greater, greaterSlice[uint8])
- testUint16x32Compare(t, simd.Uint16x32.Greater, greaterSlice[uint16])
- testUint32x16Compare(t, simd.Uint32x16.Greater, greaterSlice[uint32])
- testUint64x8Compare(t, simd.Uint64x8.Greater, greaterSlice[uint64])
- }
-}
-
-func TestGreaterEqual(t *testing.T) {
- testFloat32x4Compare(t, simd.Float32x4.GreaterEqual, greaterEqualSlice[float32])
- testFloat32x8Compare(t, simd.Float32x8.GreaterEqual, greaterEqualSlice[float32])
- testFloat64x2Compare(t, simd.Float64x2.GreaterEqual, greaterEqualSlice[float64])
- testFloat64x4Compare(t, simd.Float64x4.GreaterEqual, greaterEqualSlice[float64])
-
- testInt16x16Compare(t, simd.Int16x16.GreaterEqual, greaterEqualSlice[int16])
- testInt16x8Compare(t, simd.Int16x8.GreaterEqual, greaterEqualSlice[int16])
- testInt32x4Compare(t, simd.Int32x4.GreaterEqual, greaterEqualSlice[int32])
- testInt32x8Compare(t, simd.Int32x8.GreaterEqual, greaterEqualSlice[int32])
- testInt64x2Compare(t, simd.Int64x2.GreaterEqual, greaterEqualSlice[int64])
- testInt64x4Compare(t, simd.Int64x4.GreaterEqual, greaterEqualSlice[int64])
- testInt8x16Compare(t, simd.Int8x16.GreaterEqual, greaterEqualSlice[int8])
- testInt8x32Compare(t, simd.Int8x32.GreaterEqual, greaterEqualSlice[int8])
-
- testUint16x16Compare(t, simd.Uint16x16.GreaterEqual, greaterEqualSlice[uint16])
- testUint16x8Compare(t, simd.Uint16x8.GreaterEqual, greaterEqualSlice[uint16])
- testUint32x4Compare(t, simd.Uint32x4.GreaterEqual, greaterEqualSlice[uint32])
- testUint32x8Compare(t, simd.Uint32x8.GreaterEqual, greaterEqualSlice[uint32])
- testUint64x2Compare(t, simd.Uint64x2.GreaterEqual, greaterEqualSlice[uint64])
- testUint64x4Compare(t, simd.Uint64x4.GreaterEqual, greaterEqualSlice[uint64])
- testUint8x16Compare(t, simd.Uint8x16.GreaterEqual, greaterEqualSlice[uint8])
- testUint8x32Compare(t, simd.Uint8x32.GreaterEqual, greaterEqualSlice[uint8])
-
- if simd.X86.AVX512() {
- testFloat32x16Compare(t, simd.Float32x16.GreaterEqual, greaterEqualSlice[float32])
- testFloat64x8Compare(t, simd.Float64x8.GreaterEqual, greaterEqualSlice[float64])
- testInt8x64Compare(t, simd.Int8x64.GreaterEqual, greaterEqualSlice[int8])
- testInt16x32Compare(t, simd.Int16x32.GreaterEqual, greaterEqualSlice[int16])
- testInt32x16Compare(t, simd.Int32x16.GreaterEqual, greaterEqualSlice[int32])
- testInt64x8Compare(t, simd.Int64x8.GreaterEqual, greaterEqualSlice[int64])
- testUint8x64Compare(t, simd.Uint8x64.GreaterEqual, greaterEqualSlice[uint8])
- testUint16x32Compare(t, simd.Uint16x32.GreaterEqual, greaterEqualSlice[uint16])
- testUint32x16Compare(t, simd.Uint32x16.GreaterEqual, greaterEqualSlice[uint32])
- testUint64x8Compare(t, simd.Uint64x8.GreaterEqual, greaterEqualSlice[uint64])
- }
-}
-
-func TestEqual(t *testing.T) {
- testFloat32x4Compare(t, simd.Float32x4.Equal, equalSlice[float32])
- testFloat32x8Compare(t, simd.Float32x8.Equal, equalSlice[float32])
- testFloat64x2Compare(t, simd.Float64x2.Equal, equalSlice[float64])
- testFloat64x4Compare(t, simd.Float64x4.Equal, equalSlice[float64])
-
- testInt16x16Compare(t, simd.Int16x16.Equal, equalSlice[int16])
- testInt16x8Compare(t, simd.Int16x8.Equal, equalSlice[int16])
- testInt32x4Compare(t, simd.Int32x4.Equal, equalSlice[int32])
- testInt32x8Compare(t, simd.Int32x8.Equal, equalSlice[int32])
- testInt64x2Compare(t, simd.Int64x2.Equal, equalSlice[int64])
- testInt64x4Compare(t, simd.Int64x4.Equal, equalSlice[int64])
- testInt8x16Compare(t, simd.Int8x16.Equal, equalSlice[int8])
- testInt8x32Compare(t, simd.Int8x32.Equal, equalSlice[int8])
-
- testUint16x16Compare(t, simd.Uint16x16.Equal, equalSlice[uint16])
- testUint16x8Compare(t, simd.Uint16x8.Equal, equalSlice[uint16])
- testUint32x4Compare(t, simd.Uint32x4.Equal, equalSlice[uint32])
- testUint32x8Compare(t, simd.Uint32x8.Equal, equalSlice[uint32])
- testUint64x2Compare(t, simd.Uint64x2.Equal, equalSlice[uint64])
- testUint64x4Compare(t, simd.Uint64x4.Equal, equalSlice[uint64])
- testUint8x16Compare(t, simd.Uint8x16.Equal, equalSlice[uint8])
- testUint8x32Compare(t, simd.Uint8x32.Equal, equalSlice[uint8])
-
- if simd.X86.AVX512() {
- testFloat32x16Compare(t, simd.Float32x16.Equal, equalSlice[float32])
- testFloat64x8Compare(t, simd.Float64x8.Equal, equalSlice[float64])
- testInt8x64Compare(t, simd.Int8x64.Equal, equalSlice[int8])
- testInt16x32Compare(t, simd.Int16x32.Equal, equalSlice[int16])
- testInt32x16Compare(t, simd.Int32x16.Equal, equalSlice[int32])
- testInt64x8Compare(t, simd.Int64x8.Equal, equalSlice[int64])
- testUint8x64Compare(t, simd.Uint8x64.Equal, equalSlice[uint8])
- testUint16x32Compare(t, simd.Uint16x32.Equal, equalSlice[uint16])
- testUint32x16Compare(t, simd.Uint32x16.Equal, equalSlice[uint32])
- testUint64x8Compare(t, simd.Uint64x8.Equal, equalSlice[uint64])
- }
-}
-
-func TestNotEqual(t *testing.T) {
- testFloat32x4Compare(t, simd.Float32x4.NotEqual, notEqualSlice[float32])
- testFloat32x8Compare(t, simd.Float32x8.NotEqual, notEqualSlice[float32])
- testFloat64x2Compare(t, simd.Float64x2.NotEqual, notEqualSlice[float64])
- testFloat64x4Compare(t, simd.Float64x4.NotEqual, notEqualSlice[float64])
-
- testInt16x16Compare(t, simd.Int16x16.NotEqual, notEqualSlice[int16])
- testInt16x8Compare(t, simd.Int16x8.NotEqual, notEqualSlice[int16])
- testInt32x4Compare(t, simd.Int32x4.NotEqual, notEqualSlice[int32])
- testInt32x8Compare(t, simd.Int32x8.NotEqual, notEqualSlice[int32])
- testInt64x2Compare(t, simd.Int64x2.NotEqual, notEqualSlice[int64])
- testInt64x4Compare(t, simd.Int64x4.NotEqual, notEqualSlice[int64])
- testInt8x16Compare(t, simd.Int8x16.NotEqual, notEqualSlice[int8])
- testInt8x32Compare(t, simd.Int8x32.NotEqual, notEqualSlice[int8])
-
- testUint16x16Compare(t, simd.Uint16x16.NotEqual, notEqualSlice[uint16])
- testUint16x8Compare(t, simd.Uint16x8.NotEqual, notEqualSlice[uint16])
- testUint32x4Compare(t, simd.Uint32x4.NotEqual, notEqualSlice[uint32])
- testUint32x8Compare(t, simd.Uint32x8.NotEqual, notEqualSlice[uint32])
- testUint64x2Compare(t, simd.Uint64x2.NotEqual, notEqualSlice[uint64])
- testUint64x4Compare(t, simd.Uint64x4.NotEqual, notEqualSlice[uint64])
- testUint8x16Compare(t, simd.Uint8x16.NotEqual, notEqualSlice[uint8])
- testUint8x32Compare(t, simd.Uint8x32.NotEqual, notEqualSlice[uint8])
-
- if simd.X86.AVX512() {
- testFloat32x16Compare(t, simd.Float32x16.NotEqual, notEqualSlice[float32])
- testFloat64x8Compare(t, simd.Float64x8.NotEqual, notEqualSlice[float64])
- testInt8x64Compare(t, simd.Int8x64.NotEqual, notEqualSlice[int8])
- testInt16x32Compare(t, simd.Int16x32.NotEqual, notEqualSlice[int16])
- testInt32x16Compare(t, simd.Int32x16.NotEqual, notEqualSlice[int32])
- testInt64x8Compare(t, simd.Int64x8.NotEqual, notEqualSlice[int64])
- testUint8x64Compare(t, simd.Uint8x64.NotEqual, notEqualSlice[uint8])
- testUint16x32Compare(t, simd.Uint16x32.NotEqual, notEqualSlice[uint16])
- testUint32x16Compare(t, simd.Uint32x16.NotEqual, notEqualSlice[uint32])
- testUint64x8Compare(t, simd.Uint64x8.NotEqual, notEqualSlice[uint64])
- }
-}
+++ /dev/null
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-// This file contains functions testing simd methods that compare two operands under a mask.
-// Each function in this file is specialized for a
-// particular simd type <BaseType><Width>x<Count>.
-
-package simd_test
-
-import (
- "simd"
- "testing"
-)
-
-// testInt8x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt8x16CompareMasked(t *testing.T,
- f func(_, _ simd.Int8x16, m simd.Mask8x16) simd.Mask8x16,
- want func(_, _ []int8) []int64) {
- n := 16
- t.Helper()
- forSlicePairMasked(t, int8s, n, func(x, y []int8, m []bool) bool {
- t.Helper()
- a := simd.LoadInt8x16Slice(x)
- b := simd.LoadInt8x16Slice(y)
- k := simd.LoadInt8x16Slice(toVect[int8](m)).ToMask()
- g := make([]int8, n)
- f(a, b, k).AsInt8x16().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt16x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt16x8CompareMasked(t *testing.T,
- f func(_, _ simd.Int16x8, m simd.Mask16x8) simd.Mask16x8,
- want func(_, _ []int16) []int64) {
- n := 8
- t.Helper()
- forSlicePairMasked(t, int16s, n, func(x, y []int16, m []bool) bool {
- t.Helper()
- a := simd.LoadInt16x8Slice(x)
- b := simd.LoadInt16x8Slice(y)
- k := simd.LoadInt16x8Slice(toVect[int16](m)).ToMask()
- g := make([]int16, n)
- f(a, b, k).AsInt16x8().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt32x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt32x4CompareMasked(t *testing.T,
- f func(_, _ simd.Int32x4, m simd.Mask32x4) simd.Mask32x4,
- want func(_, _ []int32) []int64) {
- n := 4
- t.Helper()
- forSlicePairMasked(t, int32s, n, func(x, y []int32, m []bool) bool {
- t.Helper()
- a := simd.LoadInt32x4Slice(x)
- b := simd.LoadInt32x4Slice(y)
- k := simd.LoadInt32x4Slice(toVect[int32](m)).ToMask()
- g := make([]int32, n)
- f(a, b, k).AsInt32x4().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt64x2CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt64x2CompareMasked(t *testing.T,
- f func(_, _ simd.Int64x2, m simd.Mask64x2) simd.Mask64x2,
- want func(_, _ []int64) []int64) {
- n := 2
- t.Helper()
- forSlicePairMasked(t, int64s, n, func(x, y []int64, m []bool) bool {
- t.Helper()
- a := simd.LoadInt64x2Slice(x)
- b := simd.LoadInt64x2Slice(y)
- k := simd.LoadInt64x2Slice(toVect[int64](m)).ToMask()
- g := make([]int64, n)
- f(a, b, k).AsInt64x2().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint8x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint8x16CompareMasked(t *testing.T,
- f func(_, _ simd.Uint8x16, m simd.Mask8x16) simd.Mask8x16,
- want func(_, _ []uint8) []int64) {
- n := 16
- t.Helper()
- forSlicePairMasked(t, uint8s, n, func(x, y []uint8, m []bool) bool {
- t.Helper()
- a := simd.LoadUint8x16Slice(x)
- b := simd.LoadUint8x16Slice(y)
- k := simd.LoadInt8x16Slice(toVect[int8](m)).ToMask()
- g := make([]int8, n)
- f(a, b, k).AsInt8x16().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint16x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint16x8CompareMasked(t *testing.T,
- f func(_, _ simd.Uint16x8, m simd.Mask16x8) simd.Mask16x8,
- want func(_, _ []uint16) []int64) {
- n := 8
- t.Helper()
- forSlicePairMasked(t, uint16s, n, func(x, y []uint16, m []bool) bool {
- t.Helper()
- a := simd.LoadUint16x8Slice(x)
- b := simd.LoadUint16x8Slice(y)
- k := simd.LoadInt16x8Slice(toVect[int16](m)).ToMask()
- g := make([]int16, n)
- f(a, b, k).AsInt16x8().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint32x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint32x4CompareMasked(t *testing.T,
- f func(_, _ simd.Uint32x4, m simd.Mask32x4) simd.Mask32x4,
- want func(_, _ []uint32) []int64) {
- n := 4
- t.Helper()
- forSlicePairMasked(t, uint32s, n, func(x, y []uint32, m []bool) bool {
- t.Helper()
- a := simd.LoadUint32x4Slice(x)
- b := simd.LoadUint32x4Slice(y)
- k := simd.LoadInt32x4Slice(toVect[int32](m)).ToMask()
- g := make([]int32, n)
- f(a, b, k).AsInt32x4().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint64x2CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint64x2CompareMasked(t *testing.T,
- f func(_, _ simd.Uint64x2, m simd.Mask64x2) simd.Mask64x2,
- want func(_, _ []uint64) []int64) {
- n := 2
- t.Helper()
- forSlicePairMasked(t, uint64s, n, func(x, y []uint64, m []bool) bool {
- t.Helper()
- a := simd.LoadUint64x2Slice(x)
- b := simd.LoadUint64x2Slice(y)
- k := simd.LoadInt64x2Slice(toVect[int64](m)).ToMask()
- g := make([]int64, n)
- f(a, b, k).AsInt64x2().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testFloat32x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testFloat32x4CompareMasked(t *testing.T,
- f func(_, _ simd.Float32x4, m simd.Mask32x4) simd.Mask32x4,
- want func(_, _ []float32) []int64) {
- n := 4
- t.Helper()
- forSlicePairMasked(t, float32s, n, func(x, y []float32, m []bool) bool {
- t.Helper()
- a := simd.LoadFloat32x4Slice(x)
- b := simd.LoadFloat32x4Slice(y)
- k := simd.LoadInt32x4Slice(toVect[int32](m)).ToMask()
- g := make([]int32, n)
- f(a, b, k).AsInt32x4().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testFloat64x2CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testFloat64x2CompareMasked(t *testing.T,
- f func(_, _ simd.Float64x2, m simd.Mask64x2) simd.Mask64x2,
- want func(_, _ []float64) []int64) {
- n := 2
- t.Helper()
- forSlicePairMasked(t, float64s, n, func(x, y []float64, m []bool) bool {
- t.Helper()
- a := simd.LoadFloat64x2Slice(x)
- b := simd.LoadFloat64x2Slice(y)
- k := simd.LoadInt64x2Slice(toVect[int64](m)).ToMask()
- g := make([]int64, n)
- f(a, b, k).AsInt64x2().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt8x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt8x32CompareMasked(t *testing.T,
- f func(_, _ simd.Int8x32, m simd.Mask8x32) simd.Mask8x32,
- want func(_, _ []int8) []int64) {
- n := 32
- t.Helper()
- forSlicePairMasked(t, int8s, n, func(x, y []int8, m []bool) bool {
- t.Helper()
- a := simd.LoadInt8x32Slice(x)
- b := simd.LoadInt8x32Slice(y)
- k := simd.LoadInt8x32Slice(toVect[int8](m)).ToMask()
- g := make([]int8, n)
- f(a, b, k).AsInt8x32().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt16x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt16x16CompareMasked(t *testing.T,
- f func(_, _ simd.Int16x16, m simd.Mask16x16) simd.Mask16x16,
- want func(_, _ []int16) []int64) {
- n := 16
- t.Helper()
- forSlicePairMasked(t, int16s, n, func(x, y []int16, m []bool) bool {
- t.Helper()
- a := simd.LoadInt16x16Slice(x)
- b := simd.LoadInt16x16Slice(y)
- k := simd.LoadInt16x16Slice(toVect[int16](m)).ToMask()
- g := make([]int16, n)
- f(a, b, k).AsInt16x16().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt32x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt32x8CompareMasked(t *testing.T,
- f func(_, _ simd.Int32x8, m simd.Mask32x8) simd.Mask32x8,
- want func(_, _ []int32) []int64) {
- n := 8
- t.Helper()
- forSlicePairMasked(t, int32s, n, func(x, y []int32, m []bool) bool {
- t.Helper()
- a := simd.LoadInt32x8Slice(x)
- b := simd.LoadInt32x8Slice(y)
- k := simd.LoadInt32x8Slice(toVect[int32](m)).ToMask()
- g := make([]int32, n)
- f(a, b, k).AsInt32x8().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt64x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt64x4CompareMasked(t *testing.T,
- f func(_, _ simd.Int64x4, m simd.Mask64x4) simd.Mask64x4,
- want func(_, _ []int64) []int64) {
- n := 4
- t.Helper()
- forSlicePairMasked(t, int64s, n, func(x, y []int64, m []bool) bool {
- t.Helper()
- a := simd.LoadInt64x4Slice(x)
- b := simd.LoadInt64x4Slice(y)
- k := simd.LoadInt64x4Slice(toVect[int64](m)).ToMask()
- g := make([]int64, n)
- f(a, b, k).AsInt64x4().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint8x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint8x32CompareMasked(t *testing.T,
- f func(_, _ simd.Uint8x32, m simd.Mask8x32) simd.Mask8x32,
- want func(_, _ []uint8) []int64) {
- n := 32
- t.Helper()
- forSlicePairMasked(t, uint8s, n, func(x, y []uint8, m []bool) bool {
- t.Helper()
- a := simd.LoadUint8x32Slice(x)
- b := simd.LoadUint8x32Slice(y)
- k := simd.LoadInt8x32Slice(toVect[int8](m)).ToMask()
- g := make([]int8, n)
- f(a, b, k).AsInt8x32().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint16x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint16x16CompareMasked(t *testing.T,
- f func(_, _ simd.Uint16x16, m simd.Mask16x16) simd.Mask16x16,
- want func(_, _ []uint16) []int64) {
- n := 16
- t.Helper()
- forSlicePairMasked(t, uint16s, n, func(x, y []uint16, m []bool) bool {
- t.Helper()
- a := simd.LoadUint16x16Slice(x)
- b := simd.LoadUint16x16Slice(y)
- k := simd.LoadInt16x16Slice(toVect[int16](m)).ToMask()
- g := make([]int16, n)
- f(a, b, k).AsInt16x16().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint32x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint32x8CompareMasked(t *testing.T,
- f func(_, _ simd.Uint32x8, m simd.Mask32x8) simd.Mask32x8,
- want func(_, _ []uint32) []int64) {
- n := 8
- t.Helper()
- forSlicePairMasked(t, uint32s, n, func(x, y []uint32, m []bool) bool {
- t.Helper()
- a := simd.LoadUint32x8Slice(x)
- b := simd.LoadUint32x8Slice(y)
- k := simd.LoadInt32x8Slice(toVect[int32](m)).ToMask()
- g := make([]int32, n)
- f(a, b, k).AsInt32x8().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint64x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint64x4CompareMasked(t *testing.T,
- f func(_, _ simd.Uint64x4, m simd.Mask64x4) simd.Mask64x4,
- want func(_, _ []uint64) []int64) {
- n := 4
- t.Helper()
- forSlicePairMasked(t, uint64s, n, func(x, y []uint64, m []bool) bool {
- t.Helper()
- a := simd.LoadUint64x4Slice(x)
- b := simd.LoadUint64x4Slice(y)
- k := simd.LoadInt64x4Slice(toVect[int64](m)).ToMask()
- g := make([]int64, n)
- f(a, b, k).AsInt64x4().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testFloat32x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testFloat32x8CompareMasked(t *testing.T,
- f func(_, _ simd.Float32x8, m simd.Mask32x8) simd.Mask32x8,
- want func(_, _ []float32) []int64) {
- n := 8
- t.Helper()
- forSlicePairMasked(t, float32s, n, func(x, y []float32, m []bool) bool {
- t.Helper()
- a := simd.LoadFloat32x8Slice(x)
- b := simd.LoadFloat32x8Slice(y)
- k := simd.LoadInt32x8Slice(toVect[int32](m)).ToMask()
- g := make([]int32, n)
- f(a, b, k).AsInt32x8().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testFloat64x4CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testFloat64x4CompareMasked(t *testing.T,
- f func(_, _ simd.Float64x4, m simd.Mask64x4) simd.Mask64x4,
- want func(_, _ []float64) []int64) {
- n := 4
- t.Helper()
- forSlicePairMasked(t, float64s, n, func(x, y []float64, m []bool) bool {
- t.Helper()
- a := simd.LoadFloat64x4Slice(x)
- b := simd.LoadFloat64x4Slice(y)
- k := simd.LoadInt64x4Slice(toVect[int64](m)).ToMask()
- g := make([]int64, n)
- f(a, b, k).AsInt64x4().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt8x64CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt8x64CompareMasked(t *testing.T,
- f func(_, _ simd.Int8x64, m simd.Mask8x64) simd.Mask8x64,
- want func(_, _ []int8) []int64) {
- n := 64
- t.Helper()
- forSlicePairMasked(t, int8s, n, func(x, y []int8, m []bool) bool {
- t.Helper()
- a := simd.LoadInt8x64Slice(x)
- b := simd.LoadInt8x64Slice(y)
- k := simd.LoadInt8x64Slice(toVect[int8](m)).ToMask()
- g := make([]int8, n)
- f(a, b, k).AsInt8x64().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt16x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt16x32CompareMasked(t *testing.T,
- f func(_, _ simd.Int16x32, m simd.Mask16x32) simd.Mask16x32,
- want func(_, _ []int16) []int64) {
- n := 32
- t.Helper()
- forSlicePairMasked(t, int16s, n, func(x, y []int16, m []bool) bool {
- t.Helper()
- a := simd.LoadInt16x32Slice(x)
- b := simd.LoadInt16x32Slice(y)
- k := simd.LoadInt16x32Slice(toVect[int16](m)).ToMask()
- g := make([]int16, n)
- f(a, b, k).AsInt16x32().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt32x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt32x16CompareMasked(t *testing.T,
- f func(_, _ simd.Int32x16, m simd.Mask32x16) simd.Mask32x16,
- want func(_, _ []int32) []int64) {
- n := 16
- t.Helper()
- forSlicePairMasked(t, int32s, n, func(x, y []int32, m []bool) bool {
- t.Helper()
- a := simd.LoadInt32x16Slice(x)
- b := simd.LoadInt32x16Slice(y)
- k := simd.LoadInt32x16Slice(toVect[int32](m)).ToMask()
- g := make([]int32, n)
- f(a, b, k).AsInt32x16().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testInt64x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testInt64x8CompareMasked(t *testing.T,
- f func(_, _ simd.Int64x8, m simd.Mask64x8) simd.Mask64x8,
- want func(_, _ []int64) []int64) {
- n := 8
- t.Helper()
- forSlicePairMasked(t, int64s, n, func(x, y []int64, m []bool) bool {
- t.Helper()
- a := simd.LoadInt64x8Slice(x)
- b := simd.LoadInt64x8Slice(y)
- k := simd.LoadInt64x8Slice(toVect[int64](m)).ToMask()
- g := make([]int64, n)
- f(a, b, k).AsInt64x8().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint8x64CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint8x64CompareMasked(t *testing.T,
- f func(_, _ simd.Uint8x64, m simd.Mask8x64) simd.Mask8x64,
- want func(_, _ []uint8) []int64) {
- n := 64
- t.Helper()
- forSlicePairMasked(t, uint8s, n, func(x, y []uint8, m []bool) bool {
- t.Helper()
- a := simd.LoadUint8x64Slice(x)
- b := simd.LoadUint8x64Slice(y)
- k := simd.LoadInt8x64Slice(toVect[int8](m)).ToMask()
- g := make([]int8, n)
- f(a, b, k).AsInt8x64().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint16x32CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint16x32CompareMasked(t *testing.T,
- f func(_, _ simd.Uint16x32, m simd.Mask16x32) simd.Mask16x32,
- want func(_, _ []uint16) []int64) {
- n := 32
- t.Helper()
- forSlicePairMasked(t, uint16s, n, func(x, y []uint16, m []bool) bool {
- t.Helper()
- a := simd.LoadUint16x32Slice(x)
- b := simd.LoadUint16x32Slice(y)
- k := simd.LoadInt16x32Slice(toVect[int16](m)).ToMask()
- g := make([]int16, n)
- f(a, b, k).AsInt16x32().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint32x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint32x16CompareMasked(t *testing.T,
- f func(_, _ simd.Uint32x16, m simd.Mask32x16) simd.Mask32x16,
- want func(_, _ []uint32) []int64) {
- n := 16
- t.Helper()
- forSlicePairMasked(t, uint32s, n, func(x, y []uint32, m []bool) bool {
- t.Helper()
- a := simd.LoadUint32x16Slice(x)
- b := simd.LoadUint32x16Slice(y)
- k := simd.LoadInt32x16Slice(toVect[int32](m)).ToMask()
- g := make([]int32, n)
- f(a, b, k).AsInt32x16().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testUint64x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testUint64x8CompareMasked(t *testing.T,
- f func(_, _ simd.Uint64x8, m simd.Mask64x8) simd.Mask64x8,
- want func(_, _ []uint64) []int64) {
- n := 8
- t.Helper()
- forSlicePairMasked(t, uint64s, n, func(x, y []uint64, m []bool) bool {
- t.Helper()
- a := simd.LoadUint64x8Slice(x)
- b := simd.LoadUint64x8Slice(y)
- k := simd.LoadInt64x8Slice(toVect[int64](m)).ToMask()
- g := make([]int64, n)
- f(a, b, k).AsInt64x8().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testFloat32x16CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testFloat32x16CompareMasked(t *testing.T,
- f func(_, _ simd.Float32x16, m simd.Mask32x16) simd.Mask32x16,
- want func(_, _ []float32) []int64) {
- n := 16
- t.Helper()
- forSlicePairMasked(t, float32s, n, func(x, y []float32, m []bool) bool {
- t.Helper()
- a := simd.LoadFloat32x16Slice(x)
- b := simd.LoadFloat32x16Slice(y)
- k := simd.LoadInt32x16Slice(toVect[int32](m)).ToMask()
- g := make([]int32, n)
- f(a, b, k).AsInt32x16().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
-
-// testFloat64x8CompareMasked tests the simd masked comparison method f against the expected behavior generated by want
-// The mask is applied to the output of want; anything not in the mask, is zeroed.
-func testFloat64x8CompareMasked(t *testing.T,
- f func(_, _ simd.Float64x8, m simd.Mask64x8) simd.Mask64x8,
- want func(_, _ []float64) []int64) {
- n := 8
- t.Helper()
- forSlicePairMasked(t, float64s, n, func(x, y []float64, m []bool) bool {
- t.Helper()
- a := simd.LoadFloat64x8Slice(x)
- b := simd.LoadFloat64x8Slice(y)
- k := simd.LoadInt64x8Slice(toVect[int64](m)).ToMask()
- g := make([]int64, n)
- f(a, b, k).AsInt64x8().StoreSlice(g)
- w := want(x, y)
- for i := range m {
- if !m[i] {
- w[i] = 0
- }
- }
- return checkSlicesLogInput(t, s64(g), w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("m=%v", m) })
- })
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd
-
-package simd
-
-// Invoke code generators.
-
-//go:generate go run -C ../.. genfiles.go
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd_test
-
-import (
- "math"
- "simd/internal/test_helpers"
- "testing"
-)
-
-type signed interface {
- ~int | ~int8 | ~int16 | ~int32 | ~int64
-}
-
-type integer interface {
- ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr
-}
-
-type float interface {
- ~float32 | ~float64
-}
-
-type number interface {
- ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64
-}
-
-func checkSlices[T number](t *testing.T, got, want []T) bool {
- t.Helper()
- return test_helpers.CheckSlicesLogInput[T](t, got, want, 0.0, nil)
-}
-
-func checkSlicesLogInput[T number](t *testing.T, got, want []T, flakiness float64, logInput func()) bool {
- t.Helper()
- return test_helpers.CheckSlicesLogInput[T](t, got, want, flakiness, logInput)
-}
-
-// sliceOf returns a slice n T's, with each
-// element of the slice initialized to its
-// index + 1.
-func sliceOf[T number](n int) []T {
- s := make([]T, n)
- for i := 0; i < n; i++ {
- s[i] = T(i + 1)
- }
- return s
-}
-
-func toVect[T signed](b []bool) []T {
- s := make([]T, len(b))
- for i := range b {
- if b[i] {
- s[i] = -1
- }
- }
- return s
-}
-
-// s64 converts a slice of some integer type into a slice of int64
-func s64[T number](s []T) []int64 {
- var is any = s
- if r, ok := is.([]int64); ok {
- return r
- }
- r := make([]int64, len(s))
- for i := range s {
- r[i] = int64(s[i])
- }
- return r
-}
-
-// Do implements slice part testing. It repeatedly calls
-// body on smaller and smaller slices and an output slice
-// for the result, then compares the result to its own
-// calculation of what the result should be.
-func Do[T number](t *testing.T, n int, body func(a, c []T)) {
- a := sliceOf[T](n)
- b := sliceOf[T](n)
-
- for i := n; i >= 0; i-- {
- c := make([]T, n, n)
- body(a[:i], c)
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = T(0)
- }
- }
-}
-
-// map3 returns a function that returns the slice of the results of applying
-// input parameter elem to the respective elements of its 3 slice inputs.
-func map3[T, U any](elem func(x, y, z T) U) func(x, y, z []T) []U {
- return func(x, y, z []T) []U {
- s := make([]U, len(x))
- for i := range s {
- s[i] = elem(x[i], y[i], z[i])
- }
- return s
- }
-}
-
-// map2 returns a function that returns the slice of the results of applying
-// input parameter elem to the respective elements of its 2 slice inputs.
-func map2[T, U any](elem func(x, y T) U) func(x, y []T) []U {
- return func(x, y []T) []U {
- s := make([]U, len(x))
- for i := range s {
- s[i] = elem(x[i], y[i])
- }
- return s
- }
-}
-
-// map1 returns a function that returns the slice of the results of applying
-// input parameter elem to the respective elements of its single slice input.
-func map1[T, U any](elem func(x T) U) func(x []T) []U {
- return func(x []T) []U {
- s := make([]U, len(x))
- for i := range s {
- s[i] = elem(x[i])
- }
- return s
- }
-}
-
-// map1 returns a function that returns the slice of the results of applying
-// comparison function elem to the respective elements of its two slice inputs.
-func mapCompare[T number](elem func(x, y T) bool) func(x, y []T) []int64 {
- return func(x, y []T) []int64 {
- s := make([]int64, len(x))
- for i := range s {
- if elem(x[i], y[i]) {
- s[i] = -1
- }
- }
- return s
- }
-}
-
-// nOf returns a slice of length n whose elements are taken
-// from input slice s.
-func nOf[T any](n int, s []T) []T {
- if len(s) >= n {
- return s
- }
- r := make([]T, n)
- for i := range r {
- r[i] = s[i%len(s)]
- }
- return r
-}
-
-const (
- PN22 = 1.0 / 1024 / 1024 / 4
- PN24 = 1.0 / 1024 / 1024 / 16
- PN53 = PN24 * PN24 / 32
- F0 = float32(1.0 + 513*PN22/2)
- F1 = float32(1.0 + 511*PN22*8)
- Aeasy = float32(2046 * PN53)
- Ahard = float32(2047 * PN53) // 2047 provokes a 2-rounding in 64-bit FMA rounded to 32-bit
-)
-
-var zero = 0.0
-var nzero = -zero
-var inf = 1 / zero
-var ninf = -1 / zero
-var nan = math.NaN()
-
-// N controls how large the test vectors are
-const N = 144
-
-var float32s = nOf(N, []float32{float32(inf), float32(ninf), 1, float32(nan), float32(zero), 2, float32(nan), float32(zero), 3, float32(-zero), float32(1.0 / zero), float32(-1.0 / zero), 1.0 / 2, 1.0 / 4, 1.0 / 8, 1.0 / 1000, 1.0 / 1000000, 1, -1, 0, 2, -2, 3, -3, math.MaxFloat32, 1 / math.MaxFloat32, 10, -10, 100, 20, -20, 300, -300, -4000, -80, -160, -3200, -64, -4, -8, -16, -32, -64})
-var float64s = nOf(N, []float64{inf, ninf, nan, zero, -zero, 1 / zero, -1 / zero, 0.0001, 0.0000001, 1, -1, 0, 2, -2, 3, -3, math.MaxFloat64, 1.0 / math.MaxFloat64, 10, -10, 100, 20, -20, 300, -300, -4000, -80, -16, -32, -64})
-
-var int32s = nOf(N, []int32{1, -1, 0, 2, 4, 8, 1024, 0xffffff, -0xffffff, 0x55555, 0x77777, 0xccccc, -0x55555, -0x77777, -0xccccc, -4, -8, -16, -32, -64})
-var uint32s = nOf(N, []uint32{1, 0, 2, 4, 8, 1024, 0xffffff, ^uint32(0xffffff), 0x55555, 0x77777, 0xccccc, ^uint32(0x55555), ^uint32(0x77777), ^uint32(0xccccc)})
-
-var int64s = nOf(N, []int64{1, -1, 0, 2, 4, 8, 1024, 0xffffff, -0xffffff, 0x55555, 0x77777, 0xccccc, -0x55555, -0x77777, -0xccccc, -4, -8, -16, -32, -64})
-var uint64s = nOf(N, []uint64{1, 0, 2, 4, 8, 1024, 0xffffff, ^uint64(0xffffff), 0x55555, 0x77777, 0xccccc, ^uint64(0x55555), ^uint64(0x77777), ^uint64(0xccccc)})
-
-var int16s = nOf(N, []int16{1, -1, 0, 2, 4, 8, 1024, 3, 5, 7, 11, 13, 3000, 5555, 7777, 11111, 32767, 32766, -32767, -32768, -11111, -4, -8, -16, -32, -64})
-var uint16s = nOf(N, []uint16{1, 0, 2, 4, 8, 1024, 3, 5, 7, 11, 13, 3000, 5555, 7777, 11111, 32767, 32766, 32768, 65535, 45678, 56789})
-
-var int8s = nOf(N, []int8{0, 1, 2, 3, 5, 7, 11, 22, 33, 55, 77, 121, 127, -1, -2, -3, -5, -7, -11, -77, -121, -127, -128, 4, 8, 16, 32, 64, -4, -8, -16, -32, -64})
-var uint8s = nOf(N, []uint8{0, 1, 2, 3, 5, 7, 11, 22, 33, 55, 77, 121, 127, 128, 255, 233, 211, 177, 144, 4, 8, 16, 32, 64})
-
-var bools = nOf(N, []bool{
- true, false, true, true, false, false, true, true, true, false, false, false, true, true, true, true, false, false, false, false})
-
-func forSlice[T number](t *testing.T, s []T, n int, f func(a []T) bool) {
- t.Helper()
- for i := 0; i < len(s)-n; i++ {
- if !f(s[i : i+n]) {
- return
- }
- }
-}
-
-func forSlicePair[T number](t *testing.T, s []T, n int, f func(a, b []T) bool) {
- t.Helper()
- for i := 0; i < len(s)-n; i++ {
- for j := 0; j < len(s)-n; j++ {
- if !f(s[i:i+n], s[j:j+n]) {
- return
- }
- }
- }
-}
-
-func forSliceTriple[T number](t *testing.T, s []T, n int, f func(a, b, c []T) bool) {
- t.Helper()
- for i := 0; i < len(s)-n; i += 3 {
- for j := 0; j < len(s)-n; j += 3 {
- for k := 0; k < len(s)-n; k += 3 {
- if !f(s[i:i+n], s[j:j+n], s[k:k+n]) {
- return
- }
- }
- }
- }
-}
-
-func forSlicePairMasked[T number](t *testing.T, s []T, n int, f func(a, b []T, m []bool) bool) {
- t.Helper()
- m := bools
- // Step slice pair masked forward much more quickly, otherwise it is slooooow
- for i := 0; i < len(s)-n; i += 3 {
- for j := 0; j < len(s)-n; j += 3 {
- for k := 0; k < len(m)-n; k += 3 {
- if !f(s[i:i+n], s[j:j+n], m[k:k+n]) {
- return
- }
- }
- }
- }
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd_test
-
-import (
- "reflect"
- "simd"
- "slices"
- "testing"
-)
-
-var sink any
-
-func TestType(t *testing.T) {
- // Testing:
- // - Defined as another struct's field is ok
- // - Pointer is ok
- // - Type defition is ok
- // - Type alias is ok
- // - Type conversion is ok
- // - Conversion to interface is ok
- type alias = simd.Int32x4
- type maskT simd.Mask32x4
- type myStruct struct {
- x alias
- y *simd.Int32x4
- z maskT
- }
- vals := [4]int32{1, 2, 3, 4}
- v := myStruct{x: simd.LoadInt32x4(&vals)}
- // masking elements 1 and 2.
- want := []int32{2, 4, 0, 0}
- y := simd.LoadInt32x4(&vals)
- v.y = &y
- sink = y
-
- if !simd.X86.AVX512GFNI() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- v.z = maskT(simd.Mask32x4FromBits(0b0011))
- *v.y = v.y.Add(v.x).Masked(simd.Mask32x4(v.z))
-
- got := [4]int32{}
- v.y.Store(&got)
- checkSlices(t, got[:], want)
-}
-
-func TestUncomparable(t *testing.T) {
- // Test that simd vectors are not comparable
- var x, y any = simd.LoadUint32x4(&[4]uint32{1, 2, 3, 4}), simd.LoadUint32x4(&[4]uint32{5, 6, 7, 8})
- shouldPanic := func(fn func()) {
- defer func() {
- if recover() == nil {
- panic("did not panic")
- }
- }()
- fn()
- }
- shouldPanic(func() { _ = x == y })
-}
-
-func TestFuncValue(t *testing.T) {
- // Test that simd intrinsic can be used as a function value.
- xv := [4]int32{1, 2, 3, 4}
- yv := [4]int32{5, 6, 7, 8}
- want := []int32{6, 8, 10, 12}
- x := simd.LoadInt32x4(&xv)
- y := simd.LoadInt32x4(&yv)
- fn := simd.Int32x4.Add
- sink = fn
- x = fn(x, y)
- got := [4]int32{}
- x.Store(&got)
- checkSlices(t, got[:], want)
-}
-
-func TestReflectMethod(t *testing.T) {
- // Test that simd intrinsic can be accessed via reflection.
- // NOTE: we don't yet support reflect method.Call.
- xv := [4]int32{1, 2, 3, 4}
- yv := [4]int32{5, 6, 7, 8}
- want := []int32{6, 8, 10, 12}
- x := simd.LoadInt32x4(&xv)
- y := simd.LoadInt32x4(&yv)
- m, ok := reflect.TypeOf(x).MethodByName("Add")
- if !ok {
- t.Fatal("Add method not found")
- }
- fn := m.Func.Interface().(func(x, y simd.Int32x4) simd.Int32x4)
- x = fn(x, y)
- got := [4]int32{}
- x.Store(&got)
- checkSlices(t, got[:], want)
-}
-
-func TestVectorConversion(t *testing.T) {
- if !simd.X86.AVX512GFNI() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- xv := [4]int32{1, 2, 3, 4}
- x := simd.LoadInt32x4(&xv)
- xPromoted := x.AsInt64x2()
- xPromotedDemoted := xPromoted.AsInt32x4()
- got := [4]int32{}
- xPromotedDemoted.Store(&got)
- for i := range 4 {
- if xv[i] != got[i] {
- t.Errorf("Result at %d incorrect: want %d, got %d", i, xv[i], got[i])
- }
- }
-}
-
-func TestMaskConversion(t *testing.T) {
- if !simd.X86.AVX512GFNI() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- x := simd.LoadInt32x4Slice([]int32{5, 0, 7, 0})
- mask := simd.Int32x4{}.Sub(x).ToMask()
- y := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4}).Add(x).Masked(mask)
- want := [4]int32{6, 0, 10, 0}
- got := make([]int32, 4)
- y.StoreSlice(got)
- checkSlices(t, got[:], want[:])
-}
-
-func TestPermute(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
- indices := []uint64{7, 6, 5, 4, 3, 2, 1, 0}
- want := []int64{8, 7, 6, 5, 4, 3, 2, 1}
- got := make([]int64, 8)
- simd.LoadInt64x8Slice(x).Permute(simd.LoadUint64x8Slice(indices)).StoreSlice(got)
- checkSlices(t, got, want)
-}
-
-func TestPermuteOrZero(t *testing.T) {
- x := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- indices := []int8{7, 6, 5, 4, 3, 2, 1, 0, -1, 8, -1, 9, -1, 10, -1, 11}
- want := []uint8{8, 7, 6, 5, 4, 3, 2, 1, 0, 9, 0, 10, 0, 11, 0, 12}
- got := make([]uint8, len(x))
- simd.LoadUint8x16Slice(x).PermuteOrZero(simd.LoadInt8x16Slice(indices)).StoreSlice(got)
- checkSlices(t, got, want)
-}
-
-func TestConcatPermute(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- x := []int64{1, 2, 3, 4, 5, 6, 7, 8}
- y := []int64{-1, -2, -3, -4, -5, -6, -7, -8}
- indices := []uint64{7 + 8, 6, 5 + 8, 4, 3 + 8, 2, 1 + 8, 0}
- want := []int64{-8, 7, -6, 5, -4, 3, -2, 1}
- got := make([]int64, 8)
- simd.LoadInt64x8Slice(x).ConcatPermute(simd.LoadInt64x8Slice(y), simd.LoadUint64x8Slice(indices)).StoreSlice(got)
- checkSlices(t, got, want)
-}
-
-func TestCompress(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- v1234 := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
- v2400 := v1234.Compress(simd.Mask32x4FromBits(0b1010))
- got := make([]int32, 4)
- v2400.StoreSlice(got)
- want := []int32{2, 4, 0, 0}
- if !slices.Equal(got, want) {
- t.Errorf("want and got differ, want=%v, got=%v", want, got)
- }
-}
-
-func TestExpand(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- v3400 := simd.LoadInt32x4Slice([]int32{3, 4, 0, 0})
- v2400 := v3400.Expand(simd.Mask32x4FromBits(0b1010))
- got := make([]int32, 4)
- v2400.StoreSlice(got)
- want := []int32{0, 3, 0, 4}
- if !slices.Equal(got, want) {
- t.Errorf("want and got differ, want=%v, got=%v", want, got)
- }
-}
-
-var testShiftAllVal uint64 = 3
-
-func TestShiftAll(t *testing.T) {
- got := make([]int32, 4)
- simd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(2).StoreSlice(got)
- for _, v := range got {
- if v != 0b1100 {
- t.Errorf("expect 0b1100, got %b", v)
- }
- }
- simd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).ShiftAllLeft(testShiftAllVal).StoreSlice(got)
- for _, v := range got {
- if v != 0b11000 {
- t.Errorf("expect 0b11000, got %b", v)
- }
- }
-}
-
-func TestSlicesInt8(t *testing.T) {
- a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
- v := simd.LoadInt8x32Slice(a)
- b := make([]int8, 32, 32)
- v.StoreSlice(b)
- checkSlices(t, a, b)
-}
-
-func TestSlicesInt8SetElem(t *testing.T) {
- a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
- v := simd.LoadInt8x16Slice(a)
-
- v = v.SetElem(3, 13)
- a[3] = 13
-
- b := make([]int8, 16, 16)
- v.StoreSlice(b)
- checkSlices(t, a, b)
-}
-
-func TestSlicesInt8GetElem(t *testing.T) {
- a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
- v := simd.LoadInt8x16Slice(a)
- e := v.GetElem(2)
- if e != a[2] {
- t.Errorf("GetElem(2) = %d != a[2] = %d", e, a[2])
- }
-
-}
-
-func TestSlicesInt8TooShortLoad(t *testing.T) {
- defer func() {
- if r := recover(); r != nil {
- t.Logf("Saw EXPECTED panic %v", r)
- } else {
- t.Errorf("Did not see expected panic")
- }
- }()
- a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31} // TOO SHORT, should panic
- v := simd.LoadInt8x32Slice(a)
- b := make([]int8, 32, 32)
- v.StoreSlice(b)
- checkSlices(t, a, b)
-}
-
-func TestSlicesInt8TooShortStore(t *testing.T) {
- defer func() {
- if r := recover(); r != nil {
- t.Logf("Saw EXPECTED panic %v", r)
- } else {
- t.Errorf("Did not see expected panic")
- }
- }()
- a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
- v := simd.LoadInt8x32Slice(a)
- b := make([]int8, 31) // TOO SHORT, should panic
- v.StoreSlice(b)
- checkSlices(t, a, b)
-}
-
-func TestSlicesFloat64(t *testing.T) {
- a := []float64{1, 2, 3, 4, 5, 6, 7, 8} // too long, should be fine
- v := simd.LoadFloat64x4Slice(a)
- b := make([]float64, 4, 4)
- v.StoreSlice(b)
- for i := range b {
- if a[i] != b[i] {
- t.Errorf("a and b differ at index %d, a=%f, b=%f", i, a[i], b[i])
- }
- }
-}
-
-// TODO: try to reduce this test to be smaller.
-func TestMergeLocals(t *testing.T) {
- testMergeLocalswrapper(t, simd.Int64x4.Add)
-}
-
-//go:noinline
-func forceSpill() {}
-
-func testMergeLocalswrapper(t *testing.T, op func(simd.Int64x4, simd.Int64x4) simd.Int64x4) {
- t.Helper()
- s0 := []int64{0, 1, 2, 3}
- s1 := []int64{-1, 0, -1, 0}
- want := []int64{-1, 1, 1, 3}
- v := simd.LoadInt64x4Slice(s0)
- m := simd.LoadInt64x4Slice(s1)
- forceSpill()
- got := make([]int64, 4)
- gotv := op(v, m)
- gotv.StoreSlice(got)
- for i := range len(want) {
- if !(got[i] == want[i]) {
- t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], got[i])
- }
- }
-}
-
-func TestBitMaskFromBits(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- results := [2]int64{}
- want := [2]int64{0, 6}
- m := simd.Mask64x2FromBits(0b10)
- simd.LoadInt64x2Slice([]int64{1, 2}).Add(simd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
- for i := range 2 {
- if results[i] != want[i] {
- t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
- }
- }
-}
-
-var maskForTestBitMaskFromBitsLoad = uint8(0b10)
-
-func TestBitMaskFromBitsLoad(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- results := [2]int64{}
- want := [2]int64{0, 6}
- m := simd.Mask64x2FromBits(maskForTestBitMaskFromBitsLoad)
- simd.LoadInt64x2Slice([]int64{1, 2}).Add(simd.LoadInt64x2Slice([]int64{3, 4})).Masked(m).Store(&results)
- for i := range 2 {
- if results[i] != want[i] {
- t.Errorf("Result at %d incorrect: want %v, got %v", i, want[i], results[i])
- }
- }
-}
-
-func TestBitMaskToBits(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- if v := simd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits(); v != 0b101 {
- t.Errorf("Want 0b101, got %b", v)
- }
-}
-
-var maskForTestBitMaskFromBitsStore uint8
-
-func TestBitMaskToBitsStore(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- maskForTestBitMaskFromBitsStore = simd.LoadInt16x8Slice([]int16{1, 0, 1, 0, 0, 0, 0, 0}).ToMask().ToBits()
- if maskForTestBitMaskFromBitsStore != 0b101 {
- t.Errorf("Want 0b101, got %b", maskForTestBitMaskFromBitsStore)
- }
-}
-
-func TestMergeFloat(t *testing.T) {
- k := make([]int64, 4, 4)
- s := make([]float64, 4, 4)
-
- a := simd.LoadFloat64x4Slice([]float64{1, 2, 3, 4})
- b := simd.LoadFloat64x4Slice([]float64{4, 2, 3, 1})
- g := a.Greater(b)
- g.AsInt64x4().StoreSlice(k)
- c := a.Merge(b, g)
-
- c.StoreSlice(s)
-
- checkSlices[int64](t, k, []int64{0, 0, 0, -1})
- checkSlices[float64](t, s, []float64{4, 2, 3, 4})
-}
-
-func TestMergeFloat512(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
-
- k := make([]int64, 8, 8)
- s := make([]float64, 8, 8)
-
- a := simd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
- b := simd.LoadFloat64x8Slice([]float64{8, 7, 6, 5, 4, 2, 3, 1})
- g := a.Greater(b)
- g.AsInt64x8().StoreSlice(k)
- c := a.Merge(b, g)
- d := a.Masked(g)
-
- checkSlices[int64](t, k, []int64{0, 0, 0, 0, -1, -1, -1, -1})
-
- c.StoreSlice(s)
- checkSlices[float64](t, s, []float64{8, 7, 6, 5, 5, 6, 7, 8})
-
- d.StoreSlice(s)
- checkSlices[float64](t, s, []float64{0, 0, 0, 0, 5, 6, 7, 8})
-}
-
-var ro uint8 = 2
-
-func TestRotateAllVariable(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- got := make([]int32, 4)
- simd.LoadInt32x4Slice([]int32{0b11, 0b11, 0b11, 0b11}).RotateAllLeft(ro).StoreSlice(got)
- for _, v := range got {
- if v != 0b1100 {
- t.Errorf("Want 0b1100, got %b", v)
- }
- }
-}
-
-func TestBroadcastUint32x4(t *testing.T) {
- s := make([]uint32, 4, 4)
- simd.BroadcastUint32x4(123456789).StoreSlice(s)
- checkSlices(t, s, []uint32{123456789, 123456789, 123456789, 123456789})
-}
-
-func TestBroadcastFloat32x8(t *testing.T) {
- s := make([]float32, 8, 8)
- simd.BroadcastFloat32x8(123456789).StoreSlice(s)
- checkSlices(t, s, []float32{123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789, 123456789})
-}
-
-func TestBroadcastFloat64x2(t *testing.T) {
- s := make([]float64, 2, 2)
- simd.BroadcastFloat64x2(123456789).StoreSlice(s)
- checkSlices(t, s, []float64{123456789, 123456789})
-}
-
-func TestBroadcastUint64x2(t *testing.T) {
- s := make([]uint64, 2, 2)
- simd.BroadcastUint64x2(123456789).StoreSlice(s)
- checkSlices(t, s, []uint64{123456789, 123456789})
-}
-
-func TestBroadcastUint16x8(t *testing.T) {
- s := make([]uint16, 8, 8)
- simd.BroadcastUint16x8(12345).StoreSlice(s)
- checkSlices(t, s, []uint16{12345, 12345, 12345, 12345})
-}
-
-func TestBroadcastInt8x32(t *testing.T) {
- s := make([]int8, 32, 32)
- simd.BroadcastInt8x32(-123).StoreSlice(s)
- checkSlices(t, s, []int8{-123, -123, -123, -123, -123, -123, -123, -123,
- -123, -123, -123, -123, -123, -123, -123, -123,
- -123, -123, -123, -123, -123, -123, -123, -123,
- -123, -123, -123, -123, -123, -123, -123, -123,
- })
-}
-
-func TestMaskOpt512(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
-
- k := make([]int64, 8, 8)
- s := make([]float64, 8, 8)
-
- a := simd.LoadFloat64x8Slice([]float64{2, 0, 2, 0, 2, 0, 2, 0})
- b := simd.LoadFloat64x8Slice([]float64{1, 1, 1, 1, 1, 1, 1, 1})
- c := simd.LoadFloat64x8Slice([]float64{1, 2, 3, 4, 5, 6, 7, 8})
- d := simd.LoadFloat64x8Slice([]float64{2, 4, 6, 8, 10, 12, 14, 16})
- g := a.Greater(b)
- e := c.Add(d).Masked(g)
- e.StoreSlice(s)
- g.AsInt64x8().StoreSlice(k)
- checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0})
- checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0})
-}
-
-// flattenedTranspose tranposes x and y, regarded as a pair of 2x2
-// matrices, but then flattens the rows in order, i.e
-// x: ABCD ==> a: A1B2
-// y: 1234 b: C3D4
-func flattenedTranspose(x, y simd.Int32x4) (a, b simd.Int32x4) {
- return x.InterleaveLo(y), x.InterleaveHi(y)
-}
-
-func TestFlattenedTranspose(t *testing.T) {
- r := make([]int32, 4, 4)
- s := make([]int32, 4, 4)
-
- x := simd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
- y := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
- a, b := flattenedTranspose(x, y)
-
- a.StoreSlice(r)
- b.StoreSlice(s)
-
- checkSlices[int32](t, r, []int32{0xA, 1, 0xB, 2})
- checkSlices[int32](t, s, []int32{0xC, 3, 0xD, 4})
-
-}
-
-func TestClearAVXUpperBits(t *testing.T) {
- // Test that ClearAVXUpperBits is safe even if there are SIMD values
- // alive (although usually one should not do this).
- if !simd.X86.AVX2() {
- t.Skip("Test requires X86.AVX2, not available on this hardware")
- return
- }
-
- r := make([]int64, 4)
- s := make([]int64, 4)
-
- x := simd.LoadInt64x4Slice([]int64{10, 20, 30, 40})
- y := simd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
-
- x.Add(y).StoreSlice(r)
- simd.ClearAVXUpperBits()
- x.Sub(y).StoreSlice(s)
-
- checkSlices[int64](t, r, []int64{11, 22, 33, 44})
- checkSlices[int64](t, s, []int64{9, 18, 27, 36})
-}
-
-func TestLeadingZeros(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
-
- src := []uint64{0b1111, 0}
- want := []uint64{60, 64}
- got := make([]uint64, 2)
- simd.LoadUint64x2Slice(src).LeadingZeros().StoreSlice(got)
- for i := range 2 {
- if want[i] != got[i] {
- t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i])
- }
- }
-}
-
-func TestIsZero(t *testing.T) {
- v1 := simd.LoadUint64x2Slice([]uint64{0, 1})
- v2 := simd.LoadUint64x2Slice([]uint64{0, 0})
- if v1.IsZero() {
- t.Errorf("Result incorrect, want false, got true")
- }
- if !v2.IsZero() {
- t.Errorf("Result incorrect, want true, got false")
- }
- if !v1.And(v2).IsZero() {
- t.Errorf("Result incorrect, want true, got false")
- }
- if v1.AndNot(v2).IsZero() {
- t.Errorf("Result incorrect, want false, got true")
- }
- if !v2.And(v1).IsZero() {
- t.Errorf("Result incorrect, want true, got false")
- }
- if !v2.AndNot(v1).IsZero() {
- t.Errorf("Result incorrect, want true, got false")
- }
-}
-
-func TestSelect4FromPairConst(t *testing.T) {
- x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
- y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
-
- llll := x.SelectFromPair(0, 1, 2, 3, y)
- hhhh := x.SelectFromPair(4, 5, 6, 7, y)
- llhh := x.SelectFromPair(0, 1, 6, 7, y)
- hhll := x.SelectFromPair(6, 7, 0, 1, y)
-
- lllh := x.SelectFromPair(0, 1, 2, 7, y)
- llhl := x.SelectFromPair(0, 1, 7, 2, y)
- lhll := x.SelectFromPair(0, 7, 1, 2, y)
- hlll := x.SelectFromPair(7, 0, 1, 2, y)
-
- hhhl := x.SelectFromPair(4, 5, 6, 0, y)
- hhlh := x.SelectFromPair(4, 5, 0, 6, y)
- hlhh := x.SelectFromPair(4, 0, 5, 6, y)
- lhhh := x.SelectFromPair(0, 4, 5, 6, y)
-
- lhlh := x.SelectFromPair(0, 4, 1, 5, y)
- hlhl := x.SelectFromPair(4, 0, 5, 1, y)
- lhhl := x.SelectFromPair(0, 4, 5, 1, y)
- hllh := x.SelectFromPair(4, 0, 1, 5, y)
-
- r := make([]int32, 4, 4)
-
- foo := func(v simd.Int32x4, a, b, c, d int32) {
- v.StoreSlice(r)
- checkSlices[int32](t, r, []int32{a, b, c, d})
- }
-
- foo(llll, 0, 1, 2, 3)
- foo(hhhh, 4, 5, 6, 7)
- foo(llhh, 0, 1, 6, 7)
- foo(hhll, 6, 7, 0, 1)
-
- foo(lllh, 0, 1, 2, 7)
- foo(llhl, 0, 1, 7, 2)
- foo(lhll, 0, 7, 1, 2)
- foo(hlll, 7, 0, 1, 2)
-
- foo(hhhl, 4, 5, 6, 0)
- foo(hhlh, 4, 5, 0, 6)
- foo(hlhh, 4, 0, 5, 6)
- foo(lhhh, 0, 4, 5, 6)
-
- foo(lhlh, 0, 4, 1, 5)
- foo(hlhl, 4, 0, 5, 1)
- foo(lhhl, 0, 4, 5, 1)
- foo(hllh, 4, 0, 1, 5)
-}
-
-//go:noinline
-func selectFromPairInt32x4(x simd.Int32x4, a, b, c, d uint8, y simd.Int32x4) simd.Int32x4 {
- return x.SelectFromPair(a, b, c, d, y)
-}
-
-func TestSelect4FromPairVar(t *testing.T) {
- x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
- y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
-
- llll := selectFromPairInt32x4(x, 0, 1, 2, 3, y)
- hhhh := selectFromPairInt32x4(x, 4, 5, 6, 7, y)
- llhh := selectFromPairInt32x4(x, 0, 1, 6, 7, y)
- hhll := selectFromPairInt32x4(x, 6, 7, 0, 1, y)
-
- lllh := selectFromPairInt32x4(x, 0, 1, 2, 7, y)
- llhl := selectFromPairInt32x4(x, 0, 1, 7, 2, y)
- lhll := selectFromPairInt32x4(x, 0, 7, 1, 2, y)
- hlll := selectFromPairInt32x4(x, 7, 0, 1, 2, y)
-
- hhhl := selectFromPairInt32x4(x, 4, 5, 6, 0, y)
- hhlh := selectFromPairInt32x4(x, 4, 5, 0, 6, y)
- hlhh := selectFromPairInt32x4(x, 4, 0, 5, 6, y)
- lhhh := selectFromPairInt32x4(x, 0, 4, 5, 6, y)
-
- lhlh := selectFromPairInt32x4(x, 0, 4, 1, 5, y)
- hlhl := selectFromPairInt32x4(x, 4, 0, 5, 1, y)
- lhhl := selectFromPairInt32x4(x, 0, 4, 5, 1, y)
- hllh := selectFromPairInt32x4(x, 4, 0, 1, 5, y)
-
- r := make([]int32, 4, 4)
-
- foo := func(v simd.Int32x4, a, b, c, d int32) {
- v.StoreSlice(r)
- checkSlices[int32](t, r, []int32{a, b, c, d})
- }
-
- foo(llll, 0, 1, 2, 3)
- foo(hhhh, 4, 5, 6, 7)
- foo(llhh, 0, 1, 6, 7)
- foo(hhll, 6, 7, 0, 1)
-
- foo(lllh, 0, 1, 2, 7)
- foo(llhl, 0, 1, 7, 2)
- foo(lhll, 0, 7, 1, 2)
- foo(hlll, 7, 0, 1, 2)
-
- foo(hhhl, 4, 5, 6, 0)
- foo(hhlh, 4, 5, 0, 6)
- foo(hlhh, 4, 0, 5, 6)
- foo(lhhh, 0, 4, 5, 6)
-
- foo(lhlh, 0, 4, 1, 5)
- foo(hlhl, 4, 0, 5, 1)
- foo(lhhl, 0, 4, 5, 1)
- foo(hllh, 4, 0, 1, 5)
-}
-
-func TestSelect4FromPairConstGrouped(t *testing.T) {
- x := simd.LoadFloat32x8Slice([]float32{0, 1, 2, 3, 10, 11, 12, 13})
- y := simd.LoadFloat32x8Slice([]float32{4, 5, 6, 7, 14, 15, 16, 17})
-
- llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
- hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
- llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
- hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
-
- lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
- llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
- lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
- hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
-
- hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
- hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
- hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
- lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
-
- lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
- hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
- lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
- hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
-
- r := make([]float32, 8, 8)
-
- foo := func(v simd.Float32x8, a, b, c, d float32) {
- v.StoreSlice(r)
- checkSlices[float32](t, r, []float32{a, b, c, d, 10 + a, 10 + b, 10 + c, 10 + d})
- }
-
- foo(llll, 0, 1, 2, 3)
- foo(hhhh, 4, 5, 6, 7)
- foo(llhh, 0, 1, 6, 7)
- foo(hhll, 6, 7, 0, 1)
-
- foo(lllh, 0, 1, 2, 7)
- foo(llhl, 0, 1, 7, 2)
- foo(lhll, 0, 7, 1, 2)
- foo(hlll, 7, 0, 1, 2)
-
- foo(hhhl, 4, 5, 6, 0)
- foo(hhlh, 4, 5, 0, 6)
- foo(hlhh, 4, 0, 5, 6)
- foo(lhhh, 0, 4, 5, 6)
-
- foo(lhlh, 0, 4, 1, 5)
- foo(hlhl, 4, 0, 5, 1)
- foo(lhhl, 0, 4, 5, 1)
- foo(hllh, 4, 0, 1, 5)
-}
-
-func TestSelectFromPairConstGroupedUint32x16(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- x := simd.LoadUint32x16Slice([]uint32{0, 1, 2, 3, 10, 11, 12, 13, 20, 21, 22, 23, 30, 31, 32, 33})
- y := simd.LoadUint32x16Slice([]uint32{4, 5, 6, 7, 14, 15, 16, 17, 24, 25, 26, 27, 34, 35, 36, 37})
-
- llll := x.SelectFromPairGrouped(0, 1, 2, 3, y)
- hhhh := x.SelectFromPairGrouped(4, 5, 6, 7, y)
- llhh := x.SelectFromPairGrouped(0, 1, 6, 7, y)
- hhll := x.SelectFromPairGrouped(6, 7, 0, 1, y)
-
- lllh := x.SelectFromPairGrouped(0, 1, 2, 7, y)
- llhl := x.SelectFromPairGrouped(0, 1, 7, 2, y)
- lhll := x.SelectFromPairGrouped(0, 7, 1, 2, y)
- hlll := x.SelectFromPairGrouped(7, 0, 1, 2, y)
-
- hhhl := x.SelectFromPairGrouped(4, 5, 6, 0, y)
- hhlh := x.SelectFromPairGrouped(4, 5, 0, 6, y)
- hlhh := x.SelectFromPairGrouped(4, 0, 5, 6, y)
- lhhh := x.SelectFromPairGrouped(0, 4, 5, 6, y)
-
- lhlh := x.SelectFromPairGrouped(0, 4, 1, 5, y)
- hlhl := x.SelectFromPairGrouped(4, 0, 5, 1, y)
- lhhl := x.SelectFromPairGrouped(0, 4, 5, 1, y)
- hllh := x.SelectFromPairGrouped(4, 0, 1, 5, y)
-
- r := make([]uint32, 16, 16)
-
- foo := func(v simd.Uint32x16, a, b, c, d uint32) {
- v.StoreSlice(r)
- checkSlices[uint32](t, r, []uint32{a, b, c, d,
- 10 + a, 10 + b, 10 + c, 10 + d,
- 20 + a, 20 + b, 20 + c, 20 + d,
- 30 + a, 30 + b, 30 + c, 30 + d,
- })
- }
-
- foo(llll, 0, 1, 2, 3)
- foo(hhhh, 4, 5, 6, 7)
- foo(llhh, 0, 1, 6, 7)
- foo(hhll, 6, 7, 0, 1)
-
- foo(lllh, 0, 1, 2, 7)
- foo(llhl, 0, 1, 7, 2)
- foo(lhll, 0, 7, 1, 2)
- foo(hlll, 7, 0, 1, 2)
-
- foo(hhhl, 4, 5, 6, 0)
- foo(hhlh, 4, 5, 0, 6)
- foo(hlhh, 4, 0, 5, 6)
- foo(lhhh, 0, 4, 5, 6)
-
- foo(lhlh, 0, 4, 1, 5)
- foo(hlhl, 4, 0, 5, 1)
- foo(lhhl, 0, 4, 5, 1)
- foo(hllh, 4, 0, 1, 5)
-}
-
-func TestSelect128FromPair(t *testing.T) {
- x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
- y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
-
- aa := x.Select128FromPair(0, 0, y)
- ab := x.Select128FromPair(0, 1, y)
- bc := x.Select128FromPair(1, 2, y)
- cd := x.Select128FromPair(2, 3, y)
- da := x.Select128FromPair(3, 0, y)
- dc := x.Select128FromPair(3, 2, y)
-
- r := make([]uint64, 4, 4)
-
- foo := func(v simd.Uint64x4, a, b uint64) {
- a, b = 2*a, 2*b
- v.StoreSlice(r)
- checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
- }
-
- foo(aa, 0, 0)
- foo(ab, 0, 1)
- foo(bc, 1, 2)
- foo(cd, 2, 3)
- foo(da, 3, 0)
- foo(dc, 3, 2)
-}
-
-func TestSelect128FromPairError(t *testing.T) {
- x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
- y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
-
- defer func() {
- if r := recover(); r != nil {
- t.Logf("Saw expected panic %v", r)
- }
- }()
- _ = x.Select128FromPair(0, 4, y)
-
- t.Errorf("Should have panicked")
-}
-
-//go:noinline
-func select128FromPair(x simd.Uint64x4, lo, hi uint8, y simd.Uint64x4) simd.Uint64x4 {
- return x.Select128FromPair(lo, hi, y)
-}
-
-func TestSelect128FromPairVar(t *testing.T) {
- x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3})
- y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7})
-
- aa := select128FromPair(x, 0, 0, y)
- ab := select128FromPair(x, 0, 1, y)
- bc := select128FromPair(x, 1, 2, y)
- cd := select128FromPair(x, 2, 3, y)
- da := select128FromPair(x, 3, 0, y)
- dc := select128FromPair(x, 3, 2, y)
-
- r := make([]uint64, 4, 4)
-
- foo := func(v simd.Uint64x4, a, b uint64) {
- a, b = 2*a, 2*b
- v.StoreSlice(r)
- checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1})
- }
-
- foo(aa, 0, 0)
- foo(ab, 0, 1)
- foo(bc, 1, 2)
- foo(cd, 2, 3)
- foo(da, 3, 0)
- foo(dc, 3, 2)
-}
-
-func TestSelect2FromPairConst(t *testing.T) {
- x := simd.LoadUint64x2Slice([]uint64{0, 1})
- y := simd.LoadUint64x2Slice([]uint64{2, 3})
-
- ll := x.SelectFromPair(0, 1, y)
- hh := x.SelectFromPair(3, 2, y)
- lh := x.SelectFromPair(0, 3, y)
- hl := x.SelectFromPair(2, 1, y)
-
- r := make([]uint64, 2, 2)
-
- foo := func(v simd.Uint64x2, a, b uint64) {
- v.StoreSlice(r)
- checkSlices[uint64](t, r, []uint64{a, b})
- }
-
- foo(ll, 0, 1)
- foo(hh, 3, 2)
- foo(lh, 0, 3)
- foo(hl, 2, 1)
-}
-
-func TestSelect2FromPairConstGroupedUint(t *testing.T) {
- x := simd.LoadUint64x4Slice([]uint64{0, 1, 10, 11})
- y := simd.LoadUint64x4Slice([]uint64{2, 3, 12, 13})
-
- ll := x.SelectFromPairGrouped(0, 1, y)
- hh := x.SelectFromPairGrouped(3, 2, y)
- lh := x.SelectFromPairGrouped(0, 3, y)
- hl := x.SelectFromPairGrouped(2, 1, y)
-
- r := make([]uint64, 4, 4)
-
- foo := func(v simd.Uint64x4, a, b uint64) {
- v.StoreSlice(r)
- checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10})
- }
-
- foo(ll, 0, 1)
- foo(hh, 3, 2)
- foo(lh, 0, 3)
- foo(hl, 2, 1)
-}
-
-func TestSelect2FromPairConstGroupedFloat(t *testing.T) {
- x := simd.LoadFloat64x4Slice([]float64{0, 1, 10, 11})
- y := simd.LoadFloat64x4Slice([]float64{2, 3, 12, 13})
-
- ll := x.SelectFromPairGrouped(0, 1, y)
- hh := x.SelectFromPairGrouped(3, 2, y)
- lh := x.SelectFromPairGrouped(0, 3, y)
- hl := x.SelectFromPairGrouped(2, 1, y)
-
- r := make([]float64, 4, 4)
-
- foo := func(v simd.Float64x4, a, b float64) {
- v.StoreSlice(r)
- checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10})
- }
-
- foo(ll, 0, 1)
- foo(hh, 3, 2)
- foo(lh, 0, 3)
- foo(hl, 2, 1)
-}
-
-func TestSelect2FromPairConstGroupedInt(t *testing.T) {
- x := simd.LoadInt64x4Slice([]int64{0, 1, 10, 11})
- y := simd.LoadInt64x4Slice([]int64{2, 3, 12, 13})
-
- ll := x.SelectFromPairGrouped(0, 1, y)
- hh := x.SelectFromPairGrouped(3, 2, y)
- lh := x.SelectFromPairGrouped(0, 3, y)
- hl := x.SelectFromPairGrouped(2, 1, y)
-
- r := make([]int64, 4, 4)
-
- foo := func(v simd.Int64x4, a, b int64) {
- v.StoreSlice(r)
- checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10})
- }
-
- foo(ll, 0, 1)
- foo(hh, 3, 2)
- foo(lh, 0, 3)
- foo(hl, 2, 1)
-}
-
-func TestSelect2FromPairConstGroupedInt512(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
-
- x := simd.LoadInt64x8Slice([]int64{0, 1, 10, 11, 20, 21, 30, 31})
- y := simd.LoadInt64x8Slice([]int64{2, 3, 12, 13, 22, 23, 32, 33})
-
- ll := x.SelectFromPairGrouped(0, 1, y)
- hh := x.SelectFromPairGrouped(3, 2, y)
- lh := x.SelectFromPairGrouped(0, 3, y)
- hl := x.SelectFromPairGrouped(2, 1, y)
-
- r := make([]int64, 8, 8)
-
- foo := func(v simd.Int64x8, a, b int64) {
- v.StoreSlice(r)
- checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30})
- }
-
- foo(ll, 0, 1)
- foo(hh, 3, 2)
- foo(lh, 0, 3)
- foo(hl, 2, 1)
-}
-
-func TestString(t *testing.T) {
- x := simd.LoadUint32x4Slice([]uint32{0, 1, 2, 3})
- y := simd.LoadInt64x4Slice([]int64{-4, -5, -6, -7})
- z := simd.LoadFloat32x4Slice([]float32{0.5, 1.5, -2.5, 3.5e9})
- w := simd.LoadFloat64x4Slice([]float64{0.5, 1.5, -2.5, 3.5e9})
-
- sx := "{0,1,2,3}"
- sy := "{-4,-5,-6,-7}"
- sz := "{0.5,1.5,-2.5,3.5e+09}"
- sw := sz
-
- if x.String() != sx {
- t.Errorf("x=%s wanted %s", x, sx)
- }
- if y.String() != sy {
- t.Errorf("y=%s wanted %s", y, sy)
- }
- if z.String() != sz {
- t.Errorf("z=%s wanted %s", z, sz)
- }
- if w.String() != sw {
- t.Errorf("w=%s wanted %s", w, sw)
- }
- t.Logf("w=%s", w)
- t.Logf("x=%s", x)
- t.Logf("y=%s", y)
- t.Logf("z=%s", z)
-}
-
-// a returns an slice of 16 int32
-func a() []int32 {
- return make([]int32, 16, 16)
-}
-
-// applyTo3 returns a 16-element slice of the results of
-// applying f to the respective elements of vectors x, y, and z.
-func applyTo3(x, y, z simd.Int32x16, f func(x, y, z int32) int32) []int32 {
- ax, ay, az := a(), a(), a()
- x.StoreSlice(ax)
- y.StoreSlice(ay)
- z.StoreSlice(az)
-
- r := a()
- for i := range r {
- r[i] = f(ax[i], ay[i], az[i])
- }
- return r
-}
-
-// applyTo3 returns a 16-element slice of the results of
-// applying f to the respective elements of vectors x, y, z, and w.
-func applyTo4(x, y, z, w simd.Int32x16, f func(x, y, z, w int32) int32) []int32 {
- ax, ay, az, aw := a(), a(), a(), a()
- x.StoreSlice(ax)
- y.StoreSlice(ay)
- z.StoreSlice(az)
- w.StoreSlice(aw)
-
- r := make([]int32, len(ax), len(ax))
- for i := range r {
- r[i] = f(ax[i], ay[i], az[i], aw[i])
- }
- return r
-}
-
-func TestSelectTernOptInt32x16(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
- ax := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
- ay := []int32{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}
- az := []int32{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}
- aw := []int32{0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}
- am := []int32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
-
- x := simd.LoadInt32x16Slice(ax)
- y := simd.LoadInt32x16Slice(ay)
- z := simd.LoadInt32x16Slice(az)
- w := simd.LoadInt32x16Slice(aw)
- m := simd.LoadInt32x16Slice(am)
-
- foo := func(v simd.Int32x16, s []int32) {
- r := make([]int32, 16, 16)
- v.StoreSlice(r)
- checkSlices[int32](t, r, s)
- }
-
- t0 := w.Xor(y).Xor(z)
- ft0 := func(w, y, z int32) int32 {
- return w ^ y ^ z
- }
- foo(t0, applyTo3(w, y, z, ft0))
-
- t1 := m.And(w.Xor(y).Xor(z.Not()))
- ft1 := func(m, w, y, z int32) int32 {
- return m & (w ^ y ^ ^z)
- }
- foo(t1, applyTo4(m, w, y, z, ft1))
-
- t2 := x.Xor(y).Xor(z).And(x.Xor(y).Xor(z.Not()))
- ft2 := func(x, y, z int32) int32 {
- return (x ^ y ^ z) & (x ^ y ^ ^z)
- }
- foo(t2, applyTo3(x, y, z, ft2))
-}
-
-func TestMaskedMerge(t *testing.T) {
- x := simd.LoadInt64x4Slice([]int64{1, 2, 3, 4})
- y := simd.LoadInt64x4Slice([]int64{5, 6, 1, 1})
- z := simd.LoadInt64x4Slice([]int64{-1, -2, -3, -4})
- res := make([]int64, 4)
- expected := []int64{6, 8, -3, -4}
- mask := x.Less(y)
- if simd.X86.AVX512() {
- x.Add(y).Merge(z, mask).StoreSlice(res)
- } else {
- x.Add(y).Merge(z, mask).StoreSlice(res)
- }
- for i := range 4 {
- if res[i] != expected[i] {
- t.Errorf("got %d wanted %d", res[i], expected[i])
- }
- }
-}
-
-func TestDotProductQuadruple(t *testing.T) {
- if !simd.X86.AVXVNNI() {
- t.Skip("Test requires X86.AVXVNNI, not available on this hardware")
- return
- }
- xd := make([]int8, 16)
- yd := make([]uint8, 16)
- zd := make([]int32, 4)
- wanted1 := make([]int32, 4)
- wanted2 := make([]int32, 4)
- res1 := make([]int32, 4)
- res2 := make([]int32, 4)
- for i := range 4 {
- xd[i] = 5
- yd[i] = 6
- zd[i] = 3
- wanted1[i] = 30
- wanted2[i] = 30
- }
- x := simd.LoadInt8x16Slice(xd)
- y := simd.LoadUint8x16Slice(yd)
- z := simd.LoadInt32x4Slice(zd)
- x.DotProductQuadruple(y).StoreSlice(res1)
- x.DotProductQuadruple(y).Add(z).StoreSlice(res1)
- for i := range 4 {
- if res1[i] != wanted1[i] {
- t.Errorf("got %d wanted %d", res1[i], wanted1[i])
- }
- if res2[i] != wanted2[i] {
- t.Errorf("got %d wanted %d", res2[i], wanted2[i])
- }
- }
-}
-
-func TestPermuteScalars(t *testing.T) {
- x := []int32{11, 12, 13, 14}
- want := []int32{12, 13, 14, 11}
- got := make([]int32, 4)
- simd.LoadInt32x4Slice(x).PermuteScalars(1, 2, 3, 0).StoreSlice(got)
- checkSlices(t, got, want)
-}
-
-func TestPermuteScalarsGrouped(t *testing.T) {
- x := []int32{11, 12, 13, 14, 21, 22, 23, 24}
- want := []int32{12, 13, 14, 11, 22, 23, 24, 21}
- got := make([]int32, 8)
- simd.LoadInt32x8Slice(x).PermuteScalarsGrouped(1, 2, 3, 0).StoreSlice(got)
- checkSlices(t, got, want)
-}
-
-func TestPermuteScalarsHi(t *testing.T) {
- x := []int16{-1, -2, -3, -4, 11, 12, 13, 14}
- want := []int16{-1, -2, -3, -4, 12, 13, 14, 11}
- got := make([]int16, len(x))
- simd.LoadInt16x8Slice(x).PermuteScalarsHi(1, 2, 3, 0).StoreSlice(got)
- checkSlices(t, got, want)
-}
-
-func TestPermuteScalarsLo(t *testing.T) {
- x := []int16{11, 12, 13, 14, 4, 5, 6, 7}
- want := []int16{12, 13, 14, 11, 4, 5, 6, 7}
- got := make([]int16, len(x))
- simd.LoadInt16x8Slice(x).PermuteScalarsLo(1, 2, 3, 0).StoreSlice(got)
- checkSlices(t, got, want)
-}
-
-func TestPermuteScalarsHiGrouped(t *testing.T) {
- x := []int16{-1, -2, -3, -4, 11, 12, 13, 14, -11, -12, -13, -14, 111, 112, 113, 114}
- want := []int16{-1, -2, -3, -4, 12, 13, 14, 11, -11, -12, -13, -14, 112, 113, 114, 111}
- got := make([]int16, len(x))
- simd.LoadInt16x16Slice(x).PermuteScalarsHiGrouped(1, 2, 3, 0).StoreSlice(got)
- checkSlices(t, got, want)
-}
-
-func TestPermuteScalarsLoGrouped(t *testing.T) {
- x := []int16{11, 12, 13, 14, 4, 5, 6, 7, 111, 112, 113, 114, 14, 15, 16, 17}
- want := []int16{12, 13, 14, 11, 4, 5, 6, 7, 112, 113, 114, 111, 14, 15, 16, 17}
- got := make([]int16, len(x))
- simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
- checkSlices(t, got, want)
-}
-
-func TestClMul(t *testing.T) {
- var x = simd.LoadUint64x2Slice([]uint64{1, 5})
- var y = simd.LoadUint64x2Slice([]uint64{3, 9})
-
- foo := func(v simd.Uint64x2, s []uint64) {
- r := make([]uint64, 2, 2)
- v.StoreSlice(r)
- checkSlices[uint64](t, r, s)
- }
-
- foo(x.CarrylessMultiply(0, 0, y), []uint64{3, 0})
- foo(x.CarrylessMultiply(0, 1, y), []uint64{9, 0})
- foo(x.CarrylessMultiply(1, 0, y), []uint64{15, 0})
- foo(x.CarrylessMultiply(1, 1, y), []uint64{45, 0})
- foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
-
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd_test
-
-import (
- "math"
-)
-
-func less[T number](x, y T) bool {
- return x < y
-}
-func lessEqual[T number](x, y T) bool {
- return x <= y
-}
-func greater[T number](x, y T) bool {
- return x > y
-}
-func greaterEqual[T number](x, y T) bool {
- return x >= y
-}
-func equal[T number](x, y T) bool {
- return x == y
-}
-func notEqual[T number](x, y T) bool {
- return x != y
-}
-
-func abs[T number](x T) T {
- // TODO this will need a non-standard FP-equality test.
- if x == 0 { // true if x is -0.
- return 0 // this is not a negative zero
- }
- if x < 0 {
- return -x
- }
- return x
-}
-
-func ceil[T float](x T) T {
- return T(math.Ceil(float64(x)))
-}
-func floor[T float](x T) T {
- return T(math.Floor(float64(x)))
-}
-func not[T integer](x T) T {
- return ^x
-}
-func round[T float](x T) T {
- return T(math.RoundToEven(float64(x)))
-}
-func sqrt[T float](x T) T {
- return T(math.Sqrt(float64(x)))
-}
-func trunc[T float](x T) T {
- return T(math.Trunc(float64(x)))
-}
-
-func add[T number](x, y T) T {
- return x + y
-}
-
-func sub[T number](x, y T) T {
- return x - y
-}
-
-func max_[T number](x, y T) T { // "max" lands in infinite recursion
- return max(x, y)
-}
-
-func min_[T number](x, y T) T { // "min" lands in infinite recursion
- return min(x, y)
-}
-
-// Also mulLow for integers
-func mul[T number](x, y T) T {
- return x * y
-}
-
-func div[T number](x, y T) T {
- return x / y
-}
-
-func and[T integer](x, y T) T {
- return x & y
-}
-
-func andNotI[T integer](x, y T) T {
- return x & ^y // order corrected to match expectations
-}
-
-func orI[T integer](x, y T) T {
- return x | y
-}
-
-func xorI[T integer](x, y T) T {
- return x ^ y
-}
-
-func ima[T integer](x, y, z T) T {
- return x*y + z
-}
-
-func fma[T float](x, y, z T) T {
- return T(math.FMA(float64(x), float64(y), float64(z)))
-}
-
-func toUint8[T number](x T) uint8 {
- return uint8(x)
-}
-
-func toUint16[T number](x T) uint16 {
- return uint16(x)
-}
-
-func toUint64[T number](x T) uint64 {
- return uint64(x)
-}
-
-func toUint32[T number](x T) uint32 {
- switch y := (any(x)).(type) {
- case float32:
- if y < 0 || y > float32(math.MaxUint32) || y != y {
- return math.MaxUint32
- }
- case float64:
- if y < 0 || y > float64(math.MaxUint32) || y != y {
- return math.MaxUint32
- }
- }
- return uint32(x)
-}
-
-func toInt8[T number](x T) int8 {
- return int8(x)
-}
-
-func toInt16[T number](x T) int16 {
- return int16(x)
-}
-
-func toInt32[T number](x T) int32 {
- return int32(x)
-}
-
-func toInt64[T number](x T) int64 {
- return int64(x)
-}
-
-func toFloat32[T number](x T) float32 {
- return float32(x)
-}
-
-func toFloat64[T number](x T) float64 {
- return float64(x)
-}
-
-func ceilResidueForPrecision[T float](i int) func(T) T {
- f := 1.0
- for i > 0 {
- f *= 2
- i--
- }
- return func(x T) T {
- y := float64(x)
- if math.IsInf(float64(x*T(f)), 0) {
- return 0
- }
- // TODO sort out the rounding issues when T === float32
- return T(y - math.Ceil(y*f)/f)
- }
-}
-
-// Slice versions of all these elementwise operations
-
-func addSlice[T number](x, y []T) []T {
- return map2[T](add)(x, y)
-}
-
-func subSlice[T number](x, y []T) []T {
- return map2[T](sub)(x, y)
-}
-
-func maxSlice[T number](x, y []T) []T {
- return map2[T](max_)(x, y)
-}
-
-func minSlice[T number](x, y []T) []T {
- return map2[T](min_)(x, y)
-}
-
-// mulLow for integers
-func mulSlice[T number](x, y []T) []T {
- return map2[T](mul)(x, y)
-}
-
-func divSlice[T number](x, y []T) []T {
- return map2[T](div)(x, y)
-}
-
-func andSlice[T integer](x, y []T) []T {
- return map2[T](and)(x, y)
-}
-
-func andNotSlice[T integer](x, y []T) []T {
- return map2[T](andNotI)(x, y)
-}
-
-func orSlice[T integer](x, y []T) []T {
- return map2[T](orI)(x, y)
-}
-
-func xorSlice[T integer](x, y []T) []T {
- return map2[T](xorI)(x, y)
-}
-
-func lessSlice[T number](x, y []T) []int64 {
- return mapCompare[T](less)(x, y)
-}
-
-func lessEqualSlice[T number](x, y []T) []int64 {
- return mapCompare[T](lessEqual)(x, y)
-}
-
-func greaterSlice[T number](x, y []T) []int64 {
- return mapCompare[T](greater)(x, y)
-}
-
-func greaterEqualSlice[T number](x, y []T) []int64 {
- return mapCompare[T](greaterEqual)(x, y)
-}
-
-func equalSlice[T number](x, y []T) []int64 {
- return mapCompare[T](equal)(x, y)
-}
-
-func notEqualSlice[T number](x, y []T) []int64 {
- return mapCompare[T](notEqual)(x, y)
-}
-
-func ceilSlice[T float](x []T) []T {
- return map1[T](ceil)(x)
-}
-
-func floorSlice[T float](x []T) []T {
- return map1[T](floor)(x)
-}
-
-func notSlice[T integer](x []T) []T {
- return map1[T](not)(x)
-}
-
-func roundSlice[T float](x []T) []T {
- return map1[T](round)(x)
-}
-
-func sqrtSlice[T float](x []T) []T {
- return map1[T](sqrt)(x)
-}
-
-func truncSlice[T float](x []T) []T {
- return map1[T](trunc)(x)
-}
-
-func imaSlice[T integer](x, y, z []T) []T {
- return map3[T](ima)(x, y, z)
-}
-
-func fmaSlice[T float](x, y, z []T) []T {
- return map3[T](fma)(x, y, z)
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd_test
-
-import (
- "simd"
- "testing"
-)
-
-func TestSlicePartInt8x16(t *testing.T) {
- Do(t, 16, func(a, c []int8) {
- u := simd.LoadInt8x16SlicePart(a)
- u.StoreSlice(c)
- })
-}
-
-func TestSlicePartInt8x32(t *testing.T) {
- a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
- b := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
- for i := 32; i >= 0; i-- {
- u := simd.LoadInt8x32SlicePart(a[:i])
- c := make([]int8, 32, 32)
- u.StoreSlice(c)
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicePartUint8x16(t *testing.T) {
- a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- for i := 16; i >= 0; i-- {
- u := simd.LoadUint8x16SlicePart(a[:i])
- c := make([]uint8, 32, 32)
- u.StoreSlice(c)
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicePartUint8x32(t *testing.T) {
- a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
- b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
- for i := 32; i >= 0; i-- {
- u := simd.LoadUint8x32SlicePart(a[:i])
- c := make([]uint8, 32, 32)
- u.StoreSlice(c)
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicePartInt16x8(t *testing.T) {
- a := []int16{1, 2, 3, 4, 5, 6, 7, 8}
- b := []int16{1, 2, 3, 4, 5, 6, 7, 8}
- for i := 8; i >= 0; i-- {
- u := simd.LoadInt16x8SlicePart(a[:i])
- c := make([]int16, 16, 16)
- u.StoreSlice(c)
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicePartInt16x16(t *testing.T) {
- a := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- b := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- for i := 16; i >= 0; i-- {
- u := simd.LoadInt16x16SlicePart(a[:i])
- c := make([]int16, 16, 16)
- u.StoreSlice(c)
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicesPartStoreInt8x16(t *testing.T) {
- a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- b := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- for i := 16; i >= 0; i-- {
- v := simd.LoadInt8x16Slice(a)
- c := make([]int8, 32, 32)
- v.StoreSlicePart(c[:i])
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicesPartStoreInt16x8(t *testing.T) {
- a := []int16{1, 2, 3, 4, 5, 6, 7, 8}
- b := []int16{1, 2, 3, 4, 5, 6, 7, 8}
- for i := 8; i >= 0; i-- {
- v := simd.LoadInt16x8Slice(a)
- c := make([]int16, 32, 32)
- v.StoreSlicePart(c[:i])
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicesPartStoreInt16x16(t *testing.T) {
- a := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- b := []int16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- for i := 16; i >= 0; i-- {
- v := simd.LoadInt16x16Slice(a)
- c := make([]int16, 32, 32)
- v.StoreSlicePart(c[:i])
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicesPartStoreUint8x16(t *testing.T) {
- a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- for i := 16; i >= 0; i-- {
- v := simd.LoadUint8x16Slice(a)
- c := make([]uint8, 32, 32)
- v.StoreSlicePart(c[:i])
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicesPartStoreUint16x16(t *testing.T) {
- a := []uint16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- b := []uint16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
- for i := 16; i >= 0; i-- {
- v := simd.LoadUint16x16Slice(a)
- c := make([]uint16, 32, 32)
- v.StoreSlicePart(c[:i])
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicesPartStoreUint8x32(t *testing.T) {
- a := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
- b := []uint8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
- for i := 32; i >= 0; i-- {
- v := simd.LoadUint8x32Slice(a)
- c := make([]uint8, 32, 32)
- v.StoreSlicePart(c[:i])
- checkSlices(t, c, b)
- if i > 0 {
- b[i-1] = 0
- }
- }
-}
-
-func TestSlicePartInt32(t *testing.T) {
- // 32x4
- L := 4
- c := []int32{1, 2, 3, 4, 5, -1, -1, -1, -1}
- a := c[:L+1]
- for i := range a {
- // Test the load first
- // e is a partial slice.
- e := a[i:]
- v := simd.LoadInt32x4SlicePart(e)
- // d contains what a ought to contain
- d := make([]int32, L)
- for j := 0; j < len(e) && j < len(d); j++ {
- d[j] = e[j]
- }
-
- b := make([]int32, L)
- v.StoreSlice(b)
- // test the load
- checkSlices(t, d, b)
-
- // Test the store
- f := make([]int32, L+1)
- for i := range f {
- f[i] = 99
- }
-
- v.StoreSlicePart(f[:len(e)])
- if len(e) < len(b) {
- checkSlices(t, f, b[:len(e)])
- } else {
- checkSlices(t, f, b)
- }
- for i := len(e); i < len(f); i++ {
- if f[i] != 99 {
- t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %d", i, f[i])
- }
- }
- }
-}
-
-func TestSlicePartUint64(t *testing.T) {
- // 64x4
- L := 4
- c := []uint64{1, 2, 3, 4, 5, 86, 86, 86, 86}
- a := c[:L+1]
- for i := range a {
- // Test the load first
- // e is a partial slice.
- e := a[i:]
- v := simd.LoadUint64x4SlicePart(e)
- // d contains what a ought to contain
- d := make([]uint64, L)
- for j := 0; j < len(e) && j < len(d); j++ {
- d[j] = e[j]
- }
-
- b := make([]uint64, L)
- v.StoreSlice(b)
- // test the load
- checkSlices(t, d, b)
-
- // Test the store
- f := make([]uint64, L+1)
- for i := range f {
- f[i] = 99
- }
-
- v.StoreSlicePart(f[:len(e)])
- if len(e) < len(b) {
- checkSlices(t, f, b[:len(e)])
- } else {
- checkSlices(t, f, b)
- }
- for i := len(e); i < len(f); i++ {
- if f[i] != 99 {
- t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %d", i, f[i])
- }
- }
- }
-}
-
-func TestSlicePartFloat64(t *testing.T) {
- // 64x2
- L := 2
- c := []float64{1, 2, 3, 86, 86, 86, 86}
- a := c[:L+1]
- for i := range a {
- // Test the load first
- // e is a partial slice.
- e := a[i:]
- v := simd.LoadFloat64x2SlicePart(e)
- // d contains what a ought to contain
- d := make([]float64, L)
- for j := 0; j < len(e) && j < len(d); j++ {
- d[j] = e[j]
- }
-
- b := make([]float64, L)
- v.StoreSlice(b)
- // test the load
- checkSlices(t, d, b)
-
- // Test the store
- f := make([]float64, L+1)
- for i := range f {
- f[i] = 99
- }
-
- v.StoreSlicePart(f[:len(e)])
- if len(e) < len(b) {
- checkSlices(t, f, b[:len(e)])
- } else {
- checkSlices(t, f, b)
- }
- for i := len(e); i < len(f); i++ {
- if f[i] != 99 {
- t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
- }
- }
- }
-}
-
-func TestSlicePartFloat32(t *testing.T) {
- // 32x8
- L := 8
- c := []float32{1, 2, 3, 4, 5, 6, 7, 8, 86, 86, 86, 86}
- a := c[:L+1]
- for i := range a {
- // Test the load first
- // e is a partial slice.
- e := a[i:]
- v := simd.LoadFloat32x8SlicePart(e)
- // d contains what a ought to contain
- d := make([]float32, L)
- for j := 0; j < len(e) && j < len(d); j++ {
- d[j] = e[j]
- }
-
- b := make([]float32, L)
- v.StoreSlice(b)
- // test the load
- checkSlices(t, d, b)
-
- // Test the store
- f := make([]float32, L+1)
- for i := range f {
- f[i] = 99
- }
-
- v.StoreSlicePart(f[:len(e)])
- if len(e) < len(b) {
- checkSlices(t, f, b[:len(e)])
- } else {
- checkSlices(t, f, b)
- }
- for i := len(e); i < len(f); i++ {
- if f[i] != 99 {
- t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
- }
- }
- }
-}
-
-// 512-bit load
-
-func TestSlicePartInt64(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Test requires X86.AVX512, not available on this hardware")
- return
- }
-
- L := 8
- c := []int64{1, 2, 3, 4, 5, 6, 7, 8, 86, 86, 86, 86}
- a := c[:L+1]
- for i := range a {
- // Test the load first
- // e is a partial slice.
- e := a[i:]
- v := simd.LoadInt64x8SlicePart(e)
- // d contains what a ought to contain
- d := make([]int64, L)
- for j := 0; j < len(e) && j < len(d); j++ {
- d[j] = e[j]
- }
-
- b := make([]int64, L)
- v.StoreSlice(b)
- // test the load
- checkSlicesLogInput(t, b, d, 0.0, func() { t.Helper(); t.Logf("Len(e)=%d", len(e)) })
-
- // Test the store
- f := make([]int64, L+1)
- for i := range f {
- f[i] = 99
- }
-
- v.StoreSlicePart(f[:len(e)])
- if len(e) < len(b) {
- checkSlices(t, f, b[:len(e)])
- } else {
- checkSlices(t, f, b)
- }
- for i := len(e); i < len(f); i++ {
- if f[i] != 99 {
- t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
- }
- }
- }
-}
+++ /dev/null
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-// This file contains functions testing ternary simd methods.
-// Each function in this file is specialized for a
-// particular simd type <BaseType><Width>x<Count>.
-
-package simd_test
-
-import (
- "simd"
- "testing"
-)
-
-// testInt8x16Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt8x16Ternary(t *testing.T, f func(_, _, _ simd.Int8x16) simd.Int8x16, want func(_, _, _ []int8) []int8) {
- n := 16
- t.Helper()
- forSliceTriple(t, int8s, n, func(x, y, z []int8) bool {
- t.Helper()
- a := simd.LoadInt8x16Slice(x)
- b := simd.LoadInt8x16Slice(y)
- c := simd.LoadInt8x16Slice(z)
- g := make([]int8, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt16x8Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt16x8Ternary(t *testing.T, f func(_, _, _ simd.Int16x8) simd.Int16x8, want func(_, _, _ []int16) []int16) {
- n := 8
- t.Helper()
- forSliceTriple(t, int16s, n, func(x, y, z []int16) bool {
- t.Helper()
- a := simd.LoadInt16x8Slice(x)
- b := simd.LoadInt16x8Slice(y)
- c := simd.LoadInt16x8Slice(z)
- g := make([]int16, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt32x4Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt32x4Ternary(t *testing.T, f func(_, _, _ simd.Int32x4) simd.Int32x4, want func(_, _, _ []int32) []int32) {
- n := 4
- t.Helper()
- forSliceTriple(t, int32s, n, func(x, y, z []int32) bool {
- t.Helper()
- a := simd.LoadInt32x4Slice(x)
- b := simd.LoadInt32x4Slice(y)
- c := simd.LoadInt32x4Slice(z)
- g := make([]int32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt64x2Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt64x2Ternary(t *testing.T, f func(_, _, _ simd.Int64x2) simd.Int64x2, want func(_, _, _ []int64) []int64) {
- n := 2
- t.Helper()
- forSliceTriple(t, int64s, n, func(x, y, z []int64) bool {
- t.Helper()
- a := simd.LoadInt64x2Slice(x)
- b := simd.LoadInt64x2Slice(y)
- c := simd.LoadInt64x2Slice(z)
- g := make([]int64, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint8x16Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint8x16Ternary(t *testing.T, f func(_, _, _ simd.Uint8x16) simd.Uint8x16, want func(_, _, _ []uint8) []uint8) {
- n := 16
- t.Helper()
- forSliceTriple(t, uint8s, n, func(x, y, z []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x16Slice(x)
- b := simd.LoadUint8x16Slice(y)
- c := simd.LoadUint8x16Slice(z)
- g := make([]uint8, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint16x8Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint16x8Ternary(t *testing.T, f func(_, _, _ simd.Uint16x8) simd.Uint16x8, want func(_, _, _ []uint16) []uint16) {
- n := 8
- t.Helper()
- forSliceTriple(t, uint16s, n, func(x, y, z []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x8Slice(x)
- b := simd.LoadUint16x8Slice(y)
- c := simd.LoadUint16x8Slice(z)
- g := make([]uint16, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint32x4Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint32x4Ternary(t *testing.T, f func(_, _, _ simd.Uint32x4) simd.Uint32x4, want func(_, _, _ []uint32) []uint32) {
- n := 4
- t.Helper()
- forSliceTriple(t, uint32s, n, func(x, y, z []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x4Slice(x)
- b := simd.LoadUint32x4Slice(y)
- c := simd.LoadUint32x4Slice(z)
- g := make([]uint32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint64x2Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint64x2Ternary(t *testing.T, f func(_, _, _ simd.Uint64x2) simd.Uint64x2, want func(_, _, _ []uint64) []uint64) {
- n := 2
- t.Helper()
- forSliceTriple(t, uint64s, n, func(x, y, z []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x2Slice(x)
- b := simd.LoadUint64x2Slice(y)
- c := simd.LoadUint64x2Slice(z)
- g := make([]uint64, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testFloat32x4Ternary tests the simd ternary method f against the expected behavior generated by want
-func testFloat32x4Ternary(t *testing.T, f func(_, _, _ simd.Float32x4) simd.Float32x4, want func(_, _, _ []float32) []float32) {
- n := 4
- t.Helper()
- forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x4Slice(x)
- b := simd.LoadFloat32x4Slice(y)
- c := simd.LoadFloat32x4Slice(z)
- g := make([]float32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testFloat64x2Ternary tests the simd ternary method f against the expected behavior generated by want
-func testFloat64x2Ternary(t *testing.T, f func(_, _, _ simd.Float64x2) simd.Float64x2, want func(_, _, _ []float64) []float64) {
- n := 2
- t.Helper()
- forSliceTriple(t, float64s, n, func(x, y, z []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x2Slice(x)
- b := simd.LoadFloat64x2Slice(y)
- c := simd.LoadFloat64x2Slice(z)
- g := make([]float64, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt8x32Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt8x32Ternary(t *testing.T, f func(_, _, _ simd.Int8x32) simd.Int8x32, want func(_, _, _ []int8) []int8) {
- n := 32
- t.Helper()
- forSliceTriple(t, int8s, n, func(x, y, z []int8) bool {
- t.Helper()
- a := simd.LoadInt8x32Slice(x)
- b := simd.LoadInt8x32Slice(y)
- c := simd.LoadInt8x32Slice(z)
- g := make([]int8, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt16x16Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt16x16Ternary(t *testing.T, f func(_, _, _ simd.Int16x16) simd.Int16x16, want func(_, _, _ []int16) []int16) {
- n := 16
- t.Helper()
- forSliceTriple(t, int16s, n, func(x, y, z []int16) bool {
- t.Helper()
- a := simd.LoadInt16x16Slice(x)
- b := simd.LoadInt16x16Slice(y)
- c := simd.LoadInt16x16Slice(z)
- g := make([]int16, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt32x8Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt32x8Ternary(t *testing.T, f func(_, _, _ simd.Int32x8) simd.Int32x8, want func(_, _, _ []int32) []int32) {
- n := 8
- t.Helper()
- forSliceTriple(t, int32s, n, func(x, y, z []int32) bool {
- t.Helper()
- a := simd.LoadInt32x8Slice(x)
- b := simd.LoadInt32x8Slice(y)
- c := simd.LoadInt32x8Slice(z)
- g := make([]int32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt64x4Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt64x4Ternary(t *testing.T, f func(_, _, _ simd.Int64x4) simd.Int64x4, want func(_, _, _ []int64) []int64) {
- n := 4
- t.Helper()
- forSliceTriple(t, int64s, n, func(x, y, z []int64) bool {
- t.Helper()
- a := simd.LoadInt64x4Slice(x)
- b := simd.LoadInt64x4Slice(y)
- c := simd.LoadInt64x4Slice(z)
- g := make([]int64, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint8x32Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint8x32Ternary(t *testing.T, f func(_, _, _ simd.Uint8x32) simd.Uint8x32, want func(_, _, _ []uint8) []uint8) {
- n := 32
- t.Helper()
- forSliceTriple(t, uint8s, n, func(x, y, z []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x32Slice(x)
- b := simd.LoadUint8x32Slice(y)
- c := simd.LoadUint8x32Slice(z)
- g := make([]uint8, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint16x16Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint16x16Ternary(t *testing.T, f func(_, _, _ simd.Uint16x16) simd.Uint16x16, want func(_, _, _ []uint16) []uint16) {
- n := 16
- t.Helper()
- forSliceTriple(t, uint16s, n, func(x, y, z []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x16Slice(x)
- b := simd.LoadUint16x16Slice(y)
- c := simd.LoadUint16x16Slice(z)
- g := make([]uint16, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint32x8Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint32x8Ternary(t *testing.T, f func(_, _, _ simd.Uint32x8) simd.Uint32x8, want func(_, _, _ []uint32) []uint32) {
- n := 8
- t.Helper()
- forSliceTriple(t, uint32s, n, func(x, y, z []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x8Slice(x)
- b := simd.LoadUint32x8Slice(y)
- c := simd.LoadUint32x8Slice(z)
- g := make([]uint32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint64x4Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint64x4Ternary(t *testing.T, f func(_, _, _ simd.Uint64x4) simd.Uint64x4, want func(_, _, _ []uint64) []uint64) {
- n := 4
- t.Helper()
- forSliceTriple(t, uint64s, n, func(x, y, z []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x4Slice(x)
- b := simd.LoadUint64x4Slice(y)
- c := simd.LoadUint64x4Slice(z)
- g := make([]uint64, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testFloat32x8Ternary tests the simd ternary method f against the expected behavior generated by want
-func testFloat32x8Ternary(t *testing.T, f func(_, _, _ simd.Float32x8) simd.Float32x8, want func(_, _, _ []float32) []float32) {
- n := 8
- t.Helper()
- forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x8Slice(x)
- b := simd.LoadFloat32x8Slice(y)
- c := simd.LoadFloat32x8Slice(z)
- g := make([]float32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testFloat64x4Ternary tests the simd ternary method f against the expected behavior generated by want
-func testFloat64x4Ternary(t *testing.T, f func(_, _, _ simd.Float64x4) simd.Float64x4, want func(_, _, _ []float64) []float64) {
- n := 4
- t.Helper()
- forSliceTriple(t, float64s, n, func(x, y, z []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x4Slice(x)
- b := simd.LoadFloat64x4Slice(y)
- c := simd.LoadFloat64x4Slice(z)
- g := make([]float64, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt8x64Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt8x64Ternary(t *testing.T, f func(_, _, _ simd.Int8x64) simd.Int8x64, want func(_, _, _ []int8) []int8) {
- n := 64
- t.Helper()
- forSliceTriple(t, int8s, n, func(x, y, z []int8) bool {
- t.Helper()
- a := simd.LoadInt8x64Slice(x)
- b := simd.LoadInt8x64Slice(y)
- c := simd.LoadInt8x64Slice(z)
- g := make([]int8, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt16x32Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt16x32Ternary(t *testing.T, f func(_, _, _ simd.Int16x32) simd.Int16x32, want func(_, _, _ []int16) []int16) {
- n := 32
- t.Helper()
- forSliceTriple(t, int16s, n, func(x, y, z []int16) bool {
- t.Helper()
- a := simd.LoadInt16x32Slice(x)
- b := simd.LoadInt16x32Slice(y)
- c := simd.LoadInt16x32Slice(z)
- g := make([]int16, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt32x16Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt32x16Ternary(t *testing.T, f func(_, _, _ simd.Int32x16) simd.Int32x16, want func(_, _, _ []int32) []int32) {
- n := 16
- t.Helper()
- forSliceTriple(t, int32s, n, func(x, y, z []int32) bool {
- t.Helper()
- a := simd.LoadInt32x16Slice(x)
- b := simd.LoadInt32x16Slice(y)
- c := simd.LoadInt32x16Slice(z)
- g := make([]int32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testInt64x8Ternary tests the simd ternary method f against the expected behavior generated by want
-func testInt64x8Ternary(t *testing.T, f func(_, _, _ simd.Int64x8) simd.Int64x8, want func(_, _, _ []int64) []int64) {
- n := 8
- t.Helper()
- forSliceTriple(t, int64s, n, func(x, y, z []int64) bool {
- t.Helper()
- a := simd.LoadInt64x8Slice(x)
- b := simd.LoadInt64x8Slice(y)
- c := simd.LoadInt64x8Slice(z)
- g := make([]int64, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint8x64Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint8x64Ternary(t *testing.T, f func(_, _, _ simd.Uint8x64) simd.Uint8x64, want func(_, _, _ []uint8) []uint8) {
- n := 64
- t.Helper()
- forSliceTriple(t, uint8s, n, func(x, y, z []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x64Slice(x)
- b := simd.LoadUint8x64Slice(y)
- c := simd.LoadUint8x64Slice(z)
- g := make([]uint8, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint16x32Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint16x32Ternary(t *testing.T, f func(_, _, _ simd.Uint16x32) simd.Uint16x32, want func(_, _, _ []uint16) []uint16) {
- n := 32
- t.Helper()
- forSliceTriple(t, uint16s, n, func(x, y, z []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x32Slice(x)
- b := simd.LoadUint16x32Slice(y)
- c := simd.LoadUint16x32Slice(z)
- g := make([]uint16, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint32x16Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint32x16Ternary(t *testing.T, f func(_, _, _ simd.Uint32x16) simd.Uint32x16, want func(_, _, _ []uint32) []uint32) {
- n := 16
- t.Helper()
- forSliceTriple(t, uint32s, n, func(x, y, z []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x16Slice(x)
- b := simd.LoadUint32x16Slice(y)
- c := simd.LoadUint32x16Slice(z)
- g := make([]uint32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testUint64x8Ternary tests the simd ternary method f against the expected behavior generated by want
-func testUint64x8Ternary(t *testing.T, f func(_, _, _ simd.Uint64x8) simd.Uint64x8, want func(_, _, _ []uint64) []uint64) {
- n := 8
- t.Helper()
- forSliceTriple(t, uint64s, n, func(x, y, z []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x8Slice(x)
- b := simd.LoadUint64x8Slice(y)
- c := simd.LoadUint64x8Slice(z)
- g := make([]uint64, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testFloat32x16Ternary tests the simd ternary method f against the expected behavior generated by want
-func testFloat32x16Ternary(t *testing.T, f func(_, _, _ simd.Float32x16) simd.Float32x16, want func(_, _, _ []float32) []float32) {
- n := 16
- t.Helper()
- forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x16Slice(x)
- b := simd.LoadFloat32x16Slice(y)
- c := simd.LoadFloat32x16Slice(z)
- g := make([]float32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testFloat64x8Ternary tests the simd ternary method f against the expected behavior generated by want
-func testFloat64x8Ternary(t *testing.T, f func(_, _, _ simd.Float64x8) simd.Float64x8, want func(_, _, _ []float64) []float64) {
- n := 8
- t.Helper()
- forSliceTriple(t, float64s, n, func(x, y, z []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x8Slice(x)
- b := simd.LoadFloat64x8Slice(y)
- c := simd.LoadFloat64x8Slice(z)
- g := make([]float64, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testFloat32x4TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func testFloat32x4TernaryFlaky(t *testing.T, f func(x, y, z simd.Float32x4) simd.Float32x4, want func(x, y, z []float32) []float32, flakiness float64) {
- n := 4
- t.Helper()
- forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x4Slice(x)
- b := simd.LoadFloat32x4Slice(y)
- c := simd.LoadFloat32x4Slice(z)
- g := make([]float32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testFloat32x8TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func testFloat32x8TernaryFlaky(t *testing.T, f func(x, y, z simd.Float32x8) simd.Float32x8, want func(x, y, z []float32) []float32, flakiness float64) {
- n := 8
- t.Helper()
- forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x8Slice(x)
- b := simd.LoadFloat32x8Slice(y)
- c := simd.LoadFloat32x8Slice(z)
- g := make([]float32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
-
-// testFloat32x16TernaryFlaky tests the simd ternary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func testFloat32x16TernaryFlaky(t *testing.T, f func(x, y, z simd.Float32x16) simd.Float32x16, want func(x, y, z []float32) []float32, flakiness float64) {
- n := 16
- t.Helper()
- forSliceTriple(t, float32s, n, func(x, y, z []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x16Slice(x)
- b := simd.LoadFloat32x16Slice(y)
- c := simd.LoadFloat32x16Slice(z)
- g := make([]float32, n)
- f(a, b, c).StoreSlice(g)
- w := want(x, y, z)
- return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x); t.Logf("y=%v", y); t.Logf("z=%v", z) })
- })
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd_test
-
-import (
- "simd"
- "testing"
-)
-
-func TestFMA(t *testing.T) {
- if simd.X86.AVX512() {
- testFloat32x4TernaryFlaky(t, simd.Float32x4.MulAdd, fmaSlice[float32], 0.001)
- testFloat32x8TernaryFlaky(t, simd.Float32x8.MulAdd, fmaSlice[float32], 0.001)
- testFloat32x16TernaryFlaky(t, simd.Float32x16.MulAdd, fmaSlice[float32], 0.001)
- testFloat64x2Ternary(t, simd.Float64x2.MulAdd, fmaSlice[float64])
- testFloat64x4Ternary(t, simd.Float64x4.MulAdd, fmaSlice[float64])
- testFloat64x8Ternary(t, simd.Float64x8.MulAdd, fmaSlice[float64])
- }
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd_test
-
-import (
- "fmt"
- "simd"
- "testing"
-)
-
-func Transpose4(a0, a1, a2, a3 simd.Int32x4) (b0, b1, b2, b3 simd.Int32x4) {
- t0, t1 := a0.InterleaveLo(a1), a0.InterleaveHi(a1)
- t2, t3 := a2.InterleaveLo(a3), a2.InterleaveHi(a3)
-
- // a0: ABCD ==> t0: A1B2
- // a1: 1234 t1: C3D4
- // a2: EFGH t2: E5F6
- // a3: 5678 t3: G7H8
-
- // need
- // A1E5
- // B2F6
- // C3G7
- // D4H8
-
- b0 = t0.SelectFromPair(0, 1, 4, 5, t2) // lower elements from each
- b1 = t0.SelectFromPair(2, 3, 6, 7, t2) // upper elements from each
- b2 = t1.SelectFromPair(0, 1, 4, 5, t3) // lowers
- b3 = t1.SelectFromPair(2, 3, 6, 7, t3) // uppers
- return
-}
-
-func Transpose8(a0, a1, a2, a3, a4, a5, a6, a7 simd.Int32x8) (b0, b1, b2, b3, b4, b5, b6, b7 simd.Int32x8) {
- t0, t1 := a0.InterleaveLoGrouped(a1), a0.InterleaveHiGrouped(a1)
- t2, t3 := a2.InterleaveLoGrouped(a3), a2.InterleaveHiGrouped(a3)
- t4, t5 := a4.InterleaveLoGrouped(a5), a4.InterleaveHiGrouped(a5)
- t6, t7 := a6.InterleaveLoGrouped(a7), a6.InterleaveHiGrouped(a7)
-
- // a0: ABCD ==> t0: A1B2
- // a1: 1234 t1: C3D4
- // a2: EFGH t2: E5F6
- // a3: 5678 t3: G7H8
-
- // need
- // A1E5
- // B2F6
- // C3G7
- // D4H8
-
- a0 = t0.SelectFromPairGrouped(0, 1, 4, 5, t2) // lower elements from each
- a1 = t0.SelectFromPairGrouped(2, 3, 6, 7, t2) // upper elements from each
- a2 = t1.SelectFromPairGrouped(0, 1, 4, 5, t3) // lowers
- a3 = t1.SelectFromPairGrouped(2, 3, 6, 7, t3) // uppers
-
- a4 = t4.SelectFromPairGrouped(0, 1, 4, 5, t6) // lower elements from each
- a5 = t4.SelectFromPairGrouped(2, 3, 6, 7, t6) // upper elements from each
- a6 = t5.SelectFromPairGrouped(0, 1, 4, 5, t7) // lowers
- a7 = t5.SelectFromPairGrouped(2, 3, 6, 7, t7) // uppers
-
- // next need to swap the upper 128 bits of a0-a3 with the lower 128 bits of a4-a7
-
- b0 = a0.Select128FromPair(0, 2, a4)
- b4 = a0.Select128FromPair(1, 3, a4)
-
- b1 = a1.Select128FromPair(0, 2, a5)
- b5 = a1.Select128FromPair(1, 3, a5)
-
- b2 = a2.Select128FromPair(0, 2, a6)
- b6 = a2.Select128FromPair(1, 3, a6)
-
- b3 = a3.Select128FromPair(0, 2, a7)
- b7 = a3.Select128FromPair(1, 3, a7)
-
- return
-}
-
-func TestTranspose4(t *testing.T) {
- r := make([]int32, 16, 16)
-
- w := simd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
- x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
- y := simd.LoadInt32x4Slice([]int32{0xE, 0xF, 0x10, 0x11})
- z := simd.LoadInt32x4Slice([]int32{5, 6, 7, 8})
- a, b, c, d := Transpose4(w, x, y, z)
-
- a.StoreSlice(r[0:])
- b.StoreSlice(r[4:])
- c.StoreSlice(r[8:])
- d.StoreSlice(r[12:])
-
- checkSlices[int32](t, r, []int32{
- 0xA, 1, 0xE, 5,
- 0xB, 2, 0xF, 6,
- 0xC, 3, 0x10, 7,
- 0xD, 4, 0x11, 8,
- })
-
-}
-
-func TestTranspose8(t *testing.T) {
- m := make([]int32, 8)
-
- a := []int32{}
- for i := int32(1); i <= 64; i++ {
- a = append(a, i)
- }
-
- p := simd.LoadInt32x8Slice(a[0:])
- q := simd.LoadInt32x8Slice(a[8:])
- r := simd.LoadInt32x8Slice(a[16:])
- s := simd.LoadInt32x8Slice(a[24:])
-
- w := simd.LoadInt32x8Slice(a[32:])
- x := simd.LoadInt32x8Slice(a[40:])
- y := simd.LoadInt32x8Slice(a[48:])
- z := simd.LoadInt32x8Slice(a[56:])
-
- p, q, r, s, w, x, y, z = Transpose8(p, q, r, s, w, x, y, z)
-
- foo := func(a simd.Int32x8, z int32) {
- a.StoreSlice(m)
- var o []int32
- for i := int32(0); i < 8; i++ {
- o = append(o, z+i*8)
- }
- checkSlices[int32](t, m, o)
- }
-
- foo(p, 1)
- foo(q, 2)
- foo(r, 3)
- foo(s, 4)
- foo(w, 5)
- foo(x, 6)
- foo(y, 7)
- foo(z, 8)
-
-}
-
-const BIG = 20000
-
-var bigMatrix [][]int32
-
-// 9x9 is smallest matrix with diagonal and off-diagonal tiles, plus a fringe.
-var nineMatrix [][]int32
-
-var thirtyMatrix [][]int32
-
-func fill(m [][]int32) {
- for i := range m {
- m[i] = make([]int32, len(m))
- for j := range m[i] {
- m[i][j] = int32(-i<<16 + j)
- }
- }
-}
-
-func isTransposed(m [][]int32) bool {
- for i, mi := range m {
- for j, a := range mi {
- if a != int32(-j<<16+i) {
- return false
- }
- }
- }
- return true
-}
-
-func dupe(m [][]int32) [][]int32 {
- n := len(m)
- p := make([][]int32, n, n)
- for i := range p {
- t := make([]int32, n)
- for j, a := range m[i] {
- t[j] = a
- }
- p[i] = t
- }
- return p
-}
-
-func init() {
- bigMatrix = make([][]int32, BIG, BIG)
- fill(bigMatrix)
- nineMatrix = make([][]int32, 9, 9)
- fill(nineMatrix)
- thirtyMatrix = make([][]int32, 30, 30)
- fill(thirtyMatrix)
-}
-
-func BenchmarkPlainTranspose(b *testing.B) {
- d := dupe(bigMatrix)
- for b.Loop() {
- transposePlain(d)
- }
-}
-
-func BenchmarkTiled4Transpose(b *testing.B) {
- d := dupe(bigMatrix)
- for b.Loop() {
- transposeTiled4(d)
- }
-}
-
-func BenchmarkTiled8Transpose(b *testing.B) {
- d := dupe(bigMatrix)
- for b.Loop() {
- transposeTiled8(d)
- }
-}
-
-func Benchmark2BlockedTranspose(b *testing.B) {
- d := dupe(bigMatrix)
- for b.Loop() {
- transpose2Blocked(d)
- }
-}
-func Benchmark3BlockedTranspose(b *testing.B) {
- d := dupe(bigMatrix)
- for b.Loop() {
- transpose3Blocked(d)
- }
-}
-func Benchmark4BlockedTranspose(b *testing.B) {
- d := dupe(bigMatrix)
- for b.Loop() {
- transpose4Blocked(d)
- }
-}
-func Benchmark5aBlockedTranspose(b *testing.B) {
- d := dupe(bigMatrix)
- for b.Loop() {
- transpose5aBlocked(d)
- }
-}
-
-func Benchmark5bBlockedTranspose(b *testing.B) {
- d := dupe(bigMatrix)
- for b.Loop() {
- transpose5bBlocked(d)
- }
-}
-
-func transposePlain(m [][]int32) {
- for i := range m {
- for j := 0; j < i; j++ {
- t := m[i][j]
- m[i][j] = m[j][i]
- m[j][i] = t
- }
- }
-}
-
-func TestTransposePlain(t *testing.T) {
- d := dupe(nineMatrix)
- t.Logf("Input matrix is %s", formatMatrix(d))
- transposePlain(d)
- if !isTransposed(d) {
- t.Errorf("d is not transposed, d = %s", formatMatrix(d))
- } else {
- t.Logf("Transposed plain matrix = %s", formatMatrix(d))
- }
-}
-
-func TestTranspose2Blocked(t *testing.T) {
- d := dupe(nineMatrix)
- t.Logf("Input matrix is %s", formatMatrix(d))
- transpose2Blocked(d)
- if !isTransposed(d) {
- t.Errorf("d is not transposed, d = %s", formatMatrix(d))
- }
-}
-
-func TestTranspose3Blocked(t *testing.T) {
- d := dupe(nineMatrix)
- t.Logf("Input matrix is %s", formatMatrix(d))
- transpose3Blocked(d)
- if !isTransposed(d) {
- t.Errorf("d is not transposed, d = %s", formatMatrix(d))
- }
-}
-
-func TestTranspose4Blocked(t *testing.T) {
- d := dupe(nineMatrix)
- t.Logf("Input matrix is %s", formatMatrix(d))
- transpose4Blocked(d)
- if !isTransposed(d) {
- t.Errorf("d is not transposed, d = %s", formatMatrix(d))
- }
-}
-
-func TestTranspose5aBlocked(t *testing.T) {
- d := dupe(nineMatrix)
- t.Logf("Input matrix is %s", formatMatrix(d))
- transpose5aBlocked(d)
- if !isTransposed(d) {
- t.Errorf("d is not transposed, d = %s", formatMatrix(d))
- }
-}
-
-func TestTranspose5bBlocked(t *testing.T) {
- d := dupe(nineMatrix)
- t.Logf("Input matrix is %s", formatMatrix(d))
- transpose5bBlocked(d)
- if !isTransposed(d) {
- t.Errorf("d is not transposed, d = %s", formatMatrix(d))
- }
-}
-
-func TestTransposeTiled4(t *testing.T) {
- d := dupe(nineMatrix)
- transposeTiled4(d)
- if !isTransposed(d) {
- t.Errorf("d is not transposed, d = %v", d)
- }
-}
-
-func TestTransposeTiled8(t *testing.T) {
- d := dupe(thirtyMatrix)
- transposeTiled8(d)
- if !isTransposed(d) {
- t.Errorf("d is not transposed, d = %v", d)
- }
-}
-
-func formatMatrix(m [][]int32) string {
- s := ""
- for _, mi := range m {
- s += "\n["
- for _, t := range mi {
- h := t >> 16
- l := t & 0xffff
- s += fmt.Sprintf(" (%d %d)", h, l)
- }
- s += " ]"
- }
- return s
-}
-
-func transpose2Blocked(m [][]int32) {
- const B = 2
- N := len(m)
- i := 0
- for ; i <= len(m)-B; i += B {
- r0, r1 := m[i], m[i+1]
- if len(r0) < N || len(r1) < N {
- panic("Early bounds check failure")
- }
- // transpose around diagonal
- d01, d10 := r0[i+1], r1[i]
- r0[i+1], r1[i] = d10, d01
-
- // transpose across diagonal
- j := 0
- for ; j < i; j += B {
- a0, a1 := m[j], m[j+1]
-
- b00, b01 := a0[i], a0[i+1]
- b10, b11 := a1[i], a1[i+1]
-
- a0[i], a0[i+1] = r0[j], r1[j]
- a1[i], a1[i+1] = r0[j+1], r1[j+1]
-
- r0[j], r0[j+1] = b00, b10
- r1[j], r1[j+1] = b01, b11
- }
- }
-
- // Do the fringe
- for ; i < len(m); i++ {
- j := 0
- r := m[i]
- for ; j < i; j++ {
- t := r[j]
- r[j] = m[j][i]
- m[j][i] = t
- }
- }
-}
-
-func transpose3Blocked(m [][]int32) {
- const B = 3
- N := len(m)
- i := 0
- for ; i <= len(m)-B; i += B {
- r0, r1, r2 := m[i], m[i+1], m[i+2]
- if len(r0) < N || len(r1) < N {
- panic("Early bounds check failure")
- }
- // transpose around diagonal
- d01, d10 := r0[i+1], r1[i]
- d02, d20 := r0[i+2], r2[i]
- d12, d21 := r1[i+2], r2[i+1]
-
- r0[i+1], r1[i] = d10, d01
- r0[i+2], r2[i] = d20, d02
- r1[i+2], r2[i+1] = d21, d12
-
- // transpose across diagonal
- j := 0
- for ; j < i; j += B {
- a0, a1, a2 := m[j], m[j+1], m[j+2]
-
- b00, b01, b02 := a0[i], a0[i+1], a0[i+2]
- b10, b11, b12 := a1[i], a1[i+1], a1[i+2]
- b20, b21, b22 := a2[i], a2[i+1], a2[i+2]
-
- a0[i], a0[i+1], a0[i+2] = r0[j], r1[j], r2[j]
- a1[i], a1[i+1], a1[i+2] = r0[j+1], r1[j+1], r2[j+1]
- a2[i], a2[i+1], a2[i+2] = r0[j+2], r1[j+2], r2[j+2]
-
- r0[j], r0[j+1], r0[j+2] = b00, b10, b20
- r1[j], r1[j+1], r1[j+2] = b01, b11, b21
- r2[j], r2[j+1], r2[j+2] = b02, b12, b22
- }
- }
-
- // Do the fringe
- for ; i < len(m); i++ {
- j := 0
- r := m[i]
- for ; j < i; j++ {
- t := r[j]
- r[j] = m[j][i]
- m[j][i] = t
- }
- }
-}
-
-func transpose4Blocked(m [][]int32) {
- const B = 4
- N := len(m)
- i := 0
- for ; i <= len(m)-B; i += B {
- r0, r1, r2, r3 := m[i], m[i+1], m[i+2], m[i+3]
- if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N {
- panic("Early bounds check failure")
- }
- // transpose around diagonal
- d01, d10 := r0[i+1], r1[i]
- d02, d20 := r0[i+2], r2[i]
- d03, d30 := r0[i+3], r3[i]
- d12, d21 := r1[i+2], r2[i+1]
- d13, d31 := r1[i+3], r3[i+1]
- d23, d32 := r2[i+3], r3[i+2]
-
- r0[i+1], r1[i] = d10, d01
- r0[i+2], r2[i] = d20, d02
- r0[i+3], r3[i] = d30, d03
- r1[i+2], r2[i+1] = d21, d12
- r1[i+3], r3[i+1] = d31, d13
- r2[i+3], r3[i+2] = d32, d23
-
- // transpose across diagonal
- j := 0
- for ; j < i; j += B {
- a0, a1, a2, a3 := m[j], m[j+1], m[j+2], m[j+3]
-
- b00, b01, b02, b03 := a0[i], a0[i+1], a0[i+2], a0[i+3]
- b10, b11, b12, b13 := a1[i], a1[i+1], a1[i+2], a1[i+3]
- b20, b21, b22, b23 := a2[i], a2[i+1], a2[i+2], a2[i+3]
- b30, b31, b32, b33 := a3[i], a3[i+1], a3[i+2], a3[i+3]
-
- a0[i], a0[i+1], a0[i+2], a0[i+3] = r0[j], r1[j], r2[j], r3[j]
- a1[i], a1[i+1], a1[i+2], a1[i+3] = r0[j+1], r1[j+1], r2[j+1], r3[j+1]
- a2[i], a2[i+1], a2[i+2], a2[i+3] = r0[j+2], r1[j+2], r2[j+2], r3[j+2]
- a3[i], a3[i+1], a3[i+2], a3[i+3] = r0[j+3], r1[j+3], r2[j+3], r3[j+3]
-
- r0[j], r0[j+1], r0[j+2], r0[j+3] = b00, b10, b20, b30
- r1[j], r1[j+1], r1[j+2], r1[j+3] = b01, b11, b21, b31
- r2[j], r2[j+1], r2[j+2], r2[j+3] = b02, b12, b22, b32
- r3[j], r3[j+1], r3[j+2], r3[j+3] = b03, b13, b23, b33
- }
- }
-
- // Do the fringe
- for ; i < len(m); i++ {
- j := 0
- r := m[i]
- for ; j < i; j++ {
- t := r[j]
- r[j] = m[j][i]
- m[j][i] = t
- }
- }
-}
-
-func transpose5aBlocked(m [][]int32) {
- const B = 5
- N := len(m)
- i := 0
- for ; i <= len(m)-B; i += B {
- r0, r1, r2, r3, r4 := m[i], m[i+1], m[i+2], m[i+3], m[i+4]
- if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N {
- panic("Early bounds check failure")
- }
- // transpose around diagonal
- d01, d10 := r0[i+1], r1[i]
- d02, d20 := r0[i+2], r2[i]
- d03, d30 := r0[i+3], r3[i]
- d04, d40 := r0[i+4], r4[i]
-
- d12, d21 := r1[i+2], r2[i+1]
- d13, d31 := r1[i+3], r3[i+1]
- d14, d41 := r1[i+4], r4[i+1]
-
- d23, d32 := r2[i+3], r3[i+2]
- d24, d42 := r2[i+4], r4[i+2]
-
- d34, d43 := r3[i+4], r4[i+3]
-
- r0[i+1], r1[i] = d10, d01
- r0[i+2], r2[i] = d20, d02
- r0[i+3], r3[i] = d30, d03
- r0[i+4], r4[i] = d40, d04
-
- r1[i+2], r2[i+1] = d21, d12
- r1[i+3], r3[i+1] = d31, d13
- r1[i+4], r4[i+1] = d41, d14
-
- r2[i+3], r3[i+2] = d32, d23
- r2[i+4], r4[i+2] = d42, d24
-
- r3[i+4], r4[i+3] = d43, d34
-
- // transpose across diagonal
- j := 0
- for ; j < i; j += B {
- a0, a1, a2, a3, a4 := m[j], m[j+1], m[j+2], m[j+3], m[j+4]
-
- b00, b01, b02, b03, b04 := a0[i], a0[i+1], a0[i+2], a0[i+3], a0[i+4]
- b10, b11, b12, b13, b14 := a1[i], a1[i+1], a1[i+2], a1[i+3], a1[i+4]
- b20, b21, b22, b23, b24 := a2[i], a2[i+1], a2[i+2], a2[i+3], a2[i+4]
- b30, b31, b32, b33, b34 := a3[i], a3[i+1], a3[i+2], a3[i+3], a3[i+4]
- b40, b41, b42, b43, b44 := a4[i], a4[i+1], a4[i+2], a4[i+3], a4[i+4]
-
- a0[i], a0[i+1], a0[i+2], a0[i+3], a0[i+4] = r0[j], r1[j], r2[j], r3[j], r4[j]
- a1[i], a1[i+1], a1[i+2], a1[i+3], a1[i+4] = r0[j+1], r1[j+1], r2[j+1], r3[j+1], r4[j+1]
- a2[i], a2[i+1], a2[i+2], a2[i+3], a2[i+4] = r0[j+2], r1[j+2], r2[j+2], r3[j+2], r4[j+2]
- a3[i], a3[i+1], a3[i+2], a3[i+3], a3[i+4] = r0[j+3], r1[j+3], r2[j+3], r3[j+3], r4[j+3]
- a4[i], a4[i+1], a4[i+2], a4[i+3], a4[i+4] = r0[j+4], r1[j+4], r2[j+4], r3[j+4], r4[j+4]
-
- r0[j], r0[j+1], r0[j+2], r0[j+3], r0[j+4] = b00, b10, b20, b30, b40
- r1[j], r1[j+1], r1[j+2], r1[j+3], r1[j+4] = b01, b11, b21, b31, b41
- r2[j], r2[j+1], r2[j+2], r2[j+3], r2[j+4] = b02, b12, b22, b32, b42
- r3[j], r3[j+1], r3[j+2], r3[j+3], r3[j+4] = b03, b13, b23, b33, b43
- r4[j], r4[j+1], r4[j+2], r4[j+3], r4[j+4] = b04, b14, b24, b34, b44
- }
- }
-
- // Do the fringe
- for ; i < len(m); i++ {
- j := 0
- r := m[i]
- for ; j < i; j++ {
- t := r[j]
- r[j] = m[j][i]
- m[j][i] = t
- }
- }
-}
-
-// transpose5bBlocked is just like transpose5aBlocked
-// but rewritten to reduce register pressure in the
-// inner loop.
-func transpose5bBlocked(m [][]int32) {
- const B = 5
- N := len(m)
- i := 0
- for ; i <= len(m)-B; i += B {
- r0, r1, r2, r3, r4 := m[i], m[i+1], m[i+2], m[i+3], m[i+4]
- if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N {
- panic("Early bounds check failure")
- }
- // transpose around diagonal
- d01, d10 := r0[i+1], r1[i]
- d02, d20 := r0[i+2], r2[i]
- d03, d30 := r0[i+3], r3[i]
- d04, d40 := r0[i+4], r4[i]
- r0[i+1], r1[i] = d10, d01
- r0[i+2], r2[i] = d20, d02
- r0[i+3], r3[i] = d30, d03
- r0[i+4], r4[i] = d40, d04
-
- d12, d21 := r1[i+2], r2[i+1]
- d13, d31 := r1[i+3], r3[i+1]
- d14, d41 := r1[i+4], r4[i+1]
- r1[i+2], r2[i+1] = d21, d12
- r1[i+3], r3[i+1] = d31, d13
- r1[i+4], r4[i+1] = d41, d14
-
- d23, d32 := r2[i+3], r3[i+2]
- d24, d42 := r2[i+4], r4[i+2]
- r2[i+3], r3[i+2] = d32, d23
- r2[i+4], r4[i+2] = d42, d24
-
- d34, d43 := r3[i+4], r4[i+3]
- r3[i+4], r4[i+3] = d43, d34
-
- // transpose across diagonal
- j := 0
- for ; j < i; j += B {
- a4, a0, a1, a2, a3 := m[j+4], m[j], m[j+1], m[j+2], m[j+3]
-
- // Process column i+4
- temp0 := a0[i+4]
- temp1 := a1[i+4]
- temp2 := a2[i+4]
- temp3 := a3[i+4]
- temp4 := a4[i+4]
-
- a4[i+4] = r4[j+4]
- a0[i+4] = r4[j]
- a1[i+4] = r4[j+1]
- a2[i+4] = r4[j+2]
- a3[i+4] = r4[j+3]
-
- r0[j+4] = temp0
- r1[j+4] = temp1
- r2[j+4] = temp2
- r3[j+4] = temp3
- r4[j+4] = temp4
-
- // Process column i
- temp0 = a0[i]
- temp1 = a1[i]
- temp2 = a2[i]
- temp3 = a3[i]
- temp4 = a4[i]
-
- a4[i] = r0[j+4]
- a0[i] = r0[j]
- a1[i] = r0[j+1]
- a2[i] = r0[j+2]
- a3[i] = r0[j+3]
-
- r0[j] = temp0
- r1[j] = temp1
- r2[j] = temp2
- r3[j] = temp3
- r4[j] = temp4
-
- // Process column i+1
- temp0 = a0[i+1]
- temp1 = a1[i+1]
- temp2 = a2[i+1]
- temp3 = a3[i+1]
- temp4 = a4[i+1]
-
- a4[i+1] = r1[j+4]
- a0[i+1] = r1[j]
- a1[i+1] = r1[j+1]
- a2[i+1] = r1[j+2]
- a3[i+1] = r1[j+3]
-
- r0[j+1] = temp0
- r1[j+1] = temp1
- r2[j+1] = temp2
- r3[j+1] = temp3
- r4[j+1] = temp4
-
- // Process column i+2
- temp0 = a0[i+2]
- temp1 = a1[i+2]
- temp2 = a2[i+2]
- temp3 = a3[i+2]
- temp4 = a4[i+2]
-
- a4[i+2] = r2[j+4]
- a0[i+2] = r2[j]
- a1[i+2] = r2[j+1]
- a2[i+2] = r2[j+2]
- a3[i+2] = r2[j+3]
-
- r0[j+2] = temp0
- r1[j+2] = temp1
- r2[j+2] = temp2
- r3[j+2] = temp3
- r4[j+2] = temp4
-
- // Process column i+3
- temp0 = a0[i+3]
- temp1 = a1[i+3]
- temp2 = a2[i+3]
- temp3 = a3[i+3]
- temp4 = a4[i+3]
-
- a4[i+3] = r3[j+4]
- a0[i+3] = r3[j]
- a1[i+3] = r3[j+1]
- a2[i+3] = r3[j+2]
- a3[i+3] = r3[j+3]
-
- r0[j+3] = temp0
- r1[j+3] = temp1
- r2[j+3] = temp2
- r3[j+3] = temp3
- r4[j+3] = temp4
- }
- }
-
- // Do the fringe
- for ; i < len(m); i++ {
- j := 0
- r := m[i]
- for ; j < i; j++ {
- t := r[j]
- r[j] = m[j][i]
- m[j][i] = t
- }
- }
-}
-
-func transposeTiled4(m [][]int32) {
- const B = 4
- N := len(m)
- i := 0
- for ; i < len(m)-(B-1); i += B {
- r0, r1, r2, r3 := m[i], m[i+1], m[i+2], m[i+3]
- if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N {
- panic("Early bounds check failure")
- }
- // transpose diagonal
- d0, d1, d2, d3 :=
- simd.LoadInt32x4Slice(r0[i:]),
- simd.LoadInt32x4Slice(r1[i:]),
- simd.LoadInt32x4Slice(r2[i:]),
- simd.LoadInt32x4Slice(r3[i:])
-
- d0, d1, d2, d3 = Transpose4(d0, d1, d2, d3)
-
- d0.StoreSlice(r0[i:])
- d1.StoreSlice(r1[i:])
- d2.StoreSlice(r2[i:])
- d3.StoreSlice(r3[i:])
-
- // transpose across diagonal
- j := 0
- for ; j < i; j += B {
- a0, a1, a2, a3 := m[j], m[j+1], m[j+2], m[j+3]
- u0, u1, u2, u3 :=
- simd.LoadInt32x4Slice(a0[i:]),
- simd.LoadInt32x4Slice(a1[i:]),
- simd.LoadInt32x4Slice(a2[i:]),
- simd.LoadInt32x4Slice(a3[i:])
-
- u0, u1, u2, u3 = Transpose4(u0, u1, u2, u3)
-
- l0 := simd.LoadInt32x4Slice(r0[j:])
- u0.StoreSlice(r0[j:])
- l1 := simd.LoadInt32x4Slice(r1[j:])
- u1.StoreSlice(r1[j:])
- l2 := simd.LoadInt32x4Slice(r2[j:])
- u2.StoreSlice(r2[j:])
- l3 := simd.LoadInt32x4Slice(r3[j:])
- u3.StoreSlice(r3[j:])
-
- u0, u1, u2, u3 = Transpose4(l0, l1, l2, l3)
-
- u0.StoreSlice(a0[i:])
- u1.StoreSlice(a1[i:])
- u2.StoreSlice(a2[i:])
- u3.StoreSlice(a3[i:])
- }
- }
- // Do the fringe
- for ; i < len(m); i++ {
- j := 0
- r := m[i]
- for ; j < i; j++ {
- t := r[j]
- r[j] = m[j][i]
- m[j][i] = t
- }
- }
-}
-
-func transposeTiled8(m [][]int32) {
- const B = 8
- N := len(m)
- i := 0
- for ; i < len(m)-(B-1); i += B {
- r0, r1, r2, r3, r4, r5, r6, r7 := m[i], m[i+1], m[i+2], m[i+3], m[i+4], m[i+5], m[i+6], m[i+7]
- if len(r0) < N || len(r1) < N || len(r2) < N || len(r3) < N || len(r4) < N || len(r5) < N || len(r6) < N || len(r7) < N {
- panic("Early bounds check failure")
- }
- // transpose diagonal
- d0, d1, d2, d3, d4, d5, d6, d7 :=
- simd.LoadInt32x8Slice(r0[i:]),
- simd.LoadInt32x8Slice(r1[i:]),
- simd.LoadInt32x8Slice(r2[i:]),
- simd.LoadInt32x8Slice(r3[i:]),
- simd.LoadInt32x8Slice(r4[i:]),
- simd.LoadInt32x8Slice(r5[i:]),
- simd.LoadInt32x8Slice(r6[i:]),
- simd.LoadInt32x8Slice(r7[i:])
-
- d0, d1, d2, d3, d4, d5, d6, d7 = Transpose8(d0, d1, d2, d3, d4, d5, d6, d7)
-
- d0.StoreSlice(r0[i:])
- d1.StoreSlice(r1[i:])
- d2.StoreSlice(r2[i:])
- d3.StoreSlice(r3[i:])
- d4.StoreSlice(r4[i:])
- d5.StoreSlice(r5[i:])
- d6.StoreSlice(r6[i:])
- d7.StoreSlice(r7[i:])
-
- // transpose across diagonal
- j := 0
- for ; j < i; j += B {
- a7, a0, a1, a2, a3, a4, a5, a6 := m[j+7], m[j], m[j+1], m[j+2], m[j+3], m[j+4], m[j+5], m[j+6]
- u0, u1, u2, u3, u4, u5, u6, u7 :=
- simd.LoadInt32x8Slice(a0[i:]),
- simd.LoadInt32x8Slice(a1[i:]),
- simd.LoadInt32x8Slice(a2[i:]),
- simd.LoadInt32x8Slice(a3[i:]),
- simd.LoadInt32x8Slice(a4[i:]),
- simd.LoadInt32x8Slice(a5[i:]),
- simd.LoadInt32x8Slice(a6[i:]),
- simd.LoadInt32x8Slice(a7[i:])
-
- u0, u1, u2, u3, u4, u5, u6, u7 = Transpose8(u0, u1, u2, u3, u4, u5, u6, u7)
-
- l0 := simd.LoadInt32x8Slice(r0[j:])
- u0.StoreSlice(r0[j:])
- l1 := simd.LoadInt32x8Slice(r1[j:])
- u1.StoreSlice(r1[j:])
- l2 := simd.LoadInt32x8Slice(r2[j:])
- u2.StoreSlice(r2[j:])
- l3 := simd.LoadInt32x8Slice(r3[j:])
- u3.StoreSlice(r3[j:])
- l4 := simd.LoadInt32x8Slice(r4[j:])
- u4.StoreSlice(r4[j:])
- l5 := simd.LoadInt32x8Slice(r5[j:])
- u5.StoreSlice(r5[j:])
- l6 := simd.LoadInt32x8Slice(r6[j:])
- u6.StoreSlice(r6[j:])
- l7 := simd.LoadInt32x8Slice(r7[j:])
- u7.StoreSlice(r7[j:])
-
- u0, u1, u2, u3, u4, u5, u6, u7 = Transpose8(l0, l1, l2, l3, l4, l5, l6, l7)
-
- u0.StoreSlice(a0[i:])
- u1.StoreSlice(a1[i:])
- u2.StoreSlice(a2[i:])
- u3.StoreSlice(a3[i:])
- u4.StoreSlice(a4[i:])
- u5.StoreSlice(a5[i:])
- u6.StoreSlice(a6[i:])
- u7.StoreSlice(a7[i:])
- }
- }
- // Do the fringe
- for ; i < len(m); i++ {
- j := 0
- r := m[i]
- for ; j < i; j++ {
- t := r[j]
- r[j] = m[j][i]
- m[j][i] = t
- }
- }
-}
+++ /dev/null
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-// This file contains functions testing unary simd methods.
-// Each function in this file is specialized for a
-// particular simd type <BaseType><Width>x<Count>.
-
-package simd_test
-
-import (
- "simd"
- "testing"
-)
-
-// testInt8x16Unary tests the simd unary method f against the expected behavior generated by want
-func testInt8x16Unary(t *testing.T, f func(_ simd.Int8x16) simd.Int8x16, want func(_ []int8) []int8) {
- n := 16
- t.Helper()
- forSlice(t, int8s, n, func(x []int8) bool {
- t.Helper()
- a := simd.LoadInt8x16Slice(x)
- g := make([]int8, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt16x8Unary tests the simd unary method f against the expected behavior generated by want
-func testInt16x8Unary(t *testing.T, f func(_ simd.Int16x8) simd.Int16x8, want func(_ []int16) []int16) {
- n := 8
- t.Helper()
- forSlice(t, int16s, n, func(x []int16) bool {
- t.Helper()
- a := simd.LoadInt16x8Slice(x)
- g := make([]int16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x4Unary tests the simd unary method f against the expected behavior generated by want
-func testInt32x4Unary(t *testing.T, f func(_ simd.Int32x4) simd.Int32x4, want func(_ []int32) []int32) {
- n := 4
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x4Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt64x2Unary tests the simd unary method f against the expected behavior generated by want
-func testInt64x2Unary(t *testing.T, f func(_ simd.Int64x2) simd.Int64x2, want func(_ []int64) []int64) {
- n := 2
- t.Helper()
- forSlice(t, int64s, n, func(x []int64) bool {
- t.Helper()
- a := simd.LoadInt64x2Slice(x)
- g := make([]int64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint8x16Unary tests the simd unary method f against the expected behavior generated by want
-func testUint8x16Unary(t *testing.T, f func(_ simd.Uint8x16) simd.Uint8x16, want func(_ []uint8) []uint8) {
- n := 16
- t.Helper()
- forSlice(t, uint8s, n, func(x []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x16Slice(x)
- g := make([]uint8, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint16x8Unary tests the simd unary method f against the expected behavior generated by want
-func testUint16x8Unary(t *testing.T, f func(_ simd.Uint16x8) simd.Uint16x8, want func(_ []uint16) []uint16) {
- n := 8
- t.Helper()
- forSlice(t, uint16s, n, func(x []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x8Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x4Unary tests the simd unary method f against the expected behavior generated by want
-func testUint32x4Unary(t *testing.T, f func(_ simd.Uint32x4) simd.Uint32x4, want func(_ []uint32) []uint32) {
- n := 4
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x4Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint64x2Unary tests the simd unary method f against the expected behavior generated by want
-func testUint64x2Unary(t *testing.T, f func(_ simd.Uint64x2) simd.Uint64x2, want func(_ []uint64) []uint64) {
- n := 2
- t.Helper()
- forSlice(t, uint64s, n, func(x []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x2Slice(x)
- g := make([]uint64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x4Unary tests the simd unary method f against the expected behavior generated by want
-func testFloat32x4Unary(t *testing.T, f func(_ simd.Float32x4) simd.Float32x4, want func(_ []float32) []float32) {
- n := 4
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x4Slice(x)
- g := make([]float32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x2Unary tests the simd unary method f against the expected behavior generated by want
-func testFloat64x2Unary(t *testing.T, f func(_ simd.Float64x2) simd.Float64x2, want func(_ []float64) []float64) {
- n := 2
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x2Slice(x)
- g := make([]float64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt8x32Unary tests the simd unary method f against the expected behavior generated by want
-func testInt8x32Unary(t *testing.T, f func(_ simd.Int8x32) simd.Int8x32, want func(_ []int8) []int8) {
- n := 32
- t.Helper()
- forSlice(t, int8s, n, func(x []int8) bool {
- t.Helper()
- a := simd.LoadInt8x32Slice(x)
- g := make([]int8, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt16x16Unary tests the simd unary method f against the expected behavior generated by want
-func testInt16x16Unary(t *testing.T, f func(_ simd.Int16x16) simd.Int16x16, want func(_ []int16) []int16) {
- n := 16
- t.Helper()
- forSlice(t, int16s, n, func(x []int16) bool {
- t.Helper()
- a := simd.LoadInt16x16Slice(x)
- g := make([]int16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x8Unary tests the simd unary method f against the expected behavior generated by want
-func testInt32x8Unary(t *testing.T, f func(_ simd.Int32x8) simd.Int32x8, want func(_ []int32) []int32) {
- n := 8
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x8Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt64x4Unary tests the simd unary method f against the expected behavior generated by want
-func testInt64x4Unary(t *testing.T, f func(_ simd.Int64x4) simd.Int64x4, want func(_ []int64) []int64) {
- n := 4
- t.Helper()
- forSlice(t, int64s, n, func(x []int64) bool {
- t.Helper()
- a := simd.LoadInt64x4Slice(x)
- g := make([]int64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint8x32Unary tests the simd unary method f against the expected behavior generated by want
-func testUint8x32Unary(t *testing.T, f func(_ simd.Uint8x32) simd.Uint8x32, want func(_ []uint8) []uint8) {
- n := 32
- t.Helper()
- forSlice(t, uint8s, n, func(x []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x32Slice(x)
- g := make([]uint8, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint16x16Unary tests the simd unary method f against the expected behavior generated by want
-func testUint16x16Unary(t *testing.T, f func(_ simd.Uint16x16) simd.Uint16x16, want func(_ []uint16) []uint16) {
- n := 16
- t.Helper()
- forSlice(t, uint16s, n, func(x []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x16Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x8Unary tests the simd unary method f against the expected behavior generated by want
-func testUint32x8Unary(t *testing.T, f func(_ simd.Uint32x8) simd.Uint32x8, want func(_ []uint32) []uint32) {
- n := 8
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x8Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint64x4Unary tests the simd unary method f against the expected behavior generated by want
-func testUint64x4Unary(t *testing.T, f func(_ simd.Uint64x4) simd.Uint64x4, want func(_ []uint64) []uint64) {
- n := 4
- t.Helper()
- forSlice(t, uint64s, n, func(x []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x4Slice(x)
- g := make([]uint64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x8Unary tests the simd unary method f against the expected behavior generated by want
-func testFloat32x8Unary(t *testing.T, f func(_ simd.Float32x8) simd.Float32x8, want func(_ []float32) []float32) {
- n := 8
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x8Slice(x)
- g := make([]float32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x4Unary tests the simd unary method f against the expected behavior generated by want
-func testFloat64x4Unary(t *testing.T, f func(_ simd.Float64x4) simd.Float64x4, want func(_ []float64) []float64) {
- n := 4
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x4Slice(x)
- g := make([]float64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt8x64Unary tests the simd unary method f against the expected behavior generated by want
-func testInt8x64Unary(t *testing.T, f func(_ simd.Int8x64) simd.Int8x64, want func(_ []int8) []int8) {
- n := 64
- t.Helper()
- forSlice(t, int8s, n, func(x []int8) bool {
- t.Helper()
- a := simd.LoadInt8x64Slice(x)
- g := make([]int8, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt16x32Unary tests the simd unary method f against the expected behavior generated by want
-func testInt16x32Unary(t *testing.T, f func(_ simd.Int16x32) simd.Int16x32, want func(_ []int16) []int16) {
- n := 32
- t.Helper()
- forSlice(t, int16s, n, func(x []int16) bool {
- t.Helper()
- a := simd.LoadInt16x32Slice(x)
- g := make([]int16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x16Unary tests the simd unary method f against the expected behavior generated by want
-func testInt32x16Unary(t *testing.T, f func(_ simd.Int32x16) simd.Int32x16, want func(_ []int32) []int32) {
- n := 16
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x16Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt64x8Unary tests the simd unary method f against the expected behavior generated by want
-func testInt64x8Unary(t *testing.T, f func(_ simd.Int64x8) simd.Int64x8, want func(_ []int64) []int64) {
- n := 8
- t.Helper()
- forSlice(t, int64s, n, func(x []int64) bool {
- t.Helper()
- a := simd.LoadInt64x8Slice(x)
- g := make([]int64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint8x64Unary tests the simd unary method f against the expected behavior generated by want
-func testUint8x64Unary(t *testing.T, f func(_ simd.Uint8x64) simd.Uint8x64, want func(_ []uint8) []uint8) {
- n := 64
- t.Helper()
- forSlice(t, uint8s, n, func(x []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x64Slice(x)
- g := make([]uint8, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint16x32Unary tests the simd unary method f against the expected behavior generated by want
-func testUint16x32Unary(t *testing.T, f func(_ simd.Uint16x32) simd.Uint16x32, want func(_ []uint16) []uint16) {
- n := 32
- t.Helper()
- forSlice(t, uint16s, n, func(x []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x32Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x16Unary tests the simd unary method f against the expected behavior generated by want
-func testUint32x16Unary(t *testing.T, f func(_ simd.Uint32x16) simd.Uint32x16, want func(_ []uint32) []uint32) {
- n := 16
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x16Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint64x8Unary tests the simd unary method f against the expected behavior generated by want
-func testUint64x8Unary(t *testing.T, f func(_ simd.Uint64x8) simd.Uint64x8, want func(_ []uint64) []uint64) {
- n := 8
- t.Helper()
- forSlice(t, uint64s, n, func(x []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x8Slice(x)
- g := make([]uint64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x16Unary tests the simd unary method f against the expected behavior generated by want
-func testFloat32x16Unary(t *testing.T, f func(_ simd.Float32x16) simd.Float32x16, want func(_ []float32) []float32) {
- n := 16
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x16Slice(x)
- g := make([]float32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x8Unary tests the simd unary method f against the expected behavior generated by want
-func testFloat64x8Unary(t *testing.T, f func(_ simd.Float64x8) simd.Float64x8, want func(_ []float64) []float64) {
- n := 8
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x8Slice(x)
- g := make([]float64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt8x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt8x16ConvertToInt32(t *testing.T, f func(x simd.Int8x16) simd.Int32x16, want func(x []int8) []int32) {
- n := 16
- t.Helper()
- forSlice(t, int8s, n, func(x []int8) bool {
- t.Helper()
- a := simd.LoadInt8x16Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt16x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt16x8ConvertToInt32(t *testing.T, f func(x simd.Int16x8) simd.Int32x8, want func(x []int16) []int32) {
- n := 8
- t.Helper()
- forSlice(t, int16s, n, func(x []int16) bool {
- t.Helper()
- a := simd.LoadInt16x8Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt32x4ConvertToInt32(t *testing.T, f func(x simd.Int32x4) simd.Int32x4, want func(x []int32) []int32) {
- n := 4
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x4Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint8x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint8x16ConvertToInt32(t *testing.T, f func(x simd.Uint8x16) simd.Int32x16, want func(x []uint8) []int32) {
- n := 16
- t.Helper()
- forSlice(t, uint8s, n, func(x []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x16Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint16x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint16x8ConvertToInt32(t *testing.T, f func(x simd.Uint16x8) simd.Int32x8, want func(x []uint16) []int32) {
- n := 8
- t.Helper()
- forSlice(t, uint16s, n, func(x []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x8Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint32x4ConvertToInt32(t *testing.T, f func(x simd.Uint32x4) simd.Int32x4, want func(x []uint32) []int32) {
- n := 4
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x4Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat32x4ConvertToInt32(t *testing.T, f func(x simd.Float32x4) simd.Int32x4, want func(x []float32) []int32) {
- n := 4
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x4Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt16x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt16x16ConvertToInt32(t *testing.T, f func(x simd.Int16x16) simd.Int32x16, want func(x []int16) []int32) {
- n := 16
- t.Helper()
- forSlice(t, int16s, n, func(x []int16) bool {
- t.Helper()
- a := simd.LoadInt16x16Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt32x8ConvertToInt32(t *testing.T, f func(x simd.Int32x8) simd.Int32x8, want func(x []int32) []int32) {
- n := 8
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x8Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt64x4ConvertToInt32(t *testing.T, f func(x simd.Int64x4) simd.Int32x4, want func(x []int64) []int32) {
- n := 4
- t.Helper()
- forSlice(t, int64s, n, func(x []int64) bool {
- t.Helper()
- a := simd.LoadInt64x4Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint16x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint16x16ConvertToInt32(t *testing.T, f func(x simd.Uint16x16) simd.Int32x16, want func(x []uint16) []int32) {
- n := 16
- t.Helper()
- forSlice(t, uint16s, n, func(x []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x16Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint32x8ConvertToInt32(t *testing.T, f func(x simd.Uint32x8) simd.Int32x8, want func(x []uint32) []int32) {
- n := 8
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x8Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint64x4ConvertToInt32(t *testing.T, f func(x simd.Uint64x4) simd.Int32x4, want func(x []uint64) []int32) {
- n := 4
- t.Helper()
- forSlice(t, uint64s, n, func(x []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x4Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat32x8ConvertToInt32(t *testing.T, f func(x simd.Float32x8) simd.Int32x8, want func(x []float32) []int32) {
- n := 8
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x8Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x4ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat64x4ConvertToInt32(t *testing.T, f func(x simd.Float64x4) simd.Int32x4, want func(x []float64) []int32) {
- n := 4
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x4Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt32x16ConvertToInt32(t *testing.T, f func(x simd.Int32x16) simd.Int32x16, want func(x []int32) []int32) {
- n := 16
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x16Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt64x8ConvertToInt32(t *testing.T, f func(x simd.Int64x8) simd.Int32x8, want func(x []int64) []int32) {
- n := 8
- t.Helper()
- forSlice(t, int64s, n, func(x []int64) bool {
- t.Helper()
- a := simd.LoadInt64x8Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint32x16ConvertToInt32(t *testing.T, f func(x simd.Uint32x16) simd.Int32x16, want func(x []uint32) []int32) {
- n := 16
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x16Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint64x8ConvertToInt32(t *testing.T, f func(x simd.Uint64x8) simd.Int32x8, want func(x []uint64) []int32) {
- n := 8
- t.Helper()
- forSlice(t, uint64s, n, func(x []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x8Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x16ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat32x16ConvertToInt32(t *testing.T, f func(x simd.Float32x16) simd.Int32x16, want func(x []float32) []int32) {
- n := 16
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x16Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x8ConvertToInt32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat64x8ConvertToInt32(t *testing.T, f func(x simd.Float64x8) simd.Int32x8, want func(x []float64) []int32) {
- n := 8
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x8Slice(x)
- g := make([]int32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt8x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt8x16ConvertToUint32(t *testing.T, f func(x simd.Int8x16) simd.Uint32x16, want func(x []int8) []uint32) {
- n := 16
- t.Helper()
- forSlice(t, int8s, n, func(x []int8) bool {
- t.Helper()
- a := simd.LoadInt8x16Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt16x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt16x8ConvertToUint32(t *testing.T, f func(x simd.Int16x8) simd.Uint32x8, want func(x []int16) []uint32) {
- n := 8
- t.Helper()
- forSlice(t, int16s, n, func(x []int16) bool {
- t.Helper()
- a := simd.LoadInt16x8Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt32x4ConvertToUint32(t *testing.T, f func(x simd.Int32x4) simd.Uint32x4, want func(x []int32) []uint32) {
- n := 4
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x4Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint8x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint8x16ConvertToUint32(t *testing.T, f func(x simd.Uint8x16) simd.Uint32x16, want func(x []uint8) []uint32) {
- n := 16
- t.Helper()
- forSlice(t, uint8s, n, func(x []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x16Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint16x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint16x8ConvertToUint32(t *testing.T, f func(x simd.Uint16x8) simd.Uint32x8, want func(x []uint16) []uint32) {
- n := 8
- t.Helper()
- forSlice(t, uint16s, n, func(x []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x8Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint32x4ConvertToUint32(t *testing.T, f func(x simd.Uint32x4) simd.Uint32x4, want func(x []uint32) []uint32) {
- n := 4
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x4Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat32x4ConvertToUint32(t *testing.T, f func(x simd.Float32x4) simd.Uint32x4, want func(x []float32) []uint32) {
- n := 4
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x4Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt16x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt16x16ConvertToUint32(t *testing.T, f func(x simd.Int16x16) simd.Uint32x16, want func(x []int16) []uint32) {
- n := 16
- t.Helper()
- forSlice(t, int16s, n, func(x []int16) bool {
- t.Helper()
- a := simd.LoadInt16x16Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt32x8ConvertToUint32(t *testing.T, f func(x simd.Int32x8) simd.Uint32x8, want func(x []int32) []uint32) {
- n := 8
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x8Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt64x4ConvertToUint32(t *testing.T, f func(x simd.Int64x4) simd.Uint32x4, want func(x []int64) []uint32) {
- n := 4
- t.Helper()
- forSlice(t, int64s, n, func(x []int64) bool {
- t.Helper()
- a := simd.LoadInt64x4Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint16x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint16x16ConvertToUint32(t *testing.T, f func(x simd.Uint16x16) simd.Uint32x16, want func(x []uint16) []uint32) {
- n := 16
- t.Helper()
- forSlice(t, uint16s, n, func(x []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x16Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint32x8ConvertToUint32(t *testing.T, f func(x simd.Uint32x8) simd.Uint32x8, want func(x []uint32) []uint32) {
- n := 8
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x8Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint64x4ConvertToUint32(t *testing.T, f func(x simd.Uint64x4) simd.Uint32x4, want func(x []uint64) []uint32) {
- n := 4
- t.Helper()
- forSlice(t, uint64s, n, func(x []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x4Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat32x8ConvertToUint32(t *testing.T, f func(x simd.Float32x8) simd.Uint32x8, want func(x []float32) []uint32) {
- n := 8
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x8Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x4ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat64x4ConvertToUint32(t *testing.T, f func(x simd.Float64x4) simd.Uint32x4, want func(x []float64) []uint32) {
- n := 4
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x4Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt32x16ConvertToUint32(t *testing.T, f func(x simd.Int32x16) simd.Uint32x16, want func(x []int32) []uint32) {
- n := 16
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x16Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt64x8ConvertToUint32(t *testing.T, f func(x simd.Int64x8) simd.Uint32x8, want func(x []int64) []uint32) {
- n := 8
- t.Helper()
- forSlice(t, int64s, n, func(x []int64) bool {
- t.Helper()
- a := simd.LoadInt64x8Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint32x16ConvertToUint32(t *testing.T, f func(x simd.Uint32x16) simd.Uint32x16, want func(x []uint32) []uint32) {
- n := 16
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x16Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint64x8ConvertToUint32(t *testing.T, f func(x simd.Uint64x8) simd.Uint32x8, want func(x []uint64) []uint32) {
- n := 8
- t.Helper()
- forSlice(t, uint64s, n, func(x []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x8Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x16ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat32x16ConvertToUint32(t *testing.T, f func(x simd.Float32x16) simd.Uint32x16, want func(x []float32) []uint32) {
- n := 16
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x16Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x8ConvertToUint32 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat64x8ConvertToUint32(t *testing.T, f func(x simd.Float64x8) simd.Uint32x8, want func(x []float64) []uint32) {
- n := 8
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x8Slice(x)
- g := make([]uint32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt8x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt8x16ConvertToUint16(t *testing.T, f func(x simd.Int8x16) simd.Uint16x16, want func(x []int8) []uint16) {
- n := 16
- t.Helper()
- forSlice(t, int8s, n, func(x []int8) bool {
- t.Helper()
- a := simd.LoadInt8x16Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt16x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt16x8ConvertToUint16(t *testing.T, f func(x simd.Int16x8) simd.Uint16x8, want func(x []int16) []uint16) {
- n := 8
- t.Helper()
- forSlice(t, int16s, n, func(x []int16) bool {
- t.Helper()
- a := simd.LoadInt16x8Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint8x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint8x16ConvertToUint16(t *testing.T, f func(x simd.Uint8x16) simd.Uint16x16, want func(x []uint8) []uint16) {
- n := 16
- t.Helper()
- forSlice(t, uint8s, n, func(x []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x16Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint16x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint16x8ConvertToUint16(t *testing.T, f func(x simd.Uint16x8) simd.Uint16x8, want func(x []uint16) []uint16) {
- n := 8
- t.Helper()
- forSlice(t, uint16s, n, func(x []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x8Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt8x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt8x32ConvertToUint16(t *testing.T, f func(x simd.Int8x32) simd.Uint16x32, want func(x []int8) []uint16) {
- n := 32
- t.Helper()
- forSlice(t, int8s, n, func(x []int8) bool {
- t.Helper()
- a := simd.LoadInt8x32Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt16x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt16x16ConvertToUint16(t *testing.T, f func(x simd.Int16x16) simd.Uint16x16, want func(x []int16) []uint16) {
- n := 16
- t.Helper()
- forSlice(t, int16s, n, func(x []int16) bool {
- t.Helper()
- a := simd.LoadInt16x16Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt32x8ConvertToUint16(t *testing.T, f func(x simd.Int32x8) simd.Uint16x8, want func(x []int32) []uint16) {
- n := 8
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x8Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint8x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint8x32ConvertToUint16(t *testing.T, f func(x simd.Uint8x32) simd.Uint16x32, want func(x []uint8) []uint16) {
- n := 32
- t.Helper()
- forSlice(t, uint8s, n, func(x []uint8) bool {
- t.Helper()
- a := simd.LoadUint8x32Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint16x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint16x16ConvertToUint16(t *testing.T, f func(x simd.Uint16x16) simd.Uint16x16, want func(x []uint16) []uint16) {
- n := 16
- t.Helper()
- forSlice(t, uint16s, n, func(x []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x16Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint32x8ConvertToUint16(t *testing.T, f func(x simd.Uint32x8) simd.Uint16x8, want func(x []uint32) []uint16) {
- n := 8
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x8Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat32x8ConvertToUint16(t *testing.T, f func(x simd.Float32x8) simd.Uint16x8, want func(x []float32) []uint16) {
- n := 8
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x8Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt16x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt16x32ConvertToUint16(t *testing.T, f func(x simd.Int16x32) simd.Uint16x32, want func(x []int16) []uint16) {
- n := 32
- t.Helper()
- forSlice(t, int16s, n, func(x []int16) bool {
- t.Helper()
- a := simd.LoadInt16x32Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt32x16ConvertToUint16(t *testing.T, f func(x simd.Int32x16) simd.Uint16x16, want func(x []int32) []uint16) {
- n := 16
- t.Helper()
- forSlice(t, int32s, n, func(x []int32) bool {
- t.Helper()
- a := simd.LoadInt32x16Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testInt64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testInt64x8ConvertToUint16(t *testing.T, f func(x simd.Int64x8) simd.Uint16x8, want func(x []int64) []uint16) {
- n := 8
- t.Helper()
- forSlice(t, int64s, n, func(x []int64) bool {
- t.Helper()
- a := simd.LoadInt64x8Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint16x32ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint16x32ConvertToUint16(t *testing.T, f func(x simd.Uint16x32) simd.Uint16x32, want func(x []uint16) []uint16) {
- n := 32
- t.Helper()
- forSlice(t, uint16s, n, func(x []uint16) bool {
- t.Helper()
- a := simd.LoadUint16x32Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint32x16ConvertToUint16(t *testing.T, f func(x simd.Uint32x16) simd.Uint16x16, want func(x []uint32) []uint16) {
- n := 16
- t.Helper()
- forSlice(t, uint32s, n, func(x []uint32) bool {
- t.Helper()
- a := simd.LoadUint32x16Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testUint64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testUint64x8ConvertToUint16(t *testing.T, f func(x simd.Uint64x8) simd.Uint16x8, want func(x []uint64) []uint16) {
- n := 8
- t.Helper()
- forSlice(t, uint64s, n, func(x []uint64) bool {
- t.Helper()
- a := simd.LoadUint64x8Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x16ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat32x16ConvertToUint16(t *testing.T, f func(x simd.Float32x16) simd.Uint16x16, want func(x []float32) []uint16) {
- n := 16
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x16Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x8ConvertToUint16 tests the simd conversion method f against the expected behavior generated by want
-// This is for count-preserving conversions, so if there is a change in size, then there is a change in vector width.
-func testFloat64x8ConvertToUint16(t *testing.T, f func(x simd.Float64x8) simd.Uint16x8, want func(x []float64) []uint16) {
- n := 8
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x8Slice(x)
- g := make([]uint16, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, 0.0, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x4UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func testFloat32x4UnaryFlaky(t *testing.T, f func(x simd.Float32x4) simd.Float32x4, want func(x []float32) []float32, flakiness float64) {
- n := 4
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x4Slice(x)
- g := make([]float32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x2UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func testFloat64x2UnaryFlaky(t *testing.T, f func(x simd.Float64x2) simd.Float64x2, want func(x []float64) []float64, flakiness float64) {
- n := 2
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x2Slice(x)
- g := make([]float64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x8UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func testFloat32x8UnaryFlaky(t *testing.T, f func(x simd.Float32x8) simd.Float32x8, want func(x []float32) []float32, flakiness float64) {
- n := 8
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x8Slice(x)
- g := make([]float32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x4UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func testFloat64x4UnaryFlaky(t *testing.T, f func(x simd.Float64x4) simd.Float64x4, want func(x []float64) []float64, flakiness float64) {
- n := 4
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x4Slice(x)
- g := make([]float64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat32x16UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func testFloat32x16UnaryFlaky(t *testing.T, f func(x simd.Float32x16) simd.Float32x16, want func(x []float32) []float32, flakiness float64) {
- n := 16
- t.Helper()
- forSlice(t, float32s, n, func(x []float32) bool {
- t.Helper()
- a := simd.LoadFloat32x16Slice(x)
- g := make([]float32, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
-
-// testFloat64x8UnaryFlaky tests the simd unary method f against the expected behavior generated by want,
-// but using a flakiness parameter because we haven't exactly figured out how simd floating point works
-func testFloat64x8UnaryFlaky(t *testing.T, f func(x simd.Float64x8) simd.Float64x8, want func(x []float64) []float64, flakiness float64) {
- n := 8
- t.Helper()
- forSlice(t, float64s, n, func(x []float64) bool {
- t.Helper()
- a := simd.LoadFloat64x8Slice(x)
- g := make([]float64, n)
- f(a).StoreSlice(g)
- w := want(x)
- return checkSlicesLogInput(t, g, w, flakiness, func() { t.Helper(); t.Logf("x=%v", x) })
- })
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd_test
-
-import (
- "math"
- "simd"
- "testing"
-)
-
-func TestCeil(t *testing.T) {
- testFloat32x4Unary(t, simd.Float32x4.Ceil, ceilSlice[float32])
- testFloat32x8Unary(t, simd.Float32x8.Ceil, ceilSlice[float32])
- testFloat64x2Unary(t, simd.Float64x2.Ceil, ceilSlice[float64])
- testFloat64x4Unary(t, simd.Float64x4.Ceil, ceilSlice[float64])
- if simd.X86.AVX512() {
- // testFloat32x16Unary(t, simd.Float32x16.Ceil, ceilSlice[float32]) // missing
- // testFloat64x8Unary(t, simd.Float64x8.Ceil, ceilSlice[float64]) // missing
- }
-}
-
-func TestFloor(t *testing.T) {
- testFloat32x4Unary(t, simd.Float32x4.Floor, floorSlice[float32])
- testFloat32x8Unary(t, simd.Float32x8.Floor, floorSlice[float32])
- testFloat64x2Unary(t, simd.Float64x2.Floor, floorSlice[float64])
- testFloat64x4Unary(t, simd.Float64x4.Floor, floorSlice[float64])
- if simd.X86.AVX512() {
- // testFloat32x16Unary(t, simd.Float32x16.Floor, floorSlice[float32]) // missing
- // testFloat64x8Unary(t, simd.Float64x8.Floor, floorSlice[float64]) // missing
- }
-}
-
-func TestTrunc(t *testing.T) {
- testFloat32x4Unary(t, simd.Float32x4.Trunc, truncSlice[float32])
- testFloat32x8Unary(t, simd.Float32x8.Trunc, truncSlice[float32])
- testFloat64x2Unary(t, simd.Float64x2.Trunc, truncSlice[float64])
- testFloat64x4Unary(t, simd.Float64x4.Trunc, truncSlice[float64])
- if simd.X86.AVX512() {
- // testFloat32x16Unary(t, simd.Float32x16.Trunc, truncSlice[float32]) // missing
- // testFloat64x8Unary(t, simd.Float64x8.Trunc, truncSlice[float64]) // missing
- }
-}
-
-func TestRound(t *testing.T) {
- testFloat32x4Unary(t, simd.Float32x4.RoundToEven, roundSlice[float32])
- testFloat32x8Unary(t, simd.Float32x8.RoundToEven, roundSlice[float32])
- testFloat64x2Unary(t, simd.Float64x2.RoundToEven, roundSlice[float64])
- testFloat64x4Unary(t, simd.Float64x4.RoundToEven, roundSlice[float64])
- if simd.X86.AVX512() {
- // testFloat32x16Unary(t, simd.Float32x16.Round, roundSlice[float32]) // missing
- // testFloat64x8Unary(t, simd.Float64x8.Round, roundSlice[float64]) // missing
- }
-}
-
-func TestSqrt(t *testing.T) {
- testFloat32x4Unary(t, simd.Float32x4.Sqrt, sqrtSlice[float32])
- testFloat32x8Unary(t, simd.Float32x8.Sqrt, sqrtSlice[float32])
- testFloat64x2Unary(t, simd.Float64x2.Sqrt, sqrtSlice[float64])
- testFloat64x4Unary(t, simd.Float64x4.Sqrt, sqrtSlice[float64])
- if simd.X86.AVX512() {
- testFloat32x16Unary(t, simd.Float32x16.Sqrt, sqrtSlice[float32])
- testFloat64x8Unary(t, simd.Float64x8.Sqrt, sqrtSlice[float64])
- }
-}
-
-func TestNot(t *testing.T) {
- testInt8x16Unary(t, simd.Int8x16.Not, map1[int8](not))
- testInt8x32Unary(t, simd.Int8x32.Not, map1[int8](not))
- testInt16x8Unary(t, simd.Int16x8.Not, map1[int16](not))
- testInt16x16Unary(t, simd.Int16x16.Not, map1[int16](not))
- testInt32x4Unary(t, simd.Int32x4.Not, map1[int32](not))
- testInt32x8Unary(t, simd.Int32x8.Not, map1[int32](not))
-}
-
-func TestAbsolute(t *testing.T) {
- testInt8x16Unary(t, simd.Int8x16.Abs, map1[int8](abs))
- testInt8x32Unary(t, simd.Int8x32.Abs, map1[int8](abs))
- testInt16x8Unary(t, simd.Int16x8.Abs, map1[int16](abs))
- testInt16x16Unary(t, simd.Int16x16.Abs, map1[int16](abs))
- testInt32x4Unary(t, simd.Int32x4.Abs, map1[int32](abs))
- testInt32x8Unary(t, simd.Int32x8.Abs, map1[int32](abs))
- if simd.X86.AVX512() {
- testInt8x64Unary(t, simd.Int8x64.Abs, map1[int8](abs))
- testInt16x32Unary(t, simd.Int16x32.Abs, map1[int16](abs))
- testInt32x16Unary(t, simd.Int32x16.Abs, map1[int32](abs))
- testInt64x2Unary(t, simd.Int64x2.Abs, map1[int64](abs))
- testInt64x4Unary(t, simd.Int64x4.Abs, map1[int64](abs))
- testInt64x8Unary(t, simd.Int64x8.Abs, map1[int64](abs))
- }
-}
-
-func TestCeilScaledResidue(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Needs AVX512")
- }
- testFloat64x8UnaryFlaky(t,
- func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(0) },
- map1(ceilResidueForPrecision[float64](0)),
- 0.001)
- testFloat64x8UnaryFlaky(t,
- func(x simd.Float64x8) simd.Float64x8 { return x.CeilScaledResidue(1) },
- map1(ceilResidueForPrecision[float64](1)),
- 0.001)
- testFloat64x8Unary(t,
- func(x simd.Float64x8) simd.Float64x8 { return x.Sub(x.CeilScaled(0)) },
- map1[float64](func(x float64) float64 { return x - math.Ceil(x) }))
-}
-
-func TestToUint32(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Needs AVX512")
- }
- testFloat32x4ConvertToUint32(t, simd.Float32x4.ConvertToUint32, map1[float32](toUint32))
- testFloat32x8ConvertToUint32(t, simd.Float32x8.ConvertToUint32, map1[float32](toUint32))
- testFloat32x16ConvertToUint32(t, simd.Float32x16.ConvertToUint32, map1[float32](toUint32))
-}
-
-func TestToInt32(t *testing.T) {
- testFloat32x4ConvertToInt32(t, simd.Float32x4.ConvertToInt32, map1[float32](toInt32))
- testFloat32x8ConvertToInt32(t, simd.Float32x8.ConvertToInt32, map1[float32](toInt32))
-}
-
-func TestConverts(t *testing.T) {
- testUint8x16ConvertToUint16(t, simd.Uint8x16.ExtendToUint16, map1[uint8](toUint16))
- testUint16x8ConvertToUint32(t, simd.Uint16x8.ExtendToUint32, map1[uint16](toUint32))
-}
-
-func TestConvertsAVX512(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("Needs AVX512")
- }
- testUint8x32ConvertToUint16(t, simd.Uint8x32.ExtendToUint16, map1[uint8](toUint16))
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package test_helpers
-
-import (
- "math"
- "testing"
-)
-
-type signed interface {
- ~int | ~int8 | ~int16 | ~int32 | ~int64
-}
-
-type integer interface {
- ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr
-}
-
-type float interface {
- ~float32 | ~float64
-}
-
-type number interface {
- ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64
-}
-
-func CheckSlices[T number](t *testing.T, got, want []T) bool {
- t.Helper()
- return CheckSlicesLogInput[T](t, got, want, 0.0, nil)
-}
-
-// CheckSlices compares two slices for equality,
-// reporting a test error if there is a problem,
-// and also consumes the two slices so that a
-// test/benchmark won't be dead-code eliminated.
-func CheckSlicesLogInput[T number](t *testing.T, got, want []T, flakiness float64, logInput func()) bool {
- t.Helper()
- var z T
- for i := range want {
- if got[i] != want[i] {
- var ia any = got[i]
- var ib any = want[i]
- switch x := ia.(type) {
- case float32:
- y := ib.(float32)
- if math.IsNaN(float64(x)) && math.IsNaN(float64(y)) {
- continue
- }
- if flakiness > 0 {
- if y == 0 {
- if math.Abs(float64(x)) < flakiness {
- continue
- }
- } else {
- if math.Abs(float64((x-y)/y)) < flakiness {
- continue
- }
- }
- }
- case float64:
- y := ib.(float64)
- if math.IsNaN(x) && math.IsNaN(y) {
- continue
- }
- if flakiness > 0 {
- if y == 0 {
- if math.Abs(x) < flakiness {
- continue
- }
- } else if math.Abs((x-y)/y) < flakiness {
- continue
- }
- }
-
- default:
- }
-
- t.Logf("For %T vector elements:", z)
- t.Logf("got =%v", got)
- t.Logf("want=%v", want)
- if logInput != nil {
- logInput()
- }
- t.Errorf("at index %d, got=%v, want=%v", i, got[i], want[i])
- return false
- } else if got[i] == 0 { // for floating point, 0.0 == -0.0 but a bitwise check can see the difference
- var ia any = got[i]
- var ib any = want[i]
- switch x := ia.(type) {
- case float32:
- y := ib.(float32)
- if math.Float32bits(x) != math.Float32bits(y) {
- t.Logf("For %T vector elements:", z)
- t.Logf("got =%v", got)
- t.Logf("want=%v", want)
- if logInput != nil {
- logInput()
- }
- t.Errorf("at index %d, different signs of zero", i)
- return false
- }
- case float64:
- y := ib.(float64)
- if math.Float64bits(x) != math.Float64bits(y) {
- t.Logf("For %T vector elements:", z)
- t.Logf("got =%v", got)
- t.Logf("want=%v", want)
- if logInput != nil {
- logInput()
- }
- t.Errorf("at index %d, different signs of zero", i)
- return false
- }
- default:
- }
-
- }
- }
- return true
-}
+++ /dev/null
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int8x16) Masked(mask Mask8x16) Int8x16 {
- im := mask.AsInt8x16()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Int8x16) Merge(y Int8x16, mask Mask8x16) Int8x16 {
- im := mask.AsInt8x16()
- return y.blend(x, im)
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int16x8) Masked(mask Mask16x8) Int16x8 {
- im := mask.AsInt16x8()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Int16x8) Merge(y Int16x8, mask Mask16x8) Int16x8 {
- im := mask.AsInt16x8().AsInt8x16()
- ix := x.AsInt8x16()
- iy := y.AsInt8x16()
- return iy.blend(ix, im).AsInt16x8()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int32x4) Masked(mask Mask32x4) Int32x4 {
- im := mask.AsInt32x4()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Int32x4) Merge(y Int32x4, mask Mask32x4) Int32x4 {
- im := mask.AsInt32x4().AsInt8x16()
- ix := x.AsInt8x16()
- iy := y.AsInt8x16()
- return iy.blend(ix, im).AsInt32x4()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int64x2) Masked(mask Mask64x2) Int64x2 {
- im := mask.AsInt64x2()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Int64x2) Merge(y Int64x2, mask Mask64x2) Int64x2 {
- im := mask.AsInt64x2().AsInt8x16()
- ix := x.AsInt8x16()
- iy := y.AsInt8x16()
- return iy.blend(ix, im).AsInt64x2()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint8x16) Masked(mask Mask8x16) Uint8x16 {
- im := mask.AsInt8x16()
- return x.AsInt8x16().And(im).AsUint8x16()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Uint8x16) Merge(y Uint8x16, mask Mask8x16) Uint8x16 {
- im := mask.AsInt8x16()
- ix := x.AsInt8x16()
- iy := y.AsInt8x16()
- return iy.blend(ix, im).AsUint8x16()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint16x8) Masked(mask Mask16x8) Uint16x8 {
- im := mask.AsInt16x8()
- return x.AsInt16x8().And(im).AsUint16x8()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Uint16x8) Merge(y Uint16x8, mask Mask16x8) Uint16x8 {
- im := mask.AsInt16x8().AsInt8x16()
- ix := x.AsInt8x16()
- iy := y.AsInt8x16()
- return iy.blend(ix, im).AsUint16x8()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint32x4) Masked(mask Mask32x4) Uint32x4 {
- im := mask.AsInt32x4()
- return x.AsInt32x4().And(im).AsUint32x4()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Uint32x4) Merge(y Uint32x4, mask Mask32x4) Uint32x4 {
- im := mask.AsInt32x4().AsInt8x16()
- ix := x.AsInt8x16()
- iy := y.AsInt8x16()
- return iy.blend(ix, im).AsUint32x4()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint64x2) Masked(mask Mask64x2) Uint64x2 {
- im := mask.AsInt64x2()
- return x.AsInt64x2().And(im).AsUint64x2()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Uint64x2) Merge(y Uint64x2, mask Mask64x2) Uint64x2 {
- im := mask.AsInt64x2().AsInt8x16()
- ix := x.AsInt8x16()
- iy := y.AsInt8x16()
- return iy.blend(ix, im).AsUint64x2()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Float32x4) Masked(mask Mask32x4) Float32x4 {
- im := mask.AsInt32x4()
- return x.AsInt32x4().And(im).AsFloat32x4()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Float32x4) Merge(y Float32x4, mask Mask32x4) Float32x4 {
- im := mask.AsInt32x4().AsInt8x16()
- ix := x.AsInt8x16()
- iy := y.AsInt8x16()
- return iy.blend(ix, im).AsFloat32x4()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Float64x2) Masked(mask Mask64x2) Float64x2 {
- im := mask.AsInt64x2()
- return x.AsInt64x2().And(im).AsFloat64x2()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Float64x2) Merge(y Float64x2, mask Mask64x2) Float64x2 {
- im := mask.AsInt64x2().AsInt8x16()
- ix := x.AsInt8x16()
- iy := y.AsInt8x16()
- return iy.blend(ix, im).AsFloat64x2()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int8x32) Masked(mask Mask8x32) Int8x32 {
- im := mask.AsInt8x32()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Int8x32) Merge(y Int8x32, mask Mask8x32) Int8x32 {
- im := mask.AsInt8x32()
- return y.blend(x, im)
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int16x16) Masked(mask Mask16x16) Int16x16 {
- im := mask.AsInt16x16()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Int16x16) Merge(y Int16x16, mask Mask16x16) Int16x16 {
- im := mask.AsInt16x16().AsInt8x32()
- ix := x.AsInt8x32()
- iy := y.AsInt8x32()
- return iy.blend(ix, im).AsInt16x16()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int32x8) Masked(mask Mask32x8) Int32x8 {
- im := mask.AsInt32x8()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Int32x8) Merge(y Int32x8, mask Mask32x8) Int32x8 {
- im := mask.AsInt32x8().AsInt8x32()
- ix := x.AsInt8x32()
- iy := y.AsInt8x32()
- return iy.blend(ix, im).AsInt32x8()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int64x4) Masked(mask Mask64x4) Int64x4 {
- im := mask.AsInt64x4()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Int64x4) Merge(y Int64x4, mask Mask64x4) Int64x4 {
- im := mask.AsInt64x4().AsInt8x32()
- ix := x.AsInt8x32()
- iy := y.AsInt8x32()
- return iy.blend(ix, im).AsInt64x4()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint8x32) Masked(mask Mask8x32) Uint8x32 {
- im := mask.AsInt8x32()
- return x.AsInt8x32().And(im).AsUint8x32()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Uint8x32) Merge(y Uint8x32, mask Mask8x32) Uint8x32 {
- im := mask.AsInt8x32()
- ix := x.AsInt8x32()
- iy := y.AsInt8x32()
- return iy.blend(ix, im).AsUint8x32()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint16x16) Masked(mask Mask16x16) Uint16x16 {
- im := mask.AsInt16x16()
- return x.AsInt16x16().And(im).AsUint16x16()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Uint16x16) Merge(y Uint16x16, mask Mask16x16) Uint16x16 {
- im := mask.AsInt16x16().AsInt8x32()
- ix := x.AsInt8x32()
- iy := y.AsInt8x32()
- return iy.blend(ix, im).AsUint16x16()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint32x8) Masked(mask Mask32x8) Uint32x8 {
- im := mask.AsInt32x8()
- return x.AsInt32x8().And(im).AsUint32x8()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Uint32x8) Merge(y Uint32x8, mask Mask32x8) Uint32x8 {
- im := mask.AsInt32x8().AsInt8x32()
- ix := x.AsInt8x32()
- iy := y.AsInt8x32()
- return iy.blend(ix, im).AsUint32x8()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint64x4) Masked(mask Mask64x4) Uint64x4 {
- im := mask.AsInt64x4()
- return x.AsInt64x4().And(im).AsUint64x4()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Uint64x4) Merge(y Uint64x4, mask Mask64x4) Uint64x4 {
- im := mask.AsInt64x4().AsInt8x32()
- ix := x.AsInt8x32()
- iy := y.AsInt8x32()
- return iy.blend(ix, im).AsUint64x4()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Float32x8) Masked(mask Mask32x8) Float32x8 {
- im := mask.AsInt32x8()
- return x.AsInt32x8().And(im).AsFloat32x8()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Float32x8) Merge(y Float32x8, mask Mask32x8) Float32x8 {
- im := mask.AsInt32x8().AsInt8x32()
- ix := x.AsInt8x32()
- iy := y.AsInt8x32()
- return iy.blend(ix, im).AsFloat32x8()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Float64x4) Masked(mask Mask64x4) Float64x4 {
- im := mask.AsInt64x4()
- return x.AsInt64x4().And(im).AsFloat64x4()
-}
-
-// Merge returns x but with elements set to y where mask is false.
-func (x Float64x4) Merge(y Float64x4, mask Mask64x4) Float64x4 {
- im := mask.AsInt64x4().AsInt8x32()
- ix := x.AsInt8x32()
- iy := y.AsInt8x32()
- return iy.blend(ix, im).AsFloat64x4()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int8x64) Masked(mask Mask8x64) Int8x64 {
- im := mask.AsInt8x64()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x Int8x64) Merge(y Int8x64, mask Mask8x64) Int8x64 {
- return y.blendMasked(x, mask)
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int16x32) Masked(mask Mask16x32) Int16x32 {
- im := mask.AsInt16x32()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x Int16x32) Merge(y Int16x32, mask Mask16x32) Int16x32 {
- return y.blendMasked(x, mask)
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int32x16) Masked(mask Mask32x16) Int32x16 {
- im := mask.AsInt32x16()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x Int32x16) Merge(y Int32x16, mask Mask32x16) Int32x16 {
- return y.blendMasked(x, mask)
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Int64x8) Masked(mask Mask64x8) Int64x8 {
- im := mask.AsInt64x8()
- return im.And(x)
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x Int64x8) Merge(y Int64x8, mask Mask64x8) Int64x8 {
- return y.blendMasked(x, mask)
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint8x64) Masked(mask Mask8x64) Uint8x64 {
- im := mask.AsInt8x64()
- return x.AsInt8x64().And(im).AsUint8x64()
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x Uint8x64) Merge(y Uint8x64, mask Mask8x64) Uint8x64 {
- ix := x.AsInt8x64()
- iy := y.AsInt8x64()
- return iy.blendMasked(ix, mask).AsUint8x64()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint16x32) Masked(mask Mask16x32) Uint16x32 {
- im := mask.AsInt16x32()
- return x.AsInt16x32().And(im).AsUint16x32()
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x Uint16x32) Merge(y Uint16x32, mask Mask16x32) Uint16x32 {
- ix := x.AsInt16x32()
- iy := y.AsInt16x32()
- return iy.blendMasked(ix, mask).AsUint16x32()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint32x16) Masked(mask Mask32x16) Uint32x16 {
- im := mask.AsInt32x16()
- return x.AsInt32x16().And(im).AsUint32x16()
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x Uint32x16) Merge(y Uint32x16, mask Mask32x16) Uint32x16 {
- ix := x.AsInt32x16()
- iy := y.AsInt32x16()
- return iy.blendMasked(ix, mask).AsUint32x16()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Uint64x8) Masked(mask Mask64x8) Uint64x8 {
- im := mask.AsInt64x8()
- return x.AsInt64x8().And(im).AsUint64x8()
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x Uint64x8) Merge(y Uint64x8, mask Mask64x8) Uint64x8 {
- ix := x.AsInt64x8()
- iy := y.AsInt64x8()
- return iy.blendMasked(ix, mask).AsUint64x8()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Float32x16) Masked(mask Mask32x16) Float32x16 {
- im := mask.AsInt32x16()
- return x.AsInt32x16().And(im).AsFloat32x16()
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x Float32x16) Merge(y Float32x16, mask Mask32x16) Float32x16 {
- ix := x.AsInt32x16()
- iy := y.AsInt32x16()
- return iy.blendMasked(ix, mask).AsFloat32x16()
-}
-
-// Masked returns x but with elements zeroed where mask is false.
-func (x Float64x8) Masked(mask Mask64x8) Float64x8 {
- im := mask.AsInt64x8()
- return x.AsInt64x8().And(im).AsFloat64x8()
-}
-
-// Merge returns x but with elements set to y where m is false.
-func (x Float64x8) Merge(y Float64x8, mask Mask64x8) Float64x8 {
- ix := x.AsInt64x8()
- iy := y.AsInt64x8()
- return iy.blendMasked(ix, mask).AsFloat64x8()
-}
+++ /dev/null
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-/* AESDecryptLastRound */
-
-// AESDecryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of dw array in use.
-// result = AddRoundKey(InvShiftRows(InvSubBytes(x)), y)
-//
-// Asm: VAESDECLAST, CPU Feature: AVX, AES
-func (x Uint8x16) AESDecryptLastRound(y Uint32x4) Uint8x16
-
-// AESDecryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of dw array in use.
-// result = AddRoundKey(InvShiftRows(InvSubBytes(x)), y)
-//
-// Asm: VAESDECLAST, CPU Feature: AVX512VAES
-func (x Uint8x32) AESDecryptLastRound(y Uint32x8) Uint8x32
-
-// AESDecryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of dw array in use.
-// result = AddRoundKey(InvShiftRows(InvSubBytes(x)), y)
-//
-// Asm: VAESDECLAST, CPU Feature: AVX512VAES
-func (x Uint8x64) AESDecryptLastRound(y Uint32x16) Uint8x64
-
-/* AESDecryptOneRound */
-
-// AESDecryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of dw array in use.
-// result = AddRoundKey(InvMixColumns(InvShiftRows(InvSubBytes(x))), y)
-//
-// Asm: VAESDEC, CPU Feature: AVX, AES
-func (x Uint8x16) AESDecryptOneRound(y Uint32x4) Uint8x16
-
-// AESDecryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of dw array in use.
-// result = AddRoundKey(InvMixColumns(InvShiftRows(InvSubBytes(x))), y)
-//
-// Asm: VAESDEC, CPU Feature: AVX512VAES
-func (x Uint8x32) AESDecryptOneRound(y Uint32x8) Uint8x32
-
-// AESDecryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of dw array in use.
-// result = AddRoundKey(InvMixColumns(InvShiftRows(InvSubBytes(x))), y)
-//
-// Asm: VAESDEC, CPU Feature: AVX512VAES
-func (x Uint8x64) AESDecryptOneRound(y Uint32x16) Uint8x64
-
-/* AESEncryptLastRound */
-
-// AESEncryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of w array in use.
-// result = AddRoundKey((ShiftRows(SubBytes(x))), y)
-//
-// Asm: VAESENCLAST, CPU Feature: AVX, AES
-func (x Uint8x16) AESEncryptLastRound(y Uint32x4) Uint8x16
-
-// AESEncryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of w array in use.
-// result = AddRoundKey((ShiftRows(SubBytes(x))), y)
-//
-// Asm: VAESENCLAST, CPU Feature: AVX512VAES
-func (x Uint8x32) AESEncryptLastRound(y Uint32x8) Uint8x32
-
-// AESEncryptLastRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of w array in use.
-// result = AddRoundKey((ShiftRows(SubBytes(x))), y)
-//
-// Asm: VAESENCLAST, CPU Feature: AVX512VAES
-func (x Uint8x64) AESEncryptLastRound(y Uint32x16) Uint8x64
-
-/* AESEncryptOneRound */
-
-// AESEncryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of w array in use.
-// result = AddRoundKey(MixColumns(ShiftRows(SubBytes(x))), y)
-//
-// Asm: VAESENC, CPU Feature: AVX, AES
-func (x Uint8x16) AESEncryptOneRound(y Uint32x4) Uint8x16
-
-// AESEncryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of w array in use.
-// result = AddRoundKey(MixColumns(ShiftRows(SubBytes(x))), y)
-//
-// Asm: VAESENC, CPU Feature: AVX512VAES
-func (x Uint8x32) AESEncryptOneRound(y Uint32x8) Uint8x32
-
-// AESEncryptOneRound performs a series of operations in AES cipher algorithm defined in FIPS 197.
-// x is the state array, starting from low index to high are s00, s10, s20, s30, s01, ..., s33.
-// y is the chunk of w array in use.
-// result = AddRoundKey(MixColumns(ShiftRows(SubBytes(x))), y)
-//
-// Asm: VAESENC, CPU Feature: AVX512VAES
-func (x Uint8x64) AESEncryptOneRound(y Uint32x16) Uint8x64
-
-/* AESInvMixColumns */
-
-// AESInvMixColumns performs the InvMixColumns operation in AES cipher algorithm defined in FIPS 197.
-// x is the chunk of w array in use.
-// result = InvMixColumns(x)
-//
-// Asm: VAESIMC, CPU Feature: AVX, AES
-func (x Uint32x4) AESInvMixColumns() Uint32x4
-
-/* AESRoundKeyGenAssist */
-
-// AESRoundKeyGenAssist performs some components of KeyExpansion in AES cipher algorithm defined in FIPS 197.
-// x is an array of AES words, but only x[0] and x[2] are used.
-// r is a value from the Rcon constant array.
-// result[0] = XOR(SubWord(RotWord(x[0])), r)
-// result[1] = SubWord(x[1])
-// result[2] = XOR(SubWord(RotWord(x[2])), r)
-// result[3] = SubWord(x[3])
-//
-// rconVal results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VAESKEYGENASSIST, CPU Feature: AVX, AES
-func (x Uint32x4) AESRoundKeyGenAssist(rconVal uint8) Uint32x4
-
-/* Abs */
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSB, CPU Feature: AVX
-func (x Int8x16) Abs() Int8x16
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSB, CPU Feature: AVX2
-func (x Int8x32) Abs() Int8x32
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSB, CPU Feature: AVX512
-func (x Int8x64) Abs() Int8x64
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSW, CPU Feature: AVX
-func (x Int16x8) Abs() Int16x8
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSW, CPU Feature: AVX2
-func (x Int16x16) Abs() Int16x16
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSW, CPU Feature: AVX512
-func (x Int16x32) Abs() Int16x32
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSD, CPU Feature: AVX
-func (x Int32x4) Abs() Int32x4
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSD, CPU Feature: AVX2
-func (x Int32x8) Abs() Int32x8
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSD, CPU Feature: AVX512
-func (x Int32x16) Abs() Int32x16
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSQ, CPU Feature: AVX512
-func (x Int64x2) Abs() Int64x2
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSQ, CPU Feature: AVX512
-func (x Int64x4) Abs() Int64x4
-
-// Abs computes the absolute value of each element.
-//
-// Asm: VPABSQ, CPU Feature: AVX512
-func (x Int64x8) Abs() Int64x8
-
-/* Add */
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VADDPS, CPU Feature: AVX
-func (x Float32x4) Add(y Float32x4) Float32x4
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VADDPS, CPU Feature: AVX
-func (x Float32x8) Add(y Float32x8) Float32x8
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VADDPS, CPU Feature: AVX512
-func (x Float32x16) Add(y Float32x16) Float32x16
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VADDPD, CPU Feature: AVX
-func (x Float64x2) Add(y Float64x2) Float64x2
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VADDPD, CPU Feature: AVX
-func (x Float64x4) Add(y Float64x4) Float64x4
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VADDPD, CPU Feature: AVX512
-func (x Float64x8) Add(y Float64x8) Float64x8
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDB, CPU Feature: AVX
-func (x Int8x16) Add(y Int8x16) Int8x16
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDB, CPU Feature: AVX2
-func (x Int8x32) Add(y Int8x32) Int8x32
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDB, CPU Feature: AVX512
-func (x Int8x64) Add(y Int8x64) Int8x64
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDW, CPU Feature: AVX
-func (x Int16x8) Add(y Int16x8) Int16x8
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDW, CPU Feature: AVX2
-func (x Int16x16) Add(y Int16x16) Int16x16
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDW, CPU Feature: AVX512
-func (x Int16x32) Add(y Int16x32) Int16x32
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDD, CPU Feature: AVX
-func (x Int32x4) Add(y Int32x4) Int32x4
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDD, CPU Feature: AVX2
-func (x Int32x8) Add(y Int32x8) Int32x8
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDD, CPU Feature: AVX512
-func (x Int32x16) Add(y Int32x16) Int32x16
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDQ, CPU Feature: AVX
-func (x Int64x2) Add(y Int64x2) Int64x2
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDQ, CPU Feature: AVX2
-func (x Int64x4) Add(y Int64x4) Int64x4
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDQ, CPU Feature: AVX512
-func (x Int64x8) Add(y Int64x8) Int64x8
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDB, CPU Feature: AVX
-func (x Uint8x16) Add(y Uint8x16) Uint8x16
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDB, CPU Feature: AVX2
-func (x Uint8x32) Add(y Uint8x32) Uint8x32
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDB, CPU Feature: AVX512
-func (x Uint8x64) Add(y Uint8x64) Uint8x64
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDW, CPU Feature: AVX
-func (x Uint16x8) Add(y Uint16x8) Uint16x8
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDW, CPU Feature: AVX2
-func (x Uint16x16) Add(y Uint16x16) Uint16x16
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDW, CPU Feature: AVX512
-func (x Uint16x32) Add(y Uint16x32) Uint16x32
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDD, CPU Feature: AVX
-func (x Uint32x4) Add(y Uint32x4) Uint32x4
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDD, CPU Feature: AVX2
-func (x Uint32x8) Add(y Uint32x8) Uint32x8
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDD, CPU Feature: AVX512
-func (x Uint32x16) Add(y Uint32x16) Uint32x16
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDQ, CPU Feature: AVX
-func (x Uint64x2) Add(y Uint64x2) Uint64x2
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDQ, CPU Feature: AVX2
-func (x Uint64x4) Add(y Uint64x4) Uint64x4
-
-// Add adds corresponding elements of two vectors.
-//
-// Asm: VPADDQ, CPU Feature: AVX512
-func (x Uint64x8) Add(y Uint64x8) Uint64x8
-
-/* AddPairs */
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x4) AddPairs(y Float32x4) Float32x4
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x8) AddPairs(y Float32x8) Float32x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x2) AddPairs(y Float64x2) Float64x2
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x4) AddPairs(y Float64x4) Float64x4
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Int16x8) AddPairs(y Int16x8) Int16x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Int16x16) AddPairs(y Int16x16) Int16x16
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Int32x4) AddPairs(y Int32x4) Int32x4
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Int32x8) AddPairs(y Int32x8) Int32x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Uint16x16) AddPairs(y Uint16x16) Uint16x16
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Uint32x8) AddPairs(y Uint32x8) Uint32x8
-
-/* AddPairsSaturated */
-
-// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDSW, CPU Feature: AVX
-func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8
-
-// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VPHADDSW, CPU Feature: AVX2
-func (x Int16x16) AddPairsSaturated(y Int16x16) Int16x16
-
-/* AddSaturated */
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSB, CPU Feature: AVX
-func (x Int8x16) AddSaturated(y Int8x16) Int8x16
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSB, CPU Feature: AVX2
-func (x Int8x32) AddSaturated(y Int8x32) Int8x32
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSB, CPU Feature: AVX512
-func (x Int8x64) AddSaturated(y Int8x64) Int8x64
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSW, CPU Feature: AVX
-func (x Int16x8) AddSaturated(y Int16x8) Int16x8
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSW, CPU Feature: AVX2
-func (x Int16x16) AddSaturated(y Int16x16) Int16x16
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDSW, CPU Feature: AVX512
-func (x Int16x32) AddSaturated(y Int16x32) Int16x32
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDUSB, CPU Feature: AVX
-func (x Uint8x16) AddSaturated(y Uint8x16) Uint8x16
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDUSB, CPU Feature: AVX2
-func (x Uint8x32) AddSaturated(y Uint8x32) Uint8x32
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDUSB, CPU Feature: AVX512
-func (x Uint8x64) AddSaturated(y Uint8x64) Uint8x64
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDUSW, CPU Feature: AVX
-func (x Uint16x8) AddSaturated(y Uint16x8) Uint16x8
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDUSW, CPU Feature: AVX2
-func (x Uint16x16) AddSaturated(y Uint16x16) Uint16x16
-
-// AddSaturated adds corresponding elements of two vectors with saturation.
-//
-// Asm: VPADDUSW, CPU Feature: AVX512
-func (x Uint16x32) AddSaturated(y Uint16x32) Uint16x32
-
-/* AddSub */
-
-// AddSub subtracts even elements and adds odd elements of two vectors.
-//
-// Asm: VADDSUBPS, CPU Feature: AVX
-func (x Float32x4) AddSub(y Float32x4) Float32x4
-
-// AddSub subtracts even elements and adds odd elements of two vectors.
-//
-// Asm: VADDSUBPS, CPU Feature: AVX
-func (x Float32x8) AddSub(y Float32x8) Float32x8
-
-// AddSub subtracts even elements and adds odd elements of two vectors.
-//
-// Asm: VADDSUBPD, CPU Feature: AVX
-func (x Float64x2) AddSub(y Float64x2) Float64x2
-
-// AddSub subtracts even elements and adds odd elements of two vectors.
-//
-// Asm: VADDSUBPD, CPU Feature: AVX
-func (x Float64x4) AddSub(y Float64x4) Float64x4
-
-/* And */
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX
-func (x Int8x16) And(y Int8x16) Int8x16
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX2
-func (x Int8x32) And(y Int8x32) Int8x32
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPANDD, CPU Feature: AVX512
-func (x Int8x64) And(y Int8x64) Int8x64
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX
-func (x Int16x8) And(y Int16x8) Int16x8
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX2
-func (x Int16x16) And(y Int16x16) Int16x16
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPANDD, CPU Feature: AVX512
-func (x Int16x32) And(y Int16x32) Int16x32
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX
-func (x Int32x4) And(y Int32x4) Int32x4
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX2
-func (x Int32x8) And(y Int32x8) Int32x8
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPANDD, CPU Feature: AVX512
-func (x Int32x16) And(y Int32x16) Int32x16
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX
-func (x Int64x2) And(y Int64x2) Int64x2
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX2
-func (x Int64x4) And(y Int64x4) Int64x4
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPANDQ, CPU Feature: AVX512
-func (x Int64x8) And(y Int64x8) Int64x8
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX
-func (x Uint8x16) And(y Uint8x16) Uint8x16
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX2
-func (x Uint8x32) And(y Uint8x32) Uint8x32
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPANDD, CPU Feature: AVX512
-func (x Uint8x64) And(y Uint8x64) Uint8x64
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX
-func (x Uint16x8) And(y Uint16x8) Uint16x8
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX2
-func (x Uint16x16) And(y Uint16x16) Uint16x16
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPANDD, CPU Feature: AVX512
-func (x Uint16x32) And(y Uint16x32) Uint16x32
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX
-func (x Uint32x4) And(y Uint32x4) Uint32x4
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX2
-func (x Uint32x8) And(y Uint32x8) Uint32x8
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPANDD, CPU Feature: AVX512
-func (x Uint32x16) And(y Uint32x16) Uint32x16
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX
-func (x Uint64x2) And(y Uint64x2) Uint64x2
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPAND, CPU Feature: AVX2
-func (x Uint64x4) And(y Uint64x4) Uint64x4
-
-// And performs a bitwise AND operation between two vectors.
-//
-// Asm: VPANDQ, CPU Feature: AVX512
-func (x Uint64x8) And(y Uint64x8) Uint64x8
-
-/* AndNot */
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX
-func (x Int8x16) AndNot(y Int8x16) Int8x16
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX2
-func (x Int8x32) AndNot(y Int8x32) Int8x32
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDND, CPU Feature: AVX512
-func (x Int8x64) AndNot(y Int8x64) Int8x64
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX
-func (x Int16x8) AndNot(y Int16x8) Int16x8
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX2
-func (x Int16x16) AndNot(y Int16x16) Int16x16
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDND, CPU Feature: AVX512
-func (x Int16x32) AndNot(y Int16x32) Int16x32
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX
-func (x Int32x4) AndNot(y Int32x4) Int32x4
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX2
-func (x Int32x8) AndNot(y Int32x8) Int32x8
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDND, CPU Feature: AVX512
-func (x Int32x16) AndNot(y Int32x16) Int32x16
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX
-func (x Int64x2) AndNot(y Int64x2) Int64x2
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX2
-func (x Int64x4) AndNot(y Int64x4) Int64x4
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDNQ, CPU Feature: AVX512
-func (x Int64x8) AndNot(y Int64x8) Int64x8
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX
-func (x Uint8x16) AndNot(y Uint8x16) Uint8x16
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX2
-func (x Uint8x32) AndNot(y Uint8x32) Uint8x32
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDND, CPU Feature: AVX512
-func (x Uint8x64) AndNot(y Uint8x64) Uint8x64
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX
-func (x Uint16x8) AndNot(y Uint16x8) Uint16x8
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX2
-func (x Uint16x16) AndNot(y Uint16x16) Uint16x16
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDND, CPU Feature: AVX512
-func (x Uint16x32) AndNot(y Uint16x32) Uint16x32
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX
-func (x Uint32x4) AndNot(y Uint32x4) Uint32x4
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX2
-func (x Uint32x8) AndNot(y Uint32x8) Uint32x8
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDND, CPU Feature: AVX512
-func (x Uint32x16) AndNot(y Uint32x16) Uint32x16
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX
-func (x Uint64x2) AndNot(y Uint64x2) Uint64x2
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDN, CPU Feature: AVX2
-func (x Uint64x4) AndNot(y Uint64x4) Uint64x4
-
-// AndNot performs a bitwise x &^ y.
-//
-// Asm: VPANDNQ, CPU Feature: AVX512
-func (x Uint64x8) AndNot(y Uint64x8) Uint64x8
-
-/* Average */
-
-// Average computes the rounded average of corresponding elements.
-//
-// Asm: VPAVGB, CPU Feature: AVX
-func (x Uint8x16) Average(y Uint8x16) Uint8x16
-
-// Average computes the rounded average of corresponding elements.
-//
-// Asm: VPAVGB, CPU Feature: AVX2
-func (x Uint8x32) Average(y Uint8x32) Uint8x32
-
-// Average computes the rounded average of corresponding elements.
-//
-// Asm: VPAVGB, CPU Feature: AVX512
-func (x Uint8x64) Average(y Uint8x64) Uint8x64
-
-// Average computes the rounded average of corresponding elements.
-//
-// Asm: VPAVGW, CPU Feature: AVX
-func (x Uint16x8) Average(y Uint16x8) Uint16x8
-
-// Average computes the rounded average of corresponding elements.
-//
-// Asm: VPAVGW, CPU Feature: AVX2
-func (x Uint16x16) Average(y Uint16x16) Uint16x16
-
-// Average computes the rounded average of corresponding elements.
-//
-// Asm: VPAVGW, CPU Feature: AVX512
-func (x Uint16x32) Average(y Uint16x32) Uint16x32
-
-/* Broadcast128 */
-
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VBROADCASTSS, CPU Feature: AVX2
-func (x Float32x4) Broadcast128() Float32x4
-
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Float64x2) Broadcast128() Float64x2
-
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Int8x16) Broadcast128() Int8x16
-
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Int16x8) Broadcast128() Int16x8
-
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Int32x4) Broadcast128() Int32x4
-
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Int64x2) Broadcast128() Int64x2
-
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Uint8x16) Broadcast128() Uint8x16
-
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Uint16x8) Broadcast128() Uint16x8
-
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Uint32x4) Broadcast128() Uint32x4
-
-// Broadcast128 copies element zero of its (128-bit) input to all elements of
-// the 128-bit output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Uint64x2) Broadcast128() Uint64x2
-
-/* Broadcast256 */
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VBROADCASTSS, CPU Feature: AVX2
-func (x Float32x4) Broadcast256() Float32x8
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VBROADCASTSD, CPU Feature: AVX2
-func (x Float64x2) Broadcast256() Float64x4
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Int8x16) Broadcast256() Int8x32
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Int16x8) Broadcast256() Int16x16
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Int32x4) Broadcast256() Int32x8
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Int64x2) Broadcast256() Int64x4
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX2
-func (x Uint8x16) Broadcast256() Uint8x32
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX2
-func (x Uint16x8) Broadcast256() Uint16x16
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX2
-func (x Uint32x4) Broadcast256() Uint32x8
-
-// Broadcast256 copies element zero of its (128-bit) input to all elements of
-// the 256-bit output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX2
-func (x Uint64x2) Broadcast256() Uint64x4
-
-/* Broadcast512 */
-
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
-//
-// Asm: VBROADCASTSS, CPU Feature: AVX512
-func (x Float32x4) Broadcast512() Float32x16
-
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
-//
-// Asm: VBROADCASTSD, CPU Feature: AVX512
-func (x Float64x2) Broadcast512() Float64x8
-
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX512
-func (x Int8x16) Broadcast512() Int8x64
-
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX512
-func (x Int16x8) Broadcast512() Int16x32
-
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX512
-func (x Int32x4) Broadcast512() Int32x16
-
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX512
-func (x Int64x2) Broadcast512() Int64x8
-
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
-//
-// Asm: VPBROADCASTB, CPU Feature: AVX512
-func (x Uint8x16) Broadcast512() Uint8x64
-
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
-//
-// Asm: VPBROADCASTW, CPU Feature: AVX512
-func (x Uint16x8) Broadcast512() Uint16x32
-
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
-//
-// Asm: VPBROADCASTD, CPU Feature: AVX512
-func (x Uint32x4) Broadcast512() Uint32x16
-
-// Broadcast512 copies element zero of its (128-bit) input to all elements of
-// the 512-bit output vector.
-//
-// Asm: VPBROADCASTQ, CPU Feature: AVX512
-func (x Uint64x2) Broadcast512() Uint64x8
-
-/* Ceil */
-
-// Ceil rounds elements up to the nearest integer.
-//
-// Asm: VROUNDPS, CPU Feature: AVX
-func (x Float32x4) Ceil() Float32x4
-
-// Ceil rounds elements up to the nearest integer.
-//
-// Asm: VROUNDPS, CPU Feature: AVX
-func (x Float32x8) Ceil() Float32x8
-
-// Ceil rounds elements up to the nearest integer.
-//
-// Asm: VROUNDPD, CPU Feature: AVX
-func (x Float64x2) Ceil() Float64x2
-
-// Ceil rounds elements up to the nearest integer.
-//
-// Asm: VROUNDPD, CPU Feature: AVX
-func (x Float64x4) Ceil() Float64x4
-
-/* CeilScaled */
-
-// CeilScaled rounds elements up with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x4) CeilScaled(prec uint8) Float32x4
-
-// CeilScaled rounds elements up with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x8) CeilScaled(prec uint8) Float32x8
-
-// CeilScaled rounds elements up with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x16) CeilScaled(prec uint8) Float32x16
-
-// CeilScaled rounds elements up with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x2) CeilScaled(prec uint8) Float64x2
-
-// CeilScaled rounds elements up with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x4) CeilScaled(prec uint8) Float64x4
-
-// CeilScaled rounds elements up with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x8) CeilScaled(prec uint8) Float64x8
-
-/* CeilScaledResidue */
-
-// CeilScaledResidue computes the difference after ceiling with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x4) CeilScaledResidue(prec uint8) Float32x4
-
-// CeilScaledResidue computes the difference after ceiling with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x8) CeilScaledResidue(prec uint8) Float32x8
-
-// CeilScaledResidue computes the difference after ceiling with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x16) CeilScaledResidue(prec uint8) Float32x16
-
-// CeilScaledResidue computes the difference after ceiling with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x2) CeilScaledResidue(prec uint8) Float64x2
-
-// CeilScaledResidue computes the difference after ceiling with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x4) CeilScaledResidue(prec uint8) Float64x4
-
-// CeilScaledResidue computes the difference after ceiling with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x8) CeilScaledResidue(prec uint8) Float64x8
-
-/* Compress */
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VCOMPRESSPS, CPU Feature: AVX512
-func (x Float32x4) Compress(mask Mask32x4) Float32x4
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VCOMPRESSPS, CPU Feature: AVX512
-func (x Float32x8) Compress(mask Mask32x8) Float32x8
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VCOMPRESSPS, CPU Feature: AVX512
-func (x Float32x16) Compress(mask Mask32x16) Float32x16
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VCOMPRESSPD, CPU Feature: AVX512
-func (x Float64x2) Compress(mask Mask64x2) Float64x2
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VCOMPRESSPD, CPU Feature: AVX512
-func (x Float64x4) Compress(mask Mask64x4) Float64x4
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VCOMPRESSPD, CPU Feature: AVX512
-func (x Float64x8) Compress(mask Mask64x8) Float64x8
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
-func (x Int8x16) Compress(mask Mask8x16) Int8x16
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
-func (x Int8x32) Compress(mask Mask8x32) Int8x32
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
-func (x Int8x64) Compress(mask Mask8x64) Int8x64
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
-func (x Int16x8) Compress(mask Mask16x8) Int16x8
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
-func (x Int16x16) Compress(mask Mask16x16) Int16x16
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
-func (x Int16x32) Compress(mask Mask16x32) Int16x32
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSD, CPU Feature: AVX512
-func (x Int32x4) Compress(mask Mask32x4) Int32x4
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSD, CPU Feature: AVX512
-func (x Int32x8) Compress(mask Mask32x8) Int32x8
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSD, CPU Feature: AVX512
-func (x Int32x16) Compress(mask Mask32x16) Int32x16
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSQ, CPU Feature: AVX512
-func (x Int64x2) Compress(mask Mask64x2) Int64x2
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSQ, CPU Feature: AVX512
-func (x Int64x4) Compress(mask Mask64x4) Int64x4
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSQ, CPU Feature: AVX512
-func (x Int64x8) Compress(mask Mask64x8) Int64x8
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
-func (x Uint8x16) Compress(mask Mask8x16) Uint8x16
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
-func (x Uint8x32) Compress(mask Mask8x32) Uint8x32
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSB, CPU Feature: AVX512VBMI2
-func (x Uint8x64) Compress(mask Mask8x64) Uint8x64
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) Compress(mask Mask16x8) Uint16x8
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) Compress(mask Mask16x16) Uint16x16
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) Compress(mask Mask16x32) Uint16x32
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSD, CPU Feature: AVX512
-func (x Uint32x4) Compress(mask Mask32x4) Uint32x4
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSD, CPU Feature: AVX512
-func (x Uint32x8) Compress(mask Mask32x8) Uint32x8
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSD, CPU Feature: AVX512
-func (x Uint32x16) Compress(mask Mask32x16) Uint32x16
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSQ, CPU Feature: AVX512
-func (x Uint64x2) Compress(mask Mask64x2) Uint64x2
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSQ, CPU Feature: AVX512
-func (x Uint64x4) Compress(mask Mask64x4) Uint64x4
-
-// Compress performs a compression on vector x using mask by
-// selecting elements as indicated by mask, and pack them to lower indexed elements.
-//
-// Asm: VPCOMPRESSQ, CPU Feature: AVX512
-func (x Uint64x8) Compress(mask Mask64x8) Uint64x8
-
-/* ConcatPermute */
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x16) ConcatPermute(y Int8x16, indices Uint8x16) Int8x16
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x16) ConcatPermute(y Uint8x16, indices Uint8x16) Uint8x16
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x32) ConcatPermute(y Int8x32, indices Uint8x32) Int8x32
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x32) ConcatPermute(y Uint8x32, indices Uint8x32) Uint8x32
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Int8x64) ConcatPermute(y Int8x64, indices Uint8x64) Int8x64
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2B, CPU Feature: AVX512VBMI
-func (x Uint8x64) ConcatPermute(y Uint8x64, indices Uint8x64) Uint8x64
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x8) ConcatPermute(y Int16x8, indices Uint16x8) Int16x8
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x8) ConcatPermute(y Uint16x8, indices Uint16x8) Uint16x8
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x16) ConcatPermute(y Int16x16, indices Uint16x16) Int16x16
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x16) ConcatPermute(y Uint16x16, indices Uint16x16) Uint16x16
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Int16x32) ConcatPermute(y Int16x32, indices Uint16x32) Int16x32
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2W, CPU Feature: AVX512
-func (x Uint16x32) ConcatPermute(y Uint16x32, indices Uint16x32) Uint16x32
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x4) ConcatPermute(y Float32x4, indices Uint32x4) Float32x4
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x4) ConcatPermute(y Int32x4, indices Uint32x4) Int32x4
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x4) ConcatPermute(y Uint32x4, indices Uint32x4) Uint32x4
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x8) ConcatPermute(y Float32x8, indices Uint32x8) Float32x8
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x8) ConcatPermute(y Int32x8, indices Uint32x8) Int32x8
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x8) ConcatPermute(y Uint32x8, indices Uint32x8) Uint32x8
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PS, CPU Feature: AVX512
-func (x Float32x16) ConcatPermute(y Float32x16, indices Uint32x16) Float32x16
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Int32x16) ConcatPermute(y Int32x16, indices Uint32x16) Int32x16
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2D, CPU Feature: AVX512
-func (x Uint32x16) ConcatPermute(y Uint32x16, indices Uint32x16) Uint32x16
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x2) ConcatPermute(y Float64x2, indices Uint64x2) Float64x2
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x2) ConcatPermute(y Int64x2, indices Uint64x2) Int64x2
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x2) ConcatPermute(y Uint64x2, indices Uint64x2) Uint64x2
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x4) ConcatPermute(y Float64x4, indices Uint64x4) Float64x4
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x4) ConcatPermute(y Int64x4, indices Uint64x4) Int64x4
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x4) ConcatPermute(y Uint64x4, indices Uint64x4) Uint64x4
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2PD, CPU Feature: AVX512
-func (x Float64x8) ConcatPermute(y Float64x8, indices Uint64x8) Float64x8
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Int64x8) ConcatPermute(y Int64x8, indices Uint64x8) Int64x8
-
-// ConcatPermute performs a full permutation of vector x, y using indices:
-// result := {xy[indices[0]], xy[indices[1]], ..., xy[indices[n]]}
-// where xy is the concatenation of x (lower half) and y (upper half).
-// Only the needed bits to represent xy's index are used in indices' elements.
-//
-// Asm: VPERMI2Q, CPU Feature: AVX512
-func (x Uint64x8) ConcatPermute(y Uint64x8, indices Uint64x8) Uint64x8
-
-/* ConcatShiftBytesRight */
-
-// ConcatShiftBytesRight concatenates x and y and shift it right by constant bytes.
-// The result vector will be the lower half of the concatenated vector.
-//
-// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPALIGNR, CPU Feature: AVX
-func (x Uint8x16) ConcatShiftBytesRight(constant uint8, y Uint8x16) Uint8x16
-
-/* ConcatShiftBytesRightGrouped */
-
-// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by constant bytes.
-// The result vector will be the lower half of the concatenated vector.
-// This operation is performed grouped by each 16 byte.
-//
-// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPALIGNR, CPU Feature: AVX2
-func (x Uint8x32) ConcatShiftBytesRightGrouped(constant uint8, y Uint8x32) Uint8x32
-
-// ConcatShiftBytesRightGrouped concatenates x and y and shift it right by constant bytes.
-// The result vector will be the lower half of the concatenated vector.
-// This operation is performed grouped by each 16 byte.
-//
-// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPALIGNR, CPU Feature: AVX512
-func (x Uint8x64) ConcatShiftBytesRightGrouped(constant uint8, y Uint8x64) Uint8x64
-
-/* ConvertToFloat32 */
-
-// ConvertToFloat32 converts element values to float32.
-// The result vector's elements are rounded to the nearest value.
-//
-// Asm: VCVTPD2PSX, CPU Feature: AVX
-func (x Float64x2) ConvertToFloat32() Float32x4
-
-// ConvertToFloat32 converts element values to float32.
-// The result vector's elements are rounded to the nearest value.
-//
-// Asm: VCVTPD2PSY, CPU Feature: AVX
-func (x Float64x4) ConvertToFloat32() Float32x4
-
-// ConvertToFloat32 converts element values to float32.
-// The result vector's elements are rounded to the nearest value.
-//
-// Asm: VCVTPD2PS, CPU Feature: AVX512
-func (x Float64x8) ConvertToFloat32() Float32x8
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTDQ2PS, CPU Feature: AVX
-func (x Int32x4) ConvertToFloat32() Float32x4
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTDQ2PS, CPU Feature: AVX
-func (x Int32x8) ConvertToFloat32() Float32x8
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTDQ2PS, CPU Feature: AVX512
-func (x Int32x16) ConvertToFloat32() Float32x16
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTQQ2PSX, CPU Feature: AVX512
-func (x Int64x2) ConvertToFloat32() Float32x4
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTQQ2PSY, CPU Feature: AVX512
-func (x Int64x4) ConvertToFloat32() Float32x4
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTQQ2PS, CPU Feature: AVX512
-func (x Int64x8) ConvertToFloat32() Float32x8
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTUDQ2PS, CPU Feature: AVX512
-func (x Uint32x4) ConvertToFloat32() Float32x4
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTUDQ2PS, CPU Feature: AVX512
-func (x Uint32x8) ConvertToFloat32() Float32x8
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTUDQ2PS, CPU Feature: AVX512
-func (x Uint32x16) ConvertToFloat32() Float32x16
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTUQQ2PSX, CPU Feature: AVX512
-func (x Uint64x2) ConvertToFloat32() Float32x4
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTUQQ2PSY, CPU Feature: AVX512
-func (x Uint64x4) ConvertToFloat32() Float32x4
-
-// ConvertToFloat32 converts element values to float32.
-//
-// Asm: VCVTUQQ2PS, CPU Feature: AVX512
-func (x Uint64x8) ConvertToFloat32() Float32x8
-
-/* ConvertToFloat64 */
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTPS2PD, CPU Feature: AVX
-func (x Float32x4) ConvertToFloat64() Float64x4
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTPS2PD, CPU Feature: AVX512
-func (x Float32x8) ConvertToFloat64() Float64x8
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTDQ2PD, CPU Feature: AVX
-func (x Int32x4) ConvertToFloat64() Float64x4
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTDQ2PD, CPU Feature: AVX512
-func (x Int32x8) ConvertToFloat64() Float64x8
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTQQ2PD, CPU Feature: AVX512
-func (x Int64x2) ConvertToFloat64() Float64x2
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTQQ2PD, CPU Feature: AVX512
-func (x Int64x4) ConvertToFloat64() Float64x4
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTQQ2PD, CPU Feature: AVX512
-func (x Int64x8) ConvertToFloat64() Float64x8
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTUDQ2PD, CPU Feature: AVX512
-func (x Uint32x4) ConvertToFloat64() Float64x4
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTUDQ2PD, CPU Feature: AVX512
-func (x Uint32x8) ConvertToFloat64() Float64x8
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTUQQ2PD, CPU Feature: AVX512
-func (x Uint64x2) ConvertToFloat64() Float64x2
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTUQQ2PD, CPU Feature: AVX512
-func (x Uint64x4) ConvertToFloat64() Float64x4
-
-// ConvertToFloat64 converts element values to float64.
-//
-// Asm: VCVTUQQ2PD, CPU Feature: AVX512
-func (x Uint64x8) ConvertToFloat64() Float64x8
-
-/* ConvertToInt32 */
-
-// ConvertToInt32 converts element values to int32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPS2DQ, CPU Feature: AVX
-func (x Float32x4) ConvertToInt32() Int32x4
-
-// ConvertToInt32 converts element values to int32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPS2DQ, CPU Feature: AVX
-func (x Float32x8) ConvertToInt32() Int32x8
-
-// ConvertToInt32 converts element values to int32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPS2DQ, CPU Feature: AVX512
-func (x Float32x16) ConvertToInt32() Int32x16
-
-// ConvertToInt32 converts element values to int32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2DQX, CPU Feature: AVX
-func (x Float64x2) ConvertToInt32() Int32x4
-
-// ConvertToInt32 converts element values to int32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2DQY, CPU Feature: AVX
-func (x Float64x4) ConvertToInt32() Int32x4
-
-// ConvertToInt32 converts element values to int32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2DQ, CPU Feature: AVX512
-func (x Float64x8) ConvertToInt32() Int32x8
-
-/* ConvertToInt64 */
-
-// ConvertToInt64 converts element values to int64.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int64, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPS2QQ, CPU Feature: AVX512
-func (x Float32x4) ConvertToInt64() Int64x4
-
-// ConvertToInt64 converts element values to int64.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int64, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPS2QQ, CPU Feature: AVX512
-func (x Float32x8) ConvertToInt64() Int64x8
-
-// ConvertToInt64 converts element values to int64.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int64, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2QQ, CPU Feature: AVX512
-func (x Float64x2) ConvertToInt64() Int64x2
-
-// ConvertToInt64 converts element values to int64.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int64, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2QQ, CPU Feature: AVX512
-func (x Float64x4) ConvertToInt64() Int64x4
-
-// ConvertToInt64 converts element values to int64.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in int64, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2QQ, CPU Feature: AVX512
-func (x Float64x8) ConvertToInt64() Int64x8
-
-/* ConvertToUint32 */
-
-// ConvertToUint32 converts element values to uint32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPS2UDQ, CPU Feature: AVX512
-func (x Float32x4) ConvertToUint32() Uint32x4
-
-// ConvertToUint32 converts element values to uint32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPS2UDQ, CPU Feature: AVX512
-func (x Float32x8) ConvertToUint32() Uint32x8
-
-// ConvertToUint32 converts element values to uint32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPS2UDQ, CPU Feature: AVX512
-func (x Float32x16) ConvertToUint32() Uint32x16
-
-// ConvertToUint32 converts element values to uint32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2UDQX, CPU Feature: AVX512
-func (x Float64x2) ConvertToUint32() Uint32x4
-
-// ConvertToUint32 converts element values to uint32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2UDQY, CPU Feature: AVX512
-func (x Float64x4) ConvertToUint32() Uint32x4
-
-// ConvertToUint32 converts element values to uint32.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint32, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2UDQ, CPU Feature: AVX512
-func (x Float64x8) ConvertToUint32() Uint32x8
-
-/* ConvertToUint64 */
-
-// ConvertToUint64 converts element values to uint64.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint64, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPS2UQQ, CPU Feature: AVX512
-func (x Float32x4) ConvertToUint64() Uint64x4
-
-// ConvertToUint64 converts element values to uint64.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint64, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPS2UQQ, CPU Feature: AVX512
-func (x Float32x8) ConvertToUint64() Uint64x8
-
-// ConvertToUint64 converts element values to uint64.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint64, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2UQQ, CPU Feature: AVX512
-func (x Float64x2) ConvertToUint64() Uint64x2
-
-// ConvertToUint64 converts element values to uint64.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint64, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2UQQ, CPU Feature: AVX512
-func (x Float64x4) ConvertToUint64() Uint64x4
-
-// ConvertToUint64 converts element values to uint64.
-// When a conversion is inexact, a truncated (round toward zero) value is returned.
-// If a converted result cannot be represented in uint64, an implementation-defined
-// architecture-specific value is returned.
-//
-// Asm: VCVTTPD2UQQ, CPU Feature: AVX512
-func (x Float64x8) ConvertToUint64() Uint64x8
-
-/* CopySign */
-
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
-//
-// Asm: VPSIGNB, CPU Feature: AVX
-func (x Int8x16) CopySign(y Int8x16) Int8x16
-
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
-//
-// Asm: VPSIGNB, CPU Feature: AVX2
-func (x Int8x32) CopySign(y Int8x32) Int8x32
-
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
-//
-// Asm: VPSIGNW, CPU Feature: AVX
-func (x Int16x8) CopySign(y Int16x8) Int16x8
-
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
-//
-// Asm: VPSIGNW, CPU Feature: AVX2
-func (x Int16x16) CopySign(y Int16x16) Int16x16
-
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
-//
-// Asm: VPSIGND, CPU Feature: AVX
-func (x Int32x4) CopySign(y Int32x4) Int32x4
-
-// CopySign returns the product of the first operand with -1, 0, or 1,
-// whichever constant is nearest to the value of the second operand.
-//
-// Asm: VPSIGND, CPU Feature: AVX2
-func (x Int32x8) CopySign(y Int32x8) Int32x8
-
-/* Div */
-
-// Div divides elements of two vectors.
-//
-// Asm: VDIVPS, CPU Feature: AVX
-func (x Float32x4) Div(y Float32x4) Float32x4
-
-// Div divides elements of two vectors.
-//
-// Asm: VDIVPS, CPU Feature: AVX
-func (x Float32x8) Div(y Float32x8) Float32x8
-
-// Div divides elements of two vectors.
-//
-// Asm: VDIVPS, CPU Feature: AVX512
-func (x Float32x16) Div(y Float32x16) Float32x16
-
-// Div divides elements of two vectors.
-//
-// Asm: VDIVPD, CPU Feature: AVX
-func (x Float64x2) Div(y Float64x2) Float64x2
-
-// Div divides elements of two vectors.
-//
-// Asm: VDIVPD, CPU Feature: AVX
-func (x Float64x4) Div(y Float64x4) Float64x4
-
-// Div divides elements of two vectors.
-//
-// Asm: VDIVPD, CPU Feature: AVX512
-func (x Float64x8) Div(y Float64x8) Float64x8
-
-/* DotProductPairs */
-
-// DotProductPairs multiplies the elements and add the pairs together,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// Asm: VPMADDWD, CPU Feature: AVX
-func (x Int16x8) DotProductPairs(y Int16x8) Int32x4
-
-// DotProductPairs multiplies the elements and add the pairs together,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// Asm: VPMADDWD, CPU Feature: AVX2
-func (x Int16x16) DotProductPairs(y Int16x16) Int32x8
-
-// DotProductPairs multiplies the elements and add the pairs together,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// Asm: VPMADDWD, CPU Feature: AVX512
-func (x Int16x32) DotProductPairs(y Int16x32) Int32x16
-
-/* DotProductPairsSaturated */
-
-// DotProductPairsSaturated multiplies the elements and add the pairs together with saturation,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// Asm: VPMADDUBSW, CPU Feature: AVX
-func (x Uint8x16) DotProductPairsSaturated(y Int8x16) Int16x8
-
-// DotProductPairsSaturated multiplies the elements and add the pairs together with saturation,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// Asm: VPMADDUBSW, CPU Feature: AVX2
-func (x Uint8x32) DotProductPairsSaturated(y Int8x32) Int16x16
-
-// DotProductPairsSaturated multiplies the elements and add the pairs together with saturation,
-// yielding a vector of half as many elements with twice the input element size.
-//
-// Asm: VPMADDUBSW, CPU Feature: AVX512
-func (x Uint8x64) DotProductPairsSaturated(y Int8x64) Int16x32
-
-/* DotProductQuadruple */
-
-// DotProductQuadruple performs dot products on groups of 4 elements of x and y.
-// DotProductQuadruple(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSD, CPU Feature: AVXVNNI
-func (x Int8x16) DotProductQuadruple(y Uint8x16) Int32x4
-
-// DotProductQuadruple performs dot products on groups of 4 elements of x and y.
-// DotProductQuadruple(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSD, CPU Feature: AVXVNNI
-func (x Int8x32) DotProductQuadruple(y Uint8x32) Int32x8
-
-// DotProductQuadruple performs dot products on groups of 4 elements of x and y.
-// DotProductQuadruple(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSD, CPU Feature: AVX512VNNI
-func (x Int8x64) DotProductQuadruple(y Uint8x64) Int32x16
-
-/* DotProductQuadrupleSaturated */
-
-// DotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y.
-// DotProductQuadrupleSaturated(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSDS, CPU Feature: AVXVNNI
-func (x Int8x16) DotProductQuadrupleSaturated(y Uint8x16) Int32x4
-
-// DotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y.
-// DotProductQuadrupleSaturated(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSDS, CPU Feature: AVXVNNI
-func (x Int8x32) DotProductQuadrupleSaturated(y Uint8x32) Int32x8
-
-// DotProductQuadrupleSaturated multiplies performs dot products on groups of 4 elements of x and y.
-// DotProductQuadrupleSaturated(x, y).Add(z) will be optimized to the full form of the underlying instruction.
-//
-// Asm: VPDPBUSDS, CPU Feature: AVX512VNNI
-func (x Int8x64) DotProductQuadrupleSaturated(y Uint8x64) Int32x16
-
-/* Equal */
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQB, CPU Feature: AVX
-func (x Int8x16) Equal(y Int8x16) Mask8x16
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQB, CPU Feature: AVX2
-func (x Int8x32) Equal(y Int8x32) Mask8x32
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQB, CPU Feature: AVX512
-func (x Int8x64) Equal(y Int8x64) Mask8x64
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQW, CPU Feature: AVX
-func (x Int16x8) Equal(y Int16x8) Mask16x8
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQW, CPU Feature: AVX2
-func (x Int16x16) Equal(y Int16x16) Mask16x16
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQW, CPU Feature: AVX512
-func (x Int16x32) Equal(y Int16x32) Mask16x32
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQD, CPU Feature: AVX
-func (x Int32x4) Equal(y Int32x4) Mask32x4
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQD, CPU Feature: AVX2
-func (x Int32x8) Equal(y Int32x8) Mask32x8
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQD, CPU Feature: AVX512
-func (x Int32x16) Equal(y Int32x16) Mask32x16
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQQ, CPU Feature: AVX
-func (x Int64x2) Equal(y Int64x2) Mask64x2
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQQ, CPU Feature: AVX2
-func (x Int64x4) Equal(y Int64x4) Mask64x4
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQQ, CPU Feature: AVX512
-func (x Int64x8) Equal(y Int64x8) Mask64x8
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQB, CPU Feature: AVX
-func (x Uint8x16) Equal(y Uint8x16) Mask8x16
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQB, CPU Feature: AVX2
-func (x Uint8x32) Equal(y Uint8x32) Mask8x32
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQB, CPU Feature: AVX512
-func (x Uint8x64) Equal(y Uint8x64) Mask8x64
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQW, CPU Feature: AVX
-func (x Uint16x8) Equal(y Uint16x8) Mask16x8
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQW, CPU Feature: AVX2
-func (x Uint16x16) Equal(y Uint16x16) Mask16x16
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQW, CPU Feature: AVX512
-func (x Uint16x32) Equal(y Uint16x32) Mask16x32
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQD, CPU Feature: AVX
-func (x Uint32x4) Equal(y Uint32x4) Mask32x4
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQD, CPU Feature: AVX2
-func (x Uint32x8) Equal(y Uint32x8) Mask32x8
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQD, CPU Feature: AVX512
-func (x Uint32x16) Equal(y Uint32x16) Mask32x16
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQQ, CPU Feature: AVX
-func (x Uint64x2) Equal(y Uint64x2) Mask64x2
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQQ, CPU Feature: AVX2
-func (x Uint64x4) Equal(y Uint64x4) Mask64x4
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VPCMPEQQ, CPU Feature: AVX512
-func (x Uint64x8) Equal(y Uint64x8) Mask64x8
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x4) Equal(y Float32x4) Mask32x4
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x8) Equal(y Float32x8) Mask32x8
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX512
-func (x Float32x16) Equal(y Float32x16) Mask32x16
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x2) Equal(y Float64x2) Mask64x2
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x4) Equal(y Float64x4) Mask64x4
-
-// Equal returns x equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX512
-func (x Float64x8) Equal(y Float64x8) Mask64x8
-
-/* Expand */
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VEXPANDPS, CPU Feature: AVX512
-func (x Float32x4) Expand(mask Mask32x4) Float32x4
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VEXPANDPS, CPU Feature: AVX512
-func (x Float32x8) Expand(mask Mask32x8) Float32x8
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VEXPANDPS, CPU Feature: AVX512
-func (x Float32x16) Expand(mask Mask32x16) Float32x16
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VEXPANDPD, CPU Feature: AVX512
-func (x Float64x2) Expand(mask Mask64x2) Float64x2
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VEXPANDPD, CPU Feature: AVX512
-func (x Float64x4) Expand(mask Mask64x4) Float64x4
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VEXPANDPD, CPU Feature: AVX512
-func (x Float64x8) Expand(mask Mask64x8) Float64x8
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
-func (x Int8x16) Expand(mask Mask8x16) Int8x16
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
-func (x Int8x32) Expand(mask Mask8x32) Int8x32
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
-func (x Int8x64) Expand(mask Mask8x64) Int8x64
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
-func (x Int16x8) Expand(mask Mask16x8) Int16x8
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
-func (x Int16x16) Expand(mask Mask16x16) Int16x16
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
-func (x Int16x32) Expand(mask Mask16x32) Int16x32
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDD, CPU Feature: AVX512
-func (x Int32x4) Expand(mask Mask32x4) Int32x4
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDD, CPU Feature: AVX512
-func (x Int32x8) Expand(mask Mask32x8) Int32x8
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDD, CPU Feature: AVX512
-func (x Int32x16) Expand(mask Mask32x16) Int32x16
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDQ, CPU Feature: AVX512
-func (x Int64x2) Expand(mask Mask64x2) Int64x2
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDQ, CPU Feature: AVX512
-func (x Int64x4) Expand(mask Mask64x4) Int64x4
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDQ, CPU Feature: AVX512
-func (x Int64x8) Expand(mask Mask64x8) Int64x8
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
-func (x Uint8x16) Expand(mask Mask8x16) Uint8x16
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
-func (x Uint8x32) Expand(mask Mask8x32) Uint8x32
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDB, CPU Feature: AVX512VBMI2
-func (x Uint8x64) Expand(mask Mask8x64) Uint8x64
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) Expand(mask Mask16x8) Uint16x8
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) Expand(mask Mask16x16) Uint16x16
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) Expand(mask Mask16x32) Uint16x32
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDD, CPU Feature: AVX512
-func (x Uint32x4) Expand(mask Mask32x4) Uint32x4
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDD, CPU Feature: AVX512
-func (x Uint32x8) Expand(mask Mask32x8) Uint32x8
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDD, CPU Feature: AVX512
-func (x Uint32x16) Expand(mask Mask32x16) Uint32x16
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDQ, CPU Feature: AVX512
-func (x Uint64x2) Expand(mask Mask64x2) Uint64x2
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDQ, CPU Feature: AVX512
-func (x Uint64x4) Expand(mask Mask64x4) Uint64x4
-
-// Expand performs an expansion on a vector x whose elements are packed to lower parts.
-// The expansion is to distribute elements as indexed by mask, from lower mask elements to upper in order.
-//
-// Asm: VPEXPANDQ, CPU Feature: AVX512
-func (x Uint64x8) Expand(mask Mask64x8) Uint64x8
-
-/* ExtendLo2ToInt64x2 */
-
-// ExtendLo2ToInt64x2 converts 2 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXBQ, CPU Feature: AVX
-func (x Int8x16) ExtendLo2ToInt64x2() Int64x2
-
-// ExtendLo2ToInt64x2 converts 2 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXWQ, CPU Feature: AVX
-func (x Int16x8) ExtendLo2ToInt64x2() Int64x2
-
-// ExtendLo2ToInt64x2 converts 2 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXDQ, CPU Feature: AVX
-func (x Int32x4) ExtendLo2ToInt64x2() Int64x2
-
-/* ExtendLo2ToUint64x2 */
-
-// ExtendLo2ToUint64x2 converts 2 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXBQ, CPU Feature: AVX
-func (x Uint8x16) ExtendLo2ToUint64x2() Uint64x2
-
-// ExtendLo2ToUint64x2 converts 2 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXWQ, CPU Feature: AVX
-func (x Uint16x8) ExtendLo2ToUint64x2() Uint64x2
-
-// ExtendLo2ToUint64x2 converts 2 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXDQ, CPU Feature: AVX
-func (x Uint32x4) ExtendLo2ToUint64x2() Uint64x2
-
-/* ExtendLo4ToInt32x4 */
-
-// ExtendLo4ToInt32x4 converts 4 lowest vector element values to int32.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXBD, CPU Feature: AVX
-func (x Int8x16) ExtendLo4ToInt32x4() Int32x4
-
-// ExtendLo4ToInt32x4 converts 4 lowest vector element values to int32.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXWD, CPU Feature: AVX
-func (x Int16x8) ExtendLo4ToInt32x4() Int32x4
-
-/* ExtendLo4ToInt64x4 */
-
-// ExtendLo4ToInt64x4 converts 4 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXBQ, CPU Feature: AVX2
-func (x Int8x16) ExtendLo4ToInt64x4() Int64x4
-
-// ExtendLo4ToInt64x4 converts 4 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXWQ, CPU Feature: AVX2
-func (x Int16x8) ExtendLo4ToInt64x4() Int64x4
-
-/* ExtendLo4ToUint32x4 */
-
-// ExtendLo4ToUint32x4 converts 4 lowest vector element values to uint32.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXBD, CPU Feature: AVX
-func (x Uint8x16) ExtendLo4ToUint32x4() Uint32x4
-
-// ExtendLo4ToUint32x4 converts 4 lowest vector element values to uint32.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXWD, CPU Feature: AVX
-func (x Uint16x8) ExtendLo4ToUint32x4() Uint32x4
-
-/* ExtendLo4ToUint64x4 */
-
-// ExtendLo4ToUint64x4 converts 4 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXBQ, CPU Feature: AVX2
-func (x Uint8x16) ExtendLo4ToUint64x4() Uint64x4
-
-// ExtendLo4ToUint64x4 converts 4 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXWQ, CPU Feature: AVX2
-func (x Uint16x8) ExtendLo4ToUint64x4() Uint64x4
-
-/* ExtendLo8ToInt16x8 */
-
-// ExtendLo8ToInt16x8 converts 8 lowest vector element values to int16.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXBW, CPU Feature: AVX
-func (x Int8x16) ExtendLo8ToInt16x8() Int16x8
-
-/* ExtendLo8ToInt32x8 */
-
-// ExtendLo8ToInt32x8 converts 8 lowest vector element values to int32.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXBD, CPU Feature: AVX2
-func (x Int8x16) ExtendLo8ToInt32x8() Int32x8
-
-/* ExtendLo8ToInt64x8 */
-
-// ExtendLo8ToInt64x8 converts 8 lowest vector element values to int64.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXBQ, CPU Feature: AVX512
-func (x Int8x16) ExtendLo8ToInt64x8() Int64x8
-
-/* ExtendLo8ToUint16x8 */
-
-// ExtendLo8ToUint16x8 converts 8 lowest vector element values to uint16.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXBW, CPU Feature: AVX
-func (x Uint8x16) ExtendLo8ToUint16x8() Uint16x8
-
-/* ExtendLo8ToUint32x8 */
-
-// ExtendLo8ToUint32x8 converts 8 lowest vector element values to uint32.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXBD, CPU Feature: AVX2
-func (x Uint8x16) ExtendLo8ToUint32x8() Uint32x8
-
-/* ExtendLo8ToUint64x8 */
-
-// ExtendLo8ToUint64x8 converts 8 lowest vector element values to uint64.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXBQ, CPU Feature: AVX512
-func (x Uint8x16) ExtendLo8ToUint64x8() Uint64x8
-
-/* ExtendToInt16 */
-
-// ExtendToInt16 converts element values to int16.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXBW, CPU Feature: AVX2
-func (x Int8x16) ExtendToInt16() Int16x16
-
-// ExtendToInt16 converts element values to int16.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXBW, CPU Feature: AVX512
-func (x Int8x32) ExtendToInt16() Int16x32
-
-/* ExtendToInt32 */
-
-// ExtendToInt32 converts element values to int32.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXBD, CPU Feature: AVX512
-func (x Int8x16) ExtendToInt32() Int32x16
-
-// ExtendToInt32 converts element values to int32.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXWD, CPU Feature: AVX2
-func (x Int16x8) ExtendToInt32() Int32x8
-
-// ExtendToInt32 converts element values to int32.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXWD, CPU Feature: AVX512
-func (x Int16x16) ExtendToInt32() Int32x16
-
-/* ExtendToInt64 */
-
-// ExtendToInt64 converts element values to int64.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXWQ, CPU Feature: AVX512
-func (x Int16x8) ExtendToInt64() Int64x8
-
-// ExtendToInt64 converts element values to int64.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXDQ, CPU Feature: AVX2
-func (x Int32x4) ExtendToInt64() Int64x4
-
-// ExtendToInt64 converts element values to int64.
-// The result vector's elements are sign-extended.
-//
-// Asm: VPMOVSXDQ, CPU Feature: AVX512
-func (x Int32x8) ExtendToInt64() Int64x8
-
-/* ExtendToUint16 */
-
-// ExtendToUint16 converts element values to uint16.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXBW, CPU Feature: AVX2
-func (x Uint8x16) ExtendToUint16() Uint16x16
-
-// ExtendToUint16 converts element values to uint16.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXBW, CPU Feature: AVX512
-func (x Uint8x32) ExtendToUint16() Uint16x32
-
-/* ExtendToUint32 */
-
-// ExtendToUint32 converts element values to uint32.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXBD, CPU Feature: AVX512
-func (x Uint8x16) ExtendToUint32() Uint32x16
-
-// ExtendToUint32 converts element values to uint32.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXWD, CPU Feature: AVX2
-func (x Uint16x8) ExtendToUint32() Uint32x8
-
-// ExtendToUint32 converts element values to uint32.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXWD, CPU Feature: AVX512
-func (x Uint16x16) ExtendToUint32() Uint32x16
-
-/* ExtendToUint64 */
-
-// ExtendToUint64 converts element values to uint64.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXWQ, CPU Feature: AVX512
-func (x Uint16x8) ExtendToUint64() Uint64x8
-
-// ExtendToUint64 converts element values to uint64.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXDQ, CPU Feature: AVX2
-func (x Uint32x4) ExtendToUint64() Uint64x4
-
-// ExtendToUint64 converts element values to uint64.
-// The result vector's elements are zero-extended.
-//
-// Asm: VPMOVZXDQ, CPU Feature: AVX512
-func (x Uint32x8) ExtendToUint64() Uint64x8
-
-/* Floor */
-
-// Floor rounds elements down to the nearest integer.
-//
-// Asm: VROUNDPS, CPU Feature: AVX
-func (x Float32x4) Floor() Float32x4
-
-// Floor rounds elements down to the nearest integer.
-//
-// Asm: VROUNDPS, CPU Feature: AVX
-func (x Float32x8) Floor() Float32x8
-
-// Floor rounds elements down to the nearest integer.
-//
-// Asm: VROUNDPD, CPU Feature: AVX
-func (x Float64x2) Floor() Float64x2
-
-// Floor rounds elements down to the nearest integer.
-//
-// Asm: VROUNDPD, CPU Feature: AVX
-func (x Float64x4) Floor() Float64x4
-
-/* FloorScaled */
-
-// FloorScaled rounds elements down with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x4) FloorScaled(prec uint8) Float32x4
-
-// FloorScaled rounds elements down with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x8) FloorScaled(prec uint8) Float32x8
-
-// FloorScaled rounds elements down with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x16) FloorScaled(prec uint8) Float32x16
-
-// FloorScaled rounds elements down with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x2) FloorScaled(prec uint8) Float64x2
-
-// FloorScaled rounds elements down with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x4) FloorScaled(prec uint8) Float64x4
-
-// FloorScaled rounds elements down with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x8) FloorScaled(prec uint8) Float64x8
-
-/* FloorScaledResidue */
-
-// FloorScaledResidue computes the difference after flooring with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x4) FloorScaledResidue(prec uint8) Float32x4
-
-// FloorScaledResidue computes the difference after flooring with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x8) FloorScaledResidue(prec uint8) Float32x8
-
-// FloorScaledResidue computes the difference after flooring with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x16) FloorScaledResidue(prec uint8) Float32x16
-
-// FloorScaledResidue computes the difference after flooring with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x2) FloorScaledResidue(prec uint8) Float64x2
-
-// FloorScaledResidue computes the difference after flooring with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x4) FloorScaledResidue(prec uint8) Float64x4
-
-// FloorScaledResidue computes the difference after flooring with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x8) FloorScaledResidue(prec uint8) Float64x8
-
-/* GaloisFieldAffineTransform */
-
-// GaloisFieldAffineTransform computes an affine transformation in GF(2^8):
-// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
-// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
-// corresponding to a group of 8 elements in x.
-//
-// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
-func (x Uint8x16) GaloisFieldAffineTransform(y Uint64x2, b uint8) Uint8x16
-
-// GaloisFieldAffineTransform computes an affine transformation in GF(2^8):
-// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
-// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
-// corresponding to a group of 8 elements in x.
-//
-// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
-func (x Uint8x32) GaloisFieldAffineTransform(y Uint64x4, b uint8) Uint8x32
-
-// GaloisFieldAffineTransform computes an affine transformation in GF(2^8):
-// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
-// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
-// corresponding to a group of 8 elements in x.
-//
-// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
-func (x Uint8x64) GaloisFieldAffineTransform(y Uint64x8, b uint8) Uint8x64
-
-/* GaloisFieldAffineTransformInverse */
-
-// GaloisFieldAffineTransformInverse computes an affine transformation in GF(2^8),
-// with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
-// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
-// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
-// corresponding to a group of 8 elements in x.
-//
-// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
-func (x Uint8x16) GaloisFieldAffineTransformInverse(y Uint64x2, b uint8) Uint8x16
-
-// GaloisFieldAffineTransformInverse computes an affine transformation in GF(2^8),
-// with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
-// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
-// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
-// corresponding to a group of 8 elements in x.
-//
-// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
-func (x Uint8x32) GaloisFieldAffineTransformInverse(y Uint64x4, b uint8) Uint8x32
-
-// GaloisFieldAffineTransformInverse computes an affine transformation in GF(2^8),
-// with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
-// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
-// b is an 8-bit vector. The affine transformation is y * x + b, with each element of y
-// corresponding to a group of 8 elements in x.
-//
-// b results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
-func (x Uint8x64) GaloisFieldAffineTransformInverse(y Uint64x8, b uint8) Uint8x64
-
-/* GaloisFieldMul */
-
-// GaloisFieldMul computes element-wise GF(2^8) multiplication with
-// reduction polynomial x^8 + x^4 + x^3 + x + 1.
-//
-// Asm: VGF2P8MULB, CPU Feature: AVX512GFNI
-func (x Uint8x16) GaloisFieldMul(y Uint8x16) Uint8x16
-
-// GaloisFieldMul computes element-wise GF(2^8) multiplication with
-// reduction polynomial x^8 + x^4 + x^3 + x + 1.
-//
-// Asm: VGF2P8MULB, CPU Feature: AVX512GFNI
-func (x Uint8x32) GaloisFieldMul(y Uint8x32) Uint8x32
-
-// GaloisFieldMul computes element-wise GF(2^8) multiplication with
-// reduction polynomial x^8 + x^4 + x^3 + x + 1.
-//
-// Asm: VGF2P8MULB, CPU Feature: AVX512GFNI
-func (x Uint8x64) GaloisFieldMul(y Uint8x64) Uint8x64
-
-/* GetElem */
-
-// GetElem retrieves a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPEXTRD, CPU Feature: AVX
-func (x Float32x4) GetElem(index uint8) float32
-
-// GetElem retrieves a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPEXTRQ, CPU Feature: AVX
-func (x Float64x2) GetElem(index uint8) float64
-
-// GetElem retrieves a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPEXTRB, CPU Feature: AVX512
-func (x Int8x16) GetElem(index uint8) int8
-
-// GetElem retrieves a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPEXTRW, CPU Feature: AVX512
-func (x Int16x8) GetElem(index uint8) int16
-
-// GetElem retrieves a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPEXTRD, CPU Feature: AVX
-func (x Int32x4) GetElem(index uint8) int32
-
-// GetElem retrieves a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPEXTRQ, CPU Feature: AVX
-func (x Int64x2) GetElem(index uint8) int64
-
-// GetElem retrieves a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPEXTRB, CPU Feature: AVX512
-func (x Uint8x16) GetElem(index uint8) uint8
-
-// GetElem retrieves a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPEXTRW, CPU Feature: AVX512
-func (x Uint16x8) GetElem(index uint8) uint16
-
-// GetElem retrieves a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPEXTRD, CPU Feature: AVX
-func (x Uint32x4) GetElem(index uint8) uint32
-
-// GetElem retrieves a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPEXTRQ, CPU Feature: AVX
-func (x Uint64x2) GetElem(index uint8) uint64
-
-/* GetHi */
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTF128, CPU Feature: AVX
-func (x Float32x8) GetHi() Float32x4
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTF64X4, CPU Feature: AVX512
-func (x Float32x16) GetHi() Float32x8
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTF128, CPU Feature: AVX
-func (x Float64x4) GetHi() Float64x2
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTF64X4, CPU Feature: AVX512
-func (x Float64x8) GetHi() Float64x4
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Int8x32) GetHi() Int8x16
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Int8x64) GetHi() Int8x32
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Int16x16) GetHi() Int16x8
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Int16x32) GetHi() Int16x16
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Int32x8) GetHi() Int32x4
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Int32x16) GetHi() Int32x8
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Int64x4) GetHi() Int64x2
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Int64x8) GetHi() Int64x4
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Uint8x32) GetHi() Uint8x16
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Uint8x64) GetHi() Uint8x32
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Uint16x16) GetHi() Uint16x8
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Uint16x32) GetHi() Uint16x16
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Uint32x8) GetHi() Uint32x4
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Uint32x16) GetHi() Uint32x8
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Uint64x4) GetHi() Uint64x2
-
-// GetHi returns the upper half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Uint64x8) GetHi() Uint64x4
-
-/* GetLo */
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTF128, CPU Feature: AVX
-func (x Float32x8) GetLo() Float32x4
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTF64X4, CPU Feature: AVX512
-func (x Float32x16) GetLo() Float32x8
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTF128, CPU Feature: AVX
-func (x Float64x4) GetLo() Float64x2
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTF64X4, CPU Feature: AVX512
-func (x Float64x8) GetLo() Float64x4
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Int8x32) GetLo() Int8x16
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Int8x64) GetLo() Int8x32
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Int16x16) GetLo() Int16x8
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Int16x32) GetLo() Int16x16
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Int32x8) GetLo() Int32x4
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Int32x16) GetLo() Int32x8
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Int64x4) GetLo() Int64x2
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Int64x8) GetLo() Int64x4
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Uint8x32) GetLo() Uint8x16
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Uint8x64) GetLo() Uint8x32
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Uint16x16) GetLo() Uint16x8
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Uint16x32) GetLo() Uint16x16
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Uint32x8) GetLo() Uint32x4
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Uint32x16) GetLo() Uint32x8
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI128, CPU Feature: AVX2
-func (x Uint64x4) GetLo() Uint64x2
-
-// GetLo returns the lower half of x.
-//
-// Asm: VEXTRACTI64X4, CPU Feature: AVX512
-func (x Uint64x8) GetLo() Uint64x4
-
-/* Greater */
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTB, CPU Feature: AVX
-func (x Int8x16) Greater(y Int8x16) Mask8x16
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTB, CPU Feature: AVX2
-func (x Int8x32) Greater(y Int8x32) Mask8x32
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTB, CPU Feature: AVX512
-func (x Int8x64) Greater(y Int8x64) Mask8x64
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTW, CPU Feature: AVX
-func (x Int16x8) Greater(y Int16x8) Mask16x8
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTW, CPU Feature: AVX2
-func (x Int16x16) Greater(y Int16x16) Mask16x16
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTW, CPU Feature: AVX512
-func (x Int16x32) Greater(y Int16x32) Mask16x32
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTD, CPU Feature: AVX
-func (x Int32x4) Greater(y Int32x4) Mask32x4
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTD, CPU Feature: AVX2
-func (x Int32x8) Greater(y Int32x8) Mask32x8
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTD, CPU Feature: AVX512
-func (x Int32x16) Greater(y Int32x16) Mask32x16
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTQ, CPU Feature: AVX
-func (x Int64x2) Greater(y Int64x2) Mask64x2
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTQ, CPU Feature: AVX2
-func (x Int64x4) Greater(y Int64x4) Mask64x4
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPGTQ, CPU Feature: AVX512
-func (x Int64x8) Greater(y Int64x8) Mask64x8
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x4) Greater(y Float32x4) Mask32x4
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x8) Greater(y Float32x8) Mask32x8
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX512
-func (x Float32x16) Greater(y Float32x16) Mask32x16
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x2) Greater(y Float64x2) Mask64x2
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x4) Greater(y Float64x4) Mask64x4
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX512
-func (x Float64x8) Greater(y Float64x8) Mask64x8
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPUB, CPU Feature: AVX512
-func (x Uint8x64) Greater(y Uint8x64) Mask8x64
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPUW, CPU Feature: AVX512
-func (x Uint16x32) Greater(y Uint16x32) Mask16x32
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPUD, CPU Feature: AVX512
-func (x Uint32x16) Greater(y Uint32x16) Mask32x16
-
-// Greater returns x greater-than y, elementwise.
-//
-// Asm: VPCMPUQ, CPU Feature: AVX512
-func (x Uint64x8) Greater(y Uint64x8) Mask64x8
-
-/* GreaterEqual */
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x4) GreaterEqual(y Float32x4) Mask32x4
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x8) GreaterEqual(y Float32x8) Mask32x8
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX512
-func (x Float32x16) GreaterEqual(y Float32x16) Mask32x16
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x2) GreaterEqual(y Float64x2) Mask64x2
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x4) GreaterEqual(y Float64x4) Mask64x4
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX512
-func (x Float64x8) GreaterEqual(y Float64x8) Mask64x8
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VPCMPB, CPU Feature: AVX512
-func (x Int8x64) GreaterEqual(y Int8x64) Mask8x64
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VPCMPW, CPU Feature: AVX512
-func (x Int16x32) GreaterEqual(y Int16x32) Mask16x32
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VPCMPD, CPU Feature: AVX512
-func (x Int32x16) GreaterEqual(y Int32x16) Mask32x16
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VPCMPQ, CPU Feature: AVX512
-func (x Int64x8) GreaterEqual(y Int64x8) Mask64x8
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VPCMPUB, CPU Feature: AVX512
-func (x Uint8x64) GreaterEqual(y Uint8x64) Mask8x64
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VPCMPUW, CPU Feature: AVX512
-func (x Uint16x32) GreaterEqual(y Uint16x32) Mask16x32
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VPCMPUD, CPU Feature: AVX512
-func (x Uint32x16) GreaterEqual(y Uint32x16) Mask32x16
-
-// GreaterEqual returns x greater-than-or-equals y, elementwise.
-//
-// Asm: VPCMPUQ, CPU Feature: AVX512
-func (x Uint64x8) GreaterEqual(y Uint64x8) Mask64x8
-
-/* InterleaveHi */
-
-// InterleaveHi interleaves the elements of the high halves of x and y.
-//
-// Asm: VPUNPCKHWD, CPU Feature: AVX
-func (x Int16x8) InterleaveHi(y Int16x8) Int16x8
-
-// InterleaveHi interleaves the elements of the high halves of x and y.
-//
-// Asm: VPUNPCKHDQ, CPU Feature: AVX
-func (x Int32x4) InterleaveHi(y Int32x4) Int32x4
-
-// InterleaveHi interleaves the elements of the high halves of x and y.
-//
-// Asm: VPUNPCKHQDQ, CPU Feature: AVX
-func (x Int64x2) InterleaveHi(y Int64x2) Int64x2
-
-// InterleaveHi interleaves the elements of the high halves of x and y.
-//
-// Asm: VPUNPCKHWD, CPU Feature: AVX
-func (x Uint16x8) InterleaveHi(y Uint16x8) Uint16x8
-
-// InterleaveHi interleaves the elements of the high halves of x and y.
-//
-// Asm: VPUNPCKHDQ, CPU Feature: AVX
-func (x Uint32x4) InterleaveHi(y Uint32x4) Uint32x4
-
-// InterleaveHi interleaves the elements of the high halves of x and y.
-//
-// Asm: VPUNPCKHQDQ, CPU Feature: AVX
-func (x Uint64x2) InterleaveHi(y Uint64x2) Uint64x2
-
-/* InterleaveHiGrouped */
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHWD, CPU Feature: AVX2
-func (x Int16x16) InterleaveHiGrouped(y Int16x16) Int16x16
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHWD, CPU Feature: AVX512
-func (x Int16x32) InterleaveHiGrouped(y Int16x32) Int16x32
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHDQ, CPU Feature: AVX2
-func (x Int32x8) InterleaveHiGrouped(y Int32x8) Int32x8
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHDQ, CPU Feature: AVX512
-func (x Int32x16) InterleaveHiGrouped(y Int32x16) Int32x16
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHQDQ, CPU Feature: AVX2
-func (x Int64x4) InterleaveHiGrouped(y Int64x4) Int64x4
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHQDQ, CPU Feature: AVX512
-func (x Int64x8) InterleaveHiGrouped(y Int64x8) Int64x8
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHWD, CPU Feature: AVX2
-func (x Uint16x16) InterleaveHiGrouped(y Uint16x16) Uint16x16
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHWD, CPU Feature: AVX512
-func (x Uint16x32) InterleaveHiGrouped(y Uint16x32) Uint16x32
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHDQ, CPU Feature: AVX2
-func (x Uint32x8) InterleaveHiGrouped(y Uint32x8) Uint32x8
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHDQ, CPU Feature: AVX512
-func (x Uint32x16) InterleaveHiGrouped(y Uint32x16) Uint32x16
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHQDQ, CPU Feature: AVX2
-func (x Uint64x4) InterleaveHiGrouped(y Uint64x4) Uint64x4
-
-// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKHQDQ, CPU Feature: AVX512
-func (x Uint64x8) InterleaveHiGrouped(y Uint64x8) Uint64x8
-
-/* InterleaveLo */
-
-// InterleaveLo interleaves the elements of the low halves of x and y.
-//
-// Asm: VPUNPCKLWD, CPU Feature: AVX
-func (x Int16x8) InterleaveLo(y Int16x8) Int16x8
-
-// InterleaveLo interleaves the elements of the low halves of x and y.
-//
-// Asm: VPUNPCKLDQ, CPU Feature: AVX
-func (x Int32x4) InterleaveLo(y Int32x4) Int32x4
-
-// InterleaveLo interleaves the elements of the low halves of x and y.
-//
-// Asm: VPUNPCKLQDQ, CPU Feature: AVX
-func (x Int64x2) InterleaveLo(y Int64x2) Int64x2
-
-// InterleaveLo interleaves the elements of the low halves of x and y.
-//
-// Asm: VPUNPCKLWD, CPU Feature: AVX
-func (x Uint16x8) InterleaveLo(y Uint16x8) Uint16x8
-
-// InterleaveLo interleaves the elements of the low halves of x and y.
-//
-// Asm: VPUNPCKLDQ, CPU Feature: AVX
-func (x Uint32x4) InterleaveLo(y Uint32x4) Uint32x4
-
-// InterleaveLo interleaves the elements of the low halves of x and y.
-//
-// Asm: VPUNPCKLQDQ, CPU Feature: AVX
-func (x Uint64x2) InterleaveLo(y Uint64x2) Uint64x2
-
-/* InterleaveLoGrouped */
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLWD, CPU Feature: AVX2
-func (x Int16x16) InterleaveLoGrouped(y Int16x16) Int16x16
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLWD, CPU Feature: AVX512
-func (x Int16x32) InterleaveLoGrouped(y Int16x32) Int16x32
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLDQ, CPU Feature: AVX2
-func (x Int32x8) InterleaveLoGrouped(y Int32x8) Int32x8
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLDQ, CPU Feature: AVX512
-func (x Int32x16) InterleaveLoGrouped(y Int32x16) Int32x16
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLQDQ, CPU Feature: AVX2
-func (x Int64x4) InterleaveLoGrouped(y Int64x4) Int64x4
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLQDQ, CPU Feature: AVX512
-func (x Int64x8) InterleaveLoGrouped(y Int64x8) Int64x8
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLWD, CPU Feature: AVX2
-func (x Uint16x16) InterleaveLoGrouped(y Uint16x16) Uint16x16
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLWD, CPU Feature: AVX512
-func (x Uint16x32) InterleaveLoGrouped(y Uint16x32) Uint16x32
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLDQ, CPU Feature: AVX2
-func (x Uint32x8) InterleaveLoGrouped(y Uint32x8) Uint32x8
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLDQ, CPU Feature: AVX512
-func (x Uint32x16) InterleaveLoGrouped(y Uint32x16) Uint32x16
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLQDQ, CPU Feature: AVX2
-func (x Uint64x4) InterleaveLoGrouped(y Uint64x4) Uint64x4
-
-// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
-//
-// Asm: VPUNPCKLQDQ, CPU Feature: AVX512
-func (x Uint64x8) InterleaveLoGrouped(y Uint64x8) Uint64x8
-
-/* IsNan */
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x4) IsNan(y Float32x4) Mask32x4
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x8) IsNan(y Float32x8) Mask32x8
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPS, CPU Feature: AVX512
-func (x Float32x16) IsNan(y Float32x16) Mask32x16
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x2) IsNan(y Float64x2) Mask64x2
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x4) IsNan(y Float64x4) Mask64x4
-
-// IsNan checks if elements are NaN. Use as x.IsNan(x).
-//
-// Asm: VCMPPD, CPU Feature: AVX512
-func (x Float64x8) IsNan(y Float64x8) Mask64x8
-
-/* LeadingZeros */
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTD, CPU Feature: AVX512
-func (x Int32x4) LeadingZeros() Int32x4
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTD, CPU Feature: AVX512
-func (x Int32x8) LeadingZeros() Int32x8
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTD, CPU Feature: AVX512
-func (x Int32x16) LeadingZeros() Int32x16
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTQ, CPU Feature: AVX512
-func (x Int64x2) LeadingZeros() Int64x2
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTQ, CPU Feature: AVX512
-func (x Int64x4) LeadingZeros() Int64x4
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTQ, CPU Feature: AVX512
-func (x Int64x8) LeadingZeros() Int64x8
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTD, CPU Feature: AVX512
-func (x Uint32x4) LeadingZeros() Uint32x4
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTD, CPU Feature: AVX512
-func (x Uint32x8) LeadingZeros() Uint32x8
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTD, CPU Feature: AVX512
-func (x Uint32x16) LeadingZeros() Uint32x16
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTQ, CPU Feature: AVX512
-func (x Uint64x2) LeadingZeros() Uint64x2
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTQ, CPU Feature: AVX512
-func (x Uint64x4) LeadingZeros() Uint64x4
-
-// LeadingZeros counts the leading zeros of each element in x.
-//
-// Asm: VPLZCNTQ, CPU Feature: AVX512
-func (x Uint64x8) LeadingZeros() Uint64x8
-
-/* Less */
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x4) Less(y Float32x4) Mask32x4
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x8) Less(y Float32x8) Mask32x8
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX512
-func (x Float32x16) Less(y Float32x16) Mask32x16
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x2) Less(y Float64x2) Mask64x2
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x4) Less(y Float64x4) Mask64x4
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX512
-func (x Float64x8) Less(y Float64x8) Mask64x8
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VPCMPB, CPU Feature: AVX512
-func (x Int8x64) Less(y Int8x64) Mask8x64
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VPCMPW, CPU Feature: AVX512
-func (x Int16x32) Less(y Int16x32) Mask16x32
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VPCMPD, CPU Feature: AVX512
-func (x Int32x16) Less(y Int32x16) Mask32x16
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VPCMPQ, CPU Feature: AVX512
-func (x Int64x8) Less(y Int64x8) Mask64x8
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VPCMPUB, CPU Feature: AVX512
-func (x Uint8x64) Less(y Uint8x64) Mask8x64
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VPCMPUW, CPU Feature: AVX512
-func (x Uint16x32) Less(y Uint16x32) Mask16x32
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VPCMPUD, CPU Feature: AVX512
-func (x Uint32x16) Less(y Uint32x16) Mask32x16
-
-// Less returns x less-than y, elementwise.
-//
-// Asm: VPCMPUQ, CPU Feature: AVX512
-func (x Uint64x8) Less(y Uint64x8) Mask64x8
-
-/* LessEqual */
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x4) LessEqual(y Float32x4) Mask32x4
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x8) LessEqual(y Float32x8) Mask32x8
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX512
-func (x Float32x16) LessEqual(y Float32x16) Mask32x16
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x2) LessEqual(y Float64x2) Mask64x2
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x4) LessEqual(y Float64x4) Mask64x4
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX512
-func (x Float64x8) LessEqual(y Float64x8) Mask64x8
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VPCMPB, CPU Feature: AVX512
-func (x Int8x64) LessEqual(y Int8x64) Mask8x64
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VPCMPW, CPU Feature: AVX512
-func (x Int16x32) LessEqual(y Int16x32) Mask16x32
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VPCMPD, CPU Feature: AVX512
-func (x Int32x16) LessEqual(y Int32x16) Mask32x16
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VPCMPQ, CPU Feature: AVX512
-func (x Int64x8) LessEqual(y Int64x8) Mask64x8
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VPCMPUB, CPU Feature: AVX512
-func (x Uint8x64) LessEqual(y Uint8x64) Mask8x64
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VPCMPUW, CPU Feature: AVX512
-func (x Uint16x32) LessEqual(y Uint16x32) Mask16x32
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VPCMPUD, CPU Feature: AVX512
-func (x Uint32x16) LessEqual(y Uint32x16) Mask32x16
-
-// LessEqual returns x less-than-or-equals y, elementwise.
-//
-// Asm: VPCMPUQ, CPU Feature: AVX512
-func (x Uint64x8) LessEqual(y Uint64x8) Mask64x8
-
-/* Max */
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VMAXPS, CPU Feature: AVX
-func (x Float32x4) Max(y Float32x4) Float32x4
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VMAXPS, CPU Feature: AVX
-func (x Float32x8) Max(y Float32x8) Float32x8
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VMAXPS, CPU Feature: AVX512
-func (x Float32x16) Max(y Float32x16) Float32x16
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VMAXPD, CPU Feature: AVX
-func (x Float64x2) Max(y Float64x2) Float64x2
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VMAXPD, CPU Feature: AVX
-func (x Float64x4) Max(y Float64x4) Float64x4
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VMAXPD, CPU Feature: AVX512
-func (x Float64x8) Max(y Float64x8) Float64x8
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSB, CPU Feature: AVX
-func (x Int8x16) Max(y Int8x16) Int8x16
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSB, CPU Feature: AVX2
-func (x Int8x32) Max(y Int8x32) Int8x32
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSB, CPU Feature: AVX512
-func (x Int8x64) Max(y Int8x64) Int8x64
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSW, CPU Feature: AVX
-func (x Int16x8) Max(y Int16x8) Int16x8
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSW, CPU Feature: AVX2
-func (x Int16x16) Max(y Int16x16) Int16x16
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSW, CPU Feature: AVX512
-func (x Int16x32) Max(y Int16x32) Int16x32
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSD, CPU Feature: AVX
-func (x Int32x4) Max(y Int32x4) Int32x4
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSD, CPU Feature: AVX2
-func (x Int32x8) Max(y Int32x8) Int32x8
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSD, CPU Feature: AVX512
-func (x Int32x16) Max(y Int32x16) Int32x16
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSQ, CPU Feature: AVX512
-func (x Int64x2) Max(y Int64x2) Int64x2
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSQ, CPU Feature: AVX512
-func (x Int64x4) Max(y Int64x4) Int64x4
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXSQ, CPU Feature: AVX512
-func (x Int64x8) Max(y Int64x8) Int64x8
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUB, CPU Feature: AVX
-func (x Uint8x16) Max(y Uint8x16) Uint8x16
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUB, CPU Feature: AVX2
-func (x Uint8x32) Max(y Uint8x32) Uint8x32
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUB, CPU Feature: AVX512
-func (x Uint8x64) Max(y Uint8x64) Uint8x64
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUW, CPU Feature: AVX
-func (x Uint16x8) Max(y Uint16x8) Uint16x8
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUW, CPU Feature: AVX2
-func (x Uint16x16) Max(y Uint16x16) Uint16x16
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUW, CPU Feature: AVX512
-func (x Uint16x32) Max(y Uint16x32) Uint16x32
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUD, CPU Feature: AVX
-func (x Uint32x4) Max(y Uint32x4) Uint32x4
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUD, CPU Feature: AVX2
-func (x Uint32x8) Max(y Uint32x8) Uint32x8
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUD, CPU Feature: AVX512
-func (x Uint32x16) Max(y Uint32x16) Uint32x16
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUQ, CPU Feature: AVX512
-func (x Uint64x2) Max(y Uint64x2) Uint64x2
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUQ, CPU Feature: AVX512
-func (x Uint64x4) Max(y Uint64x4) Uint64x4
-
-// Max computes the maximum of corresponding elements.
-//
-// Asm: VPMAXUQ, CPU Feature: AVX512
-func (x Uint64x8) Max(y Uint64x8) Uint64x8
-
-/* Min */
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VMINPS, CPU Feature: AVX
-func (x Float32x4) Min(y Float32x4) Float32x4
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VMINPS, CPU Feature: AVX
-func (x Float32x8) Min(y Float32x8) Float32x8
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VMINPS, CPU Feature: AVX512
-func (x Float32x16) Min(y Float32x16) Float32x16
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VMINPD, CPU Feature: AVX
-func (x Float64x2) Min(y Float64x2) Float64x2
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VMINPD, CPU Feature: AVX
-func (x Float64x4) Min(y Float64x4) Float64x4
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VMINPD, CPU Feature: AVX512
-func (x Float64x8) Min(y Float64x8) Float64x8
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSB, CPU Feature: AVX
-func (x Int8x16) Min(y Int8x16) Int8x16
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSB, CPU Feature: AVX2
-func (x Int8x32) Min(y Int8x32) Int8x32
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSB, CPU Feature: AVX512
-func (x Int8x64) Min(y Int8x64) Int8x64
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSW, CPU Feature: AVX
-func (x Int16x8) Min(y Int16x8) Int16x8
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSW, CPU Feature: AVX2
-func (x Int16x16) Min(y Int16x16) Int16x16
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSW, CPU Feature: AVX512
-func (x Int16x32) Min(y Int16x32) Int16x32
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSD, CPU Feature: AVX
-func (x Int32x4) Min(y Int32x4) Int32x4
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSD, CPU Feature: AVX2
-func (x Int32x8) Min(y Int32x8) Int32x8
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSD, CPU Feature: AVX512
-func (x Int32x16) Min(y Int32x16) Int32x16
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSQ, CPU Feature: AVX512
-func (x Int64x2) Min(y Int64x2) Int64x2
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSQ, CPU Feature: AVX512
-func (x Int64x4) Min(y Int64x4) Int64x4
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINSQ, CPU Feature: AVX512
-func (x Int64x8) Min(y Int64x8) Int64x8
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUB, CPU Feature: AVX
-func (x Uint8x16) Min(y Uint8x16) Uint8x16
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUB, CPU Feature: AVX2
-func (x Uint8x32) Min(y Uint8x32) Uint8x32
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUB, CPU Feature: AVX512
-func (x Uint8x64) Min(y Uint8x64) Uint8x64
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUW, CPU Feature: AVX
-func (x Uint16x8) Min(y Uint16x8) Uint16x8
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUW, CPU Feature: AVX2
-func (x Uint16x16) Min(y Uint16x16) Uint16x16
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUW, CPU Feature: AVX512
-func (x Uint16x32) Min(y Uint16x32) Uint16x32
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUD, CPU Feature: AVX
-func (x Uint32x4) Min(y Uint32x4) Uint32x4
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUD, CPU Feature: AVX2
-func (x Uint32x8) Min(y Uint32x8) Uint32x8
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUD, CPU Feature: AVX512
-func (x Uint32x16) Min(y Uint32x16) Uint32x16
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUQ, CPU Feature: AVX512
-func (x Uint64x2) Min(y Uint64x2) Uint64x2
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUQ, CPU Feature: AVX512
-func (x Uint64x4) Min(y Uint64x4) Uint64x4
-
-// Min computes the minimum of corresponding elements.
-//
-// Asm: VPMINUQ, CPU Feature: AVX512
-func (x Uint64x8) Min(y Uint64x8) Uint64x8
-
-/* Mul */
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VMULPS, CPU Feature: AVX
-func (x Float32x4) Mul(y Float32x4) Float32x4
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VMULPS, CPU Feature: AVX
-func (x Float32x8) Mul(y Float32x8) Float32x8
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VMULPS, CPU Feature: AVX512
-func (x Float32x16) Mul(y Float32x16) Float32x16
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VMULPD, CPU Feature: AVX
-func (x Float64x2) Mul(y Float64x2) Float64x2
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VMULPD, CPU Feature: AVX
-func (x Float64x4) Mul(y Float64x4) Float64x4
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VMULPD, CPU Feature: AVX512
-func (x Float64x8) Mul(y Float64x8) Float64x8
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLW, CPU Feature: AVX
-func (x Int16x8) Mul(y Int16x8) Int16x8
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLW, CPU Feature: AVX2
-func (x Int16x16) Mul(y Int16x16) Int16x16
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLW, CPU Feature: AVX512
-func (x Int16x32) Mul(y Int16x32) Int16x32
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLD, CPU Feature: AVX
-func (x Int32x4) Mul(y Int32x4) Int32x4
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLD, CPU Feature: AVX2
-func (x Int32x8) Mul(y Int32x8) Int32x8
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLD, CPU Feature: AVX512
-func (x Int32x16) Mul(y Int32x16) Int32x16
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512
-func (x Int64x2) Mul(y Int64x2) Int64x2
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512
-func (x Int64x4) Mul(y Int64x4) Int64x4
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512
-func (x Int64x8) Mul(y Int64x8) Int64x8
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLW, CPU Feature: AVX
-func (x Uint16x8) Mul(y Uint16x8) Uint16x8
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLW, CPU Feature: AVX2
-func (x Uint16x16) Mul(y Uint16x16) Uint16x16
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLW, CPU Feature: AVX512
-func (x Uint16x32) Mul(y Uint16x32) Uint16x32
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLD, CPU Feature: AVX
-func (x Uint32x4) Mul(y Uint32x4) Uint32x4
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLD, CPU Feature: AVX2
-func (x Uint32x8) Mul(y Uint32x8) Uint32x8
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLD, CPU Feature: AVX512
-func (x Uint32x16) Mul(y Uint32x16) Uint32x16
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512
-func (x Uint64x2) Mul(y Uint64x2) Uint64x2
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512
-func (x Uint64x4) Mul(y Uint64x4) Uint64x4
-
-// Mul multiplies corresponding elements of two vectors.
-//
-// Asm: VPMULLQ, CPU Feature: AVX512
-func (x Uint64x8) Mul(y Uint64x8) Uint64x8
-
-/* MulAdd */
-
-// MulAdd performs a fused (x * y) + z.
-//
-// Asm: VFMADD213PS, CPU Feature: AVX512
-func (x Float32x4) MulAdd(y Float32x4, z Float32x4) Float32x4
-
-// MulAdd performs a fused (x * y) + z.
-//
-// Asm: VFMADD213PS, CPU Feature: AVX512
-func (x Float32x8) MulAdd(y Float32x8, z Float32x8) Float32x8
-
-// MulAdd performs a fused (x * y) + z.
-//
-// Asm: VFMADD213PS, CPU Feature: AVX512
-func (x Float32x16) MulAdd(y Float32x16, z Float32x16) Float32x16
-
-// MulAdd performs a fused (x * y) + z.
-//
-// Asm: VFMADD213PD, CPU Feature: AVX512
-func (x Float64x2) MulAdd(y Float64x2, z Float64x2) Float64x2
-
-// MulAdd performs a fused (x * y) + z.
-//
-// Asm: VFMADD213PD, CPU Feature: AVX512
-func (x Float64x4) MulAdd(y Float64x4, z Float64x4) Float64x4
-
-// MulAdd performs a fused (x * y) + z.
-//
-// Asm: VFMADD213PD, CPU Feature: AVX512
-func (x Float64x8) MulAdd(y Float64x8, z Float64x8) Float64x8
-
-/* MulAddSub */
-
-// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
-//
-// Asm: VFMADDSUB213PS, CPU Feature: AVX512
-func (x Float32x4) MulAddSub(y Float32x4, z Float32x4) Float32x4
-
-// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
-//
-// Asm: VFMADDSUB213PS, CPU Feature: AVX512
-func (x Float32x8) MulAddSub(y Float32x8, z Float32x8) Float32x8
-
-// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
-//
-// Asm: VFMADDSUB213PS, CPU Feature: AVX512
-func (x Float32x16) MulAddSub(y Float32x16, z Float32x16) Float32x16
-
-// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
-//
-// Asm: VFMADDSUB213PD, CPU Feature: AVX512
-func (x Float64x2) MulAddSub(y Float64x2, z Float64x2) Float64x2
-
-// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
-//
-// Asm: VFMADDSUB213PD, CPU Feature: AVX512
-func (x Float64x4) MulAddSub(y Float64x4, z Float64x4) Float64x4
-
-// MulAddSub performs a fused (x * y) - z for odd-indexed elements, and (x * y) + z for even-indexed elements.
-//
-// Asm: VFMADDSUB213PD, CPU Feature: AVX512
-func (x Float64x8) MulAddSub(y Float64x8, z Float64x8) Float64x8
-
-/* MulEvenWiden */
-
-// MulEvenWiden multiplies even-indexed elements, widening the result.
-// Result[i] = v1.Even[i] * v2.Even[i].
-//
-// Asm: VPMULDQ, CPU Feature: AVX
-func (x Int32x4) MulEvenWiden(y Int32x4) Int64x2
-
-// MulEvenWiden multiplies even-indexed elements, widening the result.
-// Result[i] = v1.Even[i] * v2.Even[i].
-//
-// Asm: VPMULDQ, CPU Feature: AVX2
-func (x Int32x8) MulEvenWiden(y Int32x8) Int64x4
-
-// MulEvenWiden multiplies even-indexed elements, widening the result.
-// Result[i] = v1.Even[i] * v2.Even[i].
-//
-// Asm: VPMULUDQ, CPU Feature: AVX
-func (x Uint32x4) MulEvenWiden(y Uint32x4) Uint64x2
-
-// MulEvenWiden multiplies even-indexed elements, widening the result.
-// Result[i] = v1.Even[i] * v2.Even[i].
-//
-// Asm: VPMULUDQ, CPU Feature: AVX2
-func (x Uint32x8) MulEvenWiden(y Uint32x8) Uint64x4
-
-/* MulHigh */
-
-// MulHigh multiplies elements and stores the high part of the result.
-//
-// Asm: VPMULHW, CPU Feature: AVX
-func (x Int16x8) MulHigh(y Int16x8) Int16x8
-
-// MulHigh multiplies elements and stores the high part of the result.
-//
-// Asm: VPMULHW, CPU Feature: AVX2
-func (x Int16x16) MulHigh(y Int16x16) Int16x16
-
-// MulHigh multiplies elements and stores the high part of the result.
-//
-// Asm: VPMULHW, CPU Feature: AVX512
-func (x Int16x32) MulHigh(y Int16x32) Int16x32
-
-// MulHigh multiplies elements and stores the high part of the result.
-//
-// Asm: VPMULHUW, CPU Feature: AVX
-func (x Uint16x8) MulHigh(y Uint16x8) Uint16x8
-
-// MulHigh multiplies elements and stores the high part of the result.
-//
-// Asm: VPMULHUW, CPU Feature: AVX2
-func (x Uint16x16) MulHigh(y Uint16x16) Uint16x16
-
-// MulHigh multiplies elements and stores the high part of the result.
-//
-// Asm: VPMULHUW, CPU Feature: AVX512
-func (x Uint16x32) MulHigh(y Uint16x32) Uint16x32
-
-/* MulSubAdd */
-
-// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
-//
-// Asm: VFMSUBADD213PS, CPU Feature: AVX512
-func (x Float32x4) MulSubAdd(y Float32x4, z Float32x4) Float32x4
-
-// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
-//
-// Asm: VFMSUBADD213PS, CPU Feature: AVX512
-func (x Float32x8) MulSubAdd(y Float32x8, z Float32x8) Float32x8
-
-// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
-//
-// Asm: VFMSUBADD213PS, CPU Feature: AVX512
-func (x Float32x16) MulSubAdd(y Float32x16, z Float32x16) Float32x16
-
-// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
-//
-// Asm: VFMSUBADD213PD, CPU Feature: AVX512
-func (x Float64x2) MulSubAdd(y Float64x2, z Float64x2) Float64x2
-
-// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
-//
-// Asm: VFMSUBADD213PD, CPU Feature: AVX512
-func (x Float64x4) MulSubAdd(y Float64x4, z Float64x4) Float64x4
-
-// MulSubAdd performs a fused (x * y) + z for odd-indexed elements, and (x * y) - z for even-indexed elements.
-//
-// Asm: VFMSUBADD213PD, CPU Feature: AVX512
-func (x Float64x8) MulSubAdd(y Float64x8, z Float64x8) Float64x8
-
-/* NotEqual */
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x4) NotEqual(y Float32x4) Mask32x4
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX
-func (x Float32x8) NotEqual(y Float32x8) Mask32x8
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VCMPPS, CPU Feature: AVX512
-func (x Float32x16) NotEqual(y Float32x16) Mask32x16
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x2) NotEqual(y Float64x2) Mask64x2
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX
-func (x Float64x4) NotEqual(y Float64x4) Mask64x4
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VCMPPD, CPU Feature: AVX512
-func (x Float64x8) NotEqual(y Float64x8) Mask64x8
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VPCMPB, CPU Feature: AVX512
-func (x Int8x64) NotEqual(y Int8x64) Mask8x64
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VPCMPW, CPU Feature: AVX512
-func (x Int16x32) NotEqual(y Int16x32) Mask16x32
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VPCMPD, CPU Feature: AVX512
-func (x Int32x16) NotEqual(y Int32x16) Mask32x16
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VPCMPQ, CPU Feature: AVX512
-func (x Int64x8) NotEqual(y Int64x8) Mask64x8
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VPCMPUB, CPU Feature: AVX512
-func (x Uint8x64) NotEqual(y Uint8x64) Mask8x64
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VPCMPUW, CPU Feature: AVX512
-func (x Uint16x32) NotEqual(y Uint16x32) Mask16x32
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VPCMPUD, CPU Feature: AVX512
-func (x Uint32x16) NotEqual(y Uint32x16) Mask32x16
-
-// NotEqual returns x not-equals y, elementwise.
-//
-// Asm: VPCMPUQ, CPU Feature: AVX512
-func (x Uint64x8) NotEqual(y Uint64x8) Mask64x8
-
-/* OnesCount */
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
-func (x Int8x16) OnesCount() Int8x16
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
-func (x Int8x32) OnesCount() Int8x32
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
-func (x Int8x64) OnesCount() Int8x64
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
-func (x Int16x8) OnesCount() Int16x8
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
-func (x Int16x16) OnesCount() Int16x16
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
-func (x Int16x32) OnesCount() Int16x32
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
-func (x Int32x4) OnesCount() Int32x4
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
-func (x Int32x8) OnesCount() Int32x8
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
-func (x Int32x16) OnesCount() Int32x16
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
-func (x Int64x2) OnesCount() Int64x2
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
-func (x Int64x4) OnesCount() Int64x4
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
-func (x Int64x8) OnesCount() Int64x8
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
-func (x Uint8x16) OnesCount() Uint8x16
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
-func (x Uint8x32) OnesCount() Uint8x32
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTB, CPU Feature: AVX512BITALG
-func (x Uint8x64) OnesCount() Uint8x64
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
-func (x Uint16x8) OnesCount() Uint16x8
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
-func (x Uint16x16) OnesCount() Uint16x16
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTW, CPU Feature: AVX512BITALG
-func (x Uint16x32) OnesCount() Uint16x32
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
-func (x Uint32x4) OnesCount() Uint32x4
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
-func (x Uint32x8) OnesCount() Uint32x8
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTD, CPU Feature: AVX512VPOPCNTDQ
-func (x Uint32x16) OnesCount() Uint32x16
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
-func (x Uint64x2) OnesCount() Uint64x2
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
-func (x Uint64x4) OnesCount() Uint64x4
-
-// OnesCount counts the number of set bits in each element.
-//
-// Asm: VPOPCNTQ, CPU Feature: AVX512VPOPCNTDQ
-func (x Uint64x8) OnesCount() Uint64x8
-
-/* Or */
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX
-func (x Int8x16) Or(y Int8x16) Int8x16
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX2
-func (x Int8x32) Or(y Int8x32) Int8x32
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPORD, CPU Feature: AVX512
-func (x Int8x64) Or(y Int8x64) Int8x64
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX
-func (x Int16x8) Or(y Int16x8) Int16x8
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX2
-func (x Int16x16) Or(y Int16x16) Int16x16
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPORD, CPU Feature: AVX512
-func (x Int16x32) Or(y Int16x32) Int16x32
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX
-func (x Int32x4) Or(y Int32x4) Int32x4
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX2
-func (x Int32x8) Or(y Int32x8) Int32x8
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPORD, CPU Feature: AVX512
-func (x Int32x16) Or(y Int32x16) Int32x16
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX
-func (x Int64x2) Or(y Int64x2) Int64x2
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX2
-func (x Int64x4) Or(y Int64x4) Int64x4
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPORQ, CPU Feature: AVX512
-func (x Int64x8) Or(y Int64x8) Int64x8
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX
-func (x Uint8x16) Or(y Uint8x16) Uint8x16
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX2
-func (x Uint8x32) Or(y Uint8x32) Uint8x32
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPORD, CPU Feature: AVX512
-func (x Uint8x64) Or(y Uint8x64) Uint8x64
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX
-func (x Uint16x8) Or(y Uint16x8) Uint16x8
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX2
-func (x Uint16x16) Or(y Uint16x16) Uint16x16
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPORD, CPU Feature: AVX512
-func (x Uint16x32) Or(y Uint16x32) Uint16x32
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX
-func (x Uint32x4) Or(y Uint32x4) Uint32x4
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX2
-func (x Uint32x8) Or(y Uint32x8) Uint32x8
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPORD, CPU Feature: AVX512
-func (x Uint32x16) Or(y Uint32x16) Uint32x16
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX
-func (x Uint64x2) Or(y Uint64x2) Uint64x2
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPOR, CPU Feature: AVX2
-func (x Uint64x4) Or(y Uint64x4) Uint64x4
-
-// Or performs a bitwise OR operation between two vectors.
-//
-// Asm: VPORQ, CPU Feature: AVX512
-func (x Uint64x8) Or(y Uint64x8) Uint64x8
-
-/* Permute */
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
-//
-// Asm: VPERMB, CPU Feature: AVX512VBMI
-func (x Int8x16) Permute(indices Uint8x16) Int8x16
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
-//
-// Asm: VPERMB, CPU Feature: AVX512VBMI
-func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 5 bits (values 0-31) of each element of indices is used
-//
-// Asm: VPERMB, CPU Feature: AVX512VBMI
-func (x Int8x32) Permute(indices Uint8x32) Int8x32
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 5 bits (values 0-31) of each element of indices is used
-//
-// Asm: VPERMB, CPU Feature: AVX512VBMI
-func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 6 bits (values 0-63) of each element of indices is used
-//
-// Asm: VPERMB, CPU Feature: AVX512VBMI
-func (x Int8x64) Permute(indices Uint8x64) Int8x64
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 6 bits (values 0-63) of each element of indices is used
-//
-// Asm: VPERMB, CPU Feature: AVX512VBMI
-func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
-//
-// Asm: VPERMW, CPU Feature: AVX512
-func (x Int16x8) Permute(indices Uint16x8) Int16x8
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
-//
-// Asm: VPERMW, CPU Feature: AVX512
-func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
-//
-// Asm: VPERMW, CPU Feature: AVX512
-func (x Int16x16) Permute(indices Uint16x16) Int16x16
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
-//
-// Asm: VPERMW, CPU Feature: AVX512
-func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 5 bits (values 0-31) of each element of indices is used
-//
-// Asm: VPERMW, CPU Feature: AVX512
-func (x Int16x32) Permute(indices Uint16x32) Int16x32
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 5 bits (values 0-31) of each element of indices is used
-//
-// Asm: VPERMW, CPU Feature: AVX512
-func (x Uint16x32) Permute(indices Uint16x32) Uint16x32
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
-//
-// Asm: VPERMPS, CPU Feature: AVX2
-func (x Float32x8) Permute(indices Uint32x8) Float32x8
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
-//
-// Asm: VPERMD, CPU Feature: AVX2
-func (x Int32x8) Permute(indices Uint32x8) Int32x8
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
-//
-// Asm: VPERMD, CPU Feature: AVX2
-func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
-//
-// Asm: VPERMPS, CPU Feature: AVX512
-func (x Float32x16) Permute(indices Uint32x16) Float32x16
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
-//
-// Asm: VPERMD, CPU Feature: AVX512
-func (x Int32x16) Permute(indices Uint32x16) Int32x16
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 4 bits (values 0-15) of each element of indices is used
-//
-// Asm: VPERMD, CPU Feature: AVX512
-func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 2 bits (values 0-3) of each element of indices is used
-//
-// Asm: VPERMPD, CPU Feature: AVX512
-func (x Float64x4) Permute(indices Uint64x4) Float64x4
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 2 bits (values 0-3) of each element of indices is used
-//
-// Asm: VPERMQ, CPU Feature: AVX512
-func (x Int64x4) Permute(indices Uint64x4) Int64x4
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 2 bits (values 0-3) of each element of indices is used
-//
-// Asm: VPERMQ, CPU Feature: AVX512
-func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
-//
-// Asm: VPERMPD, CPU Feature: AVX512
-func (x Float64x8) Permute(indices Uint64x8) Float64x8
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
-//
-// Asm: VPERMQ, CPU Feature: AVX512
-func (x Int64x8) Permute(indices Uint64x8) Int64x8
-
-// Permute performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The low 3 bits (values 0-7) of each element of indices is used
-//
-// Asm: VPERMQ, CPU Feature: AVX512
-func (x Uint64x8) Permute(indices Uint64x8) Uint64x8
-
-/* PermuteOrZero */
-
-// PermuteOrZero performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The lower four bits of each byte-sized index in indices select an element from x,
-// unless the index's sign bit is set in which case zero is used instead.
-//
-// Asm: VPSHUFB, CPU Feature: AVX
-func (x Int8x16) PermuteOrZero(indices Int8x16) Int8x16
-
-// PermuteOrZero performs a full permutation of vector x using indices:
-// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
-// The lower four bits of each byte-sized index in indices select an element from x,
-// unless the index's sign bit is set in which case zero is used instead.
-//
-// Asm: VPSHUFB, CPU Feature: AVX
-func (x Uint8x16) PermuteOrZero(indices Int8x16) Uint8x16
-
-/* PermuteOrZeroGrouped */
-
-// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
-// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
-// unless the index's sign bit is set in which case zero is used instead.
-// Each group is of size 128-bit.
-//
-// Asm: VPSHUFB, CPU Feature: AVX2
-func (x Int8x32) PermuteOrZeroGrouped(indices Int8x32) Int8x32
-
-// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
-// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
-// unless the index's sign bit is set in which case zero is used instead.
-// Each group is of size 128-bit.
-//
-// Asm: VPSHUFB, CPU Feature: AVX512
-func (x Int8x64) PermuteOrZeroGrouped(indices Int8x64) Int8x64
-
-// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
-// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
-// unless the index's sign bit is set in which case zero is used instead.
-// Each group is of size 128-bit.
-//
-// Asm: VPSHUFB, CPU Feature: AVX2
-func (x Uint8x32) PermuteOrZeroGrouped(indices Int8x32) Uint8x32
-
-// PermuteOrZeroGrouped performs a grouped permutation of vector x using indices:
-// result = {x_group0[indices[0]], x_group0[indices[1]], ..., x_group1[indices[16]], x_group1[indices[17]], ...}
-// The lower four bits of each byte-sized index in indices select an element from its corresponding group in x,
-// unless the index's sign bit is set in which case zero is used instead.
-// Each group is of size 128-bit.
-//
-// Asm: VPSHUFB, CPU Feature: AVX512
-func (x Uint8x64) PermuteOrZeroGrouped(indices Int8x64) Uint8x64
-
-/* Reciprocal */
-
-// Reciprocal computes an approximate reciprocal of each element.
-//
-// Asm: VRCPPS, CPU Feature: AVX
-func (x Float32x4) Reciprocal() Float32x4
-
-// Reciprocal computes an approximate reciprocal of each element.
-//
-// Asm: VRCPPS, CPU Feature: AVX
-func (x Float32x8) Reciprocal() Float32x8
-
-// Reciprocal computes an approximate reciprocal of each element.
-//
-// Asm: VRCP14PS, CPU Feature: AVX512
-func (x Float32x16) Reciprocal() Float32x16
-
-// Reciprocal computes an approximate reciprocal of each element.
-//
-// Asm: VRCP14PD, CPU Feature: AVX512
-func (x Float64x2) Reciprocal() Float64x2
-
-// Reciprocal computes an approximate reciprocal of each element.
-//
-// Asm: VRCP14PD, CPU Feature: AVX512
-func (x Float64x4) Reciprocal() Float64x4
-
-// Reciprocal computes an approximate reciprocal of each element.
-//
-// Asm: VRCP14PD, CPU Feature: AVX512
-func (x Float64x8) Reciprocal() Float64x8
-
-/* ReciprocalSqrt */
-
-// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
-//
-// Asm: VRSQRTPS, CPU Feature: AVX
-func (x Float32x4) ReciprocalSqrt() Float32x4
-
-// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
-//
-// Asm: VRSQRTPS, CPU Feature: AVX
-func (x Float32x8) ReciprocalSqrt() Float32x8
-
-// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
-//
-// Asm: VRSQRT14PS, CPU Feature: AVX512
-func (x Float32x16) ReciprocalSqrt() Float32x16
-
-// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
-//
-// Asm: VRSQRT14PD, CPU Feature: AVX512
-func (x Float64x2) ReciprocalSqrt() Float64x2
-
-// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
-//
-// Asm: VRSQRT14PD, CPU Feature: AVX512
-func (x Float64x4) ReciprocalSqrt() Float64x4
-
-// ReciprocalSqrt computes an approximate reciprocal of the square root of each element.
-//
-// Asm: VRSQRT14PD, CPU Feature: AVX512
-func (x Float64x8) ReciprocalSqrt() Float64x8
-
-/* RotateAllLeft */
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLD, CPU Feature: AVX512
-func (x Int32x4) RotateAllLeft(shift uint8) Int32x4
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLD, CPU Feature: AVX512
-func (x Int32x8) RotateAllLeft(shift uint8) Int32x8
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLD, CPU Feature: AVX512
-func (x Int32x16) RotateAllLeft(shift uint8) Int32x16
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLQ, CPU Feature: AVX512
-func (x Int64x2) RotateAllLeft(shift uint8) Int64x2
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLQ, CPU Feature: AVX512
-func (x Int64x4) RotateAllLeft(shift uint8) Int64x4
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLQ, CPU Feature: AVX512
-func (x Int64x8) RotateAllLeft(shift uint8) Int64x8
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLD, CPU Feature: AVX512
-func (x Uint32x4) RotateAllLeft(shift uint8) Uint32x4
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLD, CPU Feature: AVX512
-func (x Uint32x8) RotateAllLeft(shift uint8) Uint32x8
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLD, CPU Feature: AVX512
-func (x Uint32x16) RotateAllLeft(shift uint8) Uint32x16
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLQ, CPU Feature: AVX512
-func (x Uint64x2) RotateAllLeft(shift uint8) Uint64x2
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLQ, CPU Feature: AVX512
-func (x Uint64x4) RotateAllLeft(shift uint8) Uint64x4
-
-// RotateAllLeft rotates each element to the left by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPROLQ, CPU Feature: AVX512
-func (x Uint64x8) RotateAllLeft(shift uint8) Uint64x8
-
-/* RotateAllRight */
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORD, CPU Feature: AVX512
-func (x Int32x4) RotateAllRight(shift uint8) Int32x4
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORD, CPU Feature: AVX512
-func (x Int32x8) RotateAllRight(shift uint8) Int32x8
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORD, CPU Feature: AVX512
-func (x Int32x16) RotateAllRight(shift uint8) Int32x16
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORQ, CPU Feature: AVX512
-func (x Int64x2) RotateAllRight(shift uint8) Int64x2
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORQ, CPU Feature: AVX512
-func (x Int64x4) RotateAllRight(shift uint8) Int64x4
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORQ, CPU Feature: AVX512
-func (x Int64x8) RotateAllRight(shift uint8) Int64x8
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORD, CPU Feature: AVX512
-func (x Uint32x4) RotateAllRight(shift uint8) Uint32x4
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORD, CPU Feature: AVX512
-func (x Uint32x8) RotateAllRight(shift uint8) Uint32x8
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORD, CPU Feature: AVX512
-func (x Uint32x16) RotateAllRight(shift uint8) Uint32x16
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORQ, CPU Feature: AVX512
-func (x Uint64x2) RotateAllRight(shift uint8) Uint64x2
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORQ, CPU Feature: AVX512
-func (x Uint64x4) RotateAllRight(shift uint8) Uint64x4
-
-// RotateAllRight rotates each element to the right by the number of bits specified by the immediate.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPRORQ, CPU Feature: AVX512
-func (x Uint64x8) RotateAllRight(shift uint8) Uint64x8
-
-/* RotateLeft */
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVD, CPU Feature: AVX512
-func (x Int32x4) RotateLeft(y Int32x4) Int32x4
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVD, CPU Feature: AVX512
-func (x Int32x8) RotateLeft(y Int32x8) Int32x8
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVD, CPU Feature: AVX512
-func (x Int32x16) RotateLeft(y Int32x16) Int32x16
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVQ, CPU Feature: AVX512
-func (x Int64x2) RotateLeft(y Int64x2) Int64x2
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVQ, CPU Feature: AVX512
-func (x Int64x4) RotateLeft(y Int64x4) Int64x4
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVQ, CPU Feature: AVX512
-func (x Int64x8) RotateLeft(y Int64x8) Int64x8
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVD, CPU Feature: AVX512
-func (x Uint32x4) RotateLeft(y Uint32x4) Uint32x4
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVD, CPU Feature: AVX512
-func (x Uint32x8) RotateLeft(y Uint32x8) Uint32x8
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVD, CPU Feature: AVX512
-func (x Uint32x16) RotateLeft(y Uint32x16) Uint32x16
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVQ, CPU Feature: AVX512
-func (x Uint64x2) RotateLeft(y Uint64x2) Uint64x2
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVQ, CPU Feature: AVX512
-func (x Uint64x4) RotateLeft(y Uint64x4) Uint64x4
-
-// RotateLeft rotates each element in x to the left by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPROLVQ, CPU Feature: AVX512
-func (x Uint64x8) RotateLeft(y Uint64x8) Uint64x8
-
-/* RotateRight */
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVD, CPU Feature: AVX512
-func (x Int32x4) RotateRight(y Int32x4) Int32x4
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVD, CPU Feature: AVX512
-func (x Int32x8) RotateRight(y Int32x8) Int32x8
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVD, CPU Feature: AVX512
-func (x Int32x16) RotateRight(y Int32x16) Int32x16
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVQ, CPU Feature: AVX512
-func (x Int64x2) RotateRight(y Int64x2) Int64x2
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVQ, CPU Feature: AVX512
-func (x Int64x4) RotateRight(y Int64x4) Int64x4
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVQ, CPU Feature: AVX512
-func (x Int64x8) RotateRight(y Int64x8) Int64x8
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVD, CPU Feature: AVX512
-func (x Uint32x4) RotateRight(y Uint32x4) Uint32x4
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVD, CPU Feature: AVX512
-func (x Uint32x8) RotateRight(y Uint32x8) Uint32x8
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVD, CPU Feature: AVX512
-func (x Uint32x16) RotateRight(y Uint32x16) Uint32x16
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVQ, CPU Feature: AVX512
-func (x Uint64x2) RotateRight(y Uint64x2) Uint64x2
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVQ, CPU Feature: AVX512
-func (x Uint64x4) RotateRight(y Uint64x4) Uint64x4
-
-// RotateRight rotates each element in x to the right by the number of bits specified by y's corresponding elements.
-//
-// Asm: VPRORVQ, CPU Feature: AVX512
-func (x Uint64x8) RotateRight(y Uint64x8) Uint64x8
-
-/* RoundToEven */
-
-// RoundToEven rounds elements to the nearest integer.
-//
-// Asm: VROUNDPS, CPU Feature: AVX
-func (x Float32x4) RoundToEven() Float32x4
-
-// RoundToEven rounds elements to the nearest integer.
-//
-// Asm: VROUNDPS, CPU Feature: AVX
-func (x Float32x8) RoundToEven() Float32x8
-
-// RoundToEven rounds elements to the nearest integer.
-//
-// Asm: VROUNDPD, CPU Feature: AVX
-func (x Float64x2) RoundToEven() Float64x2
-
-// RoundToEven rounds elements to the nearest integer.
-//
-// Asm: VROUNDPD, CPU Feature: AVX
-func (x Float64x4) RoundToEven() Float64x4
-
-/* RoundToEvenScaled */
-
-// RoundToEvenScaled rounds elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x4) RoundToEvenScaled(prec uint8) Float32x4
-
-// RoundToEvenScaled rounds elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x8) RoundToEvenScaled(prec uint8) Float32x8
-
-// RoundToEvenScaled rounds elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x16) RoundToEvenScaled(prec uint8) Float32x16
-
-// RoundToEvenScaled rounds elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x2) RoundToEvenScaled(prec uint8) Float64x2
-
-// RoundToEvenScaled rounds elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x4) RoundToEvenScaled(prec uint8) Float64x4
-
-// RoundToEvenScaled rounds elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x8) RoundToEvenScaled(prec uint8) Float64x8
-
-/* RoundToEvenScaledResidue */
-
-// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x4) RoundToEvenScaledResidue(prec uint8) Float32x4
-
-// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x8) RoundToEvenScaledResidue(prec uint8) Float32x8
-
-// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x16) RoundToEvenScaledResidue(prec uint8) Float32x16
-
-// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x2) RoundToEvenScaledResidue(prec uint8) Float64x2
-
-// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x4) RoundToEvenScaledResidue(prec uint8) Float64x4
-
-// RoundToEvenScaledResidue computes the difference after rounding with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x8) RoundToEvenScaledResidue(prec uint8) Float64x8
-
-/* SHA1FourRounds */
-
-// SHA1FourRounds performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
-// x contains the state variables a, b, c and d from upper to lower order.
-// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
-// result = the state variables a', b', c', d' updated after 4 rounds.
-// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
-//
-// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: SHA1RNDS4, CPU Feature: SHA
-func (x Uint32x4) SHA1FourRounds(constant uint8, y Uint32x4) Uint32x4
-
-/* SHA1Message1 */
-
-// SHA1Message1 does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
-// x = {W3, W2, W1, W0}
-// y = {0, 0, W5, W4}
-// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
-//
-// Asm: SHA1MSG1, CPU Feature: SHA
-func (x Uint32x4) SHA1Message1(y Uint32x4) Uint32x4
-
-/* SHA1Message2 */
-
-// SHA1Message2 does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
-// x = result of 2.
-// y = {W15, W14, W13}
-// result = {W19, W18, W17, W16}
-//
-// Asm: SHA1MSG2, CPU Feature: SHA
-func (x Uint32x4) SHA1Message2(y Uint32x4) Uint32x4
-
-/* SHA1NextE */
-
-// SHA1NextE calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
-// x contains the state variable a (before the 4 rounds), placed in the upper element.
-// y is the elements of W array for next 4 rounds from upper to lower order.
-// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
-// from upper to lower order.
-// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
-// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
-// computation of the value of e'.)
-//
-// Asm: SHA1NEXTE, CPU Feature: SHA
-func (x Uint32x4) SHA1NextE(y Uint32x4) Uint32x4
-
-/* SHA256Message1 */
-
-// SHA256Message1 does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
-// x = {W0, W1, W2, W3}
-// y = {W4, 0, 0, 0}
-// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
-//
-// Asm: SHA256MSG1, CPU Feature: SHA
-func (x Uint32x4) SHA256Message1(y Uint32x4) Uint32x4
-
-/* SHA256Message2 */
-
-// SHA256Message2 does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
-// x = result of 2
-// y = {0, 0, W14, W15}
-// result = {W16, W17, W18, W19}
-//
-// Asm: SHA256MSG2, CPU Feature: SHA
-func (x Uint32x4) SHA256Message2(y Uint32x4) Uint32x4
-
-/* SHA256TwoRounds */
-
-// SHA256TwoRounds does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
-// x = {h, g, d, c}
-// y = {f, e, b, a}
-// z = {W0+K0, W1+K1}
-// result = {f', e', b', a'}
-// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
-// the corresponding element of the W array to make the input data z.
-// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
-// y (the state variables a, b, e, f before the 2 rounds).
-//
-// Asm: SHA256RNDS2, CPU Feature: SHA
-func (x Uint32x4) SHA256TwoRounds(y Uint32x4, z Uint32x4) Uint32x4
-
-/* SaturateToInt8 */
-
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSWB, CPU Feature: AVX512
-func (x Int16x8) SaturateToInt8() Int8x16
-
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSWB, CPU Feature: AVX512
-func (x Int16x16) SaturateToInt8() Int8x16
-
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVSWB, CPU Feature: AVX512
-func (x Int16x32) SaturateToInt8() Int8x32
-
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSDB, CPU Feature: AVX512
-func (x Int32x4) SaturateToInt8() Int8x16
-
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSDB, CPU Feature: AVX512
-func (x Int32x8) SaturateToInt8() Int8x16
-
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSDB, CPU Feature: AVX512
-func (x Int32x16) SaturateToInt8() Int8x16
-
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSQB, CPU Feature: AVX512
-func (x Int64x2) SaturateToInt8() Int8x16
-
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSQB, CPU Feature: AVX512
-func (x Int64x4) SaturateToInt8() Int8x16
-
-// SaturateToInt8 converts element values to int8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSQB, CPU Feature: AVX512
-func (x Int64x8) SaturateToInt8() Int8x16
-
-/* SaturateToInt16 */
-
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVSDW, CPU Feature: AVX512
-func (x Int32x4) SaturateToInt16() Int16x8
-
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVSDW, CPU Feature: AVX512
-func (x Int32x8) SaturateToInt16() Int16x8
-
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVSDW, CPU Feature: AVX512
-func (x Int32x16) SaturateToInt16() Int16x16
-
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVSQW, CPU Feature: AVX512
-func (x Int64x2) SaturateToInt16() Int16x8
-
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVSQW, CPU Feature: AVX512
-func (x Int64x4) SaturateToInt16() Int16x8
-
-// SaturateToInt16 converts element values to int16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVSQW, CPU Feature: AVX512
-func (x Int64x8) SaturateToInt16() Int16x8
-
-/* SaturateToInt16Concat */
-
-// SaturateToInt16Concat converts element values to int16.
-// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPACKSSDW, CPU Feature: AVX
-func (x Int32x4) SaturateToInt16Concat(y Int32x4) Int16x8
-
-// SaturateToInt16Concat converts element values to int16.
-// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPACKSSDW, CPU Feature: AVX2
-func (x Int32x8) SaturateToInt16Concat(y Int32x8) Int16x16
-
-// SaturateToInt16Concat converts element values to int16.
-// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPACKSSDW, CPU Feature: AVX512
-func (x Int32x16) SaturateToInt16Concat(y Int32x16) Int16x32
-
-/* SaturateToInt32 */
-
-// SaturateToInt32 converts element values to int32.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVSQD, CPU Feature: AVX512
-func (x Int64x2) SaturateToInt32() Int32x4
-
-// SaturateToInt32 converts element values to int32.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVSQD, CPU Feature: AVX512
-func (x Int64x4) SaturateToInt32() Int32x4
-
-// SaturateToInt32 converts element values to int32.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVSQD, CPU Feature: AVX512
-func (x Int64x8) SaturateToInt32() Int32x8
-
-/* SaturateToUint8 */
-
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSWB, CPU Feature: AVX512
-func (x Int16x8) SaturateToUint8() Int8x16
-
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSWB, CPU Feature: AVX512
-func (x Int16x16) SaturateToUint8() Int8x16
-
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSDB, CPU Feature: AVX512
-func (x Int32x4) SaturateToUint8() Int8x16
-
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSDB, CPU Feature: AVX512
-func (x Int32x8) SaturateToUint8() Int8x16
-
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSDB, CPU Feature: AVX512
-func (x Int32x16) SaturateToUint8() Int8x16
-
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSQB, CPU Feature: AVX512
-func (x Int64x2) SaturateToUint8() Int8x16
-
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSQB, CPU Feature: AVX512
-func (x Int64x4) SaturateToUint8() Int8x16
-
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVSQB, CPU Feature: AVX512
-func (x Int64x8) SaturateToUint8() Int8x16
-
-// SaturateToUint8 converts element values to uint8.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVUSWB, CPU Feature: AVX512
-func (x Uint16x32) SaturateToUint8() Uint8x32
-
-/* SaturateToUint16 */
-
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVUSDW, CPU Feature: AVX512
-func (x Uint32x4) SaturateToUint16() Uint16x8
-
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVUSDW, CPU Feature: AVX512
-func (x Uint32x8) SaturateToUint16() Uint16x8
-
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVUSDW, CPU Feature: AVX512
-func (x Uint32x16) SaturateToUint16() Uint16x16
-
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVUSQW, CPU Feature: AVX512
-func (x Uint64x2) SaturateToUint16() Uint16x8
-
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVUSQW, CPU Feature: AVX512
-func (x Uint64x4) SaturateToUint16() Uint16x8
-
-// SaturateToUint16 converts element values to uint16.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVUSQW, CPU Feature: AVX512
-func (x Uint64x8) SaturateToUint16() Uint16x8
-
-/* SaturateToUint16Concat */
-
-// SaturateToUint16Concat converts element values to uint16.
-// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPACKUSDW, CPU Feature: AVX
-func (x Uint32x4) SaturateToUint16Concat(y Uint32x4) Uint16x8
-
-// SaturateToUint16Concat converts element values to uint16.
-// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPACKUSDW, CPU Feature: AVX2
-func (x Uint32x8) SaturateToUint16Concat(y Uint32x8) Uint16x16
-
-// SaturateToUint16Concat converts element values to uint16.
-// With each 128-bit as a group:
-// The converted group from the first input vector will be packed to the lower part of the result vector,
-// the converted group from the second input vector will be packed to the upper part of the result vector.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPACKUSDW, CPU Feature: AVX512
-func (x Uint32x16) SaturateToUint16Concat(y Uint32x16) Uint16x32
-
-/* SaturateToUint32 */
-
-// SaturateToUint32 converts element values to uint32.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVUSQD, CPU Feature: AVX512
-func (x Uint64x2) SaturateToUint32() Uint32x4
-
-// SaturateToUint32 converts element values to uint32.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVUSQD, CPU Feature: AVX512
-func (x Uint64x4) SaturateToUint32() Uint32x4
-
-// SaturateToUint32 converts element values to uint32.
-// Conversion is done with saturation on the vector elements.
-//
-// Asm: VPMOVUSQD, CPU Feature: AVX512
-func (x Uint64x8) SaturateToUint32() Uint32x8
-
-/* Scale */
-
-// Scale multiplies elements by a power of 2.
-//
-// Asm: VSCALEFPS, CPU Feature: AVX512
-func (x Float32x4) Scale(y Float32x4) Float32x4
-
-// Scale multiplies elements by a power of 2.
-//
-// Asm: VSCALEFPS, CPU Feature: AVX512
-func (x Float32x8) Scale(y Float32x8) Float32x8
-
-// Scale multiplies elements by a power of 2.
-//
-// Asm: VSCALEFPS, CPU Feature: AVX512
-func (x Float32x16) Scale(y Float32x16) Float32x16
-
-// Scale multiplies elements by a power of 2.
-//
-// Asm: VSCALEFPD, CPU Feature: AVX512
-func (x Float64x2) Scale(y Float64x2) Float64x2
-
-// Scale multiplies elements by a power of 2.
-//
-// Asm: VSCALEFPD, CPU Feature: AVX512
-func (x Float64x4) Scale(y Float64x4) Float64x4
-
-// Scale multiplies elements by a power of 2.
-//
-// Asm: VSCALEFPD, CPU Feature: AVX512
-func (x Float64x8) Scale(y Float64x8) Float64x8
-
-/* Select128FromPair */
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
-//
-// returns {70, 71, 72, 73, 40, 41, 42, 43}.
-//
-// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: VPERM2F128, CPU Feature: AVX
-func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
-//
-// returns {70, 71, 40, 41}.
-//
-// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: VPERM2F128, CPU Feature: AVX
-func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0,
-// {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
-//
-// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
-//
-// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int8x32) Select128FromPair(lo, hi uint8, y Int8x32) Int8x32
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0,
-// {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
-//
-// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
-//
-// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int16x16) Select128FromPair(lo, hi uint8, y Int16x16) Int16x16
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
-//
-// returns {70, 71, 72, 73, 40, 41, 42, 43}.
-//
-// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
-//
-// returns {70, 71, 40, 41}.
-//
-// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0,
-// {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f})
-//
-// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}.
-//
-// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint8x32) Select128FromPair(lo, hi uint8, y Uint8x32) Uint8x32
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0,
-// {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77})
-//
-// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}.
-//
-// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint16x16) Select128FromPair(lo, hi uint8, y Uint16x16) Uint16x16
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73})
-//
-// returns {70, 71, 72, 73, 40, 41, 42, 43}.
-//
-// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8
-
-// Select128FromPair treats the 256-bit vectors x and y as a single vector of four
-// 128-bit elements, and returns a 256-bit result formed by
-// concatenating the two elements specified by lo and hi.
-// For example,
-//
-// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71})
-//
-// returns {70, 71, 40, 41}.
-//
-// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table.
-// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic.
-//
-// Asm: VPERM2I128, CPU Feature: AVX2
-func (x Uint64x4) Select128FromPair(lo, hi uint8, y Uint64x4) Uint64x4
-
-/* SetElem */
-
-// SetElem sets a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPINSRD, CPU Feature: AVX
-func (x Float32x4) SetElem(index uint8, y float32) Float32x4
-
-// SetElem sets a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPINSRQ, CPU Feature: AVX
-func (x Float64x2) SetElem(index uint8, y float64) Float64x2
-
-// SetElem sets a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPINSRB, CPU Feature: AVX
-func (x Int8x16) SetElem(index uint8, y int8) Int8x16
-
-// SetElem sets a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPINSRW, CPU Feature: AVX
-func (x Int16x8) SetElem(index uint8, y int16) Int16x8
-
-// SetElem sets a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPINSRD, CPU Feature: AVX
-func (x Int32x4) SetElem(index uint8, y int32) Int32x4
-
-// SetElem sets a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPINSRQ, CPU Feature: AVX
-func (x Int64x2) SetElem(index uint8, y int64) Int64x2
-
-// SetElem sets a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPINSRB, CPU Feature: AVX
-func (x Uint8x16) SetElem(index uint8, y uint8) Uint8x16
-
-// SetElem sets a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPINSRW, CPU Feature: AVX
-func (x Uint16x8) SetElem(index uint8, y uint16) Uint16x8
-
-// SetElem sets a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPINSRD, CPU Feature: AVX
-func (x Uint32x4) SetElem(index uint8, y uint32) Uint32x4
-
-// SetElem sets a single constant-indexed element's value.
-//
-// index results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPINSRQ, CPU Feature: AVX
-func (x Uint64x2) SetElem(index uint8, y uint64) Uint64x2
-
-/* SetHi */
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTF128, CPU Feature: AVX
-func (x Float32x8) SetHi(y Float32x4) Float32x8
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTF64X4, CPU Feature: AVX512
-func (x Float32x16) SetHi(y Float32x8) Float32x16
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTF128, CPU Feature: AVX
-func (x Float64x4) SetHi(y Float64x2) Float64x4
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTF64X4, CPU Feature: AVX512
-func (x Float64x8) SetHi(y Float64x4) Float64x8
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Int8x32) SetHi(y Int8x16) Int8x32
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Int8x64) SetHi(y Int8x32) Int8x64
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Int16x16) SetHi(y Int16x8) Int16x16
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Int16x32) SetHi(y Int16x16) Int16x32
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Int32x8) SetHi(y Int32x4) Int32x8
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Int32x16) SetHi(y Int32x8) Int32x16
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Int64x4) SetHi(y Int64x2) Int64x4
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Int64x8) SetHi(y Int64x4) Int64x8
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Uint8x32) SetHi(y Uint8x16) Uint8x32
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Uint8x64) SetHi(y Uint8x32) Uint8x64
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Uint16x16) SetHi(y Uint16x8) Uint16x16
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Uint16x32) SetHi(y Uint16x16) Uint16x32
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Uint32x8) SetHi(y Uint32x4) Uint32x8
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Uint32x16) SetHi(y Uint32x8) Uint32x16
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Uint64x4) SetHi(y Uint64x2) Uint64x4
-
-// SetHi returns x with its upper half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Uint64x8) SetHi(y Uint64x4) Uint64x8
-
-/* SetLo */
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTF128, CPU Feature: AVX
-func (x Float32x8) SetLo(y Float32x4) Float32x8
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTF64X4, CPU Feature: AVX512
-func (x Float32x16) SetLo(y Float32x8) Float32x16
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTF128, CPU Feature: AVX
-func (x Float64x4) SetLo(y Float64x2) Float64x4
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTF64X4, CPU Feature: AVX512
-func (x Float64x8) SetLo(y Float64x4) Float64x8
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Int8x32) SetLo(y Int8x16) Int8x32
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Int8x64) SetLo(y Int8x32) Int8x64
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Int16x16) SetLo(y Int16x8) Int16x16
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Int16x32) SetLo(y Int16x16) Int16x32
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Int32x8) SetLo(y Int32x4) Int32x8
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Int32x16) SetLo(y Int32x8) Int32x16
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Int64x4) SetLo(y Int64x2) Int64x4
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Int64x8) SetLo(y Int64x4) Int64x8
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Uint8x32) SetLo(y Uint8x16) Uint8x32
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Uint8x64) SetLo(y Uint8x32) Uint8x64
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Uint16x16) SetLo(y Uint16x8) Uint16x16
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Uint16x32) SetLo(y Uint16x16) Uint16x32
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Uint32x8) SetLo(y Uint32x4) Uint32x8
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Uint32x16) SetLo(y Uint32x8) Uint32x16
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI128, CPU Feature: AVX2
-func (x Uint64x4) SetLo(y Uint64x2) Uint64x4
-
-// SetLo returns x with its lower half set to y.
-//
-// Asm: VINSERTI64X4, CPU Feature: AVX512
-func (x Uint64x8) SetLo(y Uint64x4) Uint64x8
-
-/* ShiftAllLeft */
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLW, CPU Feature: AVX
-func (x Int16x8) ShiftAllLeft(y uint64) Int16x8
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLW, CPU Feature: AVX2
-func (x Int16x16) ShiftAllLeft(y uint64) Int16x16
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLW, CPU Feature: AVX512
-func (x Int16x32) ShiftAllLeft(y uint64) Int16x32
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLD, CPU Feature: AVX
-func (x Int32x4) ShiftAllLeft(y uint64) Int32x4
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLD, CPU Feature: AVX2
-func (x Int32x8) ShiftAllLeft(y uint64) Int32x8
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLD, CPU Feature: AVX512
-func (x Int32x16) ShiftAllLeft(y uint64) Int32x16
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLQ, CPU Feature: AVX
-func (x Int64x2) ShiftAllLeft(y uint64) Int64x2
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLQ, CPU Feature: AVX2
-func (x Int64x4) ShiftAllLeft(y uint64) Int64x4
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLQ, CPU Feature: AVX512
-func (x Int64x8) ShiftAllLeft(y uint64) Int64x8
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLW, CPU Feature: AVX
-func (x Uint16x8) ShiftAllLeft(y uint64) Uint16x8
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLW, CPU Feature: AVX2
-func (x Uint16x16) ShiftAllLeft(y uint64) Uint16x16
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLW, CPU Feature: AVX512
-func (x Uint16x32) ShiftAllLeft(y uint64) Uint16x32
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLD, CPU Feature: AVX
-func (x Uint32x4) ShiftAllLeft(y uint64) Uint32x4
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLD, CPU Feature: AVX2
-func (x Uint32x8) ShiftAllLeft(y uint64) Uint32x8
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLD, CPU Feature: AVX512
-func (x Uint32x16) ShiftAllLeft(y uint64) Uint32x16
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLQ, CPU Feature: AVX
-func (x Uint64x2) ShiftAllLeft(y uint64) Uint64x2
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLQ, CPU Feature: AVX2
-func (x Uint64x4) ShiftAllLeft(y uint64) Uint64x4
-
-// ShiftAllLeft shifts each element to the left by the specified number of bits. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLQ, CPU Feature: AVX512
-func (x Uint64x8) ShiftAllLeft(y uint64) Uint64x8
-
-/* ShiftAllLeftConcat */
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Int16x8) ShiftAllLeftConcat(shift uint8, y Int16x8) Int16x8
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Int16x16) ShiftAllLeftConcat(shift uint8, y Int16x16) Int16x16
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Int16x32) ShiftAllLeftConcat(shift uint8, y Int16x32) Int16x32
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Int32x4) ShiftAllLeftConcat(shift uint8, y Int32x4) Int32x4
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Int32x8) ShiftAllLeftConcat(shift uint8, y Int32x8) Int32x8
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Int32x16) ShiftAllLeftConcat(shift uint8, y Int32x16) Int32x16
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Int64x2) ShiftAllLeftConcat(shift uint8, y Int64x2) Int64x2
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Int64x4) ShiftAllLeftConcat(shift uint8, y Int64x4) Int64x4
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Int64x8) ShiftAllLeftConcat(shift uint8, y Int64x8) Int64x8
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) ShiftAllLeftConcat(shift uint8, y Uint16x8) Uint16x8
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) ShiftAllLeftConcat(shift uint8, y Uint16x16) Uint16x16
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) ShiftAllLeftConcat(shift uint8, y Uint16x32) Uint16x32
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Uint32x4) ShiftAllLeftConcat(shift uint8, y Uint32x4) Uint32x4
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Uint32x8) ShiftAllLeftConcat(shift uint8, y Uint32x8) Uint32x8
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDD, CPU Feature: AVX512VBMI2
-func (x Uint32x16) ShiftAllLeftConcat(shift uint8, y Uint32x16) Uint32x16
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x2) ShiftAllLeftConcat(shift uint8, y Uint64x2) Uint64x2
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x4) ShiftAllLeftConcat(shift uint8, y Uint64x4) Uint64x4
-
-// ShiftAllLeftConcat shifts each element of x to the left by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the upper bits of y to the emptied lower bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHLDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x8) ShiftAllLeftConcat(shift uint8, y Uint64x8) Uint64x8
-
-/* ShiftAllRight */
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAW, CPU Feature: AVX
-func (x Int16x8) ShiftAllRight(y uint64) Int16x8
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAW, CPU Feature: AVX2
-func (x Int16x16) ShiftAllRight(y uint64) Int16x16
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAW, CPU Feature: AVX512
-func (x Int16x32) ShiftAllRight(y uint64) Int16x32
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAD, CPU Feature: AVX
-func (x Int32x4) ShiftAllRight(y uint64) Int32x4
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAD, CPU Feature: AVX2
-func (x Int32x8) ShiftAllRight(y uint64) Int32x8
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAD, CPU Feature: AVX512
-func (x Int32x16) ShiftAllRight(y uint64) Int32x16
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAQ, CPU Feature: AVX512
-func (x Int64x2) ShiftAllRight(y uint64) Int64x2
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAQ, CPU Feature: AVX512
-func (x Int64x4) ShiftAllRight(y uint64) Int64x4
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAQ, CPU Feature: AVX512
-func (x Int64x8) ShiftAllRight(y uint64) Int64x8
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLW, CPU Feature: AVX
-func (x Uint16x8) ShiftAllRight(y uint64) Uint16x8
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLW, CPU Feature: AVX2
-func (x Uint16x16) ShiftAllRight(y uint64) Uint16x16
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLW, CPU Feature: AVX512
-func (x Uint16x32) ShiftAllRight(y uint64) Uint16x32
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLD, CPU Feature: AVX
-func (x Uint32x4) ShiftAllRight(y uint64) Uint32x4
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLD, CPU Feature: AVX2
-func (x Uint32x8) ShiftAllRight(y uint64) Uint32x8
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLD, CPU Feature: AVX512
-func (x Uint32x16) ShiftAllRight(y uint64) Uint32x16
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLQ, CPU Feature: AVX
-func (x Uint64x2) ShiftAllRight(y uint64) Uint64x2
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLQ, CPU Feature: AVX2
-func (x Uint64x4) ShiftAllRight(y uint64) Uint64x4
-
-// ShiftAllRight shifts each element to the right by the specified number of bits. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLQ, CPU Feature: AVX512
-func (x Uint64x8) ShiftAllRight(y uint64) Uint64x8
-
-/* ShiftAllRightConcat */
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Int16x8) ShiftAllRightConcat(shift uint8, y Int16x8) Int16x8
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Int16x16) ShiftAllRightConcat(shift uint8, y Int16x16) Int16x16
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Int16x32) ShiftAllRightConcat(shift uint8, y Int16x32) Int16x32
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Int32x4) ShiftAllRightConcat(shift uint8, y Int32x4) Int32x4
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Int32x8) ShiftAllRightConcat(shift uint8, y Int32x8) Int32x8
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Int32x16) ShiftAllRightConcat(shift uint8, y Int32x16) Int32x16
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Int64x2) ShiftAllRightConcat(shift uint8, y Int64x2) Int64x2
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Int64x4) ShiftAllRightConcat(shift uint8, y Int64x4) Int64x4
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Int64x8) ShiftAllRightConcat(shift uint8, y Int64x8) Int64x8
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) ShiftAllRightConcat(shift uint8, y Uint16x8) Uint16x8
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) ShiftAllRightConcat(shift uint8, y Uint16x16) Uint16x16
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) ShiftAllRightConcat(shift uint8, y Uint16x32) Uint16x32
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Uint32x4) ShiftAllRightConcat(shift uint8, y Uint32x4) Uint32x4
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Uint32x8) ShiftAllRightConcat(shift uint8, y Uint32x8) Uint32x8
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDD, CPU Feature: AVX512VBMI2
-func (x Uint32x16) ShiftAllRightConcat(shift uint8, y Uint32x16) Uint32x16
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x2) ShiftAllRightConcat(shift uint8, y Uint64x2) Uint64x2
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x4) ShiftAllRightConcat(shift uint8, y Uint64x4) Uint64x4
-
-// ShiftAllRightConcat shifts each element of x to the right by the number of bits specified by the
-// immediate(only the lower 5 bits are used), and then copies the lower bits of y to the emptied upper bits of the shifted x.
-//
-// shift results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHRDQ, CPU Feature: AVX512VBMI2
-func (x Uint64x8) ShiftAllRightConcat(shift uint8, y Uint64x8) Uint64x8
-
-/* ShiftLeft */
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVW, CPU Feature: AVX512
-func (x Int16x8) ShiftLeft(y Int16x8) Int16x8
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVW, CPU Feature: AVX512
-func (x Int16x16) ShiftLeft(y Int16x16) Int16x16
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVW, CPU Feature: AVX512
-func (x Int16x32) ShiftLeft(y Int16x32) Int16x32
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVD, CPU Feature: AVX2
-func (x Int32x4) ShiftLeft(y Int32x4) Int32x4
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVD, CPU Feature: AVX2
-func (x Int32x8) ShiftLeft(y Int32x8) Int32x8
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVD, CPU Feature: AVX512
-func (x Int32x16) ShiftLeft(y Int32x16) Int32x16
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVQ, CPU Feature: AVX2
-func (x Int64x2) ShiftLeft(y Int64x2) Int64x2
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVQ, CPU Feature: AVX2
-func (x Int64x4) ShiftLeft(y Int64x4) Int64x4
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVQ, CPU Feature: AVX512
-func (x Int64x8) ShiftLeft(y Int64x8) Int64x8
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVW, CPU Feature: AVX512
-func (x Uint16x8) ShiftLeft(y Uint16x8) Uint16x8
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVW, CPU Feature: AVX512
-func (x Uint16x16) ShiftLeft(y Uint16x16) Uint16x16
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVW, CPU Feature: AVX512
-func (x Uint16x32) ShiftLeft(y Uint16x32) Uint16x32
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVD, CPU Feature: AVX2
-func (x Uint32x4) ShiftLeft(y Uint32x4) Uint32x4
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVD, CPU Feature: AVX2
-func (x Uint32x8) ShiftLeft(y Uint32x8) Uint32x8
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVD, CPU Feature: AVX512
-func (x Uint32x16) ShiftLeft(y Uint32x16) Uint32x16
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVQ, CPU Feature: AVX2
-func (x Uint64x2) ShiftLeft(y Uint64x2) Uint64x2
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVQ, CPU Feature: AVX2
-func (x Uint64x4) ShiftLeft(y Uint64x4) Uint64x4
-
-// ShiftLeft shifts each element in x to the left by the number of bits specified in y's corresponding elements. Emptied lower bits are zeroed.
-//
-// Asm: VPSLLVQ, CPU Feature: AVX512
-func (x Uint64x8) ShiftLeft(y Uint64x8) Uint64x8
-
-/* ShiftLeftConcat */
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Int16x8) ShiftLeftConcat(y Int16x8, z Int16x8) Int16x8
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Int16x16) ShiftLeftConcat(y Int16x16, z Int16x16) Int16x16
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Int16x32) ShiftLeftConcat(y Int16x32, z Int16x32) Int16x32
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Int32x4) ShiftLeftConcat(y Int32x4, z Int32x4) Int32x4
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Int32x8) ShiftLeftConcat(y Int32x8, z Int32x8) Int32x8
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Int32x16) ShiftLeftConcat(y Int32x16, z Int32x16) Int32x16
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x2) ShiftLeftConcat(y Int64x2, z Int64x2) Int64x2
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x4) ShiftLeftConcat(y Int64x4, z Int64x4) Int64x4
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x8) ShiftLeftConcat(y Int64x8, z Int64x8) Int64x8
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) ShiftLeftConcat(y Uint16x8, z Uint16x8) Uint16x8
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) ShiftLeftConcat(y Uint16x16, z Uint16x16) Uint16x16
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) ShiftLeftConcat(y Uint16x32, z Uint16x32) Uint16x32
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x4) ShiftLeftConcat(y Uint32x4, z Uint32x4) Uint32x4
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x8) ShiftLeftConcat(y Uint32x8, z Uint32x8) Uint32x8
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x16) ShiftLeftConcat(y Uint32x16, z Uint32x16) Uint32x16
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x2) ShiftLeftConcat(y Uint64x2, z Uint64x2) Uint64x2
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x4) ShiftLeftConcat(y Uint64x4, z Uint64x4) Uint64x4
-
-// ShiftLeftConcat shifts each element of x to the left by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the upper bits of z to the emptied lower bits of the shifted x.
-//
-// Asm: VPSHLDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x8) ShiftLeftConcat(y Uint64x8, z Uint64x8) Uint64x8
-
-/* ShiftRight */
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAVW, CPU Feature: AVX512
-func (x Int16x8) ShiftRight(y Int16x8) Int16x8
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAVW, CPU Feature: AVX512
-func (x Int16x16) ShiftRight(y Int16x16) Int16x16
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAVW, CPU Feature: AVX512
-func (x Int16x32) ShiftRight(y Int16x32) Int16x32
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAVD, CPU Feature: AVX2
-func (x Int32x4) ShiftRight(y Int32x4) Int32x4
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAVD, CPU Feature: AVX2
-func (x Int32x8) ShiftRight(y Int32x8) Int32x8
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAVD, CPU Feature: AVX512
-func (x Int32x16) ShiftRight(y Int32x16) Int32x16
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAVQ, CPU Feature: AVX512
-func (x Int64x2) ShiftRight(y Int64x2) Int64x2
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAVQ, CPU Feature: AVX512
-func (x Int64x4) ShiftRight(y Int64x4) Int64x4
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are filled with the sign bit.
-//
-// Asm: VPSRAVQ, CPU Feature: AVX512
-func (x Int64x8) ShiftRight(y Int64x8) Int64x8
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLVW, CPU Feature: AVX512
-func (x Uint16x8) ShiftRight(y Uint16x8) Uint16x8
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLVW, CPU Feature: AVX512
-func (x Uint16x16) ShiftRight(y Uint16x16) Uint16x16
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLVW, CPU Feature: AVX512
-func (x Uint16x32) ShiftRight(y Uint16x32) Uint16x32
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLVD, CPU Feature: AVX2
-func (x Uint32x4) ShiftRight(y Uint32x4) Uint32x4
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLVD, CPU Feature: AVX2
-func (x Uint32x8) ShiftRight(y Uint32x8) Uint32x8
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLVD, CPU Feature: AVX512
-func (x Uint32x16) ShiftRight(y Uint32x16) Uint32x16
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLVQ, CPU Feature: AVX2
-func (x Uint64x2) ShiftRight(y Uint64x2) Uint64x2
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLVQ, CPU Feature: AVX2
-func (x Uint64x4) ShiftRight(y Uint64x4) Uint64x4
-
-// ShiftRight shifts each element in x to the right by the number of bits specified in y's corresponding elements. Emptied upper bits are zeroed.
-//
-// Asm: VPSRLVQ, CPU Feature: AVX512
-func (x Uint64x8) ShiftRight(y Uint64x8) Uint64x8
-
-/* ShiftRightConcat */
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Int16x8) ShiftRightConcat(y Int16x8, z Int16x8) Int16x8
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Int16x16) ShiftRightConcat(y Int16x16, z Int16x16) Int16x16
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Int16x32) ShiftRightConcat(y Int16x32, z Int16x32) Int16x32
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Int32x4) ShiftRightConcat(y Int32x4, z Int32x4) Int32x4
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Int32x8) ShiftRightConcat(y Int32x8, z Int32x8) Int32x8
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Int32x16) ShiftRightConcat(y Int32x16, z Int32x16) Int32x16
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x2) ShiftRightConcat(y Int64x2, z Int64x2) Int64x2
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x4) ShiftRightConcat(y Int64x4, z Int64x4) Int64x4
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Int64x8) ShiftRightConcat(y Int64x8, z Int64x8) Int64x8
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x8) ShiftRightConcat(y Uint16x8, z Uint16x8) Uint16x8
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x16) ShiftRightConcat(y Uint16x16, z Uint16x16) Uint16x16
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVW, CPU Feature: AVX512VBMI2
-func (x Uint16x32) ShiftRightConcat(y Uint16x32, z Uint16x32) Uint16x32
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x4) ShiftRightConcat(y Uint32x4, z Uint32x4) Uint32x4
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x8) ShiftRightConcat(y Uint32x8, z Uint32x8) Uint32x8
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVD, CPU Feature: AVX512VBMI2
-func (x Uint32x16) ShiftRightConcat(y Uint32x16, z Uint32x16) Uint32x16
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x2) ShiftRightConcat(y Uint64x2, z Uint64x2) Uint64x2
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x4) ShiftRightConcat(y Uint64x4, z Uint64x4) Uint64x4
-
-// ShiftRightConcat shifts each element of x to the right by the number of bits specified by the
-// corresponding elements in y(only the lower 5 bits are used), and then copies the lower bits of z to the emptied upper bits of the shifted x.
-//
-// Asm: VPSHRDVQ, CPU Feature: AVX512VBMI2
-func (x Uint64x8) ShiftRightConcat(y Uint64x8, z Uint64x8) Uint64x8
-
-/* Sqrt */
-
-// Sqrt computes the square root of each element.
-//
-// Asm: VSQRTPS, CPU Feature: AVX
-func (x Float32x4) Sqrt() Float32x4
-
-// Sqrt computes the square root of each element.
-//
-// Asm: VSQRTPS, CPU Feature: AVX
-func (x Float32x8) Sqrt() Float32x8
-
-// Sqrt computes the square root of each element.
-//
-// Asm: VSQRTPS, CPU Feature: AVX512
-func (x Float32x16) Sqrt() Float32x16
-
-// Sqrt computes the square root of each element.
-//
-// Asm: VSQRTPD, CPU Feature: AVX
-func (x Float64x2) Sqrt() Float64x2
-
-// Sqrt computes the square root of each element.
-//
-// Asm: VSQRTPD, CPU Feature: AVX
-func (x Float64x4) Sqrt() Float64x4
-
-// Sqrt computes the square root of each element.
-//
-// Asm: VSQRTPD, CPU Feature: AVX512
-func (x Float64x8) Sqrt() Float64x8
-
-/* Sub */
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VSUBPS, CPU Feature: AVX
-func (x Float32x4) Sub(y Float32x4) Float32x4
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VSUBPS, CPU Feature: AVX
-func (x Float32x8) Sub(y Float32x8) Float32x8
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VSUBPS, CPU Feature: AVX512
-func (x Float32x16) Sub(y Float32x16) Float32x16
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VSUBPD, CPU Feature: AVX
-func (x Float64x2) Sub(y Float64x2) Float64x2
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VSUBPD, CPU Feature: AVX
-func (x Float64x4) Sub(y Float64x4) Float64x4
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VSUBPD, CPU Feature: AVX512
-func (x Float64x8) Sub(y Float64x8) Float64x8
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBB, CPU Feature: AVX
-func (x Int8x16) Sub(y Int8x16) Int8x16
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBB, CPU Feature: AVX2
-func (x Int8x32) Sub(y Int8x32) Int8x32
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBB, CPU Feature: AVX512
-func (x Int8x64) Sub(y Int8x64) Int8x64
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBW, CPU Feature: AVX
-func (x Int16x8) Sub(y Int16x8) Int16x8
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBW, CPU Feature: AVX2
-func (x Int16x16) Sub(y Int16x16) Int16x16
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBW, CPU Feature: AVX512
-func (x Int16x32) Sub(y Int16x32) Int16x32
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBD, CPU Feature: AVX
-func (x Int32x4) Sub(y Int32x4) Int32x4
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBD, CPU Feature: AVX2
-func (x Int32x8) Sub(y Int32x8) Int32x8
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBD, CPU Feature: AVX512
-func (x Int32x16) Sub(y Int32x16) Int32x16
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBQ, CPU Feature: AVX
-func (x Int64x2) Sub(y Int64x2) Int64x2
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBQ, CPU Feature: AVX2
-func (x Int64x4) Sub(y Int64x4) Int64x4
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBQ, CPU Feature: AVX512
-func (x Int64x8) Sub(y Int64x8) Int64x8
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBB, CPU Feature: AVX
-func (x Uint8x16) Sub(y Uint8x16) Uint8x16
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBB, CPU Feature: AVX2
-func (x Uint8x32) Sub(y Uint8x32) Uint8x32
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBB, CPU Feature: AVX512
-func (x Uint8x64) Sub(y Uint8x64) Uint8x64
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBW, CPU Feature: AVX
-func (x Uint16x8) Sub(y Uint16x8) Uint16x8
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBW, CPU Feature: AVX2
-func (x Uint16x16) Sub(y Uint16x16) Uint16x16
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBW, CPU Feature: AVX512
-func (x Uint16x32) Sub(y Uint16x32) Uint16x32
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBD, CPU Feature: AVX
-func (x Uint32x4) Sub(y Uint32x4) Uint32x4
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBD, CPU Feature: AVX2
-func (x Uint32x8) Sub(y Uint32x8) Uint32x8
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBD, CPU Feature: AVX512
-func (x Uint32x16) Sub(y Uint32x16) Uint32x16
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBQ, CPU Feature: AVX
-func (x Uint64x2) Sub(y Uint64x2) Uint64x2
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBQ, CPU Feature: AVX2
-func (x Uint64x4) Sub(y Uint64x4) Uint64x4
-
-// Sub subtracts corresponding elements of two vectors.
-//
-// Asm: VPSUBQ, CPU Feature: AVX512
-func (x Uint64x8) Sub(y Uint64x8) Uint64x8
-
-/* SubPairs */
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x4) SubPairs(y Float32x4) Float32x4
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x8) SubPairs(y Float32x8) Float32x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x2) SubPairs(y Float64x2) Float64x2
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x4) SubPairs(y Float64x4) Float64x4
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Int16x8) SubPairs(y Int16x8) Int16x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Int16x16) SubPairs(y Int16x16) Int16x16
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Int32x4) SubPairs(y Int32x4) Int32x4
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Int32x8) SubPairs(y Int32x8) Int32x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Uint16x16) SubPairs(y Uint16x16) Uint16x16
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Uint32x8) SubPairs(y Uint32x8) Uint32x8
-
-/* SubPairsSaturated */
-
-// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBSW, CPU Feature: AVX
-func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8
-
-// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VPHSUBSW, CPU Feature: AVX2
-func (x Int16x16) SubPairsSaturated(y Int16x16) Int16x16
-
-/* SubSaturated */
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSB, CPU Feature: AVX
-func (x Int8x16) SubSaturated(y Int8x16) Int8x16
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSB, CPU Feature: AVX2
-func (x Int8x32) SubSaturated(y Int8x32) Int8x32
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSB, CPU Feature: AVX512
-func (x Int8x64) SubSaturated(y Int8x64) Int8x64
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSW, CPU Feature: AVX
-func (x Int16x8) SubSaturated(y Int16x8) Int16x8
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSW, CPU Feature: AVX2
-func (x Int16x16) SubSaturated(y Int16x16) Int16x16
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBSW, CPU Feature: AVX512
-func (x Int16x32) SubSaturated(y Int16x32) Int16x32
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBUSB, CPU Feature: AVX
-func (x Uint8x16) SubSaturated(y Uint8x16) Uint8x16
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBUSB, CPU Feature: AVX2
-func (x Uint8x32) SubSaturated(y Uint8x32) Uint8x32
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBUSB, CPU Feature: AVX512
-func (x Uint8x64) SubSaturated(y Uint8x64) Uint8x64
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBUSW, CPU Feature: AVX
-func (x Uint16x8) SubSaturated(y Uint16x8) Uint16x8
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBUSW, CPU Feature: AVX2
-func (x Uint16x16) SubSaturated(y Uint16x16) Uint16x16
-
-// SubSaturated subtracts corresponding elements of two vectors with saturation.
-//
-// Asm: VPSUBUSW, CPU Feature: AVX512
-func (x Uint16x32) SubSaturated(y Uint16x32) Uint16x32
-
-/* SumAbsDiff */
-
-// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
-// be a vector of word-sized elements whose each 4*n-th element contains the sum of the n-th input group. The other elements in the result vector are zeroed.
-// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
-//
-// Asm: VPSADBW, CPU Feature: AVX
-func (x Uint8x16) SumAbsDiff(y Uint8x16) Uint16x8
-
-// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
-// be a vector of word-sized elements whose each 4*n-th element contains the sum of the n-th input group. The other elements in the result vector are zeroed.
-// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
-//
-// Asm: VPSADBW, CPU Feature: AVX2
-func (x Uint8x32) SumAbsDiff(y Uint8x32) Uint16x16
-
-// SumAbsDiff sums the absolute distance of the two input vectors, each adjacent 8 bytes as a group. The output sum will
-// be a vector of word-sized elements whose each 4*n-th element contains the sum of the n-th input group. The other elements in the result vector are zeroed.
-// This method could be seen as the norm of the L1 distance of each adjacent 8-byte vector group of the two input vectors.
-//
-// Asm: VPSADBW, CPU Feature: AVX512
-func (x Uint8x64) SumAbsDiff(y Uint8x64) Uint16x32
-
-/* Trunc */
-
-// Trunc truncates elements towards zero.
-//
-// Asm: VROUNDPS, CPU Feature: AVX
-func (x Float32x4) Trunc() Float32x4
-
-// Trunc truncates elements towards zero.
-//
-// Asm: VROUNDPS, CPU Feature: AVX
-func (x Float32x8) Trunc() Float32x8
-
-// Trunc truncates elements towards zero.
-//
-// Asm: VROUNDPD, CPU Feature: AVX
-func (x Float64x2) Trunc() Float64x2
-
-// Trunc truncates elements towards zero.
-//
-// Asm: VROUNDPD, CPU Feature: AVX
-func (x Float64x4) Trunc() Float64x4
-
-/* TruncScaled */
-
-// TruncScaled truncates elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x4) TruncScaled(prec uint8) Float32x4
-
-// TruncScaled truncates elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x8) TruncScaled(prec uint8) Float32x8
-
-// TruncScaled truncates elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPS, CPU Feature: AVX512
-func (x Float32x16) TruncScaled(prec uint8) Float32x16
-
-// TruncScaled truncates elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x2) TruncScaled(prec uint8) Float64x2
-
-// TruncScaled truncates elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x4) TruncScaled(prec uint8) Float64x4
-
-// TruncScaled truncates elements with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VRNDSCALEPD, CPU Feature: AVX512
-func (x Float64x8) TruncScaled(prec uint8) Float64x8
-
-/* TruncScaledResidue */
-
-// TruncScaledResidue computes the difference after truncating with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x4) TruncScaledResidue(prec uint8) Float32x4
-
-// TruncScaledResidue computes the difference after truncating with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x8) TruncScaledResidue(prec uint8) Float32x8
-
-// TruncScaledResidue computes the difference after truncating with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPS, CPU Feature: AVX512
-func (x Float32x16) TruncScaledResidue(prec uint8) Float32x16
-
-// TruncScaledResidue computes the difference after truncating with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x2) TruncScaledResidue(prec uint8) Float64x2
-
-// TruncScaledResidue computes the difference after truncating with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x4) TruncScaledResidue(prec uint8) Float64x4
-
-// TruncScaledResidue computes the difference after truncating with specified precision.
-//
-// prec results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VREDUCEPD, CPU Feature: AVX512
-func (x Float64x8) TruncScaledResidue(prec uint8) Float64x8
-
-/* TruncateToInt8 */
-
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVWB, CPU Feature: AVX512
-func (x Int16x8) TruncateToInt8() Int8x16
-
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVWB, CPU Feature: AVX512
-func (x Int16x16) TruncateToInt8() Int8x16
-
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVWB, CPU Feature: AVX512
-func (x Int16x32) TruncateToInt8() Int8x32
-
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVDB, CPU Feature: AVX512
-func (x Int32x4) TruncateToInt8() Int8x16
-
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVDB, CPU Feature: AVX512
-func (x Int32x8) TruncateToInt8() Int8x16
-
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVDB, CPU Feature: AVX512
-func (x Int32x16) TruncateToInt8() Int8x16
-
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVQB, CPU Feature: AVX512
-func (x Int64x2) TruncateToInt8() Int8x16
-
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVQB, CPU Feature: AVX512
-func (x Int64x4) TruncateToInt8() Int8x16
-
-// TruncateToInt8 converts element values to int8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVQB, CPU Feature: AVX512
-func (x Int64x8) TruncateToInt8() Int8x16
-
-/* TruncateToInt16 */
-
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVDW, CPU Feature: AVX512
-func (x Int32x4) TruncateToInt16() Int16x8
-
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVDW, CPU Feature: AVX512
-func (x Int32x8) TruncateToInt16() Int16x8
-
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVDW, CPU Feature: AVX512
-func (x Int32x16) TruncateToInt16() Int16x16
-
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQW, CPU Feature: AVX512
-func (x Int64x2) TruncateToInt16() Int16x8
-
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQW, CPU Feature: AVX512
-func (x Int64x4) TruncateToInt16() Int16x8
-
-// TruncateToInt16 converts element values to int16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQW, CPU Feature: AVX512
-func (x Int64x8) TruncateToInt16() Int16x8
-
-/* TruncateToInt32 */
-
-// TruncateToInt32 converts element values to int32.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQD, CPU Feature: AVX512
-func (x Int64x2) TruncateToInt32() Int32x4
-
-// TruncateToInt32 converts element values to int32.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQD, CPU Feature: AVX512
-func (x Int64x4) TruncateToInt32() Int32x4
-
-// TruncateToInt32 converts element values to int32.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQD, CPU Feature: AVX512
-func (x Int64x8) TruncateToInt32() Int32x8
-
-/* TruncateToUint8 */
-
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVWB, CPU Feature: AVX512
-func (x Uint16x8) TruncateToUint8() Uint8x16
-
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVWB, CPU Feature: AVX512
-func (x Uint16x16) TruncateToUint8() Uint8x16
-
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVWB, CPU Feature: AVX512
-func (x Uint16x32) TruncateToUint8() Uint8x32
-
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVDB, CPU Feature: AVX512
-func (x Uint32x4) TruncateToUint8() Uint8x16
-
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVDB, CPU Feature: AVX512
-func (x Uint32x8) TruncateToUint8() Uint8x16
-
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVDB, CPU Feature: AVX512
-func (x Uint32x16) TruncateToUint8() Uint8x16
-
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVQB, CPU Feature: AVX512
-func (x Uint64x2) TruncateToUint8() Uint8x16
-
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVQB, CPU Feature: AVX512
-func (x Uint64x4) TruncateToUint8() Uint8x16
-
-// TruncateToUint8 converts element values to uint8.
-// Conversion is done with truncation on the vector elements.
-// Results are packed to low elements in the returned vector, its upper elements are zero-cleared.
-//
-// Asm: VPMOVQB, CPU Feature: AVX512
-func (x Uint64x8) TruncateToUint8() Uint8x16
-
-/* TruncateToUint16 */
-
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVDW, CPU Feature: AVX512
-func (x Uint32x4) TruncateToUint16() Uint16x8
-
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVDW, CPU Feature: AVX512
-func (x Uint32x8) TruncateToUint16() Uint16x8
-
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVDW, CPU Feature: AVX512
-func (x Uint32x16) TruncateToUint16() Uint16x16
-
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQW, CPU Feature: AVX512
-func (x Uint64x2) TruncateToUint16() Uint16x8
-
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQW, CPU Feature: AVX512
-func (x Uint64x4) TruncateToUint16() Uint16x8
-
-// TruncateToUint16 converts element values to uint16.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQW, CPU Feature: AVX512
-func (x Uint64x8) TruncateToUint16() Uint16x8
-
-/* TruncateToUint32 */
-
-// TruncateToUint32 converts element values to uint32.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQD, CPU Feature: AVX512
-func (x Uint64x2) TruncateToUint32() Uint32x4
-
-// TruncateToUint32 converts element values to uint32.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQD, CPU Feature: AVX512
-func (x Uint64x4) TruncateToUint32() Uint32x4
-
-// TruncateToUint32 converts element values to uint32.
-// Conversion is done with truncation on the vector elements.
-//
-// Asm: VPMOVQD, CPU Feature: AVX512
-func (x Uint64x8) TruncateToUint32() Uint32x8
-
-/* Xor */
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX
-func (x Int8x16) Xor(y Int8x16) Int8x16
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX2
-func (x Int8x32) Xor(y Int8x32) Int8x32
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXORD, CPU Feature: AVX512
-func (x Int8x64) Xor(y Int8x64) Int8x64
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX
-func (x Int16x8) Xor(y Int16x8) Int16x8
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX2
-func (x Int16x16) Xor(y Int16x16) Int16x16
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXORD, CPU Feature: AVX512
-func (x Int16x32) Xor(y Int16x32) Int16x32
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX
-func (x Int32x4) Xor(y Int32x4) Int32x4
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX2
-func (x Int32x8) Xor(y Int32x8) Int32x8
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXORD, CPU Feature: AVX512
-func (x Int32x16) Xor(y Int32x16) Int32x16
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX
-func (x Int64x2) Xor(y Int64x2) Int64x2
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX2
-func (x Int64x4) Xor(y Int64x4) Int64x4
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXORQ, CPU Feature: AVX512
-func (x Int64x8) Xor(y Int64x8) Int64x8
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX
-func (x Uint8x16) Xor(y Uint8x16) Uint8x16
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX2
-func (x Uint8x32) Xor(y Uint8x32) Uint8x32
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXORD, CPU Feature: AVX512
-func (x Uint8x64) Xor(y Uint8x64) Uint8x64
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX
-func (x Uint16x8) Xor(y Uint16x8) Uint16x8
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX2
-func (x Uint16x16) Xor(y Uint16x16) Uint16x16
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXORD, CPU Feature: AVX512
-func (x Uint16x32) Xor(y Uint16x32) Uint16x32
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX
-func (x Uint32x4) Xor(y Uint32x4) Uint32x4
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX2
-func (x Uint32x8) Xor(y Uint32x8) Uint32x8
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXORD, CPU Feature: AVX512
-func (x Uint32x16) Xor(y Uint32x16) Uint32x16
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX
-func (x Uint64x2) Xor(y Uint64x2) Uint64x2
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXOR, CPU Feature: AVX2
-func (x Uint64x4) Xor(y Uint64x4) Uint64x4
-
-// Xor performs a bitwise XOR operation between two vectors.
-//
-// Asm: VPXORQ, CPU Feature: AVX512
-func (x Uint64x8) Xor(y Uint64x8) Uint64x8
-
-// Float64x2 converts from Float32x4 to Float64x2
-func (from Float32x4) AsFloat64x2() (to Float64x2)
-
-// Int8x16 converts from Float32x4 to Int8x16
-func (from Float32x4) AsInt8x16() (to Int8x16)
-
-// Int16x8 converts from Float32x4 to Int16x8
-func (from Float32x4) AsInt16x8() (to Int16x8)
-
-// Int32x4 converts from Float32x4 to Int32x4
-func (from Float32x4) AsInt32x4() (to Int32x4)
-
-// Int64x2 converts from Float32x4 to Int64x2
-func (from Float32x4) AsInt64x2() (to Int64x2)
-
-// Uint8x16 converts from Float32x4 to Uint8x16
-func (from Float32x4) AsUint8x16() (to Uint8x16)
-
-// Uint16x8 converts from Float32x4 to Uint16x8
-func (from Float32x4) AsUint16x8() (to Uint16x8)
-
-// Uint32x4 converts from Float32x4 to Uint32x4
-func (from Float32x4) AsUint32x4() (to Uint32x4)
-
-// Uint64x2 converts from Float32x4 to Uint64x2
-func (from Float32x4) AsUint64x2() (to Uint64x2)
-
-// Float64x4 converts from Float32x8 to Float64x4
-func (from Float32x8) AsFloat64x4() (to Float64x4)
-
-// Int8x32 converts from Float32x8 to Int8x32
-func (from Float32x8) AsInt8x32() (to Int8x32)
-
-// Int16x16 converts from Float32x8 to Int16x16
-func (from Float32x8) AsInt16x16() (to Int16x16)
-
-// Int32x8 converts from Float32x8 to Int32x8
-func (from Float32x8) AsInt32x8() (to Int32x8)
-
-// Int64x4 converts from Float32x8 to Int64x4
-func (from Float32x8) AsInt64x4() (to Int64x4)
-
-// Uint8x32 converts from Float32x8 to Uint8x32
-func (from Float32x8) AsUint8x32() (to Uint8x32)
-
-// Uint16x16 converts from Float32x8 to Uint16x16
-func (from Float32x8) AsUint16x16() (to Uint16x16)
-
-// Uint32x8 converts from Float32x8 to Uint32x8
-func (from Float32x8) AsUint32x8() (to Uint32x8)
-
-// Uint64x4 converts from Float32x8 to Uint64x4
-func (from Float32x8) AsUint64x4() (to Uint64x4)
-
-// Float64x8 converts from Float32x16 to Float64x8
-func (from Float32x16) AsFloat64x8() (to Float64x8)
-
-// Int8x64 converts from Float32x16 to Int8x64
-func (from Float32x16) AsInt8x64() (to Int8x64)
-
-// Int16x32 converts from Float32x16 to Int16x32
-func (from Float32x16) AsInt16x32() (to Int16x32)
-
-// Int32x16 converts from Float32x16 to Int32x16
-func (from Float32x16) AsInt32x16() (to Int32x16)
-
-// Int64x8 converts from Float32x16 to Int64x8
-func (from Float32x16) AsInt64x8() (to Int64x8)
-
-// Uint8x64 converts from Float32x16 to Uint8x64
-func (from Float32x16) AsUint8x64() (to Uint8x64)
-
-// Uint16x32 converts from Float32x16 to Uint16x32
-func (from Float32x16) AsUint16x32() (to Uint16x32)
-
-// Uint32x16 converts from Float32x16 to Uint32x16
-func (from Float32x16) AsUint32x16() (to Uint32x16)
-
-// Uint64x8 converts from Float32x16 to Uint64x8
-func (from Float32x16) AsUint64x8() (to Uint64x8)
-
-// Float32x4 converts from Float64x2 to Float32x4
-func (from Float64x2) AsFloat32x4() (to Float32x4)
-
-// Int8x16 converts from Float64x2 to Int8x16
-func (from Float64x2) AsInt8x16() (to Int8x16)
-
-// Int16x8 converts from Float64x2 to Int16x8
-func (from Float64x2) AsInt16x8() (to Int16x8)
-
-// Int32x4 converts from Float64x2 to Int32x4
-func (from Float64x2) AsInt32x4() (to Int32x4)
-
-// Int64x2 converts from Float64x2 to Int64x2
-func (from Float64x2) AsInt64x2() (to Int64x2)
-
-// Uint8x16 converts from Float64x2 to Uint8x16
-func (from Float64x2) AsUint8x16() (to Uint8x16)
-
-// Uint16x8 converts from Float64x2 to Uint16x8
-func (from Float64x2) AsUint16x8() (to Uint16x8)
-
-// Uint32x4 converts from Float64x2 to Uint32x4
-func (from Float64x2) AsUint32x4() (to Uint32x4)
-
-// Uint64x2 converts from Float64x2 to Uint64x2
-func (from Float64x2) AsUint64x2() (to Uint64x2)
-
-// Float32x8 converts from Float64x4 to Float32x8
-func (from Float64x4) AsFloat32x8() (to Float32x8)
-
-// Int8x32 converts from Float64x4 to Int8x32
-func (from Float64x4) AsInt8x32() (to Int8x32)
-
-// Int16x16 converts from Float64x4 to Int16x16
-func (from Float64x4) AsInt16x16() (to Int16x16)
-
-// Int32x8 converts from Float64x4 to Int32x8
-func (from Float64x4) AsInt32x8() (to Int32x8)
-
-// Int64x4 converts from Float64x4 to Int64x4
-func (from Float64x4) AsInt64x4() (to Int64x4)
-
-// Uint8x32 converts from Float64x4 to Uint8x32
-func (from Float64x4) AsUint8x32() (to Uint8x32)
-
-// Uint16x16 converts from Float64x4 to Uint16x16
-func (from Float64x4) AsUint16x16() (to Uint16x16)
-
-// Uint32x8 converts from Float64x4 to Uint32x8
-func (from Float64x4) AsUint32x8() (to Uint32x8)
-
-// Uint64x4 converts from Float64x4 to Uint64x4
-func (from Float64x4) AsUint64x4() (to Uint64x4)
-
-// Float32x16 converts from Float64x8 to Float32x16
-func (from Float64x8) AsFloat32x16() (to Float32x16)
-
-// Int8x64 converts from Float64x8 to Int8x64
-func (from Float64x8) AsInt8x64() (to Int8x64)
-
-// Int16x32 converts from Float64x8 to Int16x32
-func (from Float64x8) AsInt16x32() (to Int16x32)
-
-// Int32x16 converts from Float64x8 to Int32x16
-func (from Float64x8) AsInt32x16() (to Int32x16)
-
-// Int64x8 converts from Float64x8 to Int64x8
-func (from Float64x8) AsInt64x8() (to Int64x8)
-
-// Uint8x64 converts from Float64x8 to Uint8x64
-func (from Float64x8) AsUint8x64() (to Uint8x64)
-
-// Uint16x32 converts from Float64x8 to Uint16x32
-func (from Float64x8) AsUint16x32() (to Uint16x32)
-
-// Uint32x16 converts from Float64x8 to Uint32x16
-func (from Float64x8) AsUint32x16() (to Uint32x16)
-
-// Uint64x8 converts from Float64x8 to Uint64x8
-func (from Float64x8) AsUint64x8() (to Uint64x8)
-
-// Float32x4 converts from Int8x16 to Float32x4
-func (from Int8x16) AsFloat32x4() (to Float32x4)
-
-// Float64x2 converts from Int8x16 to Float64x2
-func (from Int8x16) AsFloat64x2() (to Float64x2)
-
-// Int16x8 converts from Int8x16 to Int16x8
-func (from Int8x16) AsInt16x8() (to Int16x8)
-
-// Int32x4 converts from Int8x16 to Int32x4
-func (from Int8x16) AsInt32x4() (to Int32x4)
-
-// Int64x2 converts from Int8x16 to Int64x2
-func (from Int8x16) AsInt64x2() (to Int64x2)
-
-// Uint8x16 converts from Int8x16 to Uint8x16
-func (from Int8x16) AsUint8x16() (to Uint8x16)
-
-// Uint16x8 converts from Int8x16 to Uint16x8
-func (from Int8x16) AsUint16x8() (to Uint16x8)
-
-// Uint32x4 converts from Int8x16 to Uint32x4
-func (from Int8x16) AsUint32x4() (to Uint32x4)
-
-// Uint64x2 converts from Int8x16 to Uint64x2
-func (from Int8x16) AsUint64x2() (to Uint64x2)
-
-// Float32x8 converts from Int8x32 to Float32x8
-func (from Int8x32) AsFloat32x8() (to Float32x8)
-
-// Float64x4 converts from Int8x32 to Float64x4
-func (from Int8x32) AsFloat64x4() (to Float64x4)
-
-// Int16x16 converts from Int8x32 to Int16x16
-func (from Int8x32) AsInt16x16() (to Int16x16)
-
-// Int32x8 converts from Int8x32 to Int32x8
-func (from Int8x32) AsInt32x8() (to Int32x8)
-
-// Int64x4 converts from Int8x32 to Int64x4
-func (from Int8x32) AsInt64x4() (to Int64x4)
-
-// Uint8x32 converts from Int8x32 to Uint8x32
-func (from Int8x32) AsUint8x32() (to Uint8x32)
-
-// Uint16x16 converts from Int8x32 to Uint16x16
-func (from Int8x32) AsUint16x16() (to Uint16x16)
-
-// Uint32x8 converts from Int8x32 to Uint32x8
-func (from Int8x32) AsUint32x8() (to Uint32x8)
-
-// Uint64x4 converts from Int8x32 to Uint64x4
-func (from Int8x32) AsUint64x4() (to Uint64x4)
-
-// Float32x16 converts from Int8x64 to Float32x16
-func (from Int8x64) AsFloat32x16() (to Float32x16)
-
-// Float64x8 converts from Int8x64 to Float64x8
-func (from Int8x64) AsFloat64x8() (to Float64x8)
-
-// Int16x32 converts from Int8x64 to Int16x32
-func (from Int8x64) AsInt16x32() (to Int16x32)
-
-// Int32x16 converts from Int8x64 to Int32x16
-func (from Int8x64) AsInt32x16() (to Int32x16)
-
-// Int64x8 converts from Int8x64 to Int64x8
-func (from Int8x64) AsInt64x8() (to Int64x8)
-
-// Uint8x64 converts from Int8x64 to Uint8x64
-func (from Int8x64) AsUint8x64() (to Uint8x64)
-
-// Uint16x32 converts from Int8x64 to Uint16x32
-func (from Int8x64) AsUint16x32() (to Uint16x32)
-
-// Uint32x16 converts from Int8x64 to Uint32x16
-func (from Int8x64) AsUint32x16() (to Uint32x16)
-
-// Uint64x8 converts from Int8x64 to Uint64x8
-func (from Int8x64) AsUint64x8() (to Uint64x8)
-
-// Float32x4 converts from Int16x8 to Float32x4
-func (from Int16x8) AsFloat32x4() (to Float32x4)
-
-// Float64x2 converts from Int16x8 to Float64x2
-func (from Int16x8) AsFloat64x2() (to Float64x2)
-
-// Int8x16 converts from Int16x8 to Int8x16
-func (from Int16x8) AsInt8x16() (to Int8x16)
-
-// Int32x4 converts from Int16x8 to Int32x4
-func (from Int16x8) AsInt32x4() (to Int32x4)
-
-// Int64x2 converts from Int16x8 to Int64x2
-func (from Int16x8) AsInt64x2() (to Int64x2)
-
-// Uint8x16 converts from Int16x8 to Uint8x16
-func (from Int16x8) AsUint8x16() (to Uint8x16)
-
-// Uint16x8 converts from Int16x8 to Uint16x8
-func (from Int16x8) AsUint16x8() (to Uint16x8)
-
-// Uint32x4 converts from Int16x8 to Uint32x4
-func (from Int16x8) AsUint32x4() (to Uint32x4)
-
-// Uint64x2 converts from Int16x8 to Uint64x2
-func (from Int16x8) AsUint64x2() (to Uint64x2)
-
-// Float32x8 converts from Int16x16 to Float32x8
-func (from Int16x16) AsFloat32x8() (to Float32x8)
-
-// Float64x4 converts from Int16x16 to Float64x4
-func (from Int16x16) AsFloat64x4() (to Float64x4)
-
-// Int8x32 converts from Int16x16 to Int8x32
-func (from Int16x16) AsInt8x32() (to Int8x32)
-
-// Int32x8 converts from Int16x16 to Int32x8
-func (from Int16x16) AsInt32x8() (to Int32x8)
-
-// Int64x4 converts from Int16x16 to Int64x4
-func (from Int16x16) AsInt64x4() (to Int64x4)
-
-// Uint8x32 converts from Int16x16 to Uint8x32
-func (from Int16x16) AsUint8x32() (to Uint8x32)
-
-// Uint16x16 converts from Int16x16 to Uint16x16
-func (from Int16x16) AsUint16x16() (to Uint16x16)
-
-// Uint32x8 converts from Int16x16 to Uint32x8
-func (from Int16x16) AsUint32x8() (to Uint32x8)
-
-// Uint64x4 converts from Int16x16 to Uint64x4
-func (from Int16x16) AsUint64x4() (to Uint64x4)
-
-// Float32x16 converts from Int16x32 to Float32x16
-func (from Int16x32) AsFloat32x16() (to Float32x16)
-
-// Float64x8 converts from Int16x32 to Float64x8
-func (from Int16x32) AsFloat64x8() (to Float64x8)
-
-// Int8x64 converts from Int16x32 to Int8x64
-func (from Int16x32) AsInt8x64() (to Int8x64)
-
-// Int32x16 converts from Int16x32 to Int32x16
-func (from Int16x32) AsInt32x16() (to Int32x16)
-
-// Int64x8 converts from Int16x32 to Int64x8
-func (from Int16x32) AsInt64x8() (to Int64x8)
-
-// Uint8x64 converts from Int16x32 to Uint8x64
-func (from Int16x32) AsUint8x64() (to Uint8x64)
-
-// Uint16x32 converts from Int16x32 to Uint16x32
-func (from Int16x32) AsUint16x32() (to Uint16x32)
-
-// Uint32x16 converts from Int16x32 to Uint32x16
-func (from Int16x32) AsUint32x16() (to Uint32x16)
-
-// Uint64x8 converts from Int16x32 to Uint64x8
-func (from Int16x32) AsUint64x8() (to Uint64x8)
-
-// Float32x4 converts from Int32x4 to Float32x4
-func (from Int32x4) AsFloat32x4() (to Float32x4)
-
-// Float64x2 converts from Int32x4 to Float64x2
-func (from Int32x4) AsFloat64x2() (to Float64x2)
-
-// Int8x16 converts from Int32x4 to Int8x16
-func (from Int32x4) AsInt8x16() (to Int8x16)
-
-// Int16x8 converts from Int32x4 to Int16x8
-func (from Int32x4) AsInt16x8() (to Int16x8)
-
-// Int64x2 converts from Int32x4 to Int64x2
-func (from Int32x4) AsInt64x2() (to Int64x2)
-
-// Uint8x16 converts from Int32x4 to Uint8x16
-func (from Int32x4) AsUint8x16() (to Uint8x16)
-
-// Uint16x8 converts from Int32x4 to Uint16x8
-func (from Int32x4) AsUint16x8() (to Uint16x8)
-
-// Uint32x4 converts from Int32x4 to Uint32x4
-func (from Int32x4) AsUint32x4() (to Uint32x4)
-
-// Uint64x2 converts from Int32x4 to Uint64x2
-func (from Int32x4) AsUint64x2() (to Uint64x2)
-
-// Float32x8 converts from Int32x8 to Float32x8
-func (from Int32x8) AsFloat32x8() (to Float32x8)
-
-// Float64x4 converts from Int32x8 to Float64x4
-func (from Int32x8) AsFloat64x4() (to Float64x4)
-
-// Int8x32 converts from Int32x8 to Int8x32
-func (from Int32x8) AsInt8x32() (to Int8x32)
-
-// Int16x16 converts from Int32x8 to Int16x16
-func (from Int32x8) AsInt16x16() (to Int16x16)
-
-// Int64x4 converts from Int32x8 to Int64x4
-func (from Int32x8) AsInt64x4() (to Int64x4)
-
-// Uint8x32 converts from Int32x8 to Uint8x32
-func (from Int32x8) AsUint8x32() (to Uint8x32)
-
-// Uint16x16 converts from Int32x8 to Uint16x16
-func (from Int32x8) AsUint16x16() (to Uint16x16)
-
-// Uint32x8 converts from Int32x8 to Uint32x8
-func (from Int32x8) AsUint32x8() (to Uint32x8)
-
-// Uint64x4 converts from Int32x8 to Uint64x4
-func (from Int32x8) AsUint64x4() (to Uint64x4)
-
-// Float32x16 converts from Int32x16 to Float32x16
-func (from Int32x16) AsFloat32x16() (to Float32x16)
-
-// Float64x8 converts from Int32x16 to Float64x8
-func (from Int32x16) AsFloat64x8() (to Float64x8)
-
-// Int8x64 converts from Int32x16 to Int8x64
-func (from Int32x16) AsInt8x64() (to Int8x64)
-
-// Int16x32 converts from Int32x16 to Int16x32
-func (from Int32x16) AsInt16x32() (to Int16x32)
-
-// Int64x8 converts from Int32x16 to Int64x8
-func (from Int32x16) AsInt64x8() (to Int64x8)
-
-// Uint8x64 converts from Int32x16 to Uint8x64
-func (from Int32x16) AsUint8x64() (to Uint8x64)
-
-// Uint16x32 converts from Int32x16 to Uint16x32
-func (from Int32x16) AsUint16x32() (to Uint16x32)
-
-// Uint32x16 converts from Int32x16 to Uint32x16
-func (from Int32x16) AsUint32x16() (to Uint32x16)
-
-// Uint64x8 converts from Int32x16 to Uint64x8
-func (from Int32x16) AsUint64x8() (to Uint64x8)
-
-// Float32x4 converts from Int64x2 to Float32x4
-func (from Int64x2) AsFloat32x4() (to Float32x4)
-
-// Float64x2 converts from Int64x2 to Float64x2
-func (from Int64x2) AsFloat64x2() (to Float64x2)
-
-// Int8x16 converts from Int64x2 to Int8x16
-func (from Int64x2) AsInt8x16() (to Int8x16)
-
-// Int16x8 converts from Int64x2 to Int16x8
-func (from Int64x2) AsInt16x8() (to Int16x8)
-
-// Int32x4 converts from Int64x2 to Int32x4
-func (from Int64x2) AsInt32x4() (to Int32x4)
-
-// Uint8x16 converts from Int64x2 to Uint8x16
-func (from Int64x2) AsUint8x16() (to Uint8x16)
-
-// Uint16x8 converts from Int64x2 to Uint16x8
-func (from Int64x2) AsUint16x8() (to Uint16x8)
-
-// Uint32x4 converts from Int64x2 to Uint32x4
-func (from Int64x2) AsUint32x4() (to Uint32x4)
-
-// Uint64x2 converts from Int64x2 to Uint64x2
-func (from Int64x2) AsUint64x2() (to Uint64x2)
-
-// Float32x8 converts from Int64x4 to Float32x8
-func (from Int64x4) AsFloat32x8() (to Float32x8)
-
-// Float64x4 converts from Int64x4 to Float64x4
-func (from Int64x4) AsFloat64x4() (to Float64x4)
-
-// Int8x32 converts from Int64x4 to Int8x32
-func (from Int64x4) AsInt8x32() (to Int8x32)
-
-// Int16x16 converts from Int64x4 to Int16x16
-func (from Int64x4) AsInt16x16() (to Int16x16)
-
-// Int32x8 converts from Int64x4 to Int32x8
-func (from Int64x4) AsInt32x8() (to Int32x8)
-
-// Uint8x32 converts from Int64x4 to Uint8x32
-func (from Int64x4) AsUint8x32() (to Uint8x32)
-
-// Uint16x16 converts from Int64x4 to Uint16x16
-func (from Int64x4) AsUint16x16() (to Uint16x16)
-
-// Uint32x8 converts from Int64x4 to Uint32x8
-func (from Int64x4) AsUint32x8() (to Uint32x8)
-
-// Uint64x4 converts from Int64x4 to Uint64x4
-func (from Int64x4) AsUint64x4() (to Uint64x4)
-
-// Float32x16 converts from Int64x8 to Float32x16
-func (from Int64x8) AsFloat32x16() (to Float32x16)
-
-// Float64x8 converts from Int64x8 to Float64x8
-func (from Int64x8) AsFloat64x8() (to Float64x8)
-
-// Int8x64 converts from Int64x8 to Int8x64
-func (from Int64x8) AsInt8x64() (to Int8x64)
-
-// Int16x32 converts from Int64x8 to Int16x32
-func (from Int64x8) AsInt16x32() (to Int16x32)
-
-// Int32x16 converts from Int64x8 to Int32x16
-func (from Int64x8) AsInt32x16() (to Int32x16)
-
-// Uint8x64 converts from Int64x8 to Uint8x64
-func (from Int64x8) AsUint8x64() (to Uint8x64)
-
-// Uint16x32 converts from Int64x8 to Uint16x32
-func (from Int64x8) AsUint16x32() (to Uint16x32)
-
-// Uint32x16 converts from Int64x8 to Uint32x16
-func (from Int64x8) AsUint32x16() (to Uint32x16)
-
-// Uint64x8 converts from Int64x8 to Uint64x8
-func (from Int64x8) AsUint64x8() (to Uint64x8)
-
-// Float32x4 converts from Uint8x16 to Float32x4
-func (from Uint8x16) AsFloat32x4() (to Float32x4)
-
-// Float64x2 converts from Uint8x16 to Float64x2
-func (from Uint8x16) AsFloat64x2() (to Float64x2)
-
-// Int8x16 converts from Uint8x16 to Int8x16
-func (from Uint8x16) AsInt8x16() (to Int8x16)
-
-// Int16x8 converts from Uint8x16 to Int16x8
-func (from Uint8x16) AsInt16x8() (to Int16x8)
-
-// Int32x4 converts from Uint8x16 to Int32x4
-func (from Uint8x16) AsInt32x4() (to Int32x4)
-
-// Int64x2 converts from Uint8x16 to Int64x2
-func (from Uint8x16) AsInt64x2() (to Int64x2)
-
-// Uint16x8 converts from Uint8x16 to Uint16x8
-func (from Uint8x16) AsUint16x8() (to Uint16x8)
-
-// Uint32x4 converts from Uint8x16 to Uint32x4
-func (from Uint8x16) AsUint32x4() (to Uint32x4)
-
-// Uint64x2 converts from Uint8x16 to Uint64x2
-func (from Uint8x16) AsUint64x2() (to Uint64x2)
-
-// Float32x8 converts from Uint8x32 to Float32x8
-func (from Uint8x32) AsFloat32x8() (to Float32x8)
-
-// Float64x4 converts from Uint8x32 to Float64x4
-func (from Uint8x32) AsFloat64x4() (to Float64x4)
-
-// Int8x32 converts from Uint8x32 to Int8x32
-func (from Uint8x32) AsInt8x32() (to Int8x32)
-
-// Int16x16 converts from Uint8x32 to Int16x16
-func (from Uint8x32) AsInt16x16() (to Int16x16)
-
-// Int32x8 converts from Uint8x32 to Int32x8
-func (from Uint8x32) AsInt32x8() (to Int32x8)
-
-// Int64x4 converts from Uint8x32 to Int64x4
-func (from Uint8x32) AsInt64x4() (to Int64x4)
-
-// Uint16x16 converts from Uint8x32 to Uint16x16
-func (from Uint8x32) AsUint16x16() (to Uint16x16)
-
-// Uint32x8 converts from Uint8x32 to Uint32x8
-func (from Uint8x32) AsUint32x8() (to Uint32x8)
-
-// Uint64x4 converts from Uint8x32 to Uint64x4
-func (from Uint8x32) AsUint64x4() (to Uint64x4)
-
-// Float32x16 converts from Uint8x64 to Float32x16
-func (from Uint8x64) AsFloat32x16() (to Float32x16)
-
-// Float64x8 converts from Uint8x64 to Float64x8
-func (from Uint8x64) AsFloat64x8() (to Float64x8)
-
-// Int8x64 converts from Uint8x64 to Int8x64
-func (from Uint8x64) AsInt8x64() (to Int8x64)
-
-// Int16x32 converts from Uint8x64 to Int16x32
-func (from Uint8x64) AsInt16x32() (to Int16x32)
-
-// Int32x16 converts from Uint8x64 to Int32x16
-func (from Uint8x64) AsInt32x16() (to Int32x16)
-
-// Int64x8 converts from Uint8x64 to Int64x8
-func (from Uint8x64) AsInt64x8() (to Int64x8)
-
-// Uint16x32 converts from Uint8x64 to Uint16x32
-func (from Uint8x64) AsUint16x32() (to Uint16x32)
-
-// Uint32x16 converts from Uint8x64 to Uint32x16
-func (from Uint8x64) AsUint32x16() (to Uint32x16)
-
-// Uint64x8 converts from Uint8x64 to Uint64x8
-func (from Uint8x64) AsUint64x8() (to Uint64x8)
-
-// Float32x4 converts from Uint16x8 to Float32x4
-func (from Uint16x8) AsFloat32x4() (to Float32x4)
-
-// Float64x2 converts from Uint16x8 to Float64x2
-func (from Uint16x8) AsFloat64x2() (to Float64x2)
-
-// Int8x16 converts from Uint16x8 to Int8x16
-func (from Uint16x8) AsInt8x16() (to Int8x16)
-
-// Int16x8 converts from Uint16x8 to Int16x8
-func (from Uint16x8) AsInt16x8() (to Int16x8)
-
-// Int32x4 converts from Uint16x8 to Int32x4
-func (from Uint16x8) AsInt32x4() (to Int32x4)
-
-// Int64x2 converts from Uint16x8 to Int64x2
-func (from Uint16x8) AsInt64x2() (to Int64x2)
-
-// Uint8x16 converts from Uint16x8 to Uint8x16
-func (from Uint16x8) AsUint8x16() (to Uint8x16)
-
-// Uint32x4 converts from Uint16x8 to Uint32x4
-func (from Uint16x8) AsUint32x4() (to Uint32x4)
-
-// Uint64x2 converts from Uint16x8 to Uint64x2
-func (from Uint16x8) AsUint64x2() (to Uint64x2)
-
-// Float32x8 converts from Uint16x16 to Float32x8
-func (from Uint16x16) AsFloat32x8() (to Float32x8)
-
-// Float64x4 converts from Uint16x16 to Float64x4
-func (from Uint16x16) AsFloat64x4() (to Float64x4)
-
-// Int8x32 converts from Uint16x16 to Int8x32
-func (from Uint16x16) AsInt8x32() (to Int8x32)
-
-// Int16x16 converts from Uint16x16 to Int16x16
-func (from Uint16x16) AsInt16x16() (to Int16x16)
-
-// Int32x8 converts from Uint16x16 to Int32x8
-func (from Uint16x16) AsInt32x8() (to Int32x8)
-
-// Int64x4 converts from Uint16x16 to Int64x4
-func (from Uint16x16) AsInt64x4() (to Int64x4)
-
-// Uint8x32 converts from Uint16x16 to Uint8x32
-func (from Uint16x16) AsUint8x32() (to Uint8x32)
-
-// Uint32x8 converts from Uint16x16 to Uint32x8
-func (from Uint16x16) AsUint32x8() (to Uint32x8)
-
-// Uint64x4 converts from Uint16x16 to Uint64x4
-func (from Uint16x16) AsUint64x4() (to Uint64x4)
-
-// Float32x16 converts from Uint16x32 to Float32x16
-func (from Uint16x32) AsFloat32x16() (to Float32x16)
-
-// Float64x8 converts from Uint16x32 to Float64x8
-func (from Uint16x32) AsFloat64x8() (to Float64x8)
-
-// Int8x64 converts from Uint16x32 to Int8x64
-func (from Uint16x32) AsInt8x64() (to Int8x64)
-
-// Int16x32 converts from Uint16x32 to Int16x32
-func (from Uint16x32) AsInt16x32() (to Int16x32)
-
-// Int32x16 converts from Uint16x32 to Int32x16
-func (from Uint16x32) AsInt32x16() (to Int32x16)
-
-// Int64x8 converts from Uint16x32 to Int64x8
-func (from Uint16x32) AsInt64x8() (to Int64x8)
-
-// Uint8x64 converts from Uint16x32 to Uint8x64
-func (from Uint16x32) AsUint8x64() (to Uint8x64)
-
-// Uint32x16 converts from Uint16x32 to Uint32x16
-func (from Uint16x32) AsUint32x16() (to Uint32x16)
-
-// Uint64x8 converts from Uint16x32 to Uint64x8
-func (from Uint16x32) AsUint64x8() (to Uint64x8)
-
-// Float32x4 converts from Uint32x4 to Float32x4
-func (from Uint32x4) AsFloat32x4() (to Float32x4)
-
-// Float64x2 converts from Uint32x4 to Float64x2
-func (from Uint32x4) AsFloat64x2() (to Float64x2)
-
-// Int8x16 converts from Uint32x4 to Int8x16
-func (from Uint32x4) AsInt8x16() (to Int8x16)
-
-// Int16x8 converts from Uint32x4 to Int16x8
-func (from Uint32x4) AsInt16x8() (to Int16x8)
-
-// Int32x4 converts from Uint32x4 to Int32x4
-func (from Uint32x4) AsInt32x4() (to Int32x4)
-
-// Int64x2 converts from Uint32x4 to Int64x2
-func (from Uint32x4) AsInt64x2() (to Int64x2)
-
-// Uint8x16 converts from Uint32x4 to Uint8x16
-func (from Uint32x4) AsUint8x16() (to Uint8x16)
-
-// Uint16x8 converts from Uint32x4 to Uint16x8
-func (from Uint32x4) AsUint16x8() (to Uint16x8)
-
-// Uint64x2 converts from Uint32x4 to Uint64x2
-func (from Uint32x4) AsUint64x2() (to Uint64x2)
-
-// Float32x8 converts from Uint32x8 to Float32x8
-func (from Uint32x8) AsFloat32x8() (to Float32x8)
-
-// Float64x4 converts from Uint32x8 to Float64x4
-func (from Uint32x8) AsFloat64x4() (to Float64x4)
-
-// Int8x32 converts from Uint32x8 to Int8x32
-func (from Uint32x8) AsInt8x32() (to Int8x32)
-
-// Int16x16 converts from Uint32x8 to Int16x16
-func (from Uint32x8) AsInt16x16() (to Int16x16)
-
-// Int32x8 converts from Uint32x8 to Int32x8
-func (from Uint32x8) AsInt32x8() (to Int32x8)
-
-// Int64x4 converts from Uint32x8 to Int64x4
-func (from Uint32x8) AsInt64x4() (to Int64x4)
-
-// Uint8x32 converts from Uint32x8 to Uint8x32
-func (from Uint32x8) AsUint8x32() (to Uint8x32)
-
-// Uint16x16 converts from Uint32x8 to Uint16x16
-func (from Uint32x8) AsUint16x16() (to Uint16x16)
-
-// Uint64x4 converts from Uint32x8 to Uint64x4
-func (from Uint32x8) AsUint64x4() (to Uint64x4)
-
-// Float32x16 converts from Uint32x16 to Float32x16
-func (from Uint32x16) AsFloat32x16() (to Float32x16)
-
-// Float64x8 converts from Uint32x16 to Float64x8
-func (from Uint32x16) AsFloat64x8() (to Float64x8)
-
-// Int8x64 converts from Uint32x16 to Int8x64
-func (from Uint32x16) AsInt8x64() (to Int8x64)
-
-// Int16x32 converts from Uint32x16 to Int16x32
-func (from Uint32x16) AsInt16x32() (to Int16x32)
-
-// Int32x16 converts from Uint32x16 to Int32x16
-func (from Uint32x16) AsInt32x16() (to Int32x16)
-
-// Int64x8 converts from Uint32x16 to Int64x8
-func (from Uint32x16) AsInt64x8() (to Int64x8)
-
-// Uint8x64 converts from Uint32x16 to Uint8x64
-func (from Uint32x16) AsUint8x64() (to Uint8x64)
-
-// Uint16x32 converts from Uint32x16 to Uint16x32
-func (from Uint32x16) AsUint16x32() (to Uint16x32)
-
-// Uint64x8 converts from Uint32x16 to Uint64x8
-func (from Uint32x16) AsUint64x8() (to Uint64x8)
-
-// Float32x4 converts from Uint64x2 to Float32x4
-func (from Uint64x2) AsFloat32x4() (to Float32x4)
-
-// Float64x2 converts from Uint64x2 to Float64x2
-func (from Uint64x2) AsFloat64x2() (to Float64x2)
-
-// Int8x16 converts from Uint64x2 to Int8x16
-func (from Uint64x2) AsInt8x16() (to Int8x16)
-
-// Int16x8 converts from Uint64x2 to Int16x8
-func (from Uint64x2) AsInt16x8() (to Int16x8)
-
-// Int32x4 converts from Uint64x2 to Int32x4
-func (from Uint64x2) AsInt32x4() (to Int32x4)
-
-// Int64x2 converts from Uint64x2 to Int64x2
-func (from Uint64x2) AsInt64x2() (to Int64x2)
-
-// Uint8x16 converts from Uint64x2 to Uint8x16
-func (from Uint64x2) AsUint8x16() (to Uint8x16)
-
-// Uint16x8 converts from Uint64x2 to Uint16x8
-func (from Uint64x2) AsUint16x8() (to Uint16x8)
-
-// Uint32x4 converts from Uint64x2 to Uint32x4
-func (from Uint64x2) AsUint32x4() (to Uint32x4)
-
-// Float32x8 converts from Uint64x4 to Float32x8
-func (from Uint64x4) AsFloat32x8() (to Float32x8)
-
-// Float64x4 converts from Uint64x4 to Float64x4
-func (from Uint64x4) AsFloat64x4() (to Float64x4)
-
-// Int8x32 converts from Uint64x4 to Int8x32
-func (from Uint64x4) AsInt8x32() (to Int8x32)
-
-// Int16x16 converts from Uint64x4 to Int16x16
-func (from Uint64x4) AsInt16x16() (to Int16x16)
-
-// Int32x8 converts from Uint64x4 to Int32x8
-func (from Uint64x4) AsInt32x8() (to Int32x8)
-
-// Int64x4 converts from Uint64x4 to Int64x4
-func (from Uint64x4) AsInt64x4() (to Int64x4)
-
-// Uint8x32 converts from Uint64x4 to Uint8x32
-func (from Uint64x4) AsUint8x32() (to Uint8x32)
-
-// Uint16x16 converts from Uint64x4 to Uint16x16
-func (from Uint64x4) AsUint16x16() (to Uint16x16)
-
-// Uint32x8 converts from Uint64x4 to Uint32x8
-func (from Uint64x4) AsUint32x8() (to Uint32x8)
-
-// Float32x16 converts from Uint64x8 to Float32x16
-func (from Uint64x8) AsFloat32x16() (to Float32x16)
-
-// Float64x8 converts from Uint64x8 to Float64x8
-func (from Uint64x8) AsFloat64x8() (to Float64x8)
-
-// Int8x64 converts from Uint64x8 to Int8x64
-func (from Uint64x8) AsInt8x64() (to Int8x64)
-
-// Int16x32 converts from Uint64x8 to Int16x32
-func (from Uint64x8) AsInt16x32() (to Int16x32)
-
-// Int32x16 converts from Uint64x8 to Int32x16
-func (from Uint64x8) AsInt32x16() (to Int32x16)
-
-// Int64x8 converts from Uint64x8 to Int64x8
-func (from Uint64x8) AsInt64x8() (to Int64x8)
-
-// Uint8x64 converts from Uint64x8 to Uint8x64
-func (from Uint64x8) AsUint8x64() (to Uint8x64)
-
-// Uint16x32 converts from Uint64x8 to Uint16x32
-func (from Uint64x8) AsUint16x32() (to Uint16x32)
-
-// Uint32x16 converts from Uint64x8 to Uint32x16
-func (from Uint64x8) AsUint32x16() (to Uint32x16)
-
-// AsInt8x16 converts from Mask8x16 to Int8x16
-func (from Mask8x16) AsInt8x16() (to Int8x16)
-
-// asMask converts from Int8x16 to Mask8x16
-func (from Int8x16) asMask() (to Mask8x16)
-
-func (x Mask8x16) And(y Mask8x16) Mask8x16
-
-func (x Mask8x16) Or(y Mask8x16) Mask8x16
-
-// AsInt8x32 converts from Mask8x32 to Int8x32
-func (from Mask8x32) AsInt8x32() (to Int8x32)
-
-// asMask converts from Int8x32 to Mask8x32
-func (from Int8x32) asMask() (to Mask8x32)
-
-func (x Mask8x32) And(y Mask8x32) Mask8x32
-
-func (x Mask8x32) Or(y Mask8x32) Mask8x32
-
-// AsInt8x64 converts from Mask8x64 to Int8x64
-func (from Mask8x64) AsInt8x64() (to Int8x64)
-
-// asMask converts from Int8x64 to Mask8x64
-func (from Int8x64) asMask() (to Mask8x64)
-
-func (x Mask8x64) And(y Mask8x64) Mask8x64
-
-func (x Mask8x64) Or(y Mask8x64) Mask8x64
-
-// AsInt16x8 converts from Mask16x8 to Int16x8
-func (from Mask16x8) AsInt16x8() (to Int16x8)
-
-// asMask converts from Int16x8 to Mask16x8
-func (from Int16x8) asMask() (to Mask16x8)
-
-func (x Mask16x8) And(y Mask16x8) Mask16x8
-
-func (x Mask16x8) Or(y Mask16x8) Mask16x8
-
-// AsInt16x16 converts from Mask16x16 to Int16x16
-func (from Mask16x16) AsInt16x16() (to Int16x16)
-
-// asMask converts from Int16x16 to Mask16x16
-func (from Int16x16) asMask() (to Mask16x16)
-
-func (x Mask16x16) And(y Mask16x16) Mask16x16
-
-func (x Mask16x16) Or(y Mask16x16) Mask16x16
-
-// AsInt16x32 converts from Mask16x32 to Int16x32
-func (from Mask16x32) AsInt16x32() (to Int16x32)
-
-// asMask converts from Int16x32 to Mask16x32
-func (from Int16x32) asMask() (to Mask16x32)
-
-func (x Mask16x32) And(y Mask16x32) Mask16x32
-
-func (x Mask16x32) Or(y Mask16x32) Mask16x32
-
-// AsInt32x4 converts from Mask32x4 to Int32x4
-func (from Mask32x4) AsInt32x4() (to Int32x4)
-
-// asMask converts from Int32x4 to Mask32x4
-func (from Int32x4) asMask() (to Mask32x4)
-
-func (x Mask32x4) And(y Mask32x4) Mask32x4
-
-func (x Mask32x4) Or(y Mask32x4) Mask32x4
-
-// AsInt32x8 converts from Mask32x8 to Int32x8
-func (from Mask32x8) AsInt32x8() (to Int32x8)
-
-// asMask converts from Int32x8 to Mask32x8
-func (from Int32x8) asMask() (to Mask32x8)
-
-func (x Mask32x8) And(y Mask32x8) Mask32x8
-
-func (x Mask32x8) Or(y Mask32x8) Mask32x8
-
-// AsInt32x16 converts from Mask32x16 to Int32x16
-func (from Mask32x16) AsInt32x16() (to Int32x16)
-
-// asMask converts from Int32x16 to Mask32x16
-func (from Int32x16) asMask() (to Mask32x16)
-
-func (x Mask32x16) And(y Mask32x16) Mask32x16
-
-func (x Mask32x16) Or(y Mask32x16) Mask32x16
-
-// AsInt64x2 converts from Mask64x2 to Int64x2
-func (from Mask64x2) AsInt64x2() (to Int64x2)
-
-// asMask converts from Int64x2 to Mask64x2
-func (from Int64x2) asMask() (to Mask64x2)
-
-func (x Mask64x2) And(y Mask64x2) Mask64x2
-
-func (x Mask64x2) Or(y Mask64x2) Mask64x2
-
-// AsInt64x4 converts from Mask64x4 to Int64x4
-func (from Mask64x4) AsInt64x4() (to Int64x4)
-
-// asMask converts from Int64x4 to Mask64x4
-func (from Int64x4) asMask() (to Mask64x4)
-
-func (x Mask64x4) And(y Mask64x4) Mask64x4
-
-func (x Mask64x4) Or(y Mask64x4) Mask64x4
-
-// AsInt64x8 converts from Mask64x8 to Int64x8
-func (from Mask64x8) AsInt64x8() (to Int64x8)
-
-// asMask converts from Int64x8 to Mask64x8
-func (from Int64x8) asMask() (to Mask64x8)
-
-func (x Mask64x8) And(y Mask64x8) Mask64x8
-
-func (x Mask64x8) Or(y Mask64x8) Mask64x8
+++ /dev/null
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-/* blend */
-
-// blend blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// Asm: VPBLENDVB, CPU Feature: AVX
-func (x Int8x16) blend(y Int8x16, mask Int8x16) Int8x16
-
-// blend blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// Asm: VPBLENDVB, CPU Feature: AVX2
-func (x Int8x32) blend(y Int8x32, mask Int8x32) Int8x32
-
-/* blendMasked */
-
-// blendMasked blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPBLENDMB, CPU Feature: AVX512
-func (x Int8x64) blendMasked(y Int8x64, mask Mask8x64) Int8x64
-
-// blendMasked blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPBLENDMW, CPU Feature: AVX512
-func (x Int16x32) blendMasked(y Int16x32, mask Mask16x32) Int16x32
-
-// blendMasked blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPBLENDMD, CPU Feature: AVX512
-func (x Int32x16) blendMasked(y Int32x16, mask Mask32x16) Int32x16
-
-// blendMasked blends two vectors based on mask values, choosing either
-// the first or the second based on whether the third is false or true
-//
-// This operation is applied selectively under a write mask.
-//
-// Asm: VPBLENDMQ, CPU Feature: AVX512
-func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8
-
-/* carrylessMultiply */
-
-// carrylessMultiply computes one of four possible Galois polynomial
-// products of selected high and low halves of x and y,
-// depending on the value of xyHiLo, returning the 128-bit
-// product in the concatenated two elements of the result.
-// Bit 0 selects the low (0) or high (1) element of x and
-// bit 4 selects the low (0x00) or high (0x10) element of y.
-//
-// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPCLMULQDQ, CPU Feature: AVX
-func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2
-
-// carrylessMultiply computes one of two possible Galois polynomial
-// products of selected high and low halves of each of the two
-// 128-bit lanes of x and y, depending on the value of xyHiLo,
-// and returns the four 128-bit products in the result's lanes.
-// Bit 0 selects the low (0) or high (1) elements of x's lanes and
-// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
-//
-// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
-func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4
-
-// carrylessMultiply computes one of four possible Galois polynomial
-// products of selected high and low halves of each of the four
-// 128-bit lanes of x and y, depending on the value of xyHiLo,
-// and returns the four 128-bit products in the result's lanes.
-// Bit 0 selects the low (0) or high (1) elements of x's lanes and
-// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
-//
-// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
-func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8
-
-/* concatSelectedConstant */
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output. The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specify which element from y or x to select.
-// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
-// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Float32x4) concatSelectedConstant(h1h0l1l0 uint8, y Float32x4) Float32x4
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output. The selection is chosen by the constant parameter hilo
-// where hi and lo are each one bit specifying which 64-bit element to select
-// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7})
-// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
-// selecting from y, is 1, and selects 7.
-//
-// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Float64x2) concatSelectedConstant(hilo uint8, y Float64x2) Float64x2
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output. The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specify which element from y or x to select.
-// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
-// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Int32x4) concatSelectedConstant(h1h0l1l0 uint8, y Int32x4) Int32x4
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output. The selection is chosen by the constant parameter hilo
-// where hi and lo are each one bit specifying which 64-bit element to select
-// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7})
-// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
-// selecting from y, is 1, and selects 7.
-//
-// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Int64x2) concatSelectedConstant(hilo uint8, y Int64x2) Int64x2
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output. The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specify which element from y or x to select.
-// For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns
-// {2, 0, 5, 7} (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Uint32x4) concatSelectedConstant(h1h0l1l0 uint8, y Uint32x4) Uint32x4
-
-// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
-// halves of the output. The selection is chosen by the constant parameter hilo
-// where hi and lo are each one bit specifying which 64-bit element to select
-// from y and x. For example {4,5}.concatSelectedConstant(0b10, {6,7})
-// returns {4,7}; bit 0, selecting from x, is zero, and selects 4, and bit 1,
-// selecting from y, is 1, and selects 7.
-//
-// hilo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Uint64x2) concatSelectedConstant(hilo uint8, y Uint64x2) Uint64x2
-
-/* concatSelectedConstantGrouped */
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
-// returns {2,0,5,7,10,8,13,15}
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Float32x8
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-//
-// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
-// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
-//
-// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
-//
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Float32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x16) Float32x16
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
-// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
-// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11).
-// This differs from the same method applied to a 32x8 vector, where
-// the 8-bit constant performs the same selection on both subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Float64x4) concatSelectedConstantGrouped(hilos uint8, y Float64x4) Float64x4
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
-// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
-// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's next 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select
-// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
-// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
-// This differs from the same method applied to a 32x8 or 32x16 vector, where
-// the 8-bit constant performs the same selection on all the subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Float64x8) concatSelectedConstantGrouped(hilos uint8, y Float64x8) Float64x8
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
-// returns {2,0,5,7,10,8,13,15}
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x8
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-//
-// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
-// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
-//
-// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
-//
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Int32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x16) Int32x16
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
-// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
-// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11).
-// This differs from the same method applied to a 32x8 vector, where
-// the 8-bit constant performs the same selection on both subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Int64x4) concatSelectedConstantGrouped(hilos uint8, y Int64x4) Int64x4
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
-// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
-// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's next 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select
-// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
-// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
-// This differs from the same method applied to a 32x8 or 32x16 vector, where
-// the 8-bit constant performs the same selection on all the subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Int64x8) concatSelectedConstantGrouped(hilos uint8, y Int64x8) Int64x8
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-// {0,1,2,3,8,9,10,11}.concatSelectedConstantGrouped(0b_11_01_00_10, {4,5,6,7,12,13,14,15})
-// returns {2,0,5,7,10,8,13,15}
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint32x8
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selection is chosen by the constant parameter h1h0l1l0
-// where each {h,l}{1,0} is two bits specifying which element from y or x to select.
-// For example,
-//
-// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped(
-// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215})
-//
-// returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215}
-//
-// (don't forget that the binary constant is written big-endian).
-//
-// h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Uint32x16) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x16) Uint32x16
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9}.concatSelectedConstantGrouped(0b_11_10, {6,7,10,11})
-// returns {4,7,9,11}; bit 0 is zero, selecting element 0 from x's least
-// 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's upper 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11).
-// This differs from the same method applied to a 32x8 vector, where
-// the 8-bit constant performs the same selection on both subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Uint64x4) concatSelectedConstantGrouped(hilos uint8, y Uint64x4) Uint64x4
-
-// concatSelectedConstantGrouped concatenates selected elements from 128-bit subvectors of x and y
-// into the lower and upper halves of corresponding subvectors of the output.
-// The selections are specified by the constant parameter hilos where each
-// hi and lo pair select 64-bit elements from the corresponding 128-bit
-// subvectors of x and y.
-//
-// For example {4,5,8,9,12,13,16,17}.concatSelectedConstantGrouped(0b11_00_11_10, {6,7,10,11,14,15,18,19})
-// returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's
-// least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7),
-// then 1, selecting element 1 from x's next 128 bits (9), then 1,
-// selecting element 1 from y's upper 128 bits (11). The next two 0 bits select
-// the lower elements from x and y's 3rd 128 bit groups (12, 14), the last two
-// 1 bits select the upper elements from x and y's last 128 bits (17, 19).
-// This differs from the same method applied to a 32x8 or 32x16 vector, where
-// the 8-bit constant performs the same selection on all the subvectors.
-//
-// hilos results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Uint64x8) concatSelectedConstantGrouped(hilos uint8, y Uint64x8) Uint64x8
-
-/* permuteScalars */
-
-// permuteScalars performs a permutation of vector x using constant indices:
-// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX
-func (x Int32x4) permuteScalars(indices uint8) Int32x4
-
-// permuteScalars performs a permutation of vector x using constant indices:
-// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]]}
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX
-func (x Uint32x4) permuteScalars(indices uint8) Uint32x4
-
-/* permuteScalarsGrouped */
-
-// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX2
-func (x Int32x8) permuteScalarsGrouped(indices uint8) Int32x8
-
-// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX512
-func (x Int32x16) permuteScalarsGrouped(indices uint8) Int32x16
-
-// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX2
-func (x Uint32x8) permuteScalarsGrouped(indices uint8) Uint32x8
-
-// permuteScalarsGrouped performs a grouped permutation of vector x using constant indices:
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x_group1[indices[0:2]], ...}
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFD, CPU Feature: AVX512
-func (x Uint32x16) permuteScalarsGrouped(indices uint8) Uint32x16
-
-/* permuteScalarsHi */
-
-// permuteScalarsHi performs a permutation of vector x using constant indices:
-// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x8) permuteScalarsHi(indices uint8) Int16x8
-
-// permuteScalarsHi performs a permutation of vector x using constant indices:
-// result = {x[0], x[1], x[2], x[3], x[indices[0:2]+4], x[indices[2:4]+4], x[indices[4:6]+4], x[indices[6:8]+4]}
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x8) permuteScalarsHi(indices uint8) Uint16x8
-
-/* permuteScalarsHiGrouped */
-
-// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
-// result =
-//
-// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
-// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
-//
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Int16x16) permuteScalarsHiGrouped(indices uint8) Int16x16
-
-// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
-// result =
-//
-// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
-// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
-//
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x32) permuteScalarsHiGrouped(indices uint8) Int16x32
-
-// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
-// result =
-//
-// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
-// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
-//
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Uint16x16) permuteScalarsHiGrouped(indices uint8) Uint16x16
-
-// permuteScalarsHiGrouped performs a grouped permutation of vector x using constant indices:
-// result =
-//
-// {x_group0[0], x_group0[1], x_group0[2], x_group0[3], x_group0[indices[0:2]+4], x_group0[indices[2:4]+4], x_group0[indices[4:6]+4], x_group0[indices[6:8]+4],
-// x_group1[0], x_group1[1], x_group1[2], x_group1[3], x_group1[indices[0:2]+4], ...}
-//
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x32) permuteScalarsHiGrouped(indices uint8) Uint16x32
-
-/* permuteScalarsLo */
-
-// permuteScalarsLo performs a permutation of vector x using constant indices:
-// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX512
-func (x Int16x8) permuteScalarsLo(indices uint8) Int16x8
-
-// permuteScalarsLo performs a permutation of vector x using constant indices:
-// result = {x[indices[0:2]], x[indices[2:4]], x[indices[4:6]], x[indices[6:8]], x[4], x[5], x[6], x[7]}
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX512
-func (x Uint16x8) permuteScalarsLo(indices uint8) Uint16x8
-
-/* permuteScalarsLoGrouped */
-
-// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
-//
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
-// x_group1[indices[0:2]], ...}
-//
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX2
-func (x Int16x16) permuteScalarsLoGrouped(indices uint8) Int16x16
-
-// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
-//
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
-// x_group1[indices[0:2]], ...}
-//
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX512
-func (x Int16x32) permuteScalarsLoGrouped(indices uint8) Int16x32
-
-// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
-//
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
-// x_group1[indices[0:2]], ...}
-//
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX2
-func (x Uint16x16) permuteScalarsLoGrouped(indices uint8) Uint16x16
-
-// permuteScalarsLoGrouped performs a grouped permutation of vector x using constant indices:
-//
-// result = {x_group0[indices[0:2]], x_group0[indices[2:4]], x_group0[indices[4:6]], x_group0[indices[6:8]], x[4], x[5], x[6], x[7],
-// x_group1[indices[0:2]], ...}
-//
-// Indices is four 2-bit values packed into a byte, thus indices[0:2] is the first index.
-// Each group is of size 128-bit.
-//
-// indices results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX512
-func (x Uint16x32) permuteScalarsLoGrouped(indices uint8) Uint16x32
-
-/* tern */
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGD, CPU Feature: AVX512
-func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4
-
-// tern performs a logical operation on three vectors based on the 8-bit truth table.
-// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z))
-//
-// table results in better performance when it's a constant, a non-constant value will be translated into a jump table.
-//
-// Asm: VPTERNLOGQ, CPU Feature: AVX512
-func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8
+++ /dev/null
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-// BroadcastInt8x16 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastInt8x16(x int8) Int8x16 {
- var z Int8x16
- return z.SetElem(0, x).Broadcast128()
-}
-
-// BroadcastInt16x8 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastInt16x8(x int16) Int16x8 {
- var z Int16x8
- return z.SetElem(0, x).Broadcast128()
-}
-
-// BroadcastInt32x4 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastInt32x4(x int32) Int32x4 {
- var z Int32x4
- return z.SetElem(0, x).Broadcast128()
-}
-
-// BroadcastInt64x2 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastInt64x2(x int64) Int64x2 {
- var z Int64x2
- return z.SetElem(0, x).Broadcast128()
-}
-
-// BroadcastUint8x16 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastUint8x16(x uint8) Uint8x16 {
- var z Uint8x16
- return z.SetElem(0, x).Broadcast128()
-}
-
-// BroadcastUint16x8 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastUint16x8(x uint16) Uint16x8 {
- var z Uint16x8
- return z.SetElem(0, x).Broadcast128()
-}
-
-// BroadcastUint32x4 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastUint32x4(x uint32) Uint32x4 {
- var z Uint32x4
- return z.SetElem(0, x).Broadcast128()
-}
-
-// BroadcastUint64x2 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastUint64x2(x uint64) Uint64x2 {
- var z Uint64x2
- return z.SetElem(0, x).Broadcast128()
-}
-
-// BroadcastFloat32x4 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastFloat32x4(x float32) Float32x4 {
- var z Float32x4
- return z.SetElem(0, x).Broadcast128()
-}
-
-// BroadcastFloat64x2 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastFloat64x2(x float64) Float64x2 {
- var z Float64x2
- return z.SetElem(0, x).Broadcast128()
-}
-
-// BroadcastInt8x32 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastInt8x32(x int8) Int8x32 {
- var z Int8x16
- return z.SetElem(0, x).Broadcast256()
-}
-
-// BroadcastInt16x16 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastInt16x16(x int16) Int16x16 {
- var z Int16x8
- return z.SetElem(0, x).Broadcast256()
-}
-
-// BroadcastInt32x8 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastInt32x8(x int32) Int32x8 {
- var z Int32x4
- return z.SetElem(0, x).Broadcast256()
-}
-
-// BroadcastInt64x4 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastInt64x4(x int64) Int64x4 {
- var z Int64x2
- return z.SetElem(0, x).Broadcast256()
-}
-
-// BroadcastUint8x32 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastUint8x32(x uint8) Uint8x32 {
- var z Uint8x16
- return z.SetElem(0, x).Broadcast256()
-}
-
-// BroadcastUint16x16 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastUint16x16(x uint16) Uint16x16 {
- var z Uint16x8
- return z.SetElem(0, x).Broadcast256()
-}
-
-// BroadcastUint32x8 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastUint32x8(x uint32) Uint32x8 {
- var z Uint32x4
- return z.SetElem(0, x).Broadcast256()
-}
-
-// BroadcastUint64x4 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastUint64x4(x uint64) Uint64x4 {
- var z Uint64x2
- return z.SetElem(0, x).Broadcast256()
-}
-
-// BroadcastFloat32x8 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastFloat32x8(x float32) Float32x8 {
- var z Float32x4
- return z.SetElem(0, x).Broadcast256()
-}
-
-// BroadcastFloat64x4 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX2
-func BroadcastFloat64x4(x float64) Float64x4 {
- var z Float64x2
- return z.SetElem(0, x).Broadcast256()
-}
-
-// BroadcastInt8x64 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX512BW
-func BroadcastInt8x64(x int8) Int8x64 {
- var z Int8x16
- return z.SetElem(0, x).Broadcast512()
-}
-
-// BroadcastInt16x32 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX512BW
-func BroadcastInt16x32(x int16) Int16x32 {
- var z Int16x8
- return z.SetElem(0, x).Broadcast512()
-}
-
-// BroadcastInt32x16 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX512F
-func BroadcastInt32x16(x int32) Int32x16 {
- var z Int32x4
- return z.SetElem(0, x).Broadcast512()
-}
-
-// BroadcastInt64x8 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX512F
-func BroadcastInt64x8(x int64) Int64x8 {
- var z Int64x2
- return z.SetElem(0, x).Broadcast512()
-}
-
-// BroadcastUint8x64 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX512BW
-func BroadcastUint8x64(x uint8) Uint8x64 {
- var z Uint8x16
- return z.SetElem(0, x).Broadcast512()
-}
-
-// BroadcastUint16x32 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX512BW
-func BroadcastUint16x32(x uint16) Uint16x32 {
- var z Uint16x8
- return z.SetElem(0, x).Broadcast512()
-}
-
-// BroadcastUint32x16 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX512F
-func BroadcastUint32x16(x uint32) Uint32x16 {
- var z Uint32x4
- return z.SetElem(0, x).Broadcast512()
-}
-
-// BroadcastUint64x8 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX512F
-func BroadcastUint64x8(x uint64) Uint64x8 {
- var z Uint64x2
- return z.SetElem(0, x).Broadcast512()
-}
-
-// BroadcastFloat32x16 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX512F
-func BroadcastFloat32x16(x float32) Float32x16 {
- var z Float32x4
- return z.SetElem(0, x).Broadcast512()
-}
-
-// BroadcastFloat64x8 returns a vector with the input
-// x assigned to all elements of the output.
-//
-// Emulated, CPU Feature AVX512F
-func BroadcastFloat64x8(x float64) Float64x8 {
- var z Float64x2
- return z.SetElem(0, x).Broadcast512()
-}
-
-// ToMask converts from Int8x16 to Mask8x16, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int8x16) ToMask() (to Mask8x16) {
- return from.NotEqual(Int8x16{})
-}
-
-// ToMask converts from Int16x8 to Mask16x8, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int16x8) ToMask() (to Mask16x8) {
- return from.NotEqual(Int16x8{})
-}
-
-// ToMask converts from Int32x4 to Mask32x4, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int32x4) ToMask() (to Mask32x4) {
- return from.NotEqual(Int32x4{})
-}
-
-// ToMask converts from Int64x2 to Mask64x2, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int64x2) ToMask() (to Mask64x2) {
- return from.NotEqual(Int64x2{})
-}
-
-// ToMask converts from Uint8x16 to Mask8x16, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint8x16) ToMask() (to Mask8x16) {
- return from.NotEqual(Uint8x16{})
-}
-
-// ToMask converts from Uint16x8 to Mask16x8, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint16x8) ToMask() (to Mask16x8) {
- return from.NotEqual(Uint16x8{})
-}
-
-// ToMask converts from Uint32x4 to Mask32x4, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint32x4) ToMask() (to Mask32x4) {
- return from.NotEqual(Uint32x4{})
-}
-
-// ToMask converts from Uint64x2 to Mask64x2, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint64x2) ToMask() (to Mask64x2) {
- return from.NotEqual(Uint64x2{})
-}
-
-// ToMask converts from Float32x4 to Mask32x4, mask element is set to true when the corresponding vector element is non-zero.
-func (from Float32x4) ToMask() (to Mask32x4) {
- return from.NotEqual(Float32x4{})
-}
-
-// ToMask converts from Float64x2 to Mask64x2, mask element is set to true when the corresponding vector element is non-zero.
-func (from Float64x2) ToMask() (to Mask64x2) {
- return from.NotEqual(Float64x2{})
-}
-
-// ToMask converts from Int8x32 to Mask8x32, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int8x32) ToMask() (to Mask8x32) {
- return from.NotEqual(Int8x32{})
-}
-
-// ToMask converts from Int16x16 to Mask16x16, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int16x16) ToMask() (to Mask16x16) {
- return from.NotEqual(Int16x16{})
-}
-
-// ToMask converts from Int32x8 to Mask32x8, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int32x8) ToMask() (to Mask32x8) {
- return from.NotEqual(Int32x8{})
-}
-
-// ToMask converts from Int64x4 to Mask64x4, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int64x4) ToMask() (to Mask64x4) {
- return from.NotEqual(Int64x4{})
-}
-
-// ToMask converts from Uint8x32 to Mask8x32, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint8x32) ToMask() (to Mask8x32) {
- return from.NotEqual(Uint8x32{})
-}
-
-// ToMask converts from Uint16x16 to Mask16x16, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint16x16) ToMask() (to Mask16x16) {
- return from.NotEqual(Uint16x16{})
-}
-
-// ToMask converts from Uint32x8 to Mask32x8, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint32x8) ToMask() (to Mask32x8) {
- return from.NotEqual(Uint32x8{})
-}
-
-// ToMask converts from Uint64x4 to Mask64x4, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint64x4) ToMask() (to Mask64x4) {
- return from.NotEqual(Uint64x4{})
-}
-
-// ToMask converts from Float32x8 to Mask32x8, mask element is set to true when the corresponding vector element is non-zero.
-func (from Float32x8) ToMask() (to Mask32x8) {
- return from.NotEqual(Float32x8{})
-}
-
-// ToMask converts from Float64x4 to Mask64x4, mask element is set to true when the corresponding vector element is non-zero.
-func (from Float64x4) ToMask() (to Mask64x4) {
- return from.NotEqual(Float64x4{})
-}
-
-// ToMask converts from Int8x64 to Mask8x64, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int8x64) ToMask() (to Mask8x64) {
- return from.NotEqual(Int8x64{})
-}
-
-// ToMask converts from Int16x32 to Mask16x32, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int16x32) ToMask() (to Mask16x32) {
- return from.NotEqual(Int16x32{})
-}
-
-// ToMask converts from Int32x16 to Mask32x16, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int32x16) ToMask() (to Mask32x16) {
- return from.NotEqual(Int32x16{})
-}
-
-// ToMask converts from Int64x8 to Mask64x8, mask element is set to true when the corresponding vector element is non-zero.
-func (from Int64x8) ToMask() (to Mask64x8) {
- return from.NotEqual(Int64x8{})
-}
-
-// ToMask converts from Uint8x64 to Mask8x64, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint8x64) ToMask() (to Mask8x64) {
- return from.NotEqual(Uint8x64{})
-}
-
-// ToMask converts from Uint16x32 to Mask16x32, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint16x32) ToMask() (to Mask16x32) {
- return from.NotEqual(Uint16x32{})
-}
-
-// ToMask converts from Uint32x16 to Mask32x16, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint32x16) ToMask() (to Mask32x16) {
- return from.NotEqual(Uint32x16{})
-}
-
-// ToMask converts from Uint64x8 to Mask64x8, mask element is set to true when the corresponding vector element is non-zero.
-func (from Uint64x8) ToMask() (to Mask64x8) {
- return from.NotEqual(Uint64x8{})
-}
-
-// ToMask converts from Float32x16 to Mask32x16, mask element is set to true when the corresponding vector element is non-zero.
-func (from Float32x16) ToMask() (to Mask32x16) {
- return from.NotEqual(Float32x16{})
-}
-
-// ToMask converts from Float64x8 to Mask64x8, mask element is set to true when the corresponding vector element is non-zero.
-func (from Float64x8) ToMask() (to Mask64x8) {
- return from.NotEqual(Float64x8{})
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX
-func (x Int8x16) Not() Int8x16 {
- return x.Xor(x.Equal(x).AsInt8x16())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX
-func (x Int16x8) Not() Int16x8 {
- return x.Xor(x.Equal(x).AsInt16x8())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX
-func (x Int32x4) Not() Int32x4 {
- return x.Xor(x.Equal(x).AsInt32x4())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX
-func (x Int64x2) Not() Int64x2 {
- return x.Xor(x.Equal(x).AsInt64x2())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX2
-func (x Int8x32) Not() Int8x32 {
- return x.Xor(x.Equal(x).AsInt8x32())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX2
-func (x Int16x16) Not() Int16x16 {
- return x.Xor(x.Equal(x).AsInt16x16())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX2
-func (x Int32x8) Not() Int32x8 {
- return x.Xor(x.Equal(x).AsInt32x8())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX2
-func (x Int64x4) Not() Int64x4 {
- return x.Xor(x.Equal(x).AsInt64x4())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX512
-func (x Int8x64) Not() Int8x64 {
- return x.Xor(x.Equal(x).AsInt8x64())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX512
-func (x Int16x32) Not() Int16x32 {
- return x.Xor(x.Equal(x).AsInt16x32())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX512
-func (x Int32x16) Not() Int32x16 {
- return x.Xor(x.Equal(x).AsInt32x16())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX512
-func (x Int64x8) Not() Int64x8 {
- return x.Xor(x.Equal(x).AsInt64x8())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX
-func (x Uint8x16) Not() Uint8x16 {
- return x.Xor(x.Equal(x).AsInt8x16().AsUint8x16())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX
-func (x Uint16x8) Not() Uint16x8 {
- return x.Xor(x.Equal(x).AsInt16x8().AsUint16x8())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX
-func (x Uint32x4) Not() Uint32x4 {
- return x.Xor(x.Equal(x).AsInt32x4().AsUint32x4())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX
-func (x Uint64x2) Not() Uint64x2 {
- return x.Xor(x.Equal(x).AsInt64x2().AsUint64x2())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX2
-func (x Uint8x32) Not() Uint8x32 {
- return x.Xor(x.Equal(x).AsInt8x32().AsUint8x32())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX2
-func (x Uint16x16) Not() Uint16x16 {
- return x.Xor(x.Equal(x).AsInt16x16().AsUint16x16())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX2
-func (x Uint32x8) Not() Uint32x8 {
- return x.Xor(x.Equal(x).AsInt32x8().AsUint32x8())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX2
-func (x Uint64x4) Not() Uint64x4 {
- return x.Xor(x.Equal(x).AsInt64x4().AsUint64x4())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX512
-func (x Uint8x64) Not() Uint8x64 {
- return x.Xor(x.Equal(x).AsInt8x64().AsUint8x64())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX512
-func (x Uint16x32) Not() Uint16x32 {
- return x.Xor(x.Equal(x).AsInt16x32().AsUint16x32())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX512
-func (x Uint32x16) Not() Uint32x16 {
- return x.Xor(x.Equal(x).AsInt32x16().AsUint32x16())
-}
-
-// Not returns the bitwise complement of x
-//
-// Emulated, CPU Feature AVX512
-func (x Uint64x8) Not() Uint64x8 {
- return x.Xor(x.Equal(x).AsInt64x8().AsUint64x8())
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int8x16) String() string {
- var s [16]int8
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int16x8) String() string {
- var s [8]int16
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int32x4) String() string {
- var s [4]int32
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int64x2) String() string {
- var s [2]int64
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint8x16) String() string {
- var s [16]uint8
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint16x8) String() string {
- var s [8]uint16
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint32x4) String() string {
- var s [4]uint32
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint64x2) String() string {
- var s [2]uint64
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Float32x4) String() string {
- var s [4]float32
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Float64x2) String() string {
- var s [2]float64
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int8x32) String() string {
- var s [32]int8
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int16x16) String() string {
- var s [16]int16
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int32x8) String() string {
- var s [8]int32
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int64x4) String() string {
- var s [4]int64
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint8x32) String() string {
- var s [32]uint8
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint16x16) String() string {
- var s [16]uint16
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint32x8) String() string {
- var s [8]uint32
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint64x4) String() string {
- var s [4]uint64
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Float32x8) String() string {
- var s [8]float32
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Float64x4) String() string {
- var s [4]float64
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int8x64) String() string {
- var s [64]int8
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int16x32) String() string {
- var s [32]int16
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int32x16) String() string {
- var s [16]int32
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Int64x8) String() string {
- var s [8]int64
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint8x64) String() string {
- var s [64]uint8
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint16x32) String() string {
- var s [32]uint16
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint32x16) String() string {
- var s [16]uint32
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Uint64x8) String() string {
- var s [8]uint64
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Float32x16) String() string {
- var s [16]float32
- x.Store(&s)
- return sliceToString(s[:])
-}
-
-// String returns a string representation of SIMD vector x
-func (x Float64x8) String() string {
- var s [8]float64
- x.Store(&s)
- return sliceToString(s[:])
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd_test
-
-import (
- "simd"
- "simd/internal/test_helpers"
- "testing"
-)
-
-func TestConcatSelectedConstant64(t *testing.T) {
- a := make([]int64, 2)
- x := simd.LoadInt64x2Slice([]int64{4, 5})
- y := simd.LoadInt64x2Slice([]int64{6, 7})
- z := x.ExportTestConcatSelectedConstant(0b10, y)
- z.StoreSlice(a)
- test_helpers.CheckSlices[int64](t, a, []int64{4, 7})
-}
-
-func TestConcatSelectedConstantGrouped64(t *testing.T) {
- a := make([]float64, 4)
- x := simd.LoadFloat64x4Slice([]float64{4, 5, 8, 9})
- y := simd.LoadFloat64x4Slice([]float64{6, 7, 10, 11})
- z := x.ExportTestConcatSelectedConstantGrouped(0b_11_10, y)
- z.StoreSlice(a)
- test_helpers.CheckSlices[float64](t, a, []float64{4, 7, 9, 11})
-}
-
-func TestConcatSelectedConstant32(t *testing.T) {
- a := make([]float32, 4)
- x := simd.LoadFloat32x4Slice([]float32{4, 5, 8, 9})
- y := simd.LoadFloat32x4Slice([]float32{6, 7, 10, 11})
- z := x.ExportTestConcatSelectedConstant(0b_11_01_10_00, y)
- z.StoreSlice(a)
- test_helpers.CheckSlices[float32](t, a, []float32{4, 8, 7, 11})
-}
-
-func TestConcatSelectedConstantGrouped32(t *testing.T) {
- a := make([]uint32, 8)
- x := simd.LoadUint32x8Slice([]uint32{0, 1, 2, 3, 8, 9, 10, 11})
- y := simd.LoadUint32x8Slice([]uint32{4, 5, 6, 7, 12, 13, 14, 15})
- z := x.ExportTestConcatSelectedConstantGrouped(0b_11_01_00_10, y)
- z.StoreSlice(a)
- test_helpers.CheckSlices[uint32](t, a, []uint32{2, 0, 5, 7, 10, 8, 13, 15})
-}
-
-func TestTern(t *testing.T) {
- if !simd.X86.AVX512() {
- t.Skip("This test needs AVX512")
- }
- x := simd.LoadInt32x8Slice([]int32{0, 0, 0, 0, 1, 1, 1, 1})
- y := simd.LoadInt32x8Slice([]int32{0, 0, 1, 1, 0, 0, 1, 1})
- z := simd.LoadInt32x8Slice([]int32{0, 1, 0, 1, 0, 1, 0, 1})
-
- foo := func(w simd.Int32x8, k uint8) {
- a := make([]int32, 8)
- w.StoreSlice(a)
- t.Logf("For k=%0b, w=%v", k, a)
- for i, b := range a {
- if (int32(k)>>i)&1 != b {
- t.Errorf("Element %d of stored slice (=%d) did not match corresponding bit in 0b%b",
- i, b, k)
- }
- }
- }
-
- foo(x.ExportTestTern(0b1111_0000, y, z), 0b1111_0000)
- foo(x.ExportTestTern(0b1100_1100, y, z), 0b1100_1100)
- foo(x.ExportTestTern(0b1010_1010, y, z), 0b1010_1010)
-}
-
-func TestSelect2x4x32(t *testing.T) {
- for a := range uint8(8) {
- for b := range uint8(8) {
- for c := range uint8(8) {
- for d := range uint8(8) {
- x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
- y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
- z := select2x4x32(x, a, b, c, d, y)
- w := make([]int32, 4, 4)
- z.StoreSlice(w)
- if w[0] != int32(a) || w[1] != int32(b) ||
- w[2] != int32(c) || w[3] != int32(d) {
- t.Errorf("Expected [%d %d %d %d] got %v", a, b, c, d, w)
- }
- }
- }
- }
- }
-}
-
-func TestSelect2x8x32Grouped(t *testing.T) {
- for a := range uint8(8) {
- for b := range uint8(8) {
- for c := range uint8(8) {
- for d := range uint8(8) {
- x := simd.LoadInt32x8Slice([]int32{0, 1, 2, 3, 10, 11, 12, 13})
- y := simd.LoadInt32x8Slice([]int32{4, 5, 6, 7, 14, 15, 16, 17})
- z := select2x8x32Grouped(x, a, b, c, d, y)
- w := make([]int32, 8, 8)
- z.StoreSlice(w)
- if w[0] != int32(a) || w[1] != int32(b) ||
- w[2] != int32(c) || w[3] != int32(d) ||
- w[4] != int32(10+a) || w[5] != int32(10+b) ||
- w[6] != int32(10+c) || w[7] != int32(10+d) {
- t.Errorf("Expected [%d %d %d %d %d %d %d %d] got %v", a, b, c, d, 10+a, 10+b, 10+c, 10+d, w)
- }
- }
- }
- }
- }
-}
-
-// select2x4x32 returns a selection of 4 elements in x and y, numbered
-// 0-7, where 0-3 are the four elements of x and 4-7 are the four elements
-// of y.
-func select2x4x32(x simd.Int32x4, a, b, c, d uint8, y simd.Int32x4) simd.Int32x4 {
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case simd.LLLL:
- return x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, b, c, d), x)
- case simd.HHHH:
- return y.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, b, c, d), y)
- case simd.LLHH:
- return x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, b, c, d), y)
- case simd.HHLL:
- return y.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, b, c, d), x)
-
- case simd.HLLL:
- z := y.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, a, b, b), x)
- return z.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(0, 2, c, d), x)
- case simd.LHLL:
- z := x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, a, b, b), y)
- return z.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(0, 2, c, d), x)
-
- case simd.HLHH:
- z := y.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, a, b, b), x)
- return z.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(0, 2, c, d), y)
- case simd.LHHH:
- z := x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, a, b, b), y)
- return z.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(0, 2, c, d), y)
-
- case simd.LLLH:
- z := x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(c, c, d, d), y)
- return x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, b, 0, 2), z)
- case simd.LLHL:
- z := y.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(c, c, d, d), x)
- return x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, b, 0, 2), z)
- case simd.HHLH:
- z := x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(c, c, d, d), y)
- return y.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, b, 0, 2), z)
- case simd.HHHL:
- z := y.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(c, c, d, d), x)
- return y.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, b, 0, 2), z)
-
- case simd.LHLH:
- z := x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, c, b, d), y)
- return z.ExportTestConcatSelectedConstant(0b11_01_10_00 /* =simd.ExportTestCscImm4(0, 2, 1, 3) */, z)
- case simd.HLHL:
- z := x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(b, d, a, c), y)
- return z.ExportTestConcatSelectedConstant(0b01_11_00_10 /* =simd.ExportTestCscImm4(2, 0, 3, 1) */, z)
- case simd.HLLH:
- z := x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(b, c, a, d), y)
- return z.ExportTestConcatSelectedConstant(0b11_01_00_10 /* =simd.ExportTestCscImm4(2, 0, 1, 3) */, z)
- case simd.LHHL:
- z := x.ExportTestConcatSelectedConstant(simd.ExportTestCscImm4(a, d, b, c), y)
- return z.ExportTestConcatSelectedConstant(0b01_11_10_00 /* =simd.ExportTestCscImm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// select2x8x32Grouped returns a pair of selection of 4 elements in x and y,
-// numbered 0-7, where 0-3 are the four elements of x's two groups (lower and
-// upper 128 bits) and 4-7 are the four elements of y's two groups.
-
-func select2x8x32Grouped(x simd.Int32x8, a, b, c, d uint8, y simd.Int32x8) simd.Int32x8 {
- // selections as being expressible in the ExportTestConcatSelectedConstant pattern,
- // or not. Classification is by H and L, where H is a selection from 4-7
- // and L is a selection from 0-3.
- // simd.LLHH -> CSC(x,y, a, b, c&3, d&3)
- // simd.HHLL -> CSC(y,x, a&3, b&3, c, d)
- // simd.LLLL -> CSC(x,x, a, b, c, d)
- // simd.HHHH -> CSC(y,y, a&3, b&3, c&3, d&3)
-
- // simd.LLLH -> z = CSC(x, y, c, c, d&3, d&3); CSC(x, z, a, b, 0, 2)
- // simd.LLHL -> z = CSC(x, y, c&3, c&3, d, d); CSC(x, z, a, b, 0, 2)
- // simd.HHLH -> z = CSC(x, y, c, c, d&3, d&3); CSC(y, z, a&3, b&3, 0, 2)
- // simd.HHHL -> z = CSC(x, y, c&3, c&3, d, d); CSC(y, z, a&3, b&3, 0, 2)
-
- // simd.LHLL -> z = CSC(x, y, a, a, b&3, b&3); CSC(z, x, 0, 2, c, d)
- // etc
-
- // simd.LHLH -> z = CSC(x, y, a, c, b&3, d&3); CSC(z, z, 0, 2, 1, 3)
- // simd.HLHL -> z = CSC(x, y, b, d, a&3, c&3); CSC(z, z, 2, 0, 3, 1)
-
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case simd.LLLL:
- return x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, b, c, d), x)
- case simd.HHHH:
- return y.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, b, c, d), y)
- case simd.LLHH:
- return x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, b, c, d), y)
- case simd.HHLL:
- return y.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, b, c, d), x)
-
- case simd.HLLL:
- z := y.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, a, b, b), x)
- return z.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(0, 2, c, d), x)
- case simd.LHLL:
- z := x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, a, b, b), y)
- return z.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(0, 2, c, d), x)
-
- case simd.HLHH:
- z := y.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, a, b, b), x)
- return z.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(0, 2, c, d), y)
- case simd.LHHH:
- z := x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, a, b, b), y)
- return z.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(0, 2, c, d), y)
-
- case simd.LLLH:
- z := x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(c, c, d, d), y)
- return x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, b, 0, 2), z)
- case simd.LLHL:
- z := y.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(c, c, d, d), x)
- return x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, b, 0, 2), z)
- case simd.HHLH:
- z := x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(c, c, d, d), y)
- return y.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, b, 0, 2), z)
- case simd.HHHL:
- z := y.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(c, c, d, d), x)
- return y.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, b, 0, 2), z)
-
- case simd.LHLH:
- z := x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, c, b, d), y)
- return z.ExportTestConcatSelectedConstantGrouped(0b11_01_10_00 /* =simd.ExportTestCscImm4(0, 2, 1, 3) */, z)
- case simd.HLHL:
- z := x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(b, d, a, c), y)
- return z.ExportTestConcatSelectedConstantGrouped(0b01_11_00_10 /* =simd.ExportTestCscImm4(2, 0, 3, 1) */, z)
- case simd.HLLH:
- z := x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(b, c, a, d), y)
- return z.ExportTestConcatSelectedConstantGrouped(0b11_01_00_10 /* =simd.ExportTestCscImm4(2, 0, 1, 3) */, z)
- case simd.LHHL:
- z := x.ExportTestConcatSelectedConstantGrouped(simd.ExportTestCscImm4(a, d, b, c), y)
- return z.ExportTestConcatSelectedConstantGrouped(0b01_11_10_00 /* =simd.ExportTestCscImm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd
-
-// These constants represent the source pattern for the four parameters
-// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped.
-// L means the element comes from the 'x' vector (Low), and
-// H means it comes from the 'y' vector (High).
-// The order of the letters corresponds to elements a, b, c, d.
-// The underlying integer value is a bitmask where:
-// Bit 0: Source of element 'a' (0 for x, 1 for y)
-// Bit 1: Source of element 'b' (0 for x, 1 for y)
-// Bit 2: Source of element 'c' (0 for x, 1 for y)
-// Bit 3: Source of element 'd' (0 for x, 1 for y)
-// Note that the least-significant bit is on the LEFT in this encoding.
-const (
- _LLLL = iota // a:x, b:x, c:x, d:x
- _HLLL // a:y, b:x, c:x, d:x
- _LHLL // a:x, b:y, c:x, d:x
- _HHLL // a:y, b:y, c:x, d:x
- _LLHL // a:x, b:x, c:y, d:x
- _HLHL // a:y, b:x, c:y, d:x
- _LHHL // a:x, b:y, c:y, d:x
- _HHHL // a:y, b:y, c:y, d:x
- _LLLH // a:x, b:x, c:x, d:y
- _HLLH // a:y, b:x, c:x, d:y
- _LHLH // a:x, b:y, c:x, d:y
- _HHLH // a:y, b:y, c:x, d:y
- _LLHH // a:x, b:x, c:y, d:y
- _HLHH // a:y, b:x, c:y, d:y
- _LHHH // a:x, b:y, c:y, d:y
- _HHHH // a:y, b:y, c:y, d:y
-)
-
-// These constants represent the source pattern for the four parameters
-// (a, b, c, d) passed to SelectFromPair and SelectFromPairGrouped for
-// two-element vectors.
-const (
- _LL = iota
- _HL
- _LH
- _HH
-)
-
-// SelectFromPair returns the selection of four elements from the two
-// vectors x and y, where selector values in the range 0-3 specify
-// elements from x and values in the range 4-7 specify the 0-3 elements
-// of y. When the selectors are constants and the selection can be
-// implemented in a single instruction, it will be, otherwise it
-// requires two. a is the source index of the least element in the
-// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
-// elements in the output. For example,
-// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Int32x4) SelectFromPair(a, b, c, d uint8, y Int32x4) Int32x4 {
- // pattern gets the concatenation of "x or y?" bits
- // (0 == x, 1 == y)
- // This will determine operand choice/order and whether a second
- // instruction is needed.
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- // a-d are masked down to their offsets within x or y
- // this is not necessary for x, but this is easier on the
- // eyes and reduces the risk of an error now or later.
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case _LLLL:
- return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
- case _HHHH:
- return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
- case _LLHH:
- return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
- case _HHLL:
- return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
-
- case _HLLL:
- z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
- case _LHLL:
- z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
-
- case _HLHH:
- z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
- case _LHHH:
- z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
-
- case _LLLH:
- z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
- return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
- case _LLHL:
- z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
- return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
- case _HHLH:
- z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
- return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
- case _HHHL:
- z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
- return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
-
- case _LHLH:
- z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
- return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
- case _HLHL:
- z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
- return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
- case _HLLH:
- z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
- return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
- case _LHHL:
- z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
- return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPair returns the selection of four elements from the two
-// vectors x and y, where selector values in the range 0-3 specify
-// elements from x and values in the range 4-7 specify the 0-3 elements
-// of y. When the selectors are constants and can be the selection
-// can be implemented in a single instruction, it will be, otherwise
-// it requires two. a is the source index of the least element in the
-// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
-// elements in the output. For example,
-// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Uint32x4) SelectFromPair(a, b, c, d uint8, y Uint32x4) Uint32x4 {
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case _LLLL:
- return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
- case _HHHH:
- return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
- case _LLHH:
- return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
- case _HHLL:
- return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
-
- case _HLLL:
- z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
- case _LHLL:
- z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
-
- case _HLHH:
- z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
- case _LHHH:
- z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
-
- case _LLLH:
- z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
- return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
- case _LLHL:
- z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
- return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
- case _HHLH:
- z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
- return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
- case _HHHL:
- z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
- return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
-
- case _LHLH:
- z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
- return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
- case _HLHL:
- z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
- return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
- case _HLLH:
- z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
- return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
- case _LHHL:
- z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
- return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPair returns the selection of four elements from the two
-// vectors x and y, where selector values in the range 0-3 specify
-// elements from x and values in the range 4-7 specify the 0-3 elements
-// of y. When the selectors are constants and can be the selection
-// can be implemented in a single instruction, it will be, otherwise
-// it requires two. a is the source index of the least element in the
-// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
-// elements in the output. For example,
-// {1,2,4,8}.SelectFromPair(2,3,5,7,{9,25,49,81}) returns {4,8,25,81}
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Float32x4) SelectFromPair(a, b, c, d uint8, y Float32x4) Float32x4 {
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case _LLLL:
- return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
- case _HHHH:
- return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
- case _LLHH:
- return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
- case _HHLL:
- return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
-
- case _HLLL:
- z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
- case _LHLL:
- z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
-
- case _HLHH:
- z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
- case _LHHH:
- z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
-
- case _LLLH:
- z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
- return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
- case _LLHL:
- z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
- return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
- case _HHLH:
- z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
- return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
- case _HHHL:
- z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
- return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
-
- case _LHLH:
- z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
- return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
- case _HLHL:
- z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
- return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
- case _HLLH:
- z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
- return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
- case _LHHL:
- z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
- return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
-// the vectors x and y, the selection of four elements from x and y,
-// where selector values in the range 0-3 specify elements from x and
-// values in the range 4-7 specify the 0-3 elements of y.
-// When the selectors are constants and can be the selection
-// can be implemented in a single instruction, it will be, otherwise
-// it requires two. a is the source index of the least element in the
-// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
-// elements in the output. For example,
-// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
-//
-// returns {4,8,25,81,64,128,169,289}
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Int32x8) SelectFromPairGrouped(a, b, c, d uint8, y Int32x8) Int32x8 {
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case _LLLL:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
- case _HHHH:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _LLHH:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _HHLL:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
-
- case _HLLL:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
- case _LHLL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
-
- case _HLHH:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
- case _LHHH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
-
- case _LLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _LLHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
-
- case _LHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
- case _HLHL:
- z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
- case _HLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
- case _LHHL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
-// the vectors x and y, the selection of four elements from x and y,
-// where selector values in the range 0-3 specify elements from x and
-// values in the range 4-7 specify the 0-3 elements of y.
-// When the selectors are constants and can be the selection
-// can be implemented in a single instruction, it will be, otherwise
-// it requires two. a is the source index of the least element in the
-// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
-// elements in the output. For example,
-// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
-//
-// returns {4,8,25,81,64,128,169,289}
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Uint32x8) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x8) Uint32x8 {
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case _LLLL:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
- case _HHHH:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _LLHH:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _HHLL:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
-
- case _HLLL:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
- case _LHLL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
-
- case _HLHH:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
- case _LHHH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
-
- case _LLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _LLHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
-
- case _LHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
- case _HLHL:
- z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
- case _HLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
- case _LHHL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
-// the vectors x and y, the selection of four elements from x and y,
-// where selector values in the range 0-3 specify elements from x and
-// values in the range 4-7 specify the 0-3 elements of y.
-// When the selectors are constants and can be the selection
-// can be implemented in a single instruction, it will be, otherwise
-// it requires two. a is the source index of the least element in the
-// output, and b, c, and d are the indices of the 2nd, 3rd, and 4th
-// elements in the output. For example,
-// {1,2,4,8,16,32,64,128}.SelectFromPair(2,3,5,7,{9,25,49,81,121,169,225,289})
-//
-// returns {4,8,25,81,64,128,169,289}
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPS, CPU Feature: AVX
-func (x Float32x8) SelectFromPairGrouped(a, b, c, d uint8, y Float32x8) Float32x8 {
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case _LLLL:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
- case _HHHH:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _LLHH:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _HHLL:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
-
- case _HLLL:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
- case _LHLL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
-
- case _HLHH:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
- case _LHHH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
-
- case _LLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _LLHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
-
- case _LHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
- case _HLHL:
- z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
- case _HLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
- case _LHHL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
-// of the vectors x and y, the selection of four elements from x and y,
-// where selector values in the range 0-3 specify elements from x and
-// values in the range 4-7 specify the 0-3 elements of y.
-// When the selectors are constants and can be the selection
-// can be implemented in a single instruction, it will be, otherwise
-// it requires two.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Int32x16) SelectFromPairGrouped(a, b, c, d uint8, y Int32x16) Int32x16 {
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case _LLLL:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
- case _HHHH:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _LLHH:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _HHLL:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
-
- case _HLLL:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
- case _LHLL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
-
- case _HLHH:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
- case _LHHH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
-
- case _LLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _LLHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
-
- case _LHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
- case _HLHL:
- z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
- case _HLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
- case _LHHL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
-// of the vectors x and y, the selection of four elements from x and y,
-// where selector values in the range 0-3 specify elements from x and
-// values in the range 4-7 specify the 0-3 elements of y.
-// When the selectors are constants and can be the selection
-// can be implemented in a single instruction, it will be, otherwise
-// it requires two.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Uint32x16) SelectFromPairGrouped(a, b, c, d uint8, y Uint32x16) Uint32x16 {
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case _LLLL:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
- case _HHHH:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _LLHH:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _HHLL:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
-
- case _HLLL:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
- case _LHLL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
-
- case _HLHH:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
- case _LHHH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
-
- case _LLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _LLHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
-
- case _LHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
- case _HLHL:
- z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
- case _HLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
- case _LHHL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
-// of the vectors x and y, the selection of four elements from x and y,
-// where selector values in the range 0-3 specify elements from x and
-// values in the range 4-7 specify the 0-3 elements of y.
-// When the selectors are constants and can be the selection
-// can be implemented in a single instruction, it will be, otherwise
-// it requires two.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPS, CPU Feature: AVX512
-func (x Float32x16) SelectFromPairGrouped(a, b, c, d uint8, y Float32x16) Float32x16 {
- pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
-
- a, b, c, d = a&3, b&3, c&3, d&3
-
- switch pattern {
- case _LLLL:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
- case _HHHH:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _LLHH:
- return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
- case _HHLL:
- return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
-
- case _HLLL:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
- case _LHLL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
-
- case _HLHH:
- z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
- case _LHHH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
- return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
-
- case _LLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _LLHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
- case _HHHL:
- z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
- return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
-
- case _LHLH:
- z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
- case _HLHL:
- z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
- case _HLLH:
- z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
- return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
- case _LHHL:
- z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
- return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// cscimm4 converts the 4 vector element indices into a single
-// uint8 for use as an immediate.
-func cscimm4(a, b, c, d uint8) uint8 {
- return uint8(a + b<<2 + c<<4 + d<<6)
-}
-
-// cscimm2 converts the 2 vector element indices into a single
-// uint8 for use as an immediate.
-func cscimm2(a, b uint8) uint8 {
- return uint8(a + b<<1)
-}
-
-// cscimm2g2 converts the 2 vector element indices into a single
-// uint8 for use as an immediate, but duplicated for VSHUFPD
-// to emulate grouped behavior of VSHUFPS
-func cscimm2g2(a, b uint8) uint8 {
- g := cscimm2(a, b)
- return g + g<<2
-}
-
-// cscimm2g4 converts the 2 vector element indices into a single
-// uint8 for use as an immediate, but with four copies for VSHUFPD
-// to emulate grouped behavior of VSHUFPS
-func cscimm2g4(a, b uint8) uint8 {
- g := cscimm2g2(a, b)
- return g + g<<4
-}
-
-// SelectFromPair returns the selection of two elements from the two
-// vectors x and y, where selector values in the range 0-1 specify
-// elements from x and values in the range 2-3 specify the 0-1 elements
-// of y. When the selectors are constants the selection can be
-// implemented in a single instruction.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Uint64x2) SelectFromPair(a, b uint8, y Uint64x2) Uint64x2 {
- pattern := (a&2)>>1 + (b & 2)
-
- a, b = a&1, b&1
-
- switch pattern {
- case _LL:
- return x.concatSelectedConstant(cscimm2(a, b), x)
- case _HH:
- return y.concatSelectedConstant(cscimm2(a, b), y)
- case _LH:
- return x.concatSelectedConstant(cscimm2(a, b), y)
- case _HL:
- return y.concatSelectedConstant(cscimm2(a, b), x)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
-// the vectors x and y, the selection of two elements from the two
-// vectors x and y, where selector values in the range 0-1 specify
-// elements from x and values in the range 2-3 specify the 0-1 elements
-// of y. When the selectors are constants the selection can be
-// implemented in a single instruction.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Uint64x4) SelectFromPairGrouped(a, b uint8, y Uint64x4) Uint64x4 {
- pattern := (a&2)>>1 + (b & 2)
-
- a, b = a&1, b&1
-
- switch pattern {
- case _LL:
- return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
- case _HH:
- return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
- case _LH:
- return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
- case _HL:
- return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
-// of the vectors x and y, the selection of two elements from the two
-// vectors x and y, where selector values in the range 0-1 specify
-// elements from x and values in the range 2-3 specify the 0-1 elements
-// of y. When the selectors are constants the selection can be
-// implemented in a single instruction.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Uint64x8) SelectFromPairGrouped(a, b uint8, y Uint64x8) Uint64x8 {
- pattern := (a&2)>>1 + (b & 2)
-
- a, b = a&1, b&1
-
- switch pattern {
- case _LL:
- return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
- case _HH:
- return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
- case _LH:
- return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
- case _HL:
- return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPair returns the selection of two elements from the two
-// vectors x and y, where selector values in the range 0-1 specify
-// elements from x and values in the range 2-3 specify the 0-1 elements
-// of y. When the selectors are constants the selection can be
-// implemented in a single instruction.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Float64x2) SelectFromPair(a, b uint8, y Float64x2) Float64x2 {
- pattern := (a&2)>>1 + (b & 2)
-
- a, b = a&1, b&1
-
- switch pattern {
- case _LL:
- return x.concatSelectedConstant(cscimm2(a, b), x)
- case _HH:
- return y.concatSelectedConstant(cscimm2(a, b), y)
- case _LH:
- return x.concatSelectedConstant(cscimm2(a, b), y)
- case _HL:
- return y.concatSelectedConstant(cscimm2(a, b), x)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
-// the vectors x and y, the selection of two elements from the two
-// vectors x and y, where selector values in the range 0-1 specify
-// elements from x and values in the range 2-3 specify the 0-1 elements
-// of y. When the selectors are constants the selection can be
-// implemented in a single instruction.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Float64x4) SelectFromPairGrouped(a, b uint8, y Float64x4) Float64x4 {
- pattern := (a&2)>>1 + (b & 2)
-
- a, b = a&1, b&1
-
- switch pattern {
- case _LL:
- return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
- case _HH:
- return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
- case _LH:
- return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
- case _HL:
- return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
-// of the vectors x and y, the selection of two elements from the two
-// vectors x and y, where selector values in the range 0-1 specify
-// elements from x and values in the range 2-3 specify the 0-1 elements
-// of y. When the selectors are constants the selection can be
-// implemented in a single instruction.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Float64x8) SelectFromPairGrouped(a, b uint8, y Float64x8) Float64x8 {
- pattern := (a&2)>>1 + (b & 2)
-
- a, b = a&1, b&1
-
- switch pattern {
- case _LL:
- return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
- case _HH:
- return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
- case _LH:
- return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
- case _HL:
- return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPair returns the selection of two elements from the two
-// vectors x and y, where selector values in the range 0-1 specify
-// elements from x and values in the range 2-3 specify the 0-1 elements
-// of y. When the selectors are constants the selection can be
-// implemented in a single instruction.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Int64x2) SelectFromPair(a, b uint8, y Int64x2) Int64x2 {
- pattern := (a&2)>>1 + (b & 2)
-
- a, b = a&1, b&1
-
- switch pattern {
- case _LL:
- return x.concatSelectedConstant(cscimm2(a, b), x)
- case _HH:
- return y.concatSelectedConstant(cscimm2(a, b), y)
- case _LH:
- return x.concatSelectedConstant(cscimm2(a, b), y)
- case _HL:
- return y.concatSelectedConstant(cscimm2(a, b), x)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the two 128-bit halves of
-// the vectors x and y, the selection of two elements from the two
-// vectors x and y, where selector values in the range 0-1 specify
-// elements from x and values in the range 2-3 specify the 0-1 elements
-// of y. When the selectors are constants the selection can be
-// implemented in a single instruction.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPD, CPU Feature: AVX
-func (x Int64x4) SelectFromPairGrouped(a, b uint8, y Int64x4) Int64x4 {
- pattern := (a&2)>>1 + (b & 2)
-
- a, b = a&1, b&1
-
- switch pattern {
- case _LL:
- return x.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
- case _HH:
- return y.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
- case _LH:
- return x.concatSelectedConstantGrouped(cscimm2g2(a, b), y)
- case _HL:
- return y.concatSelectedConstantGrouped(cscimm2g2(a, b), x)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-// SelectFromPairGrouped returns, for each of the four 128-bit subvectors
-// of the vectors x and y, the selection of two elements from the two
-// vectors x and y, where selector values in the range 0-1 specify
-// elements from x and values in the range 2-3 specify the 0-1 elements
-// of y. When the selectors are constants the selection can be
-// implemented in a single instruction.
-//
-// If the selectors are not constant this will translate to a function
-// call.
-//
-// Asm: VSHUFPD, CPU Feature: AVX512
-func (x Int64x8) SelectFromPairGrouped(a, b uint8, y Int64x8) Int64x8 {
- pattern := (a&2)>>1 + (b & 2)
-
- a, b = a&1, b&1
-
- switch pattern {
- case _LL:
- return x.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
- case _HH:
- return y.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
- case _LH:
- return x.concatSelectedConstantGrouped(cscimm2g4(a, b), y)
- case _HL:
- return y.concatSelectedConstantGrouped(cscimm2g4(a, b), x)
- }
- panic("missing case, switch should be exhaustive")
-}
-
-/* PermuteScalars */
-
-// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
-//
-// result = {x[a], x[b], x[c], x[d]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table may be generated.
-//
-// Asm: VPSHUFD, CPU Feature: AVX
-func (x Int32x4) PermuteScalars(a, b, c, d uint8) Int32x4 {
- return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalars performs a permutation of vector x's elements using the supplied indices:
-//
-// result = {x[a], x[b], x[c], x[d]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table may be generated.
-//
-// Asm: VPSHUFD, CPU Feature: AVX
-func (x Uint32x4) PermuteScalars(a, b, c, d uint8) Uint32x4 {
- return x.permuteScalars(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-/* PermuteScalarsGrouped */
-
-// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table may be generated.
-//
-// Asm: VPSHUFD, CPU Feature: AVX2
-func (x Int32x8) PermuteScalarsGrouped(a, b, c, d uint8) Int32x8 {
- return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result =
-// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4],
-// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table may be generated.
-//
-// Asm: VPSHUFD, CPU Feature: AVX512
-func (x Int32x16) PermuteScalarsGrouped(a, b, c, d uint8) Int32x16 {
- return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result = {x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFD, CPU Feature: AVX2
-func (x Uint32x8) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x8 {
- return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result =
-// { x[a], x[b], x[c], x[d], x[a+4], x[b+4], x[c+4], x[d+4],
-// x[a+8], x[b+8], x[c+8], x[d+8], x[a+12], x[b+12], x[c+12], x[d+12]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFD, CPU Feature: AVX512
-func (x Uint32x16) PermuteScalarsGrouped(a, b, c, d uint8) Uint32x16 {
- return x.permuteScalarsGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-/* PermuteScalarsHi */
-
-// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
-//
-// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x8) PermuteScalarsHi(a, b, c, d uint8) Int16x8 {
- return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsHi performs a permutation of vector x using the supplied indices:
-//
-// result = {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x8) PermuteScalarsHi(a, b, c, d uint8) Uint16x8 {
- return x.permuteScalarsHi(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-/* PermuteScalarsHiGrouped */
-
-// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result =
-// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
-// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Int16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x16 {
- return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result =
-// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
-// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12],
-// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
-// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Int16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Int16x32 {
- return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result =
-// {x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
-// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12]}
-//
-// Each group is of size 128-bit.
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX2
-func (x Uint16x16) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x16 {
- return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsHiGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result =
-// { x[0], x[1], x[2], x[3], x[a+4], x[b+4], x[c+4], x[d+4],
-// x[8], x[9], x[10], x[11], x[a+12], x[b+12], x[c+12], x[d+12],
-// x[16], x[17], x[18], x[19], x[a+20], x[b+20], x[c+20], x[d+20],
-// x[24], x[25], x[26], x[27], x[a+28], x[b+28], x[c+28], x[d+28]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFHW, CPU Feature: AVX512
-func (x Uint16x32) PermuteScalarsHiGrouped(a, b, c, d uint8) Uint16x32 {
- return x.permuteScalarsHiGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-/* PermuteScalarsLo */
-
-// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
-//
-// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX512
-func (x Int16x8) PermuteScalarsLo(a, b, c, d uint8) Int16x8 {
- return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsLo performs a permutation of vector x using the supplied indices:
-//
-// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX512
-func (x Uint16x8) PermuteScalarsLo(a, b, c, d uint8) Uint16x8 {
- return x.permuteScalarsLo(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-/* PermuteScalarsLoGrouped */
-
-// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result =
-// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
-// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX2
-func (x Int16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x16 {
- return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result =
-// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
-// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15],
-// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
-// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX512
-func (x Int16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Int16x32 {
- return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result = {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
-// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15]}
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX2
-func (x Uint16x16) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x16 {
- return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// PermuteScalarsLoGrouped performs a grouped permutation of vector x using the supplied indices:
-//
-// result =
-// {x[a], x[b], x[c], x[d], x[4], x[5], x[6], x[7],
-// x[a+8], x[b+8], x[c+8], x[d+8], x[12], x[13], x[14], x[15],
-// x[a+16], x[b+16], x[c+16], x[d+16], x[20], x[21], x[22], x[23],
-// x[a+24], x[b+24], x[c+24], x[d+24], x[28], x[29], x[30], x[31]}
-//
-// Each group is of size 128-bit.
-//
-// Parameters a,b,c,d should have values between 0 and 3.
-// If a through d are constants, then an instruction will be inlined, otherwise
-// a jump table is generated.
-//
-// Asm: VPSHUFLW, CPU Feature: AVX512
-func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
- return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
-}
-
-// CarrylessMultiply computes one of four possible carryless
-// multiplications of selected high and low halves of x and y,
-// depending on the values of a and b, returning the 128-bit
-// product in the concatenated two elements of the result.
-// a selects the low (0) or high (1) element of x and
-// b selects the low (0) or high (1) element of y.
-//
-// A carryless multiplication uses bitwise XOR instead of
-// add-with-carry, for example (in base two):
-// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
-//
-// This also models multiplication of polynomials with coefficients
-// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
-// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
-// polynomial terms, but coefficients "add" with XOR.)
-//
-// constant values of a and b will result in better performance,
-// otherwise the intrinsic may translate into a jump table.
-//
-// Asm: VPCLMULQDQ, CPU Feature: AVX
-func (x Uint64x2) CarrylessMultiply(a, b uint8, y Uint64x2) Uint64x2 {
- return x.carrylessMultiply(a&1+((b&1)<<4), y)
-}
-
-// CarrylessMultiplyGrouped computes one of four possible carryless
-// multiplications of selected high and low halves of each of the two
-// 128-bit lanes of x and y, depending on the values of a and b,
-// and returns the four 128-bit products in the result's lanes.
-// a selects the low (0) or high (1) elements of x's lanes and
-// b selects the low (0) or high (1) elements of y's lanes.
-//
-// A carryless multiplication uses bitwise XOR instead of
-// add-with-carry, for example (in base two):
-// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
-//
-// This also models multiplication of polynomials with coefficients
-// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
-// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
-// polynomial terms, but coefficients "add" with XOR.)
-//
-// constant values of a and b will result in better performance,
-// otherwise the intrinsic may translate into a jump table.
-//
-// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
-func (x Uint64x4) CarrylessMultiplyGrouped(a, b uint8, y Uint64x4) Uint64x4 {
- return x.carrylessMultiply(a&1+((b&1)<<4), y)
-}
-
-// CarrylessMultiplyGrouped computes one of four possible carryless
-// multiplications of selected high and low halves of each of the four
-// 128-bit lanes of x and y, depending on the values of a and b,
-// and returns the four 128-bit products in the result's lanes.
-// a selects the low (0) or high (1) elements of x's lanes and
-// b selects the low (0) or high (1) elements of y's lanes.
-//
-// A carryless multiplication uses bitwise XOR instead of
-// add-with-carry, for example (in base two):
-// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
-//
-// This also models multiplication of polynomials with coefficients
-// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
-// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
-// polynomial terms, but coefficients "add" with XOR.)
-//
-// constant values of a and b will result in better performance,
-// otherwise the intrinsic may translate into a jump table.
-//
-// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
-func (x Uint64x8) CarrylessMultiplyGrouped(a, b uint8, y Uint64x8) Uint64x8 {
- return x.carrylessMultiply(a&1+((b&1)<<4), y)
-}
+++ /dev/null
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-import "unsafe"
-
-// LoadInt8x16Slice loads an Int8x16 from a slice of at least 16 int8s
-func LoadInt8x16Slice(s []int8) Int8x16 {
- return LoadInt8x16((*[16]int8)(s))
-}
-
-// StoreSlice stores x into a slice of at least 16 int8s
-func (x Int8x16) StoreSlice(s []int8) {
- x.Store((*[16]int8)(s))
-}
-
-// LoadInt16x8Slice loads an Int16x8 from a slice of at least 8 int16s
-func LoadInt16x8Slice(s []int16) Int16x8 {
- return LoadInt16x8((*[8]int16)(s))
-}
-
-// StoreSlice stores x into a slice of at least 8 int16s
-func (x Int16x8) StoreSlice(s []int16) {
- x.Store((*[8]int16)(s))
-}
-
-// LoadInt32x4Slice loads an Int32x4 from a slice of at least 4 int32s
-func LoadInt32x4Slice(s []int32) Int32x4 {
- return LoadInt32x4((*[4]int32)(s))
-}
-
-// StoreSlice stores x into a slice of at least 4 int32s
-func (x Int32x4) StoreSlice(s []int32) {
- x.Store((*[4]int32)(s))
-}
-
-// LoadInt64x2Slice loads an Int64x2 from a slice of at least 2 int64s
-func LoadInt64x2Slice(s []int64) Int64x2 {
- return LoadInt64x2((*[2]int64)(s))
-}
-
-// StoreSlice stores x into a slice of at least 2 int64s
-func (x Int64x2) StoreSlice(s []int64) {
- x.Store((*[2]int64)(s))
-}
-
-// LoadUint8x16Slice loads an Uint8x16 from a slice of at least 16 uint8s
-func LoadUint8x16Slice(s []uint8) Uint8x16 {
- return LoadUint8x16((*[16]uint8)(s))
-}
-
-// StoreSlice stores x into a slice of at least 16 uint8s
-func (x Uint8x16) StoreSlice(s []uint8) {
- x.Store((*[16]uint8)(s))
-}
-
-// LoadUint16x8Slice loads an Uint16x8 from a slice of at least 8 uint16s
-func LoadUint16x8Slice(s []uint16) Uint16x8 {
- return LoadUint16x8((*[8]uint16)(s))
-}
-
-// StoreSlice stores x into a slice of at least 8 uint16s
-func (x Uint16x8) StoreSlice(s []uint16) {
- x.Store((*[8]uint16)(s))
-}
-
-// LoadUint32x4Slice loads an Uint32x4 from a slice of at least 4 uint32s
-func LoadUint32x4Slice(s []uint32) Uint32x4 {
- return LoadUint32x4((*[4]uint32)(s))
-}
-
-// StoreSlice stores x into a slice of at least 4 uint32s
-func (x Uint32x4) StoreSlice(s []uint32) {
- x.Store((*[4]uint32)(s))
-}
-
-// LoadUint64x2Slice loads an Uint64x2 from a slice of at least 2 uint64s
-func LoadUint64x2Slice(s []uint64) Uint64x2 {
- return LoadUint64x2((*[2]uint64)(s))
-}
-
-// StoreSlice stores x into a slice of at least 2 uint64s
-func (x Uint64x2) StoreSlice(s []uint64) {
- x.Store((*[2]uint64)(s))
-}
-
-// LoadFloat32x4Slice loads a Float32x4 from a slice of at least 4 float32s
-func LoadFloat32x4Slice(s []float32) Float32x4 {
- return LoadFloat32x4((*[4]float32)(s))
-}
-
-// StoreSlice stores x into a slice of at least 4 float32s
-func (x Float32x4) StoreSlice(s []float32) {
- x.Store((*[4]float32)(s))
-}
-
-// LoadFloat64x2Slice loads a Float64x2 from a slice of at least 2 float64s
-func LoadFloat64x2Slice(s []float64) Float64x2 {
- return LoadFloat64x2((*[2]float64)(s))
-}
-
-// StoreSlice stores x into a slice of at least 2 float64s
-func (x Float64x2) StoreSlice(s []float64) {
- x.Store((*[2]float64)(s))
-}
-
-// LoadInt8x32Slice loads an Int8x32 from a slice of at least 32 int8s
-func LoadInt8x32Slice(s []int8) Int8x32 {
- return LoadInt8x32((*[32]int8)(s))
-}
-
-// StoreSlice stores x into a slice of at least 32 int8s
-func (x Int8x32) StoreSlice(s []int8) {
- x.Store((*[32]int8)(s))
-}
-
-// LoadInt16x16Slice loads an Int16x16 from a slice of at least 16 int16s
-func LoadInt16x16Slice(s []int16) Int16x16 {
- return LoadInt16x16((*[16]int16)(s))
-}
-
-// StoreSlice stores x into a slice of at least 16 int16s
-func (x Int16x16) StoreSlice(s []int16) {
- x.Store((*[16]int16)(s))
-}
-
-// LoadInt32x8Slice loads an Int32x8 from a slice of at least 8 int32s
-func LoadInt32x8Slice(s []int32) Int32x8 {
- return LoadInt32x8((*[8]int32)(s))
-}
-
-// StoreSlice stores x into a slice of at least 8 int32s
-func (x Int32x8) StoreSlice(s []int32) {
- x.Store((*[8]int32)(s))
-}
-
-// LoadInt64x4Slice loads an Int64x4 from a slice of at least 4 int64s
-func LoadInt64x4Slice(s []int64) Int64x4 {
- return LoadInt64x4((*[4]int64)(s))
-}
-
-// StoreSlice stores x into a slice of at least 4 int64s
-func (x Int64x4) StoreSlice(s []int64) {
- x.Store((*[4]int64)(s))
-}
-
-// LoadUint8x32Slice loads an Uint8x32 from a slice of at least 32 uint8s
-func LoadUint8x32Slice(s []uint8) Uint8x32 {
- return LoadUint8x32((*[32]uint8)(s))
-}
-
-// StoreSlice stores x into a slice of at least 32 uint8s
-func (x Uint8x32) StoreSlice(s []uint8) {
- x.Store((*[32]uint8)(s))
-}
-
-// LoadUint16x16Slice loads an Uint16x16 from a slice of at least 16 uint16s
-func LoadUint16x16Slice(s []uint16) Uint16x16 {
- return LoadUint16x16((*[16]uint16)(s))
-}
-
-// StoreSlice stores x into a slice of at least 16 uint16s
-func (x Uint16x16) StoreSlice(s []uint16) {
- x.Store((*[16]uint16)(s))
-}
-
-// LoadUint32x8Slice loads an Uint32x8 from a slice of at least 8 uint32s
-func LoadUint32x8Slice(s []uint32) Uint32x8 {
- return LoadUint32x8((*[8]uint32)(s))
-}
-
-// StoreSlice stores x into a slice of at least 8 uint32s
-func (x Uint32x8) StoreSlice(s []uint32) {
- x.Store((*[8]uint32)(s))
-}
-
-// LoadUint64x4Slice loads an Uint64x4 from a slice of at least 4 uint64s
-func LoadUint64x4Slice(s []uint64) Uint64x4 {
- return LoadUint64x4((*[4]uint64)(s))
-}
-
-// StoreSlice stores x into a slice of at least 4 uint64s
-func (x Uint64x4) StoreSlice(s []uint64) {
- x.Store((*[4]uint64)(s))
-}
-
-// LoadFloat32x8Slice loads a Float32x8 from a slice of at least 8 float32s
-func LoadFloat32x8Slice(s []float32) Float32x8 {
- return LoadFloat32x8((*[8]float32)(s))
-}
-
-// StoreSlice stores x into a slice of at least 8 float32s
-func (x Float32x8) StoreSlice(s []float32) {
- x.Store((*[8]float32)(s))
-}
-
-// LoadFloat64x4Slice loads a Float64x4 from a slice of at least 4 float64s
-func LoadFloat64x4Slice(s []float64) Float64x4 {
- return LoadFloat64x4((*[4]float64)(s))
-}
-
-// StoreSlice stores x into a slice of at least 4 float64s
-func (x Float64x4) StoreSlice(s []float64) {
- x.Store((*[4]float64)(s))
-}
-
-// LoadInt8x64Slice loads an Int8x64 from a slice of at least 64 int8s
-func LoadInt8x64Slice(s []int8) Int8x64 {
- return LoadInt8x64((*[64]int8)(s))
-}
-
-// StoreSlice stores x into a slice of at least 64 int8s
-func (x Int8x64) StoreSlice(s []int8) {
- x.Store((*[64]int8)(s))
-}
-
-// LoadInt16x32Slice loads an Int16x32 from a slice of at least 32 int16s
-func LoadInt16x32Slice(s []int16) Int16x32 {
- return LoadInt16x32((*[32]int16)(s))
-}
-
-// StoreSlice stores x into a slice of at least 32 int16s
-func (x Int16x32) StoreSlice(s []int16) {
- x.Store((*[32]int16)(s))
-}
-
-// LoadInt32x16Slice loads an Int32x16 from a slice of at least 16 int32s
-func LoadInt32x16Slice(s []int32) Int32x16 {
- return LoadInt32x16((*[16]int32)(s))
-}
-
-// StoreSlice stores x into a slice of at least 16 int32s
-func (x Int32x16) StoreSlice(s []int32) {
- x.Store((*[16]int32)(s))
-}
-
-// LoadInt64x8Slice loads an Int64x8 from a slice of at least 8 int64s
-func LoadInt64x8Slice(s []int64) Int64x8 {
- return LoadInt64x8((*[8]int64)(s))
-}
-
-// StoreSlice stores x into a slice of at least 8 int64s
-func (x Int64x8) StoreSlice(s []int64) {
- x.Store((*[8]int64)(s))
-}
-
-// LoadUint8x64Slice loads an Uint8x64 from a slice of at least 64 uint8s
-func LoadUint8x64Slice(s []uint8) Uint8x64 {
- return LoadUint8x64((*[64]uint8)(s))
-}
-
-// StoreSlice stores x into a slice of at least 64 uint8s
-func (x Uint8x64) StoreSlice(s []uint8) {
- x.Store((*[64]uint8)(s))
-}
-
-// LoadUint16x32Slice loads an Uint16x32 from a slice of at least 32 uint16s
-func LoadUint16x32Slice(s []uint16) Uint16x32 {
- return LoadUint16x32((*[32]uint16)(s))
-}
-
-// StoreSlice stores x into a slice of at least 32 uint16s
-func (x Uint16x32) StoreSlice(s []uint16) {
- x.Store((*[32]uint16)(s))
-}
-
-// LoadUint32x16Slice loads an Uint32x16 from a slice of at least 16 uint32s
-func LoadUint32x16Slice(s []uint32) Uint32x16 {
- return LoadUint32x16((*[16]uint32)(s))
-}
-
-// StoreSlice stores x into a slice of at least 16 uint32s
-func (x Uint32x16) StoreSlice(s []uint32) {
- x.Store((*[16]uint32)(s))
-}
-
-// LoadUint64x8Slice loads an Uint64x8 from a slice of at least 8 uint64s
-func LoadUint64x8Slice(s []uint64) Uint64x8 {
- return LoadUint64x8((*[8]uint64)(s))
-}
-
-// StoreSlice stores x into a slice of at least 8 uint64s
-func (x Uint64x8) StoreSlice(s []uint64) {
- x.Store((*[8]uint64)(s))
-}
-
-// LoadFloat32x16Slice loads a Float32x16 from a slice of at least 16 float32s
-func LoadFloat32x16Slice(s []float32) Float32x16 {
- return LoadFloat32x16((*[16]float32)(s))
-}
-
-// StoreSlice stores x into a slice of at least 16 float32s
-func (x Float32x16) StoreSlice(s []float32) {
- x.Store((*[16]float32)(s))
-}
-
-// LoadFloat64x8Slice loads a Float64x8 from a slice of at least 8 float64s
-func LoadFloat64x8Slice(s []float64) Float64x8 {
- return LoadFloat64x8((*[8]float64)(s))
-}
-
-// StoreSlice stores x into a slice of at least 8 float64s
-func (x Float64x8) StoreSlice(s []float64) {
- x.Store((*[8]float64)(s))
-}
-
-// LoadInt8x64SlicePart loads a Int8x64 from the slice s.
-// If s has fewer than 64 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 64 or more elements, the function is equivalent to LoadInt8x64Slice.
-func LoadInt8x64SlicePart(s []int8) Int8x64 {
- l := len(s)
- if l >= 64 {
- return LoadInt8x64Slice(s)
- }
- if l == 0 {
- var x Int8x64
- return x
- }
- mask := Mask8x64FromBits(0xffffffffffffffff >> (64 - l))
- return LoadMaskedInt8x64(paInt8x64(s), mask)
-}
-
-// StoreSlicePart stores the 64 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 64 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int8x64) StoreSlicePart(s []int8) {
- l := len(s)
- if l >= 64 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask8x64FromBits(0xffffffffffffffff >> (64 - l))
- x.StoreMasked(paInt8x64(s), mask)
-}
-
-// LoadInt16x32SlicePart loads a Int16x32 from the slice s.
-// If s has fewer than 32 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 32 or more elements, the function is equivalent to LoadInt16x32Slice.
-func LoadInt16x32SlicePart(s []int16) Int16x32 {
- l := len(s)
- if l >= 32 {
- return LoadInt16x32Slice(s)
- }
- if l == 0 {
- var x Int16x32
- return x
- }
- mask := Mask16x32FromBits(0xffffffff >> (32 - l))
- return LoadMaskedInt16x32(paInt16x32(s), mask)
-}
-
-// StoreSlicePart stores the 32 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 32 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int16x32) StoreSlicePart(s []int16) {
- l := len(s)
- if l >= 32 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask16x32FromBits(0xffffffff >> (32 - l))
- x.StoreMasked(paInt16x32(s), mask)
-}
-
-// LoadInt32x16SlicePart loads a Int32x16 from the slice s.
-// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 16 or more elements, the function is equivalent to LoadInt32x16Slice.
-func LoadInt32x16SlicePart(s []int32) Int32x16 {
- l := len(s)
- if l >= 16 {
- return LoadInt32x16Slice(s)
- }
- if l == 0 {
- var x Int32x16
- return x
- }
- mask := Mask32x16FromBits(0xffff >> (16 - l))
- return LoadMaskedInt32x16(paInt32x16(s), mask)
-}
-
-// StoreSlicePart stores the 16 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int32x16) StoreSlicePart(s []int32) {
- l := len(s)
- if l >= 16 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask32x16FromBits(0xffff >> (16 - l))
- x.StoreMasked(paInt32x16(s), mask)
-}
-
-// LoadInt64x8SlicePart loads a Int64x8 from the slice s.
-// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 8 or more elements, the function is equivalent to LoadInt64x8Slice.
-func LoadInt64x8SlicePart(s []int64) Int64x8 {
- l := len(s)
- if l >= 8 {
- return LoadInt64x8Slice(s)
- }
- if l == 0 {
- var x Int64x8
- return x
- }
- mask := Mask64x8FromBits(0xff >> (8 - l))
- return LoadMaskedInt64x8(paInt64x8(s), mask)
-}
-
-// StoreSlicePart stores the 8 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int64x8) StoreSlicePart(s []int64) {
- l := len(s)
- if l >= 8 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask64x8FromBits(0xff >> (8 - l))
- x.StoreMasked(paInt64x8(s), mask)
-}
-
-// LoadUint8x64SlicePart loads a Uint8x64 from the slice s.
-// If s has fewer than 64 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 64 or more elements, the function is equivalent to LoadUint8x64Slice.
-func LoadUint8x64SlicePart(s []uint8) Uint8x64 {
- l := len(s)
- if l >= 64 {
- return LoadUint8x64Slice(s)
- }
- if l == 0 {
- var x Uint8x64
- return x
- }
- mask := Mask8x64FromBits(0xffffffffffffffff >> (64 - l))
- return LoadMaskedUint8x64(paUint8x64(s), mask)
-}
-
-// StoreSlicePart stores the 64 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 64 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint8x64) StoreSlicePart(s []uint8) {
- l := len(s)
- if l >= 64 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask8x64FromBits(0xffffffffffffffff >> (64 - l))
- x.StoreMasked(paUint8x64(s), mask)
-}
-
-// LoadUint16x32SlicePart loads a Uint16x32 from the slice s.
-// If s has fewer than 32 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 32 or more elements, the function is equivalent to LoadUint16x32Slice.
-func LoadUint16x32SlicePart(s []uint16) Uint16x32 {
- l := len(s)
- if l >= 32 {
- return LoadUint16x32Slice(s)
- }
- if l == 0 {
- var x Uint16x32
- return x
- }
- mask := Mask16x32FromBits(0xffffffff >> (32 - l))
- return LoadMaskedUint16x32(paUint16x32(s), mask)
-}
-
-// StoreSlicePart stores the 32 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 32 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint16x32) StoreSlicePart(s []uint16) {
- l := len(s)
- if l >= 32 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask16x32FromBits(0xffffffff >> (32 - l))
- x.StoreMasked(paUint16x32(s), mask)
-}
-
-// LoadUint32x16SlicePart loads a Uint32x16 from the slice s.
-// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 16 or more elements, the function is equivalent to LoadUint32x16Slice.
-func LoadUint32x16SlicePart(s []uint32) Uint32x16 {
- l := len(s)
- if l >= 16 {
- return LoadUint32x16Slice(s)
- }
- if l == 0 {
- var x Uint32x16
- return x
- }
- mask := Mask32x16FromBits(0xffff >> (16 - l))
- return LoadMaskedUint32x16(paUint32x16(s), mask)
-}
-
-// StoreSlicePart stores the 16 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint32x16) StoreSlicePart(s []uint32) {
- l := len(s)
- if l >= 16 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask32x16FromBits(0xffff >> (16 - l))
- x.StoreMasked(paUint32x16(s), mask)
-}
-
-// LoadUint64x8SlicePart loads a Uint64x8 from the slice s.
-// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 8 or more elements, the function is equivalent to LoadUint64x8Slice.
-func LoadUint64x8SlicePart(s []uint64) Uint64x8 {
- l := len(s)
- if l >= 8 {
- return LoadUint64x8Slice(s)
- }
- if l == 0 {
- var x Uint64x8
- return x
- }
- mask := Mask64x8FromBits(0xff >> (8 - l))
- return LoadMaskedUint64x8(paUint64x8(s), mask)
-}
-
-// StoreSlicePart stores the 8 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint64x8) StoreSlicePart(s []uint64) {
- l := len(s)
- if l >= 8 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask64x8FromBits(0xff >> (8 - l))
- x.StoreMasked(paUint64x8(s), mask)
-}
-
-// LoadFloat32x16SlicePart loads a Float32x16 from the slice s.
-// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 16 or more elements, the function is equivalent to LoadFloat32x16Slice.
-func LoadFloat32x16SlicePart(s []float32) Float32x16 {
- l := len(s)
- if l >= 16 {
- return LoadFloat32x16Slice(s)
- }
- if l == 0 {
- var x Float32x16
- return x
- }
- mask := Mask32x16FromBits(0xffff >> (16 - l))
- return LoadMaskedFloat32x16(paFloat32x16(s), mask)
-}
-
-// StoreSlicePart stores the 16 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
-func (x Float32x16) StoreSlicePart(s []float32) {
- l := len(s)
- if l >= 16 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask32x16FromBits(0xffff >> (16 - l))
- x.StoreMasked(paFloat32x16(s), mask)
-}
-
-// LoadFloat64x8SlicePart loads a Float64x8 from the slice s.
-// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 8 or more elements, the function is equivalent to LoadFloat64x8Slice.
-func LoadFloat64x8SlicePart(s []float64) Float64x8 {
- l := len(s)
- if l >= 8 {
- return LoadFloat64x8Slice(s)
- }
- if l == 0 {
- var x Float64x8
- return x
- }
- mask := Mask64x8FromBits(0xff >> (8 - l))
- return LoadMaskedFloat64x8(paFloat64x8(s), mask)
-}
-
-// StoreSlicePart stores the 8 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
-func (x Float64x8) StoreSlicePart(s []float64) {
- l := len(s)
- if l >= 8 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := Mask64x8FromBits(0xff >> (8 - l))
- x.StoreMasked(paFloat64x8(s), mask)
-}
-
-// LoadInt32x4SlicePart loads a Int32x4 from the slice s.
-// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 4 or more elements, the function is equivalent to LoadInt32x4Slice.
-func LoadInt32x4SlicePart(s []int32) Int32x4 {
- l := len(s)
- if l >= 4 {
- return LoadInt32x4Slice(s)
- }
- if l == 0 {
- var x Int32x4
- return x
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- return LoadMaskedInt32x4(paInt32x4(s), LoadInt32x4Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 4 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int32x4) StoreSlicePart(s []int32) {
- l := len(s)
- if l >= 4 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- x.StoreMasked(paInt32x4(s), LoadInt32x4Slice(mask).asMask())
-}
-
-// LoadInt64x2SlicePart loads a Int64x2 from the slice s.
-// If s has fewer than 2 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 2 or more elements, the function is equivalent to LoadInt64x2Slice.
-func LoadInt64x2SlicePart(s []int64) Int64x2 {
- l := len(s)
- if l >= 2 {
- return LoadInt64x2Slice(s)
- }
- if l == 0 {
- var x Int64x2
- return x
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- return LoadMaskedInt64x2(paInt64x2(s), LoadInt64x2Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 2 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 2 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int64x2) StoreSlicePart(s []int64) {
- l := len(s)
- if l >= 2 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- x.StoreMasked(paInt64x2(s), LoadInt64x2Slice(mask).asMask())
-}
-
-// LoadUint32x4SlicePart loads a Uint32x4 from the slice s.
-// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 4 or more elements, the function is equivalent to LoadUint32x4Slice.
-func LoadUint32x4SlicePart(s []uint32) Uint32x4 {
- l := len(s)
- if l >= 4 {
- return LoadUint32x4Slice(s)
- }
- if l == 0 {
- var x Uint32x4
- return x
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- return LoadMaskedUint32x4(paUint32x4(s), LoadInt32x4Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 4 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint32x4) StoreSlicePart(s []uint32) {
- l := len(s)
- if l >= 4 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- x.StoreMasked(paUint32x4(s), LoadInt32x4Slice(mask).asMask())
-}
-
-// LoadUint64x2SlicePart loads a Uint64x2 from the slice s.
-// If s has fewer than 2 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 2 or more elements, the function is equivalent to LoadUint64x2Slice.
-func LoadUint64x2SlicePart(s []uint64) Uint64x2 {
- l := len(s)
- if l >= 2 {
- return LoadUint64x2Slice(s)
- }
- if l == 0 {
- var x Uint64x2
- return x
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- return LoadMaskedUint64x2(paUint64x2(s), LoadInt64x2Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 2 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 2 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint64x2) StoreSlicePart(s []uint64) {
- l := len(s)
- if l >= 2 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- x.StoreMasked(paUint64x2(s), LoadInt64x2Slice(mask).asMask())
-}
-
-// LoadFloat32x4SlicePart loads a Float32x4 from the slice s.
-// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 4 or more elements, the function is equivalent to LoadFloat32x4Slice.
-func LoadFloat32x4SlicePart(s []float32) Float32x4 {
- l := len(s)
- if l >= 4 {
- return LoadFloat32x4Slice(s)
- }
- if l == 0 {
- var x Float32x4
- return x
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- return LoadMaskedFloat32x4(paFloat32x4(s), LoadInt32x4Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 4 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
-func (x Float32x4) StoreSlicePart(s []float32) {
- l := len(s)
- if l >= 4 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- x.StoreMasked(paFloat32x4(s), LoadInt32x4Slice(mask).asMask())
-}
-
-// LoadFloat64x2SlicePart loads a Float64x2 from the slice s.
-// If s has fewer than 2 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 2 or more elements, the function is equivalent to LoadFloat64x2Slice.
-func LoadFloat64x2SlicePart(s []float64) Float64x2 {
- l := len(s)
- if l >= 2 {
- return LoadFloat64x2Slice(s)
- }
- if l == 0 {
- var x Float64x2
- return x
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- return LoadMaskedFloat64x2(paFloat64x2(s), LoadInt64x2Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 2 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 2 or more elements, the method is equivalent to x.StoreSlice.
-func (x Float64x2) StoreSlicePart(s []float64) {
- l := len(s)
- if l >= 2 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- x.StoreMasked(paFloat64x2(s), LoadInt64x2Slice(mask).asMask())
-}
-
-// LoadInt32x8SlicePart loads a Int32x8 from the slice s.
-// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 8 or more elements, the function is equivalent to LoadInt32x8Slice.
-func LoadInt32x8SlicePart(s []int32) Int32x8 {
- l := len(s)
- if l >= 8 {
- return LoadInt32x8Slice(s)
- }
- if l == 0 {
- var x Int32x8
- return x
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- return LoadMaskedInt32x8(paInt32x8(s), LoadInt32x8Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 8 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int32x8) StoreSlicePart(s []int32) {
- l := len(s)
- if l >= 8 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- x.StoreMasked(paInt32x8(s), LoadInt32x8Slice(mask).asMask())
-}
-
-// LoadInt64x4SlicePart loads a Int64x4 from the slice s.
-// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 4 or more elements, the function is equivalent to LoadInt64x4Slice.
-func LoadInt64x4SlicePart(s []int64) Int64x4 {
- l := len(s)
- if l >= 4 {
- return LoadInt64x4Slice(s)
- }
- if l == 0 {
- var x Int64x4
- return x
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- return LoadMaskedInt64x4(paInt64x4(s), LoadInt64x4Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 4 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int64x4) StoreSlicePart(s []int64) {
- l := len(s)
- if l >= 4 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- x.StoreMasked(paInt64x4(s), LoadInt64x4Slice(mask).asMask())
-}
-
-// LoadUint32x8SlicePart loads a Uint32x8 from the slice s.
-// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 8 or more elements, the function is equivalent to LoadUint32x8Slice.
-func LoadUint32x8SlicePart(s []uint32) Uint32x8 {
- l := len(s)
- if l >= 8 {
- return LoadUint32x8Slice(s)
- }
- if l == 0 {
- var x Uint32x8
- return x
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- return LoadMaskedUint32x8(paUint32x8(s), LoadInt32x8Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 8 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint32x8) StoreSlicePart(s []uint32) {
- l := len(s)
- if l >= 8 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- x.StoreMasked(paUint32x8(s), LoadInt32x8Slice(mask).asMask())
-}
-
-// LoadUint64x4SlicePart loads a Uint64x4 from the slice s.
-// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 4 or more elements, the function is equivalent to LoadUint64x4Slice.
-func LoadUint64x4SlicePart(s []uint64) Uint64x4 {
- l := len(s)
- if l >= 4 {
- return LoadUint64x4Slice(s)
- }
- if l == 0 {
- var x Uint64x4
- return x
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- return LoadMaskedUint64x4(paUint64x4(s), LoadInt64x4Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 4 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint64x4) StoreSlicePart(s []uint64) {
- l := len(s)
- if l >= 4 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- x.StoreMasked(paUint64x4(s), LoadInt64x4Slice(mask).asMask())
-}
-
-// LoadFloat32x8SlicePart loads a Float32x8 from the slice s.
-// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 8 or more elements, the function is equivalent to LoadFloat32x8Slice.
-func LoadFloat32x8SlicePart(s []float32) Float32x8 {
- l := len(s)
- if l >= 8 {
- return LoadFloat32x8Slice(s)
- }
- if l == 0 {
- var x Float32x8
- return x
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- return LoadMaskedFloat32x8(paFloat32x8(s), LoadInt32x8Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 8 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
-func (x Float32x8) StoreSlicePart(s []float32) {
- l := len(s)
- if l >= 8 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask32[len(vecMask32)/2-l:]
- x.StoreMasked(paFloat32x8(s), LoadInt32x8Slice(mask).asMask())
-}
-
-// LoadFloat64x4SlicePart loads a Float64x4 from the slice s.
-// If s has fewer than 4 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 4 or more elements, the function is equivalent to LoadFloat64x4Slice.
-func LoadFloat64x4SlicePart(s []float64) Float64x4 {
- l := len(s)
- if l >= 4 {
- return LoadFloat64x4Slice(s)
- }
- if l == 0 {
- var x Float64x4
- return x
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- return LoadMaskedFloat64x4(paFloat64x4(s), LoadInt64x4Slice(mask).asMask())
-}
-
-// StoreSlicePart stores the 4 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 4 or more elements, the method is equivalent to x.StoreSlice.
-func (x Float64x4) StoreSlicePart(s []float64) {
- l := len(s)
- if l >= 4 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- mask := vecMask64[len(vecMask64)/2-l:]
- x.StoreMasked(paFloat64x4(s), LoadInt64x4Slice(mask).asMask())
-}
-
-// LoadUint8x16SlicePart loads a Uint8x16 from the slice s.
-// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 16 or more elements, the function is equivalent to LoadUint8x16Slice.
-func LoadUint8x16SlicePart(s []uint8) Uint8x16 {
- if len(s) == 0 {
- var zero Uint8x16
- return zero
- }
- t := unsafe.Slice((*int8)(unsafe.Pointer(&s[0])), len(s))
- return LoadInt8x16SlicePart(t).AsUint8x16()
-}
-
-// StoreSlicePart stores the 16 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint8x16) StoreSlicePart(s []uint8) {
- if len(s) == 0 {
- return
- }
- t := unsafe.Slice((*int8)(unsafe.Pointer(&s[0])), len(s))
- x.AsInt8x16().StoreSlicePart(t)
-}
-
-// LoadUint16x8SlicePart loads a Uint16x8 from the slice s.
-// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 8 or more elements, the function is equivalent to LoadUint16x8Slice.
-func LoadUint16x8SlicePart(s []uint16) Uint16x8 {
- if len(s) == 0 {
- var zero Uint16x8
- return zero
- }
- t := unsafe.Slice((*int16)(unsafe.Pointer(&s[0])), len(s))
- return LoadInt16x8SlicePart(t).AsUint16x8()
-}
-
-// StoreSlicePart stores the 8 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint16x8) StoreSlicePart(s []uint16) {
- if len(s) == 0 {
- return
- }
- t := unsafe.Slice((*int16)(unsafe.Pointer(&s[0])), len(s))
- x.AsInt16x8().StoreSlicePart(t)
-}
-
-// LoadUint8x32SlicePart loads a Uint8x32 from the slice s.
-// If s has fewer than 32 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 32 or more elements, the function is equivalent to LoadUint8x32Slice.
-func LoadUint8x32SlicePart(s []uint8) Uint8x32 {
- if len(s) == 0 {
- var zero Uint8x32
- return zero
- }
- t := unsafe.Slice((*int8)(unsafe.Pointer(&s[0])), len(s))
- return LoadInt8x32SlicePart(t).AsUint8x32()
-}
-
-// StoreSlicePart stores the 32 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 32 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint8x32) StoreSlicePart(s []uint8) {
- if len(s) == 0 {
- return
- }
- t := unsafe.Slice((*int8)(unsafe.Pointer(&s[0])), len(s))
- x.AsInt8x32().StoreSlicePart(t)
-}
-
-// LoadUint16x16SlicePart loads a Uint16x16 from the slice s.
-// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 16 or more elements, the function is equivalent to LoadUint16x16Slice.
-func LoadUint16x16SlicePart(s []uint16) Uint16x16 {
- if len(s) == 0 {
- var zero Uint16x16
- return zero
- }
- t := unsafe.Slice((*int16)(unsafe.Pointer(&s[0])), len(s))
- return LoadInt16x16SlicePart(t).AsUint16x16()
-}
-
-// StoreSlicePart stores the 16 elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
-func (x Uint16x16) StoreSlicePart(s []uint16) {
- if len(s) == 0 {
- return
- }
- t := unsafe.Slice((*int16)(unsafe.Pointer(&s[0])), len(s))
- x.AsInt16x16().StoreSlicePart(t)
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd
-
-package simd
-
-import "unsafe"
-
-// Implementation of all the {Int,Uint}{8,16} load and store slice part
-// functions and methods for 128-bit and 256-bit vectors.
-
-/* pointer-punning functions for chunked slice part loads. */
-
-func int16atP8(p *int8) *int16 {
- return (*int16)(unsafe.Pointer(p))
-}
-
-func int32atP8(p *int8) *int32 {
- return (*int32)(unsafe.Pointer(p))
-}
-
-func int64atP8(p *int8) *int64 {
- return (*int64)(unsafe.Pointer(p))
-}
-
-func int32atP16(p *int16) *int32 {
- return (*int32)(unsafe.Pointer(p))
-}
-
-func int64atP16(p *int16) *int64 {
- return (*int64)(unsafe.Pointer(p))
-}
-
-func int64atP32(p *int32) *int64 {
- return (*int64)(unsafe.Pointer(p))
-}
-
-func int32atP64(p *int64) *int32 {
- return (*int32)(unsafe.Pointer(p))
-}
-
-/* These two masks are used by generated code */
-
-var vecMask64 = [16]int64{
- -1, -1, -1, -1,
- -1, -1, -1, -1,
- 0, 0, 0, 0,
- 0, 0, 0, 0,
-}
-
-var vecMask32 = [32]int32{
- -1, -1, -1, -1,
- -1, -1, -1, -1,
- -1, -1, -1, -1,
- -1, -1, -1, -1,
- 0, 0, 0, 0,
- 0, 0, 0, 0,
- 0, 0, 0, 0,
- 0, 0, 0, 0,
-}
-
-/* 256-bit int vector loads and stores made from 128-bit parts */
-
-// LoadInt8x32SlicePart loads a Int8x32 from the slice s.
-// If s has fewer than 32 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 32 or more elements, the function is equivalent to LoadInt8x32Slice.
-func LoadInt8x32SlicePart(s []int8) Int8x32 {
- l := len(s)
- if l >= 32 {
- return LoadInt8x32Slice(s)
- }
- var x Int8x32
- if l == 0 {
- return x
- }
- if l > 16 {
- return x.SetLo(LoadInt8x16Slice(s)).SetHi(LoadInt8x16SlicePart(s[16:]))
- } else {
- return x.SetLo(LoadInt8x16SlicePart(s))
- }
-}
-
-// LoadInt16x16SlicePart loads a Int16x16 from the slice s.
-// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 16 or more elements, the function is equivalent to LoadInt16x16Slice.
-func LoadInt16x16SlicePart(s []int16) Int16x16 {
- l := len(s)
- if l >= 16 {
- return LoadInt16x16Slice(s)
- }
- var x Int16x16
- if l == 0 {
- return x
- }
- if l > 8 {
- return x.SetLo(LoadInt16x8Slice(s)).SetHi(LoadInt16x8SlicePart(s[8:]))
- } else {
- return x.SetLo(LoadInt16x8SlicePart(s))
- }
-}
-
-// StoreSlicePart stores the elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 32 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int8x32) StoreSlicePart(s []int8) {
- l := len(s)
- if l >= 32 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- if l > 16 {
- x.GetLo().StoreSlice(s)
- x.GetHi().StoreSlicePart(s[16:])
- } else { // fits in one
- x.GetLo().StoreSlicePart(s)
- }
-}
-
-// StoreSlicePart stores the elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int16x16) StoreSlicePart(s []int16) {
- l := len(s)
- if l >= 16 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- if l > 8 {
- x.GetLo().StoreSlice(s)
- x.GetHi().StoreSlicePart(s[8:])
- } else { // fits in one
- x.GetLo().StoreSlicePart(s)
- }
-}
-
-/* 128-bit vector load and store slice parts for 8 and 16-bit int elements */
-
-// LoadInt8x16SlicePart loads a Int8x16 from the slice s.
-// If s has fewer than 16 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 16 or more elements, the function is equivalent to LoadInt8x16Slice.
-func LoadInt8x16SlicePart(s []int8) Int8x16 {
- l := len(s)
- if l >= 16 {
- return LoadInt8x16Slice(s)
- }
- var x Int8x16
- if l == 0 {
- return x
- }
- if l >= 8 { // 8-15
- x = x.AsInt64x2().SetElem(0, *int64atP8(&s[0])).AsInt8x16()
- if l >= 12 { // 12, 13, 14, 15
- x = x.AsInt32x4().SetElem(8/4, *int32atP8(&s[8])).AsInt8x16()
- if l >= 14 {
- x = x.AsInt16x8().SetElem(12/2, *int16atP8(&s[12])).AsInt8x16()
- if l == 15 {
- x = x.SetElem(14, s[14])
- }
- } else if l == 13 {
- x = x.SetElem(12, s[12])
- }
- } else if l >= 10 { // 10, 11
- x = x.AsInt16x8().SetElem(8/2, *int16atP8(&s[8])).AsInt8x16()
- if l == 11 {
- x = x.SetElem(10, s[10])
- }
- } else if l == 9 {
- x = x.SetElem(8, s[8])
- }
- } else if l >= 4 { // 4-7
- x = x.AsInt32x4().SetElem(0, *int32atP8(&s[0])).AsInt8x16()
- if l >= 6 {
- x = x.AsInt16x8().SetElem(4/2, *int16atP8(&s[4])).AsInt8x16()
- if l == 7 {
- x = x.SetElem(6, s[6])
- }
- } else if l == 5 {
- x = x.SetElem(4, s[4])
- }
- } else if l >= 2 { // 2,3
- x = x.AsInt16x8().SetElem(0, *int16atP8(&s[0])).AsInt8x16()
- if l == 3 {
- x = x.SetElem(2, s[2])
- }
- } else { // l == 1
- x = x.SetElem(0, s[0])
- }
- return x
-}
-
-// StoreSlicePart stores the elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 16 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int8x16) StoreSlicePart(s []int8) {
- l := len(s)
- if l >= 16 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- if l >= 8 { // 8-15
- *int64atP8(&s[0]) = x.AsInt64x2().GetElem(0)
- if l >= 12 { // 12, 13, 14, 15
- *int32atP8(&s[8]) = x.AsInt32x4().GetElem(8 / 4)
- if l >= 14 {
- *int16atP8(&s[12]) = x.AsInt16x8().GetElem(12 / 2)
- if l == 15 {
- s[14] = x.GetElem(14)
- }
- } else if l == 13 {
- s[12] = x.GetElem(12)
- }
- } else if l >= 10 { // 10, 11
- *int16atP8(&s[8]) = x.AsInt16x8().GetElem(8 / 2)
- if l == 11 {
- s[10] = x.GetElem(10)
- }
- } else if l == 9 {
- s[8] = x.GetElem(8)
- }
- } else if l >= 4 { // 4-7
- *int32atP8(&s[0]) = x.AsInt32x4().GetElem(0)
- if l >= 6 {
- *int16atP8(&s[4]) = x.AsInt16x8().GetElem(4 / 2)
- if l == 7 {
- s[6] = x.GetElem(6)
- }
- } else if l == 5 {
- s[4] = x.GetElem(4)
- }
- } else if l >= 2 { // 2,3
- *int16atP8(&s[0]) = x.AsInt16x8().GetElem(0)
- if l == 3 {
- s[2] = x.GetElem(2)
- }
- } else { // l == 1
- s[0] = x.GetElem(0)
- }
-}
-
-// LoadInt16x8SlicePart loads a Int16x8 from the slice s.
-// If s has fewer than 8 elements, the remaining elements of the vector are filled with zeroes.
-// If s has 8 or more elements, the function is equivalent to LoadInt16x8Slice.
-func LoadInt16x8SlicePart(s []int16) Int16x8 {
- l := len(s)
- if l >= 8 {
- return LoadInt16x8Slice(s)
- }
- var x Int16x8
- if l == 0 {
- return x
- }
- if l >= 4 { // 4-7
- x = x.AsInt64x2().SetElem(0, *int64atP16(&s[0])).AsInt16x8()
- if l >= 6 {
- x = x.AsInt32x4().SetElem(4/2, *int32atP16(&s[4])).AsInt16x8()
- if l == 7 {
- x = x.SetElem(6, s[6])
- }
- } else if l == 5 {
- x = x.SetElem(4, s[4])
- }
- } else if l >= 2 { // 2,3
- x = x.AsInt32x4().SetElem(0, *int32atP16(&s[0])).AsInt16x8()
- if l == 3 {
- x = x.SetElem(2, s[2])
- }
- } else { // l == 1
- x = x.SetElem(0, s[0])
- }
- return x
-}
-
-// StoreSlicePart stores the elements of x into the slice s.
-// It stores as many elements as will fit in s.
-// If s has 8 or more elements, the method is equivalent to x.StoreSlice.
-func (x Int16x8) StoreSlicePart(s []int16) {
- l := len(s)
- if l >= 8 {
- x.StoreSlice(s)
- return
- }
- if l == 0 {
- return
- }
- if l >= 4 { // 4-7
- *int64atP16(&s[0]) = x.AsInt64x2().GetElem(0)
- if l >= 6 {
- *int32atP16(&s[4]) = x.AsInt32x4().GetElem(4 / 2)
- if l == 7 {
- s[6] = x.GetElem(6)
- }
- } else if l == 5 {
- s[4] = x.GetElem(4)
- }
- } else if l >= 2 { // 2,3
- *int32atP16(&s[0]) = x.AsInt32x4().GetElem(0)
- if l == 3 {
- s[2] = x.GetElem(2)
- }
- } else { // l == 1
- s[0] = x.GetElem(0)
- }
- return
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build goexperiment.simd && amd64
-
-package simd
-
-import (
- "internal/strconv"
-)
-
-type number interface {
- ~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr | ~float32 | ~float64
-}
-
-func sliceToString[T number](x []T) string {
- s := ""
- pfx := "{"
- for _, y := range x {
- s += pfx
- pfx = ","
- switch e := any(y).(type) {
- case int8:
- s += strconv.Itoa(int(e))
- case int16:
- s += strconv.Itoa(int(e))
- case int32:
- s += strconv.Itoa(int(e))
- case int64:
- s += strconv.Itoa(int(e))
- case uint8:
- s += strconv.FormatUint(uint64(e), 10)
- case uint16:
- s += strconv.FormatUint(uint64(e), 10)
- case uint32:
- s += strconv.FormatUint(uint64(e), 10)
- case uint64:
- s += strconv.FormatUint(uint64(e), 10)
- case float32:
- s += strconv.FormatFloat(float64(e), 'g', -1, 32)
- case float64:
- s += strconv.FormatFloat(e, 'g', -1, 64)
- }
- }
- s += "}"
- return s
-}
+++ /dev/null
-// Copyright 2025 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package main
-
-import (
- "fmt"
- "os"
- "simd"
- "unsafe"
-)
-
-func load(s []float64) simd.Float64x4 {
- return simd.LoadFloat64x4((*[4]float64)(s[:4]))
-}
-
-type S1 = simd.Float64x4
-
-type S2 simd.Float64x4
-
-func (s S2) Len() int {
- return simd.Float64x4(s).Len()
-}
-
-func (s S2) Load(a []float64) S2 {
- return S2(load(a))
-}
-
-func (s S2) Store(a *[4]float64) {
- simd.Float64x4(s).Store(a)
-}
-
-func (s S2) Add(a S2) S2 {
- return S2(simd.Float64x4(s).Add(simd.Float64x4(a)))
-}
-
-func (s S2) Mul(a S2) S2 {
- return S2(simd.Float64x4(s).Mul(simd.Float64x4(a)))
-}
-
-type S3 struct {
- simd.Float64x4
-}
-
-func ip64_0(a, b []float64) float64 {
- s := 0.0
- for i := range a {
- s += a[i] * b[i]
- }
- return s
-}
-
-func ip64_1(a, b []float64) float64 {
- var z S1
- sum := z
- var i int
- stride := z.Len()
- for ; i <= len(a)-stride; i += stride {
- va := load(a[i:])
- vb := load(b[i:])
- sum = sum.Add(va.Mul(vb))
- }
- var tmp [4]float64
- sum.Store(&tmp)
- return tmp[0] + tmp[1] + tmp[2] + tmp[3]
-}
-
-func ip64_1a(a, b []float64) float64 {
- var z S1
- sum := z
- var i int
- stride := z.Len()
- for ; i <= len(a)-stride; i += stride {
- va := load(a[i:])
- vb := load(b[i:])
- sum = FMA(sum, va, vb)
- }
- var tmp [4]float64
- sum.Store(&tmp)
- return tmp[0] + tmp[1] + tmp[2] + tmp[3]
-}
-
-//go:noinline
-func FMA(a, b, c simd.Float64x4) simd.Float64x4 {
- return a.Add(b.Mul(c))
-}
-
-func ip64_2(a, b []float64) float64 {
- var z S2
- sum := z
- var i int
- stride := z.Len()
- for ; i <= len(a)-stride; i += stride {
- va := z.Load(a[i:])
- vb := z.Load(b[i:])
- sum = sum.Add(va.Mul(vb))
- }
- var tmp [4]float64
- sum.Store(&tmp)
- return tmp[0] + tmp[1] + tmp[2] + tmp[3]
-}
-
-func ip64_3(a, b []float64) float64 {
- var z S3
- sum := z
- var i int
- stride := z.Len()
- for ; i <= len(a)-stride; i += stride {
- va := load(a[i:])
- vb := load(b[i:])
- sum = S3{sum.Add(va.Mul(vb))}
- }
- var tmp [4]float64
- sum.Store(&tmp)
- return tmp[0] + tmp[1] + tmp[2] + tmp[3]
-}
-
-func main() {
- a := []float64{1, 2, 3, 4, 5, 6, 7, 8}
- ip0 := ip64_0(a, a)
- ip1 := ip64_1(a, a)
- ip1a := ip64_1a(a, a)
- ip2 := ip64_2(a, a)
- ip3 := ip64_3(a, a)
- fmt.Printf("Test IP = %f\n", ip0)
- fmt.Printf("SIMD IP 1 = %f\n", ip1)
- fmt.Printf("SIMD IP 1a = %f\n", ip1a)
- fmt.Printf("SIMD IP 2 = %f\n", ip2)
- fmt.Printf("SIMD IP 3 = %f\n", ip3)
- var z1 S1
- var z2 S2
- var z3 S2
-
- s1, s2, s3 := unsafe.Sizeof(z1), unsafe.Sizeof(z2), unsafe.Sizeof(z3)
-
- fmt.Printf("unsafe.Sizeof(z1, z2, z3)=%d, %d, %d\n", s1, s2, s3)
-
- fail := false
-
- if s1 != 32 || s2 != 32 || s3 != 32 {
- fmt.Println("Failed a sizeof check, should all be 32")
- fail = true
- }
-
- if ip1 != ip0 || ip1a != ip0 || ip2 != ip0 || ip3 != ip0 {
- fmt.Println("Failed an inner product check, should all be", ip0)
- fail = true
- }
-
- if fail {
- os.Exit(1)
- }
-}
+++ /dev/null
-// Code generated by x/arch/internal/simdgen using 'go run . -xedPath $XED_PATH -o godefs -goroot $GOROOT go.yaml types.yaml categories.yaml'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-// v128 is a tag type that tells the compiler that this is really 128-bit SIMD
-type v128 struct {
- _128 [0]func() // uncomparable
-}
-
-// Float32x4 is a 128-bit SIMD vector of 4 float32
-type Float32x4 struct {
- float32x4 v128
- vals [4]float32
-}
-
-// Len returns the number of elements in a Float32x4
-func (x Float32x4) Len() int { return 4 }
-
-// LoadFloat32x4 loads a Float32x4 from an array
-//
-//go:noescape
-func LoadFloat32x4(y *[4]float32) Float32x4
-
-// Store stores a Float32x4 to an array
-//
-//go:noescape
-func (x Float32x4) Store(y *[4]float32)
-
-// LoadMaskedFloat32x4 loads a Float32x4 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedFloat32x4(y *[4]float32, mask Mask32x4) Float32x4
-
-// StoreMasked stores a Float32x4 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func (x Float32x4) StoreMasked(y *[4]float32, mask Mask32x4)
-
-// Float64x2 is a 128-bit SIMD vector of 2 float64
-type Float64x2 struct {
- float64x2 v128
- vals [2]float64
-}
-
-// Len returns the number of elements in a Float64x2
-func (x Float64x2) Len() int { return 2 }
-
-// LoadFloat64x2 loads a Float64x2 from an array
-//
-//go:noescape
-func LoadFloat64x2(y *[2]float64) Float64x2
-
-// Store stores a Float64x2 to an array
-//
-//go:noescape
-func (x Float64x2) Store(y *[2]float64)
-
-// LoadMaskedFloat64x2 loads a Float64x2 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedFloat64x2(y *[2]float64, mask Mask64x2) Float64x2
-
-// StoreMasked stores a Float64x2 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func (x Float64x2) StoreMasked(y *[2]float64, mask Mask64x2)
-
-// Int8x16 is a 128-bit SIMD vector of 16 int8
-type Int8x16 struct {
- int8x16 v128
- vals [16]int8
-}
-
-// Len returns the number of elements in a Int8x16
-func (x Int8x16) Len() int { return 16 }
-
-// LoadInt8x16 loads a Int8x16 from an array
-//
-//go:noescape
-func LoadInt8x16(y *[16]int8) Int8x16
-
-// Store stores a Int8x16 to an array
-//
-//go:noescape
-func (x Int8x16) Store(y *[16]int8)
-
-// Int16x8 is a 128-bit SIMD vector of 8 int16
-type Int16x8 struct {
- int16x8 v128
- vals [8]int16
-}
-
-// Len returns the number of elements in a Int16x8
-func (x Int16x8) Len() int { return 8 }
-
-// LoadInt16x8 loads a Int16x8 from an array
-//
-//go:noescape
-func LoadInt16x8(y *[8]int16) Int16x8
-
-// Store stores a Int16x8 to an array
-//
-//go:noescape
-func (x Int16x8) Store(y *[8]int16)
-
-// Int32x4 is a 128-bit SIMD vector of 4 int32
-type Int32x4 struct {
- int32x4 v128
- vals [4]int32
-}
-
-// Len returns the number of elements in a Int32x4
-func (x Int32x4) Len() int { return 4 }
-
-// LoadInt32x4 loads a Int32x4 from an array
-//
-//go:noescape
-func LoadInt32x4(y *[4]int32) Int32x4
-
-// Store stores a Int32x4 to an array
-//
-//go:noescape
-func (x Int32x4) Store(y *[4]int32)
-
-// LoadMaskedInt32x4 loads a Int32x4 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedInt32x4(y *[4]int32, mask Mask32x4) Int32x4
-
-// StoreMasked stores a Int32x4 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func (x Int32x4) StoreMasked(y *[4]int32, mask Mask32x4)
-
-// Int64x2 is a 128-bit SIMD vector of 2 int64
-type Int64x2 struct {
- int64x2 v128
- vals [2]int64
-}
-
-// Len returns the number of elements in a Int64x2
-func (x Int64x2) Len() int { return 2 }
-
-// LoadInt64x2 loads a Int64x2 from an array
-//
-//go:noescape
-func LoadInt64x2(y *[2]int64) Int64x2
-
-// Store stores a Int64x2 to an array
-//
-//go:noescape
-func (x Int64x2) Store(y *[2]int64)
-
-// LoadMaskedInt64x2 loads a Int64x2 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedInt64x2(y *[2]int64, mask Mask64x2) Int64x2
-
-// StoreMasked stores a Int64x2 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func (x Int64x2) StoreMasked(y *[2]int64, mask Mask64x2)
-
-// Uint8x16 is a 128-bit SIMD vector of 16 uint8
-type Uint8x16 struct {
- uint8x16 v128
- vals [16]uint8
-}
-
-// Len returns the number of elements in a Uint8x16
-func (x Uint8x16) Len() int { return 16 }
-
-// LoadUint8x16 loads a Uint8x16 from an array
-//
-//go:noescape
-func LoadUint8x16(y *[16]uint8) Uint8x16
-
-// Store stores a Uint8x16 to an array
-//
-//go:noescape
-func (x Uint8x16) Store(y *[16]uint8)
-
-// Uint16x8 is a 128-bit SIMD vector of 8 uint16
-type Uint16x8 struct {
- uint16x8 v128
- vals [8]uint16
-}
-
-// Len returns the number of elements in a Uint16x8
-func (x Uint16x8) Len() int { return 8 }
-
-// LoadUint16x8 loads a Uint16x8 from an array
-//
-//go:noescape
-func LoadUint16x8(y *[8]uint16) Uint16x8
-
-// Store stores a Uint16x8 to an array
-//
-//go:noescape
-func (x Uint16x8) Store(y *[8]uint16)
-
-// Uint32x4 is a 128-bit SIMD vector of 4 uint32
-type Uint32x4 struct {
- uint32x4 v128
- vals [4]uint32
-}
-
-// Len returns the number of elements in a Uint32x4
-func (x Uint32x4) Len() int { return 4 }
-
-// LoadUint32x4 loads a Uint32x4 from an array
-//
-//go:noescape
-func LoadUint32x4(y *[4]uint32) Uint32x4
-
-// Store stores a Uint32x4 to an array
-//
-//go:noescape
-func (x Uint32x4) Store(y *[4]uint32)
-
-// LoadMaskedUint32x4 loads a Uint32x4 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedUint32x4(y *[4]uint32, mask Mask32x4) Uint32x4
-
-// StoreMasked stores a Uint32x4 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func (x Uint32x4) StoreMasked(y *[4]uint32, mask Mask32x4)
-
-// Uint64x2 is a 128-bit SIMD vector of 2 uint64
-type Uint64x2 struct {
- uint64x2 v128
- vals [2]uint64
-}
-
-// Len returns the number of elements in a Uint64x2
-func (x Uint64x2) Len() int { return 2 }
-
-// LoadUint64x2 loads a Uint64x2 from an array
-//
-//go:noescape
-func LoadUint64x2(y *[2]uint64) Uint64x2
-
-// Store stores a Uint64x2 to an array
-//
-//go:noescape
-func (x Uint64x2) Store(y *[2]uint64)
-
-// LoadMaskedUint64x2 loads a Uint64x2 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedUint64x2(y *[2]uint64, mask Mask64x2) Uint64x2
-
-// StoreMasked stores a Uint64x2 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func (x Uint64x2) StoreMasked(y *[2]uint64, mask Mask64x2)
-
-// Mask8x16 is a 128-bit SIMD vector of 16 int8
-type Mask8x16 struct {
- int8x16 v128
- vals [16]int8
-}
-
-// Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVB, CPU Feature: AVX512
-func Mask8x16FromBits(y uint16) Mask8x16
-
-// ToBits constructs a bitmap from a Mask8x16, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVB, CPU Features: AVX512
-func (x Mask8x16) ToBits() uint16
-
-// Mask16x8 is a 128-bit SIMD vector of 8 int16
-type Mask16x8 struct {
- int16x8 v128
- vals [8]int16
-}
-
-// Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVW, CPU Feature: AVX512
-func Mask16x8FromBits(y uint8) Mask16x8
-
-// ToBits constructs a bitmap from a Mask16x8, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVW, CPU Features: AVX512
-func (x Mask16x8) ToBits() uint8
-
-// Mask32x4 is a 128-bit SIMD vector of 4 int32
-type Mask32x4 struct {
- int32x4 v128
- vals [4]int32
-}
-
-// Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// Asm: KMOVD, CPU Feature: AVX512
-func Mask32x4FromBits(y uint8) Mask32x4
-
-// ToBits constructs a bitmap from a Mask32x4, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// Asm: KMOVD, CPU Features: AVX512
-func (x Mask32x4) ToBits() uint8
-
-// Mask64x2 is a 128-bit SIMD vector of 2 int64
-type Mask64x2 struct {
- int64x2 v128
- vals [2]int64
-}
-
-// Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 2 bits of y are used.
-//
-// Asm: KMOVQ, CPU Feature: AVX512
-func Mask64x2FromBits(y uint8) Mask64x2
-
-// ToBits constructs a bitmap from a Mask64x2, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 2 bits of y are used.
-//
-// Asm: KMOVQ, CPU Features: AVX512
-func (x Mask64x2) ToBits() uint8
-
-// v256 is a tag type that tells the compiler that this is really 256-bit SIMD
-type v256 struct {
- _256 [0]func() // uncomparable
-}
-
-// Float32x8 is a 256-bit SIMD vector of 8 float32
-type Float32x8 struct {
- float32x8 v256
- vals [8]float32
-}
-
-// Len returns the number of elements in a Float32x8
-func (x Float32x8) Len() int { return 8 }
-
-// LoadFloat32x8 loads a Float32x8 from an array
-//
-//go:noescape
-func LoadFloat32x8(y *[8]float32) Float32x8
-
-// Store stores a Float32x8 to an array
-//
-//go:noescape
-func (x Float32x8) Store(y *[8]float32)
-
-// LoadMaskedFloat32x8 loads a Float32x8 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedFloat32x8(y *[8]float32, mask Mask32x8) Float32x8
-
-// StoreMasked stores a Float32x8 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func (x Float32x8) StoreMasked(y *[8]float32, mask Mask32x8)
-
-// Float64x4 is a 256-bit SIMD vector of 4 float64
-type Float64x4 struct {
- float64x4 v256
- vals [4]float64
-}
-
-// Len returns the number of elements in a Float64x4
-func (x Float64x4) Len() int { return 4 }
-
-// LoadFloat64x4 loads a Float64x4 from an array
-//
-//go:noescape
-func LoadFloat64x4(y *[4]float64) Float64x4
-
-// Store stores a Float64x4 to an array
-//
-//go:noescape
-func (x Float64x4) Store(y *[4]float64)
-
-// LoadMaskedFloat64x4 loads a Float64x4 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedFloat64x4(y *[4]float64, mask Mask64x4) Float64x4
-
-// StoreMasked stores a Float64x4 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func (x Float64x4) StoreMasked(y *[4]float64, mask Mask64x4)
-
-// Int8x32 is a 256-bit SIMD vector of 32 int8
-type Int8x32 struct {
- int8x32 v256
- vals [32]int8
-}
-
-// Len returns the number of elements in a Int8x32
-func (x Int8x32) Len() int { return 32 }
-
-// LoadInt8x32 loads a Int8x32 from an array
-//
-//go:noescape
-func LoadInt8x32(y *[32]int8) Int8x32
-
-// Store stores a Int8x32 to an array
-//
-//go:noescape
-func (x Int8x32) Store(y *[32]int8)
-
-// Int16x16 is a 256-bit SIMD vector of 16 int16
-type Int16x16 struct {
- int16x16 v256
- vals [16]int16
-}
-
-// Len returns the number of elements in a Int16x16
-func (x Int16x16) Len() int { return 16 }
-
-// LoadInt16x16 loads a Int16x16 from an array
-//
-//go:noescape
-func LoadInt16x16(y *[16]int16) Int16x16
-
-// Store stores a Int16x16 to an array
-//
-//go:noescape
-func (x Int16x16) Store(y *[16]int16)
-
-// Int32x8 is a 256-bit SIMD vector of 8 int32
-type Int32x8 struct {
- int32x8 v256
- vals [8]int32
-}
-
-// Len returns the number of elements in a Int32x8
-func (x Int32x8) Len() int { return 8 }
-
-// LoadInt32x8 loads a Int32x8 from an array
-//
-//go:noescape
-func LoadInt32x8(y *[8]int32) Int32x8
-
-// Store stores a Int32x8 to an array
-//
-//go:noescape
-func (x Int32x8) Store(y *[8]int32)
-
-// LoadMaskedInt32x8 loads a Int32x8 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedInt32x8(y *[8]int32, mask Mask32x8) Int32x8
-
-// StoreMasked stores a Int32x8 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func (x Int32x8) StoreMasked(y *[8]int32, mask Mask32x8)
-
-// Int64x4 is a 256-bit SIMD vector of 4 int64
-type Int64x4 struct {
- int64x4 v256
- vals [4]int64
-}
-
-// Len returns the number of elements in a Int64x4
-func (x Int64x4) Len() int { return 4 }
-
-// LoadInt64x4 loads a Int64x4 from an array
-//
-//go:noescape
-func LoadInt64x4(y *[4]int64) Int64x4
-
-// Store stores a Int64x4 to an array
-//
-//go:noescape
-func (x Int64x4) Store(y *[4]int64)
-
-// LoadMaskedInt64x4 loads a Int64x4 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedInt64x4(y *[4]int64, mask Mask64x4) Int64x4
-
-// StoreMasked stores a Int64x4 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func (x Int64x4) StoreMasked(y *[4]int64, mask Mask64x4)
-
-// Uint8x32 is a 256-bit SIMD vector of 32 uint8
-type Uint8x32 struct {
- uint8x32 v256
- vals [32]uint8
-}
-
-// Len returns the number of elements in a Uint8x32
-func (x Uint8x32) Len() int { return 32 }
-
-// LoadUint8x32 loads a Uint8x32 from an array
-//
-//go:noescape
-func LoadUint8x32(y *[32]uint8) Uint8x32
-
-// Store stores a Uint8x32 to an array
-//
-//go:noescape
-func (x Uint8x32) Store(y *[32]uint8)
-
-// Uint16x16 is a 256-bit SIMD vector of 16 uint16
-type Uint16x16 struct {
- uint16x16 v256
- vals [16]uint16
-}
-
-// Len returns the number of elements in a Uint16x16
-func (x Uint16x16) Len() int { return 16 }
-
-// LoadUint16x16 loads a Uint16x16 from an array
-//
-//go:noescape
-func LoadUint16x16(y *[16]uint16) Uint16x16
-
-// Store stores a Uint16x16 to an array
-//
-//go:noescape
-func (x Uint16x16) Store(y *[16]uint16)
-
-// Uint32x8 is a 256-bit SIMD vector of 8 uint32
-type Uint32x8 struct {
- uint32x8 v256
- vals [8]uint32
-}
-
-// Len returns the number of elements in a Uint32x8
-func (x Uint32x8) Len() int { return 8 }
-
-// LoadUint32x8 loads a Uint32x8 from an array
-//
-//go:noescape
-func LoadUint32x8(y *[8]uint32) Uint32x8
-
-// Store stores a Uint32x8 to an array
-//
-//go:noescape
-func (x Uint32x8) Store(y *[8]uint32)
-
-// LoadMaskedUint32x8 loads a Uint32x8 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedUint32x8(y *[8]uint32, mask Mask32x8) Uint32x8
-
-// StoreMasked stores a Uint32x8 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVD, CPU Feature: AVX2
-//
-//go:noescape
-func (x Uint32x8) StoreMasked(y *[8]uint32, mask Mask32x8)
-
-// Uint64x4 is a 256-bit SIMD vector of 4 uint64
-type Uint64x4 struct {
- uint64x4 v256
- vals [4]uint64
-}
-
-// Len returns the number of elements in a Uint64x4
-func (x Uint64x4) Len() int { return 4 }
-
-// LoadUint64x4 loads a Uint64x4 from an array
-//
-//go:noescape
-func LoadUint64x4(y *[4]uint64) Uint64x4
-
-// Store stores a Uint64x4 to an array
-//
-//go:noescape
-func (x Uint64x4) Store(y *[4]uint64)
-
-// LoadMaskedUint64x4 loads a Uint64x4 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func LoadMaskedUint64x4(y *[4]uint64, mask Mask64x4) Uint64x4
-
-// StoreMasked stores a Uint64x4 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMASKMOVQ, CPU Feature: AVX2
-//
-//go:noescape
-func (x Uint64x4) StoreMasked(y *[4]uint64, mask Mask64x4)
-
-// Mask8x32 is a 256-bit SIMD vector of 32 int8
-type Mask8x32 struct {
- int8x32 v256
- vals [32]int8
-}
-
-// Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVB, CPU Feature: AVX512
-func Mask8x32FromBits(y uint32) Mask8x32
-
-// ToBits constructs a bitmap from a Mask8x32, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVB, CPU Features: AVX512
-func (x Mask8x32) ToBits() uint32
-
-// Mask16x16 is a 256-bit SIMD vector of 16 int16
-type Mask16x16 struct {
- int16x16 v256
- vals [16]int16
-}
-
-// Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVW, CPU Feature: AVX512
-func Mask16x16FromBits(y uint16) Mask16x16
-
-// ToBits constructs a bitmap from a Mask16x16, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVW, CPU Features: AVX512
-func (x Mask16x16) ToBits() uint16
-
-// Mask32x8 is a 256-bit SIMD vector of 8 int32
-type Mask32x8 struct {
- int32x8 v256
- vals [8]int32
-}
-
-// Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVD, CPU Feature: AVX512
-func Mask32x8FromBits(y uint8) Mask32x8
-
-// ToBits constructs a bitmap from a Mask32x8, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVD, CPU Features: AVX512
-func (x Mask32x8) ToBits() uint8
-
-// Mask64x4 is a 256-bit SIMD vector of 4 int64
-type Mask64x4 struct {
- int64x4 v256
- vals [4]int64
-}
-
-// Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// Asm: KMOVQ, CPU Feature: AVX512
-func Mask64x4FromBits(y uint8) Mask64x4
-
-// ToBits constructs a bitmap from a Mask64x4, where 1 means set for the indexed element, 0 means unset.
-// Only the lower 4 bits of y are used.
-//
-// Asm: KMOVQ, CPU Features: AVX512
-func (x Mask64x4) ToBits() uint8
-
-// v512 is a tag type that tells the compiler that this is really 512-bit SIMD
-type v512 struct {
- _512 [0]func() // uncomparable
-}
-
-// Float32x16 is a 512-bit SIMD vector of 16 float32
-type Float32x16 struct {
- float32x16 v512
- vals [16]float32
-}
-
-// Len returns the number of elements in a Float32x16
-func (x Float32x16) Len() int { return 16 }
-
-// LoadFloat32x16 loads a Float32x16 from an array
-//
-//go:noescape
-func LoadFloat32x16(y *[16]float32) Float32x16
-
-// Store stores a Float32x16 to an array
-//
-//go:noescape
-func (x Float32x16) Store(y *[16]float32)
-
-// LoadMaskedFloat32x16 loads a Float32x16 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU32.Z, CPU Feature: AVX512
-//
-//go:noescape
-func LoadMaskedFloat32x16(y *[16]float32, mask Mask32x16) Float32x16
-
-// StoreMasked stores a Float32x16 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU32, CPU Feature: AVX512
-//
-//go:noescape
-func (x Float32x16) StoreMasked(y *[16]float32, mask Mask32x16)
-
-// Float64x8 is a 512-bit SIMD vector of 8 float64
-type Float64x8 struct {
- float64x8 v512
- vals [8]float64
-}
-
-// Len returns the number of elements in a Float64x8
-func (x Float64x8) Len() int { return 8 }
-
-// LoadFloat64x8 loads a Float64x8 from an array
-//
-//go:noescape
-func LoadFloat64x8(y *[8]float64) Float64x8
-
-// Store stores a Float64x8 to an array
-//
-//go:noescape
-func (x Float64x8) Store(y *[8]float64)
-
-// LoadMaskedFloat64x8 loads a Float64x8 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU64.Z, CPU Feature: AVX512
-//
-//go:noescape
-func LoadMaskedFloat64x8(y *[8]float64, mask Mask64x8) Float64x8
-
-// StoreMasked stores a Float64x8 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU64, CPU Feature: AVX512
-//
-//go:noescape
-func (x Float64x8) StoreMasked(y *[8]float64, mask Mask64x8)
-
-// Int8x64 is a 512-bit SIMD vector of 64 int8
-type Int8x64 struct {
- int8x64 v512
- vals [64]int8
-}
-
-// Len returns the number of elements in a Int8x64
-func (x Int8x64) Len() int { return 64 }
-
-// LoadInt8x64 loads a Int8x64 from an array
-//
-//go:noescape
-func LoadInt8x64(y *[64]int8) Int8x64
-
-// Store stores a Int8x64 to an array
-//
-//go:noescape
-func (x Int8x64) Store(y *[64]int8)
-
-// LoadMaskedInt8x64 loads a Int8x64 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU8.Z, CPU Feature: AVX512
-//
-//go:noescape
-func LoadMaskedInt8x64(y *[64]int8, mask Mask8x64) Int8x64
-
-// StoreMasked stores a Int8x64 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU8, CPU Feature: AVX512
-//
-//go:noescape
-func (x Int8x64) StoreMasked(y *[64]int8, mask Mask8x64)
-
-// Int16x32 is a 512-bit SIMD vector of 32 int16
-type Int16x32 struct {
- int16x32 v512
- vals [32]int16
-}
-
-// Len returns the number of elements in a Int16x32
-func (x Int16x32) Len() int { return 32 }
-
-// LoadInt16x32 loads a Int16x32 from an array
-//
-//go:noescape
-func LoadInt16x32(y *[32]int16) Int16x32
-
-// Store stores a Int16x32 to an array
-//
-//go:noescape
-func (x Int16x32) Store(y *[32]int16)
-
-// LoadMaskedInt16x32 loads a Int16x32 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU16.Z, CPU Feature: AVX512
-//
-//go:noescape
-func LoadMaskedInt16x32(y *[32]int16, mask Mask16x32) Int16x32
-
-// StoreMasked stores a Int16x32 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU16, CPU Feature: AVX512
-//
-//go:noescape
-func (x Int16x32) StoreMasked(y *[32]int16, mask Mask16x32)
-
-// Int32x16 is a 512-bit SIMD vector of 16 int32
-type Int32x16 struct {
- int32x16 v512
- vals [16]int32
-}
-
-// Len returns the number of elements in a Int32x16
-func (x Int32x16) Len() int { return 16 }
-
-// LoadInt32x16 loads a Int32x16 from an array
-//
-//go:noescape
-func LoadInt32x16(y *[16]int32) Int32x16
-
-// Store stores a Int32x16 to an array
-//
-//go:noescape
-func (x Int32x16) Store(y *[16]int32)
-
-// LoadMaskedInt32x16 loads a Int32x16 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU32.Z, CPU Feature: AVX512
-//
-//go:noescape
-func LoadMaskedInt32x16(y *[16]int32, mask Mask32x16) Int32x16
-
-// StoreMasked stores a Int32x16 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU32, CPU Feature: AVX512
-//
-//go:noescape
-func (x Int32x16) StoreMasked(y *[16]int32, mask Mask32x16)
-
-// Int64x8 is a 512-bit SIMD vector of 8 int64
-type Int64x8 struct {
- int64x8 v512
- vals [8]int64
-}
-
-// Len returns the number of elements in a Int64x8
-func (x Int64x8) Len() int { return 8 }
-
-// LoadInt64x8 loads a Int64x8 from an array
-//
-//go:noescape
-func LoadInt64x8(y *[8]int64) Int64x8
-
-// Store stores a Int64x8 to an array
-//
-//go:noescape
-func (x Int64x8) Store(y *[8]int64)
-
-// LoadMaskedInt64x8 loads a Int64x8 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU64.Z, CPU Feature: AVX512
-//
-//go:noescape
-func LoadMaskedInt64x8(y *[8]int64, mask Mask64x8) Int64x8
-
-// StoreMasked stores a Int64x8 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU64, CPU Feature: AVX512
-//
-//go:noescape
-func (x Int64x8) StoreMasked(y *[8]int64, mask Mask64x8)
-
-// Uint8x64 is a 512-bit SIMD vector of 64 uint8
-type Uint8x64 struct {
- uint8x64 v512
- vals [64]uint8
-}
-
-// Len returns the number of elements in a Uint8x64
-func (x Uint8x64) Len() int { return 64 }
-
-// LoadUint8x64 loads a Uint8x64 from an array
-//
-//go:noescape
-func LoadUint8x64(y *[64]uint8) Uint8x64
-
-// Store stores a Uint8x64 to an array
-//
-//go:noescape
-func (x Uint8x64) Store(y *[64]uint8)
-
-// LoadMaskedUint8x64 loads a Uint8x64 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU8.Z, CPU Feature: AVX512
-//
-//go:noescape
-func LoadMaskedUint8x64(y *[64]uint8, mask Mask8x64) Uint8x64
-
-// StoreMasked stores a Uint8x64 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU8, CPU Feature: AVX512
-//
-//go:noescape
-func (x Uint8x64) StoreMasked(y *[64]uint8, mask Mask8x64)
-
-// Uint16x32 is a 512-bit SIMD vector of 32 uint16
-type Uint16x32 struct {
- uint16x32 v512
- vals [32]uint16
-}
-
-// Len returns the number of elements in a Uint16x32
-func (x Uint16x32) Len() int { return 32 }
-
-// LoadUint16x32 loads a Uint16x32 from an array
-//
-//go:noescape
-func LoadUint16x32(y *[32]uint16) Uint16x32
-
-// Store stores a Uint16x32 to an array
-//
-//go:noescape
-func (x Uint16x32) Store(y *[32]uint16)
-
-// LoadMaskedUint16x32 loads a Uint16x32 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU16.Z, CPU Feature: AVX512
-//
-//go:noescape
-func LoadMaskedUint16x32(y *[32]uint16, mask Mask16x32) Uint16x32
-
-// StoreMasked stores a Uint16x32 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU16, CPU Feature: AVX512
-//
-//go:noescape
-func (x Uint16x32) StoreMasked(y *[32]uint16, mask Mask16x32)
-
-// Uint32x16 is a 512-bit SIMD vector of 16 uint32
-type Uint32x16 struct {
- uint32x16 v512
- vals [16]uint32
-}
-
-// Len returns the number of elements in a Uint32x16
-func (x Uint32x16) Len() int { return 16 }
-
-// LoadUint32x16 loads a Uint32x16 from an array
-//
-//go:noescape
-func LoadUint32x16(y *[16]uint32) Uint32x16
-
-// Store stores a Uint32x16 to an array
-//
-//go:noescape
-func (x Uint32x16) Store(y *[16]uint32)
-
-// LoadMaskedUint32x16 loads a Uint32x16 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU32.Z, CPU Feature: AVX512
-//
-//go:noescape
-func LoadMaskedUint32x16(y *[16]uint32, mask Mask32x16) Uint32x16
-
-// StoreMasked stores a Uint32x16 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU32, CPU Feature: AVX512
-//
-//go:noescape
-func (x Uint32x16) StoreMasked(y *[16]uint32, mask Mask32x16)
-
-// Uint64x8 is a 512-bit SIMD vector of 8 uint64
-type Uint64x8 struct {
- uint64x8 v512
- vals [8]uint64
-}
-
-// Len returns the number of elements in a Uint64x8
-func (x Uint64x8) Len() int { return 8 }
-
-// LoadUint64x8 loads a Uint64x8 from an array
-//
-//go:noescape
-func LoadUint64x8(y *[8]uint64) Uint64x8
-
-// Store stores a Uint64x8 to an array
-//
-//go:noescape
-func (x Uint64x8) Store(y *[8]uint64)
-
-// LoadMaskedUint64x8 loads a Uint64x8 from an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU64.Z, CPU Feature: AVX512
-//
-//go:noescape
-func LoadMaskedUint64x8(y *[8]uint64, mask Mask64x8) Uint64x8
-
-// StoreMasked stores a Uint64x8 to an array,
-// at those elements enabled by mask
-//
-// Asm: VMOVDQU64, CPU Feature: AVX512
-//
-//go:noescape
-func (x Uint64x8) StoreMasked(y *[8]uint64, mask Mask64x8)
-
-// Mask8x64 is a 512-bit SIMD vector of 64 int8
-type Mask8x64 struct {
- int8x64 v512
- vals [64]int8
-}
-
-// Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVB, CPU Feature: AVX512
-func Mask8x64FromBits(y uint64) Mask8x64
-
-// ToBits constructs a bitmap from a Mask8x64, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVB, CPU Features: AVX512
-func (x Mask8x64) ToBits() uint64
-
-// Mask16x32 is a 512-bit SIMD vector of 32 int16
-type Mask16x32 struct {
- int16x32 v512
- vals [32]int16
-}
-
-// Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVW, CPU Feature: AVX512
-func Mask16x32FromBits(y uint32) Mask16x32
-
-// ToBits constructs a bitmap from a Mask16x32, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVW, CPU Features: AVX512
-func (x Mask16x32) ToBits() uint32
-
-// Mask32x16 is a 512-bit SIMD vector of 16 int32
-type Mask32x16 struct {
- int32x16 v512
- vals [16]int32
-}
-
-// Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVD, CPU Feature: AVX512
-func Mask32x16FromBits(y uint16) Mask32x16
-
-// ToBits constructs a bitmap from a Mask32x16, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVD, CPU Features: AVX512
-func (x Mask32x16) ToBits() uint16
-
-// Mask64x8 is a 512-bit SIMD vector of 8 int64
-type Mask64x8 struct {
- int64x8 v512
- vals [8]int64
-}
-
-// Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVQ, CPU Feature: AVX512
-func Mask64x8FromBits(y uint8) Mask64x8
-
-// ToBits constructs a bitmap from a Mask64x8, where 1 means set for the indexed element, 0 means unset.
-//
-// Asm: KMOVQ, CPU Features: AVX512
-func (x Mask64x8) ToBits() uint8
+++ /dev/null
-// Code generated by 'go run genfiles.go'; DO NOT EDIT.
-
-//go:build goexperiment.simd
-
-package simd
-
-import "unsafe"
-
-// paInt8x16 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt8x16(s []int8) *[16]int8 {
- return (*[16]int8)(unsafe.Pointer(&s[0]))
-}
-
-// paInt16x8 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt16x8(s []int16) *[8]int16 {
- return (*[8]int16)(unsafe.Pointer(&s[0]))
-}
-
-// paInt32x4 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt32x4(s []int32) *[4]int32 {
- return (*[4]int32)(unsafe.Pointer(&s[0]))
-}
-
-// paInt64x2 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt64x2(s []int64) *[2]int64 {
- return (*[2]int64)(unsafe.Pointer(&s[0]))
-}
-
-// paUint8x16 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint8x16(s []uint8) *[16]uint8 {
- return (*[16]uint8)(unsafe.Pointer(&s[0]))
-}
-
-// paUint16x8 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint16x8(s []uint16) *[8]uint16 {
- return (*[8]uint16)(unsafe.Pointer(&s[0]))
-}
-
-// paUint32x4 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint32x4(s []uint32) *[4]uint32 {
- return (*[4]uint32)(unsafe.Pointer(&s[0]))
-}
-
-// paUint64x2 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint64x2(s []uint64) *[2]uint64 {
- return (*[2]uint64)(unsafe.Pointer(&s[0]))
-}
-
-// paFloat32x4 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paFloat32x4(s []float32) *[4]float32 {
- return (*[4]float32)(unsafe.Pointer(&s[0]))
-}
-
-// paFloat64x2 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paFloat64x2(s []float64) *[2]float64 {
- return (*[2]float64)(unsafe.Pointer(&s[0]))
-}
-
-// paInt8x32 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt8x32(s []int8) *[32]int8 {
- return (*[32]int8)(unsafe.Pointer(&s[0]))
-}
-
-// paInt16x16 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt16x16(s []int16) *[16]int16 {
- return (*[16]int16)(unsafe.Pointer(&s[0]))
-}
-
-// paInt32x8 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt32x8(s []int32) *[8]int32 {
- return (*[8]int32)(unsafe.Pointer(&s[0]))
-}
-
-// paInt64x4 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt64x4(s []int64) *[4]int64 {
- return (*[4]int64)(unsafe.Pointer(&s[0]))
-}
-
-// paUint8x32 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint8x32(s []uint8) *[32]uint8 {
- return (*[32]uint8)(unsafe.Pointer(&s[0]))
-}
-
-// paUint16x16 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint16x16(s []uint16) *[16]uint16 {
- return (*[16]uint16)(unsafe.Pointer(&s[0]))
-}
-
-// paUint32x8 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint32x8(s []uint32) *[8]uint32 {
- return (*[8]uint32)(unsafe.Pointer(&s[0]))
-}
-
-// paUint64x4 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint64x4(s []uint64) *[4]uint64 {
- return (*[4]uint64)(unsafe.Pointer(&s[0]))
-}
-
-// paFloat32x8 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paFloat32x8(s []float32) *[8]float32 {
- return (*[8]float32)(unsafe.Pointer(&s[0]))
-}
-
-// paFloat64x4 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paFloat64x4(s []float64) *[4]float64 {
- return (*[4]float64)(unsafe.Pointer(&s[0]))
-}
-
-// paInt8x64 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt8x64(s []int8) *[64]int8 {
- return (*[64]int8)(unsafe.Pointer(&s[0]))
-}
-
-// paInt16x32 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt16x32(s []int16) *[32]int16 {
- return (*[32]int16)(unsafe.Pointer(&s[0]))
-}
-
-// paInt32x16 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt32x16(s []int32) *[16]int32 {
- return (*[16]int32)(unsafe.Pointer(&s[0]))
-}
-
-// paInt64x8 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paInt64x8(s []int64) *[8]int64 {
- return (*[8]int64)(unsafe.Pointer(&s[0]))
-}
-
-// paUint8x64 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint8x64(s []uint8) *[64]uint8 {
- return (*[64]uint8)(unsafe.Pointer(&s[0]))
-}
-
-// paUint16x32 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint16x32(s []uint16) *[32]uint16 {
- return (*[32]uint16)(unsafe.Pointer(&s[0]))
-}
-
-// paUint32x16 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint32x16(s []uint32) *[16]uint32 {
- return (*[16]uint32)(unsafe.Pointer(&s[0]))
-}
-
-// paUint64x8 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paUint64x8(s []uint64) *[8]uint64 {
- return (*[8]uint64)(unsafe.Pointer(&s[0]))
-}
-
-// paFloat32x16 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paFloat32x16(s []float32) *[16]float32 {
- return (*[16]float32)(unsafe.Pointer(&s[0]))
-}
-
-// paFloat64x8 returns a type-unsafe pointer to array that can
-// only be used with partial load/store operations that only
-// access the known-safe portions of the array.
-func paFloat64x8(s []float64) *[8]float64 {
- return (*[8]float64)(unsafe.Pointer(&s[0]))
-}
package codegen
-import "simd"
+import "simd/archsimd"
func vptest1() bool {
- v1 := simd.LoadUint64x2Slice([]uint64{0, 1})
- v2 := simd.LoadUint64x2Slice([]uint64{0, 0})
+ v1 := archsimd.LoadUint64x2Slice([]uint64{0, 1})
+ v2 := archsimd.LoadUint64x2Slice([]uint64{0, 0})
// amd64:`VPTEST\s(.*)(.*)$`
// amd64:`SETCS\s(.*)$`
return v1.AndNot(v2).IsZero()
}
func vptest2() bool {
- v1 := simd.LoadUint64x2Slice([]uint64{0, 1})
- v2 := simd.LoadUint64x2Slice([]uint64{0, 0})
+ v1 := archsimd.LoadUint64x2Slice([]uint64{0, 1})
+ v2 := archsimd.LoadUint64x2Slice([]uint64{0, 0})
// amd64:`VPTEST\s(.*)(.*)$`
// amd64:`SETEQ\s(.*)$`
return v1.And(v2).IsZero()
}
type Args2 struct {
- V0 simd.Uint8x32
- V1 simd.Uint8x32
+ V0 archsimd.Uint8x32
+ V1 archsimd.Uint8x32
x string
}
//go:noinline
-func simdStructNoSpill(a Args2) simd.Uint8x32 {
+func simdStructNoSpill(a Args2) archsimd.Uint8x32 {
// amd64:-`VMOVDQU\s.*$`
return a.V0.Xor(a.V1)
}
-func simdStructWrapperNoSpill(a Args2) simd.Uint8x32 {
+func simdStructWrapperNoSpill(a Args2) archsimd.Uint8x32 {
// amd64:-`VMOVDQU\s.*$`
a.x = "test"
return simdStructNoSpill(a)
}
//go:noinline
-func simdArrayNoSpill(a [1]Args2) simd.Uint8x32 {
+func simdArrayNoSpill(a [1]Args2) archsimd.Uint8x32 {
// amd64:-`VMOVDQU\s.*$`
return a[0].V0.Xor(a[0].V1)
}
-func simdArrayWrapperNoSpill(a [1]Args2) simd.Uint8x32 {
+func simdArrayWrapperNoSpill(a [1]Args2) archsimd.Uint8x32 {
// amd64:-`VMOVDQU\s.*$`
a[0].x = "test"
return simdArrayNoSpill(a)
}
-func simdFeatureGuardedMaskOpt() simd.Int16x16 {
- var x, y simd.Int16x16
- if simd.X86.AVX512() {
- mask := simd.Mask16x16FromBits(5)
+func simdFeatureGuardedMaskOpt() archsimd.Int16x16 {
+ var x, y archsimd.Int16x16
+ if archsimd.X86.AVX512() {
+ mask := archsimd.Mask16x16FromBits(5)
return x.Add(y).Masked(mask) // amd64:`VPADDW.Z\s.*$`
}
- mask := simd.Mask16x16FromBits(5)
+ mask := archsimd.Mask16x16FromBits(5)
return x.Add(y).Masked(mask) // amd64:`VPAND\s.*$`
}
-func simdMaskedMerge() simd.Int16x16 {
- var x, y simd.Int16x16
- if simd.X86.AVX512() {
- mask := simd.Mask16x16FromBits(5)
+func simdMaskedMerge() archsimd.Int16x16 {
+ var x, y archsimd.Int16x16
+ if archsimd.X86.AVX512() {
+ mask := archsimd.Mask16x16FromBits(5)
return x.Add(y).Merge(x, mask) // amd64:-`VPBLENDVB\s.*$`
}
- mask := simd.Mask16x16FromBits(5)
+ mask := archsimd.Mask16x16FromBits(5)
return x.Add(y).Merge(x, mask) // amd64:`VPBLENDVB\s.*$`
}
package foo
-import "simd"
+import "simd/archsimd"
-func f1(x simd.Int8x16) {
+func f1(x archsimd.Int8x16) {
return // ERROR "has features avx"
}
-func g1() simd.Int8x16 {
- var x simd.Int8x16
+func g1() archsimd.Int8x16 {
+ var x archsimd.Int8x16
return x // ERROR "has features avx$"
}
-type T1 simd.Int8x16
+type T1 archsimd.Int8x16
func (x T1) h() {
return // ERROR "has features avx$"
}
-func f2(x simd.Int8x64) {
+func f2(x archsimd.Int8x64) {
return // ERROR "has features avx[+]avx2[+]avx512$"
}
-func g2() simd.Int8x64 {
- var x simd.Int8x64
+func g2() archsimd.Int8x64 {
+ var x archsimd.Int8x64
return x // ERROR "has features avx[+]avx2[+]avx512$"
}
-type T2 simd.Int8x64
+type T2 archsimd.Int8x64
func (x T2) h() {
return // ERROR "has features avx[+]avx2[+]avx512$"
func f() {
if a == 0 {
- if !simd.X86.AVX512() {
+ if !archsimd.X86.AVX512() {
return
}
println("has avx512") // ERROR "has features avx[+]avx2[+]avx512$"
} else {
- if !simd.X86.AVX2() {
+ if !archsimd.X86.AVX2() {
return
}
println("has avx2") // ERROR "has features avx[+]avx2$"
} // ERROR "has features avx[+]avx2$"
func g() {
- if simd.X86.AVX2() { // ERROR "has features avx[+]avx2$"
+ if archsimd.X86.AVX2() { // ERROR "has features avx[+]avx2$"
for range 5 { // ERROR "has features avx[+]avx2$"
if a < 0 { // ERROR "has features avx[+]avx2$"
a++ // ERROR "has features avx[+]avx2$"
}
func hasIrreducibleLoop() {
- if simd.X86.AVX2() {
+ if archsimd.X86.AVX2() {
goto a // ERROR "has features avx[+]avx2$"
} else {
goto b
println("c")
}
-func ternRewrite(m, w, x, y, z simd.Int32x16) (t0, t1, t2 simd.Int32x16) {
- if !simd.X86.AVX512() { // ERROR "has features avx[+]avx2[+]avx512$"
+func ternRewrite(m, w, x, y, z archsimd.Int32x16) (t0, t1, t2 archsimd.Int32x16) {
+ if !archsimd.X86.AVX512() { // ERROR "has features avx[+]avx2[+]avx512$"
return // ERROR "has features avx[+]avx2[+]avx512$" // all blocks have it because of the vector size
}
t0 = w.Xor(y).Xor(z) // ERROR "Rewriting.*ternInt"
return // ERROR "has features avx[+]avx2[+]avx512$"
}
-func ternTricky1(x, y, z simd.Int32x8) simd.Int32x8 {
+func ternTricky1(x, y, z archsimd.Int32x8) archsimd.Int32x8 {
// Int32x8 is a 256-bit vector and does not guarantee AVX-512
// a is a 3-variable logical expression occurring outside AVX-512 feature check
a := x.Xor(y).Xor(z)
- var w simd.Int32x8
- if !simd.X86.AVX512() { // ERROR "has features avx$"
+ var w archsimd.Int32x8
+ if !archsimd.X86.AVX512() { // ERROR "has features avx$"
// do nothing
} else {
w = y.AndNot(a) // ERROR "has features avx[+]avx2[+]avx512" "Rewriting.*ternInt"
return a.Or(w) // ERROR "has features avx$"
}
-func ternTricky2(x, y, z simd.Int32x8) simd.Int32x8 {
+func ternTricky2(x, y, z archsimd.Int32x8) archsimd.Int32x8 {
// Int32x8 is a 256-bit vector and does not guarantee AVX-512
- var a, w simd.Int32x8
- if !simd.X86.AVX512() { // ERROR "has features avx$"
+ var a, w archsimd.Int32x8
+ if !archsimd.X86.AVX512() { // ERROR "has features avx$"
// do nothing
} else {
a = x.Xor(y).Xor(z)
return a.Or(w) // ERROR "has features avx$"
}
-func ternTricky3(x, y, z simd.Int32x8) simd.Int32x8 {
+func ternTricky3(x, y, z archsimd.Int32x8) archsimd.Int32x8 {
// Int32x8 is a 256-bit vector and does not guarantee AVX-512
a := x.Xor(y).Xor(z)
w := y.AndNot(a)
- if !simd.X86.AVX512() { // ERROR "has features avx$"
+ if !archsimd.X86.AVX512() { // ERROR "has features avx$"
return a // ERROR "has features avx$"
}
// a is a common subexpression
package p
import (
- "simd"
+ "simd/archsimd"
"unsafe"
)
tos *[2][4][4]float32,
blend int,
) {
- tiny := simd.BroadcastFloat32x8(0)
+ tiny := archsimd.BroadcastFloat32x8(0)
for {
- dstCol12 := simd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(dst[0][0:]))))
- dstCol34 := simd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(dst[0][2:]))))
- dstCol56 := simd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(dst[1][0:]))))
- dstCol78 := simd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(dst[1][2:]))))
+ dstCol12 := archsimd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(dst[0][0:]))))
+ dstCol34 := archsimd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(dst[0][2:]))))
+ dstCol56 := archsimd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(dst[1][0:]))))
+ dstCol78 := archsimd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(dst[1][2:]))))
- tosCol12 := simd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(tos[0][0:]))))
- tosCol34 := simd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(tos[0][2:]))))
- tosCol56 := simd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(tos[1][0:]))))
- tosCol78 := simd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(tos[1][2:]))))
+ tosCol12 := archsimd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(tos[0][0:]))))
+ tosCol34 := archsimd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(tos[0][2:]))))
+ tosCol56 := archsimd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(tos[1][0:]))))
+ tosCol78 := archsimd.LoadFloat32x8((*[8]float32)(unsafe.Pointer((*[2][4]float32)(tos[1][2:]))))
- var Cr0, Cr1, Cr2 simd.Float32x8
+ var Cr0, Cr1, Cr2 archsimd.Float32x8
if blend != 0 {
invas := tosCol78.Max(tiny)
invad := dstCol78.Max(tiny)
Cs0 := tosCol12.Mul(invas)
Cs1 := tosCol34.Mul(invas)
Cs2 := tosCol56.Mul(invas)
- var Cm0, Cm1, Cm2 simd.Float32x8
+ var Cm0, Cm1, Cm2 archsimd.Float32x8
switch blend {
case 4:
case 10:
Cr1 = dstCol78.Mul(Cs1).Mul(Cm1)
Cr2 = dstCol78.Mul(Cs2).Mul(Cm2)
}
- var resR, resG, resB, resA simd.Float32x8
+ var resR, resG, resB, resA archsimd.Float32x8
if blend == 0 {
resR = tosCol12
resG = tosCol34
package p
import (
- "simd"
+ "simd/archsimd"
)
func PackComplex(b bool) {
for {
if b {
var indices [4]uint32
- simd.Uint32x4{}.ShiftAllRight(20).Store(&indices)
+ archsimd.Uint32x4{}.ShiftAllRight(20).Store(&indices)
_ = indices[indices[0]]
}
}
px := &src[y]
if b {
var indices [4]uint32
- fu := simd.LoadFloat32x4(px).AsUint32x4()
+ fu := archsimd.LoadFloat32x4(px).AsUint32x4()
fu.ShiftAllRight(0).Store(nil)
- entry := simd.LoadUint32x4(&[4]uint32{
+ entry := archsimd.LoadUint32x4(&[4]uint32{
toSrgbTable[indices[0]],
})
var res [4]uint32
package foo
-import "simd"
+import "simd/archsimd"
-func hasClosure(a, b, c, d simd.Int64x4) (w, x, y, z simd.Int64x4) {
+func hasClosure(a, b, c, d archsimd.Int64x4) (w, x, y, z archsimd.Int64x4) {
shuf := func() { // ERROR "can inline hasClosure.func1"
w = z.RotateAllLeft(1).Xor(a)
x = w.RotateAllLeft(3).Xor(b)