ssa.OpAMD64VPADDQ256,
ssa.OpAMD64VPADDQ512,
ssa.OpAMD64VHADDPS128,
- ssa.OpAMD64VHADDPS256,
ssa.OpAMD64VHADDPD128,
- ssa.OpAMD64VHADDPD256,
ssa.OpAMD64VPHADDW128,
- ssa.OpAMD64VPHADDW256,
ssa.OpAMD64VPHADDD128,
+ ssa.OpAMD64VHADDPS256,
+ ssa.OpAMD64VHADDPD256,
+ ssa.OpAMD64VPHADDW256,
ssa.OpAMD64VPHADDD256,
ssa.OpAMD64VPHADDSW128,
ssa.OpAMD64VPHADDSW256,
ssa.OpAMD64VPSUBQ256,
ssa.OpAMD64VPSUBQ512,
ssa.OpAMD64VHSUBPS128,
- ssa.OpAMD64VHSUBPS256,
ssa.OpAMD64VHSUBPD128,
- ssa.OpAMD64VHSUBPD256,
ssa.OpAMD64VPHSUBW128,
- ssa.OpAMD64VPHSUBW256,
ssa.OpAMD64VPHSUBD128,
+ ssa.OpAMD64VHSUBPS256,
+ ssa.OpAMD64VHSUBPD256,
+ ssa.OpAMD64VPHSUBW256,
ssa.OpAMD64VPHSUBD256,
ssa.OpAMD64VPHSUBSW128,
ssa.OpAMD64VPHSUBSW256,
(AddUint64x4 ...) => (VPADDQ256 ...)
(AddUint64x8 ...) => (VPADDQ512 ...)
(AddPairsFloat32x4 ...) => (VHADDPS128 ...)
-(AddPairsFloat32x8 ...) => (VHADDPS256 ...)
(AddPairsFloat64x2 ...) => (VHADDPD128 ...)
-(AddPairsFloat64x4 ...) => (VHADDPD256 ...)
(AddPairsInt16x8 ...) => (VPHADDW128 ...)
-(AddPairsInt16x16 ...) => (VPHADDW256 ...)
(AddPairsInt32x4 ...) => (VPHADDD128 ...)
-(AddPairsInt32x8 ...) => (VPHADDD256 ...)
(AddPairsUint16x8 ...) => (VPHADDW128 ...)
-(AddPairsUint16x16 ...) => (VPHADDW256 ...)
(AddPairsUint32x4 ...) => (VPHADDD128 ...)
-(AddPairsUint32x8 ...) => (VPHADDD256 ...)
+(AddPairsGroupedFloat32x8 ...) => (VHADDPS256 ...)
+(AddPairsGroupedFloat64x4 ...) => (VHADDPD256 ...)
+(AddPairsGroupedInt16x16 ...) => (VPHADDW256 ...)
+(AddPairsGroupedInt32x8 ...) => (VPHADDD256 ...)
+(AddPairsGroupedUint16x16 ...) => (VPHADDW256 ...)
+(AddPairsGroupedUint32x8 ...) => (VPHADDD256 ...)
(AddPairsSaturatedInt16x8 ...) => (VPHADDSW128 ...)
-(AddPairsSaturatedInt16x16 ...) => (VPHADDSW256 ...)
+(AddPairsSaturatedGroupedInt16x16 ...) => (VPHADDSW256 ...)
(AddSaturatedInt8x16 ...) => (VPADDSB128 ...)
(AddSaturatedInt8x32 ...) => (VPADDSB256 ...)
(AddSaturatedInt8x64 ...) => (VPADDSB512 ...)
(SubUint64x4 ...) => (VPSUBQ256 ...)
(SubUint64x8 ...) => (VPSUBQ512 ...)
(SubPairsFloat32x4 ...) => (VHSUBPS128 ...)
-(SubPairsFloat32x8 ...) => (VHSUBPS256 ...)
(SubPairsFloat64x2 ...) => (VHSUBPD128 ...)
-(SubPairsFloat64x4 ...) => (VHSUBPD256 ...)
(SubPairsInt16x8 ...) => (VPHSUBW128 ...)
-(SubPairsInt16x16 ...) => (VPHSUBW256 ...)
(SubPairsInt32x4 ...) => (VPHSUBD128 ...)
-(SubPairsInt32x8 ...) => (VPHSUBD256 ...)
(SubPairsUint16x8 ...) => (VPHSUBW128 ...)
-(SubPairsUint16x16 ...) => (VPHSUBW256 ...)
(SubPairsUint32x4 ...) => (VPHSUBD128 ...)
-(SubPairsUint32x8 ...) => (VPHSUBD256 ...)
+(SubPairsGroupedFloat32x8 ...) => (VHSUBPS256 ...)
+(SubPairsGroupedFloat64x4 ...) => (VHSUBPD256 ...)
+(SubPairsGroupedInt16x16 ...) => (VPHSUBW256 ...)
+(SubPairsGroupedInt32x8 ...) => (VPHSUBD256 ...)
+(SubPairsGroupedUint16x16 ...) => (VPHSUBW256 ...)
+(SubPairsGroupedUint32x8 ...) => (VPHSUBD256 ...)
(SubPairsSaturatedInt16x8 ...) => (VPHSUBSW128 ...)
-(SubPairsSaturatedInt16x16 ...) => (VPHSUBSW256 ...)
+(SubPairsSaturatedGroupedInt16x16 ...) => (VPHSUBSW256 ...)
(SubSaturatedInt8x16 ...) => (VPSUBSB128 ...)
(SubSaturatedInt8x32 ...) => (VPSUBSB256 ...)
(SubSaturatedInt8x64 ...) => (VPSUBSB512 ...)
{name: "AddInt64x4", argLength: 2, commutative: true},
{name: "AddInt64x8", argLength: 2, commutative: true},
{name: "AddPairsFloat32x4", argLength: 2, commutative: false},
- {name: "AddPairsFloat32x8", argLength: 2, commutative: false},
{name: "AddPairsFloat64x2", argLength: 2, commutative: false},
- {name: "AddPairsFloat64x4", argLength: 2, commutative: false},
+ {name: "AddPairsGroupedFloat32x8", argLength: 2, commutative: false},
+ {name: "AddPairsGroupedFloat64x4", argLength: 2, commutative: false},
+ {name: "AddPairsGroupedInt16x16", argLength: 2, commutative: false},
+ {name: "AddPairsGroupedInt32x8", argLength: 2, commutative: false},
+ {name: "AddPairsGroupedUint16x16", argLength: 2, commutative: false},
+ {name: "AddPairsGroupedUint32x8", argLength: 2, commutative: false},
{name: "AddPairsInt16x8", argLength: 2, commutative: false},
- {name: "AddPairsInt16x16", argLength: 2, commutative: false},
{name: "AddPairsInt32x4", argLength: 2, commutative: false},
- {name: "AddPairsInt32x8", argLength: 2, commutative: false},
+ {name: "AddPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false},
{name: "AddPairsSaturatedInt16x8", argLength: 2, commutative: false},
- {name: "AddPairsSaturatedInt16x16", argLength: 2, commutative: false},
{name: "AddPairsUint16x8", argLength: 2, commutative: false},
- {name: "AddPairsUint16x16", argLength: 2, commutative: false},
{name: "AddPairsUint32x4", argLength: 2, commutative: false},
- {name: "AddPairsUint32x8", argLength: 2, commutative: false},
{name: "AddSaturatedInt8x16", argLength: 2, commutative: true},
{name: "AddSaturatedInt8x32", argLength: 2, commutative: true},
{name: "AddSaturatedInt8x64", argLength: 2, commutative: true},
{name: "SubInt64x4", argLength: 2, commutative: false},
{name: "SubInt64x8", argLength: 2, commutative: false},
{name: "SubPairsFloat32x4", argLength: 2, commutative: false},
- {name: "SubPairsFloat32x8", argLength: 2, commutative: false},
{name: "SubPairsFloat64x2", argLength: 2, commutative: false},
- {name: "SubPairsFloat64x4", argLength: 2, commutative: false},
+ {name: "SubPairsGroupedFloat32x8", argLength: 2, commutative: false},
+ {name: "SubPairsGroupedFloat64x4", argLength: 2, commutative: false},
+ {name: "SubPairsGroupedInt16x16", argLength: 2, commutative: false},
+ {name: "SubPairsGroupedInt32x8", argLength: 2, commutative: false},
+ {name: "SubPairsGroupedUint16x16", argLength: 2, commutative: false},
+ {name: "SubPairsGroupedUint32x8", argLength: 2, commutative: false},
{name: "SubPairsInt16x8", argLength: 2, commutative: false},
- {name: "SubPairsInt16x16", argLength: 2, commutative: false},
{name: "SubPairsInt32x4", argLength: 2, commutative: false},
- {name: "SubPairsInt32x8", argLength: 2, commutative: false},
+ {name: "SubPairsSaturatedGroupedInt16x16", argLength: 2, commutative: false},
{name: "SubPairsSaturatedInt16x8", argLength: 2, commutative: false},
- {name: "SubPairsSaturatedInt16x16", argLength: 2, commutative: false},
{name: "SubPairsUint16x8", argLength: 2, commutative: false},
- {name: "SubPairsUint16x16", argLength: 2, commutative: false},
{name: "SubPairsUint32x4", argLength: 2, commutative: false},
- {name: "SubPairsUint32x8", argLength: 2, commutative: false},
{name: "SubSaturatedInt8x16", argLength: 2, commutative: false},
{name: "SubSaturatedInt8x32", argLength: 2, commutative: false},
{name: "SubSaturatedInt8x64", argLength: 2, commutative: false},
OpAddInt64x4
OpAddInt64x8
OpAddPairsFloat32x4
- OpAddPairsFloat32x8
OpAddPairsFloat64x2
- OpAddPairsFloat64x4
+ OpAddPairsGroupedFloat32x8
+ OpAddPairsGroupedFloat64x4
+ OpAddPairsGroupedInt16x16
+ OpAddPairsGroupedInt32x8
+ OpAddPairsGroupedUint16x16
+ OpAddPairsGroupedUint32x8
OpAddPairsInt16x8
- OpAddPairsInt16x16
OpAddPairsInt32x4
- OpAddPairsInt32x8
+ OpAddPairsSaturatedGroupedInt16x16
OpAddPairsSaturatedInt16x8
- OpAddPairsSaturatedInt16x16
OpAddPairsUint16x8
- OpAddPairsUint16x16
OpAddPairsUint32x4
- OpAddPairsUint32x8
OpAddSaturatedInt8x16
OpAddSaturatedInt8x32
OpAddSaturatedInt8x64
OpSubInt64x4
OpSubInt64x8
OpSubPairsFloat32x4
- OpSubPairsFloat32x8
OpSubPairsFloat64x2
- OpSubPairsFloat64x4
+ OpSubPairsGroupedFloat32x8
+ OpSubPairsGroupedFloat64x4
+ OpSubPairsGroupedInt16x16
+ OpSubPairsGroupedInt32x8
+ OpSubPairsGroupedUint16x16
+ OpSubPairsGroupedUint32x8
OpSubPairsInt16x8
- OpSubPairsInt16x16
OpSubPairsInt32x4
- OpSubPairsInt32x8
+ OpSubPairsSaturatedGroupedInt16x16
OpSubPairsSaturatedInt16x8
- OpSubPairsSaturatedInt16x16
OpSubPairsUint16x8
- OpSubPairsUint16x16
OpSubPairsUint32x4
- OpSubPairsUint32x8
OpSubSaturatedInt8x16
OpSubSaturatedInt8x32
OpSubSaturatedInt8x64
generic: true,
},
{
- name: "AddPairsFloat32x8",
+ name: "AddPairsFloat64x2",
argLen: 2,
generic: true,
},
{
- name: "AddPairsFloat64x2",
+ name: "AddPairsGroupedFloat32x8",
argLen: 2,
generic: true,
},
{
- name: "AddPairsFloat64x4",
+ name: "AddPairsGroupedFloat64x4",
argLen: 2,
generic: true,
},
{
- name: "AddPairsInt16x8",
+ name: "AddPairsGroupedInt16x16",
argLen: 2,
generic: true,
},
{
- name: "AddPairsInt16x16",
+ name: "AddPairsGroupedInt32x8",
argLen: 2,
generic: true,
},
{
- name: "AddPairsInt32x4",
+ name: "AddPairsGroupedUint16x16",
argLen: 2,
generic: true,
},
{
- name: "AddPairsInt32x8",
+ name: "AddPairsGroupedUint32x8",
argLen: 2,
generic: true,
},
{
- name: "AddPairsSaturatedInt16x8",
+ name: "AddPairsInt16x8",
argLen: 2,
generic: true,
},
{
- name: "AddPairsSaturatedInt16x16",
+ name: "AddPairsInt32x4",
argLen: 2,
generic: true,
},
{
- name: "AddPairsUint16x8",
+ name: "AddPairsSaturatedGroupedInt16x16",
argLen: 2,
generic: true,
},
{
- name: "AddPairsUint16x16",
+ name: "AddPairsSaturatedInt16x8",
argLen: 2,
generic: true,
},
{
- name: "AddPairsUint32x4",
+ name: "AddPairsUint16x8",
argLen: 2,
generic: true,
},
{
- name: "AddPairsUint32x8",
+ name: "AddPairsUint32x4",
argLen: 2,
generic: true,
},
generic: true,
},
{
- name: "SubPairsFloat32x8",
+ name: "SubPairsFloat64x2",
argLen: 2,
generic: true,
},
{
- name: "SubPairsFloat64x2",
+ name: "SubPairsGroupedFloat32x8",
argLen: 2,
generic: true,
},
{
- name: "SubPairsFloat64x4",
+ name: "SubPairsGroupedFloat64x4",
argLen: 2,
generic: true,
},
{
- name: "SubPairsInt16x8",
+ name: "SubPairsGroupedInt16x16",
argLen: 2,
generic: true,
},
{
- name: "SubPairsInt16x16",
+ name: "SubPairsGroupedInt32x8",
argLen: 2,
generic: true,
},
{
- name: "SubPairsInt32x4",
+ name: "SubPairsGroupedUint16x16",
argLen: 2,
generic: true,
},
{
- name: "SubPairsInt32x8",
+ name: "SubPairsGroupedUint32x8",
argLen: 2,
generic: true,
},
{
- name: "SubPairsSaturatedInt16x8",
+ name: "SubPairsInt16x8",
argLen: 2,
generic: true,
},
{
- name: "SubPairsSaturatedInt16x16",
+ name: "SubPairsInt32x4",
argLen: 2,
generic: true,
},
{
- name: "SubPairsUint16x8",
+ name: "SubPairsSaturatedGroupedInt16x16",
argLen: 2,
generic: true,
},
{
- name: "SubPairsUint16x16",
+ name: "SubPairsSaturatedInt16x8",
argLen: 2,
generic: true,
},
{
- name: "SubPairsUint32x4",
+ name: "SubPairsUint16x8",
argLen: 2,
generic: true,
},
{
- name: "SubPairsUint32x8",
+ name: "SubPairsUint32x4",
argLen: 2,
generic: true,
},
case OpAddPairsFloat32x4:
v.Op = OpAMD64VHADDPS128
return true
- case OpAddPairsFloat32x8:
- v.Op = OpAMD64VHADDPS256
- return true
case OpAddPairsFloat64x2:
v.Op = OpAMD64VHADDPD128
return true
- case OpAddPairsFloat64x4:
+ case OpAddPairsGroupedFloat32x8:
+ v.Op = OpAMD64VHADDPS256
+ return true
+ case OpAddPairsGroupedFloat64x4:
v.Op = OpAMD64VHADDPD256
return true
- case OpAddPairsInt16x16:
+ case OpAddPairsGroupedInt16x16:
+ v.Op = OpAMD64VPHADDW256
+ return true
+ case OpAddPairsGroupedInt32x8:
+ v.Op = OpAMD64VPHADDD256
+ return true
+ case OpAddPairsGroupedUint16x16:
v.Op = OpAMD64VPHADDW256
return true
+ case OpAddPairsGroupedUint32x8:
+ v.Op = OpAMD64VPHADDD256
+ return true
case OpAddPairsInt16x8:
v.Op = OpAMD64VPHADDW128
return true
case OpAddPairsInt32x4:
v.Op = OpAMD64VPHADDD128
return true
- case OpAddPairsInt32x8:
- v.Op = OpAMD64VPHADDD256
- return true
- case OpAddPairsSaturatedInt16x16:
+ case OpAddPairsSaturatedGroupedInt16x16:
v.Op = OpAMD64VPHADDSW256
return true
case OpAddPairsSaturatedInt16x8:
v.Op = OpAMD64VPHADDSW128
return true
- case OpAddPairsUint16x16:
- v.Op = OpAMD64VPHADDW256
- return true
case OpAddPairsUint16x8:
v.Op = OpAMD64VPHADDW128
return true
case OpAddPairsUint32x4:
v.Op = OpAMD64VPHADDD128
return true
- case OpAddPairsUint32x8:
- v.Op = OpAMD64VPHADDD256
- return true
case OpAddPtr:
v.Op = OpAMD64ADDQ
return true
case OpSubPairsFloat32x4:
v.Op = OpAMD64VHSUBPS128
return true
- case OpSubPairsFloat32x8:
- v.Op = OpAMD64VHSUBPS256
- return true
case OpSubPairsFloat64x2:
v.Op = OpAMD64VHSUBPD128
return true
- case OpSubPairsFloat64x4:
+ case OpSubPairsGroupedFloat32x8:
+ v.Op = OpAMD64VHSUBPS256
+ return true
+ case OpSubPairsGroupedFloat64x4:
v.Op = OpAMD64VHSUBPD256
return true
- case OpSubPairsInt16x16:
+ case OpSubPairsGroupedInt16x16:
+ v.Op = OpAMD64VPHSUBW256
+ return true
+ case OpSubPairsGroupedInt32x8:
+ v.Op = OpAMD64VPHSUBD256
+ return true
+ case OpSubPairsGroupedUint16x16:
v.Op = OpAMD64VPHSUBW256
return true
+ case OpSubPairsGroupedUint32x8:
+ v.Op = OpAMD64VPHSUBD256
+ return true
case OpSubPairsInt16x8:
v.Op = OpAMD64VPHSUBW128
return true
case OpSubPairsInt32x4:
v.Op = OpAMD64VPHSUBD128
return true
- case OpSubPairsInt32x8:
- v.Op = OpAMD64VPHSUBD256
- return true
- case OpSubPairsSaturatedInt16x16:
+ case OpSubPairsSaturatedGroupedInt16x16:
v.Op = OpAMD64VPHSUBSW256
return true
case OpSubPairsSaturatedInt16x8:
v.Op = OpAMD64VPHSUBSW128
return true
- case OpSubPairsUint16x16:
- v.Op = OpAMD64VPHSUBW256
- return true
case OpSubPairsUint16x8:
v.Op = OpAMD64VPHSUBW128
return true
case OpSubPairsUint32x4:
v.Op = OpAMD64VPHSUBD128
return true
- case OpSubPairsUint32x8:
- v.Op = OpAMD64VPHSUBD256
- return true
case OpSubPtr:
v.Op = OpAMD64SUBQ
return true
addF(simdPackage, "Uint64x4.Add", opLen2(ssa.OpAddUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Add", opLen2(ssa.OpAddUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.AddPairs", opLen2(ssa.OpAddPairsFloat32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float32x8.AddPairs", opLen2(ssa.OpAddPairsFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x2.AddPairs", opLen2(ssa.OpAddPairsFloat64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float64x4.AddPairs", opLen2(ssa.OpAddPairsFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x8.AddPairs", opLen2(ssa.OpAddPairsInt16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.AddPairs", opLen2(ssa.OpAddPairsInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x4.AddPairs", opLen2(ssa.OpAddPairsInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x8.AddPairs", opLen2(ssa.OpAddPairsInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x8.AddPairs", opLen2(ssa.OpAddPairsUint16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint16x16.AddPairs", opLen2(ssa.OpAddPairsUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x4.AddPairs", opLen2(ssa.OpAddPairsUint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x8.AddPairs", opLen2(ssa.OpAddPairsUint32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float64x4.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x16.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x8.AddPairsGrouped", opLen2(ssa.OpAddPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x8.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.AddPairsSaturated", opLen2(ssa.OpAddPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x16.AddPairsSaturatedGrouped", opLen2(ssa.OpAddPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x16.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.AddSaturated", opLen2(ssa.OpAddSaturatedInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x4.Sub", opLen2(ssa.OpSubUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Sub", opLen2(ssa.OpSubUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.SubPairs", opLen2(ssa.OpSubPairsFloat32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float32x8.SubPairs", opLen2(ssa.OpSubPairsFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x2.SubPairs", opLen2(ssa.OpSubPairsFloat64x2, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Float64x4.SubPairs", opLen2(ssa.OpSubPairsFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x8.SubPairs", opLen2(ssa.OpSubPairsInt16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.SubPairs", opLen2(ssa.OpSubPairsInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x4.SubPairs", opLen2(ssa.OpSubPairsInt32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int32x8.SubPairs", opLen2(ssa.OpSubPairsInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x8.SubPairs", opLen2(ssa.OpSubPairsUint16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint16x16.SubPairs", opLen2(ssa.OpSubPairsUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x4.SubPairs", opLen2(ssa.OpSubPairsUint32x4, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Uint32x8.SubPairs", opLen2(ssa.OpSubPairsUint32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Float64x4.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedFloat64x4, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedInt32x8, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint16x16.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Uint32x8.SubPairsGrouped", opLen2(ssa.OpSubPairsGroupedUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x8.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x8, types.TypeVec128), sys.AMD64)
- addF(simdPackage, "Int16x16.SubPairsSaturated", opLen2(ssa.OpSubPairsSaturatedInt16x16, types.TypeVec256), sys.AMD64)
+ addF(simdPackage, "Int16x16.SubPairsSaturatedGrouped", opLen2(ssa.OpSubPairsSaturatedGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x16.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.SubSaturated", opLen2(ssa.OpSubSaturatedInt8x64, types.TypeVec512), sys.AMD64)
// NAME subtracts corresponding elements of two vectors with saturation.
- go: AddPairs
commutative: false
+ out:
+ - elemBits: 16|32
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements.
- // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+ // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: AddPairs
+ commutative: false
+ out:
+ - elemBits: 64
+ documentation: !string |-
+ // NAME horizontally adds adjacent pairs of elements.
+ // For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
- go: SubPairs
commutative: false
+ out:
+ - elemBits: 16|32
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements.
- // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+ // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: SubPairs
+ commutative: false
+ out:
+ - elemBits: 64
+ documentation: !string |-
+ // NAME horizontally subtracts adjacent pairs of elements.
+ // For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
- go: AddPairsSaturated
commutative: false
documentation: !string |-
// NAME horizontally adds adjacent pairs of elements with saturation.
- // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+ // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
- go: SubPairsSaturated
commutative: false
documentation: !string |-
// NAME horizontally subtracts adjacent pairs of elements with saturation.
- // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+ // For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: AddPairsGrouped
+ commutative: false
+ out:
+ - elemBits: 16|32
+ documentation: !string |-
+ // NAME horizontally adds adjacent pairs of elements.
+ // With each 128-bit as a group:
+ // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: AddPairsGrouped
+ commutative: false
+ out:
+ - elemBits: 64
+ documentation: !string |-
+ // NAME horizontally adds adjacent pairs of elements.
+ // With each 128-bit as a group:
+ // for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
+- go: SubPairsGrouped
+ commutative: false
+ out:
+ - elemBits: 16|32
+ documentation: !string |-
+ // NAME horizontally subtracts adjacent pairs of elements.
+ // With each 128-bit as a group:
+ // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+- go: SubPairsGrouped
+ commutative: false
+ out:
+ - elemBits: 64
+ documentation: !string |-
+ // NAME horizontally subtracts adjacent pairs of elements.
+ // With each 128-bit as a group:
+ // for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
+- go: AddPairsSaturatedGrouped
+ commutative: false
+ documentation: !string |-
+ // NAME horizontally adds adjacent pairs of elements with saturation.
+ // With each 128-bit as a group:
+ // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+- go: SubPairsSaturatedGrouped
+ commutative: false
+ documentation: !string |-
+ // NAME horizontally subtracts adjacent pairs of elements with saturation.
+ // With each 128-bit as a group:
+ // for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
- *uint
- go: AddPairs
asm: "VPHADD[DW]"
- in: *2any
- out: *1any
+ in: &2any128
+ - &any128
+ go: $t
+ bits: 128
+ - *any128
+ out: &1any128
+ - *any128
- go: SubPairs
asm: "VPHSUB[DW]"
- in: *2any
- out: *1any
+ in: *2any128
+ out: *1any128
- go: AddPairs
asm: "VHADDP[SD]" # floats
- in: *2any
- out: *1any
+ in: *2any128
+ out: *1any128
- go: SubPairs
asm: "VHSUBP[SD]" # floats
- in: *2any
- out: *1any
+ in: *2any128
+ out: *1any128
- go: AddPairsSaturated
asm: "VPHADDS[DW]"
- in: *2int
- out: *1int
+ in: &2int128
+ - &int128
+ go: $t
+ base: int
+ bits: 128
+ - *int128
+ out: &1int128
+ - *int128
- go: SubPairsSaturated
asm: "VPHSUBS[DW]"
- in: *2int
- out: *1int
+ in: *2int128
+ out: *1int128
+- go: AddPairsGrouped
+ asm: "VPHADD[DW]"
+ in: &2any256
+ - &any256
+ go: $t
+ bits: 256
+ - *any256
+ out: &1any256
+ - *any256
+- go: SubPairsGrouped
+ asm: "VPHSUB[DW]"
+ in: *2any256
+ out: *1any256
+- go: AddPairsGrouped
+ asm: "VHADDP[SD]" # floats
+ in: *2any256
+ out: *1any256
+- go: SubPairsGrouped
+ asm: "VHSUBP[SD]" # floats
+ in: *2any256
+ out: *1any256
+- go: AddPairsSaturatedGrouped
+ asm: "VPHADDS[DW]"
+ in: &2int256
+ - &int256
+ go: $t
+ base: int
+ bits: 256
+ - *int256
+ out: &1int256
+ - *int256
+- go: SubPairsSaturatedGrouped
+ asm: "VPHSUBS[DW]"
+ in: *2int256
+ out: *1int256
"simd/archsimd"
"slices"
"testing"
+ "unsafe"
)
func TestMain(m *testing.M) {
foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
}
+
+func addPairsSlice[T number](a, b []T) []T {
+ r := make([]T, len(a))
+ for i := range len(a) / 2 {
+ r[i] = a[2*i] + a[2*i+1]
+ r[i+len(a)/2] = b[2*i] + b[2*i+1]
+ }
+ return r
+}
+
+func subPairsSlice[T number](a, b []T) []T {
+ r := make([]T, len(a))
+ for i := range len(a) / 2 {
+ r[i] = a[2*i] - a[2*i+1]
+ r[i+len(a)/2] = b[2*i] - b[2*i+1]
+ }
+ return r
+}
+
+func addPairsGroupedSlice[T number](a, b []T) []T {
+ group := int(128 / unsafe.Sizeof(a[0]))
+ r := make([]T, 0, len(a))
+ for i := range len(a) / group {
+ r = append(r, addPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
+ }
+ return r
+}
+
+func subPairsGroupedSlice[T number](a, b []T) []T {
+ group := int(128 / unsafe.Sizeof(a[0]))
+ r := make([]T, 0, len(a))
+ for i := range len(a) / group {
+ r = append(r, subPairsSlice(a[i*group:(i+1)*group], b[i*group:(i+1)*group])...)
+ }
+ return r
+}
+
+func TestAddSubPairs(t *testing.T) {
+ testInt16x8Binary(t, archsimd.Int16x8.AddPairs, addPairsSlice[int16])
+ testInt16x8Binary(t, archsimd.Int16x8.SubPairs, subPairsSlice[int16])
+ testUint16x8Binary(t, archsimd.Uint16x8.AddPairs, addPairsSlice[uint16])
+ testUint16x8Binary(t, archsimd.Uint16x8.SubPairs, subPairsSlice[uint16])
+ testInt32x4Binary(t, archsimd.Int32x4.AddPairs, addPairsSlice[int32])
+ testInt32x4Binary(t, archsimd.Int32x4.SubPairs, subPairsSlice[int32])
+ testUint32x4Binary(t, archsimd.Uint32x4.AddPairs, addPairsSlice[uint32])
+ testUint32x4Binary(t, archsimd.Uint32x4.SubPairs, subPairsSlice[uint32])
+ testFloat32x4Binary(t, archsimd.Float32x4.AddPairs, addPairsSlice[float32])
+ testFloat32x4Binary(t, archsimd.Float32x4.SubPairs, subPairsSlice[float32])
+ testFloat64x2Binary(t, archsimd.Float64x2.AddPairs, addPairsSlice[float64])
+ testFloat64x2Binary(t, archsimd.Float64x2.SubPairs, subPairsSlice[float64])
+
+ // Grouped versions
+ if archsimd.X86.AVX2() {
+ testInt16x16Binary(t, archsimd.Int16x16.AddPairsGrouped, addPairsGroupedSlice[int16])
+ testInt16x16Binary(t, archsimd.Int16x16.SubPairsGrouped, subPairsGroupedSlice[int16])
+ testUint16x16Binary(t, archsimd.Uint16x16.AddPairsGrouped, addPairsGroupedSlice[uint16])
+ testUint16x16Binary(t, archsimd.Uint16x16.SubPairsGrouped, subPairsGroupedSlice[uint16])
+ testInt32x8Binary(t, archsimd.Int32x8.AddPairsGrouped, addPairsGroupedSlice[int32])
+ testInt32x8Binary(t, archsimd.Int32x8.SubPairsGrouped, subPairsGroupedSlice[int32])
+ testUint32x8Binary(t, archsimd.Uint32x8.AddPairsGrouped, addPairsGroupedSlice[uint32])
+ testUint32x8Binary(t, archsimd.Uint32x8.SubPairsGrouped, subPairsGroupedSlice[uint32])
+ testFloat32x8Binary(t, archsimd.Float32x8.AddPairsGrouped, addPairsGroupedSlice[float32])
+ testFloat32x8Binary(t, archsimd.Float32x8.SubPairsGrouped, subPairsGroupedSlice[float32])
+ testFloat64x4Binary(t, archsimd.Float64x4.AddPairsGrouped, addPairsGroupedSlice[float64])
+ testFloat64x4Binary(t, archsimd.Float64x4.SubPairsGrouped, subPairsGroupedSlice[float64])
+ }
+}
/* AddPairs */
// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VHADDPS, CPU Feature: AVX
func (x Float32x4) AddPairs(y Float32x4) Float32x4
// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
-//
-// Asm: VHADDPS, CPU Feature: AVX
-func (x Float32x8) AddPairs(y Float32x8) Float32x8
-
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
//
// Asm: VHADDPD, CPU Feature: AVX
func (x Float64x2) AddPairs(y Float64x2) Float64x2
// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
-// Asm: VHADDPD, CPU Feature: AVX
-func (x Float64x4) AddPairs(y Float64x4) Float64x4
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Int16x8) AddPairs(y Int16x8) Int16x8
// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Int16x8) AddPairs(y Int16x8) Int16x8
+// Asm: VPHADDD, CPU Feature: AVX
+func (x Int32x4) AddPairs(y Int32x4) Int32x4
// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
-// Asm: VPHADDW, CPU Feature: AVX2
-func (x Int16x16) AddPairs(y Int16x16) Int16x16
+// Asm: VPHADDW, CPU Feature: AVX
+func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDD, CPU Feature: AVX
-func (x Int32x4) AddPairs(y Int32x4) Int32x4
+func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+/* AddPairsGrouped */
+
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
-// Asm: VPHADDD, CPU Feature: AVX2
-func (x Int32x8) AddPairs(y Int32x8) Int32x8
+// Asm: VHADDPS, CPU Feature: AVX
+func (x Float32x8) AddPairsGrouped(y Float32x8) Float32x8
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1] and y = [y0, y1], the result is [x0+x1, y0+y1].
//
-// Asm: VPHADDW, CPU Feature: AVX
-func (x Uint16x8) AddPairs(y Uint16x8) Uint16x8
+// Asm: VHADDPD, CPU Feature: AVX
+func (x Float64x4) AddPairsGrouped(y Float64x4) Float64x4
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDW, CPU Feature: AVX2
-func (x Uint16x16) AddPairs(y Uint16x16) Uint16x16
+func (x Int16x16) AddPairsGrouped(y Int16x16) Int16x16
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
-// Asm: VPHADDD, CPU Feature: AVX
-func (x Uint32x4) AddPairs(y Uint32x4) Uint32x4
+// Asm: VPHADDD, CPU Feature: AVX2
+func (x Int32x8) AddPairsGrouped(y Int32x8) Int32x8
-// AddPairs horizontally adds adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
+//
+// Asm: VPHADDW, CPU Feature: AVX2
+func (x Uint16x16) AddPairsGrouped(y Uint16x16) Uint16x16
+
+// AddPairsGrouped horizontally adds adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDD, CPU Feature: AVX2
-func (x Uint32x8) AddPairs(y Uint32x8) Uint32x8
+func (x Uint32x8) AddPairsGrouped(y Uint32x8) Uint32x8
/* AddPairsSaturated */
// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDSW, CPU Feature: AVX
func (x Int16x8) AddPairsSaturated(y Int16x8) Int16x8
-// AddPairsSaturated horizontally adds adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0+y1, y2+y3, ..., x0+x1, x2+x3, ...].
+/* AddPairsSaturatedGrouped */
+
+// AddPairsSaturatedGrouped horizontally adds adjacent pairs of elements with saturation.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0+x1, x2+x3, ..., y0+y1, y2+y3, ...].
//
// Asm: VPHADDSW, CPU Feature: AVX2
-func (x Int16x16) AddPairsSaturated(y Int16x16) Int16x16
+func (x Int16x16) AddPairsSaturatedGrouped(y Int16x16) Int16x16
/* AddSaturated */
/* SubPairs */
// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VHSUBPS, CPU Feature: AVX
func (x Float32x4) SubPairs(y Float32x4) Float32x4
// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
-//
-// Asm: VHSUBPS, CPU Feature: AVX
-func (x Float32x8) SubPairs(y Float32x8) Float32x8
-
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
//
// Asm: VHSUBPD, CPU Feature: AVX
func (x Float64x2) SubPairs(y Float64x2) Float64x2
// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
-// Asm: VHSUBPD, CPU Feature: AVX
-func (x Float64x4) SubPairs(y Float64x4) Float64x4
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Int16x8) SubPairs(y Int16x8) Int16x8
// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Int16x8) SubPairs(y Int16x8) Int16x8
+// Asm: VPHSUBD, CPU Feature: AVX
+func (x Int32x4) SubPairs(y Int32x4) Int32x4
// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
-// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Int16x16) SubPairs(y Int16x16) Int16x16
+// Asm: VPHSUBW, CPU Feature: AVX
+func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBD, CPU Feature: AVX
-func (x Int32x4) SubPairs(y Int32x4) Int32x4
+func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+/* SubPairsGrouped */
+
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
-// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Int32x8) SubPairs(y Int32x8) Int32x8
+// Asm: VHSUBPS, CPU Feature: AVX
+func (x Float32x8) SubPairsGrouped(y Float32x8) Float32x8
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1] and y = [y0, y1], the result is [x0-x1, y0-y1].
//
-// Asm: VPHSUBW, CPU Feature: AVX
-func (x Uint16x8) SubPairs(y Uint16x8) Uint16x8
+// Asm: VHSUBPD, CPU Feature: AVX
+func (x Float64x4) SubPairsGrouped(y Float64x4) Float64x4
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBW, CPU Feature: AVX2
-func (x Uint16x16) SubPairs(y Uint16x16) Uint16x16
+func (x Int16x16) SubPairsGrouped(y Int16x16) Int16x16
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
-// Asm: VPHSUBD, CPU Feature: AVX
-func (x Uint32x4) SubPairs(y Uint32x4) Uint32x4
+// Asm: VPHSUBD, CPU Feature: AVX2
+func (x Int32x8) SubPairsGrouped(y Int32x8) Int32x8
-// SubPairs horizontally subtracts adjacent pairs of elements.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
+//
+// Asm: VPHSUBW, CPU Feature: AVX2
+func (x Uint16x16) SubPairsGrouped(y Uint16x16) Uint16x16
+
+// SubPairsGrouped horizontally subtracts adjacent pairs of elements.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBD, CPU Feature: AVX2
-func (x Uint32x8) SubPairs(y Uint32x8) Uint32x8
+func (x Uint32x8) SubPairsGrouped(y Uint32x8) Uint32x8
/* SubPairsSaturated */
// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBSW, CPU Feature: AVX
func (x Int16x8) SubPairsSaturated(y Int16x8) Int16x8
-// SubPairsSaturated horizontally subtracts adjacent pairs of elements with saturation.
-// For x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [y0-y1, y2-y3, ..., x0-x1, x2-x3, ...].
+/* SubPairsSaturatedGrouped */
+
+// SubPairsSaturatedGrouped horizontally subtracts adjacent pairs of elements with saturation.
+// With each 128-bit as a group:
+// for x = [x0, x1, x2, x3, ...] and y = [y0, y1, y2, y3, ...], the result is [x0-x1, x2-x3, ..., y0-y1, y2-y3, ...].
//
// Asm: VPHSUBSW, CPU Feature: AVX2
-func (x Int16x16) SubPairsSaturated(y Int16x16) Int16x16
+func (x Int16x16) SubPairsSaturatedGrouped(y Int16x16) Int16x16
/* SubSaturated */