now with comments, and also a test.
choice of data types, method names, etc, are all up for comment.
It's NOT commutative, because of the immediate operand (unless we
swap the bits of the immediate).
Change-Id: I730a6938c6803d0b93544445db65eadc51783e42
Reviewed-on: https://go-review.googlesource.com/c/go/+/726963
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
ssa.OpAMD64VPSHRDQ128,
ssa.OpAMD64VPSHRDQ256,
ssa.OpAMD64VPSHRDQ512,
+ ssa.OpAMD64VPCLMULQDQ128,
+ ssa.OpAMD64VPCLMULQDQ256,
+ ssa.OpAMD64VPCLMULQDQ512,
ssa.OpAMD64VSHUFPS128,
ssa.OpAMD64VSHUFPD128,
ssa.OpAMD64VSHUFPS256,
(blendMaskedInt16x32 x y mask) => (VPBLENDMWMasked512 x y (VPMOVVec16x32ToM <types.TypeMask> mask))
(blendMaskedInt32x16 x y mask) => (VPBLENDMDMasked512 x y (VPMOVVec32x16ToM <types.TypeMask> mask))
(blendMaskedInt64x8 x y mask) => (VPBLENDMQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
+(carrylessMultiplyUint64x2 ...) => (VPCLMULQDQ128 ...)
+(carrylessMultiplyUint64x4 ...) => (VPCLMULQDQ256 ...)
+(carrylessMultiplyUint64x8 ...) => (VPCLMULQDQ512 ...)
(concatSelectedConstantFloat32x4 ...) => (VSHUFPS128 ...)
(concatSelectedConstantFloat64x2 ...) => (VSHUFPD128 ...)
(concatSelectedConstantInt32x4 ...) => (VSHUFPS128 ...)
{name: "VPALIGNRMasked128", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPALIGNRMasked256", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPALIGNRMasked512", argLength: 3, reg: w2kw, asm: "VPALIGNR", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+ {name: "VPCLMULQDQ128", argLength: 2, reg: v21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
+ {name: "VPCLMULQDQ256", argLength: 2, reg: w21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
+ {name: "VPCLMULQDQ512", argLength: 2, reg: w21, asm: "VPCLMULQDQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPBMasked256", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "UInt8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "TruncScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
{name: "TruncScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "TruncScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"},
+ {name: "carrylessMultiplyUint64x2", argLength: 2, commutative: false, aux: "UInt8"},
+ {name: "carrylessMultiplyUint64x4", argLength: 2, commutative: false, aux: "UInt8"},
+ {name: "carrylessMultiplyUint64x8", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantFloat32x4", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantFloat64x2", argLength: 2, commutative: false, aux: "UInt8"},
{name: "concatSelectedConstantGroupedFloat32x8", argLength: 2, commutative: false, aux: "UInt8"},
OpAMD64VPALIGNRMasked128
OpAMD64VPALIGNRMasked256
OpAMD64VPALIGNRMasked512
+ OpAMD64VPCLMULQDQ128
+ OpAMD64VPCLMULQDQ256
+ OpAMD64VPCLMULQDQ512
OpAMD64VPCMPB512
OpAMD64VPCMPBMasked128
OpAMD64VPCMPBMasked256
OpTruncScaledResidueFloat64x2
OpTruncScaledResidueFloat64x4
OpTruncScaledResidueFloat64x8
+ OpcarrylessMultiplyUint64x2
+ OpcarrylessMultiplyUint64x4
+ OpcarrylessMultiplyUint64x8
OpconcatSelectedConstantFloat32x4
OpconcatSelectedConstantFloat64x2
OpconcatSelectedConstantGroupedFloat32x8
},
},
},
+ {
+ name: "VPCLMULQDQ128",
+ auxType: auxUInt8,
+ argLen: 2,
+ asm: x86.AVPCLMULQDQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
+ },
+ outputs: []outputInfo{
+ {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+ },
+ },
+ },
+ {
+ name: "VPCLMULQDQ256",
+ auxType: auxUInt8,
+ argLen: 2,
+ asm: x86.AVPCLMULQDQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
+ {
+ name: "VPCLMULQDQ512",
+ auxType: auxUInt8,
+ argLen: 2,
+ asm: x86.AVPCLMULQDQ,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ outputs: []outputInfo{
+ {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
+ },
+ },
+ },
{
name: "VPCMPB512",
auxType: auxUInt8,
argLen: 1,
generic: true,
},
+ {
+ name: "carrylessMultiplyUint64x2",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "carrylessMultiplyUint64x4",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
+ {
+ name: "carrylessMultiplyUint64x8",
+ auxType: auxUInt8,
+ argLen: 2,
+ generic: true,
+ },
{
name: "concatSelectedConstantFloat32x4",
auxType: auxUInt8,
return rewriteValueAMD64_OpblendMaskedInt64x8(v)
case OpblendMaskedInt8x64:
return rewriteValueAMD64_OpblendMaskedInt8x64(v)
+ case OpcarrylessMultiplyUint64x2:
+ v.Op = OpAMD64VPCLMULQDQ128
+ return true
+ case OpcarrylessMultiplyUint64x4:
+ v.Op = OpAMD64VPCLMULQDQ256
+ return true
+ case OpcarrylessMultiplyUint64x8:
+ v.Op = OpAMD64VPCLMULQDQ512
+ return true
case OpconcatSelectedConstantFloat32x4:
v.Op = OpAMD64VSHUFPS128
return true
addF(simdPackage, "Int16x32.blendMasked", opLen3(ssa.OpblendMaskedInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.blendMasked", opLen3(ssa.OpblendMaskedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.blendMasked", opLen3(ssa.OpblendMaskedInt64x8, types.TypeVec512), sys.AMD64)
+ addF(simdPackage, "Uint64x2.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x2, types.TypeVec128, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x4.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x4, types.TypeVec256, 0), sys.AMD64)
+ addF(simdPackage, "Uint64x8.carrylessMultiply", opLen2Imm8(ssa.OpcarrylessMultiplyUint64x8, types.TypeVec512, 0), sys.AMD64)
addF(simdPackage, "Float32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Float64x2.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x4.concatSelectedConstant", opLen2Imm8(ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128, 0), sys.AMD64)
documentation: !string |-
// NAME computes element-wise GF(2^8) multiplication with
// reduction polynomial x^8 + x^4 + x^3 + x + 1.
+- go: carrylessMultiply
+ commutative: false
- *uint8
out:
- *uint8
+
+- go: carrylessMultiply
+ documentation: !string |-
+ // NAME computes one of four possible Galois polynomial
+ // products of selected high and low halves of x and y,
+ // depending on the value of xyHiLo, returning the 128-bit
+ // product in the concatenated two elements of the result.
+ // Bit 0 selects the low (0) or high (1) element of x and
+ // bit 4 selects the low (0x00) or high (0x10) element of y.
+ asm: V?PCLMULQDQ
+ in:
+ - go: Uint64x2
+ - go: Uint64x2
+ - class: immediate
+ immOffset: 0
+ name: xyHiLo
+ out:
+ - go: Uint64x2
+ overwriteElementBits: 64
+ hideMaskMethods: true
+
+- go: carrylessMultiply
+ documentation: !string |-
+ // NAME computes one of two possible Galois polynomial
+ // products of selected high and low halves of each of the two
+ // 128-bit lanes of x and y, depending on the value of xyHiLo,
+ // and returns the four 128-bit products in the result's lanes.
+ // Bit 0 selects the low (0) or high (1) elements of x's lanes and
+ // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+ asm: V?PCLMULQDQ
+ in:
+ - go: Uint64x4
+ - go: Uint64x4
+ - class: immediate
+ immOffset: 0
+ name: xyHiLo
+ out:
+ - go: Uint64x4
+ overwriteElementBits: 64
+ hideMaskMethods: true
+
+- go: carrylessMultiply
+ documentation: !string |-
+ // NAME computes one of four possible Galois polynomial
+ // products of selected high and low halves of each of the four
+ // 128-bit lanes of x and y, depending on the value of xyHiLo,
+ // and returns the four 128-bit products in the result's lanes.
+ // Bit 0 selects the low (0) or high (1) elements of x's lanes and
+ // bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+ asm: V?PCLMULQDQ
+ in:
+ - go: Uint64x8
+ - go: Uint64x8
+ - class: immediate
+ immOffset: 0
+ name: xyHiLo
+ out:
+ - go: Uint64x8
+ overwriteElementBits: 64
+ hideMaskMethods: true
- {class: vreg, go: Int64x4, base: "int", elemBits: 128, bits: 256, lanes: 4}
- {class: vreg, go: Uint64x4, base: "uint", elemBits: 128, bits: 256, lanes: 4}
+# Special for carryless multiply
+ - {class: vreg, go: Uint64x8, base: "uint", elemBits: 128, bits: 512, lanes: 8}
+
# Special shapes just to make VAES(ENC|DEC)(LAST)?512 work.
# The elemBits field of these shapes are wrong, it would be overwritten by overwriteElemBits.
- {class: vreg, go: Int8x32, base: "int", elemBits: 128, bits: 512, lanes: 32}
// the vector length suffix.
// AVX-512 extension features
- {"AVX512EVEX", "AVX512_BITALG"}: "AVX512BITALG",
- {"AVX512EVEX", "AVX512_GFNI"}: "AVX512GFNI",
- {"AVX512EVEX", "AVX512_VBMI2"}: "AVX512VBMI2",
- {"AVX512EVEX", "AVX512_VBMI"}: "AVX512VBMI",
- {"AVX512EVEX", "AVX512_VNNI"}: "AVX512VNNI",
- {"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ",
- {"AVX512EVEX", "AVX512_VAES"}: "AVX512VAES",
+ {"AVX512EVEX", "AVX512_BITALG"}: "AVX512BITALG",
+ {"AVX512EVEX", "AVX512_GFNI"}: "AVX512GFNI",
+ {"AVX512EVEX", "AVX512_VBMI2"}: "AVX512VBMI2",
+ {"AVX512EVEX", "AVX512_VBMI"}: "AVX512VBMI",
+ {"AVX512EVEX", "AVX512_VNNI"}: "AVX512VNNI",
+ {"AVX512EVEX", "AVX512_VPOPCNTDQ"}: "AVX512VPOPCNTDQ",
+ {"AVX512EVEX", "AVX512_VAES"}: "AVX512VAES",
+ {"AVX512EVEX", "AVX512_VPCLMULQDQ"}: "AVX512VPCLMULQDQ",
// AVX 10.2 (not yet supported)
{"AVX512EVEX", "AVX10_2_RC"}: "ignore",
return cpu.X86.HasAVX512VNNI
}
+// AVX512VPCLMULQDQ returns whether the CPU supports the AVX512VPCLMULQDQ feature.
+//
+// AVX512VPCLMULQDQ is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func (X86Features) AVX512VPCLMULQDQ() bool {
+ return cpu.X86.HasAVX512VPCLMULQDQ
+}
+
// AVX512VPOPCNTDQ returns whether the CPU supports the AVX512VPOPCNTDQ feature.
//
// AVX512VPOPCNTDQ is defined on all GOARCHes, but will only return true on
simd.LoadInt16x16Slice(x).PermuteScalarsLoGrouped(1, 2, 3, 0).StoreSlice(got)
checkSlices(t, got, want)
}
+
+func TestClMul(t *testing.T) {
+ var x = simd.LoadUint64x2Slice([]uint64{1, 5})
+ var y = simd.LoadUint64x2Slice([]uint64{3, 9})
+
+ foo := func(v simd.Uint64x2, s []uint64) {
+ r := make([]uint64, 2, 2)
+ v.StoreSlice(r)
+ checkSlices[uint64](t, r, s)
+ }
+
+ foo(x.CarrylessMultiply(0, 0, y), []uint64{3, 0})
+ foo(x.CarrylessMultiply(0, 1, y), []uint64{9, 0})
+ foo(x.CarrylessMultiply(1, 0, y), []uint64{15, 0})
+ foo(x.CarrylessMultiply(1, 1, y), []uint64{45, 0})
+ foo(y.CarrylessMultiply(0, 0, y), []uint64{5, 0})
+
+}
// Asm: VPBLENDMQ, CPU Feature: AVX512
func (x Int64x8) blendMasked(y Int64x8, mask Mask64x8) Int64x8
+/* carrylessMultiply */
+
+// carrylessMultiply computes one of four possible Galois polynomial
+// products of selected high and low halves of x and y,
+// depending on the value of xyHiLo, returning the 128-bit
+// product in the concatenated two elements of the result.
+// Bit 0 selects the low (0) or high (1) element of x and
+// bit 4 selects the low (0x00) or high (0x10) element of y.
+//
+// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX
+func (x Uint64x2) carrylessMultiply(xyHiLo uint8, y Uint64x2) Uint64x2
+
+// carrylessMultiply computes one of two possible Galois polynomial
+// products of selected high and low halves of each of the two
+// 128-bit lanes of x and y, depending on the value of xyHiLo,
+// and returns the four 128-bit products in the result's lanes.
+// Bit 0 selects the low (0) or high (1) elements of x's lanes and
+// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+//
+// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x4) carrylessMultiply(xyHiLo uint8, y Uint64x4) Uint64x4
+
+// carrylessMultiply computes one of four possible Galois polynomial
+// products of selected high and low halves of each of the four
+// 128-bit lanes of x and y, depending on the value of xyHiLo,
+// and returns the four 128-bit products in the result's lanes.
+// Bit 0 selects the low (0) or high (1) elements of x's lanes and
+// bit 4 selects the low (0x00) or high (0x10) elements of y's lanes.
+//
+// xyHiLo results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x8) carrylessMultiply(xyHiLo uint8, y Uint64x8) Uint64x8
+
/* concatSelectedConstant */
// concatSelectedConstant concatenates selected elements from x and y into the lower and upper
func (x Uint16x32) PermuteScalarsLoGrouped(a, b, c, d uint8) Uint16x32 {
return x.permuteScalarsLoGrouped(a&3 | (b&3)<<2 | (c&3)<<4 | d<<6)
}
+
+// CarrylessMultiply computes one of four possible carryless
+// multiplications of selected high and low halves of x and y,
+// depending on the values of a and b, returning the 128-bit
+// product in the concatenated two elements of the result.
+// a selects the low (0) or high (1) element of x and
+// b selects the low (0) or high (1) element of y.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// constant values of a and b will result in better performance,
+// otherwise the intrinsic may translate into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX
+func (x Uint64x2) CarrylessMultiply(a, b uint8, y Uint64x2) Uint64x2 {
+ return x.carrylessMultiply(a&1+((b&1)<<4), y)
+}
+
+// CarrylessMultiplyGrouped computes one of four possible carryless
+// multiplications of selected high and low halves of each of the two
+// 128-bit lanes of x and y, depending on the values of a and b,
+// and returns the four 128-bit products in the result's lanes.
+// a selects the low (0) or high (1) elements of x's lanes and
+// b selects the low (0) or high (1) elements of y's lanes.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// constant values of a and b will result in better performance,
+// otherwise the intrinsic may translate into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x4) CarrylessMultiplyGrouped(a, b uint8, y Uint64x4) Uint64x4 {
+ return x.carrylessMultiply(a&1+((b&1)<<4), y)
+}
+
+// CarrylessMultiplyGrouped computes one of four possible carryless
+// multiplications of selected high and low halves of each of the four
+// 128-bit lanes of x and y, depending on the values of a and b,
+// and returns the four 128-bit products in the result's lanes.
+// a selects the low (0) or high (1) elements of x's lanes and
+// b selects the low (0) or high (1) elements of y's lanes.
+//
+// A carryless multiplication uses bitwise XOR instead of
+// add-with-carry, for example (in base two):
+// 11 * 11 = 11 * (10 ^ 1) = (11 * 10) ^ (11 * 1) = 110 ^ 11 = 101
+//
+// This also models multiplication of polynomials with coefficients
+// from GF(2) -- 11 * 11 models (x+1)*(x+1) = x**2 + (1^1)x + 1 =
+// x**2 + 0x + 1 = x**2 + 1 modeled by 101. (Note that "+" adds
+// polynomial terms, but coefficients "add" with XOR.)
+//
+// constant values of a and b will result in better performance,
+// otherwise the intrinsic may translate into a jump table.
+//
+// Asm: VPCLMULQDQ, CPU Feature: AVX512VPCLMULQDQ
+func (x Uint64x8) CarrylessMultiplyGrouped(a, b uint8, y Uint64x8) Uint64x8 {
+ return x.carrylessMultiply(a&1+((b&1)<<4), y)
+}