Skip to content

Commit f62984c

Browse files
Add AVX512 BMM API (#124804)
This PR implements the [AVX512 BMM API](#123898). ```csharp namespace System.Runtime.Intrinsics.X86 { public abstract class Avx512Bmm : Avx512F { public static new bool IsSupported { get; } public static Vector128<byte> ReverseBits(Vector128<byte> values); public static Vector256<byte> ReverseBits(Vector256<byte> values); public static Vector512<byte> ReverseBits(Vector512<byte> values); public static Vector256<ushort> BitMultiplyMatrix16x16WithOrReduction(Vector256<ushort> addend, Vector256<ushort> left, Vector256<ushort> right); public static Vector512<ushort> BitMultiplyMatrix16x16WithOrReduction(Vector512<ushort> addend, Vector512<ushort> left, Vector512<ushort> right); public static Vector256<ushort> BitMultiplyMatrix16x16WithXorReduction(Vector256<ushort> addend, Vector256<ushort> left, Vector256<ushort> right); public static Vector512<ushort> BitMultiplyMatrix16x16WithXorReduction(Vector512<ushort> addend, Vector512<ushort> left, Vector512<ushort> right); } } ``` ## Disasm Samples <details> <summary>BitMultiplyMatrix16x16WithOrReduction</summary> ```csharp private static Vector256<ushort> BitMultiplyMatrix16x16WithOrReduction_Vector256(Vector256<ushort> x, Vector256<ushort> y, Vector256<ushort> z) { return Avx512Bmm.BitMultiplyMatrix16x16WithOrReduction(x, y, z); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithOrReduction_Vector256(System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort]):System.Runtime.Intrinsics.Vector256`1[ushort] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vmovups ymm0, ymmword ptr [rdx] vmovups ymm1, ymmword ptr [r8] vbmacor16x16x16 ymm0, ymm1, ymmword ptr [r9] vmovups ymmword ptr [rcx], ymm0 mov rax, rcx G_M000_IG03: ;; offset=0x0016 vzeroupper ret ; Total bytes of code 26 ``` ```csharp private static Vector512<ushort> BitMultiplyMatrix16x16WithOrReduction_Vector512(Vector512<ushort> x, Vector512<ushort> y, Vector512<ushort> z) { return Avx512Bmm.BitMultiplyMatrix16x16WithOrReduction(x, y, z); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithOrReduction_Vector512(System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort]):System.Runtime.Intrinsics.Vector512`1[ushort] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vmovups zmm0, zmmword ptr [rdx] vmovups zmm1, zmmword ptr [r8] vbmacor16x16x16 zmm0, zmm1, zmmword ptr [r9] vmovups zmmword ptr [rcx], zmm0 mov rax, rcx G_M000_IG03: ;; offset=0x001B vzeroupper ret ; Total bytes of code 31 ``` </details> <details> <summary>BitMultiplyMatrix16x16WithXorReduction</summary> ```csharp private static Vector256<ushort> BitMultiplyMatrix16x16WithXorReduction_Vector256(Vector256<ushort> x, Vector256<ushort> y, Vector256<ushort> z) { return Avx512Bmm.BitMultiplyMatrix16x16WithXorReduction(x, y, z); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithXorReduction_Vector256(System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort],System.Runtime.Intrinsics.Vector256`1[ushort]):System.Runtime.Intrinsics.Vector256`1[ushort] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vmovups ymm0, ymmword ptr [rdx] vmovups ymm1, ymmword ptr [r8] vbmacxor16x16x16 ymm0, ymm1, ymmword ptr [r9] vmovups ymmword ptr [rcx], ymm0 mov rax, rcx G_M000_IG03: ;; offset=0x0016 vzeroupper ret ; Total bytes of code 26 ``` ```csharp private static Vector512<ushort> BitMultiplyMatrix16x16WithXorReduction_Vector512(Vector512<ushort> x, Vector512<ushort> y, Vector512<ushort> z) { return Avx512Bmm.BitMultiplyMatrix16x16WithXorReduction(x, y, z); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:BitMultiplyMatrix16x16WithXorReduction_Vector512(System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort],System.Runtime.Intrinsics.Vector512`1[ushort]):System.Runtime.Intrinsics.Vector512`1[ushort] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vmovups zmm0, zmmword ptr [rdx] vmovups zmm1, zmmword ptr [r8] vbmacxor16x16x16 zmm0, zmm1, zmmword ptr [r9] vmovups zmmword ptr [rcx], zmm0 mov rax, rcx G_M000_IG03: ;; offset=0x001B vzeroupper ret ; Total bytes of code 31 ``` </details> <details> <summary>ReverseBits</summary> ```csharp private static Vector128<byte> ReverseBits_Vector128(Vector128<byte> values) { return Avx512Bmm.ReverseBits(values); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Vector128(System.Runtime.Intrinsics.Vector128`1[byte]):System.Runtime.Intrinsics.Vector128`1[byte] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vbitrev xmm0, xmmword ptr [rdx] vmovups xmmword ptr [rcx], xmm0 mov rax, rcx G_M000_IG03: ;; offset=0x000D ret ; Total bytes of code 14 ``` ```csharp private static Vector256<byte> ReverseBits_Vector256(Vector256<byte> values) { return Avx512Bmm.ReverseBits(values); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Vector256(System.Runtime.Intrinsics.Vector256`1[byte]):System.Runtime.Intrinsics.Vector256`1[byte] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vbitrev ymm0, ymmword ptr [rdx] vmovups ymmword ptr [rcx], ymm0 mov rax, rcx G_M000_IG03: ;; offset=0x000D vzeroupper ret ; Total bytes of code 17 ``` ```csharp private static Vector512<byte> ReverseBits_Vector512(Vector512<byte> values) { return Avx512Bmm.ReverseBits(values); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Vector512(System.Runtime.Intrinsics.Vector512`1[byte]):System.Runtime.Intrinsics.Vector512`1[byte] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vbitrev zmm0, zmmword ptr [rdx] vmovups zmmword ptr [rcx], zmm0 mov rax, rcx G_M000_IG03: ;; offset=0x000F vzeroupper ret ; Total bytes of code 19 ``` </details> <details> <summary>ReverseBits (Merge Masking)</summary> ```csharp private static Vector128<byte> ReverseBits_Mask_Vector128(Vector128<byte> values, Vector128<byte> mask) { return Avx512BW.BlendVariable(values, Avx512Bmm.ReverseBits(values), mask); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Mask_Vector128(System.Runtime.Intrinsics.Vector128`1[byte],System.Runtime.Intrinsics.Vector128`1[byte]):System.Runtime.Intrinsics.Vector128`1[byte] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vmovups xmm0, xmmword ptr [rdx] vmovups xmm1, xmmword ptr [r8] vpmovb2m k1, xmm1 vbitrev xmm0 {k1}, xmm0 vmovups xmmword ptr [rcx], xmm0 mov rax, rcx G_M000_IG03: ;; offset=0x001C ret ; Total bytes of code 29 ``` ```csharp private static Vector256<byte> ReverseBits_Mask_Vector256(Vector256<byte> values, Vector256<byte> mask) { return Avx512BW.BlendVariable(values, Avx512Bmm.ReverseBits(values), mask); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Mask_Vector256(System.Runtime.Intrinsics.Vector256`1[byte],System.Runtime.Intrinsics.Vector256`1[byte]):System.Runtime.Intrinsics.Vector256`1[byte] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vmovups ymm0, ymmword ptr [rdx] vmovups ymm1, ymmword ptr [r8] vpmovb2m k1, ymm1 vbitrev ymm0 {k1}, ymm0 vmovups ymmword ptr [rcx], ymm0 mov rax, rcx G_M000_IG03: ;; offset=0x001C vzeroupper ret ; Total bytes of code 32 ``` ```csharp private static Vector512<byte> ReverseBits_Mask_Vector512(Vector512<byte> values, Vector512<byte> mask) { return Avx512BW.BlendVariable(values, Avx512Bmm.ReverseBits(values), mask); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Mask_Vector512(System.Runtime.Intrinsics.Vector512`1[byte],System.Runtime.Intrinsics.Vector512`1[byte]):System.Runtime.Intrinsics.Vector512`1[byte] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vmovups zmm0, zmmword ptr [rdx] vmovups zmm1, zmmword ptr [r8] vpmovb2m k1, zmm1 vbitrev zmm0 {k1}, zmm0 vmovups zmmword ptr [rcx], zmm0 mov rax, rcx G_M000_IG03: ;; offset=0x0021 vzeroupper ret ; Total bytes of code 37 ``` </details> <details> <summary>ReverseBits (Zero Masking)</summary> ```csharp private static Vector128<byte> ReverseBits_Maskz_Vector128(Vector128<byte> values, Vector128<byte> mask) { return Avx512BW.BlendVariable(Vector128<byte>.Zero, Avx512Bmm.ReverseBits(values), mask); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Maskz_Vector128(System.Runtime.Intrinsics.Vector128`1[byte],System.Runtime.Intrinsics.Vector128`1[byte]):System.Runtime.Intrinsics.Vector128`1[byte] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vmovups xmm0, xmmword ptr [r8] vpmovb2m k1, xmm0 vbitrev xmm0 {k1}{z}, xmmword ptr [rdx] vmovups xmmword ptr [rcx], xmm0 mov rax, rcx G_M000_IG03: ;; offset=0x0018 ret ; Total bytes of code 25 ``` ```csharp private static Vector256<byte> ReverseBits_Maskz_Vector256(Vector256<byte> values, Vector256<byte> mask) { return Avx512BW.BlendVariable(Vector256<byte>.Zero, Avx512Bmm.ReverseBits(values), mask); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Maskz_Vector256(System.Runtime.Intrinsics.Vector256`1[byte],System.Runtime.Intrinsics.Vector256`1[byte]):System.Runtime.Intrinsics.Vector256`1[byte] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vmovups ymm0, ymmword ptr [r8] vpmovb2m k1, ymm0 vbitrev ymm0 {k1}{z}, ymmword ptr [rdx] vmovups ymmword ptr [rcx], ymm0 mov rax, rcx G_M000_IG03: ;; offset=0x0018 vzeroupper ret ; Total bytes of code 28 ``` ```csharp private static Vector512<byte> ReverseBits_Maskz_Vector512(Vector512<byte> values, Vector512<byte> mask) { return Avx512BW.BlendVariable(Vector512<byte>.Zero, Avx512Bmm.ReverseBits(values), mask); } ``` ``` ; Assembly listing for method JIT.HardwareIntrinsics.X86._Avx512Bmm.Program:ReverseBits_Maskz_Vector512(System.Runtime.Intrinsics.Vector512`1[byte],System.Runtime.Intrinsics.Vector512`1[byte]):System.Runtime.Intrinsics.Vector512`1[byte] (FullOpts) ; Emitting BLENDED_CODE for generic X64 + VEX + EVEX on Windows ; FullOpts code ; optimized code ; rsp based frame ; partially interruptible ; No PGO data G_M000_IG01: ;; offset=0x0000 G_M000_IG02: ;; offset=0x0000 vmovups zmm0, zmmword ptr [r8] vpmovb2m k1, zmm0 vbitrev zmm0 {k1}{z}, zmmword ptr [rdx] vmovups zmmword ptr [rcx], zmm0 mov rax, rcx G_M000_IG03: ;; offset=0x001B vzeroupper ret ; Total bytes of code 31 ``` </details> --------- Co-authored-by: Alex Covington (Advanced Micro Devices Inc) <b-alexco@microsoft.com>
1 parent 4172d0e commit f62984c

29 files changed

Lines changed: 764 additions & 128 deletions

src/coreclr/inc/clrconfigvalues.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,7 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableHWIntrinsic, W("EnableHWIntri
678678
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX, W("EnableAVX"), 1, "Allows AVX and dependent hardware intrinsics to be disabled")
679679
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX2, W("EnableAVX2"), 1, "Allows AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE and dependent hardware intrinsics to be disabled")
680680
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512, W("EnableAVX512"), 1, "Allows AVX512 F+BW+CD+DQ+VL and depdendent hardware intrinsics to be disabled")
681+
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512BMM, W("EnableAVX512BMM"), 1, "Allows AVX512BMM and depdendent hardware intrinsics to be disabled")
681682

682683
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512v2, W("EnableAVX512v2"), 1, "Allows AVX512 IFMA+VBMI and depdendent hardware intrinsics to be disabled")
683684
RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableAVX512v3, W("EnableAVX512v3"), 1, "Allows AVX512 BITALG+VBMI2+VNNI+VPOPCNTDQ and depdendent hardware intrinsics to be disabled")

src/coreclr/inc/corinfoinstructionset.h

Lines changed: 68 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -67,36 +67,37 @@ enum CORINFO_InstructionSet
6767
InstructionSet_AVX512VP2INTERSECT=13,
6868
InstructionSet_AVXIFMA=14,
6969
InstructionSet_AVXVNNI=15,
70-
InstructionSet_GFNI=16,
71-
InstructionSet_GFNI_V256=17,
72-
InstructionSet_GFNI_V512=18,
73-
InstructionSet_SHA=19,
74-
InstructionSet_WAITPKG=20,
75-
InstructionSet_X86Serialize=21,
76-
InstructionSet_Vector128=22,
77-
InstructionSet_Vector256=23,
78-
InstructionSet_Vector512=24,
79-
InstructionSet_VectorT128=25,
80-
InstructionSet_VectorT256=26,
81-
InstructionSet_VectorT512=27,
82-
InstructionSet_AVXVNNIINT=28,
83-
InstructionSet_AVXVNNIINT_V512=29,
84-
InstructionSet_X86Base_X64=30,
85-
InstructionSet_AVX_X64=31,
86-
InstructionSet_AVX2_X64=32,
87-
InstructionSet_AVX512_X64=33,
88-
InstructionSet_AVX512v2_X64=34,
89-
InstructionSet_AVX512v3_X64=35,
90-
InstructionSet_AVX10v1_X64=36,
91-
InstructionSet_AVX10v2_X64=37,
92-
InstructionSet_AES_X64=38,
93-
InstructionSet_AVX512VP2INTERSECT_X64=39,
94-
InstructionSet_AVXIFMA_X64=40,
95-
InstructionSet_AVXVNNI_X64=41,
96-
InstructionSet_GFNI_X64=42,
97-
InstructionSet_SHA_X64=43,
98-
InstructionSet_WAITPKG_X64=44,
99-
InstructionSet_X86Serialize_X64=45,
70+
InstructionSet_AVX512BMM=16,
71+
InstructionSet_GFNI=17,
72+
InstructionSet_GFNI_V256=18,
73+
InstructionSet_GFNI_V512=19,
74+
InstructionSet_SHA=20,
75+
InstructionSet_WAITPKG=21,
76+
InstructionSet_X86Serialize=22,
77+
InstructionSet_Vector128=23,
78+
InstructionSet_Vector256=24,
79+
InstructionSet_Vector512=25,
80+
InstructionSet_VectorT128=26,
81+
InstructionSet_VectorT256=27,
82+
InstructionSet_VectorT512=28,
83+
InstructionSet_AVXVNNIINT=29,
84+
InstructionSet_AVXVNNIINT_V512=30,
85+
InstructionSet_X86Base_X64=31,
86+
InstructionSet_AVX_X64=32,
87+
InstructionSet_AVX2_X64=33,
88+
InstructionSet_AVX512_X64=34,
89+
InstructionSet_AVX512v2_X64=35,
90+
InstructionSet_AVX512v3_X64=36,
91+
InstructionSet_AVX10v1_X64=37,
92+
InstructionSet_AVX10v2_X64=38,
93+
InstructionSet_AES_X64=39,
94+
InstructionSet_AVX512VP2INTERSECT_X64=40,
95+
InstructionSet_AVXIFMA_X64=41,
96+
InstructionSet_AVXVNNI_X64=42,
97+
InstructionSet_GFNI_X64=43,
98+
InstructionSet_SHA_X64=44,
99+
InstructionSet_WAITPKG_X64=45,
100+
InstructionSet_X86Serialize_X64=46,
100101
#endif // TARGET_AMD64
101102
#ifdef TARGET_X86
102103
InstructionSet_X86Base=1,
@@ -114,36 +115,37 @@ enum CORINFO_InstructionSet
114115
InstructionSet_AVX512VP2INTERSECT=13,
115116
InstructionSet_AVXIFMA=14,
116117
InstructionSet_AVXVNNI=15,
117-
InstructionSet_GFNI=16,
118-
InstructionSet_GFNI_V256=17,
119-
InstructionSet_GFNI_V512=18,
120-
InstructionSet_SHA=19,
121-
InstructionSet_WAITPKG=20,
122-
InstructionSet_X86Serialize=21,
123-
InstructionSet_Vector128=22,
124-
InstructionSet_Vector256=23,
125-
InstructionSet_Vector512=24,
126-
InstructionSet_VectorT128=25,
127-
InstructionSet_VectorT256=26,
128-
InstructionSet_VectorT512=27,
129-
InstructionSet_AVXVNNIINT=28,
130-
InstructionSet_AVXVNNIINT_V512=29,
131-
InstructionSet_X86Base_X64=30,
132-
InstructionSet_AVX_X64=31,
133-
InstructionSet_AVX2_X64=32,
134-
InstructionSet_AVX512_X64=33,
135-
InstructionSet_AVX512v2_X64=34,
136-
InstructionSet_AVX512v3_X64=35,
137-
InstructionSet_AVX10v1_X64=36,
138-
InstructionSet_AVX10v2_X64=37,
139-
InstructionSet_AES_X64=38,
140-
InstructionSet_AVX512VP2INTERSECT_X64=39,
141-
InstructionSet_AVXIFMA_X64=40,
142-
InstructionSet_AVXVNNI_X64=41,
143-
InstructionSet_GFNI_X64=42,
144-
InstructionSet_SHA_X64=43,
145-
InstructionSet_WAITPKG_X64=44,
146-
InstructionSet_X86Serialize_X64=45,
118+
InstructionSet_AVX512BMM=16,
119+
InstructionSet_GFNI=17,
120+
InstructionSet_GFNI_V256=18,
121+
InstructionSet_GFNI_V512=19,
122+
InstructionSet_SHA=20,
123+
InstructionSet_WAITPKG=21,
124+
InstructionSet_X86Serialize=22,
125+
InstructionSet_Vector128=23,
126+
InstructionSet_Vector256=24,
127+
InstructionSet_Vector512=25,
128+
InstructionSet_VectorT128=26,
129+
InstructionSet_VectorT256=27,
130+
InstructionSet_VectorT512=28,
131+
InstructionSet_AVXVNNIINT=29,
132+
InstructionSet_AVXVNNIINT_V512=30,
133+
InstructionSet_X86Base_X64=31,
134+
InstructionSet_AVX_X64=32,
135+
InstructionSet_AVX2_X64=33,
136+
InstructionSet_AVX512_X64=34,
137+
InstructionSet_AVX512v2_X64=35,
138+
InstructionSet_AVX512v3_X64=36,
139+
InstructionSet_AVX10v1_X64=37,
140+
InstructionSet_AVX10v2_X64=38,
141+
InstructionSet_AES_X64=39,
142+
InstructionSet_AVX512VP2INTERSECT_X64=40,
143+
InstructionSet_AVXIFMA_X64=41,
144+
InstructionSet_AVXVNNI_X64=42,
145+
InstructionSet_GFNI_X64=43,
146+
InstructionSet_SHA_X64=44,
147+
InstructionSet_WAITPKG_X64=45,
148+
InstructionSet_X86Serialize_X64=46,
147149
#endif // TARGET_X86
148150

149151
};
@@ -719,6 +721,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
719721
return "AVXVNNI";
720722
case InstructionSet_AVXVNNI_X64 :
721723
return "AVXVNNI_X64";
724+
case InstructionSet_AVX512BMM :
725+
return "AVX512BMM";
722726
case InstructionSet_GFNI :
723727
return "GFNI";
724728
case InstructionSet_GFNI_X64 :
@@ -787,6 +791,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
787791
return "AVXIFMA";
788792
case InstructionSet_AVXVNNI :
789793
return "AVXVNNI";
794+
case InstructionSet_AVX512BMM :
795+
return "AVX512BMM";
790796
case InstructionSet_GFNI :
791797
return "GFNI";
792798
case InstructionSet_GFNI_V256 :
@@ -911,6 +917,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
911917
case READYTORUN_INSTRUCTION_Avx512Vp2intersect_VL: return InstructionSet_AVX512VP2INTERSECT;
912918
case READYTORUN_INSTRUCTION_AvxIfma: return InstructionSet_AVXIFMA;
913919
case READYTORUN_INSTRUCTION_AvxVnni: return InstructionSet_AVXVNNI;
920+
case READYTORUN_INSTRUCTION_Avx512Bmm: return InstructionSet_AVX512BMM;
914921
case READYTORUN_INSTRUCTION_Gfni: return InstructionSet_GFNI;
915922
case READYTORUN_INSTRUCTION_Gfni_V256: return InstructionSet_GFNI_V256;
916923
case READYTORUN_INSTRUCTION_Gfni_V512: return InstructionSet_GFNI_V512;
@@ -980,6 +987,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst
980987
case READYTORUN_INSTRUCTION_Avx512Vp2intersect_VL: return InstructionSet_AVX512VP2INTERSECT;
981988
case READYTORUN_INSTRUCTION_AvxIfma: return InstructionSet_AVXIFMA;
982989
case READYTORUN_INSTRUCTION_AvxVnni: return InstructionSet_AVXVNNI;
990+
case READYTORUN_INSTRUCTION_Avx512Bmm: return InstructionSet_AVX512BMM;
983991
case READYTORUN_INSTRUCTION_Gfni: return InstructionSet_GFNI;
984992
case READYTORUN_INSTRUCTION_Gfni_V256: return InstructionSet_GFNI_V256;
985993
case READYTORUN_INSTRUCTION_Gfni_V512: return InstructionSet_GFNI_V512;

src/coreclr/inc/jiteeversionguid.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@
3737

3838
#include <minipal/guid.h>
3939

40-
constexpr GUID JITEEVersionIdentifier = { /* 22511e72-5ac8-4fc8-83ef-0b61688c68bb */
41-
0x22511e72,
42-
0x5ac8,
43-
0x4fc8,
44-
{0x83, 0xef, 0x0b, 0x61, 0x68, 0x8c, 0x68, 0xbb}
40+
constexpr GUID JITEEVersionIdentifier = { /* 89e70385-3f0d-4fbd-9270-0425c0db321b */
41+
0x89e70385,
42+
0x3f0d,
43+
0x4fbd,
44+
{0x92, 0x70, 0x04, 0x25, 0xc0, 0xdb, 0x32, 0x1b}
4545
};
4646

4747
#endif // JIT_EE_VERSIONING_GUID_H

src/coreclr/inc/readytoruninstructionset.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ enum ReadyToRunInstructionSet
9292
READYTORUN_INSTRUCTION_Avx512Vpopcntdq=82,
9393
READYTORUN_INSTRUCTION_Avx512Vpopcntdq_VL=83,
9494
READYTORUN_INSTRUCTION_Zbs=84,
95+
READYTORUN_INSTRUCTION_Avx512Bmm=85,
9596

9697
};
9798

src/coreclr/jit/emitxarch.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ bool emitter::Is3OpRmwInstruction(instruction ins)
128128
{
129129
return ((ins >= FIRST_FMA_INSTRUCTION) && (ins <= LAST_FMA_INSTRUCTION)) ||
130130
(IsAVXVNNIFamilyInstruction(ins)) ||
131+
((ins >= FIRST_AVX512BMM_INSTRUCTION) && (ins <= LAST_AVX512BMM_INSTRUCTION)) ||
131132
((ins >= FIRST_AVXIFMA_INSTRUCTION) && (ins <= LAST_AVXIFMA_INSTRUCTION));
132133
}
133134
}
@@ -3104,8 +3105,9 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
31043105
// 0x0000RM11.
31053106
leadingBytes = (code >> 16) & 0xFF;
31063107
assert(leadingBytes == 0x0F ||
3107-
(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) && leadingBytes >= 0x00 &&
3108-
leadingBytes <= 0x07) ||
3108+
((m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) ||
3109+
m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512BMM)) &&
3110+
leadingBytes >= 0x00 && leadingBytes <= 0x07) ||
31093111
(IsApxExtendedEvexInstruction(ins) && leadingBytes == 0));
31103112
code &= 0xFFFF;
31113113
}
@@ -3164,10 +3166,16 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co
31643166
break;
31653167
}
31663168

3169+
case 0x06:
3170+
{
3171+
assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX512BMM));
3172+
evexPrefix |= (0x6 << 16);
3173+
break;
3174+
}
3175+
31673176
case 0x01:
31683177
case 0x02:
31693178
case 0x03:
3170-
case 0x06:
31713179
case 0x07:
31723180
default:
31733181
{
@@ -21377,6 +21385,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
2137721385
break;
2137821386
}
2137921387

21388+
case INS_vbmacor16x16x16:
21389+
case INS_vbmacxor16x16x16:
21390+
case INS_vbitrev:
21391+
{
21392+
result.insLatency = PERFSCORE_LATENCY_1C;
21393+
result.insThroughput = PERFSCORE_THROUGHPUT_1C;
21394+
break;
21395+
}
21396+
2138021397
default:
2138121398
{
2138221399
assert((unsigned)ins < ArrLen(insThroughputInfos));

src/coreclr/jit/hwintrinsic.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -956,6 +956,7 @@ static const HWIntrinsicIsaRange hwintrinsicIsaRangeArray[] = {
956956
{ NI_Illegal, NI_Illegal }, // AVX512VP2INTERSECT
957957
{ NI_Illegal, NI_Illegal }, // AVXIFMA
958958
{ FIRST_NI_AVXVNNI, LAST_NI_AVXVNNI }, // AVXVNNI
959+
{ FIRST_NI_AVX512BMM, LAST_NI_AVX512BMM }, // AVX512BMM
959960
{ FIRST_NI_GFNI, LAST_NI_GFNI }, // GFNI
960961
{ FIRST_NI_GFNI_V256, LAST_NI_GFNI_V256 }, // GFNI_V256
961962
{ FIRST_NI_GFNI_V512, LAST_NI_GFNI_V512 }, // GFNI_V512

src/coreclr/jit/hwintrinsiccodegenxarch.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,8 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
902902

903903
case NI_AVXVNNI_MultiplyWideningAndAdd:
904904
case NI_AVXVNNI_MultiplyWideningAndAddSaturate:
905+
case NI_AVX512BMM_BitMultiplyMatrix16x16WithOrReduction:
906+
case NI_AVX512BMM_BitMultiplyMatrix16x16WithXorReduction:
905907
{
906908
assert(targetReg != REG_NA);
907909
assert(op1Reg != REG_NA);

src/coreclr/jit/hwintrinsiclistxarch.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,12 @@ HARDWARE_INTRINSIC(AVX10v2, MultipleSumAbsoluteDifferences,
10921092
HARDWARE_INTRINSIC(AVX10v2, StoreScalar, 16, 2, {INS_invalid, INS_invalid, INS_vmovw_simd, INS_vmovw_simd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg)
10931093
#define LAST_NI_AVX10v2 NI_AVX10v2_StoreScalar
10941094

1095+
#define FIRST_NI_AVX512BMM NI_AVX512BMM_BitMultiplyMatrix16x16WithOrReduction
1096+
HARDWARE_INTRINSIC(AVX512BMM, BitMultiplyMatrix16x16WithOrReduction, -1, -1, {INS_invalid, INS_invalid, INS_vbmacor16x16x16, INS_vbmacor16x16x16, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_RmwIntrinsic)
1097+
HARDWARE_INTRINSIC(AVX512BMM, BitMultiplyMatrix16x16WithXorReduction, -1, -1, {INS_invalid, INS_invalid, INS_vbmacxor16x16x16, INS_vbmacxor16x16x16, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_RmwIntrinsic)
1098+
HARDWARE_INTRINSIC(AVX512BMM, ReverseBits, -1, -1, {INS_invalid, INS_vbitrev, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg)
1099+
#define LAST_NI_AVX512BMM NI_AVX512BMM_ReverseBits
1100+
10951101
// ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
10961102
// ISA Function name SIMD size NumArg Instructions Category Flags
10971103
// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE}

src/coreclr/jit/hwintrinsicxarch.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,10 @@ CORINFO_InstructionSet Compiler::lookupInstructionSet(const char* className)
215215
{
216216
return InstructionSet_AVX512;
217217
}
218+
else if (strcmp(className + 7, "mm") == 0)
219+
{
220+
return InstructionSet_AVX512BMM;
221+
}
218222
}
219223
else if ((strcmp(className + 6, "CD") == 0) || (strcmp(className + 6, "DQ") == 0))
220224
{

src/coreclr/jit/instrsxarch.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,14 @@ INST3(vpdpbuud, "vpdpbuud", IUM_WR, BAD_CODE, BAD_
651651
INST3(vpdpbuuds, "vpdpbuuds", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0x00, 0x51), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Multiply individual bytes of first source operand with individual bytes of second source operand and add the results
652652
#define LAST_AVXVNNIINT16_INSTRUCTION INS_vpdpbuuds
653653

654+
// Instructions for AVX512-BMM
655+
#define FIRST_AVX512BMM_INSTRUCTION INS_vbmacor16x16x16
656+
INST3(vbmacor16x16x16, "vbmacor16x16x16", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x80), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // 16x16 non-transposed fused BMM-accumulate (BMAC) with OR reduction.
657+
INST3(vbmacxor16x16x16, "vbmacxor16x16x16", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x80), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // 16x16 non-transposed fused BMM-accumulate (BMAC) with XOR reduction.
658+
INST3(vbitrev, "vbitrev", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x81), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX ) // Bit reversal within a byte boundary.
659+
#define LAST_AVX512BMM_INSTRUCTION INS_vbitrev
660+
661+
654662
#define FIRST_AVXIFMA_INSTRUCTION INS_vpmadd52huq
655663
// Instructions for AVXIFMA
656664
INST3(vpmadd52huq, "vpmadd52huq", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB5), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply of Unsigned 52-Bit Unsigned Integers and Add High 52-Bit Products to 64-Bit Accumulators
@@ -1131,6 +1139,7 @@ INST3(vucomxsd, "vucomxsd", IUM_RD, BAD_CODE, BAD_
11311139
INST3(vucomxss, "vucomxss", IUM_RD, BAD_CODE, BAD_CODE, SSEDBL(0x2E), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF) // Perform an unordered compare of single precision floating point values and set flags
11321140
#define LAST_AVX512_INSTRUCTION INS_vucomxss
11331141

1142+
11341143
// id nm um mr mi rm lat tp tt flags
11351144
#define FIRST_APX_INSTRUCTION INS_ccmpo
11361145
#define FIRST_CCMP_INSTRUCTION INS_ccmpo

0 commit comments

Comments
 (0)