|
| 1 | +/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation. |
| 2 | + * Copyright Wang Yang (wangyang.guo@intel.com) |
| 3 | + * For conditions of distribution and use, see copyright notice in zlib.h |
| 4 | + */ |
| 5 | + |
| 6 | +#ifdef X86_VPCLMULQDQ_CRC |
| 7 | +#include "../../zutil.h" |
| 8 | + |
| 9 | +#include <immintrin.h> |
| 10 | + |
| 11 | +size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, |
| 12 | + __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) { |
| 13 | + size_t len_tmp = len; |
| 14 | + __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; |
| 15 | + __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; |
| 16 | + __m512i z0, z1, z2, z3; |
| 17 | + z_const __m512i zmm_fold4 = _mm512_set4_epi32( |
| 18 | + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); |
| 19 | + z_const __m512i zmm_fold16 = _mm512_set4_epi32( |
| 20 | + 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); |
| 21 | + |
| 22 | + // zmm register init |
| 23 | + zmm_crc0 = _mm512_setzero_si512(); |
| 24 | + zmm_t0 = _mm512_loadu_si512((__m512i *)src); |
| 25 | + zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); |
| 26 | + zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); |
| 27 | + zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); |
| 28 | + |
| 29 | + /* already have intermediate CRC in xmm registers |
| 30 | + * fold4 with 4 xmm_crc to get zmm_crc0 |
| 31 | + */ |
| 32 | + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0); |
| 33 | + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1); |
| 34 | + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2); |
| 35 | + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); |
| 36 | + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
| 37 | + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
| 38 | + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); |
| 39 | + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); |
| 40 | + |
| 41 | + _mm512_storeu_si512((__m512i *)dst, zmm_t0); |
| 42 | + _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); |
| 43 | + _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); |
| 44 | + _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); |
| 45 | + len -= 256; |
| 46 | + src += 256; |
| 47 | + dst += 256; |
| 48 | + |
| 49 | + // fold-16 loops |
| 50 | + while (len >= 256) { |
| 51 | + zmm_t0 = _mm512_loadu_si512((__m512i *)src); |
| 52 | + zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); |
| 53 | + zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); |
| 54 | + zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); |
| 55 | + |
| 56 | + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01); |
| 57 | + z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01); |
| 58 | + z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01); |
| 59 | + z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01); |
| 60 | + |
| 61 | + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10); |
| 62 | + zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10); |
| 63 | + zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); |
| 64 | + zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); |
| 65 | + |
| 66 | + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); |
| 67 | + zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1); |
| 68 | + zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2); |
| 69 | + zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3); |
| 70 | + |
| 71 | + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); |
| 72 | + zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1); |
| 73 | + zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2); |
| 74 | + zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3); |
| 75 | + |
| 76 | + _mm512_storeu_si512((__m512i *)dst, zmm_t0); |
| 77 | + _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); |
| 78 | + _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); |
| 79 | + _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); |
| 80 | + len -= 256; |
| 81 | + src += 256; |
| 82 | + dst += 256; |
| 83 | + } |
| 84 | + // zmm_crc[0,1,2,3] -> zmm_crc0 |
| 85 | + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
| 86 | + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
| 87 | + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); |
| 88 | + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1); |
| 89 | + |
| 90 | + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
| 91 | + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
| 92 | + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); |
| 93 | + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2); |
| 94 | + |
| 95 | + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); |
| 96 | + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); |
| 97 | + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); |
| 98 | + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3); |
| 99 | + |
| 100 | + // zmm_crc0 -> xmm_crc[0, 1, 2, 3] |
| 101 | + *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0); |
| 102 | + *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1); |
| 103 | + *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2); |
| 104 | + *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3); |
| 105 | + |
| 106 | + return (len_tmp - len); // return n bytes processed |
| 107 | +} |
| 108 | +#endif |
0 commit comments