Skip to content

Commit b988402

Browse files
committed
Make crc32 100x faster on x86-64
This change makes checkpoints load significantly faster by optimizing pkzip's cyclic redundancy check. This code was developed by Intel and Google and Mozilla. See Chromium's zlib codebase for further details.
1 parent 1d2af5c commit b988402

File tree

2 files changed

+234
-0
lines changed

2 files changed

+234
-0
lines changed

thirdparty/.clang-format

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
DisableFormat: true
2+
SortIncludes: Never

thirdparty/zip.c

+232
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include <unistd.h>
3737
#endif
3838

39+
#define USE_EXTERNAL_MZCRC
3940
#include "miniz.h"
4041
#include "zip.h"
4142

@@ -1834,3 +1835,234 @@ int zip_extract(const char *zipname, const char *dir,
18341835

18351836
return zip_archive_extract(&zip_archive, dir, on_extract, arg);
18361837
}
1838+
1839+
#if defined(__SSE4_2__) || defined(__AVX512F__)
1840+
#include <immintrin.h>
1841+
#endif
1842+
1843+
// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles
1844+
// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX
1845+
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
1846+
// V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
1847+
mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) {
1848+
uint32_t crc = ~init;
1849+
#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__)
1850+
if (len >= 256) {
1851+
_Alignas(__m512) static const uint64_t k1k2[] = {
1852+
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
1853+
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
1854+
};
1855+
_Alignas(__m512) static const uint64_t k3k4[] = {
1856+
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
1857+
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
1858+
};
1859+
_Alignas(__m512) static const uint64_t k5k6[] = {
1860+
0x01751997d0,
1861+
0x00ccaa009e,
1862+
};
1863+
_Alignas(__m512) static const uint64_t k7k8[] = {
1864+
0x0163cd6124,
1865+
0x0000000000,
1866+
};
1867+
_Alignas(__m512) static const uint64_t poly[] = {
1868+
0x01db710641,
1869+
0x01f7011641,
1870+
};
1871+
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
1872+
__m128i a0, a1, a2, a3;
1873+
x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
1874+
x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
1875+
x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
1876+
x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
1877+
x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
1878+
x0 = _mm512_load_si512((__m512i *)k1k2);
1879+
buf += 256;
1880+
len -= 256;
1881+
while (len >= 256) {
1882+
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
1883+
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
1884+
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
1885+
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
1886+
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
1887+
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
1888+
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
1889+
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
1890+
y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
1891+
y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
1892+
y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
1893+
y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
1894+
x1 = _mm512_xor_si512(x1, x5);
1895+
x2 = _mm512_xor_si512(x2, x6);
1896+
x3 = _mm512_xor_si512(x3, x7);
1897+
x4 = _mm512_xor_si512(x4, x8);
1898+
x1 = _mm512_xor_si512(x1, y5);
1899+
x2 = _mm512_xor_si512(x2, y6);
1900+
x3 = _mm512_xor_si512(x3, y7);
1901+
x4 = _mm512_xor_si512(x4, y8);
1902+
buf += 256;
1903+
len -= 256;
1904+
}
1905+
x0 = _mm512_load_si512((__m512i *)k3k4);
1906+
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
1907+
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
1908+
x1 = _mm512_xor_si512(x1, x2);
1909+
x1 = _mm512_xor_si512(x1, x5);
1910+
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
1911+
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
1912+
x1 = _mm512_xor_si512(x1, x3);
1913+
x1 = _mm512_xor_si512(x1, x5);
1914+
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
1915+
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
1916+
x1 = _mm512_xor_si512(x1, x4);
1917+
x1 = _mm512_xor_si512(x1, x5);
1918+
while (len >= 64) {
1919+
x2 = _mm512_loadu_si512((__m512i *)buf);
1920+
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
1921+
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
1922+
x1 = _mm512_xor_si512(x1, x2);
1923+
x1 = _mm512_xor_si512(x1, x5);
1924+
buf += 64;
1925+
len -= 64;
1926+
}
1927+
a0 = _mm_load_si128((__m128i *)k5k6);
1928+
a1 = _mm512_extracti32x4_epi32(x1, 0);
1929+
a2 = _mm512_extracti32x4_epi32(x1, 1);
1930+
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
1931+
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
1932+
a1 = _mm_xor_si128(a1, a3);
1933+
a1 = _mm_xor_si128(a1, a2);
1934+
a2 = _mm512_extracti32x4_epi32(x1, 2);
1935+
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
1936+
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
1937+
a1 = _mm_xor_si128(a1, a3);
1938+
a1 = _mm_xor_si128(a1, a2);
1939+
a2 = _mm512_extracti32x4_epi32(x1, 3);
1940+
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
1941+
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
1942+
a1 = _mm_xor_si128(a1, a3);
1943+
a1 = _mm_xor_si128(a1, a2);
1944+
a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
1945+
a3 = _mm_setr_epi32(~0, 0, ~0, 0);
1946+
a1 = _mm_srli_si128(a1, 8);
1947+
a1 = _mm_xor_si128(a1, a2);
1948+
a0 = _mm_loadl_epi64((__m128i *)k7k8);
1949+
a2 = _mm_srli_si128(a1, 4);
1950+
a1 = _mm_and_si128(a1, a3);
1951+
a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
1952+
a1 = _mm_xor_si128(a1, a2);
1953+
a0 = _mm_load_si128((__m128i *)poly);
1954+
a2 = _mm_and_si128(a1, a3);
1955+
a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
1956+
a2 = _mm_and_si128(a2, a3);
1957+
a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
1958+
a1 = _mm_xor_si128(a1, a2);
1959+
crc = _mm_extract_epi32(a1, 1);
1960+
}
1961+
#endif
1962+
#if defined(__SSE4_2__) && defined(__PCLMUL__)
1963+
if (len >= 64) {
1964+
_Alignas(__m128) static const uint64_t k1k2[] = {
1965+
0x0154442bd4,
1966+
0x01c6e41596,
1967+
};
1968+
_Alignas(__m128) static const uint64_t k3k4[] = {
1969+
0x01751997d0,
1970+
0x00ccaa009e,
1971+
};
1972+
_Alignas(__m128) static const uint64_t k5k0[] = {
1973+
0x0163cd6124,
1974+
0x0000000000,
1975+
};
1976+
_Alignas(__m128) static const uint64_t poly[] = {
1977+
0x01db710641,
1978+
0x01f7011641,
1979+
};
1980+
__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
1981+
x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
1982+
x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
1983+
x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
1984+
x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
1985+
x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
1986+
x0 = _mm_load_si128((__m128i *)k1k2);
1987+
buf += 64;
1988+
len -= 64;
1989+
while (len >= 64) {
1990+
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
1991+
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
1992+
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
1993+
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
1994+
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
1995+
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
1996+
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
1997+
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
1998+
y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
1999+
y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
2000+
y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
2001+
y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
2002+
x1 = _mm_xor_si128(x1, x5);
2003+
x2 = _mm_xor_si128(x2, x6);
2004+
x3 = _mm_xor_si128(x3, x7);
2005+
x4 = _mm_xor_si128(x4, x8);
2006+
x1 = _mm_xor_si128(x1, y5);
2007+
x2 = _mm_xor_si128(x2, y6);
2008+
x3 = _mm_xor_si128(x3, y7);
2009+
x4 = _mm_xor_si128(x4, y8);
2010+
buf += 64;
2011+
len -= 64;
2012+
}
2013+
x0 = _mm_load_si128((__m128i *)k3k4);
2014+
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
2015+
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
2016+
x1 = _mm_xor_si128(x1, x2);
2017+
x1 = _mm_xor_si128(x1, x5);
2018+
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
2019+
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
2020+
x1 = _mm_xor_si128(x1, x3);
2021+
x1 = _mm_xor_si128(x1, x5);
2022+
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
2023+
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
2024+
x1 = _mm_xor_si128(x1, x4);
2025+
x1 = _mm_xor_si128(x1, x5);
2026+
while (len >= 16) {
2027+
x2 = _mm_loadu_si128((__m128i *)buf);
2028+
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
2029+
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
2030+
x1 = _mm_xor_si128(x1, x2);
2031+
x1 = _mm_xor_si128(x1, x5);
2032+
buf += 16;
2033+
len -= 16;
2034+
}
2035+
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
2036+
x3 = _mm_setr_epi32(~0, 0, ~0, 0);
2037+
x1 = _mm_srli_si128(x1, 8);
2038+
x1 = _mm_xor_si128(x1, x2);
2039+
x0 = _mm_loadl_epi64((__m128i *)k5k0);
2040+
x2 = _mm_srli_si128(x1, 4);
2041+
x1 = _mm_and_si128(x1, x3);
2042+
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
2043+
x1 = _mm_xor_si128(x1, x2);
2044+
x0 = _mm_load_si128((__m128i *)poly);
2045+
x2 = _mm_and_si128(x1, x3);
2046+
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
2047+
x2 = _mm_and_si128(x2, x3);
2048+
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
2049+
x1 = _mm_xor_si128(x1, x2);
2050+
crc = _mm_extract_epi32(x1, 1);
2051+
}
2052+
#endif
2053+
static uint32_t tab[256];
2054+
if (!tab[255]) {
2055+
// generates table for byte-wise crc calculation on the polynomial
2056+
// x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
2057+
uint32_t polynomial = 0xedb88320; // bits are reversed
2058+
for (int d = 0; d < 256; ++d) {
2059+
uint32_t r = d;
2060+
for (int i = 0; i < 8; ++i)
2061+
r = r >> 1 ^ (r & 1 ? polynomial : 0);
2062+
tab[d] = r;
2063+
}
2064+
}
2065+
for (size_t i = 0; i < len; ++i)
2066+
crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]];
2067+
return ~crc & 0xffffffff;
2068+
}

0 commit comments

Comments
 (0)