|
36 | 36 | #include <unistd.h>
|
37 | 37 | #endif
|
38 | 38 |
|
| 39 | +#define USE_EXTERNAL_MZCRC |
39 | 40 | #include "miniz.h"
|
40 | 41 | #include "zip.h"
|
41 | 42 |
|
@@ -1834,3 +1835,234 @@ int zip_extract(const char *zipname, const char *dir,
|
1834 | 1835 |
|
1835 | 1836 | return zip_archive_extract(&zip_archive, dir, on_extract, arg);
|
1836 | 1837 | }
|
| 1838 | + |
| 1839 | +#if defined(__SSE4_2__) || defined(__AVX512F__) |
| 1840 | +#include <immintrin.h> |
| 1841 | +#endif |
| 1842 | + |
| 1843 | +// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles |
| 1844 | +// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX |
| 1845 | +// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" |
| 1846 | +// V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 |
| 1847 | +mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) { |
| 1848 | + uint32_t crc = ~init; |
| 1849 | +#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__) |
| 1850 | + if (len >= 256) { |
| 1851 | + _Alignas(__m512) static const uint64_t k1k2[] = { |
| 1852 | + 0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430, |
| 1853 | + 0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430, |
| 1854 | + }; |
| 1855 | + _Alignas(__m512) static const uint64_t k3k4[] = { |
| 1856 | + 0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596, |
| 1857 | + 0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596, |
| 1858 | + }; |
| 1859 | + _Alignas(__m512) static const uint64_t k5k6[] = { |
| 1860 | + 0x01751997d0, |
| 1861 | + 0x00ccaa009e, |
| 1862 | + }; |
| 1863 | + _Alignas(__m512) static const uint64_t k7k8[] = { |
| 1864 | + 0x0163cd6124, |
| 1865 | + 0x0000000000, |
| 1866 | + }; |
| 1867 | + _Alignas(__m512) static const uint64_t poly[] = { |
| 1868 | + 0x01db710641, |
| 1869 | + 0x01f7011641, |
| 1870 | + }; |
| 1871 | + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; |
| 1872 | + __m128i a0, a1, a2, a3; |
| 1873 | + x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); |
| 1874 | + x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); |
| 1875 | + x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); |
| 1876 | + x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); |
| 1877 | + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); |
| 1878 | + x0 = _mm512_load_si512((__m512i *)k1k2); |
| 1879 | + buf += 256; |
| 1880 | + len -= 256; |
| 1881 | + while (len >= 256) { |
| 1882 | + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
| 1883 | + x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); |
| 1884 | + x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); |
| 1885 | + x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); |
| 1886 | + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
| 1887 | + x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); |
| 1888 | + x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); |
| 1889 | + x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); |
| 1890 | + y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00)); |
| 1891 | + y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40)); |
| 1892 | + y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80)); |
| 1893 | + y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0)); |
| 1894 | + x1 = _mm512_xor_si512(x1, x5); |
| 1895 | + x2 = _mm512_xor_si512(x2, x6); |
| 1896 | + x3 = _mm512_xor_si512(x3, x7); |
| 1897 | + x4 = _mm512_xor_si512(x4, x8); |
| 1898 | + x1 = _mm512_xor_si512(x1, y5); |
| 1899 | + x2 = _mm512_xor_si512(x2, y6); |
| 1900 | + x3 = _mm512_xor_si512(x3, y7); |
| 1901 | + x4 = _mm512_xor_si512(x4, y8); |
| 1902 | + buf += 256; |
| 1903 | + len -= 256; |
| 1904 | + } |
| 1905 | + x0 = _mm512_load_si512((__m512i *)k3k4); |
| 1906 | + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
| 1907 | + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
| 1908 | + x1 = _mm512_xor_si512(x1, x2); |
| 1909 | + x1 = _mm512_xor_si512(x1, x5); |
| 1910 | + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
| 1911 | + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
| 1912 | + x1 = _mm512_xor_si512(x1, x3); |
| 1913 | + x1 = _mm512_xor_si512(x1, x5); |
| 1914 | + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
| 1915 | + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
| 1916 | + x1 = _mm512_xor_si512(x1, x4); |
| 1917 | + x1 = _mm512_xor_si512(x1, x5); |
| 1918 | + while (len >= 64) { |
| 1919 | + x2 = _mm512_loadu_si512((__m512i *)buf); |
| 1920 | + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); |
| 1921 | + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); |
| 1922 | + x1 = _mm512_xor_si512(x1, x2); |
| 1923 | + x1 = _mm512_xor_si512(x1, x5); |
| 1924 | + buf += 64; |
| 1925 | + len -= 64; |
| 1926 | + } |
| 1927 | + a0 = _mm_load_si128((__m128i *)k5k6); |
| 1928 | + a1 = _mm512_extracti32x4_epi32(x1, 0); |
| 1929 | + a2 = _mm512_extracti32x4_epi32(x1, 1); |
| 1930 | + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); |
| 1931 | + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); |
| 1932 | + a1 = _mm_xor_si128(a1, a3); |
| 1933 | + a1 = _mm_xor_si128(a1, a2); |
| 1934 | + a2 = _mm512_extracti32x4_epi32(x1, 2); |
| 1935 | + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); |
| 1936 | + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); |
| 1937 | + a1 = _mm_xor_si128(a1, a3); |
| 1938 | + a1 = _mm_xor_si128(a1, a2); |
| 1939 | + a2 = _mm512_extracti32x4_epi32(x1, 3); |
| 1940 | + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); |
| 1941 | + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); |
| 1942 | + a1 = _mm_xor_si128(a1, a3); |
| 1943 | + a1 = _mm_xor_si128(a1, a2); |
| 1944 | + a2 = _mm_clmulepi64_si128(a1, a0, 0x10); |
| 1945 | + a3 = _mm_setr_epi32(~0, 0, ~0, 0); |
| 1946 | + a1 = _mm_srli_si128(a1, 8); |
| 1947 | + a1 = _mm_xor_si128(a1, a2); |
| 1948 | + a0 = _mm_loadl_epi64((__m128i *)k7k8); |
| 1949 | + a2 = _mm_srli_si128(a1, 4); |
| 1950 | + a1 = _mm_and_si128(a1, a3); |
| 1951 | + a1 = _mm_clmulepi64_si128(a1, a0, 0x00); |
| 1952 | + a1 = _mm_xor_si128(a1, a2); |
| 1953 | + a0 = _mm_load_si128((__m128i *)poly); |
| 1954 | + a2 = _mm_and_si128(a1, a3); |
| 1955 | + a2 = _mm_clmulepi64_si128(a2, a0, 0x10); |
| 1956 | + a2 = _mm_and_si128(a2, a3); |
| 1957 | + a2 = _mm_clmulepi64_si128(a2, a0, 0x00); |
| 1958 | + a1 = _mm_xor_si128(a1, a2); |
| 1959 | + crc = _mm_extract_epi32(a1, 1); |
| 1960 | + } |
| 1961 | +#endif |
| 1962 | +#if defined(__SSE4_2__) && defined(__PCLMUL__) |
| 1963 | + if (len >= 64) { |
| 1964 | + _Alignas(__m128) static const uint64_t k1k2[] = { |
| 1965 | + 0x0154442bd4, |
| 1966 | + 0x01c6e41596, |
| 1967 | + }; |
| 1968 | + _Alignas(__m128) static const uint64_t k3k4[] = { |
| 1969 | + 0x01751997d0, |
| 1970 | + 0x00ccaa009e, |
| 1971 | + }; |
| 1972 | + _Alignas(__m128) static const uint64_t k5k0[] = { |
| 1973 | + 0x0163cd6124, |
| 1974 | + 0x0000000000, |
| 1975 | + }; |
| 1976 | + _Alignas(__m128) static const uint64_t poly[] = { |
| 1977 | + 0x01db710641, |
| 1978 | + 0x01f7011641, |
| 1979 | + }; |
| 1980 | + __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; |
| 1981 | + x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); |
| 1982 | + x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); |
| 1983 | + x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); |
| 1984 | + x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); |
| 1985 | + x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); |
| 1986 | + x0 = _mm_load_si128((__m128i *)k1k2); |
| 1987 | + buf += 64; |
| 1988 | + len -= 64; |
| 1989 | + while (len >= 64) { |
| 1990 | + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 1991 | + x6 = _mm_clmulepi64_si128(x2, x0, 0x00); |
| 1992 | + x7 = _mm_clmulepi64_si128(x3, x0, 0x00); |
| 1993 | + x8 = _mm_clmulepi64_si128(x4, x0, 0x00); |
| 1994 | + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 1995 | + x2 = _mm_clmulepi64_si128(x2, x0, 0x11); |
| 1996 | + x3 = _mm_clmulepi64_si128(x3, x0, 0x11); |
| 1997 | + x4 = _mm_clmulepi64_si128(x4, x0, 0x11); |
| 1998 | + y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); |
| 1999 | + y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); |
| 2000 | + y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); |
| 2001 | + y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); |
| 2002 | + x1 = _mm_xor_si128(x1, x5); |
| 2003 | + x2 = _mm_xor_si128(x2, x6); |
| 2004 | + x3 = _mm_xor_si128(x3, x7); |
| 2005 | + x4 = _mm_xor_si128(x4, x8); |
| 2006 | + x1 = _mm_xor_si128(x1, y5); |
| 2007 | + x2 = _mm_xor_si128(x2, y6); |
| 2008 | + x3 = _mm_xor_si128(x3, y7); |
| 2009 | + x4 = _mm_xor_si128(x4, y8); |
| 2010 | + buf += 64; |
| 2011 | + len -= 64; |
| 2012 | + } |
| 2013 | + x0 = _mm_load_si128((__m128i *)k3k4); |
| 2014 | + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 2015 | + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 2016 | + x1 = _mm_xor_si128(x1, x2); |
| 2017 | + x1 = _mm_xor_si128(x1, x5); |
| 2018 | + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 2019 | + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 2020 | + x1 = _mm_xor_si128(x1, x3); |
| 2021 | + x1 = _mm_xor_si128(x1, x5); |
| 2022 | + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 2023 | + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 2024 | + x1 = _mm_xor_si128(x1, x4); |
| 2025 | + x1 = _mm_xor_si128(x1, x5); |
| 2026 | + while (len >= 16) { |
| 2027 | + x2 = _mm_loadu_si128((__m128i *)buf); |
| 2028 | + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 2029 | + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); |
| 2030 | + x1 = _mm_xor_si128(x1, x2); |
| 2031 | + x1 = _mm_xor_si128(x1, x5); |
| 2032 | + buf += 16; |
| 2033 | + len -= 16; |
| 2034 | + } |
| 2035 | + x2 = _mm_clmulepi64_si128(x1, x0, 0x10); |
| 2036 | + x3 = _mm_setr_epi32(~0, 0, ~0, 0); |
| 2037 | + x1 = _mm_srli_si128(x1, 8); |
| 2038 | + x1 = _mm_xor_si128(x1, x2); |
| 2039 | + x0 = _mm_loadl_epi64((__m128i *)k5k0); |
| 2040 | + x2 = _mm_srli_si128(x1, 4); |
| 2041 | + x1 = _mm_and_si128(x1, x3); |
| 2042 | + x1 = _mm_clmulepi64_si128(x1, x0, 0x00); |
| 2043 | + x1 = _mm_xor_si128(x1, x2); |
| 2044 | + x0 = _mm_load_si128((__m128i *)poly); |
| 2045 | + x2 = _mm_and_si128(x1, x3); |
| 2046 | + x2 = _mm_clmulepi64_si128(x2, x0, 0x10); |
| 2047 | + x2 = _mm_and_si128(x2, x3); |
| 2048 | + x2 = _mm_clmulepi64_si128(x2, x0, 0x00); |
| 2049 | + x1 = _mm_xor_si128(x1, x2); |
| 2050 | + crc = _mm_extract_epi32(x1, 1); |
| 2051 | + } |
| 2052 | +#endif |
| 2053 | + static uint32_t tab[256]; |
| 2054 | + if (!tab[255]) { |
| 2055 | + // generates table for byte-wise crc calculation on the polynomial |
| 2056 | + // x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1 |
| 2057 | + uint32_t polynomial = 0xedb88320; // bits are reversed |
| 2058 | + for (int d = 0; d < 256; ++d) { |
| 2059 | + uint32_t r = d; |
| 2060 | + for (int i = 0; i < 8; ++i) |
| 2061 | + r = r >> 1 ^ (r & 1 ? polynomial : 0); |
| 2062 | + tab[d] = r; |
| 2063 | + } |
| 2064 | + } |
| 2065 | + for (size_t i = 0; i < len; ++i) |
| 2066 | + crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]]; |
| 2067 | + return ~crc & 0xffffffff; |
| 2068 | +} |
0 commit comments