NAMD
xxh3.h
Go to the documentation of this file.
1 /*
2  * xxHash - Extremely Fast Hash algorithm
3  * Development source file for `xxh3`
4  * Copyright (C) 2019-present, Yann Collet
5  *
6  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are
10  * met:
11  *
12  * * Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  * * Redistributions in binary form must reproduce the above
15  * copyright notice, this list of conditions and the following disclaimer
16  * in the documentation and/or other materials provided with the
17  * distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  *
31  * You can contact the author at:
32  * - xxHash homepage: http://www.xxhash.com
33  * - xxHash source repository: https://github.com/Cyan4973/xxHash
34  */
35 
36 /*
37  * Note: This file is separated for development purposes.
38  * It will be integrated into `xxhash.h` when development stage is completed.
39  *
40  * Credit: most of the work on vectorial and asm variants comes from @easyaspi314
41  */
42 
43 #ifndef XXH3_H_1397135465
44 #define XXH3_H_1397135465
45 
46 /* === Dependencies === */
47 #ifndef XXHASH_H_5627135585666179
48 /* special: when including `xxh3.h` directly, turn on XXH_INLINE_ALL */
49 # undef XXH_INLINE_ALL /* avoid redefinition */
50 # define XXH_INLINE_ALL
51 #endif
52 #include "xxhash.h"
53 
54 
55 /* === Compiler specifics === */
56 
57 #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
58 # define XXH_RESTRICT restrict
59 #else
60 /* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
61 # define XXH_RESTRICT /* disable */
62 #endif
63 
64 #if (defined(__GNUC__) && (__GNUC__ >= 3)) \
65  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
66  || defined(__clang__)
67 # define XXH_likely(x) __builtin_expect(x, 1)
68 # define XXH_unlikely(x) __builtin_expect(x, 0)
69 #else
70 # define XXH_likely(x) (x)
71 # define XXH_unlikely(x) (x)
72 #endif
73 
74 #if defined(__GNUC__)
75 # if defined(__AVX2__)
76 # include <immintrin.h>
77 # elif defined(__SSE2__)
78 # include <emmintrin.h>
79 # elif defined(__ARM_NEON__) || defined(__ARM_NEON)
80 # define inline __inline__ /* clang bug */
81 # include <arm_neon.h>
82 # undef inline
83 # endif
84 #elif defined(_MSC_VER)
85 # include <intrin.h>
86 #endif
87 
88 /*
89  * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
90  * remaining a true 64-bit/128-bit hash function.
91  *
92  * This is done by prioritizing a subset of 64-bit operations that can be
93  * emulated without too many steps on the average 32-bit machine.
94  *
95  * For example, these two lines seem similar, and run equally fast on 64-bit:
96  *
97  * xxh_u64 x;
98  * x ^= (x >> 47); // good
99  * x ^= (x >> 13); // bad
100  *
101  * However, to a 32-bit machine, there is a major difference.
102  *
103  * x ^= (x >> 47) looks like this:
104  *
105  * x.lo ^= (x.hi >> (47 - 32));
106  *
107  * while x ^= (x >> 13) looks like this:
108  *
109  * // note: funnel shifts are not usually cheap.
110  * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
111  * x.hi ^= (x.hi >> 13);
112  *
113  * The first one is significantly faster than the second, simply because the
114  * shift is larger than 32. This means:
115  * - All the bits we need are in the upper 32 bits, so we can ignore the lower
116  * 32 bits in the shift.
117  * - The shift result will always fit in the lower 32 bits, and therefore,
118  * we can ignore the upper 32 bits in the xor.
119  *
120  * Thanks to this optimization, XXH3 only requires these features to be efficient:
121  *
122  * - Usable unaligned access
123  * - A 32-bit or 64-bit ALU
124  * - If 32-bit, a decent ADC instruction
125  * - A 32 or 64-bit multiply with a 64-bit result
126  * - For the 128-bit variant, a decent byteswap helps short inputs.
127  *
128  * The first two are already required by XXH32, and almost all 32-bit and 64-bit
129  * platforms which can run XXH32 can run XXH3 efficiently.
130  *
131  * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
132  * notable exception.
133  *
134  * First of all, Thumb-1 lacks support for the UMULL instruction which
135  * performs the important long multiply. This means numerous __aeabi_lmul
136  * calls.
137  *
138  * Second of all, the 8 functional registers are just not enough.
139  * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
140  * Lo registers, and this shuffling results in thousands more MOVs than A32.
141  *
142  * A32 and T32 don't have this limitation. They can access all 14 registers,
143  * do a 32->64 multiply with UMULL, and the flexible operand allowing free
144  * shifts is helpful, too.
145  *
146  * Therefore, we do a quick sanity check.
147  *
148  * If compiling Thumb-1 for a target which supports ARM instructions, we will
149  * emit a warning, as it is not a "sane" platform to compile for.
150  *
151  * Usually, if this happens, it is because of an accident and you probably need
152  * to specify -march, as you likely meant to compile for a newer architecture.
153  */
154 #if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
155 # warning "XXH3 is highly inefficient without ARM or Thumb-2."
156 #endif
157 
158 /* ==========================================
159  * Vectorization detection
160  * ========================================== */
161 #define XXH_SCALAR 0 /* Portable scalar version */
162 #define XXH_SSE2 1 /* SSE2 for Pentium 4 and all x86_64 */
163 #define XXH_AVX2 2 /* AVX2 for Haswell and Bulldozer */
164 #define XXH_NEON 3 /* NEON for most ARMv7-A and all AArch64 */
165 #define XXH_VSX 4 /* VSX and ZVector for POWER8/z13 */
166 
167 #ifndef XXH_VECTOR /* can be defined on command line */
168 # if defined(__AVX2__)
169 # define XXH_VECTOR XXH_AVX2
170 # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
171 # define XXH_VECTOR XXH_SSE2
172 # elif defined(__GNUC__) /* msvc support maybe later */ \
173  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
174  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
175  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
176 # define XXH_VECTOR XXH_NEON
177 # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
178  || (defined(__s390x__) && defined(__VEC__)) \
179  && defined(__GNUC__) /* TODO: IBM XL */
180 # define XXH_VECTOR XXH_VSX
181 # else
182 # define XXH_VECTOR XXH_SCALAR
183 # endif
184 #endif
185 
186 /*
187  * Controls the alignment of the accumulator.
188  * This is for compatibility with aligned vector loads, which are usually faster.
189  */
190 #ifndef XXH_ACC_ALIGN
191 # if XXH_VECTOR == XXH_SCALAR /* scalar */
192 # define XXH_ACC_ALIGN 8
193 # elif XXH_VECTOR == XXH_SSE2 /* sse2 */
194 # define XXH_ACC_ALIGN 16
195 # elif XXH_VECTOR == XXH_AVX2 /* avx2 */
196 # define XXH_ACC_ALIGN 32
197 # elif XXH_VECTOR == XXH_NEON /* neon */
198 # define XXH_ACC_ALIGN 16
199 # elif XXH_VECTOR == XXH_VSX /* vsx */
200 # define XXH_ACC_ALIGN 16
201 # endif
202 #endif
203 
204 /*
205  * UGLY HACK:
206  * GCC usually generates the best code with -O3 for xxHash.
207  *
208  * However, when targeting AVX2, it is overzealous in its unrolling resulting
209  * in code roughly 3/4 the speed of Clang.
210  *
211  * There are other issues, such as GCC splitting _mm256_loadu_si256 into
212  * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
213  * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
214  *
215  * That is why when compiling the AVX2 version, it is recommended to use either
216  * -O2 -mavx2 -march=haswell
217  * or
218  * -O2 -mavx2 -mno-avx256-split-unaligned-load
219  * for decent performance, or to use Clang instead.
220  *
221  * Fortunately, we can control the first one with a pragma that forces GCC into
222  * -O2, but the other one we can't control without "failed to inline always
223  * inline function due to target mismatch" warnings.
224  */
225 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
226  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
227  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
228 # pragma GCC push_options
229 # pragma GCC optimize("-O2")
230 #endif
231 
232 
233 #if XXH_VECTOR == XXH_NEON
234 /*
235  * NEON's setup for vmlal_u32 is a little more complicated than it is on
236  * SSE2, AVX2, and VSX.
237  *
238  * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
239  *
240  * To do the same operation, the 128-bit 'Q' register needs to be split into
241  * two 64-bit 'D' registers, performing this operation::
242  *
243  * [ a | b ]
244  * | '---------. .--------' |
245  * | x |
246  * | .---------' '--------. |
247  * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]
248  *
249  * Due to significant changes in aarch64, the fastest method for aarch64 is
250  * completely different than the fastest method for ARMv7-A.
251  *
252  * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
253  * D11 will modify the high half of Q5. This is similar to how modifying AH
254  * will only affect bits 8-15 of AX on x86.
255  *
256  * VZIP takes two registers, and puts even lanes in one register and odd lanes
257  * in the other.
258  *
259  * On ARMv7-A, this strangely modifies both parameters in place instead of
260  * taking the usual 3-operand form.
261  *
262  * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
263  * lower and upper halves of the Q register to end up with the high and low
264  * halves where we want - all in one instruction.
265  *
266  * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
267  *
268  * Unfortunately we need inline assembly for this: Instructions modifying two
269  * registers at once is not possible in GCC or Clang's IR, and they have to
270  * create a copy.
271  *
272  * aarch64 requires a different approach.
273  *
274  * In order to make it easier to write a decent compiler for aarch64, many
275  * quirks were removed, such as conditional execution.
276  *
277  * NEON was also affected by this.
278  *
279  * aarch64 cannot access the high bits of a Q-form register, and writes to a
280  * D-form register zero the high bits, similar to how writes to W-form scalar
281  * registers (or DWORD registers on x86_64) work.
282  *
283  * The formerly free vget_high intrinsics now require a vext (with a few
284  * exceptions)
285  *
286  * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
287  * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
288  * operand.
289  *
290  * The equivalent of the VZIP.32 on the lower and upper halves would be this
291  * mess:
292  *
293  * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
294  * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
295  * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
296  *
297  * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
298  *
299  * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
300  * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
301  *
302  * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
303  */
304 
305 /*
306  * Function-like macro:
307  * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
308  * {
309  * outLo = (uint32x2_t)(in & 0xFFFFFFFF);
310  * outHi = (uint32x2_t)(in >> 32);
311  * in = UNDEFINED;
312  * }
313  */
314 # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
315  && defined(__GNUC__) \
316  && !defined(__aarch64__) && !defined(__arm64__)
317 # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
318  do { \
319  /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
320  /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
321  /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
322  __asm__("vzip.32 %e0, %f0" : "+w" (in)); \
323  (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \
324  (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
325  } while (0)
326 # else
327 # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
328  do { \
329  (outLo) = vmovn_u64 (in); \
330  (outHi) = vshrn_n_u64 ((in), 32); \
331  } while (0)
332 # endif
333 #endif /* XXH_VECTOR == XXH_NEON */
334 
335 /*
336  * VSX and Z Vector helpers.
337  *
338  * This is very messy, and any pull requests to clean this up are welcome.
339  *
340  * There are a lot of problems with supporting VSX and s390x, due to
341  * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
342  */
343 #if XXH_VECTOR == XXH_VSX
344 # if defined(__s390x__)
345 # include <s390intrin.h>
346 # else
347 # include <altivec.h>
348 # endif
349 
350 # undef vector /* Undo the pollution */
351 
352 typedef __vector unsigned long long xxh_u64x2;
353 typedef __vector unsigned char xxh_u8x16;
354 typedef __vector unsigned xxh_u32x4;
355 
356 # ifndef XXH_VSX_BE
357 # if defined(__BIG_ENDIAN__) \
358  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
359 # define XXH_VSX_BE 1
360 # elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
361 # warning "-maltivec=be is not recommended. Please use native endianness."
362 # define XXH_VSX_BE 1
363 # else
364 # define XXH_VSX_BE 0
365 # endif
366 # endif /* !defined(XXH_VSX_BE) */
367 
368 # if XXH_VSX_BE
369 /* A wrapper for POWER9's vec_revb. */
370 # if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
371 # define XXH_vec_revb vec_revb
372 # else
373 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
374 {
375  xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
376  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
377  return vec_perm(val, val, vByteSwap);
378 }
379 # endif
380 # endif /* XXH_VSX_BE */
381 
382 /*
383  * Performs an unaligned load and byte swaps it on big endian.
384  */
385 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
386 {
387  xxh_u64x2 ret;
388  memcpy(&ret, ptr, sizeof(xxh_u64x2));
389 # if XXH_VSX_BE
390  ret = XXH_vec_revb(ret);
391 # endif
392  return ret;
393 }
394 
395 /*
396  * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
397  *
398  * These intrinsics weren't added until GCC 8, despite existing for a while,
399  * and they are endian dependent. Also, their meaning swap depending on version.
400  * */
401 # if defined(__s390x__)
402  /* s390x is always big endian, no issue on this platform */
403 # define XXH_vec_mulo vec_mulo
404 # define XXH_vec_mule vec_mule
405 # elif defined(__clang__) && __has_builtin(__builtin_altivec_vmuleuw)
406 /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
407 # define XXH_vec_mulo __builtin_altivec_vmulouw
408 # define XXH_vec_mule __builtin_altivec_vmuleuw
409 # else
410 /* gcc needs inline assembly */
411 /* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
412 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
413 {
414  xxh_u64x2 result;
415  __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
416  return result;
417 }
418 XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
419 {
420  xxh_u64x2 result;
421  __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
422  return result;
423 }
424 # endif /* XXH_vec_mulo, XXH_vec_mule */
425 #endif /* XXH_VECTOR == XXH_VSX */
426 
427 
428 /* prefetch
429  * can be disabled, by declaring XXH_NO_PREFETCH build macro */
430 #if defined(XXH_NO_PREFETCH)
431 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
432 #else
433 # if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */
434 # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
435 # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
436 # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
437 # define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
438 # else
439 # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
440 # endif
441 #endif /* XXH_NO_PREFETCH */
442 
443 
444 /* ==========================================
445  * XXH3 default settings
446  * ========================================== */
447 
448 #define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */
449 
450 #if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
451 # error "default keyset is not large enough"
452 #endif
453 
454 /* Pseudorandom secret taken directly from FARSH */
455 XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
456  0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
457  0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
458  0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
459  0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
460  0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
461  0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
462  0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
463  0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
464 
465  0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
466  0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
467  0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
468  0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
469 };
470 
471 /*
472  * Does a 32-bit to 64-bit long multiply.
473  *
474  * Wraps __emulu on MSVC x86 because it tends to call __allmul when it doesn't
475  * need to (but it shouldn't need to anyways, it is about 7 instructions to do
476  * a 64x64 multiply...). Since we know that this will _always_ emit MULL, we
477  * use that instead of the normal method.
478  *
479  * If you are compiling for platforms like Thumb-1 and don't have a better option,
480  * you may also want to write your own long multiply routine here.
481  *
482  * XXH_FORCE_INLINE xxh_u64 XXH_mult32to64(xxh_u64 x, xxh_u64 y)
483  * {
484  * return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
485  * }
486  */
487 #if defined(_MSC_VER) && defined(_M_IX86)
488 # include <intrin.h>
489 # define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
490 #else
491 /*
492  * Downcast + upcast is usually better than masking on older compilers like
493  * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
494  *
495  * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
496  * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
497  */
498 # define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
499 #endif
500 
501 /*
502  * Calculates a 64->128-bit long multiply.
503  *
504  * Uses __uint128_t and _umul128 if available, otherwise uses a scalar version.
505  */
506 static XXH128_hash_t
507 XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
508 {
509  /*
510  * GCC/Clang __uint128_t method.
511  *
512  * On most 64-bit targets, GCC and Clang define a __uint128_t type.
513  * This is usually the best way as it usually uses a native long 64-bit
514  * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
515  *
516  * Usually.
517  *
518  * Despite being a 32-bit platform, Clang (and emscripten) define this type
519  * despite not having the arithmetic for it. This results in a laggy
520  * compiler builtin call which calculates a full 128-bit multiply.
521  * In that case it is best to use the portable one.
522  * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
523  */
524 #if defined(__GNUC__) && !defined(__wasm__) \
525  && defined(__SIZEOF_INT128__) \
526  || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
527 
528  __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
529  XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
530  return r128;
531 
532  /*
533  * MSVC for x64's _umul128 method.
534  *
535  * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
536  *
537  * This compiles to single operand MUL on x64.
538  */
539 #elif defined(_M_X64) || defined(_M_IA64)
540 
541 #ifndef _MSC_VER
542 # pragma intrinsic(_umul128)
543 #endif
544  xxh_u64 product_high;
545  xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
546  XXH128_hash_t const r128 = { product_low, product_high };
547  return r128;
548 
549 #else
550  /*
551  * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
552  *
553  * This is a fast and simple grade school multiply, which is shown below
554  * with base 10 arithmetic instead of base 0x100000000.
555  *
556  * 9 3 // D2 lhs = 93
557  * x 7 5 // D2 rhs = 75
558  * ----------
559  * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
560  * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
561  * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
562  * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
563  * ---------
564  * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
565  * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
566  * ---------
567  * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
568  *
569  * The reasons for adding the products like this are:
570  * 1. It avoids manual carry tracking. Just like how
571  * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
572  * This avoids a lot of complexity.
573  *
574  * 2. It hints for, and on Clang, compiles to, the powerful UMAAL
575  * instruction available in ARM's Digital Signal Processing extension
576  * in 32-bit ARMv6 and later, which is shown below:
577  *
578  * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
579  * {
580  * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
581  * *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
582  * *RdHi = (xxh_u32)(product >> 32);
583  * }
584  *
585  * This instruction was designed for efficient long multiplication, and
586  * allows this to be calculated in only 4 instructions at speeds
587  * comparable to some 64-bit ALUs.
588  *
589  * 3. It isn't terrible on other platforms. Usually this will be a couple
590  * of 32-bit ADD/ADCs.
591  */
592 
593  /* First calculate all of the cross products. */
594  xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
595  xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
596  xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
597  xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32);
598 
599  /* Now add the products together. These will never overflow. */
600  xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
601  xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
602  xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
603 
604  XXH128_hash_t r128 = { lower, upper };
605  return r128;
606 #endif
607 }
608 
609 /*
610  * Does a 64-bit to 128-bit multiply, then XOR folds it.
611  *
612  * The reason for the separate function is to prevent passing too many structs
613  * around by value. This will hopefully inline the multiply, but we don't force it.
614  */
615 static xxh_u64
616 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
617 {
618  XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
619  return product.low64 ^ product.high64;
620 }
621 
622 /* Seems to produce slightly better code on GCC for some reason. */
623 XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
624 {
625  XXH_ASSERT(0 <= shift && shift < 64);
626  return v64 ^ (v64 >> shift);
627 }
628 
629 /*
630  * We don't need to (or want to) mix as much as XXH64.
631  *
632  * Short hashes are more evenly distributed, so it isn't necessary.
633  */
634 static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
635 {
636  h64 = XXH_xorshift64(h64, 37);
637  h64 *= 0x165667919E3779F9ULL;
638  h64 = XXH_xorshift64(h64, 32);
639  return h64;
640 }
641 
642 
643 /* ==========================================
644  * Short keys
645  * ==========================================
646  * One of the shortcomings of XXH32 and XXH64 was that their performance was
647  * sub-optimal on short lengths. It used an iterative algorithm which strongly
648  * favored lengths that were a multiple of 4 or 8.
649  *
650  * Instead of iterating over individual inputs, we use a set of single shot
651  * functions which piece together a range of lengths and operate in constant time.
652  *
653  * Additionally, the number of multiplies has been significantly reduced. This
654  * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
655  *
656  * Depending on the platform, this may or may not be faster than XXH32, but it
657  * is almost guaranteed to be faster than XXH64.
658  */
659 
660 /*
661  * At very short lengths, there isn't enough input to fully hide secrets, or use
662  * the entire secret.
663  *
664  * There is also only a limited amount of mixing we can do before significantly
665  * impacting performance.
666  *
667  * Therefore, we use different sections of the secret and always mix two secret
668  * samples with an XOR. This should have no effect on performance on the
669  * seedless or withSeed variants because everything _should_ be constant folded
670  * by modern compilers.
671  *
672  * The XOR mixing hides individual parts of the secret and increases entropy.
673  *
674  * This adds an extra layer of strength for custom secrets.
675  */
676 XXH_FORCE_INLINE XXH64_hash_t
677 XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
678 {
679  XXH_ASSERT(input != NULL);
680  XXH_ASSERT(1 <= len && len <= 3);
681  XXH_ASSERT(secret != NULL);
682  /*
683  * len = 1: combined = { input[0], 0x01, input[0], input[0] }
684  * len = 2: combined = { input[1], 0x02, input[0], input[1] }
685  * len = 3: combined = { input[2], 0x03, input[0], input[1] }
686  */
687  { xxh_u8 const c1 = input[0];
688  xxh_u8 const c2 = input[len >> 1];
689  xxh_u8 const c3 = input[len - 1];
690  xxh_u32 const combined = ((xxh_u32)c1<<16) | (((xxh_u32)c2) << 24) | (((xxh_u32)c3) << 0) | (((xxh_u32)len) << 8);
691  xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
692  xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
693  xxh_u64 const mixed = keyed * PRIME64_1;
694  return XXH3_avalanche(mixed);
695  }
696 }
697 
698 XXH_FORCE_INLINE XXH64_hash_t
699 XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
700 {
701  XXH_ASSERT(input != NULL);
702  XXH_ASSERT(secret != NULL);
703  XXH_ASSERT(4 <= len && len < 8);
704  seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
705  { xxh_u32 const input1 = XXH_readLE32(input);
706  xxh_u32 const input2 = XXH_readLE32(input + len - 4);
707  xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
708  xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
709  xxh_u64 x = input64 ^ bitflip;
710  /* this mix is inspired by Pelle Evensen's rrmxmx */
711  x ^= XXH_rotl64(x, 49) ^ XXH_rotl64(x, 24);
712  x *= 0x9FB21C651E98DF25ULL;
713  x ^= (x >> 35) + len ;
714  x *= 0x9FB21C651E98DF25ULL;
715  return XXH_xorshift64(x, 28);
716  }
717 }
718 
719 XXH_FORCE_INLINE XXH64_hash_t
720 XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
721 {
722  XXH_ASSERT(input != NULL);
723  XXH_ASSERT(secret != NULL);
724  XXH_ASSERT(8 <= len && len <= 16);
725  { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
726  xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
727  xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1;
728  xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
729  xxh_u64 const acc = len
730  + XXH_swap64(input_lo) + input_hi
731  + XXH3_mul128_fold64(input_lo, input_hi);
732  return XXH3_avalanche(acc);
733  }
734 }
735 
736 XXH_FORCE_INLINE XXH64_hash_t
737 XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
738 {
739  XXH_ASSERT(len <= 16);
740  { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed);
741  if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
742  if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
743  return XXH3_avalanche((PRIME64_1 + seed) ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
744  }
745 }
746 
747 /*
748  * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
749  * multiplication by zero, affecting hashes of lengths 17 to 240.
750  *
751  * However, they are very unlikely.
752  *
753  * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
754  * unseeded non-cryptographic hashes, it does not attempt to defend itself
755  * against specially crafted inputs, only random inputs.
756  *
757  * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
758  * cancelling out the secret is taken an arbitrary number of times (addressed
759  * in XXH3_accumulate_512), this collision is very unlikely with random inputs
760  * and/or proper seeding:
761  *
762  * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
763  * function that is only called up to 16 times per hash with up to 240 bytes of
764  * input.
765  *
766  * This is not too bad for a non-cryptographic hash function, especially with
767  * only 64 bit outputs.
768  *
769  * The 128-bit variant (which trades some speed for strength) is NOT affected
770  * by this, although it is always a good idea to use a proper seed if you care
771  * about strength.
772  */
773 XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
774  const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
775 {
776 #if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
777  && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \
778  && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */
779  /*
780  * UGLY HACK:
781  * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
782  * slower code.
783  *
784  * By forcing seed64 into a register, we disrupt the cost model and
785  * cause it to scalarize. See `XXH32_round()`
786  *
787  * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
788  * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
789  * GCC 9.2, despite both emitting scalar code.
790  *
791  * GCC generates much better scalar code than Clang for the rest of XXH3,
792  * which is why finding a more optimal codepath is an interest.
793  */
794  __asm__ ("" : "+r" (seed64));
795 #endif
796  { xxh_u64 const input_lo = XXH_readLE64(input);
797  xxh_u64 const input_hi = XXH_readLE64(input+8);
798  return XXH3_mul128_fold64(
799  input_lo ^ (XXH_readLE64(secret) + seed64),
800  input_hi ^ (XXH_readLE64(secret+8) - seed64)
801  );
802  }
803 }
804 
805 /* For mid range keys, XXH3 uses a Mum-hash variant. */
806 XXH_FORCE_INLINE XXH64_hash_t
807 XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
808  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
809  XXH64_hash_t seed)
810 {
811  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
812  XXH_ASSERT(16 < len && len <= 128);
813 
814  { xxh_u64 acc = len * PRIME64_1;
815  if (len > 32) {
816  if (len > 64) {
817  if (len > 96) {
818  acc += XXH3_mix16B(input+48, secret+96, seed);
819  acc += XXH3_mix16B(input+len-64, secret+112, seed);
820  }
821  acc += XXH3_mix16B(input+32, secret+64, seed);
822  acc += XXH3_mix16B(input+len-48, secret+80, seed);
823  }
824  acc += XXH3_mix16B(input+16, secret+32, seed);
825  acc += XXH3_mix16B(input+len-32, secret+48, seed);
826  }
827  acc += XXH3_mix16B(input+0, secret+0, seed);
828  acc += XXH3_mix16B(input+len-16, secret+16, seed);
829 
830  return XXH3_avalanche(acc);
831  }
832 }
833 
834 #define XXH3_MIDSIZE_MAX 240
835 
836 XXH_NO_INLINE XXH64_hash_t
837 XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
838  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
839  XXH64_hash_t seed)
840 {
841  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
842  XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
843 
844  #define XXH3_MIDSIZE_STARTOFFSET 3
845  #define XXH3_MIDSIZE_LASTOFFSET 17
846 
847  { xxh_u64 acc = len * PRIME64_1;
848  int const nbRounds = (int)len / 16;
849  int i;
850  for (i=0; i<8; i++) {
851  acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
852  }
853  acc = XXH3_avalanche(acc);
854  XXH_ASSERT(nbRounds >= 8);
855 #if defined(__clang__) /* Clang */ \
856  && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
857  && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
858  /*
859  * UGLY HACK:
860  * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
861  * In everywhere else, it uses scalar code.
862  *
863  * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
864  * would still be slower than UMAAL (see XXH_mult64to128).
865  *
866  * Unfortunately, Clang doesn't handle the long multiplies properly and
867  * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
868  * scalarized into an ugly mess of VMOV.32 instructions.
869  *
870  * This mess is difficult to avoid without turning autovectorization
871  * off completely, but they are usually relatively minor and/or not
872  * worth it to fix.
873  *
874  * This loop is the easiest to fix, as unlike XXH32, this pragma
875  * _actually works_ because it is a loop vectorization instead of an
876  * SLP vectorization.
877  */
878  #pragma clang loop vectorize(disable)
879 #endif
880  for (i=8 ; i < nbRounds; i++) {
881  acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
882  }
883  /* last bytes */
884  acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
885  return XXH3_avalanche(acc);
886  }
887 }
888 
889 
890 /* === Long Keys === */
891 
892 #define STRIPE_LEN 64
893 #define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */
894 #define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
895 
897 
898 /*
899  * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
900  *
901  * It is a hardened version of UMAC, based off of FARSH's implementation.
902  *
903  * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
904  * implementations, and it is ridiculously fast.
905  *
906  * We harden it by mixing the original input to the accumulators as well as the product.
907  *
908  * This means that in the (relatively likely) case of a multiply by zero, the
909  * original input is preserved.
910  *
911  * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
912  * cross-pollination, as otherwise the upper and lower halves would be
913  * essentially independent.
914  *
915  * This doesn't matter on 64-bit hashes since they all get merged together in
916  * the end, so we skip the extra step.
917  *
918  * Both XXH3_64bits and XXH3_128bits use this subroutine.
919  */
920 XXH_FORCE_INLINE void
922  const void* XXH_RESTRICT input,
923  const void* XXH_RESTRICT secret,
924  XXH3_accWidth_e accWidth)
925 {
926 #if (XXH_VECTOR == XXH_AVX2)
927 
928  XXH_ASSERT((((size_t)acc) & 31) == 0);
929  { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
930  /* Unaligned. This is mainly for pointer arithmetic, and because
931  * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
932  const __m256i* const xinput = (const __m256i *) input;
933  /* Unaligned. This is mainly for pointer arithmetic, and because
934  * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
935  const __m256i* const xsecret = (const __m256i *) secret;
936 
937  size_t i;
938  for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
939  /* data_vec = xinput[i]; */
940  __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
941  /* key_vec = xsecret[i]; */
942  __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
943  /* data_key = data_vec ^ key_vec; */
944  __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
945  /* data_key_lo = data_key >> 32; */
946  __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
947  /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
948  __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);
949  if (accWidth == XXH3_acc_128bits) {
950  /* xacc[i] += swap(data_vec); */
951  __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
952  __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
953  /* xacc[i] += product; */
954  xacc[i] = _mm256_add_epi64(product, sum);
955  } else { /* XXH3_acc_64bits */
956  /* xacc[i] += data_vec; */
957  __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
958  /* xacc[i] += product; */
959  xacc[i] = _mm256_add_epi64(product, sum);
960  }
961  } }
962 
963 #elif (XXH_VECTOR == XXH_SSE2)
964 
965  /* SSE2 is just a half-scale version of the AVX2 version. */
966  XXH_ASSERT((((size_t)acc) & 15) == 0);
967  { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
968  /* Unaligned. This is mainly for pointer arithmetic, and because
969  * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
970  const __m128i* const xinput = (const __m128i *) input;
971  /* Unaligned. This is mainly for pointer arithmetic, and because
972  * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
973  const __m128i* const xsecret = (const __m128i *) secret;
974 
975  size_t i;
976  for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
977  /* data_vec = xinput[i]; */
978  __m128i const data_vec = _mm_loadu_si128 (xinput+i);
979  /* key_vec = xsecret[i]; */
980  __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
981  /* data_key = data_vec ^ key_vec; */
982  __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
983  /* data_key_lo = data_key >> 32; */
984  __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
985  /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
986  __m128i const product = _mm_mul_epu32 (data_key, data_key_lo);
987  if (accWidth == XXH3_acc_128bits) {
988  /* xacc[i] += swap(data_vec); */
989  __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
990  __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
991  /* xacc[i] += product; */
992  xacc[i] = _mm_add_epi64(product, sum);
993  } else { /* XXH3_acc_64bits */
994  /* xacc[i] += data_vec; */
995  __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
996  /* xacc[i] += product; */
997  xacc[i] = _mm_add_epi64(product, sum);
998  }
999  } }
1000 
1001 #elif (XXH_VECTOR == XXH_NEON)
1002 
1003  XXH_ASSERT((((size_t)acc) & 15) == 0);
1004  {
1005  XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
1006  /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
1007  uint8_t const* const xinput = (const uint8_t *) input;
1008  uint8_t const* const xsecret = (const uint8_t *) secret;
1009 
1010  size_t i;
1011  for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
1012  /* data_vec = xinput[i]; */
1013  uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
1014  /* key_vec = xsecret[i]; */
1015  uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
1016  /* data_key = data_vec ^ key_vec; */
1017  uint64x2_t data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
1018  uint32x2_t data_key_lo, data_key_hi;
1019  if (accWidth == XXH3_acc_64bits) {
1020  /* xacc[i] += data_vec; */
1021  xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
1022  } else { /* XXH3_acc_128bits */
1023  /* xacc[i] += swap(data_vec); */
1024  uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
1025  uint64x2_t const swapped = vextq_u64(data64, data64, 1);
1026  xacc[i] = vaddq_u64 (xacc[i], swapped);
1027  }
1028  /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
1029  * data_key_hi = (uint32x2_t) (data_key >> 32);
1030  * data_key = UNDEFINED; */
1031  XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
1032  /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
1033  xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
1034 
1035  }
1036  }
1037 
1038 #elif (XXH_VECTOR == XXH_VSX)
1039  xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */
1040  xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */
1041  xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */
1042  xxh_u64x2 const v32 = { 32, 32 };
1043  size_t i;
1044  for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
1045  /* data_vec = xinput[i]; */
1046  xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
1047  /* key_vec = xsecret[i]; */
1048  xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
1049  xxh_u64x2 const data_key = data_vec ^ key_vec;
1050  /* shuffled = (data_key << 32) | (data_key >> 32); */
1051  xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
1052  /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
1053  xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
1054  xacc[i] += product;
1055 
1056  if (accWidth == XXH3_acc_64bits) {
1057  xacc[i] += data_vec;
1058  } else { /* XXH3_acc_128bits */
1059  /* swap high and low halves */
1060 #ifdef __s390x__
1061  xxh_u64x2 const data_swapped = vec_permi(data_vec, data_vec, 2);
1062 #else
1063  xxh_u64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
1064 #endif
1065  xacc[i] += data_swapped;
1066  }
1067  }
1068 
1069 #else /* scalar variant of Accumulator - universal */
1070 
1071  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
1072  const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */
1073  const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
1074  size_t i;
1075  XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
1076  for (i=0; i < ACC_NB; i++) {
1077  xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
1078  xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
1079 
1080  if (accWidth == XXH3_acc_64bits) {
1081  xacc[i] += data_val;
1082  } else {
1083  xacc[i ^ 1] += data_val; /* swap adjacent lanes */
1084  }
1085  xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
1086  }
1087 #endif
1088 }
1089 
1090 /*
1091  * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
1092  *
1093  * Multiplication isn't perfect, as explained by Google in HighwayHash:
1094  *
1095  * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
1096  * // varying degrees. In descending order of goodness, bytes
1097  * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
1098  * // As expected, the upper and lower bytes are much worse.
1099  *
1100  * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
1101  *
1102  * Since our algorithm uses a pseudorandom secret to add some variance into the
1103  * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
1104  *
1105  * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
1106  * extraction.
1107  *
1108  * Both XXH3_64bits and XXH3_128bits use this subroutine.
1109  */
1110 XXH_FORCE_INLINE void
1111 XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
1112 {
1113 #if (XXH_VECTOR == XXH_AVX2)
1114 
1115  XXH_ASSERT((((size_t)acc) & 31) == 0);
1116  { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
1117  /* Unaligned. This is mainly for pointer arithmetic, and because
1118  * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
1119  const __m256i* const xsecret = (const __m256i *) secret;
1120  const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
1121 
1122  size_t i;
1123  for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
1124  /* xacc[i] ^= (xacc[i] >> 47) */
1125  __m256i const acc_vec = xacc[i];
1126  __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47);
1127  __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted);
1128  /* xacc[i] ^= xsecret; */
1129  __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
1130  __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
1131 
1132  /* xacc[i] *= PRIME32_1; */
1133  __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
1134  __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
1135  __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
1136  xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
1137  }
1138  }
1139 
1140 #elif (XXH_VECTOR == XXH_SSE2)
1141 
1142  XXH_ASSERT((((size_t)acc) & 15) == 0);
1143  { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
1144  /* Unaligned. This is mainly for pointer arithmetic, and because
1145  * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
1146  const __m128i* const xsecret = (const __m128i *) secret;
1147  const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
1148 
1149  size_t i;
1150  for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
1151  /* xacc[i] ^= (xacc[i] >> 47) */
1152  __m128i const acc_vec = xacc[i];
1153  __m128i const shifted = _mm_srli_epi64 (acc_vec, 47);
1154  __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted);
1155  /* xacc[i] ^= xsecret[i]; */
1156  __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
1157  __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);
1158 
1159  /* xacc[i] *= PRIME32_1; */
1160  __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
1161  __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32);
1162  __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32);
1163  xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
1164  }
1165  }
1166 
1167 #elif (XXH_VECTOR == XXH_NEON)
1168 
1169  XXH_ASSERT((((size_t)acc) & 15) == 0);
1170 
1171  { uint64x2_t* xacc = (uint64x2_t*) acc;
1172  uint8_t const* xsecret = (uint8_t const*) secret;
1173  uint32x2_t prime = vdup_n_u32 (PRIME32_1);
1174 
1175  size_t i;
1176  for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
1177  /* xacc[i] ^= (xacc[i] >> 47); */
1178  uint64x2_t acc_vec = xacc[i];
1179  uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
1180  uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
1181 
1182  /* xacc[i] ^= xsecret[i]; */
1183  uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
1184  uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
1185 
1186  /* xacc[i] *= PRIME32_1 */
1187  uint32x2_t data_key_lo, data_key_hi;
1188  /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
1189  * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
1190  * xacc[i] = UNDEFINED; */
1191  XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
1192  { /*
1193  * prod_hi = (data_key >> 32) * PRIME32_1;
1194  *
1195  * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
1196  * incorrectly "optimize" this:
1197  * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
1198  * shifted = vshll_n_u32(tmp, 32);
1199  * to this:
1200  * tmp = "vmulq_u64"(a, b); // no such thing!
1201  * shifted = vshlq_n_u64(tmp, 32);
1202  *
1203  * However, unlike SSE, Clang lacks a 64-bit multiply routine
1204  * for NEON, and it scalarizes two 64-bit multiplies instead.
1205  *
1206  * vmull_u32 has the same timing as vmul_u32, and it avoids
1207  * this bug completely.
1208  * See https://bugs.llvm.org/show_bug.cgi?id=39967
1209  */
1210  uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
1211  /* xacc[i] = prod_hi << 32; */
1212  xacc[i] = vshlq_n_u64(prod_hi, 32);
1213  /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
1214  xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
1215  }
1216  } }
1217 
1218 #elif (XXH_VECTOR == XXH_VSX)
1219 
1220  XXH_ASSERT((((size_t)acc) & 15) == 0);
1221 
1222  { xxh_u64x2* const xacc = (xxh_u64x2*) acc;
1223  const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
1224  /* constants */
1225  xxh_u64x2 const v32 = { 32, 32 };
1226  xxh_u64x2 const v47 = { 47, 47 };
1227  xxh_u32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
1228  size_t i;
1229  for (i = 0; i < STRIPE_LEN / sizeof(xxh_u64x2); i++) {
1230  /* xacc[i] ^= (xacc[i] >> 47); */
1231  xxh_u64x2 const acc_vec = xacc[i];
1232  xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
1233 
1234  /* xacc[i] ^= xsecret[i]; */
1235  xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
1236  xxh_u64x2 const data_key = data_vec ^ key_vec;
1237 
1238  /* xacc[i] *= PRIME32_1 */
1239  /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */
1240  xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime);
1241  /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */
1242  xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime);
1243  xacc[i] = prod_odd + (prod_even << v32);
1244  } }
1245 
1246 #else /* scalar variant of Scrambler - universal */
1247 
1248  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
1249  const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
1250  size_t i;
1251  XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
1252  for (i=0; i < ACC_NB; i++) {
1253  xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
1254  xxh_u64 acc64 = xacc[i];
1255  acc64 = XXH_xorshift64(acc64, 47);
1256  acc64 ^= key64;
1257  acc64 *= PRIME32_1;
1258  xacc[i] = acc64;
1259  }
1260 
1261 #endif
1262 }
1263 
1264 #define XXH_PREFETCH_DIST 384
1265 
1266 /*
1267  * XXH3_accumulate()
1268  * Loops over XXH3_accumulate_512().
1269  * Assumption: nbStripes will not overflow the secret size
1270  */
1271 XXH_FORCE_INLINE void
1273  const xxh_u8* XXH_RESTRICT input,
1274  const xxh_u8* XXH_RESTRICT secret,
1275  size_t nbStripes,
1276  XXH3_accWidth_e accWidth)
1277 {
1278  size_t n;
1279  for (n = 0; n < nbStripes; n++ ) {
1280  const xxh_u8* const in = input + n*STRIPE_LEN;
1282  XXH3_accumulate_512(acc,
1283  in,
1284  secret + n*XXH_SECRET_CONSUME_RATE,
1285  accWidth);
1286  }
1287 }
1288 
1289 XXH_FORCE_INLINE void
1291  const xxh_u8* XXH_RESTRICT input, size_t len,
1292  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1293  XXH3_accWidth_e accWidth)
1294 {
1295  size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
1296  size_t const block_len = STRIPE_LEN * nb_rounds;
1297  size_t const nb_blocks = len / block_len;
1298 
1299  size_t n;
1300 
1301  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
1302 
1303  for (n = 0; n < nb_blocks; n++) {
1304  XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
1305  XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
1306  }
1307 
1308  /* last partial block */
1309  XXH_ASSERT(len > STRIPE_LEN);
1310  { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
1311  XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
1312  XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
1313 
1314  /* last stripe */
1315  if (len & (STRIPE_LEN - 1)) {
1316  const xxh_u8* const p = input + len - STRIPE_LEN;
1317  /* Do not align on 8, so that the secret is different from the scrambler */
1318 #define XXH_SECRET_LASTACC_START 7
1319  XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
1320  } }
1321 }
1322 
1323 XXH_FORCE_INLINE xxh_u64
1324 XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
1325 {
1326  return XXH3_mul128_fold64(
1327  acc[0] ^ XXH_readLE64(secret),
1328  acc[1] ^ XXH_readLE64(secret+8) );
1329 }
1330 
1331 static XXH64_hash_t
1332 XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
1333 {
1334  xxh_u64 result64 = start;
1335 
1336  result64 += XXH3_mix2Accs(acc+0, secret + 0);
1337  result64 += XXH3_mix2Accs(acc+2, secret + 16);
1338  result64 += XXH3_mix2Accs(acc+4, secret + 32);
1339  result64 += XXH3_mix2Accs(acc+6, secret + 48);
1340 
1341  return XXH3_avalanche(result64);
1342 }
1343 
1344 #define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
1345  PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
1346 
1347 XXH_FORCE_INLINE XXH64_hash_t
1348 XXH3_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
1349  const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
1350 {
1351  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
1352 
1353  XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits);
1354 
1355  /* converge into final hash */
1356  XXH_STATIC_ASSERT(sizeof(acc) == 64);
1357  /* do not align on 8, so that the secret is different from the accumulator */
1358 #define XXH_SECRET_MERGEACCS_START 11
1359  XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1360  return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
1361 }
1362 
1363 /*
1364  * It's important for performance that XXH3_hashLong is not inlined. Not sure
1365  * why (uop cache maybe?), but the difference is large and easily measurable.
1366  */
1367 XXH_NO_INLINE XXH64_hash_t
1368 XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
1369 {
1370  return XXH3_hashLong_internal(input, len, kSecret, sizeof(kSecret));
1371 }
1372 
1373 /*
1374  * It's important for performance that XXH3_hashLong is not inlined. Not sure
1375  * why (uop cache maybe?), but the difference is large and easily measurable.
1376  */
1377 XXH_NO_INLINE XXH64_hash_t
1378 XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
1379  const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
1380 {
1381  return XXH3_hashLong_internal(input, len, secret, secretSize);
1382 }
1383 
1384 
1385 XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
1386 {
1387  if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
1388  memcpy(dst, &v64, sizeof(v64));
1389 }
1390 
1391 /* XXH3_initCustomSecret() :
1392  * destination `customSecret` is presumed allocated and same size as `kSecret`.
1393  */
1394 XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
1395 {
1396  int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
1397  int i;
1398 
1399  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
1400 
1401  for (i=0; i < nbRounds; i++) {
1402  XXH_writeLE64(customSecret + 16*i, XXH_readLE64(kSecret + 16*i) + seed64);
1403  XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
1404  }
1405 }
1406 
1407 
1408 /*
1409  * XXH3_hashLong_64b_withSeed():
1410  * Generate a custom key based on alteration of default kSecret with the seed,
1411  * and then use this key for long mode hashing.
1412  *
1413  * This operation is decently fast but nonetheless costs a little bit of time.
1414  * Try to avoid it whenever possible (typically when seed==0).
1415  *
1416  * It's important for performance that XXH3_hashLong is not inlined. Not sure
1417  * why (uop cache maybe?), but the difference is large and easily measurable.
1418  */
1419 XXH_NO_INLINE XXH64_hash_t
1420 XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
1421 {
1422  XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
1423  if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len);
1424  XXH3_initCustomSecret(secret, seed);
1425  return XXH3_hashLong_internal(input, len, secret, sizeof(secret));
1426 }
1427 
1428 /* === Public entry point === */
1429 
1430 XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
1431 {
1432  if (len <= 16) return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
1433  if (len <= 128) return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1434  if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1435  return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
1436 }
1437 
1439 XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1440 {
1441  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
1442  /*
1443  * If an action is to be taken if `secret` conditions are not respected,
1444  * it should be done here.
1445  * For now, it's a contract pre-condition.
1446  * Adding a check and a branch here would cost performance at every hash.
1447  */
1448  if (len <= 16) return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1449  if (len <= 128) return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1450  if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1451  return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1452 }
1453 
1455 XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1456 {
1457  if (len <= 16) return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
1458  if (len <= 128) return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1459  if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1460  return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
1461 }
1462 
1463 /* === XXH3 streaming === */
1464 
1466 {
1467  return (XXH3_state_t*)XXH_malloc(sizeof(XXH3_state_t));
1468 }
1469 
1471 {
1472  XXH_free(statePtr);
1473  return XXH_OK;
1474 }
1475 
1476 XXH_PUBLIC_API void
1477 XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
1478 {
1479  memcpy(dst_state, src_state, sizeof(*dst_state));
1480 }
1481 
1482 static void
1483 XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
1484  XXH64_hash_t seed,
1485  const xxh_u8* secret, size_t secretSize)
1486 {
1487  XXH_ASSERT(statePtr != NULL);
1488  memset(statePtr, 0, sizeof(*statePtr));
1489  statePtr->acc[0] = PRIME32_3;
1490  statePtr->acc[1] = PRIME64_1;
1491  statePtr->acc[2] = PRIME64_2;
1492  statePtr->acc[3] = PRIME64_3;
1493  statePtr->acc[4] = PRIME64_4;
1494  statePtr->acc[5] = PRIME32_2;
1495  statePtr->acc[6] = PRIME64_5;
1496  statePtr->acc[7] = PRIME32_1;
1497  statePtr->seed = seed;
1498  XXH_ASSERT(secret != NULL);
1499  statePtr->secret = secret;
1500  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
1501  statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
1502  statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
1503 }
1504 
1506 XXH3_64bits_reset(XXH3_state_t* statePtr)
1507 {
1508  if (statePtr == NULL) return XXH_ERROR;
1509  XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
1510  return XXH_OK;
1511 }
1512 
1514 XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
1515 {
1516  if (statePtr == NULL) return XXH_ERROR;
1517  XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
1518  if (secret == NULL) return XXH_ERROR;
1519  if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
1520  return XXH_OK;
1521 }
1522 
1524 XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
1525 {
1526  if (statePtr == NULL) return XXH_ERROR;
1527  XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
1528  XXH3_initCustomSecret(statePtr->customSecret, seed);
1529  statePtr->secret = statePtr->customSecret;
1530  return XXH_OK;
1531 }
1532 
1533 XXH_FORCE_INLINE void
1534 XXH3_consumeStripes( xxh_u64* acc,
1535  XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
1536  const xxh_u8* input, size_t totalStripes,
1537  const xxh_u8* secret, size_t secretLimit,
1538  XXH3_accWidth_e accWidth)
1539 {
1540  XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
1541  if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
1542  /* need a scrambling operation */
1543  size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
1544  XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
1545  XXH3_scrambleAcc(acc, secret + secretLimit);
1546  XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
1547  *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
1548  } else {
1549  XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
1550  *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
1551  }
1552 }
1553 
1554 /*
1555  * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
1556  */
1557 XXH_FORCE_INLINE XXH_errorcode
1558 XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth)
1559 {
1560  if (input==NULL)
1561 #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
1562  return XXH_OK;
1563 #else
1564  return XXH_ERROR;
1565 #endif
1566 
1567  { const xxh_u8* const bEnd = input + len;
1568 
1569  state->totalLen += len;
1570 
1571  if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
1572  XXH_memcpy(state->buffer + state->bufferedSize, input, len);
1573  state->bufferedSize += (XXH32_hash_t)len;
1574  return XXH_OK;
1575  }
1576  /* input is now > XXH3_INTERNALBUFFER_SIZE */
1577 
1578  #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
1579  XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0); /* clean multiple */
1580 
1581  /*
1582  * There is some input left inside the internal buffer.
1583  * Fill it, then consume it.
1584  */
1585  if (state->bufferedSize) {
1586  size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
1587  XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
1588  input += loadSize;
1589  XXH3_consumeStripes(state->acc,
1590  &state->nbStripesSoFar, state->nbStripesPerBlock,
1591  state->buffer, XXH3_INTERNALBUFFER_STRIPES,
1592  state->secret, state->secretLimit,
1593  accWidth);
1594  state->bufferedSize = 0;
1595  }
1596 
1597  /* Consume input by full buffer quantities */
1598  if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
1599  const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
1600  do {
1601  XXH3_consumeStripes(state->acc,
1602  &state->nbStripesSoFar, state->nbStripesPerBlock,
1604  state->secret, state->secretLimit,
1605  accWidth);
1606  input += XXH3_INTERNALBUFFER_SIZE;
1607  } while (input<=limit);
1608  }
1609 
1610  if (input < bEnd) { /* Some remaining input: buffer it */
1611  XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
1612  state->bufferedSize = (XXH32_hash_t)(bEnd-input);
1613  }
1614  }
1615 
1616  return XXH_OK;
1617 }
1618 
1620 XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
1621 {
1622  return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits);
1623 }
1624 
1625 
1626 XXH_FORCE_INLINE void
1627 XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
1628 {
1629  /*
1630  * Digest on a local copy. This way, the state remains unaltered, and it can
1631  * continue ingesting more input afterwards.
1632  */
1633  memcpy(acc, state->acc, sizeof(state->acc));
1634  if (state->bufferedSize >= STRIPE_LEN) {
1635  size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
1636  XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
1637  XXH3_consumeStripes(acc,
1638  &nbStripesSoFar, state->nbStripesPerBlock,
1639  state->buffer, totalNbStripes,
1640  state->secret, state->secretLimit,
1641  accWidth);
1642  if (state->bufferedSize % STRIPE_LEN) { /* one last partial stripe */
1643  XXH3_accumulate_512(acc,
1644  state->buffer + state->bufferedSize - STRIPE_LEN,
1645  state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
1646  accWidth);
1647  }
1648  } else { /* bufferedSize < STRIPE_LEN */
1649  if (state->bufferedSize) { /* one last stripe */
1650  xxh_u8 lastStripe[STRIPE_LEN];
1651  size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
1652  memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
1653  memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
1654  XXH3_accumulate_512(acc,
1655  lastStripe,
1656  state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
1657  accWidth);
1658  } }
1659 }
1660 
1662 {
1663  if (state->totalLen > XXH3_MIDSIZE_MAX) {
1664  XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
1665  XXH3_digest_long(acc, state, XXH3_acc_64bits);
1666  return XXH3_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
1667  }
1668  /* len <= XXH3_MIDSIZE_MAX : short code */
1669  if (state->seed)
1670  return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
1671  return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
1672 }
1673 
1674 /* ==========================================
1675  * XXH3 128 bits (=> XXH128)
1676  * ========================================== */
1677 
1678 XXH_FORCE_INLINE XXH128_hash_t
1679 XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1680 {
1681  XXH_ASSERT(input != NULL);
1682  XXH_ASSERT(1 <= len && len <= 3);
1683  XXH_ASSERT(secret != NULL);
1684  /*
1685  * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
1686  * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
1687  * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
1688  */
1689  { xxh_u8 const c1 = input[0];
1690  xxh_u8 const c2 = input[len >> 1];
1691  xxh_u8 const c3 = input[len - 1];
1692  xxh_u32 const combinedl = ((xxh_u32)c1<<16) | (((xxh_u32)c2) << 24) | (((xxh_u32)c3) << 0) | (((xxh_u32)len) << 8);
1693  xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
1694  xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
1695  xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
1696  xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
1697  xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
1698  xxh_u64 const mixedl = keyed_lo * PRIME64_1;
1699  xxh_u64 const mixedh = keyed_hi * PRIME64_5;
1700  XXH128_hash_t const h128 = { XXH3_avalanche(mixedl) /*low64*/, XXH3_avalanche(mixedh) /*high64*/ };
1701  return h128;
1702  }
1703 }
1704 
1705 XXH_FORCE_INLINE XXH128_hash_t
1706 XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1707 {
1708  XXH_ASSERT(input != NULL);
1709  XXH_ASSERT(secret != NULL);
1710  XXH_ASSERT(4 <= len && len <= 8);
1711  seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
1712  { xxh_u32 const input_lo = XXH_readLE32(input);
1713  xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
1714  xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
1715  xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
1716  xxh_u64 const keyed = input_64 ^ bitflip;
1717 
1718  /* Shift len to the left to ensure it is even, this avoids even multiplies. */
1719  XXH128_hash_t m128 = XXH_mult64to128(keyed, PRIME64_1 + (len << 2));
1720 
1721  m128.high64 += (m128.low64 << 1);
1722  m128.low64 ^= (m128.high64 >> 3);
1723 
1724  m128.low64 = XXH_xorshift64(m128.low64, 35);
1725  m128.low64 *= 0x9FB21C651E98DF25ULL;
1726  m128.low64 = XXH_xorshift64(m128.low64, 28);
1727  m128.high64 = XXH3_avalanche(m128.high64);
1728  return m128;
1729  }
1730 }
1731 
1732 XXH_FORCE_INLINE XXH128_hash_t
1733 XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1734 {
1735  XXH_ASSERT(input != NULL);
1736  XXH_ASSERT(secret != NULL);
1737  XXH_ASSERT(9 <= len && len <= 16);
1738  { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
1739  xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
1740  xxh_u64 const input_lo = XXH_readLE64(input);
1741  xxh_u64 input_hi = XXH_readLE64(input + len - 8);
1742  XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, PRIME64_1);
1743  /*
1744  * Put len in the middle of m128 to ensure that the length gets mixed to
1745  * both the low and high bits in the 128x64 multiply below.
1746  */
1747  m128.low64 += (xxh_u64)(len - 1) << 54;
1748  input_hi ^= bitfliph;
1749  /*
1750  * Add the high 32 bits of input_hi to the high 32 bits of m128, then
1751  * add the long product of the low 32 bits of input_hi and PRIME32_2 to
1752  * the high 64 bits of m128.
1753  *
1754  * The best approach to this operation is different on 32-bit and 64-bit.
1755  */
1756  if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
1757  /*
1758  * 32-bit optimized version, which is more readable.
1759  *
1760  * On 32-bit, it removes an ADC and delays a dependency between the two
1761  * halves of m128.high64, but it generates an extra mask on 64-bit.
1762  */
1763  m128.high64 += (input_hi & 0xFFFFFFFF00000000) + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2);
1764  } else {
1765  /*
1766  * 64-bit optimized (albeit more confusing) version.
1767  *
1768  * Uses some properties of addition and multiplication to remove the mask:
1769  *
1770  * Let:
1771  * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
1772  * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
1773  * c = PRIME32_2
1774  *
1775  * a + (b * c)
1776  * Inverse Property: x + y - x == y
1777  * a + (b * (1 + c - 1))
1778  * Distributive Property: x * (y + z) == (x * y) + (x * z)
1779  * a + (b * 1) + (b * (c - 1))
1780  * Identity Property: x * 1 == x
1781  * a + b + (b * (c - 1))
1782  *
1783  * Substitute a, b, and c:
1784  * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
1785  *
1786  * Since input_hi.hi + input_hi.lo == input_hi, we get this:
1787  * input_hi + ((xxh_u64)input_hi.lo * (PRIME32_2 - 1))
1788  */
1789  m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, PRIME32_2 - 1);
1790  }
1791  /* m128 ^= XXH_swap64(m128 >> 64); */
1792  m128.low64 ^= XXH_swap64(m128.high64);
1793 
1794  { /* 128x64 multiply: h128 = m128 * PRIME64_2; */
1795  XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
1796  h128.high64 += m128.high64 * PRIME64_2;
1797 
1798  h128.low64 = XXH3_avalanche(h128.low64);
1799  h128.high64 = XXH3_avalanche(h128.high64);
1800  return h128;
1801  } }
1802 }
1803 
1804 /* Assumption : `secret` size is >= 16
1805  * Note : it should be >= XXH3_SECRET_SIZE_MIN anyway */
1806 XXH_FORCE_INLINE XXH128_hash_t
1807 XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
1808 {
1809  XXH_ASSERT(len <= 16);
1810  { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
1811  if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
1812  if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
1813  { XXH128_hash_t h128;
1814  xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
1815  xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
1816  h128.low64 = XXH3_avalanche((PRIME64_1 + seed) ^ bitflipl);
1817  h128.high64 = XXH3_avalanche((PRIME64_2 - seed) ^ bitfliph);
1818  return h128;
1819  } }
1820 }
1821 
1822 /*
1823  * A bit slower than XXH3_mix16B, but handles multiply by zero better.
1824  */
1825 XXH_FORCE_INLINE XXH128_hash_t
1826 XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
1827 {
1828  acc.low64 += XXH3_mix16B (input_1, secret+0, seed);
1829  acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
1830  acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
1831  acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
1832  return acc;
1833 }
1834 
1835 
1836 XXH_FORCE_INLINE XXH128_hash_t
1837 XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1838  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1839  XXH64_hash_t seed)
1840 {
1841  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
1842  XXH_ASSERT(16 < len && len <= 128);
1843 
1844  { XXH128_hash_t acc;
1845  acc.low64 = len * PRIME64_1;
1846  acc.high64 = 0;
1847  if (len > 32) {
1848  if (len > 64) {
1849  if (len > 96) {
1850  acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
1851  }
1852  acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
1853  }
1854  acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
1855  }
1856  acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
1857  { xxh_u64 const low64 = acc.low64 + acc.high64;
1858  xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1859  XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) };
1860  return h128;
1861  }
1862  }
1863 }
1864 
1865 XXH_NO_INLINE XXH128_hash_t
1866 XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
1867  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
1868  XXH64_hash_t seed)
1869 {
1870  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
1871  XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
1872 
1873  { XXH128_hash_t acc;
1874  int const nbRounds = (int)len / 32;
1875  int i;
1876  acc.low64 = len * PRIME64_1;
1877  acc.high64 = 0;
1878  for (i=0; i<4; i++) {
1879  acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
1880  }
1881  acc.low64 = XXH3_avalanche(acc.low64);
1882  acc.high64 = XXH3_avalanche(acc.high64);
1883  XXH_ASSERT(nbRounds >= 4);
1884  for (i=4 ; i < nbRounds; i++) {
1885  acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
1886  }
1887  /* last bytes */
1888  acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
1889 
1890  { xxh_u64 const low64 = acc.low64 + acc.high64;
1891  xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
1892  XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) };
1893  return h128;
1894  }
1895  }
1896 }
1897 
1898 XXH_FORCE_INLINE XXH128_hash_t
1899 XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
1900  const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
1901 {
1902  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
1903 
1904  XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits);
1905 
1906  /* converge into final hash */
1907  XXH_STATIC_ASSERT(sizeof(acc) == 64);
1908  XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
1909  { xxh_u64 const low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
1910  xxh_u64 const high64 = XXH3_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
1911  XXH128_hash_t const h128 = { low64, high64 };
1912  return h128;
1913  }
1914 }
1915 
1916 /*
1917  * It's important for performance that XXH3_hashLong is not inlined. Not sure
1918  * why (uop cache maybe?), but the difference is large and easily measurable.
1919  */
1920 XXH_NO_INLINE XXH128_hash_t
1921 XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
1922 {
1923  return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
1924 }
1925 
1926 /*
1927  * It's important for performance that XXH3_hashLong is not inlined. Not sure
1928  * why (uop cache maybe?), but the difference is large and easily measurable.
1929  */
1930 XXH_NO_INLINE XXH128_hash_t
1931 XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
1932  const xxh_u8* secret, size_t secretSize)
1933 {
1934  return XXH3_hashLong_128b_internal(input, len, secret, secretSize);
1935 }
1936 
1937 /*
1938  * It's important for performance that XXH3_hashLong is not inlined. Not sure
1939  * why (uop cache maybe?), but the difference is large and easily measurable.
1940  */
1941 XXH_NO_INLINE XXH128_hash_t
1942 XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
1943 {
1944  XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
1945  if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len);
1946  XXH3_initCustomSecret(secret, seed);
1947  return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret));
1948 }
1949 
1950 
1951 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
1952 {
1953  if (len <= 16) return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
1954  if (len <= 128) return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1955  if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
1956  return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
1957 }
1958 
1959 XXH_PUBLIC_API XXH128_hash_t
1960 XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
1961 {
1962  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
1963  /*
1964  * If an action is to be taken if `secret` conditions are not respected,
1965  * it should be done here.
1966  * For now, it's a contract pre-condition.
1967  * Adding a check and a branch here would cost performance at every hash.
1968  */
1969  if (len <= 16) return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
1970  if (len <= 128) return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1971  if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
1972  return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
1973 }
1974 
1975 XXH_PUBLIC_API XXH128_hash_t
1976 XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
1977 {
1978  if (len <= 16) return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
1979  if (len <= 128) return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1980  if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
1981  return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
1982 }
1983 
1984 XXH_PUBLIC_API XXH128_hash_t
1985 XXH128(const void* input, size_t len, XXH64_hash_t seed)
1986 {
1987  return XXH3_128bits_withSeed(input, len, seed);
1988 }
1989 
1990 
1991 /* === XXH3 128-bit streaming === */
1992 
1993 /* all the functions are actually the same as for 64-bit streaming variant,
1994  just the reset one is different (different initial acc values for 0,5,6,7),
1995  and near the end of the digest function */
1996 
1997 static void
1998 XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
1999  XXH64_hash_t seed,
2000  const xxh_u8* secret, size_t secretSize)
2001 {
2002  XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
2003 }
2004 
2006 XXH3_128bits_reset(XXH3_state_t* statePtr)
2007 {
2008  if (statePtr == NULL) return XXH_ERROR;
2010  return XXH_OK;
2011 }
2012 
2014 XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
2015 {
2016  if (statePtr == NULL) return XXH_ERROR;
2017  XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
2018  if (secret == NULL) return XXH_ERROR;
2019  if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
2020  return XXH_OK;
2021 }
2022 
2024 XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
2025 {
2026  if (statePtr == NULL) return XXH_ERROR;
2027  XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
2028  XXH3_initCustomSecret(statePtr->customSecret, seed);
2029  statePtr->secret = statePtr->customSecret;
2030  return XXH_OK;
2031 }
2032 
2034 XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
2035 {
2036  return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits);
2037 }
2038 
2039 XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
2040 {
2041  if (state->totalLen > XXH3_MIDSIZE_MAX) {
2042  XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
2043  XXH3_digest_long(acc, state, XXH3_acc_128bits);
2044  XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
2045  { xxh_u64 const low64 = XXH3_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
2046  xxh_u64 const high64 = XXH3_mergeAccs(acc, state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)state->totalLen * PRIME64_2));
2047  XXH128_hash_t const h128 = { low64, high64 };
2048  return h128;
2049  }
2050  }
2051  /* len <= XXH3_MIDSIZE_MAX : short code */
2052  if (state->seed)
2053  return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
2054  return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
2055 }
2056 
2057 /* 128-bit utility functions */
2058 
2059 #include <string.h> /* memcmp, memcpy */
2060 
2061 /* return : 1 is equal, 0 if different */
2062 XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
2063 {
2064  /* note : XXH128_hash_t is compact, it has no padding byte */
2065  return !(memcmp(&h1, &h2, sizeof(h1)));
2066 }
2067 
2068 /* This prototype is compatible with stdlib's qsort().
2069  * return : >0 if *h128_1 > *h128_2
2070  * <0 if *h128_1 < *h128_2
2071  * =0 if *h128_1 == *h128_2 */
2072 XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
2073 {
2074  XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
2075  XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
2076  int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
2077  /* note : bets that, in most cases, hash values are different */
2078  if (hcmp) return hcmp;
2079  return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
2080 }
2081 
2082 
2083 /*====== Canonical representation ======*/
2084 XXH_PUBLIC_API void
2085 XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
2086 {
2087  XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
2088  if (XXH_CPU_LITTLE_ENDIAN) {
2089  hash.high64 = XXH_swap64(hash.high64);
2090  hash.low64 = XXH_swap64(hash.low64);
2091  }
2092  memcpy(dst, &hash.high64, sizeof(hash.high64));
2093  memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
2094 }
2095 
2096 XXH_PUBLIC_API XXH128_hash_t
2097 XXH128_hashFromCanonical(const XXH128_canonical_t* src)
2098 {
2099  XXH128_hash_t h;
2100  h.high64 = XXH_readBE64(src);
2101  h.low64 = XXH_readBE64(src->digest + 8);
2102  return h;
2103 }
2104 
2105 /* Pop our optimization override from above */
2106 #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
2107  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
2108  && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
2109 # pragma GCC pop_options
2110 #endif
2111 
2112 #endif /* XXH3_H_1397135465 */
XXH_FORCE_INLINE XXH64_hash_t XXH3_len_9to16_64b(const xxh_u8 *input, size_t len, const xxh_u8 *secret, XXH64_hash_t seed)
Definition: xxh3.h:720
XXH_FORCE_INLINE XXH128_hash_t XXH3_len_9to16_128b(const xxh_u8 *input, size_t len, const xxh_u8 *secret, XXH64_hash_t seed)
Definition: xxh3.h:1733
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest(const XXH3_state_t *state)
Definition: xxh3.h:1661
XXH_FORCE_INLINE void XXH3_accumulate_512(void *XXH_RESTRICT acc, const void *XXH_RESTRICT input, const void *XXH_RESTRICT secret, XXH3_accWidth_e accWidth)
Definition: xxh3.h:921
XXH_FORCE_INLINE XXH64_hash_t XXH3_hashLong_internal(const xxh_u8 *XXH_RESTRICT input, size_t len, const xxh_u8 *XXH_RESTRICT secret, size_t secretSize)
Definition: xxh3.h:1348
#define XXH_PREFETCH_DIST
Definition: xxh3.h:1264
XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8 *customSecret, xxh_u64 seed64)
Definition: xxh3.h:1394
XXH_FORCE_INLINE XXH64_hash_t XXH3_len_17to128_64b(const xxh_u8 *XXH_RESTRICT input, size_t len, const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed)
Definition: xxh3.h:807
XXH_FORCE_INLINE XXH_errorcode XXH3_update(XXH3_state_t *state, const xxh_u8 *input, size_t len, XXH3_accWidth_e accWidth)
Definition: xxh3.h:1558
XXH_FORCE_INLINE void XXH3_consumeStripes(xxh_u64 *acc, XXH32_hash_t *nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock, const xxh_u8 *input, size_t totalStripes, const xxh_u8 *secret, size_t secretLimit, XXH3_accWidth_e accWidth)
Definition: xxh3.h:1534
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t *statePtr)
Definition: xxh3.h:2006
XXH_FORCE_INLINE void XXH3_accumulate(xxh_u64 *XXH_RESTRICT acc, const xxh_u8 *XXH_RESTRICT input, const xxh_u8 *XXH_RESTRICT secret, size_t nbStripes, XXH3_accWidth_e accWidth)
Definition: xxh3.h:1272
#define XXH_RESTRICT
Definition: xxh3.h:61
XXH_FORCE_INLINE void XXH3_scrambleAcc(void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret)
Definition: xxh3.h:1111
XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
Definition: xxh3.h:623
static XXH64_hash_t XXH3_mergeAccs(const xxh_u64 *XXH_RESTRICT acc, const xxh_u8 *XXH_RESTRICT secret, xxh_u64 start)
Definition: xxh3.h:1332
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t *statePtr)
Definition: xxh3.h:1506
#define XXH_mult32to64(x, y)
Definition: xxh3.h:498
static void h2(float r, float r2, float ri, float rc, float r0, float rs, float &h)
XXH_NO_INLINE XXH128_hash_t XXH3_len_129to240_128b(const xxh_u8 *XXH_RESTRICT input, size_t len, const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed)
Definition: xxh3.h:1866
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void *input, size_t len)
Definition: xxh3.h:1951
XXH_FORCE_INLINE XXH64_hash_t XXH3_len_0to16_64b(const xxh_u8 *input, size_t len, const xxh_u8 *secret, XXH64_hash_t seed)
Definition: xxh3.h:737
XXH_FORCE_INLINE void XXH_writeLE64(void *dst, xxh_u64 v64)
Definition: xxh3.h:1385
#define XXH3_MIDSIZE_LASTOFFSET
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t *statePtr, XXH64_hash_t seed)
Definition: xxh3.h:1524
#define XXH_PREFETCH(ptr)
Definition: xxh3.h:439
XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_defaultSecret(const xxh_u8 *input, size_t len)
Definition: xxh3.h:1921
#define ACC_NB
Definition: xxh3.h:894
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void *input, size_t len, XXH64_hash_t seed)
Definition: xxh3.h:1976
static void h1(float r, float r2, float ri, float rc, float r0, float rs, float &h)
static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
Definition: xxh3.h:634
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t *statePtr, const void *secret, size_t secretSize)
Definition: xxh3.h:1514
#define XXH3_INIT_ACC
Definition: xxh3.h:1344
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void *input, size_t len, const void *secret, size_t secretSize)
Definition: xxh3.h:1439
unsigned long long XXH64_hash_t
Definition: xxhash.h:309
#define XXH3_MIDSIZE_MAX
Definition: xxh3.h:834
XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE]
#define XXH_SECRET_LASTACC_START
XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSeed(const xxh_u8 *input, size_t len, XXH64_hash_t seed)
Definition: xxh3.h:1420
XXH_NO_INLINE XXH64_hash_t XXH3_len_129to240_64b(const xxh_u8 *XXH_RESTRICT input, size_t len, const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed)
Definition: xxh3.h:837
XXH_FORCE_INLINE xxh_u64 XXH3_mix2Accs(const xxh_u64 *XXH_RESTRICT acc, const xxh_u8 *XXH_RESTRICT secret)
Definition: xxh3.h:1324
XXH_FORCE_INLINE XXH128_hash_t XXH3_hashLong_128b_internal(const xxh_u8 *XXH_RESTRICT input, size_t len, const xxh_u8 *XXH_RESTRICT secret, size_t secretSize)
Definition: xxh3.h:1899
XXH_errorcode
Definition: xxhash.h:213
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update(XXH3_state_t *state, const void *input, size_t len)
Definition: xxh3.h:1620
static void XXH3_64bits_reset_internal(XXH3_state_t *statePtr, XXH64_hash_t seed, const xxh_u8 *secret, size_t secretSize)
Definition: xxh3.h:1483
#define XXH_likely(x)
Definition: xxh3.h:70
XXH_FORCE_INLINE void XXH3_hashLong_internal_loop(xxh_u64 *XXH_RESTRICT acc, const xxh_u8 *XXH_RESTRICT input, size_t len, const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH3_accWidth_e accWidth)
Definition: xxh3.h:1290
static XXH128_hash_t XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
Definition: xxh3.h:507
XXH_FORCE_INLINE XXH128_hash_t XXH3_len_4to8_128b(const xxh_u8 *input, size_t len, const xxh_u8 *secret, XXH64_hash_t seed)
Definition: xxh3.h:1706
XXH_FORCE_INLINE XXH128_hash_t XXH3_len_0to16_128b(const xxh_u8 *input, size_t len, const xxh_u8 *secret, XXH64_hash_t seed)
Definition: xxh3.h:1807
#define XXH_SECRET_CONSUME_RATE
Definition: xxh3.h:893
XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t *statePtr)
Definition: xxh3.h:1470
XXH_PUBLIC_API int XXH128_cmp(const void *h128_1, const void *h128_2)
Definition: xxh3.h:2072
Definition: xxhash.h:213
XXH3_accWidth_e
Definition: xxh3.h:896
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest(const XXH3_state_t *state)
Definition: xxh3.h:2039
#define XXH3_MIDSIZE_STARTOFFSET
XXH_FORCE_INLINE XXH64_hash_t XXH3_len_4to8_64b(const xxh_u8 *input, size_t len, const xxh_u8 *secret, XXH64_hash_t seed)
Definition: xxh3.h:699
XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_withSecret(const xxh_u8 *XXH_RESTRICT input, size_t len, const xxh_u8 *XXH_RESTRICT secret, size_t secretSize)
Definition: xxh3.h:1378
static void XXH3_128bits_reset_internal(XXH3_state_t *statePtr, XXH64_hash_t seed, const xxh_u8 *secret, size_t secretSize)
Definition: xxh3.h:1998
XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t *src)
Definition: xxh3.h:2097
XXH_FORCE_INLINE XXH64_hash_t XXH3_len_1to3_64b(const xxh_u8 *input, size_t len, const xxh_u8 *secret, XXH64_hash_t seed)
Definition: xxh3.h:677
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update(XXH3_state_t *state, const void *input, size_t len)
Definition: xxh3.h:2034
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void *input, size_t len, XXH64_hash_t seed)
Definition: xxh3.h:1455
#define XXH_SECRET_MERGEACCS_START
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t *statePtr, XXH64_hash_t seed)
Definition: xxh3.h:2024
XXH_PUBLIC_API XXH3_state_t * XXH3_createState(void)
Definition: xxh3.h:1465
XXH_FORCE_INLINE XXH128_hash_t XXH3_len_17to128_128b(const xxh_u8 *XXH_RESTRICT input, size_t len, const xxh_u8 *XXH_RESTRICT secret, size_t secretSize, XXH64_hash_t seed)
Definition: xxh3.h:1837
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void *input, size_t len)
Definition: xxh3.h:1430
XXH_FORCE_INLINE void XXH3_digest_long(XXH64_hash_t *acc, const XXH3_state_t *state, XXH3_accWidth_e accWidth)
Definition: xxh3.h:1627
XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
Definition: xxh3.h:2062
#define XXH_SECRET_DEFAULT_SIZE
Definition: xxh3.h:448
#define XXH_PUBLIC_API
Definition: xxhash.h:159
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void *input, size_t len, const void *secret, size_t secretSize)
Definition: xxh3.h:1960
XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8 *XXH_RESTRICT input, const xxh_u8 *XXH_RESTRICT secret, xxh_u64 seed64)
Definition: xxh3.h:773
XXH_PUBLIC_API XXH128_hash_t XXH128(const void *input, size_t len, XXH64_hash_t seed)
Definition: xxh3.h:1985
#define STRIPE_LEN
Definition: xxh3.h:892
static xxh_u64 XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
Definition: xxh3.h:616
XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSeed(const xxh_u8 *input, size_t len, XXH64_hash_t seed)
Definition: xxh3.h:1942
XXH_NO_INLINE XXH64_hash_t XXH3_hashLong_64b_defaultSecret(const xxh_u8 *XXH_RESTRICT input, size_t len)
Definition: xxh3.h:1368
XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t *dst_state, const XXH3_state_t *src_state)
Definition: xxh3.h:1477
XXH_FORCE_INLINE XXH128_hash_t XXH3_len_1to3_128b(const xxh_u8 *input, size_t len, const xxh_u8 *secret, XXH64_hash_t seed)
Definition: xxh3.h:1679
XXH_NO_INLINE XXH128_hash_t XXH3_hashLong_128b_withSecret(const xxh_u8 *input, size_t len, const xxh_u8 *secret, size_t secretSize)
Definition: xxh3.h:1931
XXH_FORCE_INLINE XXH128_hash_t XXH128_mix32B(XXH128_hash_t acc, const xxh_u8 *input_1, const xxh_u8 *input_2, const xxh_u8 *secret, XXH64_hash_t seed)
Definition: xxh3.h:1826
#define XXH3_INTERNALBUFFER_STRIPES
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t *statePtr, const void *secret, size_t secretSize)
Definition: xxh3.h:2014
XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t *dst, XXH128_hash_t hash)
Definition: xxh3.h:2085