Branch data Line data Source code
1 : : /* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>
2 : :
3 : : Permission is hereby granted, free of charge, to any person
4 : : obtaining a copy of this software and associated documentation
5 : : files (the "Software"), to deal in the Software without
6 : : restriction, including without limitation the rights to use, copy,
7 : : modify, merge, publish, distribute, sublicense, and/or sell copies
8 : : of the Software, and to permit persons to whom the Software is
9 : : furnished to do so, subject to the following conditions:
10 : :
11 : : The above copyright notice and this permission notice shall be
12 : : included in all copies or substantial portions of the Software.
13 : :
14 : : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 : : EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 : : MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 : : NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 : : BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 : : ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 : : CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 : : SOFTWARE.
22 : : */
23 : :
24 : : /* This file implements MUM (MUltiply and Mix) hashing. We randomize
25 : : input data by 64x64-bit multiplication and mixing hi- and low-parts
26 : : of the multiplication result by using an addition and then mix it
27 : : into the current state. We use prime numbers randomly generated
28 : : with the equal probability of their bit values for the
29 : : multiplication. When all primes are used once, the state is
30 : : randomized and the same prime numbers are used again for data
31 : : randomization.
32 : :
33 : : The MUM hashing passes all SMHasher tests. Pseudo Random Number
34 : : Generator based on MUM also passes NIST Statistical Test Suite for
35 : : Random and Pseudorandom Number Generators for Cryptographic
36 : : Applications (version 2.2.1) with 1000 bitstreams each containing
37 : : 1M bits. MUM hashing is also faster Spooky64 and City64 on small
38 : : strings (at least up to 512-bit) on Haswell and Power7. The MUM bulk
39 : : speed (speed on very long data) is bigger than Spooky and City on
40 : : Power7. On Haswell the bulk speed is bigger than Spooky one and
41 : : close to City speed. */
42 : :
43 : : #ifndef __MUM_HASH__
44 : : #define __MUM_HASH__
45 : :
46 : : #include <stddef.h>
47 : : #include <stdlib.h>
48 : : #include <string.h>
49 : : #include <limits.h>
50 : :
51 : : #ifdef _MSC_VER
52 : : typedef unsigned __int16 uint16_t;
53 : : typedef unsigned __int32 uint32_t;
54 : : typedef unsigned __int64 uint64_t;
55 : : #else
56 : : #include <stdint.h>
57 : : #endif
58 : :
59 : : /* Macro saying to use 128-bit integers implemented by GCC for some
60 : : targets. */
61 : : #ifndef _MUM_USE_INT128
62 : : /* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64.
63 : : HOST_WIDE_INT is long if HOST_BITS_PER_LONG > HOST_BITS_PER_INT,
64 : : otherwise int. */
65 : : #if defined(__GNUC__) && UINT_MAX != ULONG_MAX
66 : : #define _MUM_USE_INT128 1
67 : : #else
68 : : #define _MUM_USE_INT128 0
69 : : #endif
70 : : #endif
71 : :
72 : : #if defined(__GNUC__) && ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 9) || (__GNUC__ > 4))
73 : : #define _MUM_FRESH_GCC
74 : : #endif
75 : :
76 : : #if defined(__GNUC__) && !defined(__llvm__) && defined(_MUM_FRESH_GCC)
77 : : #define _MUM_ATTRIBUTE_UNUSED __attribute__((unused))
78 : : #define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
79 : : #define _MUM_TARGET(opts) __attribute__((__target__ (opts)))
80 : : #else
81 : : #define _MUM_ATTRIBUTE_UNUSED
82 : : #define _MUM_OPTIMIZE(opts)
83 : : #define _MUM_TARGET(opts)
84 : : #endif
85 : :
86 : :
87 : : /* Here are different primes randomly generated with the equal
88 : : probability of their bit values. They are used to randomize input
89 : : values. */
90 : : static uint64_t _mum_hash_step_prime = 0x2e0bb864e9ea7df5ULL;
91 : : static uint64_t _mum_key_step_prime = 0xcdb32970830fcaa1ULL;
92 : : static uint64_t _mum_block_start_prime = 0xc42b5e2e6480b23bULL;
93 : : static uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL;
94 : : static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
95 : : static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
96 : : static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;
97 : :
98 : : static uint64_t _mum_primes [] = {
99 : : 0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f,
100 : : 0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81,
101 : : 0X3bc721b2aad05197, 0X71b1a19b907d6e33, 0X525e6c1084a8534b, 0X9e4c2cd340c1299f,
102 : : 0Xde3add92e94caa37, 0X7e14eadb1f65311d, 0X3f5aa40f89812853, 0X33b15a3b587d15c9,
103 : : };
104 : :
105 : : /* Multiply 64-bit V and P and return sum of high and low parts of the
106 : : result. */
107 : : static inline uint64_t
108 : 11227221 : _mum (uint64_t v, uint64_t p) {
109 : : uint64_t hi, lo;
110 : : #if _MUM_USE_INT128
111 : : #if defined(__aarch64__)
112 : : /* AARCH64 needs 2 insns to calculate 128-bit result of the
113 : : multiplication. If we use a generic code we actually call a
114 : : function doing 128x128->128 bit multiplication. The function is
115 : : very slow. */
116 : : lo = v * p, hi;
117 : : asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (v), "r" (p));
118 : : #else
119 : 11227221 : __uint128_t r = (__uint128_t) v * (__uint128_t) p;
120 : 11227221 : hi = (uint64_t) (r >> 64);
121 : 11227221 : lo = (uint64_t) r;
122 : : #endif
123 : : #else
124 : : /* Implementation of 64x64->128-bit multiplication by four 32x32->64
125 : : bit multiplication. */
126 : : uint64_t hv = v >> 32, hp = p >> 32;
127 : : uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
128 : : uint64_t rh = hv * hp;
129 : : uint64_t rm_0 = hv * lp;
130 : : uint64_t rm_1 = hp * lv;
131 : : uint64_t rl = lv * lp;
132 : : uint64_t t, carry = 0;
133 : :
134 : : /* We could ignore a carry bit here if we did not care about the
135 : : same hash for 32-bit and 64-bit targets. */
136 : : t = rl + (rm_0 << 32);
137 : : #ifdef MUM_TARGET_INDEPENDENT_HASH
138 : : carry = t < rl;
139 : : #endif
140 : : lo = t + (rm_1 << 32);
141 : : #ifdef MUM_TARGET_INDEPENDENT_HASH
142 : : carry += lo < t;
143 : : #endif
144 : : hi = rh + (rm_0 >> 32) + (rm_1 >> 32) + carry;
145 : : #endif
146 : : /* We could use XOR here too but, for some reasons, on Haswell and
147 : : Power7 using an addition improves hashing performance by 10% for
148 : : small strings. */
149 : 11227221 : return hi + lo;
150 : : }
151 : :
152 : : #if defined(_MSC_VER)
153 : : #define _mum_bswap_32(x) _byteswap_uint32_t (x)
154 : : #define _mum_bswap_64(x) _byteswap_uint64_t (x)
155 : : #elif defined(__APPLE__)
156 : : #include <libkern/OSByteOrder.h>
157 : : #define _mum_bswap_32(x) OSSwapInt32 (x)
158 : : #define _mum_bswap_64(x) OSSwapInt64 (x)
159 : : #elif defined(__GNUC__)
160 : : #define _mum_bswap32(x) __builtin_bswap32 (x)
161 : : #define _mum_bswap64(x) __builtin_bswap64 (x)
162 : : #else
163 : : #include <byteswap.h>
164 : : #define _mum_bswap32(x) bswap32 (x)
165 : : #define _mum_bswap64(x) bswap64 (x)
166 : : #endif
167 : :
168 : : static inline uint64_t
169 : 1775856 : _mum_le (uint64_t v) {
170 : : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
171 : 1775856 : return v;
172 : : #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
173 : : return _mum_bswap64 (v);
174 : : #else
175 : : #error "Unknown endianness"
176 : : #endif
177 : : }
178 : :
179 : : static inline uint32_t
180 : 1251150 : _mum_le32 (uint32_t v) {
181 : : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
182 : 1251150 : return v;
183 : : #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
184 : : return _mum_bswap32 (v);
185 : : #else
186 : : #error "Unknown endianness"
187 : : #endif
188 : : }
189 : :
190 : : /* Macro defining how many times the most nested loop in
191 : : _mum_hash_aligned will be unrolled by the compiler (although it can
192 : : make an own decision:). Use only a constant here to help a
193 : : compiler to unroll a major loop.
194 : :
195 : : The macro value affects the result hash for strings > 128 bit. The
196 : : unroll factor greatly affects the hashing speed. We prefer the
197 : : speed. */
198 : : #ifndef _MUM_UNROLL_FACTOR_POWER
199 : : #if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
200 : : #define _MUM_UNROLL_FACTOR_POWER 3
201 : : #elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
202 : : #define _MUM_UNROLL_FACTOR_POWER 4
203 : : #else
204 : : #define _MUM_UNROLL_FACTOR_POWER 2
205 : : #endif
206 : : #endif
207 : :
208 : : #if _MUM_UNROLL_FACTOR_POWER < 1
209 : : #error "too small unroll factor"
210 : : #elif _MUM_UNROLL_FACTOR_POWER > 4
211 : : #error "We have not enough primes for such unroll factor"
212 : : #endif
213 : :
214 : : #define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
215 : :
216 : : static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
217 : 2343823 : _mum_hash_aligned (uint64_t start, const void *key, size_t len) {
218 : 2343823 : uint64_t result = start;
219 : 2343823 : const unsigned char *str = (const unsigned char *) key;
220 : : uint64_t u64;
221 : : int i;
222 : : size_t n;
223 : :
224 : 2343823 : result = _mum (result, _mum_block_start_prime);
225 [ + + ]: 2345819 : while (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
226 : : /* This loop could be vectorized when we have vector insns for
227 : : 64x64->128-bit multiplication. AVX2 currently only have a
228 : : vector insn for 4 32x32->64-bit multiplication. */
229 [ + + ]: 9980 : for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
230 : 7984 : result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
231 : 1996 : len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
232 : 1996 : str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
233 : : /* We will use the same prime numbers on the next iterations --
234 : : randomize the state. */
235 : 1996 : result = _mum (result, _mum_unroll_prime);
236 : : }
237 : 2343823 : n = len / sizeof (uint64_t);
238 [ + + ]: 4111695 : for (i = 0; i < (int)n; i++)
239 : 1767872 : result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
240 : 2343823 : len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t);
241 [ + + + + : 2343823 : switch (len) {
+ + + + ]
242 : : case 7:
243 : 365228 : u64 = _mum_le32 (*(uint32_t *) str);
244 : 365228 : u64 |= (uint64_t) str[4] << 32;
245 : 365228 : u64 |= (uint64_t) str[5] << 40;
246 : 365228 : u64 |= (uint64_t) str[6] << 48;
247 : 365228 : return result ^ _mum (u64, _mum_tail_prime);
248 : : case 6:
249 : 224973 : u64 = _mum_le32 (*(uint32_t *) str);
250 : 224973 : u64 |= (uint64_t) str[4] << 32;
251 : 224973 : u64 |= (uint64_t) str[5] << 40;
252 : 224973 : return result ^ _mum (u64, _mum_tail_prime);
253 : : case 5:
254 : 249004 : u64 = _mum_le32 (*(uint32_t *) str);
255 : 249004 : u64 |= (uint64_t) str[4] << 32;
256 : 249004 : return result ^ _mum (u64, _mum_tail_prime);
257 : : case 4:
258 : 411945 : u64 = _mum_le32 (*(uint32_t *) str);
259 : 411945 : return result ^ _mum (u64, _mum_tail_prime);
260 : : case 3:
261 : 285367 : u64 = str[0];
262 : 285367 : u64 |= (uint64_t) str[1] << 8;
263 : 285367 : u64 |= (uint64_t) str[2] << 16;
264 : 285367 : return result ^ _mum (u64, _mum_tail_prime);
265 : : case 2:
266 : 248184 : u64 = str[0];
267 : 248184 : u64 |= (uint64_t) str[1] << 8;
268 : 248184 : return result ^ _mum (u64, _mum_tail_prime);
269 : : case 1:
270 : 269603 : u64 = str[0];
271 : 269603 : return result ^ _mum (u64, _mum_tail_prime);
272 : : }
273 : 289519 : return result;
274 : 2343823 : }
275 : :
276 : : /* Final randomization of H. */
277 : : static inline uint64_t
278 : 2409709 : _mum_final (uint64_t h) {
279 : 2409709 : h ^= _mum (h, _mum_finish_prime1);
280 : 2409709 : h ^= _mum (h, _mum_finish_prime2);
281 : 2409709 : return h;
282 : : }
283 : :
284 : : #if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
285 : :
286 : : /* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where
287 : : it is possible. Although on modern Intel processors MULQ takes
288 : : 3-cycles vs. 4 for MULX, MULX permits more freedom in insn
289 : : scheduling as it uses less fixed registers. */
290 : : static inline uint64_t _MUM_TARGET("arch=haswell")
291 : : _mum_hash_avx2 (const void * key, size_t len, uint64_t seed) {
292 : : return _mum_final (_mum_hash_aligned (seed + len, key, len));
293 : : }
294 : : #endif
295 : :
296 : : #ifndef _MUM_UNALIGNED_ACCESS
297 : : #if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \
298 : : || defined(__s390__) || defined(__m32c__) || defined(cris) \
299 : : || defined(__CR16__) || defined(__vax__) || defined(__m68k__) \
300 : : || defined(__aarch64__)
301 : : #define _MUM_UNALIGNED_ACCESS 1
302 : : #else
303 : : #define _MUM_UNALIGNED_ACCESS 0
304 : : #endif
305 : : #endif
306 : :
307 : : /* When we need an aligned access to data being hashed we move part of
308 : : the unaligned data to an aligned block of given size and then
309 : : process it, repeating processing the data by the block. */
310 : : #ifndef _MUM_BLOCK_LEN
311 : : #define _MUM_BLOCK_LEN 1024
312 : : #endif
313 : :
314 : : #if _MUM_BLOCK_LEN < 8
315 : : #error "too small block length"
316 : : #endif
317 : :
318 : : static inline uint64_t
319 : : #if defined(__x86_64__)
320 : : _MUM_TARGET("inline-all-stringops")
321 : : #endif
322 : 2343823 : _mum_hash_default (const void *key, size_t len, uint64_t seed) {
323 : : uint64_t result;
324 : 2343823 : const unsigned char *str = (const unsigned char *) key;
325 : : size_t block_len;
326 : : uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)];
327 : :
328 : 2343823 : result = seed + len;
329 : : if (_MUM_UNALIGNED_ACCESS || ((size_t) str & 0x7) == 0)
330 : 2343823 : result = _mum_hash_aligned (result, key, len);
331 : : else {
332 : : while (len != 0) {
333 : : block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN;
334 : : memmove (buf, str, block_len);
335 : : result = _mum_hash_aligned (result, buf, block_len);
336 : : len -= block_len;
337 : : str += block_len;
338 : : }
339 : : }
340 : 2343823 : return _mum_final (result);
341 : : }
342 : :
343 : : static inline uint64_t
344 : : _mum_next_factor (void) {
345 : : uint64_t start = 0;
346 : : int i;
347 : :
348 : : for (i = 0; i < 8; i++)
349 : : start = (start << 8) | rand() % 256;
350 : : return start;
351 : : }
352 : :
353 : : /* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++ */
354 : :
355 : : /* Set random multiplicators depending on SEED. */
356 : : static inline void
357 : : mum_hash_randomize (uint64_t seed) {
358 : : int i;
359 : :
360 : : srand (seed);
361 : : _mum_hash_step_prime = _mum_next_factor ();
362 : : _mum_key_step_prime = _mum_next_factor ();
363 : : _mum_finish_prime1 = _mum_next_factor ();
364 : : _mum_finish_prime2 = _mum_next_factor ();
365 : : _mum_block_start_prime = _mum_next_factor ();
366 : : _mum_unroll_prime = _mum_next_factor ();
367 : : _mum_tail_prime = _mum_next_factor ();
368 : : for (i = 0; i < (int)(sizeof (_mum_primes) / sizeof (uint64_t)); i++)
369 : : _mum_primes[i] = _mum_next_factor ();
370 : : }
371 : :
372 : : /* Start hashing data with SEED. Return the state. */
373 : : static inline uint64_t
374 : : mum_hash_init (uint64_t seed) {
375 : : return seed;
376 : : }
377 : :
378 : : /* Process data KEY with the state H and return the updated state. */
379 : : static inline uint64_t
380 : 115912 : mum_hash_step (uint64_t h, uint64_t key)
381 : : {
382 : 115912 : return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime);
383 : : }
384 : :
385 : : /* Return the result of hashing using the current state H. */
386 : : static inline uint64_t
387 : 65886 : mum_hash_finish (uint64_t h) {
388 : 65886 : return _mum_final (h);
389 : : }
390 : :
391 : : /* Fast hashing of KEY with SEED. The hash is always the same for the
392 : : same key on any target. */
393 : : static inline size_t
394 : : mum_hash64 (uint64_t key, uint64_t seed) {
395 : : return mum_hash_finish (mum_hash_step (mum_hash_init (seed), key));
396 : : }
397 : :
398 : : /* Hash data KEY of length LEN and SEED. The hash depends on the
399 : : target endianness and the unroll factor. */
400 : : static inline uint64_t
401 : 2343823 : mum_hash (const void *key, size_t len, uint64_t seed) {
402 : : #if defined(__x86_64__) && defined(_MUM_FRESH_GCC) && _MUM_UNALIGNED_ACCESS != 0
403 : : static int avx2_support = 0;
404 : :
405 : : if (avx2_support > 0)
406 : : return _mum_hash_avx2 (key, len, seed);
407 : : else if (! avx2_support) {
408 : : __builtin_cpu_init ();
409 : : avx2_support = __builtin_cpu_supports ("avx2") ? 1 : -1;
410 : : if (avx2_support > 0)
411 : : return _mum_hash_avx2 (key, len, seed);
412 : : }
413 : : #endif
414 : 2343823 : return _mum_hash_default (key, len, seed);
415 : : }
416 : :
417 : : #endif
|