Branch data Line data Source code
1 : : /* Copyright (c) 2016, 2017, 2018
2 : : Vladimir Makarov <vmakarov@gcc.gnu.org>
3 : :
4 : : Permission is hereby granted, free of charge, to any person
5 : : obtaining a copy of this software and associated documentation
6 : : files (the "Software"), to deal in the Software without
7 : : restriction, including without limitation the rights to use, copy,
8 : : modify, merge, publish, distribute, sublicense, and/or sell copies
9 : : of the Software, and to permit persons to whom the Software is
10 : : furnished to do so, subject to the following conditions:
11 : :
12 : : The above copyright notice and this permission notice shall be
13 : : included in all copies or substantial portions of the Software.
14 : :
15 : : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 : : EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 : : MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 : : NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 : : BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 : : ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 : : CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 : : SOFTWARE.
23 : : */
24 : :
25 : : /* This file implements MUM (MUltiply and Mix) hashing. We randomize
26 : : input data by 64x64-bit multiplication and mixing hi- and low-parts
27 : : of the multiplication result by using an addition and then mix it
28 : : into the current state. We use prime numbers randomly generated
29 : : with the equal probability of their bit values for the
30 : : multiplication. When all primes are used once, the state is
31 : : randomized and the same prime numbers are used again for data
32 : : randomization.
33 : :
34 : : The MUM hashing passes all SMHasher tests. Pseudo Random Number
35 : : Generator based on MUM also passes NIST Statistical Test Suite for
36 : : Random and Pseudorandom Number Generators for Cryptographic
37 : : Applications (version 2.2.1) with 1000 bitstreams each containing
38 : : 1M bits. MUM hashing is also faster Spooky64 and City64 on small
39 : : strings (at least upto 512-bit) on Haswell and Power7. The MUM bulk
40 : : speed (speed on very long data) is bigger than Spooky and City on
41 : : Power7. On Haswell the bulk speed is bigger than Spooky one and
42 : : close to City speed. */
43 : :
44 : : #ifndef __MUM_HASH__
45 : : #define __MUM_HASH__
46 : :
47 : : #include <stddef.h>
48 : : #include <stdlib.h>
49 : : #include <string.h>
50 : : #include <limits.h>
51 : :
52 : : #ifdef _MSC_VER
53 : : typedef unsigned __int16 uint16_t;
54 : : typedef unsigned __int32 uint32_t;
55 : : typedef unsigned __int64 uint64_t;
56 : : #else
57 : : #include <stdint.h>
58 : : #endif
59 : :
60 : : #ifdef __GNUC__
61 : : #define _MUM_ATTRIBUTE_UNUSED __attribute__((unused))
62 : : #define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
63 : : #define _MUM_TARGET(opts) __attribute__((__target__ (opts)))
64 : : #else
65 : : #define _MUM_ATTRIBUTE_UNUSED
66 : : #define _MUM_OPTIMIZE(opts)
67 : : #define _MUM_TARGET(opts)
68 : : #endif
69 : :
70 : : /* Macro saying to use 128-bit integers implemented by GCC for some
71 : : targets. */
72 : : #ifndef _MUM_USE_INT128
73 : : /* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64.
74 : : HOST_WIDE_INT is long if HOST_BITS_PER_LONG > HOST_BITS_PER_INT,
75 : : otherwise int. */
76 : : #if defined(__GNUC__) && UINT_MAX != ULONG_MAX
77 : : #define _MUM_USE_INT128 1
78 : : #else
79 : : #define _MUM_USE_INT128 0
80 : : #endif
81 : : #endif
82 : :
83 : : /* Here are different primes randomly generated with the equal
84 : : probability of their bit values. They are used to randomize input
85 : : values. */
86 : : static uint64_t _mum_hash_step_prime = 0x2e0bb864e9ea7df5ULL;
87 : : static uint64_t _mum_key_step_prime = 0xcdb32970830fcaa1ULL;
88 : : static uint64_t _mum_block_start_prime = 0xc42b5e2e6480b23bULL;
89 : : static uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL;
90 : : static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
91 : : static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
92 : : static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;
93 : :
94 : : static uint64_t _mum_primes [] = {
95 : : 0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f,
96 : : 0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81,
97 : : 0X3bc721b2aad05197, 0X71b1a19b907d6e33, 0X525e6c1084a8534b, 0X9e4c2cd340c1299f,
98 : : 0Xde3add92e94caa37, 0X7e14eadb1f65311d, 0X3f5aa40f89812853, 0X33b15a3b587d15c9,
99 : : };
100 : :
101 : : /* Multiply 64-bit V and P and return sum of high and low parts of the
102 : : result. */
103 : : static inline uint64_t
104 : 95898465 : _mum (uint64_t v, uint64_t p) {
105 : : uint64_t hi, lo;
106 : : #if _MUM_USE_INT128
107 : : #if defined(__aarch64__)
108 : : /* AARCH64 needs 2 insns to calculate 128-bit result of the
109 : : multiplication. If we use a generic code we actually call a
110 : : function doing 128x128->128 bit multiplication. The function is
111 : : very slow. */
112 : : lo = v * p;
113 : : asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (v), "r" (p));
114 : : #else
115 : 95898465 : __uint128_t r = (__uint128_t) v * (__uint128_t) p;
116 : 95898465 : hi = (uint64_t) (r >> 64);
117 : 95898465 : lo = (uint64_t) r;
118 : : #endif
119 : : #else
120 : : /* Implementation of 64x64->128-bit multiplication by four 32x32->64
121 : : bit multiplication. */
122 : : uint64_t hv = v >> 32, hp = p >> 32;
123 : : uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
124 : : uint64_t rh = hv * hp;
125 : : uint64_t rm_0 = hv * lp;
126 : : uint64_t rm_1 = hp * lv;
127 : : uint64_t rl = lv * lp;
128 : : uint64_t t, carry = 0;
129 : :
130 : : /* We could ignore a carry bit here if we did not care about the
131 : : same hash for 32-bit and 64-bit targets. */
132 : : t = rl + (rm_0 << 32);
133 : : #ifdef MUM_TARGET_INDEPENDENT_HASH
134 : : carry = t < rl;
135 : : #endif
136 : : lo = t + (rm_1 << 32);
137 : : #ifdef MUM_TARGET_INDEPENDENT_HASH
138 : : carry += lo < t;
139 : : #endif
140 : : hi = rh + (rm_0 >> 32) + (rm_1 >> 32) + carry;
141 : : #endif
142 : : /* We could use XOR here too but, for some reasons, on Haswell and
143 : : Power7 using an addition improves hashing performance by 10% for
144 : : small strings. */
145 : 95898465 : return hi + lo;
146 : : }
147 : :
148 : : #if defined(_MSC_VER)
149 : : #define _mum_bswap_32(x) _byteswap_uint32_t (x)
150 : : #define _mum_bswap_64(x) _byteswap_uint64_t (x)
151 : : #elif defined(__APPLE__)
152 : : #include <libkern/OSByteOrder.h>
153 : : #define _mum_bswap_32(x) OSSwapInt32 (x)
154 : : #define _mum_bswap_64(x) OSSwapInt64 (x)
155 : : #elif defined(__GNUC__)
156 : : #define _mum_bswap32(x) __builtin_bswap32 (x)
157 : : #define _mum_bswap64(x) __builtin_bswap64 (x)
158 : : #else
159 : : #include <byteswap.h>
160 : : #define _mum_bswap32(x) bswap32 (x)
161 : : #define _mum_bswap64(x) bswap64 (x)
162 : : #endif
163 : :
164 : : static inline uint64_t
165 : 35236577 : _mum_le (uint64_t v) {
166 : : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
167 : 35236577 : return v;
168 : : #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
169 : : return _mum_bswap64 (v);
170 : : #else
171 : : #error "Unknown endianess"
172 : : #endif
173 : : }
174 : :
175 : : static inline uint32_t
176 : 10471525 : _mum_le32 (uint32_t v) {
177 : : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
178 : 10471525 : return v;
179 : : #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
180 : : return _mum_bswap32 (v);
181 : : #else
182 : : #error "Unknown endianess"
183 : : #endif
184 : : }
185 : :
186 : : static inline uint64_t
187 : 10367608 : _mum_le16 (uint16_t v) {
188 : : #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
189 : 10367608 : return v;
190 : : #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
191 : : return (v >> 8) | ((v & 0xff) << 8);
192 : : #else
193 : : #error "Unknown endianess"
194 : : #endif
195 : : }
196 : :
197 : : /* Macro defining how many times the most nested loop in
198 : : _mum_hash_aligned will be unrolled by the compiler (although it can
199 : : make an own decision:). Use only a constant here to help a
200 : : compiler to unroll a major loop.
201 : :
202 : : The macro value affects the result hash for strings > 128 bit. The
203 : : unroll factor greatly affects the hashing speed. We prefer the
204 : : speed. */
205 : : #ifndef _MUM_UNROLL_FACTOR_POWER
206 : : #if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
207 : : #define _MUM_UNROLL_FACTOR_POWER 3
208 : : #elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
209 : : #define _MUM_UNROLL_FACTOR_POWER 4
210 : : #elif defined (MUM_V1) || defined (MUM_V2)
211 : : #define _MUM_UNROLL_FACTOR_POWER 2
212 : : #else
213 : : #define _MUM_UNROLL_FACTOR_POWER 3
214 : : #endif
215 : : #endif
216 : :
217 : : #if _MUM_UNROLL_FACTOR_POWER < 1
218 : : #error "too small unroll factor"
219 : : #elif _MUM_UNROLL_FACTOR_POWER > 4
220 : : #error "We have not enough primes for such unroll factor"
221 : : #endif
222 : :
223 : : #define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
224 : :
225 : : /* Rotate V left by SH. */
226 : : static inline uint64_t _mum_rotl (uint64_t v, int sh) {
227 : : return v << sh | v >> (64 - sh);
228 : : }
229 : :
230 : : static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
231 : 20987527 : _mum_hash_aligned (uint64_t start, const void *key, size_t len) {
232 : 20987527 : uint64_t result = start;
233 : 20987527 : const unsigned char *str = (const unsigned char *) key;
234 : : uint64_t u64;
235 : : size_t i;
236 : : size_t n;
237 : :
238 : : #ifndef MUM_V2
239 : 20987527 : result = _mum (result, _mum_block_start_prime);
240 : : #endif
241 [ + + ]: 20990128 : while (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
242 : : /* This loop could be vectorized when we have vector insns for
243 : : 64x64->128-bit multiplication. AVX2 currently only have vector
244 : : insns for 4 32x32->64-bit multiplication and for 1
245 : : 64x64->128-bit multiplication (pclmulqdq). */
246 : : #if defined (MUM_V1) || defined (MUM_V2)
247 : : for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
248 : : result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
249 : : #else
250 [ + + ]: 13005 : for (i = 0; i < _MUM_UNROLL_FACTOR; i += 2)
251 : 20808 : result ^= _mum (_mum_le (((uint64_t *) str)[i]) ^ _mum_primes[i],
252 : 10404 : _mum_le (((uint64_t *) str)[i + 1]) ^ _mum_primes[i + 1]);
253 : : #endif
254 : 2601 : len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
255 : 2601 : str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
256 : : /* We will use the same prime numbers on the next iterations --
257 : : randomize the state. */
258 : 2601 : result = _mum (result, _mum_unroll_prime);
259 : : }
260 : 20987527 : n = len / sizeof (uint64_t);
261 [ + + ]: 56203296 : for (i = 0; i < n; i++)
262 : 35215769 : result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
263 : 20987527 : len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t);
264 [ + + + + : 20987527 : switch (len) {
+ + + + ]
265 : : case 7:
266 : 2308373 : u64 = _mum_le32 (*(uint32_t *) str);
267 : 2308373 : u64 |= _mum_le16 (*(uint16_t *) (str + 4)) << 32;
268 : 2308373 : u64 |= (uint64_t) str[6] << 48;
269 : 2308373 : return result ^ _mum (u64, _mum_tail_prime);
270 : : case 6:
271 : 2486379 : u64 = _mum_le32 (*(uint32_t *) str);
272 : 2486379 : u64 |= _mum_le16 (*(uint16_t *) (str + 4)) << 32;
273 : 2486379 : return result ^ _mum (u64, _mum_tail_prime);
274 : : case 5:
275 : 2802243 : u64 = _mum_le32 (*(uint32_t *) str);
276 : 2802243 : u64 |= (uint64_t) str[4] << 32;
277 : 2802243 : return result ^ _mum (u64, _mum_tail_prime);
278 : : case 4:
279 : 2874530 : u64 = _mum_le32 (*(uint32_t *) str);
280 : 2874530 : return result ^ _mum (u64, _mum_tail_prime);
281 : : case 3:
282 : 2865998 : u64 = _mum_le16 (*(uint16_t *) str);
283 : 2865998 : u64 |= (uint64_t) str[2] << 16;
284 : 2865998 : return result ^ _mum (u64, _mum_tail_prime);
285 : : case 2:
286 : 2706858 : u64 = _mum_le16 (*(uint16_t *) str);
287 : 2706858 : return result ^ _mum (u64, _mum_tail_prime);
288 : : case 1:
289 : 2650256 : u64 = str[0];
290 : 2650256 : return result ^ _mum (u64, _mum_tail_prime);
291 : : }
292 : 2292890 : return result;
293 : 20987527 : }
294 : :
295 : : /* Final randomization of H. */
296 : : static inline uint64_t
297 : 20987527 : _mum_final (uint64_t h) {
298 : : #if defined (MUM_V1)
299 : : h ^= _mum (h, _mum_finish_prime1);
300 : : h ^= _mum (h, _mum_finish_prime2);
301 : : #elif defined (MUM_V2)
302 : : h ^= _mum_rotl (h, 33);
303 : : h ^= _mum (h, _mum_finish_prime1);
304 : : #else
305 : 20987527 : h = _mum (h, h);
306 : : #endif
307 : 20987527 : return h;
308 : : }
309 : :
310 : : #ifndef _MUM_UNALIGNED_ACCESS
311 : : #if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \
312 : : || defined(__s390__) || defined(__m32c__) || defined(cris) \
313 : : || defined(__CR16__) || defined(__vax__) || defined(__m68k__) \
314 : : || defined(__aarch64__) || defined(_M_AMD64) || defined(_M_IX86)
315 : : #define _MUM_UNALIGNED_ACCESS 1
316 : : #else
317 : : #define _MUM_UNALIGNED_ACCESS 0
318 : : #endif
319 : : #endif
320 : :
321 : : /* When we need an aligned access to data being hashed we move part of
322 : : the unaligned data to an aligned block of given size and then
323 : : process it, repeating processing the data by the block. */
324 : : #ifndef _MUM_BLOCK_LEN
325 : : #define _MUM_BLOCK_LEN 1024
326 : : #endif
327 : :
328 : : #if _MUM_BLOCK_LEN < 8
329 : : #error "too small block length"
330 : : #endif
331 : :
332 : : static inline uint64_t
333 : : #if defined(__x86_64__)
334 : : _MUM_TARGET("inline-all-stringops")
335 : : #endif
336 : : _mum_hash_default (const void *key, size_t len, uint64_t seed) {
337 : : uint64_t result;
338 : : const unsigned char *str = (const unsigned char *) key;
339 : : size_t block_len;
340 : : uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)];
341 : :
342 : : result = seed + len;
343 : : if (((size_t) str & 0x7) == 0)
344 : : result = _mum_hash_aligned (result, key, len);
345 : : else {
346 : : while (len != 0) {
347 : : block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN;
348 : : memmove (buf, str, block_len);
349 : : result = _mum_hash_aligned (result, buf, block_len);
350 : : len -= block_len;
351 : : str += block_len;
352 : : }
353 : : }
354 : : return _mum_final (result);
355 : : }
356 : :
357 : : static inline uint64_t
358 : : _mum_next_factor (void) {
359 : : uint64_t start = 0;
360 : : int i;
361 : :
362 : : for (i = 0; i < 8; i++)
363 : : start = (start << 8) | rand() % 256;
364 : : return start;
365 : : }
366 : :
367 : : /* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++ */
368 : :
369 : : /* Set random multiplicators depending on SEED. */
370 : : static inline void
371 : : mum_hash_randomize (uint64_t seed) {
372 : : size_t i;
373 : :
374 : : srand (seed);
375 : : _mum_hash_step_prime = _mum_next_factor ();
376 : : _mum_key_step_prime = _mum_next_factor ();
377 : : _mum_finish_prime1 = _mum_next_factor ();
378 : : _mum_finish_prime2 = _mum_next_factor ();
379 : : _mum_block_start_prime = _mum_next_factor ();
380 : : _mum_unroll_prime = _mum_next_factor ();
381 : : _mum_tail_prime = _mum_next_factor ();
382 : : for (i = 0; i < sizeof (_mum_primes) / sizeof (uint64_t); i++)
383 : : _mum_primes[i] = _mum_next_factor ();
384 : : }
385 : :
386 : : /* Start hashing data with SEED. Return the state. */
387 : : static inline uint64_t
388 : : mum_hash_init (uint64_t seed) {
389 : : return seed;
390 : : }
391 : :
392 : : /* Process data KEY with the state H and return the updated state. */
393 : : static inline uint64_t
394 : : mum_hash_step (uint64_t h, uint64_t key) {
395 : : return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime);
396 : : }
397 : :
398 : : /* Return the result of hashing using the current state H. */
399 : : static inline uint64_t
400 : : mum_hash_finish (uint64_t h) {
401 : : return _mum_final (h);
402 : : }
403 : :
404 : : /* Fast hashing of KEY with SEED. The hash is always the same for the
405 : : same key on any target. */
406 : : static inline size_t
407 : : mum_hash64 (uint64_t key, uint64_t seed) {
408 : : return mum_hash_finish (mum_hash_step (mum_hash_init (seed), key));
409 : : }
410 : :
411 : : /* Hash data KEY of length LEN and SEED. The hash depends on the
412 : : target endianess and the unroll factor. */
413 : : static inline uint64_t
414 : 20987527 : mum_hash (const void *key, size_t len, uint64_t seed) {
415 : : #if _MUM_UNALIGNED_ACCESS
416 : 20987527 : return _mum_final (_mum_hash_aligned (seed + len, key, len));
417 : : #else
418 : : return _mum_hash_default (key, len, seed);
419 : : #endif
420 : : }
421 : :
422 : : #endif
|