Annotation of embedaddon/rsync/hashtable.c, revision 1.1.1.4
1.1 misho 1: /*
2: * Routines to provide a memory-efficient hashtable.
3: *
1.1.1.4 ! misho 4: * Copyright (C) 2007-2020 Wayne Davison
1.1 misho 5: *
6: * This program is free software; you can redistribute it and/or modify
7: * it under the terms of the GNU General Public License as published by
8: * the Free Software Foundation; either version 3 of the License, or
9: * (at your option) any later version.
10: *
11: * This program is distributed in the hope that it will be useful,
12: * but WITHOUT ANY WARRANTY; without even the implied warranty of
13: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: * GNU General Public License for more details.
15: *
16: * You should have received a copy of the GNU General Public License along
17: * with this program; if not, visit the http://fsf.org website.
18: */
19:
20: #include "rsync.h"
21:
22: #define HASH_LOAD_LIMIT(size) ((size)*3/4)
23:
24: struct hashtable *hashtable_create(int size, int key64)
25: {
1.1.1.2 misho 26: int req = size;
1.1 misho 27: struct hashtable *tbl;
28: int node_size = key64 ? sizeof (struct ht_int64_node)
29: : sizeof (struct ht_int32_node);
30:
31: /* Pick a power of 2 that can hold the requested size. */
32: if (size & (size-1) || size < 16) {
33: size = 16;
34: while (size < req)
35: size *= 2;
36: }
37:
1.1.1.4 ! misho 38: tbl = new(struct hashtable);
! 39: tbl->nodes = new_array0(char, size * node_size);
1.1 misho 40: tbl->size = size;
41: tbl->entries = 0;
42: tbl->node_size = node_size;
43: tbl->key64 = key64 ? 1 : 0;
44:
1.1.1.2 misho 45: if (DEBUG_GTE(HASH, 1)) {
46: char buf[32];
47: if (req != size)
48: snprintf(buf, sizeof buf, "req: %d, ", req);
49: else
50: *buf = '\0';
51: rprintf(FINFO, "[%s] created hashtable %lx (%ssize: %d, keys: %d-bit)\n",
52: who_am_i(), (long)tbl, buf, size, key64 ? 64 : 32);
53: }
54:
1.1 misho 55: return tbl;
56: }
57:
58: void hashtable_destroy(struct hashtable *tbl)
59: {
1.1.1.2 misho 60: if (DEBUG_GTE(HASH, 1)) {
61: rprintf(FINFO, "[%s] destroyed hashtable %lx (size: %d, keys: %d-bit)\n",
62: who_am_i(), (long)tbl, tbl->size, tbl->key64 ? 64 : 32);
63: }
1.1 misho 64: free(tbl->nodes);
65: free(tbl);
66: }
67:
1.1.1.4 ! misho 68: /* Returns the node that holds the indicated key if it exists. When it does not
! 69: * exist, it returns either NULL (when data_when_new is NULL), or it returns a
! 70: * new node with its node->data set to the indicated value.
! 71: *
! 72: * If your code doesn't know the data value for a new node in advance (usually
! 73: * because it doesn't know if a node is new or not) you should pass in a unique
! 74: * (non-0) value that you can use to check if the returned node is new. You can
! 75: * then overwrite the data with any value you want (even 0) since it only needs
! 76: * to be different than whatever data_when_new value you use later on.
! 77: *
! 78: * This return is a void* just because it might be pointing at a ht_int32_node
! 79: * or a ht_int64_node, and that makes the caller's assignment a little easier. */
! 80: void *hashtable_find(struct hashtable *tbl, int64 key, void *data_when_new)
1.1 misho 81: {
82: int key64 = tbl->key64;
83: struct ht_int32_node *node;
84: uint32 ndx;
85:
86: if (key64 ? key == 0 : (int32)key == 0) {
87: rprintf(FERROR, "Internal hashtable error: illegal key supplied!\n");
88: exit_cleanup(RERR_MESSAGEIO);
89: }
90:
1.1.1.4 ! misho 91: if (data_when_new && tbl->entries > HASH_LOAD_LIMIT(tbl->size)) {
1.1 misho 92: void *old_nodes = tbl->nodes;
93: int size = tbl->size * 2;
94: int i;
95:
1.1.1.4 ! misho 96: tbl->nodes = new_array0(char, size * tbl->node_size);
1.1 misho 97: tbl->size = size;
98: tbl->entries = 0;
99:
1.1.1.2 misho 100: if (DEBUG_GTE(HASH, 1)) {
101: rprintf(FINFO, "[%s] growing hashtable %lx (size: %d, keys: %d-bit)\n",
102: who_am_i(), (long)tbl, size, key64 ? 64 : 32);
103: }
104:
1.1 misho 105: for (i = size / 2; i-- > 0; ) {
106: struct ht_int32_node *move_node = HT_NODE(tbl, old_nodes, i);
107: int64 move_key = HT_KEY(move_node, key64);
108: if (move_key == 0)
109: continue;
1.1.1.4 ! misho 110: if (move_node->data)
! 111: hashtable_find(tbl, move_key, move_node->data);
! 112: else {
! 113: node = hashtable_find(tbl, move_key, "");
! 114: node->data = 0;
! 115: }
1.1 misho 116: }
117:
118: free(old_nodes);
119: }
120:
121: if (!key64) {
122: /* Based on Jenkins One-at-a-time hash. */
123: uchar buf[4], *keyp = buf;
124: int i;
125:
126: SIVALu(buf, 0, key);
127: for (ndx = 0, i = 0; i < 4; i++) {
128: ndx += keyp[i];
129: ndx += (ndx << 10);
130: ndx ^= (ndx >> 6);
131: }
132: ndx += (ndx << 3);
133: ndx ^= (ndx >> 11);
134: ndx += (ndx << 15);
135: } else {
136: /* Based on Jenkins hashword() from lookup3.c. */
137: uint32 a, b, c;
138:
139: /* Set up the internal state */
140: a = b = c = 0xdeadbeef + (8 << 2);
141:
142: #define rot(x,k) (((x)<<(k)) ^ ((x)>>(32-(k))))
143: #if SIZEOF_INT64 >= 8
144: b += (uint32)(key >> 32);
145: #endif
146: a += (uint32)key;
147: c ^= b; c -= rot(b, 14);
148: a ^= c; a -= rot(c, 11);
149: b ^= a; b -= rot(a, 25);
150: c ^= b; c -= rot(b, 16);
151: a ^= c; a -= rot(c, 4);
152: b ^= a; b -= rot(a, 14);
153: c ^= b; c -= rot(b, 24);
154: #undef rot
155: ndx = c;
156: }
157:
158: /* If it already exists, return the node. If we're not
159: * allocating, return NULL if the key is not found. */
160: while (1) {
161: int64 nkey;
162:
163: ndx &= tbl->size - 1;
164: node = HT_NODE(tbl, tbl->nodes, ndx);
165: nkey = HT_KEY(node, key64);
166:
167: if (nkey == key)
168: return node;
169: if (nkey == 0) {
1.1.1.4 ! misho 170: if (!data_when_new)
1.1 misho 171: return NULL;
172: break;
173: }
174: ndx++;
175: }
176:
177: /* Take over this empty spot and then return the node. */
178: if (key64)
179: ((struct ht_int64_node*)node)->key = key;
180: else
181: node->key = (int32)key;
1.1.1.4 ! misho 182: node->data = data_when_new;
1.1 misho 183: tbl->entries++;
184: return node;
185: }
1.1.1.4 ! misho 186:
! 187: #ifndef WORDS_BIGENDIAN
! 188: # define HASH_LITTLE_ENDIAN 1
! 189: # define HASH_BIG_ENDIAN 0
! 190: #else
! 191: # define HASH_LITTLE_ENDIAN 0
! 192: # define HASH_BIG_ENDIAN 1
! 193: #endif
! 194:
! 195: /*
! 196: -------------------------------------------------------------------------------
! 197: lookup3.c, by Bob Jenkins, May 2006, Public Domain.
! 198:
! 199: These are functions for producing 32-bit hashes for hash table lookup.
! 200: hash_word(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
! 201: are externally useful functions. Routines to test the hash are included
! 202: if SELF_TEST is defined. You can use this free for any purpose. It's in
! 203: the public domain. It has no warranty.
! 204:
! 205: You probably want to use hashlittle(). hashlittle() and hashbig()
! 206: hash byte arrays. hashlittle() is is faster than hashbig() on
! 207: little-endian machines. Intel and AMD are little-endian machines.
! 208: On second thought, you probably want hashlittle2(), which is identical to
! 209: hashlittle() except it returns two 32-bit hashes for the price of one.
! 210: You could implement hashbig2() if you wanted but I haven't bothered here.
! 211:
! 212: If you want to find a hash of, say, exactly 7 integers, do
! 213: a = i1; b = i2; c = i3;
! 214: mix(a,b,c);
! 215: a += i4; b += i5; c += i6;
! 216: mix(a,b,c);
! 217: a += i7;
! 218: final(a,b,c);
! 219: then use c as the hash value. If you have a variable length array of
! 220: 4-byte integers to hash, use hash_word(). If you have a byte array (like
! 221: a character string), use hashlittle(). If you have several byte arrays, or
! 222: a mix of things, see the comments above hashlittle().
! 223:
! 224: Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
! 225: then mix those integers. This is fast (you can do a lot more thorough
! 226: mixing with 12*3 instructions on 3 integers than you can with 3 instructions
! 227: on 1 byte), but shoehorning those bytes into integers efficiently is messy.
! 228: */
! 229:
! 230: #define hashsize(n) ((uint32_t)1<<(n))
! 231: #define hashmask(n) (hashsize(n)-1)
! 232: #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
! 233:
! 234: /*
! 235: -------------------------------------------------------------------------------
! 236: mix -- mix 3 32-bit values reversibly.
! 237:
! 238: This is reversible, so any information in (a,b,c) before mix() is
! 239: still in (a,b,c) after mix().
! 240:
! 241: If four pairs of (a,b,c) inputs are run through mix(), or through
! 242: mix() in reverse, there are at least 32 bits of the output that
! 243: are sometimes the same for one pair and different for another pair.
! 244: This was tested for:
! 245: * pairs that differed by one bit, by two bits, in any combination
! 246: of top bits of (a,b,c), or in any combination of bottom bits of
! 247: (a,b,c).
! 248: * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
! 249: the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
! 250: is commonly produced by subtraction) look like a single 1-bit
! 251: difference.
! 252: * the base values were pseudorandom, all zero but one bit set, or
! 253: all zero plus a counter that starts at zero.
! 254:
! 255: Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
! 256: satisfy this are
! 257: 4 6 8 16 19 4
! 258: 9 15 3 18 27 15
! 259: 14 9 3 7 17 3
! 260: Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
! 261: for "differ" defined as + with a one-bit base and a two-bit delta. I
! 262: used http://burtleburtle.net/bob/hash/avalanche.html to choose
! 263: the operations, constants, and arrangements of the variables.
! 264:
! 265: This does not achieve avalanche. There are input bits of (a,b,c)
! 266: that fail to affect some output bits of (a,b,c), especially of a. The
! 267: most thoroughly mixed value is c, but it doesn't really even achieve
! 268: avalanche in c.
! 269:
! 270: This allows some parallelism. Read-after-writes are good at doubling
! 271: the number of bits affected, so the goal of mixing pulls in the opposite
! 272: direction as the goal of parallelism. I did what I could. Rotates
! 273: seem to cost as much as shifts on every machine I could lay my hands
! 274: on, and rotates are much kinder to the top and bottom bits, so I used
! 275: rotates.
! 276: -------------------------------------------------------------------------------
! 277: */
! 278: #define mix(a,b,c) \
! 279: { \
! 280: a -= c; a ^= rot(c, 4); c += b; \
! 281: b -= a; b ^= rot(a, 6); a += c; \
! 282: c -= b; c ^= rot(b, 8); b += a; \
! 283: a -= c; a ^= rot(c,16); c += b; \
! 284: b -= a; b ^= rot(a,19); a += c; \
! 285: c -= b; c ^= rot(b, 4); b += a; \
! 286: }
! 287:
! 288: /*
! 289: -------------------------------------------------------------------------------
! 290: final -- final mixing of 3 32-bit values (a,b,c) into c
! 291:
! 292: Pairs of (a,b,c) values differing in only a few bits will usually
! 293: produce values of c that look totally different. This was tested for
! 294: * pairs that differed by one bit, by two bits, in any combination
! 295: of top bits of (a,b,c), or in any combination of bottom bits of
! 296: (a,b,c).
! 297: * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
! 298: the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
! 299: is commonly produced by subtraction) look like a single 1-bit
! 300: difference.
! 301: * the base values were pseudorandom, all zero but one bit set, or
! 302: all zero plus a counter that starts at zero.
! 303:
! 304: These constants passed:
! 305: 14 11 25 16 4 14 24
! 306: 12 14 25 16 4 14 24
! 307: and these came close:
! 308: 4 8 15 26 3 22 24
! 309: 10 8 15 26 3 22 24
! 310: 11 8 15 26 3 22 24
! 311: -------------------------------------------------------------------------------
! 312: */
! 313: #define final(a,b,c) \
! 314: { \
! 315: c ^= b; c -= rot(b,14); \
! 316: a ^= c; a -= rot(c,11); \
! 317: b ^= a; b -= rot(a,25); \
! 318: c ^= b; c -= rot(b,16); \
! 319: a ^= c; a -= rot(c,4); \
! 320: b ^= a; b -= rot(a,14); \
! 321: c ^= b; c -= rot(b,24); \
! 322: }
! 323:
! 324:
! 325: /*
! 326: -------------------------------------------------------------------------------
! 327: hashlittle() -- hash a variable-length key into a 32-bit value
! 328: k : the key (the unaligned variable-length array of bytes)
! 329: length : the length of the key, counting by bytes
! 330: val2 : IN: can be any 4-byte value OUT: second 32 bit hash.
! 331: Returns a 32-bit value. Every bit of the key affects every bit of
! 332: the return value. Two keys differing by one or two bits will have
! 333: totally different hash values. Note that the return value is better
! 334: mixed than val2, so use that first.
! 335:
! 336: The best hash table sizes are powers of 2. There is no need to do
! 337: mod a prime (mod is sooo slow!). If you need less than 32 bits,
! 338: use a bitmask. For example, if you need only 10 bits, do
! 339: h = (h & hashmask(10));
! 340: In which case, the hash table should have hashsize(10) elements.
! 341:
! 342: If you are hashing n strings (uint8_t **)k, do it like this:
! 343: for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h);
! 344:
! 345: By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this
! 346: code any way you wish, private, educational, or commercial. It's free.
! 347:
! 348: Use for hash table lookup, or anything where one collision in 2^^32 is
! 349: acceptable. Do NOT use for cryptographic purposes.
! 350: -------------------------------------------------------------------------------
! 351: */
! 352:
! 353: uint32_t hashlittle(const void *key, size_t length)
! 354: {
! 355: uint32_t a,b,c; /* internal state */
! 356: union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */
! 357:
! 358: /* Set up the internal state */
! 359: a = b = c = 0xdeadbeef + ((uint32_t)length);
! 360:
! 361: u.ptr = key;
! 362: if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
! 363: const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */
! 364: const uint8_t *k8;
! 365:
! 366: /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
! 367: while (length > 12)
! 368: {
! 369: a += k[0];
! 370: b += k[1];
! 371: c += k[2];
! 372: mix(a,b,c);
! 373: length -= 12;
! 374: k += 3;
! 375: }
! 376:
! 377: /*----------------------------- handle the last (probably partial) block */
! 378: k8 = (const uint8_t *)k;
! 379: switch(length)
! 380: {
! 381: case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
! 382: case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
! 383: case 10: c+=((uint32_t)k8[9])<<8; /* fall through */
! 384: case 9 : c+=k8[8]; /* fall through */
! 385: case 8 : b+=k[1]; a+=k[0]; break;
! 386: case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
! 387: case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */
! 388: case 5 : b+=k8[4]; /* fall through */
! 389: case 4 : a+=k[0]; break;
! 390: case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
! 391: case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */
! 392: case 1 : a+=k8[0]; break;
! 393: case 0 : return c;
! 394: }
! 395: } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
! 396: const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */
! 397: const uint8_t *k8;
! 398:
! 399: /*--------------- all but last block: aligned reads and different mixing */
! 400: while (length > 12)
! 401: {
! 402: a += k[0] + (((uint32_t)k[1])<<16);
! 403: b += k[2] + (((uint32_t)k[3])<<16);
! 404: c += k[4] + (((uint32_t)k[5])<<16);
! 405: mix(a,b,c);
! 406: length -= 12;
! 407: k += 6;
! 408: }
! 409:
! 410: /*----------------------------- handle the last (probably partial) block */
! 411: k8 = (const uint8_t *)k;
! 412: switch(length)
! 413: {
! 414: case 12: c+=k[4]+(((uint32_t)k[5])<<16);
! 415: b+=k[2]+(((uint32_t)k[3])<<16);
! 416: a+=k[0]+(((uint32_t)k[1])<<16);
! 417: break;
! 418: case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
! 419: case 10: c+=k[4];
! 420: b+=k[2]+(((uint32_t)k[3])<<16);
! 421: a+=k[0]+(((uint32_t)k[1])<<16);
! 422: break;
! 423: case 9 : c+=k8[8]; /* fall through */
! 424: case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
! 425: a+=k[0]+(((uint32_t)k[1])<<16);
! 426: break;
! 427: case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
! 428: case 6 : b+=k[2];
! 429: a+=k[0]+(((uint32_t)k[1])<<16);
! 430: break;
! 431: case 5 : b+=k8[4]; /* fall through */
! 432: case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
! 433: break;
! 434: case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
! 435: case 2 : a+=k[0];
! 436: break;
! 437: case 1 : a+=k8[0];
! 438: break;
! 439: case 0 : return c; /* zero length requires no mixing */
! 440: }
! 441:
! 442: } else { /* need to read the key one byte at a time */
! 443: const uint8_t *k = (const uint8_t *)key;
! 444:
! 445: /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
! 446: while (length > 12)
! 447: {
! 448: a += k[0];
! 449: a += ((uint32_t)k[1])<<8;
! 450: a += ((uint32_t)k[2])<<16;
! 451: a += ((uint32_t)k[3])<<24;
! 452: b += k[4];
! 453: b += ((uint32_t)k[5])<<8;
! 454: b += ((uint32_t)k[6])<<16;
! 455: b += ((uint32_t)k[7])<<24;
! 456: c += k[8];
! 457: c += ((uint32_t)k[9])<<8;
! 458: c += ((uint32_t)k[10])<<16;
! 459: c += ((uint32_t)k[11])<<24;
! 460: mix(a,b,c);
! 461: length -= 12;
! 462: k += 12;
! 463: }
! 464:
! 465: /*-------------------------------- last block: affect all 32 bits of (c) */
! 466: switch(length) /* all the case statements fall through */
! 467: {
! 468: case 12: c+=((uint32_t)k[11])<<24;
! 469: /* FALLTHROUGH */
! 470: case 11: c+=((uint32_t)k[10])<<16;
! 471: /* FALLTHROUGH */
! 472: case 10: c+=((uint32_t)k[9])<<8;
! 473: /* FALLTHROUGH */
! 474: case 9 : c+=k[8];
! 475: /* FALLTHROUGH */
! 476: case 8 : b+=((uint32_t)k[7])<<24;
! 477: /* FALLTHROUGH */
! 478: case 7 : b+=((uint32_t)k[6])<<16;
! 479: /* FALLTHROUGH */
! 480: case 6 : b+=((uint32_t)k[5])<<8;
! 481: /* FALLTHROUGH */
! 482: case 5 : b+=k[4];
! 483: /* FALLTHROUGH */
! 484: case 4 : a+=((uint32_t)k[3])<<24;
! 485: /* FALLTHROUGH */
! 486: case 3 : a+=((uint32_t)k[2])<<16;
! 487: /* FALLTHROUGH */
! 488: case 2 : a+=((uint32_t)k[1])<<8;
! 489: /* FALLTHROUGH */
! 490: case 1 : a+=k[0];
! 491: break;
! 492: case 0 : return c;
! 493: }
! 494: }
! 495:
! 496: final(a,b,c);
! 497: return c;
! 498: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>