File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / rsync / hashtable.c
Revision 1.1.1.4 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 00:32:36 2021 UTC (3 years, 3 months ago) by misho
Branches: rsync, MAIN
CVS tags: v3_2_3, HEAD
rsync 3.2.3

    1: /*
    2:  * Routines to provide a memory-efficient hashtable.
    3:  *
    4:  * Copyright (C) 2007-2020 Wayne Davison
    5:  *
    6:  * This program is free software; you can redistribute it and/or modify
    7:  * it under the terms of the GNU General Public License as published by
    8:  * the Free Software Foundation; either version 3 of the License, or
    9:  * (at your option) any later version.
   10:  *
   11:  * This program is distributed in the hope that it will be useful,
   12:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   13:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14:  * GNU General Public License for more details.
   15:  *
   16:  * You should have received a copy of the GNU General Public License along
   17:  * with this program; if not, visit the http://fsf.org website.
   18:  */
   19: 
   20: #include "rsync.h"
   21: 
   22: #define HASH_LOAD_LIMIT(size) ((size)*3/4)
   23: 
   24: struct hashtable *hashtable_create(int size, int key64)
   25: {
   26: 	int req = size;
   27: 	struct hashtable *tbl;
   28: 	int node_size = key64 ? sizeof (struct ht_int64_node)
   29: 			      : sizeof (struct ht_int32_node);
   30: 
   31: 	/* Pick a power of 2 that can hold the requested size. */
   32: 	if (size & (size-1) || size < 16) {
   33: 		size = 16;
   34: 		while (size < req)
   35: 			size *= 2;
   36: 	}
   37: 
   38: 	tbl = new(struct hashtable);
   39: 	tbl->nodes = new_array0(char, size * node_size);
   40: 	tbl->size = size;
   41: 	tbl->entries = 0;
   42: 	tbl->node_size = node_size;
   43: 	tbl->key64 = key64 ? 1 : 0;
   44: 
   45: 	if (DEBUG_GTE(HASH, 1)) {
   46: 		char buf[32];
   47: 		if (req != size)
   48: 			snprintf(buf, sizeof buf, "req: %d, ", req);
   49: 		else
   50: 			*buf = '\0';
   51: 		rprintf(FINFO, "[%s] created hashtable %lx (%ssize: %d, keys: %d-bit)\n",
   52: 			who_am_i(), (long)tbl, buf, size, key64 ? 64 : 32);
   53: 	}
   54: 
   55: 	return tbl;
   56: }
   57: 
   58: void hashtable_destroy(struct hashtable *tbl)
   59: {
   60: 	if (DEBUG_GTE(HASH, 1)) {
   61: 		rprintf(FINFO, "[%s] destroyed hashtable %lx (size: %d, keys: %d-bit)\n",
   62: 			who_am_i(), (long)tbl, tbl->size, tbl->key64 ? 64 : 32);
   63: 	}
   64: 	free(tbl->nodes);
   65: 	free(tbl);
   66: }
   67: 
   68: /* Returns the node that holds the indicated key if it exists. When it does not
   69:  * exist, it returns either NULL (when data_when_new is NULL), or it returns a
   70:  * new node with its node->data set to the indicated value.
   71:  *
   72:  * If your code doesn't know the data value for a new node in advance (usually
   73:  * because it doesn't know if a node is new or not) you should pass in a unique
   74:  * (non-0) value that you can use to check if the returned node is new. You can
   75:  * then overwrite the data with any value you want (even 0) since it only needs
   76:  * to be different than whatever data_when_new value you use later on.
   77:  *
   78:  * This return is a void* just because it might be pointing at a ht_int32_node
   79:  * or a ht_int64_node, and that makes the caller's assignment a little easier. */
   80: void *hashtable_find(struct hashtable *tbl, int64 key, void *data_when_new)
   81: {
   82: 	int key64 = tbl->key64;
   83: 	struct ht_int32_node *node;
   84: 	uint32 ndx;
   85: 
   86: 	if (key64 ? key == 0 : (int32)key == 0) {
   87: 		rprintf(FERROR, "Internal hashtable error: illegal key supplied!\n");
   88: 		exit_cleanup(RERR_MESSAGEIO);
   89: 	}
   90: 
   91: 	if (data_when_new && tbl->entries > HASH_LOAD_LIMIT(tbl->size)) {
   92: 		void *old_nodes = tbl->nodes;
   93: 		int size = tbl->size * 2;
   94: 		int i;
   95: 
   96: 		tbl->nodes = new_array0(char, size * tbl->node_size);
   97: 		tbl->size = size;
   98: 		tbl->entries = 0;
   99: 
  100: 		if (DEBUG_GTE(HASH, 1)) {
  101: 			rprintf(FINFO, "[%s] growing hashtable %lx (size: %d, keys: %d-bit)\n",
  102: 				who_am_i(), (long)tbl, size, key64 ? 64 : 32);
  103: 		}
  104: 
  105: 		for (i = size / 2; i-- > 0; ) {
  106: 			struct ht_int32_node *move_node = HT_NODE(tbl, old_nodes, i);
  107: 			int64 move_key = HT_KEY(move_node, key64);
  108: 			if (move_key == 0)
  109: 				continue;
  110: 			if (move_node->data)
  111: 				hashtable_find(tbl, move_key, move_node->data);
  112: 			else {
  113: 				node = hashtable_find(tbl, move_key, "");
  114: 				node->data = 0;
  115: 			}
  116: 		}
  117: 
  118: 		free(old_nodes);
  119: 	}
  120: 
  121: 	if (!key64) {
  122: 		/* Based on Jenkins One-at-a-time hash. */
  123: 		uchar buf[4], *keyp = buf;
  124: 		int i;
  125: 
  126: 		SIVALu(buf, 0, key);
  127: 		for (ndx = 0, i = 0; i < 4; i++) {
  128: 			ndx += keyp[i];
  129: 			ndx += (ndx << 10);
  130: 			ndx ^= (ndx >> 6);
  131: 		}
  132: 		ndx += (ndx << 3);
  133: 		ndx ^= (ndx >> 11);
  134: 		ndx += (ndx << 15);
  135: 	} else {
  136: 		/* Based on Jenkins hashword() from lookup3.c. */
  137: 		uint32 a, b, c;
  138: 
  139: 		/* Set up the internal state */
  140: 		a = b = c = 0xdeadbeef + (8 << 2);
  141: 
  142: #define rot(x,k) (((x)<<(k)) ^ ((x)>>(32-(k))))
  143: #if SIZEOF_INT64 >= 8
  144: 		b += (uint32)(key >> 32);
  145: #endif
  146: 		a += (uint32)key;
  147: 		c ^= b; c -= rot(b, 14);
  148: 		a ^= c; a -= rot(c, 11);
  149: 		b ^= a; b -= rot(a, 25);
  150: 		c ^= b; c -= rot(b, 16);
  151: 		a ^= c; a -= rot(c, 4);
  152: 		b ^= a; b -= rot(a, 14);
  153: 		c ^= b; c -= rot(b, 24);
  154: #undef rot
  155: 		ndx = c;
  156: 	}
  157: 
  158: 	/* If it already exists, return the node.  If we're not
  159: 	 * allocating, return NULL if the key is not found. */
  160: 	while (1) {
  161: 		int64 nkey;
  162: 
  163: 		ndx &= tbl->size - 1;
  164: 		node = HT_NODE(tbl, tbl->nodes, ndx);
  165: 		nkey = HT_KEY(node, key64);
  166: 
  167: 		if (nkey == key)
  168: 			return node;
  169: 		if (nkey == 0) {
  170: 			if (!data_when_new)
  171: 				return NULL;
  172: 			break;
  173: 		}
  174: 		ndx++;
  175: 	}
  176: 
  177: 	/* Take over this empty spot and then return the node. */
  178: 	if (key64)
  179: 		((struct ht_int64_node*)node)->key = key;
  180: 	else
  181: 		node->key = (int32)key;
  182: 	node->data = data_when_new;
  183: 	tbl->entries++;
  184: 	return node;
  185: }
  186: 
  187: #ifndef WORDS_BIGENDIAN
  188: # define HASH_LITTLE_ENDIAN 1
  189: # define HASH_BIG_ENDIAN 0
  190: #else
  191: # define HASH_LITTLE_ENDIAN 0
  192: # define HASH_BIG_ENDIAN 1
  193: #endif
  194: 
  195: /*
  196:  -------------------------------------------------------------------------------
  197:  lookup3.c, by Bob Jenkins, May 2006, Public Domain.
  198: 
  199:  These are functions for producing 32-bit hashes for hash table lookup.
  200:  hash_word(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
  201:  are externally useful functions.  Routines to test the hash are included
  202:  if SELF_TEST is defined.  You can use this free for any purpose.  It's in
  203:  the public domain.  It has no warranty.
  204: 
  205:  You probably want to use hashlittle().  hashlittle() and hashbig()
  206:  hash byte arrays.  hashlittle() is is faster than hashbig() on
  207:  little-endian machines.  Intel and AMD are little-endian machines.
  208:  On second thought, you probably want hashlittle2(), which is identical to
  209:  hashlittle() except it returns two 32-bit hashes for the price of one.
  210:  You could implement hashbig2() if you wanted but I haven't bothered here.
  211: 
  212:  If you want to find a hash of, say, exactly 7 integers, do
  213:    a = i1;  b = i2;  c = i3;
  214:    mix(a,b,c);
  215:    a += i4; b += i5; c += i6;
  216:    mix(a,b,c);
  217:    a += i7;
  218:    final(a,b,c);
  219:  then use c as the hash value.  If you have a variable length array of
  220:  4-byte integers to hash, use hash_word().  If you have a byte array (like
  221:  a character string), use hashlittle().  If you have several byte arrays, or
  222:  a mix of things, see the comments above hashlittle().
  223: 
  224:  Why is this so big?  I read 12 bytes at a time into 3 4-byte integers,
  225:  then mix those integers.  This is fast (you can do a lot more thorough
  226:  mixing with 12*3 instructions on 3 integers than you can with 3 instructions
  227:  on 1 byte), but shoehorning those bytes into integers efficiently is messy.
  228: */
  229: 
  230: #define hashsize(n) ((uint32_t)1<<(n))
  231: #define hashmask(n) (hashsize(n)-1)
  232: #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
  233: 
  234: /*
  235:  -------------------------------------------------------------------------------
  236:  mix -- mix 3 32-bit values reversibly.
  237: 
  238:  This is reversible, so any information in (a,b,c) before mix() is
  239:  still in (a,b,c) after mix().
  240: 
  241:  If four pairs of (a,b,c) inputs are run through mix(), or through
  242:  mix() in reverse, there are at least 32 bits of the output that
  243:  are sometimes the same for one pair and different for another pair.
  244:  This was tested for:
  245:  * pairs that differed by one bit, by two bits, in any combination
  246:    of top bits of (a,b,c), or in any combination of bottom bits of
  247:    (a,b,c).
  248:  * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
  249:    the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
  250:    is commonly produced by subtraction) look like a single 1-bit
  251:    difference.
  252:  * the base values were pseudorandom, all zero but one bit set, or
  253:    all zero plus a counter that starts at zero.
  254: 
  255:  Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
  256:  satisfy this are
  257:      4  6  8 16 19  4
  258:      9 15  3 18 27 15
  259:     14  9  3  7 17  3
  260:  Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
  261:  for "differ" defined as + with a one-bit base and a two-bit delta.  I
  262:  used http://burtleburtle.net/bob/hash/avalanche.html to choose
  263:  the operations, constants, and arrangements of the variables.
  264: 
  265:  This does not achieve avalanche.  There are input bits of (a,b,c)
  266:  that fail to affect some output bits of (a,b,c), especially of a.  The
  267:  most thoroughly mixed value is c, but it doesn't really even achieve
  268:  avalanche in c.
  269: 
  270:  This allows some parallelism.  Read-after-writes are good at doubling
  271:  the number of bits affected, so the goal of mixing pulls in the opposite
  272:  direction as the goal of parallelism.  I did what I could.  Rotates
  273:  seem to cost as much as shifts on every machine I could lay my hands
  274:  on, and rotates are much kinder to the top and bottom bits, so I used
  275:  rotates.
  276:  -------------------------------------------------------------------------------
  277: */
  278: #define mix(a,b,c) \
  279: { \
  280:   a -= c;  a ^= rot(c, 4);  c += b; \
  281:   b -= a;  b ^= rot(a, 6);  a += c; \
  282:   c -= b;  c ^= rot(b, 8);  b += a; \
  283:   a -= c;  a ^= rot(c,16);  c += b; \
  284:   b -= a;  b ^= rot(a,19);  a += c; \
  285:   c -= b;  c ^= rot(b, 4);  b += a; \
  286: }
  287: 
  288: /*
  289:  -------------------------------------------------------------------------------
  290:  final -- final mixing of 3 32-bit values (a,b,c) into c
  291: 
  292:  Pairs of (a,b,c) values differing in only a few bits will usually
  293:  produce values of c that look totally different.  This was tested for
  294:  * pairs that differed by one bit, by two bits, in any combination
  295:    of top bits of (a,b,c), or in any combination of bottom bits of
  296:    (a,b,c).
  297:  * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
  298:    the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
  299:    is commonly produced by subtraction) look like a single 1-bit
  300:    difference.
  301:  * the base values were pseudorandom, all zero but one bit set, or
  302:    all zero plus a counter that starts at zero.
  303: 
  304:  These constants passed:
  305:   14 11 25 16 4 14 24
  306:   12 14 25 16 4 14 24
  307:  and these came close:
  308:    4  8 15 26 3 22 24
  309:   10  8 15 26 3 22 24
  310:   11  8 15 26 3 22 24
  311:  -------------------------------------------------------------------------------
  312: */
  313: #define final(a,b,c) \
  314: { \
  315:   c ^= b; c -= rot(b,14); \
  316:   a ^= c; a -= rot(c,11); \
  317:   b ^= a; b -= rot(a,25); \
  318:   c ^= b; c -= rot(b,16); \
  319:   a ^= c; a -= rot(c,4);  \
  320:   b ^= a; b -= rot(a,14); \
  321:   c ^= b; c -= rot(b,24); \
  322: }
  323: 
  324: 
  325: /*
  326:  -------------------------------------------------------------------------------
  327:  hashlittle() -- hash a variable-length key into a 32-bit value
  328:    k       : the key (the unaligned variable-length array of bytes)
  329:    length  : the length of the key, counting by bytes
  330:    val2    : IN: can be any 4-byte value OUT: second 32 bit hash.
  331:  Returns a 32-bit value.  Every bit of the key affects every bit of
  332:  the return value.  Two keys differing by one or two bits will have
  333:  totally different hash values.  Note that the return value is better
  334:  mixed than val2, so use that first.
  335: 
  336:  The best hash table sizes are powers of 2.  There is no need to do
  337:  mod a prime (mod is sooo slow!).  If you need less than 32 bits,
  338:  use a bitmask.  For example, if you need only 10 bits, do
  339:    h = (h & hashmask(10));
  340:  In which case, the hash table should have hashsize(10) elements.
  341: 
  342:  If you are hashing n strings (uint8_t **)k, do it like this:
  343:    for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h);
  344: 
  345:  By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
  346:  code any way you wish, private, educational, or commercial.  It's free.
  347: 
  348:  Use for hash table lookup, or anything where one collision in 2^^32 is
  349:  acceptable.  Do NOT use for cryptographic purposes.
  350:  -------------------------------------------------------------------------------
  351: */
  352: 
  353: uint32_t hashlittle(const void *key, size_t length)
  354: {
  355:   uint32_t a,b,c;                                          /* internal state */
  356:   union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */
  357: 
  358:   /* Set up the internal state */
  359:   a = b = c = 0xdeadbeef + ((uint32_t)length);
  360: 
  361:   u.ptr = key;
  362:   if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
  363:     const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
  364:     const uint8_t  *k8;
  365: 
  366:     /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
  367:     while (length > 12)
  368:     {
  369:       a += k[0];
  370:       b += k[1];
  371:       c += k[2];
  372:       mix(a,b,c);
  373:       length -= 12;
  374:       k += 3;
  375:     }
  376: 
  377:     /*----------------------------- handle the last (probably partial) block */
  378:     k8 = (const uint8_t *)k;
  379:     switch(length)
  380:     {
  381:     case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
  382:     case 11: c+=((uint32_t)k8[10])<<16;  /* fall through */
  383:     case 10: c+=((uint32_t)k8[9])<<8;    /* fall through */
  384:     case 9 : c+=k8[8];                   /* fall through */
  385:     case 8 : b+=k[1]; a+=k[0]; break;
  386:     case 7 : b+=((uint32_t)k8[6])<<16;   /* fall through */
  387:     case 6 : b+=((uint32_t)k8[5])<<8;    /* fall through */
  388:     case 5 : b+=k8[4];                   /* fall through */
  389:     case 4 : a+=k[0]; break;
  390:     case 3 : a+=((uint32_t)k8[2])<<16;   /* fall through */
  391:     case 2 : a+=((uint32_t)k8[1])<<8;    /* fall through */
  392:     case 1 : a+=k8[0]; break;
  393:     case 0 : return c;
  394:     }
  395:   } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
  396:     const uint16_t *k = (const uint16_t *)key;         /* read 16-bit chunks */
  397:     const uint8_t  *k8;
  398: 
  399:     /*--------------- all but last block: aligned reads and different mixing */
  400:     while (length > 12)
  401:     {
  402:       a += k[0] + (((uint32_t)k[1])<<16);
  403:       b += k[2] + (((uint32_t)k[3])<<16);
  404:       c += k[4] + (((uint32_t)k[5])<<16);
  405:       mix(a,b,c);
  406:       length -= 12;
  407:       k += 6;
  408:     }
  409: 
  410:     /*----------------------------- handle the last (probably partial) block */
  411:     k8 = (const uint8_t *)k;
  412:     switch(length)
  413:     {
  414:     case 12: c+=k[4]+(((uint32_t)k[5])<<16);
  415:              b+=k[2]+(((uint32_t)k[3])<<16);
  416:              a+=k[0]+(((uint32_t)k[1])<<16);
  417:              break;
  418:     case 11: c+=((uint32_t)k8[10])<<16;     /* fall through */
  419:     case 10: c+=k[4];
  420:              b+=k[2]+(((uint32_t)k[3])<<16);
  421:              a+=k[0]+(((uint32_t)k[1])<<16);
  422:              break;
  423:     case 9 : c+=k8[8];                      /* fall through */
  424:     case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
  425:              a+=k[0]+(((uint32_t)k[1])<<16);
  426:              break;
  427:     case 7 : b+=((uint32_t)k8[6])<<16;      /* fall through */
  428:     case 6 : b+=k[2];
  429:              a+=k[0]+(((uint32_t)k[1])<<16);
  430:              break;
  431:     case 5 : b+=k8[4];                      /* fall through */
  432:     case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
  433:              break;
  434:     case 3 : a+=((uint32_t)k8[2])<<16;      /* fall through */
  435:     case 2 : a+=k[0];
  436:              break;
  437:     case 1 : a+=k8[0];
  438:              break;
  439:     case 0 : return c;                     /* zero length requires no mixing */
  440:     }
  441: 
  442:   } else {                        /* need to read the key one byte at a time */
  443:     const uint8_t *k = (const uint8_t *)key;
  444: 
  445:     /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
  446:     while (length > 12)
  447:     {
  448:       a += k[0];
  449:       a += ((uint32_t)k[1])<<8;
  450:       a += ((uint32_t)k[2])<<16;
  451:       a += ((uint32_t)k[3])<<24;
  452:       b += k[4];
  453:       b += ((uint32_t)k[5])<<8;
  454:       b += ((uint32_t)k[6])<<16;
  455:       b += ((uint32_t)k[7])<<24;
  456:       c += k[8];
  457:       c += ((uint32_t)k[9])<<8;
  458:       c += ((uint32_t)k[10])<<16;
  459:       c += ((uint32_t)k[11])<<24;
  460:       mix(a,b,c);
  461:       length -= 12;
  462:       k += 12;
  463:     }
  464: 
  465:     /*-------------------------------- last block: affect all 32 bits of (c) */
  466:     switch(length)                   /* all the case statements fall through */
  467:     {
  468:     case 12: c+=((uint32_t)k[11])<<24;
  469: 	     /* FALLTHROUGH */
  470:     case 11: c+=((uint32_t)k[10])<<16;
  471: 	     /* FALLTHROUGH */
  472:     case 10: c+=((uint32_t)k[9])<<8;
  473: 	     /* FALLTHROUGH */
  474:     case 9 : c+=k[8];
  475: 	     /* FALLTHROUGH */
  476:     case 8 : b+=((uint32_t)k[7])<<24;
  477: 	     /* FALLTHROUGH */
  478:     case 7 : b+=((uint32_t)k[6])<<16;
  479: 	     /* FALLTHROUGH */
  480:     case 6 : b+=((uint32_t)k[5])<<8;
  481: 	     /* FALLTHROUGH */
  482:     case 5 : b+=k[4];
  483: 	     /* FALLTHROUGH */
  484:     case 4 : a+=((uint32_t)k[3])<<24;
  485: 	     /* FALLTHROUGH */
  486:     case 3 : a+=((uint32_t)k[2])<<16;
  487: 	     /* FALLTHROUGH */
  488:     case 2 : a+=((uint32_t)k[1])<<8;
  489: 	     /* FALLTHROUGH */
  490:     case 1 : a+=k[0];
  491:              break;
  492:     case 0 : return c;
  493:     }
  494:   }
  495: 
  496:   final(a,b,c);
  497:   return c;
  498: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>