File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / rsync / patches / md5p8.diff
Revision 1.1: download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 00:32:36 2021 UTC (3 years, 3 months ago) by misho
CVS tags: MAIN, HEAD
Initial revision

    1: From: Jorrit Jongma <git@jongma.org>
    2: 
    3: - MD5 optimization in block matching phase:
    4: 
    5: MD5 hashes computed during rsync's block matching phase are independent
    6: and thus possible to process in parallel. This code processes 4 blocks
    7: in parallel if SSE2 is available, or 8 if AVX2 is available. An increase
    8: of performance (or decrease of CPU usage) of up to 6x has been measured.
    9: 
   10: A prefetching algorithm is used to predict and load upcoming blocks, as
   11: this prevents the need for extensive modifications to other parts of
   12: the rsync sources to get this working.
   13: 
   14: This remains compatible with existing rsync builds using MD5 checksums.
   15: 
   16: - MD5P8 whole-file checksum:
   17: 
   18: Splits the input up into 8 independent streams (64-byte interleave), and
   19: produces a final checksum based on the end state of those 8 streams. If
   20: parallelization of MD5 hashing is available, the performance gain (or
   21: CPU usage decrease) is 2x to 6x compared to traditional MD5.
   22: 
   23: The rsync version on both ends of the connection need MD5P8 support
   24: built-in for it to be used.
   25: 
   26: xxHash is still preferred (and faster), but this provides a reasonably
   27: fast fallback for the case where xxHash libraries are not available at
   28: build time.
   29: 
   30: based-on: e94bad1c156fc3910f24e2b3b71a81b0b0bdeb70
   31: diff --git a/Makefile.in b/Makefile.in
   32: --- a/Makefile.in
   33: +++ b/Makefile.in
   34: @@ -29,14 +29,14 @@ SHELL=/bin/sh
   35:  .SUFFIXES:
   36:  .SUFFIXES: .c .o
   37:  
   38: -SIMD_x86_64=simd-checksum-x86_64.o
   39: +SIMD_x86_64=simd-checksum-x86_64.o simd-md5-parallel-x86_64.o
   40:  ASM_x86_64=lib/md5-asm-x86_64.o
   41:  
   42:  GENFILES=configure.sh aclocal.m4 config.h.in rsync.1 rsync.1.html \
   43:  	 rsync-ssl.1 rsync-ssl.1.html rsyncd.conf.5 rsyncd.conf.5.html
   44:  HEADERS=byteorder.h config.h errcode.h proto.h rsync.h ifuncs.h itypes.h inums.h \
   45:  	lib/pool_alloc.h lib/mdigest.h lib/md-defines.h version.h
   46: -LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o \
   47: +LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o lib/md5p8.o \
   48:  	lib/permstring.o lib/pool_alloc.o lib/sysacls.o lib/sysxattrs.o @LIBOBJS@
   49:  zlib_OBJS=zlib/deflate.o zlib/inffast.o zlib/inflate.o zlib/inftrees.o \
   50:  	zlib/trees.o zlib/zutil.o zlib/adler32.o zlib/compress.o zlib/crc32.o
   51: @@ -140,6 +140,9 @@ git-version.h: mkgitver $(wildcard $(srcdir)/.git/logs/HEAD)
   52:  simd-checksum-x86_64.o: simd-checksum-x86_64.cpp
   53:  	@$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-checksum-x86_64.cpp
   54:  
   55: +simd-md5-parallel-x86_64.o: simd-md5-parallel-x86_64.cpp
   56: +	@$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-md5-parallel-x86_64.cpp
   57: +
   58:  lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.S config.h lib/md-defines.h
   59:  	@$(srcdir)/cmdormsg disable-asm $(CC) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/lib/md5-asm-x86_64.S
   60:  
   61: diff --git a/checksum.c b/checksum.c
   62: --- a/checksum.c
   63: +++ b/checksum.c
   64: @@ -52,6 +52,7 @@ struct name_num_obj valid_checksums = {
   65:  		{ CSUM_XXH64, "xxh64", NULL },
   66:  		{ CSUM_XXH64, "xxhash", NULL },
   67:  #endif
   68: +		{ CSUM_MD5P8, "md5p8", NULL },
   69:  		{ CSUM_MD5, "md5", NULL },
   70:  		{ CSUM_MD4, "md4", NULL },
   71:  		{ CSUM_NONE, "none", NULL },
   72: @@ -141,6 +142,7 @@ int csum_len_for_type(int cst, BOOL flist_csum)
   73:  	  case CSUM_MD4_OLD:
   74:  	  case CSUM_MD4_BUSTED:
   75:  		return MD4_DIGEST_LEN;
   76: +	  case CSUM_MD5P8:
   77:  	  case CSUM_MD5:
   78:  		return MD5_DIGEST_LEN;
   79:  	  case CSUM_XXH64:
   80: @@ -167,6 +169,7 @@ int canonical_checksum(int csum_type)
   81:  	  case CSUM_MD4_BUSTED:
   82:  		break;
   83:  	  case CSUM_MD4:
   84: +	  case CSUM_MD5P8:
   85:  	  case CSUM_MD5:
   86:  		return -1;
   87:  	  case CSUM_XXH64:
   88: @@ -179,7 +182,9 @@ int canonical_checksum(int csum_type)
   89:  	return 0;
   90:  }
   91:  
   92: -#ifndef HAVE_SIMD /* See simd-checksum-*.cpp. */
   93: +#ifdef HAVE_SIMD /* See simd-checksum-*.cpp. */
   94: +#define get_checksum2 get_checksum2_nosimd
   95: +#else
   96:  /*
   97:    a simple 32 bit checksum that can be updated from either end
   98:    (inspired by Mark Adler's Adler-32 checksum)
   99: @@ -200,9 +205,18 @@ uint32 get_checksum1(char *buf1, int32 len)
  100:  	}
  101:  	return (s1 & 0xffff) + (s2 << 16);
  102:  }
  103: +
  104: +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
  105: +{
  106: +}
  107: +
  108: +void checksum2_disable_prefetch()
  109: +{
  110: +}
  111:  #endif
  112:  
  113: -void get_checksum2(char *buf, int32 len, char *sum)
  114: +/* Renamed to get_checksum2_nosimd() with HAVE_SIMD */
  115: +void get_checksum2(char *buf, int32 len, char *sum, UNUSED(OFF_T prefetch_offset))
  116:  {
  117:  	switch (xfersum_type) {
  118:  #ifdef SUPPORT_XXHASH
  119: @@ -221,6 +235,7 @@ void get_checksum2(char *buf, int32 len, char *sum)
  120:  		break;
  121:  	  }
  122:  #endif
  123: +	  case CSUM_MD5P8:  /* == CSUM_MD5 for checksum2 */
  124:  	  case CSUM_MD5: {
  125:  		MD5_CTX m5;
  126:  		uchar seedbuf[4];
  127: @@ -373,6 +388,21 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
  128:  		break;
  129:  	  }
  130:  #endif
  131: +	  case CSUM_MD5P8: {
  132: +		MD5P8_CTX m5p8;
  133: +
  134: +		MD5P8_Init(&m5p8);
  135: +
  136: +		for (i = 0; i + CHUNK_SIZE <= len; i += CHUNK_SIZE)
  137: +			MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, CHUNK_SIZE), CHUNK_SIZE);
  138: +
  139: +		remainder = (int32)(len - i);
  140: +		if (remainder > 0)
  141: +			MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, remainder), remainder);
  142: +
  143: +		MD5P8_Final((uchar *)sum, &m5p8);
  144: +		break;
  145: +	  }
  146:  	  case CSUM_MD5: {
  147:  		MD5_CTX m5;
  148:  
  149: @@ -445,6 +475,7 @@ static union {
  150:  #endif
  151:  	MD5_CTX m5;
  152:  } ctx;
  153: +static MD5P8_CTX m5p8;
  154:  #ifdef SUPPORT_XXHASH
  155:  static XXH64_state_t* xxh64_state;
  156:  #endif
  157: @@ -481,6 +512,9 @@ void sum_init(int csum_type, int seed)
  158:  		XXH3_128bits_reset(xxh3_state);
  159:  		break;
  160:  #endif
  161: +	  case CSUM_MD5P8:
  162: +		MD5P8_Init(&m5p8);
  163: +		break;
  164:  	  case CSUM_MD5:
  165:  		MD5_Init(&ctx.m5);
  166:  		break;
  167: @@ -531,6 +565,9 @@ void sum_update(const char *p, int32 len)
  168:  		XXH3_128bits_update(xxh3_state, p, len);
  169:  		break;
  170:  #endif
  171: +	  case CSUM_MD5P8:
  172: +		MD5P8_Update(&m5p8, (uchar *)p, len);
  173: +		break;
  174:  	  case CSUM_MD5:
  175:  		MD5_Update(&ctx.m5, (uchar *)p, len);
  176:  		break;
  177: @@ -596,6 +633,9 @@ int sum_end(char *sum)
  178:  		break;
  179:  	  }
  180:  #endif
  181: +	  case CSUM_MD5P8:
  182: +		MD5P8_Final((uchar *)sum, &m5p8);
  183: +		break;
  184:  	  case CSUM_MD5:
  185:  		MD5_Final((uchar *)sum, &ctx.m5);
  186:  		break;
  187: diff --git a/generator.c b/generator.c
  188: --- a/generator.c
  189: +++ b/generator.c
  190: @@ -729,10 +729,12 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
  191:  	if (append_mode > 0 && f_copy < 0)
  192:  		return 0;
  193:  
  194: -	if (len > 0)
  195: +	if (len > 0) {
  196:  		mapbuf = map_file(fd, len, MAX_MAP_SIZE, sum.blength);
  197: -	else
  198: +		checksum2_enable_prefetch(mapbuf, len, sum.blength);
  199: +	} else {
  200:  		mapbuf = NULL;
  201: +	}
  202:  
  203:  	for (i = 0; i < sum.count; i++) {
  204:  		int32 n1 = (int32)MIN(len, (OFF_T)sum.blength);
  205: @@ -750,7 +752,7 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
  206:  		}
  207:  
  208:  		sum1 = get_checksum1(map, n1);
  209: -		get_checksum2(map, n1, sum2);
  210: +		get_checksum2(map, n1, sum2, offset - n1);
  211:  
  212:  		if (DEBUG_GTE(DELTASUM, 3)) {
  213:  			rprintf(FINFO,
  214: @@ -762,8 +764,10 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
  215:  		write_buf(f_out, sum2, sum.s2length);
  216:  	}
  217:  
  218: -	if (mapbuf)
  219: +	if (mapbuf) {
  220:  		unmap_file(mapbuf);
  221: +		checksum2_disable_prefetch();
  222: +	}
  223:  
  224:  	return 0;
  225:  }
  226: diff --git a/lib/md-defines.h b/lib/md-defines.h
  227: --- a/lib/md-defines.h
  228: +++ b/lib/md-defines.h
  229: @@ -15,3 +15,4 @@
  230:  #define CSUM_XXH64 6
  231:  #define CSUM_XXH3_64 7
  232:  #define CSUM_XXH3_128 8
  233: +#define CSUM_MD5P8 9
  234: diff --git a/lib/md5p8.c b/lib/md5p8.c
  235: new file mode 100644
  236: --- /dev/null
  237: +++ b/lib/md5p8.c
  238: @@ -0,0 +1,128 @@
  239: +/*
  240: + * MD5-based hash friendly to parallel processing, reference implementation
  241: + *
  242: + * Author: Jorrit Jongma, 2020
  243: + *
  244: + * Released in the public domain falling back to the MIT license
  245: + * ( http://www.opensource.org/licenses/MIT ) in case public domain does not
  246: + * apply in your country.
  247: + */
  248: +/*
  249: + * MD5P8 is an MD5-based hash friendly to parallel processing. The input
  250: + * stream is divided into 8 independent streams. For each 512 bytes of input,
  251: + * the first 64 bytes are send to the first stream, the second 64 bytes to
  252: + * the second stream, etc. The input stream is padded with zeros to the next
  253: + * multiple of 512 bytes, then a normal MD5 hash is computed on a buffer
  254: + * containing the A, B, C, and D states of the 8 individual streams, followed
  255: + * by the (unpadded) length of the input.
  256: + *
  257: + * On non-SIMD accelerated CPUs the performance of MD5P8 is slightly lower
  258: + * than normal MD5 (particularly on files smaller than 10 kB), but with
  259: + * SIMD-based parallel processing it can be two to six times as fast. Even in
  260: + * the best-case scenario, xxHash is still at least twice as fast and should
  261: + * be preferred when available.
  262: + */
  263: +
  264: +#include "rsync.h"
  265: +
  266: +#ifdef HAVE_SIMD
  267: +#define MD5P8_Init MD5P8_Init_c
  268: +#define MD5P8_Update MD5P8_Update_c
  269: +#define MD5P8_Final MD5P8_Final_c
  270: +#endif
  271: +
  272: +/* each MD5_CTX needs to be 8-byte aligned */
  273: +#define MD5P8_Contexts_c(ctx, index) ((MD5_CTX*)((((uintptr_t)((ctx)->context_storage) + 7) & ~7) + (index)*((sizeof(MD5_CTX) + 7) & ~7)))
  274: +
  275: +void MD5P8_Init(MD5P8_CTX *ctx)
  276: +{
  277: +    int i;
  278: +    for (i = 0; i < 8; i++) {
  279: +        MD5_Init(MD5P8_Contexts_c(ctx, i));
  280: +    }
  281: +    ctx->used = 0;
  282: +    ctx->next = 0;
  283: +}
  284: +
  285: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
  286: +{
  287: +    uint32 pos = 0;
  288: +
  289: +    if ((ctx->used) || (length < 64)) {
  290: +        int cpy = MIN(length, 64 - ctx->used);
  291: +        memmove(&ctx->buffer[ctx->used], input, cpy);
  292: +        ctx->used += cpy;
  293: +        length -= cpy;
  294: +        pos += cpy;
  295: +
  296: +        if (ctx->used == 64) {
  297: +            MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), ctx->buffer, 64);
  298: +            ctx->used = 0;
  299: +            ctx->next = (ctx->next + 1) % 8;
  300: +        }
  301: +    }
  302: +
  303: +    while (length >= 64) {
  304: +        MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), &input[pos], 64);
  305: +        ctx->next = (ctx->next + 1) % 8;
  306: +        pos += 64;
  307: +        length -= 64;
  308: +    }
  309: +
  310: +    if (length) {
  311: +        memcpy(ctx->buffer, &input[pos], length);
  312: +        ctx->used = length;
  313: +    }
  314: +}
  315: +
  316: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
  317: +{
  318: +    int i;
  319: +    uint32 low = 0, high = 0, sub = ctx->used ? 64 - ctx->used : 0;
  320: +    if (ctx->used) {
  321: +        uchar tmp[64];
  322: +        memset(tmp, 0, 64);
  323: +        MD5P8_Update(ctx, tmp, 64 - ctx->used);
  324: +    }
  325: +    memset(ctx->buffer, 0, 64);
  326: +    while (ctx->next != 0) {
  327: +        MD5P8_Update(ctx, ctx->buffer, 64);
  328: +        sub += 64;
  329: +    }
  330: +
  331: +    uchar state[34*4] = {0};
  332: +
  333: +    for (i = 0; i < 8; i++) {
  334: +        MD5_CTX* md = MD5P8_Contexts_c(ctx, i);
  335: +#ifdef USE_OPENSSL
  336: +        if (low + md->Nl < low) high++;
  337: +        low += md->Nl;
  338: +        high += md->Nh;
  339: +#else
  340: +        if (low + md->totalN < low) high++;
  341: +        low += md->totalN;
  342: +        high += md->totalN2;
  343: +#endif
  344: +        SIVALu(state, i*16, md->A);
  345: +        SIVALu(state, i*16 + 4, md->B);
  346: +        SIVALu(state, i*16 + 8, md->C);
  347: +        SIVALu(state, i*16 + 12, md->D);
  348: +    }
  349: +
  350: +#ifndef USE_OPENSSL
  351: +	high = (low >> 29) | (high << 3);
  352: +	low = (low << 3);
  353: +#endif
  354: +
  355: +    sub <<= 3;
  356: +    if (low - sub > low) high--;
  357: +    low -= sub;
  358: +
  359: +    SIVALu(state, 32*4, low);
  360: +    SIVALu(state, 33*4, high);
  361: +
  362: +    MD5_CTX md;
  363: +    MD5_Init(&md);
  364: +    MD5_Update(&md, state, 34*4);
  365: +    MD5_Final(digest, &md);
  366: +}
  367: diff --git a/lib/mdigest.h b/lib/mdigest.h
  368: --- a/lib/mdigest.h
  369: +++ b/lib/mdigest.h
  370: @@ -27,3 +27,14 @@ void md5_begin(md_context *ctx);
  371:  void md5_update(md_context *ctx, const uchar *input, uint32 length);
  372:  void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN]);
  373:  #endif
  374: +
  375: +typedef struct {
  376: +    uchar context_storage[1024];
  377: +    uchar buffer[512];
  378: +    unsigned int used;
  379: +    unsigned int next;
  380: +} MD5P8_CTX;
  381: +
  382: +void MD5P8_Init(MD5P8_CTX *ctx);
  383: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length);
  384: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
  385: diff --git a/match.c b/match.c
  386: --- a/match.c
  387: +++ b/match.c
  388: @@ -164,6 +164,8 @@ static void hash_search(int f,struct sum_struct *s,
  389:  	if (DEBUG_GTE(DELTASUM, 3))
  390:  		rprintf(FINFO, "sum=%.8x k=%ld\n", sum, (long)k);
  391:  
  392: +	checksum2_enable_prefetch(buf, len, s->blength);
  393: +
  394:  	offset = aligned_offset = aligned_i = 0;
  395:  
  396:  	end = len + 1 - s->sums[s->count-1].len;
  397: @@ -226,7 +228,7 @@ static void hash_search(int f,struct sum_struct *s,
  398:  
  399:  			if (!done_csum2) {
  400:  				map = (schar *)map_ptr(buf,offset,l);
  401: -				get_checksum2((char *)map,l,sum2);
  402: +				get_checksum2((char *)map, l, sum2, offset);
  403:  				done_csum2 = 1;
  404:  			}
  405:  
  406: @@ -268,7 +270,7 @@ static void hash_search(int f,struct sum_struct *s,
  407:  						sum = get_checksum1((char *)map, l);
  408:  						if (sum != s->sums[i].sum1)
  409:  							goto check_want_i;
  410: -						get_checksum2((char *)map, l, sum2);
  411: +						get_checksum2((char *)map, l, sum2, aligned_offset);
  412:  						if (memcmp(sum2, s->sums[i].sum2, s->s2length) != 0)
  413:  							goto check_want_i;
  414:  						/* OK, we have a re-alignment match.  Bump the offset
  415: @@ -335,6 +337,8 @@ static void hash_search(int f,struct sum_struct *s,
  416:  			matched(f, s, buf, offset - s->blength, -2);
  417:  	} while (++offset < end);
  418:  
  419: +	checksum2_disable_prefetch();
  420: +
  421:  	matched(f, s, buf, len, -1);
  422:  	map_ptr(buf, len-1, 1);
  423:  }
  424: diff --git a/simd-checksum-x86_64.cpp b/simd-checksum-x86_64.cpp
  425: --- a/simd-checksum-x86_64.cpp
  426: +++ b/simd-checksum-x86_64.cpp
  427: @@ -49,13 +49,33 @@
  428:   * use of the target attribute, selecting the fastest code path based on
  429:   * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
  430:   * GCC 4.x are not supported to ease configure.ac logic.
  431: + *
  432: + * ----
  433: + *
  434: + * get_checksum2() is optimized for the case where the selected transfer
  435: + * checksum is MD5. MD5 can't be made significantly faster with SIMD
  436: + * instructions than the assembly version already included but SIMD
  437: + * instructions can be used to hash multiple streams in parallel (see
  438: + * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
  439: + * block-matching algorithm hashes the blocks independently (in contrast to
  440: + * the whole-file checksum) this method can be employed here.
  441: + *
  442: + * To prevent needing to modify the core rsync sources significantly, a
  443: + * prefetching strategy is used. When a checksum2 is requested, the code
  444: + * reads ahead several blocks, creates the MD5 hashes for each block in
  445: + * parallel, returns the hash for the first block, and caches the results
  446: + * for the other blocks to return in future calls to get_checksum2().
  447:   */
  448:  
  449:  #ifdef __x86_64__
  450:  #ifdef __cplusplus
  451:  
  452: +extern "C" {
  453: +
  454:  #include "rsync.h"
  455:  
  456: +}
  457: +
  458:  #ifdef HAVE_SIMD
  459:  
  460:  #include <immintrin.h>
  461: @@ -480,9 +500,235 @@ uint32 get_checksum1(char *buf1, int32 len)
  462:      return get_checksum1_cpp(buf1, len);
  463:  }
  464:  
  465: -} // extern "C"
  466: +#if !defined(BENCHMARK_SIMD_CHECKSUM1)
  467:  
  468: -#ifdef BENCHMARK_SIMD_CHECKSUM1
  469: +// see simd-md5-parallel-x86_64.cpp
  470: +extern int md5_parallel_slots();
  471: +extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
  472: +
  473: +#endif /* !BENCHMARK_SIMD_CHECKSUM1 */
  474: +
  475: +#if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
  476: +
  477: +#define PREFETCH_ENABLE 1 // debugging
  478: +
  479: +#if 0 // debugging
  480: +#define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
  481: +#else
  482: +#define PREFETCH_PRINTF(f_, ...) (void)0;
  483: +#endif
  484: +
  485: +#define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
  486: +#define PREFETCH_MAX_BLOCKS 8
  487: +
  488: +typedef struct {
  489: +    int in_use;
  490: +    OFF_T offset;
  491: +    int32 len;
  492: +    char sum[SUM_LENGTH];
  493: +} prefetch_sum_t;
  494: +
  495: +typedef struct {
  496: +    struct map_struct *map;
  497: +    OFF_T len;
  498: +    OFF_T last;
  499: +    int32 blocklen;
  500: +    int blocks;
  501: +    prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
  502: +} prefetch_t;
  503: +
  504: +prefetch_t *prefetch;
  505: +
  506: +extern int xfersum_type;
  507: +extern int checksum_seed;
  508: +extern int proper_seed_order;
  509: +extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
  510: +
  511: +extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
  512: +
  513: +void checksum2_disable_prefetch()
  514: +{
  515: +    if (prefetch) {
  516: +        PREFETCH_PRINTF("checksum2_disable_prefetch\n");
  517: +        free(prefetch);
  518: +        prefetch = NULL;
  519: +    }
  520: +}
  521: +
  522: +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
  523: +{
  524: +#ifdef PREFETCH_ENABLE
  525: +    checksum2_disable_prefetch();
  526: +    int slots = md5_parallel_slots();
  527: +    if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
  528: +        prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
  529: +        memset(prefetch, 0, sizeof(prefetch_t));
  530: +        prefetch->map = map;
  531: +        prefetch->len = len;
  532: +        prefetch->last = 0;
  533: +        prefetch->blocklen = blocklen;
  534: +        prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
  535: +        PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
  536: +    }
  537: +#endif
  538: +}
  539: +
  540: +static inline void checksum2_reset_prefetch()
  541: +{
  542: +    for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
  543: +        prefetch->sums[i].in_use = 0;
  544: +    }
  545: +}
  546: +
  547: +static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
  548: +{
  549: +    if (prefetch->sums[0].in_use) {
  550: +        if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
  551: +            memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
  552: +            for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
  553: +                prefetch->sums[i] = prefetch->sums[i + 1];
  554: +            }
  555: +            prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
  556: +            PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
  557: +            return 1;
  558: +        } else {
  559: +            // unexpected access, reset cache
  560: +            PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
  561: +            checksum2_reset_prefetch();
  562: +        }
  563: +    }
  564: +    return 0;
  565: +}
  566: +
  567: +static int checksum2_perform_prefetch(OFF_T prefetch_offset)
  568: +{
  569: +    int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
  570: +    if (blocks < 2) return 0; // fall through to non-simd, probably faster
  571: +
  572: +    int32 total = 0;
  573: +    int i;
  574: +    for (i = 0; i < blocks; i++) {
  575: +        prefetch->sums[i].offset = prefetch_offset + total;
  576: +        prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
  577: +        prefetch->sums[i].in_use = 0;
  578: +        total += prefetch->sums[i].len;
  579: +    }
  580: +    for (; i < PREFETCH_MAX_BLOCKS; i++) {
  581: +        prefetch->sums[i].in_use = 0;
  582: +    }
  583: +
  584: +    uchar seedbuf[4];
  585: +    SIVALu(seedbuf, 0, checksum_seed);
  586: +
  587: +    PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
  588: +    char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
  589: +    char* bufs[PREFETCH_MAX_BLOCKS] = {0};
  590: +    int lens[PREFETCH_MAX_BLOCKS] = {0};
  591: +    char* sums[PREFETCH_MAX_BLOCKS] = {0};
  592: +    for (i = 0; i < blocks; i++) {
  593: +        bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
  594: +        lens[i] = prefetch->sums[i].len;
  595: +        sums[i] = prefetch->sums[i].sum;
  596: +    }
  597: +    if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
  598: +        for (i = 0; i < blocks; i++) {
  599: +            prefetch->sums[i].in_use = 1;
  600: +        }
  601: +        return 1;
  602: +    } else {
  603: +        // this should never be, abort
  604: +        PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
  605: +        checksum2_disable_prefetch();
  606: +    }
  607: +    return 0;
  608: +}
  609: +
  610: +void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
  611: +{
  612: +    if (prefetch) {
  613: +        PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
  614: +        OFF_T last = prefetch->last;
  615: +        prefetch->last = prefetch_offset;
  616: +        if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
  617: +            // we're looking around trying to align blocks, prefetching will slow things down
  618: +            PREFETCH_PRINTF("get_checksum2 SEEK\n");
  619: +            checksum2_reset_prefetch();
  620: +        } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
  621: +            // hit
  622: +            return;
  623: +        } else if (checksum2_perform_prefetch(prefetch_offset)) {
  624: +            if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
  625: +                // hit; should always be as we just fetched this data
  626: +                return;
  627: +            } else {
  628: +                // this should never be, abort
  629: +                PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
  630: +                checksum2_disable_prefetch();
  631: +            }
  632: +        }
  633: +    }
  634: +    get_checksum2_nosimd(buf, len, sum, prefetch_offset);
  635: +}
  636: +#endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
  637: +
  638: +} // "C"
  639: +
  640: +/* Benchmark compilation
  641: +
  642: +  The get_checksum1() benchmark runs through all available code paths in a
  643: +  single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
  644: +  recompiled for each code path (it always uses the fastest path available
  645: +  on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
  646: +  be used when applicable regardless of rsync being built with OpenSSL.
  647: +
  648: +  Something like the following should compile and run the benchmarks:
  649: +
  650: +  # if gcc
  651: +  export CC=gcc
  652: +  export CXX=g++
  653: +  export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
  654: +
  655: +  # else if clang
  656: +  export CC=clang
  657: +  export CXX=clang++
  658: +  export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
  659: +
  660: +  # /if
  661: +
  662: +  export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
  663: +  export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
  664: +  export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
  665: +  export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
  666: +
  667: +  rm bench_csum*
  668: +
  669: +  ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
  670: +
  671: +  $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
  672: +
  673: +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
  674: +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
  675: +
  676: +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
  677: +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
  678: +
  679: +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
  680: +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
  681: +
  682: +  ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
  683: +
  684: +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
  685: +  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
  686: +
  687: +  ./bench_csum1.all
  688: +  ./bench_csum2.asm
  689: +  ./bench_csum2.openssl
  690: +  ./bench_csum2.sse2
  691: +  ./bench_csum2.avx2
  692: +
  693: + */
  694: +
  695: +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
  696:  #pragma clang optimize off
  697:  #pragma GCC push_options
  698:  #pragma GCC optimize ("O0")
  699: @@ -493,7 +739,9 @@ uint32 get_checksum1(char *buf1, int32 len)
  700:  #ifndef CLOCK_MONOTONIC_RAW
  701:  #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
  702:  #endif
  703: +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
  704:  
  705: +#ifdef BENCHMARK_SIMD_CHECKSUM1
  706:  static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
  707:      struct timespec start, end;
  708:      uint64_t us;
  709: @@ -509,7 +757,7 @@ static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int
  710:      clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  711:      us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  712:      cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
  713: -    printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
  714: +    printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
  715:  }
  716:  
  717:  static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
  718: @@ -533,10 +781,108 @@ int main() {
  719:      free(buf);
  720:      return 0;
  721:  }
  722: +#endif /* BENCHMARK_SIMD_CHECKSUM1 */
  723:  
  724: +#ifdef BENCHMARK_SIMD_CHECKSUM2
  725: +static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
  726: +    struct timespec start, end;
  727: +    uint64_t us;
  728: +    unsigned char cs1[16];
  729: +    unsigned char cs2[16];
  730: +    int i;
  731: +
  732: +    clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  733: +    for (i = 0; i < ROUNDS; i++) {
  734: +        func(buf, len, (char*)cs1);
  735: +    }
  736: +    clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  737: +    us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  738: +
  739: +    func2(buf, len, (char*)cs2);
  740: +
  741: +    float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
  742: +    printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
  743: +    for (i = 0; i < 16; i++) {
  744: +        printf("%02x", cs1[i] & 0xFF);
  745: +    }
  746: +    printf(" :: ");
  747: +    for (i = 0; i < 16; i++) {
  748: +        printf("%02x", cs2[i] & 0xFF);
  749: +    }
  750: +    printf("\n");
  751: +}
  752: +
  753: +static void benchmark_inner(char* buf, int32 len, char* sum_out) {
  754: +    // This should produce the same output for different optimizations
  755: +    // levels, not the same as sanity_check()
  756: +
  757: +    char* bufs[8] = {0};
  758: +    int lens[8] = {0};
  759: +    char* sums[8] = {0};
  760: +
  761: +    bufs[0] = buf;
  762: +    lens[0] = len;
  763: +    sums[0] = sum_out;
  764: +    md5_parallel(1, bufs, lens, sums, NULL, NULL);
  765: +}
  766: +
  767: +extern "C" {
  768: +extern void MD5P8_Init_c(MD5P8_CTX *ctx);
  769: +extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
  770: +extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
  771: +}
  772: +
  773: +static void sanity_check(char* buf, int32 len, char* sum_out) {
  774: +    // This should produce the same output for different optimizations
  775: +    // levels, not the same as benchmark_inner()
  776: +    if (md5_parallel_slots() <= 1) {
  777: +        MD5P8_CTX m5p8;
  778: +        MD5P8_Init_c(&m5p8);
  779: +        MD5P8_Update_c(&m5p8, (uchar *)buf, len);
  780: +        MD5P8_Final_c((uchar *)sum_out, &m5p8);
  781: +    } else {
  782: +        MD5P8_CTX m5p8;
  783: +        MD5P8_Init(&m5p8);
  784: +        MD5P8_Update(&m5p8, (uchar *)buf, len);
  785: +        MD5P8_Final((uchar *)sum_out, &m5p8);
  786: +    }
  787: +}
  788: +
  789: +int main() {
  790: +    // This benchmarks the parallel MD5 checksum rather than get_checksum2()
  791: +    // as the latter would require compiling in a lot of rsync's code, but
  792: +    // it touches all the same internals so the performance should be nearly
  793: +    // identical.
  794: +
  795: +    int i;
  796: +    char* buf = (char*)malloc(BLOCK_LEN);
  797: +    for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
  798: +
  799: +    const char* method = "?";
  800: +    switch (md5_parallel_slots()) {
  801: +        case 8: method = "AVX2"; break;
  802: +        case 4: method = "SSE2"; break;
  803: +#ifdef USE_OPENSSL
  804: +        case 1: method = "OpenSSL"; break;
  805: +#elif (CSUM_CHUNK == 64)
  806: +        case 1: method = "ASM"; break;
  807: +#else
  808: +        // this won't happen unless you modified code somewhere
  809: +        case 1: method = "Raw-C"; break;
  810: +#endif
  811: +    }
  812: +
  813: +    benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
  814: +
  815: +    free(buf);
  816: +    return 0;
  817: +}
  818: +#endif /* BENCHMARK_SIMD_CHECKSUM2 */
  819: +
  820: +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
  821:  #pragma GCC pop_options
  822:  #pragma clang optimize on
  823: -#endif /* BENCHMARK_SIMD_CHECKSUM1 */
  824: +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
  825:  
  826:  #endif /* HAVE_SIMD */
  827:  #endif /* __cplusplus */
  828: diff --git a/simd-md5-parallel-x86_64.cpp b/simd-md5-parallel-x86_64.cpp
  829: new file mode 100644
  830: --- /dev/null
  831: +++ b/simd-md5-parallel-x86_64.cpp
  832: @@ -0,0 +1,1138 @@
  833: +/*
  834: + * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel.
  835: + *
  836: + * Original author: Nicolas Noble, 2017
  837: + * Modifications:   Jorrit Jongma, 2020
  838: + *
  839: + * The original code was released in the public domain by the original author,
  840: + * falling back to the MIT license ( http://www.opensource.org/licenses/MIT )
  841: + * in case public domain does not apply in your country. These modifications
  842: + * are likewise released in the public domain, with the same MIT license
  843: + * fallback.
  844: + *
  845: + * The original publication can be found at:
  846: + *
  847: + * https://github.com/nicolasnoble/sse-hash
  848: + */
  849: +/*
  850: + * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
  851: + * MD5 code has been removed and those code paths rerouted to use the MD5
  852: + * code already present in rsync, and wrapper functions have been added. The
  853: + * MD5P8 code is also new, and is the reason for the new stride parameter.
  854: + *
  855: + * This code allows multiple independent MD5 streams to be processed in
  856: + * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
  857: + * lower than that of the original C routines for MD5, the processing of
  858: + * additional streams is "for free".
  859: + *
  860: + * Single streams are rerouted to rsync's normal MD5 code as that is faster
  861: + * for that case. A further optimization is possible by using SSE2 code on
  862: + * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not
  863: + * implemented here as it would require some restructuring, and in practise
  864: + * the code here is only rarely called with less than the maximum amount of
  865: + * streams (typically once at the end of each checksum2'd file).
  866: + *
  867: + * Benchmarks (in MB/s)            C     ASM  SSE2*1  SSE2*4  AVX2*1  AVX2*8
  868: + * - Intel Atom D2700            302     334     166     664     N/A     N/A
  869: + * - Intel i7-7700hq             351     376     289    1156     273    2184
  870: + * - AMD ThreadRipper 2950x      728     784     568    2272     430    3440
  871: + */
  872: +
  873: +#ifdef __x86_64__
  874: +#ifdef __cplusplus
  875: +
  876: +extern "C" {
  877: +
  878: +#include "rsync.h"
  879: +
  880: +}
  881: +
  882: +#ifdef HAVE_SIMD
  883: +
  884: +#ifndef BENCHMARK_SIMD_CHECKSUM2
  885: +#define PMD5_ALLOW_SSE2 // debugging
  886: +#define PMD5_ALLOW_AVX2 // debugging
  887: +#endif
  888: +
  889: +#ifdef PMD5_ALLOW_AVX2
  890: +#ifndef PMD5_ALLOW_SSE2
  891: +#define PMD5_ALLOW_SSE2
  892: +#endif
  893: +#endif
  894: +
  895: +#include <stdint.h>
  896: +#include <string.h>
  897: +
  898: +#include <immintrin.h>
  899: +
  900: +/* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
  901: +#ifdef __clang__
  902: +#define MVSTATIC
  903: +#else
  904: +#define MVSTATIC static
  905: +#endif
  906: +
  907: +// Missing from the headers on gcc 6 and older, clang 8 and older
  908: +typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  909: +typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
  910: +
  911: +#define PMD5_SLOTS_DEFAULT 0
  912: +#define PMD5_SLOTS_SSE2 4
  913: +#define PMD5_SLOTS_AVX2 8
  914: +#define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2
  915: +
  916: +#ifdef PMD5_ALLOW_SSE2
  917: +__attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots()
  918: +{
  919: +    return PMD5_SLOTS_SSE2;
  920: +}
  921: +#endif
  922: +
  923: +#ifdef PMD5_ALLOW_AVX2
  924: +__attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots()
  925: +{
  926: +    return PMD5_SLOTS_AVX2;
  927: +}
  928: +#endif
  929: +
  930: +__attribute__ ((target("default"))) MVSTATIC int pmd5_slots()
  931: +{
  932: +    return PMD5_SLOTS_DEFAULT;
  933: +}
  934: +
  935: +/* The parallel MD5 context structure. */
  936: +typedef struct {
  937: +    __m128i state_sse2[4];
  938: +    __m256i state_avx2[4];
  939: +    uint64_t len[PMD5_SLOTS_MAX];
  940: +} pmd5_context;
  941: +
  942: +/* The status returned by the various functions below. */
  943: +typedef enum {
  944: +    PMD5_SUCCESS,
  945: +    PMD5_INVALID_SLOT,
  946: +    PMD5_UNALIGNED_UPDATE,
  947: +} pmd5_status;
  948: +
  949: +/* Initializes all slots in the given pmd5 context. */
  950: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx);
  951: +
  952: +/* Initializes a single slot out in the given pmd5 context. */
  953: +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot);
  954: +
  955: +/* Makes an MD5 update on all slots in parallel, given the same exact length on all streams.
  956: +   The stream pointers will be incremented accordingly.
  957: +   It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
  958: +   The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted.
  959: +   Stride defaults to 64 if 0 is passed. */
  960: +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride);
  961: +
  962: +/* Makes an MD5 update on all slots in parallel, given different lengths.
  963: +   The stream pointers will be incremented accordingly.
  964: +   The lengths will be decreased accordingly. Not all data might be consumed.
  965: +   It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
  966: +   The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */
  967: +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]);
  968: +
  969: +/* Finishes all slots at once. Fills in all digests. */
  970: +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]);
  971: +
  972: +/* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
  973: +   a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
  974: +static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot);
  975: +
  976: +/* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a
  977: +   multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using
  978: +   a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
  979: +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length);
  980: +
  981: +/* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */
  982: +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot);
  983: +
  984: +/* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */
  985: +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot);
  986: +
  987: +#define S11  7
  988: +#define S12 12
  989: +#define S13 17
  990: +#define S14 22
  991: +#define S21  5
  992: +#define S22  9
  993: +#define S23 14
  994: +#define S24 20
  995: +#define S31  4
  996: +#define S32 11
  997: +#define S33 16
  998: +#define S34 23
  999: +#define S41  6
 1000: +#define S42 10
 1001: +#define S43 15
 1002: +#define S44 21
 1003: +
 1004: +#define T1  0xD76AA478
 1005: +#define T2  0xE8C7B756
 1006: +#define T3  0x242070DB
 1007: +#define T4  0xC1BDCEEE
 1008: +#define T5  0xF57C0FAF
 1009: +#define T6  0x4787C62A
 1010: +#define T7  0xA8304613
 1011: +#define T8  0xFD469501
 1012: +#define T9  0x698098D8
 1013: +#define T10 0x8B44F7AF
 1014: +#define T11 0xFFFF5BB1
 1015: +#define T12 0x895CD7BE
 1016: +#define T13 0x6B901122
 1017: +#define T14 0xFD987193
 1018: +#define T15 0xA679438E
 1019: +#define T16 0x49B40821
 1020: +#define T17 0xF61E2562
 1021: +#define T18 0xC040B340
 1022: +#define T19 0x265E5A51
 1023: +#define T20 0xE9B6C7AA
 1024: +#define T21 0xD62F105D
 1025: +#define T22 0x02441453
 1026: +#define T23 0xD8A1E681
 1027: +#define T24 0xE7D3FBC8
 1028: +#define T25 0x21E1CDE6
 1029: +#define T26 0xC33707D6
 1030: +#define T27 0xF4D50D87
 1031: +#define T28 0x455A14ED
 1032: +#define T29 0xA9E3E905
 1033: +#define T30 0xFCEFA3F8
 1034: +#define T31 0x676F02D9
 1035: +#define T32 0x8D2A4C8A
 1036: +#define T33 0xFFFA3942
 1037: +#define T34 0x8771F681
 1038: +#define T35 0x6D9D6122
 1039: +#define T36 0xFDE5380C
 1040: +#define T37 0xA4BEEA44
 1041: +#define T38 0x4BDECFA9
 1042: +#define T39 0xF6BB4B60
 1043: +#define T40 0xBEBFBC70
 1044: +#define T41 0x289B7EC6
 1045: +#define T42 0xEAA127FA
 1046: +#define T43 0xD4EF3085
 1047: +#define T44 0x04881D05
 1048: +#define T45 0xD9D4D039
 1049: +#define T46 0xE6DB99E5
 1050: +#define T47 0x1FA27CF8
 1051: +#define T48 0xC4AC5665
 1052: +#define T49 0xF4292244
 1053: +#define T50 0x432AFF97
 1054: +#define T51 0xAB9423A7
 1055: +#define T52 0xFC93A039
 1056: +#define T53 0x655B59C3
 1057: +#define T54 0x8F0CCC92
 1058: +#define T55 0xFFEFF47D
 1059: +#define T56 0x85845DD1
 1060: +#define T57 0x6FA87E4F
 1061: +#define T58 0xFE2CE6E0
 1062: +#define T59 0xA3014314
 1063: +#define T60 0x4E0811A1
 1064: +#define T61 0xF7537E82
 1065: +#define T62 0xBD3AF235
 1066: +#define T63 0x2AD7D2BB
 1067: +#define T64 0xEB86D391
 1068: +
 1069: +#define ROTL_SSE2(x, n) { \
 1070: +    __m128i s; \
 1071: +    s = _mm_srli_epi32(x, 32 - n); \
 1072: +    x = _mm_slli_epi32(x, n); \
 1073: +    x = _mm_or_si128(x, s); \
 1074: +};
 1075: +
 1076: +#define ROTL_AVX2(x, n) { \
 1077: +    __m256i s; \
 1078: +    s = _mm256_srli_epi32(x, 32 - n); \
 1079: +    x = _mm256_slli_epi32(x, n); \
 1080: +    x = _mm256_or_si256(x, s); \
 1081: +};
 1082: +
 1083: +#define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z))
 1084: +#define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y))
 1085: +#define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z)
 1086: +#define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff))))
 1087: +
 1088: +#define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
 1089: +#define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y))
 1090: +#define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z)
 1091: +#define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff))))
 1092: +
 1093: +#define SET_SSE2(step, a, b, c, d, x, s, ac) { \
 1094: +    a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \
 1095: +    ROTL_SSE2(a, s); \
 1096: +    a = _mm_add_epi32(a, b); \
 1097: +}
 1098: +
 1099: +#define SET_AVX2(step, a, b, c, d, x, s, ac) { \
 1100: +    a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \
 1101: +    ROTL_AVX2(a, s); \
 1102: +    a = _mm256_add_epi32(a, b); \
 1103: +}
 1104: +
 1105: +#define IA 0x67452301
 1106: +#define IB 0xefcdab89
 1107: +#define IC 0x98badcfe
 1108: +#define ID 0x10325476
 1109: +
 1110: +#define GET_MD5_DATA(dest, src, pos)         \
 1111: +    dest =                                   \
 1112: +        ((uint32_t) src[pos + 0]) <<  0 |    \
 1113: +        ((uint32_t) src[pos + 1]) <<  8 |    \
 1114: +        ((uint32_t) src[pos + 2]) << 16 |    \
 1115: +        ((uint32_t) src[pos + 3]) << 24
 1116: +
 1117: +#define GET_PMD5_DATA_SSE2(dest, src, pos) { \
 1118: +    uint32_t v0, v1, v2, v3;                 \
 1119: +    GET_MD5_DATA(v0, src[0], pos);           \
 1120: +    GET_MD5_DATA(v1, src[1], pos);           \
 1121: +    GET_MD5_DATA(v2, src[2], pos);           \
 1122: +    GET_MD5_DATA(v3, src[3], pos);           \
 1123: +    dest = _mm_setr_epi32(v0, v1, v2, v3);   \
 1124: +}
 1125: +
 1126: +#define GET_PMD5_DATA_AVX2(dest, src, pos) { \
 1127: +    uint32_t v0, v1, v2, v3;                 \
 1128: +    uint32_t v4, v5, v6, v7;                 \
 1129: +    GET_MD5_DATA(v0, src[0], pos);           \
 1130: +    GET_MD5_DATA(v1, src[1], pos);           \
 1131: +    GET_MD5_DATA(v2, src[2], pos);           \
 1132: +    GET_MD5_DATA(v3, src[3], pos);           \
 1133: +    GET_MD5_DATA(v4, src[4], pos);           \
 1134: +    GET_MD5_DATA(v5, src[5], pos);           \
 1135: +    GET_MD5_DATA(v6, src[6], pos);           \
 1136: +    GET_MD5_DATA(v7, src[7], pos);           \
 1137: +    dest = _mm256_setr_epi32(v0, v1, v2, v3, \
 1138: +                          v4, v5, v6, v7);   \
 1139: +}
 1140: +
 1141: +#define PUT_MD5_DATA(dest, val, pos) {       \
 1142: +    dest[pos + 0] = (val >>  0) & 0xff;      \
 1143: +    dest[pos + 1] = (val >>  8) & 0xff;      \
 1144: +    dest[pos + 2] = (val >> 16) & 0xff;      \
 1145: +    dest[pos + 3] = (val >> 24) & 0xff;      \
 1146: +}
 1147: +
 1148: +const static uint8_t md5_padding[64] = {
 1149: +    0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 1150: +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 1151: +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 1152: +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 1153: +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 1154: +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 1155: +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 1156: +    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 1157: +};
 1158: +
 1159: +#ifdef PMD5_ALLOW_SSE2
 1160: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
 1161: +{
 1162: +    int i;
 1163: +    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
 1164: +        ctx->len[i] = 0;
 1165: +    }
 1166: +
 1167: +    ctx->state_sse2[0] = _mm_set1_epi32(IA);
 1168: +    ctx->state_sse2[1] = _mm_set1_epi32(IB);
 1169: +    ctx->state_sse2[2] = _mm_set1_epi32(IC);
 1170: +    ctx->state_sse2[3] = _mm_set1_epi32(ID);
 1171: +
 1172: +    return PMD5_SUCCESS;
 1173: +}
 1174: +#endif
 1175: +
 1176: +#ifdef PMD5_ALLOW_AVX2
 1177: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
 1178: +{
 1179: +    int i;
 1180: +    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
 1181: +        ctx->len[i] = 0;
 1182: +    }
 1183: +
 1184: +    ctx->state_avx2[0] = _mm256_set1_epi32(IA);
 1185: +    ctx->state_avx2[1] = _mm256_set1_epi32(IB);
 1186: +    ctx->state_avx2[2] = _mm256_set1_epi32(IC);
 1187: +    ctx->state_avx2[3] = _mm256_set1_epi32(ID);
 1188: +
 1189: +    return PMD5_SUCCESS;
 1190: +}
 1191: +#endif
 1192: +
 1193: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
 1194: +{
 1195: +    return PMD5_INVALID_SLOT;
 1196: +}
 1197: +
 1198: +#ifdef PMD5_ALLOW_SSE2
 1199: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
 1200: +{
 1201: +    if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
 1202: +        return PMD5_INVALID_SLOT;
 1203: +
 1204: +    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
 1205: +    int i;
 1206: +
 1207: +    for (i = 0; i < 4; i++) {
 1208: +        _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
 1209: +    }
 1210: +
 1211: +    v[0][slot] = a;
 1212: +    v[1][slot] = b;
 1213: +    v[2][slot] = c;
 1214: +    v[3][slot] = d;
 1215: +
 1216: +    for (i = 0; i < 4; i++) {
 1217: +        ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]);
 1218: +    }
 1219: +
 1220: +    return PMD5_SUCCESS;
 1221: +}
 1222: +#endif
 1223: +
 1224: +#ifdef PMD5_ALLOW_AVX2
 1225: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
 1226: +{
 1227: +    if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
 1228: +        return PMD5_INVALID_SLOT;
 1229: +
 1230: +    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
 1231: +    int i;
 1232: +
 1233: +    for (i = 0; i < 4; i++) {
 1234: +        _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
 1235: +    }
 1236: +
 1237: +    v[0][slot] = a;
 1238: +    v[1][slot] = b;
 1239: +    v[2][slot] = c;
 1240: +    v[3][slot] = d;
 1241: +
 1242: +    for (i = 0; i < 4; i++) {
 1243: +        ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]);
 1244: +    }
 1245: +
 1246: +    return PMD5_SUCCESS;
 1247: +}
 1248: +#endif
 1249: +
 1250: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
 1251: +{
 1252: +    return PMD5_INVALID_SLOT;
 1253: +}
 1254: +
 1255: +#ifdef PMD5_ALLOW_SSE2
 1256: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
 1257: +{
 1258: +    if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
 1259: +        return PMD5_INVALID_SLOT;
 1260: +
 1261: +    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
 1262: +    int i;
 1263: +
 1264: +    for (i = 0; i < 4; i++) {
 1265: +        _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
 1266: +    }
 1267: +
 1268: +    *a = v[0][slot];
 1269: +    *b = v[1][slot];
 1270: +    *c = v[2][slot];
 1271: +    *d = v[3][slot];
 1272: +
 1273: +    return PMD5_SUCCESS;
 1274: +}
 1275: +#endif
 1276: +
 1277: +#ifdef PMD5_ALLOW_AVX2
 1278: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
 1279: +{
 1280: +    if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
 1281: +        return PMD5_INVALID_SLOT;
 1282: +
 1283: +    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
 1284: +    int i;
 1285: +
 1286: +    for (i = 0; i < 4; i++) {
 1287: +        _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
 1288: +    }
 1289: +
 1290: +    *a = v[0][slot];
 1291: +    *b = v[1][slot];
 1292: +    *c = v[2][slot];
 1293: +    *d = v[3][slot];
 1294: +
 1295: +    return PMD5_SUCCESS;
 1296: +}
 1297: +#endif
 1298: +
 1299: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
 1300: +{
 1301: +    return PMD5_INVALID_SLOT;
 1302: +}
 1303: +
 1304: +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot)
 1305: +{
 1306: +    return pmd5_set_slot(ctx, slot, IA, IB, IC, ID);
 1307: +}
 1308: +
 1309: +#ifdef PMD5_ALLOW_SSE2
 1310: +__attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
 1311: +{
 1312: +    __m128i W[MD5_DIGEST_LEN], a, b, c, d;
 1313: +
 1314: +    GET_PMD5_DATA_SSE2(W[ 0], data,  0);
 1315: +    GET_PMD5_DATA_SSE2(W[ 1], data,  4);
 1316: +    GET_PMD5_DATA_SSE2(W[ 2], data,  8);
 1317: +    GET_PMD5_DATA_SSE2(W[ 3], data, 12);
 1318: +    GET_PMD5_DATA_SSE2(W[ 4], data, 16);
 1319: +    GET_PMD5_DATA_SSE2(W[ 5], data, 20);
 1320: +    GET_PMD5_DATA_SSE2(W[ 6], data, 24);
 1321: +    GET_PMD5_DATA_SSE2(W[ 7], data, 28);
 1322: +    GET_PMD5_DATA_SSE2(W[ 8], data, 32);
 1323: +    GET_PMD5_DATA_SSE2(W[ 9], data, 36);
 1324: +    GET_PMD5_DATA_SSE2(W[10], data, 40);
 1325: +    GET_PMD5_DATA_SSE2(W[11], data, 44);
 1326: +    GET_PMD5_DATA_SSE2(W[12], data, 48);
 1327: +    GET_PMD5_DATA_SSE2(W[13], data, 52);
 1328: +    GET_PMD5_DATA_SSE2(W[14], data, 56);
 1329: +    GET_PMD5_DATA_SSE2(W[15], data, 60);
 1330: +
 1331: +    a = ctx->state_sse2[0];
 1332: +    b = ctx->state_sse2[1];
 1333: +    c = ctx->state_sse2[2];
 1334: +    d = ctx->state_sse2[3];
 1335: +
 1336: +    SET_SSE2(F, a, b, c, d, W[ 0], S11,  1);
 1337: +    SET_SSE2(F, d, a, b, c, W[ 1], S12,  2);
 1338: +    SET_SSE2(F, c, d, a, b, W[ 2], S13,  3);
 1339: +    SET_SSE2(F, b, c, d, a, W[ 3], S14,  4);
 1340: +    SET_SSE2(F, a, b, c, d, W[ 4], S11,  5);
 1341: +    SET_SSE2(F, d, a, b, c, W[ 5], S12,  6);
 1342: +    SET_SSE2(F, c, d, a, b, W[ 6], S13,  7);
 1343: +    SET_SSE2(F, b, c, d, a, W[ 7], S14,  8);
 1344: +    SET_SSE2(F, a, b, c, d, W[ 8], S11,  9);
 1345: +    SET_SSE2(F, d, a, b, c, W[ 9], S12, 10);
 1346: +    SET_SSE2(F, c, d, a, b, W[10], S13, 11);
 1347: +    SET_SSE2(F, b, c, d, a, W[11], S14, 12);
 1348: +    SET_SSE2(F, a, b, c, d, W[12], S11, 13);
 1349: +    SET_SSE2(F, d, a, b, c, W[13], S12, 14);
 1350: +    SET_SSE2(F, c, d, a, b, W[14], S13, 15);
 1351: +    SET_SSE2(F, b, c, d, a, W[15], S14, 16);
 1352: +
 1353: +    SET_SSE2(G, a, b, c, d, W[ 1], S21, 17);
 1354: +    SET_SSE2(G, d, a, b, c, W[ 6], S22, 18);
 1355: +    SET_SSE2(G, c, d, a, b, W[11], S23, 19);
 1356: +    SET_SSE2(G, b, c, d, a, W[ 0], S24, 20);
 1357: +    SET_SSE2(G, a, b, c, d, W[ 5], S21, 21);
 1358: +    SET_SSE2(G, d, a, b, c, W[10], S22, 22);
 1359: +    SET_SSE2(G, c, d, a, b, W[15], S23, 23);
 1360: +    SET_SSE2(G, b, c, d, a, W[ 4], S24, 24);
 1361: +    SET_SSE2(G, a, b, c, d, W[ 9], S21, 25);
 1362: +    SET_SSE2(G, d, a, b, c, W[14], S22, 26);
 1363: +    SET_SSE2(G, c, d, a, b, W[ 3], S23, 27);
 1364: +    SET_SSE2(G, b, c, d, a, W[ 8], S24, 28);
 1365: +    SET_SSE2(G, a, b, c, d, W[13], S21, 29);
 1366: +    SET_SSE2(G, d, a, b, c, W[ 2], S22, 30);
 1367: +    SET_SSE2(G, c, d, a, b, W[ 7], S23, 31);
 1368: +    SET_SSE2(G, b, c, d, a, W[12], S24, 32);
 1369: +
 1370: +    SET_SSE2(H, a, b, c, d, W[ 5], S31, 33);
 1371: +    SET_SSE2(H, d, a, b, c, W[ 8], S32, 34);
 1372: +    SET_SSE2(H, c, d, a, b, W[11], S33, 35);
 1373: +    SET_SSE2(H, b, c, d, a, W[14], S34, 36);
 1374: +    SET_SSE2(H, a, b, c, d, W[ 1], S31, 37);
 1375: +    SET_SSE2(H, d, a, b, c, W[ 4], S32, 38);
 1376: +    SET_SSE2(H, c, d, a, b, W[ 7], S33, 39);
 1377: +    SET_SSE2(H, b, c, d, a, W[10], S34, 40);
 1378: +    SET_SSE2(H, a, b, c, d, W[13], S31, 41);
 1379: +    SET_SSE2(H, d, a, b, c, W[ 0], S32, 42);
 1380: +    SET_SSE2(H, c, d, a, b, W[ 3], S33, 43);
 1381: +    SET_SSE2(H, b, c, d, a, W[ 6], S34, 44);
 1382: +    SET_SSE2(H, a, b, c, d, W[ 9], S31, 45);
 1383: +    SET_SSE2(H, d, a, b, c, W[12], S32, 46);
 1384: +    SET_SSE2(H, c, d, a, b, W[15], S33, 47);
 1385: +    SET_SSE2(H, b, c, d, a, W[ 2], S34, 48);
 1386: +
 1387: +    SET_SSE2(I, a, b, c, d, W[ 0], S41, 49);
 1388: +    SET_SSE2(I, d, a, b, c, W[ 7], S42, 50);
 1389: +    SET_SSE2(I, c, d, a, b, W[14], S43, 51);
 1390: +    SET_SSE2(I, b, c, d, a, W[ 5], S44, 52);
 1391: +    SET_SSE2(I, a, b, c, d, W[12], S41, 53);
 1392: +    SET_SSE2(I, d, a, b, c, W[ 3], S42, 54);
 1393: +    SET_SSE2(I, c, d, a, b, W[10], S43, 55);
 1394: +    SET_SSE2(I, b, c, d, a, W[ 1], S44, 56);
 1395: +    SET_SSE2(I, a, b, c, d, W[ 8], S41, 57);
 1396: +    SET_SSE2(I, d, a, b, c, W[15], S42, 58);
 1397: +    SET_SSE2(I, c, d, a, b, W[ 6], S43, 59);
 1398: +    SET_SSE2(I, b, c, d, a, W[13], S44, 60);
 1399: +    SET_SSE2(I, a, b, c, d, W[ 4], S41, 61);
 1400: +    SET_SSE2(I, d, a, b, c, W[11], S42, 62);
 1401: +    SET_SSE2(I, c, d, a, b, W[ 2], S43, 63);
 1402: +    SET_SSE2(I, b, c, d, a, W[ 9], S44, 64);
 1403: +
 1404: +    ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a);
 1405: +    ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b);
 1406: +    ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c);
 1407: +    ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d);
 1408: +}
 1409: +#endif
 1410: +
 1411: +#ifdef PMD5_ALLOW_AVX2
 1412: +__attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
 1413: +{
 1414: +    __m256i W[MD5_DIGEST_LEN], a, b, c, d;
 1415: +
 1416: +    GET_PMD5_DATA_AVX2(W[ 0], data,  0);
 1417: +    GET_PMD5_DATA_AVX2(W[ 1], data,  4);
 1418: +    GET_PMD5_DATA_AVX2(W[ 2], data,  8);
 1419: +    GET_PMD5_DATA_AVX2(W[ 3], data, 12);
 1420: +    GET_PMD5_DATA_AVX2(W[ 4], data, 16);
 1421: +    GET_PMD5_DATA_AVX2(W[ 5], data, 20);
 1422: +    GET_PMD5_DATA_AVX2(W[ 6], data, 24);
 1423: +    GET_PMD5_DATA_AVX2(W[ 7], data, 28);
 1424: +    GET_PMD5_DATA_AVX2(W[ 8], data, 32);
 1425: +    GET_PMD5_DATA_AVX2(W[ 9], data, 36);
 1426: +    GET_PMD5_DATA_AVX2(W[10], data, 40);
 1427: +    GET_PMD5_DATA_AVX2(W[11], data, 44);
 1428: +    GET_PMD5_DATA_AVX2(W[12], data, 48);
 1429: +    GET_PMD5_DATA_AVX2(W[13], data, 52);
 1430: +    GET_PMD5_DATA_AVX2(W[14], data, 56);
 1431: +    GET_PMD5_DATA_AVX2(W[15], data, 60);
 1432: +
 1433: +    a = ctx->state_avx2[0];
 1434: +    b = ctx->state_avx2[1];
 1435: +    c = ctx->state_avx2[2];
 1436: +    d = ctx->state_avx2[3];
 1437: +
 1438: +    SET_AVX2(F, a, b, c, d, W[ 0], S11,  1);
 1439: +    SET_AVX2(F, d, a, b, c, W[ 1], S12,  2);
 1440: +    SET_AVX2(F, c, d, a, b, W[ 2], S13,  3);
 1441: +    SET_AVX2(F, b, c, d, a, W[ 3], S14,  4);
 1442: +    SET_AVX2(F, a, b, c, d, W[ 4], S11,  5);
 1443: +    SET_AVX2(F, d, a, b, c, W[ 5], S12,  6);
 1444: +    SET_AVX2(F, c, d, a, b, W[ 6], S13,  7);
 1445: +    SET_AVX2(F, b, c, d, a, W[ 7], S14,  8);
 1446: +    SET_AVX2(F, a, b, c, d, W[ 8], S11,  9);
 1447: +    SET_AVX2(F, d, a, b, c, W[ 9], S12, 10);
 1448: +    SET_AVX2(F, c, d, a, b, W[10], S13, 11);
 1449: +    SET_AVX2(F, b, c, d, a, W[11], S14, 12);
 1450: +    SET_AVX2(F, a, b, c, d, W[12], S11, 13);
 1451: +    SET_AVX2(F, d, a, b, c, W[13], S12, 14);
 1452: +    SET_AVX2(F, c, d, a, b, W[14], S13, 15);
 1453: +    SET_AVX2(F, b, c, d, a, W[15], S14, 16);
 1454: +
 1455: +    SET_AVX2(G, a, b, c, d, W[ 1], S21, 17);
 1456: +    SET_AVX2(G, d, a, b, c, W[ 6], S22, 18);
 1457: +    SET_AVX2(G, c, d, a, b, W[11], S23, 19);
 1458: +    SET_AVX2(G, b, c, d, a, W[ 0], S24, 20);
 1459: +    SET_AVX2(G, a, b, c, d, W[ 5], S21, 21);
 1460: +    SET_AVX2(G, d, a, b, c, W[10], S22, 22);
 1461: +    SET_AVX2(G, c, d, a, b, W[15], S23, 23);
 1462: +    SET_AVX2(G, b, c, d, a, W[ 4], S24, 24);
 1463: +    SET_AVX2(G, a, b, c, d, W[ 9], S21, 25);
 1464: +    SET_AVX2(G, d, a, b, c, W[14], S22, 26);
 1465: +    SET_AVX2(G, c, d, a, b, W[ 3], S23, 27);
 1466: +    SET_AVX2(G, b, c, d, a, W[ 8], S24, 28);
 1467: +    SET_AVX2(G, a, b, c, d, W[13], S21, 29);
 1468: +    SET_AVX2(G, d, a, b, c, W[ 2], S22, 30);
 1469: +    SET_AVX2(G, c, d, a, b, W[ 7], S23, 31);
 1470: +    SET_AVX2(G, b, c, d, a, W[12], S24, 32);
 1471: +
 1472: +    SET_AVX2(H, a, b, c, d, W[ 5], S31, 33);
 1473: +    SET_AVX2(H, d, a, b, c, W[ 8], S32, 34);
 1474: +    SET_AVX2(H, c, d, a, b, W[11], S33, 35);
 1475: +    SET_AVX2(H, b, c, d, a, W[14], S34, 36);
 1476: +    SET_AVX2(H, a, b, c, d, W[ 1], S31, 37);
 1477: +    SET_AVX2(H, d, a, b, c, W[ 4], S32, 38);
 1478: +    SET_AVX2(H, c, d, a, b, W[ 7], S33, 39);
 1479: +    SET_AVX2(H, b, c, d, a, W[10], S34, 40);
 1480: +    SET_AVX2(H, a, b, c, d, W[13], S31, 41);
 1481: +    SET_AVX2(H, d, a, b, c, W[ 0], S32, 42);
 1482: +    SET_AVX2(H, c, d, a, b, W[ 3], S33, 43);
 1483: +    SET_AVX2(H, b, c, d, a, W[ 6], S34, 44);
 1484: +    SET_AVX2(H, a, b, c, d, W[ 9], S31, 45);
 1485: +    SET_AVX2(H, d, a, b, c, W[12], S32, 46);
 1486: +    SET_AVX2(H, c, d, a, b, W[15], S33, 47);
 1487: +    SET_AVX2(H, b, c, d, a, W[ 2], S34, 48);
 1488: +
 1489: +    SET_AVX2(I, a, b, c, d, W[ 0], S41, 49);
 1490: +    SET_AVX2(I, d, a, b, c, W[ 7], S42, 50);
 1491: +    SET_AVX2(I, c, d, a, b, W[14], S43, 51);
 1492: +    SET_AVX2(I, b, c, d, a, W[ 5], S44, 52);
 1493: +    SET_AVX2(I, a, b, c, d, W[12], S41, 53);
 1494: +    SET_AVX2(I, d, a, b, c, W[ 3], S42, 54);
 1495: +    SET_AVX2(I, c, d, a, b, W[10], S43, 55);
 1496: +    SET_AVX2(I, b, c, d, a, W[ 1], S44, 56);
 1497: +    SET_AVX2(I, a, b, c, d, W[ 8], S41, 57);
 1498: +    SET_AVX2(I, d, a, b, c, W[15], S42, 58);
 1499: +    SET_AVX2(I, c, d, a, b, W[ 6], S43, 59);
 1500: +    SET_AVX2(I, b, c, d, a, W[13], S44, 60);
 1501: +    SET_AVX2(I, a, b, c, d, W[ 4], S41, 61);
 1502: +    SET_AVX2(I, d, a, b, c, W[11], S42, 62);
 1503: +    SET_AVX2(I, c, d, a, b, W[ 2], S43, 63);
 1504: +    SET_AVX2(I, b, c, d, a, W[ 9], S44, 64);
 1505: +
 1506: +    ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a);
 1507: +    ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b);
 1508: +    ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c);
 1509: +    ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d);
 1510: +}
 1511: +#endif
 1512: +
 1513: +__attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
 1514: +{
 1515: +}
 1516: +
 1517: +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride)
 1518: +{
 1519: +    const uint8_t * ptrs[PMD5_SLOTS_MAX];
 1520: +
 1521: +    if (!length) return PMD5_SUCCESS;
 1522: +
 1523: +    int slots = pmd5_slots();
 1524: +
 1525: +    if (!stride) stride = 64;
 1526: +
 1527: +    int i;
 1528: +    for (i = 0; i < slots; i++) {
 1529: +        ptrs[i] = data[i];
 1530: +        ctx->len[i] += length;
 1531: +        if (!ptrs[i]) ptrs[i] = md5_padding;
 1532: +    }
 1533: +
 1534: +    while (length >= 64) {
 1535: +        pmd5_process(ctx, ptrs);
 1536: +        length -= 64;
 1537: +        for (i = 0; i < slots; i++) {
 1538: +            if (data[i]) ptrs[i] += stride;
 1539: +        }
 1540: +    }
 1541: +
 1542: +    if (length) return PMD5_UNALIGNED_UPDATE;
 1543: +
 1544: +    for (i = 0; i < slots; i++) {
 1545: +        if (data[i]) data[i] = ptrs[i];
 1546: +    }
 1547: +
 1548: +    return PMD5_SUCCESS;
 1549: +}
 1550: +
 1551: +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX])
 1552: +{
 1553: +    uint64_t length = 0;
 1554: +    int slots = pmd5_slots();
 1555: +
 1556: +    int i;
 1557: +    for (i = 0; i < slots; i++) {
 1558: +        if ((length == 0) || (lengths[i] < length)) length = lengths[i];
 1559: +    }
 1560: +
 1561: +    for (i = 0; i < slots; i++) {
 1562: +        lengths[i] -= length;
 1563: +    }
 1564: +
 1565: +    return pmd5_update_all_simple(ctx, data, length, 0);
 1566: +}
 1567: +
 1568: +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length)
 1569: +{
 1570: +    MD5_CTX ctx;
 1571: +
 1572: +    if ((slot >= pmd5_slots()) || (slot < 0))
 1573: +        return PMD5_INVALID_SLOT;
 1574: +
 1575: +    pmd5_to_md5(pctx, &ctx, slot);
 1576: +    if (data && length) {
 1577: +        MD5_Update(&ctx, data, length);
 1578: +    }
 1579: +    MD5_Final(digest, &ctx);
 1580: +
 1581: +    return PMD5_SUCCESS;
 1582: +}
 1583: +
 1584: +static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot)
 1585: +{
 1586: +    return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0);
 1587: +}
 1588: +
 1589: +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN])
 1590: +{
 1591: +    int i;
 1592: +    for (i = 0; i < pmd5_slots(); i++) {
 1593: +        pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0);
 1594: +    }
 1595: +    return PMD5_SUCCESS;
 1596: +}
 1597: +
 1598: +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot)
 1599: +{
 1600: +    if ((slot >= pmd5_slots()) || (slot < 0))
 1601: +        return PMD5_INVALID_SLOT;
 1602: +
 1603: +    // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting.
 1604: +
 1605: +#ifdef USE_OPENSSL
 1606: +    pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29);
 1607: +#else
 1608: +    pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32);
 1609: +#endif
 1610: +    return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D);
 1611: +}
 1612: +
 1613: +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot)
 1614: +{
 1615: +    if ((slot >= pmd5_slots()) || (slot < 0))
 1616: +        return PMD5_INVALID_SLOT;
 1617: +
 1618: +    MD5_Init(ctx);
 1619: +
 1620: +#ifdef USE_OPENSSL
 1621: +    ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF;
 1622: +    ctx->Nh = pctx->len[slot] >> 29;
 1623: +
 1624: +    uint32_t a, b, c, d;
 1625: +    pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
 1626: +    if (ret == PMD5_SUCCESS) {
 1627: +        ctx->A = a;
 1628: +        ctx->B = b;
 1629: +        ctx->C = c;
 1630: +        ctx->D = d;
 1631: +    }
 1632: +    return ret;
 1633: +#else
 1634: +    ctx->totalN = pctx->len[slot] & 0xFFFFFFFF;
 1635: +    ctx->totalN2 = pctx->len[slot] >> 32;
 1636: +    return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D);
 1637: +#endif
 1638: +}
 1639: +
 1640: +/* With GCC 10 putting these implementations inside 'extern "C"' causes an
 1641: +   assembler error. That worked fine on GCC 5-9 and clang 6-10...
 1642: +  */
 1643: +
 1644: +static inline int md5_parallel_slots_cpp()
 1645: +{
 1646: +    int slots = pmd5_slots();
 1647: +    if (slots == 0) return 1;
 1648: +    return slots;
 1649: +}
 1650: +
 1651: +static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
 1652: +{
 1653: +    int slots = md5_parallel_slots_cpp();
 1654: +    if ((streams < 1) || (streams > slots)) return 0;
 1655: +    if (pre4 && post4) return 0;
 1656: +
 1657: +    if (slots == 1) {
 1658: +        MD5_CTX ctx;
 1659: +        MD5_Init(&ctx);
 1660: +        if (pre4) {
 1661: +            MD5_Update(&ctx, (const unsigned char*)pre4, 4);
 1662: +        }
 1663: +        MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]);
 1664: +        if (post4) {
 1665: +            MD5_Update(&ctx, (const unsigned char*)post4, 4);
 1666: +        }
 1667: +        if (sum[0]) {
 1668: +            MD5_Final((uint8_t*)sum[0], &ctx);
 1669: +        }
 1670: +        return 0;
 1671: +    }
 1672: +
 1673: +    int i;
 1674: +    int active[PMD5_SLOTS_MAX];
 1675: +    char* buffers[PMD5_SLOTS_MAX];
 1676: +    uint64_t left[PMD5_SLOTS_MAX];
 1677: +    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
 1678: +        active[i] = streams > i;
 1679: +        if (i < streams) {
 1680: +            buffers[i] = buf[i];
 1681: +            left[i] = (uint64_t)len[i];
 1682: +        } else {
 1683: +            buffers[i] = NULL;
 1684: +            left[i] = 0;
 1685: +        }
 1686: +    }
 1687: +    MD5_CTX results[PMD5_SLOTS_MAX];
 1688: +
 1689: +    pmd5_context ctx_simd;
 1690: +    if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0;
 1691: +
 1692: +    if (pre4) {
 1693: +        char temp_buffers[PMD5_SLOTS_MAX][64];
 1694: +        int have_any = 0;
 1695: +        for (i = 0; i < slots; i++) {
 1696: +            if (active[i]) {
 1697: +                if (left[i] < 60) {
 1698: +                    MD5_Init(&results[i]);
 1699: +                    MD5_Update(&results[i], (const unsigned char*)pre4, 4);
 1700: +                    MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]);
 1701: +                    active[i] = 0;
 1702: +                    left[i] = 0;
 1703: +                } else {
 1704: +                    memcpy(temp_buffers[i], pre4, 4);
 1705: +                    memcpy(temp_buffers[i] + 4, buffers[i], 60);
 1706: +                    buffers[i] += 60;
 1707: +                    left[i] -= 60;
 1708: +                    have_any = 1;
 1709: +                }
 1710: +            }
 1711: +        }
 1712: +
 1713: +        if (have_any) {
 1714: +            char* ptrs[PMD5_SLOTS_MAX];
 1715: +            for (i = 0; i < PMD5_SLOTS_MAX; i++) {
 1716: +                ptrs[i] = &temp_buffers[i][0];
 1717: +            }
 1718: +            if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) {
 1719: +                return 0;
 1720: +            }
 1721: +        }
 1722: +    }
 1723: +
 1724: +    int failed = 0;
 1725: +    while (true) {
 1726: +        for (i = 0; i < slots; i++) {
 1727: +            if (active[i] && (left[i] < 64)) {
 1728: +                if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) {
 1729: +                    failed = 1;
 1730: +                }
 1731: +                active[i] = 0;
 1732: +            }
 1733: +        }
 1734: +
 1735: +        uint64_t shortest = 0;
 1736: +        for (i = 0; i < slots; i++) {
 1737: +            if (!active[i]) {
 1738: +                buffers[i] = NULL;
 1739: +            } else if ((shortest == 0) || (left[i] < shortest)) {
 1740: +                shortest = left[i];
 1741: +            }
 1742: +        }
 1743: +
 1744: +        if (shortest > 0) {
 1745: +            shortest = shortest & ~63;
 1746: +            if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) {
 1747: +                failed = 1;
 1748: +            }
 1749: +            for (i = 0; i < slots; i++) {
 1750: +                if (active[i]) {
 1751: +                    left[i] -= shortest;
 1752: +                }
 1753: +            }
 1754: +        }
 1755: +
 1756: +        if (failed) {
 1757: +            return 0;
 1758: +        } else {
 1759: +            int have_any = 0;
 1760: +            for (i = 0; i < slots; i++) {
 1761: +                have_any |= active[i];
 1762: +            }
 1763: +            if (!have_any) {
 1764: +                break;
 1765: +            }
 1766: +        }
 1767: +    }
 1768: +
 1769: +    for (i = 0; i < slots; i++) {
 1770: +        if (i < streams) {
 1771: +            if (left[i] > 0) {
 1772: +                // buffer[i] == NULL here
 1773: +                MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]);
 1774: +            }
 1775: +            if (post4) {
 1776: +                MD5_Update(&results[i], (const unsigned char*)post4, 4);
 1777: +            }
 1778: +            if (sum[i]) {
 1779: +                MD5_Final((uint8_t*)sum[i], &results[i]);
 1780: +            }
 1781: +        }
 1782: +    }
 1783: +
 1784: +    return 1;
 1785: +}
 1786: +
 1787: +// each pmd5_context needs to be 32-byte aligned
 1788: +#define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31)))
 1789: +
 1790: +static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx)
 1791: +{
 1792: +    int i;
 1793: +    for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) {
 1794: +        pmd5_init_all(MD5P8_Contexts_simd(ctx, i));
 1795: +    }
 1796: +    ctx->used = 0;
 1797: +    ctx->next = 0;
 1798: +}
 1799: +
 1800: +static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length)
 1801: +{
 1802: +    int slots = pmd5_slots();
 1803: +    uint32 pos = 0;
 1804: +
 1805: +    if ((ctx->used) || (length < 512)) {
 1806: +        int cpy = MIN(length, 512 - ctx->used);
 1807: +        memcpy(&ctx->buffer[ctx->used], input, cpy);
 1808: +        ctx->used += cpy;
 1809: +        length -= cpy;
 1810: +        pos += cpy;
 1811: +
 1812: +        if (ctx->used == 512) {
 1813: +            if (slots == PMD5_SLOTS_AVX2) {
 1814: +                const uint8_t* ptrs[PMD5_SLOTS_MAX] = {
 1815: +                    (uint8_t*)ctx->buffer,
 1816: +                    (uint8_t*)(ctx->buffer + 64),
 1817: +                    (uint8_t*)(ctx->buffer + 128),
 1818: +                    (uint8_t*)(ctx->buffer + 192),
 1819: +                    (uint8_t*)(ctx->buffer + 256),
 1820: +                    (uint8_t*)(ctx->buffer + 320),
 1821: +                    (uint8_t*)(ctx->buffer + 384),
 1822: +                    (uint8_t*)(ctx->buffer + 448)
 1823: +                };
 1824: +                pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0);
 1825: +            } else {
 1826: +                const uint8_t* ptrs1[PMD5_SLOTS_MAX] = {
 1827: +                    (uint8_t*)ctx->buffer,
 1828: +                    (uint8_t*)(ctx->buffer + 64),
 1829: +                    (uint8_t*)(ctx->buffer + 128),
 1830: +                    (uint8_t*)(ctx->buffer + 192)
 1831: +                };
 1832: +                const uint8_t* ptrs2[PMD5_SLOTS_MAX] = {
 1833: +                    (uint8_t*)(ctx->buffer + 256),
 1834: +                    (uint8_t*)(ctx->buffer + 320),
 1835: +                    (uint8_t*)(ctx->buffer + 384),
 1836: +                    (uint8_t*)(ctx->buffer + 448)
 1837: +                };
 1838: +                pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0);
 1839: +                pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0);
 1840: +            }
 1841: +            ctx->used = 0;
 1842: +        }
 1843: +    }
 1844: +
 1845: +    if (length >= 512) {
 1846: +        uint32 blocks = length / 512;
 1847: +        if (slots == PMD5_SLOTS_AVX2) {
 1848: +            const uint8_t* ptrs[8] = {
 1849: +                (uint8_t*)(input + pos),
 1850: +                (uint8_t*)(input + pos + 64),
 1851: +                (uint8_t*)(input + pos + 128),
 1852: +                (uint8_t*)(input + pos + 192),
 1853: +                (uint8_t*)(input + pos + 256),
 1854: +                (uint8_t*)(input + pos + 320),
 1855: +                (uint8_t*)(input + pos + 384),
 1856: +                (uint8_t*)(input + pos + 448)
 1857: +            };
 1858: +            pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512);
 1859: +        } else {
 1860: +            const uint8_t* ptrs1[4] = {
 1861: +                (uint8_t*)(input + pos),
 1862: +                (uint8_t*)(input + pos + 64),
 1863: +                (uint8_t*)(input + pos + 128),
 1864: +                (uint8_t*)(input + pos + 192)
 1865: +            };
 1866: +            const uint8_t* ptrs2[4] = {
 1867: +                (uint8_t*)(input + pos + 256),
 1868: +                (uint8_t*)(input + pos + 320),
 1869: +                (uint8_t*)(input + pos + 384),
 1870: +                (uint8_t*)(input + pos + 448)
 1871: +            };
 1872: +            pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512);
 1873: +            pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512);
 1874: +        }
 1875: +        pos += blocks * 512;
 1876: +        length -= blocks * 512;
 1877: +    }
 1878: +
 1879: +    if (length) {
 1880: +        memcpy(ctx->buffer, &input[pos], length);
 1881: +        ctx->used = length;
 1882: +    }
 1883: +}
 1884: +
 1885: +static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
 1886: +{
 1887: +    int i;
 1888: +    uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0;
 1889: +    if (ctx->used) {
 1890: +        uchar tmp[512];
 1891: +        memset(tmp, 0, 512);
 1892: +        MD5P8_Update(ctx, tmp, 512 - ctx->used);
 1893: +    }
 1894: +
 1895: +    uchar state[34*4] = {0};
 1896: +
 1897: +    MD5_CTX tmp;
 1898: +    for (i = 0; i < 8; i++) {
 1899: +        if (pmd5_slots() == PMD5_SLOTS_AVX2) {
 1900: +            pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
 1901: +        } else if (i < 4) {
 1902: +            pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
 1903: +        } else {
 1904: +            pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4);
 1905: +        }
 1906: +#ifdef USE_OPENSSL
 1907: +        if (low + tmp.Nl < low) high++;
 1908: +        low += tmp.Nl;
 1909: +        high += tmp.Nh;
 1910: +#else
 1911: +        if (low + tmp.totalN < low) high++;
 1912: +        low += tmp.totalN;
 1913: +        high += tmp.totalN2;
 1914: +#endif
 1915: +        SIVALu(state, i*16, tmp.A);
 1916: +        SIVALu(state, i*16 + 4, tmp.B);
 1917: +        SIVALu(state, i*16 + 8, tmp.C);
 1918: +        SIVALu(state, i*16 + 12, tmp.D);
 1919: +    }
 1920: +
 1921: +#ifndef USE_OPENSSL
 1922: +    high = (low >> 29) | (high << 3);
 1923: +    low = (low << 3);
 1924: +#endif
 1925: +
 1926: +    sub <<= 3;
 1927: +    if (low - sub > low) high--;
 1928: +    low -= sub;
 1929: +
 1930: +    SIVALu(state, 32*4, low);
 1931: +    SIVALu(state, 33*4, high);
 1932: +
 1933: +    MD5_CTX md;
 1934: +    MD5_Init(&md);
 1935: +    MD5_Update(&md, state, 34*4);
 1936: +    MD5_Final(digest, &md);
 1937: +}
 1938: +
 1939: +extern "C" {
 1940: +
 1941: +int md5_parallel_slots()
 1942: +{
 1943: +    return md5_parallel_slots_cpp();
 1944: +}
 1945: +
 1946: +int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
 1947: +{
 1948: +    return md5_parallel_cpp(streams, buf, len, sum, pre4, post4);
 1949: +}
 1950: +
 1951: +void MD5P8_Init(MD5P8_CTX *ctx)
 1952: +{
 1953: +    MD5P8_Init_cpp(ctx);
 1954: +}
 1955: +
 1956: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
 1957: +{
 1958: +    MD5P8_Update_cpp(ctx, input, length);
 1959: +}
 1960: +
 1961: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
 1962: +{
 1963: +    MD5P8_Final_cpp(digest, ctx);
 1964: +}
 1965: +
 1966: +} // "C"
 1967: +
 1968: +#endif /* HAVE_SIMD */
 1969: +#endif /* __cplusplus */
 1970: +#endif /* __x86_64__ */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>