File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / rsync / patches / md5p8.diff
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 00:32:36 2021 UTC (3 years, 2 months ago) by misho
Branches: rsync, MAIN
CVS tags: v3_2_3, HEAD
rsync 3.2.3

From: Jorrit Jongma <git@jongma.org>

- MD5 optimization in block matching phase:

MD5 hashes computed during rsync's block matching phase are independent
and thus possible to process in parallel. This code processes 4 blocks
in parallel if SSE2 is available, or 8 if AVX2 is available. An increase
of performance (or decrease of CPU usage) of up to 6x has been measured.

A prefetching algorithm is used to predict and load upcoming blocks, as
this prevents the need for extensive modifications to other parts of
the rsync sources to get this working.

This remains compatible with existing rsync builds using MD5 checksums.

- MD5P8 whole-file checksum:

Splits the input up into 8 independent streams (64-byte interleave), and
produces a final checksum based on the end state of those 8 streams. If
parallelization of MD5 hashing is available, the performance gain (or
CPU usage decrease) is 2x to 6x compared to traditional MD5.

The rsync version on both ends of the connection need MD5P8 support
built-in for it to be used.

xxHash is still preferred (and faster), but this provides a reasonably
fast fallback for the case where xxHash libraries are not available at
build time.

based-on: e94bad1c156fc3910f24e2b3b71a81b0b0bdeb70
diff --git a/Makefile.in b/Makefile.in
--- a/Makefile.in
+++ b/Makefile.in
@@ -29,14 +29,14 @@ SHELL=/bin/sh
 .SUFFIXES:
 .SUFFIXES: .c .o
 
-SIMD_x86_64=simd-checksum-x86_64.o
+SIMD_x86_64=simd-checksum-x86_64.o simd-md5-parallel-x86_64.o
 ASM_x86_64=lib/md5-asm-x86_64.o
 
 GENFILES=configure.sh aclocal.m4 config.h.in rsync.1 rsync.1.html \
 	 rsync-ssl.1 rsync-ssl.1.html rsyncd.conf.5 rsyncd.conf.5.html
 HEADERS=byteorder.h config.h errcode.h proto.h rsync.h ifuncs.h itypes.h inums.h \
 	lib/pool_alloc.h lib/mdigest.h lib/md-defines.h version.h
-LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o \
+LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o lib/md5p8.o \
 	lib/permstring.o lib/pool_alloc.o lib/sysacls.o lib/sysxattrs.o @LIBOBJS@
 zlib_OBJS=zlib/deflate.o zlib/inffast.o zlib/inflate.o zlib/inftrees.o \
 	zlib/trees.o zlib/zutil.o zlib/adler32.o zlib/compress.o zlib/crc32.o
@@ -140,6 +140,9 @@ git-version.h: mkgitver $(wildcard $(srcdir)/.git/logs/HEAD)
 simd-checksum-x86_64.o: simd-checksum-x86_64.cpp
 	@$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-checksum-x86_64.cpp
 
+simd-md5-parallel-x86_64.o: simd-md5-parallel-x86_64.cpp
+	@$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-md5-parallel-x86_64.cpp
+
 lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.S config.h lib/md-defines.h
 	@$(srcdir)/cmdormsg disable-asm $(CC) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/lib/md5-asm-x86_64.S
 
diff --git a/checksum.c b/checksum.c
--- a/checksum.c
+++ b/checksum.c
@@ -52,6 +52,7 @@ struct name_num_obj valid_checksums = {
 		{ CSUM_XXH64, "xxh64", NULL },
 		{ CSUM_XXH64, "xxhash", NULL },
 #endif
+		{ CSUM_MD5P8, "md5p8", NULL },
 		{ CSUM_MD5, "md5", NULL },
 		{ CSUM_MD4, "md4", NULL },
 		{ CSUM_NONE, "none", NULL },
@@ -141,6 +142,7 @@ int csum_len_for_type(int cst, BOOL flist_csum)
 	  case CSUM_MD4_OLD:
 	  case CSUM_MD4_BUSTED:
 		return MD4_DIGEST_LEN;
+	  case CSUM_MD5P8:
 	  case CSUM_MD5:
 		return MD5_DIGEST_LEN;
 	  case CSUM_XXH64:
@@ -167,6 +169,7 @@ int canonical_checksum(int csum_type)
 	  case CSUM_MD4_BUSTED:
 		break;
 	  case CSUM_MD4:
+	  case CSUM_MD5P8:
 	  case CSUM_MD5:
 		return -1;
 	  case CSUM_XXH64:
@@ -179,7 +182,9 @@ int canonical_checksum(int csum_type)
 	return 0;
 }
 
-#ifndef HAVE_SIMD /* See simd-checksum-*.cpp. */
+#ifdef HAVE_SIMD /* See simd-checksum-*.cpp. */
+#define get_checksum2 get_checksum2_nosimd
+#else
 /*
   a simple 32 bit checksum that can be updated from either end
   (inspired by Mark Adler's Adler-32 checksum)
@@ -200,9 +205,18 @@ uint32 get_checksum1(char *buf1, int32 len)
 	}
 	return (s1 & 0xffff) + (s2 << 16);
 }
+
+void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
+{
+}
+
+void checksum2_disable_prefetch()
+{
+}
 #endif
 
-void get_checksum2(char *buf, int32 len, char *sum)
+/* Renamed to get_checksum2_nosimd() with HAVE_SIMD */
+void get_checksum2(char *buf, int32 len, char *sum, UNUSED(OFF_T prefetch_offset))
 {
 	switch (xfersum_type) {
 #ifdef SUPPORT_XXHASH
@@ -221,6 +235,7 @@ void get_checksum2(char *buf, int32 len, char *sum)
 		break;
 	  }
 #endif
+	  case CSUM_MD5P8:  /* == CSUM_MD5 for checksum2 */
 	  case CSUM_MD5: {
 		MD5_CTX m5;
 		uchar seedbuf[4];
@@ -373,6 +388,21 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
 		break;
 	  }
 #endif
+	  case CSUM_MD5P8: {
+		MD5P8_CTX m5p8;
+
+		MD5P8_Init(&m5p8);
+
+		for (i = 0; i + CHUNK_SIZE <= len; i += CHUNK_SIZE)
+			MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, CHUNK_SIZE), CHUNK_SIZE);
+
+		remainder = (int32)(len - i);
+		if (remainder > 0)
+			MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, remainder), remainder);
+
+		MD5P8_Final((uchar *)sum, &m5p8);
+		break;
+	  }
 	  case CSUM_MD5: {
 		MD5_CTX m5;
 
@@ -445,6 +475,7 @@ static union {
 #endif
 	MD5_CTX m5;
 } ctx;
+static MD5P8_CTX m5p8;
 #ifdef SUPPORT_XXHASH
 static XXH64_state_t* xxh64_state;
 #endif
@@ -481,6 +512,9 @@ void sum_init(int csum_type, int seed)
 		XXH3_128bits_reset(xxh3_state);
 		break;
 #endif
+	  case CSUM_MD5P8:
+		MD5P8_Init(&m5p8);
+		break;
 	  case CSUM_MD5:
 		MD5_Init(&ctx.m5);
 		break;
@@ -531,6 +565,9 @@ void sum_update(const char *p, int32 len)
 		XXH3_128bits_update(xxh3_state, p, len);
 		break;
 #endif
+	  case CSUM_MD5P8:
+		MD5P8_Update(&m5p8, (uchar *)p, len);
+		break;
 	  case CSUM_MD5:
 		MD5_Update(&ctx.m5, (uchar *)p, len);
 		break;
@@ -596,6 +633,9 @@ int sum_end(char *sum)
 		break;
 	  }
 #endif
+	  case CSUM_MD5P8:
+		MD5P8_Final((uchar *)sum, &m5p8);
+		break;
 	  case CSUM_MD5:
 		MD5_Final((uchar *)sum, &ctx.m5);
 		break;
diff --git a/generator.c b/generator.c
--- a/generator.c
+++ b/generator.c
@@ -729,10 +729,12 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
 	if (append_mode > 0 && f_copy < 0)
 		return 0;
 
-	if (len > 0)
+	if (len > 0) {
 		mapbuf = map_file(fd, len, MAX_MAP_SIZE, sum.blength);
-	else
+		checksum2_enable_prefetch(mapbuf, len, sum.blength);
+	} else {
 		mapbuf = NULL;
+	}
 
 	for (i = 0; i < sum.count; i++) {
 		int32 n1 = (int32)MIN(len, (OFF_T)sum.blength);
@@ -750,7 +752,7 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
 		}
 
 		sum1 = get_checksum1(map, n1);
-		get_checksum2(map, n1, sum2);
+		get_checksum2(map, n1, sum2, offset - n1);
 
 		if (DEBUG_GTE(DELTASUM, 3)) {
 			rprintf(FINFO,
@@ -762,8 +764,10 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
 		write_buf(f_out, sum2, sum.s2length);
 	}
 
-	if (mapbuf)
+	if (mapbuf) {
 		unmap_file(mapbuf);
+		checksum2_disable_prefetch();
+	}
 
 	return 0;
 }
diff --git a/lib/md-defines.h b/lib/md-defines.h
--- a/lib/md-defines.h
+++ b/lib/md-defines.h
@@ -15,3 +15,4 @@
 #define CSUM_XXH64 6
 #define CSUM_XXH3_64 7
 #define CSUM_XXH3_128 8
+#define CSUM_MD5P8 9
diff --git a/lib/md5p8.c b/lib/md5p8.c
new file mode 100644
--- /dev/null
+++ b/lib/md5p8.c
@@ -0,0 +1,128 @@
+/*
+ * MD5-based hash friendly to parallel processing, reference implementation
+ *
+ * Author: Jorrit Jongma, 2020
+ *
+ * Released in the public domain falling back to the MIT license
+ * ( http://www.opensource.org/licenses/MIT ) in case public domain does not
+ * apply in your country.
+ */
+/*
+ * MD5P8 is an MD5-based hash friendly to parallel processing. The input
+ * stream is divided into 8 independent streams. For each 512 bytes of input,
+ * the first 64 bytes are send to the first stream, the second 64 bytes to
+ * the second stream, etc. The input stream is padded with zeros to the next
+ * multiple of 512 bytes, then a normal MD5 hash is computed on a buffer
+ * containing the A, B, C, and D states of the 8 individual streams, followed
+ * by the (unpadded) length of the input.
+ *
+ * On non-SIMD accelerated CPUs the performance of MD5P8 is slightly lower
+ * than normal MD5 (particularly on files smaller than 10 kB), but with
+ * SIMD-based parallel processing it can be two to six times as fast. Even in
+ * the best-case scenario, xxHash is still at least twice as fast and should
+ * be preferred when available.
+ */
+
+#include "rsync.h"
+
+#ifdef HAVE_SIMD
+#define MD5P8_Init MD5P8_Init_c
+#define MD5P8_Update MD5P8_Update_c
+#define MD5P8_Final MD5P8_Final_c
+#endif
+
+/* each MD5_CTX needs to be 8-byte aligned */
+#define MD5P8_Contexts_c(ctx, index) ((MD5_CTX*)((((uintptr_t)((ctx)->context_storage) + 7) & ~7) + (index)*((sizeof(MD5_CTX) + 7) & ~7)))
+
+void MD5P8_Init(MD5P8_CTX *ctx)
+{
+    int i;
+    for (i = 0; i < 8; i++) {
+        MD5_Init(MD5P8_Contexts_c(ctx, i));
+    }
+    ctx->used = 0;
+    ctx->next = 0;
+}
+
+void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
+{
+    uint32 pos = 0;
+
+    if ((ctx->used) || (length < 64)) {
+        int cpy = MIN(length, 64 - ctx->used);
+        memmove(&ctx->buffer[ctx->used], input, cpy);
+        ctx->used += cpy;
+        length -= cpy;
+        pos += cpy;
+
+        if (ctx->used == 64) {
+            MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), ctx->buffer, 64);
+            ctx->used = 0;
+            ctx->next = (ctx->next + 1) % 8;
+        }
+    }
+
+    while (length >= 64) {
+        MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), &input[pos], 64);
+        ctx->next = (ctx->next + 1) % 8;
+        pos += 64;
+        length -= 64;
+    }
+
+    if (length) {
+        memcpy(ctx->buffer, &input[pos], length);
+        ctx->used = length;
+    }
+}
+
+void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
+{
+    int i;
+    uint32 low = 0, high = 0, sub = ctx->used ? 64 - ctx->used : 0;
+    if (ctx->used) {
+        uchar tmp[64];
+        memset(tmp, 0, 64);
+        MD5P8_Update(ctx, tmp, 64 - ctx->used);
+    }
+    memset(ctx->buffer, 0, 64);
+    while (ctx->next != 0) {
+        MD5P8_Update(ctx, ctx->buffer, 64);
+        sub += 64;
+    }
+
+    uchar state[34*4] = {0};
+
+    for (i = 0; i < 8; i++) {
+        MD5_CTX* md = MD5P8_Contexts_c(ctx, i);
+#ifdef USE_OPENSSL
+        if (low + md->Nl < low) high++;
+        low += md->Nl;
+        high += md->Nh;
+#else
+        if (low + md->totalN < low) high++;
+        low += md->totalN;
+        high += md->totalN2;
+#endif
+        SIVALu(state, i*16, md->A);
+        SIVALu(state, i*16 + 4, md->B);
+        SIVALu(state, i*16 + 8, md->C);
+        SIVALu(state, i*16 + 12, md->D);
+    }
+
+#ifndef USE_OPENSSL
+	high = (low >> 29) | (high << 3);
+	low = (low << 3);
+#endif
+
+    sub <<= 3;
+    if (low - sub > low) high--;
+    low -= sub;
+
+    SIVALu(state, 32*4, low);
+    SIVALu(state, 33*4, high);
+
+    MD5_CTX md;
+    MD5_Init(&md);
+    MD5_Update(&md, state, 34*4);
+    MD5_Final(digest, &md);
+}
diff --git a/lib/mdigest.h b/lib/mdigest.h
--- a/lib/mdigest.h
+++ b/lib/mdigest.h
@@ -27,3 +27,14 @@ void md5_begin(md_context *ctx);
 void md5_update(md_context *ctx, const uchar *input, uint32 length);
 void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN]);
 #endif
+
+typedef struct {
+    uchar context_storage[1024];
+    uchar buffer[512];
+    unsigned int used;
+    unsigned int next;
+} MD5P8_CTX;
+
+void MD5P8_Init(MD5P8_CTX *ctx);
+void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length);
+void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
diff --git a/match.c b/match.c
--- a/match.c
+++ b/match.c
@@ -164,6 +164,8 @@ static void hash_search(int f,struct sum_struct *s,
 	if (DEBUG_GTE(DELTASUM, 3))
 		rprintf(FINFO, "sum=%.8x k=%ld\n", sum, (long)k);
 
+	checksum2_enable_prefetch(buf, len, s->blength);
+
 	offset = aligned_offset = aligned_i = 0;
 
 	end = len + 1 - s->sums[s->count-1].len;
@@ -226,7 +228,7 @@ static void hash_search(int f,struct sum_struct *s,
 
 			if (!done_csum2) {
 				map = (schar *)map_ptr(buf,offset,l);
-				get_checksum2((char *)map,l,sum2);
+				get_checksum2((char *)map, l, sum2, offset);
 				done_csum2 = 1;
 			}
 
@@ -268,7 +270,7 @@ static void hash_search(int f,struct sum_struct *s,
 						sum = get_checksum1((char *)map, l);
 						if (sum != s->sums[i].sum1)
 							goto check_want_i;
-						get_checksum2((char *)map, l, sum2);
+						get_checksum2((char *)map, l, sum2, aligned_offset);
 						if (memcmp(sum2, s->sums[i].sum2, s->s2length) != 0)
 							goto check_want_i;
 						/* OK, we have a re-alignment match.  Bump the offset
@@ -335,6 +337,8 @@ static void hash_search(int f,struct sum_struct *s,
 			matched(f, s, buf, offset - s->blength, -2);
 	} while (++offset < end);
 
+	checksum2_disable_prefetch();
+
 	matched(f, s, buf, len, -1);
 	map_ptr(buf, len-1, 1);
 }
diff --git a/simd-checksum-x86_64.cpp b/simd-checksum-x86_64.cpp
--- a/simd-checksum-x86_64.cpp
+++ b/simd-checksum-x86_64.cpp
@@ -49,13 +49,33 @@
  * use of the target attribute, selecting the fastest code path based on
  * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
  * GCC 4.x are not supported to ease configure.ac logic.
+ *
+ * ----
+ *
+ * get_checksum2() is optimized for the case where the selected transfer
+ * checksum is MD5. MD5 can't be made significantly faster with SIMD
+ * instructions than the assembly version already included but SIMD
+ * instructions can be used to hash multiple streams in parallel (see
+ * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
+ * block-matching algorithm hashes the blocks independently (in contrast to
+ * the whole-file checksum) this method can be employed here.
+ *
+ * To prevent needing to modify the core rsync sources significantly, a
+ * prefetching strategy is used. When a checksum2 is requested, the code
+ * reads ahead several blocks, creates the MD5 hashes for each block in
+ * parallel, returns the hash for the first block, and caches the results
+ * for the other blocks to return in future calls to get_checksum2().
  */
 
 #ifdef __x86_64__
 #ifdef __cplusplus
 
+extern "C" {
+
 #include "rsync.h"
 
+}
+
 #ifdef HAVE_SIMD
 
 #include <immintrin.h>
@@ -480,9 +500,235 @@ uint32 get_checksum1(char *buf1, int32 len)
     return get_checksum1_cpp(buf1, len);
 }
 
-} // extern "C"
+#if !defined(BENCHMARK_SIMD_CHECKSUM1)
 
-#ifdef BENCHMARK_SIMD_CHECKSUM1
+// see simd-md5-parallel-x86_64.cpp
+extern int md5_parallel_slots();
+extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
+
+#endif /* !BENCHMARK_SIMD_CHECKSUM1 */
+
+#if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
+
+#define PREFETCH_ENABLE 1 // debugging
+
+#if 0 // debugging
+#define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
+#else
+#define PREFETCH_PRINTF(f_, ...) (void)0;
+#endif
+
+#define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
+#define PREFETCH_MAX_BLOCKS 8
+
+typedef struct {
+    int in_use;
+    OFF_T offset;
+    int32 len;
+    char sum[SUM_LENGTH];
+} prefetch_sum_t;
+
+typedef struct {
+    struct map_struct *map;
+    OFF_T len;
+    OFF_T last;
+    int32 blocklen;
+    int blocks;
+    prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
+} prefetch_t;
+
+prefetch_t *prefetch;
+
+extern int xfersum_type;
+extern int checksum_seed;
+extern int proper_seed_order;
+extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
+
+extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
+
+void checksum2_disable_prefetch()
+{
+    if (prefetch) {
+        PREFETCH_PRINTF("checksum2_disable_prefetch\n");
+        free(prefetch);
+        prefetch = NULL;
+    }
+}
+
+void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
+{
+#ifdef PREFETCH_ENABLE
+    checksum2_disable_prefetch();
+    int slots = md5_parallel_slots();
+    if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
+        prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
+        memset(prefetch, 0, sizeof(prefetch_t));
+        prefetch->map = map;
+        prefetch->len = len;
+        prefetch->last = 0;
+        prefetch->blocklen = blocklen;
+        prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
+        PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
+    }
+#endif
+}
+
+static inline void checksum2_reset_prefetch()
+{
+    for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
+        prefetch->sums[i].in_use = 0;
+    }
+}
+
+static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
+{
+    if (prefetch->sums[0].in_use) {
+        if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
+            memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
+            for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
+                prefetch->sums[i] = prefetch->sums[i + 1];
+            }
+            prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
+            PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
+            return 1;
+        } else {
+            // unexpected access, reset cache
+            PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
+            checksum2_reset_prefetch();
+        }
+    }
+    return 0;
+}
+
+static int checksum2_perform_prefetch(OFF_T prefetch_offset)
+{
+    int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
+    if (blocks < 2) return 0; // fall through to non-simd, probably faster
+
+    int32 total = 0;
+    int i;
+    for (i = 0; i < blocks; i++) {
+        prefetch->sums[i].offset = prefetch_offset + total;
+        prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
+        prefetch->sums[i].in_use = 0;
+        total += prefetch->sums[i].len;
+    }
+    for (; i < PREFETCH_MAX_BLOCKS; i++) {
+        prefetch->sums[i].in_use = 0;
+    }
+
+    uchar seedbuf[4];
+    SIVALu(seedbuf, 0, checksum_seed);
+
+    PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
+    char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
+    char* bufs[PREFETCH_MAX_BLOCKS] = {0};
+    int lens[PREFETCH_MAX_BLOCKS] = {0};
+    char* sums[PREFETCH_MAX_BLOCKS] = {0};
+    for (i = 0; i < blocks; i++) {
+        bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
+        lens[i] = prefetch->sums[i].len;
+        sums[i] = prefetch->sums[i].sum;
+    }
+    if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
+        for (i = 0; i < blocks; i++) {
+            prefetch->sums[i].in_use = 1;
+        }
+        return 1;
+    } else {
+        // this should never be, abort
+        PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
+        checksum2_disable_prefetch();
+    }
+    return 0;
+}
+
+void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
+{
+    if (prefetch) {
+        PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
+        OFF_T last = prefetch->last;
+        prefetch->last = prefetch_offset;
+        if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
+            // we're looking around trying to align blocks, prefetching will slow things down
+            PREFETCH_PRINTF("get_checksum2 SEEK\n");
+            checksum2_reset_prefetch();
+        } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
+            // hit
+            return;
+        } else if (checksum2_perform_prefetch(prefetch_offset)) {
+            if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
+                // hit; should always be as we just fetched this data
+                return;
+            } else {
+                // this should never be, abort
+                PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
+                checksum2_disable_prefetch();
+            }
+        }
+    }
+    get_checksum2_nosimd(buf, len, sum, prefetch_offset);
+}
+#endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
+
+} // "C"
+
+/* Benchmark compilation
+
+  The get_checksum1() benchmark runs through all available code paths in a
+  single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
+  recompiled for each code path (it always uses the fastest path available
+  on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
+  be used when applicable regardless of rsync being built with OpenSSL.
+
+  Something like the following should compile and run the benchmarks:
+
+  # if gcc
+  export CC=gcc
+  export CXX=g++
+  export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
+
+  # else if clang
+  export CC=clang
+  export CXX=clang++
+  export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
+
+  # /if
+
+  export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
+  export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
+  export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
+  export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
+
+  rm bench_csum*
+
+  ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
+
+  $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
+
+  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
+  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
+
+  $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
+  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
+
+  $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
+  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
+
+  ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
+
+  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
+  $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
+
+  ./bench_csum1.all
+  ./bench_csum2.asm
+  ./bench_csum2.openssl
+  ./bench_csum2.sse2
+  ./bench_csum2.avx2
+
+ */
+
+#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
 #pragma clang optimize off
 #pragma GCC push_options
 #pragma GCC optimize ("O0")
@@ -493,7 +739,9 @@ uint32 get_checksum1(char *buf1, int32 len)
 #ifndef CLOCK_MONOTONIC_RAW
 #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
 #endif
+#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
 
+#ifdef BENCHMARK_SIMD_CHECKSUM1
 static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
     struct timespec start, end;
     uint64_t us;
@@ -509,7 +757,7 @@ static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int
     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
     us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
     cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
-    printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
+    printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
 }
 
 static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
@@ -533,10 +781,108 @@ int main() {
     free(buf);
     return 0;
 }
+#endif /* BENCHMARK_SIMD_CHECKSUM1 */
 
+#ifdef BENCHMARK_SIMD_CHECKSUM2
+static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
+    struct timespec start, end;
+    uint64_t us;
+    unsigned char cs1[16];
+    unsigned char cs2[16];
+    int i;
+
+    clock_gettime(CLOCK_MONOTONIC_RAW, &start);
+    for (i = 0; i < ROUNDS; i++) {
+        func(buf, len, (char*)cs1);
+    }
+    clock_gettime(CLOCK_MONOTONIC_RAW, &end);
+    us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
+
+    func2(buf, len, (char*)cs2);
+
+    float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
+    printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
+    for (i = 0; i < 16; i++) {
+        printf("%02x", cs1[i] & 0xFF);
+    }
+    printf(" :: ");
+    for (i = 0; i < 16; i++) {
+        printf("%02x", cs2[i] & 0xFF);
+    }
+    printf("\n");
+}
+
+static void benchmark_inner(char* buf, int32 len, char* sum_out) {
+    // This should produce the same output for different optimizations
+    // levels, not the same as sanity_check()
+
+    char* bufs[8] = {0};
+    int lens[8] = {0};
+    char* sums[8] = {0};
+
+    bufs[0] = buf;
+    lens[0] = len;
+    sums[0] = sum_out;
+    md5_parallel(1, bufs, lens, sums, NULL, NULL);
+}
+
+extern "C" {
+extern void MD5P8_Init_c(MD5P8_CTX *ctx);
+extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
+extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
+}
+
+static void sanity_check(char* buf, int32 len, char* sum_out) {
+    // This should produce the same output for different optimizations
+    // levels, not the same as benchmark_inner()
+    if (md5_parallel_slots() <= 1) {
+        MD5P8_CTX m5p8;
+        MD5P8_Init_c(&m5p8);
+        MD5P8_Update_c(&m5p8, (uchar *)buf, len);
+        MD5P8_Final_c((uchar *)sum_out, &m5p8);
+    } else {
+        MD5P8_CTX m5p8;
+        MD5P8_Init(&m5p8);
+        MD5P8_Update(&m5p8, (uchar *)buf, len);
+        MD5P8_Final((uchar *)sum_out, &m5p8);
+    }
+}
+
+int main() {
+    // This benchmarks the parallel MD5 checksum rather than get_checksum2()
+    // as the latter would require compiling in a lot of rsync's code, but
+    // it touches all the same internals so the performance should be nearly
+    // identical.
+
+    int i;
+    char* buf = (char*)malloc(BLOCK_LEN);
+    for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
+
+    const char* method = "?";
+    switch (md5_parallel_slots()) {
+        case 8: method = "AVX2"; break;
+        case 4: method = "SSE2"; break;
+#ifdef USE_OPENSSL
+        case 1: method = "OpenSSL"; break;
+#elif (CSUM_CHUNK == 64)
+        case 1: method = "ASM"; break;
+#else
+        // this won't happen unless you modified code somewhere
+        case 1: method = "Raw-C"; break;
+#endif
+    }
+
+    benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
+
+    free(buf);
+    return 0;
+}
+#endif /* BENCHMARK_SIMD_CHECKSUM2 */
+
+#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
 #pragma GCC pop_options
 #pragma clang optimize on
-#endif /* BENCHMARK_SIMD_CHECKSUM1 */
+#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
 
 #endif /* HAVE_SIMD */
 #endif /* __cplusplus */
diff --git a/simd-md5-parallel-x86_64.cpp b/simd-md5-parallel-x86_64.cpp
new file mode 100644
--- /dev/null
+++ b/simd-md5-parallel-x86_64.cpp
@@ -0,0 +1,1138 @@
+/*
+ * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel.
+ *
+ * Original author: Nicolas Noble, 2017
+ * Modifications:   Jorrit Jongma, 2020
+ *
+ * The original code was released in the public domain by the original author,
+ * falling back to the MIT license ( http://www.opensource.org/licenses/MIT )
+ * in case public domain does not apply in your country. These modifications
+ * are likewise released in the public domain, with the same MIT license
+ * fallback.
+ *
+ * The original publication can be found at:
+ *
+ * https://github.com/nicolasnoble/sse-hash
+ */
+/*
+ * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
+ * MD5 code has been removed and those code paths rerouted to use the MD5
+ * code already present in rsync, and wrapper functions have been added. The
+ * MD5P8 code is also new, and is the reason for the new stride parameter.
+ *
+ * This code allows multiple independent MD5 streams to be processed in
+ * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
+ * lower than that of the original C routines for MD5, the processing of
+ * additional streams is "for free".
+ *
+ * Single streams are rerouted to rsync's normal MD5 code as that is faster
+ * for that case. A further optimization is possible by using SSE2 code on
+ * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not
+ * implemented here as it would require some restructuring, and in practise
+ * the code here is only rarely called with less than the maximum amount of
+ * streams (typically once at the end of each checksum2'd file).
+ *
+ * Benchmarks (in MB/s)            C     ASM  SSE2*1  SSE2*4  AVX2*1  AVX2*8
+ * - Intel Atom D2700            302     334     166     664     N/A     N/A
+ * - Intel i7-7700hq             351     376     289    1156     273    2184
+ * - AMD ThreadRipper 2950x      728     784     568    2272     430    3440
+ */
+
+#ifdef __x86_64__
+#ifdef __cplusplus
+
+extern "C" {
+
+#include "rsync.h"
+
+}
+
+#ifdef HAVE_SIMD
+
+#ifndef BENCHMARK_SIMD_CHECKSUM2
+#define PMD5_ALLOW_SSE2 // debugging
+#define PMD5_ALLOW_AVX2 // debugging
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+#ifndef PMD5_ALLOW_SSE2
+#define PMD5_ALLOW_SSE2
+#endif
+#endif
+
+#include <stdint.h>
+#include <string.h>
+
+#include <immintrin.h>
+
+/* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
+#ifdef __clang__
+#define MVSTATIC
+#else
+#define MVSTATIC static
+#endif
+
+// Missing from the headers on gcc 6 and older, clang 8 and older
+typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
+typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
+
+#define PMD5_SLOTS_DEFAULT 0
+#define PMD5_SLOTS_SSE2 4
+#define PMD5_SLOTS_AVX2 8
+#define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2
+
+#ifdef PMD5_ALLOW_SSE2
+__attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots()
+{
+    return PMD5_SLOTS_SSE2;
+}
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+__attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots()
+{
+    return PMD5_SLOTS_AVX2;
+}
+#endif
+
+__attribute__ ((target("default"))) MVSTATIC int pmd5_slots()
+{
+    return PMD5_SLOTS_DEFAULT;
+}
+
+/* The parallel MD5 context structure. */
+typedef struct {
+    __m128i state_sse2[4];
+    __m256i state_avx2[4];
+    uint64_t len[PMD5_SLOTS_MAX];
+} pmd5_context;
+
+/* The status returned by the various functions below. */
+typedef enum {
+    PMD5_SUCCESS,
+    PMD5_INVALID_SLOT,
+    PMD5_UNALIGNED_UPDATE,
+} pmd5_status;
+
+/* Initializes all slots in the given pmd5 context. */
+__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx);
+
+/* Initializes a single slot out in the given pmd5 context. */
+static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot);
+
+/* Makes an MD5 update on all slots in parallel, given the same exact length on all streams.
+   The stream pointers will be incremented accordingly.
+   It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
+   The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted.
+   Stride defaults to 64 if 0 is passed. */
+static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride);
+
+/* Makes an MD5 update on all slots in parallel, given different lengths.
+   The stream pointers will be incremented accordingly.
+   The lengths will be decreased accordingly. Not all data might be consumed.
+   It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
+   The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */
+static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]);
+
+/* Finishes all slots at once. Fills in all digests. */
+static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]);
+
+/* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
+   a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
+static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot);
+
+/* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a
+   multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using
+   a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
+static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length);
+
+/* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */
+static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot);
+
+/* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */
+static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot);
+
+#define S11  7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21  5
+#define S22  9
+#define S23 14
+#define S24 20
+#define S31  4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41  6
+#define S42 10
+#define S43 15
+#define S44 21
+
+#define T1  0xD76AA478
+#define T2  0xE8C7B756
+#define T3  0x242070DB
+#define T4  0xC1BDCEEE
+#define T5  0xF57C0FAF
+#define T6  0x4787C62A
+#define T7  0xA8304613
+#define T8  0xFD469501
+#define T9  0x698098D8
+#define T10 0x8B44F7AF
+#define T11 0xFFFF5BB1
+#define T12 0x895CD7BE
+#define T13 0x6B901122
+#define T14 0xFD987193
+#define T15 0xA679438E
+#define T16 0x49B40821
+#define T17 0xF61E2562
+#define T18 0xC040B340
+#define T19 0x265E5A51
+#define T20 0xE9B6C7AA
+#define T21 0xD62F105D
+#define T22 0x02441453
+#define T23 0xD8A1E681
+#define T24 0xE7D3FBC8
+#define T25 0x21E1CDE6
+#define T26 0xC33707D6
+#define T27 0xF4D50D87
+#define T28 0x455A14ED
+#define T29 0xA9E3E905
+#define T30 0xFCEFA3F8
+#define T31 0x676F02D9
+#define T32 0x8D2A4C8A
+#define T33 0xFFFA3942
+#define T34 0x8771F681
+#define T35 0x6D9D6122
+#define T36 0xFDE5380C
+#define T37 0xA4BEEA44
+#define T38 0x4BDECFA9
+#define T39 0xF6BB4B60
+#define T40 0xBEBFBC70
+#define T41 0x289B7EC6
+#define T42 0xEAA127FA
+#define T43 0xD4EF3085
+#define T44 0x04881D05
+#define T45 0xD9D4D039
+#define T46 0xE6DB99E5
+#define T47 0x1FA27CF8
+#define T48 0xC4AC5665
+#define T49 0xF4292244
+#define T50 0x432AFF97
+#define T51 0xAB9423A7
+#define T52 0xFC93A039
+#define T53 0x655B59C3
+#define T54 0x8F0CCC92
+#define T55 0xFFEFF47D
+#define T56 0x85845DD1
+#define T57 0x6FA87E4F
+#define T58 0xFE2CE6E0
+#define T59 0xA3014314
+#define T60 0x4E0811A1
+#define T61 0xF7537E82
+#define T62 0xBD3AF235
+#define T63 0x2AD7D2BB
+#define T64 0xEB86D391
+
+#define ROTL_SSE2(x, n) { \
+    __m128i s; \
+    s = _mm_srli_epi32(x, 32 - n); \
+    x = _mm_slli_epi32(x, n); \
+    x = _mm_or_si128(x, s); \
+};
+
+#define ROTL_AVX2(x, n) { \
+    __m256i s; \
+    s = _mm256_srli_epi32(x, 32 - n); \
+    x = _mm256_slli_epi32(x, n); \
+    x = _mm256_or_si256(x, s); \
+};
+
+#define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z))
+#define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y))
+#define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z)
+#define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff))))
+
+#define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
+#define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y))
+#define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z)
+#define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff))))
+
+#define SET_SSE2(step, a, b, c, d, x, s, ac) { \
+    a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \
+    ROTL_SSE2(a, s); \
+    a = _mm_add_epi32(a, b); \
+}
+
+#define SET_AVX2(step, a, b, c, d, x, s, ac) { \
+    a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \
+    ROTL_AVX2(a, s); \
+    a = _mm256_add_epi32(a, b); \
+}
+
+#define IA 0x67452301
+#define IB 0xefcdab89
+#define IC 0x98badcfe
+#define ID 0x10325476
+
+#define GET_MD5_DATA(dest, src, pos)         \
+    dest =                                   \
+        ((uint32_t) src[pos + 0]) <<  0 |    \
+        ((uint32_t) src[pos + 1]) <<  8 |    \
+        ((uint32_t) src[pos + 2]) << 16 |    \
+        ((uint32_t) src[pos + 3]) << 24
+
+#define GET_PMD5_DATA_SSE2(dest, src, pos) { \
+    uint32_t v0, v1, v2, v3;                 \
+    GET_MD5_DATA(v0, src[0], pos);           \
+    GET_MD5_DATA(v1, src[1], pos);           \
+    GET_MD5_DATA(v2, src[2], pos);           \
+    GET_MD5_DATA(v3, src[3], pos);           \
+    dest = _mm_setr_epi32(v0, v1, v2, v3);   \
+}
+
+#define GET_PMD5_DATA_AVX2(dest, src, pos) { \
+    uint32_t v0, v1, v2, v3;                 \
+    uint32_t v4, v5, v6, v7;                 \
+    GET_MD5_DATA(v0, src[0], pos);           \
+    GET_MD5_DATA(v1, src[1], pos);           \
+    GET_MD5_DATA(v2, src[2], pos);           \
+    GET_MD5_DATA(v3, src[3], pos);           \
+    GET_MD5_DATA(v4, src[4], pos);           \
+    GET_MD5_DATA(v5, src[5], pos);           \
+    GET_MD5_DATA(v6, src[6], pos);           \
+    GET_MD5_DATA(v7, src[7], pos);           \
+    dest = _mm256_setr_epi32(v0, v1, v2, v3, \
+                          v4, v5, v6, v7);   \
+}
+
+#define PUT_MD5_DATA(dest, val, pos) {       \
+    dest[pos + 0] = (val >>  0) & 0xff;      \
+    dest[pos + 1] = (val >>  8) & 0xff;      \
+    dest[pos + 2] = (val >> 16) & 0xff;      \
+    dest[pos + 3] = (val >> 24) & 0xff;      \
+}
+
+const static uint8_t md5_padding[64] = {
+    0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+#ifdef PMD5_ALLOW_SSE2
+__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
+{
+    int i;
+    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
+        ctx->len[i] = 0;
+    }
+
+    ctx->state_sse2[0] = _mm_set1_epi32(IA);
+    ctx->state_sse2[1] = _mm_set1_epi32(IB);
+    ctx->state_sse2[2] = _mm_set1_epi32(IC);
+    ctx->state_sse2[3] = _mm_set1_epi32(ID);
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
+{
+    int i;
+    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
+        ctx->len[i] = 0;
+    }
+
+    ctx->state_avx2[0] = _mm256_set1_epi32(IA);
+    ctx->state_avx2[1] = _mm256_set1_epi32(IB);
+    ctx->state_avx2[2] = _mm256_set1_epi32(IC);
+    ctx->state_avx2[3] = _mm256_set1_epi32(ID);
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
+{
+    return PMD5_INVALID_SLOT;
+}
+
+#ifdef PMD5_ALLOW_SSE2
+__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
+{
+    if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
+    }
+
+    v[0][slot] = a;
+    v[1][slot] = b;
+    v[2][slot] = c;
+    v[3][slot] = d;
+
+    for (i = 0; i < 4; i++) {
+        ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]);
+    }
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
+{
+    if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
+    }
+
+    v[0][slot] = a;
+    v[1][slot] = b;
+    v[2][slot] = c;
+    v[3][slot] = d;
+
+    for (i = 0; i < 4; i++) {
+        ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]);
+    }
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
+{
+    return PMD5_INVALID_SLOT;
+}
+
+#ifdef PMD5_ALLOW_SSE2
+__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
+{
+    if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
+    }
+
+    *a = v[0][slot];
+    *b = v[1][slot];
+    *c = v[2][slot];
+    *d = v[3][slot];
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
+{
+    if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
+    }
+
+    *a = v[0][slot];
+    *b = v[1][slot];
+    *c = v[2][slot];
+    *d = v[3][slot];
+
+    return PMD5_SUCCESS;
+}
+#endif
+
+__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
+{
+    return PMD5_INVALID_SLOT;
+}
+
+static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot)
+{
+    return pmd5_set_slot(ctx, slot, IA, IB, IC, ID);
+}
+
+#ifdef PMD5_ALLOW_SSE2
+__attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
+{
+    __m128i W[MD5_DIGEST_LEN], a, b, c, d;
+
+    GET_PMD5_DATA_SSE2(W[ 0], data,  0);
+    GET_PMD5_DATA_SSE2(W[ 1], data,  4);
+    GET_PMD5_DATA_SSE2(W[ 2], data,  8);
+    GET_PMD5_DATA_SSE2(W[ 3], data, 12);
+    GET_PMD5_DATA_SSE2(W[ 4], data, 16);
+    GET_PMD5_DATA_SSE2(W[ 5], data, 20);
+    GET_PMD5_DATA_SSE2(W[ 6], data, 24);
+    GET_PMD5_DATA_SSE2(W[ 7], data, 28);
+    GET_PMD5_DATA_SSE2(W[ 8], data, 32);
+    GET_PMD5_DATA_SSE2(W[ 9], data, 36);
+    GET_PMD5_DATA_SSE2(W[10], data, 40);
+    GET_PMD5_DATA_SSE2(W[11], data, 44);
+    GET_PMD5_DATA_SSE2(W[12], data, 48);
+    GET_PMD5_DATA_SSE2(W[13], data, 52);
+    GET_PMD5_DATA_SSE2(W[14], data, 56);
+    GET_PMD5_DATA_SSE2(W[15], data, 60);
+
+    a = ctx->state_sse2[0];
+    b = ctx->state_sse2[1];
+    c = ctx->state_sse2[2];
+    d = ctx->state_sse2[3];
+
+    SET_SSE2(F, a, b, c, d, W[ 0], S11,  1);
+    SET_SSE2(F, d, a, b, c, W[ 1], S12,  2);
+    SET_SSE2(F, c, d, a, b, W[ 2], S13,  3);
+    SET_SSE2(F, b, c, d, a, W[ 3], S14,  4);
+    SET_SSE2(F, a, b, c, d, W[ 4], S11,  5);
+    SET_SSE2(F, d, a, b, c, W[ 5], S12,  6);
+    SET_SSE2(F, c, d, a, b, W[ 6], S13,  7);
+    SET_SSE2(F, b, c, d, a, W[ 7], S14,  8);
+    SET_SSE2(F, a, b, c, d, W[ 8], S11,  9);
+    SET_SSE2(F, d, a, b, c, W[ 9], S12, 10);
+    SET_SSE2(F, c, d, a, b, W[10], S13, 11);
+    SET_SSE2(F, b, c, d, a, W[11], S14, 12);
+    SET_SSE2(F, a, b, c, d, W[12], S11, 13);
+    SET_SSE2(F, d, a, b, c, W[13], S12, 14);
+    SET_SSE2(F, c, d, a, b, W[14], S13, 15);
+    SET_SSE2(F, b, c, d, a, W[15], S14, 16);
+
+    SET_SSE2(G, a, b, c, d, W[ 1], S21, 17);
+    SET_SSE2(G, d, a, b, c, W[ 6], S22, 18);
+    SET_SSE2(G, c, d, a, b, W[11], S23, 19);
+    SET_SSE2(G, b, c, d, a, W[ 0], S24, 20);
+    SET_SSE2(G, a, b, c, d, W[ 5], S21, 21);
+    SET_SSE2(G, d, a, b, c, W[10], S22, 22);
+    SET_SSE2(G, c, d, a, b, W[15], S23, 23);
+    SET_SSE2(G, b, c, d, a, W[ 4], S24, 24);
+    SET_SSE2(G, a, b, c, d, W[ 9], S21, 25);
+    SET_SSE2(G, d, a, b, c, W[14], S22, 26);
+    SET_SSE2(G, c, d, a, b, W[ 3], S23, 27);
+    SET_SSE2(G, b, c, d, a, W[ 8], S24, 28);
+    SET_SSE2(G, a, b, c, d, W[13], S21, 29);
+    SET_SSE2(G, d, a, b, c, W[ 2], S22, 30);
+    SET_SSE2(G, c, d, a, b, W[ 7], S23, 31);
+    SET_SSE2(G, b, c, d, a, W[12], S24, 32);
+
+    SET_SSE2(H, a, b, c, d, W[ 5], S31, 33);
+    SET_SSE2(H, d, a, b, c, W[ 8], S32, 34);
+    SET_SSE2(H, c, d, a, b, W[11], S33, 35);
+    SET_SSE2(H, b, c, d, a, W[14], S34, 36);
+    SET_SSE2(H, a, b, c, d, W[ 1], S31, 37);
+    SET_SSE2(H, d, a, b, c, W[ 4], S32, 38);
+    SET_SSE2(H, c, d, a, b, W[ 7], S33, 39);
+    SET_SSE2(H, b, c, d, a, W[10], S34, 40);
+    SET_SSE2(H, a, b, c, d, W[13], S31, 41);
+    SET_SSE2(H, d, a, b, c, W[ 0], S32, 42);
+    SET_SSE2(H, c, d, a, b, W[ 3], S33, 43);
+    SET_SSE2(H, b, c, d, a, W[ 6], S34, 44);
+    SET_SSE2(H, a, b, c, d, W[ 9], S31, 45);
+    SET_SSE2(H, d, a, b, c, W[12], S32, 46);
+    SET_SSE2(H, c, d, a, b, W[15], S33, 47);
+    SET_SSE2(H, b, c, d, a, W[ 2], S34, 48);
+
+    SET_SSE2(I, a, b, c, d, W[ 0], S41, 49);
+    SET_SSE2(I, d, a, b, c, W[ 7], S42, 50);
+    SET_SSE2(I, c, d, a, b, W[14], S43, 51);
+    SET_SSE2(I, b, c, d, a, W[ 5], S44, 52);
+    SET_SSE2(I, a, b, c, d, W[12], S41, 53);
+    SET_SSE2(I, d, a, b, c, W[ 3], S42, 54);
+    SET_SSE2(I, c, d, a, b, W[10], S43, 55);
+    SET_SSE2(I, b, c, d, a, W[ 1], S44, 56);
+    SET_SSE2(I, a, b, c, d, W[ 8], S41, 57);
+    SET_SSE2(I, d, a, b, c, W[15], S42, 58);
+    SET_SSE2(I, c, d, a, b, W[ 6], S43, 59);
+    SET_SSE2(I, b, c, d, a, W[13], S44, 60);
+    SET_SSE2(I, a, b, c, d, W[ 4], S41, 61);
+    SET_SSE2(I, d, a, b, c, W[11], S42, 62);
+    SET_SSE2(I, c, d, a, b, W[ 2], S43, 63);
+    SET_SSE2(I, b, c, d, a, W[ 9], S44, 64);
+
+    ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a);
+    ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b);
+    ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c);
+    ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d);
+}
+#endif
+
+#ifdef PMD5_ALLOW_AVX2
+__attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
+{
+    __m256i W[MD5_DIGEST_LEN], a, b, c, d;
+
+    GET_PMD5_DATA_AVX2(W[ 0], data,  0);
+    GET_PMD5_DATA_AVX2(W[ 1], data,  4);
+    GET_PMD5_DATA_AVX2(W[ 2], data,  8);
+    GET_PMD5_DATA_AVX2(W[ 3], data, 12);
+    GET_PMD5_DATA_AVX2(W[ 4], data, 16);
+    GET_PMD5_DATA_AVX2(W[ 5], data, 20);
+    GET_PMD5_DATA_AVX2(W[ 6], data, 24);
+    GET_PMD5_DATA_AVX2(W[ 7], data, 28);
+    GET_PMD5_DATA_AVX2(W[ 8], data, 32);
+    GET_PMD5_DATA_AVX2(W[ 9], data, 36);
+    GET_PMD5_DATA_AVX2(W[10], data, 40);
+    GET_PMD5_DATA_AVX2(W[11], data, 44);
+    GET_PMD5_DATA_AVX2(W[12], data, 48);
+    GET_PMD5_DATA_AVX2(W[13], data, 52);
+    GET_PMD5_DATA_AVX2(W[14], data, 56);
+    GET_PMD5_DATA_AVX2(W[15], data, 60);
+
+    a = ctx->state_avx2[0];
+    b = ctx->state_avx2[1];
+    c = ctx->state_avx2[2];
+    d = ctx->state_avx2[3];
+
+    SET_AVX2(F, a, b, c, d, W[ 0], S11,  1);
+    SET_AVX2(F, d, a, b, c, W[ 1], S12,  2);
+    SET_AVX2(F, c, d, a, b, W[ 2], S13,  3);
+    SET_AVX2(F, b, c, d, a, W[ 3], S14,  4);
+    SET_AVX2(F, a, b, c, d, W[ 4], S11,  5);
+    SET_AVX2(F, d, a, b, c, W[ 5], S12,  6);
+    SET_AVX2(F, c, d, a, b, W[ 6], S13,  7);
+    SET_AVX2(F, b, c, d, a, W[ 7], S14,  8);
+    SET_AVX2(F, a, b, c, d, W[ 8], S11,  9);
+    SET_AVX2(F, d, a, b, c, W[ 9], S12, 10);
+    SET_AVX2(F, c, d, a, b, W[10], S13, 11);
+    SET_AVX2(F, b, c, d, a, W[11], S14, 12);
+    SET_AVX2(F, a, b, c, d, W[12], S11, 13);
+    SET_AVX2(F, d, a, b, c, W[13], S12, 14);
+    SET_AVX2(F, c, d, a, b, W[14], S13, 15);
+    SET_AVX2(F, b, c, d, a, W[15], S14, 16);
+
+    SET_AVX2(G, a, b, c, d, W[ 1], S21, 17);
+    SET_AVX2(G, d, a, b, c, W[ 6], S22, 18);
+    SET_AVX2(G, c, d, a, b, W[11], S23, 19);
+    SET_AVX2(G, b, c, d, a, W[ 0], S24, 20);
+    SET_AVX2(G, a, b, c, d, W[ 5], S21, 21);
+    SET_AVX2(G, d, a, b, c, W[10], S22, 22);
+    SET_AVX2(G, c, d, a, b, W[15], S23, 23);
+    SET_AVX2(G, b, c, d, a, W[ 4], S24, 24);
+    SET_AVX2(G, a, b, c, d, W[ 9], S21, 25);
+    SET_AVX2(G, d, a, b, c, W[14], S22, 26);
+    SET_AVX2(G, c, d, a, b, W[ 3], S23, 27);
+    SET_AVX2(G, b, c, d, a, W[ 8], S24, 28);
+    SET_AVX2(G, a, b, c, d, W[13], S21, 29);
+    SET_AVX2(G, d, a, b, c, W[ 2], S22, 30);
+    SET_AVX2(G, c, d, a, b, W[ 7], S23, 31);
+    SET_AVX2(G, b, c, d, a, W[12], S24, 32);
+
+    SET_AVX2(H, a, b, c, d, W[ 5], S31, 33);
+    SET_AVX2(H, d, a, b, c, W[ 8], S32, 34);
+    SET_AVX2(H, c, d, a, b, W[11], S33, 35);
+    SET_AVX2(H, b, c, d, a, W[14], S34, 36);
+    SET_AVX2(H, a, b, c, d, W[ 1], S31, 37);
+    SET_AVX2(H, d, a, b, c, W[ 4], S32, 38);
+    SET_AVX2(H, c, d, a, b, W[ 7], S33, 39);
+    SET_AVX2(H, b, c, d, a, W[10], S34, 40);
+    SET_AVX2(H, a, b, c, d, W[13], S31, 41);
+    SET_AVX2(H, d, a, b, c, W[ 0], S32, 42);
+    SET_AVX2(H, c, d, a, b, W[ 3], S33, 43);
+    SET_AVX2(H, b, c, d, a, W[ 6], S34, 44);
+    SET_AVX2(H, a, b, c, d, W[ 9], S31, 45);
+    SET_AVX2(H, d, a, b, c, W[12], S32, 46);
+    SET_AVX2(H, c, d, a, b, W[15], S33, 47);
+    SET_AVX2(H, b, c, d, a, W[ 2], S34, 48);
+
+    SET_AVX2(I, a, b, c, d, W[ 0], S41, 49);
+    SET_AVX2(I, d, a, b, c, W[ 7], S42, 50);
+    SET_AVX2(I, c, d, a, b, W[14], S43, 51);
+    SET_AVX2(I, b, c, d, a, W[ 5], S44, 52);
+    SET_AVX2(I, a, b, c, d, W[12], S41, 53);
+    SET_AVX2(I, d, a, b, c, W[ 3], S42, 54);
+    SET_AVX2(I, c, d, a, b, W[10], S43, 55);
+    SET_AVX2(I, b, c, d, a, W[ 1], S44, 56);
+    SET_AVX2(I, a, b, c, d, W[ 8], S41, 57);
+    SET_AVX2(I, d, a, b, c, W[15], S42, 58);
+    SET_AVX2(I, c, d, a, b, W[ 6], S43, 59);
+    SET_AVX2(I, b, c, d, a, W[13], S44, 60);
+    SET_AVX2(I, a, b, c, d, W[ 4], S41, 61);
+    SET_AVX2(I, d, a, b, c, W[11], S42, 62);
+    SET_AVX2(I, c, d, a, b, W[ 2], S43, 63);
+    SET_AVX2(I, b, c, d, a, W[ 9], S44, 64);
+
+    ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a);
+    ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b);
+    ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c);
+    ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d);
+}
+#endif
+
+__attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
+{
+}
+
+static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride)
+{
+    const uint8_t * ptrs[PMD5_SLOTS_MAX];
+
+    if (!length) return PMD5_SUCCESS;
+
+    int slots = pmd5_slots();
+
+    if (!stride) stride = 64;
+
+    int i;
+    for (i = 0; i < slots; i++) {
+        ptrs[i] = data[i];
+        ctx->len[i] += length;
+        if (!ptrs[i]) ptrs[i] = md5_padding;
+    }
+
+    while (length >= 64) {
+        pmd5_process(ctx, ptrs);
+        length -= 64;
+        for (i = 0; i < slots; i++) {
+            if (data[i]) ptrs[i] += stride;
+        }
+    }
+
+    if (length) return PMD5_UNALIGNED_UPDATE;
+
+    for (i = 0; i < slots; i++) {
+        if (data[i]) data[i] = ptrs[i];
+    }
+
+    return PMD5_SUCCESS;
+}
+
+static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX])
+{
+    uint64_t length = 0;
+    int slots = pmd5_slots();
+
+    int i;
+    for (i = 0; i < slots; i++) {
+        if ((length == 0) || (lengths[i] < length)) length = lengths[i];
+    }
+
+    for (i = 0; i < slots; i++) {
+        lengths[i] -= length;
+    }
+
+    return pmd5_update_all_simple(ctx, data, length, 0);
+}
+
+static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length)
+{
+    MD5_CTX ctx;
+
+    if ((slot >= pmd5_slots()) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    pmd5_to_md5(pctx, &ctx, slot);
+    if (data && length) {
+        MD5_Update(&ctx, data, length);
+    }
+    MD5_Final(digest, &ctx);
+
+    return PMD5_SUCCESS;
+}
+
+static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot)
+{
+    return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0);
+}
+
+static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN])
+{
+    int i;
+    for (i = 0; i < pmd5_slots(); i++) {
+        pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0);
+    }
+    return PMD5_SUCCESS;
+}
+
+static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot)
+{
+    if ((slot >= pmd5_slots()) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting.
+
+#ifdef USE_OPENSSL
+    pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29);
+#else
+    pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32);
+#endif
+    return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D);
+}
+
+static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot)
+{
+    if ((slot >= pmd5_slots()) || (slot < 0))
+        return PMD5_INVALID_SLOT;
+
+    MD5_Init(ctx);
+
+#ifdef USE_OPENSSL
+    ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF;
+    ctx->Nh = pctx->len[slot] >> 29;
+
+    uint32_t a, b, c, d;
+    pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
+    if (ret == PMD5_SUCCESS) {
+        ctx->A = a;
+        ctx->B = b;
+        ctx->C = c;
+        ctx->D = d;
+    }
+    return ret;
+#else
+    ctx->totalN = pctx->len[slot] & 0xFFFFFFFF;
+    ctx->totalN2 = pctx->len[slot] >> 32;
+    return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D);
+#endif
+}
+
+/* With GCC 10 putting these implementations inside 'extern "C"' causes an
+   assembler error. That worked fine on GCC 5-9 and clang 6-10...
+  */
+
+static inline int md5_parallel_slots_cpp()
+{
+    int slots = pmd5_slots();
+    if (slots == 0) return 1;
+    return slots;
+}
+
+static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
+{
+    int slots = md5_parallel_slots_cpp();
+    if ((streams < 1) || (streams > slots)) return 0;
+    if (pre4 && post4) return 0;
+
+    if (slots == 1) {
+        MD5_CTX ctx;
+        MD5_Init(&ctx);
+        if (pre4) {
+            MD5_Update(&ctx, (const unsigned char*)pre4, 4);
+        }
+        MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]);
+        if (post4) {
+            MD5_Update(&ctx, (const unsigned char*)post4, 4);
+        }
+        if (sum[0]) {
+            MD5_Final((uint8_t*)sum[0], &ctx);
+        }
+        return 0;
+    }
+
+    int i;
+    int active[PMD5_SLOTS_MAX];
+    char* buffers[PMD5_SLOTS_MAX];
+    uint64_t left[PMD5_SLOTS_MAX];
+    for (i = 0; i < PMD5_SLOTS_MAX; i++) {
+        active[i] = streams > i;
+        if (i < streams) {
+            buffers[i] = buf[i];
+            left[i] = (uint64_t)len[i];
+        } else {
+            buffers[i] = NULL;
+            left[i] = 0;
+        }
+    }
+    MD5_CTX results[PMD5_SLOTS_MAX];
+
+    pmd5_context ctx_simd;
+    if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0;
+
+    if (pre4) {
+        char temp_buffers[PMD5_SLOTS_MAX][64];
+        int have_any = 0;
+        for (i = 0; i < slots; i++) {
+            if (active[i]) {
+                if (left[i] < 60) {
+                    MD5_Init(&results[i]);
+                    MD5_Update(&results[i], (const unsigned char*)pre4, 4);
+                    MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]);
+                    active[i] = 0;
+                    left[i] = 0;
+                } else {
+                    memcpy(temp_buffers[i], pre4, 4);
+                    memcpy(temp_buffers[i] + 4, buffers[i], 60);
+                    buffers[i] += 60;
+                    left[i] -= 60;
+                    have_any = 1;
+                }
+            }
+        }
+
+        if (have_any) {
+            char* ptrs[PMD5_SLOTS_MAX];
+            for (i = 0; i < PMD5_SLOTS_MAX; i++) {
+                ptrs[i] = &temp_buffers[i][0];
+            }
+            if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) {
+                return 0;
+            }
+        }
+    }
+
+    int failed = 0;
+    while (true) {
+        for (i = 0; i < slots; i++) {
+            if (active[i] && (left[i] < 64)) {
+                if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) {
+                    failed = 1;
+                }
+                active[i] = 0;
+            }
+        }
+
+        uint64_t shortest = 0;
+        for (i = 0; i < slots; i++) {
+            if (!active[i]) {
+                buffers[i] = NULL;
+            } else if ((shortest == 0) || (left[i] < shortest)) {
+                shortest = left[i];
+            }
+        }
+
+        if (shortest > 0) {
+            shortest = shortest & ~63;
+            if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) {
+                failed = 1;
+            }
+            for (i = 0; i < slots; i++) {
+                if (active[i]) {
+                    left[i] -= shortest;
+                }
+            }
+        }
+
+        if (failed) {
+            return 0;
+        } else {
+            int have_any = 0;
+            for (i = 0; i < slots; i++) {
+                have_any |= active[i];
+            }
+            if (!have_any) {
+                break;
+            }
+        }
+    }
+
+    for (i = 0; i < slots; i++) {
+        if (i < streams) {
+            if (left[i] > 0) {
+                // buffer[i] == NULL here
+                MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]);
+            }
+            if (post4) {
+                MD5_Update(&results[i], (const unsigned char*)post4, 4);
+            }
+            if (sum[i]) {
+                MD5_Final((uint8_t*)sum[i], &results[i]);
+            }
+        }
+    }
+
+    return 1;
+}
+
+// each pmd5_context needs to be 32-byte aligned
+#define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31)))
+
+static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx)
+{
+    int i;
+    for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) {
+        pmd5_init_all(MD5P8_Contexts_simd(ctx, i));
+    }
+    ctx->used = 0;
+    ctx->next = 0;
+}
+
+static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length)
+{
+    int slots = pmd5_slots();
+    uint32 pos = 0;
+
+    if ((ctx->used) || (length < 512)) {
+        int cpy = MIN(length, 512 - ctx->used);
+        memcpy(&ctx->buffer[ctx->used], input, cpy);
+        ctx->used += cpy;
+        length -= cpy;
+        pos += cpy;
+
+        if (ctx->used == 512) {
+            if (slots == PMD5_SLOTS_AVX2) {
+                const uint8_t* ptrs[PMD5_SLOTS_MAX] = {
+                    (uint8_t*)ctx->buffer,
+                    (uint8_t*)(ctx->buffer + 64),
+                    (uint8_t*)(ctx->buffer + 128),
+                    (uint8_t*)(ctx->buffer + 192),
+                    (uint8_t*)(ctx->buffer + 256),
+                    (uint8_t*)(ctx->buffer + 320),
+                    (uint8_t*)(ctx->buffer + 384),
+                    (uint8_t*)(ctx->buffer + 448)
+                };
+                pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0);
+            } else {
+                const uint8_t* ptrs1[PMD5_SLOTS_MAX] = {
+                    (uint8_t*)ctx->buffer,
+                    (uint8_t*)(ctx->buffer + 64),
+                    (uint8_t*)(ctx->buffer + 128),
+                    (uint8_t*)(ctx->buffer + 192)
+                };
+                const uint8_t* ptrs2[PMD5_SLOTS_MAX] = {
+                    (uint8_t*)(ctx->buffer + 256),
+                    (uint8_t*)(ctx->buffer + 320),
+                    (uint8_t*)(ctx->buffer + 384),
+                    (uint8_t*)(ctx->buffer + 448)
+                };
+                pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0);
+                pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0);
+            }
+            ctx->used = 0;
+        }
+    }
+
+    if (length >= 512) {
+        uint32 blocks = length / 512;
+        if (slots == PMD5_SLOTS_AVX2) {
+            const uint8_t* ptrs[8] = {
+                (uint8_t*)(input + pos),
+                (uint8_t*)(input + pos + 64),
+                (uint8_t*)(input + pos + 128),
+                (uint8_t*)(input + pos + 192),
+                (uint8_t*)(input + pos + 256),
+                (uint8_t*)(input + pos + 320),
+                (uint8_t*)(input + pos + 384),
+                (uint8_t*)(input + pos + 448)
+            };
+            pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512);
+        } else {
+            const uint8_t* ptrs1[4] = {
+                (uint8_t*)(input + pos),
+                (uint8_t*)(input + pos + 64),
+                (uint8_t*)(input + pos + 128),
+                (uint8_t*)(input + pos + 192)
+            };
+            const uint8_t* ptrs2[4] = {
+                (uint8_t*)(input + pos + 256),
+                (uint8_t*)(input + pos + 320),
+                (uint8_t*)(input + pos + 384),
+                (uint8_t*)(input + pos + 448)
+            };
+            pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512);
+            pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512);
+        }
+        pos += blocks * 512;
+        length -= blocks * 512;
+    }
+
+    if (length) {
+        memcpy(ctx->buffer, &input[pos], length);
+        ctx->used = length;
+    }
+}
+
+static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
+{
+    int i;
+    uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0;
+    if (ctx->used) {
+        uchar tmp[512];
+        memset(tmp, 0, 512);
+        MD5P8_Update(ctx, tmp, 512 - ctx->used);
+    }
+
+    uchar state[34*4] = {0};
+
+    MD5_CTX tmp;
+    for (i = 0; i < 8; i++) {
+        if (pmd5_slots() == PMD5_SLOTS_AVX2) {
+            pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
+        } else if (i < 4) {
+            pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
+        } else {
+            pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4);
+        }
+#ifdef USE_OPENSSL
+        if (low + tmp.Nl < low) high++;
+        low += tmp.Nl;
+        high += tmp.Nh;
+#else
+        if (low + tmp.totalN < low) high++;
+        low += tmp.totalN;
+        high += tmp.totalN2;
+#endif
+        SIVALu(state, i*16, tmp.A);
+        SIVALu(state, i*16 + 4, tmp.B);
+        SIVALu(state, i*16 + 8, tmp.C);
+        SIVALu(state, i*16 + 12, tmp.D);
+    }
+
+#ifndef USE_OPENSSL
+    high = (low >> 29) | (high << 3);
+    low = (low << 3);
+#endif
+
+    sub <<= 3;
+    if (low - sub > low) high--;
+    low -= sub;
+
+    SIVALu(state, 32*4, low);
+    SIVALu(state, 33*4, high);
+
+    MD5_CTX md;
+    MD5_Init(&md);
+    MD5_Update(&md, state, 34*4);
+    MD5_Final(digest, &md);
+}
+
+extern "C" {
+
+int md5_parallel_slots()
+{
+    return md5_parallel_slots_cpp();
+}
+
+int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
+{
+    return md5_parallel_cpp(streams, buf, len, sum, pre4, post4);
+}
+
+void MD5P8_Init(MD5P8_CTX *ctx)
+{
+    MD5P8_Init_cpp(ctx);
+}
+
+void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
+{
+    MD5P8_Update_cpp(ctx, input, length);
+}
+
+void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
+{
+    MD5P8_Final_cpp(digest, ctx);
+}
+
+} // "C"
+
+#endif /* HAVE_SIMD */
+#endif /* __cplusplus */
+#endif /* __x86_64__ */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>