Annotation of embedaddon/rsync/patches/md5p8.diff, revision 1.1
1.1 ! misho 1: From: Jorrit Jongma <git@jongma.org>
! 2:
! 3: - MD5 optimization in block matching phase:
! 4:
! 5: MD5 hashes computed during rsync's block matching phase are independent
! 6: and thus possible to process in parallel. This code processes 4 blocks
! 7: in parallel if SSE2 is available, or 8 if AVX2 is available. An increase
! 8: of performance (or decrease of CPU usage) of up to 6x has been measured.
! 9:
! 10: A prefetching algorithm is used to predict and load upcoming blocks, as
! 11: this prevents the need for extensive modifications to other parts of
! 12: the rsync sources to get this working.
! 13:
! 14: This remains compatible with existing rsync builds using MD5 checksums.
! 15:
! 16: - MD5P8 whole-file checksum:
! 17:
! 18: Splits the input up into 8 independent streams (64-byte interleave), and
! 19: produces a final checksum based on the end state of those 8 streams. If
! 20: parallelization of MD5 hashing is available, the performance gain (or
! 21: CPU usage decrease) is 2x to 6x compared to traditional MD5.
! 22:
! 23: The rsync version on both ends of the connection need MD5P8 support
! 24: built-in for it to be used.
! 25:
! 26: xxHash is still preferred (and faster), but this provides a reasonably
! 27: fast fallback for the case where xxHash libraries are not available at
! 28: build time.
! 29:
! 30: based-on: e94bad1c156fc3910f24e2b3b71a81b0b0bdeb70
! 31: diff --git a/Makefile.in b/Makefile.in
! 32: --- a/Makefile.in
! 33: +++ b/Makefile.in
! 34: @@ -29,14 +29,14 @@ SHELL=/bin/sh
! 35: .SUFFIXES:
! 36: .SUFFIXES: .c .o
! 37:
! 38: -SIMD_x86_64=simd-checksum-x86_64.o
! 39: +SIMD_x86_64=simd-checksum-x86_64.o simd-md5-parallel-x86_64.o
! 40: ASM_x86_64=lib/md5-asm-x86_64.o
! 41:
! 42: GENFILES=configure.sh aclocal.m4 config.h.in rsync.1 rsync.1.html \
! 43: rsync-ssl.1 rsync-ssl.1.html rsyncd.conf.5 rsyncd.conf.5.html
! 44: HEADERS=byteorder.h config.h errcode.h proto.h rsync.h ifuncs.h itypes.h inums.h \
! 45: lib/pool_alloc.h lib/mdigest.h lib/md-defines.h version.h
! 46: -LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o \
! 47: +LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o lib/md5p8.o \
! 48: lib/permstring.o lib/pool_alloc.o lib/sysacls.o lib/sysxattrs.o @LIBOBJS@
! 49: zlib_OBJS=zlib/deflate.o zlib/inffast.o zlib/inflate.o zlib/inftrees.o \
! 50: zlib/trees.o zlib/zutil.o zlib/adler32.o zlib/compress.o zlib/crc32.o
! 51: @@ -140,6 +140,9 @@ git-version.h: mkgitver $(wildcard $(srcdir)/.git/logs/HEAD)
! 52: simd-checksum-x86_64.o: simd-checksum-x86_64.cpp
! 53: @$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-checksum-x86_64.cpp
! 54:
! 55: +simd-md5-parallel-x86_64.o: simd-md5-parallel-x86_64.cpp
! 56: + @$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-md5-parallel-x86_64.cpp
! 57: +
! 58: lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.S config.h lib/md-defines.h
! 59: @$(srcdir)/cmdormsg disable-asm $(CC) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/lib/md5-asm-x86_64.S
! 60:
! 61: diff --git a/checksum.c b/checksum.c
! 62: --- a/checksum.c
! 63: +++ b/checksum.c
! 64: @@ -52,6 +52,7 @@ struct name_num_obj valid_checksums = {
! 65: { CSUM_XXH64, "xxh64", NULL },
! 66: { CSUM_XXH64, "xxhash", NULL },
! 67: #endif
! 68: + { CSUM_MD5P8, "md5p8", NULL },
! 69: { CSUM_MD5, "md5", NULL },
! 70: { CSUM_MD4, "md4", NULL },
! 71: { CSUM_NONE, "none", NULL },
! 72: @@ -141,6 +142,7 @@ int csum_len_for_type(int cst, BOOL flist_csum)
! 73: case CSUM_MD4_OLD:
! 74: case CSUM_MD4_BUSTED:
! 75: return MD4_DIGEST_LEN;
! 76: + case CSUM_MD5P8:
! 77: case CSUM_MD5:
! 78: return MD5_DIGEST_LEN;
! 79: case CSUM_XXH64:
! 80: @@ -167,6 +169,7 @@ int canonical_checksum(int csum_type)
! 81: case CSUM_MD4_BUSTED:
! 82: break;
! 83: case CSUM_MD4:
! 84: + case CSUM_MD5P8:
! 85: case CSUM_MD5:
! 86: return -1;
! 87: case CSUM_XXH64:
! 88: @@ -179,7 +182,9 @@ int canonical_checksum(int csum_type)
! 89: return 0;
! 90: }
! 91:
! 92: -#ifndef HAVE_SIMD /* See simd-checksum-*.cpp. */
! 93: +#ifdef HAVE_SIMD /* See simd-checksum-*.cpp. */
! 94: +#define get_checksum2 get_checksum2_nosimd
! 95: +#else
! 96: /*
! 97: a simple 32 bit checksum that can be updated from either end
! 98: (inspired by Mark Adler's Adler-32 checksum)
! 99: @@ -200,9 +205,18 @@ uint32 get_checksum1(char *buf1, int32 len)
! 100: }
! 101: return (s1 & 0xffff) + (s2 << 16);
! 102: }
! 103: +
! 104: +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
! 105: +{
! 106: +}
! 107: +
! 108: +void checksum2_disable_prefetch()
! 109: +{
! 110: +}
! 111: #endif
! 112:
! 113: -void get_checksum2(char *buf, int32 len, char *sum)
! 114: +/* Renamed to get_checksum2_nosimd() with HAVE_SIMD */
! 115: +void get_checksum2(char *buf, int32 len, char *sum, UNUSED(OFF_T prefetch_offset))
! 116: {
! 117: switch (xfersum_type) {
! 118: #ifdef SUPPORT_XXHASH
! 119: @@ -221,6 +235,7 @@ void get_checksum2(char *buf, int32 len, char *sum)
! 120: break;
! 121: }
! 122: #endif
! 123: + case CSUM_MD5P8: /* == CSUM_MD5 for checksum2 */
! 124: case CSUM_MD5: {
! 125: MD5_CTX m5;
! 126: uchar seedbuf[4];
! 127: @@ -373,6 +388,21 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
! 128: break;
! 129: }
! 130: #endif
! 131: + case CSUM_MD5P8: {
! 132: + MD5P8_CTX m5p8;
! 133: +
! 134: + MD5P8_Init(&m5p8);
! 135: +
! 136: + for (i = 0; i + CHUNK_SIZE <= len; i += CHUNK_SIZE)
! 137: + MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, CHUNK_SIZE), CHUNK_SIZE);
! 138: +
! 139: + remainder = (int32)(len - i);
! 140: + if (remainder > 0)
! 141: + MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, remainder), remainder);
! 142: +
! 143: + MD5P8_Final((uchar *)sum, &m5p8);
! 144: + break;
! 145: + }
! 146: case CSUM_MD5: {
! 147: MD5_CTX m5;
! 148:
! 149: @@ -445,6 +475,7 @@ static union {
! 150: #endif
! 151: MD5_CTX m5;
! 152: } ctx;
! 153: +static MD5P8_CTX m5p8;
! 154: #ifdef SUPPORT_XXHASH
! 155: static XXH64_state_t* xxh64_state;
! 156: #endif
! 157: @@ -481,6 +512,9 @@ void sum_init(int csum_type, int seed)
! 158: XXH3_128bits_reset(xxh3_state);
! 159: break;
! 160: #endif
! 161: + case CSUM_MD5P8:
! 162: + MD5P8_Init(&m5p8);
! 163: + break;
! 164: case CSUM_MD5:
! 165: MD5_Init(&ctx.m5);
! 166: break;
! 167: @@ -531,6 +565,9 @@ void sum_update(const char *p, int32 len)
! 168: XXH3_128bits_update(xxh3_state, p, len);
! 169: break;
! 170: #endif
! 171: + case CSUM_MD5P8:
! 172: + MD5P8_Update(&m5p8, (uchar *)p, len);
! 173: + break;
! 174: case CSUM_MD5:
! 175: MD5_Update(&ctx.m5, (uchar *)p, len);
! 176: break;
! 177: @@ -596,6 +633,9 @@ int sum_end(char *sum)
! 178: break;
! 179: }
! 180: #endif
! 181: + case CSUM_MD5P8:
! 182: + MD5P8_Final((uchar *)sum, &m5p8);
! 183: + break;
! 184: case CSUM_MD5:
! 185: MD5_Final((uchar *)sum, &ctx.m5);
! 186: break;
! 187: diff --git a/generator.c b/generator.c
! 188: --- a/generator.c
! 189: +++ b/generator.c
! 190: @@ -729,10 +729,12 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
! 191: if (append_mode > 0 && f_copy < 0)
! 192: return 0;
! 193:
! 194: - if (len > 0)
! 195: + if (len > 0) {
! 196: mapbuf = map_file(fd, len, MAX_MAP_SIZE, sum.blength);
! 197: - else
! 198: + checksum2_enable_prefetch(mapbuf, len, sum.blength);
! 199: + } else {
! 200: mapbuf = NULL;
! 201: + }
! 202:
! 203: for (i = 0; i < sum.count; i++) {
! 204: int32 n1 = (int32)MIN(len, (OFF_T)sum.blength);
! 205: @@ -750,7 +752,7 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
! 206: }
! 207:
! 208: sum1 = get_checksum1(map, n1);
! 209: - get_checksum2(map, n1, sum2);
! 210: + get_checksum2(map, n1, sum2, offset - n1);
! 211:
! 212: if (DEBUG_GTE(DELTASUM, 3)) {
! 213: rprintf(FINFO,
! 214: @@ -762,8 +764,10 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
! 215: write_buf(f_out, sum2, sum.s2length);
! 216: }
! 217:
! 218: - if (mapbuf)
! 219: + if (mapbuf) {
! 220: unmap_file(mapbuf);
! 221: + checksum2_disable_prefetch();
! 222: + }
! 223:
! 224: return 0;
! 225: }
! 226: diff --git a/lib/md-defines.h b/lib/md-defines.h
! 227: --- a/lib/md-defines.h
! 228: +++ b/lib/md-defines.h
! 229: @@ -15,3 +15,4 @@
! 230: #define CSUM_XXH64 6
! 231: #define CSUM_XXH3_64 7
! 232: #define CSUM_XXH3_128 8
! 233: +#define CSUM_MD5P8 9
! 234: diff --git a/lib/md5p8.c b/lib/md5p8.c
! 235: new file mode 100644
! 236: --- /dev/null
! 237: +++ b/lib/md5p8.c
! 238: @@ -0,0 +1,128 @@
! 239: +/*
! 240: + * MD5-based hash friendly to parallel processing, reference implementation
! 241: + *
! 242: + * Author: Jorrit Jongma, 2020
! 243: + *
! 244: + * Released in the public domain falling back to the MIT license
! 245: + * ( http://www.opensource.org/licenses/MIT ) in case public domain does not
! 246: + * apply in your country.
! 247: + */
! 248: +/*
! 249: + * MD5P8 is an MD5-based hash friendly to parallel processing. The input
! 250: + * stream is divided into 8 independent streams. For each 512 bytes of input,
! 251: + * the first 64 bytes are send to the first stream, the second 64 bytes to
! 252: + * the second stream, etc. The input stream is padded with zeros to the next
! 253: + * multiple of 512 bytes, then a normal MD5 hash is computed on a buffer
! 254: + * containing the A, B, C, and D states of the 8 individual streams, followed
! 255: + * by the (unpadded) length of the input.
! 256: + *
! 257: + * On non-SIMD accelerated CPUs the performance of MD5P8 is slightly lower
! 258: + * than normal MD5 (particularly on files smaller than 10 kB), but with
! 259: + * SIMD-based parallel processing it can be two to six times as fast. Even in
! 260: + * the best-case scenario, xxHash is still at least twice as fast and should
! 261: + * be preferred when available.
! 262: + */
! 263: +
! 264: +#include "rsync.h"
! 265: +
! 266: +#ifdef HAVE_SIMD
! 267: +#define MD5P8_Init MD5P8_Init_c
! 268: +#define MD5P8_Update MD5P8_Update_c
! 269: +#define MD5P8_Final MD5P8_Final_c
! 270: +#endif
! 271: +
! 272: +/* each MD5_CTX needs to be 8-byte aligned */
! 273: +#define MD5P8_Contexts_c(ctx, index) ((MD5_CTX*)((((uintptr_t)((ctx)->context_storage) + 7) & ~7) + (index)*((sizeof(MD5_CTX) + 7) & ~7)))
! 274: +
! 275: +void MD5P8_Init(MD5P8_CTX *ctx)
! 276: +{
! 277: + int i;
! 278: + for (i = 0; i < 8; i++) {
! 279: + MD5_Init(MD5P8_Contexts_c(ctx, i));
! 280: + }
! 281: + ctx->used = 0;
! 282: + ctx->next = 0;
! 283: +}
! 284: +
! 285: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
! 286: +{
! 287: + uint32 pos = 0;
! 288: +
! 289: + if ((ctx->used) || (length < 64)) {
! 290: + int cpy = MIN(length, 64 - ctx->used);
! 291: + memmove(&ctx->buffer[ctx->used], input, cpy);
! 292: + ctx->used += cpy;
! 293: + length -= cpy;
! 294: + pos += cpy;
! 295: +
! 296: + if (ctx->used == 64) {
! 297: + MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), ctx->buffer, 64);
! 298: + ctx->used = 0;
! 299: + ctx->next = (ctx->next + 1) % 8;
! 300: + }
! 301: + }
! 302: +
! 303: + while (length >= 64) {
! 304: + MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), &input[pos], 64);
! 305: + ctx->next = (ctx->next + 1) % 8;
! 306: + pos += 64;
! 307: + length -= 64;
! 308: + }
! 309: +
! 310: + if (length) {
! 311: + memcpy(ctx->buffer, &input[pos], length);
! 312: + ctx->used = length;
! 313: + }
! 314: +}
! 315: +
! 316: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
! 317: +{
! 318: + int i;
! 319: + uint32 low = 0, high = 0, sub = ctx->used ? 64 - ctx->used : 0;
! 320: + if (ctx->used) {
! 321: + uchar tmp[64];
! 322: + memset(tmp, 0, 64);
! 323: + MD5P8_Update(ctx, tmp, 64 - ctx->used);
! 324: + }
! 325: + memset(ctx->buffer, 0, 64);
! 326: + while (ctx->next != 0) {
! 327: + MD5P8_Update(ctx, ctx->buffer, 64);
! 328: + sub += 64;
! 329: + }
! 330: +
! 331: + uchar state[34*4] = {0};
! 332: +
! 333: + for (i = 0; i < 8; i++) {
! 334: + MD5_CTX* md = MD5P8_Contexts_c(ctx, i);
! 335: +#ifdef USE_OPENSSL
! 336: + if (low + md->Nl < low) high++;
! 337: + low += md->Nl;
! 338: + high += md->Nh;
! 339: +#else
! 340: + if (low + md->totalN < low) high++;
! 341: + low += md->totalN;
! 342: + high += md->totalN2;
! 343: +#endif
! 344: + SIVALu(state, i*16, md->A);
! 345: + SIVALu(state, i*16 + 4, md->B);
! 346: + SIVALu(state, i*16 + 8, md->C);
! 347: + SIVALu(state, i*16 + 12, md->D);
! 348: + }
! 349: +
! 350: +#ifndef USE_OPENSSL
! 351: + high = (low >> 29) | (high << 3);
! 352: + low = (low << 3);
! 353: +#endif
! 354: +
! 355: + sub <<= 3;
! 356: + if (low - sub > low) high--;
! 357: + low -= sub;
! 358: +
! 359: + SIVALu(state, 32*4, low);
! 360: + SIVALu(state, 33*4, high);
! 361: +
! 362: + MD5_CTX md;
! 363: + MD5_Init(&md);
! 364: + MD5_Update(&md, state, 34*4);
! 365: + MD5_Final(digest, &md);
! 366: +}
! 367: diff --git a/lib/mdigest.h b/lib/mdigest.h
! 368: --- a/lib/mdigest.h
! 369: +++ b/lib/mdigest.h
! 370: @@ -27,3 +27,14 @@ void md5_begin(md_context *ctx);
! 371: void md5_update(md_context *ctx, const uchar *input, uint32 length);
! 372: void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN]);
! 373: #endif
! 374: +
! 375: +typedef struct {
! 376: + uchar context_storage[1024];
! 377: + uchar buffer[512];
! 378: + unsigned int used;
! 379: + unsigned int next;
! 380: +} MD5P8_CTX;
! 381: +
! 382: +void MD5P8_Init(MD5P8_CTX *ctx);
! 383: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length);
! 384: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
! 385: diff --git a/match.c b/match.c
! 386: --- a/match.c
! 387: +++ b/match.c
! 388: @@ -164,6 +164,8 @@ static void hash_search(int f,struct sum_struct *s,
! 389: if (DEBUG_GTE(DELTASUM, 3))
! 390: rprintf(FINFO, "sum=%.8x k=%ld\n", sum, (long)k);
! 391:
! 392: + checksum2_enable_prefetch(buf, len, s->blength);
! 393: +
! 394: offset = aligned_offset = aligned_i = 0;
! 395:
! 396: end = len + 1 - s->sums[s->count-1].len;
! 397: @@ -226,7 +228,7 @@ static void hash_search(int f,struct sum_struct *s,
! 398:
! 399: if (!done_csum2) {
! 400: map = (schar *)map_ptr(buf,offset,l);
! 401: - get_checksum2((char *)map,l,sum2);
! 402: + get_checksum2((char *)map, l, sum2, offset);
! 403: done_csum2 = 1;
! 404: }
! 405:
! 406: @@ -268,7 +270,7 @@ static void hash_search(int f,struct sum_struct *s,
! 407: sum = get_checksum1((char *)map, l);
! 408: if (sum != s->sums[i].sum1)
! 409: goto check_want_i;
! 410: - get_checksum2((char *)map, l, sum2);
! 411: + get_checksum2((char *)map, l, sum2, aligned_offset);
! 412: if (memcmp(sum2, s->sums[i].sum2, s->s2length) != 0)
! 413: goto check_want_i;
! 414: /* OK, we have a re-alignment match. Bump the offset
! 415: @@ -335,6 +337,8 @@ static void hash_search(int f,struct sum_struct *s,
! 416: matched(f, s, buf, offset - s->blength, -2);
! 417: } while (++offset < end);
! 418:
! 419: + checksum2_disable_prefetch();
! 420: +
! 421: matched(f, s, buf, len, -1);
! 422: map_ptr(buf, len-1, 1);
! 423: }
! 424: diff --git a/simd-checksum-x86_64.cpp b/simd-checksum-x86_64.cpp
! 425: --- a/simd-checksum-x86_64.cpp
! 426: +++ b/simd-checksum-x86_64.cpp
! 427: @@ -49,13 +49,33 @@
! 428: * use of the target attribute, selecting the fastest code path based on
! 429: * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
! 430: * GCC 4.x are not supported to ease configure.ac logic.
! 431: + *
! 432: + * ----
! 433: + *
! 434: + * get_checksum2() is optimized for the case where the selected transfer
! 435: + * checksum is MD5. MD5 can't be made significantly faster with SIMD
! 436: + * instructions than the assembly version already included but SIMD
! 437: + * instructions can be used to hash multiple streams in parallel (see
! 438: + * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
! 439: + * block-matching algorithm hashes the blocks independently (in contrast to
! 440: + * the whole-file checksum) this method can be employed here.
! 441: + *
! 442: + * To prevent needing to modify the core rsync sources significantly, a
! 443: + * prefetching strategy is used. When a checksum2 is requested, the code
! 444: + * reads ahead several blocks, creates the MD5 hashes for each block in
! 445: + * parallel, returns the hash for the first block, and caches the results
! 446: + * for the other blocks to return in future calls to get_checksum2().
! 447: */
! 448:
! 449: #ifdef __x86_64__
! 450: #ifdef __cplusplus
! 451:
! 452: +extern "C" {
! 453: +
! 454: #include "rsync.h"
! 455:
! 456: +}
! 457: +
! 458: #ifdef HAVE_SIMD
! 459:
! 460: #include <immintrin.h>
! 461: @@ -480,9 +500,235 @@ uint32 get_checksum1(char *buf1, int32 len)
! 462: return get_checksum1_cpp(buf1, len);
! 463: }
! 464:
! 465: -} // extern "C"
! 466: +#if !defined(BENCHMARK_SIMD_CHECKSUM1)
! 467:
! 468: -#ifdef BENCHMARK_SIMD_CHECKSUM1
! 469: +// see simd-md5-parallel-x86_64.cpp
! 470: +extern int md5_parallel_slots();
! 471: +extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
! 472: +
! 473: +#endif /* !BENCHMARK_SIMD_CHECKSUM1 */
! 474: +
! 475: +#if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
! 476: +
! 477: +#define PREFETCH_ENABLE 1 // debugging
! 478: +
! 479: +#if 0 // debugging
! 480: +#define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
! 481: +#else
! 482: +#define PREFETCH_PRINTF(f_, ...) (void)0;
! 483: +#endif
! 484: +
! 485: +#define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
! 486: +#define PREFETCH_MAX_BLOCKS 8
! 487: +
! 488: +typedef struct {
! 489: + int in_use;
! 490: + OFF_T offset;
! 491: + int32 len;
! 492: + char sum[SUM_LENGTH];
! 493: +} prefetch_sum_t;
! 494: +
! 495: +typedef struct {
! 496: + struct map_struct *map;
! 497: + OFF_T len;
! 498: + OFF_T last;
! 499: + int32 blocklen;
! 500: + int blocks;
! 501: + prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
! 502: +} prefetch_t;
! 503: +
! 504: +prefetch_t *prefetch;
! 505: +
! 506: +extern int xfersum_type;
! 507: +extern int checksum_seed;
! 508: +extern int proper_seed_order;
! 509: +extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
! 510: +
! 511: +extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
! 512: +
! 513: +void checksum2_disable_prefetch()
! 514: +{
! 515: + if (prefetch) {
! 516: + PREFETCH_PRINTF("checksum2_disable_prefetch\n");
! 517: + free(prefetch);
! 518: + prefetch = NULL;
! 519: + }
! 520: +}
! 521: +
! 522: +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
! 523: +{
! 524: +#ifdef PREFETCH_ENABLE
! 525: + checksum2_disable_prefetch();
! 526: + int slots = md5_parallel_slots();
! 527: + if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
! 528: + prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
! 529: + memset(prefetch, 0, sizeof(prefetch_t));
! 530: + prefetch->map = map;
! 531: + prefetch->len = len;
! 532: + prefetch->last = 0;
! 533: + prefetch->blocklen = blocklen;
! 534: + prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
! 535: + PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
! 536: + }
! 537: +#endif
! 538: +}
! 539: +
! 540: +static inline void checksum2_reset_prefetch()
! 541: +{
! 542: + for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
! 543: + prefetch->sums[i].in_use = 0;
! 544: + }
! 545: +}
! 546: +
! 547: +static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
! 548: +{
! 549: + if (prefetch->sums[0].in_use) {
! 550: + if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
! 551: + memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
! 552: + for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
! 553: + prefetch->sums[i] = prefetch->sums[i + 1];
! 554: + }
! 555: + prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
! 556: + PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
! 557: + return 1;
! 558: + } else {
! 559: + // unexpected access, reset cache
! 560: + PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
! 561: + checksum2_reset_prefetch();
! 562: + }
! 563: + }
! 564: + return 0;
! 565: +}
! 566: +
! 567: +static int checksum2_perform_prefetch(OFF_T prefetch_offset)
! 568: +{
! 569: + int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
! 570: + if (blocks < 2) return 0; // fall through to non-simd, probably faster
! 571: +
! 572: + int32 total = 0;
! 573: + int i;
! 574: + for (i = 0; i < blocks; i++) {
! 575: + prefetch->sums[i].offset = prefetch_offset + total;
! 576: + prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
! 577: + prefetch->sums[i].in_use = 0;
! 578: + total += prefetch->sums[i].len;
! 579: + }
! 580: + for (; i < PREFETCH_MAX_BLOCKS; i++) {
! 581: + prefetch->sums[i].in_use = 0;
! 582: + }
! 583: +
! 584: + uchar seedbuf[4];
! 585: + SIVALu(seedbuf, 0, checksum_seed);
! 586: +
! 587: + PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
! 588: + char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
! 589: + char* bufs[PREFETCH_MAX_BLOCKS] = {0};
! 590: + int lens[PREFETCH_MAX_BLOCKS] = {0};
! 591: + char* sums[PREFETCH_MAX_BLOCKS] = {0};
! 592: + for (i = 0; i < blocks; i++) {
! 593: + bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
! 594: + lens[i] = prefetch->sums[i].len;
! 595: + sums[i] = prefetch->sums[i].sum;
! 596: + }
! 597: + if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
! 598: + for (i = 0; i < blocks; i++) {
! 599: + prefetch->sums[i].in_use = 1;
! 600: + }
! 601: + return 1;
! 602: + } else {
! 603: + // this should never be, abort
! 604: + PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
! 605: + checksum2_disable_prefetch();
! 606: + }
! 607: + return 0;
! 608: +}
! 609: +
! 610: +void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
! 611: +{
! 612: + if (prefetch) {
! 613: + PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
! 614: + OFF_T last = prefetch->last;
! 615: + prefetch->last = prefetch_offset;
! 616: + if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
! 617: + // we're looking around trying to align blocks, prefetching will slow things down
! 618: + PREFETCH_PRINTF("get_checksum2 SEEK\n");
! 619: + checksum2_reset_prefetch();
! 620: + } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
! 621: + // hit
! 622: + return;
! 623: + } else if (checksum2_perform_prefetch(prefetch_offset)) {
! 624: + if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
! 625: + // hit; should always be as we just fetched this data
! 626: + return;
! 627: + } else {
! 628: + // this should never be, abort
! 629: + PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
! 630: + checksum2_disable_prefetch();
! 631: + }
! 632: + }
! 633: + }
! 634: + get_checksum2_nosimd(buf, len, sum, prefetch_offset);
! 635: +}
! 636: +#endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
! 637: +
! 638: +} // "C"
! 639: +
! 640: +/* Benchmark compilation
! 641: +
! 642: + The get_checksum1() benchmark runs through all available code paths in a
! 643: + single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
! 644: + recompiled for each code path (it always uses the fastest path available
! 645: + on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
! 646: + be used when applicable regardless of rsync being built with OpenSSL.
! 647: +
! 648: + Something like the following should compile and run the benchmarks:
! 649: +
! 650: + # if gcc
! 651: + export CC=gcc
! 652: + export CXX=g++
! 653: + export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
! 654: +
! 655: + # else if clang
! 656: + export CC=clang
! 657: + export CXX=clang++
! 658: + export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
! 659: +
! 660: + # /if
! 661: +
! 662: + export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
! 663: + export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
! 664: + export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
! 665: + export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
! 666: +
! 667: + rm bench_csum*
! 668: +
! 669: + ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
! 670: +
! 671: + $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
! 672: +
! 673: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
! 674: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
! 675: +
! 676: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
! 677: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
! 678: +
! 679: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
! 680: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
! 681: +
! 682: + ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
! 683: +
! 684: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
! 685: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
! 686: +
! 687: + ./bench_csum1.all
! 688: + ./bench_csum2.asm
! 689: + ./bench_csum2.openssl
! 690: + ./bench_csum2.sse2
! 691: + ./bench_csum2.avx2
! 692: +
! 693: + */
! 694: +
! 695: +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
! 696: #pragma clang optimize off
! 697: #pragma GCC push_options
! 698: #pragma GCC optimize ("O0")
! 699: @@ -493,7 +739,9 @@ uint32 get_checksum1(char *buf1, int32 len)
! 700: #ifndef CLOCK_MONOTONIC_RAW
! 701: #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
! 702: #endif
! 703: +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
! 704:
! 705: +#ifdef BENCHMARK_SIMD_CHECKSUM1
! 706: static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
! 707: struct timespec start, end;
! 708: uint64_t us;
! 709: @@ -509,7 +757,7 @@ static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int
! 710: clock_gettime(CLOCK_MONOTONIC_RAW, &end);
! 711: us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
! 712: cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
! 713: - printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
! 714: + printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
! 715: }
! 716:
! 717: static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
! 718: @@ -533,10 +781,108 @@ int main() {
! 719: free(buf);
! 720: return 0;
! 721: }
! 722: +#endif /* BENCHMARK_SIMD_CHECKSUM1 */
! 723:
! 724: +#ifdef BENCHMARK_SIMD_CHECKSUM2
! 725: +static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
! 726: + struct timespec start, end;
! 727: + uint64_t us;
! 728: + unsigned char cs1[16];
! 729: + unsigned char cs2[16];
! 730: + int i;
! 731: +
! 732: + clock_gettime(CLOCK_MONOTONIC_RAW, &start);
! 733: + for (i = 0; i < ROUNDS; i++) {
! 734: + func(buf, len, (char*)cs1);
! 735: + }
! 736: + clock_gettime(CLOCK_MONOTONIC_RAW, &end);
! 737: + us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
! 738: +
! 739: + func2(buf, len, (char*)cs2);
! 740: +
! 741: + float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
! 742: + printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
! 743: + for (i = 0; i < 16; i++) {
! 744: + printf("%02x", cs1[i] & 0xFF);
! 745: + }
! 746: + printf(" :: ");
! 747: + for (i = 0; i < 16; i++) {
! 748: + printf("%02x", cs2[i] & 0xFF);
! 749: + }
! 750: + printf("\n");
! 751: +}
! 752: +
! 753: +static void benchmark_inner(char* buf, int32 len, char* sum_out) {
! 754: + // This should produce the same output for different optimizations
! 755: + // levels, not the same as sanity_check()
! 756: +
! 757: + char* bufs[8] = {0};
! 758: + int lens[8] = {0};
! 759: + char* sums[8] = {0};
! 760: +
! 761: + bufs[0] = buf;
! 762: + lens[0] = len;
! 763: + sums[0] = sum_out;
! 764: + md5_parallel(1, bufs, lens, sums, NULL, NULL);
! 765: +}
! 766: +
! 767: +extern "C" {
! 768: +extern void MD5P8_Init_c(MD5P8_CTX *ctx);
! 769: +extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
! 770: +extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
! 771: +}
! 772: +
! 773: +static void sanity_check(char* buf, int32 len, char* sum_out) {
! 774: + // This should produce the same output for different optimizations
! 775: + // levels, not the same as benchmark_inner()
! 776: + if (md5_parallel_slots() <= 1) {
! 777: + MD5P8_CTX m5p8;
! 778: + MD5P8_Init_c(&m5p8);
! 779: + MD5P8_Update_c(&m5p8, (uchar *)buf, len);
! 780: + MD5P8_Final_c((uchar *)sum_out, &m5p8);
! 781: + } else {
! 782: + MD5P8_CTX m5p8;
! 783: + MD5P8_Init(&m5p8);
! 784: + MD5P8_Update(&m5p8, (uchar *)buf, len);
! 785: + MD5P8_Final((uchar *)sum_out, &m5p8);
! 786: + }
! 787: +}
! 788: +
! 789: +int main() {
! 790: + // This benchmarks the parallel MD5 checksum rather than get_checksum2()
! 791: + // as the latter would require compiling in a lot of rsync's code, but
! 792: + // it touches all the same internals so the performance should be nearly
! 793: + // identical.
! 794: +
! 795: + int i;
! 796: + char* buf = (char*)malloc(BLOCK_LEN);
! 797: + for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
! 798: +
! 799: + const char* method = "?";
! 800: + switch (md5_parallel_slots()) {
! 801: + case 8: method = "AVX2"; break;
! 802: + case 4: method = "SSE2"; break;
! 803: +#ifdef USE_OPENSSL
! 804: + case 1: method = "OpenSSL"; break;
! 805: +#elif (CSUM_CHUNK == 64)
! 806: + case 1: method = "ASM"; break;
! 807: +#else
! 808: + // this won't happen unless you modified code somewhere
! 809: + case 1: method = "Raw-C"; break;
! 810: +#endif
! 811: + }
! 812: +
! 813: + benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
! 814: +
! 815: + free(buf);
! 816: + return 0;
! 817: +}
! 818: +#endif /* BENCHMARK_SIMD_CHECKSUM2 */
! 819: +
! 820: +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
! 821: #pragma GCC pop_options
! 822: #pragma clang optimize on
! 823: -#endif /* BENCHMARK_SIMD_CHECKSUM1 */
! 824: +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
! 825:
! 826: #endif /* HAVE_SIMD */
! 827: #endif /* __cplusplus */
! 828: diff --git a/simd-md5-parallel-x86_64.cpp b/simd-md5-parallel-x86_64.cpp
! 829: new file mode 100644
! 830: --- /dev/null
! 831: +++ b/simd-md5-parallel-x86_64.cpp
! 832: @@ -0,0 +1,1138 @@
! 833: +/*
! 834: + * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel.
! 835: + *
! 836: + * Original author: Nicolas Noble, 2017
! 837: + * Modifications: Jorrit Jongma, 2020
! 838: + *
! 839: + * The original code was released in the public domain by the original author,
! 840: + * falling back to the MIT license ( http://www.opensource.org/licenses/MIT )
! 841: + * in case public domain does not apply in your country. These modifications
! 842: + * are likewise released in the public domain, with the same MIT license
! 843: + * fallback.
! 844: + *
! 845: + * The original publication can be found at:
! 846: + *
! 847: + * https://github.com/nicolasnoble/sse-hash
! 848: + */
! 849: +/*
! 850: + * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
! 851: + * MD5 code has been removed and those code paths rerouted to use the MD5
! 852: + * code already present in rsync, and wrapper functions have been added. The
! 853: + * MD5P8 code is also new, and is the reason for the new stride parameter.
! 854: + *
! 855: + * This code allows multiple independent MD5 streams to be processed in
! 856: + * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
! 857: + * lower than that of the original C routines for MD5, the processing of
! 858: + * additional streams is "for free".
! 859: + *
! 860: + * Single streams are rerouted to rsync's normal MD5 code as that is faster
! 861: + * for that case. A further optimization is possible by using SSE2 code on
! 862: + * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not
! 863: + * implemented here as it would require some restructuring, and in practise
! 864: + * the code here is only rarely called with less than the maximum amount of
! 865: + * streams (typically once at the end of each checksum2'd file).
! 866: + *
! 867: + * Benchmarks (in MB/s) C ASM SSE2*1 SSE2*4 AVX2*1 AVX2*8
! 868: + * - Intel Atom D2700 302 334 166 664 N/A N/A
! 869: + * - Intel i7-7700hq 351 376 289 1156 273 2184
! 870: + * - AMD ThreadRipper 2950x 728 784 568 2272 430 3440
! 871: + */
! 872: +
! 873: +#ifdef __x86_64__
! 874: +#ifdef __cplusplus
! 875: +
! 876: +extern "C" {
! 877: +
! 878: +#include "rsync.h"
! 879: +
! 880: +}
! 881: +
! 882: +#ifdef HAVE_SIMD
! 883: +
! 884: +#ifndef BENCHMARK_SIMD_CHECKSUM2
! 885: +#define PMD5_ALLOW_SSE2 // debugging
! 886: +#define PMD5_ALLOW_AVX2 // debugging
! 887: +#endif
! 888: +
! 889: +#ifdef PMD5_ALLOW_AVX2
! 890: +#ifndef PMD5_ALLOW_SSE2
! 891: +#define PMD5_ALLOW_SSE2
! 892: +#endif
! 893: +#endif
! 894: +
! 895: +#include <stdint.h>
! 896: +#include <string.h>
! 897: +
! 898: +#include <immintrin.h>
! 899: +
! 900: +/* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
! 901: +#ifdef __clang__
! 902: +#define MVSTATIC
! 903: +#else
! 904: +#define MVSTATIC static
! 905: +#endif
! 906: +
! 907: +// Missing from the headers on gcc 6 and older, clang 8 and older
! 908: +typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
! 909: +typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
! 910: +
! 911: +#define PMD5_SLOTS_DEFAULT 0
! 912: +#define PMD5_SLOTS_SSE2 4
! 913: +#define PMD5_SLOTS_AVX2 8
! 914: +#define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2
! 915: +
! 916: +#ifdef PMD5_ALLOW_SSE2
! 917: +__attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots()
! 918: +{
! 919: + return PMD5_SLOTS_SSE2;
! 920: +}
! 921: +#endif
! 922: +
! 923: +#ifdef PMD5_ALLOW_AVX2
! 924: +__attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots()
! 925: +{
! 926: + return PMD5_SLOTS_AVX2;
! 927: +}
! 928: +#endif
! 929: +
! 930: +__attribute__ ((target("default"))) MVSTATIC int pmd5_slots()
! 931: +{
! 932: + return PMD5_SLOTS_DEFAULT;
! 933: +}
! 934: +
! 935: +/* The parallel MD5 context structure. */
! 936: +typedef struct {
! 937: + __m128i state_sse2[4];
! 938: + __m256i state_avx2[4];
! 939: + uint64_t len[PMD5_SLOTS_MAX];
! 940: +} pmd5_context;
! 941: +
! 942: +/* The status returned by the various functions below. */
! 943: +typedef enum {
! 944: + PMD5_SUCCESS,
! 945: + PMD5_INVALID_SLOT,
! 946: + PMD5_UNALIGNED_UPDATE,
! 947: +} pmd5_status;
! 948: +
! 949: +/* Initializes all slots in the given pmd5 context. */
! 950: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx);
! 951: +
! 952: +/* Initializes a single slot out in the given pmd5 context. */
! 953: +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot);
! 954: +
! 955: +/* Makes an MD5 update on all slots in parallel, given the same exact length on all streams.
! 956: + The stream pointers will be incremented accordingly.
! 957: + It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
! 958: + The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted.
! 959: + Stride defaults to 64 if 0 is passed. */
! 960: +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride);
! 961: +
! 962: +/* Makes an MD5 update on all slots in parallel, given different lengths.
! 963: + The stream pointers will be incremented accordingly.
! 964: + The lengths will be decreased accordingly. Not all data might be consumed.
! 965: + It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
! 966: + The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */
! 967: +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]);
! 968: +
! 969: +/* Finishes all slots at once. Fills in all digests. */
! 970: +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]);
! 971: +
! 972: +/* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
! 973: + a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
! 974: +static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot);
! 975: +
! 976: +/* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a
! 977: + multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using
! 978: + a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
! 979: +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length);
! 980: +
! 981: +/* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */
! 982: +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot);
! 983: +
! 984: +/* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */
! 985: +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot);
! 986: +
! 987: +#define S11 7
! 988: +#define S12 12
! 989: +#define S13 17
! 990: +#define S14 22
! 991: +#define S21 5
! 992: +#define S22 9
! 993: +#define S23 14
! 994: +#define S24 20
! 995: +#define S31 4
! 996: +#define S32 11
! 997: +#define S33 16
! 998: +#define S34 23
! 999: +#define S41 6
! 1000: +#define S42 10
! 1001: +#define S43 15
! 1002: +#define S44 21
! 1003: +
! 1004: +#define T1 0xD76AA478
! 1005: +#define T2 0xE8C7B756
! 1006: +#define T3 0x242070DB
! 1007: +#define T4 0xC1BDCEEE
! 1008: +#define T5 0xF57C0FAF
! 1009: +#define T6 0x4787C62A
! 1010: +#define T7 0xA8304613
! 1011: +#define T8 0xFD469501
! 1012: +#define T9 0x698098D8
! 1013: +#define T10 0x8B44F7AF
! 1014: +#define T11 0xFFFF5BB1
! 1015: +#define T12 0x895CD7BE
! 1016: +#define T13 0x6B901122
! 1017: +#define T14 0xFD987193
! 1018: +#define T15 0xA679438E
! 1019: +#define T16 0x49B40821
! 1020: +#define T17 0xF61E2562
! 1021: +#define T18 0xC040B340
! 1022: +#define T19 0x265E5A51
! 1023: +#define T20 0xE9B6C7AA
! 1024: +#define T21 0xD62F105D
! 1025: +#define T22 0x02441453
! 1026: +#define T23 0xD8A1E681
! 1027: +#define T24 0xE7D3FBC8
! 1028: +#define T25 0x21E1CDE6
! 1029: +#define T26 0xC33707D6
! 1030: +#define T27 0xF4D50D87
! 1031: +#define T28 0x455A14ED
! 1032: +#define T29 0xA9E3E905
! 1033: +#define T30 0xFCEFA3F8
! 1034: +#define T31 0x676F02D9
! 1035: +#define T32 0x8D2A4C8A
! 1036: +#define T33 0xFFFA3942
! 1037: +#define T34 0x8771F681
! 1038: +#define T35 0x6D9D6122
! 1039: +#define T36 0xFDE5380C
! 1040: +#define T37 0xA4BEEA44
! 1041: +#define T38 0x4BDECFA9
! 1042: +#define T39 0xF6BB4B60
! 1043: +#define T40 0xBEBFBC70
! 1044: +#define T41 0x289B7EC6
! 1045: +#define T42 0xEAA127FA
! 1046: +#define T43 0xD4EF3085
! 1047: +#define T44 0x04881D05
! 1048: +#define T45 0xD9D4D039
! 1049: +#define T46 0xE6DB99E5
! 1050: +#define T47 0x1FA27CF8
! 1051: +#define T48 0xC4AC5665
! 1052: +#define T49 0xF4292244
! 1053: +#define T50 0x432AFF97
! 1054: +#define T51 0xAB9423A7
! 1055: +#define T52 0xFC93A039
! 1056: +#define T53 0x655B59C3
! 1057: +#define T54 0x8F0CCC92
! 1058: +#define T55 0xFFEFF47D
! 1059: +#define T56 0x85845DD1
! 1060: +#define T57 0x6FA87E4F
! 1061: +#define T58 0xFE2CE6E0
! 1062: +#define T59 0xA3014314
! 1063: +#define T60 0x4E0811A1
! 1064: +#define T61 0xF7537E82
! 1065: +#define T62 0xBD3AF235
! 1066: +#define T63 0x2AD7D2BB
! 1067: +#define T64 0xEB86D391
! 1068: +
! 1069: +#define ROTL_SSE2(x, n) { \
! 1070: + __m128i s; \
! 1071: + s = _mm_srli_epi32(x, 32 - n); \
! 1072: + x = _mm_slli_epi32(x, n); \
! 1073: + x = _mm_or_si128(x, s); \
! 1074: +};
! 1075: +
! 1076: +#define ROTL_AVX2(x, n) { \
! 1077: + __m256i s; \
! 1078: + s = _mm256_srli_epi32(x, 32 - n); \
! 1079: + x = _mm256_slli_epi32(x, n); \
! 1080: + x = _mm256_or_si256(x, s); \
! 1081: +};
! 1082: +
! 1083: +#define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z))
! 1084: +#define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y))
! 1085: +#define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z)
! 1086: +#define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff))))
! 1087: +
! 1088: +#define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
! 1089: +#define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y))
! 1090: +#define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z)
! 1091: +#define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff))))
! 1092: +
! 1093: +#define SET_SSE2(step, a, b, c, d, x, s, ac) { \
! 1094: + a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \
! 1095: + ROTL_SSE2(a, s); \
! 1096: + a = _mm_add_epi32(a, b); \
! 1097: +}
! 1098: +
! 1099: +#define SET_AVX2(step, a, b, c, d, x, s, ac) { \
! 1100: + a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \
! 1101: + ROTL_AVX2(a, s); \
! 1102: + a = _mm256_add_epi32(a, b); \
! 1103: +}
! 1104: +
! 1105: +#define IA 0x67452301
! 1106: +#define IB 0xefcdab89
! 1107: +#define IC 0x98badcfe
! 1108: +#define ID 0x10325476
! 1109: +
! 1110: +#define GET_MD5_DATA(dest, src, pos) \
! 1111: + dest = \
! 1112: + ((uint32_t) src[pos + 0]) << 0 | \
! 1113: + ((uint32_t) src[pos + 1]) << 8 | \
! 1114: + ((uint32_t) src[pos + 2]) << 16 | \
! 1115: + ((uint32_t) src[pos + 3]) << 24
! 1116: +
! 1117: +#define GET_PMD5_DATA_SSE2(dest, src, pos) { \
! 1118: + uint32_t v0, v1, v2, v3; \
! 1119: + GET_MD5_DATA(v0, src[0], pos); \
! 1120: + GET_MD5_DATA(v1, src[1], pos); \
! 1121: + GET_MD5_DATA(v2, src[2], pos); \
! 1122: + GET_MD5_DATA(v3, src[3], pos); \
! 1123: + dest = _mm_setr_epi32(v0, v1, v2, v3); \
! 1124: +}
! 1125: +
! 1126: +#define GET_PMD5_DATA_AVX2(dest, src, pos) { \
! 1127: + uint32_t v0, v1, v2, v3; \
! 1128: + uint32_t v4, v5, v6, v7; \
! 1129: + GET_MD5_DATA(v0, src[0], pos); \
! 1130: + GET_MD5_DATA(v1, src[1], pos); \
! 1131: + GET_MD5_DATA(v2, src[2], pos); \
! 1132: + GET_MD5_DATA(v3, src[3], pos); \
! 1133: + GET_MD5_DATA(v4, src[4], pos); \
! 1134: + GET_MD5_DATA(v5, src[5], pos); \
! 1135: + GET_MD5_DATA(v6, src[6], pos); \
! 1136: + GET_MD5_DATA(v7, src[7], pos); \
! 1137: + dest = _mm256_setr_epi32(v0, v1, v2, v3, \
! 1138: + v4, v5, v6, v7); \
! 1139: +}
! 1140: +
! 1141: +#define PUT_MD5_DATA(dest, val, pos) { \
! 1142: + dest[pos + 0] = (val >> 0) & 0xff; \
! 1143: + dest[pos + 1] = (val >> 8) & 0xff; \
! 1144: + dest[pos + 2] = (val >> 16) & 0xff; \
! 1145: + dest[pos + 3] = (val >> 24) & 0xff; \
! 1146: +}
! 1147: +
! 1148: +const static uint8_t md5_padding[64] = {
! 1149: + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 1150: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 1151: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 1152: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 1153: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 1154: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 1155: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 1156: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 1157: +};
! 1158: +
! 1159: +#ifdef PMD5_ALLOW_SSE2
! 1160: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
! 1161: +{
! 1162: + int i;
! 1163: + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
! 1164: + ctx->len[i] = 0;
! 1165: + }
! 1166: +
! 1167: + ctx->state_sse2[0] = _mm_set1_epi32(IA);
! 1168: + ctx->state_sse2[1] = _mm_set1_epi32(IB);
! 1169: + ctx->state_sse2[2] = _mm_set1_epi32(IC);
! 1170: + ctx->state_sse2[3] = _mm_set1_epi32(ID);
! 1171: +
! 1172: + return PMD5_SUCCESS;
! 1173: +}
! 1174: +#endif
! 1175: +
! 1176: +#ifdef PMD5_ALLOW_AVX2
! 1177: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
! 1178: +{
! 1179: + int i;
! 1180: + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
! 1181: + ctx->len[i] = 0;
! 1182: + }
! 1183: +
! 1184: + ctx->state_avx2[0] = _mm256_set1_epi32(IA);
! 1185: + ctx->state_avx2[1] = _mm256_set1_epi32(IB);
! 1186: + ctx->state_avx2[2] = _mm256_set1_epi32(IC);
! 1187: + ctx->state_avx2[3] = _mm256_set1_epi32(ID);
! 1188: +
! 1189: + return PMD5_SUCCESS;
! 1190: +}
! 1191: +#endif
! 1192: +
! 1193: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
! 1194: +{
! 1195: + return PMD5_INVALID_SLOT;
! 1196: +}
! 1197: +
! 1198: +#ifdef PMD5_ALLOW_SSE2
! 1199: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
! 1200: +{
! 1201: + if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
! 1202: + return PMD5_INVALID_SLOT;
! 1203: +
! 1204: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
! 1205: + int i;
! 1206: +
! 1207: + for (i = 0; i < 4; i++) {
! 1208: + _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
! 1209: + }
! 1210: +
! 1211: + v[0][slot] = a;
! 1212: + v[1][slot] = b;
! 1213: + v[2][slot] = c;
! 1214: + v[3][slot] = d;
! 1215: +
! 1216: + for (i = 0; i < 4; i++) {
! 1217: + ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]);
! 1218: + }
! 1219: +
! 1220: + return PMD5_SUCCESS;
! 1221: +}
! 1222: +#endif
! 1223: +
! 1224: +#ifdef PMD5_ALLOW_AVX2
! 1225: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
! 1226: +{
! 1227: + if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
! 1228: + return PMD5_INVALID_SLOT;
! 1229: +
! 1230: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
! 1231: + int i;
! 1232: +
! 1233: + for (i = 0; i < 4; i++) {
! 1234: + _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
! 1235: + }
! 1236: +
! 1237: + v[0][slot] = a;
! 1238: + v[1][slot] = b;
! 1239: + v[2][slot] = c;
! 1240: + v[3][slot] = d;
! 1241: +
! 1242: + for (i = 0; i < 4; i++) {
! 1243: + ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]);
! 1244: + }
! 1245: +
! 1246: + return PMD5_SUCCESS;
! 1247: +}
! 1248: +#endif
! 1249: +
! 1250: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
! 1251: +{
! 1252: + return PMD5_INVALID_SLOT;
! 1253: +}
! 1254: +
! 1255: +#ifdef PMD5_ALLOW_SSE2
! 1256: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
! 1257: +{
! 1258: + if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
! 1259: + return PMD5_INVALID_SLOT;
! 1260: +
! 1261: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
! 1262: + int i;
! 1263: +
! 1264: + for (i = 0; i < 4; i++) {
! 1265: + _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
! 1266: + }
! 1267: +
! 1268: + *a = v[0][slot];
! 1269: + *b = v[1][slot];
! 1270: + *c = v[2][slot];
! 1271: + *d = v[3][slot];
! 1272: +
! 1273: + return PMD5_SUCCESS;
! 1274: +}
! 1275: +#endif
! 1276: +
! 1277: +#ifdef PMD5_ALLOW_AVX2
! 1278: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
! 1279: +{
! 1280: + if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
! 1281: + return PMD5_INVALID_SLOT;
! 1282: +
! 1283: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
! 1284: + int i;
! 1285: +
! 1286: + for (i = 0; i < 4; i++) {
! 1287: + _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
! 1288: + }
! 1289: +
! 1290: + *a = v[0][slot];
! 1291: + *b = v[1][slot];
! 1292: + *c = v[2][slot];
! 1293: + *d = v[3][slot];
! 1294: +
! 1295: + return PMD5_SUCCESS;
! 1296: +}
! 1297: +#endif
! 1298: +
! 1299: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
! 1300: +{
! 1301: + return PMD5_INVALID_SLOT;
! 1302: +}
! 1303: +
! 1304: +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot)
! 1305: +{
! 1306: + return pmd5_set_slot(ctx, slot, IA, IB, IC, ID);
! 1307: +}
! 1308: +
! 1309: +#ifdef PMD5_ALLOW_SSE2
! 1310: +__attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
! 1311: +{
! 1312: + __m128i W[MD5_DIGEST_LEN], a, b, c, d;
! 1313: +
! 1314: + GET_PMD5_DATA_SSE2(W[ 0], data, 0);
! 1315: + GET_PMD5_DATA_SSE2(W[ 1], data, 4);
! 1316: + GET_PMD5_DATA_SSE2(W[ 2], data, 8);
! 1317: + GET_PMD5_DATA_SSE2(W[ 3], data, 12);
! 1318: + GET_PMD5_DATA_SSE2(W[ 4], data, 16);
! 1319: + GET_PMD5_DATA_SSE2(W[ 5], data, 20);
! 1320: + GET_PMD5_DATA_SSE2(W[ 6], data, 24);
! 1321: + GET_PMD5_DATA_SSE2(W[ 7], data, 28);
! 1322: + GET_PMD5_DATA_SSE2(W[ 8], data, 32);
! 1323: + GET_PMD5_DATA_SSE2(W[ 9], data, 36);
! 1324: + GET_PMD5_DATA_SSE2(W[10], data, 40);
! 1325: + GET_PMD5_DATA_SSE2(W[11], data, 44);
! 1326: + GET_PMD5_DATA_SSE2(W[12], data, 48);
! 1327: + GET_PMD5_DATA_SSE2(W[13], data, 52);
! 1328: + GET_PMD5_DATA_SSE2(W[14], data, 56);
! 1329: + GET_PMD5_DATA_SSE2(W[15], data, 60);
! 1330: +
! 1331: + a = ctx->state_sse2[0];
! 1332: + b = ctx->state_sse2[1];
! 1333: + c = ctx->state_sse2[2];
! 1334: + d = ctx->state_sse2[3];
! 1335: +
! 1336: + SET_SSE2(F, a, b, c, d, W[ 0], S11, 1);
! 1337: + SET_SSE2(F, d, a, b, c, W[ 1], S12, 2);
! 1338: + SET_SSE2(F, c, d, a, b, W[ 2], S13, 3);
! 1339: + SET_SSE2(F, b, c, d, a, W[ 3], S14, 4);
! 1340: + SET_SSE2(F, a, b, c, d, W[ 4], S11, 5);
! 1341: + SET_SSE2(F, d, a, b, c, W[ 5], S12, 6);
! 1342: + SET_SSE2(F, c, d, a, b, W[ 6], S13, 7);
! 1343: + SET_SSE2(F, b, c, d, a, W[ 7], S14, 8);
! 1344: + SET_SSE2(F, a, b, c, d, W[ 8], S11, 9);
! 1345: + SET_SSE2(F, d, a, b, c, W[ 9], S12, 10);
! 1346: + SET_SSE2(F, c, d, a, b, W[10], S13, 11);
! 1347: + SET_SSE2(F, b, c, d, a, W[11], S14, 12);
! 1348: + SET_SSE2(F, a, b, c, d, W[12], S11, 13);
! 1349: + SET_SSE2(F, d, a, b, c, W[13], S12, 14);
! 1350: + SET_SSE2(F, c, d, a, b, W[14], S13, 15);
! 1351: + SET_SSE2(F, b, c, d, a, W[15], S14, 16);
! 1352: +
! 1353: + SET_SSE2(G, a, b, c, d, W[ 1], S21, 17);
! 1354: + SET_SSE2(G, d, a, b, c, W[ 6], S22, 18);
! 1355: + SET_SSE2(G, c, d, a, b, W[11], S23, 19);
! 1356: + SET_SSE2(G, b, c, d, a, W[ 0], S24, 20);
! 1357: + SET_SSE2(G, a, b, c, d, W[ 5], S21, 21);
! 1358: + SET_SSE2(G, d, a, b, c, W[10], S22, 22);
! 1359: + SET_SSE2(G, c, d, a, b, W[15], S23, 23);
! 1360: + SET_SSE2(G, b, c, d, a, W[ 4], S24, 24);
! 1361: + SET_SSE2(G, a, b, c, d, W[ 9], S21, 25);
! 1362: + SET_SSE2(G, d, a, b, c, W[14], S22, 26);
! 1363: + SET_SSE2(G, c, d, a, b, W[ 3], S23, 27);
! 1364: + SET_SSE2(G, b, c, d, a, W[ 8], S24, 28);
! 1365: + SET_SSE2(G, a, b, c, d, W[13], S21, 29);
! 1366: + SET_SSE2(G, d, a, b, c, W[ 2], S22, 30);
! 1367: + SET_SSE2(G, c, d, a, b, W[ 7], S23, 31);
! 1368: + SET_SSE2(G, b, c, d, a, W[12], S24, 32);
! 1369: +
! 1370: + SET_SSE2(H, a, b, c, d, W[ 5], S31, 33);
! 1371: + SET_SSE2(H, d, a, b, c, W[ 8], S32, 34);
! 1372: + SET_SSE2(H, c, d, a, b, W[11], S33, 35);
! 1373: + SET_SSE2(H, b, c, d, a, W[14], S34, 36);
! 1374: + SET_SSE2(H, a, b, c, d, W[ 1], S31, 37);
! 1375: + SET_SSE2(H, d, a, b, c, W[ 4], S32, 38);
! 1376: + SET_SSE2(H, c, d, a, b, W[ 7], S33, 39);
! 1377: + SET_SSE2(H, b, c, d, a, W[10], S34, 40);
! 1378: + SET_SSE2(H, a, b, c, d, W[13], S31, 41);
! 1379: + SET_SSE2(H, d, a, b, c, W[ 0], S32, 42);
! 1380: + SET_SSE2(H, c, d, a, b, W[ 3], S33, 43);
! 1381: + SET_SSE2(H, b, c, d, a, W[ 6], S34, 44);
! 1382: + SET_SSE2(H, a, b, c, d, W[ 9], S31, 45);
! 1383: + SET_SSE2(H, d, a, b, c, W[12], S32, 46);
! 1384: + SET_SSE2(H, c, d, a, b, W[15], S33, 47);
! 1385: + SET_SSE2(H, b, c, d, a, W[ 2], S34, 48);
! 1386: +
! 1387: + SET_SSE2(I, a, b, c, d, W[ 0], S41, 49);
! 1388: + SET_SSE2(I, d, a, b, c, W[ 7], S42, 50);
! 1389: + SET_SSE2(I, c, d, a, b, W[14], S43, 51);
! 1390: + SET_SSE2(I, b, c, d, a, W[ 5], S44, 52);
! 1391: + SET_SSE2(I, a, b, c, d, W[12], S41, 53);
! 1392: + SET_SSE2(I, d, a, b, c, W[ 3], S42, 54);
! 1393: + SET_SSE2(I, c, d, a, b, W[10], S43, 55);
! 1394: + SET_SSE2(I, b, c, d, a, W[ 1], S44, 56);
! 1395: + SET_SSE2(I, a, b, c, d, W[ 8], S41, 57);
! 1396: + SET_SSE2(I, d, a, b, c, W[15], S42, 58);
! 1397: + SET_SSE2(I, c, d, a, b, W[ 6], S43, 59);
! 1398: + SET_SSE2(I, b, c, d, a, W[13], S44, 60);
! 1399: + SET_SSE2(I, a, b, c, d, W[ 4], S41, 61);
! 1400: + SET_SSE2(I, d, a, b, c, W[11], S42, 62);
! 1401: + SET_SSE2(I, c, d, a, b, W[ 2], S43, 63);
! 1402: + SET_SSE2(I, b, c, d, a, W[ 9], S44, 64);
! 1403: +
! 1404: + ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a);
! 1405: + ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b);
! 1406: + ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c);
! 1407: + ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d);
! 1408: +}
! 1409: +#endif
! 1410: +
! 1411: +#ifdef PMD5_ALLOW_AVX2
! 1412: +__attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
! 1413: +{
! 1414: + __m256i W[MD5_DIGEST_LEN], a, b, c, d;
! 1415: +
! 1416: + GET_PMD5_DATA_AVX2(W[ 0], data, 0);
! 1417: + GET_PMD5_DATA_AVX2(W[ 1], data, 4);
! 1418: + GET_PMD5_DATA_AVX2(W[ 2], data, 8);
! 1419: + GET_PMD5_DATA_AVX2(W[ 3], data, 12);
! 1420: + GET_PMD5_DATA_AVX2(W[ 4], data, 16);
! 1421: + GET_PMD5_DATA_AVX2(W[ 5], data, 20);
! 1422: + GET_PMD5_DATA_AVX2(W[ 6], data, 24);
! 1423: + GET_PMD5_DATA_AVX2(W[ 7], data, 28);
! 1424: + GET_PMD5_DATA_AVX2(W[ 8], data, 32);
! 1425: + GET_PMD5_DATA_AVX2(W[ 9], data, 36);
! 1426: + GET_PMD5_DATA_AVX2(W[10], data, 40);
! 1427: + GET_PMD5_DATA_AVX2(W[11], data, 44);
! 1428: + GET_PMD5_DATA_AVX2(W[12], data, 48);
! 1429: + GET_PMD5_DATA_AVX2(W[13], data, 52);
! 1430: + GET_PMD5_DATA_AVX2(W[14], data, 56);
! 1431: + GET_PMD5_DATA_AVX2(W[15], data, 60);
! 1432: +
! 1433: + a = ctx->state_avx2[0];
! 1434: + b = ctx->state_avx2[1];
! 1435: + c = ctx->state_avx2[2];
! 1436: + d = ctx->state_avx2[3];
! 1437: +
! 1438: + SET_AVX2(F, a, b, c, d, W[ 0], S11, 1);
! 1439: + SET_AVX2(F, d, a, b, c, W[ 1], S12, 2);
! 1440: + SET_AVX2(F, c, d, a, b, W[ 2], S13, 3);
! 1441: + SET_AVX2(F, b, c, d, a, W[ 3], S14, 4);
! 1442: + SET_AVX2(F, a, b, c, d, W[ 4], S11, 5);
! 1443: + SET_AVX2(F, d, a, b, c, W[ 5], S12, 6);
! 1444: + SET_AVX2(F, c, d, a, b, W[ 6], S13, 7);
! 1445: + SET_AVX2(F, b, c, d, a, W[ 7], S14, 8);
! 1446: + SET_AVX2(F, a, b, c, d, W[ 8], S11, 9);
! 1447: + SET_AVX2(F, d, a, b, c, W[ 9], S12, 10);
! 1448: + SET_AVX2(F, c, d, a, b, W[10], S13, 11);
! 1449: + SET_AVX2(F, b, c, d, a, W[11], S14, 12);
! 1450: + SET_AVX2(F, a, b, c, d, W[12], S11, 13);
! 1451: + SET_AVX2(F, d, a, b, c, W[13], S12, 14);
! 1452: + SET_AVX2(F, c, d, a, b, W[14], S13, 15);
! 1453: + SET_AVX2(F, b, c, d, a, W[15], S14, 16);
! 1454: +
! 1455: + SET_AVX2(G, a, b, c, d, W[ 1], S21, 17);
! 1456: + SET_AVX2(G, d, a, b, c, W[ 6], S22, 18);
! 1457: + SET_AVX2(G, c, d, a, b, W[11], S23, 19);
! 1458: + SET_AVX2(G, b, c, d, a, W[ 0], S24, 20);
! 1459: + SET_AVX2(G, a, b, c, d, W[ 5], S21, 21);
! 1460: + SET_AVX2(G, d, a, b, c, W[10], S22, 22);
! 1461: + SET_AVX2(G, c, d, a, b, W[15], S23, 23);
! 1462: + SET_AVX2(G, b, c, d, a, W[ 4], S24, 24);
! 1463: + SET_AVX2(G, a, b, c, d, W[ 9], S21, 25);
! 1464: + SET_AVX2(G, d, a, b, c, W[14], S22, 26);
! 1465: + SET_AVX2(G, c, d, a, b, W[ 3], S23, 27);
! 1466: + SET_AVX2(G, b, c, d, a, W[ 8], S24, 28);
! 1467: + SET_AVX2(G, a, b, c, d, W[13], S21, 29);
! 1468: + SET_AVX2(G, d, a, b, c, W[ 2], S22, 30);
! 1469: + SET_AVX2(G, c, d, a, b, W[ 7], S23, 31);
! 1470: + SET_AVX2(G, b, c, d, a, W[12], S24, 32);
! 1471: +
! 1472: + SET_AVX2(H, a, b, c, d, W[ 5], S31, 33);
! 1473: + SET_AVX2(H, d, a, b, c, W[ 8], S32, 34);
! 1474: + SET_AVX2(H, c, d, a, b, W[11], S33, 35);
! 1475: + SET_AVX2(H, b, c, d, a, W[14], S34, 36);
! 1476: + SET_AVX2(H, a, b, c, d, W[ 1], S31, 37);
! 1477: + SET_AVX2(H, d, a, b, c, W[ 4], S32, 38);
! 1478: + SET_AVX2(H, c, d, a, b, W[ 7], S33, 39);
! 1479: + SET_AVX2(H, b, c, d, a, W[10], S34, 40);
! 1480: + SET_AVX2(H, a, b, c, d, W[13], S31, 41);
! 1481: + SET_AVX2(H, d, a, b, c, W[ 0], S32, 42);
! 1482: + SET_AVX2(H, c, d, a, b, W[ 3], S33, 43);
! 1483: + SET_AVX2(H, b, c, d, a, W[ 6], S34, 44);
! 1484: + SET_AVX2(H, a, b, c, d, W[ 9], S31, 45);
! 1485: + SET_AVX2(H, d, a, b, c, W[12], S32, 46);
! 1486: + SET_AVX2(H, c, d, a, b, W[15], S33, 47);
! 1487: + SET_AVX2(H, b, c, d, a, W[ 2], S34, 48);
! 1488: +
! 1489: + SET_AVX2(I, a, b, c, d, W[ 0], S41, 49);
! 1490: + SET_AVX2(I, d, a, b, c, W[ 7], S42, 50);
! 1491: + SET_AVX2(I, c, d, a, b, W[14], S43, 51);
! 1492: + SET_AVX2(I, b, c, d, a, W[ 5], S44, 52);
! 1493: + SET_AVX2(I, a, b, c, d, W[12], S41, 53);
! 1494: + SET_AVX2(I, d, a, b, c, W[ 3], S42, 54);
! 1495: + SET_AVX2(I, c, d, a, b, W[10], S43, 55);
! 1496: + SET_AVX2(I, b, c, d, a, W[ 1], S44, 56);
! 1497: + SET_AVX2(I, a, b, c, d, W[ 8], S41, 57);
! 1498: + SET_AVX2(I, d, a, b, c, W[15], S42, 58);
! 1499: + SET_AVX2(I, c, d, a, b, W[ 6], S43, 59);
! 1500: + SET_AVX2(I, b, c, d, a, W[13], S44, 60);
! 1501: + SET_AVX2(I, a, b, c, d, W[ 4], S41, 61);
! 1502: + SET_AVX2(I, d, a, b, c, W[11], S42, 62);
! 1503: + SET_AVX2(I, c, d, a, b, W[ 2], S43, 63);
! 1504: + SET_AVX2(I, b, c, d, a, W[ 9], S44, 64);
! 1505: +
! 1506: + ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a);
! 1507: + ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b);
! 1508: + ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c);
! 1509: + ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d);
! 1510: +}
! 1511: +#endif
! 1512: +
! 1513: +__attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
! 1514: +{
! 1515: +}
! 1516: +
! 1517: +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride)
! 1518: +{
! 1519: + const uint8_t * ptrs[PMD5_SLOTS_MAX];
! 1520: +
! 1521: + if (!length) return PMD5_SUCCESS;
! 1522: +
! 1523: + int slots = pmd5_slots();
! 1524: +
! 1525: + if (!stride) stride = 64;
! 1526: +
! 1527: + int i;
! 1528: + for (i = 0; i < slots; i++) {
! 1529: + ptrs[i] = data[i];
! 1530: + ctx->len[i] += length;
! 1531: + if (!ptrs[i]) ptrs[i] = md5_padding;
! 1532: + }
! 1533: +
! 1534: + while (length >= 64) {
! 1535: + pmd5_process(ctx, ptrs);
! 1536: + length -= 64;
! 1537: + for (i = 0; i < slots; i++) {
! 1538: + if (data[i]) ptrs[i] += stride;
! 1539: + }
! 1540: + }
! 1541: +
! 1542: + if (length) return PMD5_UNALIGNED_UPDATE;
! 1543: +
! 1544: + for (i = 0; i < slots; i++) {
! 1545: + if (data[i]) data[i] = ptrs[i];
! 1546: + }
! 1547: +
! 1548: + return PMD5_SUCCESS;
! 1549: +}
! 1550: +
! 1551: +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX])
! 1552: +{
! 1553: + uint64_t length = 0;
! 1554: + int slots = pmd5_slots();
! 1555: +
! 1556: + int i;
! 1557: + for (i = 0; i < slots; i++) {
! 1558: + if ((length == 0) || (lengths[i] < length)) length = lengths[i];
! 1559: + }
! 1560: +
! 1561: + for (i = 0; i < slots; i++) {
! 1562: + lengths[i] -= length;
! 1563: + }
! 1564: +
! 1565: + return pmd5_update_all_simple(ctx, data, length, 0);
! 1566: +}
! 1567: +
! 1568: +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length)
! 1569: +{
! 1570: + MD5_CTX ctx;
! 1571: +
! 1572: + if ((slot >= pmd5_slots()) || (slot < 0))
! 1573: + return PMD5_INVALID_SLOT;
! 1574: +
! 1575: + pmd5_to_md5(pctx, &ctx, slot);
! 1576: + if (data && length) {
! 1577: + MD5_Update(&ctx, data, length);
! 1578: + }
! 1579: + MD5_Final(digest, &ctx);
! 1580: +
! 1581: + return PMD5_SUCCESS;
! 1582: +}
! 1583: +
! 1584: +static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot)
! 1585: +{
! 1586: + return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0);
! 1587: +}
! 1588: +
! 1589: +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN])
! 1590: +{
! 1591: + int i;
! 1592: + for (i = 0; i < pmd5_slots(); i++) {
! 1593: + pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0);
! 1594: + }
! 1595: + return PMD5_SUCCESS;
! 1596: +}
! 1597: +
! 1598: +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot)
! 1599: +{
! 1600: + if ((slot >= pmd5_slots()) || (slot < 0))
! 1601: + return PMD5_INVALID_SLOT;
! 1602: +
! 1603: + // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting.
! 1604: +
! 1605: +#ifdef USE_OPENSSL
! 1606: + pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29);
! 1607: +#else
! 1608: + pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32);
! 1609: +#endif
! 1610: + return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D);
! 1611: +}
! 1612: +
! 1613: +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot)
! 1614: +{
! 1615: + if ((slot >= pmd5_slots()) || (slot < 0))
! 1616: + return PMD5_INVALID_SLOT;
! 1617: +
! 1618: + MD5_Init(ctx);
! 1619: +
! 1620: +#ifdef USE_OPENSSL
! 1621: + ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF;
! 1622: + ctx->Nh = pctx->len[slot] >> 29;
! 1623: +
! 1624: + uint32_t a, b, c, d;
! 1625: + pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
! 1626: + if (ret == PMD5_SUCCESS) {
! 1627: + ctx->A = a;
! 1628: + ctx->B = b;
! 1629: + ctx->C = c;
! 1630: + ctx->D = d;
! 1631: + }
! 1632: + return ret;
! 1633: +#else
! 1634: + ctx->totalN = pctx->len[slot] & 0xFFFFFFFF;
! 1635: + ctx->totalN2 = pctx->len[slot] >> 32;
! 1636: + return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D);
! 1637: +#endif
! 1638: +}
! 1639: +
! 1640: +/* With GCC 10 putting these implementations inside 'extern "C"' causes an
! 1641: + assembler error. That worked fine on GCC 5-9 and clang 6-10...
! 1642: + */
! 1643: +
! 1644: +static inline int md5_parallel_slots_cpp()
! 1645: +{
! 1646: + int slots = pmd5_slots();
! 1647: + if (slots == 0) return 1;
! 1648: + return slots;
! 1649: +}
! 1650: +
! 1651: +static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
! 1652: +{
! 1653: + int slots = md5_parallel_slots_cpp();
! 1654: + if ((streams < 1) || (streams > slots)) return 0;
! 1655: + if (pre4 && post4) return 0;
! 1656: +
! 1657: + if (slots == 1) {
! 1658: + MD5_CTX ctx;
! 1659: + MD5_Init(&ctx);
! 1660: + if (pre4) {
! 1661: + MD5_Update(&ctx, (const unsigned char*)pre4, 4);
! 1662: + }
! 1663: + MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]);
! 1664: + if (post4) {
! 1665: + MD5_Update(&ctx, (const unsigned char*)post4, 4);
! 1666: + }
! 1667: + if (sum[0]) {
! 1668: + MD5_Final((uint8_t*)sum[0], &ctx);
! 1669: + }
! 1670: + return 0;
! 1671: + }
! 1672: +
! 1673: + int i;
! 1674: + int active[PMD5_SLOTS_MAX];
! 1675: + char* buffers[PMD5_SLOTS_MAX];
! 1676: + uint64_t left[PMD5_SLOTS_MAX];
! 1677: + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
! 1678: + active[i] = streams > i;
! 1679: + if (i < streams) {
! 1680: + buffers[i] = buf[i];
! 1681: + left[i] = (uint64_t)len[i];
! 1682: + } else {
! 1683: + buffers[i] = NULL;
! 1684: + left[i] = 0;
! 1685: + }
! 1686: + }
! 1687: + MD5_CTX results[PMD5_SLOTS_MAX];
! 1688: +
! 1689: + pmd5_context ctx_simd;
! 1690: + if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0;
! 1691: +
! 1692: + if (pre4) {
! 1693: + char temp_buffers[PMD5_SLOTS_MAX][64];
! 1694: + int have_any = 0;
! 1695: + for (i = 0; i < slots; i++) {
! 1696: + if (active[i]) {
! 1697: + if (left[i] < 60) {
! 1698: + MD5_Init(&results[i]);
! 1699: + MD5_Update(&results[i], (const unsigned char*)pre4, 4);
! 1700: + MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]);
! 1701: + active[i] = 0;
! 1702: + left[i] = 0;
! 1703: + } else {
! 1704: + memcpy(temp_buffers[i], pre4, 4);
! 1705: + memcpy(temp_buffers[i] + 4, buffers[i], 60);
! 1706: + buffers[i] += 60;
! 1707: + left[i] -= 60;
! 1708: + have_any = 1;
! 1709: + }
! 1710: + }
! 1711: + }
! 1712: +
! 1713: + if (have_any) {
! 1714: + char* ptrs[PMD5_SLOTS_MAX];
! 1715: + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
! 1716: + ptrs[i] = &temp_buffers[i][0];
! 1717: + }
! 1718: + if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) {
! 1719: + return 0;
! 1720: + }
! 1721: + }
! 1722: + }
! 1723: +
! 1724: + int failed = 0;
! 1725: + while (true) {
! 1726: + for (i = 0; i < slots; i++) {
! 1727: + if (active[i] && (left[i] < 64)) {
! 1728: + if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) {
! 1729: + failed = 1;
! 1730: + }
! 1731: + active[i] = 0;
! 1732: + }
! 1733: + }
! 1734: +
! 1735: + uint64_t shortest = 0;
! 1736: + for (i = 0; i < slots; i++) {
! 1737: + if (!active[i]) {
! 1738: + buffers[i] = NULL;
! 1739: + } else if ((shortest == 0) || (left[i] < shortest)) {
! 1740: + shortest = left[i];
! 1741: + }
! 1742: + }
! 1743: +
! 1744: + if (shortest > 0) {
! 1745: + shortest = shortest & ~63;
! 1746: + if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) {
! 1747: + failed = 1;
! 1748: + }
! 1749: + for (i = 0; i < slots; i++) {
! 1750: + if (active[i]) {
! 1751: + left[i] -= shortest;
! 1752: + }
! 1753: + }
! 1754: + }
! 1755: +
! 1756: + if (failed) {
! 1757: + return 0;
! 1758: + } else {
! 1759: + int have_any = 0;
! 1760: + for (i = 0; i < slots; i++) {
! 1761: + have_any |= active[i];
! 1762: + }
! 1763: + if (!have_any) {
! 1764: + break;
! 1765: + }
! 1766: + }
! 1767: + }
! 1768: +
! 1769: + for (i = 0; i < slots; i++) {
! 1770: + if (i < streams) {
! 1771: + if (left[i] > 0) {
! 1772: + // buffer[i] == NULL here
! 1773: + MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]);
! 1774: + }
! 1775: + if (post4) {
! 1776: + MD5_Update(&results[i], (const unsigned char*)post4, 4);
! 1777: + }
! 1778: + if (sum[i]) {
! 1779: + MD5_Final((uint8_t*)sum[i], &results[i]);
! 1780: + }
! 1781: + }
! 1782: + }
! 1783: +
! 1784: + return 1;
! 1785: +}
! 1786: +
! 1787: +// each pmd5_context needs to be 32-byte aligned
! 1788: +#define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31)))
! 1789: +
! 1790: +static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx)
! 1791: +{
! 1792: + int i;
! 1793: + for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) {
! 1794: + pmd5_init_all(MD5P8_Contexts_simd(ctx, i));
! 1795: + }
! 1796: + ctx->used = 0;
! 1797: + ctx->next = 0;
! 1798: +}
! 1799: +
! 1800: +static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length)
! 1801: +{
! 1802: + int slots = pmd5_slots();
! 1803: + uint32 pos = 0;
! 1804: +
! 1805: + if ((ctx->used) || (length < 512)) {
! 1806: + int cpy = MIN(length, 512 - ctx->used);
! 1807: + memcpy(&ctx->buffer[ctx->used], input, cpy);
! 1808: + ctx->used += cpy;
! 1809: + length -= cpy;
! 1810: + pos += cpy;
! 1811: +
! 1812: + if (ctx->used == 512) {
! 1813: + if (slots == PMD5_SLOTS_AVX2) {
! 1814: + const uint8_t* ptrs[PMD5_SLOTS_MAX] = {
! 1815: + (uint8_t*)ctx->buffer,
! 1816: + (uint8_t*)(ctx->buffer + 64),
! 1817: + (uint8_t*)(ctx->buffer + 128),
! 1818: + (uint8_t*)(ctx->buffer + 192),
! 1819: + (uint8_t*)(ctx->buffer + 256),
! 1820: + (uint8_t*)(ctx->buffer + 320),
! 1821: + (uint8_t*)(ctx->buffer + 384),
! 1822: + (uint8_t*)(ctx->buffer + 448)
! 1823: + };
! 1824: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0);
! 1825: + } else {
! 1826: + const uint8_t* ptrs1[PMD5_SLOTS_MAX] = {
! 1827: + (uint8_t*)ctx->buffer,
! 1828: + (uint8_t*)(ctx->buffer + 64),
! 1829: + (uint8_t*)(ctx->buffer + 128),
! 1830: + (uint8_t*)(ctx->buffer + 192)
! 1831: + };
! 1832: + const uint8_t* ptrs2[PMD5_SLOTS_MAX] = {
! 1833: + (uint8_t*)(ctx->buffer + 256),
! 1834: + (uint8_t*)(ctx->buffer + 320),
! 1835: + (uint8_t*)(ctx->buffer + 384),
! 1836: + (uint8_t*)(ctx->buffer + 448)
! 1837: + };
! 1838: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0);
! 1839: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0);
! 1840: + }
! 1841: + ctx->used = 0;
! 1842: + }
! 1843: + }
! 1844: +
! 1845: + if (length >= 512) {
! 1846: + uint32 blocks = length / 512;
! 1847: + if (slots == PMD5_SLOTS_AVX2) {
! 1848: + const uint8_t* ptrs[8] = {
! 1849: + (uint8_t*)(input + pos),
! 1850: + (uint8_t*)(input + pos + 64),
! 1851: + (uint8_t*)(input + pos + 128),
! 1852: + (uint8_t*)(input + pos + 192),
! 1853: + (uint8_t*)(input + pos + 256),
! 1854: + (uint8_t*)(input + pos + 320),
! 1855: + (uint8_t*)(input + pos + 384),
! 1856: + (uint8_t*)(input + pos + 448)
! 1857: + };
! 1858: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512);
! 1859: + } else {
! 1860: + const uint8_t* ptrs1[4] = {
! 1861: + (uint8_t*)(input + pos),
! 1862: + (uint8_t*)(input + pos + 64),
! 1863: + (uint8_t*)(input + pos + 128),
! 1864: + (uint8_t*)(input + pos + 192)
! 1865: + };
! 1866: + const uint8_t* ptrs2[4] = {
! 1867: + (uint8_t*)(input + pos + 256),
! 1868: + (uint8_t*)(input + pos + 320),
! 1869: + (uint8_t*)(input + pos + 384),
! 1870: + (uint8_t*)(input + pos + 448)
! 1871: + };
! 1872: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512);
! 1873: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512);
! 1874: + }
! 1875: + pos += blocks * 512;
! 1876: + length -= blocks * 512;
! 1877: + }
! 1878: +
! 1879: + if (length) {
! 1880: + memcpy(ctx->buffer, &input[pos], length);
! 1881: + ctx->used = length;
! 1882: + }
! 1883: +}
! 1884: +
! 1885: +static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
! 1886: +{
! 1887: + int i;
! 1888: + uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0;
! 1889: + if (ctx->used) {
! 1890: + uchar tmp[512];
! 1891: + memset(tmp, 0, 512);
! 1892: + MD5P8_Update(ctx, tmp, 512 - ctx->used);
! 1893: + }
! 1894: +
! 1895: + uchar state[34*4] = {0};
! 1896: +
! 1897: + MD5_CTX tmp;
! 1898: + for (i = 0; i < 8; i++) {
! 1899: + if (pmd5_slots() == PMD5_SLOTS_AVX2) {
! 1900: + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
! 1901: + } else if (i < 4) {
! 1902: + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
! 1903: + } else {
! 1904: + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4);
! 1905: + }
! 1906: +#ifdef USE_OPENSSL
! 1907: + if (low + tmp.Nl < low) high++;
! 1908: + low += tmp.Nl;
! 1909: + high += tmp.Nh;
! 1910: +#else
! 1911: + if (low + tmp.totalN < low) high++;
! 1912: + low += tmp.totalN;
! 1913: + high += tmp.totalN2;
! 1914: +#endif
! 1915: + SIVALu(state, i*16, tmp.A);
! 1916: + SIVALu(state, i*16 + 4, tmp.B);
! 1917: + SIVALu(state, i*16 + 8, tmp.C);
! 1918: + SIVALu(state, i*16 + 12, tmp.D);
! 1919: + }
! 1920: +
! 1921: +#ifndef USE_OPENSSL
! 1922: + high = (low >> 29) | (high << 3);
! 1923: + low = (low << 3);
! 1924: +#endif
! 1925: +
! 1926: + sub <<= 3;
! 1927: + if (low - sub > low) high--;
! 1928: + low -= sub;
! 1929: +
! 1930: + SIVALu(state, 32*4, low);
! 1931: + SIVALu(state, 33*4, high);
! 1932: +
! 1933: + MD5_CTX md;
! 1934: + MD5_Init(&md);
! 1935: + MD5_Update(&md, state, 34*4);
! 1936: + MD5_Final(digest, &md);
! 1937: +}
! 1938: +
! 1939: +extern "C" {
! 1940: +
! 1941: +int md5_parallel_slots()
! 1942: +{
! 1943: + return md5_parallel_slots_cpp();
! 1944: +}
! 1945: +
! 1946: +int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
! 1947: +{
! 1948: + return md5_parallel_cpp(streams, buf, len, sum, pre4, post4);
! 1949: +}
! 1950: +
! 1951: +void MD5P8_Init(MD5P8_CTX *ctx)
! 1952: +{
! 1953: + MD5P8_Init_cpp(ctx);
! 1954: +}
! 1955: +
! 1956: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
! 1957: +{
! 1958: + MD5P8_Update_cpp(ctx, input, length);
! 1959: +}
! 1960: +
! 1961: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
! 1962: +{
! 1963: + MD5P8_Final_cpp(digest, ctx);
! 1964: +}
! 1965: +
! 1966: +} // "C"
! 1967: +
! 1968: +#endif /* HAVE_SIMD */
! 1969: +#endif /* __cplusplus */
! 1970: +#endif /* __x86_64__ */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>