embedaddon/rsync/patches/md5p8.diff - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / rsync / patches / md5p8.diff
Revision 1.1: download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 00:32:36 2021 UTC (3 years, 3 months ago) by misho
CVS tags: MAIN, HEAD

Initial revision

1: From: Jorrit Jongma <git@jongma.org> 2: 3: - MD5 optimization in block matching phase: 4: 5: MD5 hashes computed during rsync's block matching phase are independent 6: and thus possible to process in parallel. This code processes 4 blocks 7: in parallel if SSE2 is available, or 8 if AVX2 is available. An increase 8: of performance (or decrease of CPU usage) of up to 6x has been measured. 9: 10: A prefetching algorithm is used to predict and load upcoming blocks, as 11: this prevents the need for extensive modifications to other parts of 12: the rsync sources to get this working. 13: 14: This remains compatible with existing rsync builds using MD5 checksums. 15: 16: - MD5P8 whole-file checksum: 17: 18: Splits the input up into 8 independent streams (64-byte interleave), and 19: produces a final checksum based on the end state of those 8 streams. If 20: parallelization of MD5 hashing is available, the performance gain (or 21: CPU usage decrease) is 2x to 6x compared to traditional MD5. 22: 23: The rsync version on both ends of the connection need MD5P8 support 24: built-in for it to be used. 25: 26: xxHash is still preferred (and faster), but this provides a reasonably 27: fast fallback for the case where xxHash libraries are not available at 28: build time. 29: 30: based-on: e94bad1c156fc3910f24e2b3b71a81b0b0bdeb70 31: diff --git a/Makefile.in b/Makefile.in 32: --- a/Makefile.in 33: +++ b/Makefile.in 34: @@ -29,14 +29,14 @@ SHELL=/bin/sh 35: .SUFFIXES: 36: .SUFFIXES: .c .o 37: 38: -SIMD_x86_64=simd-checksum-x86_64.o 39: +SIMD_x86_64=simd-checksum-x86_64.o simd-md5-parallel-x86_64.o 40: ASM_x86_64=lib/md5-asm-x86_64.o 41: 42: GENFILES=configure.sh aclocal.m4 config.h.in rsync.1 rsync.1.html \ 43: rsync-ssl.1 rsync-ssl.1.html rsyncd.conf.5 rsyncd.conf.5.html 44: HEADERS=byteorder.h config.h errcode.h proto.h rsync.h ifuncs.h itypes.h inums.h \ 45: lib/pool_alloc.h lib/mdigest.h lib/md-defines.h version.h 46: -LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o \ 47: +LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o lib/md5p8.o \ 48: lib/permstring.o lib/pool_alloc.o lib/sysacls.o lib/sysxattrs.o @LIBOBJS@ 49: zlib_OBJS=zlib/deflate.o zlib/inffast.o zlib/inflate.o zlib/inftrees.o \ 50: zlib/trees.o zlib/zutil.o zlib/adler32.o zlib/compress.o zlib/crc32.o 51: @@ -140,6 +140,9 @@ git-version.h: mkgitver $(wildcard $(srcdir)/.git/logs/HEAD) 52: simd-checksum-x86_64.o: simd-checksum-x86_64.cpp 53: @$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-checksum-x86_64.cpp 54: 55: +simd-md5-parallel-x86_64.o: simd-md5-parallel-x86_64.cpp 56: + @$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-md5-parallel-x86_64.cpp 57: + 58: lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.S config.h lib/md-defines.h 59: @$(srcdir)/cmdormsg disable-asm $(CC) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/lib/md5-asm-x86_64.S 60: 61: diff --git a/checksum.c b/checksum.c 62: --- a/checksum.c 63: +++ b/checksum.c 64: @@ -52,6 +52,7 @@ struct name_num_obj valid_checksums = { 65: { CSUM_XXH64, "xxh64", NULL }, 66: { CSUM_XXH64, "xxhash", NULL }, 67: #endif 68: + { CSUM_MD5P8, "md5p8", NULL }, 69: { CSUM_MD5, "md5", NULL }, 70: { CSUM_MD4, "md4", NULL }, 71: { CSUM_NONE, "none", NULL }, 72: @@ -141,6 +142,7 @@ int csum_len_for_type(int cst, BOOL flist_csum) 73: case CSUM_MD4_OLD: 74: case CSUM_MD4_BUSTED: 75: return MD4_DIGEST_LEN; 76: + case CSUM_MD5P8: 77: case CSUM_MD5: 78: return MD5_DIGEST_LEN; 79: case CSUM_XXH64: 80: @@ -167,6 +169,7 @@ int canonical_checksum(int csum_type) 81: case CSUM_MD4_BUSTED: 82: break; 83: case CSUM_MD4: 84: + case CSUM_MD5P8: 85: case CSUM_MD5: 86: return -1; 87: case CSUM_XXH64: 88: @@ -179,7 +182,9 @@ int canonical_checksum(int csum_type) 89: return 0; 90: } 91: 92: -#ifndef HAVE_SIMD /* See simd-checksum-*.cpp. */ 93: +#ifdef HAVE_SIMD /* See simd-checksum-*.cpp. */ 94: +#define get_checksum2 get_checksum2_nosimd 95: +#else 96: /* 97: a simple 32 bit checksum that can be updated from either end 98: (inspired by Mark Adler's Adler-32 checksum) 99: @@ -200,9 +205,18 @@ uint32 get_checksum1(char *buf1, int32 len) 100: } 101: return (s1 & 0xffff) + (s2 << 16); 102: } 103: + 104: +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen)) 105: +{ 106: +} 107: + 108: +void checksum2_disable_prefetch() 109: +{ 110: +} 111: #endif 112: 113: -void get_checksum2(char *buf, int32 len, char *sum) 114: +/* Renamed to get_checksum2_nosimd() with HAVE_SIMD */ 115: +void get_checksum2(char *buf, int32 len, char *sum, UNUSED(OFF_T prefetch_offset)) 116: { 117: switch (xfersum_type) { 118: #ifdef SUPPORT_XXHASH 119: @@ -221,6 +235,7 @@ void get_checksum2(char *buf, int32 len, char *sum) 120: break; 121: } 122: #endif 123: + case CSUM_MD5P8: /* == CSUM_MD5 for checksum2 */ 124: case CSUM_MD5: { 125: MD5_CTX m5; 126: uchar seedbuf[4]; 127: @@ -373,6 +388,21 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum) 128: break; 129: } 130: #endif 131: + case CSUM_MD5P8: { 132: + MD5P8_CTX m5p8; 133: + 134: + MD5P8_Init(&m5p8); 135: + 136: + for (i = 0; i + CHUNK_SIZE <= len; i += CHUNK_SIZE) 137: + MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, CHUNK_SIZE), CHUNK_SIZE); 138: + 139: + remainder = (int32)(len - i); 140: + if (remainder > 0) 141: + MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, remainder), remainder); 142: + 143: + MD5P8_Final((uchar *)sum, &m5p8); 144: + break; 145: + } 146: case CSUM_MD5: { 147: MD5_CTX m5; 148: 149: @@ -445,6 +475,7 @@ static union { 150: #endif 151: MD5_CTX m5; 152: } ctx; 153: +static MD5P8_CTX m5p8; 154: #ifdef SUPPORT_XXHASH 155: static XXH64_state_t* xxh64_state; 156: #endif 157: @@ -481,6 +512,9 @@ void sum_init(int csum_type, int seed) 158: XXH3_128bits_reset(xxh3_state); 159: break; 160: #endif 161: + case CSUM_MD5P8: 162: + MD5P8_Init(&m5p8); 163: + break; 164: case CSUM_MD5: 165: MD5_Init(&ctx.m5); 166: break; 167: @@ -531,6 +565,9 @@ void sum_update(const char *p, int32 len) 168: XXH3_128bits_update(xxh3_state, p, len); 169: break; 170: #endif 171: + case CSUM_MD5P8: 172: + MD5P8_Update(&m5p8, (uchar *)p, len); 173: + break; 174: case CSUM_MD5: 175: MD5_Update(&ctx.m5, (uchar *)p, len); 176: break; 177: @@ -596,6 +633,9 @@ int sum_end(char *sum) 178: break; 179: } 180: #endif 181: + case CSUM_MD5P8: 182: + MD5P8_Final((uchar *)sum, &m5p8); 183: + break; 184: case CSUM_MD5: 185: MD5_Final((uchar *)sum, &ctx.m5); 186: break; 187: diff --git a/generator.c b/generator.c 188: --- a/generator.c 189: +++ b/generator.c 190: @@ -729,10 +729,12 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy) 191: if (append_mode > 0 && f_copy < 0) 192: return 0; 193: 194: - if (len > 0) 195: + if (len > 0) { 196: mapbuf = map_file(fd, len, MAX_MAP_SIZE, sum.blength); 197: - else 198: + checksum2_enable_prefetch(mapbuf, len, sum.blength); 199: + } else { 200: mapbuf = NULL; 201: + } 202: 203: for (i = 0; i < sum.count; i++) { 204: int32 n1 = (int32)MIN(len, (OFF_T)sum.blength); 205: @@ -750,7 +752,7 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy) 206: } 207: 208: sum1 = get_checksum1(map, n1); 209: - get_checksum2(map, n1, sum2); 210: + get_checksum2(map, n1, sum2, offset - n1); 211: 212: if (DEBUG_GTE(DELTASUM, 3)) { 213: rprintf(FINFO, 214: @@ -762,8 +764,10 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy) 215: write_buf(f_out, sum2, sum.s2length); 216: } 217: 218: - if (mapbuf) 219: + if (mapbuf) { 220: unmap_file(mapbuf); 221: + checksum2_disable_prefetch(); 222: + } 223: 224: return 0; 225: } 226: diff --git a/lib/md-defines.h b/lib/md-defines.h 227: --- a/lib/md-defines.h 228: +++ b/lib/md-defines.h 229: @@ -15,3 +15,4 @@ 230: #define CSUM_XXH64 6 231: #define CSUM_XXH3_64 7 232: #define CSUM_XXH3_128 8 233: +#define CSUM_MD5P8 9 234: diff --git a/lib/md5p8.c b/lib/md5p8.c 235: new file mode 100644 236: --- /dev/null 237: +++ b/lib/md5p8.c 238: @@ -0,0 +1,128 @@ 239: +/* 240: + * MD5-based hash friendly to parallel processing, reference implementation 241: + * 242: + * Author: Jorrit Jongma, 2020 243: + * 244: + * Released in the public domain falling back to the MIT license 245: + * ( http://www.opensource.org/licenses/MIT ) in case public domain does not 246: + * apply in your country. 247: + */ 248: +/* 249: + * MD5P8 is an MD5-based hash friendly to parallel processing. The input 250: + * stream is divided into 8 independent streams. For each 512 bytes of input, 251: + * the first 64 bytes are send to the first stream, the second 64 bytes to 252: + * the second stream, etc. The input stream is padded with zeros to the next 253: + * multiple of 512 bytes, then a normal MD5 hash is computed on a buffer 254: + * containing the A, B, C, and D states of the 8 individual streams, followed 255: + * by the (unpadded) length of the input. 256: + * 257: + * On non-SIMD accelerated CPUs the performance of MD5P8 is slightly lower 258: + * than normal MD5 (particularly on files smaller than 10 kB), but with 259: + * SIMD-based parallel processing it can be two to six times as fast. Even in 260: + * the best-case scenario, xxHash is still at least twice as fast and should 261: + * be preferred when available. 262: + */ 263: + 264: +#include "rsync.h" 265: + 266: +#ifdef HAVE_SIMD 267: +#define MD5P8_Init MD5P8_Init_c 268: +#define MD5P8_Update MD5P8_Update_c 269: +#define MD5P8_Final MD5P8_Final_c 270: +#endif 271: + 272: +/* each MD5_CTX needs to be 8-byte aligned */ 273: +#define MD5P8_Contexts_c(ctx, index) ((MD5_CTX*)((((uintptr_t)((ctx)->context_storage) + 7) & ~7) + (index)*((sizeof(MD5_CTX) + 7) & ~7))) 274: + 275: +void MD5P8_Init(MD5P8_CTX *ctx) 276: +{ 277: + int i; 278: + for (i = 0; i < 8; i++) { 279: + MD5_Init(MD5P8_Contexts_c(ctx, i)); 280: + } 281: + ctx->used = 0; 282: + ctx->next = 0; 283: +} 284: + 285: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length) 286: +{ 287: + uint32 pos = 0; 288: + 289: + if ((ctx->used) || (length < 64)) { 290: + int cpy = MIN(length, 64 - ctx->used); 291: + memmove(&ctx->buffer[ctx->used], input, cpy); 292: + ctx->used += cpy; 293: + length -= cpy; 294: + pos += cpy; 295: + 296: + if (ctx->used == 64) { 297: + MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), ctx->buffer, 64); 298: + ctx->used = 0; 299: + ctx->next = (ctx->next + 1) % 8; 300: + } 301: + } 302: + 303: + while (length >= 64) { 304: + MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), &input[pos], 64); 305: + ctx->next = (ctx->next + 1) % 8; 306: + pos += 64; 307: + length -= 64; 308: + } 309: + 310: + if (length) { 311: + memcpy(ctx->buffer, &input[pos], length); 312: + ctx->used = length; 313: + } 314: +} 315: + 316: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx) 317: +{ 318: + int i; 319: + uint32 low = 0, high = 0, sub = ctx->used ? 64 - ctx->used : 0; 320: + if (ctx->used) { 321: + uchar tmp[64]; 322: + memset(tmp, 0, 64); 323: + MD5P8_Update(ctx, tmp, 64 - ctx->used); 324: + } 325: + memset(ctx->buffer, 0, 64); 326: + while (ctx->next != 0) { 327: + MD5P8_Update(ctx, ctx->buffer, 64); 328: + sub += 64; 329: + } 330: + 331: + uchar state[34*4] = {0}; 332: + 333: + for (i = 0; i < 8; i++) { 334: + MD5_CTX* md = MD5P8_Contexts_c(ctx, i); 335: +#ifdef USE_OPENSSL 336: + if (low + md->Nl < low) high++; 337: + low += md->Nl; 338: + high += md->Nh; 339: +#else 340: + if (low + md->totalN < low) high++; 341: + low += md->totalN; 342: + high += md->totalN2; 343: +#endif 344: + SIVALu(state, i*16, md->A); 345: + SIVALu(state, i*16 + 4, md->B); 346: + SIVALu(state, i*16 + 8, md->C); 347: + SIVALu(state, i*16 + 12, md->D); 348: + } 349: + 350: +#ifndef USE_OPENSSL 351: + high = (low >> 29) | (high << 3); 352: + low = (low << 3); 353: +#endif 354: + 355: + sub <<= 3; 356: + if (low - sub > low) high--; 357: + low -= sub; 358: + 359: + SIVALu(state, 32*4, low); 360: + SIVALu(state, 33*4, high); 361: + 362: + MD5_CTX md; 363: + MD5_Init(&md); 364: + MD5_Update(&md, state, 34*4); 365: + MD5_Final(digest, &md); 366: +} 367: diff --git a/lib/mdigest.h b/lib/mdigest.h 368: --- a/lib/mdigest.h 369: +++ b/lib/mdigest.h 370: @@ -27,3 +27,14 @@ void md5_begin(md_context *ctx); 371: void md5_update(md_context *ctx, const uchar *input, uint32 length); 372: void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN]); 373: #endif 374: + 375: +typedef struct { 376: + uchar context_storage[1024]; 377: + uchar buffer[512]; 378: + unsigned int used; 379: + unsigned int next; 380: +} MD5P8_CTX; 381: + 382: +void MD5P8_Init(MD5P8_CTX *ctx); 383: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length); 384: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx); 385: diff --git a/match.c b/match.c 386: --- a/match.c 387: +++ b/match.c 388: @@ -164,6 +164,8 @@ static void hash_search(int f,struct sum_struct *s, 389: if (DEBUG_GTE(DELTASUM, 3)) 390: rprintf(FINFO, "sum=%.8x k=%ld\n", sum, (long)k); 391: 392: + checksum2_enable_prefetch(buf, len, s->blength); 393: + 394: offset = aligned_offset = aligned_i = 0; 395: 396: end = len + 1 - s->sums[s->count-1].len; 397: @@ -226,7 +228,7 @@ static void hash_search(int f,struct sum_struct *s, 398: 399: if (!done_csum2) { 400: map = (schar *)map_ptr(buf,offset,l); 401: - get_checksum2((char *)map,l,sum2); 402: + get_checksum2((char *)map, l, sum2, offset); 403: done_csum2 = 1; 404: } 405: 406: @@ -268,7 +270,7 @@ static void hash_search(int f,struct sum_struct *s, 407: sum = get_checksum1((char *)map, l); 408: if (sum != s->sums[i].sum1) 409: goto check_want_i; 410: - get_checksum2((char *)map, l, sum2); 411: + get_checksum2((char *)map, l, sum2, aligned_offset); 412: if (memcmp(sum2, s->sums[i].sum2, s->s2length) != 0) 413: goto check_want_i; 414: /* OK, we have a re-alignment match. Bump the offset 415: @@ -335,6 +337,8 @@ static void hash_search(int f,struct sum_struct *s, 416: matched(f, s, buf, offset - s->blength, -2); 417: } while (++offset < end); 418: 419: + checksum2_disable_prefetch(); 420: + 421: matched(f, s, buf, len, -1); 422: map_ptr(buf, len-1, 1); 423: } 424: diff --git a/simd-checksum-x86_64.cpp b/simd-checksum-x86_64.cpp 425: --- a/simd-checksum-x86_64.cpp 426: +++ b/simd-checksum-x86_64.cpp 427: @@ -49,13 +49,33 @@ 428: * use of the target attribute, selecting the fastest code path based on 429: * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+). 430: * GCC 4.x are not supported to ease configure.ac logic. 431: + * 432: + * ---- 433: + * 434: + * get_checksum2() is optimized for the case where the selected transfer 435: + * checksum is MD5. MD5 can't be made significantly faster with SIMD 436: + * instructions than the assembly version already included but SIMD 437: + * instructions can be used to hash multiple streams in parallel (see 438: + * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's 439: + * block-matching algorithm hashes the blocks independently (in contrast to 440: + * the whole-file checksum) this method can be employed here. 441: + * 442: + * To prevent needing to modify the core rsync sources significantly, a 443: + * prefetching strategy is used. When a checksum2 is requested, the code 444: + * reads ahead several blocks, creates the MD5 hashes for each block in 445: + * parallel, returns the hash for the first block, and caches the results 446: + * for the other blocks to return in future calls to get_checksum2(). 447: */ 448: 449: #ifdef __x86_64__ 450: #ifdef __cplusplus 451: 452: +extern "C" { 453: + 454: #include "rsync.h" 455: 456: +} 457: + 458: #ifdef HAVE_SIMD 459: 460: #include <immintrin.h> 461: @@ -480,9 +500,235 @@ uint32 get_checksum1(char *buf1, int32 len) 462: return get_checksum1_cpp(buf1, len); 463: } 464: 465: -} // extern "C" 466: +#if !defined(BENCHMARK_SIMD_CHECKSUM1) 467: 468: -#ifdef BENCHMARK_SIMD_CHECKSUM1 469: +// see simd-md5-parallel-x86_64.cpp 470: +extern int md5_parallel_slots(); 471: +extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4); 472: + 473: +#endif /* !BENCHMARK_SIMD_CHECKSUM1 */ 474: + 475: +#if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2) 476: + 477: +#define PREFETCH_ENABLE 1 // debugging 478: + 479: +#if 0 // debugging 480: +#define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__) 481: +#else 482: +#define PREFETCH_PRINTF(f_, ...) (void)0; 483: +#endif 484: + 485: +#define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks 486: +#define PREFETCH_MAX_BLOCKS 8 487: + 488: +typedef struct { 489: + int in_use; 490: + OFF_T offset; 491: + int32 len; 492: + char sum[SUM_LENGTH]; 493: +} prefetch_sum_t; 494: + 495: +typedef struct { 496: + struct map_struct *map; 497: + OFF_T len; 498: + OFF_T last; 499: + int32 blocklen; 500: + int blocks; 501: + prefetch_sum_t sums[PREFETCH_MAX_BLOCKS]; 502: +} prefetch_t; 503: + 504: +prefetch_t *prefetch; 505: + 506: +extern int xfersum_type; 507: +extern int checksum_seed; 508: +extern int proper_seed_order; 509: +extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset); 510: + 511: +extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len); 512: + 513: +void checksum2_disable_prefetch() 514: +{ 515: + if (prefetch) { 516: + PREFETCH_PRINTF("checksum2_disable_prefetch\n"); 517: + free(prefetch); 518: + prefetch = NULL; 519: + } 520: +} 521: + 522: +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen)) 523: +{ 524: +#ifdef PREFETCH_ENABLE 525: + checksum2_disable_prefetch(); 526: + int slots = md5_parallel_slots(); 527: + if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) { 528: + prefetch = (prefetch_t*)malloc(sizeof(prefetch_t)); 529: + memset(prefetch, 0, sizeof(prefetch_t)); 530: + prefetch->map = map; 531: + prefetch->len = len; 532: + prefetch->last = 0; 533: + prefetch->blocklen = blocklen; 534: + prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots); 535: + PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks); 536: + } 537: +#endif 538: +} 539: + 540: +static inline void checksum2_reset_prefetch() 541: +{ 542: + for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) { 543: + prefetch->sums[i].in_use = 0; 544: + } 545: +} 546: + 547: +static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset) 548: +{ 549: + if (prefetch->sums[0].in_use) { 550: + if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) { 551: + memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH); 552: + for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) { 553: + prefetch->sums[i] = prefetch->sums[i + 1]; 554: + } 555: + prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0; 556: + PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset); 557: + return 1; 558: + } else { 559: + // unexpected access, reset cache 560: + PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset); 561: + checksum2_reset_prefetch(); 562: + } 563: + } 564: + return 0; 565: +} 566: + 567: +static int checksum2_perform_prefetch(OFF_T prefetch_offset) 568: +{ 569: + int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks); 570: + if (blocks < 2) return 0; // fall through to non-simd, probably faster 571: + 572: + int32 total = 0; 573: + int i; 574: + for (i = 0; i < blocks; i++) { 575: + prefetch->sums[i].offset = prefetch_offset + total; 576: + prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total); 577: + prefetch->sums[i].in_use = 0; 578: + total += prefetch->sums[i].len; 579: + } 580: + for (; i < PREFETCH_MAX_BLOCKS; i++) { 581: + prefetch->sums[i].in_use = 0; 582: + } 583: + 584: + uchar seedbuf[4]; 585: + SIVALu(seedbuf, 0, checksum_seed); 586: + 587: + PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks); 588: + char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total); 589: + char* bufs[PREFETCH_MAX_BLOCKS] = {0}; 590: + int lens[PREFETCH_MAX_BLOCKS] = {0}; 591: + char* sums[PREFETCH_MAX_BLOCKS] = {0}; 592: + for (i = 0; i < blocks; i++) { 593: + bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset; 594: + lens[i] = prefetch->sums[i].len; 595: + sums[i] = prefetch->sums[i].sum; 596: + } 597: + if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) { 598: + for (i = 0; i < blocks; i++) { 599: + prefetch->sums[i].in_use = 1; 600: + } 601: + return 1; 602: + } else { 603: + // this should never be, abort 604: + PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n"); 605: + checksum2_disable_prefetch(); 606: + } 607: + return 0; 608: +} 609: + 610: +void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset) 611: +{ 612: + if (prefetch) { 613: + PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset); 614: + OFF_T last = prefetch->last; 615: + prefetch->last = prefetch_offset; 616: + if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) { 617: + // we're looking around trying to align blocks, prefetching will slow things down 618: + PREFETCH_PRINTF("get_checksum2 SEEK\n"); 619: + checksum2_reset_prefetch(); 620: + } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) { 621: + // hit 622: + return; 623: + } else if (checksum2_perform_prefetch(prefetch_offset)) { 624: + if (get_checksum2_prefetched(len, sum, prefetch_offset)) { 625: + // hit; should always be as we just fetched this data 626: + return; 627: + } else { 628: + // this should never be, abort 629: + PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n"); 630: + checksum2_disable_prefetch(); 631: + } 632: + } 633: + } 634: + get_checksum2_nosimd(buf, len, sum, prefetch_offset); 635: +} 636: +#endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */ 637: + 638: +} // "C" 639: + 640: +/* Benchmark compilation 641: + 642: + The get_checksum1() benchmark runs through all available code paths in a 643: + single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be 644: + recompiled for each code path (it always uses the fastest path available 645: + on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will 646: + be used when applicable regardless of rsync being built with OpenSSL. 647: + 648: + Something like the following should compile and run the benchmarks: 649: + 650: + # if gcc 651: + export CC=gcc 652: + export CXX=g++ 653: + export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti" 654: + 655: + # else if clang 656: + export CC=clang 657: + export CXX=clang++ 658: + export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize" 659: + 660: + # /if 661: + 662: + export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash" 663: + export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp" 664: + export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp" 665: + export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o" 666: + 667: + rm bench_csum* 668: + 669: + ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4 670: + 671: + $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all 672: + 673: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P 674: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm 675: + 676: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P 677: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2 678: + 679: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P 680: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2 681: + 682: + ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4 683: + 684: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P 685: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto 686: + 687: + ./bench_csum1.all 688: + ./bench_csum2.asm 689: + ./bench_csum2.openssl 690: + ./bench_csum2.sse2 691: + ./bench_csum2.avx2 692: + 693: + */ 694: + 695: +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2) 696: #pragma clang optimize off 697: #pragma GCC push_options 698: #pragma GCC optimize ("O0") 699: @@ -493,7 +739,9 @@ uint32 get_checksum1(char *buf1, int32 len) 700: #ifndef CLOCK_MONOTONIC_RAW 701: #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC 702: #endif 703: +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */ 704: 705: +#ifdef BENCHMARK_SIMD_CHECKSUM1 706: static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) { 707: struct timespec start, end; 708: uint64_t us; 709: @@ -509,7 +757,7 @@ static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int 710: clock_gettime(CLOCK_MONOTONIC_RAW, &end); 711: us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000; 712: cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16); 713: - printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs); 714: + printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs); 715: } 716: 717: static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { 718: @@ -533,10 +781,108 @@ int main() { 719: free(buf); 720: return 0; 721: } 722: +#endif /* BENCHMARK_SIMD_CHECKSUM1 */ 723: 724: +#ifdef BENCHMARK_SIMD_CHECKSUM2 725: +static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) { 726: + struct timespec start, end; 727: + uint64_t us; 728: + unsigned char cs1[16]; 729: + unsigned char cs2[16]; 730: + int i; 731: + 732: + clock_gettime(CLOCK_MONOTONIC_RAW, &start); 733: + for (i = 0; i < ROUNDS; i++) { 734: + func(buf, len, (char*)cs1); 735: + } 736: + clock_gettime(CLOCK_MONOTONIC_RAW, &end); 737: + us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000; 738: + 739: + func2(buf, len, (char*)cs2); 740: + 741: + float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0; 742: + printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams); 743: + for (i = 0; i < 16; i++) { 744: + printf("%02x", cs1[i] & 0xFF); 745: + } 746: + printf(" :: "); 747: + for (i = 0; i < 16; i++) { 748: + printf("%02x", cs2[i] & 0xFF); 749: + } 750: + printf("\n"); 751: +} 752: + 753: +static void benchmark_inner(char* buf, int32 len, char* sum_out) { 754: + // This should produce the same output for different optimizations 755: + // levels, not the same as sanity_check() 756: + 757: + char* bufs[8] = {0}; 758: + int lens[8] = {0}; 759: + char* sums[8] = {0}; 760: + 761: + bufs[0] = buf; 762: + lens[0] = len; 763: + sums[0] = sum_out; 764: + md5_parallel(1, bufs, lens, sums, NULL, NULL); 765: +} 766: + 767: +extern "C" { 768: +extern void MD5P8_Init_c(MD5P8_CTX *ctx); 769: +extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length); 770: +extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx); 771: +} 772: + 773: +static void sanity_check(char* buf, int32 len, char* sum_out) { 774: + // This should produce the same output for different optimizations 775: + // levels, not the same as benchmark_inner() 776: + if (md5_parallel_slots() <= 1) { 777: + MD5P8_CTX m5p8; 778: + MD5P8_Init_c(&m5p8); 779: + MD5P8_Update_c(&m5p8, (uchar *)buf, len); 780: + MD5P8_Final_c((uchar *)sum_out, &m5p8); 781: + } else { 782: + MD5P8_CTX m5p8; 783: + MD5P8_Init(&m5p8); 784: + MD5P8_Update(&m5p8, (uchar *)buf, len); 785: + MD5P8_Final((uchar *)sum_out, &m5p8); 786: + } 787: +} 788: + 789: +int main() { 790: + // This benchmarks the parallel MD5 checksum rather than get_checksum2() 791: + // as the latter would require compiling in a lot of rsync's code, but 792: + // it touches all the same internals so the performance should be nearly 793: + // identical. 794: + 795: + int i; 796: + char* buf = (char*)malloc(BLOCK_LEN); 797: + for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256; 798: + 799: + const char* method = "?"; 800: + switch (md5_parallel_slots()) { 801: + case 8: method = "AVX2"; break; 802: + case 4: method = "SSE2"; break; 803: +#ifdef USE_OPENSSL 804: + case 1: method = "OpenSSL"; break; 805: +#elif (CSUM_CHUNK == 64) 806: + case 1: method = "ASM"; break; 807: +#else 808: + // this won't happen unless you modified code somewhere 809: + case 1: method = "Raw-C"; break; 810: +#endif 811: + } 812: + 813: + benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots()); 814: + 815: + free(buf); 816: + return 0; 817: +} 818: +#endif /* BENCHMARK_SIMD_CHECKSUM2 */ 819: + 820: +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2) 821: #pragma GCC pop_options 822: #pragma clang optimize on 823: -#endif /* BENCHMARK_SIMD_CHECKSUM1 */ 824: +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */ 825: 826: #endif /* HAVE_SIMD */ 827: #endif /* __cplusplus */ 828: diff --git a/simd-md5-parallel-x86_64.cpp b/simd-md5-parallel-x86_64.cpp 829: new file mode 100644 830: --- /dev/null 831: +++ b/simd-md5-parallel-x86_64.cpp 832: @@ -0,0 +1,1138 @@ 833: +/* 834: + * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel. 835: + * 836: + * Original author: Nicolas Noble, 2017 837: + * Modifications: Jorrit Jongma, 2020 838: + * 839: + * The original code was released in the public domain by the original author, 840: + * falling back to the MIT license ( http://www.opensource.org/licenses/MIT ) 841: + * in case public domain does not apply in your country. These modifications 842: + * are likewise released in the public domain, with the same MIT license 843: + * fallback. 844: + * 845: + * The original publication can be found at: 846: + * 847: + * https://github.com/nicolasnoble/sse-hash 848: + */ 849: +/* 850: + * Nicolas' original code has been extended to add AVX2 support, all non-SIMD 851: + * MD5 code has been removed and those code paths rerouted to use the MD5 852: + * code already present in rsync, and wrapper functions have been added. The 853: + * MD5P8 code is also new, and is the reason for the new stride parameter. 854: + * 855: + * This code allows multiple independent MD5 streams to be processed in 856: + * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is 857: + * lower than that of the original C routines for MD5, the processing of 858: + * additional streams is "for free". 859: + * 860: + * Single streams are rerouted to rsync's normal MD5 code as that is faster 861: + * for that case. A further optimization is possible by using SSE2 code on 862: + * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not 863: + * implemented here as it would require some restructuring, and in practise 864: + * the code here is only rarely called with less than the maximum amount of 865: + * streams (typically once at the end of each checksum2'd file). 866: + * 867: + * Benchmarks (in MB/s) C ASM SSE2*1 SSE2*4 AVX2*1 AVX2*8 868: + * - Intel Atom D2700 302 334 166 664 N/A N/A 869: + * - Intel i7-7700hq 351 376 289 1156 273 2184 870: + * - AMD ThreadRipper 2950x 728 784 568 2272 430 3440 871: + */ 872: + 873: +#ifdef __x86_64__ 874: +#ifdef __cplusplus 875: + 876: +extern "C" { 877: + 878: +#include "rsync.h" 879: + 880: +} 881: + 882: +#ifdef HAVE_SIMD 883: + 884: +#ifndef BENCHMARK_SIMD_CHECKSUM2 885: +#define PMD5_ALLOW_SSE2 // debugging 886: +#define PMD5_ALLOW_AVX2 // debugging 887: +#endif 888: + 889: +#ifdef PMD5_ALLOW_AVX2 890: +#ifndef PMD5_ALLOW_SSE2 891: +#define PMD5_ALLOW_SSE2 892: +#endif 893: +#endif 894: + 895: +#include <stdint.h> 896: +#include <string.h> 897: + 898: +#include <immintrin.h> 899: + 900: +/* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */ 901: +#ifdef __clang__ 902: +#define MVSTATIC 903: +#else 904: +#define MVSTATIC static 905: +#endif 906: + 907: +// Missing from the headers on gcc 6 and older, clang 8 and older 908: +typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 909: +typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1))); 910: + 911: +#define PMD5_SLOTS_DEFAULT 0 912: +#define PMD5_SLOTS_SSE2 4 913: +#define PMD5_SLOTS_AVX2 8 914: +#define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2 915: + 916: +#ifdef PMD5_ALLOW_SSE2 917: +__attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots() 918: +{ 919: + return PMD5_SLOTS_SSE2; 920: +} 921: +#endif 922: + 923: +#ifdef PMD5_ALLOW_AVX2 924: +__attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots() 925: +{ 926: + return PMD5_SLOTS_AVX2; 927: +} 928: +#endif 929: + 930: +__attribute__ ((target("default"))) MVSTATIC int pmd5_slots() 931: +{ 932: + return PMD5_SLOTS_DEFAULT; 933: +} 934: + 935: +/* The parallel MD5 context structure. */ 936: +typedef struct { 937: + __m128i state_sse2[4]; 938: + __m256i state_avx2[4]; 939: + uint64_t len[PMD5_SLOTS_MAX]; 940: +} pmd5_context; 941: + 942: +/* The status returned by the various functions below. */ 943: +typedef enum { 944: + PMD5_SUCCESS, 945: + PMD5_INVALID_SLOT, 946: + PMD5_UNALIGNED_UPDATE, 947: +} pmd5_status; 948: + 949: +/* Initializes all slots in the given pmd5 context. */ 950: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx); 951: + 952: +/* Initializes a single slot out in the given pmd5 context. */ 953: +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot); 954: + 955: +/* Makes an MD5 update on all slots in parallel, given the same exact length on all streams. 956: + The stream pointers will be incremented accordingly. 957: + It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot. 958: + The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted. 959: + Stride defaults to 64 if 0 is passed. */ 960: +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride); 961: + 962: +/* Makes an MD5 update on all slots in parallel, given different lengths. 963: + The stream pointers will be incremented accordingly. 964: + The lengths will be decreased accordingly. Not all data might be consumed. 965: + It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot. 966: + The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */ 967: +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]); 968: + 969: +/* Finishes all slots at once. Fills in all digests. */ 970: +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]); 971: + 972: +/* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using 973: + a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */ 974: +static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot); 975: + 976: +/* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a 977: + multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using 978: + a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */ 979: +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length); 980: + 981: +/* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */ 982: +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot); 983: + 984: +/* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */ 985: +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot); 986: + 987: +#define S11 7 988: +#define S12 12 989: +#define S13 17 990: +#define S14 22 991: +#define S21 5 992: +#define S22 9 993: +#define S23 14 994: +#define S24 20 995: +#define S31 4 996: +#define S32 11 997: +#define S33 16 998: +#define S34 23 999: +#define S41 6 1000: +#define S42 10 1001: +#define S43 15 1002: +#define S44 21 1003: + 1004: +#define T1 0xD76AA478 1005: +#define T2 0xE8C7B756 1006: +#define T3 0x242070DB 1007: +#define T4 0xC1BDCEEE 1008: +#define T5 0xF57C0FAF 1009: +#define T6 0x4787C62A 1010: +#define T7 0xA8304613 1011: +#define T8 0xFD469501 1012: +#define T9 0x698098D8 1013: +#define T10 0x8B44F7AF 1014: +#define T11 0xFFFF5BB1 1015: +#define T12 0x895CD7BE 1016: +#define T13 0x6B901122 1017: +#define T14 0xFD987193 1018: +#define T15 0xA679438E 1019: +#define T16 0x49B40821 1020: +#define T17 0xF61E2562 1021: +#define T18 0xC040B340 1022: +#define T19 0x265E5A51 1023: +#define T20 0xE9B6C7AA 1024: +#define T21 0xD62F105D 1025: +#define T22 0x02441453 1026: +#define T23 0xD8A1E681 1027: +#define T24 0xE7D3FBC8 1028: +#define T25 0x21E1CDE6 1029: +#define T26 0xC33707D6 1030: +#define T27 0xF4D50D87 1031: +#define T28 0x455A14ED 1032: +#define T29 0xA9E3E905 1033: +#define T30 0xFCEFA3F8 1034: +#define T31 0x676F02D9 1035: +#define T32 0x8D2A4C8A 1036: +#define T33 0xFFFA3942 1037: +#define T34 0x8771F681 1038: +#define T35 0x6D9D6122 1039: +#define T36 0xFDE5380C 1040: +#define T37 0xA4BEEA44 1041: +#define T38 0x4BDECFA9 1042: +#define T39 0xF6BB4B60 1043: +#define T40 0xBEBFBC70 1044: +#define T41 0x289B7EC6 1045: +#define T42 0xEAA127FA 1046: +#define T43 0xD4EF3085 1047: +#define T44 0x04881D05 1048: +#define T45 0xD9D4D039 1049: +#define T46 0xE6DB99E5 1050: +#define T47 0x1FA27CF8 1051: +#define T48 0xC4AC5665 1052: +#define T49 0xF4292244 1053: +#define T50 0x432AFF97 1054: +#define T51 0xAB9423A7 1055: +#define T52 0xFC93A039 1056: +#define T53 0x655B59C3 1057: +#define T54 0x8F0CCC92 1058: +#define T55 0xFFEFF47D 1059: +#define T56 0x85845DD1 1060: +#define T57 0x6FA87E4F 1061: +#define T58 0xFE2CE6E0 1062: +#define T59 0xA3014314 1063: +#define T60 0x4E0811A1 1064: +#define T61 0xF7537E82 1065: +#define T62 0xBD3AF235 1066: +#define T63 0x2AD7D2BB 1067: +#define T64 0xEB86D391 1068: + 1069: +#define ROTL_SSE2(x, n) { \ 1070: + __m128i s; \ 1071: + s = _mm_srli_epi32(x, 32 - n); \ 1072: + x = _mm_slli_epi32(x, n); \ 1073: + x = _mm_or_si128(x, s); \ 1074: +}; 1075: + 1076: +#define ROTL_AVX2(x, n) { \ 1077: + __m256i s; \ 1078: + s = _mm256_srli_epi32(x, 32 - n); \ 1079: + x = _mm256_slli_epi32(x, n); \ 1080: + x = _mm256_or_si256(x, s); \ 1081: +}; 1082: + 1083: +#define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z)) 1084: +#define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y)) 1085: +#define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z) 1086: +#define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff)))) 1087: + 1088: +#define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z)) 1089: +#define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y)) 1090: +#define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z) 1091: +#define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff)))) 1092: + 1093: +#define SET_SSE2(step, a, b, c, d, x, s, ac) { \ 1094: + a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \ 1095: + ROTL_SSE2(a, s); \ 1096: + a = _mm_add_epi32(a, b); \ 1097: +} 1098: + 1099: +#define SET_AVX2(step, a, b, c, d, x, s, ac) { \ 1100: + a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \ 1101: + ROTL_AVX2(a, s); \ 1102: + a = _mm256_add_epi32(a, b); \ 1103: +} 1104: + 1105: +#define IA 0x67452301 1106: +#define IB 0xefcdab89 1107: +#define IC 0x98badcfe 1108: +#define ID 0x10325476 1109: + 1110: +#define GET_MD5_DATA(dest, src, pos) \ 1111: + dest = \ 1112: + ((uint32_t) src[pos + 0]) << 0 | \ 1113: + ((uint32_t) src[pos + 1]) << 8 | \ 1114: + ((uint32_t) src[pos + 2]) << 16 | \ 1115: + ((uint32_t) src[pos + 3]) << 24 1116: + 1117: +#define GET_PMD5_DATA_SSE2(dest, src, pos) { \ 1118: + uint32_t v0, v1, v2, v3; \ 1119: + GET_MD5_DATA(v0, src[0], pos); \ 1120: + GET_MD5_DATA(v1, src[1], pos); \ 1121: + GET_MD5_DATA(v2, src[2], pos); \ 1122: + GET_MD5_DATA(v3, src[3], pos); \ 1123: + dest = _mm_setr_epi32(v0, v1, v2, v3); \ 1124: +} 1125: + 1126: +#define GET_PMD5_DATA_AVX2(dest, src, pos) { \ 1127: + uint32_t v0, v1, v2, v3; \ 1128: + uint32_t v4, v5, v6, v7; \ 1129: + GET_MD5_DATA(v0, src[0], pos); \ 1130: + GET_MD5_DATA(v1, src[1], pos); \ 1131: + GET_MD5_DATA(v2, src[2], pos); \ 1132: + GET_MD5_DATA(v3, src[3], pos); \ 1133: + GET_MD5_DATA(v4, src[4], pos); \ 1134: + GET_MD5_DATA(v5, src[5], pos); \ 1135: + GET_MD5_DATA(v6, src[6], pos); \ 1136: + GET_MD5_DATA(v7, src[7], pos); \ 1137: + dest = _mm256_setr_epi32(v0, v1, v2, v3, \ 1138: + v4, v5, v6, v7); \ 1139: +} 1140: + 1141: +#define PUT_MD5_DATA(dest, val, pos) { \ 1142: + dest[pos + 0] = (val >> 0) & 0xff; \ 1143: + dest[pos + 1] = (val >> 8) & 0xff; \ 1144: + dest[pos + 2] = (val >> 16) & 0xff; \ 1145: + dest[pos + 3] = (val >> 24) & 0xff; \ 1146: +} 1147: + 1148: +const static uint8_t md5_padding[64] = { 1149: + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1150: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1151: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1152: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1153: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1154: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1155: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1156: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1157: +}; 1158: + 1159: +#ifdef PMD5_ALLOW_SSE2 1160: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx) 1161: +{ 1162: + int i; 1163: + for (i = 0; i < PMD5_SLOTS_MAX; i++) { 1164: + ctx->len[i] = 0; 1165: + } 1166: + 1167: + ctx->state_sse2[0] = _mm_set1_epi32(IA); 1168: + ctx->state_sse2[1] = _mm_set1_epi32(IB); 1169: + ctx->state_sse2[2] = _mm_set1_epi32(IC); 1170: + ctx->state_sse2[3] = _mm_set1_epi32(ID); 1171: + 1172: + return PMD5_SUCCESS; 1173: +} 1174: +#endif 1175: + 1176: +#ifdef PMD5_ALLOW_AVX2 1177: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx) 1178: +{ 1179: + int i; 1180: + for (i = 0; i < PMD5_SLOTS_MAX; i++) { 1181: + ctx->len[i] = 0; 1182: + } 1183: + 1184: + ctx->state_avx2[0] = _mm256_set1_epi32(IA); 1185: + ctx->state_avx2[1] = _mm256_set1_epi32(IB); 1186: + ctx->state_avx2[2] = _mm256_set1_epi32(IC); 1187: + ctx->state_avx2[3] = _mm256_set1_epi32(ID); 1188: + 1189: + return PMD5_SUCCESS; 1190: +} 1191: +#endif 1192: + 1193: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx) 1194: +{ 1195: + return PMD5_INVALID_SLOT; 1196: +} 1197: + 1198: +#ifdef PMD5_ALLOW_SSE2 1199: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d) 1200: +{ 1201: + if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0)) 1202: + return PMD5_INVALID_SLOT; 1203: + 1204: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2]; 1205: + int i; 1206: + 1207: + for (i = 0; i < 4; i++) { 1208: + _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]); 1209: + } 1210: + 1211: + v[0][slot] = a; 1212: + v[1][slot] = b; 1213: + v[2][slot] = c; 1214: + v[3][slot] = d; 1215: + 1216: + for (i = 0; i < 4; i++) { 1217: + ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]); 1218: + } 1219: + 1220: + return PMD5_SUCCESS; 1221: +} 1222: +#endif 1223: + 1224: +#ifdef PMD5_ALLOW_AVX2 1225: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d) 1226: +{ 1227: + if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0)) 1228: + return PMD5_INVALID_SLOT; 1229: + 1230: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2]; 1231: + int i; 1232: + 1233: + for (i = 0; i < 4; i++) { 1234: + _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]); 1235: + } 1236: + 1237: + v[0][slot] = a; 1238: + v[1][slot] = b; 1239: + v[2][slot] = c; 1240: + v[3][slot] = d; 1241: + 1242: + for (i = 0; i < 4; i++) { 1243: + ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]); 1244: + } 1245: + 1246: + return PMD5_SUCCESS; 1247: +} 1248: +#endif 1249: + 1250: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d) 1251: +{ 1252: + return PMD5_INVALID_SLOT; 1253: +} 1254: + 1255: +#ifdef PMD5_ALLOW_SSE2 1256: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) 1257: +{ 1258: + if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0)) 1259: + return PMD5_INVALID_SLOT; 1260: + 1261: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2]; 1262: + int i; 1263: + 1264: + for (i = 0; i < 4; i++) { 1265: + _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]); 1266: + } 1267: + 1268: + *a = v[0][slot]; 1269: + *b = v[1][slot]; 1270: + *c = v[2][slot]; 1271: + *d = v[3][slot]; 1272: + 1273: + return PMD5_SUCCESS; 1274: +} 1275: +#endif 1276: + 1277: +#ifdef PMD5_ALLOW_AVX2 1278: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) 1279: +{ 1280: + if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0)) 1281: + return PMD5_INVALID_SLOT; 1282: + 1283: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2]; 1284: + int i; 1285: + 1286: + for (i = 0; i < 4; i++) { 1287: + _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]); 1288: + } 1289: + 1290: + *a = v[0][slot]; 1291: + *b = v[1][slot]; 1292: + *c = v[2][slot]; 1293: + *d = v[3][slot]; 1294: + 1295: + return PMD5_SUCCESS; 1296: +} 1297: +#endif 1298: + 1299: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) 1300: +{ 1301: + return PMD5_INVALID_SLOT; 1302: +} 1303: + 1304: +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot) 1305: +{ 1306: + return pmd5_set_slot(ctx, slot, IA, IB, IC, ID); 1307: +} 1308: + 1309: +#ifdef PMD5_ALLOW_SSE2 1310: +__attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX]) 1311: +{ 1312: + __m128i W[MD5_DIGEST_LEN], a, b, c, d; 1313: + 1314: + GET_PMD5_DATA_SSE2(W[ 0], data, 0); 1315: + GET_PMD5_DATA_SSE2(W[ 1], data, 4); 1316: + GET_PMD5_DATA_SSE2(W[ 2], data, 8); 1317: + GET_PMD5_DATA_SSE2(W[ 3], data, 12); 1318: + GET_PMD5_DATA_SSE2(W[ 4], data, 16); 1319: + GET_PMD5_DATA_SSE2(W[ 5], data, 20); 1320: + GET_PMD5_DATA_SSE2(W[ 6], data, 24); 1321: + GET_PMD5_DATA_SSE2(W[ 7], data, 28); 1322: + GET_PMD5_DATA_SSE2(W[ 8], data, 32); 1323: + GET_PMD5_DATA_SSE2(W[ 9], data, 36); 1324: + GET_PMD5_DATA_SSE2(W[10], data, 40); 1325: + GET_PMD5_DATA_SSE2(W[11], data, 44); 1326: + GET_PMD5_DATA_SSE2(W[12], data, 48); 1327: + GET_PMD5_DATA_SSE2(W[13], data, 52); 1328: + GET_PMD5_DATA_SSE2(W[14], data, 56); 1329: + GET_PMD5_DATA_SSE2(W[15], data, 60); 1330: + 1331: + a = ctx->state_sse2[0]; 1332: + b = ctx->state_sse2[1]; 1333: + c = ctx->state_sse2[2]; 1334: + d = ctx->state_sse2[3]; 1335: + 1336: + SET_SSE2(F, a, b, c, d, W[ 0], S11, 1); 1337: + SET_SSE2(F, d, a, b, c, W[ 1], S12, 2); 1338: + SET_SSE2(F, c, d, a, b, W[ 2], S13, 3); 1339: + SET_SSE2(F, b, c, d, a, W[ 3], S14, 4); 1340: + SET_SSE2(F, a, b, c, d, W[ 4], S11, 5); 1341: + SET_SSE2(F, d, a, b, c, W[ 5], S12, 6); 1342: + SET_SSE2(F, c, d, a, b, W[ 6], S13, 7); 1343: + SET_SSE2(F, b, c, d, a, W[ 7], S14, 8); 1344: + SET_SSE2(F, a, b, c, d, W[ 8], S11, 9); 1345: + SET_SSE2(F, d, a, b, c, W[ 9], S12, 10); 1346: + SET_SSE2(F, c, d, a, b, W[10], S13, 11); 1347: + SET_SSE2(F, b, c, d, a, W[11], S14, 12); 1348: + SET_SSE2(F, a, b, c, d, W[12], S11, 13); 1349: + SET_SSE2(F, d, a, b, c, W[13], S12, 14); 1350: + SET_SSE2(F, c, d, a, b, W[14], S13, 15); 1351: + SET_SSE2(F, b, c, d, a, W[15], S14, 16); 1352: + 1353: + SET_SSE2(G, a, b, c, d, W[ 1], S21, 17); 1354: + SET_SSE2(G, d, a, b, c, W[ 6], S22, 18); 1355: + SET_SSE2(G, c, d, a, b, W[11], S23, 19); 1356: + SET_SSE2(G, b, c, d, a, W[ 0], S24, 20); 1357: + SET_SSE2(G, a, b, c, d, W[ 5], S21, 21); 1358: + SET_SSE2(G, d, a, b, c, W[10], S22, 22); 1359: + SET_SSE2(G, c, d, a, b, W[15], S23, 23); 1360: + SET_SSE2(G, b, c, d, a, W[ 4], S24, 24); 1361: + SET_SSE2(G, a, b, c, d, W[ 9], S21, 25); 1362: + SET_SSE2(G, d, a, b, c, W[14], S22, 26); 1363: + SET_SSE2(G, c, d, a, b, W[ 3], S23, 27); 1364: + SET_SSE2(G, b, c, d, a, W[ 8], S24, 28); 1365: + SET_SSE2(G, a, b, c, d, W[13], S21, 29); 1366: + SET_SSE2(G, d, a, b, c, W[ 2], S22, 30); 1367: + SET_SSE2(G, c, d, a, b, W[ 7], S23, 31); 1368: + SET_SSE2(G, b, c, d, a, W[12], S24, 32); 1369: + 1370: + SET_SSE2(H, a, b, c, d, W[ 5], S31, 33); 1371: + SET_SSE2(H, d, a, b, c, W[ 8], S32, 34); 1372: + SET_SSE2(H, c, d, a, b, W[11], S33, 35); 1373: + SET_SSE2(H, b, c, d, a, W[14], S34, 36); 1374: + SET_SSE2(H, a, b, c, d, W[ 1], S31, 37); 1375: + SET_SSE2(H, d, a, b, c, W[ 4], S32, 38); 1376: + SET_SSE2(H, c, d, a, b, W[ 7], S33, 39); 1377: + SET_SSE2(H, b, c, d, a, W[10], S34, 40); 1378: + SET_SSE2(H, a, b, c, d, W[13], S31, 41); 1379: + SET_SSE2(H, d, a, b, c, W[ 0], S32, 42); 1380: + SET_SSE2(H, c, d, a, b, W[ 3], S33, 43); 1381: + SET_SSE2(H, b, c, d, a, W[ 6], S34, 44); 1382: + SET_SSE2(H, a, b, c, d, W[ 9], S31, 45); 1383: + SET_SSE2(H, d, a, b, c, W[12], S32, 46); 1384: + SET_SSE2(H, c, d, a, b, W[15], S33, 47); 1385: + SET_SSE2(H, b, c, d, a, W[ 2], S34, 48); 1386: + 1387: + SET_SSE2(I, a, b, c, d, W[ 0], S41, 49); 1388: + SET_SSE2(I, d, a, b, c, W[ 7], S42, 50); 1389: + SET_SSE2(I, c, d, a, b, W[14], S43, 51); 1390: + SET_SSE2(I, b, c, d, a, W[ 5], S44, 52); 1391: + SET_SSE2(I, a, b, c, d, W[12], S41, 53); 1392: + SET_SSE2(I, d, a, b, c, W[ 3], S42, 54); 1393: + SET_SSE2(I, c, d, a, b, W[10], S43, 55); 1394: + SET_SSE2(I, b, c, d, a, W[ 1], S44, 56); 1395: + SET_SSE2(I, a, b, c, d, W[ 8], S41, 57); 1396: + SET_SSE2(I, d, a, b, c, W[15], S42, 58); 1397: + SET_SSE2(I, c, d, a, b, W[ 6], S43, 59); 1398: + SET_SSE2(I, b, c, d, a, W[13], S44, 60); 1399: + SET_SSE2(I, a, b, c, d, W[ 4], S41, 61); 1400: + SET_SSE2(I, d, a, b, c, W[11], S42, 62); 1401: + SET_SSE2(I, c, d, a, b, W[ 2], S43, 63); 1402: + SET_SSE2(I, b, c, d, a, W[ 9], S44, 64); 1403: + 1404: + ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a); 1405: + ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b); 1406: + ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c); 1407: + ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d); 1408: +} 1409: +#endif 1410: + 1411: +#ifdef PMD5_ALLOW_AVX2 1412: +__attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX]) 1413: +{ 1414: + __m256i W[MD5_DIGEST_LEN], a, b, c, d; 1415: + 1416: + GET_PMD5_DATA_AVX2(W[ 0], data, 0); 1417: + GET_PMD5_DATA_AVX2(W[ 1], data, 4); 1418: + GET_PMD5_DATA_AVX2(W[ 2], data, 8); 1419: + GET_PMD5_DATA_AVX2(W[ 3], data, 12); 1420: + GET_PMD5_DATA_AVX2(W[ 4], data, 16); 1421: + GET_PMD5_DATA_AVX2(W[ 5], data, 20); 1422: + GET_PMD5_DATA_AVX2(W[ 6], data, 24); 1423: + GET_PMD5_DATA_AVX2(W[ 7], data, 28); 1424: + GET_PMD5_DATA_AVX2(W[ 8], data, 32); 1425: + GET_PMD5_DATA_AVX2(W[ 9], data, 36); 1426: + GET_PMD5_DATA_AVX2(W[10], data, 40); 1427: + GET_PMD5_DATA_AVX2(W[11], data, 44); 1428: + GET_PMD5_DATA_AVX2(W[12], data, 48); 1429: + GET_PMD5_DATA_AVX2(W[13], data, 52); 1430: + GET_PMD5_DATA_AVX2(W[14], data, 56); 1431: + GET_PMD5_DATA_AVX2(W[15], data, 60); 1432: + 1433: + a = ctx->state_avx2[0]; 1434: + b = ctx->state_avx2[1]; 1435: + c = ctx->state_avx2[2]; 1436: + d = ctx->state_avx2[3]; 1437: + 1438: + SET_AVX2(F, a, b, c, d, W[ 0], S11, 1); 1439: + SET_AVX2(F, d, a, b, c, W[ 1], S12, 2); 1440: + SET_AVX2(F, c, d, a, b, W[ 2], S13, 3); 1441: + SET_AVX2(F, b, c, d, a, W[ 3], S14, 4); 1442: + SET_AVX2(F, a, b, c, d, W[ 4], S11, 5); 1443: + SET_AVX2(F, d, a, b, c, W[ 5], S12, 6); 1444: + SET_AVX2(F, c, d, a, b, W[ 6], S13, 7); 1445: + SET_AVX2(F, b, c, d, a, W[ 7], S14, 8); 1446: + SET_AVX2(F, a, b, c, d, W[ 8], S11, 9); 1447: + SET_AVX2(F, d, a, b, c, W[ 9], S12, 10); 1448: + SET_AVX2(F, c, d, a, b, W[10], S13, 11); 1449: + SET_AVX2(F, b, c, d, a, W[11], S14, 12); 1450: + SET_AVX2(F, a, b, c, d, W[12], S11, 13); 1451: + SET_AVX2(F, d, a, b, c, W[13], S12, 14); 1452: + SET_AVX2(F, c, d, a, b, W[14], S13, 15); 1453: + SET_AVX2(F, b, c, d, a, W[15], S14, 16); 1454: + 1455: + SET_AVX2(G, a, b, c, d, W[ 1], S21, 17); 1456: + SET_AVX2(G, d, a, b, c, W[ 6], S22, 18); 1457: + SET_AVX2(G, c, d, a, b, W[11], S23, 19); 1458: + SET_AVX2(G, b, c, d, a, W[ 0], S24, 20); 1459: + SET_AVX2(G, a, b, c, d, W[ 5], S21, 21); 1460: + SET_AVX2(G, d, a, b, c, W[10], S22, 22); 1461: + SET_AVX2(G, c, d, a, b, W[15], S23, 23); 1462: + SET_AVX2(G, b, c, d, a, W[ 4], S24, 24); 1463: + SET_AVX2(G, a, b, c, d, W[ 9], S21, 25); 1464: + SET_AVX2(G, d, a, b, c, W[14], S22, 26); 1465: + SET_AVX2(G, c, d, a, b, W[ 3], S23, 27); 1466: + SET_AVX2(G, b, c, d, a, W[ 8], S24, 28); 1467: + SET_AVX2(G, a, b, c, d, W[13], S21, 29); 1468: + SET_AVX2(G, d, a, b, c, W[ 2], S22, 30); 1469: + SET_AVX2(G, c, d, a, b, W[ 7], S23, 31); 1470: + SET_AVX2(G, b, c, d, a, W[12], S24, 32); 1471: + 1472: + SET_AVX2(H, a, b, c, d, W[ 5], S31, 33); 1473: + SET_AVX2(H, d, a, b, c, W[ 8], S32, 34); 1474: + SET_AVX2(H, c, d, a, b, W[11], S33, 35); 1475: + SET_AVX2(H, b, c, d, a, W[14], S34, 36); 1476: + SET_AVX2(H, a, b, c, d, W[ 1], S31, 37); 1477: + SET_AVX2(H, d, a, b, c, W[ 4], S32, 38); 1478: + SET_AVX2(H, c, d, a, b, W[ 7], S33, 39); 1479: + SET_AVX2(H, b, c, d, a, W[10], S34, 40); 1480: + SET_AVX2(H, a, b, c, d, W[13], S31, 41); 1481: + SET_AVX2(H, d, a, b, c, W[ 0], S32, 42); 1482: + SET_AVX2(H, c, d, a, b, W[ 3], S33, 43); 1483: + SET_AVX2(H, b, c, d, a, W[ 6], S34, 44); 1484: + SET_AVX2(H, a, b, c, d, W[ 9], S31, 45); 1485: + SET_AVX2(H, d, a, b, c, W[12], S32, 46); 1486: + SET_AVX2(H, c, d, a, b, W[15], S33, 47); 1487: + SET_AVX2(H, b, c, d, a, W[ 2], S34, 48); 1488: + 1489: + SET_AVX2(I, a, b, c, d, W[ 0], S41, 49); 1490: + SET_AVX2(I, d, a, b, c, W[ 7], S42, 50); 1491: + SET_AVX2(I, c, d, a, b, W[14], S43, 51); 1492: + SET_AVX2(I, b, c, d, a, W[ 5], S44, 52); 1493: + SET_AVX2(I, a, b, c, d, W[12], S41, 53); 1494: + SET_AVX2(I, d, a, b, c, W[ 3], S42, 54); 1495: + SET_AVX2(I, c, d, a, b, W[10], S43, 55); 1496: + SET_AVX2(I, b, c, d, a, W[ 1], S44, 56); 1497: + SET_AVX2(I, a, b, c, d, W[ 8], S41, 57); 1498: + SET_AVX2(I, d, a, b, c, W[15], S42, 58); 1499: + SET_AVX2(I, c, d, a, b, W[ 6], S43, 59); 1500: + SET_AVX2(I, b, c, d, a, W[13], S44, 60); 1501: + SET_AVX2(I, a, b, c, d, W[ 4], S41, 61); 1502: + SET_AVX2(I, d, a, b, c, W[11], S42, 62); 1503: + SET_AVX2(I, c, d, a, b, W[ 2], S43, 63); 1504: + SET_AVX2(I, b, c, d, a, W[ 9], S44, 64); 1505: + 1506: + ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a); 1507: + ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b); 1508: + ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c); 1509: + ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d); 1510: +} 1511: +#endif 1512: + 1513: +__attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX]) 1514: +{ 1515: +} 1516: + 1517: +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride) 1518: +{ 1519: + const uint8_t * ptrs[PMD5_SLOTS_MAX]; 1520: + 1521: + if (!length) return PMD5_SUCCESS; 1522: + 1523: + int slots = pmd5_slots(); 1524: + 1525: + if (!stride) stride = 64; 1526: + 1527: + int i; 1528: + for (i = 0; i < slots; i++) { 1529: + ptrs[i] = data[i]; 1530: + ctx->len[i] += length; 1531: + if (!ptrs[i]) ptrs[i] = md5_padding; 1532: + } 1533: + 1534: + while (length >= 64) { 1535: + pmd5_process(ctx, ptrs); 1536: + length -= 64; 1537: + for (i = 0; i < slots; i++) { 1538: + if (data[i]) ptrs[i] += stride; 1539: + } 1540: + } 1541: + 1542: + if (length) return PMD5_UNALIGNED_UPDATE; 1543: + 1544: + for (i = 0; i < slots; i++) { 1545: + if (data[i]) data[i] = ptrs[i]; 1546: + } 1547: + 1548: + return PMD5_SUCCESS; 1549: +} 1550: + 1551: +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]) 1552: +{ 1553: + uint64_t length = 0; 1554: + int slots = pmd5_slots(); 1555: + 1556: + int i; 1557: + for (i = 0; i < slots; i++) { 1558: + if ((length == 0) || (lengths[i] < length)) length = lengths[i]; 1559: + } 1560: + 1561: + for (i = 0; i < slots; i++) { 1562: + lengths[i] -= length; 1563: + } 1564: + 1565: + return pmd5_update_all_simple(ctx, data, length, 0); 1566: +} 1567: + 1568: +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length) 1569: +{ 1570: + MD5_CTX ctx; 1571: + 1572: + if ((slot >= pmd5_slots()) || (slot < 0)) 1573: + return PMD5_INVALID_SLOT; 1574: + 1575: + pmd5_to_md5(pctx, &ctx, slot); 1576: + if (data && length) { 1577: + MD5_Update(&ctx, data, length); 1578: + } 1579: + MD5_Final(digest, &ctx); 1580: + 1581: + return PMD5_SUCCESS; 1582: +} 1583: + 1584: +static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot) 1585: +{ 1586: + return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0); 1587: +} 1588: + 1589: +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]) 1590: +{ 1591: + int i; 1592: + for (i = 0; i < pmd5_slots(); i++) { 1593: + pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0); 1594: + } 1595: + return PMD5_SUCCESS; 1596: +} 1597: + 1598: +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot) 1599: +{ 1600: + if ((slot >= pmd5_slots()) || (slot < 0)) 1601: + return PMD5_INVALID_SLOT; 1602: + 1603: + // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting. 1604: + 1605: +#ifdef USE_OPENSSL 1606: + pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29); 1607: +#else 1608: + pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32); 1609: +#endif 1610: + return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D); 1611: +} 1612: + 1613: +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot) 1614: +{ 1615: + if ((slot >= pmd5_slots()) || (slot < 0)) 1616: + return PMD5_INVALID_SLOT; 1617: + 1618: + MD5_Init(ctx); 1619: + 1620: +#ifdef USE_OPENSSL 1621: + ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF; 1622: + ctx->Nh = pctx->len[slot] >> 29; 1623: + 1624: + uint32_t a, b, c, d; 1625: + pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d); 1626: + if (ret == PMD5_SUCCESS) { 1627: + ctx->A = a; 1628: + ctx->B = b; 1629: + ctx->C = c; 1630: + ctx->D = d; 1631: + } 1632: + return ret; 1633: +#else 1634: + ctx->totalN = pctx->len[slot] & 0xFFFFFFFF; 1635: + ctx->totalN2 = pctx->len[slot] >> 32; 1636: + return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D); 1637: +#endif 1638: +} 1639: + 1640: +/* With GCC 10 putting these implementations inside 'extern "C"' causes an 1641: + assembler error. That worked fine on GCC 5-9 and clang 6-10... 1642: + */ 1643: + 1644: +static inline int md5_parallel_slots_cpp() 1645: +{ 1646: + int slots = pmd5_slots(); 1647: + if (slots == 0) return 1; 1648: + return slots; 1649: +} 1650: + 1651: +static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4) 1652: +{ 1653: + int slots = md5_parallel_slots_cpp(); 1654: + if ((streams < 1) || (streams > slots)) return 0; 1655: + if (pre4 && post4) return 0; 1656: + 1657: + if (slots == 1) { 1658: + MD5_CTX ctx; 1659: + MD5_Init(&ctx); 1660: + if (pre4) { 1661: + MD5_Update(&ctx, (const unsigned char*)pre4, 4); 1662: + } 1663: + MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]); 1664: + if (post4) { 1665: + MD5_Update(&ctx, (const unsigned char*)post4, 4); 1666: + } 1667: + if (sum[0]) { 1668: + MD5_Final((uint8_t*)sum[0], &ctx); 1669: + } 1670: + return 0; 1671: + } 1672: + 1673: + int i; 1674: + int active[PMD5_SLOTS_MAX]; 1675: + char* buffers[PMD5_SLOTS_MAX]; 1676: + uint64_t left[PMD5_SLOTS_MAX]; 1677: + for (i = 0; i < PMD5_SLOTS_MAX; i++) { 1678: + active[i] = streams > i; 1679: + if (i < streams) { 1680: + buffers[i] = buf[i]; 1681: + left[i] = (uint64_t)len[i]; 1682: + } else { 1683: + buffers[i] = NULL; 1684: + left[i] = 0; 1685: + } 1686: + } 1687: + MD5_CTX results[PMD5_SLOTS_MAX]; 1688: + 1689: + pmd5_context ctx_simd; 1690: + if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0; 1691: + 1692: + if (pre4) { 1693: + char temp_buffers[PMD5_SLOTS_MAX][64]; 1694: + int have_any = 0; 1695: + for (i = 0; i < slots; i++) { 1696: + if (active[i]) { 1697: + if (left[i] < 60) { 1698: + MD5_Init(&results[i]); 1699: + MD5_Update(&results[i], (const unsigned char*)pre4, 4); 1700: + MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]); 1701: + active[i] = 0; 1702: + left[i] = 0; 1703: + } else { 1704: + memcpy(temp_buffers[i], pre4, 4); 1705: + memcpy(temp_buffers[i] + 4, buffers[i], 60); 1706: + buffers[i] += 60; 1707: + left[i] -= 60; 1708: + have_any = 1; 1709: + } 1710: + } 1711: + } 1712: + 1713: + if (have_any) { 1714: + char* ptrs[PMD5_SLOTS_MAX]; 1715: + for (i = 0; i < PMD5_SLOTS_MAX; i++) { 1716: + ptrs[i] = &temp_buffers[i][0]; 1717: + } 1718: + if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) { 1719: + return 0; 1720: + } 1721: + } 1722: + } 1723: + 1724: + int failed = 0; 1725: + while (true) { 1726: + for (i = 0; i < slots; i++) { 1727: + if (active[i] && (left[i] < 64)) { 1728: + if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) { 1729: + failed = 1; 1730: + } 1731: + active[i] = 0; 1732: + } 1733: + } 1734: + 1735: + uint64_t shortest = 0; 1736: + for (i = 0; i < slots; i++) { 1737: + if (!active[i]) { 1738: + buffers[i] = NULL; 1739: + } else if ((shortest == 0) || (left[i] < shortest)) { 1740: + shortest = left[i]; 1741: + } 1742: + } 1743: + 1744: + if (shortest > 0) { 1745: + shortest = shortest & ~63; 1746: + if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) { 1747: + failed = 1; 1748: + } 1749: + for (i = 0; i < slots; i++) { 1750: + if (active[i]) { 1751: + left[i] -= shortest; 1752: + } 1753: + } 1754: + } 1755: + 1756: + if (failed) { 1757: + return 0; 1758: + } else { 1759: + int have_any = 0; 1760: + for (i = 0; i < slots; i++) { 1761: + have_any |= active[i]; 1762: + } 1763: + if (!have_any) { 1764: + break; 1765: + } 1766: + } 1767: + } 1768: + 1769: + for (i = 0; i < slots; i++) { 1770: + if (i < streams) { 1771: + if (left[i] > 0) { 1772: + // buffer[i] == NULL here 1773: + MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]); 1774: + } 1775: + if (post4) { 1776: + MD5_Update(&results[i], (const unsigned char*)post4, 4); 1777: + } 1778: + if (sum[i]) { 1779: + MD5_Final((uint8_t*)sum[i], &results[i]); 1780: + } 1781: + } 1782: + } 1783: + 1784: + return 1; 1785: +} 1786: + 1787: +// each pmd5_context needs to be 32-byte aligned 1788: +#define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31))) 1789: + 1790: +static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx) 1791: +{ 1792: + int i; 1793: + for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) { 1794: + pmd5_init_all(MD5P8_Contexts_simd(ctx, i)); 1795: + } 1796: + ctx->used = 0; 1797: + ctx->next = 0; 1798: +} 1799: + 1800: +static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length) 1801: +{ 1802: + int slots = pmd5_slots(); 1803: + uint32 pos = 0; 1804: + 1805: + if ((ctx->used) || (length < 512)) { 1806: + int cpy = MIN(length, 512 - ctx->used); 1807: + memcpy(&ctx->buffer[ctx->used], input, cpy); 1808: + ctx->used += cpy; 1809: + length -= cpy; 1810: + pos += cpy; 1811: + 1812: + if (ctx->used == 512) { 1813: + if (slots == PMD5_SLOTS_AVX2) { 1814: + const uint8_t* ptrs[PMD5_SLOTS_MAX] = { 1815: + (uint8_t*)ctx->buffer, 1816: + (uint8_t*)(ctx->buffer + 64), 1817: + (uint8_t*)(ctx->buffer + 128), 1818: + (uint8_t*)(ctx->buffer + 192), 1819: + (uint8_t*)(ctx->buffer + 256), 1820: + (uint8_t*)(ctx->buffer + 320), 1821: + (uint8_t*)(ctx->buffer + 384), 1822: + (uint8_t*)(ctx->buffer + 448) 1823: + }; 1824: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0); 1825: + } else { 1826: + const uint8_t* ptrs1[PMD5_SLOTS_MAX] = { 1827: + (uint8_t*)ctx->buffer, 1828: + (uint8_t*)(ctx->buffer + 64), 1829: + (uint8_t*)(ctx->buffer + 128), 1830: + (uint8_t*)(ctx->buffer + 192) 1831: + }; 1832: + const uint8_t* ptrs2[PMD5_SLOTS_MAX] = { 1833: + (uint8_t*)(ctx->buffer + 256), 1834: + (uint8_t*)(ctx->buffer + 320), 1835: + (uint8_t*)(ctx->buffer + 384), 1836: + (uint8_t*)(ctx->buffer + 448) 1837: + }; 1838: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0); 1839: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0); 1840: + } 1841: + ctx->used = 0; 1842: + } 1843: + } 1844: + 1845: + if (length >= 512) { 1846: + uint32 blocks = length / 512; 1847: + if (slots == PMD5_SLOTS_AVX2) { 1848: + const uint8_t* ptrs[8] = { 1849: + (uint8_t*)(input + pos), 1850: + (uint8_t*)(input + pos + 64), 1851: + (uint8_t*)(input + pos + 128), 1852: + (uint8_t*)(input + pos + 192), 1853: + (uint8_t*)(input + pos + 256), 1854: + (uint8_t*)(input + pos + 320), 1855: + (uint8_t*)(input + pos + 384), 1856: + (uint8_t*)(input + pos + 448) 1857: + }; 1858: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512); 1859: + } else { 1860: + const uint8_t* ptrs1[4] = { 1861: + (uint8_t*)(input + pos), 1862: + (uint8_t*)(input + pos + 64), 1863: + (uint8_t*)(input + pos + 128), 1864: + (uint8_t*)(input + pos + 192) 1865: + }; 1866: + const uint8_t* ptrs2[4] = { 1867: + (uint8_t*)(input + pos + 256), 1868: + (uint8_t*)(input + pos + 320), 1869: + (uint8_t*)(input + pos + 384), 1870: + (uint8_t*)(input + pos + 448) 1871: + }; 1872: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512); 1873: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512); 1874: + } 1875: + pos += blocks * 512; 1876: + length -= blocks * 512; 1877: + } 1878: + 1879: + if (length) { 1880: + memcpy(ctx->buffer, &input[pos], length); 1881: + ctx->used = length; 1882: + } 1883: +} 1884: + 1885: +static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx) 1886: +{ 1887: + int i; 1888: + uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0; 1889: + if (ctx->used) { 1890: + uchar tmp[512]; 1891: + memset(tmp, 0, 512); 1892: + MD5P8_Update(ctx, tmp, 512 - ctx->used); 1893: + } 1894: + 1895: + uchar state[34*4] = {0}; 1896: + 1897: + MD5_CTX tmp; 1898: + for (i = 0; i < 8; i++) { 1899: + if (pmd5_slots() == PMD5_SLOTS_AVX2) { 1900: + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i); 1901: + } else if (i < 4) { 1902: + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i); 1903: + } else { 1904: + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4); 1905: + } 1906: +#ifdef USE_OPENSSL 1907: + if (low + tmp.Nl < low) high++; 1908: + low += tmp.Nl; 1909: + high += tmp.Nh; 1910: +#else 1911: + if (low + tmp.totalN < low) high++; 1912: + low += tmp.totalN; 1913: + high += tmp.totalN2; 1914: +#endif 1915: + SIVALu(state, i*16, tmp.A); 1916: + SIVALu(state, i*16 + 4, tmp.B); 1917: + SIVALu(state, i*16 + 8, tmp.C); 1918: + SIVALu(state, i*16 + 12, tmp.D); 1919: + } 1920: + 1921: +#ifndef USE_OPENSSL 1922: + high = (low >> 29) | (high << 3); 1923: + low = (low << 3); 1924: +#endif 1925: + 1926: + sub <<= 3; 1927: + if (low - sub > low) high--; 1928: + low -= sub; 1929: + 1930: + SIVALu(state, 32*4, low); 1931: + SIVALu(state, 33*4, high); 1932: + 1933: + MD5_CTX md; 1934: + MD5_Init(&md); 1935: + MD5_Update(&md, state, 34*4); 1936: + MD5_Final(digest, &md); 1937: +} 1938: + 1939: +extern "C" { 1940: + 1941: +int md5_parallel_slots() 1942: +{ 1943: + return md5_parallel_slots_cpp(); 1944: +} 1945: + 1946: +int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4) 1947: +{ 1948: + return md5_parallel_cpp(streams, buf, len, sum, pre4, post4); 1949: +} 1950: + 1951: +void MD5P8_Init(MD5P8_CTX *ctx) 1952: +{ 1953: + MD5P8_Init_cpp(ctx); 1954: +} 1955: + 1956: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length) 1957: +{ 1958: + MD5P8_Update_cpp(ctx, input, length); 1959: +} 1960: + 1961: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx) 1962: +{ 1963: + MD5P8_Final_cpp(digest, ctx); 1964: +} 1965: + 1966: +} // "C" 1967: + 1968: +#endif /* HAVE_SIMD */ 1969: +#endif /* __cplusplus */ 1970: +#endif /* __x86_64__ */