Annotation of embedaddon/rsync/patches/md5p8.diff, revision 1.1.1.1
1.1 misho 1: From: Jorrit Jongma <git@jongma.org>
2:
3: - MD5 optimization in block matching phase:
4:
5: MD5 hashes computed during rsync's block matching phase are independent
6: and thus possible to process in parallel. This code processes 4 blocks
7: in parallel if SSE2 is available, or 8 if AVX2 is available. An increase
8: of performance (or decrease of CPU usage) of up to 6x has been measured.
9:
10: A prefetching algorithm is used to predict and load upcoming blocks, as
11: this prevents the need for extensive modifications to other parts of
12: the rsync sources to get this working.
13:
14: This remains compatible with existing rsync builds using MD5 checksums.
15:
16: - MD5P8 whole-file checksum:
17:
18: Splits the input up into 8 independent streams (64-byte interleave), and
19: produces a final checksum based on the end state of those 8 streams. If
20: parallelization of MD5 hashing is available, the performance gain (or
21: CPU usage decrease) is 2x to 6x compared to traditional MD5.
22:
23: The rsync version on both ends of the connection need MD5P8 support
24: built-in for it to be used.
25:
26: xxHash is still preferred (and faster), but this provides a reasonably
27: fast fallback for the case where xxHash libraries are not available at
28: build time.
29:
30: based-on: e94bad1c156fc3910f24e2b3b71a81b0b0bdeb70
31: diff --git a/Makefile.in b/Makefile.in
32: --- a/Makefile.in
33: +++ b/Makefile.in
34: @@ -29,14 +29,14 @@ SHELL=/bin/sh
35: .SUFFIXES:
36: .SUFFIXES: .c .o
37:
38: -SIMD_x86_64=simd-checksum-x86_64.o
39: +SIMD_x86_64=simd-checksum-x86_64.o simd-md5-parallel-x86_64.o
40: ASM_x86_64=lib/md5-asm-x86_64.o
41:
42: GENFILES=configure.sh aclocal.m4 config.h.in rsync.1 rsync.1.html \
43: rsync-ssl.1 rsync-ssl.1.html rsyncd.conf.5 rsyncd.conf.5.html
44: HEADERS=byteorder.h config.h errcode.h proto.h rsync.h ifuncs.h itypes.h inums.h \
45: lib/pool_alloc.h lib/mdigest.h lib/md-defines.h version.h
46: -LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o \
47: +LIBOBJ=lib/wildmatch.o lib/compat.o lib/snprintf.o lib/mdfour.o lib/md5.o lib/md5p8.o \
48: lib/permstring.o lib/pool_alloc.o lib/sysacls.o lib/sysxattrs.o @LIBOBJS@
49: zlib_OBJS=zlib/deflate.o zlib/inffast.o zlib/inflate.o zlib/inftrees.o \
50: zlib/trees.o zlib/zutil.o zlib/adler32.o zlib/compress.o zlib/crc32.o
51: @@ -140,6 +140,9 @@ git-version.h: mkgitver $(wildcard $(srcdir)/.git/logs/HEAD)
52: simd-checksum-x86_64.o: simd-checksum-x86_64.cpp
53: @$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-checksum-x86_64.cpp
54:
55: +simd-md5-parallel-x86_64.o: simd-md5-parallel-x86_64.cpp
56: + @$(srcdir)/cmdormsg disable-simd $(CXX) -I. $(CXXFLAGS) $(CPPFLAGS) -c -o $@ $(srcdir)/simd-md5-parallel-x86_64.cpp
57: +
58: lib/md5-asm-x86_64.o: lib/md5-asm-x86_64.S config.h lib/md-defines.h
59: @$(srcdir)/cmdormsg disable-asm $(CC) -I. @NOEXECSTACK@ -c -o $@ $(srcdir)/lib/md5-asm-x86_64.S
60:
61: diff --git a/checksum.c b/checksum.c
62: --- a/checksum.c
63: +++ b/checksum.c
64: @@ -52,6 +52,7 @@ struct name_num_obj valid_checksums = {
65: { CSUM_XXH64, "xxh64", NULL },
66: { CSUM_XXH64, "xxhash", NULL },
67: #endif
68: + { CSUM_MD5P8, "md5p8", NULL },
69: { CSUM_MD5, "md5", NULL },
70: { CSUM_MD4, "md4", NULL },
71: { CSUM_NONE, "none", NULL },
72: @@ -141,6 +142,7 @@ int csum_len_for_type(int cst, BOOL flist_csum)
73: case CSUM_MD4_OLD:
74: case CSUM_MD4_BUSTED:
75: return MD4_DIGEST_LEN;
76: + case CSUM_MD5P8:
77: case CSUM_MD5:
78: return MD5_DIGEST_LEN;
79: case CSUM_XXH64:
80: @@ -167,6 +169,7 @@ int canonical_checksum(int csum_type)
81: case CSUM_MD4_BUSTED:
82: break;
83: case CSUM_MD4:
84: + case CSUM_MD5P8:
85: case CSUM_MD5:
86: return -1;
87: case CSUM_XXH64:
88: @@ -179,7 +182,9 @@ int canonical_checksum(int csum_type)
89: return 0;
90: }
91:
92: -#ifndef HAVE_SIMD /* See simd-checksum-*.cpp. */
93: +#ifdef HAVE_SIMD /* See simd-checksum-*.cpp. */
94: +#define get_checksum2 get_checksum2_nosimd
95: +#else
96: /*
97: a simple 32 bit checksum that can be updated from either end
98: (inspired by Mark Adler's Adler-32 checksum)
99: @@ -200,9 +205,18 @@ uint32 get_checksum1(char *buf1, int32 len)
100: }
101: return (s1 & 0xffff) + (s2 << 16);
102: }
103: +
104: +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
105: +{
106: +}
107: +
108: +void checksum2_disable_prefetch()
109: +{
110: +}
111: #endif
112:
113: -void get_checksum2(char *buf, int32 len, char *sum)
114: +/* Renamed to get_checksum2_nosimd() with HAVE_SIMD */
115: +void get_checksum2(char *buf, int32 len, char *sum, UNUSED(OFF_T prefetch_offset))
116: {
117: switch (xfersum_type) {
118: #ifdef SUPPORT_XXHASH
119: @@ -221,6 +235,7 @@ void get_checksum2(char *buf, int32 len, char *sum)
120: break;
121: }
122: #endif
123: + case CSUM_MD5P8: /* == CSUM_MD5 for checksum2 */
124: case CSUM_MD5: {
125: MD5_CTX m5;
126: uchar seedbuf[4];
127: @@ -373,6 +388,21 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
128: break;
129: }
130: #endif
131: + case CSUM_MD5P8: {
132: + MD5P8_CTX m5p8;
133: +
134: + MD5P8_Init(&m5p8);
135: +
136: + for (i = 0; i + CHUNK_SIZE <= len; i += CHUNK_SIZE)
137: + MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, CHUNK_SIZE), CHUNK_SIZE);
138: +
139: + remainder = (int32)(len - i);
140: + if (remainder > 0)
141: + MD5P8_Update(&m5p8, (uchar *)map_ptr(buf, i, remainder), remainder);
142: +
143: + MD5P8_Final((uchar *)sum, &m5p8);
144: + break;
145: + }
146: case CSUM_MD5: {
147: MD5_CTX m5;
148:
149: @@ -445,6 +475,7 @@ static union {
150: #endif
151: MD5_CTX m5;
152: } ctx;
153: +static MD5P8_CTX m5p8;
154: #ifdef SUPPORT_XXHASH
155: static XXH64_state_t* xxh64_state;
156: #endif
157: @@ -481,6 +512,9 @@ void sum_init(int csum_type, int seed)
158: XXH3_128bits_reset(xxh3_state);
159: break;
160: #endif
161: + case CSUM_MD5P8:
162: + MD5P8_Init(&m5p8);
163: + break;
164: case CSUM_MD5:
165: MD5_Init(&ctx.m5);
166: break;
167: @@ -531,6 +565,9 @@ void sum_update(const char *p, int32 len)
168: XXH3_128bits_update(xxh3_state, p, len);
169: break;
170: #endif
171: + case CSUM_MD5P8:
172: + MD5P8_Update(&m5p8, (uchar *)p, len);
173: + break;
174: case CSUM_MD5:
175: MD5_Update(&ctx.m5, (uchar *)p, len);
176: break;
177: @@ -596,6 +633,9 @@ int sum_end(char *sum)
178: break;
179: }
180: #endif
181: + case CSUM_MD5P8:
182: + MD5P8_Final((uchar *)sum, &m5p8);
183: + break;
184: case CSUM_MD5:
185: MD5_Final((uchar *)sum, &ctx.m5);
186: break;
187: diff --git a/generator.c b/generator.c
188: --- a/generator.c
189: +++ b/generator.c
190: @@ -729,10 +729,12 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
191: if (append_mode > 0 && f_copy < 0)
192: return 0;
193:
194: - if (len > 0)
195: + if (len > 0) {
196: mapbuf = map_file(fd, len, MAX_MAP_SIZE, sum.blength);
197: - else
198: + checksum2_enable_prefetch(mapbuf, len, sum.blength);
199: + } else {
200: mapbuf = NULL;
201: + }
202:
203: for (i = 0; i < sum.count; i++) {
204: int32 n1 = (int32)MIN(len, (OFF_T)sum.blength);
205: @@ -750,7 +752,7 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
206: }
207:
208: sum1 = get_checksum1(map, n1);
209: - get_checksum2(map, n1, sum2);
210: + get_checksum2(map, n1, sum2, offset - n1);
211:
212: if (DEBUG_GTE(DELTASUM, 3)) {
213: rprintf(FINFO,
214: @@ -762,8 +764,10 @@ static int generate_and_send_sums(int fd, OFF_T len, int f_out, int f_copy)
215: write_buf(f_out, sum2, sum.s2length);
216: }
217:
218: - if (mapbuf)
219: + if (mapbuf) {
220: unmap_file(mapbuf);
221: + checksum2_disable_prefetch();
222: + }
223:
224: return 0;
225: }
226: diff --git a/lib/md-defines.h b/lib/md-defines.h
227: --- a/lib/md-defines.h
228: +++ b/lib/md-defines.h
229: @@ -15,3 +15,4 @@
230: #define CSUM_XXH64 6
231: #define CSUM_XXH3_64 7
232: #define CSUM_XXH3_128 8
233: +#define CSUM_MD5P8 9
234: diff --git a/lib/md5p8.c b/lib/md5p8.c
235: new file mode 100644
236: --- /dev/null
237: +++ b/lib/md5p8.c
238: @@ -0,0 +1,128 @@
239: +/*
240: + * MD5-based hash friendly to parallel processing, reference implementation
241: + *
242: + * Author: Jorrit Jongma, 2020
243: + *
244: + * Released in the public domain falling back to the MIT license
245: + * ( http://www.opensource.org/licenses/MIT ) in case public domain does not
246: + * apply in your country.
247: + */
248: +/*
249: + * MD5P8 is an MD5-based hash friendly to parallel processing. The input
250: + * stream is divided into 8 independent streams. For each 512 bytes of input,
251: + * the first 64 bytes are send to the first stream, the second 64 bytes to
252: + * the second stream, etc. The input stream is padded with zeros to the next
253: + * multiple of 512 bytes, then a normal MD5 hash is computed on a buffer
254: + * containing the A, B, C, and D states of the 8 individual streams, followed
255: + * by the (unpadded) length of the input.
256: + *
257: + * On non-SIMD accelerated CPUs the performance of MD5P8 is slightly lower
258: + * than normal MD5 (particularly on files smaller than 10 kB), but with
259: + * SIMD-based parallel processing it can be two to six times as fast. Even in
260: + * the best-case scenario, xxHash is still at least twice as fast and should
261: + * be preferred when available.
262: + */
263: +
264: +#include "rsync.h"
265: +
266: +#ifdef HAVE_SIMD
267: +#define MD5P8_Init MD5P8_Init_c
268: +#define MD5P8_Update MD5P8_Update_c
269: +#define MD5P8_Final MD5P8_Final_c
270: +#endif
271: +
272: +/* each MD5_CTX needs to be 8-byte aligned */
273: +#define MD5P8_Contexts_c(ctx, index) ((MD5_CTX*)((((uintptr_t)((ctx)->context_storage) + 7) & ~7) + (index)*((sizeof(MD5_CTX) + 7) & ~7)))
274: +
275: +void MD5P8_Init(MD5P8_CTX *ctx)
276: +{
277: + int i;
278: + for (i = 0; i < 8; i++) {
279: + MD5_Init(MD5P8_Contexts_c(ctx, i));
280: + }
281: + ctx->used = 0;
282: + ctx->next = 0;
283: +}
284: +
285: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
286: +{
287: + uint32 pos = 0;
288: +
289: + if ((ctx->used) || (length < 64)) {
290: + int cpy = MIN(length, 64 - ctx->used);
291: + memmove(&ctx->buffer[ctx->used], input, cpy);
292: + ctx->used += cpy;
293: + length -= cpy;
294: + pos += cpy;
295: +
296: + if (ctx->used == 64) {
297: + MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), ctx->buffer, 64);
298: + ctx->used = 0;
299: + ctx->next = (ctx->next + 1) % 8;
300: + }
301: + }
302: +
303: + while (length >= 64) {
304: + MD5_Update(MD5P8_Contexts_c(ctx, ctx->next), &input[pos], 64);
305: + ctx->next = (ctx->next + 1) % 8;
306: + pos += 64;
307: + length -= 64;
308: + }
309: +
310: + if (length) {
311: + memcpy(ctx->buffer, &input[pos], length);
312: + ctx->used = length;
313: + }
314: +}
315: +
316: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
317: +{
318: + int i;
319: + uint32 low = 0, high = 0, sub = ctx->used ? 64 - ctx->used : 0;
320: + if (ctx->used) {
321: + uchar tmp[64];
322: + memset(tmp, 0, 64);
323: + MD5P8_Update(ctx, tmp, 64 - ctx->used);
324: + }
325: + memset(ctx->buffer, 0, 64);
326: + while (ctx->next != 0) {
327: + MD5P8_Update(ctx, ctx->buffer, 64);
328: + sub += 64;
329: + }
330: +
331: + uchar state[34*4] = {0};
332: +
333: + for (i = 0; i < 8; i++) {
334: + MD5_CTX* md = MD5P8_Contexts_c(ctx, i);
335: +#ifdef USE_OPENSSL
336: + if (low + md->Nl < low) high++;
337: + low += md->Nl;
338: + high += md->Nh;
339: +#else
340: + if (low + md->totalN < low) high++;
341: + low += md->totalN;
342: + high += md->totalN2;
343: +#endif
344: + SIVALu(state, i*16, md->A);
345: + SIVALu(state, i*16 + 4, md->B);
346: + SIVALu(state, i*16 + 8, md->C);
347: + SIVALu(state, i*16 + 12, md->D);
348: + }
349: +
350: +#ifndef USE_OPENSSL
351: + high = (low >> 29) | (high << 3);
352: + low = (low << 3);
353: +#endif
354: +
355: + sub <<= 3;
356: + if (low - sub > low) high--;
357: + low -= sub;
358: +
359: + SIVALu(state, 32*4, low);
360: + SIVALu(state, 33*4, high);
361: +
362: + MD5_CTX md;
363: + MD5_Init(&md);
364: + MD5_Update(&md, state, 34*4);
365: + MD5_Final(digest, &md);
366: +}
367: diff --git a/lib/mdigest.h b/lib/mdigest.h
368: --- a/lib/mdigest.h
369: +++ b/lib/mdigest.h
370: @@ -27,3 +27,14 @@ void md5_begin(md_context *ctx);
371: void md5_update(md_context *ctx, const uchar *input, uint32 length);
372: void md5_result(md_context *ctx, uchar digest[MD5_DIGEST_LEN]);
373: #endif
374: +
375: +typedef struct {
376: + uchar context_storage[1024];
377: + uchar buffer[512];
378: + unsigned int used;
379: + unsigned int next;
380: +} MD5P8_CTX;
381: +
382: +void MD5P8_Init(MD5P8_CTX *ctx);
383: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length);
384: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
385: diff --git a/match.c b/match.c
386: --- a/match.c
387: +++ b/match.c
388: @@ -164,6 +164,8 @@ static void hash_search(int f,struct sum_struct *s,
389: if (DEBUG_GTE(DELTASUM, 3))
390: rprintf(FINFO, "sum=%.8x k=%ld\n", sum, (long)k);
391:
392: + checksum2_enable_prefetch(buf, len, s->blength);
393: +
394: offset = aligned_offset = aligned_i = 0;
395:
396: end = len + 1 - s->sums[s->count-1].len;
397: @@ -226,7 +228,7 @@ static void hash_search(int f,struct sum_struct *s,
398:
399: if (!done_csum2) {
400: map = (schar *)map_ptr(buf,offset,l);
401: - get_checksum2((char *)map,l,sum2);
402: + get_checksum2((char *)map, l, sum2, offset);
403: done_csum2 = 1;
404: }
405:
406: @@ -268,7 +270,7 @@ static void hash_search(int f,struct sum_struct *s,
407: sum = get_checksum1((char *)map, l);
408: if (sum != s->sums[i].sum1)
409: goto check_want_i;
410: - get_checksum2((char *)map, l, sum2);
411: + get_checksum2((char *)map, l, sum2, aligned_offset);
412: if (memcmp(sum2, s->sums[i].sum2, s->s2length) != 0)
413: goto check_want_i;
414: /* OK, we have a re-alignment match. Bump the offset
415: @@ -335,6 +337,8 @@ static void hash_search(int f,struct sum_struct *s,
416: matched(f, s, buf, offset - s->blength, -2);
417: } while (++offset < end);
418:
419: + checksum2_disable_prefetch();
420: +
421: matched(f, s, buf, len, -1);
422: map_ptr(buf, len-1, 1);
423: }
424: diff --git a/simd-checksum-x86_64.cpp b/simd-checksum-x86_64.cpp
425: --- a/simd-checksum-x86_64.cpp
426: +++ b/simd-checksum-x86_64.cpp
427: @@ -49,13 +49,33 @@
428: * use of the target attribute, selecting the fastest code path based on
429: * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
430: * GCC 4.x are not supported to ease configure.ac logic.
431: + *
432: + * ----
433: + *
434: + * get_checksum2() is optimized for the case where the selected transfer
435: + * checksum is MD5. MD5 can't be made significantly faster with SIMD
436: + * instructions than the assembly version already included but SIMD
437: + * instructions can be used to hash multiple streams in parallel (see
438: + * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
439: + * block-matching algorithm hashes the blocks independently (in contrast to
440: + * the whole-file checksum) this method can be employed here.
441: + *
442: + * To prevent needing to modify the core rsync sources significantly, a
443: + * prefetching strategy is used. When a checksum2 is requested, the code
444: + * reads ahead several blocks, creates the MD5 hashes for each block in
445: + * parallel, returns the hash for the first block, and caches the results
446: + * for the other blocks to return in future calls to get_checksum2().
447: */
448:
449: #ifdef __x86_64__
450: #ifdef __cplusplus
451:
452: +extern "C" {
453: +
454: #include "rsync.h"
455:
456: +}
457: +
458: #ifdef HAVE_SIMD
459:
460: #include <immintrin.h>
461: @@ -480,9 +500,235 @@ uint32 get_checksum1(char *buf1, int32 len)
462: return get_checksum1_cpp(buf1, len);
463: }
464:
465: -} // extern "C"
466: +#if !defined(BENCHMARK_SIMD_CHECKSUM1)
467:
468: -#ifdef BENCHMARK_SIMD_CHECKSUM1
469: +// see simd-md5-parallel-x86_64.cpp
470: +extern int md5_parallel_slots();
471: +extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
472: +
473: +#endif /* !BENCHMARK_SIMD_CHECKSUM1 */
474: +
475: +#if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
476: +
477: +#define PREFETCH_ENABLE 1 // debugging
478: +
479: +#if 0 // debugging
480: +#define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
481: +#else
482: +#define PREFETCH_PRINTF(f_, ...) (void)0;
483: +#endif
484: +
485: +#define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
486: +#define PREFETCH_MAX_BLOCKS 8
487: +
488: +typedef struct {
489: + int in_use;
490: + OFF_T offset;
491: + int32 len;
492: + char sum[SUM_LENGTH];
493: +} prefetch_sum_t;
494: +
495: +typedef struct {
496: + struct map_struct *map;
497: + OFF_T len;
498: + OFF_T last;
499: + int32 blocklen;
500: + int blocks;
501: + prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
502: +} prefetch_t;
503: +
504: +prefetch_t *prefetch;
505: +
506: +extern int xfersum_type;
507: +extern int checksum_seed;
508: +extern int proper_seed_order;
509: +extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
510: +
511: +extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
512: +
513: +void checksum2_disable_prefetch()
514: +{
515: + if (prefetch) {
516: + PREFETCH_PRINTF("checksum2_disable_prefetch\n");
517: + free(prefetch);
518: + prefetch = NULL;
519: + }
520: +}
521: +
522: +void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
523: +{
524: +#ifdef PREFETCH_ENABLE
525: + checksum2_disable_prefetch();
526: + int slots = md5_parallel_slots();
527: + if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
528: + prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
529: + memset(prefetch, 0, sizeof(prefetch_t));
530: + prefetch->map = map;
531: + prefetch->len = len;
532: + prefetch->last = 0;
533: + prefetch->blocklen = blocklen;
534: + prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
535: + PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
536: + }
537: +#endif
538: +}
539: +
540: +static inline void checksum2_reset_prefetch()
541: +{
542: + for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
543: + prefetch->sums[i].in_use = 0;
544: + }
545: +}
546: +
547: +static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
548: +{
549: + if (prefetch->sums[0].in_use) {
550: + if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
551: + memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
552: + for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
553: + prefetch->sums[i] = prefetch->sums[i + 1];
554: + }
555: + prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
556: + PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
557: + return 1;
558: + } else {
559: + // unexpected access, reset cache
560: + PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
561: + checksum2_reset_prefetch();
562: + }
563: + }
564: + return 0;
565: +}
566: +
567: +static int checksum2_perform_prefetch(OFF_T prefetch_offset)
568: +{
569: + int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
570: + if (blocks < 2) return 0; // fall through to non-simd, probably faster
571: +
572: + int32 total = 0;
573: + int i;
574: + for (i = 0; i < blocks; i++) {
575: + prefetch->sums[i].offset = prefetch_offset + total;
576: + prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
577: + prefetch->sums[i].in_use = 0;
578: + total += prefetch->sums[i].len;
579: + }
580: + for (; i < PREFETCH_MAX_BLOCKS; i++) {
581: + prefetch->sums[i].in_use = 0;
582: + }
583: +
584: + uchar seedbuf[4];
585: + SIVALu(seedbuf, 0, checksum_seed);
586: +
587: + PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
588: + char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
589: + char* bufs[PREFETCH_MAX_BLOCKS] = {0};
590: + int lens[PREFETCH_MAX_BLOCKS] = {0};
591: + char* sums[PREFETCH_MAX_BLOCKS] = {0};
592: + for (i = 0; i < blocks; i++) {
593: + bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
594: + lens[i] = prefetch->sums[i].len;
595: + sums[i] = prefetch->sums[i].sum;
596: + }
597: + if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
598: + for (i = 0; i < blocks; i++) {
599: + prefetch->sums[i].in_use = 1;
600: + }
601: + return 1;
602: + } else {
603: + // this should never be, abort
604: + PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
605: + checksum2_disable_prefetch();
606: + }
607: + return 0;
608: +}
609: +
610: +void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
611: +{
612: + if (prefetch) {
613: + PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
614: + OFF_T last = prefetch->last;
615: + prefetch->last = prefetch_offset;
616: + if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
617: + // we're looking around trying to align blocks, prefetching will slow things down
618: + PREFETCH_PRINTF("get_checksum2 SEEK\n");
619: + checksum2_reset_prefetch();
620: + } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
621: + // hit
622: + return;
623: + } else if (checksum2_perform_prefetch(prefetch_offset)) {
624: + if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
625: + // hit; should always be as we just fetched this data
626: + return;
627: + } else {
628: + // this should never be, abort
629: + PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
630: + checksum2_disable_prefetch();
631: + }
632: + }
633: + }
634: + get_checksum2_nosimd(buf, len, sum, prefetch_offset);
635: +}
636: +#endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
637: +
638: +} // "C"
639: +
640: +/* Benchmark compilation
641: +
642: + The get_checksum1() benchmark runs through all available code paths in a
643: + single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
644: + recompiled for each code path (it always uses the fastest path available
645: + on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
646: + be used when applicable regardless of rsync being built with OpenSSL.
647: +
648: + Something like the following should compile and run the benchmarks:
649: +
650: + # if gcc
651: + export CC=gcc
652: + export CXX=g++
653: + export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
654: +
655: + # else if clang
656: + export CC=clang
657: + export CXX=clang++
658: + export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
659: +
660: + # /if
661: +
662: + export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
663: + export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
664: + export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
665: + export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
666: +
667: + rm bench_csum*
668: +
669: + ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
670: +
671: + $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
672: +
673: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
674: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
675: +
676: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
677: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
678: +
679: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
680: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
681: +
682: + ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
683: +
684: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
685: + $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
686: +
687: + ./bench_csum1.all
688: + ./bench_csum2.asm
689: + ./bench_csum2.openssl
690: + ./bench_csum2.sse2
691: + ./bench_csum2.avx2
692: +
693: + */
694: +
695: +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
696: #pragma clang optimize off
697: #pragma GCC push_options
698: #pragma GCC optimize ("O0")
699: @@ -493,7 +739,9 @@ uint32 get_checksum1(char *buf1, int32 len)
700: #ifndef CLOCK_MONOTONIC_RAW
701: #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
702: #endif
703: +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
704:
705: +#ifdef BENCHMARK_SIMD_CHECKSUM1
706: static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
707: struct timespec start, end;
708: uint64_t us;
709: @@ -509,7 +757,7 @@ static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int
710: clock_gettime(CLOCK_MONOTONIC_RAW, &end);
711: us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
712: cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
713: - printf("%-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
714: + printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
715: }
716:
717: static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
718: @@ -533,10 +781,108 @@ int main() {
719: free(buf);
720: return 0;
721: }
722: +#endif /* BENCHMARK_SIMD_CHECKSUM1 */
723:
724: +#ifdef BENCHMARK_SIMD_CHECKSUM2
725: +static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
726: + struct timespec start, end;
727: + uint64_t us;
728: + unsigned char cs1[16];
729: + unsigned char cs2[16];
730: + int i;
731: +
732: + clock_gettime(CLOCK_MONOTONIC_RAW, &start);
733: + for (i = 0; i < ROUNDS; i++) {
734: + func(buf, len, (char*)cs1);
735: + }
736: + clock_gettime(CLOCK_MONOTONIC_RAW, &end);
737: + us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
738: +
739: + func2(buf, len, (char*)cs2);
740: +
741: + float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
742: + printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
743: + for (i = 0; i < 16; i++) {
744: + printf("%02x", cs1[i] & 0xFF);
745: + }
746: + printf(" :: ");
747: + for (i = 0; i < 16; i++) {
748: + printf("%02x", cs2[i] & 0xFF);
749: + }
750: + printf("\n");
751: +}
752: +
753: +static void benchmark_inner(char* buf, int32 len, char* sum_out) {
754: + // This should produce the same output for different optimizations
755: + // levels, not the same as sanity_check()
756: +
757: + char* bufs[8] = {0};
758: + int lens[8] = {0};
759: + char* sums[8] = {0};
760: +
761: + bufs[0] = buf;
762: + lens[0] = len;
763: + sums[0] = sum_out;
764: + md5_parallel(1, bufs, lens, sums, NULL, NULL);
765: +}
766: +
767: +extern "C" {
768: +extern void MD5P8_Init_c(MD5P8_CTX *ctx);
769: +extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
770: +extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
771: +}
772: +
773: +static void sanity_check(char* buf, int32 len, char* sum_out) {
774: + // This should produce the same output for different optimizations
775: + // levels, not the same as benchmark_inner()
776: + if (md5_parallel_slots() <= 1) {
777: + MD5P8_CTX m5p8;
778: + MD5P8_Init_c(&m5p8);
779: + MD5P8_Update_c(&m5p8, (uchar *)buf, len);
780: + MD5P8_Final_c((uchar *)sum_out, &m5p8);
781: + } else {
782: + MD5P8_CTX m5p8;
783: + MD5P8_Init(&m5p8);
784: + MD5P8_Update(&m5p8, (uchar *)buf, len);
785: + MD5P8_Final((uchar *)sum_out, &m5p8);
786: + }
787: +}
788: +
789: +int main() {
790: + // This benchmarks the parallel MD5 checksum rather than get_checksum2()
791: + // as the latter would require compiling in a lot of rsync's code, but
792: + // it touches all the same internals so the performance should be nearly
793: + // identical.
794: +
795: + int i;
796: + char* buf = (char*)malloc(BLOCK_LEN);
797: + for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
798: +
799: + const char* method = "?";
800: + switch (md5_parallel_slots()) {
801: + case 8: method = "AVX2"; break;
802: + case 4: method = "SSE2"; break;
803: +#ifdef USE_OPENSSL
804: + case 1: method = "OpenSSL"; break;
805: +#elif (CSUM_CHUNK == 64)
806: + case 1: method = "ASM"; break;
807: +#else
808: + // this won't happen unless you modified code somewhere
809: + case 1: method = "Raw-C"; break;
810: +#endif
811: + }
812: +
813: + benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
814: +
815: + free(buf);
816: + return 0;
817: +}
818: +#endif /* BENCHMARK_SIMD_CHECKSUM2 */
819: +
820: +#if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
821: #pragma GCC pop_options
822: #pragma clang optimize on
823: -#endif /* BENCHMARK_SIMD_CHECKSUM1 */
824: +#endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
825:
826: #endif /* HAVE_SIMD */
827: #endif /* __cplusplus */
828: diff --git a/simd-md5-parallel-x86_64.cpp b/simd-md5-parallel-x86_64.cpp
829: new file mode 100644
830: --- /dev/null
831: +++ b/simd-md5-parallel-x86_64.cpp
832: @@ -0,0 +1,1138 @@
833: +/*
834: + * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel.
835: + *
836: + * Original author: Nicolas Noble, 2017
837: + * Modifications: Jorrit Jongma, 2020
838: + *
839: + * The original code was released in the public domain by the original author,
840: + * falling back to the MIT license ( http://www.opensource.org/licenses/MIT )
841: + * in case public domain does not apply in your country. These modifications
842: + * are likewise released in the public domain, with the same MIT license
843: + * fallback.
844: + *
845: + * The original publication can be found at:
846: + *
847: + * https://github.com/nicolasnoble/sse-hash
848: + */
849: +/*
850: + * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
851: + * MD5 code has been removed and those code paths rerouted to use the MD5
852: + * code already present in rsync, and wrapper functions have been added. The
853: + * MD5P8 code is also new, and is the reason for the new stride parameter.
854: + *
855: + * This code allows multiple independent MD5 streams to be processed in
856: + * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
857: + * lower than that of the original C routines for MD5, the processing of
858: + * additional streams is "for free".
859: + *
860: + * Single streams are rerouted to rsync's normal MD5 code as that is faster
861: + * for that case. A further optimization is possible by using SSE2 code on
862: + * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not
863: + * implemented here as it would require some restructuring, and in practise
864: + * the code here is only rarely called with less than the maximum amount of
865: + * streams (typically once at the end of each checksum2'd file).
866: + *
867: + * Benchmarks (in MB/s) C ASM SSE2*1 SSE2*4 AVX2*1 AVX2*8
868: + * - Intel Atom D2700 302 334 166 664 N/A N/A
869: + * - Intel i7-7700hq 351 376 289 1156 273 2184
870: + * - AMD ThreadRipper 2950x 728 784 568 2272 430 3440
871: + */
872: +
873: +#ifdef __x86_64__
874: +#ifdef __cplusplus
875: +
876: +extern "C" {
877: +
878: +#include "rsync.h"
879: +
880: +}
881: +
882: +#ifdef HAVE_SIMD
883: +
884: +#ifndef BENCHMARK_SIMD_CHECKSUM2
885: +#define PMD5_ALLOW_SSE2 // debugging
886: +#define PMD5_ALLOW_AVX2 // debugging
887: +#endif
888: +
889: +#ifdef PMD5_ALLOW_AVX2
890: +#ifndef PMD5_ALLOW_SSE2
891: +#define PMD5_ALLOW_SSE2
892: +#endif
893: +#endif
894: +
895: +#include <stdint.h>
896: +#include <string.h>
897: +
898: +#include <immintrin.h>
899: +
900: +/* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
901: +#ifdef __clang__
902: +#define MVSTATIC
903: +#else
904: +#define MVSTATIC static
905: +#endif
906: +
907: +// Missing from the headers on gcc 6 and older, clang 8 and older
908: +typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
909: +typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
910: +
911: +#define PMD5_SLOTS_DEFAULT 0
912: +#define PMD5_SLOTS_SSE2 4
913: +#define PMD5_SLOTS_AVX2 8
914: +#define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2
915: +
916: +#ifdef PMD5_ALLOW_SSE2
917: +__attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots()
918: +{
919: + return PMD5_SLOTS_SSE2;
920: +}
921: +#endif
922: +
923: +#ifdef PMD5_ALLOW_AVX2
924: +__attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots()
925: +{
926: + return PMD5_SLOTS_AVX2;
927: +}
928: +#endif
929: +
930: +__attribute__ ((target("default"))) MVSTATIC int pmd5_slots()
931: +{
932: + return PMD5_SLOTS_DEFAULT;
933: +}
934: +
935: +/* The parallel MD5 context structure. */
936: +typedef struct {
937: + __m128i state_sse2[4];
938: + __m256i state_avx2[4];
939: + uint64_t len[PMD5_SLOTS_MAX];
940: +} pmd5_context;
941: +
942: +/* The status returned by the various functions below. */
943: +typedef enum {
944: + PMD5_SUCCESS,
945: + PMD5_INVALID_SLOT,
946: + PMD5_UNALIGNED_UPDATE,
947: +} pmd5_status;
948: +
949: +/* Initializes all slots in the given pmd5 context. */
950: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx);
951: +
952: +/* Initializes a single slot out in the given pmd5 context. */
953: +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot);
954: +
955: +/* Makes an MD5 update on all slots in parallel, given the same exact length on all streams.
956: + The stream pointers will be incremented accordingly.
957: + It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
958: + The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted.
959: + Stride defaults to 64 if 0 is passed. */
960: +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride);
961: +
962: +/* Makes an MD5 update on all slots in parallel, given different lengths.
963: + The stream pointers will be incremented accordingly.
964: + The lengths will be decreased accordingly. Not all data might be consumed.
965: + It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
966: + The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */
967: +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]);
968: +
969: +/* Finishes all slots at once. Fills in all digests. */
970: +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]);
971: +
972: +/* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
973: + a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
974: +static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot);
975: +
976: +/* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a
977: + multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using
978: + a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
979: +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length);
980: +
981: +/* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */
982: +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot);
983: +
984: +/* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */
985: +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot);
986: +
987: +#define S11 7
988: +#define S12 12
989: +#define S13 17
990: +#define S14 22
991: +#define S21 5
992: +#define S22 9
993: +#define S23 14
994: +#define S24 20
995: +#define S31 4
996: +#define S32 11
997: +#define S33 16
998: +#define S34 23
999: +#define S41 6
1000: +#define S42 10
1001: +#define S43 15
1002: +#define S44 21
1003: +
1004: +#define T1 0xD76AA478
1005: +#define T2 0xE8C7B756
1006: +#define T3 0x242070DB
1007: +#define T4 0xC1BDCEEE
1008: +#define T5 0xF57C0FAF
1009: +#define T6 0x4787C62A
1010: +#define T7 0xA8304613
1011: +#define T8 0xFD469501
1012: +#define T9 0x698098D8
1013: +#define T10 0x8B44F7AF
1014: +#define T11 0xFFFF5BB1
1015: +#define T12 0x895CD7BE
1016: +#define T13 0x6B901122
1017: +#define T14 0xFD987193
1018: +#define T15 0xA679438E
1019: +#define T16 0x49B40821
1020: +#define T17 0xF61E2562
1021: +#define T18 0xC040B340
1022: +#define T19 0x265E5A51
1023: +#define T20 0xE9B6C7AA
1024: +#define T21 0xD62F105D
1025: +#define T22 0x02441453
1026: +#define T23 0xD8A1E681
1027: +#define T24 0xE7D3FBC8
1028: +#define T25 0x21E1CDE6
1029: +#define T26 0xC33707D6
1030: +#define T27 0xF4D50D87
1031: +#define T28 0x455A14ED
1032: +#define T29 0xA9E3E905
1033: +#define T30 0xFCEFA3F8
1034: +#define T31 0x676F02D9
1035: +#define T32 0x8D2A4C8A
1036: +#define T33 0xFFFA3942
1037: +#define T34 0x8771F681
1038: +#define T35 0x6D9D6122
1039: +#define T36 0xFDE5380C
1040: +#define T37 0xA4BEEA44
1041: +#define T38 0x4BDECFA9
1042: +#define T39 0xF6BB4B60
1043: +#define T40 0xBEBFBC70
1044: +#define T41 0x289B7EC6
1045: +#define T42 0xEAA127FA
1046: +#define T43 0xD4EF3085
1047: +#define T44 0x04881D05
1048: +#define T45 0xD9D4D039
1049: +#define T46 0xE6DB99E5
1050: +#define T47 0x1FA27CF8
1051: +#define T48 0xC4AC5665
1052: +#define T49 0xF4292244
1053: +#define T50 0x432AFF97
1054: +#define T51 0xAB9423A7
1055: +#define T52 0xFC93A039
1056: +#define T53 0x655B59C3
1057: +#define T54 0x8F0CCC92
1058: +#define T55 0xFFEFF47D
1059: +#define T56 0x85845DD1
1060: +#define T57 0x6FA87E4F
1061: +#define T58 0xFE2CE6E0
1062: +#define T59 0xA3014314
1063: +#define T60 0x4E0811A1
1064: +#define T61 0xF7537E82
1065: +#define T62 0xBD3AF235
1066: +#define T63 0x2AD7D2BB
1067: +#define T64 0xEB86D391
1068: +
1069: +#define ROTL_SSE2(x, n) { \
1070: + __m128i s; \
1071: + s = _mm_srli_epi32(x, 32 - n); \
1072: + x = _mm_slli_epi32(x, n); \
1073: + x = _mm_or_si128(x, s); \
1074: +};
1075: +
1076: +#define ROTL_AVX2(x, n) { \
1077: + __m256i s; \
1078: + s = _mm256_srli_epi32(x, 32 - n); \
1079: + x = _mm256_slli_epi32(x, n); \
1080: + x = _mm256_or_si256(x, s); \
1081: +};
1082: +
1083: +#define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z))
1084: +#define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y))
1085: +#define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z)
1086: +#define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff))))
1087: +
1088: +#define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
1089: +#define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y))
1090: +#define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z)
1091: +#define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff))))
1092: +
1093: +#define SET_SSE2(step, a, b, c, d, x, s, ac) { \
1094: + a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \
1095: + ROTL_SSE2(a, s); \
1096: + a = _mm_add_epi32(a, b); \
1097: +}
1098: +
1099: +#define SET_AVX2(step, a, b, c, d, x, s, ac) { \
1100: + a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \
1101: + ROTL_AVX2(a, s); \
1102: + a = _mm256_add_epi32(a, b); \
1103: +}
1104: +
1105: +#define IA 0x67452301
1106: +#define IB 0xefcdab89
1107: +#define IC 0x98badcfe
1108: +#define ID 0x10325476
1109: +
1110: +#define GET_MD5_DATA(dest, src, pos) \
1111: + dest = \
1112: + ((uint32_t) src[pos + 0]) << 0 | \
1113: + ((uint32_t) src[pos + 1]) << 8 | \
1114: + ((uint32_t) src[pos + 2]) << 16 | \
1115: + ((uint32_t) src[pos + 3]) << 24
1116: +
1117: +#define GET_PMD5_DATA_SSE2(dest, src, pos) { \
1118: + uint32_t v0, v1, v2, v3; \
1119: + GET_MD5_DATA(v0, src[0], pos); \
1120: + GET_MD5_DATA(v1, src[1], pos); \
1121: + GET_MD5_DATA(v2, src[2], pos); \
1122: + GET_MD5_DATA(v3, src[3], pos); \
1123: + dest = _mm_setr_epi32(v0, v1, v2, v3); \
1124: +}
1125: +
1126: +#define GET_PMD5_DATA_AVX2(dest, src, pos) { \
1127: + uint32_t v0, v1, v2, v3; \
1128: + uint32_t v4, v5, v6, v7; \
1129: + GET_MD5_DATA(v0, src[0], pos); \
1130: + GET_MD5_DATA(v1, src[1], pos); \
1131: + GET_MD5_DATA(v2, src[2], pos); \
1132: + GET_MD5_DATA(v3, src[3], pos); \
1133: + GET_MD5_DATA(v4, src[4], pos); \
1134: + GET_MD5_DATA(v5, src[5], pos); \
1135: + GET_MD5_DATA(v6, src[6], pos); \
1136: + GET_MD5_DATA(v7, src[7], pos); \
1137: + dest = _mm256_setr_epi32(v0, v1, v2, v3, \
1138: + v4, v5, v6, v7); \
1139: +}
1140: +
1141: +#define PUT_MD5_DATA(dest, val, pos) { \
1142: + dest[pos + 0] = (val >> 0) & 0xff; \
1143: + dest[pos + 1] = (val >> 8) & 0xff; \
1144: + dest[pos + 2] = (val >> 16) & 0xff; \
1145: + dest[pos + 3] = (val >> 24) & 0xff; \
1146: +}
1147: +
1148: +const static uint8_t md5_padding[64] = {
1149: + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1150: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1151: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1152: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1153: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1154: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1155: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1156: + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1157: +};
1158: +
1159: +#ifdef PMD5_ALLOW_SSE2
1160: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
1161: +{
1162: + int i;
1163: + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1164: + ctx->len[i] = 0;
1165: + }
1166: +
1167: + ctx->state_sse2[0] = _mm_set1_epi32(IA);
1168: + ctx->state_sse2[1] = _mm_set1_epi32(IB);
1169: + ctx->state_sse2[2] = _mm_set1_epi32(IC);
1170: + ctx->state_sse2[3] = _mm_set1_epi32(ID);
1171: +
1172: + return PMD5_SUCCESS;
1173: +}
1174: +#endif
1175: +
1176: +#ifdef PMD5_ALLOW_AVX2
1177: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
1178: +{
1179: + int i;
1180: + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1181: + ctx->len[i] = 0;
1182: + }
1183: +
1184: + ctx->state_avx2[0] = _mm256_set1_epi32(IA);
1185: + ctx->state_avx2[1] = _mm256_set1_epi32(IB);
1186: + ctx->state_avx2[2] = _mm256_set1_epi32(IC);
1187: + ctx->state_avx2[3] = _mm256_set1_epi32(ID);
1188: +
1189: + return PMD5_SUCCESS;
1190: +}
1191: +#endif
1192: +
1193: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
1194: +{
1195: + return PMD5_INVALID_SLOT;
1196: +}
1197: +
1198: +#ifdef PMD5_ALLOW_SSE2
1199: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1200: +{
1201: + if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
1202: + return PMD5_INVALID_SLOT;
1203: +
1204: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
1205: + int i;
1206: +
1207: + for (i = 0; i < 4; i++) {
1208: + _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
1209: + }
1210: +
1211: + v[0][slot] = a;
1212: + v[1][slot] = b;
1213: + v[2][slot] = c;
1214: + v[3][slot] = d;
1215: +
1216: + for (i = 0; i < 4; i++) {
1217: + ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]);
1218: + }
1219: +
1220: + return PMD5_SUCCESS;
1221: +}
1222: +#endif
1223: +
1224: +#ifdef PMD5_ALLOW_AVX2
1225: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1226: +{
1227: + if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
1228: + return PMD5_INVALID_SLOT;
1229: +
1230: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
1231: + int i;
1232: +
1233: + for (i = 0; i < 4; i++) {
1234: + _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
1235: + }
1236: +
1237: + v[0][slot] = a;
1238: + v[1][slot] = b;
1239: + v[2][slot] = c;
1240: + v[3][slot] = d;
1241: +
1242: + for (i = 0; i < 4; i++) {
1243: + ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]);
1244: + }
1245: +
1246: + return PMD5_SUCCESS;
1247: +}
1248: +#endif
1249: +
1250: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
1251: +{
1252: + return PMD5_INVALID_SLOT;
1253: +}
1254: +
1255: +#ifdef PMD5_ALLOW_SSE2
1256: +__attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
1257: +{
1258: + if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
1259: + return PMD5_INVALID_SLOT;
1260: +
1261: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
1262: + int i;
1263: +
1264: + for (i = 0; i < 4; i++) {
1265: + _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
1266: + }
1267: +
1268: + *a = v[0][slot];
1269: + *b = v[1][slot];
1270: + *c = v[2][slot];
1271: + *d = v[3][slot];
1272: +
1273: + return PMD5_SUCCESS;
1274: +}
1275: +#endif
1276: +
1277: +#ifdef PMD5_ALLOW_AVX2
1278: +__attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
1279: +{
1280: + if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
1281: + return PMD5_INVALID_SLOT;
1282: +
1283: + __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
1284: + int i;
1285: +
1286: + for (i = 0; i < 4; i++) {
1287: + _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
1288: + }
1289: +
1290: + *a = v[0][slot];
1291: + *b = v[1][slot];
1292: + *c = v[2][slot];
1293: + *d = v[3][slot];
1294: +
1295: + return PMD5_SUCCESS;
1296: +}
1297: +#endif
1298: +
1299: +__attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
1300: +{
1301: + return PMD5_INVALID_SLOT;
1302: +}
1303: +
1304: +static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot)
1305: +{
1306: + return pmd5_set_slot(ctx, slot, IA, IB, IC, ID);
1307: +}
1308: +
1309: +#ifdef PMD5_ALLOW_SSE2
1310: +__attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
1311: +{
1312: + __m128i W[MD5_DIGEST_LEN], a, b, c, d;
1313: +
1314: + GET_PMD5_DATA_SSE2(W[ 0], data, 0);
1315: + GET_PMD5_DATA_SSE2(W[ 1], data, 4);
1316: + GET_PMD5_DATA_SSE2(W[ 2], data, 8);
1317: + GET_PMD5_DATA_SSE2(W[ 3], data, 12);
1318: + GET_PMD5_DATA_SSE2(W[ 4], data, 16);
1319: + GET_PMD5_DATA_SSE2(W[ 5], data, 20);
1320: + GET_PMD5_DATA_SSE2(W[ 6], data, 24);
1321: + GET_PMD5_DATA_SSE2(W[ 7], data, 28);
1322: + GET_PMD5_DATA_SSE2(W[ 8], data, 32);
1323: + GET_PMD5_DATA_SSE2(W[ 9], data, 36);
1324: + GET_PMD5_DATA_SSE2(W[10], data, 40);
1325: + GET_PMD5_DATA_SSE2(W[11], data, 44);
1326: + GET_PMD5_DATA_SSE2(W[12], data, 48);
1327: + GET_PMD5_DATA_SSE2(W[13], data, 52);
1328: + GET_PMD5_DATA_SSE2(W[14], data, 56);
1329: + GET_PMD5_DATA_SSE2(W[15], data, 60);
1330: +
1331: + a = ctx->state_sse2[0];
1332: + b = ctx->state_sse2[1];
1333: + c = ctx->state_sse2[2];
1334: + d = ctx->state_sse2[3];
1335: +
1336: + SET_SSE2(F, a, b, c, d, W[ 0], S11, 1);
1337: + SET_SSE2(F, d, a, b, c, W[ 1], S12, 2);
1338: + SET_SSE2(F, c, d, a, b, W[ 2], S13, 3);
1339: + SET_SSE2(F, b, c, d, a, W[ 3], S14, 4);
1340: + SET_SSE2(F, a, b, c, d, W[ 4], S11, 5);
1341: + SET_SSE2(F, d, a, b, c, W[ 5], S12, 6);
1342: + SET_SSE2(F, c, d, a, b, W[ 6], S13, 7);
1343: + SET_SSE2(F, b, c, d, a, W[ 7], S14, 8);
1344: + SET_SSE2(F, a, b, c, d, W[ 8], S11, 9);
1345: + SET_SSE2(F, d, a, b, c, W[ 9], S12, 10);
1346: + SET_SSE2(F, c, d, a, b, W[10], S13, 11);
1347: + SET_SSE2(F, b, c, d, a, W[11], S14, 12);
1348: + SET_SSE2(F, a, b, c, d, W[12], S11, 13);
1349: + SET_SSE2(F, d, a, b, c, W[13], S12, 14);
1350: + SET_SSE2(F, c, d, a, b, W[14], S13, 15);
1351: + SET_SSE2(F, b, c, d, a, W[15], S14, 16);
1352: +
1353: + SET_SSE2(G, a, b, c, d, W[ 1], S21, 17);
1354: + SET_SSE2(G, d, a, b, c, W[ 6], S22, 18);
1355: + SET_SSE2(G, c, d, a, b, W[11], S23, 19);
1356: + SET_SSE2(G, b, c, d, a, W[ 0], S24, 20);
1357: + SET_SSE2(G, a, b, c, d, W[ 5], S21, 21);
1358: + SET_SSE2(G, d, a, b, c, W[10], S22, 22);
1359: + SET_SSE2(G, c, d, a, b, W[15], S23, 23);
1360: + SET_SSE2(G, b, c, d, a, W[ 4], S24, 24);
1361: + SET_SSE2(G, a, b, c, d, W[ 9], S21, 25);
1362: + SET_SSE2(G, d, a, b, c, W[14], S22, 26);
1363: + SET_SSE2(G, c, d, a, b, W[ 3], S23, 27);
1364: + SET_SSE2(G, b, c, d, a, W[ 8], S24, 28);
1365: + SET_SSE2(G, a, b, c, d, W[13], S21, 29);
1366: + SET_SSE2(G, d, a, b, c, W[ 2], S22, 30);
1367: + SET_SSE2(G, c, d, a, b, W[ 7], S23, 31);
1368: + SET_SSE2(G, b, c, d, a, W[12], S24, 32);
1369: +
1370: + SET_SSE2(H, a, b, c, d, W[ 5], S31, 33);
1371: + SET_SSE2(H, d, a, b, c, W[ 8], S32, 34);
1372: + SET_SSE2(H, c, d, a, b, W[11], S33, 35);
1373: + SET_SSE2(H, b, c, d, a, W[14], S34, 36);
1374: + SET_SSE2(H, a, b, c, d, W[ 1], S31, 37);
1375: + SET_SSE2(H, d, a, b, c, W[ 4], S32, 38);
1376: + SET_SSE2(H, c, d, a, b, W[ 7], S33, 39);
1377: + SET_SSE2(H, b, c, d, a, W[10], S34, 40);
1378: + SET_SSE2(H, a, b, c, d, W[13], S31, 41);
1379: + SET_SSE2(H, d, a, b, c, W[ 0], S32, 42);
1380: + SET_SSE2(H, c, d, a, b, W[ 3], S33, 43);
1381: + SET_SSE2(H, b, c, d, a, W[ 6], S34, 44);
1382: + SET_SSE2(H, a, b, c, d, W[ 9], S31, 45);
1383: + SET_SSE2(H, d, a, b, c, W[12], S32, 46);
1384: + SET_SSE2(H, c, d, a, b, W[15], S33, 47);
1385: + SET_SSE2(H, b, c, d, a, W[ 2], S34, 48);
1386: +
1387: + SET_SSE2(I, a, b, c, d, W[ 0], S41, 49);
1388: + SET_SSE2(I, d, a, b, c, W[ 7], S42, 50);
1389: + SET_SSE2(I, c, d, a, b, W[14], S43, 51);
1390: + SET_SSE2(I, b, c, d, a, W[ 5], S44, 52);
1391: + SET_SSE2(I, a, b, c, d, W[12], S41, 53);
1392: + SET_SSE2(I, d, a, b, c, W[ 3], S42, 54);
1393: + SET_SSE2(I, c, d, a, b, W[10], S43, 55);
1394: + SET_SSE2(I, b, c, d, a, W[ 1], S44, 56);
1395: + SET_SSE2(I, a, b, c, d, W[ 8], S41, 57);
1396: + SET_SSE2(I, d, a, b, c, W[15], S42, 58);
1397: + SET_SSE2(I, c, d, a, b, W[ 6], S43, 59);
1398: + SET_SSE2(I, b, c, d, a, W[13], S44, 60);
1399: + SET_SSE2(I, a, b, c, d, W[ 4], S41, 61);
1400: + SET_SSE2(I, d, a, b, c, W[11], S42, 62);
1401: + SET_SSE2(I, c, d, a, b, W[ 2], S43, 63);
1402: + SET_SSE2(I, b, c, d, a, W[ 9], S44, 64);
1403: +
1404: + ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a);
1405: + ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b);
1406: + ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c);
1407: + ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d);
1408: +}
1409: +#endif
1410: +
1411: +#ifdef PMD5_ALLOW_AVX2
1412: +__attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
1413: +{
1414: + __m256i W[MD5_DIGEST_LEN], a, b, c, d;
1415: +
1416: + GET_PMD5_DATA_AVX2(W[ 0], data, 0);
1417: + GET_PMD5_DATA_AVX2(W[ 1], data, 4);
1418: + GET_PMD5_DATA_AVX2(W[ 2], data, 8);
1419: + GET_PMD5_DATA_AVX2(W[ 3], data, 12);
1420: + GET_PMD5_DATA_AVX2(W[ 4], data, 16);
1421: + GET_PMD5_DATA_AVX2(W[ 5], data, 20);
1422: + GET_PMD5_DATA_AVX2(W[ 6], data, 24);
1423: + GET_PMD5_DATA_AVX2(W[ 7], data, 28);
1424: + GET_PMD5_DATA_AVX2(W[ 8], data, 32);
1425: + GET_PMD5_DATA_AVX2(W[ 9], data, 36);
1426: + GET_PMD5_DATA_AVX2(W[10], data, 40);
1427: + GET_PMD5_DATA_AVX2(W[11], data, 44);
1428: + GET_PMD5_DATA_AVX2(W[12], data, 48);
1429: + GET_PMD5_DATA_AVX2(W[13], data, 52);
1430: + GET_PMD5_DATA_AVX2(W[14], data, 56);
1431: + GET_PMD5_DATA_AVX2(W[15], data, 60);
1432: +
1433: + a = ctx->state_avx2[0];
1434: + b = ctx->state_avx2[1];
1435: + c = ctx->state_avx2[2];
1436: + d = ctx->state_avx2[3];
1437: +
1438: + SET_AVX2(F, a, b, c, d, W[ 0], S11, 1);
1439: + SET_AVX2(F, d, a, b, c, W[ 1], S12, 2);
1440: + SET_AVX2(F, c, d, a, b, W[ 2], S13, 3);
1441: + SET_AVX2(F, b, c, d, a, W[ 3], S14, 4);
1442: + SET_AVX2(F, a, b, c, d, W[ 4], S11, 5);
1443: + SET_AVX2(F, d, a, b, c, W[ 5], S12, 6);
1444: + SET_AVX2(F, c, d, a, b, W[ 6], S13, 7);
1445: + SET_AVX2(F, b, c, d, a, W[ 7], S14, 8);
1446: + SET_AVX2(F, a, b, c, d, W[ 8], S11, 9);
1447: + SET_AVX2(F, d, a, b, c, W[ 9], S12, 10);
1448: + SET_AVX2(F, c, d, a, b, W[10], S13, 11);
1449: + SET_AVX2(F, b, c, d, a, W[11], S14, 12);
1450: + SET_AVX2(F, a, b, c, d, W[12], S11, 13);
1451: + SET_AVX2(F, d, a, b, c, W[13], S12, 14);
1452: + SET_AVX2(F, c, d, a, b, W[14], S13, 15);
1453: + SET_AVX2(F, b, c, d, a, W[15], S14, 16);
1454: +
1455: + SET_AVX2(G, a, b, c, d, W[ 1], S21, 17);
1456: + SET_AVX2(G, d, a, b, c, W[ 6], S22, 18);
1457: + SET_AVX2(G, c, d, a, b, W[11], S23, 19);
1458: + SET_AVX2(G, b, c, d, a, W[ 0], S24, 20);
1459: + SET_AVX2(G, a, b, c, d, W[ 5], S21, 21);
1460: + SET_AVX2(G, d, a, b, c, W[10], S22, 22);
1461: + SET_AVX2(G, c, d, a, b, W[15], S23, 23);
1462: + SET_AVX2(G, b, c, d, a, W[ 4], S24, 24);
1463: + SET_AVX2(G, a, b, c, d, W[ 9], S21, 25);
1464: + SET_AVX2(G, d, a, b, c, W[14], S22, 26);
1465: + SET_AVX2(G, c, d, a, b, W[ 3], S23, 27);
1466: + SET_AVX2(G, b, c, d, a, W[ 8], S24, 28);
1467: + SET_AVX2(G, a, b, c, d, W[13], S21, 29);
1468: + SET_AVX2(G, d, a, b, c, W[ 2], S22, 30);
1469: + SET_AVX2(G, c, d, a, b, W[ 7], S23, 31);
1470: + SET_AVX2(G, b, c, d, a, W[12], S24, 32);
1471: +
1472: + SET_AVX2(H, a, b, c, d, W[ 5], S31, 33);
1473: + SET_AVX2(H, d, a, b, c, W[ 8], S32, 34);
1474: + SET_AVX2(H, c, d, a, b, W[11], S33, 35);
1475: + SET_AVX2(H, b, c, d, a, W[14], S34, 36);
1476: + SET_AVX2(H, a, b, c, d, W[ 1], S31, 37);
1477: + SET_AVX2(H, d, a, b, c, W[ 4], S32, 38);
1478: + SET_AVX2(H, c, d, a, b, W[ 7], S33, 39);
1479: + SET_AVX2(H, b, c, d, a, W[10], S34, 40);
1480: + SET_AVX2(H, a, b, c, d, W[13], S31, 41);
1481: + SET_AVX2(H, d, a, b, c, W[ 0], S32, 42);
1482: + SET_AVX2(H, c, d, a, b, W[ 3], S33, 43);
1483: + SET_AVX2(H, b, c, d, a, W[ 6], S34, 44);
1484: + SET_AVX2(H, a, b, c, d, W[ 9], S31, 45);
1485: + SET_AVX2(H, d, a, b, c, W[12], S32, 46);
1486: + SET_AVX2(H, c, d, a, b, W[15], S33, 47);
1487: + SET_AVX2(H, b, c, d, a, W[ 2], S34, 48);
1488: +
1489: + SET_AVX2(I, a, b, c, d, W[ 0], S41, 49);
1490: + SET_AVX2(I, d, a, b, c, W[ 7], S42, 50);
1491: + SET_AVX2(I, c, d, a, b, W[14], S43, 51);
1492: + SET_AVX2(I, b, c, d, a, W[ 5], S44, 52);
1493: + SET_AVX2(I, a, b, c, d, W[12], S41, 53);
1494: + SET_AVX2(I, d, a, b, c, W[ 3], S42, 54);
1495: + SET_AVX2(I, c, d, a, b, W[10], S43, 55);
1496: + SET_AVX2(I, b, c, d, a, W[ 1], S44, 56);
1497: + SET_AVX2(I, a, b, c, d, W[ 8], S41, 57);
1498: + SET_AVX2(I, d, a, b, c, W[15], S42, 58);
1499: + SET_AVX2(I, c, d, a, b, W[ 6], S43, 59);
1500: + SET_AVX2(I, b, c, d, a, W[13], S44, 60);
1501: + SET_AVX2(I, a, b, c, d, W[ 4], S41, 61);
1502: + SET_AVX2(I, d, a, b, c, W[11], S42, 62);
1503: + SET_AVX2(I, c, d, a, b, W[ 2], S43, 63);
1504: + SET_AVX2(I, b, c, d, a, W[ 9], S44, 64);
1505: +
1506: + ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a);
1507: + ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b);
1508: + ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c);
1509: + ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d);
1510: +}
1511: +#endif
1512: +
1513: +__attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
1514: +{
1515: +}
1516: +
1517: +static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride)
1518: +{
1519: + const uint8_t * ptrs[PMD5_SLOTS_MAX];
1520: +
1521: + if (!length) return PMD5_SUCCESS;
1522: +
1523: + int slots = pmd5_slots();
1524: +
1525: + if (!stride) stride = 64;
1526: +
1527: + int i;
1528: + for (i = 0; i < slots; i++) {
1529: + ptrs[i] = data[i];
1530: + ctx->len[i] += length;
1531: + if (!ptrs[i]) ptrs[i] = md5_padding;
1532: + }
1533: +
1534: + while (length >= 64) {
1535: + pmd5_process(ctx, ptrs);
1536: + length -= 64;
1537: + for (i = 0; i < slots; i++) {
1538: + if (data[i]) ptrs[i] += stride;
1539: + }
1540: + }
1541: +
1542: + if (length) return PMD5_UNALIGNED_UPDATE;
1543: +
1544: + for (i = 0; i < slots; i++) {
1545: + if (data[i]) data[i] = ptrs[i];
1546: + }
1547: +
1548: + return PMD5_SUCCESS;
1549: +}
1550: +
1551: +static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX])
1552: +{
1553: + uint64_t length = 0;
1554: + int slots = pmd5_slots();
1555: +
1556: + int i;
1557: + for (i = 0; i < slots; i++) {
1558: + if ((length == 0) || (lengths[i] < length)) length = lengths[i];
1559: + }
1560: +
1561: + for (i = 0; i < slots; i++) {
1562: + lengths[i] -= length;
1563: + }
1564: +
1565: + return pmd5_update_all_simple(ctx, data, length, 0);
1566: +}
1567: +
1568: +static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length)
1569: +{
1570: + MD5_CTX ctx;
1571: +
1572: + if ((slot >= pmd5_slots()) || (slot < 0))
1573: + return PMD5_INVALID_SLOT;
1574: +
1575: + pmd5_to_md5(pctx, &ctx, slot);
1576: + if (data && length) {
1577: + MD5_Update(&ctx, data, length);
1578: + }
1579: + MD5_Final(digest, &ctx);
1580: +
1581: + return PMD5_SUCCESS;
1582: +}
1583: +
1584: +static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot)
1585: +{
1586: + return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0);
1587: +}
1588: +
1589: +static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN])
1590: +{
1591: + int i;
1592: + for (i = 0; i < pmd5_slots(); i++) {
1593: + pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0);
1594: + }
1595: + return PMD5_SUCCESS;
1596: +}
1597: +
1598: +static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot)
1599: +{
1600: + if ((slot >= pmd5_slots()) || (slot < 0))
1601: + return PMD5_INVALID_SLOT;
1602: +
1603: + // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting.
1604: +
1605: +#ifdef USE_OPENSSL
1606: + pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29);
1607: +#else
1608: + pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32);
1609: +#endif
1610: + return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D);
1611: +}
1612: +
1613: +static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot)
1614: +{
1615: + if ((slot >= pmd5_slots()) || (slot < 0))
1616: + return PMD5_INVALID_SLOT;
1617: +
1618: + MD5_Init(ctx);
1619: +
1620: +#ifdef USE_OPENSSL
1621: + ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF;
1622: + ctx->Nh = pctx->len[slot] >> 29;
1623: +
1624: + uint32_t a, b, c, d;
1625: + pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
1626: + if (ret == PMD5_SUCCESS) {
1627: + ctx->A = a;
1628: + ctx->B = b;
1629: + ctx->C = c;
1630: + ctx->D = d;
1631: + }
1632: + return ret;
1633: +#else
1634: + ctx->totalN = pctx->len[slot] & 0xFFFFFFFF;
1635: + ctx->totalN2 = pctx->len[slot] >> 32;
1636: + return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D);
1637: +#endif
1638: +}
1639: +
1640: +/* With GCC 10 putting these implementations inside 'extern "C"' causes an
1641: + assembler error. That worked fine on GCC 5-9 and clang 6-10...
1642: + */
1643: +
1644: +static inline int md5_parallel_slots_cpp()
1645: +{
1646: + int slots = pmd5_slots();
1647: + if (slots == 0) return 1;
1648: + return slots;
1649: +}
1650: +
1651: +static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
1652: +{
1653: + int slots = md5_parallel_slots_cpp();
1654: + if ((streams < 1) || (streams > slots)) return 0;
1655: + if (pre4 && post4) return 0;
1656: +
1657: + if (slots == 1) {
1658: + MD5_CTX ctx;
1659: + MD5_Init(&ctx);
1660: + if (pre4) {
1661: + MD5_Update(&ctx, (const unsigned char*)pre4, 4);
1662: + }
1663: + MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]);
1664: + if (post4) {
1665: + MD5_Update(&ctx, (const unsigned char*)post4, 4);
1666: + }
1667: + if (sum[0]) {
1668: + MD5_Final((uint8_t*)sum[0], &ctx);
1669: + }
1670: + return 0;
1671: + }
1672: +
1673: + int i;
1674: + int active[PMD5_SLOTS_MAX];
1675: + char* buffers[PMD5_SLOTS_MAX];
1676: + uint64_t left[PMD5_SLOTS_MAX];
1677: + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1678: + active[i] = streams > i;
1679: + if (i < streams) {
1680: + buffers[i] = buf[i];
1681: + left[i] = (uint64_t)len[i];
1682: + } else {
1683: + buffers[i] = NULL;
1684: + left[i] = 0;
1685: + }
1686: + }
1687: + MD5_CTX results[PMD5_SLOTS_MAX];
1688: +
1689: + pmd5_context ctx_simd;
1690: + if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0;
1691: +
1692: + if (pre4) {
1693: + char temp_buffers[PMD5_SLOTS_MAX][64];
1694: + int have_any = 0;
1695: + for (i = 0; i < slots; i++) {
1696: + if (active[i]) {
1697: + if (left[i] < 60) {
1698: + MD5_Init(&results[i]);
1699: + MD5_Update(&results[i], (const unsigned char*)pre4, 4);
1700: + MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]);
1701: + active[i] = 0;
1702: + left[i] = 0;
1703: + } else {
1704: + memcpy(temp_buffers[i], pre4, 4);
1705: + memcpy(temp_buffers[i] + 4, buffers[i], 60);
1706: + buffers[i] += 60;
1707: + left[i] -= 60;
1708: + have_any = 1;
1709: + }
1710: + }
1711: + }
1712: +
1713: + if (have_any) {
1714: + char* ptrs[PMD5_SLOTS_MAX];
1715: + for (i = 0; i < PMD5_SLOTS_MAX; i++) {
1716: + ptrs[i] = &temp_buffers[i][0];
1717: + }
1718: + if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) {
1719: + return 0;
1720: + }
1721: + }
1722: + }
1723: +
1724: + int failed = 0;
1725: + while (true) {
1726: + for (i = 0; i < slots; i++) {
1727: + if (active[i] && (left[i] < 64)) {
1728: + if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) {
1729: + failed = 1;
1730: + }
1731: + active[i] = 0;
1732: + }
1733: + }
1734: +
1735: + uint64_t shortest = 0;
1736: + for (i = 0; i < slots; i++) {
1737: + if (!active[i]) {
1738: + buffers[i] = NULL;
1739: + } else if ((shortest == 0) || (left[i] < shortest)) {
1740: + shortest = left[i];
1741: + }
1742: + }
1743: +
1744: + if (shortest > 0) {
1745: + shortest = shortest & ~63;
1746: + if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) {
1747: + failed = 1;
1748: + }
1749: + for (i = 0; i < slots; i++) {
1750: + if (active[i]) {
1751: + left[i] -= shortest;
1752: + }
1753: + }
1754: + }
1755: +
1756: + if (failed) {
1757: + return 0;
1758: + } else {
1759: + int have_any = 0;
1760: + for (i = 0; i < slots; i++) {
1761: + have_any |= active[i];
1762: + }
1763: + if (!have_any) {
1764: + break;
1765: + }
1766: + }
1767: + }
1768: +
1769: + for (i = 0; i < slots; i++) {
1770: + if (i < streams) {
1771: + if (left[i] > 0) {
1772: + // buffer[i] == NULL here
1773: + MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]);
1774: + }
1775: + if (post4) {
1776: + MD5_Update(&results[i], (const unsigned char*)post4, 4);
1777: + }
1778: + if (sum[i]) {
1779: + MD5_Final((uint8_t*)sum[i], &results[i]);
1780: + }
1781: + }
1782: + }
1783: +
1784: + return 1;
1785: +}
1786: +
1787: +// each pmd5_context needs to be 32-byte aligned
1788: +#define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31)))
1789: +
1790: +static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx)
1791: +{
1792: + int i;
1793: + for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) {
1794: + pmd5_init_all(MD5P8_Contexts_simd(ctx, i));
1795: + }
1796: + ctx->used = 0;
1797: + ctx->next = 0;
1798: +}
1799: +
1800: +static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length)
1801: +{
1802: + int slots = pmd5_slots();
1803: + uint32 pos = 0;
1804: +
1805: + if ((ctx->used) || (length < 512)) {
1806: + int cpy = MIN(length, 512 - ctx->used);
1807: + memcpy(&ctx->buffer[ctx->used], input, cpy);
1808: + ctx->used += cpy;
1809: + length -= cpy;
1810: + pos += cpy;
1811: +
1812: + if (ctx->used == 512) {
1813: + if (slots == PMD5_SLOTS_AVX2) {
1814: + const uint8_t* ptrs[PMD5_SLOTS_MAX] = {
1815: + (uint8_t*)ctx->buffer,
1816: + (uint8_t*)(ctx->buffer + 64),
1817: + (uint8_t*)(ctx->buffer + 128),
1818: + (uint8_t*)(ctx->buffer + 192),
1819: + (uint8_t*)(ctx->buffer + 256),
1820: + (uint8_t*)(ctx->buffer + 320),
1821: + (uint8_t*)(ctx->buffer + 384),
1822: + (uint8_t*)(ctx->buffer + 448)
1823: + };
1824: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0);
1825: + } else {
1826: + const uint8_t* ptrs1[PMD5_SLOTS_MAX] = {
1827: + (uint8_t*)ctx->buffer,
1828: + (uint8_t*)(ctx->buffer + 64),
1829: + (uint8_t*)(ctx->buffer + 128),
1830: + (uint8_t*)(ctx->buffer + 192)
1831: + };
1832: + const uint8_t* ptrs2[PMD5_SLOTS_MAX] = {
1833: + (uint8_t*)(ctx->buffer + 256),
1834: + (uint8_t*)(ctx->buffer + 320),
1835: + (uint8_t*)(ctx->buffer + 384),
1836: + (uint8_t*)(ctx->buffer + 448)
1837: + };
1838: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0);
1839: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0);
1840: + }
1841: + ctx->used = 0;
1842: + }
1843: + }
1844: +
1845: + if (length >= 512) {
1846: + uint32 blocks = length / 512;
1847: + if (slots == PMD5_SLOTS_AVX2) {
1848: + const uint8_t* ptrs[8] = {
1849: + (uint8_t*)(input + pos),
1850: + (uint8_t*)(input + pos + 64),
1851: + (uint8_t*)(input + pos + 128),
1852: + (uint8_t*)(input + pos + 192),
1853: + (uint8_t*)(input + pos + 256),
1854: + (uint8_t*)(input + pos + 320),
1855: + (uint8_t*)(input + pos + 384),
1856: + (uint8_t*)(input + pos + 448)
1857: + };
1858: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512);
1859: + } else {
1860: + const uint8_t* ptrs1[4] = {
1861: + (uint8_t*)(input + pos),
1862: + (uint8_t*)(input + pos + 64),
1863: + (uint8_t*)(input + pos + 128),
1864: + (uint8_t*)(input + pos + 192)
1865: + };
1866: + const uint8_t* ptrs2[4] = {
1867: + (uint8_t*)(input + pos + 256),
1868: + (uint8_t*)(input + pos + 320),
1869: + (uint8_t*)(input + pos + 384),
1870: + (uint8_t*)(input + pos + 448)
1871: + };
1872: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512);
1873: + pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512);
1874: + }
1875: + pos += blocks * 512;
1876: + length -= blocks * 512;
1877: + }
1878: +
1879: + if (length) {
1880: + memcpy(ctx->buffer, &input[pos], length);
1881: + ctx->used = length;
1882: + }
1883: +}
1884: +
1885: +static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
1886: +{
1887: + int i;
1888: + uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0;
1889: + if (ctx->used) {
1890: + uchar tmp[512];
1891: + memset(tmp, 0, 512);
1892: + MD5P8_Update(ctx, tmp, 512 - ctx->used);
1893: + }
1894: +
1895: + uchar state[34*4] = {0};
1896: +
1897: + MD5_CTX tmp;
1898: + for (i = 0; i < 8; i++) {
1899: + if (pmd5_slots() == PMD5_SLOTS_AVX2) {
1900: + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
1901: + } else if (i < 4) {
1902: + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
1903: + } else {
1904: + pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4);
1905: + }
1906: +#ifdef USE_OPENSSL
1907: + if (low + tmp.Nl < low) high++;
1908: + low += tmp.Nl;
1909: + high += tmp.Nh;
1910: +#else
1911: + if (low + tmp.totalN < low) high++;
1912: + low += tmp.totalN;
1913: + high += tmp.totalN2;
1914: +#endif
1915: + SIVALu(state, i*16, tmp.A);
1916: + SIVALu(state, i*16 + 4, tmp.B);
1917: + SIVALu(state, i*16 + 8, tmp.C);
1918: + SIVALu(state, i*16 + 12, tmp.D);
1919: + }
1920: +
1921: +#ifndef USE_OPENSSL
1922: + high = (low >> 29) | (high << 3);
1923: + low = (low << 3);
1924: +#endif
1925: +
1926: + sub <<= 3;
1927: + if (low - sub > low) high--;
1928: + low -= sub;
1929: +
1930: + SIVALu(state, 32*4, low);
1931: + SIVALu(state, 33*4, high);
1932: +
1933: + MD5_CTX md;
1934: + MD5_Init(&md);
1935: + MD5_Update(&md, state, 34*4);
1936: + MD5_Final(digest, &md);
1937: +}
1938: +
1939: +extern "C" {
1940: +
1941: +int md5_parallel_slots()
1942: +{
1943: + return md5_parallel_slots_cpp();
1944: +}
1945: +
1946: +int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
1947: +{
1948: + return md5_parallel_cpp(streams, buf, len, sum, pre4, post4);
1949: +}
1950: +
1951: +void MD5P8_Init(MD5P8_CTX *ctx)
1952: +{
1953: + MD5P8_Init_cpp(ctx);
1954: +}
1955: +
1956: +void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
1957: +{
1958: + MD5P8_Update_cpp(ctx, input, length);
1959: +}
1960: +
1961: +void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
1962: +{
1963: + MD5P8_Final_cpp(digest, ctx);
1964: +}
1965: +
1966: +} // "C"
1967: +
1968: +#endif /* HAVE_SIMD */
1969: +#endif /* __cplusplus */
1970: +#endif /* __x86_64__ */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>