embedaddon/rsync/simd-md5-parallel-x86_64.cpp - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / rsync / simd-md5-parallel-x86_64.cpp
Revision 1.1: download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 00:32:36 2021 UTC (4 years, 3 months ago) by misho
CVS tags: MAIN, HEAD

Initial revision

1: /* 2: * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel. 3: * 4: * Original author: Nicolas Noble, 2017 5: * Modifications: Jorrit Jongma, 2020 6: * 7: * The original code was released in the public domain by the original author, 8: * falling back to the MIT license ( http://www.opensource.org/licenses/MIT ) 9: * in case public domain does not apply in your country. These modifications 10: * are likewise released in the public domain, with the same MIT license 11: * fallback. 12: * 13: * The original publication can be found at: 14: * 15: * https://github.com/nicolasnoble/sse-hash 16: */ 17: /* 18: * Nicolas' original code has been extended to add AVX2 support, all non-SIMD 19: * MD5 code has been removed and those code paths rerouted to use the MD5 20: * code already present in rsync, and wrapper functions have been added. The 21: * MD5P8 code is also new, and is the reason for the new stride parameter. 22: * 23: * This code allows multiple independent MD5 streams to be processed in 24: * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is 25: * lower than that of the original C routines for MD5, the processing of 26: * additional streams is "for free". 27: * 28: * Single streams are rerouted to rsync's normal MD5 code as that is faster 29: * for that case. A further optimization is possible by using SSE2 code on 30: * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not 31: * implemented here as it would require some restructuring, and in practise 32: * the code here is only rarely called with less than the maximum amount of 33: * streams (typically once at the end of each checksum2'd file). 34: * 35: * Benchmarks (in MB/s) C ASM SSE2*1 SSE2*4 AVX2*1 AVX2*8 36: * - Intel Atom D2700 302 334 166 664 N/A N/A 37: * - Intel i7-7700hq 351 376 289 1156 273 2184 38: * - AMD ThreadRipper 2950x 728 784 568 2272 430 3440 39: */ 40: 41: #ifdef __x86_64__ 42: #ifdef __cplusplus 43: 44: extern "C" { 45: 46: #include "rsync.h" 47: 48: } 49: 50: #ifdef HAVE_SIMD 51: 52: #ifndef BENCHMARK_SIMD_CHECKSUM2 53: #define PMD5_ALLOW_SSE2 // debugging 54: #define PMD5_ALLOW_AVX2 // debugging 55: #endif 56: 57: #ifdef PMD5_ALLOW_AVX2 58: #ifndef PMD5_ALLOW_SSE2 59: #define PMD5_ALLOW_SSE2 60: #endif 61: #endif 62: 63: #include <stdint.h> 64: #include <string.h> 65: 66: #include <immintrin.h> 67: 68: /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */ 69: #ifdef __clang__ 70: #define MVSTATIC 71: #else 72: #define MVSTATIC static 73: #endif 74: 75: // Missing from the headers on gcc 6 and older, clang 8 and older 76: typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1))); 77: typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1))); 78: 79: #define PMD5_SLOTS_DEFAULT 0 80: #define PMD5_SLOTS_SSE2 4 81: #define PMD5_SLOTS_AVX2 8 82: #define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2 83: 84: #ifdef PMD5_ALLOW_SSE2 85: __attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots() 86: { 87: return PMD5_SLOTS_SSE2; 88: } 89: #endif 90: 91: #ifdef PMD5_ALLOW_AVX2 92: __attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots() 93: { 94: return PMD5_SLOTS_AVX2; 95: } 96: #endif 97: 98: __attribute__ ((target("default"))) MVSTATIC int pmd5_slots() 99: { 100: return PMD5_SLOTS_DEFAULT; 101: } 102: 103: /* The parallel MD5 context structure. */ 104: typedef struct { 105: __m128i state_sse2[4]; 106: __m256i state_avx2[4]; 107: uint64_t len[PMD5_SLOTS_MAX]; 108: } pmd5_context; 109: 110: /* The status returned by the various functions below. */ 111: typedef enum { 112: PMD5_SUCCESS, 113: PMD5_INVALID_SLOT, 114: PMD5_UNALIGNED_UPDATE, 115: } pmd5_status; 116: 117: /* Initializes all slots in the given pmd5 context. */ 118: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx); 119: 120: /* Initializes a single slot out in the given pmd5 context. */ 121: static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot); 122: 123: /* Makes an MD5 update on all slots in parallel, given the same exact length on all streams. 124: The stream pointers will be incremented accordingly. 125: It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot. 126: The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted. 127: Stride defaults to 64 if 0 is passed. */ 128: static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride); 129: 130: /* Makes an MD5 update on all slots in parallel, given different lengths. 131: The stream pointers will be incremented accordingly. 132: The lengths will be decreased accordingly. Not all data might be consumed. 133: It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot. 134: The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */ 135: static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]); 136: 137: /* Finishes all slots at once. Fills in all digests. */ 138: static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]); 139: 140: /* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using 141: a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */ 142: static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot); 143: 144: /* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a 145: multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using 146: a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */ 147: static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length); 148: 149: /* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */ 150: static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot); 151: 152: /* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */ 153: static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot); 154: 155: #define S11 7 156: #define S12 12 157: #define S13 17 158: #define S14 22 159: #define S21 5 160: #define S22 9 161: #define S23 14 162: #define S24 20 163: #define S31 4 164: #define S32 11 165: #define S33 16 166: #define S34 23 167: #define S41 6 168: #define S42 10 169: #define S43 15 170: #define S44 21 171: 172: #define T1 0xD76AA478 173: #define T2 0xE8C7B756 174: #define T3 0x242070DB 175: #define T4 0xC1BDCEEE 176: #define T5 0xF57C0FAF 177: #define T6 0x4787C62A 178: #define T7 0xA8304613 179: #define T8 0xFD469501 180: #define T9 0x698098D8 181: #define T10 0x8B44F7AF 182: #define T11 0xFFFF5BB1 183: #define T12 0x895CD7BE 184: #define T13 0x6B901122 185: #define T14 0xFD987193 186: #define T15 0xA679438E 187: #define T16 0x49B40821 188: #define T17 0xF61E2562 189: #define T18 0xC040B340 190: #define T19 0x265E5A51 191: #define T20 0xE9B6C7AA 192: #define T21 0xD62F105D 193: #define T22 0x02441453 194: #define T23 0xD8A1E681 195: #define T24 0xE7D3FBC8 196: #define T25 0x21E1CDE6 197: #define T26 0xC33707D6 198: #define T27 0xF4D50D87 199: #define T28 0x455A14ED 200: #define T29 0xA9E3E905 201: #define T30 0xFCEFA3F8 202: #define T31 0x676F02D9 203: #define T32 0x8D2A4C8A 204: #define T33 0xFFFA3942 205: #define T34 0x8771F681 206: #define T35 0x6D9D6122 207: #define T36 0xFDE5380C 208: #define T37 0xA4BEEA44 209: #define T38 0x4BDECFA9 210: #define T39 0xF6BB4B60 211: #define T40 0xBEBFBC70 212: #define T41 0x289B7EC6 213: #define T42 0xEAA127FA 214: #define T43 0xD4EF3085 215: #define T44 0x04881D05 216: #define T45 0xD9D4D039 217: #define T46 0xE6DB99E5 218: #define T47 0x1FA27CF8 219: #define T48 0xC4AC5665 220: #define T49 0xF4292244 221: #define T50 0x432AFF97 222: #define T51 0xAB9423A7 223: #define T52 0xFC93A039 224: #define T53 0x655B59C3 225: #define T54 0x8F0CCC92 226: #define T55 0xFFEFF47D 227: #define T56 0x85845DD1 228: #define T57 0x6FA87E4F 229: #define T58 0xFE2CE6E0 230: #define T59 0xA3014314 231: #define T60 0x4E0811A1 232: #define T61 0xF7537E82 233: #define T62 0xBD3AF235 234: #define T63 0x2AD7D2BB 235: #define T64 0xEB86D391 236: 237: #define ROTL_SSE2(x, n) { \ 238: __m128i s; \ 239: s = _mm_srli_epi32(x, 32 - n); \ 240: x = _mm_slli_epi32(x, n); \ 241: x = _mm_or_si128(x, s); \ 242: }; 243: 244: #define ROTL_AVX2(x, n) { \ 245: __m256i s; \ 246: s = _mm256_srli_epi32(x, 32 - n); \ 247: x = _mm256_slli_epi32(x, n); \ 248: x = _mm256_or_si256(x, s); \ 249: }; 250: 251: #define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z)) 252: #define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y)) 253: #define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z) 254: #define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff)))) 255: 256: #define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z)) 257: #define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y)) 258: #define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z) 259: #define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff)))) 260: 261: #define SET_SSE2(step, a, b, c, d, x, s, ac) { \ 262: a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \ 263: ROTL_SSE2(a, s); \ 264: a = _mm_add_epi32(a, b); \ 265: } 266: 267: #define SET_AVX2(step, a, b, c, d, x, s, ac) { \ 268: a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \ 269: ROTL_AVX2(a, s); \ 270: a = _mm256_add_epi32(a, b); \ 271: } 272: 273: #define IA 0x67452301 274: #define IB 0xefcdab89 275: #define IC 0x98badcfe 276: #define ID 0x10325476 277: 278: #define GET_MD5_DATA(dest, src, pos) \ 279: dest = \ 280: ((uint32_t) src[pos + 0]) << 0 | \ 281: ((uint32_t) src[pos + 1]) << 8 | \ 282: ((uint32_t) src[pos + 2]) << 16 | \ 283: ((uint32_t) src[pos + 3]) << 24 284: 285: #define GET_PMD5_DATA_SSE2(dest, src, pos) { \ 286: uint32_t v0, v1, v2, v3; \ 287: GET_MD5_DATA(v0, src[0], pos); \ 288: GET_MD5_DATA(v1, src[1], pos); \ 289: GET_MD5_DATA(v2, src[2], pos); \ 290: GET_MD5_DATA(v3, src[3], pos); \ 291: dest = _mm_setr_epi32(v0, v1, v2, v3); \ 292: } 293: 294: #define GET_PMD5_DATA_AVX2(dest, src, pos) { \ 295: uint32_t v0, v1, v2, v3; \ 296: uint32_t v4, v5, v6, v7; \ 297: GET_MD5_DATA(v0, src[0], pos); \ 298: GET_MD5_DATA(v1, src[1], pos); \ 299: GET_MD5_DATA(v2, src[2], pos); \ 300: GET_MD5_DATA(v3, src[3], pos); \ 301: GET_MD5_DATA(v4, src[4], pos); \ 302: GET_MD5_DATA(v5, src[5], pos); \ 303: GET_MD5_DATA(v6, src[6], pos); \ 304: GET_MD5_DATA(v7, src[7], pos); \ 305: dest = _mm256_setr_epi32(v0, v1, v2, v3, \ 306: v4, v5, v6, v7); \ 307: } 308: 309: #define PUT_MD5_DATA(dest, val, pos) { \ 310: dest[pos + 0] = (val >> 0) & 0xff; \ 311: dest[pos + 1] = (val >> 8) & 0xff; \ 312: dest[pos + 2] = (val >> 16) & 0xff; \ 313: dest[pos + 3] = (val >> 24) & 0xff; \ 314: } 315: 316: const static uint8_t md5_padding[64] = { 317: 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 318: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 319: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 320: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 321: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 322: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 323: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 324: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 325: }; 326: 327: #ifdef PMD5_ALLOW_SSE2 328: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx) 329: { 330: int i; 331: for (i = 0; i < PMD5_SLOTS_MAX; i++) { 332: ctx->len[i] = 0; 333: } 334: 335: ctx->state_sse2[0] = _mm_set1_epi32(IA); 336: ctx->state_sse2[1] = _mm_set1_epi32(IB); 337: ctx->state_sse2[2] = _mm_set1_epi32(IC); 338: ctx->state_sse2[3] = _mm_set1_epi32(ID); 339: 340: return PMD5_SUCCESS; 341: } 342: #endif 343: 344: #ifdef PMD5_ALLOW_AVX2 345: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx) 346: { 347: int i; 348: for (i = 0; i < PMD5_SLOTS_MAX; i++) { 349: ctx->len[i] = 0; 350: } 351: 352: ctx->state_avx2[0] = _mm256_set1_epi32(IA); 353: ctx->state_avx2[1] = _mm256_set1_epi32(IB); 354: ctx->state_avx2[2] = _mm256_set1_epi32(IC); 355: ctx->state_avx2[3] = _mm256_set1_epi32(ID); 356: 357: return PMD5_SUCCESS; 358: } 359: #endif 360: 361: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx) 362: { 363: return PMD5_INVALID_SLOT; 364: } 365: 366: #ifdef PMD5_ALLOW_SSE2 367: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d) 368: { 369: if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0)) 370: return PMD5_INVALID_SLOT; 371: 372: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2]; 373: int i; 374: 375: for (i = 0; i < 4; i++) { 376: _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]); 377: } 378: 379: v[0][slot] = a; 380: v[1][slot] = b; 381: v[2][slot] = c; 382: v[3][slot] = d; 383: 384: for (i = 0; i < 4; i++) { 385: ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]); 386: } 387: 388: return PMD5_SUCCESS; 389: } 390: #endif 391: 392: #ifdef PMD5_ALLOW_AVX2 393: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d) 394: { 395: if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0)) 396: return PMD5_INVALID_SLOT; 397: 398: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2]; 399: int i; 400: 401: for (i = 0; i < 4; i++) { 402: _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]); 403: } 404: 405: v[0][slot] = a; 406: v[1][slot] = b; 407: v[2][slot] = c; 408: v[3][slot] = d; 409: 410: for (i = 0; i < 4; i++) { 411: ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]); 412: } 413: 414: return PMD5_SUCCESS; 415: } 416: #endif 417: 418: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d) 419: { 420: return PMD5_INVALID_SLOT; 421: } 422: 423: #ifdef PMD5_ALLOW_SSE2 424: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) 425: { 426: if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0)) 427: return PMD5_INVALID_SLOT; 428: 429: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2]; 430: int i; 431: 432: for (i = 0; i < 4; i++) { 433: _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]); 434: } 435: 436: *a = v[0][slot]; 437: *b = v[1][slot]; 438: *c = v[2][slot]; 439: *d = v[3][slot]; 440: 441: return PMD5_SUCCESS; 442: } 443: #endif 444: 445: #ifdef PMD5_ALLOW_AVX2 446: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) 447: { 448: if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0)) 449: return PMD5_INVALID_SLOT; 450: 451: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2]; 452: int i; 453: 454: for (i = 0; i < 4; i++) { 455: _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]); 456: } 457: 458: *a = v[0][slot]; 459: *b = v[1][slot]; 460: *c = v[2][slot]; 461: *d = v[3][slot]; 462: 463: return PMD5_SUCCESS; 464: } 465: #endif 466: 467: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d) 468: { 469: return PMD5_INVALID_SLOT; 470: } 471: 472: static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot) 473: { 474: return pmd5_set_slot(ctx, slot, IA, IB, IC, ID); 475: } 476: 477: #ifdef PMD5_ALLOW_SSE2 478: __attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX]) 479: { 480: __m128i W[MD5_DIGEST_LEN], a, b, c, d; 481: 482: GET_PMD5_DATA_SSE2(W[ 0], data, 0); 483: GET_PMD5_DATA_SSE2(W[ 1], data, 4); 484: GET_PMD5_DATA_SSE2(W[ 2], data, 8); 485: GET_PMD5_DATA_SSE2(W[ 3], data, 12); 486: GET_PMD5_DATA_SSE2(W[ 4], data, 16); 487: GET_PMD5_DATA_SSE2(W[ 5], data, 20); 488: GET_PMD5_DATA_SSE2(W[ 6], data, 24); 489: GET_PMD5_DATA_SSE2(W[ 7], data, 28); 490: GET_PMD5_DATA_SSE2(W[ 8], data, 32); 491: GET_PMD5_DATA_SSE2(W[ 9], data, 36); 492: GET_PMD5_DATA_SSE2(W[10], data, 40); 493: GET_PMD5_DATA_SSE2(W[11], data, 44); 494: GET_PMD5_DATA_SSE2(W[12], data, 48); 495: GET_PMD5_DATA_SSE2(W[13], data, 52); 496: GET_PMD5_DATA_SSE2(W[14], data, 56); 497: GET_PMD5_DATA_SSE2(W[15], data, 60); 498: 499: a = ctx->state_sse2[0]; 500: b = ctx->state_sse2[1]; 501: c = ctx->state_sse2[2]; 502: d = ctx->state_sse2[3]; 503: 504: SET_SSE2(F, a, b, c, d, W[ 0], S11, 1); 505: SET_SSE2(F, d, a, b, c, W[ 1], S12, 2); 506: SET_SSE2(F, c, d, a, b, W[ 2], S13, 3); 507: SET_SSE2(F, b, c, d, a, W[ 3], S14, 4); 508: SET_SSE2(F, a, b, c, d, W[ 4], S11, 5); 509: SET_SSE2(F, d, a, b, c, W[ 5], S12, 6); 510: SET_SSE2(F, c, d, a, b, W[ 6], S13, 7); 511: SET_SSE2(F, b, c, d, a, W[ 7], S14, 8); 512: SET_SSE2(F, a, b, c, d, W[ 8], S11, 9); 513: SET_SSE2(F, d, a, b, c, W[ 9], S12, 10); 514: SET_SSE2(F, c, d, a, b, W[10], S13, 11); 515: SET_SSE2(F, b, c, d, a, W[11], S14, 12); 516: SET_SSE2(F, a, b, c, d, W[12], S11, 13); 517: SET_SSE2(F, d, a, b, c, W[13], S12, 14); 518: SET_SSE2(F, c, d, a, b, W[14], S13, 15); 519: SET_SSE2(F, b, c, d, a, W[15], S14, 16); 520: 521: SET_SSE2(G, a, b, c, d, W[ 1], S21, 17); 522: SET_SSE2(G, d, a, b, c, W[ 6], S22, 18); 523: SET_SSE2(G, c, d, a, b, W[11], S23, 19); 524: SET_SSE2(G, b, c, d, a, W[ 0], S24, 20); 525: SET_SSE2(G, a, b, c, d, W[ 5], S21, 21); 526: SET_SSE2(G, d, a, b, c, W[10], S22, 22); 527: SET_SSE2(G, c, d, a, b, W[15], S23, 23); 528: SET_SSE2(G, b, c, d, a, W[ 4], S24, 24); 529: SET_SSE2(G, a, b, c, d, W[ 9], S21, 25); 530: SET_SSE2(G, d, a, b, c, W[14], S22, 26); 531: SET_SSE2(G, c, d, a, b, W[ 3], S23, 27); 532: SET_SSE2(G, b, c, d, a, W[ 8], S24, 28); 533: SET_SSE2(G, a, b, c, d, W[13], S21, 29); 534: SET_SSE2(G, d, a, b, c, W[ 2], S22, 30); 535: SET_SSE2(G, c, d, a, b, W[ 7], S23, 31); 536: SET_SSE2(G, b, c, d, a, W[12], S24, 32); 537: 538: SET_SSE2(H, a, b, c, d, W[ 5], S31, 33); 539: SET_SSE2(H, d, a, b, c, W[ 8], S32, 34); 540: SET_SSE2(H, c, d, a, b, W[11], S33, 35); 541: SET_SSE2(H, b, c, d, a, W[14], S34, 36); 542: SET_SSE2(H, a, b, c, d, W[ 1], S31, 37); 543: SET_SSE2(H, d, a, b, c, W[ 4], S32, 38); 544: SET_SSE2(H, c, d, a, b, W[ 7], S33, 39); 545: SET_SSE2(H, b, c, d, a, W[10], S34, 40); 546: SET_SSE2(H, a, b, c, d, W[13], S31, 41); 547: SET_SSE2(H, d, a, b, c, W[ 0], S32, 42); 548: SET_SSE2(H, c, d, a, b, W[ 3], S33, 43); 549: SET_SSE2(H, b, c, d, a, W[ 6], S34, 44); 550: SET_SSE2(H, a, b, c, d, W[ 9], S31, 45); 551: SET_SSE2(H, d, a, b, c, W[12], S32, 46); 552: SET_SSE2(H, c, d, a, b, W[15], S33, 47); 553: SET_SSE2(H, b, c, d, a, W[ 2], S34, 48); 554: 555: SET_SSE2(I, a, b, c, d, W[ 0], S41, 49); 556: SET_SSE2(I, d, a, b, c, W[ 7], S42, 50); 557: SET_SSE2(I, c, d, a, b, W[14], S43, 51); 558: SET_SSE2(I, b, c, d, a, W[ 5], S44, 52); 559: SET_SSE2(I, a, b, c, d, W[12], S41, 53); 560: SET_SSE2(I, d, a, b, c, W[ 3], S42, 54); 561: SET_SSE2(I, c, d, a, b, W[10], S43, 55); 562: SET_SSE2(I, b, c, d, a, W[ 1], S44, 56); 563: SET_SSE2(I, a, b, c, d, W[ 8], S41, 57); 564: SET_SSE2(I, d, a, b, c, W[15], S42, 58); 565: SET_SSE2(I, c, d, a, b, W[ 6], S43, 59); 566: SET_SSE2(I, b, c, d, a, W[13], S44, 60); 567: SET_SSE2(I, a, b, c, d, W[ 4], S41, 61); 568: SET_SSE2(I, d, a, b, c, W[11], S42, 62); 569: SET_SSE2(I, c, d, a, b, W[ 2], S43, 63); 570: SET_SSE2(I, b, c, d, a, W[ 9], S44, 64); 571: 572: ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a); 573: ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b); 574: ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c); 575: ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d); 576: } 577: #endif 578: 579: #ifdef PMD5_ALLOW_AVX2 580: __attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX]) 581: { 582: __m256i W[MD5_DIGEST_LEN], a, b, c, d; 583: 584: GET_PMD5_DATA_AVX2(W[ 0], data, 0); 585: GET_PMD5_DATA_AVX2(W[ 1], data, 4); 586: GET_PMD5_DATA_AVX2(W[ 2], data, 8); 587: GET_PMD5_DATA_AVX2(W[ 3], data, 12); 588: GET_PMD5_DATA_AVX2(W[ 4], data, 16); 589: GET_PMD5_DATA_AVX2(W[ 5], data, 20); 590: GET_PMD5_DATA_AVX2(W[ 6], data, 24); 591: GET_PMD5_DATA_AVX2(W[ 7], data, 28); 592: GET_PMD5_DATA_AVX2(W[ 8], data, 32); 593: GET_PMD5_DATA_AVX2(W[ 9], data, 36); 594: GET_PMD5_DATA_AVX2(W[10], data, 40); 595: GET_PMD5_DATA_AVX2(W[11], data, 44); 596: GET_PMD5_DATA_AVX2(W[12], data, 48); 597: GET_PMD5_DATA_AVX2(W[13], data, 52); 598: GET_PMD5_DATA_AVX2(W[14], data, 56); 599: GET_PMD5_DATA_AVX2(W[15], data, 60); 600: 601: a = ctx->state_avx2[0]; 602: b = ctx->state_avx2[1]; 603: c = ctx->state_avx2[2]; 604: d = ctx->state_avx2[3]; 605: 606: SET_AVX2(F, a, b, c, d, W[ 0], S11, 1); 607: SET_AVX2(F, d, a, b, c, W[ 1], S12, 2); 608: SET_AVX2(F, c, d, a, b, W[ 2], S13, 3); 609: SET_AVX2(F, b, c, d, a, W[ 3], S14, 4); 610: SET_AVX2(F, a, b, c, d, W[ 4], S11, 5); 611: SET_AVX2(F, d, a, b, c, W[ 5], S12, 6); 612: SET_AVX2(F, c, d, a, b, W[ 6], S13, 7); 613: SET_AVX2(F, b, c, d, a, W[ 7], S14, 8); 614: SET_AVX2(F, a, b, c, d, W[ 8], S11, 9); 615: SET_AVX2(F, d, a, b, c, W[ 9], S12, 10); 616: SET_AVX2(F, c, d, a, b, W[10], S13, 11); 617: SET_AVX2(F, b, c, d, a, W[11], S14, 12); 618: SET_AVX2(F, a, b, c, d, W[12], S11, 13); 619: SET_AVX2(F, d, a, b, c, W[13], S12, 14); 620: SET_AVX2(F, c, d, a, b, W[14], S13, 15); 621: SET_AVX2(F, b, c, d, a, W[15], S14, 16); 622: 623: SET_AVX2(G, a, b, c, d, W[ 1], S21, 17); 624: SET_AVX2(G, d, a, b, c, W[ 6], S22, 18); 625: SET_AVX2(G, c, d, a, b, W[11], S23, 19); 626: SET_AVX2(G, b, c, d, a, W[ 0], S24, 20); 627: SET_AVX2(G, a, b, c, d, W[ 5], S21, 21); 628: SET_AVX2(G, d, a, b, c, W[10], S22, 22); 629: SET_AVX2(G, c, d, a, b, W[15], S23, 23); 630: SET_AVX2(G, b, c, d, a, W[ 4], S24, 24); 631: SET_AVX2(G, a, b, c, d, W[ 9], S21, 25); 632: SET_AVX2(G, d, a, b, c, W[14], S22, 26); 633: SET_AVX2(G, c, d, a, b, W[ 3], S23, 27); 634: SET_AVX2(G, b, c, d, a, W[ 8], S24, 28); 635: SET_AVX2(G, a, b, c, d, W[13], S21, 29); 636: SET_AVX2(G, d, a, b, c, W[ 2], S22, 30); 637: SET_AVX2(G, c, d, a, b, W[ 7], S23, 31); 638: SET_AVX2(G, b, c, d, a, W[12], S24, 32); 639: 640: SET_AVX2(H, a, b, c, d, W[ 5], S31, 33); 641: SET_AVX2(H, d, a, b, c, W[ 8], S32, 34); 642: SET_AVX2(H, c, d, a, b, W[11], S33, 35); 643: SET_AVX2(H, b, c, d, a, W[14], S34, 36); 644: SET_AVX2(H, a, b, c, d, W[ 1], S31, 37); 645: SET_AVX2(H, d, a, b, c, W[ 4], S32, 38); 646: SET_AVX2(H, c, d, a, b, W[ 7], S33, 39); 647: SET_AVX2(H, b, c, d, a, W[10], S34, 40); 648: SET_AVX2(H, a, b, c, d, W[13], S31, 41); 649: SET_AVX2(H, d, a, b, c, W[ 0], S32, 42); 650: SET_AVX2(H, c, d, a, b, W[ 3], S33, 43); 651: SET_AVX2(H, b, c, d, a, W[ 6], S34, 44); 652: SET_AVX2(H, a, b, c, d, W[ 9], S31, 45); 653: SET_AVX2(H, d, a, b, c, W[12], S32, 46); 654: SET_AVX2(H, c, d, a, b, W[15], S33, 47); 655: SET_AVX2(H, b, c, d, a, W[ 2], S34, 48); 656: 657: SET_AVX2(I, a, b, c, d, W[ 0], S41, 49); 658: SET_AVX2(I, d, a, b, c, W[ 7], S42, 50); 659: SET_AVX2(I, c, d, a, b, W[14], S43, 51); 660: SET_AVX2(I, b, c, d, a, W[ 5], S44, 52); 661: SET_AVX2(I, a, b, c, d, W[12], S41, 53); 662: SET_AVX2(I, d, a, b, c, W[ 3], S42, 54); 663: SET_AVX2(I, c, d, a, b, W[10], S43, 55); 664: SET_AVX2(I, b, c, d, a, W[ 1], S44, 56); 665: SET_AVX2(I, a, b, c, d, W[ 8], S41, 57); 666: SET_AVX2(I, d, a, b, c, W[15], S42, 58); 667: SET_AVX2(I, c, d, a, b, W[ 6], S43, 59); 668: SET_AVX2(I, b, c, d, a, W[13], S44, 60); 669: SET_AVX2(I, a, b, c, d, W[ 4], S41, 61); 670: SET_AVX2(I, d, a, b, c, W[11], S42, 62); 671: SET_AVX2(I, c, d, a, b, W[ 2], S43, 63); 672: SET_AVX2(I, b, c, d, a, W[ 9], S44, 64); 673: 674: ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a); 675: ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b); 676: ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c); 677: ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d); 678: } 679: #endif 680: 681: __attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX]) 682: { 683: } 684: 685: static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride) 686: { 687: const uint8_t * ptrs[PMD5_SLOTS_MAX]; 688: 689: if (!length) return PMD5_SUCCESS; 690: 691: int slots = pmd5_slots(); 692: 693: if (!stride) stride = 64; 694: 695: int i; 696: for (i = 0; i < slots; i++) { 697: ptrs[i] = data[i]; 698: ctx->len[i] += length; 699: if (!ptrs[i]) ptrs[i] = md5_padding; 700: } 701: 702: while (length >= 64) { 703: pmd5_process(ctx, ptrs); 704: length -= 64; 705: for (i = 0; i < slots; i++) { 706: if (data[i]) ptrs[i] += stride; 707: } 708: } 709: 710: if (length) return PMD5_UNALIGNED_UPDATE; 711: 712: for (i = 0; i < slots; i++) { 713: if (data[i]) data[i] = ptrs[i]; 714: } 715: 716: return PMD5_SUCCESS; 717: } 718: 719: static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]) 720: { 721: uint64_t length = 0; 722: int slots = pmd5_slots(); 723: 724: int i; 725: for (i = 0; i < slots; i++) { 726: if ((length == 0) || (lengths[i] < length)) length = lengths[i]; 727: } 728: 729: for (i = 0; i < slots; i++) { 730: lengths[i] -= length; 731: } 732: 733: return pmd5_update_all_simple(ctx, data, length, 0); 734: } 735: 736: static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length) 737: { 738: MD5_CTX ctx; 739: 740: if ((slot >= pmd5_slots()) || (slot < 0)) 741: return PMD5_INVALID_SLOT; 742: 743: pmd5_to_md5(pctx, &ctx, slot); 744: if (data && length) { 745: MD5_Update(&ctx, data, length); 746: } 747: MD5_Final(digest, &ctx); 748: 749: return PMD5_SUCCESS; 750: } 751: 752: static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot) 753: { 754: return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0); 755: } 756: 757: static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]) 758: { 759: int i; 760: for (i = 0; i < pmd5_slots(); i++) { 761: pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0); 762: } 763: return PMD5_SUCCESS; 764: } 765: 766: static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot) 767: { 768: if ((slot >= pmd5_slots()) || (slot < 0)) 769: return PMD5_INVALID_SLOT; 770: 771: // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting. 772: 773: #ifdef USE_OPENSSL 774: pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29); 775: #else 776: pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32); 777: #endif 778: return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D); 779: } 780: 781: static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot) 782: { 783: if ((slot >= pmd5_slots()) || (slot < 0)) 784: return PMD5_INVALID_SLOT; 785: 786: MD5_Init(ctx); 787: 788: #ifdef USE_OPENSSL 789: ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF; 790: ctx->Nh = pctx->len[slot] >> 29; 791: 792: uint32_t a, b, c, d; 793: pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d); 794: if (ret == PMD5_SUCCESS) { 795: ctx->A = a; 796: ctx->B = b; 797: ctx->C = c; 798: ctx->D = d; 799: } 800: return ret; 801: #else 802: ctx->totalN = pctx->len[slot] & 0xFFFFFFFF; 803: ctx->totalN2 = pctx->len[slot] >> 32; 804: return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D); 805: #endif 806: } 807: 808: /* With GCC 10 putting these implementations inside 'extern "C"' causes an 809: assembler error. That worked fine on GCC 5-9 and clang 6-10... 810: */ 811: 812: static inline int md5_parallel_slots_cpp() 813: { 814: int slots = pmd5_slots(); 815: if (slots == 0) return 1; 816: return slots; 817: } 818: 819: static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4) 820: { 821: int slots = md5_parallel_slots_cpp(); 822: if ((streams < 1) || (streams > slots)) return 0; 823: if (pre4 && post4) return 0; 824: 825: if (slots == 1) { 826: MD5_CTX ctx; 827: MD5_Init(&ctx); 828: if (pre4) { 829: MD5_Update(&ctx, (const unsigned char*)pre4, 4); 830: } 831: MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]); 832: if (post4) { 833: MD5_Update(&ctx, (const unsigned char*)post4, 4); 834: } 835: if (sum[0]) { 836: MD5_Final((uint8_t*)sum[0], &ctx); 837: } 838: return 0; 839: } 840: 841: int i; 842: int active[PMD5_SLOTS_MAX]; 843: char* buffers[PMD5_SLOTS_MAX]; 844: uint64_t left[PMD5_SLOTS_MAX]; 845: for (i = 0; i < PMD5_SLOTS_MAX; i++) { 846: active[i] = streams > i; 847: if (i < streams) { 848: buffers[i] = buf[i]; 849: left[i] = (uint64_t)len[i]; 850: } else { 851: buffers[i] = NULL; 852: left[i] = 0; 853: } 854: } 855: MD5_CTX results[PMD5_SLOTS_MAX]; 856: 857: pmd5_context ctx_simd; 858: if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0; 859: 860: if (pre4) { 861: char temp_buffers[PMD5_SLOTS_MAX][64]; 862: int have_any = 0; 863: for (i = 0; i < slots; i++) { 864: if (active[i]) { 865: if (left[i] < 60) { 866: MD5_Init(&results[i]); 867: MD5_Update(&results[i], (const unsigned char*)pre4, 4); 868: MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]); 869: active[i] = 0; 870: left[i] = 0; 871: } else { 872: memcpy(temp_buffers[i], pre4, 4); 873: memcpy(temp_buffers[i] + 4, buffers[i], 60); 874: buffers[i] += 60; 875: left[i] -= 60; 876: have_any = 1; 877: } 878: } 879: } 880: 881: if (have_any) { 882: char* ptrs[PMD5_SLOTS_MAX]; 883: for (i = 0; i < PMD5_SLOTS_MAX; i++) { 884: ptrs[i] = &temp_buffers[i][0]; 885: } 886: if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) { 887: return 0; 888: } 889: } 890: } 891: 892: int failed = 0; 893: while (true) { 894: for (i = 0; i < slots; i++) { 895: if (active[i] && (left[i] < 64)) { 896: if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) { 897: failed = 1; 898: } 899: active[i] = 0; 900: } 901: } 902: 903: uint64_t shortest = 0; 904: for (i = 0; i < slots; i++) { 905: if (!active[i]) { 906: buffers[i] = NULL; 907: } else if ((shortest == 0) || (left[i] < shortest)) { 908: shortest = left[i]; 909: } 910: } 911: 912: if (shortest > 0) { 913: shortest = shortest & ~63; 914: if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) { 915: failed = 1; 916: } 917: for (i = 0; i < slots; i++) { 918: if (active[i]) { 919: left[i] -= shortest; 920: } 921: } 922: } 923: 924: if (failed) { 925: return 0; 926: } else { 927: int have_any = 0; 928: for (i = 0; i < slots; i++) { 929: have_any |= active[i]; 930: } 931: if (!have_any) { 932: break; 933: } 934: } 935: } 936: 937: for (i = 0; i < slots; i++) { 938: if (i < streams) { 939: if (left[i] > 0) { 940: // buffer[i] == NULL here 941: MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]); 942: } 943: if (post4) { 944: MD5_Update(&results[i], (const unsigned char*)post4, 4); 945: } 946: if (sum[i]) { 947: MD5_Final((uint8_t*)sum[i], &results[i]); 948: } 949: } 950: } 951: 952: return 1; 953: } 954: 955: // each pmd5_context needs to be 32-byte aligned 956: #define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31))) 957: 958: static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx) 959: { 960: int i; 961: for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) { 962: pmd5_init_all(MD5P8_Contexts_simd(ctx, i)); 963: } 964: ctx->used = 0; 965: ctx->next = 0; 966: } 967: 968: static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length) 969: { 970: int slots = pmd5_slots(); 971: uint32 pos = 0; 972: 973: if ((ctx->used) || (length < 512)) { 974: int cpy = MIN(length, 512 - ctx->used); 975: memcpy(&ctx->buffer[ctx->used], input, cpy); 976: ctx->used += cpy; 977: length -= cpy; 978: pos += cpy; 979: 980: if (ctx->used == 512) { 981: if (slots == PMD5_SLOTS_AVX2) { 982: const uint8_t* ptrs[PMD5_SLOTS_MAX] = { 983: (uint8_t*)ctx->buffer, 984: (uint8_t*)(ctx->buffer + 64), 985: (uint8_t*)(ctx->buffer + 128), 986: (uint8_t*)(ctx->buffer + 192), 987: (uint8_t*)(ctx->buffer + 256), 988: (uint8_t*)(ctx->buffer + 320), 989: (uint8_t*)(ctx->buffer + 384), 990: (uint8_t*)(ctx->buffer + 448) 991: }; 992: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0); 993: } else { 994: const uint8_t* ptrs1[PMD5_SLOTS_MAX] = { 995: (uint8_t*)ctx->buffer, 996: (uint8_t*)(ctx->buffer + 64), 997: (uint8_t*)(ctx->buffer + 128), 998: (uint8_t*)(ctx->buffer + 192) 999: }; 1000: const uint8_t* ptrs2[PMD5_SLOTS_MAX] = { 1001: (uint8_t*)(ctx->buffer + 256), 1002: (uint8_t*)(ctx->buffer + 320), 1003: (uint8_t*)(ctx->buffer + 384), 1004: (uint8_t*)(ctx->buffer + 448) 1005: }; 1006: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0); 1007: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0); 1008: } 1009: ctx->used = 0; 1010: } 1011: } 1012: 1013: if (length >= 512) { 1014: uint32 blocks = length / 512; 1015: if (slots == PMD5_SLOTS_AVX2) { 1016: const uint8_t* ptrs[8] = { 1017: (uint8_t*)(input + pos), 1018: (uint8_t*)(input + pos + 64), 1019: (uint8_t*)(input + pos + 128), 1020: (uint8_t*)(input + pos + 192), 1021: (uint8_t*)(input + pos + 256), 1022: (uint8_t*)(input + pos + 320), 1023: (uint8_t*)(input + pos + 384), 1024: (uint8_t*)(input + pos + 448) 1025: }; 1026: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512); 1027: } else { 1028: const uint8_t* ptrs1[4] = { 1029: (uint8_t*)(input + pos), 1030: (uint8_t*)(input + pos + 64), 1031: (uint8_t*)(input + pos + 128), 1032: (uint8_t*)(input + pos + 192) 1033: }; 1034: const uint8_t* ptrs2[4] = { 1035: (uint8_t*)(input + pos + 256), 1036: (uint8_t*)(input + pos + 320), 1037: (uint8_t*)(input + pos + 384), 1038: (uint8_t*)(input + pos + 448) 1039: }; 1040: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512); 1041: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512); 1042: } 1043: pos += blocks * 512; 1044: length -= blocks * 512; 1045: } 1046: 1047: if (length) { 1048: memcpy(ctx->buffer, &input[pos], length); 1049: ctx->used = length; 1050: } 1051: } 1052: 1053: static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx) 1054: { 1055: int i; 1056: uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0; 1057: if (ctx->used) { 1058: uchar tmp[512]; 1059: memset(tmp, 0, 512); 1060: MD5P8_Update(ctx, tmp, 512 - ctx->used); 1061: } 1062: 1063: uchar state[34*4] = {0}; 1064: 1065: MD5_CTX tmp; 1066: for (i = 0; i < 8; i++) { 1067: if (pmd5_slots() == PMD5_SLOTS_AVX2) { 1068: pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i); 1069: } else if (i < 4) { 1070: pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i); 1071: } else { 1072: pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4); 1073: } 1074: #ifdef USE_OPENSSL 1075: if (low + tmp.Nl < low) high++; 1076: low += tmp.Nl; 1077: high += tmp.Nh; 1078: #else 1079: if (low + tmp.totalN < low) high++; 1080: low += tmp.totalN; 1081: high += tmp.totalN2; 1082: #endif 1083: SIVALu(state, i*16, tmp.A); 1084: SIVALu(state, i*16 + 4, tmp.B); 1085: SIVALu(state, i*16 + 8, tmp.C); 1086: SIVALu(state, i*16 + 12, tmp.D); 1087: } 1088: 1089: #ifndef USE_OPENSSL 1090: high = (low >> 29) | (high << 3); 1091: low = (low << 3); 1092: #endif 1093: 1094: sub <<= 3; 1095: if (low - sub > low) high--; 1096: low -= sub; 1097: 1098: SIVALu(state, 32*4, low); 1099: SIVALu(state, 33*4, high); 1100: 1101: MD5_CTX md; 1102: MD5_Init(&md); 1103: MD5_Update(&md, state, 34*4); 1104: MD5_Final(digest, &md); 1105: } 1106: 1107: extern "C" { 1108: 1109: int md5_parallel_slots() 1110: { 1111: return md5_parallel_slots_cpp(); 1112: } 1113: 1114: int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4) 1115: { 1116: return md5_parallel_cpp(streams, buf, len, sum, pre4, post4); 1117: } 1118: 1119: void MD5P8_Init(MD5P8_CTX *ctx) 1120: { 1121: MD5P8_Init_cpp(ctx); 1122: } 1123: 1124: void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length) 1125: { 1126: MD5P8_Update_cpp(ctx, input, length); 1127: } 1128: 1129: void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx) 1130: { 1131: MD5P8_Final_cpp(digest, ctx); 1132: } 1133: 1134: } // "C" 1135: 1136: #endif /* HAVE_SIMD */ 1137: #endif /* __cplusplus */ 1138: #endif /* __x86_64__ */