Annotation of embedaddon/rsync/simd-md5-parallel-x86_64.cpp, revision 1.1
1.1 ! misho 1: /*
! 2: * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel.
! 3: *
! 4: * Original author: Nicolas Noble, 2017
! 5: * Modifications: Jorrit Jongma, 2020
! 6: *
! 7: * The original code was released in the public domain by the original author,
! 8: * falling back to the MIT license ( http://www.opensource.org/licenses/MIT )
! 9: * in case public domain does not apply in your country. These modifications
! 10: * are likewise released in the public domain, with the same MIT license
! 11: * fallback.
! 12: *
! 13: * The original publication can be found at:
! 14: *
! 15: * https://github.com/nicolasnoble/sse-hash
! 16: */
! 17: /*
! 18: * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
! 19: * MD5 code has been removed and those code paths rerouted to use the MD5
! 20: * code already present in rsync, and wrapper functions have been added. The
! 21: * MD5P8 code is also new, and is the reason for the new stride parameter.
! 22: *
! 23: * This code allows multiple independent MD5 streams to be processed in
! 24: * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
! 25: * lower than that of the original C routines for MD5, the processing of
! 26: * additional streams is "for free".
! 27: *
! 28: * Single streams are rerouted to rsync's normal MD5 code as that is faster
! 29: * for that case. A further optimization is possible by using SSE2 code on
! 30: * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not
! 31: * implemented here as it would require some restructuring, and in practise
! 32: * the code here is only rarely called with less than the maximum amount of
! 33: * streams (typically once at the end of each checksum2'd file).
! 34: *
! 35: * Benchmarks (in MB/s) C ASM SSE2*1 SSE2*4 AVX2*1 AVX2*8
! 36: * - Intel Atom D2700 302 334 166 664 N/A N/A
! 37: * - Intel i7-7700hq 351 376 289 1156 273 2184
! 38: * - AMD ThreadRipper 2950x 728 784 568 2272 430 3440
! 39: */
! 40:
! 41: #ifdef __x86_64__
! 42: #ifdef __cplusplus
! 43:
! 44: extern "C" {
! 45:
! 46: #include "rsync.h"
! 47:
! 48: }
! 49:
! 50: #ifdef HAVE_SIMD
! 51:
! 52: #ifndef BENCHMARK_SIMD_CHECKSUM2
! 53: #define PMD5_ALLOW_SSE2 // debugging
! 54: #define PMD5_ALLOW_AVX2 // debugging
! 55: #endif
! 56:
! 57: #ifdef PMD5_ALLOW_AVX2
! 58: #ifndef PMD5_ALLOW_SSE2
! 59: #define PMD5_ALLOW_SSE2
! 60: #endif
! 61: #endif
! 62:
! 63: #include <stdint.h>
! 64: #include <string.h>
! 65:
! 66: #include <immintrin.h>
! 67:
! 68: /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
! 69: #ifdef __clang__
! 70: #define MVSTATIC
! 71: #else
! 72: #define MVSTATIC static
! 73: #endif
! 74:
! 75: // Missing from the headers on gcc 6 and older, clang 8 and older
! 76: typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
! 77: typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
! 78:
! 79: #define PMD5_SLOTS_DEFAULT 0
! 80: #define PMD5_SLOTS_SSE2 4
! 81: #define PMD5_SLOTS_AVX2 8
! 82: #define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2
! 83:
! 84: #ifdef PMD5_ALLOW_SSE2
! 85: __attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots()
! 86: {
! 87: return PMD5_SLOTS_SSE2;
! 88: }
! 89: #endif
! 90:
! 91: #ifdef PMD5_ALLOW_AVX2
! 92: __attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots()
! 93: {
! 94: return PMD5_SLOTS_AVX2;
! 95: }
! 96: #endif
! 97:
! 98: __attribute__ ((target("default"))) MVSTATIC int pmd5_slots()
! 99: {
! 100: return PMD5_SLOTS_DEFAULT;
! 101: }
! 102:
! 103: /* The parallel MD5 context structure. */
! 104: typedef struct {
! 105: __m128i state_sse2[4];
! 106: __m256i state_avx2[4];
! 107: uint64_t len[PMD5_SLOTS_MAX];
! 108: } pmd5_context;
! 109:
! 110: /* The status returned by the various functions below. */
! 111: typedef enum {
! 112: PMD5_SUCCESS,
! 113: PMD5_INVALID_SLOT,
! 114: PMD5_UNALIGNED_UPDATE,
! 115: } pmd5_status;
! 116:
! 117: /* Initializes all slots in the given pmd5 context. */
! 118: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx);
! 119:
! 120: /* Initializes a single slot out in the given pmd5 context. */
! 121: static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot);
! 122:
! 123: /* Makes an MD5 update on all slots in parallel, given the same exact length on all streams.
! 124: The stream pointers will be incremented accordingly.
! 125: It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
! 126: The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted.
! 127: Stride defaults to 64 if 0 is passed. */
! 128: static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride);
! 129:
! 130: /* Makes an MD5 update on all slots in parallel, given different lengths.
! 131: The stream pointers will be incremented accordingly.
! 132: The lengths will be decreased accordingly. Not all data might be consumed.
! 133: It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
! 134: The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */
! 135: static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]);
! 136:
! 137: /* Finishes all slots at once. Fills in all digests. */
! 138: static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]);
! 139:
! 140: /* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
! 141: a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
! 142: static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot);
! 143:
! 144: /* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a
! 145: multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using
! 146: a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
! 147: static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length);
! 148:
! 149: /* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */
! 150: static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot);
! 151:
! 152: /* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */
! 153: static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot);
! 154:
! 155: #define S11 7
! 156: #define S12 12
! 157: #define S13 17
! 158: #define S14 22
! 159: #define S21 5
! 160: #define S22 9
! 161: #define S23 14
! 162: #define S24 20
! 163: #define S31 4
! 164: #define S32 11
! 165: #define S33 16
! 166: #define S34 23
! 167: #define S41 6
! 168: #define S42 10
! 169: #define S43 15
! 170: #define S44 21
! 171:
! 172: #define T1 0xD76AA478
! 173: #define T2 0xE8C7B756
! 174: #define T3 0x242070DB
! 175: #define T4 0xC1BDCEEE
! 176: #define T5 0xF57C0FAF
! 177: #define T6 0x4787C62A
! 178: #define T7 0xA8304613
! 179: #define T8 0xFD469501
! 180: #define T9 0x698098D8
! 181: #define T10 0x8B44F7AF
! 182: #define T11 0xFFFF5BB1
! 183: #define T12 0x895CD7BE
! 184: #define T13 0x6B901122
! 185: #define T14 0xFD987193
! 186: #define T15 0xA679438E
! 187: #define T16 0x49B40821
! 188: #define T17 0xF61E2562
! 189: #define T18 0xC040B340
! 190: #define T19 0x265E5A51
! 191: #define T20 0xE9B6C7AA
! 192: #define T21 0xD62F105D
! 193: #define T22 0x02441453
! 194: #define T23 0xD8A1E681
! 195: #define T24 0xE7D3FBC8
! 196: #define T25 0x21E1CDE6
! 197: #define T26 0xC33707D6
! 198: #define T27 0xF4D50D87
! 199: #define T28 0x455A14ED
! 200: #define T29 0xA9E3E905
! 201: #define T30 0xFCEFA3F8
! 202: #define T31 0x676F02D9
! 203: #define T32 0x8D2A4C8A
! 204: #define T33 0xFFFA3942
! 205: #define T34 0x8771F681
! 206: #define T35 0x6D9D6122
! 207: #define T36 0xFDE5380C
! 208: #define T37 0xA4BEEA44
! 209: #define T38 0x4BDECFA9
! 210: #define T39 0xF6BB4B60
! 211: #define T40 0xBEBFBC70
! 212: #define T41 0x289B7EC6
! 213: #define T42 0xEAA127FA
! 214: #define T43 0xD4EF3085
! 215: #define T44 0x04881D05
! 216: #define T45 0xD9D4D039
! 217: #define T46 0xE6DB99E5
! 218: #define T47 0x1FA27CF8
! 219: #define T48 0xC4AC5665
! 220: #define T49 0xF4292244
! 221: #define T50 0x432AFF97
! 222: #define T51 0xAB9423A7
! 223: #define T52 0xFC93A039
! 224: #define T53 0x655B59C3
! 225: #define T54 0x8F0CCC92
! 226: #define T55 0xFFEFF47D
! 227: #define T56 0x85845DD1
! 228: #define T57 0x6FA87E4F
! 229: #define T58 0xFE2CE6E0
! 230: #define T59 0xA3014314
! 231: #define T60 0x4E0811A1
! 232: #define T61 0xF7537E82
! 233: #define T62 0xBD3AF235
! 234: #define T63 0x2AD7D2BB
! 235: #define T64 0xEB86D391
! 236:
! 237: #define ROTL_SSE2(x, n) { \
! 238: __m128i s; \
! 239: s = _mm_srli_epi32(x, 32 - n); \
! 240: x = _mm_slli_epi32(x, n); \
! 241: x = _mm_or_si128(x, s); \
! 242: };
! 243:
! 244: #define ROTL_AVX2(x, n) { \
! 245: __m256i s; \
! 246: s = _mm256_srli_epi32(x, 32 - n); \
! 247: x = _mm256_slli_epi32(x, n); \
! 248: x = _mm256_or_si256(x, s); \
! 249: };
! 250:
! 251: #define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z))
! 252: #define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y))
! 253: #define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z)
! 254: #define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff))))
! 255:
! 256: #define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
! 257: #define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y))
! 258: #define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z)
! 259: #define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff))))
! 260:
! 261: #define SET_SSE2(step, a, b, c, d, x, s, ac) { \
! 262: a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \
! 263: ROTL_SSE2(a, s); \
! 264: a = _mm_add_epi32(a, b); \
! 265: }
! 266:
! 267: #define SET_AVX2(step, a, b, c, d, x, s, ac) { \
! 268: a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \
! 269: ROTL_AVX2(a, s); \
! 270: a = _mm256_add_epi32(a, b); \
! 271: }
! 272:
! 273: #define IA 0x67452301
! 274: #define IB 0xefcdab89
! 275: #define IC 0x98badcfe
! 276: #define ID 0x10325476
! 277:
! 278: #define GET_MD5_DATA(dest, src, pos) \
! 279: dest = \
! 280: ((uint32_t) src[pos + 0]) << 0 | \
! 281: ((uint32_t) src[pos + 1]) << 8 | \
! 282: ((uint32_t) src[pos + 2]) << 16 | \
! 283: ((uint32_t) src[pos + 3]) << 24
! 284:
! 285: #define GET_PMD5_DATA_SSE2(dest, src, pos) { \
! 286: uint32_t v0, v1, v2, v3; \
! 287: GET_MD5_DATA(v0, src[0], pos); \
! 288: GET_MD5_DATA(v1, src[1], pos); \
! 289: GET_MD5_DATA(v2, src[2], pos); \
! 290: GET_MD5_DATA(v3, src[3], pos); \
! 291: dest = _mm_setr_epi32(v0, v1, v2, v3); \
! 292: }
! 293:
! 294: #define GET_PMD5_DATA_AVX2(dest, src, pos) { \
! 295: uint32_t v0, v1, v2, v3; \
! 296: uint32_t v4, v5, v6, v7; \
! 297: GET_MD5_DATA(v0, src[0], pos); \
! 298: GET_MD5_DATA(v1, src[1], pos); \
! 299: GET_MD5_DATA(v2, src[2], pos); \
! 300: GET_MD5_DATA(v3, src[3], pos); \
! 301: GET_MD5_DATA(v4, src[4], pos); \
! 302: GET_MD5_DATA(v5, src[5], pos); \
! 303: GET_MD5_DATA(v6, src[6], pos); \
! 304: GET_MD5_DATA(v7, src[7], pos); \
! 305: dest = _mm256_setr_epi32(v0, v1, v2, v3, \
! 306: v4, v5, v6, v7); \
! 307: }
! 308:
! 309: #define PUT_MD5_DATA(dest, val, pos) { \
! 310: dest[pos + 0] = (val >> 0) & 0xff; \
! 311: dest[pos + 1] = (val >> 8) & 0xff; \
! 312: dest[pos + 2] = (val >> 16) & 0xff; \
! 313: dest[pos + 3] = (val >> 24) & 0xff; \
! 314: }
! 315:
! 316: const static uint8_t md5_padding[64] = {
! 317: 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 318: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 319: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 320: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 321: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 322: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 323: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 324: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
! 325: };
! 326:
! 327: #ifdef PMD5_ALLOW_SSE2
! 328: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
! 329: {
! 330: int i;
! 331: for (i = 0; i < PMD5_SLOTS_MAX; i++) {
! 332: ctx->len[i] = 0;
! 333: }
! 334:
! 335: ctx->state_sse2[0] = _mm_set1_epi32(IA);
! 336: ctx->state_sse2[1] = _mm_set1_epi32(IB);
! 337: ctx->state_sse2[2] = _mm_set1_epi32(IC);
! 338: ctx->state_sse2[3] = _mm_set1_epi32(ID);
! 339:
! 340: return PMD5_SUCCESS;
! 341: }
! 342: #endif
! 343:
! 344: #ifdef PMD5_ALLOW_AVX2
! 345: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
! 346: {
! 347: int i;
! 348: for (i = 0; i < PMD5_SLOTS_MAX; i++) {
! 349: ctx->len[i] = 0;
! 350: }
! 351:
! 352: ctx->state_avx2[0] = _mm256_set1_epi32(IA);
! 353: ctx->state_avx2[1] = _mm256_set1_epi32(IB);
! 354: ctx->state_avx2[2] = _mm256_set1_epi32(IC);
! 355: ctx->state_avx2[3] = _mm256_set1_epi32(ID);
! 356:
! 357: return PMD5_SUCCESS;
! 358: }
! 359: #endif
! 360:
! 361: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
! 362: {
! 363: return PMD5_INVALID_SLOT;
! 364: }
! 365:
! 366: #ifdef PMD5_ALLOW_SSE2
! 367: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
! 368: {
! 369: if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
! 370: return PMD5_INVALID_SLOT;
! 371:
! 372: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
! 373: int i;
! 374:
! 375: for (i = 0; i < 4; i++) {
! 376: _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
! 377: }
! 378:
! 379: v[0][slot] = a;
! 380: v[1][slot] = b;
! 381: v[2][slot] = c;
! 382: v[3][slot] = d;
! 383:
! 384: for (i = 0; i < 4; i++) {
! 385: ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]);
! 386: }
! 387:
! 388: return PMD5_SUCCESS;
! 389: }
! 390: #endif
! 391:
! 392: #ifdef PMD5_ALLOW_AVX2
! 393: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
! 394: {
! 395: if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
! 396: return PMD5_INVALID_SLOT;
! 397:
! 398: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
! 399: int i;
! 400:
! 401: for (i = 0; i < 4; i++) {
! 402: _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
! 403: }
! 404:
! 405: v[0][slot] = a;
! 406: v[1][slot] = b;
! 407: v[2][slot] = c;
! 408: v[3][slot] = d;
! 409:
! 410: for (i = 0; i < 4; i++) {
! 411: ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]);
! 412: }
! 413:
! 414: return PMD5_SUCCESS;
! 415: }
! 416: #endif
! 417:
! 418: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
! 419: {
! 420: return PMD5_INVALID_SLOT;
! 421: }
! 422:
! 423: #ifdef PMD5_ALLOW_SSE2
! 424: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
! 425: {
! 426: if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
! 427: return PMD5_INVALID_SLOT;
! 428:
! 429: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
! 430: int i;
! 431:
! 432: for (i = 0; i < 4; i++) {
! 433: _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
! 434: }
! 435:
! 436: *a = v[0][slot];
! 437: *b = v[1][slot];
! 438: *c = v[2][slot];
! 439: *d = v[3][slot];
! 440:
! 441: return PMD5_SUCCESS;
! 442: }
! 443: #endif
! 444:
! 445: #ifdef PMD5_ALLOW_AVX2
! 446: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
! 447: {
! 448: if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
! 449: return PMD5_INVALID_SLOT;
! 450:
! 451: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
! 452: int i;
! 453:
! 454: for (i = 0; i < 4; i++) {
! 455: _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
! 456: }
! 457:
! 458: *a = v[0][slot];
! 459: *b = v[1][slot];
! 460: *c = v[2][slot];
! 461: *d = v[3][slot];
! 462:
! 463: return PMD5_SUCCESS;
! 464: }
! 465: #endif
! 466:
! 467: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
! 468: {
! 469: return PMD5_INVALID_SLOT;
! 470: }
! 471:
! 472: static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot)
! 473: {
! 474: return pmd5_set_slot(ctx, slot, IA, IB, IC, ID);
! 475: }
! 476:
! 477: #ifdef PMD5_ALLOW_SSE2
! 478: __attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
! 479: {
! 480: __m128i W[MD5_DIGEST_LEN], a, b, c, d;
! 481:
! 482: GET_PMD5_DATA_SSE2(W[ 0], data, 0);
! 483: GET_PMD5_DATA_SSE2(W[ 1], data, 4);
! 484: GET_PMD5_DATA_SSE2(W[ 2], data, 8);
! 485: GET_PMD5_DATA_SSE2(W[ 3], data, 12);
! 486: GET_PMD5_DATA_SSE2(W[ 4], data, 16);
! 487: GET_PMD5_DATA_SSE2(W[ 5], data, 20);
! 488: GET_PMD5_DATA_SSE2(W[ 6], data, 24);
! 489: GET_PMD5_DATA_SSE2(W[ 7], data, 28);
! 490: GET_PMD5_DATA_SSE2(W[ 8], data, 32);
! 491: GET_PMD5_DATA_SSE2(W[ 9], data, 36);
! 492: GET_PMD5_DATA_SSE2(W[10], data, 40);
! 493: GET_PMD5_DATA_SSE2(W[11], data, 44);
! 494: GET_PMD5_DATA_SSE2(W[12], data, 48);
! 495: GET_PMD5_DATA_SSE2(W[13], data, 52);
! 496: GET_PMD5_DATA_SSE2(W[14], data, 56);
! 497: GET_PMD5_DATA_SSE2(W[15], data, 60);
! 498:
! 499: a = ctx->state_sse2[0];
! 500: b = ctx->state_sse2[1];
! 501: c = ctx->state_sse2[2];
! 502: d = ctx->state_sse2[3];
! 503:
! 504: SET_SSE2(F, a, b, c, d, W[ 0], S11, 1);
! 505: SET_SSE2(F, d, a, b, c, W[ 1], S12, 2);
! 506: SET_SSE2(F, c, d, a, b, W[ 2], S13, 3);
! 507: SET_SSE2(F, b, c, d, a, W[ 3], S14, 4);
! 508: SET_SSE2(F, a, b, c, d, W[ 4], S11, 5);
! 509: SET_SSE2(F, d, a, b, c, W[ 5], S12, 6);
! 510: SET_SSE2(F, c, d, a, b, W[ 6], S13, 7);
! 511: SET_SSE2(F, b, c, d, a, W[ 7], S14, 8);
! 512: SET_SSE2(F, a, b, c, d, W[ 8], S11, 9);
! 513: SET_SSE2(F, d, a, b, c, W[ 9], S12, 10);
! 514: SET_SSE2(F, c, d, a, b, W[10], S13, 11);
! 515: SET_SSE2(F, b, c, d, a, W[11], S14, 12);
! 516: SET_SSE2(F, a, b, c, d, W[12], S11, 13);
! 517: SET_SSE2(F, d, a, b, c, W[13], S12, 14);
! 518: SET_SSE2(F, c, d, a, b, W[14], S13, 15);
! 519: SET_SSE2(F, b, c, d, a, W[15], S14, 16);
! 520:
! 521: SET_SSE2(G, a, b, c, d, W[ 1], S21, 17);
! 522: SET_SSE2(G, d, a, b, c, W[ 6], S22, 18);
! 523: SET_SSE2(G, c, d, a, b, W[11], S23, 19);
! 524: SET_SSE2(G, b, c, d, a, W[ 0], S24, 20);
! 525: SET_SSE2(G, a, b, c, d, W[ 5], S21, 21);
! 526: SET_SSE2(G, d, a, b, c, W[10], S22, 22);
! 527: SET_SSE2(G, c, d, a, b, W[15], S23, 23);
! 528: SET_SSE2(G, b, c, d, a, W[ 4], S24, 24);
! 529: SET_SSE2(G, a, b, c, d, W[ 9], S21, 25);
! 530: SET_SSE2(G, d, a, b, c, W[14], S22, 26);
! 531: SET_SSE2(G, c, d, a, b, W[ 3], S23, 27);
! 532: SET_SSE2(G, b, c, d, a, W[ 8], S24, 28);
! 533: SET_SSE2(G, a, b, c, d, W[13], S21, 29);
! 534: SET_SSE2(G, d, a, b, c, W[ 2], S22, 30);
! 535: SET_SSE2(G, c, d, a, b, W[ 7], S23, 31);
! 536: SET_SSE2(G, b, c, d, a, W[12], S24, 32);
! 537:
! 538: SET_SSE2(H, a, b, c, d, W[ 5], S31, 33);
! 539: SET_SSE2(H, d, a, b, c, W[ 8], S32, 34);
! 540: SET_SSE2(H, c, d, a, b, W[11], S33, 35);
! 541: SET_SSE2(H, b, c, d, a, W[14], S34, 36);
! 542: SET_SSE2(H, a, b, c, d, W[ 1], S31, 37);
! 543: SET_SSE2(H, d, a, b, c, W[ 4], S32, 38);
! 544: SET_SSE2(H, c, d, a, b, W[ 7], S33, 39);
! 545: SET_SSE2(H, b, c, d, a, W[10], S34, 40);
! 546: SET_SSE2(H, a, b, c, d, W[13], S31, 41);
! 547: SET_SSE2(H, d, a, b, c, W[ 0], S32, 42);
! 548: SET_SSE2(H, c, d, a, b, W[ 3], S33, 43);
! 549: SET_SSE2(H, b, c, d, a, W[ 6], S34, 44);
! 550: SET_SSE2(H, a, b, c, d, W[ 9], S31, 45);
! 551: SET_SSE2(H, d, a, b, c, W[12], S32, 46);
! 552: SET_SSE2(H, c, d, a, b, W[15], S33, 47);
! 553: SET_SSE2(H, b, c, d, a, W[ 2], S34, 48);
! 554:
! 555: SET_SSE2(I, a, b, c, d, W[ 0], S41, 49);
! 556: SET_SSE2(I, d, a, b, c, W[ 7], S42, 50);
! 557: SET_SSE2(I, c, d, a, b, W[14], S43, 51);
! 558: SET_SSE2(I, b, c, d, a, W[ 5], S44, 52);
! 559: SET_SSE2(I, a, b, c, d, W[12], S41, 53);
! 560: SET_SSE2(I, d, a, b, c, W[ 3], S42, 54);
! 561: SET_SSE2(I, c, d, a, b, W[10], S43, 55);
! 562: SET_SSE2(I, b, c, d, a, W[ 1], S44, 56);
! 563: SET_SSE2(I, a, b, c, d, W[ 8], S41, 57);
! 564: SET_SSE2(I, d, a, b, c, W[15], S42, 58);
! 565: SET_SSE2(I, c, d, a, b, W[ 6], S43, 59);
! 566: SET_SSE2(I, b, c, d, a, W[13], S44, 60);
! 567: SET_SSE2(I, a, b, c, d, W[ 4], S41, 61);
! 568: SET_SSE2(I, d, a, b, c, W[11], S42, 62);
! 569: SET_SSE2(I, c, d, a, b, W[ 2], S43, 63);
! 570: SET_SSE2(I, b, c, d, a, W[ 9], S44, 64);
! 571:
! 572: ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a);
! 573: ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b);
! 574: ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c);
! 575: ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d);
! 576: }
! 577: #endif
! 578:
! 579: #ifdef PMD5_ALLOW_AVX2
! 580: __attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
! 581: {
! 582: __m256i W[MD5_DIGEST_LEN], a, b, c, d;
! 583:
! 584: GET_PMD5_DATA_AVX2(W[ 0], data, 0);
! 585: GET_PMD5_DATA_AVX2(W[ 1], data, 4);
! 586: GET_PMD5_DATA_AVX2(W[ 2], data, 8);
! 587: GET_PMD5_DATA_AVX2(W[ 3], data, 12);
! 588: GET_PMD5_DATA_AVX2(W[ 4], data, 16);
! 589: GET_PMD5_DATA_AVX2(W[ 5], data, 20);
! 590: GET_PMD5_DATA_AVX2(W[ 6], data, 24);
! 591: GET_PMD5_DATA_AVX2(W[ 7], data, 28);
! 592: GET_PMD5_DATA_AVX2(W[ 8], data, 32);
! 593: GET_PMD5_DATA_AVX2(W[ 9], data, 36);
! 594: GET_PMD5_DATA_AVX2(W[10], data, 40);
! 595: GET_PMD5_DATA_AVX2(W[11], data, 44);
! 596: GET_PMD5_DATA_AVX2(W[12], data, 48);
! 597: GET_PMD5_DATA_AVX2(W[13], data, 52);
! 598: GET_PMD5_DATA_AVX2(W[14], data, 56);
! 599: GET_PMD5_DATA_AVX2(W[15], data, 60);
! 600:
! 601: a = ctx->state_avx2[0];
! 602: b = ctx->state_avx2[1];
! 603: c = ctx->state_avx2[2];
! 604: d = ctx->state_avx2[3];
! 605:
! 606: SET_AVX2(F, a, b, c, d, W[ 0], S11, 1);
! 607: SET_AVX2(F, d, a, b, c, W[ 1], S12, 2);
! 608: SET_AVX2(F, c, d, a, b, W[ 2], S13, 3);
! 609: SET_AVX2(F, b, c, d, a, W[ 3], S14, 4);
! 610: SET_AVX2(F, a, b, c, d, W[ 4], S11, 5);
! 611: SET_AVX2(F, d, a, b, c, W[ 5], S12, 6);
! 612: SET_AVX2(F, c, d, a, b, W[ 6], S13, 7);
! 613: SET_AVX2(F, b, c, d, a, W[ 7], S14, 8);
! 614: SET_AVX2(F, a, b, c, d, W[ 8], S11, 9);
! 615: SET_AVX2(F, d, a, b, c, W[ 9], S12, 10);
! 616: SET_AVX2(F, c, d, a, b, W[10], S13, 11);
! 617: SET_AVX2(F, b, c, d, a, W[11], S14, 12);
! 618: SET_AVX2(F, a, b, c, d, W[12], S11, 13);
! 619: SET_AVX2(F, d, a, b, c, W[13], S12, 14);
! 620: SET_AVX2(F, c, d, a, b, W[14], S13, 15);
! 621: SET_AVX2(F, b, c, d, a, W[15], S14, 16);
! 622:
! 623: SET_AVX2(G, a, b, c, d, W[ 1], S21, 17);
! 624: SET_AVX2(G, d, a, b, c, W[ 6], S22, 18);
! 625: SET_AVX2(G, c, d, a, b, W[11], S23, 19);
! 626: SET_AVX2(G, b, c, d, a, W[ 0], S24, 20);
! 627: SET_AVX2(G, a, b, c, d, W[ 5], S21, 21);
! 628: SET_AVX2(G, d, a, b, c, W[10], S22, 22);
! 629: SET_AVX2(G, c, d, a, b, W[15], S23, 23);
! 630: SET_AVX2(G, b, c, d, a, W[ 4], S24, 24);
! 631: SET_AVX2(G, a, b, c, d, W[ 9], S21, 25);
! 632: SET_AVX2(G, d, a, b, c, W[14], S22, 26);
! 633: SET_AVX2(G, c, d, a, b, W[ 3], S23, 27);
! 634: SET_AVX2(G, b, c, d, a, W[ 8], S24, 28);
! 635: SET_AVX2(G, a, b, c, d, W[13], S21, 29);
! 636: SET_AVX2(G, d, a, b, c, W[ 2], S22, 30);
! 637: SET_AVX2(G, c, d, a, b, W[ 7], S23, 31);
! 638: SET_AVX2(G, b, c, d, a, W[12], S24, 32);
! 639:
! 640: SET_AVX2(H, a, b, c, d, W[ 5], S31, 33);
! 641: SET_AVX2(H, d, a, b, c, W[ 8], S32, 34);
! 642: SET_AVX2(H, c, d, a, b, W[11], S33, 35);
! 643: SET_AVX2(H, b, c, d, a, W[14], S34, 36);
! 644: SET_AVX2(H, a, b, c, d, W[ 1], S31, 37);
! 645: SET_AVX2(H, d, a, b, c, W[ 4], S32, 38);
! 646: SET_AVX2(H, c, d, a, b, W[ 7], S33, 39);
! 647: SET_AVX2(H, b, c, d, a, W[10], S34, 40);
! 648: SET_AVX2(H, a, b, c, d, W[13], S31, 41);
! 649: SET_AVX2(H, d, a, b, c, W[ 0], S32, 42);
! 650: SET_AVX2(H, c, d, a, b, W[ 3], S33, 43);
! 651: SET_AVX2(H, b, c, d, a, W[ 6], S34, 44);
! 652: SET_AVX2(H, a, b, c, d, W[ 9], S31, 45);
! 653: SET_AVX2(H, d, a, b, c, W[12], S32, 46);
! 654: SET_AVX2(H, c, d, a, b, W[15], S33, 47);
! 655: SET_AVX2(H, b, c, d, a, W[ 2], S34, 48);
! 656:
! 657: SET_AVX2(I, a, b, c, d, W[ 0], S41, 49);
! 658: SET_AVX2(I, d, a, b, c, W[ 7], S42, 50);
! 659: SET_AVX2(I, c, d, a, b, W[14], S43, 51);
! 660: SET_AVX2(I, b, c, d, a, W[ 5], S44, 52);
! 661: SET_AVX2(I, a, b, c, d, W[12], S41, 53);
! 662: SET_AVX2(I, d, a, b, c, W[ 3], S42, 54);
! 663: SET_AVX2(I, c, d, a, b, W[10], S43, 55);
! 664: SET_AVX2(I, b, c, d, a, W[ 1], S44, 56);
! 665: SET_AVX2(I, a, b, c, d, W[ 8], S41, 57);
! 666: SET_AVX2(I, d, a, b, c, W[15], S42, 58);
! 667: SET_AVX2(I, c, d, a, b, W[ 6], S43, 59);
! 668: SET_AVX2(I, b, c, d, a, W[13], S44, 60);
! 669: SET_AVX2(I, a, b, c, d, W[ 4], S41, 61);
! 670: SET_AVX2(I, d, a, b, c, W[11], S42, 62);
! 671: SET_AVX2(I, c, d, a, b, W[ 2], S43, 63);
! 672: SET_AVX2(I, b, c, d, a, W[ 9], S44, 64);
! 673:
! 674: ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a);
! 675: ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b);
! 676: ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c);
! 677: ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d);
! 678: }
! 679: #endif
! 680:
! 681: __attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
! 682: {
! 683: }
! 684:
! 685: static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride)
! 686: {
! 687: const uint8_t * ptrs[PMD5_SLOTS_MAX];
! 688:
! 689: if (!length) return PMD5_SUCCESS;
! 690:
! 691: int slots = pmd5_slots();
! 692:
! 693: if (!stride) stride = 64;
! 694:
! 695: int i;
! 696: for (i = 0; i < slots; i++) {
! 697: ptrs[i] = data[i];
! 698: ctx->len[i] += length;
! 699: if (!ptrs[i]) ptrs[i] = md5_padding;
! 700: }
! 701:
! 702: while (length >= 64) {
! 703: pmd5_process(ctx, ptrs);
! 704: length -= 64;
! 705: for (i = 0; i < slots; i++) {
! 706: if (data[i]) ptrs[i] += stride;
! 707: }
! 708: }
! 709:
! 710: if (length) return PMD5_UNALIGNED_UPDATE;
! 711:
! 712: for (i = 0; i < slots; i++) {
! 713: if (data[i]) data[i] = ptrs[i];
! 714: }
! 715:
! 716: return PMD5_SUCCESS;
! 717: }
! 718:
! 719: static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX])
! 720: {
! 721: uint64_t length = 0;
! 722: int slots = pmd5_slots();
! 723:
! 724: int i;
! 725: for (i = 0; i < slots; i++) {
! 726: if ((length == 0) || (lengths[i] < length)) length = lengths[i];
! 727: }
! 728:
! 729: for (i = 0; i < slots; i++) {
! 730: lengths[i] -= length;
! 731: }
! 732:
! 733: return pmd5_update_all_simple(ctx, data, length, 0);
! 734: }
! 735:
! 736: static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length)
! 737: {
! 738: MD5_CTX ctx;
! 739:
! 740: if ((slot >= pmd5_slots()) || (slot < 0))
! 741: return PMD5_INVALID_SLOT;
! 742:
! 743: pmd5_to_md5(pctx, &ctx, slot);
! 744: if (data && length) {
! 745: MD5_Update(&ctx, data, length);
! 746: }
! 747: MD5_Final(digest, &ctx);
! 748:
! 749: return PMD5_SUCCESS;
! 750: }
! 751:
! 752: static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot)
! 753: {
! 754: return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0);
! 755: }
! 756:
! 757: static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN])
! 758: {
! 759: int i;
! 760: for (i = 0; i < pmd5_slots(); i++) {
! 761: pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0);
! 762: }
! 763: return PMD5_SUCCESS;
! 764: }
! 765:
! 766: static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot)
! 767: {
! 768: if ((slot >= pmd5_slots()) || (slot < 0))
! 769: return PMD5_INVALID_SLOT;
! 770:
! 771: // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting.
! 772:
! 773: #ifdef USE_OPENSSL
! 774: pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29);
! 775: #else
! 776: pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32);
! 777: #endif
! 778: return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D);
! 779: }
! 780:
! 781: static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot)
! 782: {
! 783: if ((slot >= pmd5_slots()) || (slot < 0))
! 784: return PMD5_INVALID_SLOT;
! 785:
! 786: MD5_Init(ctx);
! 787:
! 788: #ifdef USE_OPENSSL
! 789: ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF;
! 790: ctx->Nh = pctx->len[slot] >> 29;
! 791:
! 792: uint32_t a, b, c, d;
! 793: pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
! 794: if (ret == PMD5_SUCCESS) {
! 795: ctx->A = a;
! 796: ctx->B = b;
! 797: ctx->C = c;
! 798: ctx->D = d;
! 799: }
! 800: return ret;
! 801: #else
! 802: ctx->totalN = pctx->len[slot] & 0xFFFFFFFF;
! 803: ctx->totalN2 = pctx->len[slot] >> 32;
! 804: return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D);
! 805: #endif
! 806: }
! 807:
! 808: /* With GCC 10 putting these implementations inside 'extern "C"' causes an
! 809: assembler error. That worked fine on GCC 5-9 and clang 6-10...
! 810: */
! 811:
! 812: static inline int md5_parallel_slots_cpp()
! 813: {
! 814: int slots = pmd5_slots();
! 815: if (slots == 0) return 1;
! 816: return slots;
! 817: }
! 818:
! 819: static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
! 820: {
! 821: int slots = md5_parallel_slots_cpp();
! 822: if ((streams < 1) || (streams > slots)) return 0;
! 823: if (pre4 && post4) return 0;
! 824:
! 825: if (slots == 1) {
! 826: MD5_CTX ctx;
! 827: MD5_Init(&ctx);
! 828: if (pre4) {
! 829: MD5_Update(&ctx, (const unsigned char*)pre4, 4);
! 830: }
! 831: MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]);
! 832: if (post4) {
! 833: MD5_Update(&ctx, (const unsigned char*)post4, 4);
! 834: }
! 835: if (sum[0]) {
! 836: MD5_Final((uint8_t*)sum[0], &ctx);
! 837: }
! 838: return 0;
! 839: }
! 840:
! 841: int i;
! 842: int active[PMD5_SLOTS_MAX];
! 843: char* buffers[PMD5_SLOTS_MAX];
! 844: uint64_t left[PMD5_SLOTS_MAX];
! 845: for (i = 0; i < PMD5_SLOTS_MAX; i++) {
! 846: active[i] = streams > i;
! 847: if (i < streams) {
! 848: buffers[i] = buf[i];
! 849: left[i] = (uint64_t)len[i];
! 850: } else {
! 851: buffers[i] = NULL;
! 852: left[i] = 0;
! 853: }
! 854: }
! 855: MD5_CTX results[PMD5_SLOTS_MAX];
! 856:
! 857: pmd5_context ctx_simd;
! 858: if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0;
! 859:
! 860: if (pre4) {
! 861: char temp_buffers[PMD5_SLOTS_MAX][64];
! 862: int have_any = 0;
! 863: for (i = 0; i < slots; i++) {
! 864: if (active[i]) {
! 865: if (left[i] < 60) {
! 866: MD5_Init(&results[i]);
! 867: MD5_Update(&results[i], (const unsigned char*)pre4, 4);
! 868: MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]);
! 869: active[i] = 0;
! 870: left[i] = 0;
! 871: } else {
! 872: memcpy(temp_buffers[i], pre4, 4);
! 873: memcpy(temp_buffers[i] + 4, buffers[i], 60);
! 874: buffers[i] += 60;
! 875: left[i] -= 60;
! 876: have_any = 1;
! 877: }
! 878: }
! 879: }
! 880:
! 881: if (have_any) {
! 882: char* ptrs[PMD5_SLOTS_MAX];
! 883: for (i = 0; i < PMD5_SLOTS_MAX; i++) {
! 884: ptrs[i] = &temp_buffers[i][0];
! 885: }
! 886: if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) {
! 887: return 0;
! 888: }
! 889: }
! 890: }
! 891:
! 892: int failed = 0;
! 893: while (true) {
! 894: for (i = 0; i < slots; i++) {
! 895: if (active[i] && (left[i] < 64)) {
! 896: if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) {
! 897: failed = 1;
! 898: }
! 899: active[i] = 0;
! 900: }
! 901: }
! 902:
! 903: uint64_t shortest = 0;
! 904: for (i = 0; i < slots; i++) {
! 905: if (!active[i]) {
! 906: buffers[i] = NULL;
! 907: } else if ((shortest == 0) || (left[i] < shortest)) {
! 908: shortest = left[i];
! 909: }
! 910: }
! 911:
! 912: if (shortest > 0) {
! 913: shortest = shortest & ~63;
! 914: if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) {
! 915: failed = 1;
! 916: }
! 917: for (i = 0; i < slots; i++) {
! 918: if (active[i]) {
! 919: left[i] -= shortest;
! 920: }
! 921: }
! 922: }
! 923:
! 924: if (failed) {
! 925: return 0;
! 926: } else {
! 927: int have_any = 0;
! 928: for (i = 0; i < slots; i++) {
! 929: have_any |= active[i];
! 930: }
! 931: if (!have_any) {
! 932: break;
! 933: }
! 934: }
! 935: }
! 936:
! 937: for (i = 0; i < slots; i++) {
! 938: if (i < streams) {
! 939: if (left[i] > 0) {
! 940: // buffer[i] == NULL here
! 941: MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]);
! 942: }
! 943: if (post4) {
! 944: MD5_Update(&results[i], (const unsigned char*)post4, 4);
! 945: }
! 946: if (sum[i]) {
! 947: MD5_Final((uint8_t*)sum[i], &results[i]);
! 948: }
! 949: }
! 950: }
! 951:
! 952: return 1;
! 953: }
! 954:
! 955: // each pmd5_context needs to be 32-byte aligned
! 956: #define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31)))
! 957:
! 958: static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx)
! 959: {
! 960: int i;
! 961: for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) {
! 962: pmd5_init_all(MD5P8_Contexts_simd(ctx, i));
! 963: }
! 964: ctx->used = 0;
! 965: ctx->next = 0;
! 966: }
! 967:
! 968: static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length)
! 969: {
! 970: int slots = pmd5_slots();
! 971: uint32 pos = 0;
! 972:
! 973: if ((ctx->used) || (length < 512)) {
! 974: int cpy = MIN(length, 512 - ctx->used);
! 975: memcpy(&ctx->buffer[ctx->used], input, cpy);
! 976: ctx->used += cpy;
! 977: length -= cpy;
! 978: pos += cpy;
! 979:
! 980: if (ctx->used == 512) {
! 981: if (slots == PMD5_SLOTS_AVX2) {
! 982: const uint8_t* ptrs[PMD5_SLOTS_MAX] = {
! 983: (uint8_t*)ctx->buffer,
! 984: (uint8_t*)(ctx->buffer + 64),
! 985: (uint8_t*)(ctx->buffer + 128),
! 986: (uint8_t*)(ctx->buffer + 192),
! 987: (uint8_t*)(ctx->buffer + 256),
! 988: (uint8_t*)(ctx->buffer + 320),
! 989: (uint8_t*)(ctx->buffer + 384),
! 990: (uint8_t*)(ctx->buffer + 448)
! 991: };
! 992: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0);
! 993: } else {
! 994: const uint8_t* ptrs1[PMD5_SLOTS_MAX] = {
! 995: (uint8_t*)ctx->buffer,
! 996: (uint8_t*)(ctx->buffer + 64),
! 997: (uint8_t*)(ctx->buffer + 128),
! 998: (uint8_t*)(ctx->buffer + 192)
! 999: };
! 1000: const uint8_t* ptrs2[PMD5_SLOTS_MAX] = {
! 1001: (uint8_t*)(ctx->buffer + 256),
! 1002: (uint8_t*)(ctx->buffer + 320),
! 1003: (uint8_t*)(ctx->buffer + 384),
! 1004: (uint8_t*)(ctx->buffer + 448)
! 1005: };
! 1006: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0);
! 1007: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0);
! 1008: }
! 1009: ctx->used = 0;
! 1010: }
! 1011: }
! 1012:
! 1013: if (length >= 512) {
! 1014: uint32 blocks = length / 512;
! 1015: if (slots == PMD5_SLOTS_AVX2) {
! 1016: const uint8_t* ptrs[8] = {
! 1017: (uint8_t*)(input + pos),
! 1018: (uint8_t*)(input + pos + 64),
! 1019: (uint8_t*)(input + pos + 128),
! 1020: (uint8_t*)(input + pos + 192),
! 1021: (uint8_t*)(input + pos + 256),
! 1022: (uint8_t*)(input + pos + 320),
! 1023: (uint8_t*)(input + pos + 384),
! 1024: (uint8_t*)(input + pos + 448)
! 1025: };
! 1026: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512);
! 1027: } else {
! 1028: const uint8_t* ptrs1[4] = {
! 1029: (uint8_t*)(input + pos),
! 1030: (uint8_t*)(input + pos + 64),
! 1031: (uint8_t*)(input + pos + 128),
! 1032: (uint8_t*)(input + pos + 192)
! 1033: };
! 1034: const uint8_t* ptrs2[4] = {
! 1035: (uint8_t*)(input + pos + 256),
! 1036: (uint8_t*)(input + pos + 320),
! 1037: (uint8_t*)(input + pos + 384),
! 1038: (uint8_t*)(input + pos + 448)
! 1039: };
! 1040: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512);
! 1041: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512);
! 1042: }
! 1043: pos += blocks * 512;
! 1044: length -= blocks * 512;
! 1045: }
! 1046:
! 1047: if (length) {
! 1048: memcpy(ctx->buffer, &input[pos], length);
! 1049: ctx->used = length;
! 1050: }
! 1051: }
! 1052:
! 1053: static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
! 1054: {
! 1055: int i;
! 1056: uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0;
! 1057: if (ctx->used) {
! 1058: uchar tmp[512];
! 1059: memset(tmp, 0, 512);
! 1060: MD5P8_Update(ctx, tmp, 512 - ctx->used);
! 1061: }
! 1062:
! 1063: uchar state[34*4] = {0};
! 1064:
! 1065: MD5_CTX tmp;
! 1066: for (i = 0; i < 8; i++) {
! 1067: if (pmd5_slots() == PMD5_SLOTS_AVX2) {
! 1068: pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
! 1069: } else if (i < 4) {
! 1070: pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
! 1071: } else {
! 1072: pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4);
! 1073: }
! 1074: #ifdef USE_OPENSSL
! 1075: if (low + tmp.Nl < low) high++;
! 1076: low += tmp.Nl;
! 1077: high += tmp.Nh;
! 1078: #else
! 1079: if (low + tmp.totalN < low) high++;
! 1080: low += tmp.totalN;
! 1081: high += tmp.totalN2;
! 1082: #endif
! 1083: SIVALu(state, i*16, tmp.A);
! 1084: SIVALu(state, i*16 + 4, tmp.B);
! 1085: SIVALu(state, i*16 + 8, tmp.C);
! 1086: SIVALu(state, i*16 + 12, tmp.D);
! 1087: }
! 1088:
! 1089: #ifndef USE_OPENSSL
! 1090: high = (low >> 29) | (high << 3);
! 1091: low = (low << 3);
! 1092: #endif
! 1093:
! 1094: sub <<= 3;
! 1095: if (low - sub > low) high--;
! 1096: low -= sub;
! 1097:
! 1098: SIVALu(state, 32*4, low);
! 1099: SIVALu(state, 33*4, high);
! 1100:
! 1101: MD5_CTX md;
! 1102: MD5_Init(&md);
! 1103: MD5_Update(&md, state, 34*4);
! 1104: MD5_Final(digest, &md);
! 1105: }
! 1106:
! 1107: extern "C" {
! 1108:
! 1109: int md5_parallel_slots()
! 1110: {
! 1111: return md5_parallel_slots_cpp();
! 1112: }
! 1113:
! 1114: int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
! 1115: {
! 1116: return md5_parallel_cpp(streams, buf, len, sum, pre4, post4);
! 1117: }
! 1118:
! 1119: void MD5P8_Init(MD5P8_CTX *ctx)
! 1120: {
! 1121: MD5P8_Init_cpp(ctx);
! 1122: }
! 1123:
! 1124: void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
! 1125: {
! 1126: MD5P8_Update_cpp(ctx, input, length);
! 1127: }
! 1128:
! 1129: void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
! 1130: {
! 1131: MD5P8_Final_cpp(digest, ctx);
! 1132: }
! 1133:
! 1134: } // "C"
! 1135:
! 1136: #endif /* HAVE_SIMD */
! 1137: #endif /* __cplusplus */
! 1138: #endif /* __x86_64__ */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>