File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / rsync / simd-md5-parallel-x86_64.cpp
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 00:32:36 2021 UTC (3 years, 3 months ago) by misho
Branches: rsync, MAIN
CVS tags: v3_2_3, HEAD
rsync 3.2.3

    1: /*
    2:  * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel.
    3:  *
    4:  * Original author: Nicolas Noble, 2017
    5:  * Modifications:   Jorrit Jongma, 2020
    6:  *
    7:  * The original code was released in the public domain by the original author,
    8:  * falling back to the MIT license ( http://www.opensource.org/licenses/MIT )
    9:  * in case public domain does not apply in your country. These modifications
   10:  * are likewise released in the public domain, with the same MIT license
   11:  * fallback.
   12:  *
   13:  * The original publication can be found at:
   14:  *
   15:  * https://github.com/nicolasnoble/sse-hash
   16:  */
   17: /*
   18:  * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
   19:  * MD5 code has been removed and those code paths rerouted to use the MD5
   20:  * code already present in rsync, and wrapper functions have been added. The
   21:  * MD5P8 code is also new, and is the reason for the new stride parameter.
   22:  *
   23:  * This code allows multiple independent MD5 streams to be processed in
   24:  * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
   25:  * lower than that of the original C routines for MD5, the processing of
   26:  * additional streams is "for free".
   27:  *
   28:  * Single streams are rerouted to rsync's normal MD5 code as that is faster
   29:  * for that case. A further optimization is possible by using SSE2 code on
   30:  * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not
   31:  * implemented here as it would require some restructuring, and in practise
   32:  * the code here is only rarely called with less than the maximum amount of
   33:  * streams (typically once at the end of each checksum2'd file).
   34:  *
   35:  * Benchmarks (in MB/s)            C     ASM  SSE2*1  SSE2*4  AVX2*1  AVX2*8
   36:  * - Intel Atom D2700            302     334     166     664     N/A     N/A
   37:  * - Intel i7-7700hq             351     376     289    1156     273    2184
   38:  * - AMD ThreadRipper 2950x      728     784     568    2272     430    3440
   39:  */
   40: 
   41: #ifdef __x86_64__
   42: #ifdef __cplusplus
   43: 
   44: extern "C" {
   45: 
   46: #include "rsync.h"
   47: 
   48: }
   49: 
   50: #ifdef HAVE_SIMD
   51: 
   52: #ifndef BENCHMARK_SIMD_CHECKSUM2
   53: #define PMD5_ALLOW_SSE2 // debugging
   54: #define PMD5_ALLOW_AVX2 // debugging
   55: #endif
   56: 
   57: #ifdef PMD5_ALLOW_AVX2
   58: #ifndef PMD5_ALLOW_SSE2
   59: #define PMD5_ALLOW_SSE2
   60: #endif
   61: #endif
   62: 
   63: #include <stdint.h>
   64: #include <string.h>
   65: 
   66: #include <immintrin.h>
   67: 
   68: /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
   69: #ifdef __clang__
   70: #define MVSTATIC
   71: #else
   72: #define MVSTATIC static
   73: #endif
   74: 
   75: // Missing from the headers on gcc 6 and older, clang 8 and older
   76: typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
   77: typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
   78: 
   79: #define PMD5_SLOTS_DEFAULT 0
   80: #define PMD5_SLOTS_SSE2 4
   81: #define PMD5_SLOTS_AVX2 8
   82: #define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2
   83: 
   84: #ifdef PMD5_ALLOW_SSE2
   85: __attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots()
   86: {
   87:     return PMD5_SLOTS_SSE2;
   88: }
   89: #endif
   90: 
   91: #ifdef PMD5_ALLOW_AVX2
   92: __attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots()
   93: {
   94:     return PMD5_SLOTS_AVX2;
   95: }
   96: #endif
   97: 
   98: __attribute__ ((target("default"))) MVSTATIC int pmd5_slots()
   99: {
  100:     return PMD5_SLOTS_DEFAULT;
  101: }
  102: 
  103: /* The parallel MD5 context structure. */
  104: typedef struct {
  105:     __m128i state_sse2[4];
  106:     __m256i state_avx2[4];
  107:     uint64_t len[PMD5_SLOTS_MAX];
  108: } pmd5_context;
  109: 
  110: /* The status returned by the various functions below. */
  111: typedef enum {
  112:     PMD5_SUCCESS,
  113:     PMD5_INVALID_SLOT,
  114:     PMD5_UNALIGNED_UPDATE,
  115: } pmd5_status;
  116: 
  117: /* Initializes all slots in the given pmd5 context. */
  118: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx);
  119: 
  120: /* Initializes a single slot out in the given pmd5 context. */
  121: static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot);
  122: 
  123: /* Makes an MD5 update on all slots in parallel, given the same exact length on all streams.
  124:    The stream pointers will be incremented accordingly.
  125:    It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
  126:    The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted.
  127:    Stride defaults to 64 if 0 is passed. */
  128: static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride);
  129: 
  130: /* Makes an MD5 update on all slots in parallel, given different lengths.
  131:    The stream pointers will be incremented accordingly.
  132:    The lengths will be decreased accordingly. Not all data might be consumed.
  133:    It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
  134:    The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */
  135: static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]);
  136: 
  137: /* Finishes all slots at once. Fills in all digests. */
  138: static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]);
  139: 
  140: /* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
  141:    a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
  142: static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot);
  143: 
  144: /* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a
  145:    multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using
  146:    a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
  147: static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length);
  148: 
  149: /* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */
  150: static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot);
  151: 
  152: /* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */
  153: static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot);
  154: 
  155: #define S11  7
  156: #define S12 12
  157: #define S13 17
  158: #define S14 22
  159: #define S21  5
  160: #define S22  9
  161: #define S23 14
  162: #define S24 20
  163: #define S31  4
  164: #define S32 11
  165: #define S33 16
  166: #define S34 23
  167: #define S41  6
  168: #define S42 10
  169: #define S43 15
  170: #define S44 21
  171: 
  172: #define T1  0xD76AA478
  173: #define T2  0xE8C7B756
  174: #define T3  0x242070DB
  175: #define T4  0xC1BDCEEE
  176: #define T5  0xF57C0FAF
  177: #define T6  0x4787C62A
  178: #define T7  0xA8304613
  179: #define T8  0xFD469501
  180: #define T9  0x698098D8
  181: #define T10 0x8B44F7AF
  182: #define T11 0xFFFF5BB1
  183: #define T12 0x895CD7BE
  184: #define T13 0x6B901122
  185: #define T14 0xFD987193
  186: #define T15 0xA679438E
  187: #define T16 0x49B40821
  188: #define T17 0xF61E2562
  189: #define T18 0xC040B340
  190: #define T19 0x265E5A51
  191: #define T20 0xE9B6C7AA
  192: #define T21 0xD62F105D
  193: #define T22 0x02441453
  194: #define T23 0xD8A1E681
  195: #define T24 0xE7D3FBC8
  196: #define T25 0x21E1CDE6
  197: #define T26 0xC33707D6
  198: #define T27 0xF4D50D87
  199: #define T28 0x455A14ED
  200: #define T29 0xA9E3E905
  201: #define T30 0xFCEFA3F8
  202: #define T31 0x676F02D9
  203: #define T32 0x8D2A4C8A
  204: #define T33 0xFFFA3942
  205: #define T34 0x8771F681
  206: #define T35 0x6D9D6122
  207: #define T36 0xFDE5380C
  208: #define T37 0xA4BEEA44
  209: #define T38 0x4BDECFA9
  210: #define T39 0xF6BB4B60
  211: #define T40 0xBEBFBC70
  212: #define T41 0x289B7EC6
  213: #define T42 0xEAA127FA
  214: #define T43 0xD4EF3085
  215: #define T44 0x04881D05
  216: #define T45 0xD9D4D039
  217: #define T46 0xE6DB99E5
  218: #define T47 0x1FA27CF8
  219: #define T48 0xC4AC5665
  220: #define T49 0xF4292244
  221: #define T50 0x432AFF97
  222: #define T51 0xAB9423A7
  223: #define T52 0xFC93A039
  224: #define T53 0x655B59C3
  225: #define T54 0x8F0CCC92
  226: #define T55 0xFFEFF47D
  227: #define T56 0x85845DD1
  228: #define T57 0x6FA87E4F
  229: #define T58 0xFE2CE6E0
  230: #define T59 0xA3014314
  231: #define T60 0x4E0811A1
  232: #define T61 0xF7537E82
  233: #define T62 0xBD3AF235
  234: #define T63 0x2AD7D2BB
  235: #define T64 0xEB86D391
  236: 
  237: #define ROTL_SSE2(x, n) { \
  238:     __m128i s; \
  239:     s = _mm_srli_epi32(x, 32 - n); \
  240:     x = _mm_slli_epi32(x, n); \
  241:     x = _mm_or_si128(x, s); \
  242: };
  243: 
  244: #define ROTL_AVX2(x, n) { \
  245:     __m256i s; \
  246:     s = _mm256_srli_epi32(x, 32 - n); \
  247:     x = _mm256_slli_epi32(x, n); \
  248:     x = _mm256_or_si256(x, s); \
  249: };
  250: 
  251: #define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z))
  252: #define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y))
  253: #define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z)
  254: #define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff))))
  255: 
  256: #define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
  257: #define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y))
  258: #define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z)
  259: #define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff))))
  260: 
  261: #define SET_SSE2(step, a, b, c, d, x, s, ac) { \
  262:     a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \
  263:     ROTL_SSE2(a, s); \
  264:     a = _mm_add_epi32(a, b); \
  265: }
  266: 
  267: #define SET_AVX2(step, a, b, c, d, x, s, ac) { \
  268:     a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \
  269:     ROTL_AVX2(a, s); \
  270:     a = _mm256_add_epi32(a, b); \
  271: }
  272: 
  273: #define IA 0x67452301
  274: #define IB 0xefcdab89
  275: #define IC 0x98badcfe
  276: #define ID 0x10325476
  277: 
  278: #define GET_MD5_DATA(dest, src, pos)         \
  279:     dest =                                   \
  280:         ((uint32_t) src[pos + 0]) <<  0 |    \
  281:         ((uint32_t) src[pos + 1]) <<  8 |    \
  282:         ((uint32_t) src[pos + 2]) << 16 |    \
  283:         ((uint32_t) src[pos + 3]) << 24
  284: 
  285: #define GET_PMD5_DATA_SSE2(dest, src, pos) { \
  286:     uint32_t v0, v1, v2, v3;                 \
  287:     GET_MD5_DATA(v0, src[0], pos);           \
  288:     GET_MD5_DATA(v1, src[1], pos);           \
  289:     GET_MD5_DATA(v2, src[2], pos);           \
  290:     GET_MD5_DATA(v3, src[3], pos);           \
  291:     dest = _mm_setr_epi32(v0, v1, v2, v3);   \
  292: }
  293: 
  294: #define GET_PMD5_DATA_AVX2(dest, src, pos) { \
  295:     uint32_t v0, v1, v2, v3;                 \
  296:     uint32_t v4, v5, v6, v7;                 \
  297:     GET_MD5_DATA(v0, src[0], pos);           \
  298:     GET_MD5_DATA(v1, src[1], pos);           \
  299:     GET_MD5_DATA(v2, src[2], pos);           \
  300:     GET_MD5_DATA(v3, src[3], pos);           \
  301:     GET_MD5_DATA(v4, src[4], pos);           \
  302:     GET_MD5_DATA(v5, src[5], pos);           \
  303:     GET_MD5_DATA(v6, src[6], pos);           \
  304:     GET_MD5_DATA(v7, src[7], pos);           \
  305:     dest = _mm256_setr_epi32(v0, v1, v2, v3, \
  306:                           v4, v5, v6, v7);   \
  307: }
  308: 
  309: #define PUT_MD5_DATA(dest, val, pos) {       \
  310:     dest[pos + 0] = (val >>  0) & 0xff;      \
  311:     dest[pos + 1] = (val >>  8) & 0xff;      \
  312:     dest[pos + 2] = (val >> 16) & 0xff;      \
  313:     dest[pos + 3] = (val >> 24) & 0xff;      \
  314: }
  315: 
  316: const static uint8_t md5_padding[64] = {
  317:     0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  318:     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  319:     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  320:     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  321:     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  322:     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  323:     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  324:     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  325: };
  326: 
  327: #ifdef PMD5_ALLOW_SSE2
  328: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
  329: {
  330:     int i;
  331:     for (i = 0; i < PMD5_SLOTS_MAX; i++) {
  332:         ctx->len[i] = 0;
  333:     }
  334: 
  335:     ctx->state_sse2[0] = _mm_set1_epi32(IA);
  336:     ctx->state_sse2[1] = _mm_set1_epi32(IB);
  337:     ctx->state_sse2[2] = _mm_set1_epi32(IC);
  338:     ctx->state_sse2[3] = _mm_set1_epi32(ID);
  339: 
  340:     return PMD5_SUCCESS;
  341: }
  342: #endif
  343: 
  344: #ifdef PMD5_ALLOW_AVX2
  345: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
  346: {
  347:     int i;
  348:     for (i = 0; i < PMD5_SLOTS_MAX; i++) {
  349:         ctx->len[i] = 0;
  350:     }
  351: 
  352:     ctx->state_avx2[0] = _mm256_set1_epi32(IA);
  353:     ctx->state_avx2[1] = _mm256_set1_epi32(IB);
  354:     ctx->state_avx2[2] = _mm256_set1_epi32(IC);
  355:     ctx->state_avx2[3] = _mm256_set1_epi32(ID);
  356: 
  357:     return PMD5_SUCCESS;
  358: }
  359: #endif
  360: 
  361: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
  362: {
  363:     return PMD5_INVALID_SLOT;
  364: }
  365: 
  366: #ifdef PMD5_ALLOW_SSE2
  367: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
  368: {
  369:     if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
  370:         return PMD5_INVALID_SLOT;
  371: 
  372:     __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
  373:     int i;
  374: 
  375:     for (i = 0; i < 4; i++) {
  376:         _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
  377:     }
  378: 
  379:     v[0][slot] = a;
  380:     v[1][slot] = b;
  381:     v[2][slot] = c;
  382:     v[3][slot] = d;
  383: 
  384:     for (i = 0; i < 4; i++) {
  385:         ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]);
  386:     }
  387: 
  388:     return PMD5_SUCCESS;
  389: }
  390: #endif
  391: 
  392: #ifdef PMD5_ALLOW_AVX2
  393: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
  394: {
  395:     if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
  396:         return PMD5_INVALID_SLOT;
  397: 
  398:     __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
  399:     int i;
  400: 
  401:     for (i = 0; i < 4; i++) {
  402:         _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
  403:     }
  404: 
  405:     v[0][slot] = a;
  406:     v[1][slot] = b;
  407:     v[2][slot] = c;
  408:     v[3][slot] = d;
  409: 
  410:     for (i = 0; i < 4; i++) {
  411:         ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]);
  412:     }
  413: 
  414:     return PMD5_SUCCESS;
  415: }
  416: #endif
  417: 
  418: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
  419: {
  420:     return PMD5_INVALID_SLOT;
  421: }
  422: 
  423: #ifdef PMD5_ALLOW_SSE2
  424: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
  425: {
  426:     if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
  427:         return PMD5_INVALID_SLOT;
  428: 
  429:     __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
  430:     int i;
  431: 
  432:     for (i = 0; i < 4; i++) {
  433:         _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
  434:     }
  435: 
  436:     *a = v[0][slot];
  437:     *b = v[1][slot];
  438:     *c = v[2][slot];
  439:     *d = v[3][slot];
  440: 
  441:     return PMD5_SUCCESS;
  442: }
  443: #endif
  444: 
  445: #ifdef PMD5_ALLOW_AVX2
  446: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
  447: {
  448:     if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
  449:         return PMD5_INVALID_SLOT;
  450: 
  451:     __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
  452:     int i;
  453: 
  454:     for (i = 0; i < 4; i++) {
  455:         _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
  456:     }
  457: 
  458:     *a = v[0][slot];
  459:     *b = v[1][slot];
  460:     *c = v[2][slot];
  461:     *d = v[3][slot];
  462: 
  463:     return PMD5_SUCCESS;
  464: }
  465: #endif
  466: 
  467: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
  468: {
  469:     return PMD5_INVALID_SLOT;
  470: }
  471: 
  472: static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot)
  473: {
  474:     return pmd5_set_slot(ctx, slot, IA, IB, IC, ID);
  475: }
  476: 
  477: #ifdef PMD5_ALLOW_SSE2
  478: __attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
  479: {
  480:     __m128i W[MD5_DIGEST_LEN], a, b, c, d;
  481: 
  482:     GET_PMD5_DATA_SSE2(W[ 0], data,  0);
  483:     GET_PMD5_DATA_SSE2(W[ 1], data,  4);
  484:     GET_PMD5_DATA_SSE2(W[ 2], data,  8);
  485:     GET_PMD5_DATA_SSE2(W[ 3], data, 12);
  486:     GET_PMD5_DATA_SSE2(W[ 4], data, 16);
  487:     GET_PMD5_DATA_SSE2(W[ 5], data, 20);
  488:     GET_PMD5_DATA_SSE2(W[ 6], data, 24);
  489:     GET_PMD5_DATA_SSE2(W[ 7], data, 28);
  490:     GET_PMD5_DATA_SSE2(W[ 8], data, 32);
  491:     GET_PMD5_DATA_SSE2(W[ 9], data, 36);
  492:     GET_PMD5_DATA_SSE2(W[10], data, 40);
  493:     GET_PMD5_DATA_SSE2(W[11], data, 44);
  494:     GET_PMD5_DATA_SSE2(W[12], data, 48);
  495:     GET_PMD5_DATA_SSE2(W[13], data, 52);
  496:     GET_PMD5_DATA_SSE2(W[14], data, 56);
  497:     GET_PMD5_DATA_SSE2(W[15], data, 60);
  498: 
  499:     a = ctx->state_sse2[0];
  500:     b = ctx->state_sse2[1];
  501:     c = ctx->state_sse2[2];
  502:     d = ctx->state_sse2[3];
  503: 
  504:     SET_SSE2(F, a, b, c, d, W[ 0], S11,  1);
  505:     SET_SSE2(F, d, a, b, c, W[ 1], S12,  2);
  506:     SET_SSE2(F, c, d, a, b, W[ 2], S13,  3);
  507:     SET_SSE2(F, b, c, d, a, W[ 3], S14,  4);
  508:     SET_SSE2(F, a, b, c, d, W[ 4], S11,  5);
  509:     SET_SSE2(F, d, a, b, c, W[ 5], S12,  6);
  510:     SET_SSE2(F, c, d, a, b, W[ 6], S13,  7);
  511:     SET_SSE2(F, b, c, d, a, W[ 7], S14,  8);
  512:     SET_SSE2(F, a, b, c, d, W[ 8], S11,  9);
  513:     SET_SSE2(F, d, a, b, c, W[ 9], S12, 10);
  514:     SET_SSE2(F, c, d, a, b, W[10], S13, 11);
  515:     SET_SSE2(F, b, c, d, a, W[11], S14, 12);
  516:     SET_SSE2(F, a, b, c, d, W[12], S11, 13);
  517:     SET_SSE2(F, d, a, b, c, W[13], S12, 14);
  518:     SET_SSE2(F, c, d, a, b, W[14], S13, 15);
  519:     SET_SSE2(F, b, c, d, a, W[15], S14, 16);
  520: 
  521:     SET_SSE2(G, a, b, c, d, W[ 1], S21, 17);
  522:     SET_SSE2(G, d, a, b, c, W[ 6], S22, 18);
  523:     SET_SSE2(G, c, d, a, b, W[11], S23, 19);
  524:     SET_SSE2(G, b, c, d, a, W[ 0], S24, 20);
  525:     SET_SSE2(G, a, b, c, d, W[ 5], S21, 21);
  526:     SET_SSE2(G, d, a, b, c, W[10], S22, 22);
  527:     SET_SSE2(G, c, d, a, b, W[15], S23, 23);
  528:     SET_SSE2(G, b, c, d, a, W[ 4], S24, 24);
  529:     SET_SSE2(G, a, b, c, d, W[ 9], S21, 25);
  530:     SET_SSE2(G, d, a, b, c, W[14], S22, 26);
  531:     SET_SSE2(G, c, d, a, b, W[ 3], S23, 27);
  532:     SET_SSE2(G, b, c, d, a, W[ 8], S24, 28);
  533:     SET_SSE2(G, a, b, c, d, W[13], S21, 29);
  534:     SET_SSE2(G, d, a, b, c, W[ 2], S22, 30);
  535:     SET_SSE2(G, c, d, a, b, W[ 7], S23, 31);
  536:     SET_SSE2(G, b, c, d, a, W[12], S24, 32);
  537: 
  538:     SET_SSE2(H, a, b, c, d, W[ 5], S31, 33);
  539:     SET_SSE2(H, d, a, b, c, W[ 8], S32, 34);
  540:     SET_SSE2(H, c, d, a, b, W[11], S33, 35);
  541:     SET_SSE2(H, b, c, d, a, W[14], S34, 36);
  542:     SET_SSE2(H, a, b, c, d, W[ 1], S31, 37);
  543:     SET_SSE2(H, d, a, b, c, W[ 4], S32, 38);
  544:     SET_SSE2(H, c, d, a, b, W[ 7], S33, 39);
  545:     SET_SSE2(H, b, c, d, a, W[10], S34, 40);
  546:     SET_SSE2(H, a, b, c, d, W[13], S31, 41);
  547:     SET_SSE2(H, d, a, b, c, W[ 0], S32, 42);
  548:     SET_SSE2(H, c, d, a, b, W[ 3], S33, 43);
  549:     SET_SSE2(H, b, c, d, a, W[ 6], S34, 44);
  550:     SET_SSE2(H, a, b, c, d, W[ 9], S31, 45);
  551:     SET_SSE2(H, d, a, b, c, W[12], S32, 46);
  552:     SET_SSE2(H, c, d, a, b, W[15], S33, 47);
  553:     SET_SSE2(H, b, c, d, a, W[ 2], S34, 48);
  554: 
  555:     SET_SSE2(I, a, b, c, d, W[ 0], S41, 49);
  556:     SET_SSE2(I, d, a, b, c, W[ 7], S42, 50);
  557:     SET_SSE2(I, c, d, a, b, W[14], S43, 51);
  558:     SET_SSE2(I, b, c, d, a, W[ 5], S44, 52);
  559:     SET_SSE2(I, a, b, c, d, W[12], S41, 53);
  560:     SET_SSE2(I, d, a, b, c, W[ 3], S42, 54);
  561:     SET_SSE2(I, c, d, a, b, W[10], S43, 55);
  562:     SET_SSE2(I, b, c, d, a, W[ 1], S44, 56);
  563:     SET_SSE2(I, a, b, c, d, W[ 8], S41, 57);
  564:     SET_SSE2(I, d, a, b, c, W[15], S42, 58);
  565:     SET_SSE2(I, c, d, a, b, W[ 6], S43, 59);
  566:     SET_SSE2(I, b, c, d, a, W[13], S44, 60);
  567:     SET_SSE2(I, a, b, c, d, W[ 4], S41, 61);
  568:     SET_SSE2(I, d, a, b, c, W[11], S42, 62);
  569:     SET_SSE2(I, c, d, a, b, W[ 2], S43, 63);
  570:     SET_SSE2(I, b, c, d, a, W[ 9], S44, 64);
  571: 
  572:     ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a);
  573:     ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b);
  574:     ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c);
  575:     ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d);
  576: }
  577: #endif
  578: 
  579: #ifdef PMD5_ALLOW_AVX2
  580: __attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
  581: {
  582:     __m256i W[MD5_DIGEST_LEN], a, b, c, d;
  583: 
  584:     GET_PMD5_DATA_AVX2(W[ 0], data,  0);
  585:     GET_PMD5_DATA_AVX2(W[ 1], data,  4);
  586:     GET_PMD5_DATA_AVX2(W[ 2], data,  8);
  587:     GET_PMD5_DATA_AVX2(W[ 3], data, 12);
  588:     GET_PMD5_DATA_AVX2(W[ 4], data, 16);
  589:     GET_PMD5_DATA_AVX2(W[ 5], data, 20);
  590:     GET_PMD5_DATA_AVX2(W[ 6], data, 24);
  591:     GET_PMD5_DATA_AVX2(W[ 7], data, 28);
  592:     GET_PMD5_DATA_AVX2(W[ 8], data, 32);
  593:     GET_PMD5_DATA_AVX2(W[ 9], data, 36);
  594:     GET_PMD5_DATA_AVX2(W[10], data, 40);
  595:     GET_PMD5_DATA_AVX2(W[11], data, 44);
  596:     GET_PMD5_DATA_AVX2(W[12], data, 48);
  597:     GET_PMD5_DATA_AVX2(W[13], data, 52);
  598:     GET_PMD5_DATA_AVX2(W[14], data, 56);
  599:     GET_PMD5_DATA_AVX2(W[15], data, 60);
  600: 
  601:     a = ctx->state_avx2[0];
  602:     b = ctx->state_avx2[1];
  603:     c = ctx->state_avx2[2];
  604:     d = ctx->state_avx2[3];
  605: 
  606:     SET_AVX2(F, a, b, c, d, W[ 0], S11,  1);
  607:     SET_AVX2(F, d, a, b, c, W[ 1], S12,  2);
  608:     SET_AVX2(F, c, d, a, b, W[ 2], S13,  3);
  609:     SET_AVX2(F, b, c, d, a, W[ 3], S14,  4);
  610:     SET_AVX2(F, a, b, c, d, W[ 4], S11,  5);
  611:     SET_AVX2(F, d, a, b, c, W[ 5], S12,  6);
  612:     SET_AVX2(F, c, d, a, b, W[ 6], S13,  7);
  613:     SET_AVX2(F, b, c, d, a, W[ 7], S14,  8);
  614:     SET_AVX2(F, a, b, c, d, W[ 8], S11,  9);
  615:     SET_AVX2(F, d, a, b, c, W[ 9], S12, 10);
  616:     SET_AVX2(F, c, d, a, b, W[10], S13, 11);
  617:     SET_AVX2(F, b, c, d, a, W[11], S14, 12);
  618:     SET_AVX2(F, a, b, c, d, W[12], S11, 13);
  619:     SET_AVX2(F, d, a, b, c, W[13], S12, 14);
  620:     SET_AVX2(F, c, d, a, b, W[14], S13, 15);
  621:     SET_AVX2(F, b, c, d, a, W[15], S14, 16);
  622: 
  623:     SET_AVX2(G, a, b, c, d, W[ 1], S21, 17);
  624:     SET_AVX2(G, d, a, b, c, W[ 6], S22, 18);
  625:     SET_AVX2(G, c, d, a, b, W[11], S23, 19);
  626:     SET_AVX2(G, b, c, d, a, W[ 0], S24, 20);
  627:     SET_AVX2(G, a, b, c, d, W[ 5], S21, 21);
  628:     SET_AVX2(G, d, a, b, c, W[10], S22, 22);
  629:     SET_AVX2(G, c, d, a, b, W[15], S23, 23);
  630:     SET_AVX2(G, b, c, d, a, W[ 4], S24, 24);
  631:     SET_AVX2(G, a, b, c, d, W[ 9], S21, 25);
  632:     SET_AVX2(G, d, a, b, c, W[14], S22, 26);
  633:     SET_AVX2(G, c, d, a, b, W[ 3], S23, 27);
  634:     SET_AVX2(G, b, c, d, a, W[ 8], S24, 28);
  635:     SET_AVX2(G, a, b, c, d, W[13], S21, 29);
  636:     SET_AVX2(G, d, a, b, c, W[ 2], S22, 30);
  637:     SET_AVX2(G, c, d, a, b, W[ 7], S23, 31);
  638:     SET_AVX2(G, b, c, d, a, W[12], S24, 32);
  639: 
  640:     SET_AVX2(H, a, b, c, d, W[ 5], S31, 33);
  641:     SET_AVX2(H, d, a, b, c, W[ 8], S32, 34);
  642:     SET_AVX2(H, c, d, a, b, W[11], S33, 35);
  643:     SET_AVX2(H, b, c, d, a, W[14], S34, 36);
  644:     SET_AVX2(H, a, b, c, d, W[ 1], S31, 37);
  645:     SET_AVX2(H, d, a, b, c, W[ 4], S32, 38);
  646:     SET_AVX2(H, c, d, a, b, W[ 7], S33, 39);
  647:     SET_AVX2(H, b, c, d, a, W[10], S34, 40);
  648:     SET_AVX2(H, a, b, c, d, W[13], S31, 41);
  649:     SET_AVX2(H, d, a, b, c, W[ 0], S32, 42);
  650:     SET_AVX2(H, c, d, a, b, W[ 3], S33, 43);
  651:     SET_AVX2(H, b, c, d, a, W[ 6], S34, 44);
  652:     SET_AVX2(H, a, b, c, d, W[ 9], S31, 45);
  653:     SET_AVX2(H, d, a, b, c, W[12], S32, 46);
  654:     SET_AVX2(H, c, d, a, b, W[15], S33, 47);
  655:     SET_AVX2(H, b, c, d, a, W[ 2], S34, 48);
  656: 
  657:     SET_AVX2(I, a, b, c, d, W[ 0], S41, 49);
  658:     SET_AVX2(I, d, a, b, c, W[ 7], S42, 50);
  659:     SET_AVX2(I, c, d, a, b, W[14], S43, 51);
  660:     SET_AVX2(I, b, c, d, a, W[ 5], S44, 52);
  661:     SET_AVX2(I, a, b, c, d, W[12], S41, 53);
  662:     SET_AVX2(I, d, a, b, c, W[ 3], S42, 54);
  663:     SET_AVX2(I, c, d, a, b, W[10], S43, 55);
  664:     SET_AVX2(I, b, c, d, a, W[ 1], S44, 56);
  665:     SET_AVX2(I, a, b, c, d, W[ 8], S41, 57);
  666:     SET_AVX2(I, d, a, b, c, W[15], S42, 58);
  667:     SET_AVX2(I, c, d, a, b, W[ 6], S43, 59);
  668:     SET_AVX2(I, b, c, d, a, W[13], S44, 60);
  669:     SET_AVX2(I, a, b, c, d, W[ 4], S41, 61);
  670:     SET_AVX2(I, d, a, b, c, W[11], S42, 62);
  671:     SET_AVX2(I, c, d, a, b, W[ 2], S43, 63);
  672:     SET_AVX2(I, b, c, d, a, W[ 9], S44, 64);
  673: 
  674:     ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a);
  675:     ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b);
  676:     ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c);
  677:     ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d);
  678: }
  679: #endif
  680: 
  681: __attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
  682: {
  683: }
  684: 
  685: static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride)
  686: {
  687:     const uint8_t * ptrs[PMD5_SLOTS_MAX];
  688: 
  689:     if (!length) return PMD5_SUCCESS;
  690: 
  691:     int slots = pmd5_slots();
  692: 
  693:     if (!stride) stride = 64;
  694: 
  695:     int i;
  696:     for (i = 0; i < slots; i++) {
  697:         ptrs[i] = data[i];
  698:         ctx->len[i] += length;
  699:         if (!ptrs[i]) ptrs[i] = md5_padding;
  700:     }
  701: 
  702:     while (length >= 64) {
  703:         pmd5_process(ctx, ptrs);
  704:         length -= 64;
  705:         for (i = 0; i < slots; i++) {
  706:             if (data[i]) ptrs[i] += stride;
  707:         }
  708:     }
  709: 
  710:     if (length) return PMD5_UNALIGNED_UPDATE;
  711: 
  712:     for (i = 0; i < slots; i++) {
  713:         if (data[i]) data[i] = ptrs[i];
  714:     }
  715: 
  716:     return PMD5_SUCCESS;
  717: }
  718: 
  719: static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX])
  720: {
  721:     uint64_t length = 0;
  722:     int slots = pmd5_slots();
  723: 
  724:     int i;
  725:     for (i = 0; i < slots; i++) {
  726:         if ((length == 0) || (lengths[i] < length)) length = lengths[i];
  727:     }
  728: 
  729:     for (i = 0; i < slots; i++) {
  730:         lengths[i] -= length;
  731:     }
  732: 
  733:     return pmd5_update_all_simple(ctx, data, length, 0);
  734: }
  735: 
  736: static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length)
  737: {
  738:     MD5_CTX ctx;
  739: 
  740:     if ((slot >= pmd5_slots()) || (slot < 0))
  741:         return PMD5_INVALID_SLOT;
  742: 
  743:     pmd5_to_md5(pctx, &ctx, slot);
  744:     if (data && length) {
  745:         MD5_Update(&ctx, data, length);
  746:     }
  747:     MD5_Final(digest, &ctx);
  748: 
  749:     return PMD5_SUCCESS;
  750: }
  751: 
  752: static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot)
  753: {
  754:     return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0);
  755: }
  756: 
  757: static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN])
  758: {
  759:     int i;
  760:     for (i = 0; i < pmd5_slots(); i++) {
  761:         pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0);
  762:     }
  763:     return PMD5_SUCCESS;
  764: }
  765: 
  766: static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot)
  767: {
  768:     if ((slot >= pmd5_slots()) || (slot < 0))
  769:         return PMD5_INVALID_SLOT;
  770: 
  771:     // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting.
  772: 
  773: #ifdef USE_OPENSSL
  774:     pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29);
  775: #else
  776:     pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32);
  777: #endif
  778:     return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D);
  779: }
  780: 
  781: static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot)
  782: {
  783:     if ((slot >= pmd5_slots()) || (slot < 0))
  784:         return PMD5_INVALID_SLOT;
  785: 
  786:     MD5_Init(ctx);
  787: 
  788: #ifdef USE_OPENSSL
  789:     ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF;
  790:     ctx->Nh = pctx->len[slot] >> 29;
  791: 
  792:     uint32_t a, b, c, d;
  793:     pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
  794:     if (ret == PMD5_SUCCESS) {
  795:         ctx->A = a;
  796:         ctx->B = b;
  797:         ctx->C = c;
  798:         ctx->D = d;
  799:     }
  800:     return ret;
  801: #else
  802:     ctx->totalN = pctx->len[slot] & 0xFFFFFFFF;
  803:     ctx->totalN2 = pctx->len[slot] >> 32;
  804:     return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D);
  805: #endif
  806: }
  807: 
  808: /* With GCC 10 putting these implementations inside 'extern "C"' causes an
  809:    assembler error. That worked fine on GCC 5-9 and clang 6-10...
  810:   */
  811: 
  812: static inline int md5_parallel_slots_cpp()
  813: {
  814:     int slots = pmd5_slots();
  815:     if (slots == 0) return 1;
  816:     return slots;
  817: }
  818: 
  819: static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
  820: {
  821:     int slots = md5_parallel_slots_cpp();
  822:     if ((streams < 1) || (streams > slots)) return 0;
  823:     if (pre4 && post4) return 0;
  824: 
  825:     if (slots == 1) {
  826:         MD5_CTX ctx;
  827:         MD5_Init(&ctx);
  828:         if (pre4) {
  829:             MD5_Update(&ctx, (const unsigned char*)pre4, 4);
  830:         }
  831:         MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]);
  832:         if (post4) {
  833:             MD5_Update(&ctx, (const unsigned char*)post4, 4);
  834:         }
  835:         if (sum[0]) {
  836:             MD5_Final((uint8_t*)sum[0], &ctx);
  837:         }
  838:         return 0;
  839:     }
  840: 
  841:     int i;
  842:     int active[PMD5_SLOTS_MAX];
  843:     char* buffers[PMD5_SLOTS_MAX];
  844:     uint64_t left[PMD5_SLOTS_MAX];
  845:     for (i = 0; i < PMD5_SLOTS_MAX; i++) {
  846:         active[i] = streams > i;
  847:         if (i < streams) {
  848:             buffers[i] = buf[i];
  849:             left[i] = (uint64_t)len[i];
  850:         } else {
  851:             buffers[i] = NULL;
  852:             left[i] = 0;
  853:         }
  854:     }
  855:     MD5_CTX results[PMD5_SLOTS_MAX];
  856: 
  857:     pmd5_context ctx_simd;
  858:     if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0;
  859: 
  860:     if (pre4) {
  861:         char temp_buffers[PMD5_SLOTS_MAX][64];
  862:         int have_any = 0;
  863:         for (i = 0; i < slots; i++) {
  864:             if (active[i]) {
  865:                 if (left[i] < 60) {
  866:                     MD5_Init(&results[i]);
  867:                     MD5_Update(&results[i], (const unsigned char*)pre4, 4);
  868:                     MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]);
  869:                     active[i] = 0;
  870:                     left[i] = 0;
  871:                 } else {
  872:                     memcpy(temp_buffers[i], pre4, 4);
  873:                     memcpy(temp_buffers[i] + 4, buffers[i], 60);
  874:                     buffers[i] += 60;
  875:                     left[i] -= 60;
  876:                     have_any = 1;
  877:                 }
  878:             }
  879:         }
  880: 
  881:         if (have_any) {
  882:             char* ptrs[PMD5_SLOTS_MAX];
  883:             for (i = 0; i < PMD5_SLOTS_MAX; i++) {
  884:                 ptrs[i] = &temp_buffers[i][0];
  885:             }
  886:             if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) {
  887:                 return 0;
  888:             }
  889:         }
  890:     }
  891: 
  892:     int failed = 0;
  893:     while (true) {
  894:         for (i = 0; i < slots; i++) {
  895:             if (active[i] && (left[i] < 64)) {
  896:                 if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) {
  897:                     failed = 1;
  898:                 }
  899:                 active[i] = 0;
  900:             }
  901:         }
  902: 
  903:         uint64_t shortest = 0;
  904:         for (i = 0; i < slots; i++) {
  905:             if (!active[i]) {
  906:                 buffers[i] = NULL;
  907:             } else if ((shortest == 0) || (left[i] < shortest)) {
  908:                 shortest = left[i];
  909:             }
  910:         }
  911: 
  912:         if (shortest > 0) {
  913:             shortest = shortest & ~63;
  914:             if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) {
  915:                 failed = 1;
  916:             }
  917:             for (i = 0; i < slots; i++) {
  918:                 if (active[i]) {
  919:                     left[i] -= shortest;
  920:                 }
  921:             }
  922:         }
  923: 
  924:         if (failed) {
  925:             return 0;
  926:         } else {
  927:             int have_any = 0;
  928:             for (i = 0; i < slots; i++) {
  929:                 have_any |= active[i];
  930:             }
  931:             if (!have_any) {
  932:                 break;
  933:             }
  934:         }
  935:     }
  936: 
  937:     for (i = 0; i < slots; i++) {
  938:         if (i < streams) {
  939:             if (left[i] > 0) {
  940:                 // buffer[i] == NULL here
  941:                 MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]);
  942:             }
  943:             if (post4) {
  944:                 MD5_Update(&results[i], (const unsigned char*)post4, 4);
  945:             }
  946:             if (sum[i]) {
  947:                 MD5_Final((uint8_t*)sum[i], &results[i]);
  948:             }
  949:         }
  950:     }
  951: 
  952:     return 1;
  953: }
  954: 
  955: // each pmd5_context needs to be 32-byte aligned
  956: #define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31)))
  957: 
  958: static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx)
  959: {
  960:     int i;
  961:     for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) {
  962:         pmd5_init_all(MD5P8_Contexts_simd(ctx, i));
  963:     }
  964:     ctx->used = 0;
  965:     ctx->next = 0;
  966: }
  967: 
  968: static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length)
  969: {
  970:     int slots = pmd5_slots();
  971:     uint32 pos = 0;
  972: 
  973:     if ((ctx->used) || (length < 512)) {
  974:         int cpy = MIN(length, 512 - ctx->used);
  975:         memcpy(&ctx->buffer[ctx->used], input, cpy);
  976:         ctx->used += cpy;
  977:         length -= cpy;
  978:         pos += cpy;
  979: 
  980:         if (ctx->used == 512) {
  981:             if (slots == PMD5_SLOTS_AVX2) {
  982:                 const uint8_t* ptrs[PMD5_SLOTS_MAX] = {
  983:                     (uint8_t*)ctx->buffer,
  984:                     (uint8_t*)(ctx->buffer + 64),
  985:                     (uint8_t*)(ctx->buffer + 128),
  986:                     (uint8_t*)(ctx->buffer + 192),
  987:                     (uint8_t*)(ctx->buffer + 256),
  988:                     (uint8_t*)(ctx->buffer + 320),
  989:                     (uint8_t*)(ctx->buffer + 384),
  990:                     (uint8_t*)(ctx->buffer + 448)
  991:                 };
  992:                 pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0);
  993:             } else {
  994:                 const uint8_t* ptrs1[PMD5_SLOTS_MAX] = {
  995:                     (uint8_t*)ctx->buffer,
  996:                     (uint8_t*)(ctx->buffer + 64),
  997:                     (uint8_t*)(ctx->buffer + 128),
  998:                     (uint8_t*)(ctx->buffer + 192)
  999:                 };
 1000:                 const uint8_t* ptrs2[PMD5_SLOTS_MAX] = {
 1001:                     (uint8_t*)(ctx->buffer + 256),
 1002:                     (uint8_t*)(ctx->buffer + 320),
 1003:                     (uint8_t*)(ctx->buffer + 384),
 1004:                     (uint8_t*)(ctx->buffer + 448)
 1005:                 };
 1006:                 pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0);
 1007:                 pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0);
 1008:             }
 1009:             ctx->used = 0;
 1010:         }
 1011:     }
 1012: 
 1013:     if (length >= 512) {
 1014:         uint32 blocks = length / 512;
 1015:         if (slots == PMD5_SLOTS_AVX2) {
 1016:             const uint8_t* ptrs[8] = {
 1017:                 (uint8_t*)(input + pos),
 1018:                 (uint8_t*)(input + pos + 64),
 1019:                 (uint8_t*)(input + pos + 128),
 1020:                 (uint8_t*)(input + pos + 192),
 1021:                 (uint8_t*)(input + pos + 256),
 1022:                 (uint8_t*)(input + pos + 320),
 1023:                 (uint8_t*)(input + pos + 384),
 1024:                 (uint8_t*)(input + pos + 448)
 1025:             };
 1026:             pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512);
 1027:         } else {
 1028:             const uint8_t* ptrs1[4] = {
 1029:                 (uint8_t*)(input + pos),
 1030:                 (uint8_t*)(input + pos + 64),
 1031:                 (uint8_t*)(input + pos + 128),
 1032:                 (uint8_t*)(input + pos + 192)
 1033:             };
 1034:             const uint8_t* ptrs2[4] = {
 1035:                 (uint8_t*)(input + pos + 256),
 1036:                 (uint8_t*)(input + pos + 320),
 1037:                 (uint8_t*)(input + pos + 384),
 1038:                 (uint8_t*)(input + pos + 448)
 1039:             };
 1040:             pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512);
 1041:             pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512);
 1042:         }
 1043:         pos += blocks * 512;
 1044:         length -= blocks * 512;
 1045:     }
 1046: 
 1047:     if (length) {
 1048:         memcpy(ctx->buffer, &input[pos], length);
 1049:         ctx->used = length;
 1050:     }
 1051: }
 1052: 
 1053: static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
 1054: {
 1055:     int i;
 1056:     uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0;
 1057:     if (ctx->used) {
 1058:         uchar tmp[512];
 1059:         memset(tmp, 0, 512);
 1060:         MD5P8_Update(ctx, tmp, 512 - ctx->used);
 1061:     }
 1062: 
 1063:     uchar state[34*4] = {0};
 1064: 
 1065:     MD5_CTX tmp;
 1066:     for (i = 0; i < 8; i++) {
 1067:         if (pmd5_slots() == PMD5_SLOTS_AVX2) {
 1068:             pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
 1069:         } else if (i < 4) {
 1070:             pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
 1071:         } else {
 1072:             pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4);
 1073:         }
 1074: #ifdef USE_OPENSSL
 1075:         if (low + tmp.Nl < low) high++;
 1076:         low += tmp.Nl;
 1077:         high += tmp.Nh;
 1078: #else
 1079:         if (low + tmp.totalN < low) high++;
 1080:         low += tmp.totalN;
 1081:         high += tmp.totalN2;
 1082: #endif
 1083:         SIVALu(state, i*16, tmp.A);
 1084:         SIVALu(state, i*16 + 4, tmp.B);
 1085:         SIVALu(state, i*16 + 8, tmp.C);
 1086:         SIVALu(state, i*16 + 12, tmp.D);
 1087:     }
 1088: 
 1089: #ifndef USE_OPENSSL
 1090:     high = (low >> 29) | (high << 3);
 1091:     low = (low << 3);
 1092: #endif
 1093: 
 1094:     sub <<= 3;
 1095:     if (low - sub > low) high--;
 1096:     low -= sub;
 1097: 
 1098:     SIVALu(state, 32*4, low);
 1099:     SIVALu(state, 33*4, high);
 1100: 
 1101:     MD5_CTX md;
 1102:     MD5_Init(&md);
 1103:     MD5_Update(&md, state, 34*4);
 1104:     MD5_Final(digest, &md);
 1105: }
 1106: 
 1107: extern "C" {
 1108: 
 1109: int md5_parallel_slots()
 1110: {
 1111:     return md5_parallel_slots_cpp();
 1112: }
 1113: 
 1114: int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
 1115: {
 1116:     return md5_parallel_cpp(streams, buf, len, sum, pre4, post4);
 1117: }
 1118: 
 1119: void MD5P8_Init(MD5P8_CTX *ctx)
 1120: {
 1121:     MD5P8_Init_cpp(ctx);
 1122: }
 1123: 
 1124: void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
 1125: {
 1126:     MD5P8_Update_cpp(ctx, input, length);
 1127: }
 1128: 
 1129: void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
 1130: {
 1131:     MD5P8_Final_cpp(digest, ctx);
 1132: }
 1133: 
 1134: } // "C"
 1135: 
 1136: #endif /* HAVE_SIMD */
 1137: #endif /* __cplusplus */
 1138: #endif /* __x86_64__ */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>