1: /*
2: * SSE2/AVX2-optimized routines to process multiple MD5 streams in parallel.
3: *
4: * Original author: Nicolas Noble, 2017
5: * Modifications: Jorrit Jongma, 2020
6: *
7: * The original code was released in the public domain by the original author,
8: * falling back to the MIT license ( http://www.opensource.org/licenses/MIT )
9: * in case public domain does not apply in your country. These modifications
10: * are likewise released in the public domain, with the same MIT license
11: * fallback.
12: *
13: * The original publication can be found at:
14: *
15: * https://github.com/nicolasnoble/sse-hash
16: */
17: /*
18: * Nicolas' original code has been extended to add AVX2 support, all non-SIMD
19: * MD5 code has been removed and those code paths rerouted to use the MD5
20: * code already present in rsync, and wrapper functions have been added. The
21: * MD5P8 code is also new, and is the reason for the new stride parameter.
22: *
23: * This code allows multiple independent MD5 streams to be processed in
24: * parallel, 4 with SSE2, 8 with AVX2. While single-stream performance is
25: * lower than that of the original C routines for MD5, the processing of
26: * additional streams is "for free".
27: *
28: * Single streams are rerouted to rsync's normal MD5 code as that is faster
29: * for that case. A further optimization is possible by using SSE2 code on
30: * AVX2-supporting CPUs when the number of streams is 2, 3, or 4. This is not
31: * implemented here as it would require some restructuring, and in practise
32: * the code here is only rarely called with less than the maximum amount of
33: * streams (typically once at the end of each checksum2'd file).
34: *
35: * Benchmarks (in MB/s) C ASM SSE2*1 SSE2*4 AVX2*1 AVX2*8
36: * - Intel Atom D2700 302 334 166 664 N/A N/A
37: * - Intel i7-7700hq 351 376 289 1156 273 2184
38: * - AMD ThreadRipper 2950x 728 784 568 2272 430 3440
39: */
40:
41: #ifdef __x86_64__
42: #ifdef __cplusplus
43:
44: extern "C" {
45:
46: #include "rsync.h"
47:
48: }
49:
50: #ifdef HAVE_SIMD
51:
52: #ifndef BENCHMARK_SIMD_CHECKSUM2
53: #define PMD5_ALLOW_SSE2 // debugging
54: #define PMD5_ALLOW_AVX2 // debugging
55: #endif
56:
57: #ifdef PMD5_ALLOW_AVX2
58: #ifndef PMD5_ALLOW_SSE2
59: #define PMD5_ALLOW_SSE2
60: #endif
61: #endif
62:
63: #include <stdint.h>
64: #include <string.h>
65:
66: #include <immintrin.h>
67:
68: /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
69: #ifdef __clang__
70: #define MVSTATIC
71: #else
72: #define MVSTATIC static
73: #endif
74:
75: // Missing from the headers on gcc 6 and older, clang 8 and older
76: typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
77: typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
78:
79: #define PMD5_SLOTS_DEFAULT 0
80: #define PMD5_SLOTS_SSE2 4
81: #define PMD5_SLOTS_AVX2 8
82: #define PMD5_SLOTS_MAX PMD5_SLOTS_AVX2
83:
84: #ifdef PMD5_ALLOW_SSE2
85: __attribute__ ((target("sse2"))) MVSTATIC int pmd5_slots()
86: {
87: return PMD5_SLOTS_SSE2;
88: }
89: #endif
90:
91: #ifdef PMD5_ALLOW_AVX2
92: __attribute__ ((target("avx2"))) MVSTATIC int pmd5_slots()
93: {
94: return PMD5_SLOTS_AVX2;
95: }
96: #endif
97:
98: __attribute__ ((target("default"))) MVSTATIC int pmd5_slots()
99: {
100: return PMD5_SLOTS_DEFAULT;
101: }
102:
103: /* The parallel MD5 context structure. */
104: typedef struct {
105: __m128i state_sse2[4];
106: __m256i state_avx2[4];
107: uint64_t len[PMD5_SLOTS_MAX];
108: } pmd5_context;
109:
110: /* The status returned by the various functions below. */
111: typedef enum {
112: PMD5_SUCCESS,
113: PMD5_INVALID_SLOT,
114: PMD5_UNALIGNED_UPDATE,
115: } pmd5_status;
116:
117: /* Initializes all slots in the given pmd5 context. */
118: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx);
119:
120: /* Initializes a single slot out in the given pmd5 context. */
121: static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot);
122:
123: /* Makes an MD5 update on all slots in parallel, given the same exact length on all streams.
124: The stream pointers will be incremented accordingly.
125: It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
126: The argument length NEEDS to be a multiple of 64. If not, an error is returned, and the context is corrupted.
127: Stride defaults to 64 if 0 is passed. */
128: static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride);
129:
130: /* Makes an MD5 update on all slots in parallel, given different lengths.
131: The stream pointers will be incremented accordingly.
132: The lengths will be decreased accordingly. Not all data might be consumed.
133: It is valid for a stream pointer to be NULL. Garbage will then be hashed into its corresponding slot.
134: The argument lengths NEEDS to contain only multiples of 64. If not, an error is returned, and the context is corrupted. */
135: static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX]);
136:
137: /* Finishes all slots at once. Fills in all digests. */
138: static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN]);
139:
140: /* Finishes one slot. The other slots will be unnaffected. The finished slot can then continue to hash garbage using
141: a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
142: static pmd5_status pmd5_finish_slot(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot);
143:
144: /* Finishes one slot. Extra data is allowed to be passed on as an argument. Length DOESN'T need to be a
145: multiple of 64. The other slots will be unaffected. The finished slot can then continue to hash garbage using
146: a NULL pointer as its stream argument, or needs to be reinitialized using pmd5_init_slot before being usable again. */
147: static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * ctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length);
148:
149: /* Insert a normal MD5 context into a given slot of a given parallel MD5 context. */
150: static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot);
151:
152: /* Extract a normal MD5 context from a given slot of a given parallel MD5 context. */
153: static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot);
154:
155: #define S11 7
156: #define S12 12
157: #define S13 17
158: #define S14 22
159: #define S21 5
160: #define S22 9
161: #define S23 14
162: #define S24 20
163: #define S31 4
164: #define S32 11
165: #define S33 16
166: #define S34 23
167: #define S41 6
168: #define S42 10
169: #define S43 15
170: #define S44 21
171:
172: #define T1 0xD76AA478
173: #define T2 0xE8C7B756
174: #define T3 0x242070DB
175: #define T4 0xC1BDCEEE
176: #define T5 0xF57C0FAF
177: #define T6 0x4787C62A
178: #define T7 0xA8304613
179: #define T8 0xFD469501
180: #define T9 0x698098D8
181: #define T10 0x8B44F7AF
182: #define T11 0xFFFF5BB1
183: #define T12 0x895CD7BE
184: #define T13 0x6B901122
185: #define T14 0xFD987193
186: #define T15 0xA679438E
187: #define T16 0x49B40821
188: #define T17 0xF61E2562
189: #define T18 0xC040B340
190: #define T19 0x265E5A51
191: #define T20 0xE9B6C7AA
192: #define T21 0xD62F105D
193: #define T22 0x02441453
194: #define T23 0xD8A1E681
195: #define T24 0xE7D3FBC8
196: #define T25 0x21E1CDE6
197: #define T26 0xC33707D6
198: #define T27 0xF4D50D87
199: #define T28 0x455A14ED
200: #define T29 0xA9E3E905
201: #define T30 0xFCEFA3F8
202: #define T31 0x676F02D9
203: #define T32 0x8D2A4C8A
204: #define T33 0xFFFA3942
205: #define T34 0x8771F681
206: #define T35 0x6D9D6122
207: #define T36 0xFDE5380C
208: #define T37 0xA4BEEA44
209: #define T38 0x4BDECFA9
210: #define T39 0xF6BB4B60
211: #define T40 0xBEBFBC70
212: #define T41 0x289B7EC6
213: #define T42 0xEAA127FA
214: #define T43 0xD4EF3085
215: #define T44 0x04881D05
216: #define T45 0xD9D4D039
217: #define T46 0xE6DB99E5
218: #define T47 0x1FA27CF8
219: #define T48 0xC4AC5665
220: #define T49 0xF4292244
221: #define T50 0x432AFF97
222: #define T51 0xAB9423A7
223: #define T52 0xFC93A039
224: #define T53 0x655B59C3
225: #define T54 0x8F0CCC92
226: #define T55 0xFFEFF47D
227: #define T56 0x85845DD1
228: #define T57 0x6FA87E4F
229: #define T58 0xFE2CE6E0
230: #define T59 0xA3014314
231: #define T60 0x4E0811A1
232: #define T61 0xF7537E82
233: #define T62 0xBD3AF235
234: #define T63 0x2AD7D2BB
235: #define T64 0xEB86D391
236:
237: #define ROTL_SSE2(x, n) { \
238: __m128i s; \
239: s = _mm_srli_epi32(x, 32 - n); \
240: x = _mm_slli_epi32(x, n); \
241: x = _mm_or_si128(x, s); \
242: };
243:
244: #define ROTL_AVX2(x, n) { \
245: __m256i s; \
246: s = _mm256_srli_epi32(x, 32 - n); \
247: x = _mm256_slli_epi32(x, n); \
248: x = _mm256_or_si256(x, s); \
249: };
250:
251: #define F_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, y), _mm_andnot_si128(x, z))
252: #define G_SSE2(x, y, z) _mm_or_si128(_mm_and_si128(x, z), _mm_andnot_si128(z, y))
253: #define H_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128(x, y), z)
254: #define I_SSE2(x, y, z) _mm_xor_si128(y, _mm_or_si128(x, _mm_andnot_si128(z, _mm_set1_epi32(0xffffffff))))
255:
256: #define F_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
257: #define G_AVX2(x, y, z) _mm256_or_si256(_mm256_and_si256(x, z), _mm256_andnot_si256(z, y))
258: #define H_AVX2(x, y, z) _mm256_xor_si256(_mm256_xor_si256(x, y), z)
259: #define I_AVX2(x, y, z) _mm256_xor_si256(y, _mm256_or_si256(x, _mm256_andnot_si256(z, _mm256_set1_epi32(0xffffffff))))
260:
261: #define SET_SSE2(step, a, b, c, d, x, s, ac) { \
262: a = _mm_add_epi32(_mm_add_epi32(a, _mm_add_epi32(x, _mm_set1_epi32(T##ac))), step##_SSE2(b, c, d)); \
263: ROTL_SSE2(a, s); \
264: a = _mm_add_epi32(a, b); \
265: }
266:
267: #define SET_AVX2(step, a, b, c, d, x, s, ac) { \
268: a = _mm256_add_epi32(_mm256_add_epi32(a, _mm256_add_epi32(x, _mm256_set1_epi32(T##ac))), step##_AVX2(b, c, d)); \
269: ROTL_AVX2(a, s); \
270: a = _mm256_add_epi32(a, b); \
271: }
272:
273: #define IA 0x67452301
274: #define IB 0xefcdab89
275: #define IC 0x98badcfe
276: #define ID 0x10325476
277:
278: #define GET_MD5_DATA(dest, src, pos) \
279: dest = \
280: ((uint32_t) src[pos + 0]) << 0 | \
281: ((uint32_t) src[pos + 1]) << 8 | \
282: ((uint32_t) src[pos + 2]) << 16 | \
283: ((uint32_t) src[pos + 3]) << 24
284:
285: #define GET_PMD5_DATA_SSE2(dest, src, pos) { \
286: uint32_t v0, v1, v2, v3; \
287: GET_MD5_DATA(v0, src[0], pos); \
288: GET_MD5_DATA(v1, src[1], pos); \
289: GET_MD5_DATA(v2, src[2], pos); \
290: GET_MD5_DATA(v3, src[3], pos); \
291: dest = _mm_setr_epi32(v0, v1, v2, v3); \
292: }
293:
294: #define GET_PMD5_DATA_AVX2(dest, src, pos) { \
295: uint32_t v0, v1, v2, v3; \
296: uint32_t v4, v5, v6, v7; \
297: GET_MD5_DATA(v0, src[0], pos); \
298: GET_MD5_DATA(v1, src[1], pos); \
299: GET_MD5_DATA(v2, src[2], pos); \
300: GET_MD5_DATA(v3, src[3], pos); \
301: GET_MD5_DATA(v4, src[4], pos); \
302: GET_MD5_DATA(v5, src[5], pos); \
303: GET_MD5_DATA(v6, src[6], pos); \
304: GET_MD5_DATA(v7, src[7], pos); \
305: dest = _mm256_setr_epi32(v0, v1, v2, v3, \
306: v4, v5, v6, v7); \
307: }
308:
309: #define PUT_MD5_DATA(dest, val, pos) { \
310: dest[pos + 0] = (val >> 0) & 0xff; \
311: dest[pos + 1] = (val >> 8) & 0xff; \
312: dest[pos + 2] = (val >> 16) & 0xff; \
313: dest[pos + 3] = (val >> 24) & 0xff; \
314: }
315:
316: const static uint8_t md5_padding[64] = {
317: 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
318: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
319: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
320: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
321: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
322: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
323: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
324: 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
325: };
326:
327: #ifdef PMD5_ALLOW_SSE2
328: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
329: {
330: int i;
331: for (i = 0; i < PMD5_SLOTS_MAX; i++) {
332: ctx->len[i] = 0;
333: }
334:
335: ctx->state_sse2[0] = _mm_set1_epi32(IA);
336: ctx->state_sse2[1] = _mm_set1_epi32(IB);
337: ctx->state_sse2[2] = _mm_set1_epi32(IC);
338: ctx->state_sse2[3] = _mm_set1_epi32(ID);
339:
340: return PMD5_SUCCESS;
341: }
342: #endif
343:
344: #ifdef PMD5_ALLOW_AVX2
345: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
346: {
347: int i;
348: for (i = 0; i < PMD5_SLOTS_MAX; i++) {
349: ctx->len[i] = 0;
350: }
351:
352: ctx->state_avx2[0] = _mm256_set1_epi32(IA);
353: ctx->state_avx2[1] = _mm256_set1_epi32(IB);
354: ctx->state_avx2[2] = _mm256_set1_epi32(IC);
355: ctx->state_avx2[3] = _mm256_set1_epi32(ID);
356:
357: return PMD5_SUCCESS;
358: }
359: #endif
360:
361: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_init_all(pmd5_context * ctx)
362: {
363: return PMD5_INVALID_SLOT;
364: }
365:
366: #ifdef PMD5_ALLOW_SSE2
367: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
368: {
369: if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
370: return PMD5_INVALID_SLOT;
371:
372: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
373: int i;
374:
375: for (i = 0; i < 4; i++) {
376: _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
377: }
378:
379: v[0][slot] = a;
380: v[1][slot] = b;
381: v[2][slot] = c;
382: v[3][slot] = d;
383:
384: for (i = 0; i < 4; i++) {
385: ctx->state_sse2[i] = _mm_loadu_si128((__m128i_u*)v[i]);
386: }
387:
388: return PMD5_SUCCESS;
389: }
390: #endif
391:
392: #ifdef PMD5_ALLOW_AVX2
393: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
394: {
395: if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
396: return PMD5_INVALID_SLOT;
397:
398: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
399: int i;
400:
401: for (i = 0; i < 4; i++) {
402: _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
403: }
404:
405: v[0][slot] = a;
406: v[1][slot] = b;
407: v[2][slot] = c;
408: v[3][slot] = d;
409:
410: for (i = 0; i < 4; i++) {
411: ctx->state_avx2[i] = _mm256_lddqu_si256((__m256i_u*)v[i]);
412: }
413:
414: return PMD5_SUCCESS;
415: }
416: #endif
417:
418: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_set_slot(pmd5_context * ctx, int slot, uint32_t a, uint32_t b, uint32_t c, uint32_t d)
419: {
420: return PMD5_INVALID_SLOT;
421: }
422:
423: #ifdef PMD5_ALLOW_SSE2
424: __attribute__ ((target("sse2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
425: {
426: if ((slot >= PMD5_SLOTS_SSE2) || (slot < 0))
427: return PMD5_INVALID_SLOT;
428:
429: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_SSE2];
430: int i;
431:
432: for (i = 0; i < 4; i++) {
433: _mm_store_si128((__m128i_u*)v[i], ctx->state_sse2[i]);
434: }
435:
436: *a = v[0][slot];
437: *b = v[1][slot];
438: *c = v[2][slot];
439: *d = v[3][slot];
440:
441: return PMD5_SUCCESS;
442: }
443: #endif
444:
445: #ifdef PMD5_ALLOW_AVX2
446: __attribute__ ((target("avx2"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
447: {
448: if ((slot >= PMD5_SLOTS_AVX2) || (slot < 0))
449: return PMD5_INVALID_SLOT;
450:
451: __attribute__ ((aligned(32))) uint32_t v[4][PMD5_SLOTS_AVX2];
452: int i;
453:
454: for (i = 0; i < 4; i++) {
455: _mm256_store_si256((__m256i_u*)v[i], ctx->state_avx2[i]);
456: }
457:
458: *a = v[0][slot];
459: *b = v[1][slot];
460: *c = v[2][slot];
461: *d = v[3][slot];
462:
463: return PMD5_SUCCESS;
464: }
465: #endif
466:
467: __attribute__ ((target("default"))) MVSTATIC pmd5_status pmd5_get_slot(const pmd5_context * ctx, int slot, uint32_t* a, uint32_t* b, uint32_t* c, uint32_t* d)
468: {
469: return PMD5_INVALID_SLOT;
470: }
471:
472: static pmd5_status pmd5_init_slot(pmd5_context * ctx, int slot)
473: {
474: return pmd5_set_slot(ctx, slot, IA, IB, IC, ID);
475: }
476:
477: #ifdef PMD5_ALLOW_SSE2
478: __attribute__ ((target("sse2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
479: {
480: __m128i W[MD5_DIGEST_LEN], a, b, c, d;
481:
482: GET_PMD5_DATA_SSE2(W[ 0], data, 0);
483: GET_PMD5_DATA_SSE2(W[ 1], data, 4);
484: GET_PMD5_DATA_SSE2(W[ 2], data, 8);
485: GET_PMD5_DATA_SSE2(W[ 3], data, 12);
486: GET_PMD5_DATA_SSE2(W[ 4], data, 16);
487: GET_PMD5_DATA_SSE2(W[ 5], data, 20);
488: GET_PMD5_DATA_SSE2(W[ 6], data, 24);
489: GET_PMD5_DATA_SSE2(W[ 7], data, 28);
490: GET_PMD5_DATA_SSE2(W[ 8], data, 32);
491: GET_PMD5_DATA_SSE2(W[ 9], data, 36);
492: GET_PMD5_DATA_SSE2(W[10], data, 40);
493: GET_PMD5_DATA_SSE2(W[11], data, 44);
494: GET_PMD5_DATA_SSE2(W[12], data, 48);
495: GET_PMD5_DATA_SSE2(W[13], data, 52);
496: GET_PMD5_DATA_SSE2(W[14], data, 56);
497: GET_PMD5_DATA_SSE2(W[15], data, 60);
498:
499: a = ctx->state_sse2[0];
500: b = ctx->state_sse2[1];
501: c = ctx->state_sse2[2];
502: d = ctx->state_sse2[3];
503:
504: SET_SSE2(F, a, b, c, d, W[ 0], S11, 1);
505: SET_SSE2(F, d, a, b, c, W[ 1], S12, 2);
506: SET_SSE2(F, c, d, a, b, W[ 2], S13, 3);
507: SET_SSE2(F, b, c, d, a, W[ 3], S14, 4);
508: SET_SSE2(F, a, b, c, d, W[ 4], S11, 5);
509: SET_SSE2(F, d, a, b, c, W[ 5], S12, 6);
510: SET_SSE2(F, c, d, a, b, W[ 6], S13, 7);
511: SET_SSE2(F, b, c, d, a, W[ 7], S14, 8);
512: SET_SSE2(F, a, b, c, d, W[ 8], S11, 9);
513: SET_SSE2(F, d, a, b, c, W[ 9], S12, 10);
514: SET_SSE2(F, c, d, a, b, W[10], S13, 11);
515: SET_SSE2(F, b, c, d, a, W[11], S14, 12);
516: SET_SSE2(F, a, b, c, d, W[12], S11, 13);
517: SET_SSE2(F, d, a, b, c, W[13], S12, 14);
518: SET_SSE2(F, c, d, a, b, W[14], S13, 15);
519: SET_SSE2(F, b, c, d, a, W[15], S14, 16);
520:
521: SET_SSE2(G, a, b, c, d, W[ 1], S21, 17);
522: SET_SSE2(G, d, a, b, c, W[ 6], S22, 18);
523: SET_SSE2(G, c, d, a, b, W[11], S23, 19);
524: SET_SSE2(G, b, c, d, a, W[ 0], S24, 20);
525: SET_SSE2(G, a, b, c, d, W[ 5], S21, 21);
526: SET_SSE2(G, d, a, b, c, W[10], S22, 22);
527: SET_SSE2(G, c, d, a, b, W[15], S23, 23);
528: SET_SSE2(G, b, c, d, a, W[ 4], S24, 24);
529: SET_SSE2(G, a, b, c, d, W[ 9], S21, 25);
530: SET_SSE2(G, d, a, b, c, W[14], S22, 26);
531: SET_SSE2(G, c, d, a, b, W[ 3], S23, 27);
532: SET_SSE2(G, b, c, d, a, W[ 8], S24, 28);
533: SET_SSE2(G, a, b, c, d, W[13], S21, 29);
534: SET_SSE2(G, d, a, b, c, W[ 2], S22, 30);
535: SET_SSE2(G, c, d, a, b, W[ 7], S23, 31);
536: SET_SSE2(G, b, c, d, a, W[12], S24, 32);
537:
538: SET_SSE2(H, a, b, c, d, W[ 5], S31, 33);
539: SET_SSE2(H, d, a, b, c, W[ 8], S32, 34);
540: SET_SSE2(H, c, d, a, b, W[11], S33, 35);
541: SET_SSE2(H, b, c, d, a, W[14], S34, 36);
542: SET_SSE2(H, a, b, c, d, W[ 1], S31, 37);
543: SET_SSE2(H, d, a, b, c, W[ 4], S32, 38);
544: SET_SSE2(H, c, d, a, b, W[ 7], S33, 39);
545: SET_SSE2(H, b, c, d, a, W[10], S34, 40);
546: SET_SSE2(H, a, b, c, d, W[13], S31, 41);
547: SET_SSE2(H, d, a, b, c, W[ 0], S32, 42);
548: SET_SSE2(H, c, d, a, b, W[ 3], S33, 43);
549: SET_SSE2(H, b, c, d, a, W[ 6], S34, 44);
550: SET_SSE2(H, a, b, c, d, W[ 9], S31, 45);
551: SET_SSE2(H, d, a, b, c, W[12], S32, 46);
552: SET_SSE2(H, c, d, a, b, W[15], S33, 47);
553: SET_SSE2(H, b, c, d, a, W[ 2], S34, 48);
554:
555: SET_SSE2(I, a, b, c, d, W[ 0], S41, 49);
556: SET_SSE2(I, d, a, b, c, W[ 7], S42, 50);
557: SET_SSE2(I, c, d, a, b, W[14], S43, 51);
558: SET_SSE2(I, b, c, d, a, W[ 5], S44, 52);
559: SET_SSE2(I, a, b, c, d, W[12], S41, 53);
560: SET_SSE2(I, d, a, b, c, W[ 3], S42, 54);
561: SET_SSE2(I, c, d, a, b, W[10], S43, 55);
562: SET_SSE2(I, b, c, d, a, W[ 1], S44, 56);
563: SET_SSE2(I, a, b, c, d, W[ 8], S41, 57);
564: SET_SSE2(I, d, a, b, c, W[15], S42, 58);
565: SET_SSE2(I, c, d, a, b, W[ 6], S43, 59);
566: SET_SSE2(I, b, c, d, a, W[13], S44, 60);
567: SET_SSE2(I, a, b, c, d, W[ 4], S41, 61);
568: SET_SSE2(I, d, a, b, c, W[11], S42, 62);
569: SET_SSE2(I, c, d, a, b, W[ 2], S43, 63);
570: SET_SSE2(I, b, c, d, a, W[ 9], S44, 64);
571:
572: ctx->state_sse2[0] = _mm_add_epi32(ctx->state_sse2[0], a);
573: ctx->state_sse2[1] = _mm_add_epi32(ctx->state_sse2[1], b);
574: ctx->state_sse2[2] = _mm_add_epi32(ctx->state_sse2[2], c);
575: ctx->state_sse2[3] = _mm_add_epi32(ctx->state_sse2[3], d);
576: }
577: #endif
578:
579: #ifdef PMD5_ALLOW_AVX2
580: __attribute__ ((target("avx2"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
581: {
582: __m256i W[MD5_DIGEST_LEN], a, b, c, d;
583:
584: GET_PMD5_DATA_AVX2(W[ 0], data, 0);
585: GET_PMD5_DATA_AVX2(W[ 1], data, 4);
586: GET_PMD5_DATA_AVX2(W[ 2], data, 8);
587: GET_PMD5_DATA_AVX2(W[ 3], data, 12);
588: GET_PMD5_DATA_AVX2(W[ 4], data, 16);
589: GET_PMD5_DATA_AVX2(W[ 5], data, 20);
590: GET_PMD5_DATA_AVX2(W[ 6], data, 24);
591: GET_PMD5_DATA_AVX2(W[ 7], data, 28);
592: GET_PMD5_DATA_AVX2(W[ 8], data, 32);
593: GET_PMD5_DATA_AVX2(W[ 9], data, 36);
594: GET_PMD5_DATA_AVX2(W[10], data, 40);
595: GET_PMD5_DATA_AVX2(W[11], data, 44);
596: GET_PMD5_DATA_AVX2(W[12], data, 48);
597: GET_PMD5_DATA_AVX2(W[13], data, 52);
598: GET_PMD5_DATA_AVX2(W[14], data, 56);
599: GET_PMD5_DATA_AVX2(W[15], data, 60);
600:
601: a = ctx->state_avx2[0];
602: b = ctx->state_avx2[1];
603: c = ctx->state_avx2[2];
604: d = ctx->state_avx2[3];
605:
606: SET_AVX2(F, a, b, c, d, W[ 0], S11, 1);
607: SET_AVX2(F, d, a, b, c, W[ 1], S12, 2);
608: SET_AVX2(F, c, d, a, b, W[ 2], S13, 3);
609: SET_AVX2(F, b, c, d, a, W[ 3], S14, 4);
610: SET_AVX2(F, a, b, c, d, W[ 4], S11, 5);
611: SET_AVX2(F, d, a, b, c, W[ 5], S12, 6);
612: SET_AVX2(F, c, d, a, b, W[ 6], S13, 7);
613: SET_AVX2(F, b, c, d, a, W[ 7], S14, 8);
614: SET_AVX2(F, a, b, c, d, W[ 8], S11, 9);
615: SET_AVX2(F, d, a, b, c, W[ 9], S12, 10);
616: SET_AVX2(F, c, d, a, b, W[10], S13, 11);
617: SET_AVX2(F, b, c, d, a, W[11], S14, 12);
618: SET_AVX2(F, a, b, c, d, W[12], S11, 13);
619: SET_AVX2(F, d, a, b, c, W[13], S12, 14);
620: SET_AVX2(F, c, d, a, b, W[14], S13, 15);
621: SET_AVX2(F, b, c, d, a, W[15], S14, 16);
622:
623: SET_AVX2(G, a, b, c, d, W[ 1], S21, 17);
624: SET_AVX2(G, d, a, b, c, W[ 6], S22, 18);
625: SET_AVX2(G, c, d, a, b, W[11], S23, 19);
626: SET_AVX2(G, b, c, d, a, W[ 0], S24, 20);
627: SET_AVX2(G, a, b, c, d, W[ 5], S21, 21);
628: SET_AVX2(G, d, a, b, c, W[10], S22, 22);
629: SET_AVX2(G, c, d, a, b, W[15], S23, 23);
630: SET_AVX2(G, b, c, d, a, W[ 4], S24, 24);
631: SET_AVX2(G, a, b, c, d, W[ 9], S21, 25);
632: SET_AVX2(G, d, a, b, c, W[14], S22, 26);
633: SET_AVX2(G, c, d, a, b, W[ 3], S23, 27);
634: SET_AVX2(G, b, c, d, a, W[ 8], S24, 28);
635: SET_AVX2(G, a, b, c, d, W[13], S21, 29);
636: SET_AVX2(G, d, a, b, c, W[ 2], S22, 30);
637: SET_AVX2(G, c, d, a, b, W[ 7], S23, 31);
638: SET_AVX2(G, b, c, d, a, W[12], S24, 32);
639:
640: SET_AVX2(H, a, b, c, d, W[ 5], S31, 33);
641: SET_AVX2(H, d, a, b, c, W[ 8], S32, 34);
642: SET_AVX2(H, c, d, a, b, W[11], S33, 35);
643: SET_AVX2(H, b, c, d, a, W[14], S34, 36);
644: SET_AVX2(H, a, b, c, d, W[ 1], S31, 37);
645: SET_AVX2(H, d, a, b, c, W[ 4], S32, 38);
646: SET_AVX2(H, c, d, a, b, W[ 7], S33, 39);
647: SET_AVX2(H, b, c, d, a, W[10], S34, 40);
648: SET_AVX2(H, a, b, c, d, W[13], S31, 41);
649: SET_AVX2(H, d, a, b, c, W[ 0], S32, 42);
650: SET_AVX2(H, c, d, a, b, W[ 3], S33, 43);
651: SET_AVX2(H, b, c, d, a, W[ 6], S34, 44);
652: SET_AVX2(H, a, b, c, d, W[ 9], S31, 45);
653: SET_AVX2(H, d, a, b, c, W[12], S32, 46);
654: SET_AVX2(H, c, d, a, b, W[15], S33, 47);
655: SET_AVX2(H, b, c, d, a, W[ 2], S34, 48);
656:
657: SET_AVX2(I, a, b, c, d, W[ 0], S41, 49);
658: SET_AVX2(I, d, a, b, c, W[ 7], S42, 50);
659: SET_AVX2(I, c, d, a, b, W[14], S43, 51);
660: SET_AVX2(I, b, c, d, a, W[ 5], S44, 52);
661: SET_AVX2(I, a, b, c, d, W[12], S41, 53);
662: SET_AVX2(I, d, a, b, c, W[ 3], S42, 54);
663: SET_AVX2(I, c, d, a, b, W[10], S43, 55);
664: SET_AVX2(I, b, c, d, a, W[ 1], S44, 56);
665: SET_AVX2(I, a, b, c, d, W[ 8], S41, 57);
666: SET_AVX2(I, d, a, b, c, W[15], S42, 58);
667: SET_AVX2(I, c, d, a, b, W[ 6], S43, 59);
668: SET_AVX2(I, b, c, d, a, W[13], S44, 60);
669: SET_AVX2(I, a, b, c, d, W[ 4], S41, 61);
670: SET_AVX2(I, d, a, b, c, W[11], S42, 62);
671: SET_AVX2(I, c, d, a, b, W[ 2], S43, 63);
672: SET_AVX2(I, b, c, d, a, W[ 9], S44, 64);
673:
674: ctx->state_avx2[0] = _mm256_add_epi32(ctx->state_avx2[0], a);
675: ctx->state_avx2[1] = _mm256_add_epi32(ctx->state_avx2[1], b);
676: ctx->state_avx2[2] = _mm256_add_epi32(ctx->state_avx2[2], c);
677: ctx->state_avx2[3] = _mm256_add_epi32(ctx->state_avx2[3], d);
678: }
679: #endif
680:
681: __attribute__ ((target("default"))) MVSTATIC void pmd5_process(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX])
682: {
683: }
684:
685: static pmd5_status pmd5_update_all_simple(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t length, uint64_t stride)
686: {
687: const uint8_t * ptrs[PMD5_SLOTS_MAX];
688:
689: if (!length) return PMD5_SUCCESS;
690:
691: int slots = pmd5_slots();
692:
693: if (!stride) stride = 64;
694:
695: int i;
696: for (i = 0; i < slots; i++) {
697: ptrs[i] = data[i];
698: ctx->len[i] += length;
699: if (!ptrs[i]) ptrs[i] = md5_padding;
700: }
701:
702: while (length >= 64) {
703: pmd5_process(ctx, ptrs);
704: length -= 64;
705: for (i = 0; i < slots; i++) {
706: if (data[i]) ptrs[i] += stride;
707: }
708: }
709:
710: if (length) return PMD5_UNALIGNED_UPDATE;
711:
712: for (i = 0; i < slots; i++) {
713: if (data[i]) data[i] = ptrs[i];
714: }
715:
716: return PMD5_SUCCESS;
717: }
718:
719: static pmd5_status pmd5_update_all(pmd5_context * ctx, const uint8_t * data[PMD5_SLOTS_MAX], uint64_t lengths[PMD5_SLOTS_MAX])
720: {
721: uint64_t length = 0;
722: int slots = pmd5_slots();
723:
724: int i;
725: for (i = 0; i < slots; i++) {
726: if ((length == 0) || (lengths[i] < length)) length = lengths[i];
727: }
728:
729: for (i = 0; i < slots; i++) {
730: lengths[i] -= length;
731: }
732:
733: return pmd5_update_all_simple(ctx, data, length, 0);
734: }
735:
736: static pmd5_status pmd5_finish_slot_with_extra(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot, const uint8_t * data, uint64_t length)
737: {
738: MD5_CTX ctx;
739:
740: if ((slot >= pmd5_slots()) || (slot < 0))
741: return PMD5_INVALID_SLOT;
742:
743: pmd5_to_md5(pctx, &ctx, slot);
744: if (data && length) {
745: MD5_Update(&ctx, data, length);
746: }
747: MD5_Final(digest, &ctx);
748:
749: return PMD5_SUCCESS;
750: }
751:
752: static pmd5_status pmd5_finish_slot(pmd5_context * pctx, uint8_t digest[MD5_DIGEST_LEN], int slot)
753: {
754: return pmd5_finish_slot_with_extra(pctx, digest, slot, NULL, 0);
755: }
756:
757: static pmd5_status pmd5_finish_all(pmd5_context * ctx, uint8_t digests[PMD5_SLOTS_MAX][MD5_DIGEST_LEN])
758: {
759: int i;
760: for (i = 0; i < pmd5_slots(); i++) {
761: pmd5_finish_slot_with_extra(ctx, digests[i], i, NULL, 0);
762: }
763: return PMD5_SUCCESS;
764: }
765:
766: static pmd5_status md5_to_pmd5(const MD5_CTX * ctx, pmd5_context * pctx, int slot)
767: {
768: if ((slot >= pmd5_slots()) || (slot < 0))
769: return PMD5_INVALID_SLOT;
770:
771: // TODO This function ignores buffered but as of yet unhashed data. We're not using this function, just noting.
772:
773: #ifdef USE_OPENSSL
774: pctx->len[slot] = (ctx->Nl >> 3) + ((uint64_t)ctx->Nh << 29);
775: #else
776: pctx->len[slot] = ctx->totalN + ((uint64_t)ctx->totalN2 << 32);
777: #endif
778: return pmd5_set_slot(pctx, slot, (uint32_t)ctx->A, (uint32_t)ctx->B, (uint32_t)ctx->C, (uint32_t)ctx->D);
779: }
780:
781: static pmd5_status pmd5_to_md5(const pmd5_context * pctx, MD5_CTX * ctx, int slot)
782: {
783: if ((slot >= pmd5_slots()) || (slot < 0))
784: return PMD5_INVALID_SLOT;
785:
786: MD5_Init(ctx);
787:
788: #ifdef USE_OPENSSL
789: ctx->Nl = (pctx->len[slot] << 3) & 0xFFFFFFFF;
790: ctx->Nh = pctx->len[slot] >> 29;
791:
792: uint32_t a, b, c, d;
793: pmd5_status ret = pmd5_get_slot(pctx, slot, &a, &b, &c, &d);
794: if (ret == PMD5_SUCCESS) {
795: ctx->A = a;
796: ctx->B = b;
797: ctx->C = c;
798: ctx->D = d;
799: }
800: return ret;
801: #else
802: ctx->totalN = pctx->len[slot] & 0xFFFFFFFF;
803: ctx->totalN2 = pctx->len[slot] >> 32;
804: return pmd5_get_slot(pctx, slot, &ctx->A, &ctx->B, &ctx->C, &ctx->D);
805: #endif
806: }
807:
808: /* With GCC 10 putting these implementations inside 'extern "C"' causes an
809: assembler error. That worked fine on GCC 5-9 and clang 6-10...
810: */
811:
812: static inline int md5_parallel_slots_cpp()
813: {
814: int slots = pmd5_slots();
815: if (slots == 0) return 1;
816: return slots;
817: }
818:
819: static inline int md5_parallel_cpp(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
820: {
821: int slots = md5_parallel_slots_cpp();
822: if ((streams < 1) || (streams > slots)) return 0;
823: if (pre4 && post4) return 0;
824:
825: if (slots == 1) {
826: MD5_CTX ctx;
827: MD5_Init(&ctx);
828: if (pre4) {
829: MD5_Update(&ctx, (const unsigned char*)pre4, 4);
830: }
831: MD5_Update(&ctx, (const unsigned char*)buf[0], len[0]);
832: if (post4) {
833: MD5_Update(&ctx, (const unsigned char*)post4, 4);
834: }
835: if (sum[0]) {
836: MD5_Final((uint8_t*)sum[0], &ctx);
837: }
838: return 0;
839: }
840:
841: int i;
842: int active[PMD5_SLOTS_MAX];
843: char* buffers[PMD5_SLOTS_MAX];
844: uint64_t left[PMD5_SLOTS_MAX];
845: for (i = 0; i < PMD5_SLOTS_MAX; i++) {
846: active[i] = streams > i;
847: if (i < streams) {
848: buffers[i] = buf[i];
849: left[i] = (uint64_t)len[i];
850: } else {
851: buffers[i] = NULL;
852: left[i] = 0;
853: }
854: }
855: MD5_CTX results[PMD5_SLOTS_MAX];
856:
857: pmd5_context ctx_simd;
858: if (pmd5_init_all(&ctx_simd) != PMD5_SUCCESS) return 0;
859:
860: if (pre4) {
861: char temp_buffers[PMD5_SLOTS_MAX][64];
862: int have_any = 0;
863: for (i = 0; i < slots; i++) {
864: if (active[i]) {
865: if (left[i] < 60) {
866: MD5_Init(&results[i]);
867: MD5_Update(&results[i], (const unsigned char*)pre4, 4);
868: MD5_Update(&results[i], (const unsigned char*)buf[i], left[i]);
869: active[i] = 0;
870: left[i] = 0;
871: } else {
872: memcpy(temp_buffers[i], pre4, 4);
873: memcpy(temp_buffers[i] + 4, buffers[i], 60);
874: buffers[i] += 60;
875: left[i] -= 60;
876: have_any = 1;
877: }
878: }
879: }
880:
881: if (have_any) {
882: char* ptrs[PMD5_SLOTS_MAX];
883: for (i = 0; i < PMD5_SLOTS_MAX; i++) {
884: ptrs[i] = &temp_buffers[i][0];
885: }
886: if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)ptrs, 64, 0) != PMD5_SUCCESS) {
887: return 0;
888: }
889: }
890: }
891:
892: int failed = 0;
893: while (true) {
894: for (i = 0; i < slots; i++) {
895: if (active[i] && (left[i] < 64)) {
896: if (pmd5_to_md5(&ctx_simd, &results[i], i) != PMD5_SUCCESS) {
897: failed = 1;
898: }
899: active[i] = 0;
900: }
901: }
902:
903: uint64_t shortest = 0;
904: for (i = 0; i < slots; i++) {
905: if (!active[i]) {
906: buffers[i] = NULL;
907: } else if ((shortest == 0) || (left[i] < shortest)) {
908: shortest = left[i];
909: }
910: }
911:
912: if (shortest > 0) {
913: shortest = shortest & ~63;
914: if (pmd5_update_all_simple(&ctx_simd, (const uint8_t**)buffers, shortest, 0) != PMD5_SUCCESS) {
915: failed = 1;
916: }
917: for (i = 0; i < slots; i++) {
918: if (active[i]) {
919: left[i] -= shortest;
920: }
921: }
922: }
923:
924: if (failed) {
925: return 0;
926: } else {
927: int have_any = 0;
928: for (i = 0; i < slots; i++) {
929: have_any |= active[i];
930: }
931: if (!have_any) {
932: break;
933: }
934: }
935: }
936:
937: for (i = 0; i < slots; i++) {
938: if (i < streams) {
939: if (left[i] > 0) {
940: // buffer[i] == NULL here
941: MD5_Update(&results[i], (const unsigned char*)buf[i] + len[i] - left[i], left[i]);
942: }
943: if (post4) {
944: MD5_Update(&results[i], (const unsigned char*)post4, 4);
945: }
946: if (sum[i]) {
947: MD5_Final((uint8_t*)sum[i], &results[i]);
948: }
949: }
950: }
951:
952: return 1;
953: }
954:
955: // each pmd5_context needs to be 32-byte aligned
956: #define MD5P8_Contexts_simd(ctx, index) ((pmd5_context*)((((uintptr_t)((ctx)->context_storage) + 31) & ~31) + (index)*((sizeof(pmd5_context) + 31) & ~31)))
957:
958: static inline void MD5P8_Init_cpp(MD5P8_CTX *ctx)
959: {
960: int i;
961: for (i = 0; i < (pmd5_slots() == PMD5_SLOTS_AVX2 ? 1 : 2); i++) {
962: pmd5_init_all(MD5P8_Contexts_simd(ctx, i));
963: }
964: ctx->used = 0;
965: ctx->next = 0;
966: }
967:
968: static inline void MD5P8_Update_cpp(MD5P8_CTX *ctx, const uchar *input, uint32 length)
969: {
970: int slots = pmd5_slots();
971: uint32 pos = 0;
972:
973: if ((ctx->used) || (length < 512)) {
974: int cpy = MIN(length, 512 - ctx->used);
975: memcpy(&ctx->buffer[ctx->used], input, cpy);
976: ctx->used += cpy;
977: length -= cpy;
978: pos += cpy;
979:
980: if (ctx->used == 512) {
981: if (slots == PMD5_SLOTS_AVX2) {
982: const uint8_t* ptrs[PMD5_SLOTS_MAX] = {
983: (uint8_t*)ctx->buffer,
984: (uint8_t*)(ctx->buffer + 64),
985: (uint8_t*)(ctx->buffer + 128),
986: (uint8_t*)(ctx->buffer + 192),
987: (uint8_t*)(ctx->buffer + 256),
988: (uint8_t*)(ctx->buffer + 320),
989: (uint8_t*)(ctx->buffer + 384),
990: (uint8_t*)(ctx->buffer + 448)
991: };
992: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, 64, 0);
993: } else {
994: const uint8_t* ptrs1[PMD5_SLOTS_MAX] = {
995: (uint8_t*)ctx->buffer,
996: (uint8_t*)(ctx->buffer + 64),
997: (uint8_t*)(ctx->buffer + 128),
998: (uint8_t*)(ctx->buffer + 192)
999: };
1000: const uint8_t* ptrs2[PMD5_SLOTS_MAX] = {
1001: (uint8_t*)(ctx->buffer + 256),
1002: (uint8_t*)(ctx->buffer + 320),
1003: (uint8_t*)(ctx->buffer + 384),
1004: (uint8_t*)(ctx->buffer + 448)
1005: };
1006: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, 64, 0);
1007: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, 64, 0);
1008: }
1009: ctx->used = 0;
1010: }
1011: }
1012:
1013: if (length >= 512) {
1014: uint32 blocks = length / 512;
1015: if (slots == PMD5_SLOTS_AVX2) {
1016: const uint8_t* ptrs[8] = {
1017: (uint8_t*)(input + pos),
1018: (uint8_t*)(input + pos + 64),
1019: (uint8_t*)(input + pos + 128),
1020: (uint8_t*)(input + pos + 192),
1021: (uint8_t*)(input + pos + 256),
1022: (uint8_t*)(input + pos + 320),
1023: (uint8_t*)(input + pos + 384),
1024: (uint8_t*)(input + pos + 448)
1025: };
1026: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs, blocks * 64, 512);
1027: } else {
1028: const uint8_t* ptrs1[4] = {
1029: (uint8_t*)(input + pos),
1030: (uint8_t*)(input + pos + 64),
1031: (uint8_t*)(input + pos + 128),
1032: (uint8_t*)(input + pos + 192)
1033: };
1034: const uint8_t* ptrs2[4] = {
1035: (uint8_t*)(input + pos + 256),
1036: (uint8_t*)(input + pos + 320),
1037: (uint8_t*)(input + pos + 384),
1038: (uint8_t*)(input + pos + 448)
1039: };
1040: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 0), ptrs1, blocks * 64, 512);
1041: pmd5_update_all_simple(MD5P8_Contexts_simd(ctx, 1), ptrs2, blocks * 64, 512);
1042: }
1043: pos += blocks * 512;
1044: length -= blocks * 512;
1045: }
1046:
1047: if (length) {
1048: memcpy(ctx->buffer, &input[pos], length);
1049: ctx->used = length;
1050: }
1051: }
1052:
1053: static inline void MD5P8_Final_cpp(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
1054: {
1055: int i;
1056: uint32 low = 0, high = 0, sub = ctx->used ? 512 - ctx->used : 0;
1057: if (ctx->used) {
1058: uchar tmp[512];
1059: memset(tmp, 0, 512);
1060: MD5P8_Update(ctx, tmp, 512 - ctx->used);
1061: }
1062:
1063: uchar state[34*4] = {0};
1064:
1065: MD5_CTX tmp;
1066: for (i = 0; i < 8; i++) {
1067: if (pmd5_slots() == PMD5_SLOTS_AVX2) {
1068: pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
1069: } else if (i < 4) {
1070: pmd5_to_md5(MD5P8_Contexts_simd(ctx, 0), &tmp, i);
1071: } else {
1072: pmd5_to_md5(MD5P8_Contexts_simd(ctx, 1), &tmp, i - 4);
1073: }
1074: #ifdef USE_OPENSSL
1075: if (low + tmp.Nl < low) high++;
1076: low += tmp.Nl;
1077: high += tmp.Nh;
1078: #else
1079: if (low + tmp.totalN < low) high++;
1080: low += tmp.totalN;
1081: high += tmp.totalN2;
1082: #endif
1083: SIVALu(state, i*16, tmp.A);
1084: SIVALu(state, i*16 + 4, tmp.B);
1085: SIVALu(state, i*16 + 8, tmp.C);
1086: SIVALu(state, i*16 + 12, tmp.D);
1087: }
1088:
1089: #ifndef USE_OPENSSL
1090: high = (low >> 29) | (high << 3);
1091: low = (low << 3);
1092: #endif
1093:
1094: sub <<= 3;
1095: if (low - sub > low) high--;
1096: low -= sub;
1097:
1098: SIVALu(state, 32*4, low);
1099: SIVALu(state, 33*4, high);
1100:
1101: MD5_CTX md;
1102: MD5_Init(&md);
1103: MD5_Update(&md, state, 34*4);
1104: MD5_Final(digest, &md);
1105: }
1106:
1107: extern "C" {
1108:
1109: int md5_parallel_slots()
1110: {
1111: return md5_parallel_slots_cpp();
1112: }
1113:
1114: int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4)
1115: {
1116: return md5_parallel_cpp(streams, buf, len, sum, pre4, post4);
1117: }
1118:
1119: void MD5P8_Init(MD5P8_CTX *ctx)
1120: {
1121: MD5P8_Init_cpp(ctx);
1122: }
1123:
1124: void MD5P8_Update(MD5P8_CTX *ctx, const uchar *input, uint32 length)
1125: {
1126: MD5P8_Update_cpp(ctx, input, length);
1127: }
1128:
1129: void MD5P8_Final(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx)
1130: {
1131: MD5P8_Final_cpp(digest, ctx);
1132: }
1133:
1134: } // "C"
1135:
1136: #endif /* HAVE_SIMD */
1137: #endif /* __cplusplus */
1138: #endif /* __x86_64__ */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>