File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / rsync / simd-checksum-x86_64.cpp
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 00:32:36 2021 UTC (3 years, 3 months ago) by misho
Branches: rsync, MAIN
CVS tags: v3_2_3, HEAD
rsync 3.2.3

    1: /*
    2:  * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
    3:  *
    4:  * Copyright (C) 1996 Andrew Tridgell
    5:  * Copyright (C) 1996 Paul Mackerras
    6:  * Copyright (C) 2004-2020 Wayne Davison
    7:  * Copyright (C) 2020 Jorrit Jongma
    8:  *
    9:  * This program is free software; you can redistribute it and/or modify
   10:  * it under the terms of the GNU General Public License as published by
   11:  * the Free Software Foundation; either version 3 of the License, or
   12:  * (at your option) any later version.
   13:  *
   14:  * This program is distributed in the hope that it will be useful,
   15:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   16:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   17:  * GNU General Public License for more details.
   18:  *
   19:  * You should have received a copy of the GNU General Public License along
   20:  * with this program; if not, visit the http://fsf.org website.
   21:  */
   22: /*
   23:  * Optimization target for get_checksum1() was the Intel Atom D2700, the
   24:  * slowest CPU in the test set and the most likely to be CPU limited during
   25:  * transfers. The combination of intrinsics was chosen specifically for the
   26:  * most gain on that CPU, other combinations were occasionally slightly
   27:  * faster on the others.
   28:  *
   29:  * While on more modern CPUs transfers are less likely to be CPU limited
   30:  * (at least by this specific function), lower CPU usage is always better.
   31:  * Improvements may still be seen when matching chunks from NVMe storage
   32:  * even on newer CPUs.
   33:  *
   34:  * Benchmarks (in MB/s)            C    SSE2   SSSE3    AVX2
   35:  * - Intel Atom D2700            550     750    1000     N/A
   36:  * - Intel i7-7700hq            1850    2550    4050    6200
   37:  * - AMD ThreadRipper 2950x     2900    5600    8950    8100
   38:  *
   39:  * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
   40:  * significantly faster. AVX2 is kept because it's more likely to relieve
   41:  * the bottleneck on the slower CPU.
   42:  *
   43:  * This optimization for get_checksum1() is intentionally limited to x86-64
   44:  * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
   45:  * the available xmm registers, this optimized version may not be faster than
   46:  * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
   47:  *
   48:  * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
   49:  * use of the target attribute, selecting the fastest code path based on
   50:  * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
   51:  * GCC 4.x are not supported to ease configure.ac logic.
   52:  *
   53:  * ----
   54:  *
   55:  * get_checksum2() is optimized for the case where the selected transfer
   56:  * checksum is MD5. MD5 can't be made significantly faster with SIMD
   57:  * instructions than the assembly version already included but SIMD
   58:  * instructions can be used to hash multiple streams in parallel (see
   59:  * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
   60:  * block-matching algorithm hashes the blocks independently (in contrast to
   61:  * the whole-file checksum) this method can be employed here.
   62:  *
   63:  * To prevent needing to modify the core rsync sources significantly, a
   64:  * prefetching strategy is used. When a checksum2 is requested, the code
   65:  * reads ahead several blocks, creates the MD5 hashes for each block in
   66:  * parallel, returns the hash for the first block, and caches the results
   67:  * for the other blocks to return in future calls to get_checksum2().
   68:  */
   69: 
   70: #ifdef __x86_64__
   71: #ifdef __cplusplus
   72: 
   73: extern "C" {
   74: 
   75: #include "rsync.h"
   76: 
   77: }
   78: 
   79: #ifdef HAVE_SIMD
   80: 
   81: #include <immintrin.h>
   82: 
   83: /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
   84: #ifdef __clang__
   85: #define MVSTATIC
   86: #else
   87: #define MVSTATIC static
   88: #endif
   89: 
   90: // Missing from the headers on gcc 6 and older, clang 8 and older
   91: typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
   92: typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
   93: 
   94: /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
   95:    These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
   96:    as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
   97:    performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
   98: */
   99: 
  100: #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
  101: #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
  102: #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
  103: #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
  104: 
  105: #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
  106: #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
  107: 
  108: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  109: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  110: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
  111: 
  112: /*
  113:   Original loop per 4 bytes:
  114:     s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
  115:     s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
  116: 
  117:   SSE2/SSSE3 loop per 32 bytes:
  118:     int16 t1[8];
  119:     int16 t2[8];
  120:     for (int j = 0; j < 8; j++) {
  121:       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
  122:       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
  123:     }
  124:     s2 += 32*s1 + (uint32)(
  125:               28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
  126:               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
  127:           ) + 528*CHAR_OFFSET;
  128:     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
  129:           32*CHAR_OFFSET;
  130:  */
  131: __attribute__ ((target("ssse3"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
  132: {
  133:     if (len > 32) {
  134:         int aligned = ((uintptr_t)buf & 15) == 0;
  135: 
  136:         uint32 x[4] = {0};
  137:         x[0] = *ps1;
  138:         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
  139:         x[0] = *ps2;
  140:         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
  141: 
  142:         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
  143:         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
  144: 
  145:         for (; i < (len-32); i+=32) {
  146:             // Load ... 2*[int8*16]
  147:             __m128i in8_1, in8_2;
  148:             if (!aligned) {
  149:                 // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
  150:                 in8_1 = _mm_lddqu_si128((__m128i_u*)&buf[i]);
  151:                 in8_2 = _mm_lddqu_si128((__m128i_u*)&buf[i + 16]);
  152:             } else {
  153:                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
  154:                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
  155:             }
  156: 
  157:             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
  158:             // Fastest, even though multiply by 1
  159:             __m128i mul_one = _mm_set1_epi8(1);
  160:             __m128i add16_1 = _mm_maddubs_epi16(mul_one, in8_1);
  161:             __m128i add16_2 = _mm_maddubs_epi16(mul_one, in8_2);
  162: 
  163:             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
  164:             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
  165:             __m128i mul_add16_1 = _mm_maddubs_epi16(mul_const, in8_1);
  166:             __m128i mul_add16_2 = _mm_maddubs_epi16(mul_const, in8_2);
  167: 
  168:             // s2 += 32*s1
  169:             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
  170: 
  171:             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
  172:             // Shifting left, then shifting right again and shuffling (rather than just
  173:             // shifting right as with mul32 below) to cheaply end up with the correct sign
  174:             // extension as we go from int16 to int32.
  175:             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
  176:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
  177:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
  178:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
  179:             sum_add32 = _mm_srai_epi32(sum_add32, 16);
  180:             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
  181: 
  182:             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
  183:             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
  184:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
  185:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
  186:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
  187:             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
  188:             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
  189: 
  190:             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
  191:             ss1 = _mm_add_epi32(ss1, sum_add32);
  192: 
  193:             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
  194:             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
  195: 
  196:             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
  197:             // We could've combined this with generating sum_add32 above and
  198:             // save an instruction but benchmarking shows that as being slower
  199:             __m128i add16 = _mm_hadds_epi16(add16_1, add16_2);
  200: 
  201:             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
  202:             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
  203: 
  204:             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
  205:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
  206:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
  207: 
  208:             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
  209:             ss2 = _mm_add_epi32(ss2, mul32);
  210: 
  211: #if CHAR_OFFSET != 0
  212:             // s1 += 32*CHAR_OFFSET
  213:             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
  214:             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
  215: 
  216:             // s2 += 528*CHAR_OFFSET
  217:             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
  218:             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
  219: #endif
  220:         }
  221: 
  222:         _mm_store_si128((__m128i_u*)x, ss1);
  223:         *ps1 = x[0];
  224:         _mm_store_si128((__m128i_u*)x, ss2);
  225:         *ps2 = x[0];
  226:     }
  227:     return i;
  228: }
  229: 
  230: /*
  231:   Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
  232:   For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
  233:   target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
  234:   to a near 50% performance drop.
  235:  */
  236: __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
  237: {
  238:     if (len > 32) {
  239:         int aligned = ((uintptr_t)buf & 15) == 0;
  240: 
  241:         uint32 x[4] = {0};
  242:         x[0] = *ps1;
  243:         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
  244:         x[0] = *ps2;
  245:         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
  246: 
  247:         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
  248:         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
  249: 
  250:         for (; i < (len-32); i+=32) {
  251:             // Load ... 2*[int8*16]
  252:             __m128i in8_1, in8_2;
  253:             if (!aligned) {
  254:                 in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
  255:                 in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
  256:             } else {
  257:                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
  258:                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
  259:             }
  260: 
  261:             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
  262:             // Fastest, even though multiply by 1
  263:             __m128i mul_one = _mm_set1_epi8(1);
  264:             __m128i add16_1 = SSE2_MADDUBS_EPI16(mul_one, in8_1);
  265:             __m128i add16_2 = SSE2_MADDUBS_EPI16(mul_one, in8_2);
  266: 
  267:             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
  268:             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
  269:             __m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
  270:             __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
  271: 
  272:             // s2 += 32*s1
  273:             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
  274: 
  275:             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
  276:             // Shifting left, then shifting right again and shuffling (rather than just
  277:             // shifting right as with mul32 below) to cheaply end up with the correct sign
  278:             // extension as we go from int16 to int32.
  279:             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
  280:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
  281:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
  282:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
  283:             sum_add32 = _mm_srai_epi32(sum_add32, 16);
  284:             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
  285: 
  286:             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
  287:             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
  288:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
  289:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
  290:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
  291:             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
  292:             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
  293: 
  294:             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
  295:             ss1 = _mm_add_epi32(ss1, sum_add32);
  296: 
  297:             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
  298:             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
  299: 
  300:             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
  301:             // We could've combined this with generating sum_add32 above and
  302:             // save an instruction but benchmarking shows that as being slower
  303:             __m128i add16 = SSE2_HADDS_EPI16(add16_1, add16_2);
  304: 
  305:             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
  306:             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
  307: 
  308:             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
  309:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
  310:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
  311: 
  312:             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
  313:             ss2 = _mm_add_epi32(ss2, mul32);
  314: 
  315: #if CHAR_OFFSET != 0
  316:             // s1 += 32*CHAR_OFFSET
  317:             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
  318:             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
  319: 
  320:             // s2 += 528*CHAR_OFFSET
  321:             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
  322:             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
  323: #endif
  324:         }
  325: 
  326:         _mm_store_si128((__m128i_u*)x, ss1);
  327:         *ps1 = x[0];
  328:         _mm_store_si128((__m128i_u*)x, ss2);
  329:         *ps2 = x[0];
  330:     }
  331:     return i;
  332: }
  333: 
  334: /*
  335:   AVX2 loop per 64 bytes:
  336:     int16 t1[16];
  337:     int16 t2[16];
  338:     for (int j = 0; j < 16; j++) {
  339:       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
  340:       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
  341:     }
  342:     s2 += 64*s1 + (uint32)(
  343:               60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
  344:               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
  345:           ) + 2080*CHAR_OFFSET;
  346:     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
  347:           64*CHAR_OFFSET;
  348:  */
  349: __attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
  350: {
  351:     if (len > 64) {
  352:         // Instructions reshuffled compared to SSE2 for slightly better performance
  353:         int aligned = ((uintptr_t)buf & 31) == 0;
  354: 
  355:         uint32 x[8] = {0};
  356:         x[0] = *ps1;
  357:         __m256i ss1 = _mm256_lddqu_si256((__m256i_u*)x);
  358:         x[0] = *ps2;
  359:         __m256i ss2 = _mm256_lddqu_si256((__m256i_u*)x);
  360: 
  361:         // The order gets shuffled compared to SSE2
  362:         const int16 mul_t1_buf[16] = {60, 56, 52, 48, 28, 24, 20, 16, 44, 40, 36, 32, 12, 8, 4, 0};
  363:         __m256i mul_t1 = _mm256_lddqu_si256((__m256i_u*)mul_t1_buf);
  364: 
  365:         for (; i < (len-64); i+=64) {
  366:             // Load ... 2*[int8*32]
  367:             __m256i in8_1, in8_2;
  368:             if (!aligned) {
  369:                 in8_1 = _mm256_lddqu_si256((__m256i_u*)&buf[i]);
  370:                 in8_2 = _mm256_lddqu_si256((__m256i_u*)&buf[i + 32]);
  371:             } else {
  372:                 in8_1 = _mm256_load_si256((__m256i_u*)&buf[i]);
  373:                 in8_2 = _mm256_load_si256((__m256i_u*)&buf[i + 32]);
  374:             }
  375: 
  376:             // Prefetch for next loops. This has no observable effect on the
  377:             // tested AMD but makes as much as 20% difference on the Intel.
  378:             // Curiously that same Intel sees no benefit from this with SSE2
  379:             // or SSSE3.
  380:             _mm_prefetch(&buf[i + 64], _MM_HINT_T0);
  381:             _mm_prefetch(&buf[i + 96], _MM_HINT_T0);
  382:             _mm_prefetch(&buf[i + 128], _MM_HINT_T0);
  383:             _mm_prefetch(&buf[i + 160], _MM_HINT_T0);
  384: 
  385:             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*16]
  386:             // Fastest, even though multiply by 1
  387:             __m256i mul_one = _mm256_set1_epi8(1);
  388:             __m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
  389:             __m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
  390: 
  391:             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*16]
  392:             __m256i mul_const = _mm256_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
  393:             __m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
  394:             __m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
  395: 
  396:             // s2 += 64*s1
  397:             ss2 = _mm256_add_epi32(ss2, _mm256_slli_epi32(ss1, 6));
  398: 
  399:             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*16]
  400:             __m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
  401: 
  402:             // [t1[0], t1[1], ...] -> [t1[0]*60 + t1[1]*56, ...] [int32*8]
  403:             __m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
  404: 
  405:             // [sum(t1[0]..t1[15]), X, X, X, X, X, X, X] [int32*8]
  406:             __m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
  407:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_permute4x64_epi64(sum_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
  408:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 2));
  409:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 4));
  410:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 8));
  411:             sum_add32 = _mm256_srai_epi32(sum_add32, 16);
  412:             sum_add32 = _mm256_shuffle_epi32(sum_add32, 3);
  413: 
  414:             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]
  415:             ss1 = _mm256_add_epi32(ss1, sum_add32);
  416: 
  417:             // [sum(t2[0]..t2[15]), X, X, X, X, X, X, X] [int32*8]
  418:             __m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
  419:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_permute4x64_epi64(sum_mul_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
  420:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 2));
  421:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 4));
  422:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 8));
  423:             sum_mul_add32 = _mm256_srai_epi32(sum_mul_add32, 16);
  424:             sum_mul_add32 = _mm256_shuffle_epi32(sum_mul_add32, 3);
  425: 
  426:             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
  427:             ss2 = _mm256_add_epi32(ss2, sum_mul_add32);
  428: 
  429:             // [sum(mul32), X, X, X, X, X, X, X] [int32*8]
  430:             mul32 = _mm256_add_epi32(mul32, _mm256_permute2x128_si256(mul32, mul32, 1));
  431:             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
  432:             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
  433: 
  434:             // s2 += 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14]
  435:             ss2 = _mm256_add_epi32(ss2, mul32);
  436: 
  437: #if CHAR_OFFSET != 0
  438:             // s1 += 64*CHAR_OFFSET
  439:             __m256i char_offset_multiplier = _mm256_set1_epi32(64 * CHAR_OFFSET);
  440:             ss1 = _mm256_add_epi32(ss1, char_offset_multiplier);
  441: 
  442:             // s2 += 2080*CHAR_OFFSET
  443:             char_offset_multiplier = _mm256_set1_epi32(2080 * CHAR_OFFSET);
  444:             ss2 = _mm256_add_epi32(ss2, char_offset_multiplier);
  445: #endif
  446:         }
  447: 
  448:         _mm256_store_si256((__m256i_u*)x, ss1);
  449:         *ps1 = x[0];
  450:         _mm256_store_si256((__m256i_u*)x, ss2);
  451:         *ps2 = x[0];
  452:     }
  453:     return i;
  454: }
  455: 
  456: static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
  457: {
  458:     uint32 s1 = *ps1;
  459:     uint32 s2 = *ps2;
  460:     for (; i < (len-4); i+=4) {
  461:         s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
  462:         s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
  463:     }
  464:     for (; i < len; i++) {
  465:         s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
  466:     }
  467:     *ps1 = s1;
  468:     *ps2 = s2;
  469:     return i;
  470: }
  471: 
  472: /* With GCC 10 putting this implementation inside 'extern "C"' causes an
  473:    assembler error. That worked fine on GCC 5-9 and clang 6-10...
  474:   */
  475: static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
  476: {
  477:     int32 i = 0;
  478:     uint32 s1 = 0;
  479:     uint32 s2 = 0;
  480: 
  481:     // multiples of 64 bytes using AVX2 (if available)
  482:     i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
  483: 
  484:     // multiples of 32 bytes using SSSE3 (if available)
  485:     i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
  486: 
  487:     // multiples of 32 bytes using SSE2 (if available)
  488:     i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
  489: 
  490:     // whatever is left
  491:     i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
  492: 
  493:     return (s1 & 0xffff) + (s2 << 16);
  494: }
  495: 
  496: extern "C" {
  497: 
  498: uint32 get_checksum1(char *buf1, int32 len)
  499: {
  500:     return get_checksum1_cpp(buf1, len);
  501: }
  502: 
  503: #if !defined(BENCHMARK_SIMD_CHECKSUM1)
  504: 
  505: // see simd-md5-parallel-x86_64.cpp
  506: extern int md5_parallel_slots();
  507: extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
  508: 
  509: #endif /* !BENCHMARK_SIMD_CHECKSUM1 */
  510: 
  511: #if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
  512: 
  513: #define PREFETCH_ENABLE 1 // debugging
  514: 
  515: #if 0 // debugging
  516: #define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
  517: #else
  518: #define PREFETCH_PRINTF(f_, ...) (void)0;
  519: #endif
  520: 
  521: #define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
  522: #define PREFETCH_MAX_BLOCKS 8
  523: 
  524: typedef struct {
  525:     int in_use;
  526:     OFF_T offset;
  527:     int32 len;
  528:     char sum[SUM_LENGTH];
  529: } prefetch_sum_t;
  530: 
  531: typedef struct {
  532:     struct map_struct *map;
  533:     OFF_T len;
  534:     OFF_T last;
  535:     int32 blocklen;
  536:     int blocks;
  537:     prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
  538: } prefetch_t;
  539: 
  540: prefetch_t *prefetch;
  541: 
  542: extern int xfersum_type;
  543: extern int checksum_seed;
  544: extern int proper_seed_order;
  545: extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
  546: 
  547: extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
  548: 
  549: void checksum2_disable_prefetch()
  550: {
  551:     if (prefetch) {
  552:         PREFETCH_PRINTF("checksum2_disable_prefetch\n");
  553:         free(prefetch);
  554:         prefetch = NULL;
  555:     }
  556: }
  557: 
  558: void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
  559: {
  560: #ifdef PREFETCH_ENABLE
  561:     checksum2_disable_prefetch();
  562:     int slots = md5_parallel_slots();
  563:     if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
  564:         prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
  565:         memset(prefetch, 0, sizeof(prefetch_t));
  566:         prefetch->map = map;
  567:         prefetch->len = len;
  568:         prefetch->last = 0;
  569:         prefetch->blocklen = blocklen;
  570:         prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
  571:         PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
  572:     }
  573: #endif
  574: }
  575: 
  576: static inline void checksum2_reset_prefetch()
  577: {
  578:     for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
  579:         prefetch->sums[i].in_use = 0;
  580:     }
  581: }
  582: 
  583: static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
  584: {
  585:     if (prefetch->sums[0].in_use) {
  586:         if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
  587:             memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
  588:             for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
  589:                 prefetch->sums[i] = prefetch->sums[i + 1];
  590:             }
  591:             prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
  592:             PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
  593:             return 1;
  594:         } else {
  595:             // unexpected access, reset cache
  596:             PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
  597:             checksum2_reset_prefetch();
  598:         }
  599:     }
  600:     return 0;
  601: }
  602: 
  603: static int checksum2_perform_prefetch(OFF_T prefetch_offset)
  604: {
  605:     int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
  606:     if (blocks < 2) return 0; // fall through to non-simd, probably faster
  607: 
  608:     int32 total = 0;
  609:     int i;
  610:     for (i = 0; i < blocks; i++) {
  611:         prefetch->sums[i].offset = prefetch_offset + total;
  612:         prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
  613:         prefetch->sums[i].in_use = 0;
  614:         total += prefetch->sums[i].len;
  615:     }
  616:     for (; i < PREFETCH_MAX_BLOCKS; i++) {
  617:         prefetch->sums[i].in_use = 0;
  618:     }
  619: 
  620:     uchar seedbuf[4];
  621:     SIVALu(seedbuf, 0, checksum_seed);
  622: 
  623:     PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
  624:     char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
  625:     char* bufs[PREFETCH_MAX_BLOCKS] = {0};
  626:     int lens[PREFETCH_MAX_BLOCKS] = {0};
  627:     char* sums[PREFETCH_MAX_BLOCKS] = {0};
  628:     for (i = 0; i < blocks; i++) {
  629:         bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
  630:         lens[i] = prefetch->sums[i].len;
  631:         sums[i] = prefetch->sums[i].sum;
  632:     }
  633:     if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
  634:         for (i = 0; i < blocks; i++) {
  635:             prefetch->sums[i].in_use = 1;
  636:         }
  637:         return 1;
  638:     } else {
  639:         // this should never be, abort
  640:         PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
  641:         checksum2_disable_prefetch();
  642:     }
  643:     return 0;
  644: }
  645: 
  646: void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
  647: {
  648:     if (prefetch) {
  649:         PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
  650:         OFF_T last = prefetch->last;
  651:         prefetch->last = prefetch_offset;
  652:         if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
  653:             // we're looking around trying to align blocks, prefetching will slow things down
  654:             PREFETCH_PRINTF("get_checksum2 SEEK\n");
  655:             checksum2_reset_prefetch();
  656:         } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
  657:             // hit
  658:             return;
  659:         } else if (checksum2_perform_prefetch(prefetch_offset)) {
  660:             if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
  661:                 // hit; should always be as we just fetched this data
  662:                 return;
  663:             } else {
  664:                 // this should never be, abort
  665:                 PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
  666:                 checksum2_disable_prefetch();
  667:             }
  668:         }
  669:     }
  670:     get_checksum2_nosimd(buf, len, sum, prefetch_offset);
  671: }
  672: #endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
  673: 
  674: } // "C"
  675: 
  676: /* Benchmark compilation
  677: 
  678:   The get_checksum1() benchmark runs through all available code paths in a
  679:   single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
  680:   recompiled for each code path (it always uses the fastest path available
  681:   on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
  682:   be used when applicable regardless of rsync being built with OpenSSL.
  683: 
  684:   Something like the following should compile and run the benchmarks:
  685: 
  686:   # if gcc
  687:   export CC=gcc
  688:   export CXX=g++
  689:   export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
  690: 
  691:   # else if clang
  692:   export CC=clang
  693:   export CXX=clang++
  694:   export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
  695: 
  696:   # /if
  697: 
  698:   export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
  699:   export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
  700:   export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
  701:   export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
  702: 
  703:   rm bench_csum*
  704: 
  705:   ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
  706: 
  707:   $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
  708: 
  709:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
  710:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
  711: 
  712:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
  713:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
  714: 
  715:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
  716:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
  717: 
  718:   ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
  719: 
  720:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
  721:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
  722: 
  723:   ./bench_csum1.all
  724:   ./bench_csum2.asm
  725:   ./bench_csum2.openssl
  726:   ./bench_csum2.sse2
  727:   ./bench_csum2.avx2
  728: 
  729:  */
  730: 
  731: #if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
  732: #pragma clang optimize off
  733: #pragma GCC push_options
  734: #pragma GCC optimize ("O0")
  735: 
  736: #define ROUNDS 1024
  737: #define BLOCK_LEN 1024*1024
  738: 
  739: #ifndef CLOCK_MONOTONIC_RAW
  740: #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
  741: #endif
  742: #endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
  743: 
  744: #ifdef BENCHMARK_SIMD_CHECKSUM1
  745: static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
  746:     struct timespec start, end;
  747:     uint64_t us;
  748:     uint32_t cs, s1, s2;
  749:     int i, next;
  750: 
  751:     clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  752:     for (i = 0; i < ROUNDS; i++) {
  753:         s1 = s2 = 0;
  754:         next = func((schar*)buf, len, 0, &s1, &s2);
  755:         get_checksum1_default_1((schar*)buf, len, next, &s1, &s2);
  756:     }
  757:     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  758:     us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  759:     cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
  760:     printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
  761: }
  762: 
  763: static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
  764:     uint32 cs = get_checksum1((char*)buf, len);
  765:     *ps1 = cs & 0xffff;
  766:     *ps2 = cs >> 16;
  767:     return len;
  768: }
  769: 
  770: int main() {
  771:     int i;
  772:     unsigned char* buf = (unsigned char*)malloc(BLOCK_LEN);
  773:     for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
  774: 
  775:     benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
  776:     benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
  777:     benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
  778:     benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
  779:     benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN);
  780: 
  781:     free(buf);
  782:     return 0;
  783: }
  784: #endif /* BENCHMARK_SIMD_CHECKSUM1 */
  785: 
  786: #ifdef BENCHMARK_SIMD_CHECKSUM2
  787: static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
  788:     struct timespec start, end;
  789:     uint64_t us;
  790:     unsigned char cs1[16];
  791:     unsigned char cs2[16];
  792:     int i;
  793: 
  794:     clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  795:     for (i = 0; i < ROUNDS; i++) {
  796:         func(buf, len, (char*)cs1);
  797:     }
  798:     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  799:     us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  800: 
  801:     func2(buf, len, (char*)cs2);
  802: 
  803:     float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
  804:     printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
  805:     for (i = 0; i < 16; i++) {
  806:         printf("%02x", cs1[i] & 0xFF);
  807:     }
  808:     printf(" :: ");
  809:     for (i = 0; i < 16; i++) {
  810:         printf("%02x", cs2[i] & 0xFF);
  811:     }
  812:     printf("\n");
  813: }
  814: 
  815: static void benchmark_inner(char* buf, int32 len, char* sum_out) {
  816:     // This should produce the same output for different optimizations
  817:     // levels, not the same as sanity_check()
  818: 
  819:     char* bufs[8] = {0};
  820:     int lens[8] = {0};
  821:     char* sums[8] = {0};
  822: 
  823:     bufs[0] = buf;
  824:     lens[0] = len;
  825:     sums[0] = sum_out;
  826:     md5_parallel(1, bufs, lens, sums, NULL, NULL);
  827: }
  828: 
  829: extern "C" {
  830: extern void MD5P8_Init_c(MD5P8_CTX *ctx);
  831: extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
  832: extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
  833: }
  834: 
  835: static void sanity_check(char* buf, int32 len, char* sum_out) {
  836:     // This should produce the same output for different optimizations
  837:     // levels, not the same as benchmark_inner()
  838:     if (md5_parallel_slots() <= 1) {
  839:         MD5P8_CTX m5p8;
  840:         MD5P8_Init_c(&m5p8);
  841:         MD5P8_Update_c(&m5p8, (uchar *)buf, len);
  842:         MD5P8_Final_c((uchar *)sum_out, &m5p8);
  843:     } else {
  844:         MD5P8_CTX m5p8;
  845:         MD5P8_Init(&m5p8);
  846:         MD5P8_Update(&m5p8, (uchar *)buf, len);
  847:         MD5P8_Final((uchar *)sum_out, &m5p8);
  848:     }
  849: }
  850: 
  851: int main() {
  852:     // This benchmarks the parallel MD5 checksum rather than get_checksum2()
  853:     // as the latter would require compiling in a lot of rsync's code, but
  854:     // it touches all the same internals so the performance should be nearly
  855:     // identical.
  856: 
  857:     int i;
  858:     char* buf = (char*)malloc(BLOCK_LEN);
  859:     for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
  860: 
  861:     const char* method = "?";
  862:     switch (md5_parallel_slots()) {
  863:         case 8: method = "AVX2"; break;
  864:         case 4: method = "SSE2"; break;
  865: #ifdef USE_OPENSSL
  866:         case 1: method = "OpenSSL"; break;
  867: #elif (CSUM_CHUNK == 64)
  868:         case 1: method = "ASM"; break;
  869: #else
  870:         // this won't happen unless you modified code somewhere
  871:         case 1: method = "Raw-C"; break;
  872: #endif
  873:     }
  874: 
  875:     benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
  876: 
  877:     free(buf);
  878:     return 0;
  879: }
  880: #endif /* BENCHMARK_SIMD_CHECKSUM2 */
  881: 
  882: #if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
  883: #pragma GCC pop_options
  884: #pragma clang optimize on
  885: #endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
  886: 
  887: #endif /* HAVE_SIMD */
  888: #endif /* __cplusplus */
  889: #endif /* __x86_64__ */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>