Annotation of embedaddon/rsync/simd-checksum-x86_64.cpp, revision 1.1.1.1

1.1       misho       1: /*
                      2:  * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
                      3:  *
                      4:  * Copyright (C) 1996 Andrew Tridgell
                      5:  * Copyright (C) 1996 Paul Mackerras
                      6:  * Copyright (C) 2004-2020 Wayne Davison
                      7:  * Copyright (C) 2020 Jorrit Jongma
                      8:  *
                      9:  * This program is free software; you can redistribute it and/or modify
                     10:  * it under the terms of the GNU General Public License as published by
                     11:  * the Free Software Foundation; either version 3 of the License, or
                     12:  * (at your option) any later version.
                     13:  *
                     14:  * This program is distributed in the hope that it will be useful,
                     15:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
                     16:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     17:  * GNU General Public License for more details.
                     18:  *
                     19:  * You should have received a copy of the GNU General Public License along
                     20:  * with this program; if not, visit the http://fsf.org website.
                     21:  */
                     22: /*
                     23:  * Optimization target for get_checksum1() was the Intel Atom D2700, the
                     24:  * slowest CPU in the test set and the most likely to be CPU limited during
                     25:  * transfers. The combination of intrinsics was chosen specifically for the
                     26:  * most gain on that CPU, other combinations were occasionally slightly
                     27:  * faster on the others.
                     28:  *
                     29:  * While on more modern CPUs transfers are less likely to be CPU limited
                     30:  * (at least by this specific function), lower CPU usage is always better.
                     31:  * Improvements may still be seen when matching chunks from NVMe storage
                     32:  * even on newer CPUs.
                     33:  *
                     34:  * Benchmarks (in MB/s)            C    SSE2   SSSE3    AVX2
                     35:  * - Intel Atom D2700            550     750    1000     N/A
                     36:  * - Intel i7-7700hq            1850    2550    4050    6200
                     37:  * - AMD ThreadRipper 2950x     2900    5600    8950    8100
                     38:  *
                     39:  * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
                     40:  * significantly faster. AVX2 is kept because it's more likely to relieve
                     41:  * the bottleneck on the slower CPU.
                     42:  *
                     43:  * This optimization for get_checksum1() is intentionally limited to x86-64
                     44:  * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
                     45:  * the available xmm registers, this optimized version may not be faster than
                     46:  * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
                     47:  *
                     48:  * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
                     49:  * use of the target attribute, selecting the fastest code path based on
                     50:  * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
                     51:  * GCC 4.x are not supported to ease configure.ac logic.
                     52:  *
                     53:  * ----
                     54:  *
                     55:  * get_checksum2() is optimized for the case where the selected transfer
                     56:  * checksum is MD5. MD5 can't be made significantly faster with SIMD
                     57:  * instructions than the assembly version already included but SIMD
                     58:  * instructions can be used to hash multiple streams in parallel (see
                     59:  * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
                     60:  * block-matching algorithm hashes the blocks independently (in contrast to
                     61:  * the whole-file checksum) this method can be employed here.
                     62:  *
                     63:  * To prevent needing to modify the core rsync sources significantly, a
                     64:  * prefetching strategy is used. When a checksum2 is requested, the code
                     65:  * reads ahead several blocks, creates the MD5 hashes for each block in
                     66:  * parallel, returns the hash for the first block, and caches the results
                     67:  * for the other blocks to return in future calls to get_checksum2().
                     68:  */
                     69: 
                     70: #ifdef __x86_64__
                     71: #ifdef __cplusplus
                     72: 
                     73: extern "C" {
                     74: 
                     75: #include "rsync.h"
                     76: 
                     77: }
                     78: 
                     79: #ifdef HAVE_SIMD
                     80: 
                     81: #include <immintrin.h>
                     82: 
                     83: /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
                     84: #ifdef __clang__
                     85: #define MVSTATIC
                     86: #else
                     87: #define MVSTATIC static
                     88: #endif
                     89: 
                     90: // Missing from the headers on gcc 6 and older, clang 8 and older
                     91: typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
                     92: typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
                     93: 
                     94: /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
                     95:    These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
                     96:    as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
                     97:    performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
                     98: */
                     99: 
                    100: #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
                    101: #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
                    102: #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
                    103: #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
                    104: 
                    105: #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
                    106: #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
                    107: 
                    108: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
                    109: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
                    110: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
                    111: 
                    112: /*
                    113:   Original loop per 4 bytes:
                    114:     s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
                    115:     s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
                    116: 
                    117:   SSE2/SSSE3 loop per 32 bytes:
                    118:     int16 t1[8];
                    119:     int16 t2[8];
                    120:     for (int j = 0; j < 8; j++) {
                    121:       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
                    122:       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
                    123:     }
                    124:     s2 += 32*s1 + (uint32)(
                    125:               28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
                    126:               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
                    127:           ) + 528*CHAR_OFFSET;
                    128:     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
                    129:           32*CHAR_OFFSET;
                    130:  */
                    131: __attribute__ ((target("ssse3"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
                    132: {
                    133:     if (len > 32) {
                    134:         int aligned = ((uintptr_t)buf & 15) == 0;
                    135: 
                    136:         uint32 x[4] = {0};
                    137:         x[0] = *ps1;
                    138:         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
                    139:         x[0] = *ps2;
                    140:         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
                    141: 
                    142:         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
                    143:         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
                    144: 
                    145:         for (; i < (len-32); i+=32) {
                    146:             // Load ... 2*[int8*16]
                    147:             __m128i in8_1, in8_2;
                    148:             if (!aligned) {
                    149:                 // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
                    150:                 in8_1 = _mm_lddqu_si128((__m128i_u*)&buf[i]);
                    151:                 in8_2 = _mm_lddqu_si128((__m128i_u*)&buf[i + 16]);
                    152:             } else {
                    153:                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
                    154:                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
                    155:             }
                    156: 
                    157:             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
                    158:             // Fastest, even though multiply by 1
                    159:             __m128i mul_one = _mm_set1_epi8(1);
                    160:             __m128i add16_1 = _mm_maddubs_epi16(mul_one, in8_1);
                    161:             __m128i add16_2 = _mm_maddubs_epi16(mul_one, in8_2);
                    162: 
                    163:             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
                    164:             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
                    165:             __m128i mul_add16_1 = _mm_maddubs_epi16(mul_const, in8_1);
                    166:             __m128i mul_add16_2 = _mm_maddubs_epi16(mul_const, in8_2);
                    167: 
                    168:             // s2 += 32*s1
                    169:             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
                    170: 
                    171:             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
                    172:             // Shifting left, then shifting right again and shuffling (rather than just
                    173:             // shifting right as with mul32 below) to cheaply end up with the correct sign
                    174:             // extension as we go from int16 to int32.
                    175:             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
                    176:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
                    177:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
                    178:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
                    179:             sum_add32 = _mm_srai_epi32(sum_add32, 16);
                    180:             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
                    181: 
                    182:             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
                    183:             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
                    184:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
                    185:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
                    186:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
                    187:             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
                    188:             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
                    189: 
                    190:             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
                    191:             ss1 = _mm_add_epi32(ss1, sum_add32);
                    192: 
                    193:             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
                    194:             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
                    195: 
                    196:             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
                    197:             // We could've combined this with generating sum_add32 above and
                    198:             // save an instruction but benchmarking shows that as being slower
                    199:             __m128i add16 = _mm_hadds_epi16(add16_1, add16_2);
                    200: 
                    201:             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
                    202:             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
                    203: 
                    204:             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
                    205:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
                    206:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
                    207: 
                    208:             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
                    209:             ss2 = _mm_add_epi32(ss2, mul32);
                    210: 
                    211: #if CHAR_OFFSET != 0
                    212:             // s1 += 32*CHAR_OFFSET
                    213:             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
                    214:             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
                    215: 
                    216:             // s2 += 528*CHAR_OFFSET
                    217:             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
                    218:             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
                    219: #endif
                    220:         }
                    221: 
                    222:         _mm_store_si128((__m128i_u*)x, ss1);
                    223:         *ps1 = x[0];
                    224:         _mm_store_si128((__m128i_u*)x, ss2);
                    225:         *ps2 = x[0];
                    226:     }
                    227:     return i;
                    228: }
                    229: 
                    230: /*
                    231:   Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
                    232:   For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
                    233:   target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
                    234:   to a near 50% performance drop.
                    235:  */
                    236: __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
                    237: {
                    238:     if (len > 32) {
                    239:         int aligned = ((uintptr_t)buf & 15) == 0;
                    240: 
                    241:         uint32 x[4] = {0};
                    242:         x[0] = *ps1;
                    243:         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
                    244:         x[0] = *ps2;
                    245:         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
                    246: 
                    247:         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
                    248:         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
                    249: 
                    250:         for (; i < (len-32); i+=32) {
                    251:             // Load ... 2*[int8*16]
                    252:             __m128i in8_1, in8_2;
                    253:             if (!aligned) {
                    254:                 in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
                    255:                 in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
                    256:             } else {
                    257:                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
                    258:                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
                    259:             }
                    260: 
                    261:             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
                    262:             // Fastest, even though multiply by 1
                    263:             __m128i mul_one = _mm_set1_epi8(1);
                    264:             __m128i add16_1 = SSE2_MADDUBS_EPI16(mul_one, in8_1);
                    265:             __m128i add16_2 = SSE2_MADDUBS_EPI16(mul_one, in8_2);
                    266: 
                    267:             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
                    268:             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
                    269:             __m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
                    270:             __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
                    271: 
                    272:             // s2 += 32*s1
                    273:             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
                    274: 
                    275:             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
                    276:             // Shifting left, then shifting right again and shuffling (rather than just
                    277:             // shifting right as with mul32 below) to cheaply end up with the correct sign
                    278:             // extension as we go from int16 to int32.
                    279:             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
                    280:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
                    281:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
                    282:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
                    283:             sum_add32 = _mm_srai_epi32(sum_add32, 16);
                    284:             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
                    285: 
                    286:             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
                    287:             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
                    288:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
                    289:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
                    290:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
                    291:             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
                    292:             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
                    293: 
                    294:             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
                    295:             ss1 = _mm_add_epi32(ss1, sum_add32);
                    296: 
                    297:             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
                    298:             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
                    299: 
                    300:             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
                    301:             // We could've combined this with generating sum_add32 above and
                    302:             // save an instruction but benchmarking shows that as being slower
                    303:             __m128i add16 = SSE2_HADDS_EPI16(add16_1, add16_2);
                    304: 
                    305:             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
                    306:             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
                    307: 
                    308:             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
                    309:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
                    310:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
                    311: 
                    312:             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
                    313:             ss2 = _mm_add_epi32(ss2, mul32);
                    314: 
                    315: #if CHAR_OFFSET != 0
                    316:             // s1 += 32*CHAR_OFFSET
                    317:             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
                    318:             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
                    319: 
                    320:             // s2 += 528*CHAR_OFFSET
                    321:             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
                    322:             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
                    323: #endif
                    324:         }
                    325: 
                    326:         _mm_store_si128((__m128i_u*)x, ss1);
                    327:         *ps1 = x[0];
                    328:         _mm_store_si128((__m128i_u*)x, ss2);
                    329:         *ps2 = x[0];
                    330:     }
                    331:     return i;
                    332: }
                    333: 
                    334: /*
                    335:   AVX2 loop per 64 bytes:
                    336:     int16 t1[16];
                    337:     int16 t2[16];
                    338:     for (int j = 0; j < 16; j++) {
                    339:       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
                    340:       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
                    341:     }
                    342:     s2 += 64*s1 + (uint32)(
                    343:               60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
                    344:               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
                    345:           ) + 2080*CHAR_OFFSET;
                    346:     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
                    347:           64*CHAR_OFFSET;
                    348:  */
                    349: __attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
                    350: {
                    351:     if (len > 64) {
                    352:         // Instructions reshuffled compared to SSE2 for slightly better performance
                    353:         int aligned = ((uintptr_t)buf & 31) == 0;
                    354: 
                    355:         uint32 x[8] = {0};
                    356:         x[0] = *ps1;
                    357:         __m256i ss1 = _mm256_lddqu_si256((__m256i_u*)x);
                    358:         x[0] = *ps2;
                    359:         __m256i ss2 = _mm256_lddqu_si256((__m256i_u*)x);
                    360: 
                    361:         // The order gets shuffled compared to SSE2
                    362:         const int16 mul_t1_buf[16] = {60, 56, 52, 48, 28, 24, 20, 16, 44, 40, 36, 32, 12, 8, 4, 0};
                    363:         __m256i mul_t1 = _mm256_lddqu_si256((__m256i_u*)mul_t1_buf);
                    364: 
                    365:         for (; i < (len-64); i+=64) {
                    366:             // Load ... 2*[int8*32]
                    367:             __m256i in8_1, in8_2;
                    368:             if (!aligned) {
                    369:                 in8_1 = _mm256_lddqu_si256((__m256i_u*)&buf[i]);
                    370:                 in8_2 = _mm256_lddqu_si256((__m256i_u*)&buf[i + 32]);
                    371:             } else {
                    372:                 in8_1 = _mm256_load_si256((__m256i_u*)&buf[i]);
                    373:                 in8_2 = _mm256_load_si256((__m256i_u*)&buf[i + 32]);
                    374:             }
                    375: 
                    376:             // Prefetch for next loops. This has no observable effect on the
                    377:             // tested AMD but makes as much as 20% difference on the Intel.
                    378:             // Curiously that same Intel sees no benefit from this with SSE2
                    379:             // or SSSE3.
                    380:             _mm_prefetch(&buf[i + 64], _MM_HINT_T0);
                    381:             _mm_prefetch(&buf[i + 96], _MM_HINT_T0);
                    382:             _mm_prefetch(&buf[i + 128], _MM_HINT_T0);
                    383:             _mm_prefetch(&buf[i + 160], _MM_HINT_T0);
                    384: 
                    385:             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*16]
                    386:             // Fastest, even though multiply by 1
                    387:             __m256i mul_one = _mm256_set1_epi8(1);
                    388:             __m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
                    389:             __m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
                    390: 
                    391:             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*16]
                    392:             __m256i mul_const = _mm256_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
                    393:             __m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
                    394:             __m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
                    395: 
                    396:             // s2 += 64*s1
                    397:             ss2 = _mm256_add_epi32(ss2, _mm256_slli_epi32(ss1, 6));
                    398: 
                    399:             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*16]
                    400:             __m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
                    401: 
                    402:             // [t1[0], t1[1], ...] -> [t1[0]*60 + t1[1]*56, ...] [int32*8]
                    403:             __m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
                    404: 
                    405:             // [sum(t1[0]..t1[15]), X, X, X, X, X, X, X] [int32*8]
                    406:             __m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
                    407:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_permute4x64_epi64(sum_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
                    408:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 2));
                    409:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 4));
                    410:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 8));
                    411:             sum_add32 = _mm256_srai_epi32(sum_add32, 16);
                    412:             sum_add32 = _mm256_shuffle_epi32(sum_add32, 3);
                    413: 
                    414:             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]
                    415:             ss1 = _mm256_add_epi32(ss1, sum_add32);
                    416: 
                    417:             // [sum(t2[0]..t2[15]), X, X, X, X, X, X, X] [int32*8]
                    418:             __m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
                    419:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_permute4x64_epi64(sum_mul_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
                    420:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 2));
                    421:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 4));
                    422:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 8));
                    423:             sum_mul_add32 = _mm256_srai_epi32(sum_mul_add32, 16);
                    424:             sum_mul_add32 = _mm256_shuffle_epi32(sum_mul_add32, 3);
                    425: 
                    426:             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
                    427:             ss2 = _mm256_add_epi32(ss2, sum_mul_add32);
                    428: 
                    429:             // [sum(mul32), X, X, X, X, X, X, X] [int32*8]
                    430:             mul32 = _mm256_add_epi32(mul32, _mm256_permute2x128_si256(mul32, mul32, 1));
                    431:             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
                    432:             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
                    433: 
                    434:             // s2 += 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14]
                    435:             ss2 = _mm256_add_epi32(ss2, mul32);
                    436: 
                    437: #if CHAR_OFFSET != 0
                    438:             // s1 += 64*CHAR_OFFSET
                    439:             __m256i char_offset_multiplier = _mm256_set1_epi32(64 * CHAR_OFFSET);
                    440:             ss1 = _mm256_add_epi32(ss1, char_offset_multiplier);
                    441: 
                    442:             // s2 += 2080*CHAR_OFFSET
                    443:             char_offset_multiplier = _mm256_set1_epi32(2080 * CHAR_OFFSET);
                    444:             ss2 = _mm256_add_epi32(ss2, char_offset_multiplier);
                    445: #endif
                    446:         }
                    447: 
                    448:         _mm256_store_si256((__m256i_u*)x, ss1);
                    449:         *ps1 = x[0];
                    450:         _mm256_store_si256((__m256i_u*)x, ss2);
                    451:         *ps2 = x[0];
                    452:     }
                    453:     return i;
                    454: }
                    455: 
                    456: static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
                    457: {
                    458:     uint32 s1 = *ps1;
                    459:     uint32 s2 = *ps2;
                    460:     for (; i < (len-4); i+=4) {
                    461:         s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
                    462:         s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
                    463:     }
                    464:     for (; i < len; i++) {
                    465:         s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
                    466:     }
                    467:     *ps1 = s1;
                    468:     *ps2 = s2;
                    469:     return i;
                    470: }
                    471: 
                    472: /* With GCC 10 putting this implementation inside 'extern "C"' causes an
                    473:    assembler error. That worked fine on GCC 5-9 and clang 6-10...
                    474:   */
                    475: static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
                    476: {
                    477:     int32 i = 0;
                    478:     uint32 s1 = 0;
                    479:     uint32 s2 = 0;
                    480: 
                    481:     // multiples of 64 bytes using AVX2 (if available)
                    482:     i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
                    483: 
                    484:     // multiples of 32 bytes using SSSE3 (if available)
                    485:     i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
                    486: 
                    487:     // multiples of 32 bytes using SSE2 (if available)
                    488:     i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
                    489: 
                    490:     // whatever is left
                    491:     i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
                    492: 
                    493:     return (s1 & 0xffff) + (s2 << 16);
                    494: }
                    495: 
                    496: extern "C" {
                    497: 
                    498: uint32 get_checksum1(char *buf1, int32 len)
                    499: {
                    500:     return get_checksum1_cpp(buf1, len);
                    501: }
                    502: 
                    503: #if !defined(BENCHMARK_SIMD_CHECKSUM1)
                    504: 
                    505: // see simd-md5-parallel-x86_64.cpp
                    506: extern int md5_parallel_slots();
                    507: extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
                    508: 
                    509: #endif /* !BENCHMARK_SIMD_CHECKSUM1 */
                    510: 
                    511: #if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
                    512: 
                    513: #define PREFETCH_ENABLE 1 // debugging
                    514: 
                    515: #if 0 // debugging
                    516: #define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
                    517: #else
                    518: #define PREFETCH_PRINTF(f_, ...) (void)0;
                    519: #endif
                    520: 
                    521: #define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
                    522: #define PREFETCH_MAX_BLOCKS 8
                    523: 
                    524: typedef struct {
                    525:     int in_use;
                    526:     OFF_T offset;
                    527:     int32 len;
                    528:     char sum[SUM_LENGTH];
                    529: } prefetch_sum_t;
                    530: 
                    531: typedef struct {
                    532:     struct map_struct *map;
                    533:     OFF_T len;
                    534:     OFF_T last;
                    535:     int32 blocklen;
                    536:     int blocks;
                    537:     prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
                    538: } prefetch_t;
                    539: 
                    540: prefetch_t *prefetch;
                    541: 
                    542: extern int xfersum_type;
                    543: extern int checksum_seed;
                    544: extern int proper_seed_order;
                    545: extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
                    546: 
                    547: extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
                    548: 
                    549: void checksum2_disable_prefetch()
                    550: {
                    551:     if (prefetch) {
                    552:         PREFETCH_PRINTF("checksum2_disable_prefetch\n");
                    553:         free(prefetch);
                    554:         prefetch = NULL;
                    555:     }
                    556: }
                    557: 
                    558: void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
                    559: {
                    560: #ifdef PREFETCH_ENABLE
                    561:     checksum2_disable_prefetch();
                    562:     int slots = md5_parallel_slots();
                    563:     if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
                    564:         prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
                    565:         memset(prefetch, 0, sizeof(prefetch_t));
                    566:         prefetch->map = map;
                    567:         prefetch->len = len;
                    568:         prefetch->last = 0;
                    569:         prefetch->blocklen = blocklen;
                    570:         prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
                    571:         PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
                    572:     }
                    573: #endif
                    574: }
                    575: 
                    576: static inline void checksum2_reset_prefetch()
                    577: {
                    578:     for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
                    579:         prefetch->sums[i].in_use = 0;
                    580:     }
                    581: }
                    582: 
                    583: static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
                    584: {
                    585:     if (prefetch->sums[0].in_use) {
                    586:         if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
                    587:             memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
                    588:             for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
                    589:                 prefetch->sums[i] = prefetch->sums[i + 1];
                    590:             }
                    591:             prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
                    592:             PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
                    593:             return 1;
                    594:         } else {
                    595:             // unexpected access, reset cache
                    596:             PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
                    597:             checksum2_reset_prefetch();
                    598:         }
                    599:     }
                    600:     return 0;
                    601: }
                    602: 
                    603: static int checksum2_perform_prefetch(OFF_T prefetch_offset)
                    604: {
                    605:     int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
                    606:     if (blocks < 2) return 0; // fall through to non-simd, probably faster
                    607: 
                    608:     int32 total = 0;
                    609:     int i;
                    610:     for (i = 0; i < blocks; i++) {
                    611:         prefetch->sums[i].offset = prefetch_offset + total;
                    612:         prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
                    613:         prefetch->sums[i].in_use = 0;
                    614:         total += prefetch->sums[i].len;
                    615:     }
                    616:     for (; i < PREFETCH_MAX_BLOCKS; i++) {
                    617:         prefetch->sums[i].in_use = 0;
                    618:     }
                    619: 
                    620:     uchar seedbuf[4];
                    621:     SIVALu(seedbuf, 0, checksum_seed);
                    622: 
                    623:     PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
                    624:     char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
                    625:     char* bufs[PREFETCH_MAX_BLOCKS] = {0};
                    626:     int lens[PREFETCH_MAX_BLOCKS] = {0};
                    627:     char* sums[PREFETCH_MAX_BLOCKS] = {0};
                    628:     for (i = 0; i < blocks; i++) {
                    629:         bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
                    630:         lens[i] = prefetch->sums[i].len;
                    631:         sums[i] = prefetch->sums[i].sum;
                    632:     }
                    633:     if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
                    634:         for (i = 0; i < blocks; i++) {
                    635:             prefetch->sums[i].in_use = 1;
                    636:         }
                    637:         return 1;
                    638:     } else {
                    639:         // this should never be, abort
                    640:         PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
                    641:         checksum2_disable_prefetch();
                    642:     }
                    643:     return 0;
                    644: }
                    645: 
                    646: void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
                    647: {
                    648:     if (prefetch) {
                    649:         PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
                    650:         OFF_T last = prefetch->last;
                    651:         prefetch->last = prefetch_offset;
                    652:         if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
                    653:             // we're looking around trying to align blocks, prefetching will slow things down
                    654:             PREFETCH_PRINTF("get_checksum2 SEEK\n");
                    655:             checksum2_reset_prefetch();
                    656:         } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
                    657:             // hit
                    658:             return;
                    659:         } else if (checksum2_perform_prefetch(prefetch_offset)) {
                    660:             if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
                    661:                 // hit; should always be as we just fetched this data
                    662:                 return;
                    663:             } else {
                    664:                 // this should never be, abort
                    665:                 PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
                    666:                 checksum2_disable_prefetch();
                    667:             }
                    668:         }
                    669:     }
                    670:     get_checksum2_nosimd(buf, len, sum, prefetch_offset);
                    671: }
                    672: #endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
                    673: 
                    674: } // "C"
                    675: 
                    676: /* Benchmark compilation
                    677: 
                    678:   The get_checksum1() benchmark runs through all available code paths in a
                    679:   single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
                    680:   recompiled for each code path (it always uses the fastest path available
                    681:   on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
                    682:   be used when applicable regardless of rsync being built with OpenSSL.
                    683: 
                    684:   Something like the following should compile and run the benchmarks:
                    685: 
                    686:   # if gcc
                    687:   export CC=gcc
                    688:   export CXX=g++
                    689:   export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
                    690: 
                    691:   # else if clang
                    692:   export CC=clang
                    693:   export CXX=clang++
                    694:   export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
                    695: 
                    696:   # /if
                    697: 
                    698:   export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
                    699:   export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
                    700:   export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
                    701:   export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
                    702: 
                    703:   rm bench_csum*
                    704: 
                    705:   ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
                    706: 
                    707:   $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
                    708: 
                    709:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
                    710:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
                    711: 
                    712:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
                    713:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
                    714: 
                    715:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
                    716:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
                    717: 
                    718:   ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
                    719: 
                    720:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
                    721:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
                    722: 
                    723:   ./bench_csum1.all
                    724:   ./bench_csum2.asm
                    725:   ./bench_csum2.openssl
                    726:   ./bench_csum2.sse2
                    727:   ./bench_csum2.avx2
                    728: 
                    729:  */
                    730: 
                    731: #if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
                    732: #pragma clang optimize off
                    733: #pragma GCC push_options
                    734: #pragma GCC optimize ("O0")
                    735: 
                    736: #define ROUNDS 1024
                    737: #define BLOCK_LEN 1024*1024
                    738: 
                    739: #ifndef CLOCK_MONOTONIC_RAW
                    740: #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
                    741: #endif
                    742: #endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
                    743: 
                    744: #ifdef BENCHMARK_SIMD_CHECKSUM1
                    745: static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
                    746:     struct timespec start, end;
                    747:     uint64_t us;
                    748:     uint32_t cs, s1, s2;
                    749:     int i, next;
                    750: 
                    751:     clock_gettime(CLOCK_MONOTONIC_RAW, &start);
                    752:     for (i = 0; i < ROUNDS; i++) {
                    753:         s1 = s2 = 0;
                    754:         next = func((schar*)buf, len, 0, &s1, &s2);
                    755:         get_checksum1_default_1((schar*)buf, len, next, &s1, &s2);
                    756:     }
                    757:     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
                    758:     us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
                    759:     cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
                    760:     printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
                    761: }
                    762: 
                    763: static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
                    764:     uint32 cs = get_checksum1((char*)buf, len);
                    765:     *ps1 = cs & 0xffff;
                    766:     *ps2 = cs >> 16;
                    767:     return len;
                    768: }
                    769: 
                    770: int main() {
                    771:     int i;
                    772:     unsigned char* buf = (unsigned char*)malloc(BLOCK_LEN);
                    773:     for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
                    774: 
                    775:     benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
                    776:     benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
                    777:     benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
                    778:     benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
                    779:     benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN);
                    780: 
                    781:     free(buf);
                    782:     return 0;
                    783: }
                    784: #endif /* BENCHMARK_SIMD_CHECKSUM1 */
                    785: 
                    786: #ifdef BENCHMARK_SIMD_CHECKSUM2
                    787: static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
                    788:     struct timespec start, end;
                    789:     uint64_t us;
                    790:     unsigned char cs1[16];
                    791:     unsigned char cs2[16];
                    792:     int i;
                    793: 
                    794:     clock_gettime(CLOCK_MONOTONIC_RAW, &start);
                    795:     for (i = 0; i < ROUNDS; i++) {
                    796:         func(buf, len, (char*)cs1);
                    797:     }
                    798:     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
                    799:     us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
                    800: 
                    801:     func2(buf, len, (char*)cs2);
                    802: 
                    803:     float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
                    804:     printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
                    805:     for (i = 0; i < 16; i++) {
                    806:         printf("%02x", cs1[i] & 0xFF);
                    807:     }
                    808:     printf(" :: ");
                    809:     for (i = 0; i < 16; i++) {
                    810:         printf("%02x", cs2[i] & 0xFF);
                    811:     }
                    812:     printf("\n");
                    813: }
                    814: 
                    815: static void benchmark_inner(char* buf, int32 len, char* sum_out) {
                    816:     // This should produce the same output for different optimizations
                    817:     // levels, not the same as sanity_check()
                    818: 
                    819:     char* bufs[8] = {0};
                    820:     int lens[8] = {0};
                    821:     char* sums[8] = {0};
                    822: 
                    823:     bufs[0] = buf;
                    824:     lens[0] = len;
                    825:     sums[0] = sum_out;
                    826:     md5_parallel(1, bufs, lens, sums, NULL, NULL);
                    827: }
                    828: 
                    829: extern "C" {
                    830: extern void MD5P8_Init_c(MD5P8_CTX *ctx);
                    831: extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
                    832: extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
                    833: }
                    834: 
                    835: static void sanity_check(char* buf, int32 len, char* sum_out) {
                    836:     // This should produce the same output for different optimizations
                    837:     // levels, not the same as benchmark_inner()
                    838:     if (md5_parallel_slots() <= 1) {
                    839:         MD5P8_CTX m5p8;
                    840:         MD5P8_Init_c(&m5p8);
                    841:         MD5P8_Update_c(&m5p8, (uchar *)buf, len);
                    842:         MD5P8_Final_c((uchar *)sum_out, &m5p8);
                    843:     } else {
                    844:         MD5P8_CTX m5p8;
                    845:         MD5P8_Init(&m5p8);
                    846:         MD5P8_Update(&m5p8, (uchar *)buf, len);
                    847:         MD5P8_Final((uchar *)sum_out, &m5p8);
                    848:     }
                    849: }
                    850: 
                    851: int main() {
                    852:     // This benchmarks the parallel MD5 checksum rather than get_checksum2()
                    853:     // as the latter would require compiling in a lot of rsync's code, but
                    854:     // it touches all the same internals so the performance should be nearly
                    855:     // identical.
                    856: 
                    857:     int i;
                    858:     char* buf = (char*)malloc(BLOCK_LEN);
                    859:     for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
                    860: 
                    861:     const char* method = "?";
                    862:     switch (md5_parallel_slots()) {
                    863:         case 8: method = "AVX2"; break;
                    864:         case 4: method = "SSE2"; break;
                    865: #ifdef USE_OPENSSL
                    866:         case 1: method = "OpenSSL"; break;
                    867: #elif (CSUM_CHUNK == 64)
                    868:         case 1: method = "ASM"; break;
                    869: #else
                    870:         // this won't happen unless you modified code somewhere
                    871:         case 1: method = "Raw-C"; break;
                    872: #endif
                    873:     }
                    874: 
                    875:     benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
                    876: 
                    877:     free(buf);
                    878:     return 0;
                    879: }
                    880: #endif /* BENCHMARK_SIMD_CHECKSUM2 */
                    881: 
                    882: #if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
                    883: #pragma GCC pop_options
                    884: #pragma clang optimize on
                    885: #endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
                    886: 
                    887: #endif /* HAVE_SIMD */
                    888: #endif /* __cplusplus */
                    889: #endif /* __x86_64__ */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>