Annotation of embedaddon/rsync/simd-checksum-x86_64.cpp, revision 1.1

1.1     ! misho       1: /*
        !             2:  * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
        !             3:  *
        !             4:  * Copyright (C) 1996 Andrew Tridgell
        !             5:  * Copyright (C) 1996 Paul Mackerras
        !             6:  * Copyright (C) 2004-2020 Wayne Davison
        !             7:  * Copyright (C) 2020 Jorrit Jongma
        !             8:  *
        !             9:  * This program is free software; you can redistribute it and/or modify
        !            10:  * it under the terms of the GNU General Public License as published by
        !            11:  * the Free Software Foundation; either version 3 of the License, or
        !            12:  * (at your option) any later version.
        !            13:  *
        !            14:  * This program is distributed in the hope that it will be useful,
        !            15:  * but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            16:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
        !            17:  * GNU General Public License for more details.
        !            18:  *
        !            19:  * You should have received a copy of the GNU General Public License along
        !            20:  * with this program; if not, visit the http://fsf.org website.
        !            21:  */
        !            22: /*
        !            23:  * Optimization target for get_checksum1() was the Intel Atom D2700, the
        !            24:  * slowest CPU in the test set and the most likely to be CPU limited during
        !            25:  * transfers. The combination of intrinsics was chosen specifically for the
        !            26:  * most gain on that CPU, other combinations were occasionally slightly
        !            27:  * faster on the others.
        !            28:  *
        !            29:  * While on more modern CPUs transfers are less likely to be CPU limited
        !            30:  * (at least by this specific function), lower CPU usage is always better.
        !            31:  * Improvements may still be seen when matching chunks from NVMe storage
        !            32:  * even on newer CPUs.
        !            33:  *
        !            34:  * Benchmarks (in MB/s)            C    SSE2   SSSE3    AVX2
        !            35:  * - Intel Atom D2700            550     750    1000     N/A
        !            36:  * - Intel i7-7700hq            1850    2550    4050    6200
        !            37:  * - AMD ThreadRipper 2950x     2900    5600    8950    8100
        !            38:  *
        !            39:  * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
        !            40:  * significantly faster. AVX2 is kept because it's more likely to relieve
        !            41:  * the bottleneck on the slower CPU.
        !            42:  *
        !            43:  * This optimization for get_checksum1() is intentionally limited to x86-64
        !            44:  * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
        !            45:  * the available xmm registers, this optimized version may not be faster than
        !            46:  * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
        !            47:  *
        !            48:  * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
        !            49:  * use of the target attribute, selecting the fastest code path based on
        !            50:  * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
        !            51:  * GCC 4.x are not supported to ease configure.ac logic.
        !            52:  *
        !            53:  * ----
        !            54:  *
        !            55:  * get_checksum2() is optimized for the case where the selected transfer
        !            56:  * checksum is MD5. MD5 can't be made significantly faster with SIMD
        !            57:  * instructions than the assembly version already included but SIMD
        !            58:  * instructions can be used to hash multiple streams in parallel (see
        !            59:  * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
        !            60:  * block-matching algorithm hashes the blocks independently (in contrast to
        !            61:  * the whole-file checksum) this method can be employed here.
        !            62:  *
        !            63:  * To prevent needing to modify the core rsync sources significantly, a
        !            64:  * prefetching strategy is used. When a checksum2 is requested, the code
        !            65:  * reads ahead several blocks, creates the MD5 hashes for each block in
        !            66:  * parallel, returns the hash for the first block, and caches the results
        !            67:  * for the other blocks to return in future calls to get_checksum2().
        !            68:  */
        !            69: 
        !            70: #ifdef __x86_64__
        !            71: #ifdef __cplusplus
        !            72: 
        !            73: extern "C" {
        !            74: 
        !            75: #include "rsync.h"
        !            76: 
        !            77: }
        !            78: 
        !            79: #ifdef HAVE_SIMD
        !            80: 
        !            81: #include <immintrin.h>
        !            82: 
        !            83: /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
        !            84: #ifdef __clang__
        !            85: #define MVSTATIC
        !            86: #else
        !            87: #define MVSTATIC static
        !            88: #endif
        !            89: 
        !            90: // Missing from the headers on gcc 6 and older, clang 8 and older
        !            91: typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
        !            92: typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
        !            93: 
        !            94: /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
        !            95:    These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
        !            96:    as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
        !            97:    performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
        !            98: */
        !            99: 
        !           100: #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
        !           101: #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
        !           102: #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
        !           103: #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
        !           104: 
        !           105: #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
        !           106: #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
        !           107: 
        !           108: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
        !           109: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
        !           110: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
        !           111: 
        !           112: /*
        !           113:   Original loop per 4 bytes:
        !           114:     s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
        !           115:     s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
        !           116: 
        !           117:   SSE2/SSSE3 loop per 32 bytes:
        !           118:     int16 t1[8];
        !           119:     int16 t2[8];
        !           120:     for (int j = 0; j < 8; j++) {
        !           121:       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
        !           122:       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
        !           123:     }
        !           124:     s2 += 32*s1 + (uint32)(
        !           125:               28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
        !           126:               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
        !           127:           ) + 528*CHAR_OFFSET;
        !           128:     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
        !           129:           32*CHAR_OFFSET;
        !           130:  */
        !           131: __attribute__ ((target("ssse3"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
        !           132: {
        !           133:     if (len > 32) {
        !           134:         int aligned = ((uintptr_t)buf & 15) == 0;
        !           135: 
        !           136:         uint32 x[4] = {0};
        !           137:         x[0] = *ps1;
        !           138:         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
        !           139:         x[0] = *ps2;
        !           140:         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
        !           141: 
        !           142:         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
        !           143:         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
        !           144: 
        !           145:         for (; i < (len-32); i+=32) {
        !           146:             // Load ... 2*[int8*16]
        !           147:             __m128i in8_1, in8_2;
        !           148:             if (!aligned) {
        !           149:                 // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
        !           150:                 in8_1 = _mm_lddqu_si128((__m128i_u*)&buf[i]);
        !           151:                 in8_2 = _mm_lddqu_si128((__m128i_u*)&buf[i + 16]);
        !           152:             } else {
        !           153:                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
        !           154:                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
        !           155:             }
        !           156: 
        !           157:             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
        !           158:             // Fastest, even though multiply by 1
        !           159:             __m128i mul_one = _mm_set1_epi8(1);
        !           160:             __m128i add16_1 = _mm_maddubs_epi16(mul_one, in8_1);
        !           161:             __m128i add16_2 = _mm_maddubs_epi16(mul_one, in8_2);
        !           162: 
        !           163:             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
        !           164:             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
        !           165:             __m128i mul_add16_1 = _mm_maddubs_epi16(mul_const, in8_1);
        !           166:             __m128i mul_add16_2 = _mm_maddubs_epi16(mul_const, in8_2);
        !           167: 
        !           168:             // s2 += 32*s1
        !           169:             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
        !           170: 
        !           171:             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
        !           172:             // Shifting left, then shifting right again and shuffling (rather than just
        !           173:             // shifting right as with mul32 below) to cheaply end up with the correct sign
        !           174:             // extension as we go from int16 to int32.
        !           175:             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
        !           176:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
        !           177:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
        !           178:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
        !           179:             sum_add32 = _mm_srai_epi32(sum_add32, 16);
        !           180:             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
        !           181: 
        !           182:             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
        !           183:             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
        !           184:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
        !           185:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
        !           186:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
        !           187:             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
        !           188:             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
        !           189: 
        !           190:             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
        !           191:             ss1 = _mm_add_epi32(ss1, sum_add32);
        !           192: 
        !           193:             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
        !           194:             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
        !           195: 
        !           196:             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
        !           197:             // We could've combined this with generating sum_add32 above and
        !           198:             // save an instruction but benchmarking shows that as being slower
        !           199:             __m128i add16 = _mm_hadds_epi16(add16_1, add16_2);
        !           200: 
        !           201:             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
        !           202:             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
        !           203: 
        !           204:             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
        !           205:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
        !           206:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
        !           207: 
        !           208:             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
        !           209:             ss2 = _mm_add_epi32(ss2, mul32);
        !           210: 
        !           211: #if CHAR_OFFSET != 0
        !           212:             // s1 += 32*CHAR_OFFSET
        !           213:             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
        !           214:             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
        !           215: 
        !           216:             // s2 += 528*CHAR_OFFSET
        !           217:             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
        !           218:             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
        !           219: #endif
        !           220:         }
        !           221: 
        !           222:         _mm_store_si128((__m128i_u*)x, ss1);
        !           223:         *ps1 = x[0];
        !           224:         _mm_store_si128((__m128i_u*)x, ss2);
        !           225:         *ps2 = x[0];
        !           226:     }
        !           227:     return i;
        !           228: }
        !           229: 
        !           230: /*
        !           231:   Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
        !           232:   For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
        !           233:   target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
        !           234:   to a near 50% performance drop.
        !           235:  */
        !           236: __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
        !           237: {
        !           238:     if (len > 32) {
        !           239:         int aligned = ((uintptr_t)buf & 15) == 0;
        !           240: 
        !           241:         uint32 x[4] = {0};
        !           242:         x[0] = *ps1;
        !           243:         __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
        !           244:         x[0] = *ps2;
        !           245:         __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
        !           246: 
        !           247:         const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
        !           248:         __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
        !           249: 
        !           250:         for (; i < (len-32); i+=32) {
        !           251:             // Load ... 2*[int8*16]
        !           252:             __m128i in8_1, in8_2;
        !           253:             if (!aligned) {
        !           254:                 in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
        !           255:                 in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
        !           256:             } else {
        !           257:                 in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
        !           258:                 in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
        !           259:             }
        !           260: 
        !           261:             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
        !           262:             // Fastest, even though multiply by 1
        !           263:             __m128i mul_one = _mm_set1_epi8(1);
        !           264:             __m128i add16_1 = SSE2_MADDUBS_EPI16(mul_one, in8_1);
        !           265:             __m128i add16_2 = SSE2_MADDUBS_EPI16(mul_one, in8_2);
        !           266: 
        !           267:             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
        !           268:             __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
        !           269:             __m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
        !           270:             __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
        !           271: 
        !           272:             // s2 += 32*s1
        !           273:             ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
        !           274: 
        !           275:             // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
        !           276:             // Shifting left, then shifting right again and shuffling (rather than just
        !           277:             // shifting right as with mul32 below) to cheaply end up with the correct sign
        !           278:             // extension as we go from int16 to int32.
        !           279:             __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
        !           280:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
        !           281:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
        !           282:             sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
        !           283:             sum_add32 = _mm_srai_epi32(sum_add32, 16);
        !           284:             sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
        !           285: 
        !           286:             // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
        !           287:             __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
        !           288:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
        !           289:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
        !           290:             sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
        !           291:             sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
        !           292:             sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
        !           293: 
        !           294:             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
        !           295:             ss1 = _mm_add_epi32(ss1, sum_add32);
        !           296: 
        !           297:             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
        !           298:             ss2 = _mm_add_epi32(ss2, sum_mul_add32);
        !           299: 
        !           300:             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
        !           301:             // We could've combined this with generating sum_add32 above and
        !           302:             // save an instruction but benchmarking shows that as being slower
        !           303:             __m128i add16 = SSE2_HADDS_EPI16(add16_1, add16_2);
        !           304: 
        !           305:             // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
        !           306:             __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
        !           307: 
        !           308:             // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
        !           309:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
        !           310:             mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
        !           311: 
        !           312:             // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
        !           313:             ss2 = _mm_add_epi32(ss2, mul32);
        !           314: 
        !           315: #if CHAR_OFFSET != 0
        !           316:             // s1 += 32*CHAR_OFFSET
        !           317:             __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
        !           318:             ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
        !           319: 
        !           320:             // s2 += 528*CHAR_OFFSET
        !           321:             char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
        !           322:             ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
        !           323: #endif
        !           324:         }
        !           325: 
        !           326:         _mm_store_si128((__m128i_u*)x, ss1);
        !           327:         *ps1 = x[0];
        !           328:         _mm_store_si128((__m128i_u*)x, ss2);
        !           329:         *ps2 = x[0];
        !           330:     }
        !           331:     return i;
        !           332: }
        !           333: 
        !           334: /*
        !           335:   AVX2 loop per 64 bytes:
        !           336:     int16 t1[16];
        !           337:     int16 t2[16];
        !           338:     for (int j = 0; j < 16; j++) {
        !           339:       t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
        !           340:       t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
        !           341:     }
        !           342:     s2 += 64*s1 + (uint32)(
        !           343:               60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
        !           344:               t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
        !           345:           ) + 2080*CHAR_OFFSET;
        !           346:     s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
        !           347:           64*CHAR_OFFSET;
        !           348:  */
        !           349: __attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
        !           350: {
        !           351:     if (len > 64) {
        !           352:         // Instructions reshuffled compared to SSE2 for slightly better performance
        !           353:         int aligned = ((uintptr_t)buf & 31) == 0;
        !           354: 
        !           355:         uint32 x[8] = {0};
        !           356:         x[0] = *ps1;
        !           357:         __m256i ss1 = _mm256_lddqu_si256((__m256i_u*)x);
        !           358:         x[0] = *ps2;
        !           359:         __m256i ss2 = _mm256_lddqu_si256((__m256i_u*)x);
        !           360: 
        !           361:         // The order gets shuffled compared to SSE2
        !           362:         const int16 mul_t1_buf[16] = {60, 56, 52, 48, 28, 24, 20, 16, 44, 40, 36, 32, 12, 8, 4, 0};
        !           363:         __m256i mul_t1 = _mm256_lddqu_si256((__m256i_u*)mul_t1_buf);
        !           364: 
        !           365:         for (; i < (len-64); i+=64) {
        !           366:             // Load ... 2*[int8*32]
        !           367:             __m256i in8_1, in8_2;
        !           368:             if (!aligned) {
        !           369:                 in8_1 = _mm256_lddqu_si256((__m256i_u*)&buf[i]);
        !           370:                 in8_2 = _mm256_lddqu_si256((__m256i_u*)&buf[i + 32]);
        !           371:             } else {
        !           372:                 in8_1 = _mm256_load_si256((__m256i_u*)&buf[i]);
        !           373:                 in8_2 = _mm256_load_si256((__m256i_u*)&buf[i + 32]);
        !           374:             }
        !           375: 
        !           376:             // Prefetch for next loops. This has no observable effect on the
        !           377:             // tested AMD but makes as much as 20% difference on the Intel.
        !           378:             // Curiously that same Intel sees no benefit from this with SSE2
        !           379:             // or SSSE3.
        !           380:             _mm_prefetch(&buf[i + 64], _MM_HINT_T0);
        !           381:             _mm_prefetch(&buf[i + 96], _MM_HINT_T0);
        !           382:             _mm_prefetch(&buf[i + 128], _MM_HINT_T0);
        !           383:             _mm_prefetch(&buf[i + 160], _MM_HINT_T0);
        !           384: 
        !           385:             // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*16]
        !           386:             // Fastest, even though multiply by 1
        !           387:             __m256i mul_one = _mm256_set1_epi8(1);
        !           388:             __m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
        !           389:             __m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
        !           390: 
        !           391:             // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*16]
        !           392:             __m256i mul_const = _mm256_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
        !           393:             __m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
        !           394:             __m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
        !           395: 
        !           396:             // s2 += 64*s1
        !           397:             ss2 = _mm256_add_epi32(ss2, _mm256_slli_epi32(ss1, 6));
        !           398: 
        !           399:             // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*16]
        !           400:             __m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
        !           401: 
        !           402:             // [t1[0], t1[1], ...] -> [t1[0]*60 + t1[1]*56, ...] [int32*8]
        !           403:             __m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
        !           404: 
        !           405:             // [sum(t1[0]..t1[15]), X, X, X, X, X, X, X] [int32*8]
        !           406:             __m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
        !           407:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_permute4x64_epi64(sum_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
        !           408:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 2));
        !           409:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 4));
        !           410:             sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 8));
        !           411:             sum_add32 = _mm256_srai_epi32(sum_add32, 16);
        !           412:             sum_add32 = _mm256_shuffle_epi32(sum_add32, 3);
        !           413: 
        !           414:             // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]
        !           415:             ss1 = _mm256_add_epi32(ss1, sum_add32);
        !           416: 
        !           417:             // [sum(t2[0]..t2[15]), X, X, X, X, X, X, X] [int32*8]
        !           418:             __m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
        !           419:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_permute4x64_epi64(sum_mul_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
        !           420:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 2));
        !           421:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 4));
        !           422:             sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 8));
        !           423:             sum_mul_add32 = _mm256_srai_epi32(sum_mul_add32, 16);
        !           424:             sum_mul_add32 = _mm256_shuffle_epi32(sum_mul_add32, 3);
        !           425: 
        !           426:             // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
        !           427:             ss2 = _mm256_add_epi32(ss2, sum_mul_add32);
        !           428: 
        !           429:             // [sum(mul32), X, X, X, X, X, X, X] [int32*8]
        !           430:             mul32 = _mm256_add_epi32(mul32, _mm256_permute2x128_si256(mul32, mul32, 1));
        !           431:             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
        !           432:             mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
        !           433: 
        !           434:             // s2 += 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14]
        !           435:             ss2 = _mm256_add_epi32(ss2, mul32);
        !           436: 
        !           437: #if CHAR_OFFSET != 0
        !           438:             // s1 += 64*CHAR_OFFSET
        !           439:             __m256i char_offset_multiplier = _mm256_set1_epi32(64 * CHAR_OFFSET);
        !           440:             ss1 = _mm256_add_epi32(ss1, char_offset_multiplier);
        !           441: 
        !           442:             // s2 += 2080*CHAR_OFFSET
        !           443:             char_offset_multiplier = _mm256_set1_epi32(2080 * CHAR_OFFSET);
        !           444:             ss2 = _mm256_add_epi32(ss2, char_offset_multiplier);
        !           445: #endif
        !           446:         }
        !           447: 
        !           448:         _mm256_store_si256((__m256i_u*)x, ss1);
        !           449:         *ps1 = x[0];
        !           450:         _mm256_store_si256((__m256i_u*)x, ss2);
        !           451:         *ps2 = x[0];
        !           452:     }
        !           453:     return i;
        !           454: }
        !           455: 
        !           456: static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
        !           457: {
        !           458:     uint32 s1 = *ps1;
        !           459:     uint32 s2 = *ps2;
        !           460:     for (; i < (len-4); i+=4) {
        !           461:         s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
        !           462:         s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
        !           463:     }
        !           464:     for (; i < len; i++) {
        !           465:         s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
        !           466:     }
        !           467:     *ps1 = s1;
        !           468:     *ps2 = s2;
        !           469:     return i;
        !           470: }
        !           471: 
        !           472: /* With GCC 10 putting this implementation inside 'extern "C"' causes an
        !           473:    assembler error. That worked fine on GCC 5-9 and clang 6-10...
        !           474:   */
        !           475: static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
        !           476: {
        !           477:     int32 i = 0;
        !           478:     uint32 s1 = 0;
        !           479:     uint32 s2 = 0;
        !           480: 
        !           481:     // multiples of 64 bytes using AVX2 (if available)
        !           482:     i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
        !           483: 
        !           484:     // multiples of 32 bytes using SSSE3 (if available)
        !           485:     i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
        !           486: 
        !           487:     // multiples of 32 bytes using SSE2 (if available)
        !           488:     i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
        !           489: 
        !           490:     // whatever is left
        !           491:     i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
        !           492: 
        !           493:     return (s1 & 0xffff) + (s2 << 16);
        !           494: }
        !           495: 
        !           496: extern "C" {
        !           497: 
        !           498: uint32 get_checksum1(char *buf1, int32 len)
        !           499: {
        !           500:     return get_checksum1_cpp(buf1, len);
        !           501: }
        !           502: 
        !           503: #if !defined(BENCHMARK_SIMD_CHECKSUM1)
        !           504: 
        !           505: // see simd-md5-parallel-x86_64.cpp
        !           506: extern int md5_parallel_slots();
        !           507: extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
        !           508: 
        !           509: #endif /* !BENCHMARK_SIMD_CHECKSUM1 */
        !           510: 
        !           511: #if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
        !           512: 
        !           513: #define PREFETCH_ENABLE 1 // debugging
        !           514: 
        !           515: #if 0 // debugging
        !           516: #define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
        !           517: #else
        !           518: #define PREFETCH_PRINTF(f_, ...) (void)0;
        !           519: #endif
        !           520: 
        !           521: #define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
        !           522: #define PREFETCH_MAX_BLOCKS 8
        !           523: 
        !           524: typedef struct {
        !           525:     int in_use;
        !           526:     OFF_T offset;
        !           527:     int32 len;
        !           528:     char sum[SUM_LENGTH];
        !           529: } prefetch_sum_t;
        !           530: 
        !           531: typedef struct {
        !           532:     struct map_struct *map;
        !           533:     OFF_T len;
        !           534:     OFF_T last;
        !           535:     int32 blocklen;
        !           536:     int blocks;
        !           537:     prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
        !           538: } prefetch_t;
        !           539: 
        !           540: prefetch_t *prefetch;
        !           541: 
        !           542: extern int xfersum_type;
        !           543: extern int checksum_seed;
        !           544: extern int proper_seed_order;
        !           545: extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
        !           546: 
        !           547: extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
        !           548: 
        !           549: void checksum2_disable_prefetch()
        !           550: {
        !           551:     if (prefetch) {
        !           552:         PREFETCH_PRINTF("checksum2_disable_prefetch\n");
        !           553:         free(prefetch);
        !           554:         prefetch = NULL;
        !           555:     }
        !           556: }
        !           557: 
        !           558: void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
        !           559: {
        !           560: #ifdef PREFETCH_ENABLE
        !           561:     checksum2_disable_prefetch();
        !           562:     int slots = md5_parallel_slots();
        !           563:     if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
        !           564:         prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
        !           565:         memset(prefetch, 0, sizeof(prefetch_t));
        !           566:         prefetch->map = map;
        !           567:         prefetch->len = len;
        !           568:         prefetch->last = 0;
        !           569:         prefetch->blocklen = blocklen;
        !           570:         prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
        !           571:         PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
        !           572:     }
        !           573: #endif
        !           574: }
        !           575: 
        !           576: static inline void checksum2_reset_prefetch()
        !           577: {
        !           578:     for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
        !           579:         prefetch->sums[i].in_use = 0;
        !           580:     }
        !           581: }
        !           582: 
        !           583: static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
        !           584: {
        !           585:     if (prefetch->sums[0].in_use) {
        !           586:         if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
        !           587:             memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
        !           588:             for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
        !           589:                 prefetch->sums[i] = prefetch->sums[i + 1];
        !           590:             }
        !           591:             prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
        !           592:             PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
        !           593:             return 1;
        !           594:         } else {
        !           595:             // unexpected access, reset cache
        !           596:             PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
        !           597:             checksum2_reset_prefetch();
        !           598:         }
        !           599:     }
        !           600:     return 0;
        !           601: }
        !           602: 
        !           603: static int checksum2_perform_prefetch(OFF_T prefetch_offset)
        !           604: {
        !           605:     int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
        !           606:     if (blocks < 2) return 0; // fall through to non-simd, probably faster
        !           607: 
        !           608:     int32 total = 0;
        !           609:     int i;
        !           610:     for (i = 0; i < blocks; i++) {
        !           611:         prefetch->sums[i].offset = prefetch_offset + total;
        !           612:         prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
        !           613:         prefetch->sums[i].in_use = 0;
        !           614:         total += prefetch->sums[i].len;
        !           615:     }
        !           616:     for (; i < PREFETCH_MAX_BLOCKS; i++) {
        !           617:         prefetch->sums[i].in_use = 0;
        !           618:     }
        !           619: 
        !           620:     uchar seedbuf[4];
        !           621:     SIVALu(seedbuf, 0, checksum_seed);
        !           622: 
        !           623:     PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
        !           624:     char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
        !           625:     char* bufs[PREFETCH_MAX_BLOCKS] = {0};
        !           626:     int lens[PREFETCH_MAX_BLOCKS] = {0};
        !           627:     char* sums[PREFETCH_MAX_BLOCKS] = {0};
        !           628:     for (i = 0; i < blocks; i++) {
        !           629:         bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
        !           630:         lens[i] = prefetch->sums[i].len;
        !           631:         sums[i] = prefetch->sums[i].sum;
        !           632:     }
        !           633:     if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
        !           634:         for (i = 0; i < blocks; i++) {
        !           635:             prefetch->sums[i].in_use = 1;
        !           636:         }
        !           637:         return 1;
        !           638:     } else {
        !           639:         // this should never be, abort
        !           640:         PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
        !           641:         checksum2_disable_prefetch();
        !           642:     }
        !           643:     return 0;
        !           644: }
        !           645: 
        !           646: void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
        !           647: {
        !           648:     if (prefetch) {
        !           649:         PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
        !           650:         OFF_T last = prefetch->last;
        !           651:         prefetch->last = prefetch_offset;
        !           652:         if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
        !           653:             // we're looking around trying to align blocks, prefetching will slow things down
        !           654:             PREFETCH_PRINTF("get_checksum2 SEEK\n");
        !           655:             checksum2_reset_prefetch();
        !           656:         } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
        !           657:             // hit
        !           658:             return;
        !           659:         } else if (checksum2_perform_prefetch(prefetch_offset)) {
        !           660:             if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
        !           661:                 // hit; should always be as we just fetched this data
        !           662:                 return;
        !           663:             } else {
        !           664:                 // this should never be, abort
        !           665:                 PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
        !           666:                 checksum2_disable_prefetch();
        !           667:             }
        !           668:         }
        !           669:     }
        !           670:     get_checksum2_nosimd(buf, len, sum, prefetch_offset);
        !           671: }
        !           672: #endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
        !           673: 
        !           674: } // "C"
        !           675: 
        !           676: /* Benchmark compilation
        !           677: 
        !           678:   The get_checksum1() benchmark runs through all available code paths in a
        !           679:   single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
        !           680:   recompiled for each code path (it always uses the fastest path available
        !           681:   on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
        !           682:   be used when applicable regardless of rsync being built with OpenSSL.
        !           683: 
        !           684:   Something like the following should compile and run the benchmarks:
        !           685: 
        !           686:   # if gcc
        !           687:   export CC=gcc
        !           688:   export CXX=g++
        !           689:   export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
        !           690: 
        !           691:   # else if clang
        !           692:   export CC=clang
        !           693:   export CXX=clang++
        !           694:   export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
        !           695: 
        !           696:   # /if
        !           697: 
        !           698:   export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
        !           699:   export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
        !           700:   export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
        !           701:   export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
        !           702: 
        !           703:   rm bench_csum*
        !           704: 
        !           705:   ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
        !           706: 
        !           707:   $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
        !           708: 
        !           709:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
        !           710:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
        !           711: 
        !           712:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
        !           713:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
        !           714: 
        !           715:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
        !           716:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
        !           717: 
        !           718:   ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
        !           719: 
        !           720:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
        !           721:   $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
        !           722: 
        !           723:   ./bench_csum1.all
        !           724:   ./bench_csum2.asm
        !           725:   ./bench_csum2.openssl
        !           726:   ./bench_csum2.sse2
        !           727:   ./bench_csum2.avx2
        !           728: 
        !           729:  */
        !           730: 
        !           731: #if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
        !           732: #pragma clang optimize off
        !           733: #pragma GCC push_options
        !           734: #pragma GCC optimize ("O0")
        !           735: 
        !           736: #define ROUNDS 1024
        !           737: #define BLOCK_LEN 1024*1024
        !           738: 
        !           739: #ifndef CLOCK_MONOTONIC_RAW
        !           740: #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
        !           741: #endif
        !           742: #endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
        !           743: 
        !           744: #ifdef BENCHMARK_SIMD_CHECKSUM1
        !           745: static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
        !           746:     struct timespec start, end;
        !           747:     uint64_t us;
        !           748:     uint32_t cs, s1, s2;
        !           749:     int i, next;
        !           750: 
        !           751:     clock_gettime(CLOCK_MONOTONIC_RAW, &start);
        !           752:     for (i = 0; i < ROUNDS; i++) {
        !           753:         s1 = s2 = 0;
        !           754:         next = func((schar*)buf, len, 0, &s1, &s2);
        !           755:         get_checksum1_default_1((schar*)buf, len, next, &s1, &s2);
        !           756:     }
        !           757:     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
        !           758:     us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
        !           759:     cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
        !           760:     printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
        !           761: }
        !           762: 
        !           763: static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
        !           764:     uint32 cs = get_checksum1((char*)buf, len);
        !           765:     *ps1 = cs & 0xffff;
        !           766:     *ps2 = cs >> 16;
        !           767:     return len;
        !           768: }
        !           769: 
        !           770: int main() {
        !           771:     int i;
        !           772:     unsigned char* buf = (unsigned char*)malloc(BLOCK_LEN);
        !           773:     for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
        !           774: 
        !           775:     benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
        !           776:     benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
        !           777:     benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
        !           778:     benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
        !           779:     benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN);
        !           780: 
        !           781:     free(buf);
        !           782:     return 0;
        !           783: }
        !           784: #endif /* BENCHMARK_SIMD_CHECKSUM1 */
        !           785: 
        !           786: #ifdef BENCHMARK_SIMD_CHECKSUM2
        !           787: static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
        !           788:     struct timespec start, end;
        !           789:     uint64_t us;
        !           790:     unsigned char cs1[16];
        !           791:     unsigned char cs2[16];
        !           792:     int i;
        !           793: 
        !           794:     clock_gettime(CLOCK_MONOTONIC_RAW, &start);
        !           795:     for (i = 0; i < ROUNDS; i++) {
        !           796:         func(buf, len, (char*)cs1);
        !           797:     }
        !           798:     clock_gettime(CLOCK_MONOTONIC_RAW, &end);
        !           799:     us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
        !           800: 
        !           801:     func2(buf, len, (char*)cs2);
        !           802: 
        !           803:     float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
        !           804:     printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
        !           805:     for (i = 0; i < 16; i++) {
        !           806:         printf("%02x", cs1[i] & 0xFF);
        !           807:     }
        !           808:     printf(" :: ");
        !           809:     for (i = 0; i < 16; i++) {
        !           810:         printf("%02x", cs2[i] & 0xFF);
        !           811:     }
        !           812:     printf("\n");
        !           813: }
        !           814: 
        !           815: static void benchmark_inner(char* buf, int32 len, char* sum_out) {
        !           816:     // This should produce the same output for different optimizations
        !           817:     // levels, not the same as sanity_check()
        !           818: 
        !           819:     char* bufs[8] = {0};
        !           820:     int lens[8] = {0};
        !           821:     char* sums[8] = {0};
        !           822: 
        !           823:     bufs[0] = buf;
        !           824:     lens[0] = len;
        !           825:     sums[0] = sum_out;
        !           826:     md5_parallel(1, bufs, lens, sums, NULL, NULL);
        !           827: }
        !           828: 
        !           829: extern "C" {
        !           830: extern void MD5P8_Init_c(MD5P8_CTX *ctx);
        !           831: extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
        !           832: extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
        !           833: }
        !           834: 
        !           835: static void sanity_check(char* buf, int32 len, char* sum_out) {
        !           836:     // This should produce the same output for different optimizations
        !           837:     // levels, not the same as benchmark_inner()
        !           838:     if (md5_parallel_slots() <= 1) {
        !           839:         MD5P8_CTX m5p8;
        !           840:         MD5P8_Init_c(&m5p8);
        !           841:         MD5P8_Update_c(&m5p8, (uchar *)buf, len);
        !           842:         MD5P8_Final_c((uchar *)sum_out, &m5p8);
        !           843:     } else {
        !           844:         MD5P8_CTX m5p8;
        !           845:         MD5P8_Init(&m5p8);
        !           846:         MD5P8_Update(&m5p8, (uchar *)buf, len);
        !           847:         MD5P8_Final((uchar *)sum_out, &m5p8);
        !           848:     }
        !           849: }
        !           850: 
        !           851: int main() {
        !           852:     // This benchmarks the parallel MD5 checksum rather than get_checksum2()
        !           853:     // as the latter would require compiling in a lot of rsync's code, but
        !           854:     // it touches all the same internals so the performance should be nearly
        !           855:     // identical.
        !           856: 
        !           857:     int i;
        !           858:     char* buf = (char*)malloc(BLOCK_LEN);
        !           859:     for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
        !           860: 
        !           861:     const char* method = "?";
        !           862:     switch (md5_parallel_slots()) {
        !           863:         case 8: method = "AVX2"; break;
        !           864:         case 4: method = "SSE2"; break;
        !           865: #ifdef USE_OPENSSL
        !           866:         case 1: method = "OpenSSL"; break;
        !           867: #elif (CSUM_CHUNK == 64)
        !           868:         case 1: method = "ASM"; break;
        !           869: #else
        !           870:         // this won't happen unless you modified code somewhere
        !           871:         case 1: method = "Raw-C"; break;
        !           872: #endif
        !           873:     }
        !           874: 
        !           875:     benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
        !           876: 
        !           877:     free(buf);
        !           878:     return 0;
        !           879: }
        !           880: #endif /* BENCHMARK_SIMD_CHECKSUM2 */
        !           881: 
        !           882: #if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
        !           883: #pragma GCC pop_options
        !           884: #pragma clang optimize on
        !           885: #endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
        !           886: 
        !           887: #endif /* HAVE_SIMD */
        !           888: #endif /* __cplusplus */
        !           889: #endif /* __x86_64__ */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>