Annotation of embedaddon/rsync/simd-checksum-x86_64.cpp, revision 1.1
1.1 ! misho 1: /*
! 2: * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
! 3: *
! 4: * Copyright (C) 1996 Andrew Tridgell
! 5: * Copyright (C) 1996 Paul Mackerras
! 6: * Copyright (C) 2004-2020 Wayne Davison
! 7: * Copyright (C) 2020 Jorrit Jongma
! 8: *
! 9: * This program is free software; you can redistribute it and/or modify
! 10: * it under the terms of the GNU General Public License as published by
! 11: * the Free Software Foundation; either version 3 of the License, or
! 12: * (at your option) any later version.
! 13: *
! 14: * This program is distributed in the hope that it will be useful,
! 15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
! 16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! 17: * GNU General Public License for more details.
! 18: *
! 19: * You should have received a copy of the GNU General Public License along
! 20: * with this program; if not, visit the http://fsf.org website.
! 21: */
! 22: /*
! 23: * Optimization target for get_checksum1() was the Intel Atom D2700, the
! 24: * slowest CPU in the test set and the most likely to be CPU limited during
! 25: * transfers. The combination of intrinsics was chosen specifically for the
! 26: * most gain on that CPU, other combinations were occasionally slightly
! 27: * faster on the others.
! 28: *
! 29: * While on more modern CPUs transfers are less likely to be CPU limited
! 30: * (at least by this specific function), lower CPU usage is always better.
! 31: * Improvements may still be seen when matching chunks from NVMe storage
! 32: * even on newer CPUs.
! 33: *
! 34: * Benchmarks (in MB/s) C SSE2 SSSE3 AVX2
! 35: * - Intel Atom D2700 550 750 1000 N/A
! 36: * - Intel i7-7700hq 1850 2550 4050 6200
! 37: * - AMD ThreadRipper 2950x 2900 5600 8950 8100
! 38: *
! 39: * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
! 40: * significantly faster. AVX2 is kept because it's more likely to relieve
! 41: * the bottleneck on the slower CPU.
! 42: *
! 43: * This optimization for get_checksum1() is intentionally limited to x86-64
! 44: * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
! 45: * the available xmm registers, this optimized version may not be faster than
! 46: * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
! 47: *
! 48: * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
! 49: * use of the target attribute, selecting the fastest code path based on
! 50: * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
! 51: * GCC 4.x are not supported to ease configure.ac logic.
! 52: *
! 53: * ----
! 54: *
! 55: * get_checksum2() is optimized for the case where the selected transfer
! 56: * checksum is MD5. MD5 can't be made significantly faster with SIMD
! 57: * instructions than the assembly version already included but SIMD
! 58: * instructions can be used to hash multiple streams in parallel (see
! 59: * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
! 60: * block-matching algorithm hashes the blocks independently (in contrast to
! 61: * the whole-file checksum) this method can be employed here.
! 62: *
! 63: * To prevent needing to modify the core rsync sources significantly, a
! 64: * prefetching strategy is used. When a checksum2 is requested, the code
! 65: * reads ahead several blocks, creates the MD5 hashes for each block in
! 66: * parallel, returns the hash for the first block, and caches the results
! 67: * for the other blocks to return in future calls to get_checksum2().
! 68: */
! 69:
! 70: #ifdef __x86_64__
! 71: #ifdef __cplusplus
! 72:
! 73: extern "C" {
! 74:
! 75: #include "rsync.h"
! 76:
! 77: }
! 78:
! 79: #ifdef HAVE_SIMD
! 80:
! 81: #include <immintrin.h>
! 82:
! 83: /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
! 84: #ifdef __clang__
! 85: #define MVSTATIC
! 86: #else
! 87: #define MVSTATIC static
! 88: #endif
! 89:
! 90: // Missing from the headers on gcc 6 and older, clang 8 and older
! 91: typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
! 92: typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
! 93:
! 94: /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
! 95: These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
! 96: as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
! 97: performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
! 98: */
! 99:
! 100: #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
! 101: #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
! 102: #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
! 103: #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
! 104:
! 105: #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
! 106: #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
! 107:
! 108: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
! 109: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
! 110: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
! 111:
! 112: /*
! 113: Original loop per 4 bytes:
! 114: s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
! 115: s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
! 116:
! 117: SSE2/SSSE3 loop per 32 bytes:
! 118: int16 t1[8];
! 119: int16 t2[8];
! 120: for (int j = 0; j < 8; j++) {
! 121: t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
! 122: t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
! 123: }
! 124: s2 += 32*s1 + (uint32)(
! 125: 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
! 126: t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
! 127: ) + 528*CHAR_OFFSET;
! 128: s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
! 129: 32*CHAR_OFFSET;
! 130: */
! 131: __attribute__ ((target("ssse3"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
! 132: {
! 133: if (len > 32) {
! 134: int aligned = ((uintptr_t)buf & 15) == 0;
! 135:
! 136: uint32 x[4] = {0};
! 137: x[0] = *ps1;
! 138: __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
! 139: x[0] = *ps2;
! 140: __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
! 141:
! 142: const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
! 143: __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
! 144:
! 145: for (; i < (len-32); i+=32) {
! 146: // Load ... 2*[int8*16]
! 147: __m128i in8_1, in8_2;
! 148: if (!aligned) {
! 149: // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
! 150: in8_1 = _mm_lddqu_si128((__m128i_u*)&buf[i]);
! 151: in8_2 = _mm_lddqu_si128((__m128i_u*)&buf[i + 16]);
! 152: } else {
! 153: in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
! 154: in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
! 155: }
! 156:
! 157: // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
! 158: // Fastest, even though multiply by 1
! 159: __m128i mul_one = _mm_set1_epi8(1);
! 160: __m128i add16_1 = _mm_maddubs_epi16(mul_one, in8_1);
! 161: __m128i add16_2 = _mm_maddubs_epi16(mul_one, in8_2);
! 162:
! 163: // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
! 164: __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
! 165: __m128i mul_add16_1 = _mm_maddubs_epi16(mul_const, in8_1);
! 166: __m128i mul_add16_2 = _mm_maddubs_epi16(mul_const, in8_2);
! 167:
! 168: // s2 += 32*s1
! 169: ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
! 170:
! 171: // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
! 172: // Shifting left, then shifting right again and shuffling (rather than just
! 173: // shifting right as with mul32 below) to cheaply end up with the correct sign
! 174: // extension as we go from int16 to int32.
! 175: __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
! 176: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
! 177: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
! 178: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
! 179: sum_add32 = _mm_srai_epi32(sum_add32, 16);
! 180: sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
! 181:
! 182: // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
! 183: __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
! 184: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
! 185: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
! 186: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
! 187: sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
! 188: sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
! 189:
! 190: // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
! 191: ss1 = _mm_add_epi32(ss1, sum_add32);
! 192:
! 193: // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
! 194: ss2 = _mm_add_epi32(ss2, sum_mul_add32);
! 195:
! 196: // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
! 197: // We could've combined this with generating sum_add32 above and
! 198: // save an instruction but benchmarking shows that as being slower
! 199: __m128i add16 = _mm_hadds_epi16(add16_1, add16_2);
! 200:
! 201: // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
! 202: __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
! 203:
! 204: // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
! 205: mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
! 206: mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
! 207:
! 208: // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
! 209: ss2 = _mm_add_epi32(ss2, mul32);
! 210:
! 211: #if CHAR_OFFSET != 0
! 212: // s1 += 32*CHAR_OFFSET
! 213: __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
! 214: ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
! 215:
! 216: // s2 += 528*CHAR_OFFSET
! 217: char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
! 218: ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
! 219: #endif
! 220: }
! 221:
! 222: _mm_store_si128((__m128i_u*)x, ss1);
! 223: *ps1 = x[0];
! 224: _mm_store_si128((__m128i_u*)x, ss2);
! 225: *ps2 = x[0];
! 226: }
! 227: return i;
! 228: }
! 229:
! 230: /*
! 231: Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
! 232: For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
! 233: target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
! 234: to a near 50% performance drop.
! 235: */
! 236: __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
! 237: {
! 238: if (len > 32) {
! 239: int aligned = ((uintptr_t)buf & 15) == 0;
! 240:
! 241: uint32 x[4] = {0};
! 242: x[0] = *ps1;
! 243: __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
! 244: x[0] = *ps2;
! 245: __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
! 246:
! 247: const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
! 248: __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
! 249:
! 250: for (; i < (len-32); i+=32) {
! 251: // Load ... 2*[int8*16]
! 252: __m128i in8_1, in8_2;
! 253: if (!aligned) {
! 254: in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
! 255: in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
! 256: } else {
! 257: in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
! 258: in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
! 259: }
! 260:
! 261: // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
! 262: // Fastest, even though multiply by 1
! 263: __m128i mul_one = _mm_set1_epi8(1);
! 264: __m128i add16_1 = SSE2_MADDUBS_EPI16(mul_one, in8_1);
! 265: __m128i add16_2 = SSE2_MADDUBS_EPI16(mul_one, in8_2);
! 266:
! 267: // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
! 268: __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
! 269: __m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
! 270: __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
! 271:
! 272: // s2 += 32*s1
! 273: ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
! 274:
! 275: // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
! 276: // Shifting left, then shifting right again and shuffling (rather than just
! 277: // shifting right as with mul32 below) to cheaply end up with the correct sign
! 278: // extension as we go from int16 to int32.
! 279: __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
! 280: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
! 281: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
! 282: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
! 283: sum_add32 = _mm_srai_epi32(sum_add32, 16);
! 284: sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
! 285:
! 286: // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
! 287: __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
! 288: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
! 289: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
! 290: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
! 291: sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
! 292: sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
! 293:
! 294: // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
! 295: ss1 = _mm_add_epi32(ss1, sum_add32);
! 296:
! 297: // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
! 298: ss2 = _mm_add_epi32(ss2, sum_mul_add32);
! 299:
! 300: // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
! 301: // We could've combined this with generating sum_add32 above and
! 302: // save an instruction but benchmarking shows that as being slower
! 303: __m128i add16 = SSE2_HADDS_EPI16(add16_1, add16_2);
! 304:
! 305: // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
! 306: __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
! 307:
! 308: // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
! 309: mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
! 310: mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
! 311:
! 312: // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
! 313: ss2 = _mm_add_epi32(ss2, mul32);
! 314:
! 315: #if CHAR_OFFSET != 0
! 316: // s1 += 32*CHAR_OFFSET
! 317: __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
! 318: ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
! 319:
! 320: // s2 += 528*CHAR_OFFSET
! 321: char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
! 322: ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
! 323: #endif
! 324: }
! 325:
! 326: _mm_store_si128((__m128i_u*)x, ss1);
! 327: *ps1 = x[0];
! 328: _mm_store_si128((__m128i_u*)x, ss2);
! 329: *ps2 = x[0];
! 330: }
! 331: return i;
! 332: }
! 333:
! 334: /*
! 335: AVX2 loop per 64 bytes:
! 336: int16 t1[16];
! 337: int16 t2[16];
! 338: for (int j = 0; j < 16; j++) {
! 339: t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
! 340: t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
! 341: }
! 342: s2 += 64*s1 + (uint32)(
! 343: 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
! 344: t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
! 345: ) + 2080*CHAR_OFFSET;
! 346: s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
! 347: 64*CHAR_OFFSET;
! 348: */
! 349: __attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
! 350: {
! 351: if (len > 64) {
! 352: // Instructions reshuffled compared to SSE2 for slightly better performance
! 353: int aligned = ((uintptr_t)buf & 31) == 0;
! 354:
! 355: uint32 x[8] = {0};
! 356: x[0] = *ps1;
! 357: __m256i ss1 = _mm256_lddqu_si256((__m256i_u*)x);
! 358: x[0] = *ps2;
! 359: __m256i ss2 = _mm256_lddqu_si256((__m256i_u*)x);
! 360:
! 361: // The order gets shuffled compared to SSE2
! 362: const int16 mul_t1_buf[16] = {60, 56, 52, 48, 28, 24, 20, 16, 44, 40, 36, 32, 12, 8, 4, 0};
! 363: __m256i mul_t1 = _mm256_lddqu_si256((__m256i_u*)mul_t1_buf);
! 364:
! 365: for (; i < (len-64); i+=64) {
! 366: // Load ... 2*[int8*32]
! 367: __m256i in8_1, in8_2;
! 368: if (!aligned) {
! 369: in8_1 = _mm256_lddqu_si256((__m256i_u*)&buf[i]);
! 370: in8_2 = _mm256_lddqu_si256((__m256i_u*)&buf[i + 32]);
! 371: } else {
! 372: in8_1 = _mm256_load_si256((__m256i_u*)&buf[i]);
! 373: in8_2 = _mm256_load_si256((__m256i_u*)&buf[i + 32]);
! 374: }
! 375:
! 376: // Prefetch for next loops. This has no observable effect on the
! 377: // tested AMD but makes as much as 20% difference on the Intel.
! 378: // Curiously that same Intel sees no benefit from this with SSE2
! 379: // or SSSE3.
! 380: _mm_prefetch(&buf[i + 64], _MM_HINT_T0);
! 381: _mm_prefetch(&buf[i + 96], _MM_HINT_T0);
! 382: _mm_prefetch(&buf[i + 128], _MM_HINT_T0);
! 383: _mm_prefetch(&buf[i + 160], _MM_HINT_T0);
! 384:
! 385: // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*16]
! 386: // Fastest, even though multiply by 1
! 387: __m256i mul_one = _mm256_set1_epi8(1);
! 388: __m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
! 389: __m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
! 390:
! 391: // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*16]
! 392: __m256i mul_const = _mm256_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
! 393: __m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
! 394: __m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
! 395:
! 396: // s2 += 64*s1
! 397: ss2 = _mm256_add_epi32(ss2, _mm256_slli_epi32(ss1, 6));
! 398:
! 399: // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*16]
! 400: __m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
! 401:
! 402: // [t1[0], t1[1], ...] -> [t1[0]*60 + t1[1]*56, ...] [int32*8]
! 403: __m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
! 404:
! 405: // [sum(t1[0]..t1[15]), X, X, X, X, X, X, X] [int32*8]
! 406: __m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
! 407: sum_add32 = _mm256_add_epi16(sum_add32, _mm256_permute4x64_epi64(sum_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
! 408: sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 2));
! 409: sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 4));
! 410: sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 8));
! 411: sum_add32 = _mm256_srai_epi32(sum_add32, 16);
! 412: sum_add32 = _mm256_shuffle_epi32(sum_add32, 3);
! 413:
! 414: // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]
! 415: ss1 = _mm256_add_epi32(ss1, sum_add32);
! 416:
! 417: // [sum(t2[0]..t2[15]), X, X, X, X, X, X, X] [int32*8]
! 418: __m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
! 419: sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_permute4x64_epi64(sum_mul_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
! 420: sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 2));
! 421: sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 4));
! 422: sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 8));
! 423: sum_mul_add32 = _mm256_srai_epi32(sum_mul_add32, 16);
! 424: sum_mul_add32 = _mm256_shuffle_epi32(sum_mul_add32, 3);
! 425:
! 426: // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
! 427: ss2 = _mm256_add_epi32(ss2, sum_mul_add32);
! 428:
! 429: // [sum(mul32), X, X, X, X, X, X, X] [int32*8]
! 430: mul32 = _mm256_add_epi32(mul32, _mm256_permute2x128_si256(mul32, mul32, 1));
! 431: mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
! 432: mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
! 433:
! 434: // s2 += 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14]
! 435: ss2 = _mm256_add_epi32(ss2, mul32);
! 436:
! 437: #if CHAR_OFFSET != 0
! 438: // s1 += 64*CHAR_OFFSET
! 439: __m256i char_offset_multiplier = _mm256_set1_epi32(64 * CHAR_OFFSET);
! 440: ss1 = _mm256_add_epi32(ss1, char_offset_multiplier);
! 441:
! 442: // s2 += 2080*CHAR_OFFSET
! 443: char_offset_multiplier = _mm256_set1_epi32(2080 * CHAR_OFFSET);
! 444: ss2 = _mm256_add_epi32(ss2, char_offset_multiplier);
! 445: #endif
! 446: }
! 447:
! 448: _mm256_store_si256((__m256i_u*)x, ss1);
! 449: *ps1 = x[0];
! 450: _mm256_store_si256((__m256i_u*)x, ss2);
! 451: *ps2 = x[0];
! 452: }
! 453: return i;
! 454: }
! 455:
! 456: static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
! 457: {
! 458: uint32 s1 = *ps1;
! 459: uint32 s2 = *ps2;
! 460: for (; i < (len-4); i+=4) {
! 461: s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
! 462: s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
! 463: }
! 464: for (; i < len; i++) {
! 465: s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
! 466: }
! 467: *ps1 = s1;
! 468: *ps2 = s2;
! 469: return i;
! 470: }
! 471:
! 472: /* With GCC 10 putting this implementation inside 'extern "C"' causes an
! 473: assembler error. That worked fine on GCC 5-9 and clang 6-10...
! 474: */
! 475: static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
! 476: {
! 477: int32 i = 0;
! 478: uint32 s1 = 0;
! 479: uint32 s2 = 0;
! 480:
! 481: // multiples of 64 bytes using AVX2 (if available)
! 482: i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
! 483:
! 484: // multiples of 32 bytes using SSSE3 (if available)
! 485: i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
! 486:
! 487: // multiples of 32 bytes using SSE2 (if available)
! 488: i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
! 489:
! 490: // whatever is left
! 491: i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
! 492:
! 493: return (s1 & 0xffff) + (s2 << 16);
! 494: }
! 495:
! 496: extern "C" {
! 497:
! 498: uint32 get_checksum1(char *buf1, int32 len)
! 499: {
! 500: return get_checksum1_cpp(buf1, len);
! 501: }
! 502:
! 503: #if !defined(BENCHMARK_SIMD_CHECKSUM1)
! 504:
! 505: // see simd-md5-parallel-x86_64.cpp
! 506: extern int md5_parallel_slots();
! 507: extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
! 508:
! 509: #endif /* !BENCHMARK_SIMD_CHECKSUM1 */
! 510:
! 511: #if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
! 512:
! 513: #define PREFETCH_ENABLE 1 // debugging
! 514:
! 515: #if 0 // debugging
! 516: #define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
! 517: #else
! 518: #define PREFETCH_PRINTF(f_, ...) (void)0;
! 519: #endif
! 520:
! 521: #define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
! 522: #define PREFETCH_MAX_BLOCKS 8
! 523:
! 524: typedef struct {
! 525: int in_use;
! 526: OFF_T offset;
! 527: int32 len;
! 528: char sum[SUM_LENGTH];
! 529: } prefetch_sum_t;
! 530:
! 531: typedef struct {
! 532: struct map_struct *map;
! 533: OFF_T len;
! 534: OFF_T last;
! 535: int32 blocklen;
! 536: int blocks;
! 537: prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
! 538: } prefetch_t;
! 539:
! 540: prefetch_t *prefetch;
! 541:
! 542: extern int xfersum_type;
! 543: extern int checksum_seed;
! 544: extern int proper_seed_order;
! 545: extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
! 546:
! 547: extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
! 548:
! 549: void checksum2_disable_prefetch()
! 550: {
! 551: if (prefetch) {
! 552: PREFETCH_PRINTF("checksum2_disable_prefetch\n");
! 553: free(prefetch);
! 554: prefetch = NULL;
! 555: }
! 556: }
! 557:
! 558: void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
! 559: {
! 560: #ifdef PREFETCH_ENABLE
! 561: checksum2_disable_prefetch();
! 562: int slots = md5_parallel_slots();
! 563: if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
! 564: prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
! 565: memset(prefetch, 0, sizeof(prefetch_t));
! 566: prefetch->map = map;
! 567: prefetch->len = len;
! 568: prefetch->last = 0;
! 569: prefetch->blocklen = blocklen;
! 570: prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
! 571: PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
! 572: }
! 573: #endif
! 574: }
! 575:
! 576: static inline void checksum2_reset_prefetch()
! 577: {
! 578: for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
! 579: prefetch->sums[i].in_use = 0;
! 580: }
! 581: }
! 582:
! 583: static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
! 584: {
! 585: if (prefetch->sums[0].in_use) {
! 586: if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
! 587: memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
! 588: for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
! 589: prefetch->sums[i] = prefetch->sums[i + 1];
! 590: }
! 591: prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
! 592: PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
! 593: return 1;
! 594: } else {
! 595: // unexpected access, reset cache
! 596: PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
! 597: checksum2_reset_prefetch();
! 598: }
! 599: }
! 600: return 0;
! 601: }
! 602:
! 603: static int checksum2_perform_prefetch(OFF_T prefetch_offset)
! 604: {
! 605: int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
! 606: if (blocks < 2) return 0; // fall through to non-simd, probably faster
! 607:
! 608: int32 total = 0;
! 609: int i;
! 610: for (i = 0; i < blocks; i++) {
! 611: prefetch->sums[i].offset = prefetch_offset + total;
! 612: prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
! 613: prefetch->sums[i].in_use = 0;
! 614: total += prefetch->sums[i].len;
! 615: }
! 616: for (; i < PREFETCH_MAX_BLOCKS; i++) {
! 617: prefetch->sums[i].in_use = 0;
! 618: }
! 619:
! 620: uchar seedbuf[4];
! 621: SIVALu(seedbuf, 0, checksum_seed);
! 622:
! 623: PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
! 624: char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
! 625: char* bufs[PREFETCH_MAX_BLOCKS] = {0};
! 626: int lens[PREFETCH_MAX_BLOCKS] = {0};
! 627: char* sums[PREFETCH_MAX_BLOCKS] = {0};
! 628: for (i = 0; i < blocks; i++) {
! 629: bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
! 630: lens[i] = prefetch->sums[i].len;
! 631: sums[i] = prefetch->sums[i].sum;
! 632: }
! 633: if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
! 634: for (i = 0; i < blocks; i++) {
! 635: prefetch->sums[i].in_use = 1;
! 636: }
! 637: return 1;
! 638: } else {
! 639: // this should never be, abort
! 640: PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
! 641: checksum2_disable_prefetch();
! 642: }
! 643: return 0;
! 644: }
! 645:
! 646: void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
! 647: {
! 648: if (prefetch) {
! 649: PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
! 650: OFF_T last = prefetch->last;
! 651: prefetch->last = prefetch_offset;
! 652: if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
! 653: // we're looking around trying to align blocks, prefetching will slow things down
! 654: PREFETCH_PRINTF("get_checksum2 SEEK\n");
! 655: checksum2_reset_prefetch();
! 656: } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
! 657: // hit
! 658: return;
! 659: } else if (checksum2_perform_prefetch(prefetch_offset)) {
! 660: if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
! 661: // hit; should always be as we just fetched this data
! 662: return;
! 663: } else {
! 664: // this should never be, abort
! 665: PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
! 666: checksum2_disable_prefetch();
! 667: }
! 668: }
! 669: }
! 670: get_checksum2_nosimd(buf, len, sum, prefetch_offset);
! 671: }
! 672: #endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
! 673:
! 674: } // "C"
! 675:
! 676: /* Benchmark compilation
! 677:
! 678: The get_checksum1() benchmark runs through all available code paths in a
! 679: single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
! 680: recompiled for each code path (it always uses the fastest path available
! 681: on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
! 682: be used when applicable regardless of rsync being built with OpenSSL.
! 683:
! 684: Something like the following should compile and run the benchmarks:
! 685:
! 686: # if gcc
! 687: export CC=gcc
! 688: export CXX=g++
! 689: export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
! 690:
! 691: # else if clang
! 692: export CC=clang
! 693: export CXX=clang++
! 694: export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
! 695:
! 696: # /if
! 697:
! 698: export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
! 699: export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
! 700: export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
! 701: export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
! 702:
! 703: rm bench_csum*
! 704:
! 705: ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
! 706:
! 707: $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
! 708:
! 709: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
! 710: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
! 711:
! 712: $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
! 713: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
! 714:
! 715: $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
! 716: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
! 717:
! 718: ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
! 719:
! 720: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
! 721: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
! 722:
! 723: ./bench_csum1.all
! 724: ./bench_csum2.asm
! 725: ./bench_csum2.openssl
! 726: ./bench_csum2.sse2
! 727: ./bench_csum2.avx2
! 728:
! 729: */
! 730:
! 731: #if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
! 732: #pragma clang optimize off
! 733: #pragma GCC push_options
! 734: #pragma GCC optimize ("O0")
! 735:
! 736: #define ROUNDS 1024
! 737: #define BLOCK_LEN 1024*1024
! 738:
! 739: #ifndef CLOCK_MONOTONIC_RAW
! 740: #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
! 741: #endif
! 742: #endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
! 743:
! 744: #ifdef BENCHMARK_SIMD_CHECKSUM1
! 745: static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
! 746: struct timespec start, end;
! 747: uint64_t us;
! 748: uint32_t cs, s1, s2;
! 749: int i, next;
! 750:
! 751: clock_gettime(CLOCK_MONOTONIC_RAW, &start);
! 752: for (i = 0; i < ROUNDS; i++) {
! 753: s1 = s2 = 0;
! 754: next = func((schar*)buf, len, 0, &s1, &s2);
! 755: get_checksum1_default_1((schar*)buf, len, next, &s1, &s2);
! 756: }
! 757: clock_gettime(CLOCK_MONOTONIC_RAW, &end);
! 758: us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
! 759: cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
! 760: printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
! 761: }
! 762:
! 763: static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
! 764: uint32 cs = get_checksum1((char*)buf, len);
! 765: *ps1 = cs & 0xffff;
! 766: *ps2 = cs >> 16;
! 767: return len;
! 768: }
! 769:
! 770: int main() {
! 771: int i;
! 772: unsigned char* buf = (unsigned char*)malloc(BLOCK_LEN);
! 773: for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
! 774:
! 775: benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
! 776: benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
! 777: benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
! 778: benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
! 779: benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN);
! 780:
! 781: free(buf);
! 782: return 0;
! 783: }
! 784: #endif /* BENCHMARK_SIMD_CHECKSUM1 */
! 785:
! 786: #ifdef BENCHMARK_SIMD_CHECKSUM2
! 787: static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
! 788: struct timespec start, end;
! 789: uint64_t us;
! 790: unsigned char cs1[16];
! 791: unsigned char cs2[16];
! 792: int i;
! 793:
! 794: clock_gettime(CLOCK_MONOTONIC_RAW, &start);
! 795: for (i = 0; i < ROUNDS; i++) {
! 796: func(buf, len, (char*)cs1);
! 797: }
! 798: clock_gettime(CLOCK_MONOTONIC_RAW, &end);
! 799: us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
! 800:
! 801: func2(buf, len, (char*)cs2);
! 802:
! 803: float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
! 804: printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
! 805: for (i = 0; i < 16; i++) {
! 806: printf("%02x", cs1[i] & 0xFF);
! 807: }
! 808: printf(" :: ");
! 809: for (i = 0; i < 16; i++) {
! 810: printf("%02x", cs2[i] & 0xFF);
! 811: }
! 812: printf("\n");
! 813: }
! 814:
! 815: static void benchmark_inner(char* buf, int32 len, char* sum_out) {
! 816: // This should produce the same output for different optimizations
! 817: // levels, not the same as sanity_check()
! 818:
! 819: char* bufs[8] = {0};
! 820: int lens[8] = {0};
! 821: char* sums[8] = {0};
! 822:
! 823: bufs[0] = buf;
! 824: lens[0] = len;
! 825: sums[0] = sum_out;
! 826: md5_parallel(1, bufs, lens, sums, NULL, NULL);
! 827: }
! 828:
! 829: extern "C" {
! 830: extern void MD5P8_Init_c(MD5P8_CTX *ctx);
! 831: extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
! 832: extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
! 833: }
! 834:
! 835: static void sanity_check(char* buf, int32 len, char* sum_out) {
! 836: // This should produce the same output for different optimizations
! 837: // levels, not the same as benchmark_inner()
! 838: if (md5_parallel_slots() <= 1) {
! 839: MD5P8_CTX m5p8;
! 840: MD5P8_Init_c(&m5p8);
! 841: MD5P8_Update_c(&m5p8, (uchar *)buf, len);
! 842: MD5P8_Final_c((uchar *)sum_out, &m5p8);
! 843: } else {
! 844: MD5P8_CTX m5p8;
! 845: MD5P8_Init(&m5p8);
! 846: MD5P8_Update(&m5p8, (uchar *)buf, len);
! 847: MD5P8_Final((uchar *)sum_out, &m5p8);
! 848: }
! 849: }
! 850:
! 851: int main() {
! 852: // This benchmarks the parallel MD5 checksum rather than get_checksum2()
! 853: // as the latter would require compiling in a lot of rsync's code, but
! 854: // it touches all the same internals so the performance should be nearly
! 855: // identical.
! 856:
! 857: int i;
! 858: char* buf = (char*)malloc(BLOCK_LEN);
! 859: for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
! 860:
! 861: const char* method = "?";
! 862: switch (md5_parallel_slots()) {
! 863: case 8: method = "AVX2"; break;
! 864: case 4: method = "SSE2"; break;
! 865: #ifdef USE_OPENSSL
! 866: case 1: method = "OpenSSL"; break;
! 867: #elif (CSUM_CHUNK == 64)
! 868: case 1: method = "ASM"; break;
! 869: #else
! 870: // this won't happen unless you modified code somewhere
! 871: case 1: method = "Raw-C"; break;
! 872: #endif
! 873: }
! 874:
! 875: benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
! 876:
! 877: free(buf);
! 878: return 0;
! 879: }
! 880: #endif /* BENCHMARK_SIMD_CHECKSUM2 */
! 881:
! 882: #if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
! 883: #pragma GCC pop_options
! 884: #pragma clang optimize on
! 885: #endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
! 886:
! 887: #endif /* HAVE_SIMD */
! 888: #endif /* __cplusplus */
! 889: #endif /* __x86_64__ */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>