1: /*
2: * SSE2/SSSE3/AVX2-optimized routines to support checksumming of bytes.
3: *
4: * Copyright (C) 1996 Andrew Tridgell
5: * Copyright (C) 1996 Paul Mackerras
6: * Copyright (C) 2004-2020 Wayne Davison
7: * Copyright (C) 2020 Jorrit Jongma
8: *
9: * This program is free software; you can redistribute it and/or modify
10: * it under the terms of the GNU General Public License as published by
11: * the Free Software Foundation; either version 3 of the License, or
12: * (at your option) any later version.
13: *
14: * This program is distributed in the hope that it will be useful,
15: * but WITHOUT ANY WARRANTY; without even the implied warranty of
16: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17: * GNU General Public License for more details.
18: *
19: * You should have received a copy of the GNU General Public License along
20: * with this program; if not, visit the http://fsf.org website.
21: */
22: /*
23: * Optimization target for get_checksum1() was the Intel Atom D2700, the
24: * slowest CPU in the test set and the most likely to be CPU limited during
25: * transfers. The combination of intrinsics was chosen specifically for the
26: * most gain on that CPU, other combinations were occasionally slightly
27: * faster on the others.
28: *
29: * While on more modern CPUs transfers are less likely to be CPU limited
30: * (at least by this specific function), lower CPU usage is always better.
31: * Improvements may still be seen when matching chunks from NVMe storage
32: * even on newer CPUs.
33: *
34: * Benchmarks (in MB/s) C SSE2 SSSE3 AVX2
35: * - Intel Atom D2700 550 750 1000 N/A
36: * - Intel i7-7700hq 1850 2550 4050 6200
37: * - AMD ThreadRipper 2950x 2900 5600 8950 8100
38: *
39: * Curiously the AMD is slower with AVX2 than SSSE3, while the Intel is
40: * significantly faster. AVX2 is kept because it's more likely to relieve
41: * the bottleneck on the slower CPU.
42: *
43: * This optimization for get_checksum1() is intentionally limited to x86-64
44: * as no 32-bit CPU was available for testing. As 32-bit CPUs only have half
45: * the available xmm registers, this optimized version may not be faster than
46: * the pure C version anyway. Note that all x86-64 CPUs support at least SSE2.
47: *
48: * This file is compiled using GCC 4.8+/clang 6+'s C++ front end to allow the
49: * use of the target attribute, selecting the fastest code path based on
50: * dispatch priority (GCC 5) or runtime detection of CPU capabilities (GCC 6+).
51: * GCC 4.x are not supported to ease configure.ac logic.
52: *
53: * ----
54: *
55: * get_checksum2() is optimized for the case where the selected transfer
56: * checksum is MD5. MD5 can't be made significantly faster with SIMD
57: * instructions than the assembly version already included but SIMD
58: * instructions can be used to hash multiple streams in parallel (see
59: * simd-md5-parallel-x86_64.cpp for details and benchmarks). As rsync's
60: * block-matching algorithm hashes the blocks independently (in contrast to
61: * the whole-file checksum) this method can be employed here.
62: *
63: * To prevent needing to modify the core rsync sources significantly, a
64: * prefetching strategy is used. When a checksum2 is requested, the code
65: * reads ahead several blocks, creates the MD5 hashes for each block in
66: * parallel, returns the hash for the first block, and caches the results
67: * for the other blocks to return in future calls to get_checksum2().
68: */
69:
70: #ifdef __x86_64__
71: #ifdef __cplusplus
72:
73: extern "C" {
74:
75: #include "rsync.h"
76:
77: }
78:
79: #ifdef HAVE_SIMD
80:
81: #include <immintrin.h>
82:
83: /* Some clang versions don't like it when you use static with multi-versioned functions: linker errors */
84: #ifdef __clang__
85: #define MVSTATIC
86: #else
87: #define MVSTATIC static
88: #endif
89:
90: // Missing from the headers on gcc 6 and older, clang 8 and older
91: typedef long long __m128i_u __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
92: typedef long long __m256i_u __attribute__((__vector_size__(32), __may_alias__, __aligned__(1)));
93:
94: /* Compatibility macros to let our SSSE3 algorithm run with only SSE2.
95: These used to be neat individual functions with target attributes switching between SSE2 and SSSE3 implementations
96: as needed, but though this works perfectly with GCC, clang fails to inline those properly leading to a near 50%
97: performance drop - combined with static and inline modifiers gets you linker errors and even compiler crashes...
98: */
99:
100: #define SSE2_INTERLEAVE_ODD_EPI16(a, b) _mm_packs_epi32(_mm_srai_epi32(a, 16), _mm_srai_epi32(b, 16))
101: #define SSE2_INTERLEAVE_EVEN_EPI16(a, b) SSE2_INTERLEAVE_ODD_EPI16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2))
102: #define SSE2_MULU_ODD_EPI8(a, b) _mm_mullo_epi16(_mm_srli_epi16(a, 8), _mm_srai_epi16(b, 8))
103: #define SSE2_MULU_EVEN_EPI8(a, b) _mm_mullo_epi16(_mm_and_si128(a, _mm_set1_epi16(0xFF)), _mm_srai_epi16(_mm_slli_si128(b, 1), 8))
104:
105: #define SSE2_HADDS_EPI16(a, b) _mm_adds_epi16(SSE2_INTERLEAVE_EVEN_EPI16(a, b), SSE2_INTERLEAVE_ODD_EPI16(a, b))
106: #define SSE2_MADDUBS_EPI16(a, b) _mm_adds_epi16(SSE2_MULU_EVEN_EPI8(a, b), SSE2_MULU_ODD_EPI8(a, b))
107:
108: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
109: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
110: __attribute__ ((target("default"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) { return i; }
111:
112: /*
113: Original loop per 4 bytes:
114: s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
115: s1 += buf[i] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET;
116:
117: SSE2/SSSE3 loop per 32 bytes:
118: int16 t1[8];
119: int16 t2[8];
120: for (int j = 0; j < 8; j++) {
121: t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
122: t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
123: }
124: s2 += 32*s1 + (uint32)(
125: 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6] +
126: t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
127: ) + 528*CHAR_OFFSET;
128: s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]) +
129: 32*CHAR_OFFSET;
130: */
131: __attribute__ ((target("ssse3"))) MVSTATIC int32 get_checksum1_ssse3_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
132: {
133: if (len > 32) {
134: int aligned = ((uintptr_t)buf & 15) == 0;
135:
136: uint32 x[4] = {0};
137: x[0] = *ps1;
138: __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
139: x[0] = *ps2;
140: __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
141:
142: const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
143: __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
144:
145: for (; i < (len-32); i+=32) {
146: // Load ... 2*[int8*16]
147: __m128i in8_1, in8_2;
148: if (!aligned) {
149: // Synonymous with _mm_loadu_si128 on all but a handful of old CPUs
150: in8_1 = _mm_lddqu_si128((__m128i_u*)&buf[i]);
151: in8_2 = _mm_lddqu_si128((__m128i_u*)&buf[i + 16]);
152: } else {
153: in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
154: in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
155: }
156:
157: // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
158: // Fastest, even though multiply by 1
159: __m128i mul_one = _mm_set1_epi8(1);
160: __m128i add16_1 = _mm_maddubs_epi16(mul_one, in8_1);
161: __m128i add16_2 = _mm_maddubs_epi16(mul_one, in8_2);
162:
163: // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
164: __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
165: __m128i mul_add16_1 = _mm_maddubs_epi16(mul_const, in8_1);
166: __m128i mul_add16_2 = _mm_maddubs_epi16(mul_const, in8_2);
167:
168: // s2 += 32*s1
169: ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
170:
171: // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
172: // Shifting left, then shifting right again and shuffling (rather than just
173: // shifting right as with mul32 below) to cheaply end up with the correct sign
174: // extension as we go from int16 to int32.
175: __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
176: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
177: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
178: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
179: sum_add32 = _mm_srai_epi32(sum_add32, 16);
180: sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
181:
182: // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
183: __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
184: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
185: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
186: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
187: sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
188: sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
189:
190: // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
191: ss1 = _mm_add_epi32(ss1, sum_add32);
192:
193: // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
194: ss2 = _mm_add_epi32(ss2, sum_mul_add32);
195:
196: // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
197: // We could've combined this with generating sum_add32 above and
198: // save an instruction but benchmarking shows that as being slower
199: __m128i add16 = _mm_hadds_epi16(add16_1, add16_2);
200:
201: // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
202: __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
203:
204: // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
205: mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
206: mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
207:
208: // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
209: ss2 = _mm_add_epi32(ss2, mul32);
210:
211: #if CHAR_OFFSET != 0
212: // s1 += 32*CHAR_OFFSET
213: __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
214: ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
215:
216: // s2 += 528*CHAR_OFFSET
217: char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
218: ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
219: #endif
220: }
221:
222: _mm_store_si128((__m128i_u*)x, ss1);
223: *ps1 = x[0];
224: _mm_store_si128((__m128i_u*)x, ss2);
225: *ps2 = x[0];
226: }
227: return i;
228: }
229:
230: /*
231: Same as SSSE3 version, but using macros defined above to emulate SSSE3 calls that are not available with SSE2.
232: For GCC-only the SSE2 and SSSE3 versions could be a single function calling other functions with the right
233: target attributes to emulate SSSE3 calls on SSE2 if needed, but clang doesn't inline those properly leading
234: to a near 50% performance drop.
235: */
236: __attribute__ ((target("sse2"))) MVSTATIC int32 get_checksum1_sse2_32(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
237: {
238: if (len > 32) {
239: int aligned = ((uintptr_t)buf & 15) == 0;
240:
241: uint32 x[4] = {0};
242: x[0] = *ps1;
243: __m128i ss1 = _mm_loadu_si128((__m128i_u*)x);
244: x[0] = *ps2;
245: __m128i ss2 = _mm_loadu_si128((__m128i_u*)x);
246:
247: const int16 mul_t1_buf[8] = {28, 24, 20, 16, 12, 8, 4, 0};
248: __m128i mul_t1 = _mm_loadu_si128((__m128i_u*)mul_t1_buf);
249:
250: for (; i < (len-32); i+=32) {
251: // Load ... 2*[int8*16]
252: __m128i in8_1, in8_2;
253: if (!aligned) {
254: in8_1 = _mm_loadu_si128((__m128i_u*)&buf[i]);
255: in8_2 = _mm_loadu_si128((__m128i_u*)&buf[i + 16]);
256: } else {
257: in8_1 = _mm_load_si128((__m128i_u*)&buf[i]);
258: in8_2 = _mm_load_si128((__m128i_u*)&buf[i + 16]);
259: }
260:
261: // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*8]
262: // Fastest, even though multiply by 1
263: __m128i mul_one = _mm_set1_epi8(1);
264: __m128i add16_1 = SSE2_MADDUBS_EPI16(mul_one, in8_1);
265: __m128i add16_2 = SSE2_MADDUBS_EPI16(mul_one, in8_2);
266:
267: // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*8]
268: __m128i mul_const = _mm_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
269: __m128i mul_add16_1 = SSE2_MADDUBS_EPI16(mul_const, in8_1);
270: __m128i mul_add16_2 = SSE2_MADDUBS_EPI16(mul_const, in8_2);
271:
272: // s2 += 32*s1
273: ss2 = _mm_add_epi32(ss2, _mm_slli_epi32(ss1, 5));
274:
275: // [sum(t1[0]..t1[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
276: // Shifting left, then shifting right again and shuffling (rather than just
277: // shifting right as with mul32 below) to cheaply end up with the correct sign
278: // extension as we go from int16 to int32.
279: __m128i sum_add32 = _mm_add_epi16(add16_1, add16_2);
280: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 2));
281: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 4));
282: sum_add32 = _mm_add_epi16(sum_add32, _mm_slli_si128(sum_add32, 8));
283: sum_add32 = _mm_srai_epi32(sum_add32, 16);
284: sum_add32 = _mm_shuffle_epi32(sum_add32, 3);
285:
286: // [sum(t2[0]..t2[7]), X, X, X] [int32*4]; faster than multiple _mm_hadds_epi16
287: __m128i sum_mul_add32 = _mm_add_epi16(mul_add16_1, mul_add16_2);
288: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 2));
289: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 4));
290: sum_mul_add32 = _mm_add_epi16(sum_mul_add32, _mm_slli_si128(sum_mul_add32, 8));
291: sum_mul_add32 = _mm_srai_epi32(sum_mul_add32, 16);
292: sum_mul_add32 = _mm_shuffle_epi32(sum_mul_add32, 3);
293:
294: // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7]
295: ss1 = _mm_add_epi32(ss1, sum_add32);
296:
297: // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7]
298: ss2 = _mm_add_epi32(ss2, sum_mul_add32);
299:
300: // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*8]
301: // We could've combined this with generating sum_add32 above and
302: // save an instruction but benchmarking shows that as being slower
303: __m128i add16 = SSE2_HADDS_EPI16(add16_1, add16_2);
304:
305: // [t1[0], t1[1], ...] -> [t1[0]*28 + t1[1]*24, ...] [int32*4]
306: __m128i mul32 = _mm_madd_epi16(add16, mul_t1);
307:
308: // [sum(mul32), X, X, X] [int32*4]; faster than multiple _mm_hadd_epi32
309: mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 4));
310: mul32 = _mm_add_epi32(mul32, _mm_srli_si128(mul32, 8));
311:
312: // s2 += 28*t1[0] + 24*t1[1] + 20*t1[2] + 16*t1[3] + 12*t1[4] + 8*t1[5] + 4*t1[6]
313: ss2 = _mm_add_epi32(ss2, mul32);
314:
315: #if CHAR_OFFSET != 0
316: // s1 += 32*CHAR_OFFSET
317: __m128i char_offset_multiplier = _mm_set1_epi32(32 * CHAR_OFFSET);
318: ss1 = _mm_add_epi32(ss1, char_offset_multiplier);
319:
320: // s2 += 528*CHAR_OFFSET
321: char_offset_multiplier = _mm_set1_epi32(528 * CHAR_OFFSET);
322: ss2 = _mm_add_epi32(ss2, char_offset_multiplier);
323: #endif
324: }
325:
326: _mm_store_si128((__m128i_u*)x, ss1);
327: *ps1 = x[0];
328: _mm_store_si128((__m128i_u*)x, ss2);
329: *ps2 = x[0];
330: }
331: return i;
332: }
333:
334: /*
335: AVX2 loop per 64 bytes:
336: int16 t1[16];
337: int16 t2[16];
338: for (int j = 0; j < 16; j++) {
339: t1[j] = buf[j*4 + i] + buf[j*4 + i+1] + buf[j*4 + i+2] + buf[j*4 + i+3];
340: t2[j] = 4*buf[j*4 + i] + 3*buf[j*4 + i+1] + 2*buf[j*4 + i+2] + buf[j*4 + i+3];
341: }
342: s2 += 64*s1 + (uint32)(
343: 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14] +
344: t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
345: ) + 2080*CHAR_OFFSET;
346: s1 += (uint32)(t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]) +
347: 64*CHAR_OFFSET;
348: */
349: __attribute__ ((target("avx2"))) MVSTATIC int32 get_checksum1_avx2_64(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
350: {
351: if (len > 64) {
352: // Instructions reshuffled compared to SSE2 for slightly better performance
353: int aligned = ((uintptr_t)buf & 31) == 0;
354:
355: uint32 x[8] = {0};
356: x[0] = *ps1;
357: __m256i ss1 = _mm256_lddqu_si256((__m256i_u*)x);
358: x[0] = *ps2;
359: __m256i ss2 = _mm256_lddqu_si256((__m256i_u*)x);
360:
361: // The order gets shuffled compared to SSE2
362: const int16 mul_t1_buf[16] = {60, 56, 52, 48, 28, 24, 20, 16, 44, 40, 36, 32, 12, 8, 4, 0};
363: __m256i mul_t1 = _mm256_lddqu_si256((__m256i_u*)mul_t1_buf);
364:
365: for (; i < (len-64); i+=64) {
366: // Load ... 2*[int8*32]
367: __m256i in8_1, in8_2;
368: if (!aligned) {
369: in8_1 = _mm256_lddqu_si256((__m256i_u*)&buf[i]);
370: in8_2 = _mm256_lddqu_si256((__m256i_u*)&buf[i + 32]);
371: } else {
372: in8_1 = _mm256_load_si256((__m256i_u*)&buf[i]);
373: in8_2 = _mm256_load_si256((__m256i_u*)&buf[i + 32]);
374: }
375:
376: // Prefetch for next loops. This has no observable effect on the
377: // tested AMD but makes as much as 20% difference on the Intel.
378: // Curiously that same Intel sees no benefit from this with SSE2
379: // or SSSE3.
380: _mm_prefetch(&buf[i + 64], _MM_HINT_T0);
381: _mm_prefetch(&buf[i + 96], _MM_HINT_T0);
382: _mm_prefetch(&buf[i + 128], _MM_HINT_T0);
383: _mm_prefetch(&buf[i + 160], _MM_HINT_T0);
384:
385: // (1*buf[i] + 1*buf[i+1]), (1*buf[i+2], 1*buf[i+3]), ... 2*[int16*16]
386: // Fastest, even though multiply by 1
387: __m256i mul_one = _mm256_set1_epi8(1);
388: __m256i add16_1 = _mm256_maddubs_epi16(mul_one, in8_1);
389: __m256i add16_2 = _mm256_maddubs_epi16(mul_one, in8_2);
390:
391: // (4*buf[i] + 3*buf[i+1]), (2*buf[i+2], buf[i+3]), ... 2*[int16*16]
392: __m256i mul_const = _mm256_set1_epi32(4 + (3 << 8) + (2 << 16) + (1 << 24));
393: __m256i mul_add16_1 = _mm256_maddubs_epi16(mul_const, in8_1);
394: __m256i mul_add16_2 = _mm256_maddubs_epi16(mul_const, in8_2);
395:
396: // s2 += 64*s1
397: ss2 = _mm256_add_epi32(ss2, _mm256_slli_epi32(ss1, 6));
398:
399: // [t1[0] + t1[1], t1[2] + t1[3] ...] [int16*16]
400: __m256i add16 = _mm256_hadds_epi16(add16_1, add16_2);
401:
402: // [t1[0], t1[1], ...] -> [t1[0]*60 + t1[1]*56, ...] [int32*8]
403: __m256i mul32 = _mm256_madd_epi16(add16, mul_t1);
404:
405: // [sum(t1[0]..t1[15]), X, X, X, X, X, X, X] [int32*8]
406: __m256i sum_add32 = _mm256_add_epi16(add16_1, add16_2);
407: sum_add32 = _mm256_add_epi16(sum_add32, _mm256_permute4x64_epi64(sum_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
408: sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 2));
409: sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 4));
410: sum_add32 = _mm256_add_epi16(sum_add32, _mm256_slli_si256(sum_add32, 8));
411: sum_add32 = _mm256_srai_epi32(sum_add32, 16);
412: sum_add32 = _mm256_shuffle_epi32(sum_add32, 3);
413:
414: // s1 += t1[0] + t1[1] + t1[2] + t1[3] + t1[4] + t1[5] + t1[6] + t1[7] + t1[8] + t1[9] + t1[10] + t1[11] + t1[12] + t1[13] + t1[14] + t1[15]
415: ss1 = _mm256_add_epi32(ss1, sum_add32);
416:
417: // [sum(t2[0]..t2[15]), X, X, X, X, X, X, X] [int32*8]
418: __m256i sum_mul_add32 = _mm256_add_epi16(mul_add16_1, mul_add16_2);
419: sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_permute4x64_epi64(sum_mul_add32, 2 + (3 << 2) + (0 << 4) + (1 << 6)));
420: sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 2));
421: sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 4));
422: sum_mul_add32 = _mm256_add_epi16(sum_mul_add32, _mm256_slli_si256(sum_mul_add32, 8));
423: sum_mul_add32 = _mm256_srai_epi32(sum_mul_add32, 16);
424: sum_mul_add32 = _mm256_shuffle_epi32(sum_mul_add32, 3);
425:
426: // s2 += t2[0] + t2[1] + t2[2] + t2[3] + t2[4] + t2[5] + t2[6] + t2[7] + t2[8] + t2[9] + t2[10] + t2[11] + t2[12] + t2[13] + t2[14] + t2[15]
427: ss2 = _mm256_add_epi32(ss2, sum_mul_add32);
428:
429: // [sum(mul32), X, X, X, X, X, X, X] [int32*8]
430: mul32 = _mm256_add_epi32(mul32, _mm256_permute2x128_si256(mul32, mul32, 1));
431: mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 4));
432: mul32 = _mm256_add_epi32(mul32, _mm256_srli_si256(mul32, 8));
433:
434: // s2 += 60*t1[0] + 56*t1[1] + 52*t1[2] + 48*t1[3] + 44*t1[4] + 40*t1[5] + 36*t1[6] + 32*t1[7] + 28*t1[8] + 24*t1[9] + 20*t1[10] + 16*t1[11] + 12*t1[12] + 8*t1[13] + 4*t1[14]
435: ss2 = _mm256_add_epi32(ss2, mul32);
436:
437: #if CHAR_OFFSET != 0
438: // s1 += 64*CHAR_OFFSET
439: __m256i char_offset_multiplier = _mm256_set1_epi32(64 * CHAR_OFFSET);
440: ss1 = _mm256_add_epi32(ss1, char_offset_multiplier);
441:
442: // s2 += 2080*CHAR_OFFSET
443: char_offset_multiplier = _mm256_set1_epi32(2080 * CHAR_OFFSET);
444: ss2 = _mm256_add_epi32(ss2, char_offset_multiplier);
445: #endif
446: }
447:
448: _mm256_store_si256((__m256i_u*)x, ss1);
449: *ps1 = x[0];
450: _mm256_store_si256((__m256i_u*)x, ss2);
451: *ps2 = x[0];
452: }
453: return i;
454: }
455:
456: static int32 get_checksum1_default_1(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2)
457: {
458: uint32 s1 = *ps1;
459: uint32 s2 = *ps2;
460: for (; i < (len-4); i+=4) {
461: s2 += 4*(s1 + buf[i]) + 3*buf[i+1] + 2*buf[i+2] + buf[i+3] + 10*CHAR_OFFSET;
462: s1 += (buf[i+0] + buf[i+1] + buf[i+2] + buf[i+3] + 4*CHAR_OFFSET);
463: }
464: for (; i < len; i++) {
465: s1 += (buf[i]+CHAR_OFFSET); s2 += s1;
466: }
467: *ps1 = s1;
468: *ps2 = s2;
469: return i;
470: }
471:
472: /* With GCC 10 putting this implementation inside 'extern "C"' causes an
473: assembler error. That worked fine on GCC 5-9 and clang 6-10...
474: */
475: static inline uint32 get_checksum1_cpp(char *buf1, int32 len)
476: {
477: int32 i = 0;
478: uint32 s1 = 0;
479: uint32 s2 = 0;
480:
481: // multiples of 64 bytes using AVX2 (if available)
482: i = get_checksum1_avx2_64((schar*)buf1, len, i, &s1, &s2);
483:
484: // multiples of 32 bytes using SSSE3 (if available)
485: i = get_checksum1_ssse3_32((schar*)buf1, len, i, &s1, &s2);
486:
487: // multiples of 32 bytes using SSE2 (if available)
488: i = get_checksum1_sse2_32((schar*)buf1, len, i, &s1, &s2);
489:
490: // whatever is left
491: i = get_checksum1_default_1((schar*)buf1, len, i, &s1, &s2);
492:
493: return (s1 & 0xffff) + (s2 << 16);
494: }
495:
496: extern "C" {
497:
498: uint32 get_checksum1(char *buf1, int32 len)
499: {
500: return get_checksum1_cpp(buf1, len);
501: }
502:
503: #if !defined(BENCHMARK_SIMD_CHECKSUM1)
504:
505: // see simd-md5-parallel-x86_64.cpp
506: extern int md5_parallel_slots();
507: extern int md5_parallel(int streams, char** buf, int* len, char** sum, char* pre4, char* post4);
508:
509: #endif /* !BENCHMARK_SIMD_CHECKSUM1 */
510:
511: #if !defined(BENCHMARK_SIMD_CHECKSUM1) && !defined(BENCHMARK_SIMD_CHECKSUM2)
512:
513: #define PREFETCH_ENABLE 1 // debugging
514:
515: #if 0 // debugging
516: #define PREFETCH_PRINTF(f_, ...) printf((f_), ##__VA_ARGS__)
517: #else
518: #define PREFETCH_PRINTF(f_, ...) (void)0;
519: #endif
520:
521: #define PREFETCH_MIN_LEN 1024 // the overhead is unlikely to be worth the gain for small blocks
522: #define PREFETCH_MAX_BLOCKS 8
523:
524: typedef struct {
525: int in_use;
526: OFF_T offset;
527: int32 len;
528: char sum[SUM_LENGTH];
529: } prefetch_sum_t;
530:
531: typedef struct {
532: struct map_struct *map;
533: OFF_T len;
534: OFF_T last;
535: int32 blocklen;
536: int blocks;
537: prefetch_sum_t sums[PREFETCH_MAX_BLOCKS];
538: } prefetch_t;
539:
540: prefetch_t *prefetch;
541:
542: extern int xfersum_type;
543: extern int checksum_seed;
544: extern int proper_seed_order;
545: extern void get_checksum2_nosimd(char *buf, int32 len, char *sum, OFF_T prefetch_offset);
546:
547: extern char *map_ptr(struct map_struct *map, OFF_T offset, int32 len);
548:
549: void checksum2_disable_prefetch()
550: {
551: if (prefetch) {
552: PREFETCH_PRINTF("checksum2_disable_prefetch\n");
553: free(prefetch);
554: prefetch = NULL;
555: }
556: }
557:
558: void checksum2_enable_prefetch(UNUSED(struct map_struct *map), UNUSED(OFF_T len), UNUSED(int32 blocklen))
559: {
560: #ifdef PREFETCH_ENABLE
561: checksum2_disable_prefetch();
562: int slots = md5_parallel_slots();
563: if ((xfersum_type == CSUM_MD5 || xfersum_type == CSUM_MD5P8) && slots > 1 && len >= blocklen * PREFETCH_MAX_BLOCKS && blocklen >= PREFETCH_MIN_LEN) {
564: prefetch = (prefetch_t*)malloc(sizeof(prefetch_t));
565: memset(prefetch, 0, sizeof(prefetch_t));
566: prefetch->map = map;
567: prefetch->len = len;
568: prefetch->last = 0;
569: prefetch->blocklen = blocklen;
570: prefetch->blocks = MIN(PREFETCH_MAX_BLOCKS, slots);
571: PREFETCH_PRINTF("checksum2_enable_prefetch len:%ld blocklen:%d blocks:%d\n", prefetch->len, prefetch->blocklen, prefetch->blocks);
572: }
573: #endif
574: }
575:
576: static inline void checksum2_reset_prefetch()
577: {
578: for (int i = 0; i < PREFETCH_MAX_BLOCKS; i++) {
579: prefetch->sums[i].in_use = 0;
580: }
581: }
582:
583: static int get_checksum2_prefetched(int32 len, char* sum, OFF_T prefetch_offset)
584: {
585: if (prefetch->sums[0].in_use) {
586: if ((prefetch->sums[0].offset == prefetch_offset) && (prefetch->sums[0].len == len)) {
587: memcpy(sum, prefetch->sums[0].sum, SUM_LENGTH);
588: for (int i = 0; i < PREFETCH_MAX_BLOCKS - 1; i++) {
589: prefetch->sums[i] = prefetch->sums[i + 1];
590: }
591: prefetch->sums[PREFETCH_MAX_BLOCKS - 1].in_use = 0;
592: PREFETCH_PRINTF("checksum2_prefetch HIT len:%d offset:%ld\n", len, prefetch_offset);
593: return 1;
594: } else {
595: // unexpected access, reset cache
596: PREFETCH_PRINTF("checksum2_prefetch MISS len:%d offset:%ld\n", len, prefetch_offset);
597: checksum2_reset_prefetch();
598: }
599: }
600: return 0;
601: }
602:
603: static int checksum2_perform_prefetch(OFF_T prefetch_offset)
604: {
605: int blocks = MIN(MAX(1, (prefetch->len + prefetch->blocklen - 1) / prefetch->blocklen), prefetch->blocks);
606: if (blocks < 2) return 0; // fall through to non-simd, probably faster
607:
608: int32 total = 0;
609: int i;
610: for (i = 0; i < blocks; i++) {
611: prefetch->sums[i].offset = prefetch_offset + total;
612: prefetch->sums[i].len = MIN(prefetch->blocklen, prefetch->len - prefetch_offset - total);
613: prefetch->sums[i].in_use = 0;
614: total += prefetch->sums[i].len;
615: }
616: for (; i < PREFETCH_MAX_BLOCKS; i++) {
617: prefetch->sums[i].in_use = 0;
618: }
619:
620: uchar seedbuf[4];
621: SIVALu(seedbuf, 0, checksum_seed);
622:
623: PREFETCH_PRINTF("checksum2_perform_prefetch pos:%ld len:%d blocks:%d\n", prefetch_offset, total, blocks);
624: char* mapbuf = map_ptr(prefetch->map, prefetch_offset, total);
625: char* bufs[PREFETCH_MAX_BLOCKS] = {0};
626: int lens[PREFETCH_MAX_BLOCKS] = {0};
627: char* sums[PREFETCH_MAX_BLOCKS] = {0};
628: for (i = 0; i < blocks; i++) {
629: bufs[i] = mapbuf + prefetch->sums[i].offset - prefetch_offset;
630: lens[i] = prefetch->sums[i].len;
631: sums[i] = prefetch->sums[i].sum;
632: }
633: if (md5_parallel(blocks, bufs, lens, sums, (proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL, (!proper_seed_order && checksum_seed) ? (char*)seedbuf : NULL)) {
634: for (i = 0; i < blocks; i++) {
635: prefetch->sums[i].in_use = 1;
636: }
637: return 1;
638: } else {
639: // this should never be, abort
640: PREFETCH_PRINTF("checksum2_perform_prefetch PMD5 ABORT\n");
641: checksum2_disable_prefetch();
642: }
643: return 0;
644: }
645:
646: void get_checksum2(char *buf, int32 len, char *sum, OFF_T prefetch_offset)
647: {
648: if (prefetch) {
649: PREFETCH_PRINTF("get_checksum2 %d @ %ld\n", len, prefetch_offset);
650: OFF_T last = prefetch->last;
651: prefetch->last = prefetch_offset;
652: if ((prefetch_offset != 0) && (prefetch_offset != last + prefetch->blocklen)) {
653: // we're looking around trying to align blocks, prefetching will slow things down
654: PREFETCH_PRINTF("get_checksum2 SEEK\n");
655: checksum2_reset_prefetch();
656: } else if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
657: // hit
658: return;
659: } else if (checksum2_perform_prefetch(prefetch_offset)) {
660: if (get_checksum2_prefetched(len, sum, prefetch_offset)) {
661: // hit; should always be as we just fetched this data
662: return;
663: } else {
664: // this should never be, abort
665: PREFETCH_PRINTF("get_checksum2 MISSING DATA ABORT\n");
666: checksum2_disable_prefetch();
667: }
668: }
669: }
670: get_checksum2_nosimd(buf, len, sum, prefetch_offset);
671: }
672: #endif /* !BENCHMARK_SIMD_CHECKSUM1 && !BENCHMARK_SIMD_CHECKSUM2 */
673:
674: } // "C"
675:
676: /* Benchmark compilation
677:
678: The get_checksum1() benchmark runs through all available code paths in a
679: single execution, the get_checksum2()/MD5 and MD5P8 benchmark needs to be
680: recompiled for each code path (it always uses the fastest path available
681: on the current CPU otherwise). Note that SSE2/AVX2 MD5 optimizations will
682: be used when applicable regardless of rsync being built with OpenSSL.
683:
684: Something like the following should compile and run the benchmarks:
685:
686: # if gcc
687: export CC=gcc
688: export CXX=g++
689: export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti"
690:
691: # else if clang
692: export CC=clang
693: export CXX=clang++
694: export CXX_BASE="-g -O3 -fno-exceptions -fno-rtti -fno-slp-vectorize"
695:
696: # /if
697:
698: export CONF_EXTRA="--disable-md2man --disable-zstd --disable-lz4 --disable-xxhash"
699: export CXX_CSUM1="$CXX_BASE simd-checksum-x86_64.cpp"
700: export CXX_MD5P="$CXX_BASE -c -o simd-md5-parallel-x86_64.o simd-md5-parallel-x86_64.cpp"
701: export CXX_CSUM2="$CXX_BASE simd-checksum-x86_64.cpp simd-md5-parallel-x86_64.o lib/md5.o lib/md5p8.o lib/md5-asm-x86_64.o"
702:
703: rm bench_csum*
704:
705: ./configure --disable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
706:
707: $CXX -DBENCHMARK_SIMD_CHECKSUM1 $CXX_CSUM1 -o bench_csum1.all
708:
709: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
710: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.asm
711:
712: $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_SSE2 $CXX_MD5P
713: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.sse2
714:
715: $CXX -DBENCHMARK_SIMD_CHECKSUM2 -DPMD5_ALLOW_AVX2 $CXX_MD5P
716: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.avx2
717:
718: ./configure --enable-openssl --enable-simd $CONF_EXTRA && make clean && make -j4
719:
720: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_MD5P
721: $CXX -DBENCHMARK_SIMD_CHECKSUM2 $CXX_CSUM2 -o bench_csum2.openssl -lcrypto
722:
723: ./bench_csum1.all
724: ./bench_csum2.asm
725: ./bench_csum2.openssl
726: ./bench_csum2.sse2
727: ./bench_csum2.avx2
728:
729: */
730:
731: #if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
732: #pragma clang optimize off
733: #pragma GCC push_options
734: #pragma GCC optimize ("O0")
735:
736: #define ROUNDS 1024
737: #define BLOCK_LEN 1024*1024
738:
739: #ifndef CLOCK_MONOTONIC_RAW
740: #define CLOCK_MONOTONIC_RAW CLOCK_MONOTONIC
741: #endif
742: #endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
743:
744: #ifdef BENCHMARK_SIMD_CHECKSUM1
745: static void benchmark(const char* desc, int32 (*func)(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2), schar* buf, int32 len) {
746: struct timespec start, end;
747: uint64_t us;
748: uint32_t cs, s1, s2;
749: int i, next;
750:
751: clock_gettime(CLOCK_MONOTONIC_RAW, &start);
752: for (i = 0; i < ROUNDS; i++) {
753: s1 = s2 = 0;
754: next = func((schar*)buf, len, 0, &s1, &s2);
755: get_checksum1_default_1((schar*)buf, len, next, &s1, &s2);
756: }
757: clock_gettime(CLOCK_MONOTONIC_RAW, &end);
758: us = next == 0 ? 0 : (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
759: cs = next == 0 ? 0 : (s1 & 0xffff) + (s2 << 16);
760: printf("CSUM1 :: %-5s :: %5.0f MB/s :: %08x\n", desc, us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0, cs);
761: }
762:
763: static int32 get_checksum1_auto(schar* buf, int32 len, int32 i, uint32* ps1, uint32* ps2) {
764: uint32 cs = get_checksum1((char*)buf, len);
765: *ps1 = cs & 0xffff;
766: *ps2 = cs >> 16;
767: return len;
768: }
769:
770: int main() {
771: int i;
772: unsigned char* buf = (unsigned char*)malloc(BLOCK_LEN);
773: for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
774:
775: benchmark("Auto", get_checksum1_auto, (schar*)buf, BLOCK_LEN);
776: benchmark("Raw-C", get_checksum1_default_1, (schar*)buf, BLOCK_LEN);
777: benchmark("SSE2", get_checksum1_sse2_32, (schar*)buf, BLOCK_LEN);
778: benchmark("SSSE3", get_checksum1_ssse3_32, (schar*)buf, BLOCK_LEN);
779: benchmark("AVX2", get_checksum1_avx2_64, (schar*)buf, BLOCK_LEN);
780:
781: free(buf);
782: return 0;
783: }
784: #endif /* BENCHMARK_SIMD_CHECKSUM1 */
785:
786: #ifdef BENCHMARK_SIMD_CHECKSUM2
787: static void benchmark(const char* desc, void (*func)(char* buf, int32 len, char* sum_out), void (*func2)(char* buf, int32 len, char* sum_out), char* buf, int32 len, int streams) {
788: struct timespec start, end;
789: uint64_t us;
790: unsigned char cs1[16];
791: unsigned char cs2[16];
792: int i;
793:
794: clock_gettime(CLOCK_MONOTONIC_RAW, &start);
795: for (i = 0; i < ROUNDS; i++) {
796: func(buf, len, (char*)cs1);
797: }
798: clock_gettime(CLOCK_MONOTONIC_RAW, &end);
799: us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
800:
801: func2(buf, len, (char*)cs2);
802:
803: float perf = us ? (float)(len / (1024 * 1024) * ROUNDS) / ((float)us / 1000000.0f) : 0;
804: printf("CSUM2 :: %-7s :: %5.0f to %5.0f MB/s :: ", desc, perf, perf * streams);
805: for (i = 0; i < 16; i++) {
806: printf("%02x", cs1[i] & 0xFF);
807: }
808: printf(" :: ");
809: for (i = 0; i < 16; i++) {
810: printf("%02x", cs2[i] & 0xFF);
811: }
812: printf("\n");
813: }
814:
815: static void benchmark_inner(char* buf, int32 len, char* sum_out) {
816: // This should produce the same output for different optimizations
817: // levels, not the same as sanity_check()
818:
819: char* bufs[8] = {0};
820: int lens[8] = {0};
821: char* sums[8] = {0};
822:
823: bufs[0] = buf;
824: lens[0] = len;
825: sums[0] = sum_out;
826: md5_parallel(1, bufs, lens, sums, NULL, NULL);
827: }
828:
829: extern "C" {
830: extern void MD5P8_Init_c(MD5P8_CTX *ctx);
831: extern void MD5P8_Update_c(MD5P8_CTX *ctx, const uchar *input, uint32 length);
832: extern void MD5P8_Final_c(uchar digest[MD5_DIGEST_LEN], MD5P8_CTX *ctx);
833: }
834:
835: static void sanity_check(char* buf, int32 len, char* sum_out) {
836: // This should produce the same output for different optimizations
837: // levels, not the same as benchmark_inner()
838: if (md5_parallel_slots() <= 1) {
839: MD5P8_CTX m5p8;
840: MD5P8_Init_c(&m5p8);
841: MD5P8_Update_c(&m5p8, (uchar *)buf, len);
842: MD5P8_Final_c((uchar *)sum_out, &m5p8);
843: } else {
844: MD5P8_CTX m5p8;
845: MD5P8_Init(&m5p8);
846: MD5P8_Update(&m5p8, (uchar *)buf, len);
847: MD5P8_Final((uchar *)sum_out, &m5p8);
848: }
849: }
850:
851: int main() {
852: // This benchmarks the parallel MD5 checksum rather than get_checksum2()
853: // as the latter would require compiling in a lot of rsync's code, but
854: // it touches all the same internals so the performance should be nearly
855: // identical.
856:
857: int i;
858: char* buf = (char*)malloc(BLOCK_LEN);
859: for (i = 0; i < BLOCK_LEN; i++) buf[i] = (i + (i % 3) + (i % 11)) % 256;
860:
861: const char* method = "?";
862: switch (md5_parallel_slots()) {
863: case 8: method = "AVX2"; break;
864: case 4: method = "SSE2"; break;
865: #ifdef USE_OPENSSL
866: case 1: method = "OpenSSL"; break;
867: #elif (CSUM_CHUNK == 64)
868: case 1: method = "ASM"; break;
869: #else
870: // this won't happen unless you modified code somewhere
871: case 1: method = "Raw-C"; break;
872: #endif
873: }
874:
875: benchmark(method, benchmark_inner, sanity_check, buf, BLOCK_LEN, md5_parallel_slots());
876:
877: free(buf);
878: return 0;
879: }
880: #endif /* BENCHMARK_SIMD_CHECKSUM2 */
881:
882: #if defined(BENCHMARK_SIMD_CHECKSUM1) || defined(BENCHMARK_SIMD_CHECKSUM2)
883: #pragma GCC pop_options
884: #pragma clang optimize on
885: #endif /* BENCHMARK_SIMD_CHECKSUM1 || BENCHMARK_SIMD_CHECKSUM2 */
886:
887: #endif /* HAVE_SIMD */
888: #endif /* __cplusplus */
889: #endif /* __x86_64__ */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>