Annotation of embedaddon/strongswan/src/libstrongswan/plugins/aesni/aesni_gcm.c, revision 1.1.1.2
1.1 misho 1: /*
2: * Copyright (C) 2015 Martin Willi
3: * Copyright (C) 2015 revosec AG
4: *
5: * This program is free software; you can redistribute it and/or modify it
6: * under the terms of the GNU General Public License as published by the
7: * Free Software Foundation; either version 2 of the License, or (at your
8: * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
9: *
10: * This program is distributed in the hope that it will be useful, but
11: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13: * for more details.
14: */
15:
16: #include "aesni_gcm.h"
17: #include "aesni_key.h"
18:
19: #include <crypto/iv/iv_gen_seq.h>
20:
21: #include <tmmintrin.h>
22:
23: #define NONCE_SIZE 12
24: #define IV_SIZE 8
25: #define SALT_SIZE (NONCE_SIZE - IV_SIZE)
26:
27: /**
28: * Parallel pipelining
29: */
30: #define GCM_CRYPT_PARALLELISM 4
31:
32: typedef struct private_aesni_gcm_t private_aesni_gcm_t;
33:
34: /**
35: * GCM en/decryption method type
36: */
37: typedef void (*aesni_gcm_fn_t)(private_aesni_gcm_t*, size_t, u_char*, u_char*,
38: u_char*, size_t, u_char*, u_char*);
39:
40: /**
41: * Private data of an aesni_gcm_t object.
42: */
43: struct private_aesni_gcm_t {
44:
45: /**
46: * Public aesni_gcm_t interface.
47: */
48: aesni_gcm_t public;
49:
50: /**
51: * Encryption key schedule
52: */
53: aesni_key_t *key;
54:
55: /**
56: * IV generator.
57: */
58: iv_gen_t *iv_gen;
59:
60: /**
61: * Length of the integrity check value
62: */
63: size_t icv_size;
64:
65: /**
66: * Length of the key in bytes
67: */
68: size_t key_size;
69:
70: /**
71: * GCM encryption function
72: */
73: aesni_gcm_fn_t encrypt;
74:
75: /**
76: * GCM decryption function
77: */
78: aesni_gcm_fn_t decrypt;
79:
80: /**
81: * salt to add to nonce
82: */
83: u_char salt[SALT_SIZE];
84:
85: /**
86: * GHASH subkey H, big-endian
87: */
88: __m128i h;
89:
90: /**
91: * GHASH key H^2, big-endian
92: */
93: __m128i hh;
94:
95: /**
96: * GHASH key H^3, big-endian
97: */
98: __m128i hhh;
99:
100: /**
101: * GHASH key H^4, big-endian
102: */
103: __m128i hhhh;
104: };
105:
106: /**
107: * Byte-swap a 128-bit integer
108: */
109: static inline __m128i swap128(__m128i x)
110: {
111: return _mm_shuffle_epi8(x,
112: _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
113: }
114:
115: /**
116: * Multiply two blocks in GF128
117: */
118: static __m128i mult_block(__m128i h, __m128i y)
119: {
120: __m128i t1, t2, t3, t4, t5, t6;
121:
122: y = swap128(y);
123:
124: t1 = _mm_clmulepi64_si128(h, y, 0x00);
125: t2 = _mm_clmulepi64_si128(h, y, 0x01);
126: t3 = _mm_clmulepi64_si128(h, y, 0x10);
127: t4 = _mm_clmulepi64_si128(h, y, 0x11);
128:
129: t2 = _mm_xor_si128(t2, t3);
130: t3 = _mm_slli_si128(t2, 8);
131: t2 = _mm_srli_si128(t2, 8);
132: t1 = _mm_xor_si128(t1, t3);
133: t4 = _mm_xor_si128(t4, t2);
134:
135: t5 = _mm_srli_epi32(t1, 31);
136: t1 = _mm_slli_epi32(t1, 1);
137: t6 = _mm_srli_epi32(t4, 31);
138: t4 = _mm_slli_epi32(t4, 1);
139:
140: t3 = _mm_srli_si128(t5, 12);
141: t6 = _mm_slli_si128(t6, 4);
142: t5 = _mm_slli_si128(t5, 4);
143: t1 = _mm_or_si128(t1, t5);
144: t4 = _mm_or_si128(t4, t6);
145: t4 = _mm_or_si128(t4, t3);
146:
147: t5 = _mm_slli_epi32(t1, 31);
148: t6 = _mm_slli_epi32(t1, 30);
149: t3 = _mm_slli_epi32(t1, 25);
150:
151: t5 = _mm_xor_si128(t5, t6);
152: t5 = _mm_xor_si128(t5, t3);
153: t6 = _mm_srli_si128(t5, 4);
154: t4 = _mm_xor_si128(t4, t6);
155: t5 = _mm_slli_si128(t5, 12);
156: t1 = _mm_xor_si128(t1, t5);
157: t4 = _mm_xor_si128(t4, t1);
158:
159: t5 = _mm_srli_epi32(t1, 1);
160: t2 = _mm_srli_epi32(t1, 2);
161: t3 = _mm_srli_epi32(t1, 7);
162: t4 = _mm_xor_si128(t4, t2);
163: t4 = _mm_xor_si128(t4, t3);
164: t4 = _mm_xor_si128(t4, t5);
165:
166: return swap128(t4);
167: }
168:
169: /**
170: * Multiply four consecutive blocks by their respective GHASH key, XOR
171: */
172: static inline __m128i mult4xor(__m128i h1, __m128i h2, __m128i h3, __m128i h4,
173: __m128i d1, __m128i d2, __m128i d3, __m128i d4)
174: {
175: __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
176:
177: d1 = swap128(d1);
178: d2 = swap128(d2);
179: d3 = swap128(d3);
180: d4 = swap128(d4);
181:
182: t0 = _mm_clmulepi64_si128(h1, d1, 0x00);
183: t1 = _mm_clmulepi64_si128(h2, d2, 0x00);
184: t2 = _mm_clmulepi64_si128(h3, d3, 0x00);
185: t3 = _mm_clmulepi64_si128(h4, d4, 0x00);
186: t8 = _mm_xor_si128(t0, t1);
187: t8 = _mm_xor_si128(t8, t2);
188: t8 = _mm_xor_si128(t8, t3);
189:
190: t4 = _mm_clmulepi64_si128(h1, d1, 0x11);
191: t5 = _mm_clmulepi64_si128(h2, d2, 0x11);
192: t6 = _mm_clmulepi64_si128(h3, d3, 0x11);
193: t7 = _mm_clmulepi64_si128(h4, d4, 0x11);
194: t9 = _mm_xor_si128(t4, t5);
195: t9 = _mm_xor_si128(t9, t6);
196: t9 = _mm_xor_si128(t9, t7);
197:
198: t0 = _mm_shuffle_epi32(h1, 78);
199: t4 = _mm_shuffle_epi32(d1, 78);
200: t0 = _mm_xor_si128(t0, h1);
201: t4 = _mm_xor_si128(t4, d1);
202: t1 = _mm_shuffle_epi32(h2, 78);
203: t5 = _mm_shuffle_epi32(d2, 78);
204: t1 = _mm_xor_si128(t1, h2);
205: t5 = _mm_xor_si128(t5, d2);
206: t2 = _mm_shuffle_epi32(h3, 78);
207: t6 = _mm_shuffle_epi32(d3, 78);
208: t2 = _mm_xor_si128(t2, h3);
209: t6 = _mm_xor_si128(t6, d3);
210: t3 = _mm_shuffle_epi32(h4, 78);
211: t7 = _mm_shuffle_epi32(d4, 78);
212: t3 = _mm_xor_si128(t3, h4);
213: t7 = _mm_xor_si128(t7, d4);
214:
215: t0 = _mm_clmulepi64_si128(t0, t4, 0x00);
216: t1 = _mm_clmulepi64_si128(t1, t5, 0x00);
217: t2 = _mm_clmulepi64_si128(t2, t6, 0x00);
218: t3 = _mm_clmulepi64_si128(t3, t7, 0x00);
219: t0 = _mm_xor_si128(t0, t8);
220: t0 = _mm_xor_si128(t0, t9);
221: t0 = _mm_xor_si128(t1, t0);
222: t0 = _mm_xor_si128(t2, t0);
223:
224: t0 = _mm_xor_si128(t3, t0);
225: t4 = _mm_slli_si128(t0, 8);
226: t0 = _mm_srli_si128(t0, 8);
227: t3 = _mm_xor_si128(t4, t8);
228: t6 = _mm_xor_si128(t0, t9);
229: t7 = _mm_srli_epi32(t3, 31);
230: t8 = _mm_srli_epi32(t6, 31);
231: t3 = _mm_slli_epi32(t3, 1);
232: t6 = _mm_slli_epi32(t6, 1);
233: t9 = _mm_srli_si128(t7, 12);
234: t8 = _mm_slli_si128(t8, 4);
235: t7 = _mm_slli_si128(t7, 4);
236: t3 = _mm_or_si128(t3, t7);
237: t6 = _mm_or_si128(t6, t8);
238: t6 = _mm_or_si128(t6, t9);
239: t7 = _mm_slli_epi32(t3, 31);
240: t8 = _mm_slli_epi32(t3, 30);
241: t9 = _mm_slli_epi32(t3, 25);
242: t7 = _mm_xor_si128(t7, t8);
243: t7 = _mm_xor_si128(t7, t9);
244: t8 = _mm_srli_si128(t7, 4);
245: t7 = _mm_slli_si128(t7, 12);
246: t3 = _mm_xor_si128(t3, t7);
247: t2 = _mm_srli_epi32(t3, 1);
248: t4 = _mm_srli_epi32(t3, 2);
249: t5 = _mm_srli_epi32(t3, 7);
250: t2 = _mm_xor_si128(t2, t4);
251: t2 = _mm_xor_si128(t2, t5);
252: t2 = _mm_xor_si128(t2, t8);
253: t3 = _mm_xor_si128(t3, t2);
254: t6 = _mm_xor_si128(t6, t3);
255:
256: return swap128(t6);
257: }
258:
259: /**
260: * GHASH on a single block
261: */
262: static __m128i ghash(__m128i h, __m128i y, __m128i x)
263: {
264: return mult_block(h, _mm_xor_si128(y, x));
265: }
266:
267: /**
268: * Start constructing the ICV for the associated data
269: */
270: static __m128i icv_header(private_aesni_gcm_t *this, void *assoc, size_t alen)
271: {
272: u_int blocks, pblocks, rem, i;
273: __m128i h1, h2, h3, h4, d1, d2, d3, d4;
274: __m128i y, last, *ab;
275:
276: h1 = this->hhhh;
277: h2 = this->hhh;
278: h3 = this->hh;
279: h4 = this->h;
280:
281: y = _mm_setzero_si128();
282: ab = assoc;
283: blocks = alen / AES_BLOCK_SIZE;
284: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
285: rem = alen % AES_BLOCK_SIZE;
286: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
287: {
288: d1 = _mm_loadu_si128(ab + i + 0);
289: d2 = _mm_loadu_si128(ab + i + 1);
290: d3 = _mm_loadu_si128(ab + i + 2);
291: d4 = _mm_loadu_si128(ab + i + 3);
292: y = _mm_xor_si128(y, d1);
293: y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
294: }
295: for (i = pblocks; i < blocks; i++)
296: {
297: y = ghash(this->h, y, _mm_loadu_si128(ab + i));
298: }
299: if (rem)
300: {
301: last = _mm_setzero_si128();
302: memcpy(&last, ab + blocks, rem);
303:
304: y = ghash(this->h, y, last);
305: }
306:
307: return y;
308: }
309:
310: /**
311: * Complete the ICV by hashing a assoc/data length block
312: */
313: static __m128i icv_tailer(private_aesni_gcm_t *this, __m128i y,
314: size_t alen, size_t dlen)
315: {
316: __m128i b;
317:
318: htoun64(&b, alen * 8);
319: htoun64((u_char*)&b + sizeof(uint64_t), dlen * 8);
320:
321: return ghash(this->h, y, b);
322: }
323:
324: /**
325: * En-/Decrypt the ICV, trim and store it
326: */
327: static void icv_crypt(private_aesni_gcm_t *this, __m128i y, __m128i j,
328: u_char *icv)
329: {
330: __m128i *ks, t, b;
331: u_int round;
332:
333: ks = this->key->schedule;
334: t = _mm_xor_si128(j, ks[0]);
335: for (round = 1; round < this->key->rounds; round++)
336: {
337: t = _mm_aesenc_si128(t, ks[round]);
338: }
339: t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
340:
341: t = _mm_xor_si128(y, t);
342:
343: _mm_storeu_si128(&b, t);
344: memcpy(icv, &b, this->icv_size);
345: }
346:
347: /**
348: * Do big-endian increment on x
349: */
350: static inline __m128i increment_be(__m128i x)
351: {
352: x = swap128(x);
353: x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
354: x = swap128(x);
355:
356: return x;
357: }
358:
359: /**
360: * Generate the block J0
361: */
362: static inline __m128i create_j(private_aesni_gcm_t *this, u_char *iv)
363: {
364: u_char j[AES_BLOCK_SIZE];
365:
366: memcpy(j, this->salt, SALT_SIZE);
367: memcpy(j + SALT_SIZE, iv, IV_SIZE);
368: htoun32(j + SALT_SIZE + IV_SIZE, 1);
369:
370: return _mm_loadu_si128((__m128i*)j);
371: }
372:
373: /**
374: * Encrypt a remaining incomplete block, return updated Y
375: */
376: static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
377: void *in, void *out, __m128i cb, __m128i y)
378: {
379: __m128i *ks, t, b;
380: u_int round;
381:
382: memset(&b, 0, sizeof(b));
383: memcpy(&b, in, rem);
384:
385: ks = this->key->schedule;
386: t = _mm_xor_si128(cb, ks[0]);
387: for (round = 1; round < this->key->rounds; round++)
388: {
389: t = _mm_aesenc_si128(t, ks[round]);
390: }
391: t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
392: b = _mm_xor_si128(t, b);
393:
394: memcpy(out, &b, rem);
395:
396: memset((u_char*)&b + rem, 0, AES_BLOCK_SIZE - rem);
397: return ghash(this->h, y, b);
398: }
399:
400: /**
401: * Decrypt a remaining incomplete block, return updated Y
402: */
403: static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
404: void *in, void *out, __m128i cb, __m128i y)
405: {
406: __m128i *ks, t, b;
407: u_int round;
408:
409: memset(&b, 0, sizeof(b));
410: memcpy(&b, in, rem);
411:
412: y = ghash(this->h, y, b);
413:
414: ks = this->key->schedule;
415: t = _mm_xor_si128(cb, ks[0]);
416: for (round = 1; round < this->key->rounds; round++)
417: {
418: t = _mm_aesenc_si128(t, ks[round]);
419: }
420: t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
421: b = _mm_xor_si128(t, b);
422:
423: memcpy(out, &b, rem);
424:
425: return y;
426: }
427:
428: /**
429: * AES-128 GCM encryption/ICV generation
430: */
431: static void encrypt_gcm128(private_aesni_gcm_t *this,
432: size_t len, u_char *in, u_char *out, u_char *iv,
433: size_t alen, u_char *assoc, u_char *icv)
434: {
435: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
436: __m128i *ks, y, j, cb, *bi, *bo;
437: u_int blocks, pblocks, rem, i;
438:
439: j = create_j(this, iv);
440: cb = increment_be(j);
441: y = icv_header(this, assoc, alen);
442: blocks = len / AES_BLOCK_SIZE;
443: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
444: rem = len % AES_BLOCK_SIZE;
445: bi = (__m128i*)in;
446: bo = (__m128i*)out;
447:
448: ks = this->key->schedule;
449:
450: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
451: {
452: d1 = _mm_loadu_si128(bi + i + 0);
453: d2 = _mm_loadu_si128(bi + i + 1);
454: d3 = _mm_loadu_si128(bi + i + 2);
455: d4 = _mm_loadu_si128(bi + i + 3);
456:
457: t1 = _mm_xor_si128(cb, ks[0]);
458: cb = increment_be(cb);
459: t2 = _mm_xor_si128(cb, ks[0]);
460: cb = increment_be(cb);
461: t3 = _mm_xor_si128(cb, ks[0]);
462: cb = increment_be(cb);
463: t4 = _mm_xor_si128(cb, ks[0]);
464: cb = increment_be(cb);
465:
466: t1 = _mm_aesenc_si128(t1, ks[1]);
467: t2 = _mm_aesenc_si128(t2, ks[1]);
468: t3 = _mm_aesenc_si128(t3, ks[1]);
469: t4 = _mm_aesenc_si128(t4, ks[1]);
470: t1 = _mm_aesenc_si128(t1, ks[2]);
471: t2 = _mm_aesenc_si128(t2, ks[2]);
472: t3 = _mm_aesenc_si128(t3, ks[2]);
473: t4 = _mm_aesenc_si128(t4, ks[2]);
474: t1 = _mm_aesenc_si128(t1, ks[3]);
475: t2 = _mm_aesenc_si128(t2, ks[3]);
476: t3 = _mm_aesenc_si128(t3, ks[3]);
477: t4 = _mm_aesenc_si128(t4, ks[3]);
478: t1 = _mm_aesenc_si128(t1, ks[4]);
479: t2 = _mm_aesenc_si128(t2, ks[4]);
480: t3 = _mm_aesenc_si128(t3, ks[4]);
481: t4 = _mm_aesenc_si128(t4, ks[4]);
482: t1 = _mm_aesenc_si128(t1, ks[5]);
483: t2 = _mm_aesenc_si128(t2, ks[5]);
484: t3 = _mm_aesenc_si128(t3, ks[5]);
485: t4 = _mm_aesenc_si128(t4, ks[5]);
486: t1 = _mm_aesenc_si128(t1, ks[6]);
487: t2 = _mm_aesenc_si128(t2, ks[6]);
488: t3 = _mm_aesenc_si128(t3, ks[6]);
489: t4 = _mm_aesenc_si128(t4, ks[6]);
490: t1 = _mm_aesenc_si128(t1, ks[7]);
491: t2 = _mm_aesenc_si128(t2, ks[7]);
492: t3 = _mm_aesenc_si128(t3, ks[7]);
493: t4 = _mm_aesenc_si128(t4, ks[7]);
494: t1 = _mm_aesenc_si128(t1, ks[8]);
495: t2 = _mm_aesenc_si128(t2, ks[8]);
496: t3 = _mm_aesenc_si128(t3, ks[8]);
497: t4 = _mm_aesenc_si128(t4, ks[8]);
498: t1 = _mm_aesenc_si128(t1, ks[9]);
499: t2 = _mm_aesenc_si128(t2, ks[9]);
500: t3 = _mm_aesenc_si128(t3, ks[9]);
501: t4 = _mm_aesenc_si128(t4, ks[9]);
502:
503: t1 = _mm_aesenclast_si128(t1, ks[10]);
504: t2 = _mm_aesenclast_si128(t2, ks[10]);
505: t3 = _mm_aesenclast_si128(t3, ks[10]);
506: t4 = _mm_aesenclast_si128(t4, ks[10]);
507:
508: t1 = _mm_xor_si128(t1, d1);
509: t2 = _mm_xor_si128(t2, d2);
510: t3 = _mm_xor_si128(t3, d3);
511: t4 = _mm_xor_si128(t4, d4);
512:
513: y = _mm_xor_si128(y, t1);
514: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
515:
516: _mm_storeu_si128(bo + i + 0, t1);
517: _mm_storeu_si128(bo + i + 1, t2);
518: _mm_storeu_si128(bo + i + 2, t3);
519: _mm_storeu_si128(bo + i + 3, t4);
520: }
521:
522: for (i = pblocks; i < blocks; i++)
523: {
524: d1 = _mm_loadu_si128(bi + i);
525:
526: t1 = _mm_xor_si128(cb, ks[0]);
527: t1 = _mm_aesenc_si128(t1, ks[1]);
528: t1 = _mm_aesenc_si128(t1, ks[2]);
529: t1 = _mm_aesenc_si128(t1, ks[3]);
530: t1 = _mm_aesenc_si128(t1, ks[4]);
531: t1 = _mm_aesenc_si128(t1, ks[5]);
532: t1 = _mm_aesenc_si128(t1, ks[6]);
533: t1 = _mm_aesenc_si128(t1, ks[7]);
534: t1 = _mm_aesenc_si128(t1, ks[8]);
535: t1 = _mm_aesenc_si128(t1, ks[9]);
536: t1 = _mm_aesenclast_si128(t1, ks[10]);
537:
538: t1 = _mm_xor_si128(t1, d1);
539: _mm_storeu_si128(bo + i, t1);
540:
541: y = ghash(this->h, y, t1);
542:
543: cb = increment_be(cb);
544: }
545:
546: if (rem)
547: {
548: y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
549: }
550: y = icv_tailer(this, y, alen, len);
551: icv_crypt(this, y, j, icv);
552: }
553:
554: /**
555: * AES-128 GCM decryption/ICV generation
556: */
557: static void decrypt_gcm128(private_aesni_gcm_t *this,
558: size_t len, u_char *in, u_char *out, u_char *iv,
559: size_t alen, u_char *assoc, u_char *icv)
560: {
561: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
562: __m128i *ks, y, j, cb, *bi, *bo;
563: u_int blocks, pblocks, rem, i;
564:
565: j = create_j(this, iv);
566: cb = increment_be(j);
567: y = icv_header(this, assoc, alen);
568: blocks = len / AES_BLOCK_SIZE;
569: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
570: rem = len % AES_BLOCK_SIZE;
571: bi = (__m128i*)in;
572: bo = (__m128i*)out;
573:
574: ks = this->key->schedule;
575:
576: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
577: {
578: d1 = _mm_loadu_si128(bi + i + 0);
579: d2 = _mm_loadu_si128(bi + i + 1);
580: d3 = _mm_loadu_si128(bi + i + 2);
581: d4 = _mm_loadu_si128(bi + i + 3);
582:
583: y = _mm_xor_si128(y, d1);
584: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
585:
586: t1 = _mm_xor_si128(cb, ks[0]);
587: cb = increment_be(cb);
588: t2 = _mm_xor_si128(cb, ks[0]);
589: cb = increment_be(cb);
590: t3 = _mm_xor_si128(cb, ks[0]);
591: cb = increment_be(cb);
592: t4 = _mm_xor_si128(cb, ks[0]);
593: cb = increment_be(cb);
594:
595: t1 = _mm_aesenc_si128(t1, ks[1]);
596: t2 = _mm_aesenc_si128(t2, ks[1]);
597: t3 = _mm_aesenc_si128(t3, ks[1]);
598: t4 = _mm_aesenc_si128(t4, ks[1]);
599: t1 = _mm_aesenc_si128(t1, ks[2]);
600: t2 = _mm_aesenc_si128(t2, ks[2]);
601: t3 = _mm_aesenc_si128(t3, ks[2]);
602: t4 = _mm_aesenc_si128(t4, ks[2]);
603: t1 = _mm_aesenc_si128(t1, ks[3]);
604: t2 = _mm_aesenc_si128(t2, ks[3]);
605: t3 = _mm_aesenc_si128(t3, ks[3]);
606: t4 = _mm_aesenc_si128(t4, ks[3]);
607: t1 = _mm_aesenc_si128(t1, ks[4]);
608: t2 = _mm_aesenc_si128(t2, ks[4]);
609: t3 = _mm_aesenc_si128(t3, ks[4]);
610: t4 = _mm_aesenc_si128(t4, ks[4]);
611: t1 = _mm_aesenc_si128(t1, ks[5]);
612: t2 = _mm_aesenc_si128(t2, ks[5]);
613: t3 = _mm_aesenc_si128(t3, ks[5]);
614: t4 = _mm_aesenc_si128(t4, ks[5]);
615: t1 = _mm_aesenc_si128(t1, ks[6]);
616: t2 = _mm_aesenc_si128(t2, ks[6]);
617: t3 = _mm_aesenc_si128(t3, ks[6]);
618: t4 = _mm_aesenc_si128(t4, ks[6]);
619: t1 = _mm_aesenc_si128(t1, ks[7]);
620: t2 = _mm_aesenc_si128(t2, ks[7]);
621: t3 = _mm_aesenc_si128(t3, ks[7]);
622: t4 = _mm_aesenc_si128(t4, ks[7]);
623: t1 = _mm_aesenc_si128(t1, ks[8]);
624: t2 = _mm_aesenc_si128(t2, ks[8]);
625: t3 = _mm_aesenc_si128(t3, ks[8]);
626: t4 = _mm_aesenc_si128(t4, ks[8]);
627: t1 = _mm_aesenc_si128(t1, ks[9]);
628: t2 = _mm_aesenc_si128(t2, ks[9]);
629: t3 = _mm_aesenc_si128(t3, ks[9]);
630: t4 = _mm_aesenc_si128(t4, ks[9]);
631:
632: t1 = _mm_aesenclast_si128(t1, ks[10]);
633: t2 = _mm_aesenclast_si128(t2, ks[10]);
634: t3 = _mm_aesenclast_si128(t3, ks[10]);
635: t4 = _mm_aesenclast_si128(t4, ks[10]);
636:
637: t1 = _mm_xor_si128(t1, d1);
638: t2 = _mm_xor_si128(t2, d2);
639: t3 = _mm_xor_si128(t3, d3);
640: t4 = _mm_xor_si128(t4, d4);
641:
642: _mm_storeu_si128(bo + i + 0, t1);
643: _mm_storeu_si128(bo + i + 1, t2);
644: _mm_storeu_si128(bo + i + 2, t3);
645: _mm_storeu_si128(bo + i + 3, t4);
646: }
647:
648: for (i = pblocks; i < blocks; i++)
649: {
650: d1 = _mm_loadu_si128(bi + i);
651:
652: y = ghash(this->h, y, d1);
653:
654: t1 = _mm_xor_si128(cb, ks[0]);
655: t1 = _mm_aesenc_si128(t1, ks[1]);
656: t1 = _mm_aesenc_si128(t1, ks[2]);
657: t1 = _mm_aesenc_si128(t1, ks[3]);
658: t1 = _mm_aesenc_si128(t1, ks[4]);
659: t1 = _mm_aesenc_si128(t1, ks[5]);
660: t1 = _mm_aesenc_si128(t1, ks[6]);
661: t1 = _mm_aesenc_si128(t1, ks[7]);
662: t1 = _mm_aesenc_si128(t1, ks[8]);
663: t1 = _mm_aesenc_si128(t1, ks[9]);
664: t1 = _mm_aesenclast_si128(t1, ks[10]);
665:
666: t1 = _mm_xor_si128(t1, d1);
667: _mm_storeu_si128(bo + i, t1);
668:
669: cb = increment_be(cb);
670: }
671:
672: if (rem)
673: {
674: y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
675: }
676: y = icv_tailer(this, y, alen, len);
677: icv_crypt(this, y, j, icv);
678: }
679:
680: /**
681: * AES-192 GCM encryption/ICV generation
682: */
683: static void encrypt_gcm192(private_aesni_gcm_t *this,
684: size_t len, u_char *in, u_char *out, u_char *iv,
685: size_t alen, u_char *assoc, u_char *icv)
686: {
687: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
688: __m128i *ks, y, j, cb, *bi, *bo;
689: u_int blocks, pblocks, rem, i;
690:
691: j = create_j(this, iv);
692: cb = increment_be(j);
693: y = icv_header(this, assoc, alen);
694: blocks = len / AES_BLOCK_SIZE;
695: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
696: rem = len % AES_BLOCK_SIZE;
697: bi = (__m128i*)in;
698: bo = (__m128i*)out;
699:
700: ks = this->key->schedule;
701:
702: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
703: {
704: d1 = _mm_loadu_si128(bi + i + 0);
705: d2 = _mm_loadu_si128(bi + i + 1);
706: d3 = _mm_loadu_si128(bi + i + 2);
707: d4 = _mm_loadu_si128(bi + i + 3);
708:
709: t1 = _mm_xor_si128(cb, ks[0]);
710: cb = increment_be(cb);
711: t2 = _mm_xor_si128(cb, ks[0]);
712: cb = increment_be(cb);
713: t3 = _mm_xor_si128(cb, ks[0]);
714: cb = increment_be(cb);
715: t4 = _mm_xor_si128(cb, ks[0]);
716: cb = increment_be(cb);
717:
718: t1 = _mm_aesenc_si128(t1, ks[1]);
719: t2 = _mm_aesenc_si128(t2, ks[1]);
720: t3 = _mm_aesenc_si128(t3, ks[1]);
721: t4 = _mm_aesenc_si128(t4, ks[1]);
722: t1 = _mm_aesenc_si128(t1, ks[2]);
723: t2 = _mm_aesenc_si128(t2, ks[2]);
724: t3 = _mm_aesenc_si128(t3, ks[2]);
725: t4 = _mm_aesenc_si128(t4, ks[2]);
726: t1 = _mm_aesenc_si128(t1, ks[3]);
727: t2 = _mm_aesenc_si128(t2, ks[3]);
728: t3 = _mm_aesenc_si128(t3, ks[3]);
729: t4 = _mm_aesenc_si128(t4, ks[3]);
730: t1 = _mm_aesenc_si128(t1, ks[4]);
731: t2 = _mm_aesenc_si128(t2, ks[4]);
732: t3 = _mm_aesenc_si128(t3, ks[4]);
733: t4 = _mm_aesenc_si128(t4, ks[4]);
734: t1 = _mm_aesenc_si128(t1, ks[5]);
735: t2 = _mm_aesenc_si128(t2, ks[5]);
736: t3 = _mm_aesenc_si128(t3, ks[5]);
737: t4 = _mm_aesenc_si128(t4, ks[5]);
738: t1 = _mm_aesenc_si128(t1, ks[6]);
739: t2 = _mm_aesenc_si128(t2, ks[6]);
740: t3 = _mm_aesenc_si128(t3, ks[6]);
741: t4 = _mm_aesenc_si128(t4, ks[6]);
742: t1 = _mm_aesenc_si128(t1, ks[7]);
743: t2 = _mm_aesenc_si128(t2, ks[7]);
744: t3 = _mm_aesenc_si128(t3, ks[7]);
745: t4 = _mm_aesenc_si128(t4, ks[7]);
746: t1 = _mm_aesenc_si128(t1, ks[8]);
747: t2 = _mm_aesenc_si128(t2, ks[8]);
748: t3 = _mm_aesenc_si128(t3, ks[8]);
749: t4 = _mm_aesenc_si128(t4, ks[8]);
750: t1 = _mm_aesenc_si128(t1, ks[9]);
751: t2 = _mm_aesenc_si128(t2, ks[9]);
752: t3 = _mm_aesenc_si128(t3, ks[9]);
753: t4 = _mm_aesenc_si128(t4, ks[9]);
754: t1 = _mm_aesenc_si128(t1, ks[10]);
755: t2 = _mm_aesenc_si128(t2, ks[10]);
756: t3 = _mm_aesenc_si128(t3, ks[10]);
757: t4 = _mm_aesenc_si128(t4, ks[10]);
758: t1 = _mm_aesenc_si128(t1, ks[11]);
759: t2 = _mm_aesenc_si128(t2, ks[11]);
760: t3 = _mm_aesenc_si128(t3, ks[11]);
761: t4 = _mm_aesenc_si128(t4, ks[11]);
762:
763: t1 = _mm_aesenclast_si128(t1, ks[12]);
764: t2 = _mm_aesenclast_si128(t2, ks[12]);
765: t3 = _mm_aesenclast_si128(t3, ks[12]);
766: t4 = _mm_aesenclast_si128(t4, ks[12]);
767:
768: t1 = _mm_xor_si128(t1, d1);
769: t2 = _mm_xor_si128(t2, d2);
770: t3 = _mm_xor_si128(t3, d3);
771: t4 = _mm_xor_si128(t4, d4);
772:
773: y = _mm_xor_si128(y, t1);
774: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
775:
776: _mm_storeu_si128(bo + i + 0, t1);
777: _mm_storeu_si128(bo + i + 1, t2);
778: _mm_storeu_si128(bo + i + 2, t3);
779: _mm_storeu_si128(bo + i + 3, t4);
780: }
781:
782: for (i = pblocks; i < blocks; i++)
783: {
784: d1 = _mm_loadu_si128(bi + i);
785:
786: t1 = _mm_xor_si128(cb, ks[0]);
787: t1 = _mm_aesenc_si128(t1, ks[1]);
788: t1 = _mm_aesenc_si128(t1, ks[2]);
789: t1 = _mm_aesenc_si128(t1, ks[3]);
790: t1 = _mm_aesenc_si128(t1, ks[4]);
791: t1 = _mm_aesenc_si128(t1, ks[5]);
792: t1 = _mm_aesenc_si128(t1, ks[6]);
793: t1 = _mm_aesenc_si128(t1, ks[7]);
794: t1 = _mm_aesenc_si128(t1, ks[8]);
795: t1 = _mm_aesenc_si128(t1, ks[9]);
796: t1 = _mm_aesenc_si128(t1, ks[10]);
797: t1 = _mm_aesenc_si128(t1, ks[11]);
798: t1 = _mm_aesenclast_si128(t1, ks[12]);
799:
800: t1 = _mm_xor_si128(t1, d1);
801: _mm_storeu_si128(bo + i, t1);
802:
803: y = ghash(this->h, y, t1);
804:
805: cb = increment_be(cb);
806: }
807:
808: if (rem)
809: {
810: y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
811: }
812: y = icv_tailer(this, y, alen, len);
813: icv_crypt(this, y, j, icv);
814: }
815:
816: /**
817: * AES-192 GCM decryption/ICV generation
818: */
819: static void decrypt_gcm192(private_aesni_gcm_t *this,
820: size_t len, u_char *in, u_char *out, u_char *iv,
821: size_t alen, u_char *assoc, u_char *icv)
822: {
823: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
824: __m128i *ks, y, j, cb, *bi, *bo;
825: u_int blocks, pblocks, rem, i;
826:
827: j = create_j(this, iv);
828: cb = increment_be(j);
829: y = icv_header(this, assoc, alen);
830: blocks = len / AES_BLOCK_SIZE;
831: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
832: rem = len % AES_BLOCK_SIZE;
833: bi = (__m128i*)in;
834: bo = (__m128i*)out;
835:
836: ks = this->key->schedule;
837:
838: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
839: {
840: d1 = _mm_loadu_si128(bi + i + 0);
841: d2 = _mm_loadu_si128(bi + i + 1);
842: d3 = _mm_loadu_si128(bi + i + 2);
843: d4 = _mm_loadu_si128(bi + i + 3);
844:
845: y = _mm_xor_si128(y, d1);
846: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
847:
848: t1 = _mm_xor_si128(cb, ks[0]);
849: cb = increment_be(cb);
850: t2 = _mm_xor_si128(cb, ks[0]);
851: cb = increment_be(cb);
852: t3 = _mm_xor_si128(cb, ks[0]);
853: cb = increment_be(cb);
854: t4 = _mm_xor_si128(cb, ks[0]);
855: cb = increment_be(cb);
856:
857: t1 = _mm_aesenc_si128(t1, ks[1]);
858: t2 = _mm_aesenc_si128(t2, ks[1]);
859: t3 = _mm_aesenc_si128(t3, ks[1]);
860: t4 = _mm_aesenc_si128(t4, ks[1]);
861: t1 = _mm_aesenc_si128(t1, ks[2]);
862: t2 = _mm_aesenc_si128(t2, ks[2]);
863: t3 = _mm_aesenc_si128(t3, ks[2]);
864: t4 = _mm_aesenc_si128(t4, ks[2]);
865: t1 = _mm_aesenc_si128(t1, ks[3]);
866: t2 = _mm_aesenc_si128(t2, ks[3]);
867: t3 = _mm_aesenc_si128(t3, ks[3]);
868: t4 = _mm_aesenc_si128(t4, ks[3]);
869: t1 = _mm_aesenc_si128(t1, ks[4]);
870: t2 = _mm_aesenc_si128(t2, ks[4]);
871: t3 = _mm_aesenc_si128(t3, ks[4]);
872: t4 = _mm_aesenc_si128(t4, ks[4]);
873: t1 = _mm_aesenc_si128(t1, ks[5]);
874: t2 = _mm_aesenc_si128(t2, ks[5]);
875: t3 = _mm_aesenc_si128(t3, ks[5]);
876: t4 = _mm_aesenc_si128(t4, ks[5]);
877: t1 = _mm_aesenc_si128(t1, ks[6]);
878: t2 = _mm_aesenc_si128(t2, ks[6]);
879: t3 = _mm_aesenc_si128(t3, ks[6]);
880: t4 = _mm_aesenc_si128(t4, ks[6]);
881: t1 = _mm_aesenc_si128(t1, ks[7]);
882: t2 = _mm_aesenc_si128(t2, ks[7]);
883: t3 = _mm_aesenc_si128(t3, ks[7]);
884: t4 = _mm_aesenc_si128(t4, ks[7]);
885: t1 = _mm_aesenc_si128(t1, ks[8]);
886: t2 = _mm_aesenc_si128(t2, ks[8]);
887: t3 = _mm_aesenc_si128(t3, ks[8]);
888: t4 = _mm_aesenc_si128(t4, ks[8]);
889: t1 = _mm_aesenc_si128(t1, ks[9]);
890: t2 = _mm_aesenc_si128(t2, ks[9]);
891: t3 = _mm_aesenc_si128(t3, ks[9]);
892: t4 = _mm_aesenc_si128(t4, ks[9]);
893: t1 = _mm_aesenc_si128(t1, ks[10]);
894: t2 = _mm_aesenc_si128(t2, ks[10]);
895: t3 = _mm_aesenc_si128(t3, ks[10]);
896: t4 = _mm_aesenc_si128(t4, ks[10]);
897: t1 = _mm_aesenc_si128(t1, ks[11]);
898: t2 = _mm_aesenc_si128(t2, ks[11]);
899: t3 = _mm_aesenc_si128(t3, ks[11]);
900: t4 = _mm_aesenc_si128(t4, ks[11]);
901:
902: t1 = _mm_aesenclast_si128(t1, ks[12]);
903: t2 = _mm_aesenclast_si128(t2, ks[12]);
904: t3 = _mm_aesenclast_si128(t3, ks[12]);
905: t4 = _mm_aesenclast_si128(t4, ks[12]);
906:
907: t1 = _mm_xor_si128(t1, d1);
908: t2 = _mm_xor_si128(t2, d2);
909: t3 = _mm_xor_si128(t3, d3);
910: t4 = _mm_xor_si128(t4, d4);
911:
912: _mm_storeu_si128(bo + i + 0, t1);
913: _mm_storeu_si128(bo + i + 1, t2);
914: _mm_storeu_si128(bo + i + 2, t3);
915: _mm_storeu_si128(bo + i + 3, t4);
916: }
917:
918: for (i = pblocks; i < blocks; i++)
919: {
920: d1 = _mm_loadu_si128(bi + i);
921:
922: y = ghash(this->h, y, d1);
923:
924: t1 = _mm_xor_si128(cb, ks[0]);
925: t1 = _mm_aesenc_si128(t1, ks[1]);
926: t1 = _mm_aesenc_si128(t1, ks[2]);
927: t1 = _mm_aesenc_si128(t1, ks[3]);
928: t1 = _mm_aesenc_si128(t1, ks[4]);
929: t1 = _mm_aesenc_si128(t1, ks[5]);
930: t1 = _mm_aesenc_si128(t1, ks[6]);
931: t1 = _mm_aesenc_si128(t1, ks[7]);
932: t1 = _mm_aesenc_si128(t1, ks[8]);
933: t1 = _mm_aesenc_si128(t1, ks[9]);
934: t1 = _mm_aesenc_si128(t1, ks[10]);
935: t1 = _mm_aesenc_si128(t1, ks[11]);
936: t1 = _mm_aesenclast_si128(t1, ks[12]);
937:
938: t1 = _mm_xor_si128(t1, d1);
939: _mm_storeu_si128(bo + i, t1);
940:
941: cb = increment_be(cb);
942: }
943:
944: if (rem)
945: {
946: y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
947: }
948: y = icv_tailer(this, y, alen, len);
949: icv_crypt(this, y, j, icv);
950: }
951:
952: /**
953: * AES-256 GCM encryption/ICV generation
954: */
955: static void encrypt_gcm256(private_aesni_gcm_t *this,
956: size_t len, u_char *in, u_char *out, u_char *iv,
957: size_t alen, u_char *assoc, u_char *icv)
958: {
959: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
960: __m128i *ks, y, j, cb, *bi, *bo;
961: u_int blocks, pblocks, rem, i;
962:
963: j = create_j(this, iv);
964: cb = increment_be(j);
965: y = icv_header(this, assoc, alen);
966: blocks = len / AES_BLOCK_SIZE;
967: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
968: rem = len % AES_BLOCK_SIZE;
969: bi = (__m128i*)in;
970: bo = (__m128i*)out;
971:
972: ks = this->key->schedule;
973:
974: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
975: {
976: d1 = _mm_loadu_si128(bi + i + 0);
977: d2 = _mm_loadu_si128(bi + i + 1);
978: d3 = _mm_loadu_si128(bi + i + 2);
979: d4 = _mm_loadu_si128(bi + i + 3);
980:
981: t1 = _mm_xor_si128(cb, ks[0]);
982: cb = increment_be(cb);
983: t2 = _mm_xor_si128(cb, ks[0]);
984: cb = increment_be(cb);
985: t3 = _mm_xor_si128(cb, ks[0]);
986: cb = increment_be(cb);
987: t4 = _mm_xor_si128(cb, ks[0]);
988: cb = increment_be(cb);
989:
990: t1 = _mm_aesenc_si128(t1, ks[1]);
991: t2 = _mm_aesenc_si128(t2, ks[1]);
992: t3 = _mm_aesenc_si128(t3, ks[1]);
993: t4 = _mm_aesenc_si128(t4, ks[1]);
994: t1 = _mm_aesenc_si128(t1, ks[2]);
995: t2 = _mm_aesenc_si128(t2, ks[2]);
996: t3 = _mm_aesenc_si128(t3, ks[2]);
997: t4 = _mm_aesenc_si128(t4, ks[2]);
998: t1 = _mm_aesenc_si128(t1, ks[3]);
999: t2 = _mm_aesenc_si128(t2, ks[3]);
1000: t3 = _mm_aesenc_si128(t3, ks[3]);
1001: t4 = _mm_aesenc_si128(t4, ks[3]);
1002: t1 = _mm_aesenc_si128(t1, ks[4]);
1003: t2 = _mm_aesenc_si128(t2, ks[4]);
1004: t3 = _mm_aesenc_si128(t3, ks[4]);
1005: t4 = _mm_aesenc_si128(t4, ks[4]);
1006: t1 = _mm_aesenc_si128(t1, ks[5]);
1007: t2 = _mm_aesenc_si128(t2, ks[5]);
1008: t3 = _mm_aesenc_si128(t3, ks[5]);
1009: t4 = _mm_aesenc_si128(t4, ks[5]);
1010: t1 = _mm_aesenc_si128(t1, ks[6]);
1011: t2 = _mm_aesenc_si128(t2, ks[6]);
1012: t3 = _mm_aesenc_si128(t3, ks[6]);
1013: t4 = _mm_aesenc_si128(t4, ks[6]);
1014: t1 = _mm_aesenc_si128(t1, ks[7]);
1015: t2 = _mm_aesenc_si128(t2, ks[7]);
1016: t3 = _mm_aesenc_si128(t3, ks[7]);
1017: t4 = _mm_aesenc_si128(t4, ks[7]);
1018: t1 = _mm_aesenc_si128(t1, ks[8]);
1019: t2 = _mm_aesenc_si128(t2, ks[8]);
1020: t3 = _mm_aesenc_si128(t3, ks[8]);
1021: t4 = _mm_aesenc_si128(t4, ks[8]);
1022: t1 = _mm_aesenc_si128(t1, ks[9]);
1023: t2 = _mm_aesenc_si128(t2, ks[9]);
1024: t3 = _mm_aesenc_si128(t3, ks[9]);
1025: t4 = _mm_aesenc_si128(t4, ks[9]);
1026: t1 = _mm_aesenc_si128(t1, ks[10]);
1027: t2 = _mm_aesenc_si128(t2, ks[10]);
1028: t3 = _mm_aesenc_si128(t3, ks[10]);
1029: t4 = _mm_aesenc_si128(t4, ks[10]);
1030: t1 = _mm_aesenc_si128(t1, ks[11]);
1031: t2 = _mm_aesenc_si128(t2, ks[11]);
1032: t3 = _mm_aesenc_si128(t3, ks[11]);
1033: t4 = _mm_aesenc_si128(t4, ks[11]);
1034: t1 = _mm_aesenc_si128(t1, ks[12]);
1035: t2 = _mm_aesenc_si128(t2, ks[12]);
1036: t3 = _mm_aesenc_si128(t3, ks[12]);
1037: t4 = _mm_aesenc_si128(t4, ks[12]);
1038: t1 = _mm_aesenc_si128(t1, ks[13]);
1039: t2 = _mm_aesenc_si128(t2, ks[13]);
1040: t3 = _mm_aesenc_si128(t3, ks[13]);
1041: t4 = _mm_aesenc_si128(t4, ks[13]);
1042:
1043: t1 = _mm_aesenclast_si128(t1, ks[14]);
1044: t2 = _mm_aesenclast_si128(t2, ks[14]);
1045: t3 = _mm_aesenclast_si128(t3, ks[14]);
1046: t4 = _mm_aesenclast_si128(t4, ks[14]);
1047:
1048: t1 = _mm_xor_si128(t1, d1);
1049: t2 = _mm_xor_si128(t2, d2);
1050: t3 = _mm_xor_si128(t3, d3);
1051: t4 = _mm_xor_si128(t4, d4);
1052:
1053: y = _mm_xor_si128(y, t1);
1054: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
1055:
1056: _mm_storeu_si128(bo + i + 0, t1);
1057: _mm_storeu_si128(bo + i + 1, t2);
1058: _mm_storeu_si128(bo + i + 2, t3);
1059: _mm_storeu_si128(bo + i + 3, t4);
1060: }
1061:
1062: for (i = pblocks; i < blocks; i++)
1063: {
1064: d1 = _mm_loadu_si128(bi + i);
1065:
1066: t1 = _mm_xor_si128(cb, ks[0]);
1067: t1 = _mm_aesenc_si128(t1, ks[1]);
1068: t1 = _mm_aesenc_si128(t1, ks[2]);
1069: t1 = _mm_aesenc_si128(t1, ks[3]);
1070: t1 = _mm_aesenc_si128(t1, ks[4]);
1071: t1 = _mm_aesenc_si128(t1, ks[5]);
1072: t1 = _mm_aesenc_si128(t1, ks[6]);
1073: t1 = _mm_aesenc_si128(t1, ks[7]);
1074: t1 = _mm_aesenc_si128(t1, ks[8]);
1075: t1 = _mm_aesenc_si128(t1, ks[9]);
1076: t1 = _mm_aesenc_si128(t1, ks[10]);
1077: t1 = _mm_aesenc_si128(t1, ks[11]);
1078: t1 = _mm_aesenc_si128(t1, ks[12]);
1079: t1 = _mm_aesenc_si128(t1, ks[13]);
1080: t1 = _mm_aesenclast_si128(t1, ks[14]);
1081:
1082: t1 = _mm_xor_si128(t1, d1);
1083: _mm_storeu_si128(bo + i, t1);
1084:
1085: y = ghash(this->h, y, t1);
1086:
1087: cb = increment_be(cb);
1088: }
1089:
1090: if (rem)
1091: {
1092: y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
1093: }
1094: y = icv_tailer(this, y, alen, len);
1095: icv_crypt(this, y, j, icv);
1096: }
1097:
1098: /**
1099: * AES-256 GCM decryption/ICV generation
1100: */
1101: static void decrypt_gcm256(private_aesni_gcm_t *this,
1102: size_t len, u_char *in, u_char *out, u_char *iv,
1103: size_t alen, u_char *assoc, u_char *icv)
1104: {
1105: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
1106: __m128i *ks, y, j, cb, *bi, *bo;
1107: u_int blocks, pblocks, rem, i;
1108:
1109: j = create_j(this, iv);
1110: cb = increment_be(j);
1111: y = icv_header(this, assoc, alen);
1112: blocks = len / AES_BLOCK_SIZE;
1113: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
1114: rem = len % AES_BLOCK_SIZE;
1115: bi = (__m128i*)in;
1116: bo = (__m128i*)out;
1117:
1118: ks = this->key->schedule;
1119:
1120: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
1121: {
1122: d1 = _mm_loadu_si128(bi + i + 0);
1123: d2 = _mm_loadu_si128(bi + i + 1);
1124: d3 = _mm_loadu_si128(bi + i + 2);
1125: d4 = _mm_loadu_si128(bi + i + 3);
1126:
1127: y = _mm_xor_si128(y, d1);
1128: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
1129:
1130: t1 = _mm_xor_si128(cb, ks[0]);
1131: cb = increment_be(cb);
1132: t2 = _mm_xor_si128(cb, ks[0]);
1133: cb = increment_be(cb);
1134: t3 = _mm_xor_si128(cb, ks[0]);
1135: cb = increment_be(cb);
1136: t4 = _mm_xor_si128(cb, ks[0]);
1137: cb = increment_be(cb);
1138:
1139: t1 = _mm_aesenc_si128(t1, ks[1]);
1140: t2 = _mm_aesenc_si128(t2, ks[1]);
1141: t3 = _mm_aesenc_si128(t3, ks[1]);
1142: t4 = _mm_aesenc_si128(t4, ks[1]);
1143: t1 = _mm_aesenc_si128(t1, ks[2]);
1144: t2 = _mm_aesenc_si128(t2, ks[2]);
1145: t3 = _mm_aesenc_si128(t3, ks[2]);
1146: t4 = _mm_aesenc_si128(t4, ks[2]);
1147: t1 = _mm_aesenc_si128(t1, ks[3]);
1148: t2 = _mm_aesenc_si128(t2, ks[3]);
1149: t3 = _mm_aesenc_si128(t3, ks[3]);
1150: t4 = _mm_aesenc_si128(t4, ks[3]);
1151: t1 = _mm_aesenc_si128(t1, ks[4]);
1152: t2 = _mm_aesenc_si128(t2, ks[4]);
1153: t3 = _mm_aesenc_si128(t3, ks[4]);
1154: t4 = _mm_aesenc_si128(t4, ks[4]);
1155: t1 = _mm_aesenc_si128(t1, ks[5]);
1156: t2 = _mm_aesenc_si128(t2, ks[5]);
1157: t3 = _mm_aesenc_si128(t3, ks[5]);
1158: t4 = _mm_aesenc_si128(t4, ks[5]);
1159: t1 = _mm_aesenc_si128(t1, ks[6]);
1160: t2 = _mm_aesenc_si128(t2, ks[6]);
1161: t3 = _mm_aesenc_si128(t3, ks[6]);
1162: t4 = _mm_aesenc_si128(t4, ks[6]);
1163: t1 = _mm_aesenc_si128(t1, ks[7]);
1164: t2 = _mm_aesenc_si128(t2, ks[7]);
1165: t3 = _mm_aesenc_si128(t3, ks[7]);
1166: t4 = _mm_aesenc_si128(t4, ks[7]);
1167: t1 = _mm_aesenc_si128(t1, ks[8]);
1168: t2 = _mm_aesenc_si128(t2, ks[8]);
1169: t3 = _mm_aesenc_si128(t3, ks[8]);
1170: t4 = _mm_aesenc_si128(t4, ks[8]);
1171: t1 = _mm_aesenc_si128(t1, ks[9]);
1172: t2 = _mm_aesenc_si128(t2, ks[9]);
1173: t3 = _mm_aesenc_si128(t3, ks[9]);
1174: t4 = _mm_aesenc_si128(t4, ks[9]);
1175: t1 = _mm_aesenc_si128(t1, ks[10]);
1176: t2 = _mm_aesenc_si128(t2, ks[10]);
1177: t3 = _mm_aesenc_si128(t3, ks[10]);
1178: t4 = _mm_aesenc_si128(t4, ks[10]);
1179: t1 = _mm_aesenc_si128(t1, ks[11]);
1180: t2 = _mm_aesenc_si128(t2, ks[11]);
1181: t3 = _mm_aesenc_si128(t3, ks[11]);
1182: t4 = _mm_aesenc_si128(t4, ks[11]);
1183: t1 = _mm_aesenc_si128(t1, ks[12]);
1184: t2 = _mm_aesenc_si128(t2, ks[12]);
1185: t3 = _mm_aesenc_si128(t3, ks[12]);
1186: t4 = _mm_aesenc_si128(t4, ks[12]);
1187: t1 = _mm_aesenc_si128(t1, ks[13]);
1188: t2 = _mm_aesenc_si128(t2, ks[13]);
1189: t3 = _mm_aesenc_si128(t3, ks[13]);
1190: t4 = _mm_aesenc_si128(t4, ks[13]);
1191:
1192: t1 = _mm_aesenclast_si128(t1, ks[14]);
1193: t2 = _mm_aesenclast_si128(t2, ks[14]);
1194: t3 = _mm_aesenclast_si128(t3, ks[14]);
1195: t4 = _mm_aesenclast_si128(t4, ks[14]);
1196:
1197: t1 = _mm_xor_si128(t1, d1);
1198: t2 = _mm_xor_si128(t2, d2);
1199: t3 = _mm_xor_si128(t3, d3);
1200: t4 = _mm_xor_si128(t4, d4);
1201:
1202: _mm_storeu_si128(bo + i + 0, t1);
1203: _mm_storeu_si128(bo + i + 1, t2);
1204: _mm_storeu_si128(bo + i + 2, t3);
1205: _mm_storeu_si128(bo + i + 3, t4);
1206: }
1207:
1208: for (i = pblocks; i < blocks; i++)
1209: {
1210: d1 = _mm_loadu_si128(bi + i);
1211:
1212: y = ghash(this->h, y, d1);
1213:
1214: t1 = _mm_xor_si128(cb, ks[0]);
1215: t1 = _mm_aesenc_si128(t1, ks[1]);
1216: t1 = _mm_aesenc_si128(t1, ks[2]);
1217: t1 = _mm_aesenc_si128(t1, ks[3]);
1218: t1 = _mm_aesenc_si128(t1, ks[4]);
1219: t1 = _mm_aesenc_si128(t1, ks[5]);
1220: t1 = _mm_aesenc_si128(t1, ks[6]);
1221: t1 = _mm_aesenc_si128(t1, ks[7]);
1222: t1 = _mm_aesenc_si128(t1, ks[8]);
1223: t1 = _mm_aesenc_si128(t1, ks[9]);
1224: t1 = _mm_aesenc_si128(t1, ks[10]);
1225: t1 = _mm_aesenc_si128(t1, ks[11]);
1226: t1 = _mm_aesenc_si128(t1, ks[12]);
1227: t1 = _mm_aesenc_si128(t1, ks[13]);
1228: t1 = _mm_aesenclast_si128(t1, ks[14]);
1229:
1230: t1 = _mm_xor_si128(t1, d1);
1231: _mm_storeu_si128(bo + i, t1);
1232:
1233: cb = increment_be(cb);
1234: }
1235:
1236: if (rem)
1237: {
1238: y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
1239: }
1240: y = icv_tailer(this, y, alen, len);
1241: icv_crypt(this, y, j, icv);
1242: }
1243:
1244: METHOD(aead_t, encrypt, bool,
1245: private_aesni_gcm_t *this, chunk_t plain, chunk_t assoc, chunk_t iv,
1246: chunk_t *encr)
1247: {
1248: u_char *out;
1249:
1250: if (!this->key || iv.len != IV_SIZE)
1251: {
1252: return FALSE;
1253: }
1254: out = plain.ptr;
1255: if (encr)
1256: {
1257: *encr = chunk_alloc(plain.len + this->icv_size);
1258: out = encr->ptr;
1259: }
1260: this->encrypt(this, plain.len, plain.ptr, out, iv.ptr,
1261: assoc.len, assoc.ptr, out + plain.len);
1262: return TRUE;
1263: }
1264:
1265: METHOD(aead_t, decrypt, bool,
1266: private_aesni_gcm_t *this, chunk_t encr, chunk_t assoc, chunk_t iv,
1267: chunk_t *plain)
1268: {
1269: u_char *out, icv[this->icv_size];
1270:
1271: if (!this->key || iv.len != IV_SIZE || encr.len < this->icv_size)
1272: {
1273: return FALSE;
1274: }
1275: encr.len -= this->icv_size;
1276: out = encr.ptr;
1277: if (plain)
1278: {
1279: *plain = chunk_alloc(encr.len);
1280: out = plain->ptr;
1281: }
1282: this->decrypt(this, encr.len, encr.ptr, out, iv.ptr,
1283: assoc.len, assoc.ptr, icv);
1284: return memeq_const(icv, encr.ptr + encr.len, this->icv_size);
1285: }
1286:
1287: METHOD(aead_t, get_block_size, size_t,
1288: private_aesni_gcm_t *this)
1289: {
1290: return 1;
1291: }
1292:
1293: METHOD(aead_t, get_icv_size, size_t,
1294: private_aesni_gcm_t *this)
1295: {
1296: return this->icv_size;
1297: }
1298:
1299: METHOD(aead_t, get_iv_size, size_t,
1300: private_aesni_gcm_t *this)
1301: {
1302: return IV_SIZE;
1303: }
1304:
1305: METHOD(aead_t, get_iv_gen, iv_gen_t*,
1306: private_aesni_gcm_t *this)
1307: {
1308: return this->iv_gen;
1309: }
1310:
1311: METHOD(aead_t, get_key_size, size_t,
1312: private_aesni_gcm_t *this)
1313: {
1314: return this->key_size + SALT_SIZE;
1315: }
1316:
1317: METHOD(aead_t, set_key, bool,
1318: private_aesni_gcm_t *this, chunk_t key)
1319: {
1320: u_int round;
1321: __m128i *ks, h;
1322:
1323: if (key.len != this->key_size + SALT_SIZE)
1324: {
1325: return FALSE;
1326: }
1327:
1328: memcpy(this->salt, key.ptr + key.len - SALT_SIZE, SALT_SIZE);
1329: key.len -= SALT_SIZE;
1330:
1331: DESTROY_IF(this->key);
1332: this->key = aesni_key_create(TRUE, key);
1333:
1334: ks = this->key->schedule;
1335: h = _mm_xor_si128(_mm_setzero_si128(), ks[0]);
1336: for (round = 1; round < this->key->rounds; round++)
1337: {
1338: h = _mm_aesenc_si128(h, ks[round]);
1339: }
1340: h = _mm_aesenclast_si128(h, ks[this->key->rounds]);
1341:
1342: this->h = h;
1343: h = swap128(h);
1344: this->hh = mult_block(h, this->h);
1345: this->hhh = mult_block(h, this->hh);
1346: this->hhhh = mult_block(h, this->hhh);
1347: this->h = swap128(this->h);
1348: this->hh = swap128(this->hh);
1349: this->hhh = swap128(this->hhh);
1350: this->hhhh = swap128(this->hhhh);
1351:
1352: return TRUE;
1353: }
1354:
1355: METHOD(aead_t, destroy, void,
1356: private_aesni_gcm_t *this)
1357: {
1358: DESTROY_IF(this->key);
1359: memwipe(&this->h, sizeof(this->h));
1360: memwipe(&this->hh, sizeof(this->hh));
1361: memwipe(&this->hhh, sizeof(this->hhh));
1362: memwipe(&this->hhhh, sizeof(this->hhhh));
1363: this->iv_gen->destroy(this->iv_gen);
1364: free_align(this);
1365: }
1366:
1367: /**
1368: * See header
1369: */
1370: aesni_gcm_t *aesni_gcm_create(encryption_algorithm_t algo,
1371: size_t key_size, size_t salt_size)
1372: {
1373: private_aesni_gcm_t *this;
1374: size_t icv_size;
1375:
1376: switch (key_size)
1377: {
1378: case 0:
1379: key_size = 16;
1380: break;
1381: case 16:
1382: case 24:
1383: case 32:
1384: break;
1385: default:
1386: return NULL;
1387: }
1388: if (salt_size && salt_size != SALT_SIZE)
1389: {
1390: /* currently not supported */
1391: return NULL;
1392: }
1393: switch (algo)
1394: {
1395: case ENCR_AES_GCM_ICV8:
1396: icv_size = 8;
1397: break;
1398: case ENCR_AES_GCM_ICV12:
1399: icv_size = 12;
1400: break;
1401: case ENCR_AES_GCM_ICV16:
1402: icv_size = 16;
1403: break;
1404: default:
1405: return NULL;
1406: }
1407:
1408: INIT_ALIGN(this, sizeof(__m128i),
1409: .public = {
1410: .aead = {
1411: .encrypt = _encrypt,
1412: .decrypt = _decrypt,
1413: .get_block_size = _get_block_size,
1414: .get_icv_size = _get_icv_size,
1415: .get_iv_size = _get_iv_size,
1416: .get_iv_gen = _get_iv_gen,
1417: .get_key_size = _get_key_size,
1418: .set_key = _set_key,
1419: .destroy = _destroy,
1420: },
1421: },
1422: .key_size = key_size,
1423: .iv_gen = iv_gen_seq_create(),
1424: .icv_size = icv_size,
1425: );
1426:
1427: switch (key_size)
1428: {
1429: case 16:
1430: this->encrypt = encrypt_gcm128;
1431: this->decrypt = decrypt_gcm128;
1432: break;
1433: case 24:
1434: this->encrypt = encrypt_gcm192;
1435: this->decrypt = decrypt_gcm192;
1436: break;
1437: case 32:
1438: this->encrypt = encrypt_gcm256;
1439: this->decrypt = decrypt_gcm256;
1440: break;
1441: }
1442:
1443: return &this->public;
1444: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>