1: /*
2: * Copyright (C) 2015 Martin Willi
3: * Copyright (C) 2015 revosec AG
4: *
5: * This program is free software; you can redistribute it and/or modify it
6: * under the terms of the GNU General Public License as published by the
7: * Free Software Foundation; either version 2 of the License, or (at your
8: * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
9: *
10: * This program is distributed in the hope that it will be useful, but
11: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
12: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13: * for more details.
14: */
15:
16: #include "aesni_ctr.h"
17: #include "aesni_key.h"
18:
19: #include <tmmintrin.h>
20:
21: /**
22: * Pipeline parallelism we use for CTR en/decryption
23: */
24: #define CTR_CRYPT_PARALLELISM 4
25:
26: typedef struct private_aesni_ctr_t private_aesni_ctr_t;
27:
28: /**
29: * CTR en/decryption method type
30: */
31: typedef void (*aesni_ctr_fn_t)(private_aesni_ctr_t*, size_t, u_char*, u_char*);
32:
33: /**
34: * Private data of an aesni_ctr_t object.
35: */
36: struct private_aesni_ctr_t {
37:
38: /**
39: * Public aesni_ctr_t interface.
40: */
41: aesni_ctr_t public;
42:
43: /**
44: * Key size
45: */
46: u_int key_size;
47:
48: /**
49: * Key schedule
50: */
51: aesni_key_t *key;
52:
53: /**
54: * Encryption method
55: */
56: aesni_ctr_fn_t crypt;
57:
58: /**
59: * Counter state
60: */
61: struct {
62: char nonce[4];
63: char iv[8];
64: uint32_t counter;
65: } __attribute__((packed, aligned(sizeof(__m128i)))) state;
66: };
67:
68: /**
69: * Do big-endian increment on x
70: */
71: static inline __m128i increment_be(__m128i x)
72: {
73: __m128i swap;
74:
75: swap = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
76:
77: x = _mm_shuffle_epi8(x, swap);
78: x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
79: x = _mm_shuffle_epi8(x, swap);
80:
81: return x;
82: }
83:
84: /**
85: * AES-128 CTR encryption
86: */
87: static void encrypt_ctr128(private_aesni_ctr_t *this,
88: size_t len, u_char *in, u_char *out)
89: {
90: __m128i t1, t2, t3, t4;
91: __m128i d1, d2, d3, d4;
92: __m128i *ks, state, b, *bi, *bo;
93: u_int i, blocks, pblocks, rem;
94:
95: state = _mm_load_si128((__m128i*)&this->state);
96: blocks = len / AES_BLOCK_SIZE;
97: pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
98: rem = len % AES_BLOCK_SIZE;
99: bi = (__m128i*)in;
100: bo = (__m128i*)out;
101:
102: ks = this->key->schedule;
103:
104: for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
105: {
106: d1 = _mm_loadu_si128(bi + i + 0);
107: d2 = _mm_loadu_si128(bi + i + 1);
108: d3 = _mm_loadu_si128(bi + i + 2);
109: d4 = _mm_loadu_si128(bi + i + 3);
110:
111: t1 = _mm_xor_si128(state, ks[0]);
112: state = increment_be(state);
113: t2 = _mm_xor_si128(state, ks[0]);
114: state = increment_be(state);
115: t3 = _mm_xor_si128(state, ks[0]);
116: state = increment_be(state);
117: t4 = _mm_xor_si128(state, ks[0]);
118: state = increment_be(state);
119:
120: t1 = _mm_aesenc_si128(t1, ks[1]);
121: t2 = _mm_aesenc_si128(t2, ks[1]);
122: t3 = _mm_aesenc_si128(t3, ks[1]);
123: t4 = _mm_aesenc_si128(t4, ks[1]);
124: t1 = _mm_aesenc_si128(t1, ks[2]);
125: t2 = _mm_aesenc_si128(t2, ks[2]);
126: t3 = _mm_aesenc_si128(t3, ks[2]);
127: t4 = _mm_aesenc_si128(t4, ks[2]);
128: t1 = _mm_aesenc_si128(t1, ks[3]);
129: t2 = _mm_aesenc_si128(t2, ks[3]);
130: t3 = _mm_aesenc_si128(t3, ks[3]);
131: t4 = _mm_aesenc_si128(t4, ks[3]);
132: t1 = _mm_aesenc_si128(t1, ks[4]);
133: t2 = _mm_aesenc_si128(t2, ks[4]);
134: t3 = _mm_aesenc_si128(t3, ks[4]);
135: t4 = _mm_aesenc_si128(t4, ks[4]);
136: t1 = _mm_aesenc_si128(t1, ks[5]);
137: t2 = _mm_aesenc_si128(t2, ks[5]);
138: t3 = _mm_aesenc_si128(t3, ks[5]);
139: t4 = _mm_aesenc_si128(t4, ks[5]);
140: t1 = _mm_aesenc_si128(t1, ks[6]);
141: t2 = _mm_aesenc_si128(t2, ks[6]);
142: t3 = _mm_aesenc_si128(t3, ks[6]);
143: t4 = _mm_aesenc_si128(t4, ks[6]);
144: t1 = _mm_aesenc_si128(t1, ks[7]);
145: t2 = _mm_aesenc_si128(t2, ks[7]);
146: t3 = _mm_aesenc_si128(t3, ks[7]);
147: t4 = _mm_aesenc_si128(t4, ks[7]);
148: t1 = _mm_aesenc_si128(t1, ks[8]);
149: t2 = _mm_aesenc_si128(t2, ks[8]);
150: t3 = _mm_aesenc_si128(t3, ks[8]);
151: t4 = _mm_aesenc_si128(t4, ks[8]);
152: t1 = _mm_aesenc_si128(t1, ks[9]);
153: t2 = _mm_aesenc_si128(t2, ks[9]);
154: t3 = _mm_aesenc_si128(t3, ks[9]);
155: t4 = _mm_aesenc_si128(t4, ks[9]);
156:
157: t1 = _mm_aesenclast_si128(t1, ks[10]);
158: t2 = _mm_aesenclast_si128(t2, ks[10]);
159: t3 = _mm_aesenclast_si128(t3, ks[10]);
160: t4 = _mm_aesenclast_si128(t4, ks[10]);
161: t1 = _mm_xor_si128(t1, d1);
162: t2 = _mm_xor_si128(t2, d2);
163: t3 = _mm_xor_si128(t3, d3);
164: t4 = _mm_xor_si128(t4, d4);
165: _mm_storeu_si128(bo + i + 0, t1);
166: _mm_storeu_si128(bo + i + 1, t2);
167: _mm_storeu_si128(bo + i + 2, t3);
168: _mm_storeu_si128(bo + i + 3, t4);
169: }
170:
171: for (i = pblocks; i < blocks; i++)
172: {
173: d1 = _mm_loadu_si128(bi + i);
174:
175: t1 = _mm_xor_si128(state, ks[0]);
176: state = increment_be(state);
177:
178: t1 = _mm_aesenc_si128(t1, ks[1]);
179: t1 = _mm_aesenc_si128(t1, ks[2]);
180: t1 = _mm_aesenc_si128(t1, ks[3]);
181: t1 = _mm_aesenc_si128(t1, ks[4]);
182: t1 = _mm_aesenc_si128(t1, ks[5]);
183: t1 = _mm_aesenc_si128(t1, ks[6]);
184: t1 = _mm_aesenc_si128(t1, ks[7]);
185: t1 = _mm_aesenc_si128(t1, ks[8]);
186: t1 = _mm_aesenc_si128(t1, ks[9]);
187:
188: t1 = _mm_aesenclast_si128(t1, ks[10]);
189: t1 = _mm_xor_si128(t1, d1);
190: _mm_storeu_si128(bo + i, t1);
191: }
192:
193: if (rem)
194: {
195: memset(&b, 0, sizeof(b));
196: memcpy(&b, bi + blocks, rem);
197:
198: d1 = _mm_loadu_si128(&b);
199: t1 = _mm_xor_si128(state, ks[0]);
200:
201: t1 = _mm_aesenc_si128(t1, ks[1]);
202: t1 = _mm_aesenc_si128(t1, ks[2]);
203: t1 = _mm_aesenc_si128(t1, ks[3]);
204: t1 = _mm_aesenc_si128(t1, ks[4]);
205: t1 = _mm_aesenc_si128(t1, ks[5]);
206: t1 = _mm_aesenc_si128(t1, ks[6]);
207: t1 = _mm_aesenc_si128(t1, ks[7]);
208: t1 = _mm_aesenc_si128(t1, ks[8]);
209: t1 = _mm_aesenc_si128(t1, ks[9]);
210:
211: t1 = _mm_aesenclast_si128(t1, ks[10]);
212: t1 = _mm_xor_si128(t1, d1);
213: _mm_storeu_si128(&b, t1);
214:
215: memcpy(bo + blocks, &b, rem);
216: }
217: }
218:
219: /**
220: * AES-192 CTR encryption
221: */
222: static void encrypt_ctr192(private_aesni_ctr_t *this,
223: size_t len, u_char *in, u_char *out)
224: {
225: __m128i t1, t2, t3, t4;
226: __m128i d1, d2, d3, d4;
227: __m128i *ks, state, b, *bi, *bo;
228: u_int i, blocks, pblocks, rem;
229:
230: state = _mm_load_si128((__m128i*)&this->state);
231: blocks = len / AES_BLOCK_SIZE;
232: pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
233: rem = len % AES_BLOCK_SIZE;
234: bi = (__m128i*)in;
235: bo = (__m128i*)out;
236:
237: ks = this->key->schedule;
238:
239: for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
240: {
241: d1 = _mm_loadu_si128(bi + i + 0);
242: d2 = _mm_loadu_si128(bi + i + 1);
243: d3 = _mm_loadu_si128(bi + i + 2);
244: d4 = _mm_loadu_si128(bi + i + 3);
245:
246: t1 = _mm_xor_si128(state, ks[0]);
247: state = increment_be(state);
248: t2 = _mm_xor_si128(state, ks[0]);
249: state = increment_be(state);
250: t3 = _mm_xor_si128(state, ks[0]);
251: state = increment_be(state);
252: t4 = _mm_xor_si128(state, ks[0]);
253: state = increment_be(state);
254:
255: t1 = _mm_aesenc_si128(t1, ks[1]);
256: t2 = _mm_aesenc_si128(t2, ks[1]);
257: t3 = _mm_aesenc_si128(t3, ks[1]);
258: t4 = _mm_aesenc_si128(t4, ks[1]);
259: t1 = _mm_aesenc_si128(t1, ks[2]);
260: t2 = _mm_aesenc_si128(t2, ks[2]);
261: t3 = _mm_aesenc_si128(t3, ks[2]);
262: t4 = _mm_aesenc_si128(t4, ks[2]);
263: t1 = _mm_aesenc_si128(t1, ks[3]);
264: t2 = _mm_aesenc_si128(t2, ks[3]);
265: t3 = _mm_aesenc_si128(t3, ks[3]);
266: t4 = _mm_aesenc_si128(t4, ks[3]);
267: t1 = _mm_aesenc_si128(t1, ks[4]);
268: t2 = _mm_aesenc_si128(t2, ks[4]);
269: t3 = _mm_aesenc_si128(t3, ks[4]);
270: t4 = _mm_aesenc_si128(t4, ks[4]);
271: t1 = _mm_aesenc_si128(t1, ks[5]);
272: t2 = _mm_aesenc_si128(t2, ks[5]);
273: t3 = _mm_aesenc_si128(t3, ks[5]);
274: t4 = _mm_aesenc_si128(t4, ks[5]);
275: t1 = _mm_aesenc_si128(t1, ks[6]);
276: t2 = _mm_aesenc_si128(t2, ks[6]);
277: t3 = _mm_aesenc_si128(t3, ks[6]);
278: t4 = _mm_aesenc_si128(t4, ks[6]);
279: t1 = _mm_aesenc_si128(t1, ks[7]);
280: t2 = _mm_aesenc_si128(t2, ks[7]);
281: t3 = _mm_aesenc_si128(t3, ks[7]);
282: t4 = _mm_aesenc_si128(t4, ks[7]);
283: t1 = _mm_aesenc_si128(t1, ks[8]);
284: t2 = _mm_aesenc_si128(t2, ks[8]);
285: t3 = _mm_aesenc_si128(t3, ks[8]);
286: t4 = _mm_aesenc_si128(t4, ks[8]);
287: t1 = _mm_aesenc_si128(t1, ks[9]);
288: t2 = _mm_aesenc_si128(t2, ks[9]);
289: t3 = _mm_aesenc_si128(t3, ks[9]);
290: t4 = _mm_aesenc_si128(t4, ks[9]);
291: t1 = _mm_aesenc_si128(t1, ks[10]);
292: t2 = _mm_aesenc_si128(t2, ks[10]);
293: t3 = _mm_aesenc_si128(t3, ks[10]);
294: t4 = _mm_aesenc_si128(t4, ks[10]);
295: t1 = _mm_aesenc_si128(t1, ks[11]);
296: t2 = _mm_aesenc_si128(t2, ks[11]);
297: t3 = _mm_aesenc_si128(t3, ks[11]);
298: t4 = _mm_aesenc_si128(t4, ks[11]);
299:
300: t1 = _mm_aesenclast_si128(t1, ks[12]);
301: t2 = _mm_aesenclast_si128(t2, ks[12]);
302: t3 = _mm_aesenclast_si128(t3, ks[12]);
303: t4 = _mm_aesenclast_si128(t4, ks[12]);
304: t1 = _mm_xor_si128(t1, d1);
305: t2 = _mm_xor_si128(t2, d2);
306: t3 = _mm_xor_si128(t3, d3);
307: t4 = _mm_xor_si128(t4, d4);
308: _mm_storeu_si128(bo + i + 0, t1);
309: _mm_storeu_si128(bo + i + 1, t2);
310: _mm_storeu_si128(bo + i + 2, t3);
311: _mm_storeu_si128(bo + i + 3, t4);
312: }
313:
314: for (i = pblocks; i < blocks; i++)
315: {
316: d1 = _mm_loadu_si128(bi + i);
317:
318: t1 = _mm_xor_si128(state, ks[0]);
319: state = increment_be(state);
320:
321: t1 = _mm_aesenc_si128(t1, ks[1]);
322: t1 = _mm_aesenc_si128(t1, ks[2]);
323: t1 = _mm_aesenc_si128(t1, ks[3]);
324: t1 = _mm_aesenc_si128(t1, ks[4]);
325: t1 = _mm_aesenc_si128(t1, ks[5]);
326: t1 = _mm_aesenc_si128(t1, ks[6]);
327: t1 = _mm_aesenc_si128(t1, ks[7]);
328: t1 = _mm_aesenc_si128(t1, ks[8]);
329: t1 = _mm_aesenc_si128(t1, ks[9]);
330: t1 = _mm_aesenc_si128(t1, ks[10]);
331: t1 = _mm_aesenc_si128(t1, ks[11]);
332:
333: t1 = _mm_aesenclast_si128(t1, ks[12]);
334: t1 = _mm_xor_si128(t1, d1);
335: _mm_storeu_si128(bo + i, t1);
336: }
337:
338: if (rem)
339: {
340: memset(&b, 0, sizeof(b));
341: memcpy(&b, bi + blocks, rem);
342:
343: d1 = _mm_loadu_si128(&b);
344: t1 = _mm_xor_si128(state, ks[0]);
345:
346: t1 = _mm_aesenc_si128(t1, ks[1]);
347: t1 = _mm_aesenc_si128(t1, ks[2]);
348: t1 = _mm_aesenc_si128(t1, ks[3]);
349: t1 = _mm_aesenc_si128(t1, ks[4]);
350: t1 = _mm_aesenc_si128(t1, ks[5]);
351: t1 = _mm_aesenc_si128(t1, ks[6]);
352: t1 = _mm_aesenc_si128(t1, ks[7]);
353: t1 = _mm_aesenc_si128(t1, ks[8]);
354: t1 = _mm_aesenc_si128(t1, ks[9]);
355: t1 = _mm_aesenc_si128(t1, ks[10]);
356: t1 = _mm_aesenc_si128(t1, ks[11]);
357:
358: t1 = _mm_aesenclast_si128(t1, ks[12]);
359: t1 = _mm_xor_si128(t1, d1);
360: _mm_storeu_si128(&b, t1);
361:
362: memcpy(bo + blocks, &b, rem);
363: }
364: }
365:
366: /**
367: * AES-256 CTR encryption
368: */
369: static void encrypt_ctr256(private_aesni_ctr_t *this,
370: size_t len, u_char *in, u_char *out)
371: {
372: __m128i t1, t2, t3, t4;
373: __m128i d1, d2, d3, d4;
374: __m128i *ks, state, b, *bi, *bo;
375: u_int i, blocks, pblocks, rem;
376:
377: state = _mm_load_si128((__m128i*)&this->state);
378: blocks = len / AES_BLOCK_SIZE;
379: pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
380: rem = len % AES_BLOCK_SIZE;
381: bi = (__m128i*)in;
382: bo = (__m128i*)out;
383:
384: ks = this->key->schedule;
385:
386: for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
387: {
388: d1 = _mm_loadu_si128(bi + i + 0);
389: d2 = _mm_loadu_si128(bi + i + 1);
390: d3 = _mm_loadu_si128(bi + i + 2);
391: d4 = _mm_loadu_si128(bi + i + 3);
392:
393: t1 = _mm_xor_si128(state, ks[0]);
394: state = increment_be(state);
395: t2 = _mm_xor_si128(state, ks[0]);
396: state = increment_be(state);
397: t3 = _mm_xor_si128(state, ks[0]);
398: state = increment_be(state);
399: t4 = _mm_xor_si128(state, ks[0]);
400: state = increment_be(state);
401:
402: t1 = _mm_aesenc_si128(t1, ks[1]);
403: t2 = _mm_aesenc_si128(t2, ks[1]);
404: t3 = _mm_aesenc_si128(t3, ks[1]);
405: t4 = _mm_aesenc_si128(t4, ks[1]);
406: t1 = _mm_aesenc_si128(t1, ks[2]);
407: t2 = _mm_aesenc_si128(t2, ks[2]);
408: t3 = _mm_aesenc_si128(t3, ks[2]);
409: t4 = _mm_aesenc_si128(t4, ks[2]);
410: t1 = _mm_aesenc_si128(t1, ks[3]);
411: t2 = _mm_aesenc_si128(t2, ks[3]);
412: t3 = _mm_aesenc_si128(t3, ks[3]);
413: t4 = _mm_aesenc_si128(t4, ks[3]);
414: t1 = _mm_aesenc_si128(t1, ks[4]);
415: t2 = _mm_aesenc_si128(t2, ks[4]);
416: t3 = _mm_aesenc_si128(t3, ks[4]);
417: t4 = _mm_aesenc_si128(t4, ks[4]);
418: t1 = _mm_aesenc_si128(t1, ks[5]);
419: t2 = _mm_aesenc_si128(t2, ks[5]);
420: t3 = _mm_aesenc_si128(t3, ks[5]);
421: t4 = _mm_aesenc_si128(t4, ks[5]);
422: t1 = _mm_aesenc_si128(t1, ks[6]);
423: t2 = _mm_aesenc_si128(t2, ks[6]);
424: t3 = _mm_aesenc_si128(t3, ks[6]);
425: t4 = _mm_aesenc_si128(t4, ks[6]);
426: t1 = _mm_aesenc_si128(t1, ks[7]);
427: t2 = _mm_aesenc_si128(t2, ks[7]);
428: t3 = _mm_aesenc_si128(t3, ks[7]);
429: t4 = _mm_aesenc_si128(t4, ks[7]);
430: t1 = _mm_aesenc_si128(t1, ks[8]);
431: t2 = _mm_aesenc_si128(t2, ks[8]);
432: t3 = _mm_aesenc_si128(t3, ks[8]);
433: t4 = _mm_aesenc_si128(t4, ks[8]);
434: t1 = _mm_aesenc_si128(t1, ks[9]);
435: t2 = _mm_aesenc_si128(t2, ks[9]);
436: t3 = _mm_aesenc_si128(t3, ks[9]);
437: t4 = _mm_aesenc_si128(t4, ks[9]);
438: t1 = _mm_aesenc_si128(t1, ks[10]);
439: t2 = _mm_aesenc_si128(t2, ks[10]);
440: t3 = _mm_aesenc_si128(t3, ks[10]);
441: t4 = _mm_aesenc_si128(t4, ks[10]);
442: t1 = _mm_aesenc_si128(t1, ks[11]);
443: t2 = _mm_aesenc_si128(t2, ks[11]);
444: t3 = _mm_aesenc_si128(t3, ks[11]);
445: t4 = _mm_aesenc_si128(t4, ks[11]);
446: t1 = _mm_aesenc_si128(t1, ks[12]);
447: t2 = _mm_aesenc_si128(t2, ks[12]);
448: t3 = _mm_aesenc_si128(t3, ks[12]);
449: t4 = _mm_aesenc_si128(t4, ks[12]);
450: t1 = _mm_aesenc_si128(t1, ks[13]);
451: t2 = _mm_aesenc_si128(t2, ks[13]);
452: t3 = _mm_aesenc_si128(t3, ks[13]);
453: t4 = _mm_aesenc_si128(t4, ks[13]);
454:
455: t1 = _mm_aesenclast_si128(t1, ks[14]);
456: t2 = _mm_aesenclast_si128(t2, ks[14]);
457: t3 = _mm_aesenclast_si128(t3, ks[14]);
458: t4 = _mm_aesenclast_si128(t4, ks[14]);
459: t1 = _mm_xor_si128(t1, d1);
460: t2 = _mm_xor_si128(t2, d2);
461: t3 = _mm_xor_si128(t3, d3);
462: t4 = _mm_xor_si128(t4, d4);
463: _mm_storeu_si128(bo + i + 0, t1);
464: _mm_storeu_si128(bo + i + 1, t2);
465: _mm_storeu_si128(bo + i + 2, t3);
466: _mm_storeu_si128(bo + i + 3, t4);
467: }
468:
469: for (i = pblocks; i < blocks; i++)
470: {
471: d1 = _mm_loadu_si128(bi + i);
472:
473: t1 = _mm_xor_si128(state, ks[0]);
474: state = increment_be(state);
475:
476: t1 = _mm_aesenc_si128(t1, ks[1]);
477: t1 = _mm_aesenc_si128(t1, ks[2]);
478: t1 = _mm_aesenc_si128(t1, ks[3]);
479: t1 = _mm_aesenc_si128(t1, ks[4]);
480: t1 = _mm_aesenc_si128(t1, ks[5]);
481: t1 = _mm_aesenc_si128(t1, ks[6]);
482: t1 = _mm_aesenc_si128(t1, ks[7]);
483: t1 = _mm_aesenc_si128(t1, ks[8]);
484: t1 = _mm_aesenc_si128(t1, ks[9]);
485: t1 = _mm_aesenc_si128(t1, ks[10]);
486: t1 = _mm_aesenc_si128(t1, ks[11]);
487: t1 = _mm_aesenc_si128(t1, ks[12]);
488: t1 = _mm_aesenc_si128(t1, ks[13]);
489:
490: t1 = _mm_aesenclast_si128(t1, ks[14]);
491: t1 = _mm_xor_si128(t1, d1);
492: _mm_storeu_si128(bo + i, t1);
493: }
494:
495: if (rem)
496: {
497: memset(&b, 0, sizeof(b));
498: memcpy(&b, bi + blocks, rem);
499:
500: d1 = _mm_loadu_si128(&b);
501: t1 = _mm_xor_si128(state, ks[0]);
502:
503: t1 = _mm_aesenc_si128(t1, ks[1]);
504: t1 = _mm_aesenc_si128(t1, ks[2]);
505: t1 = _mm_aesenc_si128(t1, ks[3]);
506: t1 = _mm_aesenc_si128(t1, ks[4]);
507: t1 = _mm_aesenc_si128(t1, ks[5]);
508: t1 = _mm_aesenc_si128(t1, ks[6]);
509: t1 = _mm_aesenc_si128(t1, ks[7]);
510: t1 = _mm_aesenc_si128(t1, ks[8]);
511: t1 = _mm_aesenc_si128(t1, ks[9]);
512: t1 = _mm_aesenc_si128(t1, ks[10]);
513: t1 = _mm_aesenc_si128(t1, ks[11]);
514: t1 = _mm_aesenc_si128(t1, ks[12]);
515: t1 = _mm_aesenc_si128(t1, ks[13]);
516:
517: t1 = _mm_aesenclast_si128(t1, ks[14]);
518: t1 = _mm_xor_si128(t1, d1);
519: _mm_storeu_si128(&b, t1);
520:
521: memcpy(bo + blocks, &b, rem);
522: }
523: }
524:
525: METHOD(crypter_t, crypt, bool,
526: private_aesni_ctr_t *this, chunk_t in, chunk_t iv, chunk_t *out)
527: {
528: u_char *buf;
529:
530: if (!this->key || iv.len != sizeof(this->state.iv))
531: {
532: return FALSE;
533: }
534: memcpy(this->state.iv, iv.ptr, sizeof(this->state.iv));
535: this->state.counter = htonl(1);
536:
537: buf = in.ptr;
538: if (out)
539: {
540: *out = chunk_alloc(in.len);
541: buf = out->ptr;
542: }
543: this->crypt(this, in.len, in.ptr, buf);
544: return TRUE;
545: }
546:
547: METHOD(crypter_t, get_block_size, size_t,
548: private_aesni_ctr_t *this)
549: {
550: return 1;
551: }
552:
553: METHOD(crypter_t, get_iv_size, size_t,
554: private_aesni_ctr_t *this)
555: {
556: return sizeof(this->state.iv);
557: }
558:
559: METHOD(crypter_t, get_key_size, size_t,
560: private_aesni_ctr_t *this)
561: {
562: return this->key_size + sizeof(this->state.nonce);
563: }
564:
565: METHOD(crypter_t, set_key, bool,
566: private_aesni_ctr_t *this, chunk_t key)
567: {
568: if (key.len != get_key_size(this))
569: {
570: return FALSE;
571: }
572:
573: memcpy(this->state.nonce, key.ptr + key.len - sizeof(this->state.nonce),
574: sizeof(this->state.nonce));
575: key.len -= sizeof(this->state.nonce);
576:
577: DESTROY_IF(this->key);
578: this->key = aesni_key_create(TRUE, key);
579:
580: return this->key;
581: }
582:
583: METHOD(crypter_t, destroy, void,
584: private_aesni_ctr_t *this)
585: {
586: DESTROY_IF(this->key);
587: free_align(this);
588: }
589:
590: /**
591: * See header
592: */
593: aesni_ctr_t *aesni_ctr_create(encryption_algorithm_t algo, size_t key_size)
594: {
595: private_aesni_ctr_t *this;
596:
597: if (algo != ENCR_AES_CTR)
598: {
599: return NULL;
600: }
601: switch (key_size)
602: {
603: case 0:
604: key_size = 16;
605: break;
606: case 16:
607: case 24:
608: case 32:
609: break;
610: default:
611: return NULL;
612: }
613:
614: INIT_ALIGN(this, sizeof(__m128i),
615: .public = {
616: .crypter = {
617: .encrypt = _crypt,
618: .decrypt = _crypt,
619: .get_block_size = _get_block_size,
620: .get_iv_size = _get_iv_size,
621: .get_key_size = _get_key_size,
622: .set_key = _set_key,
623: .destroy = _destroy,
624: },
625: },
626: .key_size = key_size,
627: );
628:
629: switch (key_size)
630: {
631: case 16:
632: this->crypt = encrypt_ctr128;
633: break;
634: case 24:
635: this->crypt = encrypt_ctr192;
636: break;
637: case 32:
638: this->crypt = encrypt_ctr256;
639: break;
640: }
641:
642: return &this->public;
643: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>