Annotation of embedaddon/strongswan/src/libstrongswan/plugins/aesni/aesni_ecb.c, revision 1.1.1.1
1.1 misho 1: /*
2: * Copyright (C) 2015 Martin Willi
3: * Copyright (C) 2015 revosec AG
4: *
5: * Copyright (C) 2019 Andreas Steffen
6: * HSR Hochschule fuer Technik Rapperswil
7: *
8: * This program is free software; you can redistribute it and/or modify it
9: * under the terms of the GNU General Public License as published by the
10: * Free Software Foundation; either version 2 of the License, or (at your
11: * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
12: *
13: * This program is distributed in the hope that it will be useful, but
14: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16: * for more details.
17: */
18:
19: #include "aesni_ecb.h"
20: #include "aesni_key.h"
21:
22: /**
23: * Pipeline parallelism we use for ECB encryption/decryption
24: */
25: #define ECB_PARALLELISM 4
26:
27: typedef struct private_aesni_ecb_t private_aesni_ecb_t;
28:
29: /**
30: * ECB en/decryption method type
31: */
32: typedef void (*aesni_ecb_fn_t)(aesni_key_t*, u_int, u_char*, u_char*);
33:
34: /**
35: * Private data of an aesni_ecb_t object.
36: */
37: struct private_aesni_ecb_t {
38:
39: /**
40: * Public aesni_ecb_t interface.
41: */
42: aesni_ecb_t public;
43:
44: /**
45: * Key size
46: */
47: u_int key_size;
48:
49: /**
50: * Encryption key schedule
51: */
52: aesni_key_t *ekey;
53:
54: /**
55: * Decryption key schedule
56: */
57: aesni_key_t *dkey;
58:
59: /**
60: * Encryption method
61: */
62: aesni_ecb_fn_t encrypt;
63:
64: /**
65: * Decryption method
66: */
67: aesni_ecb_fn_t decrypt;
68: };
69:
70: /**
71: * AES-128 ECB encryption
72: */
73: static void encrypt_ecb128(aesni_key_t *key, u_int blocks, u_char *in,
74: u_char *out)
75: {
76: __m128i *ks, *bi, *bo;
77: __m128i t1, t2, t3, t4;
78: u_int i, pblocks;
79:
80: ks = key->schedule;
81: bi = (__m128i*)in;
82: bo = (__m128i*)out;
83: pblocks = blocks - (blocks % ECB_PARALLELISM);
84:
85: for (i = 0; i < pblocks; i += ECB_PARALLELISM)
86: {
87: t1 = _mm_loadu_si128(bi + i + 0);
88: t2 = _mm_loadu_si128(bi + i + 1);
89: t3 = _mm_loadu_si128(bi + i + 2);
90: t4 = _mm_loadu_si128(bi + i + 3);
91:
92: t1 = _mm_xor_si128(t1, ks[0]);
93: t2 = _mm_xor_si128(t2, ks[0]);
94: t3 = _mm_xor_si128(t3, ks[0]);
95: t4 = _mm_xor_si128(t4, ks[0]);
96:
97: t1 = _mm_aesenc_si128(t1, ks[1]);
98: t2 = _mm_aesenc_si128(t2, ks[1]);
99: t3 = _mm_aesenc_si128(t3, ks[1]);
100: t4 = _mm_aesenc_si128(t4, ks[1]);
101: t1 = _mm_aesenc_si128(t1, ks[2]);
102: t2 = _mm_aesenc_si128(t2, ks[2]);
103: t3 = _mm_aesenc_si128(t3, ks[2]);
104: t4 = _mm_aesenc_si128(t4, ks[2]);
105: t1 = _mm_aesenc_si128(t1, ks[3]);
106: t2 = _mm_aesenc_si128(t2, ks[3]);
107: t3 = _mm_aesenc_si128(t3, ks[3]);
108: t4 = _mm_aesenc_si128(t4, ks[3]);
109: t1 = _mm_aesenc_si128(t1, ks[4]);
110: t2 = _mm_aesenc_si128(t2, ks[4]);
111: t3 = _mm_aesenc_si128(t3, ks[4]);
112: t4 = _mm_aesenc_si128(t4, ks[4]);
113: t1 = _mm_aesenc_si128(t1, ks[5]);
114: t2 = _mm_aesenc_si128(t2, ks[5]);
115: t3 = _mm_aesenc_si128(t3, ks[5]);
116: t4 = _mm_aesenc_si128(t4, ks[5]);
117: t1 = _mm_aesenc_si128(t1, ks[6]);
118: t2 = _mm_aesenc_si128(t2, ks[6]);
119: t3 = _mm_aesenc_si128(t3, ks[6]);
120: t4 = _mm_aesenc_si128(t4, ks[6]);
121: t1 = _mm_aesenc_si128(t1, ks[7]);
122: t2 = _mm_aesenc_si128(t2, ks[7]);
123: t3 = _mm_aesenc_si128(t3, ks[7]);
124: t4 = _mm_aesenc_si128(t4, ks[7]);
125: t1 = _mm_aesenc_si128(t1, ks[8]);
126: t2 = _mm_aesenc_si128(t2, ks[8]);
127: t3 = _mm_aesenc_si128(t3, ks[8]);
128: t4 = _mm_aesenc_si128(t4, ks[8]);
129: t1 = _mm_aesenc_si128(t1, ks[9]);
130: t2 = _mm_aesenc_si128(t2, ks[9]);
131: t3 = _mm_aesenc_si128(t3, ks[9]);
132: t4 = _mm_aesenc_si128(t4, ks[9]);
133:
134: t1 = _mm_aesenclast_si128(t1, ks[10]);
135: t2 = _mm_aesenclast_si128(t2, ks[10]);
136: t3 = _mm_aesenclast_si128(t3, ks[10]);
137: t4 = _mm_aesenclast_si128(t4, ks[10]);
138:
139: _mm_storeu_si128(bo + i + 0, t1);
140: _mm_storeu_si128(bo + i + 1, t2);
141: _mm_storeu_si128(bo + i + 2, t3);
142: _mm_storeu_si128(bo + i + 3, t4);
143: }
144:
145: for (i = pblocks; i < blocks; i++)
146: {
147: t1 = _mm_loadu_si128(bi + i);
148: t1 = _mm_xor_si128(t1, ks[0]);
149:
150: t1 = _mm_aesenc_si128(t1, ks[1]);
151: t1 = _mm_aesenc_si128(t1, ks[2]);
152: t1 = _mm_aesenc_si128(t1, ks[3]);
153: t1 = _mm_aesenc_si128(t1, ks[4]);
154: t1 = _mm_aesenc_si128(t1, ks[5]);
155: t1 = _mm_aesenc_si128(t1, ks[6]);
156: t1 = _mm_aesenc_si128(t1, ks[7]);
157: t1 = _mm_aesenc_si128(t1, ks[8]);
158: t1 = _mm_aesenc_si128(t1, ks[9]);
159:
160: t1 = _mm_aesenclast_si128(t1, ks[10]);
161: _mm_storeu_si128(bo + i, t1);
162: }
163: }
164:
165: /**
166: * AES-128 ECB decryption
167: */
168: static void decrypt_ecb128(aesni_key_t *key, u_int blocks, u_char *in,
169: u_char *out)
170: {
171: __m128i *ks, *bi, *bo;
172: __m128i t1, t2, t3, t4;
173: u_int i, pblocks;
174:
175: ks = key->schedule;
176: bi = (__m128i*)in;
177: bo = (__m128i*)out;
178: pblocks = blocks - (blocks % ECB_PARALLELISM);
179:
180: for (i = 0; i < pblocks; i += ECB_PARALLELISM)
181: {
182: t1 = _mm_loadu_si128(bi + i + 0);
183: t2 = _mm_loadu_si128(bi + i + 1);
184: t3 = _mm_loadu_si128(bi + i + 2);
185: t4 = _mm_loadu_si128(bi + i + 3);
186:
187: t1 = _mm_xor_si128(t1, ks[0]);
188: t2 = _mm_xor_si128(t2, ks[0]);
189: t3 = _mm_xor_si128(t3, ks[0]);
190: t4 = _mm_xor_si128(t4, ks[0]);
191:
192: t1 = _mm_aesdec_si128(t1, ks[1]);
193: t2 = _mm_aesdec_si128(t2, ks[1]);
194: t3 = _mm_aesdec_si128(t3, ks[1]);
195: t4 = _mm_aesdec_si128(t4, ks[1]);
196: t1 = _mm_aesdec_si128(t1, ks[2]);
197: t2 = _mm_aesdec_si128(t2, ks[2]);
198: t3 = _mm_aesdec_si128(t3, ks[2]);
199: t4 = _mm_aesdec_si128(t4, ks[2]);
200: t1 = _mm_aesdec_si128(t1, ks[3]);
201: t2 = _mm_aesdec_si128(t2, ks[3]);
202: t3 = _mm_aesdec_si128(t3, ks[3]);
203: t4 = _mm_aesdec_si128(t4, ks[3]);
204: t1 = _mm_aesdec_si128(t1, ks[4]);
205: t2 = _mm_aesdec_si128(t2, ks[4]);
206: t3 = _mm_aesdec_si128(t3, ks[4]);
207: t4 = _mm_aesdec_si128(t4, ks[4]);
208: t1 = _mm_aesdec_si128(t1, ks[5]);
209: t2 = _mm_aesdec_si128(t2, ks[5]);
210: t3 = _mm_aesdec_si128(t3, ks[5]);
211: t4 = _mm_aesdec_si128(t4, ks[5]);
212: t1 = _mm_aesdec_si128(t1, ks[6]);
213: t2 = _mm_aesdec_si128(t2, ks[6]);
214: t3 = _mm_aesdec_si128(t3, ks[6]);
215: t4 = _mm_aesdec_si128(t4, ks[6]);
216: t1 = _mm_aesdec_si128(t1, ks[7]);
217: t2 = _mm_aesdec_si128(t2, ks[7]);
218: t3 = _mm_aesdec_si128(t3, ks[7]);
219: t4 = _mm_aesdec_si128(t4, ks[7]);
220: t1 = _mm_aesdec_si128(t1, ks[8]);
221: t2 = _mm_aesdec_si128(t2, ks[8]);
222: t3 = _mm_aesdec_si128(t3, ks[8]);
223: t4 = _mm_aesdec_si128(t4, ks[8]);
224: t1 = _mm_aesdec_si128(t1, ks[9]);
225: t2 = _mm_aesdec_si128(t2, ks[9]);
226: t3 = _mm_aesdec_si128(t3, ks[9]);
227: t4 = _mm_aesdec_si128(t4, ks[9]);
228:
229: t1 = _mm_aesdeclast_si128(t1, ks[10]);
230: t2 = _mm_aesdeclast_si128(t2, ks[10]);
231: t3 = _mm_aesdeclast_si128(t3, ks[10]);
232: t4 = _mm_aesdeclast_si128(t4, ks[10]);
233:
234: _mm_storeu_si128(bo + i + 0, t1);
235: _mm_storeu_si128(bo + i + 1, t2);
236: _mm_storeu_si128(bo + i + 2, t3);
237: _mm_storeu_si128(bo + i + 3, t4);
238: }
239:
240: for (i = pblocks; i < blocks; i++)
241: {
242: t1 = _mm_loadu_si128(bi + i);
243: t1 = _mm_xor_si128(t1, ks[0]);
244:
245: t1 = _mm_aesdec_si128(t1, ks[1]);
246: t1 = _mm_aesdec_si128(t1, ks[2]);
247: t1 = _mm_aesdec_si128(t1, ks[3]);
248: t1 = _mm_aesdec_si128(t1, ks[4]);
249: t1 = _mm_aesdec_si128(t1, ks[5]);
250: t1 = _mm_aesdec_si128(t1, ks[6]);
251: t1 = _mm_aesdec_si128(t1, ks[7]);
252: t1 = _mm_aesdec_si128(t1, ks[8]);
253: t1 = _mm_aesdec_si128(t1, ks[9]);
254:
255: t1 = _mm_aesdeclast_si128(t1, ks[10]);
256: _mm_storeu_si128(bo + i, t1);
257: }
258: }
259:
260: /**
261: * AES-192 ECB encryption
262: */
263: static void encrypt_ecb192(aesni_key_t *key, u_int blocks, u_char *in,
264: u_char *out)
265: {
266: __m128i *ks, *bi, *bo;
267: __m128i t1, t2, t3, t4;
268: u_int i, pblocks;
269:
270: ks = key->schedule;
271: bi = (__m128i*)in;
272: bo = (__m128i*)out;
273: pblocks = blocks - (blocks % ECB_PARALLELISM);
274:
275: for (i = 0; i < pblocks; i += ECB_PARALLELISM)
276: {
277: t1 = _mm_loadu_si128(bi + i + 0);
278: t2 = _mm_loadu_si128(bi + i + 1);
279: t3 = _mm_loadu_si128(bi + i + 2);
280: t4 = _mm_loadu_si128(bi + i + 3);
281:
282: t1 = _mm_xor_si128(t1, ks[0]);
283: t2 = _mm_xor_si128(t2, ks[0]);
284: t3 = _mm_xor_si128(t3, ks[0]);
285: t4 = _mm_xor_si128(t4, ks[0]);
286:
287: t1 = _mm_aesenc_si128(t1, ks[1]);
288: t2 = _mm_aesenc_si128(t2, ks[1]);
289: t3 = _mm_aesenc_si128(t3, ks[1]);
290: t4 = _mm_aesenc_si128(t4, ks[1]);
291: t1 = _mm_aesenc_si128(t1, ks[2]);
292: t2 = _mm_aesenc_si128(t2, ks[2]);
293: t3 = _mm_aesenc_si128(t3, ks[2]);
294: t4 = _mm_aesenc_si128(t4, ks[2]);
295: t1 = _mm_aesenc_si128(t1, ks[3]);
296: t2 = _mm_aesenc_si128(t2, ks[3]);
297: t3 = _mm_aesenc_si128(t3, ks[3]);
298: t4 = _mm_aesenc_si128(t4, ks[3]);
299: t1 = _mm_aesenc_si128(t1, ks[4]);
300: t2 = _mm_aesenc_si128(t2, ks[4]);
301: t3 = _mm_aesenc_si128(t3, ks[4]);
302: t4 = _mm_aesenc_si128(t4, ks[4]);
303: t1 = _mm_aesenc_si128(t1, ks[5]);
304: t2 = _mm_aesenc_si128(t2, ks[5]);
305: t3 = _mm_aesenc_si128(t3, ks[5]);
306: t4 = _mm_aesenc_si128(t4, ks[5]);
307: t1 = _mm_aesenc_si128(t1, ks[6]);
308: t2 = _mm_aesenc_si128(t2, ks[6]);
309: t3 = _mm_aesenc_si128(t3, ks[6]);
310: t4 = _mm_aesenc_si128(t4, ks[6]);
311: t1 = _mm_aesenc_si128(t1, ks[7]);
312: t2 = _mm_aesenc_si128(t2, ks[7]);
313: t3 = _mm_aesenc_si128(t3, ks[7]);
314: t4 = _mm_aesenc_si128(t4, ks[7]);
315: t1 = _mm_aesenc_si128(t1, ks[8]);
316: t2 = _mm_aesenc_si128(t2, ks[8]);
317: t3 = _mm_aesenc_si128(t3, ks[8]);
318: t4 = _mm_aesenc_si128(t4, ks[8]);
319: t1 = _mm_aesenc_si128(t1, ks[9]);
320: t2 = _mm_aesenc_si128(t2, ks[9]);
321: t3 = _mm_aesenc_si128(t3, ks[9]);
322: t4 = _mm_aesenc_si128(t4, ks[9]);
323: t1 = _mm_aesenc_si128(t1, ks[10]);
324: t2 = _mm_aesenc_si128(t2, ks[10]);
325: t3 = _mm_aesenc_si128(t3, ks[10]);
326: t4 = _mm_aesenc_si128(t4, ks[10]);
327: t1 = _mm_aesenc_si128(t1, ks[11]);
328: t2 = _mm_aesenc_si128(t2, ks[11]);
329: t3 = _mm_aesenc_si128(t3, ks[11]);
330: t4 = _mm_aesenc_si128(t4, ks[11]);
331:
332: t1 = _mm_aesenclast_si128(t1, ks[12]);
333: t2 = _mm_aesenclast_si128(t2, ks[12]);
334: t3 = _mm_aesenclast_si128(t3, ks[12]);
335: t4 = _mm_aesenclast_si128(t4, ks[12]);
336:
337: _mm_storeu_si128(bo + i + 0, t1);
338: _mm_storeu_si128(bo + i + 1, t2);
339: _mm_storeu_si128(bo + i + 2, t3);
340: _mm_storeu_si128(bo + i + 3, t4);
341: }
342:
343: for (i = pblocks; i < blocks; i++)
344: {
345: t1 = _mm_loadu_si128(bi + i);
346: t1 = _mm_xor_si128(t1, ks[0]);
347:
348: t1 = _mm_aesenc_si128(t1, ks[1]);
349: t1 = _mm_aesenc_si128(t1, ks[2]);
350: t1 = _mm_aesenc_si128(t1, ks[3]);
351: t1 = _mm_aesenc_si128(t1, ks[4]);
352: t1 = _mm_aesenc_si128(t1, ks[5]);
353: t1 = _mm_aesenc_si128(t1, ks[6]);
354: t1 = _mm_aesenc_si128(t1, ks[7]);
355: t1 = _mm_aesenc_si128(t1, ks[8]);
356: t1 = _mm_aesenc_si128(t1, ks[9]);
357: t1 = _mm_aesenc_si128(t1, ks[10]);
358: t1 = _mm_aesenc_si128(t1, ks[11]);
359:
360: t1 = _mm_aesenclast_si128(t1, ks[12]);
361: _mm_storeu_si128(bo + i, t1);
362: }
363: }
364:
365: /**
366: * AES-192 ECB decryption
367: */
368: static void decrypt_ecb192(aesni_key_t *key, u_int blocks, u_char *in,
369: u_char *out)
370: {
371: __m128i *ks, *bi, *bo;
372: __m128i t1, t2, t3, t4;
373: u_int i, pblocks;
374:
375: ks = key->schedule;
376: bi = (__m128i*)in;
377: bo = (__m128i*)out;
378: pblocks = blocks - (blocks % ECB_PARALLELISM);
379:
380: for (i = 0; i < pblocks; i += ECB_PARALLELISM)
381: {
382: t1 = _mm_loadu_si128(bi + i + 0);
383: t2 = _mm_loadu_si128(bi + i + 1);
384: t3 = _mm_loadu_si128(bi + i + 2);
385: t4 = _mm_loadu_si128(bi + i + 3);
386:
387: t1 = _mm_xor_si128(t1, ks[0]);
388: t2 = _mm_xor_si128(t2, ks[0]);
389: t3 = _mm_xor_si128(t3, ks[0]);
390: t4 = _mm_xor_si128(t4, ks[0]);
391:
392: t1 = _mm_aesdec_si128(t1, ks[1]);
393: t2 = _mm_aesdec_si128(t2, ks[1]);
394: t3 = _mm_aesdec_si128(t3, ks[1]);
395: t4 = _mm_aesdec_si128(t4, ks[1]);
396: t1 = _mm_aesdec_si128(t1, ks[2]);
397: t2 = _mm_aesdec_si128(t2, ks[2]);
398: t3 = _mm_aesdec_si128(t3, ks[2]);
399: t4 = _mm_aesdec_si128(t4, ks[2]);
400: t1 = _mm_aesdec_si128(t1, ks[3]);
401: t2 = _mm_aesdec_si128(t2, ks[3]);
402: t3 = _mm_aesdec_si128(t3, ks[3]);
403: t4 = _mm_aesdec_si128(t4, ks[3]);
404: t1 = _mm_aesdec_si128(t1, ks[4]);
405: t2 = _mm_aesdec_si128(t2, ks[4]);
406: t3 = _mm_aesdec_si128(t3, ks[4]);
407: t4 = _mm_aesdec_si128(t4, ks[4]);
408: t1 = _mm_aesdec_si128(t1, ks[5]);
409: t2 = _mm_aesdec_si128(t2, ks[5]);
410: t3 = _mm_aesdec_si128(t3, ks[5]);
411: t4 = _mm_aesdec_si128(t4, ks[5]);
412: t1 = _mm_aesdec_si128(t1, ks[6]);
413: t2 = _mm_aesdec_si128(t2, ks[6]);
414: t3 = _mm_aesdec_si128(t3, ks[6]);
415: t4 = _mm_aesdec_si128(t4, ks[6]);
416: t1 = _mm_aesdec_si128(t1, ks[7]);
417: t2 = _mm_aesdec_si128(t2, ks[7]);
418: t3 = _mm_aesdec_si128(t3, ks[7]);
419: t4 = _mm_aesdec_si128(t4, ks[7]);
420: t1 = _mm_aesdec_si128(t1, ks[8]);
421: t2 = _mm_aesdec_si128(t2, ks[8]);
422: t3 = _mm_aesdec_si128(t3, ks[8]);
423: t4 = _mm_aesdec_si128(t4, ks[8]);
424: t1 = _mm_aesdec_si128(t1, ks[9]);
425: t2 = _mm_aesdec_si128(t2, ks[9]);
426: t3 = _mm_aesdec_si128(t3, ks[9]);
427: t4 = _mm_aesdec_si128(t4, ks[9]);
428: t1 = _mm_aesdec_si128(t1, ks[10]);
429: t2 = _mm_aesdec_si128(t2, ks[10]);
430: t3 = _mm_aesdec_si128(t3, ks[10]);
431: t4 = _mm_aesdec_si128(t4, ks[10]);
432: t1 = _mm_aesdec_si128(t1, ks[11]);
433: t2 = _mm_aesdec_si128(t2, ks[11]);
434: t3 = _mm_aesdec_si128(t3, ks[11]);
435: t4 = _mm_aesdec_si128(t4, ks[11]);
436:
437: t1 = _mm_aesdeclast_si128(t1, ks[12]);
438: t2 = _mm_aesdeclast_si128(t2, ks[12]);
439: t3 = _mm_aesdeclast_si128(t3, ks[12]);
440: t4 = _mm_aesdeclast_si128(t4, ks[12]);
441:
442: _mm_storeu_si128(bo + i + 0, t1);
443: _mm_storeu_si128(bo + i + 1, t2);
444: _mm_storeu_si128(bo + i + 2, t3);
445: _mm_storeu_si128(bo + i + 3, t4);
446: }
447:
448: for (i = pblocks; i < blocks; i++)
449: {
450: t1 = _mm_loadu_si128(bi + i);
451: t1 = _mm_xor_si128(t1, ks[0]);
452:
453: t1 = _mm_aesdec_si128(t1, ks[1]);
454: t1 = _mm_aesdec_si128(t1, ks[2]);
455: t1 = _mm_aesdec_si128(t1, ks[3]);
456: t1 = _mm_aesdec_si128(t1, ks[4]);
457: t1 = _mm_aesdec_si128(t1, ks[5]);
458: t1 = _mm_aesdec_si128(t1, ks[6]);
459: t1 = _mm_aesdec_si128(t1, ks[7]);
460: t1 = _mm_aesdec_si128(t1, ks[8]);
461: t1 = _mm_aesdec_si128(t1, ks[9]);
462: t1 = _mm_aesdec_si128(t1, ks[10]);
463: t1 = _mm_aesdec_si128(t1, ks[11]);
464:
465: t1 = _mm_aesdeclast_si128(t1, ks[12]);
466: _mm_storeu_si128(bo + i, t1);
467: }
468: }
469:
470: /**
471: * AES-256 ECB encryption
472: */
473: static void encrypt_ecb256(aesni_key_t *key, u_int blocks, u_char *in,
474: u_char *out)
475: {
476: __m128i *ks, *bi, *bo;
477: __m128i t1, t2, t3, t4;
478: u_int i, pblocks;
479:
480: ks = key->schedule;
481: bi = (__m128i*)in;
482: bo = (__m128i*)out;
483: pblocks = blocks - (blocks % ECB_PARALLELISM);
484:
485: for (i = 0; i < pblocks; i += ECB_PARALLELISM)
486: {
487: t1 = _mm_loadu_si128(bi + i + 0);
488: t2 = _mm_loadu_si128(bi + i + 1);
489: t3 = _mm_loadu_si128(bi + i + 2);
490: t4 = _mm_loadu_si128(bi + i + 3);
491:
492: t1 = _mm_xor_si128(t1, ks[0]);
493: t2 = _mm_xor_si128(t2, ks[0]);
494: t3 = _mm_xor_si128(t3, ks[0]);
495: t4 = _mm_xor_si128(t4, ks[0]);
496:
497: t1 = _mm_aesenc_si128(t1, ks[1]);
498: t2 = _mm_aesenc_si128(t2, ks[1]);
499: t3 = _mm_aesenc_si128(t3, ks[1]);
500: t4 = _mm_aesenc_si128(t4, ks[1]);
501: t1 = _mm_aesenc_si128(t1, ks[2]);
502: t2 = _mm_aesenc_si128(t2, ks[2]);
503: t3 = _mm_aesenc_si128(t3, ks[2]);
504: t4 = _mm_aesenc_si128(t4, ks[2]);
505: t1 = _mm_aesenc_si128(t1, ks[3]);
506: t2 = _mm_aesenc_si128(t2, ks[3]);
507: t3 = _mm_aesenc_si128(t3, ks[3]);
508: t4 = _mm_aesenc_si128(t4, ks[3]);
509: t1 = _mm_aesenc_si128(t1, ks[4]);
510: t2 = _mm_aesenc_si128(t2, ks[4]);
511: t3 = _mm_aesenc_si128(t3, ks[4]);
512: t4 = _mm_aesenc_si128(t4, ks[4]);
513: t1 = _mm_aesenc_si128(t1, ks[5]);
514: t2 = _mm_aesenc_si128(t2, ks[5]);
515: t3 = _mm_aesenc_si128(t3, ks[5]);
516: t4 = _mm_aesenc_si128(t4, ks[5]);
517: t1 = _mm_aesenc_si128(t1, ks[6]);
518: t2 = _mm_aesenc_si128(t2, ks[6]);
519: t3 = _mm_aesenc_si128(t3, ks[6]);
520: t4 = _mm_aesenc_si128(t4, ks[6]);
521: t1 = _mm_aesenc_si128(t1, ks[7]);
522: t2 = _mm_aesenc_si128(t2, ks[7]);
523: t3 = _mm_aesenc_si128(t3, ks[7]);
524: t4 = _mm_aesenc_si128(t4, ks[7]);
525: t1 = _mm_aesenc_si128(t1, ks[8]);
526: t2 = _mm_aesenc_si128(t2, ks[8]);
527: t3 = _mm_aesenc_si128(t3, ks[8]);
528: t4 = _mm_aesenc_si128(t4, ks[8]);
529: t1 = _mm_aesenc_si128(t1, ks[9]);
530: t2 = _mm_aesenc_si128(t2, ks[9]);
531: t3 = _mm_aesenc_si128(t3, ks[9]);
532: t4 = _mm_aesenc_si128(t4, ks[9]);
533: t1 = _mm_aesenc_si128(t1, ks[10]);
534: t2 = _mm_aesenc_si128(t2, ks[10]);
535: t3 = _mm_aesenc_si128(t3, ks[10]);
536: t4 = _mm_aesenc_si128(t4, ks[10]);
537: t1 = _mm_aesenc_si128(t1, ks[11]);
538: t2 = _mm_aesenc_si128(t2, ks[11]);
539: t3 = _mm_aesenc_si128(t3, ks[11]);
540: t4 = _mm_aesenc_si128(t4, ks[11]);
541: t1 = _mm_aesenc_si128(t1, ks[12]);
542: t2 = _mm_aesenc_si128(t2, ks[12]);
543: t3 = _mm_aesenc_si128(t3, ks[12]);
544: t4 = _mm_aesenc_si128(t4, ks[12]);
545: t1 = _mm_aesenc_si128(t1, ks[13]);
546: t2 = _mm_aesenc_si128(t2, ks[13]);
547: t3 = _mm_aesenc_si128(t3, ks[13]);
548: t4 = _mm_aesenc_si128(t4, ks[13]);
549:
550: t1 = _mm_aesenclast_si128(t1, ks[14]);
551: t2 = _mm_aesenclast_si128(t2, ks[14]);
552: t3 = _mm_aesenclast_si128(t3, ks[14]);
553: t4 = _mm_aesenclast_si128(t4, ks[14]);
554:
555: _mm_storeu_si128(bo + i + 0, t1);
556: _mm_storeu_si128(bo + i + 1, t2);
557: _mm_storeu_si128(bo + i + 2, t3);
558: _mm_storeu_si128(bo + i + 3, t4);
559: }
560:
561: for (i = pblocks; i < blocks; i++)
562: {
563: t1 = _mm_loadu_si128(bi + i);
564: t1 = _mm_xor_si128(t1, ks[0]);
565:
566: t1 = _mm_aesenc_si128(t1, ks[1]);
567: t1 = _mm_aesenc_si128(t1, ks[2]);
568: t1 = _mm_aesenc_si128(t1, ks[3]);
569: t1 = _mm_aesenc_si128(t1, ks[4]);
570: t1 = _mm_aesenc_si128(t1, ks[5]);
571: t1 = _mm_aesenc_si128(t1, ks[6]);
572: t1 = _mm_aesenc_si128(t1, ks[7]);
573: t1 = _mm_aesenc_si128(t1, ks[8]);
574: t1 = _mm_aesenc_si128(t1, ks[9]);
575: t1 = _mm_aesenc_si128(t1, ks[10]);
576: t1 = _mm_aesenc_si128(t1, ks[11]);
577: t1 = _mm_aesenc_si128(t1, ks[12]);
578: t1 = _mm_aesenc_si128(t1, ks[13]);
579:
580: t1 = _mm_aesenclast_si128(t1, ks[14]);
581: _mm_storeu_si128(bo + i, t1);
582: }
583: }
584:
585: /**
586: * AES-256 ECB decryption
587: */
588: static void decrypt_ecb256(aesni_key_t *key, u_int blocks, u_char *in,
589: u_char *out)
590: {
591: __m128i *ks, *bi, *bo;
592: __m128i t1, t2, t3, t4;
593: u_int i, pblocks;
594:
595: ks = key->schedule;
596: bi = (__m128i*)in;
597: bo = (__m128i*)out;
598: pblocks = blocks - (blocks % ECB_PARALLELISM);
599:
600: for (i = 0; i < pblocks; i += ECB_PARALLELISM)
601: {
602: t1 = _mm_loadu_si128(bi + i + 0);
603: t2 = _mm_loadu_si128(bi + i + 1);
604: t3 = _mm_loadu_si128(bi + i + 2);
605: t4 = _mm_loadu_si128(bi + i + 3);
606:
607: t1 = _mm_xor_si128(t1, ks[0]);
608: t2 = _mm_xor_si128(t2, ks[0]);
609: t3 = _mm_xor_si128(t3, ks[0]);
610: t4 = _mm_xor_si128(t4, ks[0]);
611:
612: t1 = _mm_aesdec_si128(t1, ks[1]);
613: t2 = _mm_aesdec_si128(t2, ks[1]);
614: t3 = _mm_aesdec_si128(t3, ks[1]);
615: t4 = _mm_aesdec_si128(t4, ks[1]);
616: t1 = _mm_aesdec_si128(t1, ks[2]);
617: t2 = _mm_aesdec_si128(t2, ks[2]);
618: t3 = _mm_aesdec_si128(t3, ks[2]);
619: t4 = _mm_aesdec_si128(t4, ks[2]);
620: t1 = _mm_aesdec_si128(t1, ks[3]);
621: t2 = _mm_aesdec_si128(t2, ks[3]);
622: t3 = _mm_aesdec_si128(t3, ks[3]);
623: t4 = _mm_aesdec_si128(t4, ks[3]);
624: t1 = _mm_aesdec_si128(t1, ks[4]);
625: t2 = _mm_aesdec_si128(t2, ks[4]);
626: t3 = _mm_aesdec_si128(t3, ks[4]);
627: t4 = _mm_aesdec_si128(t4, ks[4]);
628: t1 = _mm_aesdec_si128(t1, ks[5]);
629: t2 = _mm_aesdec_si128(t2, ks[5]);
630: t3 = _mm_aesdec_si128(t3, ks[5]);
631: t4 = _mm_aesdec_si128(t4, ks[5]);
632: t1 = _mm_aesdec_si128(t1, ks[6]);
633: t2 = _mm_aesdec_si128(t2, ks[6]);
634: t3 = _mm_aesdec_si128(t3, ks[6]);
635: t4 = _mm_aesdec_si128(t4, ks[6]);
636: t1 = _mm_aesdec_si128(t1, ks[7]);
637: t2 = _mm_aesdec_si128(t2, ks[7]);
638: t3 = _mm_aesdec_si128(t3, ks[7]);
639: t4 = _mm_aesdec_si128(t4, ks[7]);
640: t1 = _mm_aesdec_si128(t1, ks[8]);
641: t2 = _mm_aesdec_si128(t2, ks[8]);
642: t3 = _mm_aesdec_si128(t3, ks[8]);
643: t4 = _mm_aesdec_si128(t4, ks[8]);
644: t1 = _mm_aesdec_si128(t1, ks[9]);
645: t2 = _mm_aesdec_si128(t2, ks[9]);
646: t3 = _mm_aesdec_si128(t3, ks[9]);
647: t4 = _mm_aesdec_si128(t4, ks[9]);
648: t1 = _mm_aesdec_si128(t1, ks[10]);
649: t2 = _mm_aesdec_si128(t2, ks[10]);
650: t3 = _mm_aesdec_si128(t3, ks[10]);
651: t4 = _mm_aesdec_si128(t4, ks[10]);
652: t1 = _mm_aesdec_si128(t1, ks[11]);
653: t2 = _mm_aesdec_si128(t2, ks[11]);
654: t3 = _mm_aesdec_si128(t3, ks[11]);
655: t4 = _mm_aesdec_si128(t4, ks[11]);
656: t1 = _mm_aesdec_si128(t1, ks[12]);
657: t2 = _mm_aesdec_si128(t2, ks[12]);
658: t3 = _mm_aesdec_si128(t3, ks[12]);
659: t4 = _mm_aesdec_si128(t4, ks[12]);
660: t1 = _mm_aesdec_si128(t1, ks[13]);
661: t2 = _mm_aesdec_si128(t2, ks[13]);
662: t3 = _mm_aesdec_si128(t3, ks[13]);
663: t4 = _mm_aesdec_si128(t4, ks[13]);
664:
665: t1 = _mm_aesdeclast_si128(t1, ks[14]);
666: t2 = _mm_aesdeclast_si128(t2, ks[14]);
667: t3 = _mm_aesdeclast_si128(t3, ks[14]);
668: t4 = _mm_aesdeclast_si128(t4, ks[14]);
669:
670: _mm_storeu_si128(bo + i + 0, t1);
671: _mm_storeu_si128(bo + i + 1, t2);
672: _mm_storeu_si128(bo + i + 2, t3);
673: _mm_storeu_si128(bo + i + 3, t4);
674: }
675:
676: for (i = pblocks; i < blocks; i++)
677: {
678: t1 = _mm_loadu_si128(bi + i);
679: t1 = _mm_xor_si128(t1, ks[0]);
680:
681: t1 = _mm_aesdec_si128(t1, ks[1]);
682: t1 = _mm_aesdec_si128(t1, ks[2]);
683: t1 = _mm_aesdec_si128(t1, ks[3]);
684: t1 = _mm_aesdec_si128(t1, ks[4]);
685: t1 = _mm_aesdec_si128(t1, ks[5]);
686: t1 = _mm_aesdec_si128(t1, ks[6]);
687: t1 = _mm_aesdec_si128(t1, ks[7]);
688: t1 = _mm_aesdec_si128(t1, ks[8]);
689: t1 = _mm_aesdec_si128(t1, ks[9]);
690: t1 = _mm_aesdec_si128(t1, ks[10]);
691: t1 = _mm_aesdec_si128(t1, ks[11]);
692: t1 = _mm_aesdec_si128(t1, ks[12]);
693: t1 = _mm_aesdec_si128(t1, ks[13]);
694:
695: t1 = _mm_aesdeclast_si128(t1, ks[14]);
696: _mm_storeu_si128(bo + i, t1);
697: }
698: }
699:
700: /**
701: * Do inline or allocated de/encryption using key schedule
702: */
703: static bool crypt(aesni_ecb_fn_t fn, aesni_key_t *key, chunk_t data,
704: chunk_t *out)
705: {
706: u_char *buf;
707:
708: if (!key || data.len % AES_BLOCK_SIZE)
709: {
710: return FALSE;
711: }
712: if (out)
713: {
714: *out = chunk_alloc(data.len);
715: buf = out->ptr;
716: }
717: else
718: {
719: buf = data.ptr;
720: }
721: fn(key, data.len / AES_BLOCK_SIZE, data.ptr, buf);
722: return TRUE;
723: }
724:
725: METHOD(crypter_t, encrypt, bool,
726: private_aesni_ecb_t *this, chunk_t data, chunk_t iv, chunk_t *encrypted)
727: {
728: return crypt(this->encrypt, this->ekey, data, encrypted);
729: }
730:
731: METHOD(crypter_t, decrypt, bool,
732: private_aesni_ecb_t *this, chunk_t data, chunk_t iv, chunk_t *decrypted)
733: {
734: return crypt(this->decrypt, this->dkey, data, decrypted);
735: }
736:
737: METHOD(crypter_t, get_block_size, size_t,
738: private_aesni_ecb_t *this)
739: {
740: return AES_BLOCK_SIZE;
741: }
742:
743: METHOD(crypter_t, get_iv_size, size_t,
744: private_aesni_ecb_t *this)
745: {
746: return 0;
747: }
748:
749: METHOD(crypter_t, get_key_size, size_t,
750: private_aesni_ecb_t *this)
751: {
752: return this->key_size;
753: }
754:
755: METHOD(crypter_t, set_key, bool,
756: private_aesni_ecb_t *this, chunk_t key)
757: {
758: if (key.len != this->key_size)
759: {
760: return FALSE;
761: }
762:
763: DESTROY_IF(this->ekey);
764: DESTROY_IF(this->dkey);
765:
766: this->ekey = aesni_key_create(TRUE, key);
767: this->dkey = aesni_key_create(FALSE, key);
768:
769: return this->ekey && this->dkey;
770: }
771:
772: METHOD(crypter_t, destroy, void,
773: private_aesni_ecb_t *this)
774: {
775: DESTROY_IF(this->ekey);
776: DESTROY_IF(this->dkey);
777: free_align(this);
778: }
779:
780: /**
781: * See header
782: */
783: aesni_ecb_t *aesni_ecb_create(encryption_algorithm_t algo, size_t key_size)
784: {
785: private_aesni_ecb_t *this;
786:
787: if (algo != ENCR_AES_ECB)
788: {
789: return NULL;
790: }
791: switch (key_size)
792: {
793: case 0:
794: key_size = 16;
795: break;
796: case 16:
797: case 24:
798: case 32:
799: break;
800: default:
801: return NULL;
802: }
803:
804: INIT_ALIGN(this, sizeof(__m128i),
805: .public = {
806: .crypter = {
807: .encrypt = _encrypt,
808: .decrypt = _decrypt,
809: .get_block_size = _get_block_size,
810: .get_iv_size = _get_iv_size,
811: .get_key_size = _get_key_size,
812: .set_key = _set_key,
813: .destroy = _destroy,
814: },
815: },
816: .key_size = key_size,
817: );
818:
819: switch (key_size)
820: {
821: case 16:
822: this->encrypt = encrypt_ecb128;
823: this->decrypt = decrypt_ecb128;
824: break;
825: case 24:
826: this->encrypt = encrypt_ecb192;
827: this->decrypt = decrypt_ecb192;
828: break;
829: case 32:
830: this->encrypt = encrypt_ecb256;
831: this->decrypt = decrypt_ecb256;
832: break;
833: }
834:
835: return &this->public;
836: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>