Return to aesni_ecb.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / strongswan / src / libstrongswan / plugins / aesni |
1.1 misho 1: /* 2: * Copyright (C) 2015 Martin Willi 3: * Copyright (C) 2015 revosec AG 4: * 5: * Copyright (C) 2019 Andreas Steffen 6: * HSR Hochschule fuer Technik Rapperswil 7: * 8: * This program is free software; you can redistribute it and/or modify it 9: * under the terms of the GNU General Public License as published by the 10: * Free Software Foundation; either version 2 of the License, or (at your 11: * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>. 12: * 13: * This program is distributed in the hope that it will be useful, but 14: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16: * for more details. 17: */ 18: 19: #include "aesni_ecb.h" 20: #include "aesni_key.h" 21: 22: /** 23: * Pipeline parallelism we use for ECB encryption/decryption 24: */ 25: #define ECB_PARALLELISM 4 26: 27: typedef struct private_aesni_ecb_t private_aesni_ecb_t; 28: 29: /** 30: * ECB en/decryption method type 31: */ 32: typedef void (*aesni_ecb_fn_t)(aesni_key_t*, u_int, u_char*, u_char*); 33: 34: /** 35: * Private data of an aesni_ecb_t object. 36: */ 37: struct private_aesni_ecb_t { 38: 39: /** 40: * Public aesni_ecb_t interface. 41: */ 42: aesni_ecb_t public; 43: 44: /** 45: * Key size 46: */ 47: u_int key_size; 48: 49: /** 50: * Encryption key schedule 51: */ 52: aesni_key_t *ekey; 53: 54: /** 55: * Decryption key schedule 56: */ 57: aesni_key_t *dkey; 58: 59: /** 60: * Encryption method 61: */ 62: aesni_ecb_fn_t encrypt; 63: 64: /** 65: * Decryption method 66: */ 67: aesni_ecb_fn_t decrypt; 68: }; 69: 70: /** 71: * AES-128 ECB encryption 72: */ 73: static void encrypt_ecb128(aesni_key_t *key, u_int blocks, u_char *in, 74: u_char *out) 75: { 76: __m128i *ks, *bi, *bo; 77: __m128i t1, t2, t3, t4; 78: u_int i, pblocks; 79: 80: ks = key->schedule; 81: bi = (__m128i*)in; 82: bo = (__m128i*)out; 83: pblocks = blocks - (blocks % ECB_PARALLELISM); 84: 85: for (i = 0; i < pblocks; i += ECB_PARALLELISM) 86: { 87: t1 = _mm_loadu_si128(bi + i + 0); 88: t2 = _mm_loadu_si128(bi + i + 1); 89: t3 = _mm_loadu_si128(bi + i + 2); 90: t4 = _mm_loadu_si128(bi + i + 3); 91: 92: t1 = _mm_xor_si128(t1, ks[0]); 93: t2 = _mm_xor_si128(t2, ks[0]); 94: t3 = _mm_xor_si128(t3, ks[0]); 95: t4 = _mm_xor_si128(t4, ks[0]); 96: 97: t1 = _mm_aesenc_si128(t1, ks[1]); 98: t2 = _mm_aesenc_si128(t2, ks[1]); 99: t3 = _mm_aesenc_si128(t3, ks[1]); 100: t4 = _mm_aesenc_si128(t4, ks[1]); 101: t1 = _mm_aesenc_si128(t1, ks[2]); 102: t2 = _mm_aesenc_si128(t2, ks[2]); 103: t3 = _mm_aesenc_si128(t3, ks[2]); 104: t4 = _mm_aesenc_si128(t4, ks[2]); 105: t1 = _mm_aesenc_si128(t1, ks[3]); 106: t2 = _mm_aesenc_si128(t2, ks[3]); 107: t3 = _mm_aesenc_si128(t3, ks[3]); 108: t4 = _mm_aesenc_si128(t4, ks[3]); 109: t1 = _mm_aesenc_si128(t1, ks[4]); 110: t2 = _mm_aesenc_si128(t2, ks[4]); 111: t3 = _mm_aesenc_si128(t3, ks[4]); 112: t4 = _mm_aesenc_si128(t4, ks[4]); 113: t1 = _mm_aesenc_si128(t1, ks[5]); 114: t2 = _mm_aesenc_si128(t2, ks[5]); 115: t3 = _mm_aesenc_si128(t3, ks[5]); 116: t4 = _mm_aesenc_si128(t4, ks[5]); 117: t1 = _mm_aesenc_si128(t1, ks[6]); 118: t2 = _mm_aesenc_si128(t2, ks[6]); 119: t3 = _mm_aesenc_si128(t3, ks[6]); 120: t4 = _mm_aesenc_si128(t4, ks[6]); 121: t1 = _mm_aesenc_si128(t1, ks[7]); 122: t2 = _mm_aesenc_si128(t2, ks[7]); 123: t3 = _mm_aesenc_si128(t3, ks[7]); 124: t4 = _mm_aesenc_si128(t4, ks[7]); 125: t1 = _mm_aesenc_si128(t1, ks[8]); 126: t2 = _mm_aesenc_si128(t2, ks[8]); 127: t3 = _mm_aesenc_si128(t3, ks[8]); 128: t4 = _mm_aesenc_si128(t4, ks[8]); 129: t1 = _mm_aesenc_si128(t1, ks[9]); 130: t2 = _mm_aesenc_si128(t2, ks[9]); 131: t3 = _mm_aesenc_si128(t3, ks[9]); 132: t4 = _mm_aesenc_si128(t4, ks[9]); 133: 134: t1 = _mm_aesenclast_si128(t1, ks[10]); 135: t2 = _mm_aesenclast_si128(t2, ks[10]); 136: t3 = _mm_aesenclast_si128(t3, ks[10]); 137: t4 = _mm_aesenclast_si128(t4, ks[10]); 138: 139: _mm_storeu_si128(bo + i + 0, t1); 140: _mm_storeu_si128(bo + i + 1, t2); 141: _mm_storeu_si128(bo + i + 2, t3); 142: _mm_storeu_si128(bo + i + 3, t4); 143: } 144: 145: for (i = pblocks; i < blocks; i++) 146: { 147: t1 = _mm_loadu_si128(bi + i); 148: t1 = _mm_xor_si128(t1, ks[0]); 149: 150: t1 = _mm_aesenc_si128(t1, ks[1]); 151: t1 = _mm_aesenc_si128(t1, ks[2]); 152: t1 = _mm_aesenc_si128(t1, ks[3]); 153: t1 = _mm_aesenc_si128(t1, ks[4]); 154: t1 = _mm_aesenc_si128(t1, ks[5]); 155: t1 = _mm_aesenc_si128(t1, ks[6]); 156: t1 = _mm_aesenc_si128(t1, ks[7]); 157: t1 = _mm_aesenc_si128(t1, ks[8]); 158: t1 = _mm_aesenc_si128(t1, ks[9]); 159: 160: t1 = _mm_aesenclast_si128(t1, ks[10]); 161: _mm_storeu_si128(bo + i, t1); 162: } 163: } 164: 165: /** 166: * AES-128 ECB decryption 167: */ 168: static void decrypt_ecb128(aesni_key_t *key, u_int blocks, u_char *in, 169: u_char *out) 170: { 171: __m128i *ks, *bi, *bo; 172: __m128i t1, t2, t3, t4; 173: u_int i, pblocks; 174: 175: ks = key->schedule; 176: bi = (__m128i*)in; 177: bo = (__m128i*)out; 178: pblocks = blocks - (blocks % ECB_PARALLELISM); 179: 180: for (i = 0; i < pblocks; i += ECB_PARALLELISM) 181: { 182: t1 = _mm_loadu_si128(bi + i + 0); 183: t2 = _mm_loadu_si128(bi + i + 1); 184: t3 = _mm_loadu_si128(bi + i + 2); 185: t4 = _mm_loadu_si128(bi + i + 3); 186: 187: t1 = _mm_xor_si128(t1, ks[0]); 188: t2 = _mm_xor_si128(t2, ks[0]); 189: t3 = _mm_xor_si128(t3, ks[0]); 190: t4 = _mm_xor_si128(t4, ks[0]); 191: 192: t1 = _mm_aesdec_si128(t1, ks[1]); 193: t2 = _mm_aesdec_si128(t2, ks[1]); 194: t3 = _mm_aesdec_si128(t3, ks[1]); 195: t4 = _mm_aesdec_si128(t4, ks[1]); 196: t1 = _mm_aesdec_si128(t1, ks[2]); 197: t2 = _mm_aesdec_si128(t2, ks[2]); 198: t3 = _mm_aesdec_si128(t3, ks[2]); 199: t4 = _mm_aesdec_si128(t4, ks[2]); 200: t1 = _mm_aesdec_si128(t1, ks[3]); 201: t2 = _mm_aesdec_si128(t2, ks[3]); 202: t3 = _mm_aesdec_si128(t3, ks[3]); 203: t4 = _mm_aesdec_si128(t4, ks[3]); 204: t1 = _mm_aesdec_si128(t1, ks[4]); 205: t2 = _mm_aesdec_si128(t2, ks[4]); 206: t3 = _mm_aesdec_si128(t3, ks[4]); 207: t4 = _mm_aesdec_si128(t4, ks[4]); 208: t1 = _mm_aesdec_si128(t1, ks[5]); 209: t2 = _mm_aesdec_si128(t2, ks[5]); 210: t3 = _mm_aesdec_si128(t3, ks[5]); 211: t4 = _mm_aesdec_si128(t4, ks[5]); 212: t1 = _mm_aesdec_si128(t1, ks[6]); 213: t2 = _mm_aesdec_si128(t2, ks[6]); 214: t3 = _mm_aesdec_si128(t3, ks[6]); 215: t4 = _mm_aesdec_si128(t4, ks[6]); 216: t1 = _mm_aesdec_si128(t1, ks[7]); 217: t2 = _mm_aesdec_si128(t2, ks[7]); 218: t3 = _mm_aesdec_si128(t3, ks[7]); 219: t4 = _mm_aesdec_si128(t4, ks[7]); 220: t1 = _mm_aesdec_si128(t1, ks[8]); 221: t2 = _mm_aesdec_si128(t2, ks[8]); 222: t3 = _mm_aesdec_si128(t3, ks[8]); 223: t4 = _mm_aesdec_si128(t4, ks[8]); 224: t1 = _mm_aesdec_si128(t1, ks[9]); 225: t2 = _mm_aesdec_si128(t2, ks[9]); 226: t3 = _mm_aesdec_si128(t3, ks[9]); 227: t4 = _mm_aesdec_si128(t4, ks[9]); 228: 229: t1 = _mm_aesdeclast_si128(t1, ks[10]); 230: t2 = _mm_aesdeclast_si128(t2, ks[10]); 231: t3 = _mm_aesdeclast_si128(t3, ks[10]); 232: t4 = _mm_aesdeclast_si128(t4, ks[10]); 233: 234: _mm_storeu_si128(bo + i + 0, t1); 235: _mm_storeu_si128(bo + i + 1, t2); 236: _mm_storeu_si128(bo + i + 2, t3); 237: _mm_storeu_si128(bo + i + 3, t4); 238: } 239: 240: for (i = pblocks; i < blocks; i++) 241: { 242: t1 = _mm_loadu_si128(bi + i); 243: t1 = _mm_xor_si128(t1, ks[0]); 244: 245: t1 = _mm_aesdec_si128(t1, ks[1]); 246: t1 = _mm_aesdec_si128(t1, ks[2]); 247: t1 = _mm_aesdec_si128(t1, ks[3]); 248: t1 = _mm_aesdec_si128(t1, ks[4]); 249: t1 = _mm_aesdec_si128(t1, ks[5]); 250: t1 = _mm_aesdec_si128(t1, ks[6]); 251: t1 = _mm_aesdec_si128(t1, ks[7]); 252: t1 = _mm_aesdec_si128(t1, ks[8]); 253: t1 = _mm_aesdec_si128(t1, ks[9]); 254: 255: t1 = _mm_aesdeclast_si128(t1, ks[10]); 256: _mm_storeu_si128(bo + i, t1); 257: } 258: } 259: 260: /** 261: * AES-192 ECB encryption 262: */ 263: static void encrypt_ecb192(aesni_key_t *key, u_int blocks, u_char *in, 264: u_char *out) 265: { 266: __m128i *ks, *bi, *bo; 267: __m128i t1, t2, t3, t4; 268: u_int i, pblocks; 269: 270: ks = key->schedule; 271: bi = (__m128i*)in; 272: bo = (__m128i*)out; 273: pblocks = blocks - (blocks % ECB_PARALLELISM); 274: 275: for (i = 0; i < pblocks; i += ECB_PARALLELISM) 276: { 277: t1 = _mm_loadu_si128(bi + i + 0); 278: t2 = _mm_loadu_si128(bi + i + 1); 279: t3 = _mm_loadu_si128(bi + i + 2); 280: t4 = _mm_loadu_si128(bi + i + 3); 281: 282: t1 = _mm_xor_si128(t1, ks[0]); 283: t2 = _mm_xor_si128(t2, ks[0]); 284: t3 = _mm_xor_si128(t3, ks[0]); 285: t4 = _mm_xor_si128(t4, ks[0]); 286: 287: t1 = _mm_aesenc_si128(t1, ks[1]); 288: t2 = _mm_aesenc_si128(t2, ks[1]); 289: t3 = _mm_aesenc_si128(t3, ks[1]); 290: t4 = _mm_aesenc_si128(t4, ks[1]); 291: t1 = _mm_aesenc_si128(t1, ks[2]); 292: t2 = _mm_aesenc_si128(t2, ks[2]); 293: t3 = _mm_aesenc_si128(t3, ks[2]); 294: t4 = _mm_aesenc_si128(t4, ks[2]); 295: t1 = _mm_aesenc_si128(t1, ks[3]); 296: t2 = _mm_aesenc_si128(t2, ks[3]); 297: t3 = _mm_aesenc_si128(t3, ks[3]); 298: t4 = _mm_aesenc_si128(t4, ks[3]); 299: t1 = _mm_aesenc_si128(t1, ks[4]); 300: t2 = _mm_aesenc_si128(t2, ks[4]); 301: t3 = _mm_aesenc_si128(t3, ks[4]); 302: t4 = _mm_aesenc_si128(t4, ks[4]); 303: t1 = _mm_aesenc_si128(t1, ks[5]); 304: t2 = _mm_aesenc_si128(t2, ks[5]); 305: t3 = _mm_aesenc_si128(t3, ks[5]); 306: t4 = _mm_aesenc_si128(t4, ks[5]); 307: t1 = _mm_aesenc_si128(t1, ks[6]); 308: t2 = _mm_aesenc_si128(t2, ks[6]); 309: t3 = _mm_aesenc_si128(t3, ks[6]); 310: t4 = _mm_aesenc_si128(t4, ks[6]); 311: t1 = _mm_aesenc_si128(t1, ks[7]); 312: t2 = _mm_aesenc_si128(t2, ks[7]); 313: t3 = _mm_aesenc_si128(t3, ks[7]); 314: t4 = _mm_aesenc_si128(t4, ks[7]); 315: t1 = _mm_aesenc_si128(t1, ks[8]); 316: t2 = _mm_aesenc_si128(t2, ks[8]); 317: t3 = _mm_aesenc_si128(t3, ks[8]); 318: t4 = _mm_aesenc_si128(t4, ks[8]); 319: t1 = _mm_aesenc_si128(t1, ks[9]); 320: t2 = _mm_aesenc_si128(t2, ks[9]); 321: t3 = _mm_aesenc_si128(t3, ks[9]); 322: t4 = _mm_aesenc_si128(t4, ks[9]); 323: t1 = _mm_aesenc_si128(t1, ks[10]); 324: t2 = _mm_aesenc_si128(t2, ks[10]); 325: t3 = _mm_aesenc_si128(t3, ks[10]); 326: t4 = _mm_aesenc_si128(t4, ks[10]); 327: t1 = _mm_aesenc_si128(t1, ks[11]); 328: t2 = _mm_aesenc_si128(t2, ks[11]); 329: t3 = _mm_aesenc_si128(t3, ks[11]); 330: t4 = _mm_aesenc_si128(t4, ks[11]); 331: 332: t1 = _mm_aesenclast_si128(t1, ks[12]); 333: t2 = _mm_aesenclast_si128(t2, ks[12]); 334: t3 = _mm_aesenclast_si128(t3, ks[12]); 335: t4 = _mm_aesenclast_si128(t4, ks[12]); 336: 337: _mm_storeu_si128(bo + i + 0, t1); 338: _mm_storeu_si128(bo + i + 1, t2); 339: _mm_storeu_si128(bo + i + 2, t3); 340: _mm_storeu_si128(bo + i + 3, t4); 341: } 342: 343: for (i = pblocks; i < blocks; i++) 344: { 345: t1 = _mm_loadu_si128(bi + i); 346: t1 = _mm_xor_si128(t1, ks[0]); 347: 348: t1 = _mm_aesenc_si128(t1, ks[1]); 349: t1 = _mm_aesenc_si128(t1, ks[2]); 350: t1 = _mm_aesenc_si128(t1, ks[3]); 351: t1 = _mm_aesenc_si128(t1, ks[4]); 352: t1 = _mm_aesenc_si128(t1, ks[5]); 353: t1 = _mm_aesenc_si128(t1, ks[6]); 354: t1 = _mm_aesenc_si128(t1, ks[7]); 355: t1 = _mm_aesenc_si128(t1, ks[8]); 356: t1 = _mm_aesenc_si128(t1, ks[9]); 357: t1 = _mm_aesenc_si128(t1, ks[10]); 358: t1 = _mm_aesenc_si128(t1, ks[11]); 359: 360: t1 = _mm_aesenclast_si128(t1, ks[12]); 361: _mm_storeu_si128(bo + i, t1); 362: } 363: } 364: 365: /** 366: * AES-192 ECB decryption 367: */ 368: static void decrypt_ecb192(aesni_key_t *key, u_int blocks, u_char *in, 369: u_char *out) 370: { 371: __m128i *ks, *bi, *bo; 372: __m128i t1, t2, t3, t4; 373: u_int i, pblocks; 374: 375: ks = key->schedule; 376: bi = (__m128i*)in; 377: bo = (__m128i*)out; 378: pblocks = blocks - (blocks % ECB_PARALLELISM); 379: 380: for (i = 0; i < pblocks; i += ECB_PARALLELISM) 381: { 382: t1 = _mm_loadu_si128(bi + i + 0); 383: t2 = _mm_loadu_si128(bi + i + 1); 384: t3 = _mm_loadu_si128(bi + i + 2); 385: t4 = _mm_loadu_si128(bi + i + 3); 386: 387: t1 = _mm_xor_si128(t1, ks[0]); 388: t2 = _mm_xor_si128(t2, ks[0]); 389: t3 = _mm_xor_si128(t3, ks[0]); 390: t4 = _mm_xor_si128(t4, ks[0]); 391: 392: t1 = _mm_aesdec_si128(t1, ks[1]); 393: t2 = _mm_aesdec_si128(t2, ks[1]); 394: t3 = _mm_aesdec_si128(t3, ks[1]); 395: t4 = _mm_aesdec_si128(t4, ks[1]); 396: t1 = _mm_aesdec_si128(t1, ks[2]); 397: t2 = _mm_aesdec_si128(t2, ks[2]); 398: t3 = _mm_aesdec_si128(t3, ks[2]); 399: t4 = _mm_aesdec_si128(t4, ks[2]); 400: t1 = _mm_aesdec_si128(t1, ks[3]); 401: t2 = _mm_aesdec_si128(t2, ks[3]); 402: t3 = _mm_aesdec_si128(t3, ks[3]); 403: t4 = _mm_aesdec_si128(t4, ks[3]); 404: t1 = _mm_aesdec_si128(t1, ks[4]); 405: t2 = _mm_aesdec_si128(t2, ks[4]); 406: t3 = _mm_aesdec_si128(t3, ks[4]); 407: t4 = _mm_aesdec_si128(t4, ks[4]); 408: t1 = _mm_aesdec_si128(t1, ks[5]); 409: t2 = _mm_aesdec_si128(t2, ks[5]); 410: t3 = _mm_aesdec_si128(t3, ks[5]); 411: t4 = _mm_aesdec_si128(t4, ks[5]); 412: t1 = _mm_aesdec_si128(t1, ks[6]); 413: t2 = _mm_aesdec_si128(t2, ks[6]); 414: t3 = _mm_aesdec_si128(t3, ks[6]); 415: t4 = _mm_aesdec_si128(t4, ks[6]); 416: t1 = _mm_aesdec_si128(t1, ks[7]); 417: t2 = _mm_aesdec_si128(t2, ks[7]); 418: t3 = _mm_aesdec_si128(t3, ks[7]); 419: t4 = _mm_aesdec_si128(t4, ks[7]); 420: t1 = _mm_aesdec_si128(t1, ks[8]); 421: t2 = _mm_aesdec_si128(t2, ks[8]); 422: t3 = _mm_aesdec_si128(t3, ks[8]); 423: t4 = _mm_aesdec_si128(t4, ks[8]); 424: t1 = _mm_aesdec_si128(t1, ks[9]); 425: t2 = _mm_aesdec_si128(t2, ks[9]); 426: t3 = _mm_aesdec_si128(t3, ks[9]); 427: t4 = _mm_aesdec_si128(t4, ks[9]); 428: t1 = _mm_aesdec_si128(t1, ks[10]); 429: t2 = _mm_aesdec_si128(t2, ks[10]); 430: t3 = _mm_aesdec_si128(t3, ks[10]); 431: t4 = _mm_aesdec_si128(t4, ks[10]); 432: t1 = _mm_aesdec_si128(t1, ks[11]); 433: t2 = _mm_aesdec_si128(t2, ks[11]); 434: t3 = _mm_aesdec_si128(t3, ks[11]); 435: t4 = _mm_aesdec_si128(t4, ks[11]); 436: 437: t1 = _mm_aesdeclast_si128(t1, ks[12]); 438: t2 = _mm_aesdeclast_si128(t2, ks[12]); 439: t3 = _mm_aesdeclast_si128(t3, ks[12]); 440: t4 = _mm_aesdeclast_si128(t4, ks[12]); 441: 442: _mm_storeu_si128(bo + i + 0, t1); 443: _mm_storeu_si128(bo + i + 1, t2); 444: _mm_storeu_si128(bo + i + 2, t3); 445: _mm_storeu_si128(bo + i + 3, t4); 446: } 447: 448: for (i = pblocks; i < blocks; i++) 449: { 450: t1 = _mm_loadu_si128(bi + i); 451: t1 = _mm_xor_si128(t1, ks[0]); 452: 453: t1 = _mm_aesdec_si128(t1, ks[1]); 454: t1 = _mm_aesdec_si128(t1, ks[2]); 455: t1 = _mm_aesdec_si128(t1, ks[3]); 456: t1 = _mm_aesdec_si128(t1, ks[4]); 457: t1 = _mm_aesdec_si128(t1, ks[5]); 458: t1 = _mm_aesdec_si128(t1, ks[6]); 459: t1 = _mm_aesdec_si128(t1, ks[7]); 460: t1 = _mm_aesdec_si128(t1, ks[8]); 461: t1 = _mm_aesdec_si128(t1, ks[9]); 462: t1 = _mm_aesdec_si128(t1, ks[10]); 463: t1 = _mm_aesdec_si128(t1, ks[11]); 464: 465: t1 = _mm_aesdeclast_si128(t1, ks[12]); 466: _mm_storeu_si128(bo + i, t1); 467: } 468: } 469: 470: /** 471: * AES-256 ECB encryption 472: */ 473: static void encrypt_ecb256(aesni_key_t *key, u_int blocks, u_char *in, 474: u_char *out) 475: { 476: __m128i *ks, *bi, *bo; 477: __m128i t1, t2, t3, t4; 478: u_int i, pblocks; 479: 480: ks = key->schedule; 481: bi = (__m128i*)in; 482: bo = (__m128i*)out; 483: pblocks = blocks - (blocks % ECB_PARALLELISM); 484: 485: for (i = 0; i < pblocks; i += ECB_PARALLELISM) 486: { 487: t1 = _mm_loadu_si128(bi + i + 0); 488: t2 = _mm_loadu_si128(bi + i + 1); 489: t3 = _mm_loadu_si128(bi + i + 2); 490: t4 = _mm_loadu_si128(bi + i + 3); 491: 492: t1 = _mm_xor_si128(t1, ks[0]); 493: t2 = _mm_xor_si128(t2, ks[0]); 494: t3 = _mm_xor_si128(t3, ks[0]); 495: t4 = _mm_xor_si128(t4, ks[0]); 496: 497: t1 = _mm_aesenc_si128(t1, ks[1]); 498: t2 = _mm_aesenc_si128(t2, ks[1]); 499: t3 = _mm_aesenc_si128(t3, ks[1]); 500: t4 = _mm_aesenc_si128(t4, ks[1]); 501: t1 = _mm_aesenc_si128(t1, ks[2]); 502: t2 = _mm_aesenc_si128(t2, ks[2]); 503: t3 = _mm_aesenc_si128(t3, ks[2]); 504: t4 = _mm_aesenc_si128(t4, ks[2]); 505: t1 = _mm_aesenc_si128(t1, ks[3]); 506: t2 = _mm_aesenc_si128(t2, ks[3]); 507: t3 = _mm_aesenc_si128(t3, ks[3]); 508: t4 = _mm_aesenc_si128(t4, ks[3]); 509: t1 = _mm_aesenc_si128(t1, ks[4]); 510: t2 = _mm_aesenc_si128(t2, ks[4]); 511: t3 = _mm_aesenc_si128(t3, ks[4]); 512: t4 = _mm_aesenc_si128(t4, ks[4]); 513: t1 = _mm_aesenc_si128(t1, ks[5]); 514: t2 = _mm_aesenc_si128(t2, ks[5]); 515: t3 = _mm_aesenc_si128(t3, ks[5]); 516: t4 = _mm_aesenc_si128(t4, ks[5]); 517: t1 = _mm_aesenc_si128(t1, ks[6]); 518: t2 = _mm_aesenc_si128(t2, ks[6]); 519: t3 = _mm_aesenc_si128(t3, ks[6]); 520: t4 = _mm_aesenc_si128(t4, ks[6]); 521: t1 = _mm_aesenc_si128(t1, ks[7]); 522: t2 = _mm_aesenc_si128(t2, ks[7]); 523: t3 = _mm_aesenc_si128(t3, ks[7]); 524: t4 = _mm_aesenc_si128(t4, ks[7]); 525: t1 = _mm_aesenc_si128(t1, ks[8]); 526: t2 = _mm_aesenc_si128(t2, ks[8]); 527: t3 = _mm_aesenc_si128(t3, ks[8]); 528: t4 = _mm_aesenc_si128(t4, ks[8]); 529: t1 = _mm_aesenc_si128(t1, ks[9]); 530: t2 = _mm_aesenc_si128(t2, ks[9]); 531: t3 = _mm_aesenc_si128(t3, ks[9]); 532: t4 = _mm_aesenc_si128(t4, ks[9]); 533: t1 = _mm_aesenc_si128(t1, ks[10]); 534: t2 = _mm_aesenc_si128(t2, ks[10]); 535: t3 = _mm_aesenc_si128(t3, ks[10]); 536: t4 = _mm_aesenc_si128(t4, ks[10]); 537: t1 = _mm_aesenc_si128(t1, ks[11]); 538: t2 = _mm_aesenc_si128(t2, ks[11]); 539: t3 = _mm_aesenc_si128(t3, ks[11]); 540: t4 = _mm_aesenc_si128(t4, ks[11]); 541: t1 = _mm_aesenc_si128(t1, ks[12]); 542: t2 = _mm_aesenc_si128(t2, ks[12]); 543: t3 = _mm_aesenc_si128(t3, ks[12]); 544: t4 = _mm_aesenc_si128(t4, ks[12]); 545: t1 = _mm_aesenc_si128(t1, ks[13]); 546: t2 = _mm_aesenc_si128(t2, ks[13]); 547: t3 = _mm_aesenc_si128(t3, ks[13]); 548: t4 = _mm_aesenc_si128(t4, ks[13]); 549: 550: t1 = _mm_aesenclast_si128(t1, ks[14]); 551: t2 = _mm_aesenclast_si128(t2, ks[14]); 552: t3 = _mm_aesenclast_si128(t3, ks[14]); 553: t4 = _mm_aesenclast_si128(t4, ks[14]); 554: 555: _mm_storeu_si128(bo + i + 0, t1); 556: _mm_storeu_si128(bo + i + 1, t2); 557: _mm_storeu_si128(bo + i + 2, t3); 558: _mm_storeu_si128(bo + i + 3, t4); 559: } 560: 561: for (i = pblocks; i < blocks; i++) 562: { 563: t1 = _mm_loadu_si128(bi + i); 564: t1 = _mm_xor_si128(t1, ks[0]); 565: 566: t1 = _mm_aesenc_si128(t1, ks[1]); 567: t1 = _mm_aesenc_si128(t1, ks[2]); 568: t1 = _mm_aesenc_si128(t1, ks[3]); 569: t1 = _mm_aesenc_si128(t1, ks[4]); 570: t1 = _mm_aesenc_si128(t1, ks[5]); 571: t1 = _mm_aesenc_si128(t1, ks[6]); 572: t1 = _mm_aesenc_si128(t1, ks[7]); 573: t1 = _mm_aesenc_si128(t1, ks[8]); 574: t1 = _mm_aesenc_si128(t1, ks[9]); 575: t1 = _mm_aesenc_si128(t1, ks[10]); 576: t1 = _mm_aesenc_si128(t1, ks[11]); 577: t1 = _mm_aesenc_si128(t1, ks[12]); 578: t1 = _mm_aesenc_si128(t1, ks[13]); 579: 580: t1 = _mm_aesenclast_si128(t1, ks[14]); 581: _mm_storeu_si128(bo + i, t1); 582: } 583: } 584: 585: /** 586: * AES-256 ECB decryption 587: */ 588: static void decrypt_ecb256(aesni_key_t *key, u_int blocks, u_char *in, 589: u_char *out) 590: { 591: __m128i *ks, *bi, *bo; 592: __m128i t1, t2, t3, t4; 593: u_int i, pblocks; 594: 595: ks = key->schedule; 596: bi = (__m128i*)in; 597: bo = (__m128i*)out; 598: pblocks = blocks - (blocks % ECB_PARALLELISM); 599: 600: for (i = 0; i < pblocks; i += ECB_PARALLELISM) 601: { 602: t1 = _mm_loadu_si128(bi + i + 0); 603: t2 = _mm_loadu_si128(bi + i + 1); 604: t3 = _mm_loadu_si128(bi + i + 2); 605: t4 = _mm_loadu_si128(bi + i + 3); 606: 607: t1 = _mm_xor_si128(t1, ks[0]); 608: t2 = _mm_xor_si128(t2, ks[0]); 609: t3 = _mm_xor_si128(t3, ks[0]); 610: t4 = _mm_xor_si128(t4, ks[0]); 611: 612: t1 = _mm_aesdec_si128(t1, ks[1]); 613: t2 = _mm_aesdec_si128(t2, ks[1]); 614: t3 = _mm_aesdec_si128(t3, ks[1]); 615: t4 = _mm_aesdec_si128(t4, ks[1]); 616: t1 = _mm_aesdec_si128(t1, ks[2]); 617: t2 = _mm_aesdec_si128(t2, ks[2]); 618: t3 = _mm_aesdec_si128(t3, ks[2]); 619: t4 = _mm_aesdec_si128(t4, ks[2]); 620: t1 = _mm_aesdec_si128(t1, ks[3]); 621: t2 = _mm_aesdec_si128(t2, ks[3]); 622: t3 = _mm_aesdec_si128(t3, ks[3]); 623: t4 = _mm_aesdec_si128(t4, ks[3]); 624: t1 = _mm_aesdec_si128(t1, ks[4]); 625: t2 = _mm_aesdec_si128(t2, ks[4]); 626: t3 = _mm_aesdec_si128(t3, ks[4]); 627: t4 = _mm_aesdec_si128(t4, ks[4]); 628: t1 = _mm_aesdec_si128(t1, ks[5]); 629: t2 = _mm_aesdec_si128(t2, ks[5]); 630: t3 = _mm_aesdec_si128(t3, ks[5]); 631: t4 = _mm_aesdec_si128(t4, ks[5]); 632: t1 = _mm_aesdec_si128(t1, ks[6]); 633: t2 = _mm_aesdec_si128(t2, ks[6]); 634: t3 = _mm_aesdec_si128(t3, ks[6]); 635: t4 = _mm_aesdec_si128(t4, ks[6]); 636: t1 = _mm_aesdec_si128(t1, ks[7]); 637: t2 = _mm_aesdec_si128(t2, ks[7]); 638: t3 = _mm_aesdec_si128(t3, ks[7]); 639: t4 = _mm_aesdec_si128(t4, ks[7]); 640: t1 = _mm_aesdec_si128(t1, ks[8]); 641: t2 = _mm_aesdec_si128(t2, ks[8]); 642: t3 = _mm_aesdec_si128(t3, ks[8]); 643: t4 = _mm_aesdec_si128(t4, ks[8]); 644: t1 = _mm_aesdec_si128(t1, ks[9]); 645: t2 = _mm_aesdec_si128(t2, ks[9]); 646: t3 = _mm_aesdec_si128(t3, ks[9]); 647: t4 = _mm_aesdec_si128(t4, ks[9]); 648: t1 = _mm_aesdec_si128(t1, ks[10]); 649: t2 = _mm_aesdec_si128(t2, ks[10]); 650: t3 = _mm_aesdec_si128(t3, ks[10]); 651: t4 = _mm_aesdec_si128(t4, ks[10]); 652: t1 = _mm_aesdec_si128(t1, ks[11]); 653: t2 = _mm_aesdec_si128(t2, ks[11]); 654: t3 = _mm_aesdec_si128(t3, ks[11]); 655: t4 = _mm_aesdec_si128(t4, ks[11]); 656: t1 = _mm_aesdec_si128(t1, ks[12]); 657: t2 = _mm_aesdec_si128(t2, ks[12]); 658: t3 = _mm_aesdec_si128(t3, ks[12]); 659: t4 = _mm_aesdec_si128(t4, ks[12]); 660: t1 = _mm_aesdec_si128(t1, ks[13]); 661: t2 = _mm_aesdec_si128(t2, ks[13]); 662: t3 = _mm_aesdec_si128(t3, ks[13]); 663: t4 = _mm_aesdec_si128(t4, ks[13]); 664: 665: t1 = _mm_aesdeclast_si128(t1, ks[14]); 666: t2 = _mm_aesdeclast_si128(t2, ks[14]); 667: t3 = _mm_aesdeclast_si128(t3, ks[14]); 668: t4 = _mm_aesdeclast_si128(t4, ks[14]); 669: 670: _mm_storeu_si128(bo + i + 0, t1); 671: _mm_storeu_si128(bo + i + 1, t2); 672: _mm_storeu_si128(bo + i + 2, t3); 673: _mm_storeu_si128(bo + i + 3, t4); 674: } 675: 676: for (i = pblocks; i < blocks; i++) 677: { 678: t1 = _mm_loadu_si128(bi + i); 679: t1 = _mm_xor_si128(t1, ks[0]); 680: 681: t1 = _mm_aesdec_si128(t1, ks[1]); 682: t1 = _mm_aesdec_si128(t1, ks[2]); 683: t1 = _mm_aesdec_si128(t1, ks[3]); 684: t1 = _mm_aesdec_si128(t1, ks[4]); 685: t1 = _mm_aesdec_si128(t1, ks[5]); 686: t1 = _mm_aesdec_si128(t1, ks[6]); 687: t1 = _mm_aesdec_si128(t1, ks[7]); 688: t1 = _mm_aesdec_si128(t1, ks[8]); 689: t1 = _mm_aesdec_si128(t1, ks[9]); 690: t1 = _mm_aesdec_si128(t1, ks[10]); 691: t1 = _mm_aesdec_si128(t1, ks[11]); 692: t1 = _mm_aesdec_si128(t1, ks[12]); 693: t1 = _mm_aesdec_si128(t1, ks[13]); 694: 695: t1 = _mm_aesdeclast_si128(t1, ks[14]); 696: _mm_storeu_si128(bo + i, t1); 697: } 698: } 699: 700: /** 701: * Do inline or allocated de/encryption using key schedule 702: */ 703: static bool crypt(aesni_ecb_fn_t fn, aesni_key_t *key, chunk_t data, 704: chunk_t *out) 705: { 706: u_char *buf; 707: 708: if (!key || data.len % AES_BLOCK_SIZE) 709: { 710: return FALSE; 711: } 712: if (out) 713: { 714: *out = chunk_alloc(data.len); 715: buf = out->ptr; 716: } 717: else 718: { 719: buf = data.ptr; 720: } 721: fn(key, data.len / AES_BLOCK_SIZE, data.ptr, buf); 722: return TRUE; 723: } 724: 725: METHOD(crypter_t, encrypt, bool, 726: private_aesni_ecb_t *this, chunk_t data, chunk_t iv, chunk_t *encrypted) 727: { 728: return crypt(this->encrypt, this->ekey, data, encrypted); 729: } 730: 731: METHOD(crypter_t, decrypt, bool, 732: private_aesni_ecb_t *this, chunk_t data, chunk_t iv, chunk_t *decrypted) 733: { 734: return crypt(this->decrypt, this->dkey, data, decrypted); 735: } 736: 737: METHOD(crypter_t, get_block_size, size_t, 738: private_aesni_ecb_t *this) 739: { 740: return AES_BLOCK_SIZE; 741: } 742: 743: METHOD(crypter_t, get_iv_size, size_t, 744: private_aesni_ecb_t *this) 745: { 746: return 0; 747: } 748: 749: METHOD(crypter_t, get_key_size, size_t, 750: private_aesni_ecb_t *this) 751: { 752: return this->key_size; 753: } 754: 755: METHOD(crypter_t, set_key, bool, 756: private_aesni_ecb_t *this, chunk_t key) 757: { 758: if (key.len != this->key_size) 759: { 760: return FALSE; 761: } 762: 763: DESTROY_IF(this->ekey); 764: DESTROY_IF(this->dkey); 765: 766: this->ekey = aesni_key_create(TRUE, key); 767: this->dkey = aesni_key_create(FALSE, key); 768: 769: return this->ekey && this->dkey; 770: } 771: 772: METHOD(crypter_t, destroy, void, 773: private_aesni_ecb_t *this) 774: { 775: DESTROY_IF(this->ekey); 776: DESTROY_IF(this->dkey); 777: free_align(this); 778: } 779: 780: /** 781: * See header 782: */ 783: aesni_ecb_t *aesni_ecb_create(encryption_algorithm_t algo, size_t key_size) 784: { 785: private_aesni_ecb_t *this; 786: 787: if (algo != ENCR_AES_ECB) 788: { 789: return NULL; 790: } 791: switch (key_size) 792: { 793: case 0: 794: key_size = 16; 795: break; 796: case 16: 797: case 24: 798: case 32: 799: break; 800: default: 801: return NULL; 802: } 803: 804: INIT_ALIGN(this, sizeof(__m128i), 805: .public = { 806: .crypter = { 807: .encrypt = _encrypt, 808: .decrypt = _decrypt, 809: .get_block_size = _get_block_size, 810: .get_iv_size = _get_iv_size, 811: .get_key_size = _get_key_size, 812: .set_key = _set_key, 813: .destroy = _destroy, 814: }, 815: }, 816: .key_size = key_size, 817: ); 818: 819: switch (key_size) 820: { 821: case 16: 822: this->encrypt = encrypt_ecb128; 823: this->decrypt = decrypt_ecb128; 824: break; 825: case 24: 826: this->encrypt = encrypt_ecb192; 827: this->decrypt = decrypt_ecb192; 828: break; 829: case 32: 830: this->encrypt = encrypt_ecb256; 831: this->decrypt = decrypt_ecb256; 832: break; 833: } 834: 835: return &this->public; 836: }