Return to aesni_gcm.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / strongswan / src / libstrongswan / plugins / aesni |
1.1 misho 1: /* 2: * Copyright (C) 2015 Martin Willi 3: * Copyright (C) 2015 revosec AG 4: * 5: * This program is free software; you can redistribute it and/or modify it 6: * under the terms of the GNU General Public License as published by the 7: * Free Software Foundation; either version 2 of the License, or (at your 8: * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>. 9: * 10: * This program is distributed in the hope that it will be useful, but 11: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 12: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13: * for more details. 14: */ 15: 16: #include "aesni_gcm.h" 17: #include "aesni_key.h" 18: 19: #include <crypto/iv/iv_gen_seq.h> 20: 21: #include <tmmintrin.h> 22: 23: #define NONCE_SIZE 12 24: #define IV_SIZE 8 25: #define SALT_SIZE (NONCE_SIZE - IV_SIZE) 26: 27: /** 28: * Parallel pipelining 29: */ 30: #define GCM_CRYPT_PARALLELISM 4 31: 32: typedef struct private_aesni_gcm_t private_aesni_gcm_t; 33: 34: /** 35: * GCM en/decryption method type 36: */ 37: typedef void (*aesni_gcm_fn_t)(private_aesni_gcm_t*, size_t, u_char*, u_char*, 38: u_char*, size_t, u_char*, u_char*); 39: 40: /** 41: * Private data of an aesni_gcm_t object. 42: */ 43: struct private_aesni_gcm_t { 44: 45: /** 46: * Public aesni_gcm_t interface. 47: */ 48: aesni_gcm_t public; 49: 50: /** 51: * Encryption key schedule 52: */ 53: aesni_key_t *key; 54: 55: /** 56: * IV generator. 57: */ 58: iv_gen_t *iv_gen; 59: 60: /** 61: * Length of the integrity check value 62: */ 63: size_t icv_size; 64: 65: /** 66: * Length of the key in bytes 67: */ 68: size_t key_size; 69: 70: /** 71: * GCM encryption function 72: */ 73: aesni_gcm_fn_t encrypt; 74: 75: /** 76: * GCM decryption function 77: */ 78: aesni_gcm_fn_t decrypt; 79: 80: /** 81: * salt to add to nonce 82: */ 83: u_char salt[SALT_SIZE]; 84: 85: /** 86: * GHASH subkey H, big-endian 87: */ 88: __m128i h; 89: 90: /** 91: * GHASH key H^2, big-endian 92: */ 93: __m128i hh; 94: 95: /** 96: * GHASH key H^3, big-endian 97: */ 98: __m128i hhh; 99: 100: /** 101: * GHASH key H^4, big-endian 102: */ 103: __m128i hhhh; 104: }; 105: 106: /** 107: * Byte-swap a 128-bit integer 108: */ 109: static inline __m128i swap128(__m128i x) 110: { 111: return _mm_shuffle_epi8(x, 112: _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); 113: } 114: 115: /** 116: * Multiply two blocks in GF128 117: */ 118: static __m128i mult_block(__m128i h, __m128i y) 119: { 120: __m128i t1, t2, t3, t4, t5, t6; 121: 122: y = swap128(y); 123: 124: t1 = _mm_clmulepi64_si128(h, y, 0x00); 125: t2 = _mm_clmulepi64_si128(h, y, 0x01); 126: t3 = _mm_clmulepi64_si128(h, y, 0x10); 127: t4 = _mm_clmulepi64_si128(h, y, 0x11); 128: 129: t2 = _mm_xor_si128(t2, t3); 130: t3 = _mm_slli_si128(t2, 8); 131: t2 = _mm_srli_si128(t2, 8); 132: t1 = _mm_xor_si128(t1, t3); 133: t4 = _mm_xor_si128(t4, t2); 134: 135: t5 = _mm_srli_epi32(t1, 31); 136: t1 = _mm_slli_epi32(t1, 1); 137: t6 = _mm_srli_epi32(t4, 31); 138: t4 = _mm_slli_epi32(t4, 1); 139: 140: t3 = _mm_srli_si128(t5, 12); 141: t6 = _mm_slli_si128(t6, 4); 142: t5 = _mm_slli_si128(t5, 4); 143: t1 = _mm_or_si128(t1, t5); 144: t4 = _mm_or_si128(t4, t6); 145: t4 = _mm_or_si128(t4, t3); 146: 147: t5 = _mm_slli_epi32(t1, 31); 148: t6 = _mm_slli_epi32(t1, 30); 149: t3 = _mm_slli_epi32(t1, 25); 150: 151: t5 = _mm_xor_si128(t5, t6); 152: t5 = _mm_xor_si128(t5, t3); 153: t6 = _mm_srli_si128(t5, 4); 154: t4 = _mm_xor_si128(t4, t6); 155: t5 = _mm_slli_si128(t5, 12); 156: t1 = _mm_xor_si128(t1, t5); 157: t4 = _mm_xor_si128(t4, t1); 158: 159: t5 = _mm_srli_epi32(t1, 1); 160: t2 = _mm_srli_epi32(t1, 2); 161: t3 = _mm_srli_epi32(t1, 7); 162: t4 = _mm_xor_si128(t4, t2); 163: t4 = _mm_xor_si128(t4, t3); 164: t4 = _mm_xor_si128(t4, t5); 165: 166: return swap128(t4); 167: } 168: 169: /** 170: * Multiply four consecutive blocks by their respective GHASH key, XOR 171: */ 172: static inline __m128i mult4xor(__m128i h1, __m128i h2, __m128i h3, __m128i h4, 173: __m128i d1, __m128i d2, __m128i d3, __m128i d4) 174: { 175: __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; 176: 177: d1 = swap128(d1); 178: d2 = swap128(d2); 179: d3 = swap128(d3); 180: d4 = swap128(d4); 181: 182: t0 = _mm_clmulepi64_si128(h1, d1, 0x00); 183: t1 = _mm_clmulepi64_si128(h2, d2, 0x00); 184: t2 = _mm_clmulepi64_si128(h3, d3, 0x00); 185: t3 = _mm_clmulepi64_si128(h4, d4, 0x00); 186: t8 = _mm_xor_si128(t0, t1); 187: t8 = _mm_xor_si128(t8, t2); 188: t8 = _mm_xor_si128(t8, t3); 189: 190: t4 = _mm_clmulepi64_si128(h1, d1, 0x11); 191: t5 = _mm_clmulepi64_si128(h2, d2, 0x11); 192: t6 = _mm_clmulepi64_si128(h3, d3, 0x11); 193: t7 = _mm_clmulepi64_si128(h4, d4, 0x11); 194: t9 = _mm_xor_si128(t4, t5); 195: t9 = _mm_xor_si128(t9, t6); 196: t9 = _mm_xor_si128(t9, t7); 197: 198: t0 = _mm_shuffle_epi32(h1, 78); 199: t4 = _mm_shuffle_epi32(d1, 78); 200: t0 = _mm_xor_si128(t0, h1); 201: t4 = _mm_xor_si128(t4, d1); 202: t1 = _mm_shuffle_epi32(h2, 78); 203: t5 = _mm_shuffle_epi32(d2, 78); 204: t1 = _mm_xor_si128(t1, h2); 205: t5 = _mm_xor_si128(t5, d2); 206: t2 = _mm_shuffle_epi32(h3, 78); 207: t6 = _mm_shuffle_epi32(d3, 78); 208: t2 = _mm_xor_si128(t2, h3); 209: t6 = _mm_xor_si128(t6, d3); 210: t3 = _mm_shuffle_epi32(h4, 78); 211: t7 = _mm_shuffle_epi32(d4, 78); 212: t3 = _mm_xor_si128(t3, h4); 213: t7 = _mm_xor_si128(t7, d4); 214: 215: t0 = _mm_clmulepi64_si128(t0, t4, 0x00); 216: t1 = _mm_clmulepi64_si128(t1, t5, 0x00); 217: t2 = _mm_clmulepi64_si128(t2, t6, 0x00); 218: t3 = _mm_clmulepi64_si128(t3, t7, 0x00); 219: t0 = _mm_xor_si128(t0, t8); 220: t0 = _mm_xor_si128(t0, t9); 221: t0 = _mm_xor_si128(t1, t0); 222: t0 = _mm_xor_si128(t2, t0); 223: 224: t0 = _mm_xor_si128(t3, t0); 225: t4 = _mm_slli_si128(t0, 8); 226: t0 = _mm_srli_si128(t0, 8); 227: t3 = _mm_xor_si128(t4, t8); 228: t6 = _mm_xor_si128(t0, t9); 229: t7 = _mm_srli_epi32(t3, 31); 230: t8 = _mm_srli_epi32(t6, 31); 231: t3 = _mm_slli_epi32(t3, 1); 232: t6 = _mm_slli_epi32(t6, 1); 233: t9 = _mm_srli_si128(t7, 12); 234: t8 = _mm_slli_si128(t8, 4); 235: t7 = _mm_slli_si128(t7, 4); 236: t3 = _mm_or_si128(t3, t7); 237: t6 = _mm_or_si128(t6, t8); 238: t6 = _mm_or_si128(t6, t9); 239: t7 = _mm_slli_epi32(t3, 31); 240: t8 = _mm_slli_epi32(t3, 30); 241: t9 = _mm_slli_epi32(t3, 25); 242: t7 = _mm_xor_si128(t7, t8); 243: t7 = _mm_xor_si128(t7, t9); 244: t8 = _mm_srli_si128(t7, 4); 245: t7 = _mm_slli_si128(t7, 12); 246: t3 = _mm_xor_si128(t3, t7); 247: t2 = _mm_srli_epi32(t3, 1); 248: t4 = _mm_srli_epi32(t3, 2); 249: t5 = _mm_srli_epi32(t3, 7); 250: t2 = _mm_xor_si128(t2, t4); 251: t2 = _mm_xor_si128(t2, t5); 252: t2 = _mm_xor_si128(t2, t8); 253: t3 = _mm_xor_si128(t3, t2); 254: t6 = _mm_xor_si128(t6, t3); 255: 256: return swap128(t6); 257: } 258: 259: /** 260: * GHASH on a single block 261: */ 262: static __m128i ghash(__m128i h, __m128i y, __m128i x) 263: { 264: return mult_block(h, _mm_xor_si128(y, x)); 265: } 266: 267: /** 268: * Start constructing the ICV for the associated data 269: */ 270: static __m128i icv_header(private_aesni_gcm_t *this, void *assoc, size_t alen) 271: { 272: u_int blocks, pblocks, rem, i; 273: __m128i h1, h2, h3, h4, d1, d2, d3, d4; 274: __m128i y, last, *ab; 275: 276: h1 = this->hhhh; 277: h2 = this->hhh; 278: h3 = this->hh; 279: h4 = this->h; 280: 281: y = _mm_setzero_si128(); 282: ab = assoc; 283: blocks = alen / AES_BLOCK_SIZE; 284: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); 285: rem = alen % AES_BLOCK_SIZE; 286: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) 287: { 288: d1 = _mm_loadu_si128(ab + i + 0); 289: d2 = _mm_loadu_si128(ab + i + 1); 290: d3 = _mm_loadu_si128(ab + i + 2); 291: d4 = _mm_loadu_si128(ab + i + 3); 292: y = _mm_xor_si128(y, d1); 293: y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4); 294: } 295: for (i = pblocks; i < blocks; i++) 296: { 297: y = ghash(this->h, y, _mm_loadu_si128(ab + i)); 298: } 299: if (rem) 300: { 301: last = _mm_setzero_si128(); 302: memcpy(&last, ab + blocks, rem); 303: 304: y = ghash(this->h, y, last); 305: } 306: 307: return y; 308: } 309: 310: /** 311: * Complete the ICV by hashing a assoc/data length block 312: */ 313: static __m128i icv_tailer(private_aesni_gcm_t *this, __m128i y, 314: size_t alen, size_t dlen) 315: { 316: __m128i b; 317: 318: htoun64(&b, alen * 8); 319: htoun64((u_char*)&b + sizeof(uint64_t), dlen * 8); 320: 321: return ghash(this->h, y, b); 322: } 323: 324: /** 325: * En-/Decrypt the ICV, trim and store it 326: */ 327: static void icv_crypt(private_aesni_gcm_t *this, __m128i y, __m128i j, 328: u_char *icv) 329: { 330: __m128i *ks, t, b; 331: u_int round; 332: 333: ks = this->key->schedule; 334: t = _mm_xor_si128(j, ks[0]); 335: for (round = 1; round < this->key->rounds; round++) 336: { 337: t = _mm_aesenc_si128(t, ks[round]); 338: } 339: t = _mm_aesenclast_si128(t, ks[this->key->rounds]); 340: 341: t = _mm_xor_si128(y, t); 342: 343: _mm_storeu_si128(&b, t); 344: memcpy(icv, &b, this->icv_size); 345: } 346: 347: /** 348: * Do big-endian increment on x 349: */ 350: static inline __m128i increment_be(__m128i x) 351: { 352: x = swap128(x); 353: x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1)); 354: x = swap128(x); 355: 356: return x; 357: } 358: 359: /** 360: * Generate the block J0 361: */ 362: static inline __m128i create_j(private_aesni_gcm_t *this, u_char *iv) 363: { 364: u_char j[AES_BLOCK_SIZE]; 365: 366: memcpy(j, this->salt, SALT_SIZE); 367: memcpy(j + SALT_SIZE, iv, IV_SIZE); 368: htoun32(j + SALT_SIZE + IV_SIZE, 1); 369: 370: return _mm_loadu_si128((__m128i*)j); 371: } 372: 373: /** 374: * Encrypt a remaining incomplete block, return updated Y 375: */ 376: static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem, 377: void *in, void *out, __m128i cb, __m128i y) 378: { 379: __m128i *ks, t, b; 380: u_int round; 381: 382: memset(&b, 0, sizeof(b)); 383: memcpy(&b, in, rem); 384: 385: ks = this->key->schedule; 386: t = _mm_xor_si128(cb, ks[0]); 387: for (round = 1; round < this->key->rounds; round++) 388: { 389: t = _mm_aesenc_si128(t, ks[round]); 390: } 391: t = _mm_aesenclast_si128(t, ks[this->key->rounds]); 392: b = _mm_xor_si128(t, b); 393: 394: memcpy(out, &b, rem); 395: 396: memset((u_char*)&b + rem, 0, AES_BLOCK_SIZE - rem); 397: return ghash(this->h, y, b); 398: } 399: 400: /** 401: * Decrypt a remaining incomplete block, return updated Y 402: */ 403: static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem, 404: void *in, void *out, __m128i cb, __m128i y) 405: { 406: __m128i *ks, t, b; 407: u_int round; 408: 409: memset(&b, 0, sizeof(b)); 410: memcpy(&b, in, rem); 411: 412: y = ghash(this->h, y, b); 413: 414: ks = this->key->schedule; 415: t = _mm_xor_si128(cb, ks[0]); 416: for (round = 1; round < this->key->rounds; round++) 417: { 418: t = _mm_aesenc_si128(t, ks[round]); 419: } 420: t = _mm_aesenclast_si128(t, ks[this->key->rounds]); 421: b = _mm_xor_si128(t, b); 422: 423: memcpy(out, &b, rem); 424: 425: return y; 426: } 427: 428: /** 429: * AES-128 GCM encryption/ICV generation 430: */ 431: static void encrypt_gcm128(private_aesni_gcm_t *this, 432: size_t len, u_char *in, u_char *out, u_char *iv, 433: size_t alen, u_char *assoc, u_char *icv) 434: { 435: __m128i d1, d2, d3, d4, t1, t2, t3, t4; 436: __m128i *ks, y, j, cb, *bi, *bo; 437: u_int blocks, pblocks, rem, i; 438: 439: j = create_j(this, iv); 440: cb = increment_be(j); 441: y = icv_header(this, assoc, alen); 442: blocks = len / AES_BLOCK_SIZE; 443: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); 444: rem = len % AES_BLOCK_SIZE; 445: bi = (__m128i*)in; 446: bo = (__m128i*)out; 447: 448: ks = this->key->schedule; 449: 450: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) 451: { 452: d1 = _mm_loadu_si128(bi + i + 0); 453: d2 = _mm_loadu_si128(bi + i + 1); 454: d3 = _mm_loadu_si128(bi + i + 2); 455: d4 = _mm_loadu_si128(bi + i + 3); 456: 457: t1 = _mm_xor_si128(cb, ks[0]); 458: cb = increment_be(cb); 459: t2 = _mm_xor_si128(cb, ks[0]); 460: cb = increment_be(cb); 461: t3 = _mm_xor_si128(cb, ks[0]); 462: cb = increment_be(cb); 463: t4 = _mm_xor_si128(cb, ks[0]); 464: cb = increment_be(cb); 465: 466: t1 = _mm_aesenc_si128(t1, ks[1]); 467: t2 = _mm_aesenc_si128(t2, ks[1]); 468: t3 = _mm_aesenc_si128(t3, ks[1]); 469: t4 = _mm_aesenc_si128(t4, ks[1]); 470: t1 = _mm_aesenc_si128(t1, ks[2]); 471: t2 = _mm_aesenc_si128(t2, ks[2]); 472: t3 = _mm_aesenc_si128(t3, ks[2]); 473: t4 = _mm_aesenc_si128(t4, ks[2]); 474: t1 = _mm_aesenc_si128(t1, ks[3]); 475: t2 = _mm_aesenc_si128(t2, ks[3]); 476: t3 = _mm_aesenc_si128(t3, ks[3]); 477: t4 = _mm_aesenc_si128(t4, ks[3]); 478: t1 = _mm_aesenc_si128(t1, ks[4]); 479: t2 = _mm_aesenc_si128(t2, ks[4]); 480: t3 = _mm_aesenc_si128(t3, ks[4]); 481: t4 = _mm_aesenc_si128(t4, ks[4]); 482: t1 = _mm_aesenc_si128(t1, ks[5]); 483: t2 = _mm_aesenc_si128(t2, ks[5]); 484: t3 = _mm_aesenc_si128(t3, ks[5]); 485: t4 = _mm_aesenc_si128(t4, ks[5]); 486: t1 = _mm_aesenc_si128(t1, ks[6]); 487: t2 = _mm_aesenc_si128(t2, ks[6]); 488: t3 = _mm_aesenc_si128(t3, ks[6]); 489: t4 = _mm_aesenc_si128(t4, ks[6]); 490: t1 = _mm_aesenc_si128(t1, ks[7]); 491: t2 = _mm_aesenc_si128(t2, ks[7]); 492: t3 = _mm_aesenc_si128(t3, ks[7]); 493: t4 = _mm_aesenc_si128(t4, ks[7]); 494: t1 = _mm_aesenc_si128(t1, ks[8]); 495: t2 = _mm_aesenc_si128(t2, ks[8]); 496: t3 = _mm_aesenc_si128(t3, ks[8]); 497: t4 = _mm_aesenc_si128(t4, ks[8]); 498: t1 = _mm_aesenc_si128(t1, ks[9]); 499: t2 = _mm_aesenc_si128(t2, ks[9]); 500: t3 = _mm_aesenc_si128(t3, ks[9]); 501: t4 = _mm_aesenc_si128(t4, ks[9]); 502: 503: t1 = _mm_aesenclast_si128(t1, ks[10]); 504: t2 = _mm_aesenclast_si128(t2, ks[10]); 505: t3 = _mm_aesenclast_si128(t3, ks[10]); 506: t4 = _mm_aesenclast_si128(t4, ks[10]); 507: 508: t1 = _mm_xor_si128(t1, d1); 509: t2 = _mm_xor_si128(t2, d2); 510: t3 = _mm_xor_si128(t3, d3); 511: t4 = _mm_xor_si128(t4, d4); 512: 513: y = _mm_xor_si128(y, t1); 514: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4); 515: 516: _mm_storeu_si128(bo + i + 0, t1); 517: _mm_storeu_si128(bo + i + 1, t2); 518: _mm_storeu_si128(bo + i + 2, t3); 519: _mm_storeu_si128(bo + i + 3, t4); 520: } 521: 522: for (i = pblocks; i < blocks; i++) 523: { 524: d1 = _mm_loadu_si128(bi + i); 525: 526: t1 = _mm_xor_si128(cb, ks[0]); 527: t1 = _mm_aesenc_si128(t1, ks[1]); 528: t1 = _mm_aesenc_si128(t1, ks[2]); 529: t1 = _mm_aesenc_si128(t1, ks[3]); 530: t1 = _mm_aesenc_si128(t1, ks[4]); 531: t1 = _mm_aesenc_si128(t1, ks[5]); 532: t1 = _mm_aesenc_si128(t1, ks[6]); 533: t1 = _mm_aesenc_si128(t1, ks[7]); 534: t1 = _mm_aesenc_si128(t1, ks[8]); 535: t1 = _mm_aesenc_si128(t1, ks[9]); 536: t1 = _mm_aesenclast_si128(t1, ks[10]); 537: 538: t1 = _mm_xor_si128(t1, d1); 539: _mm_storeu_si128(bo + i, t1); 540: 541: y = ghash(this->h, y, t1); 542: 543: cb = increment_be(cb); 544: } 545: 546: if (rem) 547: { 548: y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y); 549: } 550: y = icv_tailer(this, y, alen, len); 551: icv_crypt(this, y, j, icv); 552: } 553: 554: /** 555: * AES-128 GCM decryption/ICV generation 556: */ 557: static void decrypt_gcm128(private_aesni_gcm_t *this, 558: size_t len, u_char *in, u_char *out, u_char *iv, 559: size_t alen, u_char *assoc, u_char *icv) 560: { 561: __m128i d1, d2, d3, d4, t1, t2, t3, t4; 562: __m128i *ks, y, j, cb, *bi, *bo; 563: u_int blocks, pblocks, rem, i; 564: 565: j = create_j(this, iv); 566: cb = increment_be(j); 567: y = icv_header(this, assoc, alen); 568: blocks = len / AES_BLOCK_SIZE; 569: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); 570: rem = len % AES_BLOCK_SIZE; 571: bi = (__m128i*)in; 572: bo = (__m128i*)out; 573: 574: ks = this->key->schedule; 575: 576: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) 577: { 578: d1 = _mm_loadu_si128(bi + i + 0); 579: d2 = _mm_loadu_si128(bi + i + 1); 580: d3 = _mm_loadu_si128(bi + i + 2); 581: d4 = _mm_loadu_si128(bi + i + 3); 582: 583: y = _mm_xor_si128(y, d1); 584: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4); 585: 586: t1 = _mm_xor_si128(cb, ks[0]); 587: cb = increment_be(cb); 588: t2 = _mm_xor_si128(cb, ks[0]); 589: cb = increment_be(cb); 590: t3 = _mm_xor_si128(cb, ks[0]); 591: cb = increment_be(cb); 592: t4 = _mm_xor_si128(cb, ks[0]); 593: cb = increment_be(cb); 594: 595: t1 = _mm_aesenc_si128(t1, ks[1]); 596: t2 = _mm_aesenc_si128(t2, ks[1]); 597: t3 = _mm_aesenc_si128(t3, ks[1]); 598: t4 = _mm_aesenc_si128(t4, ks[1]); 599: t1 = _mm_aesenc_si128(t1, ks[2]); 600: t2 = _mm_aesenc_si128(t2, ks[2]); 601: t3 = _mm_aesenc_si128(t3, ks[2]); 602: t4 = _mm_aesenc_si128(t4, ks[2]); 603: t1 = _mm_aesenc_si128(t1, ks[3]); 604: t2 = _mm_aesenc_si128(t2, ks[3]); 605: t3 = _mm_aesenc_si128(t3, ks[3]); 606: t4 = _mm_aesenc_si128(t4, ks[3]); 607: t1 = _mm_aesenc_si128(t1, ks[4]); 608: t2 = _mm_aesenc_si128(t2, ks[4]); 609: t3 = _mm_aesenc_si128(t3, ks[4]); 610: t4 = _mm_aesenc_si128(t4, ks[4]); 611: t1 = _mm_aesenc_si128(t1, ks[5]); 612: t2 = _mm_aesenc_si128(t2, ks[5]); 613: t3 = _mm_aesenc_si128(t3, ks[5]); 614: t4 = _mm_aesenc_si128(t4, ks[5]); 615: t1 = _mm_aesenc_si128(t1, ks[6]); 616: t2 = _mm_aesenc_si128(t2, ks[6]); 617: t3 = _mm_aesenc_si128(t3, ks[6]); 618: t4 = _mm_aesenc_si128(t4, ks[6]); 619: t1 = _mm_aesenc_si128(t1, ks[7]); 620: t2 = _mm_aesenc_si128(t2, ks[7]); 621: t3 = _mm_aesenc_si128(t3, ks[7]); 622: t4 = _mm_aesenc_si128(t4, ks[7]); 623: t1 = _mm_aesenc_si128(t1, ks[8]); 624: t2 = _mm_aesenc_si128(t2, ks[8]); 625: t3 = _mm_aesenc_si128(t3, ks[8]); 626: t4 = _mm_aesenc_si128(t4, ks[8]); 627: t1 = _mm_aesenc_si128(t1, ks[9]); 628: t2 = _mm_aesenc_si128(t2, ks[9]); 629: t3 = _mm_aesenc_si128(t3, ks[9]); 630: t4 = _mm_aesenc_si128(t4, ks[9]); 631: 632: t1 = _mm_aesenclast_si128(t1, ks[10]); 633: t2 = _mm_aesenclast_si128(t2, ks[10]); 634: t3 = _mm_aesenclast_si128(t3, ks[10]); 635: t4 = _mm_aesenclast_si128(t4, ks[10]); 636: 637: t1 = _mm_xor_si128(t1, d1); 638: t2 = _mm_xor_si128(t2, d2); 639: t3 = _mm_xor_si128(t3, d3); 640: t4 = _mm_xor_si128(t4, d4); 641: 642: _mm_storeu_si128(bo + i + 0, t1); 643: _mm_storeu_si128(bo + i + 1, t2); 644: _mm_storeu_si128(bo + i + 2, t3); 645: _mm_storeu_si128(bo + i + 3, t4); 646: } 647: 648: for (i = pblocks; i < blocks; i++) 649: { 650: d1 = _mm_loadu_si128(bi + i); 651: 652: y = ghash(this->h, y, d1); 653: 654: t1 = _mm_xor_si128(cb, ks[0]); 655: t1 = _mm_aesenc_si128(t1, ks[1]); 656: t1 = _mm_aesenc_si128(t1, ks[2]); 657: t1 = _mm_aesenc_si128(t1, ks[3]); 658: t1 = _mm_aesenc_si128(t1, ks[4]); 659: t1 = _mm_aesenc_si128(t1, ks[5]); 660: t1 = _mm_aesenc_si128(t1, ks[6]); 661: t1 = _mm_aesenc_si128(t1, ks[7]); 662: t1 = _mm_aesenc_si128(t1, ks[8]); 663: t1 = _mm_aesenc_si128(t1, ks[9]); 664: t1 = _mm_aesenclast_si128(t1, ks[10]); 665: 666: t1 = _mm_xor_si128(t1, d1); 667: _mm_storeu_si128(bo + i, t1); 668: 669: cb = increment_be(cb); 670: } 671: 672: if (rem) 673: { 674: y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y); 675: } 676: y = icv_tailer(this, y, alen, len); 677: icv_crypt(this, y, j, icv); 678: } 679: 680: /** 681: * AES-192 GCM encryption/ICV generation 682: */ 683: static void encrypt_gcm192(private_aesni_gcm_t *this, 684: size_t len, u_char *in, u_char *out, u_char *iv, 685: size_t alen, u_char *assoc, u_char *icv) 686: { 687: __m128i d1, d2, d3, d4, t1, t2, t3, t4; 688: __m128i *ks, y, j, cb, *bi, *bo; 689: u_int blocks, pblocks, rem, i; 690: 691: j = create_j(this, iv); 692: cb = increment_be(j); 693: y = icv_header(this, assoc, alen); 694: blocks = len / AES_BLOCK_SIZE; 695: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); 696: rem = len % AES_BLOCK_SIZE; 697: bi = (__m128i*)in; 698: bo = (__m128i*)out; 699: 700: ks = this->key->schedule; 701: 702: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) 703: { 704: d1 = _mm_loadu_si128(bi + i + 0); 705: d2 = _mm_loadu_si128(bi + i + 1); 706: d3 = _mm_loadu_si128(bi + i + 2); 707: d4 = _mm_loadu_si128(bi + i + 3); 708: 709: t1 = _mm_xor_si128(cb, ks[0]); 710: cb = increment_be(cb); 711: t2 = _mm_xor_si128(cb, ks[0]); 712: cb = increment_be(cb); 713: t3 = _mm_xor_si128(cb, ks[0]); 714: cb = increment_be(cb); 715: t4 = _mm_xor_si128(cb, ks[0]); 716: cb = increment_be(cb); 717: 718: t1 = _mm_aesenc_si128(t1, ks[1]); 719: t2 = _mm_aesenc_si128(t2, ks[1]); 720: t3 = _mm_aesenc_si128(t3, ks[1]); 721: t4 = _mm_aesenc_si128(t4, ks[1]); 722: t1 = _mm_aesenc_si128(t1, ks[2]); 723: t2 = _mm_aesenc_si128(t2, ks[2]); 724: t3 = _mm_aesenc_si128(t3, ks[2]); 725: t4 = _mm_aesenc_si128(t4, ks[2]); 726: t1 = _mm_aesenc_si128(t1, ks[3]); 727: t2 = _mm_aesenc_si128(t2, ks[3]); 728: t3 = _mm_aesenc_si128(t3, ks[3]); 729: t4 = _mm_aesenc_si128(t4, ks[3]); 730: t1 = _mm_aesenc_si128(t1, ks[4]); 731: t2 = _mm_aesenc_si128(t2, ks[4]); 732: t3 = _mm_aesenc_si128(t3, ks[4]); 733: t4 = _mm_aesenc_si128(t4, ks[4]); 734: t1 = _mm_aesenc_si128(t1, ks[5]); 735: t2 = _mm_aesenc_si128(t2, ks[5]); 736: t3 = _mm_aesenc_si128(t3, ks[5]); 737: t4 = _mm_aesenc_si128(t4, ks[5]); 738: t1 = _mm_aesenc_si128(t1, ks[6]); 739: t2 = _mm_aesenc_si128(t2, ks[6]); 740: t3 = _mm_aesenc_si128(t3, ks[6]); 741: t4 = _mm_aesenc_si128(t4, ks[6]); 742: t1 = _mm_aesenc_si128(t1, ks[7]); 743: t2 = _mm_aesenc_si128(t2, ks[7]); 744: t3 = _mm_aesenc_si128(t3, ks[7]); 745: t4 = _mm_aesenc_si128(t4, ks[7]); 746: t1 = _mm_aesenc_si128(t1, ks[8]); 747: t2 = _mm_aesenc_si128(t2, ks[8]); 748: t3 = _mm_aesenc_si128(t3, ks[8]); 749: t4 = _mm_aesenc_si128(t4, ks[8]); 750: t1 = _mm_aesenc_si128(t1, ks[9]); 751: t2 = _mm_aesenc_si128(t2, ks[9]); 752: t3 = _mm_aesenc_si128(t3, ks[9]); 753: t4 = _mm_aesenc_si128(t4, ks[9]); 754: t1 = _mm_aesenc_si128(t1, ks[10]); 755: t2 = _mm_aesenc_si128(t2, ks[10]); 756: t3 = _mm_aesenc_si128(t3, ks[10]); 757: t4 = _mm_aesenc_si128(t4, ks[10]); 758: t1 = _mm_aesenc_si128(t1, ks[11]); 759: t2 = _mm_aesenc_si128(t2, ks[11]); 760: t3 = _mm_aesenc_si128(t3, ks[11]); 761: t4 = _mm_aesenc_si128(t4, ks[11]); 762: 763: t1 = _mm_aesenclast_si128(t1, ks[12]); 764: t2 = _mm_aesenclast_si128(t2, ks[12]); 765: t3 = _mm_aesenclast_si128(t3, ks[12]); 766: t4 = _mm_aesenclast_si128(t4, ks[12]); 767: 768: t1 = _mm_xor_si128(t1, d1); 769: t2 = _mm_xor_si128(t2, d2); 770: t3 = _mm_xor_si128(t3, d3); 771: t4 = _mm_xor_si128(t4, d4); 772: 773: y = _mm_xor_si128(y, t1); 774: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4); 775: 776: _mm_storeu_si128(bo + i + 0, t1); 777: _mm_storeu_si128(bo + i + 1, t2); 778: _mm_storeu_si128(bo + i + 2, t3); 779: _mm_storeu_si128(bo + i + 3, t4); 780: } 781: 782: for (i = pblocks; i < blocks; i++) 783: { 784: d1 = _mm_loadu_si128(bi + i); 785: 786: t1 = _mm_xor_si128(cb, ks[0]); 787: t1 = _mm_aesenc_si128(t1, ks[1]); 788: t1 = _mm_aesenc_si128(t1, ks[2]); 789: t1 = _mm_aesenc_si128(t1, ks[3]); 790: t1 = _mm_aesenc_si128(t1, ks[4]); 791: t1 = _mm_aesenc_si128(t1, ks[5]); 792: t1 = _mm_aesenc_si128(t1, ks[6]); 793: t1 = _mm_aesenc_si128(t1, ks[7]); 794: t1 = _mm_aesenc_si128(t1, ks[8]); 795: t1 = _mm_aesenc_si128(t1, ks[9]); 796: t1 = _mm_aesenc_si128(t1, ks[10]); 797: t1 = _mm_aesenc_si128(t1, ks[11]); 798: t1 = _mm_aesenclast_si128(t1, ks[12]); 799: 800: t1 = _mm_xor_si128(t1, d1); 801: _mm_storeu_si128(bo + i, t1); 802: 803: y = ghash(this->h, y, t1); 804: 805: cb = increment_be(cb); 806: } 807: 808: if (rem) 809: { 810: y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y); 811: } 812: y = icv_tailer(this, y, alen, len); 813: icv_crypt(this, y, j, icv); 814: } 815: 816: /** 817: * AES-192 GCM decryption/ICV generation 818: */ 819: static void decrypt_gcm192(private_aesni_gcm_t *this, 820: size_t len, u_char *in, u_char *out, u_char *iv, 821: size_t alen, u_char *assoc, u_char *icv) 822: { 823: __m128i d1, d2, d3, d4, t1, t2, t3, t4; 824: __m128i *ks, y, j, cb, *bi, *bo; 825: u_int blocks, pblocks, rem, i; 826: 827: j = create_j(this, iv); 828: cb = increment_be(j); 829: y = icv_header(this, assoc, alen); 830: blocks = len / AES_BLOCK_SIZE; 831: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); 832: rem = len % AES_BLOCK_SIZE; 833: bi = (__m128i*)in; 834: bo = (__m128i*)out; 835: 836: ks = this->key->schedule; 837: 838: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) 839: { 840: d1 = _mm_loadu_si128(bi + i + 0); 841: d2 = _mm_loadu_si128(bi + i + 1); 842: d3 = _mm_loadu_si128(bi + i + 2); 843: d4 = _mm_loadu_si128(bi + i + 3); 844: 845: y = _mm_xor_si128(y, d1); 846: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4); 847: 848: t1 = _mm_xor_si128(cb, ks[0]); 849: cb = increment_be(cb); 850: t2 = _mm_xor_si128(cb, ks[0]); 851: cb = increment_be(cb); 852: t3 = _mm_xor_si128(cb, ks[0]); 853: cb = increment_be(cb); 854: t4 = _mm_xor_si128(cb, ks[0]); 855: cb = increment_be(cb); 856: 857: t1 = _mm_aesenc_si128(t1, ks[1]); 858: t2 = _mm_aesenc_si128(t2, ks[1]); 859: t3 = _mm_aesenc_si128(t3, ks[1]); 860: t4 = _mm_aesenc_si128(t4, ks[1]); 861: t1 = _mm_aesenc_si128(t1, ks[2]); 862: t2 = _mm_aesenc_si128(t2, ks[2]); 863: t3 = _mm_aesenc_si128(t3, ks[2]); 864: t4 = _mm_aesenc_si128(t4, ks[2]); 865: t1 = _mm_aesenc_si128(t1, ks[3]); 866: t2 = _mm_aesenc_si128(t2, ks[3]); 867: t3 = _mm_aesenc_si128(t3, ks[3]); 868: t4 = _mm_aesenc_si128(t4, ks[3]); 869: t1 = _mm_aesenc_si128(t1, ks[4]); 870: t2 = _mm_aesenc_si128(t2, ks[4]); 871: t3 = _mm_aesenc_si128(t3, ks[4]); 872: t4 = _mm_aesenc_si128(t4, ks[4]); 873: t1 = _mm_aesenc_si128(t1, ks[5]); 874: t2 = _mm_aesenc_si128(t2, ks[5]); 875: t3 = _mm_aesenc_si128(t3, ks[5]); 876: t4 = _mm_aesenc_si128(t4, ks[5]); 877: t1 = _mm_aesenc_si128(t1, ks[6]); 878: t2 = _mm_aesenc_si128(t2, ks[6]); 879: t3 = _mm_aesenc_si128(t3, ks[6]); 880: t4 = _mm_aesenc_si128(t4, ks[6]); 881: t1 = _mm_aesenc_si128(t1, ks[7]); 882: t2 = _mm_aesenc_si128(t2, ks[7]); 883: t3 = _mm_aesenc_si128(t3, ks[7]); 884: t4 = _mm_aesenc_si128(t4, ks[7]); 885: t1 = _mm_aesenc_si128(t1, ks[8]); 886: t2 = _mm_aesenc_si128(t2, ks[8]); 887: t3 = _mm_aesenc_si128(t3, ks[8]); 888: t4 = _mm_aesenc_si128(t4, ks[8]); 889: t1 = _mm_aesenc_si128(t1, ks[9]); 890: t2 = _mm_aesenc_si128(t2, ks[9]); 891: t3 = _mm_aesenc_si128(t3, ks[9]); 892: t4 = _mm_aesenc_si128(t4, ks[9]); 893: t1 = _mm_aesenc_si128(t1, ks[10]); 894: t2 = _mm_aesenc_si128(t2, ks[10]); 895: t3 = _mm_aesenc_si128(t3, ks[10]); 896: t4 = _mm_aesenc_si128(t4, ks[10]); 897: t1 = _mm_aesenc_si128(t1, ks[11]); 898: t2 = _mm_aesenc_si128(t2, ks[11]); 899: t3 = _mm_aesenc_si128(t3, ks[11]); 900: t4 = _mm_aesenc_si128(t4, ks[11]); 901: 902: t1 = _mm_aesenclast_si128(t1, ks[12]); 903: t2 = _mm_aesenclast_si128(t2, ks[12]); 904: t3 = _mm_aesenclast_si128(t3, ks[12]); 905: t4 = _mm_aesenclast_si128(t4, ks[12]); 906: 907: t1 = _mm_xor_si128(t1, d1); 908: t2 = _mm_xor_si128(t2, d2); 909: t3 = _mm_xor_si128(t3, d3); 910: t4 = _mm_xor_si128(t4, d4); 911: 912: _mm_storeu_si128(bo + i + 0, t1); 913: _mm_storeu_si128(bo + i + 1, t2); 914: _mm_storeu_si128(bo + i + 2, t3); 915: _mm_storeu_si128(bo + i + 3, t4); 916: } 917: 918: for (i = pblocks; i < blocks; i++) 919: { 920: d1 = _mm_loadu_si128(bi + i); 921: 922: y = ghash(this->h, y, d1); 923: 924: t1 = _mm_xor_si128(cb, ks[0]); 925: t1 = _mm_aesenc_si128(t1, ks[1]); 926: t1 = _mm_aesenc_si128(t1, ks[2]); 927: t1 = _mm_aesenc_si128(t1, ks[3]); 928: t1 = _mm_aesenc_si128(t1, ks[4]); 929: t1 = _mm_aesenc_si128(t1, ks[5]); 930: t1 = _mm_aesenc_si128(t1, ks[6]); 931: t1 = _mm_aesenc_si128(t1, ks[7]); 932: t1 = _mm_aesenc_si128(t1, ks[8]); 933: t1 = _mm_aesenc_si128(t1, ks[9]); 934: t1 = _mm_aesenc_si128(t1, ks[10]); 935: t1 = _mm_aesenc_si128(t1, ks[11]); 936: t1 = _mm_aesenclast_si128(t1, ks[12]); 937: 938: t1 = _mm_xor_si128(t1, d1); 939: _mm_storeu_si128(bo + i, t1); 940: 941: cb = increment_be(cb); 942: } 943: 944: if (rem) 945: { 946: y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y); 947: } 948: y = icv_tailer(this, y, alen, len); 949: icv_crypt(this, y, j, icv); 950: } 951: 952: /** 953: * AES-256 GCM encryption/ICV generation 954: */ 955: static void encrypt_gcm256(private_aesni_gcm_t *this, 956: size_t len, u_char *in, u_char *out, u_char *iv, 957: size_t alen, u_char *assoc, u_char *icv) 958: { 959: __m128i d1, d2, d3, d4, t1, t2, t3, t4; 960: __m128i *ks, y, j, cb, *bi, *bo; 961: u_int blocks, pblocks, rem, i; 962: 963: j = create_j(this, iv); 964: cb = increment_be(j); 965: y = icv_header(this, assoc, alen); 966: blocks = len / AES_BLOCK_SIZE; 967: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); 968: rem = len % AES_BLOCK_SIZE; 969: bi = (__m128i*)in; 970: bo = (__m128i*)out; 971: 972: ks = this->key->schedule; 973: 974: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) 975: { 976: d1 = _mm_loadu_si128(bi + i + 0); 977: d2 = _mm_loadu_si128(bi + i + 1); 978: d3 = _mm_loadu_si128(bi + i + 2); 979: d4 = _mm_loadu_si128(bi + i + 3); 980: 981: t1 = _mm_xor_si128(cb, ks[0]); 982: cb = increment_be(cb); 983: t2 = _mm_xor_si128(cb, ks[0]); 984: cb = increment_be(cb); 985: t3 = _mm_xor_si128(cb, ks[0]); 986: cb = increment_be(cb); 987: t4 = _mm_xor_si128(cb, ks[0]); 988: cb = increment_be(cb); 989: 990: t1 = _mm_aesenc_si128(t1, ks[1]); 991: t2 = _mm_aesenc_si128(t2, ks[1]); 992: t3 = _mm_aesenc_si128(t3, ks[1]); 993: t4 = _mm_aesenc_si128(t4, ks[1]); 994: t1 = _mm_aesenc_si128(t1, ks[2]); 995: t2 = _mm_aesenc_si128(t2, ks[2]); 996: t3 = _mm_aesenc_si128(t3, ks[2]); 997: t4 = _mm_aesenc_si128(t4, ks[2]); 998: t1 = _mm_aesenc_si128(t1, ks[3]); 999: t2 = _mm_aesenc_si128(t2, ks[3]); 1000: t3 = _mm_aesenc_si128(t3, ks[3]); 1001: t4 = _mm_aesenc_si128(t4, ks[3]); 1002: t1 = _mm_aesenc_si128(t1, ks[4]); 1003: t2 = _mm_aesenc_si128(t2, ks[4]); 1004: t3 = _mm_aesenc_si128(t3, ks[4]); 1005: t4 = _mm_aesenc_si128(t4, ks[4]); 1006: t1 = _mm_aesenc_si128(t1, ks[5]); 1007: t2 = _mm_aesenc_si128(t2, ks[5]); 1008: t3 = _mm_aesenc_si128(t3, ks[5]); 1009: t4 = _mm_aesenc_si128(t4, ks[5]); 1010: t1 = _mm_aesenc_si128(t1, ks[6]); 1011: t2 = _mm_aesenc_si128(t2, ks[6]); 1012: t3 = _mm_aesenc_si128(t3, ks[6]); 1013: t4 = _mm_aesenc_si128(t4, ks[6]); 1014: t1 = _mm_aesenc_si128(t1, ks[7]); 1015: t2 = _mm_aesenc_si128(t2, ks[7]); 1016: t3 = _mm_aesenc_si128(t3, ks[7]); 1017: t4 = _mm_aesenc_si128(t4, ks[7]); 1018: t1 = _mm_aesenc_si128(t1, ks[8]); 1019: t2 = _mm_aesenc_si128(t2, ks[8]); 1020: t3 = _mm_aesenc_si128(t3, ks[8]); 1021: t4 = _mm_aesenc_si128(t4, ks[8]); 1022: t1 = _mm_aesenc_si128(t1, ks[9]); 1023: t2 = _mm_aesenc_si128(t2, ks[9]); 1024: t3 = _mm_aesenc_si128(t3, ks[9]); 1025: t4 = _mm_aesenc_si128(t4, ks[9]); 1026: t1 = _mm_aesenc_si128(t1, ks[10]); 1027: t2 = _mm_aesenc_si128(t2, ks[10]); 1028: t3 = _mm_aesenc_si128(t3, ks[10]); 1029: t4 = _mm_aesenc_si128(t4, ks[10]); 1030: t1 = _mm_aesenc_si128(t1, ks[11]); 1031: t2 = _mm_aesenc_si128(t2, ks[11]); 1032: t3 = _mm_aesenc_si128(t3, ks[11]); 1033: t4 = _mm_aesenc_si128(t4, ks[11]); 1034: t1 = _mm_aesenc_si128(t1, ks[12]); 1035: t2 = _mm_aesenc_si128(t2, ks[12]); 1036: t3 = _mm_aesenc_si128(t3, ks[12]); 1037: t4 = _mm_aesenc_si128(t4, ks[12]); 1038: t1 = _mm_aesenc_si128(t1, ks[13]); 1039: t2 = _mm_aesenc_si128(t2, ks[13]); 1040: t3 = _mm_aesenc_si128(t3, ks[13]); 1041: t4 = _mm_aesenc_si128(t4, ks[13]); 1042: 1043: t1 = _mm_aesenclast_si128(t1, ks[14]); 1044: t2 = _mm_aesenclast_si128(t2, ks[14]); 1045: t3 = _mm_aesenclast_si128(t3, ks[14]); 1046: t4 = _mm_aesenclast_si128(t4, ks[14]); 1047: 1048: t1 = _mm_xor_si128(t1, d1); 1049: t2 = _mm_xor_si128(t2, d2); 1050: t3 = _mm_xor_si128(t3, d3); 1051: t4 = _mm_xor_si128(t4, d4); 1052: 1053: y = _mm_xor_si128(y, t1); 1054: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4); 1055: 1056: _mm_storeu_si128(bo + i + 0, t1); 1057: _mm_storeu_si128(bo + i + 1, t2); 1058: _mm_storeu_si128(bo + i + 2, t3); 1059: _mm_storeu_si128(bo + i + 3, t4); 1060: } 1061: 1062: for (i = pblocks; i < blocks; i++) 1063: { 1064: d1 = _mm_loadu_si128(bi + i); 1065: 1066: t1 = _mm_xor_si128(cb, ks[0]); 1067: t1 = _mm_aesenc_si128(t1, ks[1]); 1068: t1 = _mm_aesenc_si128(t1, ks[2]); 1069: t1 = _mm_aesenc_si128(t1, ks[3]); 1070: t1 = _mm_aesenc_si128(t1, ks[4]); 1071: t1 = _mm_aesenc_si128(t1, ks[5]); 1072: t1 = _mm_aesenc_si128(t1, ks[6]); 1073: t1 = _mm_aesenc_si128(t1, ks[7]); 1074: t1 = _mm_aesenc_si128(t1, ks[8]); 1075: t1 = _mm_aesenc_si128(t1, ks[9]); 1076: t1 = _mm_aesenc_si128(t1, ks[10]); 1077: t1 = _mm_aesenc_si128(t1, ks[11]); 1078: t1 = _mm_aesenc_si128(t1, ks[12]); 1079: t1 = _mm_aesenc_si128(t1, ks[13]); 1080: t1 = _mm_aesenclast_si128(t1, ks[14]); 1081: 1082: t1 = _mm_xor_si128(t1, d1); 1083: _mm_storeu_si128(bo + i, t1); 1084: 1085: y = ghash(this->h, y, t1); 1086: 1087: cb = increment_be(cb); 1088: } 1089: 1090: if (rem) 1091: { 1092: y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y); 1093: } 1094: y = icv_tailer(this, y, alen, len); 1095: icv_crypt(this, y, j, icv); 1096: } 1097: 1098: /** 1099: * AES-256 GCM decryption/ICV generation 1100: */ 1101: static void decrypt_gcm256(private_aesni_gcm_t *this, 1102: size_t len, u_char *in, u_char *out, u_char *iv, 1103: size_t alen, u_char *assoc, u_char *icv) 1104: { 1105: __m128i d1, d2, d3, d4, t1, t2, t3, t4; 1106: __m128i *ks, y, j, cb, *bi, *bo; 1107: u_int blocks, pblocks, rem, i; 1108: 1109: j = create_j(this, iv); 1110: cb = increment_be(j); 1111: y = icv_header(this, assoc, alen); 1112: blocks = len / AES_BLOCK_SIZE; 1113: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM); 1114: rem = len % AES_BLOCK_SIZE; 1115: bi = (__m128i*)in; 1116: bo = (__m128i*)out; 1117: 1118: ks = this->key->schedule; 1119: 1120: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM) 1121: { 1122: d1 = _mm_loadu_si128(bi + i + 0); 1123: d2 = _mm_loadu_si128(bi + i + 1); 1124: d3 = _mm_loadu_si128(bi + i + 2); 1125: d4 = _mm_loadu_si128(bi + i + 3); 1126: 1127: y = _mm_xor_si128(y, d1); 1128: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4); 1129: 1130: t1 = _mm_xor_si128(cb, ks[0]); 1131: cb = increment_be(cb); 1132: t2 = _mm_xor_si128(cb, ks[0]); 1133: cb = increment_be(cb); 1134: t3 = _mm_xor_si128(cb, ks[0]); 1135: cb = increment_be(cb); 1136: t4 = _mm_xor_si128(cb, ks[0]); 1137: cb = increment_be(cb); 1138: 1139: t1 = _mm_aesenc_si128(t1, ks[1]); 1140: t2 = _mm_aesenc_si128(t2, ks[1]); 1141: t3 = _mm_aesenc_si128(t3, ks[1]); 1142: t4 = _mm_aesenc_si128(t4, ks[1]); 1143: t1 = _mm_aesenc_si128(t1, ks[2]); 1144: t2 = _mm_aesenc_si128(t2, ks[2]); 1145: t3 = _mm_aesenc_si128(t3, ks[2]); 1146: t4 = _mm_aesenc_si128(t4, ks[2]); 1147: t1 = _mm_aesenc_si128(t1, ks[3]); 1148: t2 = _mm_aesenc_si128(t2, ks[3]); 1149: t3 = _mm_aesenc_si128(t3, ks[3]); 1150: t4 = _mm_aesenc_si128(t4, ks[3]); 1151: t1 = _mm_aesenc_si128(t1, ks[4]); 1152: t2 = _mm_aesenc_si128(t2, ks[4]); 1153: t3 = _mm_aesenc_si128(t3, ks[4]); 1154: t4 = _mm_aesenc_si128(t4, ks[4]); 1155: t1 = _mm_aesenc_si128(t1, ks[5]); 1156: t2 = _mm_aesenc_si128(t2, ks[5]); 1157: t3 = _mm_aesenc_si128(t3, ks[5]); 1158: t4 = _mm_aesenc_si128(t4, ks[5]); 1159: t1 = _mm_aesenc_si128(t1, ks[6]); 1160: t2 = _mm_aesenc_si128(t2, ks[6]); 1161: t3 = _mm_aesenc_si128(t3, ks[6]); 1162: t4 = _mm_aesenc_si128(t4, ks[6]); 1163: t1 = _mm_aesenc_si128(t1, ks[7]); 1164: t2 = _mm_aesenc_si128(t2, ks[7]); 1165: t3 = _mm_aesenc_si128(t3, ks[7]); 1166: t4 = _mm_aesenc_si128(t4, ks[7]); 1167: t1 = _mm_aesenc_si128(t1, ks[8]); 1168: t2 = _mm_aesenc_si128(t2, ks[8]); 1169: t3 = _mm_aesenc_si128(t3, ks[8]); 1170: t4 = _mm_aesenc_si128(t4, ks[8]); 1171: t1 = _mm_aesenc_si128(t1, ks[9]); 1172: t2 = _mm_aesenc_si128(t2, ks[9]); 1173: t3 = _mm_aesenc_si128(t3, ks[9]); 1174: t4 = _mm_aesenc_si128(t4, ks[9]); 1175: t1 = _mm_aesenc_si128(t1, ks[10]); 1176: t2 = _mm_aesenc_si128(t2, ks[10]); 1177: t3 = _mm_aesenc_si128(t3, ks[10]); 1178: t4 = _mm_aesenc_si128(t4, ks[10]); 1179: t1 = _mm_aesenc_si128(t1, ks[11]); 1180: t2 = _mm_aesenc_si128(t2, ks[11]); 1181: t3 = _mm_aesenc_si128(t3, ks[11]); 1182: t4 = _mm_aesenc_si128(t4, ks[11]); 1183: t1 = _mm_aesenc_si128(t1, ks[12]); 1184: t2 = _mm_aesenc_si128(t2, ks[12]); 1185: t3 = _mm_aesenc_si128(t3, ks[12]); 1186: t4 = _mm_aesenc_si128(t4, ks[12]); 1187: t1 = _mm_aesenc_si128(t1, ks[13]); 1188: t2 = _mm_aesenc_si128(t2, ks[13]); 1189: t3 = _mm_aesenc_si128(t3, ks[13]); 1190: t4 = _mm_aesenc_si128(t4, ks[13]); 1191: 1192: t1 = _mm_aesenclast_si128(t1, ks[14]); 1193: t2 = _mm_aesenclast_si128(t2, ks[14]); 1194: t3 = _mm_aesenclast_si128(t3, ks[14]); 1195: t4 = _mm_aesenclast_si128(t4, ks[14]); 1196: 1197: t1 = _mm_xor_si128(t1, d1); 1198: t2 = _mm_xor_si128(t2, d2); 1199: t3 = _mm_xor_si128(t3, d3); 1200: t4 = _mm_xor_si128(t4, d4); 1201: 1202: _mm_storeu_si128(bo + i + 0, t1); 1203: _mm_storeu_si128(bo + i + 1, t2); 1204: _mm_storeu_si128(bo + i + 2, t3); 1205: _mm_storeu_si128(bo + i + 3, t4); 1206: } 1207: 1208: for (i = pblocks; i < blocks; i++) 1209: { 1210: d1 = _mm_loadu_si128(bi + i); 1211: 1212: y = ghash(this->h, y, d1); 1213: 1214: t1 = _mm_xor_si128(cb, ks[0]); 1215: t1 = _mm_aesenc_si128(t1, ks[1]); 1216: t1 = _mm_aesenc_si128(t1, ks[2]); 1217: t1 = _mm_aesenc_si128(t1, ks[3]); 1218: t1 = _mm_aesenc_si128(t1, ks[4]); 1219: t1 = _mm_aesenc_si128(t1, ks[5]); 1220: t1 = _mm_aesenc_si128(t1, ks[6]); 1221: t1 = _mm_aesenc_si128(t1, ks[7]); 1222: t1 = _mm_aesenc_si128(t1, ks[8]); 1223: t1 = _mm_aesenc_si128(t1, ks[9]); 1224: t1 = _mm_aesenc_si128(t1, ks[10]); 1225: t1 = _mm_aesenc_si128(t1, ks[11]); 1226: t1 = _mm_aesenc_si128(t1, ks[12]); 1227: t1 = _mm_aesenc_si128(t1, ks[13]); 1228: t1 = _mm_aesenclast_si128(t1, ks[14]); 1229: 1230: t1 = _mm_xor_si128(t1, d1); 1231: _mm_storeu_si128(bo + i, t1); 1232: 1233: cb = increment_be(cb); 1234: } 1235: 1236: if (rem) 1237: { 1238: y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y); 1239: } 1240: y = icv_tailer(this, y, alen, len); 1241: icv_crypt(this, y, j, icv); 1242: } 1243: 1244: METHOD(aead_t, encrypt, bool, 1245: private_aesni_gcm_t *this, chunk_t plain, chunk_t assoc, chunk_t iv, 1246: chunk_t *encr) 1247: { 1248: u_char *out; 1249: 1250: if (!this->key || iv.len != IV_SIZE) 1251: { 1252: return FALSE; 1253: } 1254: out = plain.ptr; 1255: if (encr) 1256: { 1257: *encr = chunk_alloc(plain.len + this->icv_size); 1258: out = encr->ptr; 1259: } 1260: this->encrypt(this, plain.len, plain.ptr, out, iv.ptr, 1261: assoc.len, assoc.ptr, out + plain.len); 1262: return TRUE; 1263: } 1264: 1265: METHOD(aead_t, decrypt, bool, 1266: private_aesni_gcm_t *this, chunk_t encr, chunk_t assoc, chunk_t iv, 1267: chunk_t *plain) 1268: { 1269: u_char *out, icv[this->icv_size]; 1270: 1271: if (!this->key || iv.len != IV_SIZE || encr.len < this->icv_size) 1272: { 1273: return FALSE; 1274: } 1275: encr.len -= this->icv_size; 1276: out = encr.ptr; 1277: if (plain) 1278: { 1279: *plain = chunk_alloc(encr.len); 1280: out = plain->ptr; 1281: } 1282: this->decrypt(this, encr.len, encr.ptr, out, iv.ptr, 1283: assoc.len, assoc.ptr, icv); 1284: return memeq_const(icv, encr.ptr + encr.len, this->icv_size); 1285: } 1286: 1287: METHOD(aead_t, get_block_size, size_t, 1288: private_aesni_gcm_t *this) 1289: { 1290: return 1; 1291: } 1292: 1293: METHOD(aead_t, get_icv_size, size_t, 1294: private_aesni_gcm_t *this) 1295: { 1296: return this->icv_size; 1297: } 1298: 1299: METHOD(aead_t, get_iv_size, size_t, 1300: private_aesni_gcm_t *this) 1301: { 1302: return IV_SIZE; 1303: } 1304: 1305: METHOD(aead_t, get_iv_gen, iv_gen_t*, 1306: private_aesni_gcm_t *this) 1307: { 1308: return this->iv_gen; 1309: } 1310: 1311: METHOD(aead_t, get_key_size, size_t, 1312: private_aesni_gcm_t *this) 1313: { 1314: return this->key_size + SALT_SIZE; 1315: } 1316: 1317: METHOD(aead_t, set_key, bool, 1318: private_aesni_gcm_t *this, chunk_t key) 1319: { 1320: u_int round; 1321: __m128i *ks, h; 1322: 1323: if (key.len != this->key_size + SALT_SIZE) 1324: { 1325: return FALSE; 1326: } 1327: 1328: memcpy(this->salt, key.ptr + key.len - SALT_SIZE, SALT_SIZE); 1329: key.len -= SALT_SIZE; 1330: 1331: DESTROY_IF(this->key); 1332: this->key = aesni_key_create(TRUE, key); 1333: 1334: ks = this->key->schedule; 1335: h = _mm_xor_si128(_mm_setzero_si128(), ks[0]); 1336: for (round = 1; round < this->key->rounds; round++) 1337: { 1338: h = _mm_aesenc_si128(h, ks[round]); 1339: } 1340: h = _mm_aesenclast_si128(h, ks[this->key->rounds]); 1341: 1342: this->h = h; 1343: h = swap128(h); 1344: this->hh = mult_block(h, this->h); 1345: this->hhh = mult_block(h, this->hh); 1346: this->hhhh = mult_block(h, this->hhh); 1347: this->h = swap128(this->h); 1348: this->hh = swap128(this->hh); 1349: this->hhh = swap128(this->hhh); 1350: this->hhhh = swap128(this->hhhh); 1351: 1352: return TRUE; 1353: } 1354: 1355: METHOD(aead_t, destroy, void, 1356: private_aesni_gcm_t *this) 1357: { 1358: DESTROY_IF(this->key); 1359: memwipe(&this->h, sizeof(this->h)); 1360: memwipe(&this->hh, sizeof(this->hh)); 1361: memwipe(&this->hhh, sizeof(this->hhh)); 1362: memwipe(&this->hhhh, sizeof(this->hhhh)); 1363: this->iv_gen->destroy(this->iv_gen); 1364: free_align(this); 1365: } 1366: 1367: /** 1368: * See header 1369: */ 1370: aesni_gcm_t *aesni_gcm_create(encryption_algorithm_t algo, 1371: size_t key_size, size_t salt_size) 1372: { 1373: private_aesni_gcm_t *this; 1374: size_t icv_size; 1375: 1376: switch (key_size) 1377: { 1378: case 0: 1379: key_size = 16; 1380: break; 1381: case 16: 1382: case 24: 1383: case 32: 1384: break; 1385: default: 1386: return NULL; 1387: } 1388: if (salt_size && salt_size != SALT_SIZE) 1389: { 1390: /* currently not supported */ 1391: return NULL; 1392: } 1393: switch (algo) 1394: { 1395: case ENCR_AES_GCM_ICV8: 1396: algo = ENCR_AES_CBC; 1397: icv_size = 8; 1398: break; 1399: case ENCR_AES_GCM_ICV12: 1400: algo = ENCR_AES_CBC; 1401: icv_size = 12; 1402: break; 1403: case ENCR_AES_GCM_ICV16: 1404: algo = ENCR_AES_CBC; 1405: icv_size = 16; 1406: break; 1407: default: 1408: return NULL; 1409: } 1410: 1411: INIT_ALIGN(this, sizeof(__m128i), 1412: .public = { 1413: .aead = { 1414: .encrypt = _encrypt, 1415: .decrypt = _decrypt, 1416: .get_block_size = _get_block_size, 1417: .get_icv_size = _get_icv_size, 1418: .get_iv_size = _get_iv_size, 1419: .get_iv_gen = _get_iv_gen, 1420: .get_key_size = _get_key_size, 1421: .set_key = _set_key, 1422: .destroy = _destroy, 1423: }, 1424: }, 1425: .key_size = key_size, 1426: .iv_gen = iv_gen_seq_create(), 1427: .icv_size = icv_size, 1428: ); 1429: 1430: switch (key_size) 1431: { 1432: case 16: 1433: this->encrypt = encrypt_gcm128; 1434: this->decrypt = decrypt_gcm128; 1435: break; 1436: case 24: 1437: this->encrypt = encrypt_gcm192; 1438: this->decrypt = decrypt_gcm192; 1439: break; 1440: case 32: 1441: this->encrypt = encrypt_gcm256; 1442: this->decrypt = decrypt_gcm256; 1443: break; 1444: } 1445: 1446: return &this->public; 1447: }