Annotation of embedaddon/strongswan/src/libstrongswan/plugins/aesni/aesni_gcm.c, revision 1.1
1.1 ! misho 1: /*
! 2: * Copyright (C) 2015 Martin Willi
! 3: * Copyright (C) 2015 revosec AG
! 4: *
! 5: * This program is free software; you can redistribute it and/or modify it
! 6: * under the terms of the GNU General Public License as published by the
! 7: * Free Software Foundation; either version 2 of the License, or (at your
! 8: * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
! 9: *
! 10: * This program is distributed in the hope that it will be useful, but
! 11: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
! 12: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
! 13: * for more details.
! 14: */
! 15:
! 16: #include "aesni_gcm.h"
! 17: #include "aesni_key.h"
! 18:
! 19: #include <crypto/iv/iv_gen_seq.h>
! 20:
! 21: #include <tmmintrin.h>
! 22:
! 23: #define NONCE_SIZE 12
! 24: #define IV_SIZE 8
! 25: #define SALT_SIZE (NONCE_SIZE - IV_SIZE)
! 26:
! 27: /**
! 28: * Parallel pipelining
! 29: */
! 30: #define GCM_CRYPT_PARALLELISM 4
! 31:
! 32: typedef struct private_aesni_gcm_t private_aesni_gcm_t;
! 33:
! 34: /**
! 35: * GCM en/decryption method type
! 36: */
! 37: typedef void (*aesni_gcm_fn_t)(private_aesni_gcm_t*, size_t, u_char*, u_char*,
! 38: u_char*, size_t, u_char*, u_char*);
! 39:
! 40: /**
! 41: * Private data of an aesni_gcm_t object.
! 42: */
! 43: struct private_aesni_gcm_t {
! 44:
! 45: /**
! 46: * Public aesni_gcm_t interface.
! 47: */
! 48: aesni_gcm_t public;
! 49:
! 50: /**
! 51: * Encryption key schedule
! 52: */
! 53: aesni_key_t *key;
! 54:
! 55: /**
! 56: * IV generator.
! 57: */
! 58: iv_gen_t *iv_gen;
! 59:
! 60: /**
! 61: * Length of the integrity check value
! 62: */
! 63: size_t icv_size;
! 64:
! 65: /**
! 66: * Length of the key in bytes
! 67: */
! 68: size_t key_size;
! 69:
! 70: /**
! 71: * GCM encryption function
! 72: */
! 73: aesni_gcm_fn_t encrypt;
! 74:
! 75: /**
! 76: * GCM decryption function
! 77: */
! 78: aesni_gcm_fn_t decrypt;
! 79:
! 80: /**
! 81: * salt to add to nonce
! 82: */
! 83: u_char salt[SALT_SIZE];
! 84:
! 85: /**
! 86: * GHASH subkey H, big-endian
! 87: */
! 88: __m128i h;
! 89:
! 90: /**
! 91: * GHASH key H^2, big-endian
! 92: */
! 93: __m128i hh;
! 94:
! 95: /**
! 96: * GHASH key H^3, big-endian
! 97: */
! 98: __m128i hhh;
! 99:
! 100: /**
! 101: * GHASH key H^4, big-endian
! 102: */
! 103: __m128i hhhh;
! 104: };
! 105:
! 106: /**
! 107: * Byte-swap a 128-bit integer
! 108: */
! 109: static inline __m128i swap128(__m128i x)
! 110: {
! 111: return _mm_shuffle_epi8(x,
! 112: _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
! 113: }
! 114:
! 115: /**
! 116: * Multiply two blocks in GF128
! 117: */
! 118: static __m128i mult_block(__m128i h, __m128i y)
! 119: {
! 120: __m128i t1, t2, t3, t4, t5, t6;
! 121:
! 122: y = swap128(y);
! 123:
! 124: t1 = _mm_clmulepi64_si128(h, y, 0x00);
! 125: t2 = _mm_clmulepi64_si128(h, y, 0x01);
! 126: t3 = _mm_clmulepi64_si128(h, y, 0x10);
! 127: t4 = _mm_clmulepi64_si128(h, y, 0x11);
! 128:
! 129: t2 = _mm_xor_si128(t2, t3);
! 130: t3 = _mm_slli_si128(t2, 8);
! 131: t2 = _mm_srli_si128(t2, 8);
! 132: t1 = _mm_xor_si128(t1, t3);
! 133: t4 = _mm_xor_si128(t4, t2);
! 134:
! 135: t5 = _mm_srli_epi32(t1, 31);
! 136: t1 = _mm_slli_epi32(t1, 1);
! 137: t6 = _mm_srli_epi32(t4, 31);
! 138: t4 = _mm_slli_epi32(t4, 1);
! 139:
! 140: t3 = _mm_srli_si128(t5, 12);
! 141: t6 = _mm_slli_si128(t6, 4);
! 142: t5 = _mm_slli_si128(t5, 4);
! 143: t1 = _mm_or_si128(t1, t5);
! 144: t4 = _mm_or_si128(t4, t6);
! 145: t4 = _mm_or_si128(t4, t3);
! 146:
! 147: t5 = _mm_slli_epi32(t1, 31);
! 148: t6 = _mm_slli_epi32(t1, 30);
! 149: t3 = _mm_slli_epi32(t1, 25);
! 150:
! 151: t5 = _mm_xor_si128(t5, t6);
! 152: t5 = _mm_xor_si128(t5, t3);
! 153: t6 = _mm_srli_si128(t5, 4);
! 154: t4 = _mm_xor_si128(t4, t6);
! 155: t5 = _mm_slli_si128(t5, 12);
! 156: t1 = _mm_xor_si128(t1, t5);
! 157: t4 = _mm_xor_si128(t4, t1);
! 158:
! 159: t5 = _mm_srli_epi32(t1, 1);
! 160: t2 = _mm_srli_epi32(t1, 2);
! 161: t3 = _mm_srli_epi32(t1, 7);
! 162: t4 = _mm_xor_si128(t4, t2);
! 163: t4 = _mm_xor_si128(t4, t3);
! 164: t4 = _mm_xor_si128(t4, t5);
! 165:
! 166: return swap128(t4);
! 167: }
! 168:
! 169: /**
! 170: * Multiply four consecutive blocks by their respective GHASH key, XOR
! 171: */
! 172: static inline __m128i mult4xor(__m128i h1, __m128i h2, __m128i h3, __m128i h4,
! 173: __m128i d1, __m128i d2, __m128i d3, __m128i d4)
! 174: {
! 175: __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
! 176:
! 177: d1 = swap128(d1);
! 178: d2 = swap128(d2);
! 179: d3 = swap128(d3);
! 180: d4 = swap128(d4);
! 181:
! 182: t0 = _mm_clmulepi64_si128(h1, d1, 0x00);
! 183: t1 = _mm_clmulepi64_si128(h2, d2, 0x00);
! 184: t2 = _mm_clmulepi64_si128(h3, d3, 0x00);
! 185: t3 = _mm_clmulepi64_si128(h4, d4, 0x00);
! 186: t8 = _mm_xor_si128(t0, t1);
! 187: t8 = _mm_xor_si128(t8, t2);
! 188: t8 = _mm_xor_si128(t8, t3);
! 189:
! 190: t4 = _mm_clmulepi64_si128(h1, d1, 0x11);
! 191: t5 = _mm_clmulepi64_si128(h2, d2, 0x11);
! 192: t6 = _mm_clmulepi64_si128(h3, d3, 0x11);
! 193: t7 = _mm_clmulepi64_si128(h4, d4, 0x11);
! 194: t9 = _mm_xor_si128(t4, t5);
! 195: t9 = _mm_xor_si128(t9, t6);
! 196: t9 = _mm_xor_si128(t9, t7);
! 197:
! 198: t0 = _mm_shuffle_epi32(h1, 78);
! 199: t4 = _mm_shuffle_epi32(d1, 78);
! 200: t0 = _mm_xor_si128(t0, h1);
! 201: t4 = _mm_xor_si128(t4, d1);
! 202: t1 = _mm_shuffle_epi32(h2, 78);
! 203: t5 = _mm_shuffle_epi32(d2, 78);
! 204: t1 = _mm_xor_si128(t1, h2);
! 205: t5 = _mm_xor_si128(t5, d2);
! 206: t2 = _mm_shuffle_epi32(h3, 78);
! 207: t6 = _mm_shuffle_epi32(d3, 78);
! 208: t2 = _mm_xor_si128(t2, h3);
! 209: t6 = _mm_xor_si128(t6, d3);
! 210: t3 = _mm_shuffle_epi32(h4, 78);
! 211: t7 = _mm_shuffle_epi32(d4, 78);
! 212: t3 = _mm_xor_si128(t3, h4);
! 213: t7 = _mm_xor_si128(t7, d4);
! 214:
! 215: t0 = _mm_clmulepi64_si128(t0, t4, 0x00);
! 216: t1 = _mm_clmulepi64_si128(t1, t5, 0x00);
! 217: t2 = _mm_clmulepi64_si128(t2, t6, 0x00);
! 218: t3 = _mm_clmulepi64_si128(t3, t7, 0x00);
! 219: t0 = _mm_xor_si128(t0, t8);
! 220: t0 = _mm_xor_si128(t0, t9);
! 221: t0 = _mm_xor_si128(t1, t0);
! 222: t0 = _mm_xor_si128(t2, t0);
! 223:
! 224: t0 = _mm_xor_si128(t3, t0);
! 225: t4 = _mm_slli_si128(t0, 8);
! 226: t0 = _mm_srli_si128(t0, 8);
! 227: t3 = _mm_xor_si128(t4, t8);
! 228: t6 = _mm_xor_si128(t0, t9);
! 229: t7 = _mm_srli_epi32(t3, 31);
! 230: t8 = _mm_srli_epi32(t6, 31);
! 231: t3 = _mm_slli_epi32(t3, 1);
! 232: t6 = _mm_slli_epi32(t6, 1);
! 233: t9 = _mm_srli_si128(t7, 12);
! 234: t8 = _mm_slli_si128(t8, 4);
! 235: t7 = _mm_slli_si128(t7, 4);
! 236: t3 = _mm_or_si128(t3, t7);
! 237: t6 = _mm_or_si128(t6, t8);
! 238: t6 = _mm_or_si128(t6, t9);
! 239: t7 = _mm_slli_epi32(t3, 31);
! 240: t8 = _mm_slli_epi32(t3, 30);
! 241: t9 = _mm_slli_epi32(t3, 25);
! 242: t7 = _mm_xor_si128(t7, t8);
! 243: t7 = _mm_xor_si128(t7, t9);
! 244: t8 = _mm_srli_si128(t7, 4);
! 245: t7 = _mm_slli_si128(t7, 12);
! 246: t3 = _mm_xor_si128(t3, t7);
! 247: t2 = _mm_srli_epi32(t3, 1);
! 248: t4 = _mm_srli_epi32(t3, 2);
! 249: t5 = _mm_srli_epi32(t3, 7);
! 250: t2 = _mm_xor_si128(t2, t4);
! 251: t2 = _mm_xor_si128(t2, t5);
! 252: t2 = _mm_xor_si128(t2, t8);
! 253: t3 = _mm_xor_si128(t3, t2);
! 254: t6 = _mm_xor_si128(t6, t3);
! 255:
! 256: return swap128(t6);
! 257: }
! 258:
! 259: /**
! 260: * GHASH on a single block
! 261: */
! 262: static __m128i ghash(__m128i h, __m128i y, __m128i x)
! 263: {
! 264: return mult_block(h, _mm_xor_si128(y, x));
! 265: }
! 266:
! 267: /**
! 268: * Start constructing the ICV for the associated data
! 269: */
! 270: static __m128i icv_header(private_aesni_gcm_t *this, void *assoc, size_t alen)
! 271: {
! 272: u_int blocks, pblocks, rem, i;
! 273: __m128i h1, h2, h3, h4, d1, d2, d3, d4;
! 274: __m128i y, last, *ab;
! 275:
! 276: h1 = this->hhhh;
! 277: h2 = this->hhh;
! 278: h3 = this->hh;
! 279: h4 = this->h;
! 280:
! 281: y = _mm_setzero_si128();
! 282: ab = assoc;
! 283: blocks = alen / AES_BLOCK_SIZE;
! 284: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
! 285: rem = alen % AES_BLOCK_SIZE;
! 286: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
! 287: {
! 288: d1 = _mm_loadu_si128(ab + i + 0);
! 289: d2 = _mm_loadu_si128(ab + i + 1);
! 290: d3 = _mm_loadu_si128(ab + i + 2);
! 291: d4 = _mm_loadu_si128(ab + i + 3);
! 292: y = _mm_xor_si128(y, d1);
! 293: y = mult4xor(h1, h2, h3, h4, y, d2, d3, d4);
! 294: }
! 295: for (i = pblocks; i < blocks; i++)
! 296: {
! 297: y = ghash(this->h, y, _mm_loadu_si128(ab + i));
! 298: }
! 299: if (rem)
! 300: {
! 301: last = _mm_setzero_si128();
! 302: memcpy(&last, ab + blocks, rem);
! 303:
! 304: y = ghash(this->h, y, last);
! 305: }
! 306:
! 307: return y;
! 308: }
! 309:
! 310: /**
! 311: * Complete the ICV by hashing a assoc/data length block
! 312: */
! 313: static __m128i icv_tailer(private_aesni_gcm_t *this, __m128i y,
! 314: size_t alen, size_t dlen)
! 315: {
! 316: __m128i b;
! 317:
! 318: htoun64(&b, alen * 8);
! 319: htoun64((u_char*)&b + sizeof(uint64_t), dlen * 8);
! 320:
! 321: return ghash(this->h, y, b);
! 322: }
! 323:
! 324: /**
! 325: * En-/Decrypt the ICV, trim and store it
! 326: */
! 327: static void icv_crypt(private_aesni_gcm_t *this, __m128i y, __m128i j,
! 328: u_char *icv)
! 329: {
! 330: __m128i *ks, t, b;
! 331: u_int round;
! 332:
! 333: ks = this->key->schedule;
! 334: t = _mm_xor_si128(j, ks[0]);
! 335: for (round = 1; round < this->key->rounds; round++)
! 336: {
! 337: t = _mm_aesenc_si128(t, ks[round]);
! 338: }
! 339: t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
! 340:
! 341: t = _mm_xor_si128(y, t);
! 342:
! 343: _mm_storeu_si128(&b, t);
! 344: memcpy(icv, &b, this->icv_size);
! 345: }
! 346:
! 347: /**
! 348: * Do big-endian increment on x
! 349: */
! 350: static inline __m128i increment_be(__m128i x)
! 351: {
! 352: x = swap128(x);
! 353: x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
! 354: x = swap128(x);
! 355:
! 356: return x;
! 357: }
! 358:
! 359: /**
! 360: * Generate the block J0
! 361: */
! 362: static inline __m128i create_j(private_aesni_gcm_t *this, u_char *iv)
! 363: {
! 364: u_char j[AES_BLOCK_SIZE];
! 365:
! 366: memcpy(j, this->salt, SALT_SIZE);
! 367: memcpy(j + SALT_SIZE, iv, IV_SIZE);
! 368: htoun32(j + SALT_SIZE + IV_SIZE, 1);
! 369:
! 370: return _mm_loadu_si128((__m128i*)j);
! 371: }
! 372:
! 373: /**
! 374: * Encrypt a remaining incomplete block, return updated Y
! 375: */
! 376: static __m128i encrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
! 377: void *in, void *out, __m128i cb, __m128i y)
! 378: {
! 379: __m128i *ks, t, b;
! 380: u_int round;
! 381:
! 382: memset(&b, 0, sizeof(b));
! 383: memcpy(&b, in, rem);
! 384:
! 385: ks = this->key->schedule;
! 386: t = _mm_xor_si128(cb, ks[0]);
! 387: for (round = 1; round < this->key->rounds; round++)
! 388: {
! 389: t = _mm_aesenc_si128(t, ks[round]);
! 390: }
! 391: t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
! 392: b = _mm_xor_si128(t, b);
! 393:
! 394: memcpy(out, &b, rem);
! 395:
! 396: memset((u_char*)&b + rem, 0, AES_BLOCK_SIZE - rem);
! 397: return ghash(this->h, y, b);
! 398: }
! 399:
! 400: /**
! 401: * Decrypt a remaining incomplete block, return updated Y
! 402: */
! 403: static __m128i decrypt_gcm_rem(private_aesni_gcm_t *this, u_int rem,
! 404: void *in, void *out, __m128i cb, __m128i y)
! 405: {
! 406: __m128i *ks, t, b;
! 407: u_int round;
! 408:
! 409: memset(&b, 0, sizeof(b));
! 410: memcpy(&b, in, rem);
! 411:
! 412: y = ghash(this->h, y, b);
! 413:
! 414: ks = this->key->schedule;
! 415: t = _mm_xor_si128(cb, ks[0]);
! 416: for (round = 1; round < this->key->rounds; round++)
! 417: {
! 418: t = _mm_aesenc_si128(t, ks[round]);
! 419: }
! 420: t = _mm_aesenclast_si128(t, ks[this->key->rounds]);
! 421: b = _mm_xor_si128(t, b);
! 422:
! 423: memcpy(out, &b, rem);
! 424:
! 425: return y;
! 426: }
! 427:
! 428: /**
! 429: * AES-128 GCM encryption/ICV generation
! 430: */
! 431: static void encrypt_gcm128(private_aesni_gcm_t *this,
! 432: size_t len, u_char *in, u_char *out, u_char *iv,
! 433: size_t alen, u_char *assoc, u_char *icv)
! 434: {
! 435: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
! 436: __m128i *ks, y, j, cb, *bi, *bo;
! 437: u_int blocks, pblocks, rem, i;
! 438:
! 439: j = create_j(this, iv);
! 440: cb = increment_be(j);
! 441: y = icv_header(this, assoc, alen);
! 442: blocks = len / AES_BLOCK_SIZE;
! 443: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
! 444: rem = len % AES_BLOCK_SIZE;
! 445: bi = (__m128i*)in;
! 446: bo = (__m128i*)out;
! 447:
! 448: ks = this->key->schedule;
! 449:
! 450: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
! 451: {
! 452: d1 = _mm_loadu_si128(bi + i + 0);
! 453: d2 = _mm_loadu_si128(bi + i + 1);
! 454: d3 = _mm_loadu_si128(bi + i + 2);
! 455: d4 = _mm_loadu_si128(bi + i + 3);
! 456:
! 457: t1 = _mm_xor_si128(cb, ks[0]);
! 458: cb = increment_be(cb);
! 459: t2 = _mm_xor_si128(cb, ks[0]);
! 460: cb = increment_be(cb);
! 461: t3 = _mm_xor_si128(cb, ks[0]);
! 462: cb = increment_be(cb);
! 463: t4 = _mm_xor_si128(cb, ks[0]);
! 464: cb = increment_be(cb);
! 465:
! 466: t1 = _mm_aesenc_si128(t1, ks[1]);
! 467: t2 = _mm_aesenc_si128(t2, ks[1]);
! 468: t3 = _mm_aesenc_si128(t3, ks[1]);
! 469: t4 = _mm_aesenc_si128(t4, ks[1]);
! 470: t1 = _mm_aesenc_si128(t1, ks[2]);
! 471: t2 = _mm_aesenc_si128(t2, ks[2]);
! 472: t3 = _mm_aesenc_si128(t3, ks[2]);
! 473: t4 = _mm_aesenc_si128(t4, ks[2]);
! 474: t1 = _mm_aesenc_si128(t1, ks[3]);
! 475: t2 = _mm_aesenc_si128(t2, ks[3]);
! 476: t3 = _mm_aesenc_si128(t3, ks[3]);
! 477: t4 = _mm_aesenc_si128(t4, ks[3]);
! 478: t1 = _mm_aesenc_si128(t1, ks[4]);
! 479: t2 = _mm_aesenc_si128(t2, ks[4]);
! 480: t3 = _mm_aesenc_si128(t3, ks[4]);
! 481: t4 = _mm_aesenc_si128(t4, ks[4]);
! 482: t1 = _mm_aesenc_si128(t1, ks[5]);
! 483: t2 = _mm_aesenc_si128(t2, ks[5]);
! 484: t3 = _mm_aesenc_si128(t3, ks[5]);
! 485: t4 = _mm_aesenc_si128(t4, ks[5]);
! 486: t1 = _mm_aesenc_si128(t1, ks[6]);
! 487: t2 = _mm_aesenc_si128(t2, ks[6]);
! 488: t3 = _mm_aesenc_si128(t3, ks[6]);
! 489: t4 = _mm_aesenc_si128(t4, ks[6]);
! 490: t1 = _mm_aesenc_si128(t1, ks[7]);
! 491: t2 = _mm_aesenc_si128(t2, ks[7]);
! 492: t3 = _mm_aesenc_si128(t3, ks[7]);
! 493: t4 = _mm_aesenc_si128(t4, ks[7]);
! 494: t1 = _mm_aesenc_si128(t1, ks[8]);
! 495: t2 = _mm_aesenc_si128(t2, ks[8]);
! 496: t3 = _mm_aesenc_si128(t3, ks[8]);
! 497: t4 = _mm_aesenc_si128(t4, ks[8]);
! 498: t1 = _mm_aesenc_si128(t1, ks[9]);
! 499: t2 = _mm_aesenc_si128(t2, ks[9]);
! 500: t3 = _mm_aesenc_si128(t3, ks[9]);
! 501: t4 = _mm_aesenc_si128(t4, ks[9]);
! 502:
! 503: t1 = _mm_aesenclast_si128(t1, ks[10]);
! 504: t2 = _mm_aesenclast_si128(t2, ks[10]);
! 505: t3 = _mm_aesenclast_si128(t3, ks[10]);
! 506: t4 = _mm_aesenclast_si128(t4, ks[10]);
! 507:
! 508: t1 = _mm_xor_si128(t1, d1);
! 509: t2 = _mm_xor_si128(t2, d2);
! 510: t3 = _mm_xor_si128(t3, d3);
! 511: t4 = _mm_xor_si128(t4, d4);
! 512:
! 513: y = _mm_xor_si128(y, t1);
! 514: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
! 515:
! 516: _mm_storeu_si128(bo + i + 0, t1);
! 517: _mm_storeu_si128(bo + i + 1, t2);
! 518: _mm_storeu_si128(bo + i + 2, t3);
! 519: _mm_storeu_si128(bo + i + 3, t4);
! 520: }
! 521:
! 522: for (i = pblocks; i < blocks; i++)
! 523: {
! 524: d1 = _mm_loadu_si128(bi + i);
! 525:
! 526: t1 = _mm_xor_si128(cb, ks[0]);
! 527: t1 = _mm_aesenc_si128(t1, ks[1]);
! 528: t1 = _mm_aesenc_si128(t1, ks[2]);
! 529: t1 = _mm_aesenc_si128(t1, ks[3]);
! 530: t1 = _mm_aesenc_si128(t1, ks[4]);
! 531: t1 = _mm_aesenc_si128(t1, ks[5]);
! 532: t1 = _mm_aesenc_si128(t1, ks[6]);
! 533: t1 = _mm_aesenc_si128(t1, ks[7]);
! 534: t1 = _mm_aesenc_si128(t1, ks[8]);
! 535: t1 = _mm_aesenc_si128(t1, ks[9]);
! 536: t1 = _mm_aesenclast_si128(t1, ks[10]);
! 537:
! 538: t1 = _mm_xor_si128(t1, d1);
! 539: _mm_storeu_si128(bo + i, t1);
! 540:
! 541: y = ghash(this->h, y, t1);
! 542:
! 543: cb = increment_be(cb);
! 544: }
! 545:
! 546: if (rem)
! 547: {
! 548: y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
! 549: }
! 550: y = icv_tailer(this, y, alen, len);
! 551: icv_crypt(this, y, j, icv);
! 552: }
! 553:
! 554: /**
! 555: * AES-128 GCM decryption/ICV generation
! 556: */
! 557: static void decrypt_gcm128(private_aesni_gcm_t *this,
! 558: size_t len, u_char *in, u_char *out, u_char *iv,
! 559: size_t alen, u_char *assoc, u_char *icv)
! 560: {
! 561: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
! 562: __m128i *ks, y, j, cb, *bi, *bo;
! 563: u_int blocks, pblocks, rem, i;
! 564:
! 565: j = create_j(this, iv);
! 566: cb = increment_be(j);
! 567: y = icv_header(this, assoc, alen);
! 568: blocks = len / AES_BLOCK_SIZE;
! 569: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
! 570: rem = len % AES_BLOCK_SIZE;
! 571: bi = (__m128i*)in;
! 572: bo = (__m128i*)out;
! 573:
! 574: ks = this->key->schedule;
! 575:
! 576: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
! 577: {
! 578: d1 = _mm_loadu_si128(bi + i + 0);
! 579: d2 = _mm_loadu_si128(bi + i + 1);
! 580: d3 = _mm_loadu_si128(bi + i + 2);
! 581: d4 = _mm_loadu_si128(bi + i + 3);
! 582:
! 583: y = _mm_xor_si128(y, d1);
! 584: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
! 585:
! 586: t1 = _mm_xor_si128(cb, ks[0]);
! 587: cb = increment_be(cb);
! 588: t2 = _mm_xor_si128(cb, ks[0]);
! 589: cb = increment_be(cb);
! 590: t3 = _mm_xor_si128(cb, ks[0]);
! 591: cb = increment_be(cb);
! 592: t4 = _mm_xor_si128(cb, ks[0]);
! 593: cb = increment_be(cb);
! 594:
! 595: t1 = _mm_aesenc_si128(t1, ks[1]);
! 596: t2 = _mm_aesenc_si128(t2, ks[1]);
! 597: t3 = _mm_aesenc_si128(t3, ks[1]);
! 598: t4 = _mm_aesenc_si128(t4, ks[1]);
! 599: t1 = _mm_aesenc_si128(t1, ks[2]);
! 600: t2 = _mm_aesenc_si128(t2, ks[2]);
! 601: t3 = _mm_aesenc_si128(t3, ks[2]);
! 602: t4 = _mm_aesenc_si128(t4, ks[2]);
! 603: t1 = _mm_aesenc_si128(t1, ks[3]);
! 604: t2 = _mm_aesenc_si128(t2, ks[3]);
! 605: t3 = _mm_aesenc_si128(t3, ks[3]);
! 606: t4 = _mm_aesenc_si128(t4, ks[3]);
! 607: t1 = _mm_aesenc_si128(t1, ks[4]);
! 608: t2 = _mm_aesenc_si128(t2, ks[4]);
! 609: t3 = _mm_aesenc_si128(t3, ks[4]);
! 610: t4 = _mm_aesenc_si128(t4, ks[4]);
! 611: t1 = _mm_aesenc_si128(t1, ks[5]);
! 612: t2 = _mm_aesenc_si128(t2, ks[5]);
! 613: t3 = _mm_aesenc_si128(t3, ks[5]);
! 614: t4 = _mm_aesenc_si128(t4, ks[5]);
! 615: t1 = _mm_aesenc_si128(t1, ks[6]);
! 616: t2 = _mm_aesenc_si128(t2, ks[6]);
! 617: t3 = _mm_aesenc_si128(t3, ks[6]);
! 618: t4 = _mm_aesenc_si128(t4, ks[6]);
! 619: t1 = _mm_aesenc_si128(t1, ks[7]);
! 620: t2 = _mm_aesenc_si128(t2, ks[7]);
! 621: t3 = _mm_aesenc_si128(t3, ks[7]);
! 622: t4 = _mm_aesenc_si128(t4, ks[7]);
! 623: t1 = _mm_aesenc_si128(t1, ks[8]);
! 624: t2 = _mm_aesenc_si128(t2, ks[8]);
! 625: t3 = _mm_aesenc_si128(t3, ks[8]);
! 626: t4 = _mm_aesenc_si128(t4, ks[8]);
! 627: t1 = _mm_aesenc_si128(t1, ks[9]);
! 628: t2 = _mm_aesenc_si128(t2, ks[9]);
! 629: t3 = _mm_aesenc_si128(t3, ks[9]);
! 630: t4 = _mm_aesenc_si128(t4, ks[9]);
! 631:
! 632: t1 = _mm_aesenclast_si128(t1, ks[10]);
! 633: t2 = _mm_aesenclast_si128(t2, ks[10]);
! 634: t3 = _mm_aesenclast_si128(t3, ks[10]);
! 635: t4 = _mm_aesenclast_si128(t4, ks[10]);
! 636:
! 637: t1 = _mm_xor_si128(t1, d1);
! 638: t2 = _mm_xor_si128(t2, d2);
! 639: t3 = _mm_xor_si128(t3, d3);
! 640: t4 = _mm_xor_si128(t4, d4);
! 641:
! 642: _mm_storeu_si128(bo + i + 0, t1);
! 643: _mm_storeu_si128(bo + i + 1, t2);
! 644: _mm_storeu_si128(bo + i + 2, t3);
! 645: _mm_storeu_si128(bo + i + 3, t4);
! 646: }
! 647:
! 648: for (i = pblocks; i < blocks; i++)
! 649: {
! 650: d1 = _mm_loadu_si128(bi + i);
! 651:
! 652: y = ghash(this->h, y, d1);
! 653:
! 654: t1 = _mm_xor_si128(cb, ks[0]);
! 655: t1 = _mm_aesenc_si128(t1, ks[1]);
! 656: t1 = _mm_aesenc_si128(t1, ks[2]);
! 657: t1 = _mm_aesenc_si128(t1, ks[3]);
! 658: t1 = _mm_aesenc_si128(t1, ks[4]);
! 659: t1 = _mm_aesenc_si128(t1, ks[5]);
! 660: t1 = _mm_aesenc_si128(t1, ks[6]);
! 661: t1 = _mm_aesenc_si128(t1, ks[7]);
! 662: t1 = _mm_aesenc_si128(t1, ks[8]);
! 663: t1 = _mm_aesenc_si128(t1, ks[9]);
! 664: t1 = _mm_aesenclast_si128(t1, ks[10]);
! 665:
! 666: t1 = _mm_xor_si128(t1, d1);
! 667: _mm_storeu_si128(bo + i, t1);
! 668:
! 669: cb = increment_be(cb);
! 670: }
! 671:
! 672: if (rem)
! 673: {
! 674: y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
! 675: }
! 676: y = icv_tailer(this, y, alen, len);
! 677: icv_crypt(this, y, j, icv);
! 678: }
! 679:
! 680: /**
! 681: * AES-192 GCM encryption/ICV generation
! 682: */
! 683: static void encrypt_gcm192(private_aesni_gcm_t *this,
! 684: size_t len, u_char *in, u_char *out, u_char *iv,
! 685: size_t alen, u_char *assoc, u_char *icv)
! 686: {
! 687: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
! 688: __m128i *ks, y, j, cb, *bi, *bo;
! 689: u_int blocks, pblocks, rem, i;
! 690:
! 691: j = create_j(this, iv);
! 692: cb = increment_be(j);
! 693: y = icv_header(this, assoc, alen);
! 694: blocks = len / AES_BLOCK_SIZE;
! 695: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
! 696: rem = len % AES_BLOCK_SIZE;
! 697: bi = (__m128i*)in;
! 698: bo = (__m128i*)out;
! 699:
! 700: ks = this->key->schedule;
! 701:
! 702: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
! 703: {
! 704: d1 = _mm_loadu_si128(bi + i + 0);
! 705: d2 = _mm_loadu_si128(bi + i + 1);
! 706: d3 = _mm_loadu_si128(bi + i + 2);
! 707: d4 = _mm_loadu_si128(bi + i + 3);
! 708:
! 709: t1 = _mm_xor_si128(cb, ks[0]);
! 710: cb = increment_be(cb);
! 711: t2 = _mm_xor_si128(cb, ks[0]);
! 712: cb = increment_be(cb);
! 713: t3 = _mm_xor_si128(cb, ks[0]);
! 714: cb = increment_be(cb);
! 715: t4 = _mm_xor_si128(cb, ks[0]);
! 716: cb = increment_be(cb);
! 717:
! 718: t1 = _mm_aesenc_si128(t1, ks[1]);
! 719: t2 = _mm_aesenc_si128(t2, ks[1]);
! 720: t3 = _mm_aesenc_si128(t3, ks[1]);
! 721: t4 = _mm_aesenc_si128(t4, ks[1]);
! 722: t1 = _mm_aesenc_si128(t1, ks[2]);
! 723: t2 = _mm_aesenc_si128(t2, ks[2]);
! 724: t3 = _mm_aesenc_si128(t3, ks[2]);
! 725: t4 = _mm_aesenc_si128(t4, ks[2]);
! 726: t1 = _mm_aesenc_si128(t1, ks[3]);
! 727: t2 = _mm_aesenc_si128(t2, ks[3]);
! 728: t3 = _mm_aesenc_si128(t3, ks[3]);
! 729: t4 = _mm_aesenc_si128(t4, ks[3]);
! 730: t1 = _mm_aesenc_si128(t1, ks[4]);
! 731: t2 = _mm_aesenc_si128(t2, ks[4]);
! 732: t3 = _mm_aesenc_si128(t3, ks[4]);
! 733: t4 = _mm_aesenc_si128(t4, ks[4]);
! 734: t1 = _mm_aesenc_si128(t1, ks[5]);
! 735: t2 = _mm_aesenc_si128(t2, ks[5]);
! 736: t3 = _mm_aesenc_si128(t3, ks[5]);
! 737: t4 = _mm_aesenc_si128(t4, ks[5]);
! 738: t1 = _mm_aesenc_si128(t1, ks[6]);
! 739: t2 = _mm_aesenc_si128(t2, ks[6]);
! 740: t3 = _mm_aesenc_si128(t3, ks[6]);
! 741: t4 = _mm_aesenc_si128(t4, ks[6]);
! 742: t1 = _mm_aesenc_si128(t1, ks[7]);
! 743: t2 = _mm_aesenc_si128(t2, ks[7]);
! 744: t3 = _mm_aesenc_si128(t3, ks[7]);
! 745: t4 = _mm_aesenc_si128(t4, ks[7]);
! 746: t1 = _mm_aesenc_si128(t1, ks[8]);
! 747: t2 = _mm_aesenc_si128(t2, ks[8]);
! 748: t3 = _mm_aesenc_si128(t3, ks[8]);
! 749: t4 = _mm_aesenc_si128(t4, ks[8]);
! 750: t1 = _mm_aesenc_si128(t1, ks[9]);
! 751: t2 = _mm_aesenc_si128(t2, ks[9]);
! 752: t3 = _mm_aesenc_si128(t3, ks[9]);
! 753: t4 = _mm_aesenc_si128(t4, ks[9]);
! 754: t1 = _mm_aesenc_si128(t1, ks[10]);
! 755: t2 = _mm_aesenc_si128(t2, ks[10]);
! 756: t3 = _mm_aesenc_si128(t3, ks[10]);
! 757: t4 = _mm_aesenc_si128(t4, ks[10]);
! 758: t1 = _mm_aesenc_si128(t1, ks[11]);
! 759: t2 = _mm_aesenc_si128(t2, ks[11]);
! 760: t3 = _mm_aesenc_si128(t3, ks[11]);
! 761: t4 = _mm_aesenc_si128(t4, ks[11]);
! 762:
! 763: t1 = _mm_aesenclast_si128(t1, ks[12]);
! 764: t2 = _mm_aesenclast_si128(t2, ks[12]);
! 765: t3 = _mm_aesenclast_si128(t3, ks[12]);
! 766: t4 = _mm_aesenclast_si128(t4, ks[12]);
! 767:
! 768: t1 = _mm_xor_si128(t1, d1);
! 769: t2 = _mm_xor_si128(t2, d2);
! 770: t3 = _mm_xor_si128(t3, d3);
! 771: t4 = _mm_xor_si128(t4, d4);
! 772:
! 773: y = _mm_xor_si128(y, t1);
! 774: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
! 775:
! 776: _mm_storeu_si128(bo + i + 0, t1);
! 777: _mm_storeu_si128(bo + i + 1, t2);
! 778: _mm_storeu_si128(bo + i + 2, t3);
! 779: _mm_storeu_si128(bo + i + 3, t4);
! 780: }
! 781:
! 782: for (i = pblocks; i < blocks; i++)
! 783: {
! 784: d1 = _mm_loadu_si128(bi + i);
! 785:
! 786: t1 = _mm_xor_si128(cb, ks[0]);
! 787: t1 = _mm_aesenc_si128(t1, ks[1]);
! 788: t1 = _mm_aesenc_si128(t1, ks[2]);
! 789: t1 = _mm_aesenc_si128(t1, ks[3]);
! 790: t1 = _mm_aesenc_si128(t1, ks[4]);
! 791: t1 = _mm_aesenc_si128(t1, ks[5]);
! 792: t1 = _mm_aesenc_si128(t1, ks[6]);
! 793: t1 = _mm_aesenc_si128(t1, ks[7]);
! 794: t1 = _mm_aesenc_si128(t1, ks[8]);
! 795: t1 = _mm_aesenc_si128(t1, ks[9]);
! 796: t1 = _mm_aesenc_si128(t1, ks[10]);
! 797: t1 = _mm_aesenc_si128(t1, ks[11]);
! 798: t1 = _mm_aesenclast_si128(t1, ks[12]);
! 799:
! 800: t1 = _mm_xor_si128(t1, d1);
! 801: _mm_storeu_si128(bo + i, t1);
! 802:
! 803: y = ghash(this->h, y, t1);
! 804:
! 805: cb = increment_be(cb);
! 806: }
! 807:
! 808: if (rem)
! 809: {
! 810: y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
! 811: }
! 812: y = icv_tailer(this, y, alen, len);
! 813: icv_crypt(this, y, j, icv);
! 814: }
! 815:
! 816: /**
! 817: * AES-192 GCM decryption/ICV generation
! 818: */
! 819: static void decrypt_gcm192(private_aesni_gcm_t *this,
! 820: size_t len, u_char *in, u_char *out, u_char *iv,
! 821: size_t alen, u_char *assoc, u_char *icv)
! 822: {
! 823: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
! 824: __m128i *ks, y, j, cb, *bi, *bo;
! 825: u_int blocks, pblocks, rem, i;
! 826:
! 827: j = create_j(this, iv);
! 828: cb = increment_be(j);
! 829: y = icv_header(this, assoc, alen);
! 830: blocks = len / AES_BLOCK_SIZE;
! 831: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
! 832: rem = len % AES_BLOCK_SIZE;
! 833: bi = (__m128i*)in;
! 834: bo = (__m128i*)out;
! 835:
! 836: ks = this->key->schedule;
! 837:
! 838: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
! 839: {
! 840: d1 = _mm_loadu_si128(bi + i + 0);
! 841: d2 = _mm_loadu_si128(bi + i + 1);
! 842: d3 = _mm_loadu_si128(bi + i + 2);
! 843: d4 = _mm_loadu_si128(bi + i + 3);
! 844:
! 845: y = _mm_xor_si128(y, d1);
! 846: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
! 847:
! 848: t1 = _mm_xor_si128(cb, ks[0]);
! 849: cb = increment_be(cb);
! 850: t2 = _mm_xor_si128(cb, ks[0]);
! 851: cb = increment_be(cb);
! 852: t3 = _mm_xor_si128(cb, ks[0]);
! 853: cb = increment_be(cb);
! 854: t4 = _mm_xor_si128(cb, ks[0]);
! 855: cb = increment_be(cb);
! 856:
! 857: t1 = _mm_aesenc_si128(t1, ks[1]);
! 858: t2 = _mm_aesenc_si128(t2, ks[1]);
! 859: t3 = _mm_aesenc_si128(t3, ks[1]);
! 860: t4 = _mm_aesenc_si128(t4, ks[1]);
! 861: t1 = _mm_aesenc_si128(t1, ks[2]);
! 862: t2 = _mm_aesenc_si128(t2, ks[2]);
! 863: t3 = _mm_aesenc_si128(t3, ks[2]);
! 864: t4 = _mm_aesenc_si128(t4, ks[2]);
! 865: t1 = _mm_aesenc_si128(t1, ks[3]);
! 866: t2 = _mm_aesenc_si128(t2, ks[3]);
! 867: t3 = _mm_aesenc_si128(t3, ks[3]);
! 868: t4 = _mm_aesenc_si128(t4, ks[3]);
! 869: t1 = _mm_aesenc_si128(t1, ks[4]);
! 870: t2 = _mm_aesenc_si128(t2, ks[4]);
! 871: t3 = _mm_aesenc_si128(t3, ks[4]);
! 872: t4 = _mm_aesenc_si128(t4, ks[4]);
! 873: t1 = _mm_aesenc_si128(t1, ks[5]);
! 874: t2 = _mm_aesenc_si128(t2, ks[5]);
! 875: t3 = _mm_aesenc_si128(t3, ks[5]);
! 876: t4 = _mm_aesenc_si128(t4, ks[5]);
! 877: t1 = _mm_aesenc_si128(t1, ks[6]);
! 878: t2 = _mm_aesenc_si128(t2, ks[6]);
! 879: t3 = _mm_aesenc_si128(t3, ks[6]);
! 880: t4 = _mm_aesenc_si128(t4, ks[6]);
! 881: t1 = _mm_aesenc_si128(t1, ks[7]);
! 882: t2 = _mm_aesenc_si128(t2, ks[7]);
! 883: t3 = _mm_aesenc_si128(t3, ks[7]);
! 884: t4 = _mm_aesenc_si128(t4, ks[7]);
! 885: t1 = _mm_aesenc_si128(t1, ks[8]);
! 886: t2 = _mm_aesenc_si128(t2, ks[8]);
! 887: t3 = _mm_aesenc_si128(t3, ks[8]);
! 888: t4 = _mm_aesenc_si128(t4, ks[8]);
! 889: t1 = _mm_aesenc_si128(t1, ks[9]);
! 890: t2 = _mm_aesenc_si128(t2, ks[9]);
! 891: t3 = _mm_aesenc_si128(t3, ks[9]);
! 892: t4 = _mm_aesenc_si128(t4, ks[9]);
! 893: t1 = _mm_aesenc_si128(t1, ks[10]);
! 894: t2 = _mm_aesenc_si128(t2, ks[10]);
! 895: t3 = _mm_aesenc_si128(t3, ks[10]);
! 896: t4 = _mm_aesenc_si128(t4, ks[10]);
! 897: t1 = _mm_aesenc_si128(t1, ks[11]);
! 898: t2 = _mm_aesenc_si128(t2, ks[11]);
! 899: t3 = _mm_aesenc_si128(t3, ks[11]);
! 900: t4 = _mm_aesenc_si128(t4, ks[11]);
! 901:
! 902: t1 = _mm_aesenclast_si128(t1, ks[12]);
! 903: t2 = _mm_aesenclast_si128(t2, ks[12]);
! 904: t3 = _mm_aesenclast_si128(t3, ks[12]);
! 905: t4 = _mm_aesenclast_si128(t4, ks[12]);
! 906:
! 907: t1 = _mm_xor_si128(t1, d1);
! 908: t2 = _mm_xor_si128(t2, d2);
! 909: t3 = _mm_xor_si128(t3, d3);
! 910: t4 = _mm_xor_si128(t4, d4);
! 911:
! 912: _mm_storeu_si128(bo + i + 0, t1);
! 913: _mm_storeu_si128(bo + i + 1, t2);
! 914: _mm_storeu_si128(bo + i + 2, t3);
! 915: _mm_storeu_si128(bo + i + 3, t4);
! 916: }
! 917:
! 918: for (i = pblocks; i < blocks; i++)
! 919: {
! 920: d1 = _mm_loadu_si128(bi + i);
! 921:
! 922: y = ghash(this->h, y, d1);
! 923:
! 924: t1 = _mm_xor_si128(cb, ks[0]);
! 925: t1 = _mm_aesenc_si128(t1, ks[1]);
! 926: t1 = _mm_aesenc_si128(t1, ks[2]);
! 927: t1 = _mm_aesenc_si128(t1, ks[3]);
! 928: t1 = _mm_aesenc_si128(t1, ks[4]);
! 929: t1 = _mm_aesenc_si128(t1, ks[5]);
! 930: t1 = _mm_aesenc_si128(t1, ks[6]);
! 931: t1 = _mm_aesenc_si128(t1, ks[7]);
! 932: t1 = _mm_aesenc_si128(t1, ks[8]);
! 933: t1 = _mm_aesenc_si128(t1, ks[9]);
! 934: t1 = _mm_aesenc_si128(t1, ks[10]);
! 935: t1 = _mm_aesenc_si128(t1, ks[11]);
! 936: t1 = _mm_aesenclast_si128(t1, ks[12]);
! 937:
! 938: t1 = _mm_xor_si128(t1, d1);
! 939: _mm_storeu_si128(bo + i, t1);
! 940:
! 941: cb = increment_be(cb);
! 942: }
! 943:
! 944: if (rem)
! 945: {
! 946: y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
! 947: }
! 948: y = icv_tailer(this, y, alen, len);
! 949: icv_crypt(this, y, j, icv);
! 950: }
! 951:
! 952: /**
! 953: * AES-256 GCM encryption/ICV generation
! 954: */
! 955: static void encrypt_gcm256(private_aesni_gcm_t *this,
! 956: size_t len, u_char *in, u_char *out, u_char *iv,
! 957: size_t alen, u_char *assoc, u_char *icv)
! 958: {
! 959: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
! 960: __m128i *ks, y, j, cb, *bi, *bo;
! 961: u_int blocks, pblocks, rem, i;
! 962:
! 963: j = create_j(this, iv);
! 964: cb = increment_be(j);
! 965: y = icv_header(this, assoc, alen);
! 966: blocks = len / AES_BLOCK_SIZE;
! 967: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
! 968: rem = len % AES_BLOCK_SIZE;
! 969: bi = (__m128i*)in;
! 970: bo = (__m128i*)out;
! 971:
! 972: ks = this->key->schedule;
! 973:
! 974: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
! 975: {
! 976: d1 = _mm_loadu_si128(bi + i + 0);
! 977: d2 = _mm_loadu_si128(bi + i + 1);
! 978: d3 = _mm_loadu_si128(bi + i + 2);
! 979: d4 = _mm_loadu_si128(bi + i + 3);
! 980:
! 981: t1 = _mm_xor_si128(cb, ks[0]);
! 982: cb = increment_be(cb);
! 983: t2 = _mm_xor_si128(cb, ks[0]);
! 984: cb = increment_be(cb);
! 985: t3 = _mm_xor_si128(cb, ks[0]);
! 986: cb = increment_be(cb);
! 987: t4 = _mm_xor_si128(cb, ks[0]);
! 988: cb = increment_be(cb);
! 989:
! 990: t1 = _mm_aesenc_si128(t1, ks[1]);
! 991: t2 = _mm_aesenc_si128(t2, ks[1]);
! 992: t3 = _mm_aesenc_si128(t3, ks[1]);
! 993: t4 = _mm_aesenc_si128(t4, ks[1]);
! 994: t1 = _mm_aesenc_si128(t1, ks[2]);
! 995: t2 = _mm_aesenc_si128(t2, ks[2]);
! 996: t3 = _mm_aesenc_si128(t3, ks[2]);
! 997: t4 = _mm_aesenc_si128(t4, ks[2]);
! 998: t1 = _mm_aesenc_si128(t1, ks[3]);
! 999: t2 = _mm_aesenc_si128(t2, ks[3]);
! 1000: t3 = _mm_aesenc_si128(t3, ks[3]);
! 1001: t4 = _mm_aesenc_si128(t4, ks[3]);
! 1002: t1 = _mm_aesenc_si128(t1, ks[4]);
! 1003: t2 = _mm_aesenc_si128(t2, ks[4]);
! 1004: t3 = _mm_aesenc_si128(t3, ks[4]);
! 1005: t4 = _mm_aesenc_si128(t4, ks[4]);
! 1006: t1 = _mm_aesenc_si128(t1, ks[5]);
! 1007: t2 = _mm_aesenc_si128(t2, ks[5]);
! 1008: t3 = _mm_aesenc_si128(t3, ks[5]);
! 1009: t4 = _mm_aesenc_si128(t4, ks[5]);
! 1010: t1 = _mm_aesenc_si128(t1, ks[6]);
! 1011: t2 = _mm_aesenc_si128(t2, ks[6]);
! 1012: t3 = _mm_aesenc_si128(t3, ks[6]);
! 1013: t4 = _mm_aesenc_si128(t4, ks[6]);
! 1014: t1 = _mm_aesenc_si128(t1, ks[7]);
! 1015: t2 = _mm_aesenc_si128(t2, ks[7]);
! 1016: t3 = _mm_aesenc_si128(t3, ks[7]);
! 1017: t4 = _mm_aesenc_si128(t4, ks[7]);
! 1018: t1 = _mm_aesenc_si128(t1, ks[8]);
! 1019: t2 = _mm_aesenc_si128(t2, ks[8]);
! 1020: t3 = _mm_aesenc_si128(t3, ks[8]);
! 1021: t4 = _mm_aesenc_si128(t4, ks[8]);
! 1022: t1 = _mm_aesenc_si128(t1, ks[9]);
! 1023: t2 = _mm_aesenc_si128(t2, ks[9]);
! 1024: t3 = _mm_aesenc_si128(t3, ks[9]);
! 1025: t4 = _mm_aesenc_si128(t4, ks[9]);
! 1026: t1 = _mm_aesenc_si128(t1, ks[10]);
! 1027: t2 = _mm_aesenc_si128(t2, ks[10]);
! 1028: t3 = _mm_aesenc_si128(t3, ks[10]);
! 1029: t4 = _mm_aesenc_si128(t4, ks[10]);
! 1030: t1 = _mm_aesenc_si128(t1, ks[11]);
! 1031: t2 = _mm_aesenc_si128(t2, ks[11]);
! 1032: t3 = _mm_aesenc_si128(t3, ks[11]);
! 1033: t4 = _mm_aesenc_si128(t4, ks[11]);
! 1034: t1 = _mm_aesenc_si128(t1, ks[12]);
! 1035: t2 = _mm_aesenc_si128(t2, ks[12]);
! 1036: t3 = _mm_aesenc_si128(t3, ks[12]);
! 1037: t4 = _mm_aesenc_si128(t4, ks[12]);
! 1038: t1 = _mm_aesenc_si128(t1, ks[13]);
! 1039: t2 = _mm_aesenc_si128(t2, ks[13]);
! 1040: t3 = _mm_aesenc_si128(t3, ks[13]);
! 1041: t4 = _mm_aesenc_si128(t4, ks[13]);
! 1042:
! 1043: t1 = _mm_aesenclast_si128(t1, ks[14]);
! 1044: t2 = _mm_aesenclast_si128(t2, ks[14]);
! 1045: t3 = _mm_aesenclast_si128(t3, ks[14]);
! 1046: t4 = _mm_aesenclast_si128(t4, ks[14]);
! 1047:
! 1048: t1 = _mm_xor_si128(t1, d1);
! 1049: t2 = _mm_xor_si128(t2, d2);
! 1050: t3 = _mm_xor_si128(t3, d3);
! 1051: t4 = _mm_xor_si128(t4, d4);
! 1052:
! 1053: y = _mm_xor_si128(y, t1);
! 1054: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, t2, t3, t4);
! 1055:
! 1056: _mm_storeu_si128(bo + i + 0, t1);
! 1057: _mm_storeu_si128(bo + i + 1, t2);
! 1058: _mm_storeu_si128(bo + i + 2, t3);
! 1059: _mm_storeu_si128(bo + i + 3, t4);
! 1060: }
! 1061:
! 1062: for (i = pblocks; i < blocks; i++)
! 1063: {
! 1064: d1 = _mm_loadu_si128(bi + i);
! 1065:
! 1066: t1 = _mm_xor_si128(cb, ks[0]);
! 1067: t1 = _mm_aesenc_si128(t1, ks[1]);
! 1068: t1 = _mm_aesenc_si128(t1, ks[2]);
! 1069: t1 = _mm_aesenc_si128(t1, ks[3]);
! 1070: t1 = _mm_aesenc_si128(t1, ks[4]);
! 1071: t1 = _mm_aesenc_si128(t1, ks[5]);
! 1072: t1 = _mm_aesenc_si128(t1, ks[6]);
! 1073: t1 = _mm_aesenc_si128(t1, ks[7]);
! 1074: t1 = _mm_aesenc_si128(t1, ks[8]);
! 1075: t1 = _mm_aesenc_si128(t1, ks[9]);
! 1076: t1 = _mm_aesenc_si128(t1, ks[10]);
! 1077: t1 = _mm_aesenc_si128(t1, ks[11]);
! 1078: t1 = _mm_aesenc_si128(t1, ks[12]);
! 1079: t1 = _mm_aesenc_si128(t1, ks[13]);
! 1080: t1 = _mm_aesenclast_si128(t1, ks[14]);
! 1081:
! 1082: t1 = _mm_xor_si128(t1, d1);
! 1083: _mm_storeu_si128(bo + i, t1);
! 1084:
! 1085: y = ghash(this->h, y, t1);
! 1086:
! 1087: cb = increment_be(cb);
! 1088: }
! 1089:
! 1090: if (rem)
! 1091: {
! 1092: y = encrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
! 1093: }
! 1094: y = icv_tailer(this, y, alen, len);
! 1095: icv_crypt(this, y, j, icv);
! 1096: }
! 1097:
! 1098: /**
! 1099: * AES-256 GCM decryption/ICV generation
! 1100: */
! 1101: static void decrypt_gcm256(private_aesni_gcm_t *this,
! 1102: size_t len, u_char *in, u_char *out, u_char *iv,
! 1103: size_t alen, u_char *assoc, u_char *icv)
! 1104: {
! 1105: __m128i d1, d2, d3, d4, t1, t2, t3, t4;
! 1106: __m128i *ks, y, j, cb, *bi, *bo;
! 1107: u_int blocks, pblocks, rem, i;
! 1108:
! 1109: j = create_j(this, iv);
! 1110: cb = increment_be(j);
! 1111: y = icv_header(this, assoc, alen);
! 1112: blocks = len / AES_BLOCK_SIZE;
! 1113: pblocks = blocks - (blocks % GCM_CRYPT_PARALLELISM);
! 1114: rem = len % AES_BLOCK_SIZE;
! 1115: bi = (__m128i*)in;
! 1116: bo = (__m128i*)out;
! 1117:
! 1118: ks = this->key->schedule;
! 1119:
! 1120: for (i = 0; i < pblocks; i += GCM_CRYPT_PARALLELISM)
! 1121: {
! 1122: d1 = _mm_loadu_si128(bi + i + 0);
! 1123: d2 = _mm_loadu_si128(bi + i + 1);
! 1124: d3 = _mm_loadu_si128(bi + i + 2);
! 1125: d4 = _mm_loadu_si128(bi + i + 3);
! 1126:
! 1127: y = _mm_xor_si128(y, d1);
! 1128: y = mult4xor(this->hhhh, this->hhh, this->hh, this->h, y, d2, d3, d4);
! 1129:
! 1130: t1 = _mm_xor_si128(cb, ks[0]);
! 1131: cb = increment_be(cb);
! 1132: t2 = _mm_xor_si128(cb, ks[0]);
! 1133: cb = increment_be(cb);
! 1134: t3 = _mm_xor_si128(cb, ks[0]);
! 1135: cb = increment_be(cb);
! 1136: t4 = _mm_xor_si128(cb, ks[0]);
! 1137: cb = increment_be(cb);
! 1138:
! 1139: t1 = _mm_aesenc_si128(t1, ks[1]);
! 1140: t2 = _mm_aesenc_si128(t2, ks[1]);
! 1141: t3 = _mm_aesenc_si128(t3, ks[1]);
! 1142: t4 = _mm_aesenc_si128(t4, ks[1]);
! 1143: t1 = _mm_aesenc_si128(t1, ks[2]);
! 1144: t2 = _mm_aesenc_si128(t2, ks[2]);
! 1145: t3 = _mm_aesenc_si128(t3, ks[2]);
! 1146: t4 = _mm_aesenc_si128(t4, ks[2]);
! 1147: t1 = _mm_aesenc_si128(t1, ks[3]);
! 1148: t2 = _mm_aesenc_si128(t2, ks[3]);
! 1149: t3 = _mm_aesenc_si128(t3, ks[3]);
! 1150: t4 = _mm_aesenc_si128(t4, ks[3]);
! 1151: t1 = _mm_aesenc_si128(t1, ks[4]);
! 1152: t2 = _mm_aesenc_si128(t2, ks[4]);
! 1153: t3 = _mm_aesenc_si128(t3, ks[4]);
! 1154: t4 = _mm_aesenc_si128(t4, ks[4]);
! 1155: t1 = _mm_aesenc_si128(t1, ks[5]);
! 1156: t2 = _mm_aesenc_si128(t2, ks[5]);
! 1157: t3 = _mm_aesenc_si128(t3, ks[5]);
! 1158: t4 = _mm_aesenc_si128(t4, ks[5]);
! 1159: t1 = _mm_aesenc_si128(t1, ks[6]);
! 1160: t2 = _mm_aesenc_si128(t2, ks[6]);
! 1161: t3 = _mm_aesenc_si128(t3, ks[6]);
! 1162: t4 = _mm_aesenc_si128(t4, ks[6]);
! 1163: t1 = _mm_aesenc_si128(t1, ks[7]);
! 1164: t2 = _mm_aesenc_si128(t2, ks[7]);
! 1165: t3 = _mm_aesenc_si128(t3, ks[7]);
! 1166: t4 = _mm_aesenc_si128(t4, ks[7]);
! 1167: t1 = _mm_aesenc_si128(t1, ks[8]);
! 1168: t2 = _mm_aesenc_si128(t2, ks[8]);
! 1169: t3 = _mm_aesenc_si128(t3, ks[8]);
! 1170: t4 = _mm_aesenc_si128(t4, ks[8]);
! 1171: t1 = _mm_aesenc_si128(t1, ks[9]);
! 1172: t2 = _mm_aesenc_si128(t2, ks[9]);
! 1173: t3 = _mm_aesenc_si128(t3, ks[9]);
! 1174: t4 = _mm_aesenc_si128(t4, ks[9]);
! 1175: t1 = _mm_aesenc_si128(t1, ks[10]);
! 1176: t2 = _mm_aesenc_si128(t2, ks[10]);
! 1177: t3 = _mm_aesenc_si128(t3, ks[10]);
! 1178: t4 = _mm_aesenc_si128(t4, ks[10]);
! 1179: t1 = _mm_aesenc_si128(t1, ks[11]);
! 1180: t2 = _mm_aesenc_si128(t2, ks[11]);
! 1181: t3 = _mm_aesenc_si128(t3, ks[11]);
! 1182: t4 = _mm_aesenc_si128(t4, ks[11]);
! 1183: t1 = _mm_aesenc_si128(t1, ks[12]);
! 1184: t2 = _mm_aesenc_si128(t2, ks[12]);
! 1185: t3 = _mm_aesenc_si128(t3, ks[12]);
! 1186: t4 = _mm_aesenc_si128(t4, ks[12]);
! 1187: t1 = _mm_aesenc_si128(t1, ks[13]);
! 1188: t2 = _mm_aesenc_si128(t2, ks[13]);
! 1189: t3 = _mm_aesenc_si128(t3, ks[13]);
! 1190: t4 = _mm_aesenc_si128(t4, ks[13]);
! 1191:
! 1192: t1 = _mm_aesenclast_si128(t1, ks[14]);
! 1193: t2 = _mm_aesenclast_si128(t2, ks[14]);
! 1194: t3 = _mm_aesenclast_si128(t3, ks[14]);
! 1195: t4 = _mm_aesenclast_si128(t4, ks[14]);
! 1196:
! 1197: t1 = _mm_xor_si128(t1, d1);
! 1198: t2 = _mm_xor_si128(t2, d2);
! 1199: t3 = _mm_xor_si128(t3, d3);
! 1200: t4 = _mm_xor_si128(t4, d4);
! 1201:
! 1202: _mm_storeu_si128(bo + i + 0, t1);
! 1203: _mm_storeu_si128(bo + i + 1, t2);
! 1204: _mm_storeu_si128(bo + i + 2, t3);
! 1205: _mm_storeu_si128(bo + i + 3, t4);
! 1206: }
! 1207:
! 1208: for (i = pblocks; i < blocks; i++)
! 1209: {
! 1210: d1 = _mm_loadu_si128(bi + i);
! 1211:
! 1212: y = ghash(this->h, y, d1);
! 1213:
! 1214: t1 = _mm_xor_si128(cb, ks[0]);
! 1215: t1 = _mm_aesenc_si128(t1, ks[1]);
! 1216: t1 = _mm_aesenc_si128(t1, ks[2]);
! 1217: t1 = _mm_aesenc_si128(t1, ks[3]);
! 1218: t1 = _mm_aesenc_si128(t1, ks[4]);
! 1219: t1 = _mm_aesenc_si128(t1, ks[5]);
! 1220: t1 = _mm_aesenc_si128(t1, ks[6]);
! 1221: t1 = _mm_aesenc_si128(t1, ks[7]);
! 1222: t1 = _mm_aesenc_si128(t1, ks[8]);
! 1223: t1 = _mm_aesenc_si128(t1, ks[9]);
! 1224: t1 = _mm_aesenc_si128(t1, ks[10]);
! 1225: t1 = _mm_aesenc_si128(t1, ks[11]);
! 1226: t1 = _mm_aesenc_si128(t1, ks[12]);
! 1227: t1 = _mm_aesenc_si128(t1, ks[13]);
! 1228: t1 = _mm_aesenclast_si128(t1, ks[14]);
! 1229:
! 1230: t1 = _mm_xor_si128(t1, d1);
! 1231: _mm_storeu_si128(bo + i, t1);
! 1232:
! 1233: cb = increment_be(cb);
! 1234: }
! 1235:
! 1236: if (rem)
! 1237: {
! 1238: y = decrypt_gcm_rem(this, rem, bi + blocks, bo + blocks, cb, y);
! 1239: }
! 1240: y = icv_tailer(this, y, alen, len);
! 1241: icv_crypt(this, y, j, icv);
! 1242: }
! 1243:
! 1244: METHOD(aead_t, encrypt, bool,
! 1245: private_aesni_gcm_t *this, chunk_t plain, chunk_t assoc, chunk_t iv,
! 1246: chunk_t *encr)
! 1247: {
! 1248: u_char *out;
! 1249:
! 1250: if (!this->key || iv.len != IV_SIZE)
! 1251: {
! 1252: return FALSE;
! 1253: }
! 1254: out = plain.ptr;
! 1255: if (encr)
! 1256: {
! 1257: *encr = chunk_alloc(plain.len + this->icv_size);
! 1258: out = encr->ptr;
! 1259: }
! 1260: this->encrypt(this, plain.len, plain.ptr, out, iv.ptr,
! 1261: assoc.len, assoc.ptr, out + plain.len);
! 1262: return TRUE;
! 1263: }
! 1264:
! 1265: METHOD(aead_t, decrypt, bool,
! 1266: private_aesni_gcm_t *this, chunk_t encr, chunk_t assoc, chunk_t iv,
! 1267: chunk_t *plain)
! 1268: {
! 1269: u_char *out, icv[this->icv_size];
! 1270:
! 1271: if (!this->key || iv.len != IV_SIZE || encr.len < this->icv_size)
! 1272: {
! 1273: return FALSE;
! 1274: }
! 1275: encr.len -= this->icv_size;
! 1276: out = encr.ptr;
! 1277: if (plain)
! 1278: {
! 1279: *plain = chunk_alloc(encr.len);
! 1280: out = plain->ptr;
! 1281: }
! 1282: this->decrypt(this, encr.len, encr.ptr, out, iv.ptr,
! 1283: assoc.len, assoc.ptr, icv);
! 1284: return memeq_const(icv, encr.ptr + encr.len, this->icv_size);
! 1285: }
! 1286:
! 1287: METHOD(aead_t, get_block_size, size_t,
! 1288: private_aesni_gcm_t *this)
! 1289: {
! 1290: return 1;
! 1291: }
! 1292:
! 1293: METHOD(aead_t, get_icv_size, size_t,
! 1294: private_aesni_gcm_t *this)
! 1295: {
! 1296: return this->icv_size;
! 1297: }
! 1298:
! 1299: METHOD(aead_t, get_iv_size, size_t,
! 1300: private_aesni_gcm_t *this)
! 1301: {
! 1302: return IV_SIZE;
! 1303: }
! 1304:
! 1305: METHOD(aead_t, get_iv_gen, iv_gen_t*,
! 1306: private_aesni_gcm_t *this)
! 1307: {
! 1308: return this->iv_gen;
! 1309: }
! 1310:
! 1311: METHOD(aead_t, get_key_size, size_t,
! 1312: private_aesni_gcm_t *this)
! 1313: {
! 1314: return this->key_size + SALT_SIZE;
! 1315: }
! 1316:
! 1317: METHOD(aead_t, set_key, bool,
! 1318: private_aesni_gcm_t *this, chunk_t key)
! 1319: {
! 1320: u_int round;
! 1321: __m128i *ks, h;
! 1322:
! 1323: if (key.len != this->key_size + SALT_SIZE)
! 1324: {
! 1325: return FALSE;
! 1326: }
! 1327:
! 1328: memcpy(this->salt, key.ptr + key.len - SALT_SIZE, SALT_SIZE);
! 1329: key.len -= SALT_SIZE;
! 1330:
! 1331: DESTROY_IF(this->key);
! 1332: this->key = aesni_key_create(TRUE, key);
! 1333:
! 1334: ks = this->key->schedule;
! 1335: h = _mm_xor_si128(_mm_setzero_si128(), ks[0]);
! 1336: for (round = 1; round < this->key->rounds; round++)
! 1337: {
! 1338: h = _mm_aesenc_si128(h, ks[round]);
! 1339: }
! 1340: h = _mm_aesenclast_si128(h, ks[this->key->rounds]);
! 1341:
! 1342: this->h = h;
! 1343: h = swap128(h);
! 1344: this->hh = mult_block(h, this->h);
! 1345: this->hhh = mult_block(h, this->hh);
! 1346: this->hhhh = mult_block(h, this->hhh);
! 1347: this->h = swap128(this->h);
! 1348: this->hh = swap128(this->hh);
! 1349: this->hhh = swap128(this->hhh);
! 1350: this->hhhh = swap128(this->hhhh);
! 1351:
! 1352: return TRUE;
! 1353: }
! 1354:
! 1355: METHOD(aead_t, destroy, void,
! 1356: private_aesni_gcm_t *this)
! 1357: {
! 1358: DESTROY_IF(this->key);
! 1359: memwipe(&this->h, sizeof(this->h));
! 1360: memwipe(&this->hh, sizeof(this->hh));
! 1361: memwipe(&this->hhh, sizeof(this->hhh));
! 1362: memwipe(&this->hhhh, sizeof(this->hhhh));
! 1363: this->iv_gen->destroy(this->iv_gen);
! 1364: free_align(this);
! 1365: }
! 1366:
! 1367: /**
! 1368: * See header
! 1369: */
! 1370: aesni_gcm_t *aesni_gcm_create(encryption_algorithm_t algo,
! 1371: size_t key_size, size_t salt_size)
! 1372: {
! 1373: private_aesni_gcm_t *this;
! 1374: size_t icv_size;
! 1375:
! 1376: switch (key_size)
! 1377: {
! 1378: case 0:
! 1379: key_size = 16;
! 1380: break;
! 1381: case 16:
! 1382: case 24:
! 1383: case 32:
! 1384: break;
! 1385: default:
! 1386: return NULL;
! 1387: }
! 1388: if (salt_size && salt_size != SALT_SIZE)
! 1389: {
! 1390: /* currently not supported */
! 1391: return NULL;
! 1392: }
! 1393: switch (algo)
! 1394: {
! 1395: case ENCR_AES_GCM_ICV8:
! 1396: algo = ENCR_AES_CBC;
! 1397: icv_size = 8;
! 1398: break;
! 1399: case ENCR_AES_GCM_ICV12:
! 1400: algo = ENCR_AES_CBC;
! 1401: icv_size = 12;
! 1402: break;
! 1403: case ENCR_AES_GCM_ICV16:
! 1404: algo = ENCR_AES_CBC;
! 1405: icv_size = 16;
! 1406: break;
! 1407: default:
! 1408: return NULL;
! 1409: }
! 1410:
! 1411: INIT_ALIGN(this, sizeof(__m128i),
! 1412: .public = {
! 1413: .aead = {
! 1414: .encrypt = _encrypt,
! 1415: .decrypt = _decrypt,
! 1416: .get_block_size = _get_block_size,
! 1417: .get_icv_size = _get_icv_size,
! 1418: .get_iv_size = _get_iv_size,
! 1419: .get_iv_gen = _get_iv_gen,
! 1420: .get_key_size = _get_key_size,
! 1421: .set_key = _set_key,
! 1422: .destroy = _destroy,
! 1423: },
! 1424: },
! 1425: .key_size = key_size,
! 1426: .iv_gen = iv_gen_seq_create(),
! 1427: .icv_size = icv_size,
! 1428: );
! 1429:
! 1430: switch (key_size)
! 1431: {
! 1432: case 16:
! 1433: this->encrypt = encrypt_gcm128;
! 1434: this->decrypt = decrypt_gcm128;
! 1435: break;
! 1436: case 24:
! 1437: this->encrypt = encrypt_gcm192;
! 1438: this->decrypt = decrypt_gcm192;
! 1439: break;
! 1440: case 32:
! 1441: this->encrypt = encrypt_gcm256;
! 1442: this->decrypt = decrypt_gcm256;
! 1443: break;
! 1444: }
! 1445:
! 1446: return &this->public;
! 1447: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>