Annotation of embedaddon/strongswan/src/libstrongswan/plugins/aesni/aesni_ctr.c, revision 1.1.1.1

1.1       misho       1: /*
                      2:  * Copyright (C) 2015 Martin Willi
                      3:  * Copyright (C) 2015 revosec AG
                      4:  *
                      5:  * This program is free software; you can redistribute it and/or modify it
                      6:  * under the terms of the GNU General Public License as published by the
                      7:  * Free Software Foundation; either version 2 of the License, or (at your
                      8:  * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
                      9:  *
                     10:  * This program is distributed in the hope that it will be useful, but
                     11:  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
                     12:  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
                     13:  * for more details.
                     14:  */
                     15: 
                     16: #include "aesni_ctr.h"
                     17: #include "aesni_key.h"
                     18: 
                     19: #include <tmmintrin.h>
                     20: 
                     21: /**
                     22:  * Pipeline parallelism we use for CTR en/decryption
                     23:  */
                     24: #define CTR_CRYPT_PARALLELISM 4
                     25: 
                     26: typedef struct private_aesni_ctr_t private_aesni_ctr_t;
                     27: 
                     28: /**
                     29:  * CTR en/decryption method type
                     30:  */
                     31: typedef void (*aesni_ctr_fn_t)(private_aesni_ctr_t*, size_t, u_char*, u_char*);
                     32: 
                     33: /**
                     34:  * Private data of an aesni_ctr_t object.
                     35:  */
                     36: struct private_aesni_ctr_t {
                     37: 
                     38:        /**
                     39:         * Public aesni_ctr_t interface.
                     40:         */
                     41:        aesni_ctr_t public;
                     42: 
                     43:        /**
                     44:         * Key size
                     45:         */
                     46:        u_int key_size;
                     47: 
                     48:        /**
                     49:         * Key schedule
                     50:         */
                     51:        aesni_key_t *key;
                     52: 
                     53:        /**
                     54:         * Encryption method
                     55:         */
                     56:        aesni_ctr_fn_t crypt;
                     57: 
                     58:        /**
                     59:         * Counter state
                     60:         */
                     61:        struct {
                     62:                char nonce[4];
                     63:                char iv[8];
                     64:                uint32_t counter;
                     65:        } __attribute__((packed, aligned(sizeof(__m128i)))) state;
                     66: };
                     67: 
                     68: /**
                     69:  * Do big-endian increment on x
                     70:  */
                     71: static inline __m128i increment_be(__m128i x)
                     72: {
                     73:        __m128i swap;
                     74: 
                     75:        swap = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
                     76: 
                     77:        x = _mm_shuffle_epi8(x, swap);
                     78:        x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
                     79:        x = _mm_shuffle_epi8(x, swap);
                     80: 
                     81:        return x;
                     82: }
                     83: 
                     84: /**
                     85:  * AES-128 CTR encryption
                     86:  */
                     87: static void encrypt_ctr128(private_aesni_ctr_t *this,
                     88:                                                   size_t len, u_char *in, u_char *out)
                     89: {
                     90:        __m128i t1, t2, t3, t4;
                     91:        __m128i d1, d2, d3, d4;
                     92:        __m128i *ks, state, b, *bi, *bo;
                     93:        u_int i, blocks, pblocks, rem;
                     94: 
                     95:        state = _mm_load_si128((__m128i*)&this->state);
                     96:        blocks = len / AES_BLOCK_SIZE;
                     97:        pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
                     98:        rem = len % AES_BLOCK_SIZE;
                     99:        bi = (__m128i*)in;
                    100:        bo = (__m128i*)out;
                    101: 
                    102:        ks = this->key->schedule;
                    103: 
                    104:        for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
                    105:        {
                    106:                d1 = _mm_loadu_si128(bi + i + 0);
                    107:                d2 = _mm_loadu_si128(bi + i + 1);
                    108:                d3 = _mm_loadu_si128(bi + i + 2);
                    109:                d4 = _mm_loadu_si128(bi + i + 3);
                    110: 
                    111:                t1 = _mm_xor_si128(state, ks[0]);
                    112:                state = increment_be(state);
                    113:                t2 = _mm_xor_si128(state, ks[0]);
                    114:                state = increment_be(state);
                    115:                t3 = _mm_xor_si128(state, ks[0]);
                    116:                state = increment_be(state);
                    117:                t4 = _mm_xor_si128(state, ks[0]);
                    118:                state = increment_be(state);
                    119: 
                    120:                t1 = _mm_aesenc_si128(t1, ks[1]);
                    121:                t2 = _mm_aesenc_si128(t2, ks[1]);
                    122:                t3 = _mm_aesenc_si128(t3, ks[1]);
                    123:                t4 = _mm_aesenc_si128(t4, ks[1]);
                    124:                t1 = _mm_aesenc_si128(t1, ks[2]);
                    125:                t2 = _mm_aesenc_si128(t2, ks[2]);
                    126:                t3 = _mm_aesenc_si128(t3, ks[2]);
                    127:                t4 = _mm_aesenc_si128(t4, ks[2]);
                    128:                t1 = _mm_aesenc_si128(t1, ks[3]);
                    129:                t2 = _mm_aesenc_si128(t2, ks[3]);
                    130:                t3 = _mm_aesenc_si128(t3, ks[3]);
                    131:                t4 = _mm_aesenc_si128(t4, ks[3]);
                    132:                t1 = _mm_aesenc_si128(t1, ks[4]);
                    133:                t2 = _mm_aesenc_si128(t2, ks[4]);
                    134:                t3 = _mm_aesenc_si128(t3, ks[4]);
                    135:                t4 = _mm_aesenc_si128(t4, ks[4]);
                    136:                t1 = _mm_aesenc_si128(t1, ks[5]);
                    137:                t2 = _mm_aesenc_si128(t2, ks[5]);
                    138:                t3 = _mm_aesenc_si128(t3, ks[5]);
                    139:                t4 = _mm_aesenc_si128(t4, ks[5]);
                    140:                t1 = _mm_aesenc_si128(t1, ks[6]);
                    141:                t2 = _mm_aesenc_si128(t2, ks[6]);
                    142:                t3 = _mm_aesenc_si128(t3, ks[6]);
                    143:                t4 = _mm_aesenc_si128(t4, ks[6]);
                    144:                t1 = _mm_aesenc_si128(t1, ks[7]);
                    145:                t2 = _mm_aesenc_si128(t2, ks[7]);
                    146:                t3 = _mm_aesenc_si128(t3, ks[7]);
                    147:                t4 = _mm_aesenc_si128(t4, ks[7]);
                    148:                t1 = _mm_aesenc_si128(t1, ks[8]);
                    149:                t2 = _mm_aesenc_si128(t2, ks[8]);
                    150:                t3 = _mm_aesenc_si128(t3, ks[8]);
                    151:                t4 = _mm_aesenc_si128(t4, ks[8]);
                    152:                t1 = _mm_aesenc_si128(t1, ks[9]);
                    153:                t2 = _mm_aesenc_si128(t2, ks[9]);
                    154:                t3 = _mm_aesenc_si128(t3, ks[9]);
                    155:                t4 = _mm_aesenc_si128(t4, ks[9]);
                    156: 
                    157:                t1 = _mm_aesenclast_si128(t1, ks[10]);
                    158:                t2 = _mm_aesenclast_si128(t2, ks[10]);
                    159:                t3 = _mm_aesenclast_si128(t3, ks[10]);
                    160:                t4 = _mm_aesenclast_si128(t4, ks[10]);
                    161:                t1 = _mm_xor_si128(t1, d1);
                    162:                t2 = _mm_xor_si128(t2, d2);
                    163:                t3 = _mm_xor_si128(t3, d3);
                    164:                t4 = _mm_xor_si128(t4, d4);
                    165:                _mm_storeu_si128(bo + i + 0, t1);
                    166:                _mm_storeu_si128(bo + i + 1, t2);
                    167:                _mm_storeu_si128(bo + i + 2, t3);
                    168:                _mm_storeu_si128(bo + i + 3, t4);
                    169:        }
                    170: 
                    171:        for (i = pblocks; i < blocks; i++)
                    172:        {
                    173:                d1 = _mm_loadu_si128(bi + i);
                    174: 
                    175:                t1 = _mm_xor_si128(state, ks[0]);
                    176:                state = increment_be(state);
                    177: 
                    178:                t1 = _mm_aesenc_si128(t1, ks[1]);
                    179:                t1 = _mm_aesenc_si128(t1, ks[2]);
                    180:                t1 = _mm_aesenc_si128(t1, ks[3]);
                    181:                t1 = _mm_aesenc_si128(t1, ks[4]);
                    182:                t1 = _mm_aesenc_si128(t1, ks[5]);
                    183:                t1 = _mm_aesenc_si128(t1, ks[6]);
                    184:                t1 = _mm_aesenc_si128(t1, ks[7]);
                    185:                t1 = _mm_aesenc_si128(t1, ks[8]);
                    186:                t1 = _mm_aesenc_si128(t1, ks[9]);
                    187: 
                    188:                t1 = _mm_aesenclast_si128(t1, ks[10]);
                    189:                t1 = _mm_xor_si128(t1, d1);
                    190:                _mm_storeu_si128(bo + i, t1);
                    191:        }
                    192: 
                    193:        if (rem)
                    194:        {
                    195:                memset(&b, 0, sizeof(b));
                    196:                memcpy(&b, bi + blocks, rem);
                    197: 
                    198:                d1 = _mm_loadu_si128(&b);
                    199:                t1 = _mm_xor_si128(state, ks[0]);
                    200: 
                    201:                t1 = _mm_aesenc_si128(t1, ks[1]);
                    202:                t1 = _mm_aesenc_si128(t1, ks[2]);
                    203:                t1 = _mm_aesenc_si128(t1, ks[3]);
                    204:                t1 = _mm_aesenc_si128(t1, ks[4]);
                    205:                t1 = _mm_aesenc_si128(t1, ks[5]);
                    206:                t1 = _mm_aesenc_si128(t1, ks[6]);
                    207:                t1 = _mm_aesenc_si128(t1, ks[7]);
                    208:                t1 = _mm_aesenc_si128(t1, ks[8]);
                    209:                t1 = _mm_aesenc_si128(t1, ks[9]);
                    210: 
                    211:                t1 = _mm_aesenclast_si128(t1, ks[10]);
                    212:                t1 = _mm_xor_si128(t1, d1);
                    213:                _mm_storeu_si128(&b, t1);
                    214: 
                    215:                memcpy(bo + blocks, &b, rem);
                    216:        }
                    217: }
                    218: 
                    219: /**
                    220:  * AES-192 CTR encryption
                    221:  */
                    222: static void encrypt_ctr192(private_aesni_ctr_t *this,
                    223:                                                   size_t len, u_char *in, u_char *out)
                    224: {
                    225:        __m128i t1, t2, t3, t4;
                    226:        __m128i d1, d2, d3, d4;
                    227:        __m128i *ks, state, b, *bi, *bo;
                    228:        u_int i, blocks, pblocks, rem;
                    229: 
                    230:        state = _mm_load_si128((__m128i*)&this->state);
                    231:        blocks = len / AES_BLOCK_SIZE;
                    232:        pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
                    233:        rem = len % AES_BLOCK_SIZE;
                    234:        bi = (__m128i*)in;
                    235:        bo = (__m128i*)out;
                    236: 
                    237:        ks = this->key->schedule;
                    238: 
                    239:        for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
                    240:        {
                    241:                d1 = _mm_loadu_si128(bi + i + 0);
                    242:                d2 = _mm_loadu_si128(bi + i + 1);
                    243:                d3 = _mm_loadu_si128(bi + i + 2);
                    244:                d4 = _mm_loadu_si128(bi + i + 3);
                    245: 
                    246:                t1 = _mm_xor_si128(state, ks[0]);
                    247:                state = increment_be(state);
                    248:                t2 = _mm_xor_si128(state, ks[0]);
                    249:                state = increment_be(state);
                    250:                t3 = _mm_xor_si128(state, ks[0]);
                    251:                state = increment_be(state);
                    252:                t4 = _mm_xor_si128(state, ks[0]);
                    253:                state = increment_be(state);
                    254: 
                    255:                t1 = _mm_aesenc_si128(t1, ks[1]);
                    256:                t2 = _mm_aesenc_si128(t2, ks[1]);
                    257:                t3 = _mm_aesenc_si128(t3, ks[1]);
                    258:                t4 = _mm_aesenc_si128(t4, ks[1]);
                    259:                t1 = _mm_aesenc_si128(t1, ks[2]);
                    260:                t2 = _mm_aesenc_si128(t2, ks[2]);
                    261:                t3 = _mm_aesenc_si128(t3, ks[2]);
                    262:                t4 = _mm_aesenc_si128(t4, ks[2]);
                    263:                t1 = _mm_aesenc_si128(t1, ks[3]);
                    264:                t2 = _mm_aesenc_si128(t2, ks[3]);
                    265:                t3 = _mm_aesenc_si128(t3, ks[3]);
                    266:                t4 = _mm_aesenc_si128(t4, ks[3]);
                    267:                t1 = _mm_aesenc_si128(t1, ks[4]);
                    268:                t2 = _mm_aesenc_si128(t2, ks[4]);
                    269:                t3 = _mm_aesenc_si128(t3, ks[4]);
                    270:                t4 = _mm_aesenc_si128(t4, ks[4]);
                    271:                t1 = _mm_aesenc_si128(t1, ks[5]);
                    272:                t2 = _mm_aesenc_si128(t2, ks[5]);
                    273:                t3 = _mm_aesenc_si128(t3, ks[5]);
                    274:                t4 = _mm_aesenc_si128(t4, ks[5]);
                    275:                t1 = _mm_aesenc_si128(t1, ks[6]);
                    276:                t2 = _mm_aesenc_si128(t2, ks[6]);
                    277:                t3 = _mm_aesenc_si128(t3, ks[6]);
                    278:                t4 = _mm_aesenc_si128(t4, ks[6]);
                    279:                t1 = _mm_aesenc_si128(t1, ks[7]);
                    280:                t2 = _mm_aesenc_si128(t2, ks[7]);
                    281:                t3 = _mm_aesenc_si128(t3, ks[7]);
                    282:                t4 = _mm_aesenc_si128(t4, ks[7]);
                    283:                t1 = _mm_aesenc_si128(t1, ks[8]);
                    284:                t2 = _mm_aesenc_si128(t2, ks[8]);
                    285:                t3 = _mm_aesenc_si128(t3, ks[8]);
                    286:                t4 = _mm_aesenc_si128(t4, ks[8]);
                    287:                t1 = _mm_aesenc_si128(t1, ks[9]);
                    288:                t2 = _mm_aesenc_si128(t2, ks[9]);
                    289:                t3 = _mm_aesenc_si128(t3, ks[9]);
                    290:                t4 = _mm_aesenc_si128(t4, ks[9]);
                    291:                t1 = _mm_aesenc_si128(t1, ks[10]);
                    292:                t2 = _mm_aesenc_si128(t2, ks[10]);
                    293:                t3 = _mm_aesenc_si128(t3, ks[10]);
                    294:                t4 = _mm_aesenc_si128(t4, ks[10]);
                    295:                t1 = _mm_aesenc_si128(t1, ks[11]);
                    296:                t2 = _mm_aesenc_si128(t2, ks[11]);
                    297:                t3 = _mm_aesenc_si128(t3, ks[11]);
                    298:                t4 = _mm_aesenc_si128(t4, ks[11]);
                    299: 
                    300:                t1 = _mm_aesenclast_si128(t1, ks[12]);
                    301:                t2 = _mm_aesenclast_si128(t2, ks[12]);
                    302:                t3 = _mm_aesenclast_si128(t3, ks[12]);
                    303:                t4 = _mm_aesenclast_si128(t4, ks[12]);
                    304:                t1 = _mm_xor_si128(t1, d1);
                    305:                t2 = _mm_xor_si128(t2, d2);
                    306:                t3 = _mm_xor_si128(t3, d3);
                    307:                t4 = _mm_xor_si128(t4, d4);
                    308:                _mm_storeu_si128(bo + i + 0, t1);
                    309:                _mm_storeu_si128(bo + i + 1, t2);
                    310:                _mm_storeu_si128(bo + i + 2, t3);
                    311:                _mm_storeu_si128(bo + i + 3, t4);
                    312:        }
                    313: 
                    314:        for (i = pblocks; i < blocks; i++)
                    315:        {
                    316:                d1 = _mm_loadu_si128(bi + i);
                    317: 
                    318:                t1 = _mm_xor_si128(state, ks[0]);
                    319:                state = increment_be(state);
                    320: 
                    321:                t1 = _mm_aesenc_si128(t1, ks[1]);
                    322:                t1 = _mm_aesenc_si128(t1, ks[2]);
                    323:                t1 = _mm_aesenc_si128(t1, ks[3]);
                    324:                t1 = _mm_aesenc_si128(t1, ks[4]);
                    325:                t1 = _mm_aesenc_si128(t1, ks[5]);
                    326:                t1 = _mm_aesenc_si128(t1, ks[6]);
                    327:                t1 = _mm_aesenc_si128(t1, ks[7]);
                    328:                t1 = _mm_aesenc_si128(t1, ks[8]);
                    329:                t1 = _mm_aesenc_si128(t1, ks[9]);
                    330:                t1 = _mm_aesenc_si128(t1, ks[10]);
                    331:                t1 = _mm_aesenc_si128(t1, ks[11]);
                    332: 
                    333:                t1 = _mm_aesenclast_si128(t1, ks[12]);
                    334:                t1 = _mm_xor_si128(t1, d1);
                    335:                _mm_storeu_si128(bo + i, t1);
                    336:        }
                    337: 
                    338:        if (rem)
                    339:        {
                    340:                memset(&b, 0, sizeof(b));
                    341:                memcpy(&b, bi + blocks, rem);
                    342: 
                    343:                d1 = _mm_loadu_si128(&b);
                    344:                t1 = _mm_xor_si128(state, ks[0]);
                    345: 
                    346:                t1 = _mm_aesenc_si128(t1, ks[1]);
                    347:                t1 = _mm_aesenc_si128(t1, ks[2]);
                    348:                t1 = _mm_aesenc_si128(t1, ks[3]);
                    349:                t1 = _mm_aesenc_si128(t1, ks[4]);
                    350:                t1 = _mm_aesenc_si128(t1, ks[5]);
                    351:                t1 = _mm_aesenc_si128(t1, ks[6]);
                    352:                t1 = _mm_aesenc_si128(t1, ks[7]);
                    353:                t1 = _mm_aesenc_si128(t1, ks[8]);
                    354:                t1 = _mm_aesenc_si128(t1, ks[9]);
                    355:                t1 = _mm_aesenc_si128(t1, ks[10]);
                    356:                t1 = _mm_aesenc_si128(t1, ks[11]);
                    357: 
                    358:                t1 = _mm_aesenclast_si128(t1, ks[12]);
                    359:                t1 = _mm_xor_si128(t1, d1);
                    360:                _mm_storeu_si128(&b, t1);
                    361: 
                    362:                memcpy(bo + blocks, &b, rem);
                    363:        }
                    364: }
                    365: 
                    366: /**
                    367:  * AES-256 CTR encryption
                    368:  */
                    369: static void encrypt_ctr256(private_aesni_ctr_t *this,
                    370:                                                   size_t len, u_char *in, u_char *out)
                    371: {
                    372:        __m128i t1, t2, t3, t4;
                    373:        __m128i d1, d2, d3, d4;
                    374:        __m128i *ks, state, b, *bi, *bo;
                    375:        u_int i, blocks, pblocks, rem;
                    376: 
                    377:        state = _mm_load_si128((__m128i*)&this->state);
                    378:        blocks = len / AES_BLOCK_SIZE;
                    379:        pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
                    380:        rem = len % AES_BLOCK_SIZE;
                    381:        bi = (__m128i*)in;
                    382:        bo = (__m128i*)out;
                    383: 
                    384:        ks = this->key->schedule;
                    385: 
                    386:        for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
                    387:        {
                    388:                d1 = _mm_loadu_si128(bi + i + 0);
                    389:                d2 = _mm_loadu_si128(bi + i + 1);
                    390:                d3 = _mm_loadu_si128(bi + i + 2);
                    391:                d4 = _mm_loadu_si128(bi + i + 3);
                    392: 
                    393:                t1 = _mm_xor_si128(state, ks[0]);
                    394:                state = increment_be(state);
                    395:                t2 = _mm_xor_si128(state, ks[0]);
                    396:                state = increment_be(state);
                    397:                t3 = _mm_xor_si128(state, ks[0]);
                    398:                state = increment_be(state);
                    399:                t4 = _mm_xor_si128(state, ks[0]);
                    400:                state = increment_be(state);
                    401: 
                    402:                t1 = _mm_aesenc_si128(t1, ks[1]);
                    403:                t2 = _mm_aesenc_si128(t2, ks[1]);
                    404:                t3 = _mm_aesenc_si128(t3, ks[1]);
                    405:                t4 = _mm_aesenc_si128(t4, ks[1]);
                    406:                t1 = _mm_aesenc_si128(t1, ks[2]);
                    407:                t2 = _mm_aesenc_si128(t2, ks[2]);
                    408:                t3 = _mm_aesenc_si128(t3, ks[2]);
                    409:                t4 = _mm_aesenc_si128(t4, ks[2]);
                    410:                t1 = _mm_aesenc_si128(t1, ks[3]);
                    411:                t2 = _mm_aesenc_si128(t2, ks[3]);
                    412:                t3 = _mm_aesenc_si128(t3, ks[3]);
                    413:                t4 = _mm_aesenc_si128(t4, ks[3]);
                    414:                t1 = _mm_aesenc_si128(t1, ks[4]);
                    415:                t2 = _mm_aesenc_si128(t2, ks[4]);
                    416:                t3 = _mm_aesenc_si128(t3, ks[4]);
                    417:                t4 = _mm_aesenc_si128(t4, ks[4]);
                    418:                t1 = _mm_aesenc_si128(t1, ks[5]);
                    419:                t2 = _mm_aesenc_si128(t2, ks[5]);
                    420:                t3 = _mm_aesenc_si128(t3, ks[5]);
                    421:                t4 = _mm_aesenc_si128(t4, ks[5]);
                    422:                t1 = _mm_aesenc_si128(t1, ks[6]);
                    423:                t2 = _mm_aesenc_si128(t2, ks[6]);
                    424:                t3 = _mm_aesenc_si128(t3, ks[6]);
                    425:                t4 = _mm_aesenc_si128(t4, ks[6]);
                    426:                t1 = _mm_aesenc_si128(t1, ks[7]);
                    427:                t2 = _mm_aesenc_si128(t2, ks[7]);
                    428:                t3 = _mm_aesenc_si128(t3, ks[7]);
                    429:                t4 = _mm_aesenc_si128(t4, ks[7]);
                    430:                t1 = _mm_aesenc_si128(t1, ks[8]);
                    431:                t2 = _mm_aesenc_si128(t2, ks[8]);
                    432:                t3 = _mm_aesenc_si128(t3, ks[8]);
                    433:                t4 = _mm_aesenc_si128(t4, ks[8]);
                    434:                t1 = _mm_aesenc_si128(t1, ks[9]);
                    435:                t2 = _mm_aesenc_si128(t2, ks[9]);
                    436:                t3 = _mm_aesenc_si128(t3, ks[9]);
                    437:                t4 = _mm_aesenc_si128(t4, ks[9]);
                    438:                t1 = _mm_aesenc_si128(t1, ks[10]);
                    439:                t2 = _mm_aesenc_si128(t2, ks[10]);
                    440:                t3 = _mm_aesenc_si128(t3, ks[10]);
                    441:                t4 = _mm_aesenc_si128(t4, ks[10]);
                    442:                t1 = _mm_aesenc_si128(t1, ks[11]);
                    443:                t2 = _mm_aesenc_si128(t2, ks[11]);
                    444:                t3 = _mm_aesenc_si128(t3, ks[11]);
                    445:                t4 = _mm_aesenc_si128(t4, ks[11]);
                    446:                t1 = _mm_aesenc_si128(t1, ks[12]);
                    447:                t2 = _mm_aesenc_si128(t2, ks[12]);
                    448:                t3 = _mm_aesenc_si128(t3, ks[12]);
                    449:                t4 = _mm_aesenc_si128(t4, ks[12]);
                    450:                t1 = _mm_aesenc_si128(t1, ks[13]);
                    451:                t2 = _mm_aesenc_si128(t2, ks[13]);
                    452:                t3 = _mm_aesenc_si128(t3, ks[13]);
                    453:                t4 = _mm_aesenc_si128(t4, ks[13]);
                    454: 
                    455:                t1 = _mm_aesenclast_si128(t1, ks[14]);
                    456:                t2 = _mm_aesenclast_si128(t2, ks[14]);
                    457:                t3 = _mm_aesenclast_si128(t3, ks[14]);
                    458:                t4 = _mm_aesenclast_si128(t4, ks[14]);
                    459:                t1 = _mm_xor_si128(t1, d1);
                    460:                t2 = _mm_xor_si128(t2, d2);
                    461:                t3 = _mm_xor_si128(t3, d3);
                    462:                t4 = _mm_xor_si128(t4, d4);
                    463:                _mm_storeu_si128(bo + i + 0, t1);
                    464:                _mm_storeu_si128(bo + i + 1, t2);
                    465:                _mm_storeu_si128(bo + i + 2, t3);
                    466:                _mm_storeu_si128(bo + i + 3, t4);
                    467:        }
                    468: 
                    469:        for (i = pblocks; i < blocks; i++)
                    470:        {
                    471:                d1 = _mm_loadu_si128(bi + i);
                    472: 
                    473:                t1 = _mm_xor_si128(state, ks[0]);
                    474:                state = increment_be(state);
                    475: 
                    476:                t1 = _mm_aesenc_si128(t1, ks[1]);
                    477:                t1 = _mm_aesenc_si128(t1, ks[2]);
                    478:                t1 = _mm_aesenc_si128(t1, ks[3]);
                    479:                t1 = _mm_aesenc_si128(t1, ks[4]);
                    480:                t1 = _mm_aesenc_si128(t1, ks[5]);
                    481:                t1 = _mm_aesenc_si128(t1, ks[6]);
                    482:                t1 = _mm_aesenc_si128(t1, ks[7]);
                    483:                t1 = _mm_aesenc_si128(t1, ks[8]);
                    484:                t1 = _mm_aesenc_si128(t1, ks[9]);
                    485:                t1 = _mm_aesenc_si128(t1, ks[10]);
                    486:                t1 = _mm_aesenc_si128(t1, ks[11]);
                    487:                t1 = _mm_aesenc_si128(t1, ks[12]);
                    488:                t1 = _mm_aesenc_si128(t1, ks[13]);
                    489: 
                    490:                t1 = _mm_aesenclast_si128(t1, ks[14]);
                    491:                t1 = _mm_xor_si128(t1, d1);
                    492:                _mm_storeu_si128(bo + i, t1);
                    493:        }
                    494: 
                    495:        if (rem)
                    496:        {
                    497:                memset(&b, 0, sizeof(b));
                    498:                memcpy(&b, bi + blocks, rem);
                    499: 
                    500:                d1 = _mm_loadu_si128(&b);
                    501:                t1 = _mm_xor_si128(state, ks[0]);
                    502: 
                    503:                t1 = _mm_aesenc_si128(t1, ks[1]);
                    504:                t1 = _mm_aesenc_si128(t1, ks[2]);
                    505:                t1 = _mm_aesenc_si128(t1, ks[3]);
                    506:                t1 = _mm_aesenc_si128(t1, ks[4]);
                    507:                t1 = _mm_aesenc_si128(t1, ks[5]);
                    508:                t1 = _mm_aesenc_si128(t1, ks[6]);
                    509:                t1 = _mm_aesenc_si128(t1, ks[7]);
                    510:                t1 = _mm_aesenc_si128(t1, ks[8]);
                    511:                t1 = _mm_aesenc_si128(t1, ks[9]);
                    512:                t1 = _mm_aesenc_si128(t1, ks[10]);
                    513:                t1 = _mm_aesenc_si128(t1, ks[11]);
                    514:                t1 = _mm_aesenc_si128(t1, ks[12]);
                    515:                t1 = _mm_aesenc_si128(t1, ks[13]);
                    516: 
                    517:                t1 = _mm_aesenclast_si128(t1, ks[14]);
                    518:                t1 = _mm_xor_si128(t1, d1);
                    519:                _mm_storeu_si128(&b, t1);
                    520: 
                    521:                memcpy(bo + blocks, &b, rem);
                    522:        }
                    523: }
                    524: 
                    525: METHOD(crypter_t, crypt, bool,
                    526:        private_aesni_ctr_t *this, chunk_t in, chunk_t iv, chunk_t *out)
                    527: {
                    528:        u_char *buf;
                    529: 
                    530:        if (!this->key || iv.len != sizeof(this->state.iv))
                    531:        {
                    532:                return FALSE;
                    533:        }
                    534:        memcpy(this->state.iv, iv.ptr, sizeof(this->state.iv));
                    535:        this->state.counter = htonl(1);
                    536: 
                    537:        buf = in.ptr;
                    538:        if (out)
                    539:        {
                    540:                *out = chunk_alloc(in.len);
                    541:                buf = out->ptr;
                    542:        }
                    543:        this->crypt(this, in.len, in.ptr, buf);
                    544:        return TRUE;
                    545: }
                    546: 
                    547: METHOD(crypter_t, get_block_size, size_t,
                    548:        private_aesni_ctr_t *this)
                    549: {
                    550:        return 1;
                    551: }
                    552: 
                    553: METHOD(crypter_t, get_iv_size, size_t,
                    554:        private_aesni_ctr_t *this)
                    555: {
                    556:        return sizeof(this->state.iv);
                    557: }
                    558: 
                    559: METHOD(crypter_t, get_key_size, size_t,
                    560:        private_aesni_ctr_t *this)
                    561: {
                    562:        return this->key_size + sizeof(this->state.nonce);
                    563: }
                    564: 
                    565: METHOD(crypter_t, set_key, bool,
                    566:        private_aesni_ctr_t *this, chunk_t key)
                    567: {
                    568:        if (key.len != get_key_size(this))
                    569:        {
                    570:                return FALSE;
                    571:        }
                    572: 
                    573:        memcpy(this->state.nonce, key.ptr + key.len - sizeof(this->state.nonce),
                    574:                   sizeof(this->state.nonce));
                    575:        key.len -= sizeof(this->state.nonce);
                    576: 
                    577:        DESTROY_IF(this->key);
                    578:        this->key = aesni_key_create(TRUE, key);
                    579: 
                    580:        return this->key;
                    581: }
                    582: 
                    583: METHOD(crypter_t, destroy, void,
                    584:        private_aesni_ctr_t *this)
                    585: {
                    586:        DESTROY_IF(this->key);
                    587:        free_align(this);
                    588: }
                    589: 
                    590: /**
                    591:  * See header
                    592:  */
                    593: aesni_ctr_t *aesni_ctr_create(encryption_algorithm_t algo, size_t key_size)
                    594: {
                    595:        private_aesni_ctr_t *this;
                    596: 
                    597:        if (algo != ENCR_AES_CTR)
                    598:        {
                    599:                return NULL;
                    600:        }
                    601:        switch (key_size)
                    602:        {
                    603:                case 0:
                    604:                        key_size = 16;
                    605:                        break;
                    606:                case 16:
                    607:                case 24:
                    608:                case 32:
                    609:                        break;
                    610:                default:
                    611:                        return NULL;
                    612:        }
                    613: 
                    614:        INIT_ALIGN(this, sizeof(__m128i),
                    615:                .public = {
                    616:                        .crypter = {
                    617:                                .encrypt = _crypt,
                    618:                                .decrypt = _crypt,
                    619:                                .get_block_size = _get_block_size,
                    620:                                .get_iv_size = _get_iv_size,
                    621:                                .get_key_size = _get_key_size,
                    622:                                .set_key = _set_key,
                    623:                                .destroy = _destroy,
                    624:                        },
                    625:                },
                    626:                .key_size = key_size,
                    627:        );
                    628: 
                    629:        switch (key_size)
                    630:        {
                    631:                case 16:
                    632:                        this->crypt = encrypt_ctr128;
                    633:                        break;
                    634:                case 24:
                    635:                        this->crypt = encrypt_ctr192;
                    636:                        break;
                    637:                case 32:
                    638:                        this->crypt = encrypt_ctr256;
                    639:                        break;
                    640:        }
                    641: 
                    642:        return &this->public;
                    643: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>