File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / strongswan / src / libstrongswan / plugins / aesni / aesni_ctr.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Jun 3 09:46:44 2020 UTC (4 years, 3 months ago) by misho
Branches: strongswan, MAIN
CVS tags: v5_9_2p0, v5_8_4p7, HEAD
Strongswan

    1: /*
    2:  * Copyright (C) 2015 Martin Willi
    3:  * Copyright (C) 2015 revosec AG
    4:  *
    5:  * This program is free software; you can redistribute it and/or modify it
    6:  * under the terms of the GNU General Public License as published by the
    7:  * Free Software Foundation; either version 2 of the License, or (at your
    8:  * option) any later version.  See <http://www.fsf.org/copyleft/gpl.txt>.
    9:  *
   10:  * This program is distributed in the hope that it will be useful, but
   11:  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
   12:  * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   13:  * for more details.
   14:  */
   15: 
   16: #include "aesni_ctr.h"
   17: #include "aesni_key.h"
   18: 
   19: #include <tmmintrin.h>
   20: 
   21: /**
   22:  * Pipeline parallelism we use for CTR en/decryption
   23:  */
   24: #define CTR_CRYPT_PARALLELISM 4
   25: 
   26: typedef struct private_aesni_ctr_t private_aesni_ctr_t;
   27: 
   28: /**
   29:  * CTR en/decryption method type
   30:  */
   31: typedef void (*aesni_ctr_fn_t)(private_aesni_ctr_t*, size_t, u_char*, u_char*);
   32: 
   33: /**
   34:  * Private data of an aesni_ctr_t object.
   35:  */
   36: struct private_aesni_ctr_t {
   37: 
   38: 	/**
   39: 	 * Public aesni_ctr_t interface.
   40: 	 */
   41: 	aesni_ctr_t public;
   42: 
   43: 	/**
   44: 	 * Key size
   45: 	 */
   46: 	u_int key_size;
   47: 
   48: 	/**
   49: 	 * Key schedule
   50: 	 */
   51: 	aesni_key_t *key;
   52: 
   53: 	/**
   54: 	 * Encryption method
   55: 	 */
   56: 	aesni_ctr_fn_t crypt;
   57: 
   58: 	/**
   59: 	 * Counter state
   60: 	 */
   61: 	struct {
   62: 		char nonce[4];
   63: 		char iv[8];
   64: 		uint32_t counter;
   65: 	} __attribute__((packed, aligned(sizeof(__m128i)))) state;
   66: };
   67: 
   68: /**
   69:  * Do big-endian increment on x
   70:  */
   71: static inline __m128i increment_be(__m128i x)
   72: {
   73: 	__m128i swap;
   74: 
   75: 	swap = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
   76: 
   77: 	x = _mm_shuffle_epi8(x, swap);
   78: 	x = _mm_add_epi64(x, _mm_set_epi32(0, 0, 0, 1));
   79: 	x = _mm_shuffle_epi8(x, swap);
   80: 
   81: 	return x;
   82: }
   83: 
   84: /**
   85:  * AES-128 CTR encryption
   86:  */
   87: static void encrypt_ctr128(private_aesni_ctr_t *this,
   88: 						   size_t len, u_char *in, u_char *out)
   89: {
   90: 	__m128i t1, t2, t3, t4;
   91: 	__m128i d1, d2, d3, d4;
   92: 	__m128i *ks, state, b, *bi, *bo;
   93: 	u_int i, blocks, pblocks, rem;
   94: 
   95: 	state = _mm_load_si128((__m128i*)&this->state);
   96: 	blocks = len / AES_BLOCK_SIZE;
   97: 	pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
   98: 	rem = len % AES_BLOCK_SIZE;
   99: 	bi = (__m128i*)in;
  100: 	bo = (__m128i*)out;
  101: 
  102: 	ks = this->key->schedule;
  103: 
  104: 	for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
  105: 	{
  106: 		d1 = _mm_loadu_si128(bi + i + 0);
  107: 		d2 = _mm_loadu_si128(bi + i + 1);
  108: 		d3 = _mm_loadu_si128(bi + i + 2);
  109: 		d4 = _mm_loadu_si128(bi + i + 3);
  110: 
  111: 		t1 = _mm_xor_si128(state, ks[0]);
  112: 		state = increment_be(state);
  113: 		t2 = _mm_xor_si128(state, ks[0]);
  114: 		state = increment_be(state);
  115: 		t3 = _mm_xor_si128(state, ks[0]);
  116: 		state = increment_be(state);
  117: 		t4 = _mm_xor_si128(state, ks[0]);
  118: 		state = increment_be(state);
  119: 
  120: 		t1 = _mm_aesenc_si128(t1, ks[1]);
  121: 		t2 = _mm_aesenc_si128(t2, ks[1]);
  122: 		t3 = _mm_aesenc_si128(t3, ks[1]);
  123: 		t4 = _mm_aesenc_si128(t4, ks[1]);
  124: 		t1 = _mm_aesenc_si128(t1, ks[2]);
  125: 		t2 = _mm_aesenc_si128(t2, ks[2]);
  126: 		t3 = _mm_aesenc_si128(t3, ks[2]);
  127: 		t4 = _mm_aesenc_si128(t4, ks[2]);
  128: 		t1 = _mm_aesenc_si128(t1, ks[3]);
  129: 		t2 = _mm_aesenc_si128(t2, ks[3]);
  130: 		t3 = _mm_aesenc_si128(t3, ks[3]);
  131: 		t4 = _mm_aesenc_si128(t4, ks[3]);
  132: 		t1 = _mm_aesenc_si128(t1, ks[4]);
  133: 		t2 = _mm_aesenc_si128(t2, ks[4]);
  134: 		t3 = _mm_aesenc_si128(t3, ks[4]);
  135: 		t4 = _mm_aesenc_si128(t4, ks[4]);
  136: 		t1 = _mm_aesenc_si128(t1, ks[5]);
  137: 		t2 = _mm_aesenc_si128(t2, ks[5]);
  138: 		t3 = _mm_aesenc_si128(t3, ks[5]);
  139: 		t4 = _mm_aesenc_si128(t4, ks[5]);
  140: 		t1 = _mm_aesenc_si128(t1, ks[6]);
  141: 		t2 = _mm_aesenc_si128(t2, ks[6]);
  142: 		t3 = _mm_aesenc_si128(t3, ks[6]);
  143: 		t4 = _mm_aesenc_si128(t4, ks[6]);
  144: 		t1 = _mm_aesenc_si128(t1, ks[7]);
  145: 		t2 = _mm_aesenc_si128(t2, ks[7]);
  146: 		t3 = _mm_aesenc_si128(t3, ks[7]);
  147: 		t4 = _mm_aesenc_si128(t4, ks[7]);
  148: 		t1 = _mm_aesenc_si128(t1, ks[8]);
  149: 		t2 = _mm_aesenc_si128(t2, ks[8]);
  150: 		t3 = _mm_aesenc_si128(t3, ks[8]);
  151: 		t4 = _mm_aesenc_si128(t4, ks[8]);
  152: 		t1 = _mm_aesenc_si128(t1, ks[9]);
  153: 		t2 = _mm_aesenc_si128(t2, ks[9]);
  154: 		t3 = _mm_aesenc_si128(t3, ks[9]);
  155: 		t4 = _mm_aesenc_si128(t4, ks[9]);
  156: 
  157: 		t1 = _mm_aesenclast_si128(t1, ks[10]);
  158: 		t2 = _mm_aesenclast_si128(t2, ks[10]);
  159: 		t3 = _mm_aesenclast_si128(t3, ks[10]);
  160: 		t4 = _mm_aesenclast_si128(t4, ks[10]);
  161: 		t1 = _mm_xor_si128(t1, d1);
  162: 		t2 = _mm_xor_si128(t2, d2);
  163: 		t3 = _mm_xor_si128(t3, d3);
  164: 		t4 = _mm_xor_si128(t4, d4);
  165: 		_mm_storeu_si128(bo + i + 0, t1);
  166: 		_mm_storeu_si128(bo + i + 1, t2);
  167: 		_mm_storeu_si128(bo + i + 2, t3);
  168: 		_mm_storeu_si128(bo + i + 3, t4);
  169: 	}
  170: 
  171: 	for (i = pblocks; i < blocks; i++)
  172: 	{
  173: 		d1 = _mm_loadu_si128(bi + i);
  174: 
  175: 		t1 = _mm_xor_si128(state, ks[0]);
  176: 		state = increment_be(state);
  177: 
  178: 		t1 = _mm_aesenc_si128(t1, ks[1]);
  179: 		t1 = _mm_aesenc_si128(t1, ks[2]);
  180: 		t1 = _mm_aesenc_si128(t1, ks[3]);
  181: 		t1 = _mm_aesenc_si128(t1, ks[4]);
  182: 		t1 = _mm_aesenc_si128(t1, ks[5]);
  183: 		t1 = _mm_aesenc_si128(t1, ks[6]);
  184: 		t1 = _mm_aesenc_si128(t1, ks[7]);
  185: 		t1 = _mm_aesenc_si128(t1, ks[8]);
  186: 		t1 = _mm_aesenc_si128(t1, ks[9]);
  187: 
  188: 		t1 = _mm_aesenclast_si128(t1, ks[10]);
  189: 		t1 = _mm_xor_si128(t1, d1);
  190: 		_mm_storeu_si128(bo + i, t1);
  191: 	}
  192: 
  193: 	if (rem)
  194: 	{
  195: 		memset(&b, 0, sizeof(b));
  196: 		memcpy(&b, bi + blocks, rem);
  197: 
  198: 		d1 = _mm_loadu_si128(&b);
  199: 		t1 = _mm_xor_si128(state, ks[0]);
  200: 
  201: 		t1 = _mm_aesenc_si128(t1, ks[1]);
  202: 		t1 = _mm_aesenc_si128(t1, ks[2]);
  203: 		t1 = _mm_aesenc_si128(t1, ks[3]);
  204: 		t1 = _mm_aesenc_si128(t1, ks[4]);
  205: 		t1 = _mm_aesenc_si128(t1, ks[5]);
  206: 		t1 = _mm_aesenc_si128(t1, ks[6]);
  207: 		t1 = _mm_aesenc_si128(t1, ks[7]);
  208: 		t1 = _mm_aesenc_si128(t1, ks[8]);
  209: 		t1 = _mm_aesenc_si128(t1, ks[9]);
  210: 
  211: 		t1 = _mm_aesenclast_si128(t1, ks[10]);
  212: 		t1 = _mm_xor_si128(t1, d1);
  213: 		_mm_storeu_si128(&b, t1);
  214: 
  215: 		memcpy(bo + blocks, &b, rem);
  216: 	}
  217: }
  218: 
  219: /**
  220:  * AES-192 CTR encryption
  221:  */
  222: static void encrypt_ctr192(private_aesni_ctr_t *this,
  223: 						   size_t len, u_char *in, u_char *out)
  224: {
  225: 	__m128i t1, t2, t3, t4;
  226: 	__m128i d1, d2, d3, d4;
  227: 	__m128i *ks, state, b, *bi, *bo;
  228: 	u_int i, blocks, pblocks, rem;
  229: 
  230: 	state = _mm_load_si128((__m128i*)&this->state);
  231: 	blocks = len / AES_BLOCK_SIZE;
  232: 	pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
  233: 	rem = len % AES_BLOCK_SIZE;
  234: 	bi = (__m128i*)in;
  235: 	bo = (__m128i*)out;
  236: 
  237: 	ks = this->key->schedule;
  238: 
  239: 	for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
  240: 	{
  241: 		d1 = _mm_loadu_si128(bi + i + 0);
  242: 		d2 = _mm_loadu_si128(bi + i + 1);
  243: 		d3 = _mm_loadu_si128(bi + i + 2);
  244: 		d4 = _mm_loadu_si128(bi + i + 3);
  245: 
  246: 		t1 = _mm_xor_si128(state, ks[0]);
  247: 		state = increment_be(state);
  248: 		t2 = _mm_xor_si128(state, ks[0]);
  249: 		state = increment_be(state);
  250: 		t3 = _mm_xor_si128(state, ks[0]);
  251: 		state = increment_be(state);
  252: 		t4 = _mm_xor_si128(state, ks[0]);
  253: 		state = increment_be(state);
  254: 
  255: 		t1 = _mm_aesenc_si128(t1, ks[1]);
  256: 		t2 = _mm_aesenc_si128(t2, ks[1]);
  257: 		t3 = _mm_aesenc_si128(t3, ks[1]);
  258: 		t4 = _mm_aesenc_si128(t4, ks[1]);
  259: 		t1 = _mm_aesenc_si128(t1, ks[2]);
  260: 		t2 = _mm_aesenc_si128(t2, ks[2]);
  261: 		t3 = _mm_aesenc_si128(t3, ks[2]);
  262: 		t4 = _mm_aesenc_si128(t4, ks[2]);
  263: 		t1 = _mm_aesenc_si128(t1, ks[3]);
  264: 		t2 = _mm_aesenc_si128(t2, ks[3]);
  265: 		t3 = _mm_aesenc_si128(t3, ks[3]);
  266: 		t4 = _mm_aesenc_si128(t4, ks[3]);
  267: 		t1 = _mm_aesenc_si128(t1, ks[4]);
  268: 		t2 = _mm_aesenc_si128(t2, ks[4]);
  269: 		t3 = _mm_aesenc_si128(t3, ks[4]);
  270: 		t4 = _mm_aesenc_si128(t4, ks[4]);
  271: 		t1 = _mm_aesenc_si128(t1, ks[5]);
  272: 		t2 = _mm_aesenc_si128(t2, ks[5]);
  273: 		t3 = _mm_aesenc_si128(t3, ks[5]);
  274: 		t4 = _mm_aesenc_si128(t4, ks[5]);
  275: 		t1 = _mm_aesenc_si128(t1, ks[6]);
  276: 		t2 = _mm_aesenc_si128(t2, ks[6]);
  277: 		t3 = _mm_aesenc_si128(t3, ks[6]);
  278: 		t4 = _mm_aesenc_si128(t4, ks[6]);
  279: 		t1 = _mm_aesenc_si128(t1, ks[7]);
  280: 		t2 = _mm_aesenc_si128(t2, ks[7]);
  281: 		t3 = _mm_aesenc_si128(t3, ks[7]);
  282: 		t4 = _mm_aesenc_si128(t4, ks[7]);
  283: 		t1 = _mm_aesenc_si128(t1, ks[8]);
  284: 		t2 = _mm_aesenc_si128(t2, ks[8]);
  285: 		t3 = _mm_aesenc_si128(t3, ks[8]);
  286: 		t4 = _mm_aesenc_si128(t4, ks[8]);
  287: 		t1 = _mm_aesenc_si128(t1, ks[9]);
  288: 		t2 = _mm_aesenc_si128(t2, ks[9]);
  289: 		t3 = _mm_aesenc_si128(t3, ks[9]);
  290: 		t4 = _mm_aesenc_si128(t4, ks[9]);
  291: 		t1 = _mm_aesenc_si128(t1, ks[10]);
  292: 		t2 = _mm_aesenc_si128(t2, ks[10]);
  293: 		t3 = _mm_aesenc_si128(t3, ks[10]);
  294: 		t4 = _mm_aesenc_si128(t4, ks[10]);
  295: 		t1 = _mm_aesenc_si128(t1, ks[11]);
  296: 		t2 = _mm_aesenc_si128(t2, ks[11]);
  297: 		t3 = _mm_aesenc_si128(t3, ks[11]);
  298: 		t4 = _mm_aesenc_si128(t4, ks[11]);
  299: 
  300: 		t1 = _mm_aesenclast_si128(t1, ks[12]);
  301: 		t2 = _mm_aesenclast_si128(t2, ks[12]);
  302: 		t3 = _mm_aesenclast_si128(t3, ks[12]);
  303: 		t4 = _mm_aesenclast_si128(t4, ks[12]);
  304: 		t1 = _mm_xor_si128(t1, d1);
  305: 		t2 = _mm_xor_si128(t2, d2);
  306: 		t3 = _mm_xor_si128(t3, d3);
  307: 		t4 = _mm_xor_si128(t4, d4);
  308: 		_mm_storeu_si128(bo + i + 0, t1);
  309: 		_mm_storeu_si128(bo + i + 1, t2);
  310: 		_mm_storeu_si128(bo + i + 2, t3);
  311: 		_mm_storeu_si128(bo + i + 3, t4);
  312: 	}
  313: 
  314: 	for (i = pblocks; i < blocks; i++)
  315: 	{
  316: 		d1 = _mm_loadu_si128(bi + i);
  317: 
  318: 		t1 = _mm_xor_si128(state, ks[0]);
  319: 		state = increment_be(state);
  320: 
  321: 		t1 = _mm_aesenc_si128(t1, ks[1]);
  322: 		t1 = _mm_aesenc_si128(t1, ks[2]);
  323: 		t1 = _mm_aesenc_si128(t1, ks[3]);
  324: 		t1 = _mm_aesenc_si128(t1, ks[4]);
  325: 		t1 = _mm_aesenc_si128(t1, ks[5]);
  326: 		t1 = _mm_aesenc_si128(t1, ks[6]);
  327: 		t1 = _mm_aesenc_si128(t1, ks[7]);
  328: 		t1 = _mm_aesenc_si128(t1, ks[8]);
  329: 		t1 = _mm_aesenc_si128(t1, ks[9]);
  330: 		t1 = _mm_aesenc_si128(t1, ks[10]);
  331: 		t1 = _mm_aesenc_si128(t1, ks[11]);
  332: 
  333: 		t1 = _mm_aesenclast_si128(t1, ks[12]);
  334: 		t1 = _mm_xor_si128(t1, d1);
  335: 		_mm_storeu_si128(bo + i, t1);
  336: 	}
  337: 
  338: 	if (rem)
  339: 	{
  340: 		memset(&b, 0, sizeof(b));
  341: 		memcpy(&b, bi + blocks, rem);
  342: 
  343: 		d1 = _mm_loadu_si128(&b);
  344: 		t1 = _mm_xor_si128(state, ks[0]);
  345: 
  346: 		t1 = _mm_aesenc_si128(t1, ks[1]);
  347: 		t1 = _mm_aesenc_si128(t1, ks[2]);
  348: 		t1 = _mm_aesenc_si128(t1, ks[3]);
  349: 		t1 = _mm_aesenc_si128(t1, ks[4]);
  350: 		t1 = _mm_aesenc_si128(t1, ks[5]);
  351: 		t1 = _mm_aesenc_si128(t1, ks[6]);
  352: 		t1 = _mm_aesenc_si128(t1, ks[7]);
  353: 		t1 = _mm_aesenc_si128(t1, ks[8]);
  354: 		t1 = _mm_aesenc_si128(t1, ks[9]);
  355: 		t1 = _mm_aesenc_si128(t1, ks[10]);
  356: 		t1 = _mm_aesenc_si128(t1, ks[11]);
  357: 
  358: 		t1 = _mm_aesenclast_si128(t1, ks[12]);
  359: 		t1 = _mm_xor_si128(t1, d1);
  360: 		_mm_storeu_si128(&b, t1);
  361: 
  362: 		memcpy(bo + blocks, &b, rem);
  363: 	}
  364: }
  365: 
  366: /**
  367:  * AES-256 CTR encryption
  368:  */
  369: static void encrypt_ctr256(private_aesni_ctr_t *this,
  370: 						   size_t len, u_char *in, u_char *out)
  371: {
  372: 	__m128i t1, t2, t3, t4;
  373: 	__m128i d1, d2, d3, d4;
  374: 	__m128i *ks, state, b, *bi, *bo;
  375: 	u_int i, blocks, pblocks, rem;
  376: 
  377: 	state = _mm_load_si128((__m128i*)&this->state);
  378: 	blocks = len / AES_BLOCK_SIZE;
  379: 	pblocks = blocks - (blocks % CTR_CRYPT_PARALLELISM);
  380: 	rem = len % AES_BLOCK_SIZE;
  381: 	bi = (__m128i*)in;
  382: 	bo = (__m128i*)out;
  383: 
  384: 	ks = this->key->schedule;
  385: 
  386: 	for (i = 0; i < pblocks; i += CTR_CRYPT_PARALLELISM)
  387: 	{
  388: 		d1 = _mm_loadu_si128(bi + i + 0);
  389: 		d2 = _mm_loadu_si128(bi + i + 1);
  390: 		d3 = _mm_loadu_si128(bi + i + 2);
  391: 		d4 = _mm_loadu_si128(bi + i + 3);
  392: 
  393: 		t1 = _mm_xor_si128(state, ks[0]);
  394: 		state = increment_be(state);
  395: 		t2 = _mm_xor_si128(state, ks[0]);
  396: 		state = increment_be(state);
  397: 		t3 = _mm_xor_si128(state, ks[0]);
  398: 		state = increment_be(state);
  399: 		t4 = _mm_xor_si128(state, ks[0]);
  400: 		state = increment_be(state);
  401: 
  402: 		t1 = _mm_aesenc_si128(t1, ks[1]);
  403: 		t2 = _mm_aesenc_si128(t2, ks[1]);
  404: 		t3 = _mm_aesenc_si128(t3, ks[1]);
  405: 		t4 = _mm_aesenc_si128(t4, ks[1]);
  406: 		t1 = _mm_aesenc_si128(t1, ks[2]);
  407: 		t2 = _mm_aesenc_si128(t2, ks[2]);
  408: 		t3 = _mm_aesenc_si128(t3, ks[2]);
  409: 		t4 = _mm_aesenc_si128(t4, ks[2]);
  410: 		t1 = _mm_aesenc_si128(t1, ks[3]);
  411: 		t2 = _mm_aesenc_si128(t2, ks[3]);
  412: 		t3 = _mm_aesenc_si128(t3, ks[3]);
  413: 		t4 = _mm_aesenc_si128(t4, ks[3]);
  414: 		t1 = _mm_aesenc_si128(t1, ks[4]);
  415: 		t2 = _mm_aesenc_si128(t2, ks[4]);
  416: 		t3 = _mm_aesenc_si128(t3, ks[4]);
  417: 		t4 = _mm_aesenc_si128(t4, ks[4]);
  418: 		t1 = _mm_aesenc_si128(t1, ks[5]);
  419: 		t2 = _mm_aesenc_si128(t2, ks[5]);
  420: 		t3 = _mm_aesenc_si128(t3, ks[5]);
  421: 		t4 = _mm_aesenc_si128(t4, ks[5]);
  422: 		t1 = _mm_aesenc_si128(t1, ks[6]);
  423: 		t2 = _mm_aesenc_si128(t2, ks[6]);
  424: 		t3 = _mm_aesenc_si128(t3, ks[6]);
  425: 		t4 = _mm_aesenc_si128(t4, ks[6]);
  426: 		t1 = _mm_aesenc_si128(t1, ks[7]);
  427: 		t2 = _mm_aesenc_si128(t2, ks[7]);
  428: 		t3 = _mm_aesenc_si128(t3, ks[7]);
  429: 		t4 = _mm_aesenc_si128(t4, ks[7]);
  430: 		t1 = _mm_aesenc_si128(t1, ks[8]);
  431: 		t2 = _mm_aesenc_si128(t2, ks[8]);
  432: 		t3 = _mm_aesenc_si128(t3, ks[8]);
  433: 		t4 = _mm_aesenc_si128(t4, ks[8]);
  434: 		t1 = _mm_aesenc_si128(t1, ks[9]);
  435: 		t2 = _mm_aesenc_si128(t2, ks[9]);
  436: 		t3 = _mm_aesenc_si128(t3, ks[9]);
  437: 		t4 = _mm_aesenc_si128(t4, ks[9]);
  438: 		t1 = _mm_aesenc_si128(t1, ks[10]);
  439: 		t2 = _mm_aesenc_si128(t2, ks[10]);
  440: 		t3 = _mm_aesenc_si128(t3, ks[10]);
  441: 		t4 = _mm_aesenc_si128(t4, ks[10]);
  442: 		t1 = _mm_aesenc_si128(t1, ks[11]);
  443: 		t2 = _mm_aesenc_si128(t2, ks[11]);
  444: 		t3 = _mm_aesenc_si128(t3, ks[11]);
  445: 		t4 = _mm_aesenc_si128(t4, ks[11]);
  446: 		t1 = _mm_aesenc_si128(t1, ks[12]);
  447: 		t2 = _mm_aesenc_si128(t2, ks[12]);
  448: 		t3 = _mm_aesenc_si128(t3, ks[12]);
  449: 		t4 = _mm_aesenc_si128(t4, ks[12]);
  450: 		t1 = _mm_aesenc_si128(t1, ks[13]);
  451: 		t2 = _mm_aesenc_si128(t2, ks[13]);
  452: 		t3 = _mm_aesenc_si128(t3, ks[13]);
  453: 		t4 = _mm_aesenc_si128(t4, ks[13]);
  454: 
  455: 		t1 = _mm_aesenclast_si128(t1, ks[14]);
  456: 		t2 = _mm_aesenclast_si128(t2, ks[14]);
  457: 		t3 = _mm_aesenclast_si128(t3, ks[14]);
  458: 		t4 = _mm_aesenclast_si128(t4, ks[14]);
  459: 		t1 = _mm_xor_si128(t1, d1);
  460: 		t2 = _mm_xor_si128(t2, d2);
  461: 		t3 = _mm_xor_si128(t3, d3);
  462: 		t4 = _mm_xor_si128(t4, d4);
  463: 		_mm_storeu_si128(bo + i + 0, t1);
  464: 		_mm_storeu_si128(bo + i + 1, t2);
  465: 		_mm_storeu_si128(bo + i + 2, t3);
  466: 		_mm_storeu_si128(bo + i + 3, t4);
  467: 	}
  468: 
  469: 	for (i = pblocks; i < blocks; i++)
  470: 	{
  471: 		d1 = _mm_loadu_si128(bi + i);
  472: 
  473: 		t1 = _mm_xor_si128(state, ks[0]);
  474: 		state = increment_be(state);
  475: 
  476: 		t1 = _mm_aesenc_si128(t1, ks[1]);
  477: 		t1 = _mm_aesenc_si128(t1, ks[2]);
  478: 		t1 = _mm_aesenc_si128(t1, ks[3]);
  479: 		t1 = _mm_aesenc_si128(t1, ks[4]);
  480: 		t1 = _mm_aesenc_si128(t1, ks[5]);
  481: 		t1 = _mm_aesenc_si128(t1, ks[6]);
  482: 		t1 = _mm_aesenc_si128(t1, ks[7]);
  483: 		t1 = _mm_aesenc_si128(t1, ks[8]);
  484: 		t1 = _mm_aesenc_si128(t1, ks[9]);
  485: 		t1 = _mm_aesenc_si128(t1, ks[10]);
  486: 		t1 = _mm_aesenc_si128(t1, ks[11]);
  487: 		t1 = _mm_aesenc_si128(t1, ks[12]);
  488: 		t1 = _mm_aesenc_si128(t1, ks[13]);
  489: 
  490: 		t1 = _mm_aesenclast_si128(t1, ks[14]);
  491: 		t1 = _mm_xor_si128(t1, d1);
  492: 		_mm_storeu_si128(bo + i, t1);
  493: 	}
  494: 
  495: 	if (rem)
  496: 	{
  497: 		memset(&b, 0, sizeof(b));
  498: 		memcpy(&b, bi + blocks, rem);
  499: 
  500: 		d1 = _mm_loadu_si128(&b);
  501: 		t1 = _mm_xor_si128(state, ks[0]);
  502: 
  503: 		t1 = _mm_aesenc_si128(t1, ks[1]);
  504: 		t1 = _mm_aesenc_si128(t1, ks[2]);
  505: 		t1 = _mm_aesenc_si128(t1, ks[3]);
  506: 		t1 = _mm_aesenc_si128(t1, ks[4]);
  507: 		t1 = _mm_aesenc_si128(t1, ks[5]);
  508: 		t1 = _mm_aesenc_si128(t1, ks[6]);
  509: 		t1 = _mm_aesenc_si128(t1, ks[7]);
  510: 		t1 = _mm_aesenc_si128(t1, ks[8]);
  511: 		t1 = _mm_aesenc_si128(t1, ks[9]);
  512: 		t1 = _mm_aesenc_si128(t1, ks[10]);
  513: 		t1 = _mm_aesenc_si128(t1, ks[11]);
  514: 		t1 = _mm_aesenc_si128(t1, ks[12]);
  515: 		t1 = _mm_aesenc_si128(t1, ks[13]);
  516: 
  517: 		t1 = _mm_aesenclast_si128(t1, ks[14]);
  518: 		t1 = _mm_xor_si128(t1, d1);
  519: 		_mm_storeu_si128(&b, t1);
  520: 
  521: 		memcpy(bo + blocks, &b, rem);
  522: 	}
  523: }
  524: 
  525: METHOD(crypter_t, crypt, bool,
  526: 	private_aesni_ctr_t *this, chunk_t in, chunk_t iv, chunk_t *out)
  527: {
  528: 	u_char *buf;
  529: 
  530: 	if (!this->key || iv.len != sizeof(this->state.iv))
  531: 	{
  532: 		return FALSE;
  533: 	}
  534: 	memcpy(this->state.iv, iv.ptr, sizeof(this->state.iv));
  535: 	this->state.counter = htonl(1);
  536: 
  537: 	buf = in.ptr;
  538: 	if (out)
  539: 	{
  540: 		*out = chunk_alloc(in.len);
  541: 		buf = out->ptr;
  542: 	}
  543: 	this->crypt(this, in.len, in.ptr, buf);
  544: 	return TRUE;
  545: }
  546: 
  547: METHOD(crypter_t, get_block_size, size_t,
  548: 	private_aesni_ctr_t *this)
  549: {
  550: 	return 1;
  551: }
  552: 
  553: METHOD(crypter_t, get_iv_size, size_t,
  554: 	private_aesni_ctr_t *this)
  555: {
  556: 	return sizeof(this->state.iv);
  557: }
  558: 
  559: METHOD(crypter_t, get_key_size, size_t,
  560: 	private_aesni_ctr_t *this)
  561: {
  562: 	return this->key_size + sizeof(this->state.nonce);
  563: }
  564: 
  565: METHOD(crypter_t, set_key, bool,
  566: 	private_aesni_ctr_t *this, chunk_t key)
  567: {
  568: 	if (key.len != get_key_size(this))
  569: 	{
  570: 		return FALSE;
  571: 	}
  572: 
  573: 	memcpy(this->state.nonce, key.ptr + key.len - sizeof(this->state.nonce),
  574: 		   sizeof(this->state.nonce));
  575: 	key.len -= sizeof(this->state.nonce);
  576: 
  577: 	DESTROY_IF(this->key);
  578: 	this->key = aesni_key_create(TRUE, key);
  579: 
  580: 	return this->key;
  581: }
  582: 
  583: METHOD(crypter_t, destroy, void,
  584: 	private_aesni_ctr_t *this)
  585: {
  586: 	DESTROY_IF(this->key);
  587: 	free_align(this);
  588: }
  589: 
  590: /**
  591:  * See header
  592:  */
  593: aesni_ctr_t *aesni_ctr_create(encryption_algorithm_t algo, size_t key_size)
  594: {
  595: 	private_aesni_ctr_t *this;
  596: 
  597: 	if (algo != ENCR_AES_CTR)
  598: 	{
  599: 		return NULL;
  600: 	}
  601: 	switch (key_size)
  602: 	{
  603: 		case 0:
  604: 			key_size = 16;
  605: 			break;
  606: 		case 16:
  607: 		case 24:
  608: 		case 32:
  609: 			break;
  610: 		default:
  611: 			return NULL;
  612: 	}
  613: 
  614: 	INIT_ALIGN(this, sizeof(__m128i),
  615: 		.public = {
  616: 			.crypter = {
  617: 				.encrypt = _crypt,
  618: 				.decrypt = _crypt,
  619: 				.get_block_size = _get_block_size,
  620: 				.get_iv_size = _get_iv_size,
  621: 				.get_key_size = _get_key_size,
  622: 				.set_key = _set_key,
  623: 				.destroy = _destroy,
  624: 			},
  625: 		},
  626: 		.key_size = key_size,
  627: 	);
  628: 
  629: 	switch (key_size)
  630: 	{
  631: 		case 16:
  632: 			this->crypt = encrypt_ctr128;
  633: 			break;
  634: 		case 24:
  635: 			this->crypt = encrypt_ctr192;
  636: 			break;
  637: 		case 32:
  638: 			this->crypt = encrypt_ctr256;
  639: 			break;
  640: 	}
  641: 
  642: 	return &this->public;
  643: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>