diff --git a/lib/freebl/freebl_base.gypi b/lib/freebl/freebl_base.gypi index 6970eff7d..03abbefcf 100644 --- a/lib/freebl/freebl_base.gypi +++ b/lib/freebl/freebl_base.gypi @@ -38,6 +38,7 @@ 'blinit.c', 'freeblver.c', 'gcm.c', + 'gcm-hw.c', 'hmacct.c', 'jpake.c', 'ldvector.c', @@ -52,6 +53,7 @@ 'pqg.c', 'rawhash.c', 'rijndael.c', + 'rijndael-hw.c', 'rsa.c', 'rsapkcs.c', 'seed.c', diff --git a/lib/freebl/gcm-hw.c b/lib/freebl/gcm-hw.c new file mode 100644 index 000000000..df77d1eb3 --- /dev/null +++ b/lib/freebl/gcm-hw.c @@ -0,0 +1,151 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifdef FREEBL_NO_DEPEND +#include "stubs.h" +#endif +#include "gcm.h" +#include "secerr.h" + +#ifdef NSS_X86_OR_X64 +#include /* clmul */ +#endif + +#define WRITE64(x, bytes) \ + (bytes)[0] = (x) >> 56; \ + (bytes)[1] = (x) >> 48; \ + (bytes)[2] = (x) >> 40; \ + (bytes)[3] = (x) >> 32; \ + (bytes)[4] = (x) >> 24; \ + (bytes)[5] = (x) >> 16; \ + (bytes)[6] = (x) >> 8; \ + (bytes)[7] = (x); + +SECStatus +gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf, + unsigned int maxout) +{ +#ifdef NSS_X86_OR_X64 + uint64_t tmp_out[2]; + _mm_storeu_si128((__m128i *)tmp_out, ghash->x); + PORT_Assert(maxout >= 16); + WRITE64(tmp_out[0], outbuf + 8); + WRITE64(tmp_out[1], outbuf); + return SECSuccess; +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ +} + +SECStatus +gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count) +{ +#ifdef NSS_X86_OR_X64 + size_t i; + pre_align __m128i z_high post_align; + pre_align __m128i z_low post_align; + pre_align __m128i C post_align; + pre_align __m128i D post_align; + pre_align __m128i E post_align; + pre_align __m128i F post_align; + pre_align __m128i bin post_align; + pre_align __m128i Ci post_align; + pre_align __m128i tmp post_align; + + for (i = 0; i < count; i++, buf += 16) { + bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1], + ((uint16_t)buf[2] << 8) | buf[3], + ((uint16_t)buf[4] << 8) | buf[5], + ((uint16_t)buf[6] << 8) | buf[7], + ((uint16_t)buf[8] << 8) | buf[9], + ((uint16_t)buf[10] << 8) | buf[11], + ((uint16_t)buf[12] << 8) | buf[13], + ((uint16_t)buf[14] << 8) | buf[15]); + Ci = _mm_xor_si128(bin, ghash->x); + + /* Do binary mult ghash->X = Ci * ghash->H. */ + C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00); + D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11); + E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01); + F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10); + tmp = _mm_xor_si128(E, F); + z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8)); + z_high = _mm_unpackhi_epi64(z_high, D); + z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C); + z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low); + + /* Shift one to the left (multiply by x) as gcm spec is stupid. */ + C = _mm_slli_si128(z_low, 8); + E = _mm_srli_epi64(C, 63); + D = _mm_slli_si128(z_high, 8); + F = _mm_srli_epi64(D, 63); + /* Carry over */ + C = _mm_srli_si128(z_low, 8); + D = _mm_srli_epi64(C, 63); + z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E); + z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D); + + /* Reduce */ + C = _mm_slli_si128(z_low, 8); + /* D = z_low << 127 */ + D = _mm_slli_epi64(C, 63); + /* E = z_low << 126 */ + E = _mm_slli_epi64(C, 62); + /* F = z_low << 121 */ + F = _mm_slli_epi64(C, 57); + /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */ + z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F); + C = _mm_srli_si128(z_low, 8); + /* D = z_low >> 1 */ + D = _mm_slli_epi64(C, 63); + D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D); + /* E = z_low >> 2 */ + E = _mm_slli_epi64(C, 62); + E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E); + /* F = z_low >> 7 */ + F = _mm_slli_epi64(C, 57); + F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F); + /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */ + ghash->x = _mm_xor_si128(_mm_xor_si128( + _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E), + F); + } + return SECSuccess; +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ +} + +SECStatus +gcm_HashInit_hw(gcmHashContext *ghash) +{ +#ifdef NSS_X86_OR_X64 + ghash->ghash_mul = gcm_HashMult_hw; + ghash->x = _mm_setzero_si128(); + /* MSVC requires __m64 to load epi64. */ + ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high, + ghash->h_low >> 32, (uint32_t)ghash->h_low); + ghash->hw = PR_TRUE; + return SECSuccess; +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ +} + +SECStatus +gcm_HashZeroX_hw(gcmHashContext *ghash) +{ +#ifdef NSS_X86_OR_X64 + ghash->x = _mm_setzero_si128(); + return SECSuccess; +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ +} + diff --git a/lib/freebl/gcm.c b/lib/freebl/gcm.c index 780b7a632..0b1df498f 100644 --- a/lib/freebl/gcm.c +++ b/lib/freebl/gcm.c @@ -17,13 +17,13 @@ #include -#ifdef NSS_X86_OR_X64 -#include /* clmul */ -#endif - /* Forward declarations */ +SECStatus gcm_HashInit_hw(gcmHashContext *ghash); +SECStatus gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf, + unsigned int maxout); SECStatus gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, unsigned int count); +SECStatus gcm_HashZeroX_hw(gcmHashContext *ghash); SECStatus gcm_HashMult_sftw(gcmHashContext *ghash, const unsigned char *buf, unsigned int count); SECStatus gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf, @@ -46,6 +46,8 @@ get64(const unsigned char *bytes) SECStatus gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw) { + SECStatus rv = SECSuccess; + ghash->cLen = 0; ghash->bufLen = 0; PORT_Memset(ghash->counterBuf, 0, sizeof(ghash->counterBuf)); @@ -53,17 +55,7 @@ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw) ghash->h_low = get64(H + 8); ghash->h_high = get64(H); if (clmul_support() && !sw) { -#ifdef NSS_X86_OR_X64 - ghash->ghash_mul = gcm_HashMult_hw; - ghash->x = _mm_setzero_si128(); - /* MSVC requires __m64 to load epi64. */ - ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high, - ghash->h_low >> 32, (uint32_t)ghash->h_low); - ghash->hw = PR_TRUE; -#else - PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); - return SECFailure; -#endif /* NSS_X86_OR_X64 */ + rv = gcm_HashInit_hw(ghash); } else { /* We fall back to the software implementation if we can't use / don't * want to use pclmul. */ @@ -75,7 +67,7 @@ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw) ghash->x_high = ghash->x_low = 0; ghash->hw = PR_FALSE; } - return SECSuccess; + return rv; } #ifdef HAVE_INT128_SUPPORT @@ -283,102 +275,17 @@ gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf, } #endif /* HAVE_INT128_SUPPORT */ -SECStatus -gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, - unsigned int count) -{ -#ifdef NSS_X86_OR_X64 - size_t i; - pre_align __m128i z_high post_align; - pre_align __m128i z_low post_align; - pre_align __m128i C post_align; - pre_align __m128i D post_align; - pre_align __m128i E post_align; - pre_align __m128i F post_align; - pre_align __m128i bin post_align; - pre_align __m128i Ci post_align; - pre_align __m128i tmp post_align; - - for (i = 0; i < count; i++, buf += 16) { - bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1], - ((uint16_t)buf[2] << 8) | buf[3], - ((uint16_t)buf[4] << 8) | buf[5], - ((uint16_t)buf[6] << 8) | buf[7], - ((uint16_t)buf[8] << 8) | buf[9], - ((uint16_t)buf[10] << 8) | buf[11], - ((uint16_t)buf[12] << 8) | buf[13], - ((uint16_t)buf[14] << 8) | buf[15]); - Ci = _mm_xor_si128(bin, ghash->x); - - /* Do binary mult ghash->X = Ci * ghash->H. */ - C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00); - D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11); - E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01); - F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10); - tmp = _mm_xor_si128(E, F); - z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8)); - z_high = _mm_unpackhi_epi64(z_high, D); - z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C); - z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low); - - /* Shift one to the left (multiply by x) as gcm spec is stupid. */ - C = _mm_slli_si128(z_low, 8); - E = _mm_srli_epi64(C, 63); - D = _mm_slli_si128(z_high, 8); - F = _mm_srli_epi64(D, 63); - /* Carry over */ - C = _mm_srli_si128(z_low, 8); - D = _mm_srli_epi64(C, 63); - z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E); - z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D); - - /* Reduce */ - C = _mm_slli_si128(z_low, 8); - /* D = z_low << 127 */ - D = _mm_slli_epi64(C, 63); - /* E = z_low << 126 */ - E = _mm_slli_epi64(C, 62); - /* F = z_low << 121 */ - F = _mm_slli_epi64(C, 57); - /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */ - z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F); - C = _mm_srli_si128(z_low, 8); - /* D = z_low >> 1 */ - D = _mm_slli_epi64(C, 63); - D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D); - /* E = z_low >> 2 */ - E = _mm_slli_epi64(C, 62); - E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E); - /* F = z_low >> 7 */ - F = _mm_slli_epi64(C, 57); - F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F); - /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */ - ghash->x = _mm_xor_si128(_mm_xor_si128( - _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E), - F); - } - return SECSuccess; -#else - PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); - return SECFailure; -#endif /* NSS_X86_OR_X64 */ -} - static SECStatus gcm_zeroX(gcmHashContext *ghash) { + SECStatus rv = SECSuccess; + if (ghash->hw) { -#ifdef NSS_X86_OR_X64 - ghash->x = _mm_setzero_si128(); - return SECSuccess; -#else - PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); - return SECFailure; -#endif /* NSS_X86_OR_X64 */ + rv = gcm_HashZeroX_hw(ghash); } ghash->x_high = ghash->x_low = 0; - return SECSuccess; + return rv; } /* @@ -503,15 +410,10 @@ gcmHash_Final(gcmHashContext *ghash, unsigned char *outbuf, } if (ghash->hw) { -#ifdef NSS_X86_OR_X64 - uint64_t tmp_out[2]; - _mm_storeu_si128((__m128i *)tmp_out, ghash->x); - WRITE64(tmp_out[0], T + 8); - WRITE64(tmp_out[1], T); -#else - PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); - return SECFailure; -#endif /* NSS_X86_OR_X64 */ + rv = gcm_HashWrite_hw(ghash, T, MAX_BLOCK_SIZE); + if (rv != SECSuccess) { + goto cleanup; + } } else { WRITE64(ghash->x_low, T + 8); WRITE64(ghash->x_high, T); diff --git a/lib/freebl/manifest.mn b/lib/freebl/manifest.mn index e4c9ab0b7..6c14da9b0 100644 --- a/lib/freebl/manifest.mn +++ b/lib/freebl/manifest.mn @@ -135,8 +135,10 @@ CSRCS = \ blinit.c \ fipsfreebl.c \ gcm.c \ + gcm-hw.c \ hmacct.c \ rijndael.c \ + rijndael-hw.c \ aeskeywrap.c \ camellia.c \ dh.c \ diff --git a/lib/freebl/rijndael-hw.c b/lib/freebl/rijndael-hw.c new file mode 100644 index 000000000..b9c4b2204 --- /dev/null +++ b/lib/freebl/rijndael-hw.c @@ -0,0 +1,170 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifdef FREEBL_NO_DEPEND +#include "stubs.h" +#endif +#include "rijndael.h" +#include "secerr.h" + +#ifdef NSS_X86_OR_X64 +#include /* aes-ni */ +#endif + +#if defined(NSS_X86_OR_X64) +#define EXPAND_KEY128(k, rcon, res) \ + tmp_key = _mm_aeskeygenassist_si128(k, rcon); \ + tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \ + tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \ + tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ + tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ + res = _mm_xor_si128(tmp, tmp_key) + +static void +native_key_expansion128(AESContext *cx, const unsigned char *key) +{ + __m128i *keySchedule = cx->keySchedule; + pre_align __m128i tmp_key post_align; + pre_align __m128i tmp post_align; + keySchedule[0] = _mm_loadu_si128((__m128i *)key); + EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]); + EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]); + EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]); + EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]); + EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]); + EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]); + EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]); + EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]); + EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]); + EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]); +} + +#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \ + tmp2 = _mm_slli_si128(k0, 4); \ + tmp1 = _mm_xor_si128(k0, tmp2); \ + tmp2 = _mm_slli_si128(tmp2, 4); \ + tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ + tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \ + res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55)) + +#define EXPAND_KEY192_PART2(res, k1, k2) \ + tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \ + res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF)) + +#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \ + EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \ + EXPAND_KEY192_PART2(carry, res1, tmp3); \ + res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \ + _mm_castsi128_pd(tmp3), 0)); \ + res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \ + _mm_castsi128_pd(carry), 1)); \ + EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2) + +static void +native_key_expansion192(AESContext *cx, const unsigned char *key) +{ + __m128i *keySchedule = cx->keySchedule; + pre_align __m128i tmp1 post_align; + pre_align __m128i tmp2 post_align; + pre_align __m128i tmp3 post_align; + pre_align __m128i carry post_align; + keySchedule[0] = _mm_loadu_si128((__m128i *)key); + keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); + EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2], + keySchedule[3], carry, 0x1, 0x2); + EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]); + EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5], + keySchedule[6], carry, 0x4, 0x8); + EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]); + EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8], + keySchedule[9], carry, 0x10, 0x20); + EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]); + EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11], + keySchedule[12], carry, 0x40, 0x80); +} + +#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \ + tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \ + tmp2 = _mm_slli_si128(k1x, 4); \ + tmp1 = _mm_xor_si128(k1x, tmp2); \ + tmp2 = _mm_slli_si128(tmp2, 4); \ + tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ + res = _mm_xor_si128(tmp1, tmp_key); + +#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \ + EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \ + EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA) + +static void +native_key_expansion256(AESContext *cx, const unsigned char *key) +{ + __m128i *keySchedule = cx->keySchedule; + pre_align __m128i tmp_key post_align; + pre_align __m128i tmp1 post_align; + pre_align __m128i tmp2 post_align; + keySchedule[0] = _mm_loadu_si128((__m128i *)key); + keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); + EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0], + keySchedule[1], 0x01); + EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2], + keySchedule[3], 0x02); + EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4], + keySchedule[5], 0x04); + EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6], + keySchedule[7], 0x08); + EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8], + keySchedule[9], 0x10); + EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10], + keySchedule[11], 0x20); + EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12], + keySchedule[13], 0xFF); +} + +#endif /* NSS_X86_OR_X64 */ + +/* + * AES key expansion using aes-ni instructions. + */ +void +rijndael_native_key_expansion(AESContext *cx, const unsigned char *key, + unsigned int Nk) +{ +#ifdef NSS_X86_OR_X64 + switch (Nk) { + case 4: + native_key_expansion128(cx, key); + return; + case 6: + native_key_expansion192(cx, key); + return; + case 8: + native_key_expansion256(cx, key); + return; + default: + /* This shouldn't happen. */ + PORT_Assert(0); + } +#else + PORT_Assert(0); +#endif /* NSS_X86_OR_X64 */ +} + +void +rijndael_native_encryptBlock(AESContext *cx, + unsigned char *output, + const unsigned char *input) +{ +#ifdef NSS_X86_OR_X64 + int i; + pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input); + m = _mm_xor_si128(m, cx->keySchedule[0]); + for (i = 1; i < cx->Nr; ++i) { + m = _mm_aesenc_si128(m, cx->keySchedule[i]); + } + m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]); + _mm_storeu_si128((__m128i *)output, m); +#else + PORT_Assert(0); +#endif /* NSS_X86_OR_X64 */ +} diff --git a/lib/freebl/rijndael.c b/lib/freebl/rijndael.c index a09f13098..c13dc61f4 100644 --- a/lib/freebl/rijndael.c +++ b/lib/freebl/rijndael.c @@ -27,6 +27,13 @@ #include "intel-gcm.h" #endif /* INTEL_GCM */ +/* Forward declarations */ +void rijndael_native_key_expansion(AESContext *cx, const unsigned char *key, + unsigned int Nk); +void rijndael_native_encryptBlock(AESContext *cx, + unsigned char *output, + const unsigned char *input); + /* * There are currently three ways to build this code, varying in performance * and code size. @@ -309,162 +316,6 @@ rijndael_key_expansion7(AESContext *cx, const unsigned char *key, unsigned int N } } -#if defined(NSS_X86_OR_X64) -#define EXPAND_KEY128(k, rcon, res) \ - tmp_key = _mm_aeskeygenassist_si128(k, rcon); \ - tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \ - tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \ - tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ - tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ - res = _mm_xor_si128(tmp, tmp_key) - -static void -native_key_expansion128(AESContext *cx, const unsigned char *key) -{ - __m128i *keySchedule = cx->keySchedule; - pre_align __m128i tmp_key post_align; - pre_align __m128i tmp post_align; - keySchedule[0] = _mm_loadu_si128((__m128i *)key); - EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]); - EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]); - EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]); - EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]); - EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]); - EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]); - EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]); - EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]); - EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]); - EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]); -} - -#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \ - tmp2 = _mm_slli_si128(k0, 4); \ - tmp1 = _mm_xor_si128(k0, tmp2); \ - tmp2 = _mm_slli_si128(tmp2, 4); \ - tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ - tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \ - res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55)) - -#define EXPAND_KEY192_PART2(res, k1, k2) \ - tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \ - res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF)) - -#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \ - EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \ - EXPAND_KEY192_PART2(carry, res1, tmp3); \ - res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \ - _mm_castsi128_pd(tmp3), 0)); \ - res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \ - _mm_castsi128_pd(carry), 1)); \ - EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2) - -static void -native_key_expansion192(AESContext *cx, const unsigned char *key) -{ - __m128i *keySchedule = cx->keySchedule; - pre_align __m128i tmp1 post_align; - pre_align __m128i tmp2 post_align; - pre_align __m128i tmp3 post_align; - pre_align __m128i carry post_align; - keySchedule[0] = _mm_loadu_si128((__m128i *)key); - keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); - EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2], - keySchedule[3], carry, 0x1, 0x2); - EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]); - EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5], - keySchedule[6], carry, 0x4, 0x8); - EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]); - EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8], - keySchedule[9], carry, 0x10, 0x20); - EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]); - EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11], - keySchedule[12], carry, 0x40, 0x80); -} - -#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \ - tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \ - tmp2 = _mm_slli_si128(k1x, 4); \ - tmp1 = _mm_xor_si128(k1x, tmp2); \ - tmp2 = _mm_slli_si128(tmp2, 4); \ - tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ - res = _mm_xor_si128(tmp1, tmp_key); - -#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \ - EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \ - EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA) - -static void -native_key_expansion256(AESContext *cx, const unsigned char *key) -{ - __m128i *keySchedule = cx->keySchedule; - pre_align __m128i tmp_key post_align; - pre_align __m128i tmp1 post_align; - pre_align __m128i tmp2 post_align; - keySchedule[0] = _mm_loadu_si128((__m128i *)key); - keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); - EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0], - keySchedule[1], 0x01); - EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2], - keySchedule[3], 0x02); - EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4], - keySchedule[5], 0x04); - EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6], - keySchedule[7], 0x08); - EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8], - keySchedule[9], 0x10); - EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10], - keySchedule[11], 0x20); - EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12], - keySchedule[13], 0xFF); -} - -#endif /* NSS_X86_OR_X64 */ - -/* - * AES key expansion using aes-ni instructions. - */ -static void -native_key_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk) -{ -#ifdef NSS_X86_OR_X64 - switch (Nk) { - case 4: - native_key_expansion128(cx, key); - return; - case 6: - native_key_expansion192(cx, key); - return; - case 8: - native_key_expansion256(cx, key); - return; - default: - /* This shouldn't happen. */ - PORT_Assert(0); - } -#else - PORT_Assert(0); -#endif /* NSS_X86_OR_X64 */ -} - -static void -native_encryptBlock(AESContext *cx, - unsigned char *output, - const unsigned char *input) -{ -#ifdef NSS_X86_OR_X64 - int i; - pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input); - m = _mm_xor_si128(m, cx->keySchedule[0]); - for (i = 1; i < cx->Nr; ++i) { - m = _mm_aesenc_si128(m, cx->keySchedule[i]); - } - m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]); - _mm_storeu_si128((__m128i *)output, m); -#else - PORT_Assert(0); -#endif /* NSS_X86_OR_X64 */ -} - /* rijndael_key_expansion * * Generate the expanded key from the key input by the user. @@ -830,7 +681,7 @@ rijndael_encryptECB(AESContext *cx, unsigned char *output, if (aesni_support()) { /* Use hardware acceleration for normal AES parameters. */ - encryptor = &native_encryptBlock; + encryptor = &rijndael_native_encryptBlock; } else { encryptor = &rijndael_encryptBlock128; } @@ -1026,7 +877,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, cx->mode == NSS_AES_CTR)) { PORT_Assert(keysize == 16 || keysize == 24 || keysize == 32); /* Prepare hardware key for normal AES parameters. */ - native_key_expansion(cx, key, Nk); + rijndael_native_key_expansion(cx, key, Nk); } else { rijndael_key_expansion(cx, key, Nk); } diff --git a/lib/freebl/rijndael.h b/lib/freebl/rijndael.h index 1f4a8a9f7..357f77c0a 100644 --- a/lib/freebl/rijndael.h +++ b/lib/freebl/rijndael.h @@ -9,9 +9,11 @@ #include #ifdef NSS_X86_OR_X64 -#include /* aes-ni */ +#include /* __m128i */ #endif +SEC_BEGIN_PROTOS + typedef void AESBlockFunc(AESContext *cx, unsigned char *output, const unsigned char *input); @@ -66,4 +68,6 @@ struct AESContextStr { void *mem; /* Start of the allocated memory to free. */ }; +SEC_END_PROTOS + #endif /* _RIJNDAEL_H_ */ -- 2.13.5