diff -rauN nss-3.35/nss/lib/freebl/freebl_base.gypi nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/freebl_base.gypi --- nss-3.35/nss/lib/freebl/freebl_base.gypi 2018-01-18 15:19:59.000000000 +0100 +++ nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/freebl_base.gypi 2018-03-13 20:41:50.250412209 +0100 @@ -38,6 +38,7 @@ 'blinit.c', 'freeblver.c', 'gcm.c', + 'gcm-hw.c', 'hmacct.c', 'jpake.c', 'ldvector.c', @@ -52,6 +53,7 @@ 'pqg.c', 'rawhash.c', 'rijndael.c', + 'rijndael-hw.c', 'rsa.c', 'rsapkcs.c', 'seed.c', diff -rauN nss-3.35/nss/lib/freebl/gcm-hw.c nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/gcm-hw.c --- nss-3.35/nss/lib/freebl/gcm-hw.c 1970-01-01 01:00:00.000000000 +0100 +++ nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/gcm-hw.c 2018-03-13 20:41:50.250412209 +0100 @@ -0,0 +1,151 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifdef FREEBL_NO_DEPEND +#include "stubs.h" +#endif +#include "gcm.h" +#include "secerr.h" + +#ifdef NSS_X86_OR_X64 +#include /* clmul */ +#endif + +#define WRITE64(x, bytes) \ + (bytes)[0] = (x) >> 56; \ + (bytes)[1] = (x) >> 48; \ + (bytes)[2] = (x) >> 40; \ + (bytes)[3] = (x) >> 32; \ + (bytes)[4] = (x) >> 24; \ + (bytes)[5] = (x) >> 16; \ + (bytes)[6] = (x) >> 8; \ + (bytes)[7] = (x); + +SECStatus +gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf, + unsigned int maxout) +{ +#ifdef NSS_X86_OR_X64 + uint64_t tmp_out[2]; + _mm_storeu_si128((__m128i *)tmp_out, ghash->x); + PORT_Assert(maxout >= 16); + WRITE64(tmp_out[0], outbuf + 8); + WRITE64(tmp_out[1], outbuf); + return SECSuccess; +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ +} + +SECStatus +gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count) +{ +#ifdef NSS_X86_OR_X64 + size_t i; + pre_align __m128i z_high post_align; + pre_align __m128i z_low post_align; + pre_align __m128i C post_align; + pre_align __m128i D post_align; + pre_align __m128i E post_align; + pre_align __m128i F post_align; + pre_align __m128i bin post_align; + pre_align __m128i Ci post_align; + pre_align __m128i tmp post_align; + + for (i = 0; i < count; i++, buf += 16) { + bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1], + ((uint16_t)buf[2] << 8) | buf[3], + ((uint16_t)buf[4] << 8) | buf[5], + ((uint16_t)buf[6] << 8) | buf[7], + ((uint16_t)buf[8] << 8) | buf[9], + ((uint16_t)buf[10] << 8) | buf[11], + ((uint16_t)buf[12] << 8) | buf[13], + ((uint16_t)buf[14] << 8) | buf[15]); + Ci = _mm_xor_si128(bin, ghash->x); + + /* Do binary mult ghash->X = Ci * ghash->H. */ + C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00); + D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11); + E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01); + F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10); + tmp = _mm_xor_si128(E, F); + z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8)); + z_high = _mm_unpackhi_epi64(z_high, D); + z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C); + z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low); + + /* Shift one to the left (multiply by x) as gcm spec is stupid. */ + C = _mm_slli_si128(z_low, 8); + E = _mm_srli_epi64(C, 63); + D = _mm_slli_si128(z_high, 8); + F = _mm_srli_epi64(D, 63); + /* Carry over */ + C = _mm_srli_si128(z_low, 8); + D = _mm_srli_epi64(C, 63); + z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E); + z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D); + + /* Reduce */ + C = _mm_slli_si128(z_low, 8); + /* D = z_low << 127 */ + D = _mm_slli_epi64(C, 63); + /* E = z_low << 126 */ + E = _mm_slli_epi64(C, 62); + /* F = z_low << 121 */ + F = _mm_slli_epi64(C, 57); + /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */ + z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F); + C = _mm_srli_si128(z_low, 8); + /* D = z_low >> 1 */ + D = _mm_slli_epi64(C, 63); + D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D); + /* E = z_low >> 2 */ + E = _mm_slli_epi64(C, 62); + E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E); + /* F = z_low >> 7 */ + F = _mm_slli_epi64(C, 57); + F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F); + /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */ + ghash->x = _mm_xor_si128(_mm_xor_si128( + _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E), + F); + } + return SECSuccess; +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ +} + +SECStatus +gcm_HashInit_hw(gcmHashContext *ghash) +{ +#ifdef NSS_X86_OR_X64 + ghash->ghash_mul = gcm_HashMult_hw; + ghash->x = _mm_setzero_si128(); + /* MSVC requires __m64 to load epi64. */ + ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high, + ghash->h_low >> 32, (uint32_t)ghash->h_low); + ghash->hw = PR_TRUE; + return SECSuccess; +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ +} + +SECStatus +gcm_HashZeroX_hw(gcmHashContext *ghash) +{ +#ifdef NSS_X86_OR_X64 + ghash->x = _mm_setzero_si128(); + return SECSuccess; +#else + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +#endif /* NSS_X86_OR_X64 */ +} + diff -rauN nss-3.35/nss/lib/freebl/gcm.c nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/gcm.c --- nss-3.35/nss/lib/freebl/gcm.c 2018-01-18 15:19:59.000000000 +0100 +++ nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/gcm.c 2018-03-13 20:41:50.251412221 +0100 @@ -303,7 +303,7 @@ ghash->x_high = z_high_h; ghash->x_low = z_high_l; } - return SECSuccess; + return rv; } #endif /* HAVE_INT128_SUPPORT */ diff -rauN nss-3.35/nss/lib/freebl/manifest.mn nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/manifest.mn --- nss-3.35/nss/lib/freebl/manifest.mn 2018-01-18 15:19:59.000000000 +0100 +++ nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/manifest.mn 2018-03-13 20:41:50.252412232 +0100 @@ -135,8 +135,10 @@ blinit.c \ fipsfreebl.c \ gcm.c \ + gcm-hw.c \ hmacct.c \ rijndael.c \ + rijndael-hw.c \ aeskeywrap.c \ camellia.c \ dh.c \ diff -rauN nss-3.35/nss/lib/freebl/rijndael-hw.c nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/rijndael-hw.c --- nss-3.35/nss/lib/freebl/rijndael-hw.c 1970-01-01 01:00:00.000000000 +0100 +++ nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/rijndael-hw.c 2018-03-13 20:41:50.252412232 +0100 @@ -0,0 +1,170 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifdef FREEBL_NO_DEPEND +#include "stubs.h" +#endif +#include "rijndael.h" +#include "secerr.h" + +#ifdef NSS_X86_OR_X64 +#include /* aes-ni */ +#endif + +#if defined(NSS_X86_OR_X64) +#define EXPAND_KEY128(k, rcon, res) \ + tmp_key = _mm_aeskeygenassist_si128(k, rcon); \ + tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \ + tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \ + tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ + tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ + res = _mm_xor_si128(tmp, tmp_key) + +static void +native_key_expansion128(AESContext *cx, const unsigned char *key) +{ + __m128i *keySchedule = cx->keySchedule; + pre_align __m128i tmp_key post_align; + pre_align __m128i tmp post_align; + keySchedule[0] = _mm_loadu_si128((__m128i *)key); + EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]); + EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]); + EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]); + EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]); + EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]); + EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]); + EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]); + EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]); + EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]); + EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]); +} + +#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \ + tmp2 = _mm_slli_si128(k0, 4); \ + tmp1 = _mm_xor_si128(k0, tmp2); \ + tmp2 = _mm_slli_si128(tmp2, 4); \ + tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ + tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \ + res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55)) + +#define EXPAND_KEY192_PART2(res, k1, k2) \ + tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \ + res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF)) + +#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \ + EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \ + EXPAND_KEY192_PART2(carry, res1, tmp3); \ + res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \ + _mm_castsi128_pd(tmp3), 0)); \ + res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \ + _mm_castsi128_pd(carry), 1)); \ + EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2) + +static void +native_key_expansion192(AESContext *cx, const unsigned char *key) +{ + __m128i *keySchedule = cx->keySchedule; + pre_align __m128i tmp1 post_align; + pre_align __m128i tmp2 post_align; + pre_align __m128i tmp3 post_align; + pre_align __m128i carry post_align; + keySchedule[0] = _mm_loadu_si128((__m128i *)key); + keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); + EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2], + keySchedule[3], carry, 0x1, 0x2); + EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]); + EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5], + keySchedule[6], carry, 0x4, 0x8); + EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]); + EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8], + keySchedule[9], carry, 0x10, 0x20); + EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]); + EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11], + keySchedule[12], carry, 0x40, 0x80); +} + +#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \ + tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \ + tmp2 = _mm_slli_si128(k1x, 4); \ + tmp1 = _mm_xor_si128(k1x, tmp2); \ + tmp2 = _mm_slli_si128(tmp2, 4); \ + tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ + res = _mm_xor_si128(tmp1, tmp_key); + +#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \ + EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \ + EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA) + +static void +native_key_expansion256(AESContext *cx, const unsigned char *key) +{ + __m128i *keySchedule = cx->keySchedule; + pre_align __m128i tmp_key post_align; + pre_align __m128i tmp1 post_align; + pre_align __m128i tmp2 post_align; + keySchedule[0] = _mm_loadu_si128((__m128i *)key); + keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); + EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0], + keySchedule[1], 0x01); + EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2], + keySchedule[3], 0x02); + EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4], + keySchedule[5], 0x04); + EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6], + keySchedule[7], 0x08); + EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8], + keySchedule[9], 0x10); + EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10], + keySchedule[11], 0x20); + EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12], + keySchedule[13], 0xFF); +} + +#endif /* NSS_X86_OR_X64 */ + +/* + * AES key expansion using aes-ni instructions. + */ +void +rijndael_native_key_expansion(AESContext *cx, const unsigned char *key, + unsigned int Nk) +{ +#ifdef NSS_X86_OR_X64 + switch (Nk) { + case 4: + native_key_expansion128(cx, key); + return; + case 6: + native_key_expansion192(cx, key); + return; + case 8: + native_key_expansion256(cx, key); + return; + default: + /* This shouldn't happen. */ + PORT_Assert(0); + } +#else + PORT_Assert(0); +#endif /* NSS_X86_OR_X64 */ +} + +void +rijndael_native_encryptBlock(AESContext *cx, + unsigned char *output, + const unsigned char *input) +{ +#ifdef NSS_X86_OR_X64 + int i; + pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input); + m = _mm_xor_si128(m, cx->keySchedule[0]); + for (i = 1; i < cx->Nr; ++i) { + m = _mm_aesenc_si128(m, cx->keySchedule[i]); + } + m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]); + _mm_storeu_si128((__m128i *)output, m); +#else + PORT_Assert(0); +#endif /* NSS_X86_OR_X64 */ +} diff -rauN nss-3.35/nss/lib/freebl/rijndael.c nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/rijndael.c --- nss-3.35/nss/lib/freebl/rijndael.c 2018-01-18 15:19:59.000000000 +0100 +++ nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/rijndael.c 2018-03-13 20:41:50.253412244 +0100 @@ -55,6 +55,13 @@ } #endif /* NSS_X86_OR_X64 */ +/* Forward declarations */ +void rijndael_native_key_expansion(AESContext *cx, const unsigned char *key, + unsigned int Nk); +void rijndael_native_encryptBlock(AESContext *cx, + unsigned char *output, + const unsigned char *input); + /* * There are currently three ways to build this code, varying in performance * and code size. diff -rauN nss-3.35/nss/lib/freebl/rijndael.h nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/rijndael.h --- nss-3.35/nss/lib/freebl/rijndael.h 2018-01-18 15:19:59.000000000 +0100 +++ nss-3.35-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw-patch/nss/lib/freebl/rijndael.h 2018-03-13 20:41:50.254412255 +0100 @@ -80,4 +80,6 @@ void *mem; /* Start of the allocated memory to free. */ }; +SEC_END_PROTOS + #endif /* _RIJNDAEL_H_ */