summaryrefslogtreecommitdiff
path: root/i486-stage3/nss/0001-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw.patch
diff options
context:
space:
mode:
Diffstat (limited to 'i486-stage3/nss/0001-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw.patch')
-rw-r--r--i486-stage3/nss/0001-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw.patch780
1 files changed, 780 insertions, 0 deletions
diff --git a/i486-stage3/nss/0001-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw.patch b/i486-stage3/nss/0001-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw.patch
new file mode 100644
index 0000000..313a1b9
--- /dev/null
+++ b/i486-stage3/nss/0001-freebl-Reorganize-AES-GCM-source-code-based-on-hw-sw.patch
@@ -0,0 +1,780 @@
+diff --git a/lib/freebl/freebl_base.gypi b/lib/freebl/freebl_base.gypi
+index 6970eff7d..03abbefcf 100644
+--- a/lib/freebl/freebl_base.gypi
++++ b/lib/freebl/freebl_base.gypi
+@@ -38,6 +38,7 @@
+ 'blinit.c',
+ 'freeblver.c',
+ 'gcm.c',
++ 'gcm-hw.c',
+ 'hmacct.c',
+ 'jpake.c',
+ 'ldvector.c',
+@@ -52,6 +53,7 @@
+ 'pqg.c',
+ 'rawhash.c',
+ 'rijndael.c',
++ 'rijndael-hw.c',
+ 'rsa.c',
+ 'rsapkcs.c',
+ 'seed.c',
+diff --git a/lib/freebl/gcm-hw.c b/lib/freebl/gcm-hw.c
+new file mode 100644
+index 000000000..df77d1eb3
+--- /dev/null
++++ b/lib/freebl/gcm-hw.c
+@@ -0,0 +1,151 @@
++/* This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifdef FREEBL_NO_DEPEND
++#include "stubs.h"
++#endif
++#include "gcm.h"
++#include "secerr.h"
++
++#ifdef NSS_X86_OR_X64
++#include <wmmintrin.h> /* clmul */
++#endif
++
++#define WRITE64(x, bytes) \
++ (bytes)[0] = (x) >> 56; \
++ (bytes)[1] = (x) >> 48; \
++ (bytes)[2] = (x) >> 40; \
++ (bytes)[3] = (x) >> 32; \
++ (bytes)[4] = (x) >> 24; \
++ (bytes)[5] = (x) >> 16; \
++ (bytes)[6] = (x) >> 8; \
++ (bytes)[7] = (x);
++
++SECStatus
++gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf,
++ unsigned int maxout)
++{
++#ifdef NSS_X86_OR_X64
++ uint64_t tmp_out[2];
++ _mm_storeu_si128((__m128i *)tmp_out, ghash->x);
++ PORT_Assert(maxout >= 16);
++ WRITE64(tmp_out[0], outbuf + 8);
++ WRITE64(tmp_out[1], outbuf);
++ return SECSuccess;
++#else
++ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
++ return SECFailure;
++#endif /* NSS_X86_OR_X64 */
++}
++
++SECStatus
++gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
++ unsigned int count)
++{
++#ifdef NSS_X86_OR_X64
++ size_t i;
++ pre_align __m128i z_high post_align;
++ pre_align __m128i z_low post_align;
++ pre_align __m128i C post_align;
++ pre_align __m128i D post_align;
++ pre_align __m128i E post_align;
++ pre_align __m128i F post_align;
++ pre_align __m128i bin post_align;
++ pre_align __m128i Ci post_align;
++ pre_align __m128i tmp post_align;
++
++ for (i = 0; i < count; i++, buf += 16) {
++ bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1],
++ ((uint16_t)buf[2] << 8) | buf[3],
++ ((uint16_t)buf[4] << 8) | buf[5],
++ ((uint16_t)buf[6] << 8) | buf[7],
++ ((uint16_t)buf[8] << 8) | buf[9],
++ ((uint16_t)buf[10] << 8) | buf[11],
++ ((uint16_t)buf[12] << 8) | buf[13],
++ ((uint16_t)buf[14] << 8) | buf[15]);
++ Ci = _mm_xor_si128(bin, ghash->x);
++
++ /* Do binary mult ghash->X = Ci * ghash->H. */
++ C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00);
++ D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11);
++ E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01);
++ F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10);
++ tmp = _mm_xor_si128(E, F);
++ z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8));
++ z_high = _mm_unpackhi_epi64(z_high, D);
++ z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C);
++ z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low);
++
++ /* Shift one to the left (multiply by x) as gcm spec is stupid. */
++ C = _mm_slli_si128(z_low, 8);
++ E = _mm_srli_epi64(C, 63);
++ D = _mm_slli_si128(z_high, 8);
++ F = _mm_srli_epi64(D, 63);
++ /* Carry over */
++ C = _mm_srli_si128(z_low, 8);
++ D = _mm_srli_epi64(C, 63);
++ z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E);
++ z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D);
++
++ /* Reduce */
++ C = _mm_slli_si128(z_low, 8);
++ /* D = z_low << 127 */
++ D = _mm_slli_epi64(C, 63);
++ /* E = z_low << 126 */
++ E = _mm_slli_epi64(C, 62);
++ /* F = z_low << 121 */
++ F = _mm_slli_epi64(C, 57);
++ /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */
++ z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F);
++ C = _mm_srli_si128(z_low, 8);
++ /* D = z_low >> 1 */
++ D = _mm_slli_epi64(C, 63);
++ D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D);
++ /* E = z_low >> 2 */
++ E = _mm_slli_epi64(C, 62);
++ E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E);
++ /* F = z_low >> 7 */
++ F = _mm_slli_epi64(C, 57);
++ F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F);
++ /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */
++ ghash->x = _mm_xor_si128(_mm_xor_si128(
++ _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E),
++ F);
++ }
++ return SECSuccess;
++#else
++ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
++ return SECFailure;
++#endif /* NSS_X86_OR_X64 */
++}
++
++SECStatus
++gcm_HashInit_hw(gcmHashContext *ghash)
++{
++#ifdef NSS_X86_OR_X64
++ ghash->ghash_mul = gcm_HashMult_hw;
++ ghash->x = _mm_setzero_si128();
++ /* MSVC requires __m64 to load epi64. */
++ ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high,
++ ghash->h_low >> 32, (uint32_t)ghash->h_low);
++ ghash->hw = PR_TRUE;
++ return SECSuccess;
++#else
++ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
++ return SECFailure;
++#endif /* NSS_X86_OR_X64 */
++}
++
++SECStatus
++gcm_HashZeroX_hw(gcmHashContext *ghash)
++{
++#ifdef NSS_X86_OR_X64
++ ghash->x = _mm_setzero_si128();
++ return SECSuccess;
++#else
++ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
++ return SECFailure;
++#endif /* NSS_X86_OR_X64 */
++}
++
+diff --git a/lib/freebl/gcm.c b/lib/freebl/gcm.c
+index 780b7a632..0b1df498f 100644
+--- a/lib/freebl/gcm.c
++++ b/lib/freebl/gcm.c
+@@ -17,13 +17,13 @@
+
+ #include <limits.h>
+
+-#ifdef NSS_X86_OR_X64
+-#include <wmmintrin.h> /* clmul */
+-#endif
+-
+ /* Forward declarations */
++SECStatus gcm_HashInit_hw(gcmHashContext *ghash);
++SECStatus gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf,
++ unsigned int maxout);
+ SECStatus gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
+ unsigned int count);
++SECStatus gcm_HashZeroX_hw(gcmHashContext *ghash);
+ SECStatus gcm_HashMult_sftw(gcmHashContext *ghash, const unsigned char *buf,
+ unsigned int count);
+ SECStatus gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf,
+@@ -46,6 +46,8 @@ get64(const unsigned char *bytes)
+ SECStatus
+ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw)
+ {
++ SECStatus rv = SECSuccess;
++
+ ghash->cLen = 0;
+ ghash->bufLen = 0;
+ PORT_Memset(ghash->counterBuf, 0, sizeof(ghash->counterBuf));
+@@ -53,17 +55,7 @@ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw)
+ ghash->h_low = get64(H + 8);
+ ghash->h_high = get64(H);
+ if (clmul_support() && !sw) {
+-#ifdef NSS_X86_OR_X64
+- ghash->ghash_mul = gcm_HashMult_hw;
+- ghash->x = _mm_setzero_si128();
+- /* MSVC requires __m64 to load epi64. */
+- ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high,
+- ghash->h_low >> 32, (uint32_t)ghash->h_low);
+- ghash->hw = PR_TRUE;
+-#else
+- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+- return SECFailure;
+-#endif /* NSS_X86_OR_X64 */
++ rv = gcm_HashInit_hw(ghash);
+ } else {
+ /* We fall back to the software implementation if we can't use / don't
+ * want to use pclmul. */
+@@ -75,7 +67,7 @@ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw)
+ ghash->x_high = ghash->x_low = 0;
+ ghash->hw = PR_FALSE;
+ }
+- return SECSuccess;
++ return rv;
+ }
+
+ #ifdef HAVE_INT128_SUPPORT
+@@ -283,102 +275,17 @@ gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf,
+ }
+ #endif /* HAVE_INT128_SUPPORT */
+
+-SECStatus
+-gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf,
+- unsigned int count)
+-{
+-#ifdef NSS_X86_OR_X64
+- size_t i;
+- pre_align __m128i z_high post_align;
+- pre_align __m128i z_low post_align;
+- pre_align __m128i C post_align;
+- pre_align __m128i D post_align;
+- pre_align __m128i E post_align;
+- pre_align __m128i F post_align;
+- pre_align __m128i bin post_align;
+- pre_align __m128i Ci post_align;
+- pre_align __m128i tmp post_align;
+-
+- for (i = 0; i < count; i++, buf += 16) {
+- bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1],
+- ((uint16_t)buf[2] << 8) | buf[3],
+- ((uint16_t)buf[4] << 8) | buf[5],
+- ((uint16_t)buf[6] << 8) | buf[7],
+- ((uint16_t)buf[8] << 8) | buf[9],
+- ((uint16_t)buf[10] << 8) | buf[11],
+- ((uint16_t)buf[12] << 8) | buf[13],
+- ((uint16_t)buf[14] << 8) | buf[15]);
+- Ci = _mm_xor_si128(bin, ghash->x);
+-
+- /* Do binary mult ghash->X = Ci * ghash->H. */
+- C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00);
+- D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11);
+- E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01);
+- F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10);
+- tmp = _mm_xor_si128(E, F);
+- z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8));
+- z_high = _mm_unpackhi_epi64(z_high, D);
+- z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C);
+- z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low);
+-
+- /* Shift one to the left (multiply by x) as gcm spec is stupid. */
+- C = _mm_slli_si128(z_low, 8);
+- E = _mm_srli_epi64(C, 63);
+- D = _mm_slli_si128(z_high, 8);
+- F = _mm_srli_epi64(D, 63);
+- /* Carry over */
+- C = _mm_srli_si128(z_low, 8);
+- D = _mm_srli_epi64(C, 63);
+- z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E);
+- z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D);
+-
+- /* Reduce */
+- C = _mm_slli_si128(z_low, 8);
+- /* D = z_low << 127 */
+- D = _mm_slli_epi64(C, 63);
+- /* E = z_low << 126 */
+- E = _mm_slli_epi64(C, 62);
+- /* F = z_low << 121 */
+- F = _mm_slli_epi64(C, 57);
+- /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */
+- z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F);
+- C = _mm_srli_si128(z_low, 8);
+- /* D = z_low >> 1 */
+- D = _mm_slli_epi64(C, 63);
+- D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D);
+- /* E = z_low >> 2 */
+- E = _mm_slli_epi64(C, 62);
+- E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E);
+- /* F = z_low >> 7 */
+- F = _mm_slli_epi64(C, 57);
+- F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F);
+- /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */
+- ghash->x = _mm_xor_si128(_mm_xor_si128(
+- _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E),
+- F);
+- }
+- return SECSuccess;
+-#else
+- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+- return SECFailure;
+-#endif /* NSS_X86_OR_X64 */
+-}
+-
+ static SECStatus
+ gcm_zeroX(gcmHashContext *ghash)
+ {
++ SECStatus rv = SECSuccess;
++
+ if (ghash->hw) {
+-#ifdef NSS_X86_OR_X64
+- ghash->x = _mm_setzero_si128();
+- return SECSuccess;
+-#else
+- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+- return SECFailure;
+-#endif /* NSS_X86_OR_X64 */
++ rv = gcm_HashZeroX_hw(ghash);
+ }
+
+ ghash->x_high = ghash->x_low = 0;
+- return SECSuccess;
++ return rv;
+ }
+
+ /*
+@@ -503,15 +410,10 @@ gcmHash_Final(gcmHashContext *ghash, unsigned char *outbuf,
+ }
+
+ if (ghash->hw) {
+-#ifdef NSS_X86_OR_X64
+- uint64_t tmp_out[2];
+- _mm_storeu_si128((__m128i *)tmp_out, ghash->x);
+- WRITE64(tmp_out[0], T + 8);
+- WRITE64(tmp_out[1], T);
+-#else
+- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE);
+- return SECFailure;
+-#endif /* NSS_X86_OR_X64 */
++ rv = gcm_HashWrite_hw(ghash, T, MAX_BLOCK_SIZE);
++ if (rv != SECSuccess) {
++ goto cleanup;
++ }
+ } else {
+ WRITE64(ghash->x_low, T + 8);
+ WRITE64(ghash->x_high, T);
+diff --git a/lib/freebl/manifest.mn b/lib/freebl/manifest.mn
+index e4c9ab0b7..6c14da9b0 100644
+--- a/lib/freebl/manifest.mn
++++ b/lib/freebl/manifest.mn
+@@ -135,8 +135,10 @@ CSRCS = \
+ blinit.c \
+ fipsfreebl.c \
+ gcm.c \
++ gcm-hw.c \
+ hmacct.c \
+ rijndael.c \
++ rijndael-hw.c \
+ aeskeywrap.c \
+ camellia.c \
+ dh.c \
+diff --git a/lib/freebl/rijndael-hw.c b/lib/freebl/rijndael-hw.c
+new file mode 100644
+index 000000000..b9c4b2204
+--- /dev/null
++++ b/lib/freebl/rijndael-hw.c
+@@ -0,0 +1,170 @@
++/* This Source Code Form is subject to the terms of the Mozilla Public
++ * License, v. 2.0. If a copy of the MPL was not distributed with this
++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
++
++#ifdef FREEBL_NO_DEPEND
++#include "stubs.h"
++#endif
++#include "rijndael.h"
++#include "secerr.h"
++
++#ifdef NSS_X86_OR_X64
++#include <wmmintrin.h> /* aes-ni */
++#endif
++
++#if defined(NSS_X86_OR_X64)
++#define EXPAND_KEY128(k, rcon, res) \
++ tmp_key = _mm_aeskeygenassist_si128(k, rcon); \
++ tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \
++ tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \
++ tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
++ tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
++ res = _mm_xor_si128(tmp, tmp_key)
++
++static void
++native_key_expansion128(AESContext *cx, const unsigned char *key)
++{
++ __m128i *keySchedule = cx->keySchedule;
++ pre_align __m128i tmp_key post_align;
++ pre_align __m128i tmp post_align;
++ keySchedule[0] = _mm_loadu_si128((__m128i *)key);
++ EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]);
++ EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]);
++ EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]);
++ EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]);
++ EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]);
++ EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]);
++ EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]);
++ EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]);
++ EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]);
++ EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]);
++}
++
++#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \
++ tmp2 = _mm_slli_si128(k0, 4); \
++ tmp1 = _mm_xor_si128(k0, tmp2); \
++ tmp2 = _mm_slli_si128(tmp2, 4); \
++ tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
++ tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \
++ res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55))
++
++#define EXPAND_KEY192_PART2(res, k1, k2) \
++ tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \
++ res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF))
++
++#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \
++ EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \
++ EXPAND_KEY192_PART2(carry, res1, tmp3); \
++ res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \
++ _mm_castsi128_pd(tmp3), 0)); \
++ res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \
++ _mm_castsi128_pd(carry), 1)); \
++ EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2)
++
++static void
++native_key_expansion192(AESContext *cx, const unsigned char *key)
++{
++ __m128i *keySchedule = cx->keySchedule;
++ pre_align __m128i tmp1 post_align;
++ pre_align __m128i tmp2 post_align;
++ pre_align __m128i tmp3 post_align;
++ pre_align __m128i carry post_align;
++ keySchedule[0] = _mm_loadu_si128((__m128i *)key);
++ keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
++ EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2],
++ keySchedule[3], carry, 0x1, 0x2);
++ EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]);
++ EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5],
++ keySchedule[6], carry, 0x4, 0x8);
++ EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]);
++ EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8],
++ keySchedule[9], carry, 0x10, 0x20);
++ EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]);
++ EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11],
++ keySchedule[12], carry, 0x40, 0x80);
++}
++
++#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \
++ tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \
++ tmp2 = _mm_slli_si128(k1x, 4); \
++ tmp1 = _mm_xor_si128(k1x, tmp2); \
++ tmp2 = _mm_slli_si128(tmp2, 4); \
++ tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
++ res = _mm_xor_si128(tmp1, tmp_key);
++
++#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \
++ EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \
++ EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA)
++
++static void
++native_key_expansion256(AESContext *cx, const unsigned char *key)
++{
++ __m128i *keySchedule = cx->keySchedule;
++ pre_align __m128i tmp_key post_align;
++ pre_align __m128i tmp1 post_align;
++ pre_align __m128i tmp2 post_align;
++ keySchedule[0] = _mm_loadu_si128((__m128i *)key);
++ keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
++ EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0],
++ keySchedule[1], 0x01);
++ EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2],
++ keySchedule[3], 0x02);
++ EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4],
++ keySchedule[5], 0x04);
++ EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6],
++ keySchedule[7], 0x08);
++ EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8],
++ keySchedule[9], 0x10);
++ EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10],
++ keySchedule[11], 0x20);
++ EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12],
++ keySchedule[13], 0xFF);
++}
++
++#endif /* NSS_X86_OR_X64 */
++
++/*
++ * AES key expansion using aes-ni instructions.
++ */
++void
++rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
++ unsigned int Nk)
++{
++#ifdef NSS_X86_OR_X64
++ switch (Nk) {
++ case 4:
++ native_key_expansion128(cx, key);
++ return;
++ case 6:
++ native_key_expansion192(cx, key);
++ return;
++ case 8:
++ native_key_expansion256(cx, key);
++ return;
++ default:
++ /* This shouldn't happen. */
++ PORT_Assert(0);
++ }
++#else
++ PORT_Assert(0);
++#endif /* NSS_X86_OR_X64 */
++}
++
++void
++rijndael_native_encryptBlock(AESContext *cx,
++ unsigned char *output,
++ const unsigned char *input)
++{
++#ifdef NSS_X86_OR_X64
++ int i;
++ pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input);
++ m = _mm_xor_si128(m, cx->keySchedule[0]);
++ for (i = 1; i < cx->Nr; ++i) {
++ m = _mm_aesenc_si128(m, cx->keySchedule[i]);
++ }
++ m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]);
++ _mm_storeu_si128((__m128i *)output, m);
++#else
++ PORT_Assert(0);
++#endif /* NSS_X86_OR_X64 */
++}
+diff --git a/lib/freebl/rijndael.c b/lib/freebl/rijndael.c
+index a09f13098..c13dc61f4 100644
+--- a/lib/freebl/rijndael.c
++++ b/lib/freebl/rijndael.c
+@@ -27,6 +27,13 @@
+ #include "intel-gcm.h"
+ #endif /* INTEL_GCM */
+
++/* Forward declarations */
++void rijndael_native_key_expansion(AESContext *cx, const unsigned char *key,
++ unsigned int Nk);
++void rijndael_native_encryptBlock(AESContext *cx,
++ unsigned char *output,
++ const unsigned char *input);
++
+ /*
+ * There are currently three ways to build this code, varying in performance
+ * and code size.
+@@ -309,162 +316,6 @@ rijndael_key_expansion7(AESContext *cx, const unsigned char *key, unsigned int N
+ }
+ }
+
+-#if defined(NSS_X86_OR_X64)
+-#define EXPAND_KEY128(k, rcon, res) \
+- tmp_key = _mm_aeskeygenassist_si128(k, rcon); \
+- tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \
+- tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \
+- tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
+- tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \
+- res = _mm_xor_si128(tmp, tmp_key)
+-
+-static void
+-native_key_expansion128(AESContext *cx, const unsigned char *key)
+-{
+- __m128i *keySchedule = cx->keySchedule;
+- pre_align __m128i tmp_key post_align;
+- pre_align __m128i tmp post_align;
+- keySchedule[0] = _mm_loadu_si128((__m128i *)key);
+- EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]);
+- EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]);
+- EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]);
+- EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]);
+- EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]);
+- EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]);
+- EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]);
+- EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]);
+- EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]);
+- EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]);
+-}
+-
+-#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \
+- tmp2 = _mm_slli_si128(k0, 4); \
+- tmp1 = _mm_xor_si128(k0, tmp2); \
+- tmp2 = _mm_slli_si128(tmp2, 4); \
+- tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
+- tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \
+- res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55))
+-
+-#define EXPAND_KEY192_PART2(res, k1, k2) \
+- tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \
+- res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF))
+-
+-#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \
+- EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \
+- EXPAND_KEY192_PART2(carry, res1, tmp3); \
+- res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \
+- _mm_castsi128_pd(tmp3), 0)); \
+- res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \
+- _mm_castsi128_pd(carry), 1)); \
+- EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2)
+-
+-static void
+-native_key_expansion192(AESContext *cx, const unsigned char *key)
+-{
+- __m128i *keySchedule = cx->keySchedule;
+- pre_align __m128i tmp1 post_align;
+- pre_align __m128i tmp2 post_align;
+- pre_align __m128i tmp3 post_align;
+- pre_align __m128i carry post_align;
+- keySchedule[0] = _mm_loadu_si128((__m128i *)key);
+- keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
+- EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2],
+- keySchedule[3], carry, 0x1, 0x2);
+- EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]);
+- EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5],
+- keySchedule[6], carry, 0x4, 0x8);
+- EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]);
+- EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8],
+- keySchedule[9], carry, 0x10, 0x20);
+- EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]);
+- EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11],
+- keySchedule[12], carry, 0x40, 0x80);
+-}
+-
+-#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \
+- tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \
+- tmp2 = _mm_slli_si128(k1x, 4); \
+- tmp1 = _mm_xor_si128(k1x, tmp2); \
+- tmp2 = _mm_slli_si128(tmp2, 4); \
+- tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \
+- res = _mm_xor_si128(tmp1, tmp_key);
+-
+-#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \
+- EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \
+- EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA)
+-
+-static void
+-native_key_expansion256(AESContext *cx, const unsigned char *key)
+-{
+- __m128i *keySchedule = cx->keySchedule;
+- pre_align __m128i tmp_key post_align;
+- pre_align __m128i tmp1 post_align;
+- pre_align __m128i tmp2 post_align;
+- keySchedule[0] = _mm_loadu_si128((__m128i *)key);
+- keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16));
+- EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0],
+- keySchedule[1], 0x01);
+- EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2],
+- keySchedule[3], 0x02);
+- EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4],
+- keySchedule[5], 0x04);
+- EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6],
+- keySchedule[7], 0x08);
+- EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8],
+- keySchedule[9], 0x10);
+- EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10],
+- keySchedule[11], 0x20);
+- EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12],
+- keySchedule[13], 0xFF);
+-}
+-
+-#endif /* NSS_X86_OR_X64 */
+-
+-/*
+- * AES key expansion using aes-ni instructions.
+- */
+-static void
+-native_key_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk)
+-{
+-#ifdef NSS_X86_OR_X64
+- switch (Nk) {
+- case 4:
+- native_key_expansion128(cx, key);
+- return;
+- case 6:
+- native_key_expansion192(cx, key);
+- return;
+- case 8:
+- native_key_expansion256(cx, key);
+- return;
+- default:
+- /* This shouldn't happen. */
+- PORT_Assert(0);
+- }
+-#else
+- PORT_Assert(0);
+-#endif /* NSS_X86_OR_X64 */
+-}
+-
+-static void
+-native_encryptBlock(AESContext *cx,
+- unsigned char *output,
+- const unsigned char *input)
+-{
+-#ifdef NSS_X86_OR_X64
+- int i;
+- pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input);
+- m = _mm_xor_si128(m, cx->keySchedule[0]);
+- for (i = 1; i < cx->Nr; ++i) {
+- m = _mm_aesenc_si128(m, cx->keySchedule[i]);
+- }
+- m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]);
+- _mm_storeu_si128((__m128i *)output, m);
+-#else
+- PORT_Assert(0);
+-#endif /* NSS_X86_OR_X64 */
+-}
+-
+ /* rijndael_key_expansion
+ *
+ * Generate the expanded key from the key input by the user.
+@@ -830,7 +681,7 @@ rijndael_encryptECB(AESContext *cx, unsigned char *output,
+
+ if (aesni_support()) {
+ /* Use hardware acceleration for normal AES parameters. */
+- encryptor = &native_encryptBlock;
++ encryptor = &rijndael_native_encryptBlock;
+ } else {
+ encryptor = &rijndael_encryptBlock128;
+ }
+@@ -1026,7 +877,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize,
+ cx->mode == NSS_AES_CTR)) {
+ PORT_Assert(keysize == 16 || keysize == 24 || keysize == 32);
+ /* Prepare hardware key for normal AES parameters. */
+- native_key_expansion(cx, key, Nk);
++ rijndael_native_key_expansion(cx, key, Nk);
+ } else {
+ rijndael_key_expansion(cx, key, Nk);
+ }
+diff --git a/lib/freebl/rijndael.h b/lib/freebl/rijndael.h
+index 1f4a8a9f7..357f77c0a 100644
+--- a/lib/freebl/rijndael.h
++++ b/lib/freebl/rijndael.h
+@@ -9,9 +9,11 @@
+ #include <stdint.h>
+
+ #ifdef NSS_X86_OR_X64
+-#include <wmmintrin.h> /* aes-ni */
++#include <emmintrin.h> /* __m128i */
+ #endif
+
++SEC_BEGIN_PROTOS
++
+ typedef void AESBlockFunc(AESContext *cx,
+ unsigned char *output,
+ const unsigned char *input);
+@@ -66,4 +68,6 @@ struct AESContextStr {
+ void *mem; /* Start of the allocated memory to free. */
+ };
+
++SEC_END_PROTOS
++
+ #endif /* _RIJNDAEL_H_ */
+--
+2.13.5
+