#!/bin/bash
###############################################################################
# = WireGuard for Linux 5.4.y =
#
# This is a patch that combines all of the patches on [1] into one single patch
# with all file changes merged into a single change per file, so that patching
# is fast and conflict free, and to keep the size of this file minimized.
#
# It also doubles as an executable bash script, which you can run to update
# this to the latest patch series for 5.4.y. It relies on bash, sed, coreutils,
# and patchutils.
#
# [1] https://git.zx2c4.com/wireguard-linux/log/?h=backport-5.4.y
##############################################################################
set -e
self="$(readlink -f "${BASH_SOURCE[0]}")"
temp="$(mktemp -d)"
trap 'cd /; rm -rf "$temp";' INT TERM EXIT
cd "$temp"
echo "[+] Fetching WireGuard patches for 5.4.y"
curl -# -f -L -o git-patches.mbox "https://git.zx2c4.com/wireguard-linux/patch/?id2=gregkh/stable-5.4.y&id=backport-5.4.y"
echo "[+] Normalizing patches"
splitdiff -a -p 1 git-patches.mbox
l=/dev/null
for patch in *.patch; do
combinediff -q -p 1 "$l" "$patch" > next.patch
l=last.patch
mv next.patch last.patch
done
echo "[+] Writing out new patch"
sed '/^--$/q' "$self" > "$self.new"
cat last.patch >> "$self.new"
chmod +x "$self.new"
mv "$self.new" "$self"
exit 0
--
--- b/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -136,8 +136,6 @@
Userspace configuration for cryptographic instantiations such as
cbc(aes).
-if CRYPTO_MANAGER2
-
config CRYPTO_MANAGER_DISABLE_TESTS
bool "Disable run-time self tests"
default y
@@ -147,7 +145,7 @@
config CRYPTO_MANAGER_EXTRA_TESTS
bool "Enable extra run-time crypto self tests"
- depends on DEBUG_KERNEL && !CRYPTO_MANAGER_DISABLE_TESTS
+ depends on DEBUG_KERNEL && !CRYPTO_MANAGER_DISABLE_TESTS && CRYPTO_MANAGER
help
Enable extra run-time self tests of registered crypto algorithms,
including randomized fuzz tests.
@@ -155,8 +153,6 @@
This is intended for developer use only, as these tests take much
longer to run than the normal self tests.
-endif # if CRYPTO_MANAGER2
-
config CRYPTO_GF128MUL
tristate
@@ -264,6 +260,17 @@
standard algorithms (called GOST algorithms). Only signature verification
is implemented.
+config CRYPTO_CURVE25519
+ tristate "Curve25519 algorithm"
+ select CRYPTO_KPP
+ select CRYPTO_LIB_CURVE25519_GENERIC
+
+config CRYPTO_CURVE25519_X86
+ tristate "x86_64 accelerated Curve25519 scalar multiplication library"
+ depends on X86 && 64BIT
+ select CRYPTO_LIB_CURVE25519_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+
comment "Authenticated Encryption with Associated Data"
config CRYPTO_CCM
@@ -446,7 +453,7 @@
config CRYPTO_NHPOLY1305
tristate
select CRYPTO_HASH
- select CRYPTO_POLY1305
+ select CRYPTO_LIB_POLY1305_GENERIC
config CRYPTO_NHPOLY1305_SSE2
tristate "NHPoly1305 hash function (x86_64 SSE2 implementation)"
@@ -467,7 +474,7 @@
config CRYPTO_ADIANTUM
tristate "Adiantum support"
select CRYPTO_CHACHA20
- select CRYPTO_POLY1305
+ select CRYPTO_LIB_POLY1305_GENERIC
select CRYPTO_NHPOLY1305
select CRYPTO_MANAGER
help
@@ -639,6 +646,30 @@
xxHash non-cryptographic hash algorithm. Extremely fast, working at
speeds close to RAM limits.
+config CRYPTO_BLAKE2S
+ tristate "BLAKE2s digest algorithm"
+ select CRYPTO_LIB_BLAKE2S_GENERIC
+ select CRYPTO_HASH
+ help
+ Implementation of cryptographic hash function BLAKE2s
+ optimized for 8-32bit platforms and can produce digests of any size
+ between 1 to 32. The keyed hash is also implemented.
+
+ This module provides the following algorithms:
+
+ - blake2s-128
+ - blake2s-160
+ - blake2s-224
+ - blake2s-256
+
+ See https://blake2.net for further information.
+
+config CRYPTO_BLAKE2S_X86
+ tristate "BLAKE2s digest algorithm (x86 accelerated version)"
+ depends on X86 && 64BIT
+ select CRYPTO_LIB_BLAKE2S_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+
config CRYPTO_CRCT10DIF
tristate "CRCT10DIF algorithm"
select CRYPTO_HASH
@@ -686,6 +717,7 @@
config CRYPTO_POLY1305
tristate "Poly1305 authenticator algorithm"
select CRYPTO_HASH
+ select CRYPTO_LIB_POLY1305_GENERIC
help
Poly1305 authenticator algorithm, RFC7539.
@@ -696,7 +728,8 @@
config CRYPTO_POLY1305_X86_64
tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)"
depends on X86 && 64BIT
- select CRYPTO_POLY1305
+ select CRYPTO_LIB_POLY1305_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
help
Poly1305 authenticator algorithm, RFC7539.
@@ -705,6 +738,11 @@
in IETF protocols. This is the x86_64 assembler implementation using SIMD
instructions.
+config CRYPTO_POLY1305_MIPS
+ tristate "Poly1305 authenticator algorithm (MIPS optimized)"
+ depends on MIPS
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
config CRYPTO_MD4
tristate "MD4 digest algorithm"
select CRYPTO_HASH
@@ -878,9 +916,6 @@
SHA-1 secure hash standard (DFIPS 180-4) implemented
using powerpc SPE SIMD instruction set.
-config CRYPTO_LIB_SHA256
- tristate
-
config CRYPTO_SHA256
tristate "SHA224 and SHA256 digest algorithm"
select CRYPTO_HASH
@@ -1019,9 +1054,6 @@
comment "Ciphers"
-config CRYPTO_LIB_AES
- tristate
-
config CRYPTO_AES
tristate "AES cipher algorithms"
select CRYPTO_ALGAPI
@@ -1150,9 +1182,6 @@
-config CRYPTO_LIB_ARC4
- tristate
-
config CRYPTO_ARC4
tristate "ARC4 cipher algorithm"
select CRYPTO_BLKCIPHER
@@ -1339,9 +1368,6 @@
This module provides the Cast6 cipher algorithm that processes
eight blocks parallel using the AVX instruction set.
-config CRYPTO_LIB_DES
- tristate
-
config CRYPTO_DES
tristate "DES and Triple DES EDE cipher algorithms"
select CRYPTO_ALGAPI
@@ -1405,6 +1431,7 @@
config CRYPTO_CHACHA20
tristate "ChaCha stream cipher algorithms"
+ select CRYPTO_LIB_CHACHA_GENERIC
select CRYPTO_BLKCIPHER
help
The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms.
@@ -1428,11 +1455,18 @@
tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)"
depends on X86 && 64BIT
select CRYPTO_BLKCIPHER
- select CRYPTO_CHACHA20
+ select CRYPTO_LIB_CHACHA_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
help
SSSE3, AVX2, and AVX-512VL optimized implementations of the ChaCha20,
XChaCha20, and XChaCha12 stream ciphers.
+config CRYPTO_CHACHA_MIPS
+ tristate "ChaCha stream cipher algorithms (MIPS 32r2 optimized)"
+ depends on CPU_MIPS32_R2
+ select CRYPTO_BLKCIPHER
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
config CRYPTO_SEED
tristate "SEED cipher algorithm"
select CRYPTO_ALGAPI
@@ -1845,6 +1879,7 @@
config CRYPTO_HASH_INFO
bool
+source "lib/crypto/Kconfig"
source "drivers/crypto/Kconfig"
source "crypto/asymmetric_keys/Kconfig"
source "certs/Kconfig"
--- b/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: GPL-2.0
+
+comment "Crypto library routines"
+
+config CRYPTO_LIB_AES
+ tristate
+
+config CRYPTO_LIB_ARC4
+ tristate
+
+config CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+ tristate
+ help
+ Declares whether the architecture provides an arch-specific
+ accelerated implementation of the Blake2s library interface,
+ either builtin or as a module.
+
+config CRYPTO_LIB_BLAKE2S_GENERIC
+ tristate
+ help
+ This symbol can be depended upon by arch implementations of the
+ Blake2s library interface that require the generic code as a
+ fallback, e.g., for SIMD implementations. If no arch specific
+ implementation is enabled, this implementation serves the users
+ of CRYPTO_LIB_BLAKE2S.
+
+config CRYPTO_LIB_BLAKE2S
+ tristate "BLAKE2s hash function library"
+ depends on CRYPTO_ARCH_HAVE_LIB_BLAKE2S || !CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+ select CRYPTO_LIB_BLAKE2S_GENERIC if CRYPTO_ARCH_HAVE_LIB_BLAKE2S=n
+ help
+ Enable the Blake2s library interface. This interface may be fulfilled
+ by either the generic implementation or an arch-specific one, if one
+ is available and enabled.
+
+config CRYPTO_ARCH_HAVE_LIB_CHACHA
+ tristate
+ help
+ Declares whether the architecture provides an arch-specific
+ accelerated implementation of the ChaCha library interface,
+ either builtin or as a module.
+
+config CRYPTO_LIB_CHACHA_GENERIC
+ tristate
+ select CRYPTO_ALGAPI
+ help
+ This symbol can be depended upon by arch implementations of the
+ ChaCha library interface that require the generic code as a
+ fallback, e.g., for SIMD implementations. If no arch specific
+ implementation is enabled, this implementation serves the users
+ of CRYPTO_LIB_CHACHA.
+
+config CRYPTO_LIB_CHACHA
+ tristate "ChaCha library interface"
+ depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
+ select CRYPTO_LIB_CHACHA_GENERIC if CRYPTO_ARCH_HAVE_LIB_CHACHA=n
+ help
+ Enable the ChaCha library interface. This interface may be fulfilled
+ by either the generic implementation or an arch-specific one, if one
+ is available and enabled.
+
+config CRYPTO_ARCH_HAVE_LIB_CURVE25519
+ tristate
+ help
+ Declares whether the architecture provides an arch-specific
+ accelerated implementation of the Curve25519 library interface,
+ either builtin or as a module.
+
+config CRYPTO_LIB_CURVE25519_GENERIC
+ tristate
+ help
+ This symbol can be depended upon by arch implementations of the
+ Curve25519 library interface that require the generic code as a
+ fallback, e.g., for SIMD implementations. If no arch specific
+ implementation is enabled, this implementation serves the users
+ of CRYPTO_LIB_CURVE25519.
+
+config CRYPTO_LIB_CURVE25519
+ tristate "Curve25519 scalar multiplication library"
+ depends on CRYPTO_ARCH_HAVE_LIB_CURVE25519 || !CRYPTO_ARCH_HAVE_LIB_CURVE25519
+ select CRYPTO_LIB_CURVE25519_GENERIC if CRYPTO_ARCH_HAVE_LIB_CURVE25519=n
+ help
+ Enable the Curve25519 library interface. This interface may be
+ fulfilled by either the generic implementation or an arch-specific
+ one, if one is available and enabled.
+
+config CRYPTO_LIB_DES
+ tristate
+
+config CRYPTO_LIB_POLY1305_RSIZE
+ int
+ default 2 if MIPS
+ default 11 if X86_64
+ default 9 if ARM || ARM64
+ default 1
+
+config CRYPTO_ARCH_HAVE_LIB_POLY1305
+ tristate
+ help
+ Declares whether the architecture provides an arch-specific
+ accelerated implementation of the Poly1305 library interface,
+ either builtin or as a module.
+
+config CRYPTO_LIB_POLY1305_GENERIC
+ tristate
+ help
+ This symbol can be depended upon by arch implementations of the
+ Poly1305 library interface that require the generic code as a
+ fallback, e.g., for SIMD implementations. If no arch specific
+ implementation is enabled, this implementation serves the users
+ of CRYPTO_LIB_POLY1305.
+
+config CRYPTO_LIB_POLY1305
+ tristate "Poly1305 library interface"
+ depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
+ select CRYPTO_LIB_POLY1305_GENERIC if CRYPTO_ARCH_HAVE_LIB_POLY1305=n
+ help
+ Enable the Poly1305 library interface. This interface may be fulfilled
+ by either the generic implementation or an arch-specific one, if one
+ is available and enabled.
+
+config CRYPTO_LIB_CHACHA20POLY1305
+ tristate "ChaCha20-Poly1305 AEAD support (8-byte nonce library version)"
+ depends on CRYPTO_ARCH_HAVE_LIB_CHACHA || !CRYPTO_ARCH_HAVE_LIB_CHACHA
+ depends on CRYPTO_ARCH_HAVE_LIB_POLY1305 || !CRYPTO_ARCH_HAVE_LIB_POLY1305
+ select CRYPTO_LIB_CHACHA
+ select CRYPTO_LIB_POLY1305
+
+config CRYPTO_LIB_SHA256
+ tristate
--- b/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -1,13 +1,45 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o
-libaes-y := aes.o
+# chacha is used by the /dev/random driver which is always builtin
+obj-y += chacha.o
+obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC) += libchacha.o
-obj-$(CONFIG_CRYPTO_LIB_ARC4) += libarc4.o
-libarc4-y := arc4.o
+obj-$(CONFIG_CRYPTO_LIB_AES) += libaes.o
+libaes-y := aes.o
-obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o
-libdes-y := des.o
+obj-$(CONFIG_CRYPTO_LIB_ARC4) += libarc4.o
+libarc4-y := arc4.o
-obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
-libsha256-y := sha256.o
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) += libblake2s-generic.o
+libblake2s-generic-y += blake2s-generic.o
+
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2S) += libblake2s.o
+libblake2s-y += blake2s.o
+
+obj-$(CONFIG_CRYPTO_LIB_CHACHA20POLY1305) += libchacha20poly1305.o
+libchacha20poly1305-y += chacha20poly1305.o
+
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519_GENERIC) += libcurve25519-generic.o
+libcurve25519-generic-y := curve25519-fiat32.o
+libcurve25519-generic-$(CONFIG_ARCH_SUPPORTS_INT128) := curve25519-hacl64.o
+libcurve25519-generic-y += curve25519-generic.o
+
+obj-$(CONFIG_CRYPTO_LIB_CURVE25519) += libcurve25519.o
+libcurve25519-y += curve25519.o
+
+obj-$(CONFIG_CRYPTO_LIB_DES) += libdes.o
+libdes-y := des.o
+
+obj-$(CONFIG_CRYPTO_LIB_POLY1305_GENERIC) += libpoly1305.o
+libpoly1305-y := poly1305-donna32.o
+libpoly1305-$(CONFIG_ARCH_SUPPORTS_INT128) := poly1305-donna64.o
+libpoly1305-y += poly1305.o
+
+obj-$(CONFIG_CRYPTO_LIB_SHA256) += libsha256.o
+libsha256-y := sha256.o
+
+ifneq ($(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS),y)
+libblake2s-y += blake2s-selftest.o
+libchacha20poly1305-y += chacha20poly1305-selftest.o
+libcurve25519-y += curve25519-selftest.o
+endif
--- b/arch/arm/crypto/chacha-neon-glue.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
- * including ChaCha20 (RFC7539)
- *
- * Copyright (C) 2016 Linaro, Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include
-#include
-#include
-
-asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
- int nrounds);
-asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
- int nrounds);
-asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
-
-static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- u8 buf[CHACHA_BLOCK_SIZE];
-
- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
- chacha_4block_xor_neon(state, dst, src, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 4;
- src += CHACHA_BLOCK_SIZE * 4;
- dst += CHACHA_BLOCK_SIZE * 4;
- state[12] += 4;
- }
- while (bytes >= CHACHA_BLOCK_SIZE) {
- chacha_block_xor_neon(state, dst, src, nrounds);
- bytes -= CHACHA_BLOCK_SIZE;
- src += CHACHA_BLOCK_SIZE;
- dst += CHACHA_BLOCK_SIZE;
- state[12]++;
- }
- if (bytes) {
- memcpy(buf, src, bytes);
- chacha_block_xor_neon(state, buf, buf, nrounds);
- memcpy(dst, buf, bytes);
- }
-}
-
-static int chacha_neon_stream_xor(struct skcipher_request *req,
- const struct chacha_ctx *ctx, const u8 *iv)
-{
- struct skcipher_walk walk;
- u32 state[16];
- int err;
-
- err = skcipher_walk_virt(&walk, req, false);
-
- crypto_chacha_init(state, ctx, iv);
-
- while (walk.nbytes > 0) {
- unsigned int nbytes = walk.nbytes;
-
- if (nbytes < walk.total)
- nbytes = round_down(nbytes, walk.stride);
-
- kernel_neon_begin();
- chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, ctx->nrounds);
- kernel_neon_end();
- err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
- }
-
- return err;
-}
-
-static int chacha_neon(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
- return crypto_chacha_crypt(req);
-
- return chacha_neon_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha_neon(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct chacha_ctx subctx;
- u32 state[16];
- u8 real_iv[16];
-
- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
- return crypto_xchacha_crypt(req);
-
- crypto_chacha_init(state, ctx, req->iv);
-
- kernel_neon_begin();
- hchacha_block_neon(state, subctx.key, ctx->nrounds);
- kernel_neon_end();
- subctx.nrounds = ctx->nrounds;
-
- memcpy(&real_iv[0], req->iv + 24, 8);
- memcpy(&real_iv[8], req->iv + 16, 8);
- return chacha_neon_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
- {
- .base.cra_name = "chacha20",
- .base.cra_driver_name = "chacha20-neon",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA_KEY_SIZE,
- .max_keysize = CHACHA_KEY_SIZE,
- .ivsize = CHACHA_IV_SIZE,
- .chunksize = CHACHA_BLOCK_SIZE,
- .walksize = 4 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
- .encrypt = chacha_neon,
- .decrypt = chacha_neon,
- }, {
- .base.cra_name = "xchacha20",
- .base.cra_driver_name = "xchacha20-neon",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA_KEY_SIZE,
- .max_keysize = CHACHA_KEY_SIZE,
- .ivsize = XCHACHA_IV_SIZE,
- .chunksize = CHACHA_BLOCK_SIZE,
- .walksize = 4 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
- .encrypt = xchacha_neon,
- .decrypt = xchacha_neon,
- }, {
- .base.cra_name = "xchacha12",
- .base.cra_driver_name = "xchacha12-neon",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA_KEY_SIZE,
- .max_keysize = CHACHA_KEY_SIZE,
- .ivsize = XCHACHA_IV_SIZE,
- .chunksize = CHACHA_BLOCK_SIZE,
- .walksize = 4 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha12_setkey,
- .encrypt = xchacha_neon,
- .decrypt = xchacha_neon,
- }
-};
-
-static int __init chacha_simd_mod_init(void)
-{
- if (!(elf_hwcap & HWCAP_NEON))
- return -ENODEV;
-
- return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit chacha_simd_mod_fini(void)
-{
- crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha_simd_mod_init);
-module_exit(chacha_simd_mod_fini);
-
-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
-MODULE_AUTHOR("Ard Biesheuvel ");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-neon");
--- b/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@@ -1,5 +1,5 @@
/*
- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2016 - 2017 Linaro, Ltd.
@@ -20,9 +20,10 @@
*/
#include
-#include
+#include
#include
#include
+#include
#include
#include
@@ -36,6 +37,8 @@
int nrounds, int bytes);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
int bytes, int nrounds)
{
@@ -52,12 +55,51 @@
break;
}
chacha_4block_xor_neon(state, dst, src, nrounds, l);
- bytes -= CHACHA_BLOCK_SIZE * 5;
- src += CHACHA_BLOCK_SIZE * 5;
- dst += CHACHA_BLOCK_SIZE * 5;
- state[12] += 5;
+ bytes -= l;
+ src += l;
+ dst += l;
+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+ }
+}
+
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+ hchacha_block_generic(state, stream, nrounds);
+ } else {
+ kernel_neon_begin();
+ hchacha_block_neon(state, stream, nrounds);
+ kernel_neon_end();
}
}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+ chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+ int nrounds)
+{
+ if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+ !crypto_simd_usable())
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_neon_begin();
+ chacha_doneon(state, dst, src, todo, nrounds);
+ kernel_neon_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
static int chacha_neon_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
@@ -68,7 +110,7 @@
err = skcipher_walk_virt(&walk, req, false);
- crypto_chacha_init(state, ctx, iv);
+ chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
@@ -76,10 +118,17 @@
if (nbytes < walk.total)
nbytes = rounddown(nbytes, walk.stride);
- kernel_neon_begin();
- chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, ctx->nrounds);
- kernel_neon_end();
+ if (!static_branch_likely(&have_neon) ||
+ !crypto_simd_usable()) {
+ chacha_crypt_generic(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ ctx->nrounds);
+ } else {
+ kernel_neon_begin();
+ chacha_doneon(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes, ctx->nrounds);
+ kernel_neon_end();
+ }
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
@@ -91,9 +140,6 @@
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
- return crypto_chacha_crypt(req);
-
return chacha_neon_stream_xor(req, ctx, req->iv);
}
@@ -105,14 +151,8 @@
u32 state[16];
u8 real_iv[16];
- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
- return crypto_xchacha_crypt(req);
-
- crypto_chacha_init(state, ctx, req->iv);
-
- kernel_neon_begin();
- hchacha_block_neon(state, subctx.key, ctx->nrounds);
- kernel_neon_end();
+ chacha_init_generic(state, ctx->key, req->iv);
+ hchacha_block_arch(state, subctx.key, ctx->nrounds);
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
@@ -134,7 +174,7 @@
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
+ .setkey = chacha20_setkey,
.encrypt = chacha_neon,
.decrypt = chacha_neon,
}, {
@@ -150,7 +190,7 @@
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
+ .setkey = chacha20_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}, {
@@ -166,7 +206,7 @@
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha12_setkey,
+ .setkey = chacha12_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}
@@ -175,14 +215,18 @@
static int __init chacha_simd_mod_init(void)
{
if (!cpu_have_named_feature(ASIMD))
- return -ENODEV;
+ return 0;
+
+ static_branch_enable(&have_neon);
- return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+ return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
+ crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
}
static void __exit chacha_simd_mod_fini(void)
{
- crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && cpu_have_named_feature(ASIMD))
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);
--- b/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@@ -7,38 +7,36 @@
*/
#include
-#include
+#include
#include
#include
#include
#include
#include
-#define CHACHA_STATE_ALIGN 16
-
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
-#ifdef CONFIG_AS_AVX2
+
asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
-static bool chacha_use_avx2;
-#ifdef CONFIG_AS_AVX512
+
asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
-static bool chacha_use_avx512vl;
-#endif
-#endif
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
{
@@ -49,9 +47,8 @@
static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds)
{
-#ifdef CONFIG_AS_AVX2
-#ifdef CONFIG_AS_AVX512
- if (chacha_use_avx512vl) {
+ if (IS_ENABLED(CONFIG_AS_AVX512) &&
+ static_branch_likely(&chacha_use_avx512vl)) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha_8block_xor_avx512vl(state, dst, src, bytes,
nrounds);
@@ -79,8 +76,9 @@
return;
}
}
-#endif
- if (chacha_use_avx2) {
+
+ if (IS_ENABLED(CONFIG_AS_AVX2) &&
+ static_branch_likely(&chacha_use_avx2)) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 8;
@@ -104,7 +102,7 @@
return;
}
}
-#endif
+
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 4;
@@ -123,37 +121,75 @@
}
}
-static int chacha_simd_stream_xor(struct skcipher_walk *walk,
- const struct chacha_ctx *ctx, const u8 *iv)
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
{
- u32 *state, state_buf[16 + 2] __aligned(8);
- int next_yield = 4096; /* bytes until next FPU yield */
- int err = 0;
+ if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) {
+ hchacha_block_generic(state, stream, nrounds);
+ } else {
+ kernel_fpu_begin();
+ hchacha_block_ssse3(state, stream, nrounds);
+ kernel_fpu_end();
+ }
+}
+EXPORT_SYMBOL(hchacha_block_arch);
- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+ chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
- crypto_chacha_init(state, ctx, iv);
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+ int nrounds)
+{
+ if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() ||
+ bytes <= CHACHA_BLOCK_SIZE)
+ return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_fpu_begin();
+ chacha_dosimd(state, dst, src, todo, nrounds);
+ kernel_fpu_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
- while (walk->nbytes > 0) {
- unsigned int nbytes = walk->nbytes;
+static int chacha_simd_stream_xor(struct skcipher_request *req,
+ const struct chacha_ctx *ctx, const u8 *iv)
+{
+ u32 state[CHACHA_STATE_WORDS] __aligned(8);
+ struct skcipher_walk walk;
+ int err;
- if (nbytes < walk->total) {
- nbytes = round_down(nbytes, walk->stride);
- next_yield -= nbytes;
- }
+ err = skcipher_walk_virt(&walk, req, false);
- chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
- nbytes, ctx->nrounds);
+ chacha_init_generic(state, ctx->key, iv);
- if (next_yield <= 0) {
- /* temporarily allow preemption */
- kernel_fpu_end();
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);
+
+ if (!static_branch_likely(&chacha_use_simd) ||
+ !crypto_simd_usable()) {
+ chacha_crypt_generic(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ ctx->nrounds);
+ } else {
kernel_fpu_begin();
- next_yield = 4096;
+ chacha_dosimd(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes,
+ ctx->nrounds);
+ kernel_fpu_end();
}
-
- err = skcipher_walk_done(walk, walk->nbytes - nbytes);
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
@@ -163,55 +199,32 @@
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- int err;
- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
- return crypto_chacha_crypt(req);
-
- err = skcipher_walk_virt(&walk, req, true);
- if (err)
- return err;
-
- kernel_fpu_begin();
- err = chacha_simd_stream_xor(&walk, ctx, req->iv);
- kernel_fpu_end();
- return err;
+ return chacha_simd_stream_xor(req, ctx, req->iv);
}
static int xchacha_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
+ u32 state[CHACHA_STATE_WORDS] __aligned(8);
struct chacha_ctx subctx;
- u32 *state, state_buf[16 + 2] __aligned(8);
u8 real_iv[16];
- int err;
- if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable())
- return crypto_xchacha_crypt(req);
+ chacha_init_generic(state, ctx->key, req->iv);
- err = skcipher_walk_virt(&walk, req, true);
- if (err)
- return err;
-
- BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
- state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
- crypto_chacha_init(state, ctx, req->iv);
-
- kernel_fpu_begin();
-
- hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+ if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+ kernel_fpu_end();
+ } else {
+ hchacha_block_generic(state, subctx.key, ctx->nrounds);
+ }
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
- err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
-
- kernel_fpu_end();
-
- return err;
+ return chacha_simd_stream_xor(req, &subctx, real_iv);
}
static struct skcipher_alg algs[] = {
@@ -227,7 +240,7 @@
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
+ .setkey = chacha20_setkey,
.encrypt = chacha_simd,
.decrypt = chacha_simd,
}, {
@@ -242,7 +255,7 @@
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
+ .setkey = chacha20_setkey,
.encrypt = xchacha_simd,
.decrypt = xchacha_simd,
}, {
@@ -257,7 +270,7 @@
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha12_setkey,
+ .setkey = chacha12_setkey,
.encrypt = xchacha_simd,
.decrypt = xchacha_simd,
},
@@ -266,24 +279,29 @@
static int __init chacha_simd_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return -ENODEV;
+ return 0;
-#ifdef CONFIG_AS_AVX2
- chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#ifdef CONFIG_AS_AVX512
- chacha_use_avx512vl = chacha_use_avx2 &&
- boot_cpu_has(X86_FEATURE_AVX512VL) &&
- boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
-#endif
-#endif
- return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+ static_branch_enable(&chacha_use_simd);
+
+ if (IS_ENABLED(CONFIG_AS_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+ static_branch_enable(&chacha_use_avx2);
+
+ if (IS_ENABLED(CONFIG_AS_AVX512) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
+ static_branch_enable(&chacha_use_avx512vl);
+ }
+ return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
+ crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
}
static void __exit chacha_simd_mod_fini(void)
{
- crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+ if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);
--- b/crypto/chacha_generic.c
+++ b/crypto/chacha_generic.c
@@ -8,29 +8,10 @@
#include
#include
-#include
+#include
#include
#include
-static void chacha_docrypt(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes, int nrounds)
-{
- /* aligned to potentially speed up crypto_xor() */
- u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
-
- while (bytes >= CHACHA_BLOCK_SIZE) {
- chacha_block(state, stream, nrounds);
- crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
- bytes -= CHACHA_BLOCK_SIZE;
- dst += CHACHA_BLOCK_SIZE;
- src += CHACHA_BLOCK_SIZE;
- }
- if (bytes) {
- chacha_block(state, stream, nrounds);
- crypto_xor_cpy(dst, src, stream, bytes);
- }
-}
-
static int chacha_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
{
@@ -40,7 +21,7 @@
err = skcipher_walk_virt(&walk, req, false);
- crypto_chacha_init(state, ctx, iv);
+ chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
@@ -48,75 +29,23 @@
if (nbytes < walk.total)
nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE);
- chacha_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes, ctx->nrounds);
+ chacha_crypt_generic(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes, ctx->nrounds);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
-void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv)
-{
- state[0] = 0x61707865; /* "expa" */
- state[1] = 0x3320646e; /* "nd 3" */
- state[2] = 0x79622d32; /* "2-by" */
- state[3] = 0x6b206574; /* "te k" */
- state[4] = ctx->key[0];
- state[5] = ctx->key[1];
- state[6] = ctx->key[2];
- state[7] = ctx->key[3];
- state[8] = ctx->key[4];
- state[9] = ctx->key[5];
- state[10] = ctx->key[6];
- state[11] = ctx->key[7];
- state[12] = get_unaligned_le32(iv + 0);
- state[13] = get_unaligned_le32(iv + 4);
- state[14] = get_unaligned_le32(iv + 8);
- state[15] = get_unaligned_le32(iv + 12);
-}
-EXPORT_SYMBOL_GPL(crypto_chacha_init);
-
-static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keysize, int nrounds)
-{
- struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
- int i;
-
- if (keysize != CHACHA_KEY_SIZE)
- return -EINVAL;
-
- for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
- ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
-
- ctx->nrounds = nrounds;
- return 0;
-}
-
-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keysize)
-{
- return chacha_setkey(tfm, key, keysize, 20);
-}
-EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
-
-int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keysize)
-{
- return chacha_setkey(tfm, key, keysize, 12);
-}
-EXPORT_SYMBOL_GPL(crypto_chacha12_setkey);
-
-int crypto_chacha_crypt(struct skcipher_request *req)
+static int crypto_chacha_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
return chacha_stream_xor(req, ctx, req->iv);
}
-EXPORT_SYMBOL_GPL(crypto_chacha_crypt);
-int crypto_xchacha_crypt(struct skcipher_request *req)
+static int crypto_xchacha_crypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
@@ -125,8 +54,8 @@
u8 real_iv[16];
/* Compute the subkey given the original key and first 128 nonce bits */
- crypto_chacha_init(state, ctx, req->iv);
- hchacha_block(state, subctx.key, ctx->nrounds);
+ chacha_init_generic(state, ctx->key, req->iv);
+ hchacha_block_generic(state, subctx.key, ctx->nrounds);
subctx.nrounds = ctx->nrounds;
/* Build the real IV */
@@ -136,7 +65,6 @@
/* Generate the stream and XOR it with the data */
return chacha_stream_xor(req, &subctx, real_iv);
}
-EXPORT_SYMBOL_GPL(crypto_xchacha_crypt);
static struct skcipher_alg algs[] = {
{
@@ -151,7 +79,7 @@
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
+ .setkey = chacha20_setkey,
.encrypt = crypto_chacha_crypt,
.decrypt = crypto_chacha_crypt,
}, {
@@ -166,7 +94,7 @@
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
+ .setkey = chacha20_setkey,
.encrypt = crypto_xchacha_crypt,
.decrypt = crypto_xchacha_crypt,
}, {
@@ -181,7 +109,7 @@
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
- .setkey = crypto_chacha12_setkey,
+ .setkey = chacha12_setkey,
.encrypt = crypto_xchacha_crypt,
.decrypt = crypto_xchacha_crypt,
}
--- b/include/crypto/chacha.h
+++ b/include/crypto/chacha.h
@@ -15,9 +15,8 @@
#ifndef _CRYPTO_CHACHA_H
#define _CRYPTO_CHACHA_H
-#include
+#include
#include
-#include
/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
#define CHACHA_IV_SIZE 16
@@ -27,28 +26,74 @@
#define CHACHAPOLY_IV_SIZE 12
+#define CHACHA_STATE_WORDS (CHACHA_BLOCK_SIZE / sizeof(u32))
+
/* 192-bit nonce, then 64-bit stream position */
#define XCHACHA_IV_SIZE 32
-struct chacha_ctx {
- u32 key[8];
- int nrounds;
-};
-
-void chacha_block(u32 *state, u8 *stream, int nrounds);
+void chacha_block_generic(u32 *state, u8 *stream, int nrounds);
static inline void chacha20_block(u32 *state, u8 *stream)
{
- chacha_block(state, stream, 20);
+ chacha_block_generic(state, stream, 20);
+}
+
+void hchacha_block_arch(const u32 *state, u32 *out, int nrounds);
+void hchacha_block_generic(const u32 *state, u32 *out, int nrounds);
+
+static inline void hchacha_block(const u32 *state, u32 *out, int nrounds)
+{
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
+ hchacha_block_arch(state, out, nrounds);
+ else
+ hchacha_block_generic(state, out, nrounds);
+}
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv);
+static inline void chacha_init_generic(u32 *state, const u32 *key, const u8 *iv)
+{
+ state[0] = 0x61707865; /* "expa" */
+ state[1] = 0x3320646e; /* "nd 3" */
+ state[2] = 0x79622d32; /* "2-by" */
+ state[3] = 0x6b206574; /* "te k" */
+ state[4] = key[0];
+ state[5] = key[1];
+ state[6] = key[2];
+ state[7] = key[3];
+ state[8] = key[4];
+ state[9] = key[5];
+ state[10] = key[6];
+ state[11] = key[7];
+ state[12] = get_unaligned_le32(iv + 0);
+ state[13] = get_unaligned_le32(iv + 4);
+ state[14] = get_unaligned_le32(iv + 8);
+ state[15] = get_unaligned_le32(iv + 12);
+}
+
+static inline void chacha_init(u32 *state, const u32 *key, const u8 *iv)
+{
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
+ chacha_init_arch(state, key, iv);
+ else
+ chacha_init_generic(state, key, iv);
}
-void hchacha_block(const u32 *in, u32 *out, int nrounds);
-void crypto_chacha_init(u32 *state, const struct chacha_ctx *ctx, const u8 *iv);
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds);
+void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds);
-int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keysize);
-int crypto_chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keysize);
+static inline void chacha_crypt(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
+ chacha_crypt_arch(state, dst, src, bytes, nrounds);
+ else
+ chacha_crypt_generic(state, dst, src, bytes, nrounds);
+}
-int crypto_chacha_crypt(struct skcipher_request *req);
-int crypto_xchacha_crypt(struct skcipher_request *req);
+static inline void chacha20_crypt(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes)
+{
+ chacha_crypt(state, dst, src, bytes, 20);
+}
#endif /* _CRYPTO_CHACHA_H */
--- b/include/crypto/internal/chacha.h
+++ b/include/crypto/internal/chacha.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CRYPTO_INTERNAL_CHACHA_H
+#define _CRYPTO_INTERNAL_CHACHA_H
+
+#include
+#include
+#include
+
+struct chacha_ctx {
+ u32 key[8];
+ int nrounds;
+};
+
+static inline int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keysize, int nrounds)
+{
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int i;
+
+ if (keysize != CHACHA_KEY_SIZE)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(ctx->key); i++)
+ ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32));
+
+ ctx->nrounds = nrounds;
+ return 0;
+}
+
+static inline int chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keysize)
+{
+ return chacha_setkey(tfm, key, keysize, 20);
+}
+
+static inline int chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int keysize)
+{
+ return chacha_setkey(tfm, key, keysize, 12);
+}
+
+#endif /* _CRYPTO_CHACHA_H */
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,8 +26,7 @@ endif
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o timerqueue.o xarray.o \
- idr.o extable.o \
- sha1.o chacha.o irq_regs.o argv_split.o \
+ idr.o extable.o sha1.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
--- a/lib/chacha.c
+++ /dev/null
@@ -1,113 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-static void chacha_permute(u32 *x, int nrounds)
-{
- int i;
-
- /* whitelist the allowed round counts */
- WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
-
- for (i = 0; i < nrounds; i += 2) {
- x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16);
- x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16);
- x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16);
- x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16);
-
- x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12);
- x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12);
- x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12);
- x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12);
-
- x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8);
- x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8);
- x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8);
- x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8);
-
- x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7);
- x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7);
- x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7);
- x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7);
-
- x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16);
- x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16);
- x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16);
- x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16);
-
- x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12);
- x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12);
- x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12);
- x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12);
-
- x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8);
- x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8);
- x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8);
- x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8);
-
- x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7);
- x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7);
- x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7);
- x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7);
- }
-}
-
-/**
- * chacha_block - generate one keystream block and increment block counter
- * @state: input state matrix (16 32-bit words)
- * @stream: output keystream block (64 bytes)
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
- *
- * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
- * The caller has already converted the endianness of the input. This function
- * also handles incrementing the block counter in the input matrix.
- */
-void chacha_block(u32 *state, u8 *stream, int nrounds)
-{
- u32 x[16];
- int i;
-
- memcpy(x, state, 64);
-
- chacha_permute(x, nrounds);
-
- for (i = 0; i < ARRAY_SIZE(x); i++)
- put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
-
- state[12]++;
-}
-EXPORT_SYMBOL(chacha_block);
-
-/**
- * hchacha_block - abbreviated ChaCha core, for XChaCha
- * @in: input state matrix (16 32-bit words)
- * @out: output (8 32-bit words)
- * @nrounds: number of rounds (20 or 12; 20 is recommended)
- *
- * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
- * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha
- * skips the final addition of the initial state, and outputs only certain words
- * of the state. It should not be used for streaming directly.
- */
-void hchacha_block(const u32 *in, u32 *out, int nrounds)
-{
- u32 x[16];
-
- memcpy(x, in, 64);
-
- chacha_permute(x, nrounds);
-
- memcpy(&out[0], &x[0], 16);
- memcpy(&out[4], &x[12], 16);
-}
-EXPORT_SYMBOL(hchacha_block);
--- /dev/null
+++ b/lib/crypto/chacha.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+static void chacha_permute(u32 *x, int nrounds)
+{
+ int i;
+
+ /* whitelist the allowed round counts */
+ WARN_ON_ONCE(nrounds != 20 && nrounds != 12);
+
+ for (i = 0; i < nrounds; i += 2) {
+ x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16);
+ x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16);
+ x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16);
+ x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16);
+
+ x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12);
+ x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12);
+ x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12);
+ x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12);
+
+ x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8);
+ x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8);
+ x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8);
+ x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8);
+
+ x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7);
+ x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7);
+ x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7);
+ x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7);
+
+ x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16);
+ x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16);
+ x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16);
+ x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16);
+
+ x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12);
+ x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12);
+ x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12);
+ x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12);
+
+ x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8);
+ x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8);
+ x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8);
+ x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8);
+
+ x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7);
+ x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7);
+ x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7);
+ x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7);
+ }
+}
+
+/**
+ * chacha_block - generate one keystream block and increment block counter
+ * @state: input state matrix (16 32-bit words)
+ * @stream: output keystream block (64 bytes)
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
+ * The caller has already converted the endianness of the input. This function
+ * also handles incrementing the block counter in the input matrix.
+ */
+void chacha_block_generic(u32 *state, u8 *stream, int nrounds)
+{
+ u32 x[16];
+ int i;
+
+ memcpy(x, state, 64);
+
+ chacha_permute(x, nrounds);
+
+ for (i = 0; i < ARRAY_SIZE(x); i++)
+ put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
+
+ state[12]++;
+}
+EXPORT_SYMBOL(chacha_block_generic);
+
+/**
+ * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
+ * @state: input state matrix (16 32-bit words)
+ * @out: output (8 32-bit words)
+ * @nrounds: number of rounds (20 or 12; 20 is recommended)
+ *
+ * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
+ * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha
+ * skips the final addition of the initial state, and outputs only certain words
+ * of the state. It should not be used for streaming directly.
+ */
+void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds)
+{
+ u32 x[16];
+
+ memcpy(x, state, 64);
+
+ chacha_permute(x, nrounds);
+
+ memcpy(&stream[0], &x[0], 16);
+ memcpy(&stream[4], &x[12], 16);
+}
+EXPORT_SYMBOL(hchacha_block_generic);
--- /dev/null
+++ b/lib/crypto/libchacha.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * The ChaCha stream cipher (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include
+#include
+#include
+
+#include // for crypto_xor_cpy
+#include
+
+void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ /* aligned to potentially speed up crypto_xor() */
+ u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long));
+
+ while (bytes >= CHACHA_BLOCK_SIZE) {
+ chacha_block_generic(state, stream, nrounds);
+ crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
+ bytes -= CHACHA_BLOCK_SIZE;
+ dst += CHACHA_BLOCK_SIZE;
+ src += CHACHA_BLOCK_SIZE;
+ }
+ if (bytes) {
+ chacha_block_generic(state, stream, nrounds);
+ crypto_xor_cpy(dst, src, stream, bytes);
+ }
+}
+EXPORT_SYMBOL(chacha_crypt_generic);
+
+MODULE_LICENSE("GPL");
--- b/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -103,7 +103,14 @@
tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
- select CRYPTO_CHACHA20
+ select CRYPTO_LIB_CHACHA_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_NEON
+ tristate "Poly1305 hash function using scalar or NEON instructions"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_HASH
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
config CRYPTO_NHPOLY1305_NEON
tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
--- b/arch/arm/crypto/chacha-scalar-core.S
+++ b/arch/arm/crypto/chacha-scalar-core.S
@@ -0,0 +1,460 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Google, Inc.
+ */
+
+#include
+#include
+
+/*
+ * Design notes:
+ *
+ * 16 registers would be needed to hold the state matrix, but only 14 are
+ * available because 'sp' and 'pc' cannot be used. So we spill the elements
+ * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
+ * 'ldrd' and one 'strd' instruction per round.
+ *
+ * All rotates are performed using the implicit rotate operand accepted by the
+ * 'add' and 'eor' instructions. This is faster than using explicit rotate
+ * instructions. To make this work, we allow the values in the second and last
+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
+ * wrong rotation amount. The rotation amount is then fixed up just in time
+ * when the values are used. 'brot' is the number of bits the values in row 'b'
+ * need to be rotated right to arrive at the correct values, and 'drot'
+ * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
+ * that they end up as (25, 24) after every round.
+ */
+
+ // ChaCha state registers
+ X0 .req r0
+ X1 .req r1
+ X2 .req r2
+ X3 .req r3
+ X4 .req r4
+ X5 .req r5
+ X6 .req r6
+ X7 .req r7
+ X8_X10 .req r8 // shared by x8 and x10
+ X9_X11 .req r9 // shared by x9 and x11
+ X12 .req r10
+ X13 .req r11
+ X14 .req r12
+ X15 .req r14
+
+.macro __rev out, in, t0, t1, t2
+.if __LINUX_ARM_ARCH__ >= 6
+ rev \out, \in
+.else
+ lsl \t0, \in, #24
+ and \t1, \in, #0xff00
+ and \t2, \in, #0xff0000
+ orr \out, \t0, \in, lsr #24
+ orr \out, \out, \t1, lsl #8
+ orr \out, \out, \t2, lsr #8
+.endif
+.endm
+
+.macro _le32_bswap x, t0, t1, t2
+#ifdef __ARMEB__
+ __rev \x, \x, \t0, \t1, \t2
+#endif
+.endm
+
+.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
+ _le32_bswap \a, \t0, \t1, \t2
+ _le32_bswap \b, \t0, \t1, \t2
+ _le32_bswap \c, \t0, \t1, \t2
+ _le32_bswap \d, \t0, \t1, \t2
+.endm
+
+.macro __ldrd a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ ldrd \a, \b, [\src, #\offset]
+#else
+ ldr \a, [\src, #\offset]
+ ldr \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+ strd \a, \b, [\dst, #\offset]
+#else
+ str \a, [\dst, #\offset]
+ str \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
+
+ // a += b; d ^= a; d = rol(d, 16);
+ add \a1, \a1, \b1, ror #brot
+ add \a2, \a2, \b2, ror #brot
+ eor \d1, \a1, \d1, ror #drot
+ eor \d2, \a2, \d2, ror #drot
+ // drot == 32 - 16 == 16
+
+ // c += d; b ^= c; b = rol(b, 12);
+ add \c1, \c1, \d1, ror #16
+ add \c2, \c2, \d2, ror #16
+ eor \b1, \c1, \b1, ror #brot
+ eor \b2, \c2, \b2, ror #brot
+ // brot == 32 - 12 == 20
+
+ // a += b; d ^= a; d = rol(d, 8);
+ add \a1, \a1, \b1, ror #20
+ add \a2, \a2, \b2, ror #20
+ eor \d1, \a1, \d1, ror #16
+ eor \d2, \a2, \d2, ror #16
+ // drot == 32 - 8 == 24
+
+ // c += d; b ^= c; b = rol(b, 7);
+ add \c1, \c1, \d1, ror #24
+ add \c2, \c2, \d2, ror #24
+ eor \b1, \c1, \b1, ror #20
+ eor \b2, \c2, \b2, ror #20
+ // brot == 32 - 7 == 25
+.endm
+
+.macro _doubleround
+
+ // column round
+
+ // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
+ _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
+
+ // save (x8, x9); restore (x10, x11)
+ __strd X8_X10, X9_X11, sp, 0
+ __ldrd X8_X10, X9_X11, sp, 8
+
+ // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
+ _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
+
+ .set brot, 25
+ .set drot, 24
+
+ // diagonal round
+
+ // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
+ _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
+
+ // save (x10, x11); restore (x8, x9)
+ __strd X8_X10, X9_X11, sp, 8
+ __ldrd X8_X10, X9_X11, sp, 0
+
+ // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
+ _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
+.endm
+
+.macro _chacha_permute nrounds
+ .set brot, 0
+ .set drot, 0
+ .rept \nrounds / 2
+ _doubleround
+ .endr
+.endm
+
+.macro _chacha nrounds
+
+.Lnext_block\@:
+ // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
+ // Registers contain x0-x9,x12-x15.
+
+ // Do the core ChaCha permutation to update x0-x15.
+ _chacha_permute \nrounds
+
+ add sp, #8
+ // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers contain x0-x9,x12-x15.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
+ push {X8_X10, X9_X11, X12, X13, X14, X15}
+
+ // Load (OUT, IN, LEN).
+ ldr r14, [sp, #96]
+ ldr r12, [sp, #100]
+ ldr r11, [sp, #104]
+
+ orr r10, r14, r12
+
+ // Use slow path if fewer than 64 bytes remain.
+ cmp r11, #64
+ blt .Lxor_slowpath\@
+
+ // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
+ // ARMv6+, since ldmia and stmia (used below) still require alignment.
+ tst r10, #3
+ bne .Lxor_slowpath\@
+
+ // Fast path: XOR 64 bytes of aligned data.
+
+ // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // x0-x3
+ __ldrd r8, r9, sp, 32
+ __ldrd r10, r11, sp, 40
+ add X0, X0, r8
+ add X1, X1, r9
+ add X2, X2, r10
+ add X3, X3, r11
+ _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
+ ldmia r12!, {r8-r11}
+ eor X0, X0, r8
+ eor X1, X1, r9
+ eor X2, X2, r10
+ eor X3, X3, r11
+ stmia r14!, {X0-X3}
+
+ // x4-x7
+ __ldrd r8, r9, sp, 48
+ __ldrd r10, r11, sp, 56
+ add X4, r8, X4, ror #brot
+ add X5, r9, X5, ror #brot
+ ldmia r12!, {X0-X3}
+ add X6, r10, X6, ror #brot
+ add X7, r11, X7, ror #brot
+ _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
+ eor X4, X4, X0
+ eor X5, X5, X1
+ eor X6, X6, X2
+ eor X7, X7, X3
+ stmia r14!, {X4-X7}
+
+ // x8-x15
+ pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
+ __ldrd r8, r9, sp, 32
+ __ldrd r10, r11, sp, 40
+ add r0, r0, r8 // x8
+ add r1, r1, r9 // x9
+ add r6, r6, r10 // x10
+ add r7, r7, r11 // x11
+ _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
+ ldmia r12!, {r8-r11}
+ eor r0, r0, r8 // x8
+ eor r1, r1, r9 // x9
+ eor r6, r6, r10 // x10
+ eor r7, r7, r11 // x11
+ stmia r14!, {r0,r1,r6,r7}
+ ldmia r12!, {r0,r1,r6,r7}
+ __ldrd r8, r9, sp, 48
+ __ldrd r10, r11, sp, 56
+ add r2, r8, r2, ror #drot // x12
+ add r3, r9, r3, ror #drot // x13
+ add r4, r10, r4, ror #drot // x14
+ add r5, r11, r5, ror #drot // x15
+ _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
+ ldr r9, [sp, #72] // load LEN
+ eor r2, r2, r0 // x12
+ eor r3, r3, r1 // x13
+ eor r4, r4, r6 // x14
+ eor r5, r5, r7 // x15
+ subs r9, #64 // decrement and check LEN
+ stmia r14!, {r2-r5}
+
+ beq .Ldone\@
+
+.Lprepare_for_next_block\@:
+
+ // Stack: x0-x15 OUT IN LEN
+
+ // Increment block counter (x12)
+ add r8, #1
+
+ // Store updated (OUT, IN, LEN)
+ str r14, [sp, #64]
+ str r12, [sp, #68]
+ str r9, [sp, #72]
+
+ mov r14, sp
+
+ // Store updated block counter (x12)
+ str r8, [sp, #48]
+
+ sub sp, #16
+
+ // Reload state and do next block
+ ldmia r14!, {r0-r11} // load x0-x11
+ __strd r10, r11, sp, 8 // store x10-x11 before state
+ ldmia r14, {r10-r12,r14} // load x12-x15
+ b .Lnext_block\@
+
+.Lxor_slowpath\@:
+ // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
+ // We handle it by storing the 64 bytes of keystream to the stack, then
+ // XOR-ing the needed portion with the data.
+
+ // Allocate keystream buffer
+ sub sp, #64
+ mov r14, sp
+
+ // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+ // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
+ // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+ // Save keystream for x0-x3
+ __ldrd r8, r9, sp, 96
+ __ldrd r10, r11, sp, 104
+ add X0, X0, r8
+ add X1, X1, r9
+ add X2, X2, r10
+ add X3, X3, r11
+ _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
+ stmia r14!, {X0-X3}
+
+ // Save keystream for x4-x7
+ __ldrd r8, r9, sp, 112
+ __ldrd r10, r11, sp, 120
+ add X4, r8, X4, ror #brot
+ add X5, r9, X5, ror #brot
+ add X6, r10, X6, ror #brot
+ add X7, r11, X7, ror #brot
+ _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
+ add r8, sp, #64
+ stmia r14!, {X4-X7}
+
+ // Save keystream for x8-x15
+ ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
+ __ldrd r8, r9, sp, 128
+ __ldrd r10, r11, sp, 136
+ add r0, r0, r8 // x8
+ add r1, r1, r9 // x9
+ add r6, r6, r10 // x10
+ add r7, r7, r11 // x11
+ _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
+ stmia r14!, {r0,r1,r6,r7}
+ __ldrd r8, r9, sp, 144
+ __ldrd r10, r11, sp, 152
+ add r2, r8, r2, ror #drot // x12
+ add r3, r9, r3, ror #drot // x13
+ add r4, r10, r4, ror #drot // x14
+ add r5, r11, r5, ror #drot // x15
+ _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
+ stmia r14, {r2-r5}
+
+ // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
+ // Registers: r8 is block counter, r12 is IN.
+
+ ldr r9, [sp, #168] // LEN
+ ldr r14, [sp, #160] // OUT
+ cmp r9, #64
+ mov r0, sp
+ movle r1, r9
+ movgt r1, #64
+ // r1 is number of bytes to XOR, in range [1, 64]
+
+.if __LINUX_ARM_ARCH__ < 6
+ orr r2, r12, r14
+ tst r2, #3 // IN or OUT misaligned?
+ bne .Lxor_next_byte\@
+.endif
+
+ // XOR a word at a time
+.rept 16
+ subs r1, #4
+ blt .Lxor_words_done\@
+ ldr r2, [r12], #4
+ ldr r3, [r0], #4
+ eor r2, r2, r3
+ str r2, [r14], #4
+.endr
+ b .Lxor_slowpath_done\@
+.Lxor_words_done\@:
+ ands r1, r1, #3
+ beq .Lxor_slowpath_done\@
+
+ // XOR a byte at a time
+.Lxor_next_byte\@:
+ ldrb r2, [r12], #1
+ ldrb r3, [r0], #1
+ eor r2, r2, r3
+ strb r2, [r14], #1
+ subs r1, #1
+ bne .Lxor_next_byte\@
+
+.Lxor_slowpath_done\@:
+ subs r9, #64
+ add sp, #96
+ bgt .Lprepare_for_next_block\@
+
+.Ldone\@:
+.endm // _chacha
+
+/*
+ * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ * const u32 *state, int nrounds);
+ */
+ENTRY(chacha_doarm)
+ cmp r2, #0 // len == 0?
+ reteq lr
+
+ ldr ip, [sp]
+ cmp ip, #12
+
+ push {r0-r2,r4-r11,lr}
+
+ // Push state x0-x15 onto stack.
+ // Also store an extra copy of x10-x11 just before the state.
+
+ add X12, r3, #48
+ ldm X12, {X12,X13,X14,X15}
+ push {X12,X13,X14,X15}
+ sub sp, sp, #64
+
+ __ldrd X8_X10, X9_X11, r3, 40
+ __strd X8_X10, X9_X11, sp, 8
+ __strd X8_X10, X9_X11, sp, 56
+ ldm r3, {X0-X9_X11}
+ __strd X0, X1, sp, 16
+ __strd X2, X3, sp, 24
+ __strd X4, X5, sp, 32
+ __strd X6, X7, sp, 40
+ __strd X8_X10, X9_X11, sp, 48
+
+ beq 1f
+ _chacha 20
+
+0: add sp, #76
+ pop {r4-r11, pc}
+
+1: _chacha 12
+ b 0b
+ENDPROC(chacha_doarm)
+
+/*
+ * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
+ */
+ENTRY(hchacha_block_arm)
+ push {r1,r4-r11,lr}
+
+ cmp r2, #12 // ChaCha12 ?
+
+ mov r14, r0
+ ldmia r14!, {r0-r11} // load x0-x11
+ push {r10-r11} // store x10-x11 to stack
+ ldm r14, {r10-r12,r14} // load x12-x15
+ sub sp, #8
+
+ beq 1f
+ _chacha_permute 20
+
+ // Skip over (unused0-unused1, x10-x11)
+0: add sp, #16
+
+ // Fix up rotations of x12-x15
+ ror X12, X12, #drot
+ ror X13, X13, #drot
+ pop {r4} // load 'out'
+ ror X14, X14, #drot
+ ror X15, X15, #drot
+
+ // Store (x0-x3,x12-x15) to 'out'
+ stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+ pop {r4-r11,pc}
+
+1: _chacha_permute 12
+ b 0b
+ENDPROC(hchacha_block_arm)
--- b/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -129,12 +129,22 @@
config CRYPTO_CHACHA20_NEON
- tristate "NEON accelerated ChaCha stream cipher algorithms"
- depends on KERNEL_MODE_NEON
+ tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
select CRYPTO_BLKCIPHER
- select CRYPTO_CHACHA20
+ select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_ARM
+ tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
+ select CRYPTO_HASH
+ select CRYPTO_ARCH_HAVE_LIB_POLY1305
config CRYPTO_NHPOLY1305_NEON
tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
depends on KERNEL_MODE_NEON
select CRYPTO_NHPOLY1305
+config CRYPTO_CURVE25519_NEON
+ tristate "NEON accelerated Curve25519 scalar multiplication library"
+ depends on KERNEL_MODE_NEON
+ select CRYPTO_LIB_CURVE25519_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_CURVE25519
+
endif
--- b/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -10,7 +10,9 @@
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
+obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -53,13 +55,19 @@
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+chacha-neon-y := chacha-scalar-core.o chacha-glue.o
+chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
+poly1305-arm-y := poly1305-core.o poly1305-glue.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
+curve25519-neon-y := curve25519-core.o curve25519-glue.o
ifdef REGENERATE_ARM_CRYPTO
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
+$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
+ $(call cmd,perl)
+
$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
$(call cmd,perl)
@@ -67,4 +75,9 @@
$(call cmd,perl)
endif
-clean-files += sha256-core.S sha512-core.S
+clean-files += poly1305-core.S sha256-core.S sha512-core.S
+
+# massage the perlasm code a bit so we only get the NEON routine if we need it
+poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
+poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
+AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
--- b/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2016-2019 Linaro, Ltd.
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+ int nrounds);
+asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+ int nrounds, unsigned int nbytes);
+asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ const u32 *state, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
+
+static inline bool neon_usable(void)
+{
+ return static_branch_likely(&use_neon) && crypto_simd_usable();
+}
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+ u8 buf[CHACHA_BLOCK_SIZE];
+
+ while (bytes > CHACHA_BLOCK_SIZE) {
+ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+ }
+ if (bytes) {
+ const u8 *s = src;
+ u8 *d = dst;
+
+ if (bytes != CHACHA_BLOCK_SIZE)
+ s = d = memcpy(buf, src, bytes);
+ chacha_block_xor_neon(state, d, s, nrounds);
+ if (d != dst)
+ memcpy(dst, buf, bytes);
+ state[12]++;
+ }
+}
+
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
+ hchacha_block_arm(state, stream, nrounds);
+ } else {
+ kernel_neon_begin();
+ hchacha_block_neon(state, stream, nrounds);
+ kernel_neon_end();
+ }
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+ chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+ int nrounds)
+{
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
+ bytes <= CHACHA_BLOCK_SIZE) {
+ chacha_doarm(dst, src, bytes, state, nrounds);
+ state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
+ return;
+ }
+
+ do {
+ unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+ kernel_neon_begin();
+ chacha_doneon(state, dst, src, todo, nrounds);
+ kernel_neon_end();
+
+ bytes -= todo;
+ src += todo;
+ dst += todo;
+ } while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+static int chacha_stream_xor(struct skcipher_request *req,
+ const struct chacha_ctx *ctx, const u8 *iv,
+ bool neon)
+{
+ struct skcipher_walk walk;
+ u32 state[16];
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ chacha_init_generic(state, ctx->key, iv);
+
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);
+
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
+ chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes, state, ctx->nrounds);
+ state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
+ } else {
+ kernel_neon_begin();
+ chacha_doneon(state, walk.dst.virt.addr,
+ walk.src.virt.addr, nbytes, ctx->nrounds);
+ kernel_neon_end();
+ }
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+
+ return err;
+}
+
+static int do_chacha(struct skcipher_request *req, bool neon)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return chacha_stream_xor(req, ctx, req->iv, neon);
+}
+
+static int chacha_arm(struct skcipher_request *req)
+{
+ return do_chacha(req, false);
+}
+
+static int chacha_neon(struct skcipher_request *req)
+{
+ return do_chacha(req, neon_usable());
+}
+
+static int do_xchacha(struct skcipher_request *req, bool neon)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct chacha_ctx subctx;
+ u32 state[16];
+ u8 real_iv[16];
+
+ chacha_init_generic(state, ctx->key, req->iv);
+
+ if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
+ hchacha_block_arm(state, subctx.key, ctx->nrounds);
+ } else {
+ kernel_neon_begin();
+ hchacha_block_neon(state, subctx.key, ctx->nrounds);
+ kernel_neon_end();
+ }
+ subctx.nrounds = ctx->nrounds;
+
+ memcpy(&real_iv[0], req->iv + 24, 8);
+ memcpy(&real_iv[8], req->iv + 16, 8);
+ return chacha_stream_xor(req, &subctx, real_iv, neon);
+}
+
+static int xchacha_arm(struct skcipher_request *req)
+{
+ return do_xchacha(req, false);
+}
+
+static int xchacha_neon(struct skcipher_request *req)
+{
+ return do_xchacha(req, neon_usable());
+}
+
+static struct skcipher_alg arm_algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-arm",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = chacha_arm,
+ .decrypt = chacha_arm,
+ }, {
+ .base.cra_name = "xchacha20",
+ .base.cra_driver_name = "xchacha20-arm",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = xchacha_arm,
+ .decrypt = xchacha_arm,
+ }, {
+ .base.cra_name = "xchacha12",
+ .base.cra_driver_name = "xchacha12-arm",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha12_setkey,
+ .encrypt = xchacha_arm,
+ .decrypt = xchacha_arm,
+ },
+};
+
+static struct skcipher_alg neon_algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .walksize = 4 * CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = chacha_neon,
+ .decrypt = chacha_neon,
+ }, {
+ .base.cra_name = "xchacha20",
+ .base.cra_driver_name = "xchacha20-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .walksize = 4 * CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = xchacha_neon,
+ .decrypt = xchacha_neon,
+ }, {
+ .base.cra_name = "xchacha12",
+ .base.cra_driver_name = "xchacha12-neon",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .walksize = 4 * CHACHA_BLOCK_SIZE,
+ .setkey = chacha12_setkey,
+ .encrypt = xchacha_neon,
+ .decrypt = xchacha_neon,
+ }
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+ int err = 0;
+
+ if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
+ err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+ if (err)
+ return err;
+ }
+
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+ int i;
+
+ switch (read_cpuid_part()) {
+ case ARM_CPU_PART_CORTEX_A7:
+ case ARM_CPU_PART_CORTEX_A5:
+ /*
+ * The Cortex-A7 and Cortex-A5 do not perform well with
+ * the NEON implementation but do incredibly with the
+ * scalar one and use less power.
+ */
+ for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
+ neon_algs[i].base.cra_priority = 0;
+ break;
+ default:
+ static_branch_enable(&use_neon);
+ }
+
+ if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
+ err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
+ if (err)
+ crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+ }
+ }
+ return err;
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
+ crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
+ crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
+ }
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel ");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-arm");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-arm");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-arm");
+#ifdef CONFIG_KERNEL_MODE_NEON
+MODULE_ALIAS_CRYPTO("chacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha20-neon");
+MODULE_ALIAS_CRYPTO("xchacha12-neon");
+#endif
--- b/arch/mips/crypto/chacha-core.S
+++ b/arch/mips/crypto/chacha-core.S
@@ -0,0 +1,497 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2016-2018 René van Dorst . All Rights Reserved.
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
+ */
+
+#define MASK_U32 0x3c
+#define CHACHA20_BLOCK_SIZE 64
+#define STACK_SIZE 32
+
+#define X0 $t0
+#define X1 $t1
+#define X2 $t2
+#define X3 $t3
+#define X4 $t4
+#define X5 $t5
+#define X6 $t6
+#define X7 $t7
+#define X8 $t8
+#define X9 $t9
+#define X10 $v1
+#define X11 $s6
+#define X12 $s5
+#define X13 $s4
+#define X14 $s3
+#define X15 $s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0 $s1
+#define T1 $s0
+#define T(n) T ## n
+#define X(n) X ## n
+
+/* Input arguments */
+#define STATE $a0
+#define OUT $a1
+#define IN $a2
+#define BYTES $a3
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0 $v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X X15
+#define SAVED_CA $s7
+
+#define IS_UNALIGNED $s7
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define ROTx rotl
+#define ROTR(n) rotr n, 24
+#define CPU_TO_LE32(n) \
+ wsbh n; \
+ rotr n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define ROTx rotr
+#define CPU_TO_LE32(n)
+#define ROTR(n)
+#endif
+
+#define FOR_EACH_WORD(x) \
+ x( 0); \
+ x( 1); \
+ x( 2); \
+ x( 3); \
+ x( 4); \
+ x( 5); \
+ x( 6); \
+ x( 7); \
+ x( 8); \
+ x( 9); \
+ x(10); \
+ x(11); \
+ x(12); \
+ x(13); \
+ x(14); \
+ x(15);
+
+#define FOR_EACH_WORD_REV(x) \
+ x(15); \
+ x(14); \
+ x(13); \
+ x(12); \
+ x(11); \
+ x(10); \
+ x( 9); \
+ x( 8); \
+ x( 7); \
+ x( 6); \
+ x( 5); \
+ x( 4); \
+ x( 3); \
+ x( 2); \
+ x( 1); \
+ x( 0);
+
+#define PLUS_ONE_0 1
+#define PLUS_ONE_1 2
+#define PLUS_ONE_2 3
+#define PLUS_ONE_3 4
+#define PLUS_ONE_4 5
+#define PLUS_ONE_5 6
+#define PLUS_ONE_6 7
+#define PLUS_ONE_7 8
+#define PLUS_ONE_8 9
+#define PLUS_ONE_9 10
+#define PLUS_ONE_10 11
+#define PLUS_ONE_11 12
+#define PLUS_ONE_12 13
+#define PLUS_ONE_13 14
+#define PLUS_ONE_14 15
+#define PLUS_ONE_15 16
+#define PLUS_ONE(x) PLUS_ONE_ ## x
+#define _CONCAT3(a,b,c) a ## b ## c
+#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
+
+#define STORE_UNALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+ .if (x != 12); \
+ lw T0, (x*4)(STATE); \
+ .endif; \
+ lwl T1, (x*4)+MSB ## (IN); \
+ lwr T1, (x*4)+LSB ## (IN); \
+ .if (x == 12); \
+ addu X ## x, NONCE_0; \
+ .else; \
+ addu X ## x, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## x); \
+ xor X ## x, T1; \
+ swl X ## x, (x*4)+MSB ## (OUT); \
+ swr X ## x, (x*4)+LSB ## (OUT);
+
+#define STORE_ALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+ .if (x != 12); \
+ lw T0, (x*4)(STATE); \
+ .endif; \
+ lw T1, (x*4) ## (IN); \
+ .if (x == 12); \
+ addu X ## x, NONCE_0; \
+ .else; \
+ addu X ## x, T0; \
+ .endif; \
+ CPU_TO_LE32(X ## x); \
+ xor X ## x, T1; \
+ sw X ## x, (x*4) ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x) \
+.Lchacha_mips_jmptbl_aligned_ ## x: ; \
+ .set noreorder; \
+ b .Lchacha_mips_xor_aligned_ ## x ## _b; \
+ .if (x == 12); \
+ addu SAVED_X, X ## x, NONCE_0; \
+ .else; \
+ addu SAVED_X, X ## x, SAVED_CA; \
+ .endif; \
+ .set reorder
+
+#define JMPTBL_UNALIGNED(x) \
+.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
+ .set noreorder; \
+ b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
+ .if (x == 12); \
+ addu SAVED_X, X ## x, NONCE_0; \
+ .else; \
+ addu SAVED_X, X ## x, SAVED_CA; \
+ .endif; \
+ .set reorder
+
+#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
+ addu X(A), X(K); \
+ addu X(B), X(L); \
+ addu X(C), X(M); \
+ addu X(D), X(N); \
+ xor X(V), X(A); \
+ xor X(W), X(B); \
+ xor X(Y), X(C); \
+ xor X(Z), X(D); \
+ rotl X(V), S; \
+ rotl X(W), S; \
+ rotl X(Y), S; \
+ rotl X(Z), S;
+
+.text
+.set reorder
+.set noat
+.globl chacha_crypt_arch
+.ent chacha_crypt_arch
+chacha_crypt_arch:
+ .frame $sp, STACK_SIZE, $ra
+
+ /* Load number of rounds */
+ lw $at, 16($sp)
+
+ addiu $sp, -STACK_SIZE
+
+ /* Return bytes = 0. */
+ beqz BYTES, .Lchacha_mips_end
+
+ lw NONCE_0, 48(STATE)
+
+ /* Save s0-s7 */
+ sw $s0, 0($sp)
+ sw $s1, 4($sp)
+ sw $s2, 8($sp)
+ sw $s3, 12($sp)
+ sw $s4, 16($sp)
+ sw $s5, 20($sp)
+ sw $s6, 24($sp)
+ sw $s7, 28($sp)
+
+ /* Test IN or OUT is unaligned.
+ * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
+ */
+ or IS_UNALIGNED, IN, OUT
+ andi IS_UNALIGNED, 0x3
+
+ b .Lchacha_rounds_start
+
+.align 4
+.Loop_chacha_rounds:
+ addiu IN, CHACHA20_BLOCK_SIZE
+ addiu OUT, CHACHA20_BLOCK_SIZE
+ addiu NONCE_0, 1
+
+.Lchacha_rounds_start:
+ lw X0, 0(STATE)
+ lw X1, 4(STATE)
+ lw X2, 8(STATE)
+ lw X3, 12(STATE)
+
+ lw X4, 16(STATE)
+ lw X5, 20(STATE)
+ lw X6, 24(STATE)
+ lw X7, 28(STATE)
+ lw X8, 32(STATE)
+ lw X9, 36(STATE)
+ lw X10, 40(STATE)
+ lw X11, 44(STATE)
+
+ move X12, NONCE_0
+ lw X13, 52(STATE)
+ lw X14, 56(STATE)
+ lw X15, 60(STATE)
+
+.Loop_chacha_xor_rounds:
+ addiu $at, -2
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
+ bnez $at, .Loop_chacha_xor_rounds
+
+ addiu BYTES, -(CHACHA20_BLOCK_SIZE)
+
+ /* Is data src/dst unaligned? Jump */
+ bnez IS_UNALIGNED, .Loop_chacha_unaligned
+
+ /* Set number rounds here to fill delayslot. */
+ lw $at, (STACK_SIZE+16)($sp)
+
+ /* BYTES < 0, it has no full block. */
+ bltz BYTES, .Lchacha_mips_no_full_block_aligned
+
+ FOR_EACH_WORD_REV(STORE_ALIGNED)
+
+ /* BYTES > 0? Loop again. */
+ bgtz BYTES, .Loop_chacha_rounds
+
+ /* Place this here to fill delay slot */
+ addiu NONCE_0, 1
+
+ /* BYTES < 0? Handle last bytes */
+ bltz BYTES, .Lchacha_mips_xor_bytes
+
+.Lchacha_mips_xor_done:
+ /* Restore used registers */
+ lw $s0, 0($sp)
+ lw $s1, 4($sp)
+ lw $s2, 8($sp)
+ lw $s3, 12($sp)
+ lw $s4, 16($sp)
+ lw $s5, 20($sp)
+ lw $s6, 24($sp)
+ lw $s7, 28($sp)
+
+ /* Write NONCE_0 back to right location in state */
+ sw NONCE_0, 48(STATE)
+
+.Lchacha_mips_end:
+ addiu $sp, STACK_SIZE
+ jr $ra
+
+.Lchacha_mips_no_full_block_aligned:
+ /* Restore the offset on BYTES */
+ addiu BYTES, CHACHA20_BLOCK_SIZE
+
+ /* Get number of full WORDS */
+ andi $at, BYTES, MASK_U32
+
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
+
+ /* Calculate lower half jump table offset */
+ ins T0, $at, 1, 6
+
+ /* Add offset to STATE */
+ addu T1, STATE, $at
+
+ /* Add lower half jump table addr */
+ addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
+
+ /* Read value from STATE */
+ lw SAVED_CA, 0(T1)
+
+ /* Store remaining bytecounter as negative value */
+ subu BYTES, $at, BYTES
+
+ jr T0
+
+ /* Jump table */
+ FOR_EACH_WORD(JMPTBL_ALIGNED)
+
+
+.Loop_chacha_unaligned:
+ /* Set number rounds here to fill delayslot. */
+ lw $at, (STACK_SIZE+16)($sp)
+
+ /* BYTES > 0, it has no full block. */
+ bltz BYTES, .Lchacha_mips_no_full_block_unaligned
+
+ FOR_EACH_WORD_REV(STORE_UNALIGNED)
+
+ /* BYTES > 0? Loop again. */
+ bgtz BYTES, .Loop_chacha_rounds
+
+ /* Write NONCE_0 back to right location in state */
+ sw NONCE_0, 48(STATE)
+
+ .set noreorder
+ /* Fall through to byte handling */
+ bgez BYTES, .Lchacha_mips_xor_done
+.Lchacha_mips_xor_unaligned_0_b:
+.Lchacha_mips_xor_aligned_0_b:
+ /* Place this here to fill delay slot */
+ addiu NONCE_0, 1
+ .set reorder
+
+.Lchacha_mips_xor_bytes:
+ addu IN, $at
+ addu OUT, $at
+ /* First byte */
+ lbu T1, 0(IN)
+ addiu $at, BYTES, 1
+ CPU_TO_LE32(SAVED_X)
+ ROTR(SAVED_X)
+ xor T1, SAVED_X
+ sb T1, 0(OUT)
+ beqz $at, .Lchacha_mips_xor_done
+ /* Second byte */
+ lbu T1, 1(IN)
+ addiu $at, BYTES, 2
+ ROTx SAVED_X, 8
+ xor T1, SAVED_X
+ sb T1, 1(OUT)
+ beqz $at, .Lchacha_mips_xor_done
+ /* Third byte */
+ lbu T1, 2(IN)
+ ROTx SAVED_X, 8
+ xor T1, SAVED_X
+ sb T1, 2(OUT)
+ b .Lchacha_mips_xor_done
+
+.Lchacha_mips_no_full_block_unaligned:
+ /* Restore the offset on BYTES */
+ addiu BYTES, CHACHA20_BLOCK_SIZE
+
+ /* Get number of full WORDS */
+ andi $at, BYTES, MASK_U32
+
+ /* Load upper half of jump table addr */
+ lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
+
+ /* Calculate lower half jump table offset */
+ ins T0, $at, 1, 6
+
+ /* Add offset to STATE */
+ addu T1, STATE, $at
+
+ /* Add lower half jump table addr */
+ addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
+
+ /* Read value from STATE */
+ lw SAVED_CA, 0(T1)
+
+ /* Store remaining bytecounter as negative value */
+ subu BYTES, $at, BYTES
+
+ jr T0
+
+ /* Jump table */
+ FOR_EACH_WORD(JMPTBL_UNALIGNED)
+.end chacha_crypt_arch
+.set at
+
+/* Input arguments
+ * STATE $a0
+ * OUT $a1
+ * NROUND $a2
+ */
+
+#undef X12
+#undef X13
+#undef X14
+#undef X15
+
+#define X12 $a3
+#define X13 $at
+#define X14 $v0
+#define X15 STATE
+
+.set noat
+.globl hchacha_block_arch
+.ent hchacha_block_arch
+hchacha_block_arch:
+ .frame $sp, STACK_SIZE, $ra
+
+ addiu $sp, -STACK_SIZE
+
+ /* Save X11(s6) */
+ sw X11, 0($sp)
+
+ lw X0, 0(STATE)
+ lw X1, 4(STATE)
+ lw X2, 8(STATE)
+ lw X3, 12(STATE)
+ lw X4, 16(STATE)
+ lw X5, 20(STATE)
+ lw X6, 24(STATE)
+ lw X7, 28(STATE)
+ lw X8, 32(STATE)
+ lw X9, 36(STATE)
+ lw X10, 40(STATE)
+ lw X11, 44(STATE)
+ lw X12, 48(STATE)
+ lw X13, 52(STATE)
+ lw X14, 56(STATE)
+ lw X15, 60(STATE)
+
+.Loop_hchacha_xor_rounds:
+ addiu $a2, -2
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
+ AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
+ AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
+ AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
+ AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
+ bnez $a2, .Loop_hchacha_xor_rounds
+
+ /* Restore used register */
+ lw X11, 0($sp)
+
+ sw X0, 0(OUT)
+ sw X1, 4(OUT)
+ sw X2, 8(OUT)
+ sw X3, 12(OUT)
+ sw X12, 16(OUT)
+ sw X13, 20(OUT)
+ sw X14, 24(OUT)
+ sw X15, 28(OUT)
+
+ addiu $sp, STACK_SIZE
+ jr $ra
+.end hchacha_block_arch
+.set at
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -334,7 +334,7 @@ libs-$(CONFIG_MIPS_FP_SUPPORT) += arch/mips/math-emu/
# See arch/mips/Kbuild for content of core part of the kernel
core-y += arch/mips/
-drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/
+drivers-y += arch/mips/crypto/
drivers-$(CONFIG_OPROFILE) += arch/mips/oprofile/
# suspend and hibernation support
--- b/arch/mips/crypto/Makefile
+++ b/arch/mips/crypto/Makefile
@@ -4,3 +4,21 @@
#
obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
+
+obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
+chacha-mips-y := chacha-core.o chacha-glue.o
+AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
+
+obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
+poly1305-mips-y := poly1305-core.o poly1305-glue.o
+
+perlasm-flavour-$(CONFIG_32BIT) := o32
+perlasm-flavour-$(CONFIG_64BIT) := 64
+
+quiet_cmd_perlasm = PERLASM $@
+ cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
+
+$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
+ $(call if_changed,perlasm)
+
+targets += poly1305-core.S
--- b/arch/mips/crypto/chacha-glue.c
+++ b/arch/mips/crypto/chacha-glue.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * MIPS accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2019 Linaro, Ltd.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds);
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
+{
+ chacha_init_generic(state, key, iv);
+}
+EXPORT_SYMBOL(chacha_init_arch);
+
+static int chacha_mips_stream_xor(struct skcipher_request *req,
+ const struct chacha_ctx *ctx, const u8 *iv)
+{
+ struct skcipher_walk walk;
+ u32 state[16];
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+
+ chacha_init_generic(state, ctx->key, iv);
+
+ while (walk.nbytes > 0) {
+ unsigned int nbytes = walk.nbytes;
+
+ if (nbytes < walk.total)
+ nbytes = round_down(nbytes, walk.stride);
+
+ chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes, ctx->nrounds);
+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+ }
+
+ return err;
+}
+
+static int chacha_mips(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ return chacha_mips_stream_xor(req, ctx, req->iv);
+}
+
+static int xchacha_mips(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct chacha_ctx subctx;
+ u32 state[16];
+ u8 real_iv[16];
+
+ chacha_init_generic(state, ctx->key, req->iv);
+
+ hchacha_block(state, subctx.key, ctx->nrounds);
+ subctx.nrounds = ctx->nrounds;
+
+ memcpy(&real_iv[0], req->iv + 24, 8);
+ memcpy(&real_iv[8], req->iv + 16, 8);
+ return chacha_mips_stream_xor(req, &subctx, real_iv);
+}
+
+static struct skcipher_alg algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-mips",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = chacha_mips,
+ .decrypt = chacha_mips,
+ }, {
+ .base.cra_name = "xchacha20",
+ .base.cra_driver_name = "xchacha20-mips",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha20_setkey,
+ .encrypt = xchacha_mips,
+ .decrypt = xchacha_mips,
+ }, {
+ .base.cra_name = "xchacha12",
+ .base.cra_driver_name = "xchacha12-mips",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = chacha12_setkey,
+ .encrypt = xchacha_mips,
+ .decrypt = xchacha_mips,
+ }
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+ return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
+ crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER))
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
+MODULE_AUTHOR("Ard Biesheuvel ");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-mips");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-mips");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-mips");
--- b/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -1,131 +1,173 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
- * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
*/
#include
#include
+#include
#include
-#include
#include
+#include
#include
#include
+#include
#include
-struct poly1305_simd_desc_ctx {
- struct poly1305_desc_ctx base;
- /* derived key u set? */
- bool uset;
-#ifdef CONFIG_AS_AVX2
- /* derived keys r^3, r^4 set? */
- bool wset;
-#endif
- /* derived Poly1305 key r^2 */
- u32 u[5];
- /* ... silently appended r^3 and r^4 when using AVX2 */
+asmlinkage void poly1305_init_x86_64(void *ctx,
+ const u8 key[POLY1305_BLOCK_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
+
+struct poly1305_arch_internal {
+ union {
+ struct {
+ u32 h[5];
+ u32 is_base2_26;
+ };
+ u64 hs[3];
+ };
+ u64 r[2];
+ u64 pad;
+ struct { u32 r2, r1, r4, r3; } rn[9];
};
-asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
- const u32 *r, unsigned int blocks);
-asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
- unsigned int blocks, const u32 *u);
-#ifdef CONFIG_AS_AVX2
-asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
- unsigned int blocks, const u32 *u);
-static bool poly1305_use_avx2;
-#endif
-
-static int poly1305_simd_init(struct shash_desc *desc)
+/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
+ * the unfortunate situation of using AVX and then having to go back to scalar
+ * -- because the user is silly and has called the update function from two
+ * separate contexts -- then we need to convert back to the original base before
+ * proceeding. It is possible to reason that the initial reduction below is
+ * sufficient given the implementation invariants. However, for an avoidance of
+ * doubt and because this is not performance critical, we do the full reduction
+ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
+ */
+static void convert_to_base2_64(void *ctx)
{
- struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc);
+ struct poly1305_arch_internal *state = ctx;
+ u32 cy;
+
+ if (!state->is_base2_26)
+ return;
- sctx->uset = false;
-#ifdef CONFIG_AS_AVX2
- sctx->wset = false;
-#endif
+ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+ state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+ state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+ state->hs[2] = state->h[4] >> 24;
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+ state->hs[2] &= 3;
+ state->hs[0] += cy;
+ state->hs[1] += (cy = ULT(state->hs[0], cy));
+ state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+ state->is_base2_26 = 0;
+}
- return crypto_poly1305_init(desc);
+static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE])
+{
+ poly1305_init_x86_64(ctx, key);
}
-static void poly1305_simd_mult(u32 *a, const u32 *b)
+static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
+ const u32 padbit)
{
- u8 m[POLY1305_BLOCK_SIZE];
+ struct poly1305_arch_internal *state = ctx;
- memset(m, 0, sizeof(m));
- /* The poly1305 block function adds a hi-bit to the accumulator which
- * we don't need for key multiplication; compensate for it. */
- a[4] -= 1 << 24;
- poly1305_block_sse2(a, m, b, 1);
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(SZ_4K < POLY1305_BLOCK_SIZE ||
+ SZ_4K % POLY1305_BLOCK_SIZE);
+
+ if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
+ (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
+ !crypto_simd_usable()) {
+ convert_to_base2_64(ctx);
+ poly1305_blocks_x86_64(ctx, inp, len, padbit);
+ return;
+ }
+
+ do {
+ const size_t bytes = min_t(size_t, len, SZ_4K);
+
+ kernel_fpu_begin();
+ if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
+ poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+ else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
+ poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+ else
+ poly1305_blocks_avx(ctx, inp, bytes, padbit);
+ kernel_fpu_end();
+
+ len -= bytes;
+ inp += bytes;
+ } while (len);
}
-static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
- const u8 *src, unsigned int srclen)
+static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4])
{
- struct poly1305_simd_desc_ctx *sctx;
- unsigned int blocks, datalen;
+ if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx))
+ poly1305_emit_x86_64(ctx, mac, nonce);
+ else
+ poly1305_emit_avx(ctx, mac, nonce);
+}
- BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base));
- sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base);
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
+{
+ poly1305_simd_init(&dctx->h, key);
+ dctx->s[0] = get_unaligned_le32(&key[16]);
+ dctx->s[1] = get_unaligned_le32(&key[20]);
+ dctx->s[2] = get_unaligned_le32(&key[24]);
+ dctx->s[3] = get_unaligned_le32(&key[28]);
+ dctx->buflen = 0;
+ dctx->sset = true;
+}
+EXPORT_SYMBOL(poly1305_init_arch);
+static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
+ const u8 *inp, unsigned int len)
+{
+ unsigned int acc = 0;
if (unlikely(!dctx->sset)) {
- datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
- src += srclen - datalen;
- srclen = datalen;
- }
-
-#ifdef CONFIG_AS_AVX2
- if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
- if (unlikely(!sctx->wset)) {
- if (!sctx->uset) {
- memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u, dctx->r.r);
- sctx->uset = true;
- }
- memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u + 5, dctx->r.r);
- memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u + 10, dctx->r.r);
- sctx->wset = true;
+ if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
+ poly1305_simd_init(&dctx->h, inp);
+ inp += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ acc += POLY1305_BLOCK_SIZE;
+ dctx->rset = 1;
}
- blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
- poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
- sctx->u);
- src += POLY1305_BLOCK_SIZE * 4 * blocks;
- srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
- }
-#endif
- if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
- if (unlikely(!sctx->uset)) {
- memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u, dctx->r.r);
- sctx->uset = true;
+ if (len >= POLY1305_BLOCK_SIZE) {
+ dctx->s[0] = get_unaligned_le32(&inp[0]);
+ dctx->s[1] = get_unaligned_le32(&inp[4]);
+ dctx->s[2] = get_unaligned_le32(&inp[8]);
+ dctx->s[3] = get_unaligned_le32(&inp[12]);
+ acc += POLY1305_BLOCK_SIZE;
+ dctx->sset = true;
}
- blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
- poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
- sctx->u);
- src += POLY1305_BLOCK_SIZE * 2 * blocks;
- srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
- }
- if (srclen >= POLY1305_BLOCK_SIZE) {
- poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
- srclen -= POLY1305_BLOCK_SIZE;
}
- return srclen;
+ return acc;
}
-static int poly1305_simd_update(struct shash_desc *desc,
- const u8 *src, unsigned int srclen)
+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+ unsigned int srclen)
{
- struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
- unsigned int bytes;
-
- /* kernel_fpu_begin/end is costly, use fallback for small updates */
- if (srclen <= 288 || !crypto_simd_usable())
- return crypto_poly1305_update(desc, src, srclen);
-
- kernel_fpu_begin();
+ unsigned int bytes, used;
if (unlikely(dctx->buflen)) {
bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
@@ -135,34 +177,76 @@
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
- poly1305_simd_blocks(dctx, dctx->buf,
- POLY1305_BLOCK_SIZE);
+ if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
+ poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
- bytes = poly1305_simd_blocks(dctx, src, srclen);
- src += srclen - bytes;
- srclen = bytes;
+ bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
+ srclen -= bytes;
+ used = crypto_poly1305_setdctxkey(dctx, src, bytes);
+ if (likely(bytes - used))
+ poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
+ src += bytes;
}
- kernel_fpu_end();
-
if (unlikely(srclen)) {
dctx->buflen = srclen;
memcpy(dctx->buf, src, srclen);
}
+}
+EXPORT_SYMBOL(poly1305_update_arch);
+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+{
+ if (unlikely(dctx->buflen)) {
+ dctx->buf[dctx->buflen++] = 1;
+ memset(dctx->buf + dctx->buflen, 0,
+ POLY1305_BLOCK_SIZE - dctx->buflen);
+ poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ }
+
+ poly1305_simd_emit(&dctx->h, dst, dctx->s);
+ *dctx = (struct poly1305_desc_ctx){};
+}
+EXPORT_SYMBOL(poly1305_final_arch);
+
+static int crypto_poly1305_init(struct shash_desc *desc)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ *dctx = (struct poly1305_desc_ctx){};
+ return 0;
+}
+
+static int crypto_poly1305_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ poly1305_update_arch(dctx, src, srclen);
+ return 0;
+}
+
+static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (unlikely(!dctx->sset))
+ return -ENOKEY;
+
+ poly1305_final_arch(dctx, dst);
return 0;
}
static struct shash_alg alg = {
.digestsize = POLY1305_DIGEST_SIZE,
- .init = poly1305_simd_init,
- .update = poly1305_simd_update,
+ .init = crypto_poly1305_init,
+ .update = crypto_poly1305_update,
.final = crypto_poly1305_final,
- .descsize = sizeof(struct poly1305_simd_desc_ctx),
+ .descsize = sizeof(struct poly1305_desc_ctx),
.base = {
.cra_name = "poly1305",
.cra_driver_name = "poly1305-simd",
@@ -174,30 +258,33 @@
static int __init poly1305_simd_mod_init(void)
{
- if (!boot_cpu_has(X86_FEATURE_XMM2))
- return -ENODEV;
-
-#ifdef CONFIG_AS_AVX2
- poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
- alg.descsize = sizeof(struct poly1305_simd_desc_ctx);
- if (poly1305_use_avx2)
- alg.descsize += 10 * sizeof(u32);
-#endif
- return crypto_register_shash(&alg);
+ if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ static_branch_enable(&poly1305_use_avx);
+ if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ static_branch_enable(&poly1305_use_avx2);
+ if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+ /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
+ static_branch_enable(&poly1305_use_avx512);
+ return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
}
static void __exit poly1305_simd_mod_exit(void)
{
- crypto_unregister_shash(&alg);
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
+ crypto_unregister_shash(&alg);
}
module_init(poly1305_simd_mod_init);
module_exit(poly1305_simd_mod_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi ");
+MODULE_AUTHOR("Jason A. Donenfeld ");
MODULE_DESCRIPTION("Poly1305 authenticator");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-simd");
--- b/crypto/adiantum.c
+++ b/crypto/adiantum.c
@@ -33,6 +33,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -71,7 +72,7 @@
struct crypto_skcipher *streamcipher;
struct crypto_cipher *blockcipher;
struct crypto_shash *hash;
- struct poly1305_key header_hash_key;
+ struct poly1305_core_key header_hash_key;
};
struct adiantum_request_ctx {
@@ -242,13 +243,13 @@
BUILD_BUG_ON(sizeof(header) % POLY1305_BLOCK_SIZE != 0);
poly1305_core_blocks(&state, &tctx->header_hash_key,
- &header, sizeof(header) / POLY1305_BLOCK_SIZE);
+ &header, sizeof(header) / POLY1305_BLOCK_SIZE, 1);
BUILD_BUG_ON(TWEAK_SIZE % POLY1305_BLOCK_SIZE != 0);
poly1305_core_blocks(&state, &tctx->header_hash_key, req->iv,
- TWEAK_SIZE / POLY1305_BLOCK_SIZE);
+ TWEAK_SIZE / POLY1305_BLOCK_SIZE, 1);
- poly1305_core_emit(&state, &rctx->header_hash);
+ poly1305_core_emit(&state, NULL, &rctx->header_hash);
}
/* Hash the left-hand part (the "bulk") of the message using NHPoly1305 */
--- b/crypto/nhpoly1305.c
+++ b/crypto/nhpoly1305.c
@@ -33,6 +33,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -78,7 +79,7 @@
BUILD_BUG_ON(NH_HASH_BYTES % POLY1305_BLOCK_SIZE != 0);
poly1305_core_blocks(&state->poly_state, &key->poly_key, state->nh_hash,
- NH_HASH_BYTES / POLY1305_BLOCK_SIZE);
+ NH_HASH_BYTES / POLY1305_BLOCK_SIZE, 1);
}
/*
@@ -209,7 +210,7 @@
if (state->nh_remaining)
process_nh_hash_value(state, key);
- poly1305_core_emit(&state->poly_state, dst);
+ poly1305_core_emit(&state->poly_state, NULL, dst);
return 0;
}
EXPORT_SYMBOL(crypto_nhpoly1305_final_helper);
--- b/crypto/poly1305_generic.c
+++ b/crypto/poly1305_generic.c
@@ -13,65 +13,33 @@
#include
#include
-#include
+#include
#include
#include
#include
#include
-static inline u64 mlt(u64 a, u64 b)
-{
- return a * b;
-}
-
-static inline u32 sr(u64 v, u_char n)
-{
- return v >> n;
-}
-
-static inline u32 and(u32 v, u32 mask)
-{
- return v & mask;
-}
-
-int crypto_poly1305_init(struct shash_desc *desc)
+static int crypto_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
poly1305_core_init(&dctx->h);
dctx->buflen = 0;
- dctx->rset = false;
+ dctx->rset = 0;
dctx->sset = false;
return 0;
}
-EXPORT_SYMBOL_GPL(crypto_poly1305_init);
-
-void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key)
-{
- /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
- key->r[0] = (get_unaligned_le32(raw_key + 0) >> 0) & 0x3ffffff;
- key->r[1] = (get_unaligned_le32(raw_key + 3) >> 2) & 0x3ffff03;
- key->r[2] = (get_unaligned_le32(raw_key + 6) >> 4) & 0x3ffc0ff;
- key->r[3] = (get_unaligned_le32(raw_key + 9) >> 6) & 0x3f03fff;
- key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
-}
-EXPORT_SYMBOL_GPL(poly1305_core_setkey);
-/*
- * Poly1305 requires a unique key for each tag, which implies that we can't set
- * it on the tfm that gets accessed by multiple users simultaneously. Instead we
- * expect the key as the first 32 bytes in the update() call.
- */
-unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
- const u8 *src, unsigned int srclen)
+static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
+ const u8 *src, unsigned int srclen)
{
if (!dctx->sset) {
if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
- poly1305_core_setkey(&dctx->r, src);
+ poly1305_core_setkey(&dctx->core_r, src);
src += POLY1305_BLOCK_SIZE;
srclen -= POLY1305_BLOCK_SIZE;
- dctx->rset = true;
+ dctx->rset = 2;
}
if (srclen >= POLY1305_BLOCK_SIZE) {
dctx->s[0] = get_unaligned_le32(src + 0);
@@ -85,86 +53,9 @@
}
return srclen;
}
-EXPORT_SYMBOL_GPL(crypto_poly1305_setdesckey);
-static void poly1305_blocks_internal(struct poly1305_state *state,
- const struct poly1305_key *key,
- const void *src, unsigned int nblocks,
- u32 hibit)
-{
- u32 r0, r1, r2, r3, r4;
- u32 s1, s2, s3, s4;
- u32 h0, h1, h2, h3, h4;
- u64 d0, d1, d2, d3, d4;
-
- if (!nblocks)
- return;
-
- r0 = key->r[0];
- r1 = key->r[1];
- r2 = key->r[2];
- r3 = key->r[3];
- r4 = key->r[4];
-
- s1 = r1 * 5;
- s2 = r2 * 5;
- s3 = r3 * 5;
- s4 = r4 * 5;
-
- h0 = state->h[0];
- h1 = state->h[1];
- h2 = state->h[2];
- h3 = state->h[3];
- h4 = state->h[4];
-
- do {
- /* h += m[i] */
- h0 += (get_unaligned_le32(src + 0) >> 0) & 0x3ffffff;
- h1 += (get_unaligned_le32(src + 3) >> 2) & 0x3ffffff;
- h2 += (get_unaligned_le32(src + 6) >> 4) & 0x3ffffff;
- h3 += (get_unaligned_le32(src + 9) >> 6) & 0x3ffffff;
- h4 += (get_unaligned_le32(src + 12) >> 8) | hibit;
-
- /* h *= r */
- d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
- mlt(h3, s2) + mlt(h4, s1);
- d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
- mlt(h3, s3) + mlt(h4, s2);
- d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
- mlt(h3, s4) + mlt(h4, s3);
- d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
- mlt(h3, r0) + mlt(h4, s4);
- d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
- mlt(h3, r1) + mlt(h4, r0);
-
- /* (partial) h %= p */
- d1 += sr(d0, 26); h0 = and(d0, 0x3ffffff);
- d2 += sr(d1, 26); h1 = and(d1, 0x3ffffff);
- d3 += sr(d2, 26); h2 = and(d2, 0x3ffffff);
- d4 += sr(d3, 26); h3 = and(d3, 0x3ffffff);
- h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
- h1 += h0 >> 26; h0 = h0 & 0x3ffffff;
-
- src += POLY1305_BLOCK_SIZE;
- } while (--nblocks);
-
- state->h[0] = h0;
- state->h[1] = h1;
- state->h[2] = h2;
- state->h[3] = h3;
- state->h[4] = h4;
-}
-
-void poly1305_core_blocks(struct poly1305_state *state,
- const struct poly1305_key *key,
- const void *src, unsigned int nblocks)
-{
- poly1305_blocks_internal(state, key, src, nblocks, 1 << 24);
-}
-EXPORT_SYMBOL_GPL(poly1305_core_blocks);
-
-static void poly1305_blocks(struct poly1305_desc_ctx *dctx,
- const u8 *src, unsigned int srclen, u32 hibit)
+static void poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+ unsigned int srclen)
{
unsigned int datalen;
@@ -174,12 +65,12 @@
srclen = datalen;
}
- poly1305_blocks_internal(&dctx->h, &dctx->r,
- src, srclen / POLY1305_BLOCK_SIZE, hibit);
+ poly1305_core_blocks(&dctx->h, &dctx->core_r, src,
+ srclen / POLY1305_BLOCK_SIZE, 1);
}
-int crypto_poly1305_update(struct shash_desc *desc,
- const u8 *src, unsigned int srclen)
+static int crypto_poly1305_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
unsigned int bytes;
@@ -193,13 +84,13 @@
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
poly1305_blocks(dctx, dctx->buf,
- POLY1305_BLOCK_SIZE, 1 << 24);
+ POLY1305_BLOCK_SIZE);
dctx->buflen = 0;
}
}
if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
- poly1305_blocks(dctx, src, srclen, 1 << 24);
+ poly1305_blocks(dctx, src, srclen);
src += srclen - (srclen % POLY1305_BLOCK_SIZE);
srclen %= POLY1305_BLOCK_SIZE;
}
@@ -211,87 +102,17 @@
return 0;
}
-EXPORT_SYMBOL_GPL(crypto_poly1305_update);
-
-void poly1305_core_emit(const struct poly1305_state *state, void *dst)
-{
- u32 h0, h1, h2, h3, h4;
- u32 g0, g1, g2, g3, g4;
- u32 mask;
-
- /* fully carry h */
- h0 = state->h[0];
- h1 = state->h[1];
- h2 = state->h[2];
- h3 = state->h[3];
- h4 = state->h[4];
-
- h2 += (h1 >> 26); h1 = h1 & 0x3ffffff;
- h3 += (h2 >> 26); h2 = h2 & 0x3ffffff;
- h4 += (h3 >> 26); h3 = h3 & 0x3ffffff;
- h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
- h1 += (h0 >> 26); h0 = h0 & 0x3ffffff;
-
- /* compute h + -p */
- g0 = h0 + 5;
- g1 = h1 + (g0 >> 26); g0 &= 0x3ffffff;
- g2 = h2 + (g1 >> 26); g1 &= 0x3ffffff;
- g3 = h3 + (g2 >> 26); g2 &= 0x3ffffff;
- g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
-
- /* select h if h < p, or h + -p if h >= p */
- mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
- g0 &= mask;
- g1 &= mask;
- g2 &= mask;
- g3 &= mask;
- g4 &= mask;
- mask = ~mask;
- h0 = (h0 & mask) | g0;
- h1 = (h1 & mask) | g1;
- h2 = (h2 & mask) | g2;
- h3 = (h3 & mask) | g3;
- h4 = (h4 & mask) | g4;
-
- /* h = h % (2^128) */
- put_unaligned_le32((h0 >> 0) | (h1 << 26), dst + 0);
- put_unaligned_le32((h1 >> 6) | (h2 << 20), dst + 4);
- put_unaligned_le32((h2 >> 12) | (h3 << 14), dst + 8);
- put_unaligned_le32((h3 >> 18) | (h4 << 8), dst + 12);
-}
-EXPORT_SYMBOL_GPL(poly1305_core_emit);
-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
- __le32 digest[4];
- u64 f = 0;
if (unlikely(!dctx->sset))
return -ENOKEY;
- if (unlikely(dctx->buflen)) {
- dctx->buf[dctx->buflen++] = 1;
- memset(dctx->buf + dctx->buflen, 0,
- POLY1305_BLOCK_SIZE - dctx->buflen);
- poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 0);
- }
-
- poly1305_core_emit(&dctx->h, digest);
-
- /* mac = (h + s) % (2^128) */
- f = (f >> 32) + le32_to_cpu(digest[0]) + dctx->s[0];
- put_unaligned_le32(f, dst + 0);
- f = (f >> 32) + le32_to_cpu(digest[1]) + dctx->s[1];
- put_unaligned_le32(f, dst + 4);
- f = (f >> 32) + le32_to_cpu(digest[2]) + dctx->s[2];
- put_unaligned_le32(f, dst + 8);
- f = (f >> 32) + le32_to_cpu(digest[3]) + dctx->s[3];
- put_unaligned_le32(f, dst + 12);
-
+ poly1305_final_generic(dctx, dst);
return 0;
}
-EXPORT_SYMBOL_GPL(crypto_poly1305_final);
static struct shash_alg poly1305_alg = {
.digestsize = POLY1305_DIGEST_SIZE,
--- b/include/crypto/internal/poly1305.h
+++ b/include/crypto/internal/poly1305.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common values for the Poly1305 algorithm
+ */
+
+#ifndef _CRYPTO_INTERNAL_POLY1305_H
+#define _CRYPTO_INTERNAL_POLY1305_H
+
+#include
+#include
+#include
+
+/*
+ * Poly1305 core functions. These only accept whole blocks; the caller must
+ * handle any needed block buffering and padding. 'hibit' must be 1 for any
+ * full blocks, or 0 for the final block if it had to be padded. If 'nonce' is
+ * non-NULL, then it's added at the end to compute the Poly1305 MAC. Otherwise,
+ * only the ε-almost-∆-universal hash function (not the full MAC) is computed.
+ */
+
+void poly1305_core_setkey(struct poly1305_core_key *key,
+ const u8 raw_key[POLY1305_BLOCK_SIZE]);
+static inline void poly1305_core_init(struct poly1305_state *state)
+{
+ *state = (struct poly1305_state){};
+}
+
+void poly1305_core_blocks(struct poly1305_state *state,
+ const struct poly1305_core_key *key, const void *src,
+ unsigned int nblocks, u32 hibit);
+void poly1305_core_emit(const struct poly1305_state *state, const u32 nonce[4],
+ void *dst);
+
+#endif
--- b/include/crypto/poly1305.h
+++ b/include/crypto/poly1305.h
@@ -14,51 +14,86 @@
#define POLY1305_DIGEST_SIZE 16
+/* The poly1305_key and poly1305_state types are mostly opaque and
+ * implementation-defined. Limbs might be in base 2^64 or base 2^26, or
+ * different yet. The union type provided keeps these 64-bit aligned for the
+ * case in which this is implemented using 64x64 multiplies.
+ */
+
struct poly1305_key {
- u32 r[5]; /* key, base 2^26 */
+ union {
+ u32 r[5];
+ u64 r64[3];
+ };
+};
+
+struct poly1305_core_key {
+ struct poly1305_key key;
+ struct poly1305_key precomputed_s;
};
struct poly1305_state {
- u32 h[5]; /* accumulator, base 2^26 */
+ union {
+ u32 h[5];
+ u64 h64[3];
+ };
};
struct poly1305_desc_ctx {
- /* key */
- struct poly1305_key r;
- /* finalize key */
- u32 s[4];
- /* accumulator */
- struct poly1305_state h;
/* partial buffer */
u8 buf[POLY1305_BLOCK_SIZE];
/* bytes used in partial buffer */
unsigned int buflen;
- /* r key has been set */
- bool rset;
- /* s key has been set */
+ /* how many keys have been set in r[] */
+ unsigned short rset;
+ /* whether s[] has been set */
bool sset;
+ /* finalize key */
+ u32 s[4];
+ /* accumulator */
+ struct poly1305_state h;
+ /* key */
+ union {
+ struct poly1305_key opaque_r[CONFIG_CRYPTO_LIB_POLY1305_RSIZE];
+ struct poly1305_core_key core_r;
+ };
};
-/*
- * Poly1305 core functions. These implement the ε-almost-∆-universal hash
- * function underlying the Poly1305 MAC, i.e. they don't add an encrypted nonce
- * ("s key") at the end. They also only support block-aligned inputs.
- */
-void poly1305_core_setkey(struct poly1305_key *key, const u8 *raw_key);
-static inline void poly1305_core_init(struct poly1305_state *state)
+void poly1305_init_arch(struct poly1305_desc_ctx *desc,
+ const u8 key[POLY1305_KEY_SIZE]);
+void poly1305_init_generic(struct poly1305_desc_ctx *desc,
+ const u8 key[POLY1305_KEY_SIZE]);
+
+static inline void poly1305_init(struct poly1305_desc_ctx *desc, const u8 *key)
+{
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
+ poly1305_init_arch(desc, key);
+ else
+ poly1305_init_generic(desc, key);
+}
+
+void poly1305_update_arch(struct poly1305_desc_ctx *desc, const u8 *src,
+ unsigned int nbytes);
+void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src,
+ unsigned int nbytes);
+
+static inline void poly1305_update(struct poly1305_desc_ctx *desc,
+ const u8 *src, unsigned int nbytes)
+{
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
+ poly1305_update_arch(desc, src, nbytes);
+ else
+ poly1305_update_generic(desc, src, nbytes);
+}
+
+void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest);
+void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *digest);
+
+static inline void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest)
{
- memset(state->h, 0, sizeof(state->h));
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305))
+ poly1305_final_arch(desc, digest);
+ else
+ poly1305_final_generic(desc, digest);
}
-void poly1305_core_blocks(struct poly1305_state *state,
- const struct poly1305_key *key,
- const void *src, unsigned int nblocks);
-void poly1305_core_emit(const struct poly1305_state *state, void *dst);
-
-/* Crypto API helper functions for the Poly1305 MAC */
-int crypto_poly1305_init(struct shash_desc *desc);
-unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
- const u8 *src, unsigned int srclen);
-int crypto_poly1305_update(struct shash_desc *desc,
- const u8 *src, unsigned int srclen);
-int crypto_poly1305_final(struct shash_desc *desc, u8 *dst);
#endif
--- b/lib/crypto/poly1305.c
+++ b/lib/crypto/poly1305.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Poly1305 authenticator algorithm, RFC7539
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * Based on public domain code by Andrew Moon and Daniel J. Bernstein.
+ */
+
+#include
+#include
+#include
+#include
+
+void poly1305_init_generic(struct poly1305_desc_ctx *desc,
+ const u8 key[POLY1305_KEY_SIZE])
+{
+ poly1305_core_setkey(&desc->core_r, key);
+ desc->s[0] = get_unaligned_le32(key + 16);
+ desc->s[1] = get_unaligned_le32(key + 20);
+ desc->s[2] = get_unaligned_le32(key + 24);
+ desc->s[3] = get_unaligned_le32(key + 28);
+ poly1305_core_init(&desc->h);
+ desc->buflen = 0;
+ desc->sset = true;
+ desc->rset = 2;
+}
+EXPORT_SYMBOL_GPL(poly1305_init_generic);
+
+void poly1305_update_generic(struct poly1305_desc_ctx *desc, const u8 *src,
+ unsigned int nbytes)
+{
+ unsigned int bytes;
+
+ if (unlikely(desc->buflen)) {
+ bytes = min(nbytes, POLY1305_BLOCK_SIZE - desc->buflen);
+ memcpy(desc->buf + desc->buflen, src, bytes);
+ src += bytes;
+ nbytes -= bytes;
+ desc->buflen += bytes;
+
+ if (desc->buflen == POLY1305_BLOCK_SIZE) {
+ poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf,
+ 1, 1);
+ desc->buflen = 0;
+ }
+ }
+
+ if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
+ poly1305_core_blocks(&desc->h, &desc->core_r, src,
+ nbytes / POLY1305_BLOCK_SIZE, 1);
+ src += nbytes - (nbytes % POLY1305_BLOCK_SIZE);
+ nbytes %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(nbytes)) {
+ desc->buflen = nbytes;
+ memcpy(desc->buf, src, nbytes);
+ }
+}
+EXPORT_SYMBOL_GPL(poly1305_update_generic);
+
+void poly1305_final_generic(struct poly1305_desc_ctx *desc, u8 *dst)
+{
+ if (unlikely(desc->buflen)) {
+ desc->buf[desc->buflen++] = 1;
+ memset(desc->buf + desc->buflen, 0,
+ POLY1305_BLOCK_SIZE - desc->buflen);
+ poly1305_core_blocks(&desc->h, &desc->core_r, desc->buf, 1, 0);
+ }
+
+ poly1305_core_emit(&desc->h, desc->s, dst);
+ *desc = (struct poly1305_desc_ctx){};
+}
+EXPORT_SYMBOL_GPL(poly1305_final_generic);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi ");
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -50,6 +50,10 @@ sha512-arm64-y := sha512-glue.o sha512-core.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
+poly1305-neon-y := poly1305-core.o poly1305-glue.o
+AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
+
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
@@ -68,11 +72,15 @@ ifdef REGENERATE_ARM64_CRYPTO
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $(<) void $(@)
+$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
+ $(call cmd,perlasm)
+
$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
$(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
+
endif
-clean-files += sha256-core.S sha512-core.S
+clean-files += poly1305-core.S sha256-core.S sha512-core.S
--- /dev/null
+++ b/arch/arm64/crypto/poly1305-armv8.pl
@@ -0,0 +1,913 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc-4.9 NEON
+#
+# Apple A7 1.86/+5% 0.72
+# Cortex-A53 2.69/+58% 1.47
+# Cortex-A57 2.70/+7% 1.14
+# Denver 1.64/+50% 1.18(*)
+# X-Gene 2.13/+68% 2.27
+# Mongoose 1.77/+75% 1.12
+# Kryo 2.70/+55% 1.13
+# ThunderX2 1.17/+95% 1.36
+#
+# (*) estimate based on resources availability is less than 1.0,
+# i.e. measured result is worse than expected, presumably binary
+# translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp $inp,xzr
+ stp xzr,xzr,[$ctx] // zero hash value
+ stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
+#endif
+
+ ldp $r0,$r1,[$inp] // load key
+ mov $s1,#0xfffffffc0fffffff
+ movk $s1,#0x0fff,lsl#48
+#ifdef __AARCH64EB__
+ rev $r0,$r0 // flip bytes
+ rev $r1,$r1
+#endif
+ and $r0,$r0,$s1 // &=0ffffffc0fffffff
+ and $s1,$s1,#-4
+ and $r1,$r1,$s1 // &=0ffffffc0ffffffc
+ mov w#$s1,#-1
+ stp $r0,$r1,[$ctx,#32] // save key value
+ str w#$s1,[$ctx,#48] // impossible key power value
+
+#ifndef __KERNEL__
+ tst w17,#ARMV7_NEON
+
+ adr $d0,.Lpoly1305_blocks
+ adr $r0,.Lpoly1305_blocks_neon
+ adr $d1,.Lpoly1305_emit
+
+ csel $d0,$d0,$r0,eq
+
+# ifdef __ILP32__
+ stp w#$d0,w#$d1,[$len]
+# else
+ stp $d0,$d1,[$len]
+# endif
+#endif
+ mov x0,#1
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ ands $len,$len,#-16
+ b.eq .Lno_data
+
+ ldp $h0,$h1,[$ctx] // load hash value
+ ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+#ifdef __AARCH64EB__
+ lsr $d0,$h0,#32
+ mov w#$d1,w#$h0
+ lsr $d2,$h1,#32
+ mov w15,w#$h1
+ lsr x16,$h2,#32
+#else
+ mov w#$d0,w#$h0
+ lsr $d1,$h0,#32
+ mov w#$d2,w#$h1
+ lsr x15,$h1,#32
+ mov w16,w#$h2
+#endif
+
+ add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
+ lsr $d1,$d2,#12
+ adds $d0,$d0,$d2,lsl#52
+ add $d1,$d1,x15,lsl#14
+ adc $d1,$d1,xzr
+ lsr $d2,x16,#24
+ adds $d1,$d1,x16,lsl#40
+ adc $d2,$d2,xzr
+
+ cmp x17,#0 // is_base2_26?
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ csel $h0,$h0,$d0,eq // choose between radixes
+ csel $h1,$h1,$d1,eq
+ csel $h2,$h2,$d2,eq
+
+.Loop:
+ ldp $t0,$t1,[$inp],#16 // load input
+ sub $len,$len,#16
+#ifdef __AARCH64EB__
+ rev $t0,$t0
+ rev $t1,$t1
+#endif
+ adds $h0,$h0,$t0 // accumulate input
+ adcs $h1,$h1,$t1
+
+ mul $d0,$h0,$r0 // h0*r0
+ adc $h2,$h2,$padbit
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ cbnz $len,.Loop
+
+ stp $h0,$h1,[$ctx] // store hash value
+ stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ ldp $h0,$h1,[$ctx] // load hash base 2^64
+ ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
+ ldp $t0,$t1,[$nonce] // load nonce
+
+#ifdef __AARCH64EB__
+ lsr $d0,$h0,#32
+ mov w#$d1,w#$h0
+ lsr $d2,$h1,#32
+ mov w15,w#$h1
+ lsr x16,$h2,#32
+#else
+ mov w#$d0,w#$h0
+ lsr $d1,$h0,#32
+ mov w#$d2,w#$h1
+ lsr x15,$h1,#32
+ mov w16,w#$h2
+#endif
+
+ add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
+ lsr $d1,$d2,#12
+ adds $d0,$d0,$d2,lsl#52
+ add $d1,$d1,x15,lsl#14
+ adc $d1,$d1,xzr
+ lsr $d2,x16,#24
+ adds $d1,$d1,x16,lsl#40
+ adc $d2,$d2,xzr
+
+ cmp $r0,#0 // is_base2_26?
+ csel $h0,$h0,$d0,eq // choose between radixes
+ csel $h1,$h1,$d1,eq
+ csel $h2,$h2,$d2,eq
+
+ adds $d0,$h0,#5 // compare to modulus
+ adcs $d1,$h1,xzr
+ adc $d2,$h2,xzr
+
+ tst $d2,#-4 // see if it's carried/borrowed
+
+ csel $h0,$h0,$d0,eq
+ csel $h1,$h1,$d1,eq
+
+#ifdef __AARCH64EB__
+ ror $t0,$t0,#32 // flip nonce words
+ ror $t1,$t1,#32
+#endif
+ adds $h0,$h0,$t0 // accumulate nonce
+ adc $h1,$h1,$t1
+#ifdef __AARCH64EB__
+ rev $h0,$h0 // flip output bytes
+ rev $h1,$h1
+#endif
+ stp $h0,$h1,[$mac] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros; # borrow
+
+$code.=<<___;
+.type poly1305_mult,%function
+.align 5
+poly1305_mult:
+ mul $d0,$h0,$r0 // h0*r0
+ umulh $d1,$h0,$r0
+
+ mul $t0,$h1,$s1 // h1*5*r1
+ umulh $t1,$h1,$s1
+
+ adds $d0,$d0,$t0
+ mul $t0,$h0,$r1 // h0*r1
+ adc $d1,$d1,$t1
+ umulh $d2,$h0,$r1
+
+ adds $d1,$d1,$t0
+ mul $t0,$h1,$r0 // h1*r0
+ adc $d2,$d2,xzr
+ umulh $t1,$h1,$r0
+
+ adds $d1,$d1,$t0
+ mul $t0,$h2,$s1 // h2*5*r1
+ adc $d2,$d2,$t1
+ mul $t1,$h2,$r0 // h2*r0
+
+ adds $d1,$d1,$t0
+ adc $d2,$d2,$t1
+
+ and $t0,$d2,#-4 // final reduction
+ and $h2,$d2,#3
+ add $t0,$t0,$d2,lsr#2
+ adds $h0,$d0,$t0
+ adcs $h1,$d1,xzr
+ adc $h2,$h2,xzr
+
+ ret
+.size poly1305_mult,.-poly1305_mult
+
+.type poly1305_splat,%function
+.align 4
+poly1305_splat:
+ and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,$h0,#26,#26
+ extr x14,$h1,$h0,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,$h1,#14,#26
+ extr x16,$h2,$h1,#40
+
+ str w12,[$ctx,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[$ctx,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[$ctx,#16*2] // s1
+ str w14,[$ctx,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[$ctx,#16*4] // s2
+ str w15,[$ctx,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[$ctx,#16*6] // s3
+ str w16,[$ctx,#16*7] // r4
+ str w15,[$ctx,#16*8] // s4
+
+ ret
+.size poly1305_splat,.-poly1305_splat
+
+#ifdef __KERNEL__
+.globl poly1305_blocks_neon
+#endif
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr $is_base2_26,[$ctx,#24]
+ cmp $len,#128
+ b.lo .Lpoly1305_blocks
+
+ .inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ cbz $is_base2_26,.Lbase2_64_neon
+
+ ldp w10,w11,[$ctx] // load hash value base 2^26
+ ldp w12,w13,[$ctx,#8]
+ ldr w14,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Leven_neon
+
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr $h1,x12,#12
+ adds $h0,$h0,x12,lsl#52
+ add $h1,$h1,x13,lsl#14
+ adc $h1,$h1,xzr
+ lsr $h2,x14,#24
+ adds $h1,$h1,x14,lsl#40
+ adc $d2,$h2,xzr // can be partially reduced...
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ b .Leven_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp $r0,$r1,[$ctx,#32] // load key value
+
+ ldp $h0,$h1,[$ctx] // load hash value base 2^64
+ ldr $h2,[$ctx,#16]
+
+ tst $len,#31
+ b.eq .Linit_neon
+
+ ldp $d0,$d1,[$inp],#16 // load input
+ sub $len,$len,#16
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __AARCH64EB__
+ rev $d0,$d0
+ rev $d1,$d1
+#endif
+ adds $h0,$h0,$d0 // accumulate input
+ adcs $h1,$h1,$d1
+ adc $h2,$h2,$padbit
+
+ bl poly1305_mult
+
+.Linit_neon:
+ ldr w17,[$ctx,#48] // first table element
+ and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,$h0,#26,#26
+ extr x12,$h1,$h0,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,$h1,#14,#26
+ extr x14,$h2,$h1,#40
+
+ cmp w17,#-1 // is value impossible?
+ b.ne .Leven_neon
+
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+ ////////////////////////////////// initialize r^n table
+ mov $h0,$r0 // r^1
+ add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov $h1,$r1
+ mov $h2,xzr
+ add $ctx,$ctx,#48+12
+ bl poly1305_splat
+
+ bl poly1305_mult // r^2
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^3
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^4
+ sub $ctx,$ctx,#4
+ bl poly1305_splat
+ sub $ctx,$ctx,#48 // restore original $ctx
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ fmov ${H0},x10
+ fmov ${H1},x11
+ fmov ${H2},x12
+ fmov ${H3},x13
+ fmov ${H4},x14
+
+.Ldo_neon:
+ ldp x8,x12,[$inp,#32] // inp[2:3]
+ subs $len,$len,#64
+ ldp x9,x13,[$inp,#48]
+ add $in2,$inp,#96
+ adr $zeros,.Lzeros
+
+ lsl $padbit,$padbit,#24
+ add x15,$ctx,#48
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN23_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN23_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov $IN23_2,x8
+ fmov $IN23_3,x10
+ fmov $IN23_4,x12
+
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ ldp x9,x13,[$inp],#48
+
+ ld1 {$R0,$R1,$S1,$R2},[x15],#64
+ ld1 {$S2,$R3,$S3,$R4},[x15],#64
+ ld1 {$S4},[x15]
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov $IN01_0,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,$padbit,x12,lsr#40
+ add x13,$padbit,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov $IN01_1,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi $MASK.2d,#-1
+ fmov $IN01_2,x8
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+ ushr $MASK.2d,$MASK.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // \___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // \___________________/ \____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs $len,$len,#64
+ umull $ACC4,$IN23_0,${R4}[2]
+ csel $in2,$zeros,$in2,lo
+ umull $ACC3,$IN23_0,${R3}[2]
+ umull $ACC2,$IN23_0,${R2}[2]
+ ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
+ umull $ACC1,$IN23_0,${R1}[2]
+ ldp x9,x13,[$in2],#48
+ umull $ACC0,$IN23_0,${R0}[2]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal $ACC4,$IN23_1,${R3}[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC3,$IN23_1,${R2}[2]
+ and x5,x9,#0x03ffffff
+ umlal $ACC2,$IN23_1,${R1}[2]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN23_1,${R0}[2]
+ ubfx x7,x9,#26,#26
+ umlal $ACC0,$IN23_1,${S4}[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal $ACC4,$IN23_2,${R2}[2]
+ extr x8,x12,x8,#52
+ umlal $ACC3,$IN23_2,${R1}[2]
+ extr x9,x13,x9,#52
+ umlal $ACC2,$IN23_2,${R0}[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC1,$IN23_2,${S4}[2]
+ fmov $IN23_0,x4
+ umlal $ACC0,$IN23_2,${S3}[2]
+ and x8,x8,#0x03ffffff
+
+ umlal $ACC4,$IN23_3,${R1}[2]
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN23_3,${R0}[2]
+ ubfx x10,x12,#14,#26
+ umlal $ACC2,$IN23_3,${S4}[2]
+ ubfx x11,x13,#14,#26
+ umlal $ACC1,$IN23_3,${S3}[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC0,$IN23_3,${S2}[2]
+ fmov $IN23_1,x6
+
+ add $IN01_2,$IN01_2,$H2
+ add x12,$padbit,x12,lsr#40
+ umlal $ACC4,$IN23_4,${R0}[2]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC3,$IN23_4,${S4}[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC2,$IN23_4,${S3}[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN23_4,${S2}[2]
+ fmov $IN23_2,x8
+ umlal $ACC0,$IN23_4,${S1}[2]
+ fmov $IN23_3,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ fmov $IN23_4,x12
+ umlal $ACC3,$IN01_2,${R1}[0]
+ ldp x8,x12,[$inp],#16 // inp[0:1]
+ umlal $ACC0,$IN01_2,${S3}[0]
+ ldp x9,x13,[$inp],#48
+ umlal $ACC4,$IN01_2,${R2}[0]
+ umlal $ACC1,$IN01_2,${S4}[0]
+ umlal $ACC2,$IN01_2,${R0}[0]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}[0]
+ umlal $ACC4,$IN01_0,${R4}[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal $ACC2,$IN01_0,${R2}[0]
+ and x5,x9,#0x03ffffff
+ umlal $ACC0,$IN01_0,${R0}[0]
+ ubfx x6,x8,#26,#26
+ umlal $ACC1,$IN01_0,${R1}[0]
+ ubfx x7,x9,#26,#26
+
+ add $IN01_3,$IN01_3,$H3
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal $ACC3,$IN01_1,${R2}[0]
+ extr x8,x12,x8,#52
+ umlal $ACC4,$IN01_1,${R3}[0]
+ extr x9,x13,x9,#52
+ umlal $ACC0,$IN01_1,${S4}[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal $ACC2,$IN01_1,${R1}[0]
+ fmov $IN01_0,x4
+ umlal $ACC1,$IN01_1,${R0}[0]
+ and x8,x8,#0x03ffffff
+
+ add $IN01_4,$IN01_4,$H4
+ and x9,x9,#0x03ffffff
+ umlal $ACC3,$IN01_3,${R0}[0]
+ ubfx x10,x12,#14,#26
+ umlal $ACC0,$IN01_3,${S2}[0]
+ ubfx x11,x13,#14,#26
+ umlal $ACC4,$IN01_3,${R1}[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal $ACC1,$IN01_3,${S3}[0]
+ fmov $IN01_1,x6
+ umlal $ACC2,$IN01_3,${S4}[0]
+ add x12,$padbit,x12,lsr#40
+
+ umlal $ACC3,$IN01_4,${S4}[0]
+ add x13,$padbit,x13,lsr#40
+ umlal $ACC0,$IN01_4,${S1}[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal $ACC4,$IN01_4,${R0}[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal $ACC1,$IN01_4,${S2}[0]
+ fmov $IN01_2,x8
+ umlal $ACC2,$IN01_4,${S3}[0]
+ fmov $IN01_3,x10
+ fmov $IN01_4,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr $T0.2d,$ACC3,#26
+ xtn $H3,$ACC3
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ bic $H3,#0xfc,lsl#24 // &=0x03ffffff
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ xtn $H4,$ACC4
+ ushr $T1.2d,$ACC1,#26
+ xtn $H1,$ACC1
+ bic $H4,#0xfc,lsl#24
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ shrn $T1.2s,$ACC2,#26
+ xtn $H2,$ACC2
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ bic $H1,#0xfc,lsl#24
+ add $H3,$H3,$T1.2s // h2 -> h3
+ bic $H2,#0xfc,lsl#24
+
+ shrn $T0.2s,$ACC0,#26
+ xtn $H0,$ACC0
+ ushr $T1.2s,$H3,#26
+ bic $H3,#0xfc,lsl#24
+ bic $H0,#0xfc,lsl#24
+ add $H1,$H1,$T0.2s // h0 -> h1
+ add $H4,$H4,$T1.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup $IN23_2,${IN23_2}[0]
+ add $IN01_2,$IN01_2,$H2
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds $len,$len,#32
+ b.ne .Long_tail
+
+ dup $IN23_2,${IN01_2}[0]
+ add $IN23_0,$IN01_0,$H0
+ add $IN23_3,$IN01_3,$H3
+ add $IN23_1,$IN01_1,$H1
+ add $IN23_4,$IN01_4,$H4
+
+.Long_tail:
+ dup $IN23_0,${IN23_0}[0]
+ umull2 $ACC0,$IN23_2,${S3}
+ umull2 $ACC3,$IN23_2,${R1}
+ umull2 $ACC4,$IN23_2,${R2}
+ umull2 $ACC2,$IN23_2,${R0}
+ umull2 $ACC1,$IN23_2,${S4}
+
+ dup $IN23_1,${IN23_1}[0]
+ umlal2 $ACC0,$IN23_0,${R0}
+ umlal2 $ACC2,$IN23_0,${R2}
+ umlal2 $ACC3,$IN23_0,${R3}
+ umlal2 $ACC4,$IN23_0,${R4}
+ umlal2 $ACC1,$IN23_0,${R1}
+
+ dup $IN23_3,${IN23_3}[0]
+ umlal2 $ACC0,$IN23_1,${S4}
+ umlal2 $ACC3,$IN23_1,${R2}
+ umlal2 $ACC2,$IN23_1,${R1}
+ umlal2 $ACC4,$IN23_1,${R3}
+ umlal2 $ACC1,$IN23_1,${R0}
+
+ dup $IN23_4,${IN23_4}[0]
+ umlal2 $ACC3,$IN23_3,${R0}
+ umlal2 $ACC4,$IN23_3,${R1}
+ umlal2 $ACC0,$IN23_3,${S2}
+ umlal2 $ACC1,$IN23_3,${S3}
+ umlal2 $ACC2,$IN23_3,${S4}
+
+ umlal2 $ACC3,$IN23_4,${S4}
+ umlal2 $ACC0,$IN23_4,${S1}
+ umlal2 $ACC4,$IN23_4,${R0}
+ umlal2 $ACC1,$IN23_4,${S2}
+ umlal2 $ACC2,$IN23_4,${S3}
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add $IN01_0,$IN01_0,$H0
+ umlal $ACC3,$IN01_2,${R1}
+ umlal $ACC0,$IN01_2,${S3}
+ umlal $ACC4,$IN01_2,${R2}
+ umlal $ACC1,$IN01_2,${S4}
+ umlal $ACC2,$IN01_2,${R0}
+
+ add $IN01_1,$IN01_1,$H1
+ umlal $ACC3,$IN01_0,${R3}
+ umlal $ACC0,$IN01_0,${R0}
+ umlal $ACC4,$IN01_0,${R4}
+ umlal $ACC1,$IN01_0,${R1}
+ umlal $ACC2,$IN01_0,${R2}
+
+ add $IN01_3,$IN01_3,$H3
+ umlal $ACC3,$IN01_1,${R2}
+ umlal $ACC0,$IN01_1,${S4}
+ umlal $ACC4,$IN01_1,${R3}
+ umlal $ACC1,$IN01_1,${R0}
+ umlal $ACC2,$IN01_1,${R1}
+
+ add $IN01_4,$IN01_4,$H4
+ umlal $ACC3,$IN01_3,${R0}
+ umlal $ACC0,$IN01_3,${S2}
+ umlal $ACC4,$IN01_3,${R1}
+ umlal $ACC1,$IN01_3,${S3}
+ umlal $ACC2,$IN01_3,${S4}
+
+ umlal $ACC3,$IN01_4,${S4}
+ umlal $ACC0,$IN01_4,${S1}
+ umlal $ACC4,$IN01_4,${R0}
+ umlal $ACC1,$IN01_4,${S2}
+ umlal $ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp $ACC3,$ACC3,$ACC3
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp $ACC0,$ACC0,$ACC0
+ ldp d10,d11,[sp,#32]
+ addp $ACC4,$ACC4,$ACC4
+ ldp d12,d13,[sp,#48]
+ addp $ACC1,$ACC1,$ACC1
+ ldp d14,d15,[sp,#64]
+ addp $ACC2,$ACC2,$ACC2
+ ldr x30,[sp,#8]
+ .inst 0xd50323bf // autiasp
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr $T0.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ ushr $T1.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+
+ add $ACC4,$ACC4,$T0.2d // h3 -> h4
+ add $ACC1,$ACC1,$T1.2d // h0 -> h1
+
+ ushr $T0.2d,$ACC4,#26
+ and $ACC4,$ACC4,$MASK.2d
+ ushr $T1.2d,$ACC1,#26
+ and $ACC1,$ACC1,$MASK.2d
+ add $ACC2,$ACC2,$T1.2d // h1 -> h2
+
+ add $ACC0,$ACC0,$T0.2d
+ shl $T0.2d,$T0.2d,#2
+ ushr $T1.2d,$ACC2,#26
+ and $ACC2,$ACC2,$MASK.2d
+ add $ACC0,$ACC0,$T0.2d // h4 -> h0
+ add $ACC3,$ACC3,$T1.2d // h2 -> h3
+
+ ushr $T0.2d,$ACC0,#26
+ and $ACC0,$ACC0,$MASK.2d
+ ushr $T1.2d,$ACC3,#26
+ and $ACC3,$ACC3,$MASK.2d
+ add $ACC1,$ACC1,$T0.2d // h0 -> h1
+ add $ACC4,$ACC4,$T1.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+ mov x4,#1
+ st1 {$ACC4}[0],[$ctx]
+ str x4,[$ctx,#8] // set is_base2_26
+
+ ldr x29,[sp],#80
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
+.align 2
+#if !defined(__KERNEL__) && !defined(_WIN64)
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+___
+
+foreach (split("\n",$code)) {
+ s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
+ s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
+ (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
+ (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
+ (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
+ (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
+ (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+ s/\.[124]([sd])\[/.$1\[/;
+ s/w#x([0-9]+)/w$1/g;
+
+ print $_,"\n";
+}
+close STDOUT;
--- /dev/null
+++ b/arch/arm64/crypto/poly1305-core.S_shipped
@@ -0,0 +1,835 @@
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern OPENSSL_armcap_P
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+ cmp x1,xzr
+ stp xzr,xzr,[x0] // zero hash value
+ stp xzr,xzr,[x0,#16] // [along with is_base2_26]
+
+ csel x0,xzr,x0,eq
+ b.eq .Lno_key
+
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
+#endif
+
+ ldp x7,x8,[x1] // load key
+ mov x9,#0xfffffffc0fffffff
+ movk x9,#0x0fff,lsl#48
+#ifdef __AARCH64EB__
+ rev x7,x7 // flip bytes
+ rev x8,x8
+#endif
+ and x7,x7,x9 // &=0ffffffc0fffffff
+ and x9,x9,#-4
+ and x8,x8,x9 // &=0ffffffc0ffffffc
+ mov w9,#-1
+ stp x7,x8,[x0,#32] // save key value
+ str w9,[x0,#48] // impossible key power value
+
+#ifndef __KERNEL__
+ tst w17,#ARMV7_NEON
+
+ adr x12,.Lpoly1305_blocks
+ adr x7,.Lpoly1305_blocks_neon
+ adr x13,.Lpoly1305_emit
+
+ csel x12,x12,x7,eq
+
+# ifdef __ILP32__
+ stp w12,w13,[x2]
+# else
+ stp x12,x13,[x2]
+# endif
+#endif
+ mov x0,#1
+.Lno_key:
+ ret
+.size poly1305_init,.-poly1305_init
+
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ ands x2,x2,#-16
+ b.eq .Lno_data
+
+ ldp x4,x5,[x0] // load hash value
+ ldp x6,x17,[x0,#16] // [along with is_base2_26]
+ ldp x7,x8,[x0,#32] // load key value
+
+#ifdef __AARCH64EB__
+ lsr x12,x4,#32
+ mov w13,w4
+ lsr x14,x5,#32
+ mov w15,w5
+ lsr x16,x6,#32
+#else
+ mov w12,w4
+ lsr x13,x4,#32
+ mov w14,w5
+ lsr x15,x5,#32
+ mov w16,w6
+#endif
+
+ add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
+ lsr x13,x14,#12
+ adds x12,x12,x14,lsl#52
+ add x13,x13,x15,lsl#14
+ adc x13,x13,xzr
+ lsr x14,x16,#24
+ adds x13,x13,x16,lsl#40
+ adc x14,x14,xzr
+
+ cmp x17,#0 // is_base2_26?
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+ csel x4,x4,x12,eq // choose between radixes
+ csel x5,x5,x13,eq
+ csel x6,x6,x14,eq
+
+.Loop:
+ ldp x10,x11,[x1],#16 // load input
+ sub x2,x2,#16
+#ifdef __AARCH64EB__
+ rev x10,x10
+ rev x11,x11
+#endif
+ adds x4,x4,x10 // accumulate input
+ adcs x5,x5,x11
+
+ mul x12,x4,x7 // h0*r0
+ adc x6,x6,x3
+ umulh x13,x4,x7
+
+ mul x10,x5,x9 // h1*5*r1
+ umulh x11,x5,x9
+
+ adds x12,x12,x10
+ mul x10,x4,x8 // h0*r1
+ adc x13,x13,x11
+ umulh x14,x4,x8
+
+ adds x13,x13,x10
+ mul x10,x5,x7 // h1*r0
+ adc x14,x14,xzr
+ umulh x11,x5,x7
+
+ adds x13,x13,x10
+ mul x10,x6,x9 // h2*5*r1
+ adc x14,x14,x11
+ mul x11,x6,x7 // h2*r0
+
+ adds x13,x13,x10
+ adc x14,x14,x11
+
+ and x10,x14,#-4 // final reduction
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x12,x10
+ adcs x5,x13,xzr
+ adc x6,x6,xzr
+
+ cbnz x2,.Loop
+
+ stp x4,x5,[x0] // store hash value
+ stp x6,xzr,[x0,#16] // [and clear is_base2_26]
+
+.Lno_data:
+ ret
+.size poly1305_blocks,.-poly1305_blocks
+
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ ldp x4,x5,[x0] // load hash base 2^64
+ ldp x6,x7,[x0,#16] // [along with is_base2_26]
+ ldp x10,x11,[x2] // load nonce
+
+#ifdef __AARCH64EB__
+ lsr x12,x4,#32
+ mov w13,w4
+ lsr x14,x5,#32
+ mov w15,w5
+ lsr x16,x6,#32
+#else
+ mov w12,w4
+ lsr x13,x4,#32
+ mov w14,w5
+ lsr x15,x5,#32
+ mov w16,w6
+#endif
+
+ add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
+ lsr x13,x14,#12
+ adds x12,x12,x14,lsl#52
+ add x13,x13,x15,lsl#14
+ adc x13,x13,xzr
+ lsr x14,x16,#24
+ adds x13,x13,x16,lsl#40
+ adc x14,x14,xzr
+
+ cmp x7,#0 // is_base2_26?
+ csel x4,x4,x12,eq // choose between radixes
+ csel x5,x5,x13,eq
+ csel x6,x6,x14,eq
+
+ adds x12,x4,#5 // compare to modulus
+ adcs x13,x5,xzr
+ adc x14,x6,xzr
+
+ tst x14,#-4 // see if it's carried/borrowed
+
+ csel x4,x4,x12,eq
+ csel x5,x5,x13,eq
+
+#ifdef __AARCH64EB__
+ ror x10,x10,#32 // flip nonce words
+ ror x11,x11,#32
+#endif
+ adds x4,x4,x10 // accumulate nonce
+ adc x5,x5,x11
+#ifdef __AARCH64EB__
+ rev x4,x4 // flip output bytes
+ rev x5,x5
+#endif
+ stp x4,x5,[x1] // write result
+
+ ret
+.size poly1305_emit,.-poly1305_emit
+.type poly1305_mult,%function
+.align 5
+poly1305_mult:
+ mul x12,x4,x7 // h0*r0
+ umulh x13,x4,x7
+
+ mul x10,x5,x9 // h1*5*r1
+ umulh x11,x5,x9
+
+ adds x12,x12,x10
+ mul x10,x4,x8 // h0*r1
+ adc x13,x13,x11
+ umulh x14,x4,x8
+
+ adds x13,x13,x10
+ mul x10,x5,x7 // h1*r0
+ adc x14,x14,xzr
+ umulh x11,x5,x7
+
+ adds x13,x13,x10
+ mul x10,x6,x9 // h2*5*r1
+ adc x14,x14,x11
+ mul x11,x6,x7 // h2*r0
+
+ adds x13,x13,x10
+ adc x14,x14,x11
+
+ and x10,x14,#-4 // final reduction
+ and x6,x14,#3
+ add x10,x10,x14,lsr#2
+ adds x4,x12,x10
+ adcs x5,x13,xzr
+ adc x6,x6,xzr
+
+ ret
+.size poly1305_mult,.-poly1305_mult
+
+.type poly1305_splat,%function
+.align 4
+poly1305_splat:
+ and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x13,x4,#26,#26
+ extr x14,x5,x4,#52
+ and x14,x14,#0x03ffffff
+ ubfx x15,x5,#14,#26
+ extr x16,x6,x5,#40
+
+ str w12,[x0,#16*0] // r0
+ add w12,w13,w13,lsl#2 // r1*5
+ str w13,[x0,#16*1] // r1
+ add w13,w14,w14,lsl#2 // r2*5
+ str w12,[x0,#16*2] // s1
+ str w14,[x0,#16*3] // r2
+ add w14,w15,w15,lsl#2 // r3*5
+ str w13,[x0,#16*4] // s2
+ str w15,[x0,#16*5] // r3
+ add w15,w16,w16,lsl#2 // r4*5
+ str w14,[x0,#16*6] // s3
+ str w16,[x0,#16*7] // r4
+ str w15,[x0,#16*8] // s4
+
+ ret
+.size poly1305_splat,.-poly1305_splat
+
+#ifdef __KERNEL__
+.globl poly1305_blocks_neon
+#endif
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr x17,[x0,#24]
+ cmp x2,#128
+ b.lo .Lpoly1305_blocks
+
+ .inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+ stp d8,d9,[sp,#16] // meet ABI requirements
+ stp d10,d11,[sp,#32]
+ stp d12,d13,[sp,#48]
+ stp d14,d15,[sp,#64]
+
+ cbz x17,.Lbase2_64_neon
+
+ ldp w10,w11,[x0] // load hash value base 2^26
+ ldp w12,w13,[x0,#8]
+ ldr w14,[x0,#16]
+
+ tst x2,#31
+ b.eq .Leven_neon
+
+ ldp x7,x8,[x0,#32] // load key value
+
+ add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
+ lsr x5,x12,#12
+ adds x4,x4,x12,lsl#52
+ add x5,x5,x13,lsl#14
+ adc x5,x5,xzr
+ lsr x6,x14,#24
+ adds x5,x5,x14,lsl#40
+ adc x14,x6,xzr // can be partially reduced...
+
+ ldp x12,x13,[x1],#16 // load input
+ sub x2,x2,#16
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+
+#ifdef __AARCH64EB__
+ rev x12,x12
+ rev x13,x13
+#endif
+ adds x4,x4,x12 // accumulate input
+ adcs x5,x5,x13
+ adc x6,x6,x3
+
+ bl poly1305_mult
+
+ and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,x4,#26,#26
+ extr x12,x5,x4,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,x5,#14,#26
+ extr x14,x6,x5,#40
+
+ b .Leven_neon
+
+.align 4
+.Lbase2_64_neon:
+ ldp x7,x8,[x0,#32] // load key value
+
+ ldp x4,x5,[x0] // load hash value base 2^64
+ ldr x6,[x0,#16]
+
+ tst x2,#31
+ b.eq .Linit_neon
+
+ ldp x12,x13,[x1],#16 // load input
+ sub x2,x2,#16
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+#ifdef __AARCH64EB__
+ rev x12,x12
+ rev x13,x13
+#endif
+ adds x4,x4,x12 // accumulate input
+ adcs x5,x5,x13
+ adc x6,x6,x3
+
+ bl poly1305_mult
+
+.Linit_neon:
+ ldr w17,[x0,#48] // first table element
+ and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
+ ubfx x11,x4,#26,#26
+ extr x12,x5,x4,#52
+ and x12,x12,#0x03ffffff
+ ubfx x13,x5,#14,#26
+ extr x14,x6,x5,#40
+
+ cmp w17,#-1 // is value impossible?
+ b.ne .Leven_neon
+
+ fmov d24,x10
+ fmov d25,x11
+ fmov d26,x12
+ fmov d27,x13
+ fmov d28,x14
+
+ ////////////////////////////////// initialize r^n table
+ mov x4,x7 // r^1
+ add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
+ mov x5,x8
+ mov x6,xzr
+ add x0,x0,#48+12
+ bl poly1305_splat
+
+ bl poly1305_mult // r^2
+ sub x0,x0,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^3
+ sub x0,x0,#4
+ bl poly1305_splat
+
+ bl poly1305_mult // r^4
+ sub x0,x0,#4
+ bl poly1305_splat
+ sub x0,x0,#48 // restore original x0
+ b .Ldo_neon
+
+.align 4
+.Leven_neon:
+ fmov d24,x10
+ fmov d25,x11
+ fmov d26,x12
+ fmov d27,x13
+ fmov d28,x14
+
+.Ldo_neon:
+ ldp x8,x12,[x1,#32] // inp[2:3]
+ subs x2,x2,#64
+ ldp x9,x13,[x1,#48]
+ add x16,x1,#96
+ adr x17,.Lzeros
+
+ lsl x3,x3,#24
+ add x15,x0,#48
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov d14,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,x3,x12,lsr#40
+ add x13,x3,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov d15,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ fmov d16,x8
+ fmov d17,x10
+ fmov d18,x12
+
+ ldp x8,x12,[x1],#16 // inp[0:1]
+ ldp x9,x13,[x1],#48
+
+ ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
+ ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
+ ld1 {v8.4s},[x15]
+
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ and x5,x9,#0x03ffffff
+ ubfx x6,x8,#26,#26
+ ubfx x7,x9,#26,#26
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ extr x8,x12,x8,#52
+ extr x9,x13,x9,#52
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ fmov d9,x4
+ and x8,x8,#0x03ffffff
+ and x9,x9,#0x03ffffff
+ ubfx x10,x12,#14,#26
+ ubfx x11,x13,#14,#26
+ add x12,x3,x12,lsr#40
+ add x13,x3,x13,lsr#40
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ fmov d10,x6
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ movi v31.2d,#-1
+ fmov d11,x8
+ fmov d12,x10
+ fmov d13,x12
+ ushr v31.2d,v31.2d,#38
+
+ b.ls .Lskip_loop
+
+.align 4
+.Loop_neon:
+ ////////////////////////////////////////////////////////////////
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ // ___________________/
+ // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ // ___________________/ ____________________/
+ //
+ // Note that we start with inp[2:3]*r^2. This is because it
+ // doesn't depend on reduction in previous iteration.
+ ////////////////////////////////////////////////////////////////
+ // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
+ // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
+ // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
+ // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
+ // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+ subs x2,x2,#64
+ umull v23.2d,v14.2s,v7.s[2]
+ csel x16,x17,x16,lo
+ umull v22.2d,v14.2s,v5.s[2]
+ umull v21.2d,v14.2s,v3.s[2]
+ ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
+ umull v20.2d,v14.2s,v1.s[2]
+ ldp x9,x13,[x16],#48
+ umull v19.2d,v14.2s,v0.s[2]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ umlal v23.2d,v15.2s,v5.s[2]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal v22.2d,v15.2s,v3.s[2]
+ and x5,x9,#0x03ffffff
+ umlal v21.2d,v15.2s,v1.s[2]
+ ubfx x6,x8,#26,#26
+ umlal v20.2d,v15.2s,v0.s[2]
+ ubfx x7,x9,#26,#26
+ umlal v19.2d,v15.2s,v8.s[2]
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+
+ umlal v23.2d,v16.2s,v3.s[2]
+ extr x8,x12,x8,#52
+ umlal v22.2d,v16.2s,v1.s[2]
+ extr x9,x13,x9,#52
+ umlal v21.2d,v16.2s,v0.s[2]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal v20.2d,v16.2s,v8.s[2]
+ fmov d14,x4
+ umlal v19.2d,v16.2s,v6.s[2]
+ and x8,x8,#0x03ffffff
+
+ umlal v23.2d,v17.2s,v1.s[2]
+ and x9,x9,#0x03ffffff
+ umlal v22.2d,v17.2s,v0.s[2]
+ ubfx x10,x12,#14,#26
+ umlal v21.2d,v17.2s,v8.s[2]
+ ubfx x11,x13,#14,#26
+ umlal v20.2d,v17.2s,v6.s[2]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal v19.2d,v17.2s,v4.s[2]
+ fmov d15,x6
+
+ add v11.2s,v11.2s,v26.2s
+ add x12,x3,x12,lsr#40
+ umlal v23.2d,v18.2s,v0.s[2]
+ add x13,x3,x13,lsr#40
+ umlal v22.2d,v18.2s,v8.s[2]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal v21.2d,v18.2s,v6.s[2]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal v20.2d,v18.2s,v4.s[2]
+ fmov d16,x8
+ umlal v19.2d,v18.2s,v2.s[2]
+ fmov d17,x10
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4 and accumulate
+
+ add v9.2s,v9.2s,v24.2s
+ fmov d18,x12
+ umlal v22.2d,v11.2s,v1.s[0]
+ ldp x8,x12,[x1],#16 // inp[0:1]
+ umlal v19.2d,v11.2s,v6.s[0]
+ ldp x9,x13,[x1],#48
+ umlal v23.2d,v11.2s,v3.s[0]
+ umlal v20.2d,v11.2s,v8.s[0]
+ umlal v21.2d,v11.2s,v0.s[0]
+#ifdef __AARCH64EB__
+ rev x8,x8
+ rev x12,x12
+ rev x9,x9
+ rev x13,x13
+#endif
+
+ add v10.2s,v10.2s,v25.2s
+ umlal v22.2d,v9.2s,v5.s[0]
+ umlal v23.2d,v9.2s,v7.s[0]
+ and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
+ umlal v21.2d,v9.2s,v3.s[0]
+ and x5,x9,#0x03ffffff
+ umlal v19.2d,v9.2s,v0.s[0]
+ ubfx x6,x8,#26,#26
+ umlal v20.2d,v9.2s,v1.s[0]
+ ubfx x7,x9,#26,#26
+
+ add v12.2s,v12.2s,v27.2s
+ add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
+ umlal v22.2d,v10.2s,v3.s[0]
+ extr x8,x12,x8,#52
+ umlal v23.2d,v10.2s,v5.s[0]
+ extr x9,x13,x9,#52
+ umlal v19.2d,v10.2s,v8.s[0]
+ add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
+ umlal v21.2d,v10.2s,v1.s[0]
+ fmov d9,x4
+ umlal v20.2d,v10.2s,v0.s[0]
+ and x8,x8,#0x03ffffff
+
+ add v13.2s,v13.2s,v28.2s
+ and x9,x9,#0x03ffffff
+ umlal v22.2d,v12.2s,v0.s[0]
+ ubfx x10,x12,#14,#26
+ umlal v19.2d,v12.2s,v4.s[0]
+ ubfx x11,x13,#14,#26
+ umlal v23.2d,v12.2s,v1.s[0]
+ add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
+ umlal v20.2d,v12.2s,v6.s[0]
+ fmov d10,x6
+ umlal v21.2d,v12.2s,v8.s[0]
+ add x12,x3,x12,lsr#40
+
+ umlal v22.2d,v13.2s,v8.s[0]
+ add x13,x3,x13,lsr#40
+ umlal v19.2d,v13.2s,v2.s[0]
+ add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
+ umlal v23.2d,v13.2s,v0.s[0]
+ add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
+ umlal v20.2d,v13.2s,v4.s[0]
+ fmov d11,x8
+ umlal v21.2d,v13.2s,v6.s[0]
+ fmov d12,x10
+ fmov d13,x12
+
+ /////////////////////////////////////////////////////////////////
+ // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ // and P. Schwabe
+ //
+ // [see discussion in poly1305-armv4 module]
+
+ ushr v29.2d,v22.2d,#26
+ xtn v27.2s,v22.2d
+ ushr v30.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+ add v23.2d,v23.2d,v29.2d // h3 -> h4
+ bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
+ add v20.2d,v20.2d,v30.2d // h0 -> h1
+
+ ushr v29.2d,v23.2d,#26
+ xtn v28.2s,v23.2d
+ ushr v30.2d,v20.2d,#26
+ xtn v25.2s,v20.2d
+ bic v28.2s,#0xfc,lsl#24
+ add v21.2d,v21.2d,v30.2d // h1 -> h2
+
+ add v19.2d,v19.2d,v29.2d
+ shl v29.2d,v29.2d,#2
+ shrn v30.2s,v21.2d,#26
+ xtn v26.2s,v21.2d
+ add v19.2d,v19.2d,v29.2d // h4 -> h0
+ bic v25.2s,#0xfc,lsl#24
+ add v27.2s,v27.2s,v30.2s // h2 -> h3
+ bic v26.2s,#0xfc,lsl#24
+
+ shrn v29.2s,v19.2d,#26
+ xtn v24.2s,v19.2d
+ ushr v30.2s,v27.2s,#26
+ bic v27.2s,#0xfc,lsl#24
+ bic v24.2s,#0xfc,lsl#24
+ add v25.2s,v25.2s,v29.2s // h0 -> h1
+ add v28.2s,v28.2s,v30.2s // h3 -> h4
+
+ b.hi .Loop_neon
+
+.Lskip_loop:
+ dup v16.2d,v16.d[0]
+ add v11.2s,v11.2s,v26.2s
+
+ ////////////////////////////////////////////////////////////////
+ // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ adds x2,x2,#32
+ b.ne .Long_tail
+
+ dup v16.2d,v11.d[0]
+ add v14.2s,v9.2s,v24.2s
+ add v17.2s,v12.2s,v27.2s
+ add v15.2s,v10.2s,v25.2s
+ add v18.2s,v13.2s,v28.2s
+
+.Long_tail:
+ dup v14.2d,v14.d[0]
+ umull2 v19.2d,v16.4s,v6.4s
+ umull2 v22.2d,v16.4s,v1.4s
+ umull2 v23.2d,v16.4s,v3.4s
+ umull2 v21.2d,v16.4s,v0.4s
+ umull2 v20.2d,v16.4s,v8.4s
+
+ dup v15.2d,v15.d[0]
+ umlal2 v19.2d,v14.4s,v0.4s
+ umlal2 v21.2d,v14.4s,v3.4s
+ umlal2 v22.2d,v14.4s,v5.4s
+ umlal2 v23.2d,v14.4s,v7.4s
+ umlal2 v20.2d,v14.4s,v1.4s
+
+ dup v17.2d,v17.d[0]
+ umlal2 v19.2d,v15.4s,v8.4s
+ umlal2 v22.2d,v15.4s,v3.4s
+ umlal2 v21.2d,v15.4s,v1.4s
+ umlal2 v23.2d,v15.4s,v5.4s
+ umlal2 v20.2d,v15.4s,v0.4s
+
+ dup v18.2d,v18.d[0]
+ umlal2 v22.2d,v17.4s,v0.4s
+ umlal2 v23.2d,v17.4s,v1.4s
+ umlal2 v19.2d,v17.4s,v4.4s
+ umlal2 v20.2d,v17.4s,v6.4s
+ umlal2 v21.2d,v17.4s,v8.4s
+
+ umlal2 v22.2d,v18.4s,v8.4s
+ umlal2 v19.2d,v18.4s,v2.4s
+ umlal2 v23.2d,v18.4s,v0.4s
+ umlal2 v20.2d,v18.4s,v4.4s
+ umlal2 v21.2d,v18.4s,v6.4s
+
+ b.eq .Lshort_tail
+
+ ////////////////////////////////////////////////////////////////
+ // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ add v9.2s,v9.2s,v24.2s
+ umlal v22.2d,v11.2s,v1.2s
+ umlal v19.2d,v11.2s,v6.2s
+ umlal v23.2d,v11.2s,v3.2s
+ umlal v20.2d,v11.2s,v8.2s
+ umlal v21.2d,v11.2s,v0.2s
+
+ add v10.2s,v10.2s,v25.2s
+ umlal v22.2d,v9.2s,v5.2s
+ umlal v19.2d,v9.2s,v0.2s
+ umlal v23.2d,v9.2s,v7.2s
+ umlal v20.2d,v9.2s,v1.2s
+ umlal v21.2d,v9.2s,v3.2s
+
+ add v12.2s,v12.2s,v27.2s
+ umlal v22.2d,v10.2s,v3.2s
+ umlal v19.2d,v10.2s,v8.2s
+ umlal v23.2d,v10.2s,v5.2s
+ umlal v20.2d,v10.2s,v0.2s
+ umlal v21.2d,v10.2s,v1.2s
+
+ add v13.2s,v13.2s,v28.2s
+ umlal v22.2d,v12.2s,v0.2s
+ umlal v19.2d,v12.2s,v4.2s
+ umlal v23.2d,v12.2s,v1.2s
+ umlal v20.2d,v12.2s,v6.2s
+ umlal v21.2d,v12.2s,v8.2s
+
+ umlal v22.2d,v13.2s,v8.2s
+ umlal v19.2d,v13.2s,v2.2s
+ umlal v23.2d,v13.2s,v0.2s
+ umlal v20.2d,v13.2s,v4.2s
+ umlal v21.2d,v13.2s,v6.2s
+
+.Lshort_tail:
+ ////////////////////////////////////////////////////////////////
+ // horizontal add
+
+ addp v22.2d,v22.2d,v22.2d
+ ldp d8,d9,[sp,#16] // meet ABI requirements
+ addp v19.2d,v19.2d,v19.2d
+ ldp d10,d11,[sp,#32]
+ addp v23.2d,v23.2d,v23.2d
+ ldp d12,d13,[sp,#48]
+ addp v20.2d,v20.2d,v20.2d
+ ldp d14,d15,[sp,#64]
+ addp v21.2d,v21.2d,v21.2d
+ ldr x30,[sp,#8]
+ .inst 0xd50323bf // autiasp
+
+ ////////////////////////////////////////////////////////////////
+ // lazy reduction, but without narrowing
+
+ ushr v29.2d,v22.2d,#26
+ and v22.16b,v22.16b,v31.16b
+ ushr v30.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+
+ add v23.2d,v23.2d,v29.2d // h3 -> h4
+ add v20.2d,v20.2d,v30.2d // h0 -> h1
+
+ ushr v29.2d,v23.2d,#26
+ and v23.16b,v23.16b,v31.16b
+ ushr v30.2d,v20.2d,#26
+ and v20.16b,v20.16b,v31.16b
+ add v21.2d,v21.2d,v30.2d // h1 -> h2
+
+ add v19.2d,v19.2d,v29.2d
+ shl v29.2d,v29.2d,#2
+ ushr v30.2d,v21.2d,#26
+ and v21.16b,v21.16b,v31.16b
+ add v19.2d,v19.2d,v29.2d // h4 -> h0
+ add v22.2d,v22.2d,v30.2d // h2 -> h3
+
+ ushr v29.2d,v19.2d,#26
+ and v19.16b,v19.16b,v31.16b
+ ushr v30.2d,v22.2d,#26
+ and v22.16b,v22.16b,v31.16b
+ add v20.2d,v20.2d,v29.2d // h0 -> h1
+ add v23.2d,v23.2d,v30.2d // h3 -> h4
+
+ ////////////////////////////////////////////////////////////////
+ // write the result, can be partially reduced
+
+ st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
+ mov x4,#1
+ st1 {v23.s}[0],[x0]
+ str x4,[x0,#8] // set is_base2_26
+
+ ldr x29,[sp],#80
+ ret
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
+.align 2
+#if !defined(__KERNEL__) && !defined(_WIN64)
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
--- b/arch/arm64/crypto/poly1305-glue.c
+++ b/arch/arm64/crypto/poly1305-glue.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
+ *
+ * Copyright (C) 2019 Linaro Ltd.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
+asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
+{
+ poly1305_init_arm64(&dctx->h, key);
+ dctx->s[0] = get_unaligned_le32(key + 16);
+ dctx->s[1] = get_unaligned_le32(key + 20);
+ dctx->s[2] = get_unaligned_le32(key + 24);
+ dctx->s[3] = get_unaligned_le32(key + 28);
+ dctx->buflen = 0;
+}
+EXPORT_SYMBOL(poly1305_init_arch);
+
+static int neon_poly1305_init(struct shash_desc *desc)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ dctx->buflen = 0;
+ dctx->rset = 0;
+ dctx->sset = false;
+
+ return 0;
+}
+
+static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+ u32 len, u32 hibit, bool do_neon)
+{
+ if (unlikely(!dctx->sset)) {
+ if (!dctx->rset) {
+ poly1305_init_arch(dctx, src);
+ src += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ dctx->rset = 1;
+ }
+ if (len >= POLY1305_BLOCK_SIZE) {
+ dctx->s[0] = get_unaligned_le32(src + 0);
+ dctx->s[1] = get_unaligned_le32(src + 4);
+ dctx->s[2] = get_unaligned_le32(src + 8);
+ dctx->s[3] = get_unaligned_le32(src + 12);
+ src += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ dctx->sset = true;
+ }
+ if (len < POLY1305_BLOCK_SIZE)
+ return;
+ }
+
+ len &= ~(POLY1305_BLOCK_SIZE - 1);
+
+ if (static_branch_likely(&have_neon) && likely(do_neon))
+ poly1305_blocks_neon(&dctx->h, src, len, hibit);
+ else
+ poly1305_blocks(&dctx->h, src, len, hibit);
+}
+
+static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
+ const u8 *src, u32 len, bool do_neon)
+{
+ if (unlikely(dctx->buflen)) {
+ u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ len -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ neon_poly1305_blocks(dctx, dctx->buf,
+ POLY1305_BLOCK_SIZE, 1, false);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(len >= POLY1305_BLOCK_SIZE)) {
+ neon_poly1305_blocks(dctx, src, len, 1, do_neon);
+ src += round_down(len, POLY1305_BLOCK_SIZE);
+ len %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(len)) {
+ dctx->buflen = len;
+ memcpy(dctx->buf, src, len);
+ }
+}
+
+static int neon_poly1305_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ bool do_neon = crypto_simd_usable() && srclen > 128;
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (static_branch_likely(&have_neon) && do_neon)
+ kernel_neon_begin();
+ neon_poly1305_do_update(dctx, src, srclen, do_neon);
+ if (static_branch_likely(&have_neon) && do_neon)
+ kernel_neon_end();
+ return 0;
+}
+
+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+ unsigned int nbytes)
+{
+ if (unlikely(dctx->buflen)) {
+ u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ nbytes -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
+ unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+
+ if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ kernel_neon_begin();
+ poly1305_blocks_neon(&dctx->h, src, todo, 1);
+ kernel_neon_end();
+
+ len -= todo;
+ src += todo;
+ } while (len);
+ } else {
+ poly1305_blocks(&dctx->h, src, len, 1);
+ src += len;
+ }
+ nbytes %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(nbytes)) {
+ dctx->buflen = nbytes;
+ memcpy(dctx->buf, src, nbytes);
+ }
+}
+EXPORT_SYMBOL(poly1305_update_arch);
+
+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+{
+ if (unlikely(dctx->buflen)) {
+ dctx->buf[dctx->buflen++] = 1;
+ memset(dctx->buf + dctx->buflen, 0,
+ POLY1305_BLOCK_SIZE - dctx->buflen);
+ poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ }
+
+ poly1305_emit(&dctx->h, dst, dctx->s);
+ *dctx = (struct poly1305_desc_ctx){};
+}
+EXPORT_SYMBOL(poly1305_final_arch);
+
+static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (unlikely(!dctx->sset))
+ return -ENOKEY;
+
+ poly1305_final_arch(dctx, dst);
+ return 0;
+}
+
+static struct shash_alg neon_poly1305_alg = {
+ .init = neon_poly1305_init,
+ .update = neon_poly1305_update,
+ .final = neon_poly1305_final,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .descsize = sizeof(struct poly1305_desc_ctx),
+
+ .base.cra_name = "poly1305",
+ .base.cra_driver_name = "poly1305-neon",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+};
+
+static int __init neon_poly1305_mod_init(void)
+{
+ if (!cpu_have_named_feature(ASIMD))
+ return 0;
+
+ static_branch_enable(&have_neon);
+
+ return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+ crypto_register_shash(&neon_poly1305_alg) : 0;
+}
+
+static void __exit neon_poly1305_mod_exit(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && cpu_have_named_feature(ASIMD))
+ crypto_unregister_shash(&neon_poly1305_alg);
+}
+
+module_init(neon_poly1305_mod_init);
+module_exit(neon_poly1305_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-neon");
--- /dev/null
+++ b/arch/arm/crypto/poly1305-armv4.pl
@@ -0,0 +1,1236 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# IALU(*)/gcc-4.4 NEON
+#
+# ARM11xx(ARMv6) 7.78/+100% -
+# Cortex-A5 6.35/+130% 3.00
+# Cortex-A8 6.25/+115% 2.36
+# Cortex-A9 5.10/+95% 2.55
+# Cortex-A15 3.85/+85% 1.25(**)
+# Snapdragon S4 5.70/+100% 1.48(**)
+#
+# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**) these are trade-off results, they can be improved by ~8% but at
+# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+ die "can't locate arm-xlate.pl";
+
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+ open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define poly1305_init poly1305_init_arm
+# define poly1305_blocks poly1305_blocks_arm
+# define poly1305_emit poly1305_emit_arm
+.globl poly1305_blocks_neon
+#endif
+
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.text
+
+.globl poly1305_emit
+.globl poly1305_blocks
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+.Lpoly1305_init:
+ stmdb sp!,{r4-r11}
+
+ eor r3,r3,r3
+ cmp $inp,#0
+ str r3,[$ctx,#0] @ zero hash value
+ str r3,[$ctx,#4]
+ str r3,[$ctx,#8]
+ str r3,[$ctx,#12]
+ str r3,[$ctx,#16]
+ str r3,[$ctx,#36] @ clear is_base2_26
+ add $ctx,$ctx,#20
+
+#ifdef __thumb2__
+ it eq
+#endif
+ moveq r0,#0
+ beq .Lno_key
+
+#if __ARM_MAX_ARCH__>=7
+ mov r3,#-1
+ str r3,[$ctx,#28] @ impossible key power value
+# ifndef __KERNEL__
+ adr r11,.Lpoly1305_init
+ ldr r12,.LOPENSSL_armcap
+# endif
+#endif
+ ldrb r4,[$inp,#0]
+ mov r10,#0x0fffffff
+ ldrb r5,[$inp,#1]
+ and r3,r10,#-4 @ 0x0ffffffc
+ ldrb r6,[$inp,#2]
+ ldrb r7,[$inp,#3]
+ orr r4,r4,r5,lsl#8
+ ldrb r5,[$inp,#4]
+ orr r4,r4,r6,lsl#16
+ ldrb r6,[$inp,#5]
+ orr r4,r4,r7,lsl#24
+ ldrb r7,[$inp,#6]
+ and r4,r4,r10
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+# if !defined(_WIN32)
+ ldr r12,[r11,r12] @ OPENSSL_armcap_P
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
+ ldr r12,[r12]
+# endif
+#endif
+ ldrb r8,[$inp,#7]
+ orr r5,r5,r6,lsl#8
+ ldrb r6,[$inp,#8]
+ orr r5,r5,r7,lsl#16
+ ldrb r7,[$inp,#9]
+ orr r5,r5,r8,lsl#24
+ ldrb r8,[$inp,#10]
+ and r5,r5,r3
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ tst r12,#ARMV7_NEON @ check for NEON
+# ifdef __thumb2__
+ adr r9,.Lpoly1305_blocks_neon
+ adr r11,.Lpoly1305_blocks
+ it ne
+ movne r11,r9
+ adr r12,.Lpoly1305_emit
+ orr r11,r11,#1 @ thumb-ify addresses
+ orr r12,r12,#1
+# else
+ add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+ ite eq
+ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
+# endif
+#endif
+ ldrb r9,[$inp,#11]
+ orr r6,r6,r7,lsl#8
+ ldrb r7,[$inp,#12]
+ orr r6,r6,r8,lsl#16
+ ldrb r8,[$inp,#13]
+ orr r6,r6,r9,lsl#24
+ ldrb r9,[$inp,#14]
+ and r6,r6,r3
+
+ ldrb r10,[$inp,#15]
+ orr r7,r7,r8,lsl#8
+ str r4,[$ctx,#0]
+ orr r7,r7,r9,lsl#16
+ str r5,[$ctx,#4]
+ orr r7,r7,r10,lsl#24
+ str r6,[$ctx,#8]
+ and r7,r7,r3
+ str r7,[$ctx,#12]
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ stmia r2,{r11,r12} @ fill functions table
+ mov r0,#1
+#else
+ mov r0,#0
+#endif
+.Lno_key:
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ stmdb sp!,{r3-r11,lr}
+
+ ands $len,$len,#-16
+ beq .Lno_data
+
+ add $len,$len,$inp @ end pointer
+ sub sp,sp,#32
+
+#if __ARM_ARCH__<7
+ ldmia $ctx,{$h0-$r3} @ load context
+ add $ctx,$ctx,#20
+ str $len,[sp,#16] @ offload stuff
+ str $ctx,[sp,#12]
+#else
+ ldr lr,[$ctx,#36] @ is_base2_26
+ ldmia $ctx!,{$h0-$h4} @ load hash value
+ str $len,[sp,#16] @ offload stuff
+ str $ctx,[sp,#12]
+
+ adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
+ mov $r1,$h1,lsr#6
+ adcs $r1,$r1,$h2,lsl#20
+ mov $r2,$h2,lsr#12
+ adcs $r2,$r2,$h3,lsl#14
+ mov $r3,$h3,lsr#18
+ adcs $r3,$r3,$h4,lsl#8
+ mov $len,#0
+ teq lr,#0
+ str $len,[$ctx,#16] @ clear is_base2_26
+ adc $len,$len,$h4,lsr#24
+
+ itttt ne
+ movne $h0,$r0 @ choose between radixes
+ movne $h1,$r1
+ movne $h2,$r2
+ movne $h3,$r3
+ ldmia $ctx,{$r0-$r3} @ load key
+ it ne
+ movne $h4,$len
+#endif
+
+ mov lr,$inp
+ cmp $padbit,#0
+ str $r1,[sp,#20]
+ str $r2,[sp,#24]
+ str $r3,[sp,#28]
+ b .Loop
+
+.align 4
+.Loop:
+#if __ARM_ARCH__<7
+ ldrb r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi $h4,$h4,#1 @ 1<<128
+ ldrb r1,[lr,#-15]
+ ldrb r2,[lr,#-14]
+ ldrb r3,[lr,#-13]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-12]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-11]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-10]
+ adds $h0,$h0,r3 @ accumulate input
+
+ ldrb r3,[lr,#-9]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-8]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-7]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-6]
+ adcs $h1,$h1,r3
+
+ ldrb r3,[lr,#-5]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-4]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-3]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-2]
+ adcs $h2,$h2,r3
+
+ ldrb r3,[lr,#-1]
+ orr r1,r0,r1,lsl#8
+ str lr,[sp,#8] @ offload input pointer
+ orr r2,r1,r2,lsl#16
+ add $s1,$r1,$r1,lsr#2
+ orr r3,r2,r3,lsl#24
+#else
+ ldr r0,[lr],#16 @ load input
+ it hi
+ addhi $h4,$h4,#1 @ padbit
+ ldr r1,[lr,#-12]
+ ldr r2,[lr,#-8]
+ ldr r3,[lr,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ adds $h0,$h0,r0 @ accumulate input
+ str lr,[sp,#8] @ offload input pointer
+ adcs $h1,$h1,r1
+ add $s1,$r1,$r1,lsr#2
+ adcs $h2,$h2,r2
+#endif
+ add $s2,$r2,$r2,lsr#2
+ adcs $h3,$h3,r3
+ add $s3,$r3,$r3,lsr#2
+
+ umull r2,r3,$h1,$r0
+ adc $h4,$h4,#0
+ umull r0,r1,$h0,$r0
+ umlal r2,r3,$h4,$s1
+ umlal r0,r1,$h3,$s1
+ ldr $r1,[sp,#20] @ reload $r1
+ umlal r2,r3,$h2,$s3
+ umlal r0,r1,$h1,$s3
+ umlal r2,r3,$h3,$s2
+ umlal r0,r1,$h2,$s2
+ umlal r2,r3,$h0,$r1
+ str r0,[sp,#0] @ future $h0
+ mul r0,$s2,$h4
+ ldr $r2,[sp,#24] @ reload $r2
+ adds r2,r2,r1 @ d1+=d0>>32
+ eor r1,r1,r1
+ adc lr,r3,#0 @ future $h2
+ str r2,[sp,#4] @ future $h1
+
+ mul r2,$s3,$h4
+ eor r3,r3,r3
+ umlal r0,r1,$h3,$s3
+ ldr $r3,[sp,#28] @ reload $r3
+ umlal r2,r3,$h3,$r0
+ umlal r0,r1,$h2,$r0
+ umlal r2,r3,$h2,$r1
+ umlal r0,r1,$h1,$r1
+ umlal r2,r3,$h1,$r2
+ umlal r0,r1,$h0,$r2
+ umlal r2,r3,$h0,$r3
+ ldr $h0,[sp,#0]
+ mul $h4,$r0,$h4
+ ldr $h1,[sp,#4]
+
+ adds $h2,lr,r0 @ d2+=d1>>32
+ ldr lr,[sp,#8] @ reload input pointer
+ adc r1,r1,#0
+ adds $h3,r2,r1 @ d3+=d2>>32
+ ldr r0,[sp,#16] @ reload end pointer
+ adc r3,r3,#0
+ add $h4,$h4,r3 @ h4+=d3>>32
+
+ and r1,$h4,#-4
+ and $h4,$h4,#3
+ add r1,r1,r1,lsr#2 @ *=5
+ adds $h0,$h0,r1
+ adcs $h1,$h1,#0
+ adcs $h2,$h2,#0
+ adcs $h3,$h3,#0
+ adc $h4,$h4,#0
+
+ cmp r0,lr @ done yet?
+ bhi .Loop
+
+ ldr $ctx,[sp,#12]
+ add sp,sp,#32
+ stmdb $ctx,{$h0-$h4} @ store the result
+
+.Lno_data:
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r3-r11,pc}
+#else
+ ldmia sp!,{r3-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$ctx;
+
+$code.=<<___;
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ stmdb sp!,{r4-r11}
+
+ ldmia $ctx,{$h0-$h4}
+
+#if __ARM_ARCH__>=7
+ ldr ip,[$ctx,#36] @ is_base2_26
+
+ adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
+ mov $g1,$h1,lsr#6
+ adcs $g1,$g1,$h2,lsl#20
+ mov $g2,$h2,lsr#12
+ adcs $g2,$g2,$h3,lsl#14
+ mov $g3,$h3,lsr#18
+ adcs $g3,$g3,$h4,lsl#8
+ mov $g4,#0
+ adc $g4,$g4,$h4,lsr#24
+
+ tst ip,ip
+ itttt ne
+ movne $h0,$g0
+ movne $h1,$g1
+ movne $h2,$g2
+ movne $h3,$g3
+ it ne
+ movne $h4,$g4
+#endif
+
+ adds $g0,$h0,#5 @ compare to modulus
+ adcs $g1,$h1,#0
+ adcs $g2,$h2,#0
+ adcs $g3,$h3,#0
+ adc $g4,$h4,#0
+ tst $g4,#4 @ did it carry/borrow?
+
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h0,$g0
+ ldr $g0,[$nonce,#0]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h1,$g1
+ ldr $g1,[$nonce,#4]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h2,$g2
+ ldr $g2,[$nonce,#8]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne $h3,$g3
+ ldr $g3,[$nonce,#12]
+
+ adds $h0,$h0,$g0
+ adcs $h1,$h1,$g1
+ adcs $h2,$h2,$g2
+ adc $h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+ rev $h0,$h0
+ rev $h1,$h1
+ rev $h2,$h2
+ rev $h3,$h3
+# endif
+ str $h0,[$mac,#0]
+ str $h1,[$mac,#4]
+ str $h2,[$mac,#8]
+ str $h3,[$mac,#12]
+#else
+ strb $h0,[$mac,#0]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#4]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#8]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#12]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#1]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#5]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#9]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#13]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#2]
+ mov $h0,$h0,lsr#8
+ strb $h1,[$mac,#6]
+ mov $h1,$h1,lsr#8
+ strb $h2,[$mac,#10]
+ mov $h2,$h2,lsr#8
+ strb $h3,[$mac,#14]
+ mov $h3,$h3,lsr#8
+
+ strb $h0,[$mac,#3]
+ strb $h1,[$mac,#7]
+ strb $h2,[$mac,#11]
+ strb $h3,[$mac,#15]
+#endif
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ ret @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.fpu neon
+
+.type poly1305_init_neon,%function
+.align 5
+poly1305_init_neon:
+.Lpoly1305_init_neon:
+ ldr r3,[$ctx,#48] @ first table element
+ cmp r3,#-1 @ is value impossible?
+ bne .Lno_init_neon
+
+ ldr r4,[$ctx,#20] @ load key base 2^32
+ ldr r5,[$ctx,#24]
+ ldr r6,[$ctx,#28]
+ ldr r7,[$ctx,#32]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ and r3,r3,#0x03ffffff
+ and r4,r4,#0x03ffffff
+ and r5,r5,#0x03ffffff
+
+ vdup.32 $R0,r2 @ r^1 in both lanes
+ add r2,r3,r3,lsl#2 @ *5
+ vdup.32 $R1,r3
+ add r3,r4,r4,lsl#2
+ vdup.32 $S1,r2
+ vdup.32 $R2,r4
+ add r4,r5,r5,lsl#2
+ vdup.32 $S2,r3
+ vdup.32 $R3,r5
+ add r5,r6,r6,lsl#2
+ vdup.32 $S3,r4
+ vdup.32 $R4,r6
+ vdup.32 $S4,r5
+
+ mov $zeros,#2 @ counter
+
+.Lsquare_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+
+ vmull.u32 $D0,$R0,${R0}[1]
+ vmull.u32 $D1,$R1,${R0}[1]
+ vmull.u32 $D2,$R2,${R0}[1]
+ vmull.u32 $D3,$R3,${R0}[1]
+ vmull.u32 $D4,$R4,${R0}[1]
+
+ vmlal.u32 $D0,$R4,${S1}[1]
+ vmlal.u32 $D1,$R0,${R1}[1]
+ vmlal.u32 $D2,$R1,${R1}[1]
+ vmlal.u32 $D3,$R2,${R1}[1]
+ vmlal.u32 $D4,$R3,${R1}[1]
+
+ vmlal.u32 $D0,$R3,${S2}[1]
+ vmlal.u32 $D1,$R4,${S2}[1]
+ vmlal.u32 $D3,$R1,${R2}[1]
+ vmlal.u32 $D2,$R0,${R2}[1]
+ vmlal.u32 $D4,$R2,${R2}[1]
+
+ vmlal.u32 $D0,$R2,${S3}[1]
+ vmlal.u32 $D3,$R0,${R3}[1]
+ vmlal.u32 $D1,$R3,${S3}[1]
+ vmlal.u32 $D2,$R4,${S3}[1]
+ vmlal.u32 $D4,$R1,${R3}[1]
+
+ vmlal.u32 $D3,$R4,${S4}[1]
+ vmlal.u32 $D0,$R1,${S4}[1]
+ vmlal.u32 $D1,$R2,${S4}[1]
+ vmlal.u32 $D2,$R3,${S4}[1]
+ vmlal.u32 $D4,$R0,${R4}[1]
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ @ and P. Schwabe
+ @
+ @ H0>>+H1>>+H2>>+H3>>+H4
+ @ H3>>+H4>>*5+H0>>+H1
+ @
+ @ Trivia.
+ @
+ @ Result of multiplication of n-bit number by m-bit number is
+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+ @ m-bit number multiplied by 2^n is still n+m bits wide.
+ @
+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+ @ one is n+1 bits wide.
+ @
+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+ @ can be 27. However! In cases when their width exceeds 26 bits
+ @ they are limited by 2^26+2^6. This in turn means that *sum*
+ @ of the products with these values can still be viewed as sum
+ @ of 52-bit numbers as long as the amount of addends is not a
+ @ power of 2. For example,
+ @
+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+ @
+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+ @ which is less than 32 * (2^52) or 2^57. And when processing
+ @ data we are looking at triple as many addends...
+ @
+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+ @ This means that result of reduction have to be compressed upon
+ @ loop wrap-around. This can be done in the process of reduction
+ @ to minimize amount of instructions [as well as amount of
+ @ 128-bit instructions, which benefits low-end processors], but
+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
+ @ not being wider than 58 bits, so that result of right shift
+ @ by 26 bits fits in 32 bits. This is also useful on x86,
+ @ because it allows to use paddd in place for paddq, which
+ @ benefits Atom, where paddq is ridiculously slow.
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vbic.i32 $D4#lo,#0xfc000000
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vbic.i32 $D2#lo,#0xfc000000
+
+ vshr.u32 $T0#lo,$D0#lo,#26
+ vbic.i32 $D0#lo,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+
+ subs $zeros,$zeros,#1
+ beq .Lsquare_break_neon
+
+ add $tbl0,$ctx,#(48+0*9*4)
+ add $tbl1,$ctx,#(48+1*9*4)
+
+ vtrn.32 $R0,$D0#lo @ r^2:r^1
+ vtrn.32 $R2,$D2#lo
+ vtrn.32 $R3,$D3#lo
+ vtrn.32 $R1,$D1#lo
+ vtrn.32 $R4,$D4#lo
+
+ vshl.u32 $S2,$R2,#2 @ *5
+ vshl.u32 $S3,$R3,#2
+ vshl.u32 $S1,$R1,#2
+ vshl.u32 $S4,$R4,#2
+ vadd.i32 $S2,$S2,$R2
+ vadd.i32 $S1,$S1,$R1
+ vadd.i32 $S3,$S3,$R3
+ vadd.i32 $S4,$S4,$R4
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0,:32]
+ vst1.32 {${S4}[1]},[$tbl1,:32]
+
+ b .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+ add $tbl0,$ctx,#(48+2*4*9)
+ add $tbl1,$ctx,#(48+3*4*9)
+
+ vmov $R0,$D0#lo @ r^4:r^3
+ vshl.u32 $S1,$D1#lo,#2 @ *5
+ vmov $R1,$D1#lo
+ vshl.u32 $S2,$D2#lo,#2
+ vmov $R2,$D2#lo
+ vshl.u32 $S3,$D3#lo,#2
+ vmov $R3,$D3#lo
+ vshl.u32 $S4,$D4#lo,#2
+ vmov $R4,$D4#lo
+ vadd.i32 $S1,$S1,$D1#lo
+ vadd.i32 $S2,$S2,$D2#lo
+ vadd.i32 $S3,$S3,$D3#lo
+ vadd.i32 $S4,$S4,$D4#lo
+
+ vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+ vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+ vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vst1.32 {${S4}[0]},[$tbl0]
+ vst1.32 {${S4}[1]},[$tbl1]
+
+.Lno_init_neon:
+ ret @ bx lr
+.size poly1305_init_neon,.-poly1305_init_neon
+
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr ip,[$ctx,#36] @ is_base2_26
+
+ cmp $len,#64
+ blo .Lpoly1305_blocks
+
+ stmdb sp!,{r4-r7}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ tst ip,ip @ is_base2_26?
+ bne .Lbase2_26_neon
+
+ stmdb sp!,{r1-r3,lr}
+ bl .Lpoly1305_init_neon
+
+ ldr r4,[$ctx,#0] @ load hash value base 2^32
+ ldr r5,[$ctx,#4]
+ ldr r6,[$ctx,#8]
+ ldr r7,[$ctx,#12]
+ ldr ip,[$ctx,#16]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ veor $D0#lo,$D0#lo,$D0#lo
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ veor $D1#lo,$D1#lo,$D1#lo
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ veor $D2#lo,$D2#lo,$D2#lo
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ veor $D3#lo,$D3#lo,$D3#lo
+ and r3,r3,#0x03ffffff
+ orr r6,r6,ip,lsl#24
+ veor $D4#lo,$D4#lo,$D4#lo
+ and r4,r4,#0x03ffffff
+ mov r1,#1
+ and r5,r5,#0x03ffffff
+ str r1,[$ctx,#36] @ set is_base2_26
+
+ vmov.32 $D0#lo[0],r2
+ vmov.32 $D1#lo[0],r3
+ vmov.32 $D2#lo[0],r4
+ vmov.32 $D3#lo[0],r5
+ vmov.32 $D4#lo[0],r6
+ adr $zeros,.Lzeros
+
+ ldmia sp!,{r1-r3,lr}
+ b .Lhash_loaded
+
+.align 4
+.Lbase2_26_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ load hash value
+
+ veor $D0#lo,$D0#lo,$D0#lo
+ veor $D1#lo,$D1#lo,$D1#lo
+ veor $D2#lo,$D2#lo,$D2#lo
+ veor $D3#lo,$D3#lo,$D3#lo
+ veor $D4#lo,$D4#lo,$D4#lo
+ vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ adr $zeros,.Lzeros
+ vld1.32 {$D4#lo[0]},[$ctx]
+ sub $ctx,$ctx,#16 @ rewind
+
+.Lhash_loaded:
+ add $in2,$inp,#32
+ mov $padbit,$padbit,lsl#24
+ tst $len,#31
+ beq .Leven
+
+ vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+ vmov.32 $H4#lo[0],$padbit
+ sub $len,$len,#16
+ add $in2,$inp,#32
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3#lo,$H3#lo,#18
+
+ vsri.u32 $H3#lo,$H2#lo,#14
+ vshl.u32 $H2#lo,$H2#lo,#12
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
+
+ vbic.i32 $H3#lo,#0xfc000000
+ vsri.u32 $H2#lo,$H1#lo,#20
+ vshl.u32 $H1#lo,$H1#lo,#6
+
+ vbic.i32 $H2#lo,#0xfc000000
+ vsri.u32 $H1#lo,$H0#lo,#26
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+
+ vbic.i32 $H0#lo,#0xfc000000
+ vbic.i32 $H1#lo,#0xfc000000
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo
+
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+
+ mov $tbl1,$zeros
+ add $tbl0,$ctx,#48
+
+ cmp $len,$len
+ b .Long_tail
+
+.align 4
+.Leven:
+ subs $len,$len,#64
+ it lo
+ movlo $in2,$zeros
+
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+ itt hi
+ addhi $tbl1,$ctx,#(48+1*9*4)
+ addhi $tbl0,$ctx,#(48+3*9*4)
+
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H3,$H3
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+# endif
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vshl.u32 $H3,$H3,#18
+
+ vsri.u32 $H3,$H2,#14
+ vshl.u32 $H2,$H2,#12
+
+ vbic.i32 $H3,#0xfc000000
+ vsri.u32 $H2,$H1,#20
+ vshl.u32 $H1,$H1,#6
+
+ vbic.i32 $H2,#0xfc000000
+ vsri.u32 $H1,$H0,#26
+
+ vbic.i32 $H0,#0xfc000000
+ vbic.i32 $H1,#0xfc000000
+
+ bls .Lskip_loop
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ b .Loop_neon
+
+.align 5
+.Loop_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ @ \___________________/
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ @ \___________________/ \____________________/
+ @
+ @ Note that we start with inp[2:3]*r^2. This is because it
+ @ doesn't depend on reduction in previous iteration.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ inp[2:3]*r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
+ vmull.u32 $D2,$H2#hi,${R0}[1]
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,${R0}[1]
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,${R0}[1]
+ vmlal.u32 $D2,$H1#hi,${R1}[1]
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,${R0}[1]
+
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,${R0}[1]
+ subs $len,$len,#64
+ vmlal.u32 $D0,$H4#hi,${S1}[1]
+ it lo
+ movlo $in2,$zeros
+ vmlal.u32 $D3,$H2#hi,${R1}[1]
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D1,$H0#hi,${R1}[1]
+ vmlal.u32 $D4,$H3#hi,${R1}[1]
+
+ vmlal.u32 $D0,$H3#hi,${S2}[1]
+ vmlal.u32 $D3,$H1#hi,${R2}[1]
+ vmlal.u32 $D4,$H2#hi,${R2}[1]
+ vmlal.u32 $D1,$H4#hi,${S2}[1]
+ vmlal.u32 $D2,$H0#hi,${R2}[1]
+
+ vmlal.u32 $D3,$H0#hi,${R3}[1]
+ vmlal.u32 $D0,$H2#hi,${S3}[1]
+ vmlal.u32 $D4,$H1#hi,${R3}[1]
+ vmlal.u32 $D1,$H3#hi,${S3}[1]
+ vmlal.u32 $D2,$H4#hi,${S3}[1]
+
+ vmlal.u32 $D3,$H4#hi,${S4}[1]
+ vmlal.u32 $D0,$H1#hi,${S4}[1]
+ vmlal.u32 $D4,$H0#hi,${R4}[1]
+ vmlal.u32 $D1,$H2#hi,${S4}[1]
+ vmlal.u32 $D2,$H3#hi,${S4}[1]
+
+ vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
+ add $in2,$in2,#64
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4 and accumulate
+
+ vmlal.u32 $D3,$H3#lo,${R0}[0]
+ vmlal.u32 $D0,$H0#lo,${R0}[0]
+ vmlal.u32 $D4,$H4#lo,${R0}[0]
+ vmlal.u32 $D1,$H1#lo,${R0}[0]
+ vmlal.u32 $D2,$H2#lo,${R0}[0]
+ vld1.32 ${S4}[0],[$tbl0,:32]
+
+ vmlal.u32 $D3,$H2#lo,${R1}[0]
+ vmlal.u32 $D0,$H4#lo,${S1}[0]
+ vmlal.u32 $D4,$H3#lo,${R1}[0]
+ vmlal.u32 $D1,$H0#lo,${R1}[0]
+ vmlal.u32 $D2,$H1#lo,${R1}[0]
+
+ vmlal.u32 $D3,$H1#lo,${R2}[0]
+ vmlal.u32 $D0,$H3#lo,${S2}[0]
+ vmlal.u32 $D4,$H2#lo,${R2}[0]
+ vmlal.u32 $D1,$H4#lo,${S2}[0]
+ vmlal.u32 $D2,$H0#lo,${R2}[0]
+
+ vmlal.u32 $D3,$H0#lo,${R3}[0]
+ vmlal.u32 $D0,$H2#lo,${S3}[0]
+ vmlal.u32 $D4,$H1#lo,${R3}[0]
+ vmlal.u32 $D1,$H3#lo,${S3}[0]
+ vmlal.u32 $D3,$H4#lo,${S4}[0]
+
+ vmlal.u32 $D2,$H4#lo,${S3}[0]
+ vmlal.u32 $D0,$H1#lo,${S4}[0]
+ vmlal.u32 $D4,$H0#lo,${R4}[0]
+ vmov.i32 $H4,#1<<24 @ padbit, yes, always
+ vmlal.u32 $D1,$H2#lo,${S4}[0]
+ vmlal.u32 $D2,$H3#lo,${S4}[0]
+
+ vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
+ add $inp,$inp,#64
+# ifdef __ARMEB__
+ vrev32.8 $H0,$H0
+ vrev32.8 $H1,$H1
+ vrev32.8 $H2,$H2
+ vrev32.8 $H3,$H3
+# endif
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+ @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
+
+ vshr.u64 $T0,$D3,#26
+ vmovn.i64 $D3#lo,$D3
+ vshr.u64 $T1,$D0,#26
+ vmovn.i64 $D0#lo,$D0
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vbic.i32 $D3#lo,#0xfc000000
+ vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+ vshl.u32 $H3,$H3,#18
+ vbic.i32 $D0#lo,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D4,#26
+ vmovn.i64 $D4#lo,$D4
+ vshr.u64 $T1,$D1,#26
+ vmovn.i64 $D1#lo,$D1
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+ vsri.u32 $H3,$H2,#14
+ vbic.i32 $D4#lo,#0xfc000000
+ vshl.u32 $H2,$H2,#12
+ vbic.i32 $D1#lo,#0xfc000000
+
+ vadd.i32 $D0#lo,$D0#lo,$T0#lo
+ vshl.u32 $T0#lo,$T0#lo,#2
+ vbic.i32 $H3,#0xfc000000
+ vshrn.u64 $T1#lo,$D2,#26
+ vmovn.i64 $D2#lo,$D2
+ vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec]
+ vsri.u32 $H2,$H1,#20
+ vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
+ vshl.u32 $H1,$H1,#6
+ vbic.i32 $D2#lo,#0xfc000000
+ vbic.i32 $H2,#0xfc000000
+
+ vshrn.u64 $T0#lo,$D0,#26 @ re-narrow
+ vmovn.i64 $D0#lo,$D0
+ vsri.u32 $H1,$H0,#26
+ vbic.i32 $H0,#0xfc000000
+ vshr.u32 $T1#lo,$D3#lo,#26
+ vbic.i32 $D3#lo,#0xfc000000
+ vbic.i32 $D0#lo,#0xfc000000
+ vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
+ vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
+ vbic.i32 $H1,#0xfc000000
+
+ bhi .Loop_neon
+
+.Lskip_loop:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ add $tbl1,$ctx,#(48+0*9*4)
+ add $tbl0,$ctx,#(48+1*9*4)
+ adds $len,$len,#32
+ it ne
+ movne $len,#0
+ bne .Long_tail
+
+ vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
+ vadd.i32 $H0#hi,$H0#lo,$D0#lo
+ vadd.i32 $H3#hi,$H3#lo,$D3#lo
+ vadd.i32 $H1#hi,$H1#lo,$D1#lo
+ vadd.i32 $H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
+
+ vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
+ vmull.u32 $D2,$H2#hi,$R0
+ vadd.i32 $H0#lo,$H0#lo,$D0#lo
+ vmull.u32 $D0,$H0#hi,$R0
+ vadd.i32 $H3#lo,$H3#lo,$D3#lo
+ vmull.u32 $D3,$H3#hi,$R0
+ vadd.i32 $H1#lo,$H1#lo,$D1#lo
+ vmull.u32 $D1,$H1#hi,$R0
+ vadd.i32 $H4#lo,$H4#lo,$D4#lo
+ vmull.u32 $D4,$H4#hi,$R0
+
+ vmlal.u32 $D0,$H4#hi,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#hi,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#hi,$R1
+ vmlal.u32 $D4,$H3#hi,$R1
+ vmlal.u32 $D2,$H1#hi,$R1
+
+ vmlal.u32 $D3,$H1#hi,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#hi,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#hi,$R2
+ vmlal.u32 $D1,$H4#hi,$S2
+ vmlal.u32 $D2,$H0#hi,$R2
+
+ vmlal.u32 $D3,$H0#hi,$R3
+ it ne
+ addne $tbl1,$ctx,#(48+2*9*4)
+ vmlal.u32 $D0,$H2#hi,$S3
+ it ne
+ addne $tbl0,$ctx,#(48+3*9*4)
+ vmlal.u32 $D4,$H1#hi,$R3
+ vmlal.u32 $D1,$H3#hi,$S3
+ vmlal.u32 $D2,$H4#hi,$S3
+
+ vmlal.u32 $D3,$H4#hi,$S4
+ vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
+ vmlal.u32 $D0,$H1#hi,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#hi,$R4
+ vmlal.u32 $D1,$H2#hi,$S4
+ vmlal.u32 $D2,$H3#hi,$S4
+
+ beq .Lshort_tail
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
+ vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
+
+ vmlal.u32 $D2,$H2#lo,$R0
+ vmlal.u32 $D0,$H0#lo,$R0
+ vmlal.u32 $D3,$H3#lo,$R0
+ vmlal.u32 $D1,$H1#lo,$R0
+ vmlal.u32 $D4,$H4#lo,$R0
+
+ vmlal.u32 $D0,$H4#lo,$S1
+ vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+ vmlal.u32 $D3,$H2#lo,$R1
+ vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+ vmlal.u32 $D1,$H0#lo,$R1
+ vmlal.u32 $D4,$H3#lo,$R1
+ vmlal.u32 $D2,$H1#lo,$R1
+
+ vmlal.u32 $D3,$H1#lo,$R2
+ vld1.32 ${S4}[1],[$tbl1,:32]
+ vmlal.u32 $D0,$H3#lo,$S2
+ vld1.32 ${S4}[0],[$tbl0,:32]
+ vmlal.u32 $D4,$H2#lo,$R2
+ vmlal.u32 $D1,$H4#lo,$S2
+ vmlal.u32 $D2,$H0#lo,$R2
+
+ vmlal.u32 $D3,$H0#lo,$R3
+ vmlal.u32 $D0,$H2#lo,$S3
+ vmlal.u32 $D4,$H1#lo,$R3
+ vmlal.u32 $D1,$H3#lo,$S3
+ vmlal.u32 $D2,$H4#lo,$S3
+
+ vmlal.u32 $D3,$H4#lo,$S4
+ vorn $MASK,$MASK,$MASK @ all-ones
+ vmlal.u32 $D0,$H1#lo,$S4
+ vshr.u64 $MASK,$MASK,#38
+ vmlal.u32 $D4,$H0#lo,$R4
+ vmlal.u32 $D1,$H2#lo,$S4
+ vmlal.u32 $D2,$H3#lo,$S4
+
+.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 $D3#lo,$D3#lo,$D3#hi
+ vadd.i64 $D0#lo,$D0#lo,$D0#hi
+ vadd.i64 $D4#lo,$D4#lo,$D4#hi
+ vadd.i64 $D1#lo,$D1#lo,$D1#hi
+ vadd.i64 $D2#lo,$D2#lo,$D2#hi
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction, but without narrowing
+
+ vshr.u64 $T0,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vshr.u64 $T1,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vadd.i64 $D4,$D4,$T0 @ h3 -> h4
+ vadd.i64 $D1,$D1,$T1 @ h0 -> h1
+
+ vshr.u64 $T0,$D4,#26
+ vand.i64 $D4,$D4,$MASK
+ vshr.u64 $T1,$D1,#26
+ vand.i64 $D1,$D1,$MASK
+ vadd.i64 $D2,$D2,$T1 @ h1 -> h2
+
+ vadd.i64 $D0,$D0,$T0
+ vshl.u64 $T0,$T0,#2
+ vshr.u64 $T1,$D2,#26
+ vand.i64 $D2,$D2,$MASK
+ vadd.i64 $D0,$D0,$T0 @ h4 -> h0
+ vadd.i64 $D3,$D3,$T1 @ h2 -> h3
+
+ vshr.u64 $T0,$D0,#26
+ vand.i64 $D0,$D0,$MASK
+ vshr.u64 $T1,$D3,#26
+ vand.i64 $D3,$D3,$MASK
+ vadd.i64 $D1,$D1,$T0 @ h0 -> h1
+ vadd.i64 $D4,$D4,$T1 @ h3 -> h4
+
+ cmp $len,#0
+ bne .Leven
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ store hash value
+
+ vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+ vst1.32 {$D4#lo[0]},[$ctx]
+
+ vldmia sp!,{d8-d15} @ epilogue
+ ldmia sp!,{r4-r7}
+ ret @ bx lr
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#ifndef __KERNEL__
+.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
+.word OPENSSL_armcap_P-.Lpoly1305_init
+# endif
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+#endif
+___
+} }
+$code.=<<___;
+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+close STDOUT; # enforce flush
--- /dev/null
+++ b/arch/arm/crypto/poly1305-core.S_shipped
@@ -0,0 +1,1158 @@
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define poly1305_init poly1305_init_arm
+# define poly1305_blocks poly1305_blocks_arm
+# define poly1305_emit poly1305_emit_arm
+.globl poly1305_blocks_neon
+#endif
+
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.text
+
+.globl poly1305_emit
+.globl poly1305_blocks
+.globl poly1305_init
+.type poly1305_init,%function
+.align 5
+poly1305_init:
+.Lpoly1305_init:
+ stmdb sp!,{r4-r11}
+
+ eor r3,r3,r3
+ cmp r1,#0
+ str r3,[r0,#0] @ zero hash value
+ str r3,[r0,#4]
+ str r3,[r0,#8]
+ str r3,[r0,#12]
+ str r3,[r0,#16]
+ str r3,[r0,#36] @ clear is_base2_26
+ add r0,r0,#20
+
+#ifdef __thumb2__
+ it eq
+#endif
+ moveq r0,#0
+ beq .Lno_key
+
+#if __ARM_MAX_ARCH__>=7
+ mov r3,#-1
+ str r3,[r0,#28] @ impossible key power value
+# ifndef __KERNEL__
+ adr r11,.Lpoly1305_init
+ ldr r12,.LOPENSSL_armcap
+# endif
+#endif
+ ldrb r4,[r1,#0]
+ mov r10,#0x0fffffff
+ ldrb r5,[r1,#1]
+ and r3,r10,#-4 @ 0x0ffffffc
+ ldrb r6,[r1,#2]
+ ldrb r7,[r1,#3]
+ orr r4,r4,r5,lsl#8
+ ldrb r5,[r1,#4]
+ orr r4,r4,r6,lsl#16
+ ldrb r6,[r1,#5]
+ orr r4,r4,r7,lsl#24
+ ldrb r7,[r1,#6]
+ and r4,r4,r10
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+# if !defined(_WIN32)
+ ldr r12,[r11,r12] @ OPENSSL_armcap_P
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
+ ldr r12,[r12]
+# endif
+#endif
+ ldrb r8,[r1,#7]
+ orr r5,r5,r6,lsl#8
+ ldrb r6,[r1,#8]
+ orr r5,r5,r7,lsl#16
+ ldrb r7,[r1,#9]
+ orr r5,r5,r8,lsl#24
+ ldrb r8,[r1,#10]
+ and r5,r5,r3
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ tst r12,#ARMV7_NEON @ check for NEON
+# ifdef __thumb2__
+ adr r9,.Lpoly1305_blocks_neon
+ adr r11,.Lpoly1305_blocks
+ it ne
+ movne r11,r9
+ adr r12,.Lpoly1305_emit
+ orr r11,r11,#1 @ thumb-ify addresses
+ orr r12,r12,#1
+# else
+ add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+ ite eq
+ addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+ addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
+# endif
+#endif
+ ldrb r9,[r1,#11]
+ orr r6,r6,r7,lsl#8
+ ldrb r7,[r1,#12]
+ orr r6,r6,r8,lsl#16
+ ldrb r8,[r1,#13]
+ orr r6,r6,r9,lsl#24
+ ldrb r9,[r1,#14]
+ and r6,r6,r3
+
+ ldrb r10,[r1,#15]
+ orr r7,r7,r8,lsl#8
+ str r4,[r0,#0]
+ orr r7,r7,r9,lsl#16
+ str r5,[r0,#4]
+ orr r7,r7,r10,lsl#24
+ str r6,[r0,#8]
+ and r7,r7,r3
+ str r7,[r0,#12]
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ stmia r2,{r11,r12} @ fill functions table
+ mov r0,#1
+#else
+ mov r0,#0
+#endif
+.Lno_key:
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_init,.-poly1305_init
+.type poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+ stmdb sp!,{r3-r11,lr}
+
+ ands r2,r2,#-16
+ beq .Lno_data
+
+ add r2,r2,r1 @ end pointer
+ sub sp,sp,#32
+
+#if __ARM_ARCH__<7
+ ldmia r0,{r4-r12} @ load context
+ add r0,r0,#20
+ str r2,[sp,#16] @ offload stuff
+ str r0,[sp,#12]
+#else
+ ldr lr,[r0,#36] @ is_base2_26
+ ldmia r0!,{r4-r8} @ load hash value
+ str r2,[sp,#16] @ offload stuff
+ str r0,[sp,#12]
+
+ adds r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32
+ mov r10,r5,lsr#6
+ adcs r10,r10,r6,lsl#20
+ mov r11,r6,lsr#12
+ adcs r11,r11,r7,lsl#14
+ mov r12,r7,lsr#18
+ adcs r12,r12,r8,lsl#8
+ mov r2,#0
+ teq lr,#0
+ str r2,[r0,#16] @ clear is_base2_26
+ adc r2,r2,r8,lsr#24
+
+ itttt ne
+ movne r4,r9 @ choose between radixes
+ movne r5,r10
+ movne r6,r11
+ movne r7,r12
+ ldmia r0,{r9-r12} @ load key
+ it ne
+ movne r8,r2
+#endif
+
+ mov lr,r1
+ cmp r3,#0
+ str r10,[sp,#20]
+ str r11,[sp,#24]
+ str r12,[sp,#28]
+ b .Loop
+
+.align 4
+.Loop:
+#if __ARM_ARCH__<7
+ ldrb r0,[lr],#16 @ load input
+# ifdef __thumb2__
+ it hi
+# endif
+ addhi r8,r8,#1 @ 1<<128
+ ldrb r1,[lr,#-15]
+ ldrb r2,[lr,#-14]
+ ldrb r3,[lr,#-13]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-12]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-11]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-10]
+ adds r4,r4,r3 @ accumulate input
+
+ ldrb r3,[lr,#-9]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-8]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-7]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-6]
+ adcs r5,r5,r3
+
+ ldrb r3,[lr,#-5]
+ orr r1,r0,r1,lsl#8
+ ldrb r0,[lr,#-4]
+ orr r2,r1,r2,lsl#16
+ ldrb r1,[lr,#-3]
+ orr r3,r2,r3,lsl#24
+ ldrb r2,[lr,#-2]
+ adcs r6,r6,r3
+
+ ldrb r3,[lr,#-1]
+ orr r1,r0,r1,lsl#8
+ str lr,[sp,#8] @ offload input pointer
+ orr r2,r1,r2,lsl#16
+ add r10,r10,r10,lsr#2
+ orr r3,r2,r3,lsl#24
+#else
+ ldr r0,[lr],#16 @ load input
+ it hi
+ addhi r8,r8,#1 @ padbit
+ ldr r1,[lr,#-12]
+ ldr r2,[lr,#-8]
+ ldr r3,[lr,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ adds r4,r4,r0 @ accumulate input
+ str lr,[sp,#8] @ offload input pointer
+ adcs r5,r5,r1
+ add r10,r10,r10,lsr#2
+ adcs r6,r6,r2
+#endif
+ add r11,r11,r11,lsr#2
+ adcs r7,r7,r3
+ add r12,r12,r12,lsr#2
+
+ umull r2,r3,r5,r9
+ adc r8,r8,#0
+ umull r0,r1,r4,r9
+ umlal r2,r3,r8,r10
+ umlal r0,r1,r7,r10
+ ldr r10,[sp,#20] @ reload r10
+ umlal r2,r3,r6,r12
+ umlal r0,r1,r5,r12
+ umlal r2,r3,r7,r11
+ umlal r0,r1,r6,r11
+ umlal r2,r3,r4,r10
+ str r0,[sp,#0] @ future r4
+ mul r0,r11,r8
+ ldr r11,[sp,#24] @ reload r11
+ adds r2,r2,r1 @ d1+=d0>>32
+ eor r1,r1,r1
+ adc lr,r3,#0 @ future r6
+ str r2,[sp,#4] @ future r5
+
+ mul r2,r12,r8
+ eor r3,r3,r3
+ umlal r0,r1,r7,r12
+ ldr r12,[sp,#28] @ reload r12
+ umlal r2,r3,r7,r9
+ umlal r0,r1,r6,r9
+ umlal r2,r3,r6,r10
+ umlal r0,r1,r5,r10
+ umlal r2,r3,r5,r11
+ umlal r0,r1,r4,r11
+ umlal r2,r3,r4,r12
+ ldr r4,[sp,#0]
+ mul r8,r9,r8
+ ldr r5,[sp,#4]
+
+ adds r6,lr,r0 @ d2+=d1>>32
+ ldr lr,[sp,#8] @ reload input pointer
+ adc r1,r1,#0
+ adds r7,r2,r1 @ d3+=d2>>32
+ ldr r0,[sp,#16] @ reload end pointer
+ adc r3,r3,#0
+ add r8,r8,r3 @ h4+=d3>>32
+
+ and r1,r8,#-4
+ and r8,r8,#3
+ add r1,r1,r1,lsr#2 @ *=5
+ adds r4,r4,r1
+ adcs r5,r5,#0
+ adcs r6,r6,#0
+ adcs r7,r7,#0
+ adc r8,r8,#0
+
+ cmp r0,lr @ done yet?
+ bhi .Loop
+
+ ldr r0,[sp,#12]
+ add sp,sp,#32
+ stmdb r0,{r4-r8} @ store the result
+
+.Lno_data:
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r3-r11,pc}
+#else
+ ldmia sp!,{r3-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_blocks,.-poly1305_blocks
+.type poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+ stmdb sp!,{r4-r11}
+
+ ldmia r0,{r3-r7}
+
+#if __ARM_ARCH__>=7
+ ldr ip,[r0,#36] @ is_base2_26
+
+ adds r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32
+ mov r9,r4,lsr#6
+ adcs r9,r9,r5,lsl#20
+ mov r10,r5,lsr#12
+ adcs r10,r10,r6,lsl#14
+ mov r11,r6,lsr#18
+ adcs r11,r11,r7,lsl#8
+ mov r0,#0
+ adc r0,r0,r7,lsr#24
+
+ tst ip,ip
+ itttt ne
+ movne r3,r8
+ movne r4,r9
+ movne r5,r10
+ movne r6,r11
+ it ne
+ movne r7,r0
+#endif
+
+ adds r8,r3,#5 @ compare to modulus
+ adcs r9,r4,#0
+ adcs r10,r5,#0
+ adcs r11,r6,#0
+ adc r0,r7,#0
+ tst r0,#4 @ did it carry/borrow?
+
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r3,r8
+ ldr r8,[r2,#0]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r4,r9
+ ldr r9,[r2,#4]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r5,r10
+ ldr r10,[r2,#8]
+#ifdef __thumb2__
+ it ne
+#endif
+ movne r6,r11
+ ldr r11,[r2,#12]
+
+ adds r3,r3,r8
+ adcs r4,r4,r9
+ adcs r5,r5,r10
+ adc r6,r6,r11
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+# endif
+ str r3,[r1,#0]
+ str r4,[r1,#4]
+ str r5,[r1,#8]
+ str r6,[r1,#12]
+#else
+ strb r3,[r1,#0]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#4]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#8]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#12]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#1]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#5]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#9]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#13]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#2]
+ mov r3,r3,lsr#8
+ strb r4,[r1,#6]
+ mov r4,r4,lsr#8
+ strb r5,[r1,#10]
+ mov r5,r5,lsr#8
+ strb r6,[r1,#14]
+ mov r6,r6,lsr#8
+
+ strb r3,[r1,#3]
+ strb r4,[r1,#7]
+ strb r5,[r1,#11]
+ strb r6,[r1,#15]
+#endif
+ ldmia sp!,{r4-r11}
+#if __ARM_ARCH__>=5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ .word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size poly1305_emit,.-poly1305_emit
+#if __ARM_MAX_ARCH__>=7
+.fpu neon
+
+.type poly1305_init_neon,%function
+.align 5
+poly1305_init_neon:
+.Lpoly1305_init_neon:
+ ldr r3,[r0,#48] @ first table element
+ cmp r3,#-1 @ is value impossible?
+ bne .Lno_init_neon
+
+ ldr r4,[r0,#20] @ load key base 2^32
+ ldr r5,[r0,#24]
+ ldr r6,[r0,#28]
+ ldr r7,[r0,#32]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ and r3,r3,#0x03ffffff
+ and r4,r4,#0x03ffffff
+ and r5,r5,#0x03ffffff
+
+ vdup.32 d0,r2 @ r^1 in both lanes
+ add r2,r3,r3,lsl#2 @ *5
+ vdup.32 d1,r3
+ add r3,r4,r4,lsl#2
+ vdup.32 d2,r2
+ vdup.32 d3,r4
+ add r4,r5,r5,lsl#2
+ vdup.32 d4,r3
+ vdup.32 d5,r5
+ add r5,r6,r6,lsl#2
+ vdup.32 d6,r4
+ vdup.32 d7,r6
+ vdup.32 d8,r5
+
+ mov r5,#2 @ counter
+
+.Lsquare_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+
+ vmull.u32 q5,d0,d0[1]
+ vmull.u32 q6,d1,d0[1]
+ vmull.u32 q7,d3,d0[1]
+ vmull.u32 q8,d5,d0[1]
+ vmull.u32 q9,d7,d0[1]
+
+ vmlal.u32 q5,d7,d2[1]
+ vmlal.u32 q6,d0,d1[1]
+ vmlal.u32 q7,d1,d1[1]
+ vmlal.u32 q8,d3,d1[1]
+ vmlal.u32 q9,d5,d1[1]
+
+ vmlal.u32 q5,d5,d4[1]
+ vmlal.u32 q6,d7,d4[1]
+ vmlal.u32 q8,d1,d3[1]
+ vmlal.u32 q7,d0,d3[1]
+ vmlal.u32 q9,d3,d3[1]
+
+ vmlal.u32 q5,d3,d6[1]
+ vmlal.u32 q8,d0,d5[1]
+ vmlal.u32 q6,d5,d6[1]
+ vmlal.u32 q7,d7,d6[1]
+ vmlal.u32 q9,d1,d5[1]
+
+ vmlal.u32 q8,d7,d8[1]
+ vmlal.u32 q5,d1,d8[1]
+ vmlal.u32 q6,d3,d8[1]
+ vmlal.u32 q7,d5,d8[1]
+ vmlal.u32 q9,d0,d7[1]
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ @ and P. Schwabe
+ @
+ @ H0>>+H1>>+H2>>+H3>>+H4
+ @ H3>>+H4>>*5+H0>>+H1
+ @
+ @ Trivia.
+ @
+ @ Result of multiplication of n-bit number by m-bit number is
+ @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+ @ m-bit number multiplied by 2^n is still n+m bits wide.
+ @
+ @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+ @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+ @ one is n+1 bits wide.
+ @
+ @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+ @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+ @ can be 27. However! In cases when their width exceeds 26 bits
+ @ they are limited by 2^26+2^6. This in turn means that *sum*
+ @ of the products with these values can still be viewed as sum
+ @ of 52-bit numbers as long as the amount of addends is not a
+ @ power of 2. For example,
+ @
+ @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+ @
+ @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+ @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+ @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+ @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+ @ which is less than 32 * (2^52) or 2^57. And when processing
+ @ data we are looking at triple as many addends...
+ @
+ @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+ @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+ @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+ @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+ @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+ @ This means that result of reduction have to be compressed upon
+ @ loop wrap-around. This can be done in the process of reduction
+ @ to minimize amount of instructions [as well as amount of
+ @ 128-bit instructions, which benefits low-end processors], but
+ @ one has to watch for H2 (which is narrower than H0) and 5*H4
+ @ not being wider than 58 bits, so that result of right shift
+ @ by 26 bits fits in 32 bits. This is also useful on x86,
+ @ because it allows to use paddd in place for paddq, which
+ @ benefits Atom, where paddq is ridiculously slow.
+
+ vshr.u64 q15,q8,#26
+ vmovn.i64 d16,q8
+ vshr.u64 q4,q5,#26
+ vmovn.i64 d10,q5
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+ vbic.i32 d10,#0xfc000000
+
+ vshrn.u64 d30,q9,#26
+ vmovn.i64 d18,q9
+ vshr.u64 q4,q6,#26
+ vmovn.i64 d12,q6
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+ vbic.i32 d18,#0xfc000000
+ vbic.i32 d12,#0xfc000000
+
+ vadd.i32 d10,d10,d30
+ vshl.u32 d30,d30,#2
+ vshrn.u64 d8,q7,#26
+ vmovn.i64 d14,q7
+ vadd.i32 d10,d10,d30 @ h4 -> h0
+ vadd.i32 d16,d16,d8 @ h2 -> h3
+ vbic.i32 d14,#0xfc000000
+
+ vshr.u32 d30,d10,#26
+ vbic.i32 d10,#0xfc000000
+ vshr.u32 d8,d16,#26
+ vbic.i32 d16,#0xfc000000
+ vadd.i32 d12,d12,d30 @ h0 -> h1
+ vadd.i32 d18,d18,d8 @ h3 -> h4
+
+ subs r5,r5,#1
+ beq .Lsquare_break_neon
+
+ add r6,r0,#(48+0*9*4)
+ add r7,r0,#(48+1*9*4)
+
+ vtrn.32 d0,d10 @ r^2:r^1
+ vtrn.32 d3,d14
+ vtrn.32 d5,d16
+ vtrn.32 d1,d12
+ vtrn.32 d7,d18
+
+ vshl.u32 d4,d3,#2 @ *5
+ vshl.u32 d6,d5,#2
+ vshl.u32 d2,d1,#2
+ vshl.u32 d8,d7,#2
+ vadd.i32 d4,d4,d3
+ vadd.i32 d2,d2,d1
+ vadd.i32 d6,d6,d5
+ vadd.i32 d8,d8,d7
+
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
+ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
+ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vst1.32 {d8[0]},[r6,:32]
+ vst1.32 {d8[1]},[r7,:32]
+
+ b .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+ add r6,r0,#(48+2*4*9)
+ add r7,r0,#(48+3*4*9)
+
+ vmov d0,d10 @ r^4:r^3
+ vshl.u32 d2,d12,#2 @ *5
+ vmov d1,d12
+ vshl.u32 d4,d14,#2
+ vmov d3,d14
+ vshl.u32 d6,d16,#2
+ vmov d5,d16
+ vshl.u32 d8,d18,#2
+ vmov d7,d18
+ vadd.i32 d2,d2,d12
+ vadd.i32 d4,d4,d14
+ vadd.i32 d6,d6,d16
+ vadd.i32 d8,d8,d18
+
+ vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
+ vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
+ vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vst1.32 {d8[0]},[r6]
+ vst1.32 {d8[1]},[r7]
+
+.Lno_init_neon:
+ bx lr @ bx lr
+.size poly1305_init_neon,.-poly1305_init_neon
+
+.type poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+ ldr ip,[r0,#36] @ is_base2_26
+
+ cmp r2,#64
+ blo .Lpoly1305_blocks
+
+ stmdb sp!,{r4-r7}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+
+ tst ip,ip @ is_base2_26?
+ bne .Lbase2_26_neon
+
+ stmdb sp!,{r1-r3,lr}
+ bl .Lpoly1305_init_neon
+
+ ldr r4,[r0,#0] @ load hash value base 2^32
+ ldr r5,[r0,#4]
+ ldr r6,[r0,#8]
+ ldr r7,[r0,#12]
+ ldr ip,[r0,#16]
+
+ and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
+ mov r3,r4,lsr#26
+ veor d10,d10,d10
+ mov r4,r5,lsr#20
+ orr r3,r3,r5,lsl#6
+ veor d12,d12,d12
+ mov r5,r6,lsr#14
+ orr r4,r4,r6,lsl#12
+ veor d14,d14,d14
+ mov r6,r7,lsr#8
+ orr r5,r5,r7,lsl#18
+ veor d16,d16,d16
+ and r3,r3,#0x03ffffff
+ orr r6,r6,ip,lsl#24
+ veor d18,d18,d18
+ and r4,r4,#0x03ffffff
+ mov r1,#1
+ and r5,r5,#0x03ffffff
+ str r1,[r0,#36] @ set is_base2_26
+
+ vmov.32 d10[0],r2
+ vmov.32 d12[0],r3
+ vmov.32 d14[0],r4
+ vmov.32 d16[0],r5
+ vmov.32 d18[0],r6
+ adr r5,.Lzeros
+
+ ldmia sp!,{r1-r3,lr}
+ b .Lhash_loaded
+
+.align 4
+.Lbase2_26_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ load hash value
+
+ veor d10,d10,d10
+ veor d12,d12,d12
+ veor d14,d14,d14
+ veor d16,d16,d16
+ veor d18,d18,d18
+ vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
+ adr r5,.Lzeros
+ vld1.32 {d18[0]},[r0]
+ sub r0,r0,#16 @ rewind
+
+.Lhash_loaded:
+ add r4,r1,#32
+ mov r3,r3,lsl#24
+ tst r2,#31
+ beq .Leven
+
+ vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
+ vmov.32 d28[0],r3
+ sub r2,r2,#16
+ add r4,r1,#32
+
+# ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q13,q13
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+# endif
+ vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
+ vshl.u32 d26,d26,#18
+
+ vsri.u32 d26,d24,#14
+ vshl.u32 d24,d24,#12
+ vadd.i32 d29,d28,d18 @ add hash value and move to #hi
+
+ vbic.i32 d26,#0xfc000000
+ vsri.u32 d24,d22,#20
+ vshl.u32 d22,d22,#6
+
+ vbic.i32 d24,#0xfc000000
+ vsri.u32 d22,d20,#26
+ vadd.i32 d27,d26,d16
+
+ vbic.i32 d20,#0xfc000000
+ vbic.i32 d22,#0xfc000000
+ vadd.i32 d25,d24,d14
+
+ vadd.i32 d21,d20,d10
+ vadd.i32 d23,d22,d12
+
+ mov r7,r5
+ add r6,r0,#48
+
+ cmp r2,r2
+ b .Long_tail
+
+.align 4
+.Leven:
+ subs r2,r2,#64
+ it lo
+ movlo r4,r5
+
+ vmov.i32 q14,#1<<24 @ padbit, yes, always
+ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
+ add r1,r1,#64
+ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
+ add r4,r4,#64
+ itt hi
+ addhi r7,r0,#(48+1*9*4)
+ addhi r6,r0,#(48+3*9*4)
+
+# ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q13,q13
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+# endif
+ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
+ vshl.u32 q13,q13,#18
+
+ vsri.u32 q13,q12,#14
+ vshl.u32 q12,q12,#12
+
+ vbic.i32 q13,#0xfc000000
+ vsri.u32 q12,q11,#20
+ vshl.u32 q11,q11,#6
+
+ vbic.i32 q12,#0xfc000000
+ vsri.u32 q11,q10,#26
+
+ vbic.i32 q10,#0xfc000000
+ vbic.i32 q11,#0xfc000000
+
+ bls .Lskip_loop
+
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ b .Loop_neon
+
+.align 5
+.Loop_neon:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ @ ___________________/
+ @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ @ ___________________/ ____________________/
+ @
+ @ Note that we start with inp[2:3]*r^2. This is because it
+ @ doesn't depend on reduction in previous iteration.
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ inp[2:3]*r^2
+
+ vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
+ vmull.u32 q7,d25,d0[1]
+ vadd.i32 d20,d20,d10
+ vmull.u32 q5,d21,d0[1]
+ vadd.i32 d26,d26,d16
+ vmull.u32 q8,d27,d0[1]
+ vmlal.u32 q7,d23,d1[1]
+ vadd.i32 d22,d22,d12
+ vmull.u32 q6,d23,d0[1]
+
+ vadd.i32 d28,d28,d18
+ vmull.u32 q9,d29,d0[1]
+ subs r2,r2,#64
+ vmlal.u32 q5,d29,d2[1]
+ it lo
+ movlo r4,r5
+ vmlal.u32 q8,d25,d1[1]
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q6,d21,d1[1]
+ vmlal.u32 q9,d27,d1[1]
+
+ vmlal.u32 q5,d27,d4[1]
+ vmlal.u32 q8,d23,d3[1]
+ vmlal.u32 q9,d25,d3[1]
+ vmlal.u32 q6,d29,d4[1]
+ vmlal.u32 q7,d21,d3[1]
+
+ vmlal.u32 q8,d21,d5[1]
+ vmlal.u32 q5,d25,d6[1]
+ vmlal.u32 q9,d23,d5[1]
+ vmlal.u32 q6,d27,d6[1]
+ vmlal.u32 q7,d29,d6[1]
+
+ vmlal.u32 q8,d29,d8[1]
+ vmlal.u32 q5,d23,d8[1]
+ vmlal.u32 q9,d21,d7[1]
+ vmlal.u32 q6,d25,d8[1]
+ vmlal.u32 q7,d27,d8[1]
+
+ vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
+ add r4,r4,#64
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4 and accumulate
+
+ vmlal.u32 q8,d26,d0[0]
+ vmlal.u32 q5,d20,d0[0]
+ vmlal.u32 q9,d28,d0[0]
+ vmlal.u32 q6,d22,d0[0]
+ vmlal.u32 q7,d24,d0[0]
+ vld1.32 d8[0],[r6,:32]
+
+ vmlal.u32 q8,d24,d1[0]
+ vmlal.u32 q5,d28,d2[0]
+ vmlal.u32 q9,d26,d1[0]
+ vmlal.u32 q6,d20,d1[0]
+ vmlal.u32 q7,d22,d1[0]
+
+ vmlal.u32 q8,d22,d3[0]
+ vmlal.u32 q5,d26,d4[0]
+ vmlal.u32 q9,d24,d3[0]
+ vmlal.u32 q6,d28,d4[0]
+ vmlal.u32 q7,d20,d3[0]
+
+ vmlal.u32 q8,d20,d5[0]
+ vmlal.u32 q5,d24,d6[0]
+ vmlal.u32 q9,d22,d5[0]
+ vmlal.u32 q6,d26,d6[0]
+ vmlal.u32 q8,d28,d8[0]
+
+ vmlal.u32 q7,d28,d6[0]
+ vmlal.u32 q5,d22,d8[0]
+ vmlal.u32 q9,d20,d7[0]
+ vmov.i32 q14,#1<<24 @ padbit, yes, always
+ vmlal.u32 q6,d24,d8[0]
+ vmlal.u32 q7,d26,d8[0]
+
+ vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
+ add r1,r1,#64
+# ifdef __ARMEB__
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vrev32.8 q12,q12
+ vrev32.8 q13,q13
+# endif
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+ @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
+
+ vshr.u64 q15,q8,#26
+ vmovn.i64 d16,q8
+ vshr.u64 q4,q5,#26
+ vmovn.i64 d10,q5
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vbic.i32 d16,#0xfc000000
+ vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+ vshl.u32 q13,q13,#18
+ vbic.i32 d10,#0xfc000000
+
+ vshrn.u64 d30,q9,#26
+ vmovn.i64 d18,q9
+ vshr.u64 q4,q6,#26
+ vmovn.i64 d12,q6
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+ vsri.u32 q13,q12,#14
+ vbic.i32 d18,#0xfc000000
+ vshl.u32 q12,q12,#12
+ vbic.i32 d12,#0xfc000000
+
+ vadd.i32 d10,d10,d30
+ vshl.u32 d30,d30,#2
+ vbic.i32 q13,#0xfc000000
+ vshrn.u64 d8,q7,#26
+ vmovn.i64 d14,q7
+ vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
+ vsri.u32 q12,q11,#20
+ vadd.i32 d16,d16,d8 @ h2 -> h3
+ vshl.u32 q11,q11,#6
+ vbic.i32 d14,#0xfc000000
+ vbic.i32 q12,#0xfc000000
+
+ vshrn.u64 d30,q5,#26 @ re-narrow
+ vmovn.i64 d10,q5
+ vsri.u32 q11,q10,#26
+ vbic.i32 q10,#0xfc000000
+ vshr.u32 d8,d16,#26
+ vbic.i32 d16,#0xfc000000
+ vbic.i32 d10,#0xfc000000
+ vadd.i32 d12,d12,d30 @ h0 -> h1
+ vadd.i32 d18,d18,d8 @ h3 -> h4
+ vbic.i32 q11,#0xfc000000
+
+ bhi .Loop_neon
+
+.Lskip_loop:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ add r7,r0,#(48+0*9*4)
+ add r6,r0,#(48+1*9*4)
+ adds r2,r2,#32
+ it ne
+ movne r2,#0
+ bne .Long_tail
+
+ vadd.i32 d25,d24,d14 @ add hash value and move to #hi
+ vadd.i32 d21,d20,d10
+ vadd.i32 d27,d26,d16
+ vadd.i32 d23,d22,d12
+ vadd.i32 d29,d28,d18
+
+.Long_tail:
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
+
+ vadd.i32 d24,d24,d14 @ can be redundant
+ vmull.u32 q7,d25,d0
+ vadd.i32 d20,d20,d10
+ vmull.u32 q5,d21,d0
+ vadd.i32 d26,d26,d16
+ vmull.u32 q8,d27,d0
+ vadd.i32 d22,d22,d12
+ vmull.u32 q6,d23,d0
+ vadd.i32 d28,d28,d18
+ vmull.u32 q9,d29,d0
+
+ vmlal.u32 q5,d29,d2
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vmlal.u32 q8,d25,d1
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vmlal.u32 q6,d21,d1
+ vmlal.u32 q9,d27,d1
+ vmlal.u32 q7,d23,d1
+
+ vmlal.u32 q8,d23,d3
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q5,d27,d4
+ vld1.32 d8[0],[r6,:32]
+ vmlal.u32 q9,d25,d3
+ vmlal.u32 q6,d29,d4
+ vmlal.u32 q7,d21,d3
+
+ vmlal.u32 q8,d21,d5
+ it ne
+ addne r7,r0,#(48+2*9*4)
+ vmlal.u32 q5,d25,d6
+ it ne
+ addne r6,r0,#(48+3*9*4)
+ vmlal.u32 q9,d23,d5
+ vmlal.u32 q6,d27,d6
+ vmlal.u32 q7,d29,d6
+
+ vmlal.u32 q8,d29,d8
+ vorn q0,q0,q0 @ all-ones, can be redundant
+ vmlal.u32 q5,d23,d8
+ vshr.u64 q0,q0,#38
+ vmlal.u32 q9,d21,d7
+ vmlal.u32 q6,d25,d8
+ vmlal.u32 q7,d27,d8
+
+ beq .Lshort_tail
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+ vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
+ vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
+
+ vmlal.u32 q7,d24,d0
+ vmlal.u32 q5,d20,d0
+ vmlal.u32 q8,d26,d0
+ vmlal.u32 q6,d22,d0
+ vmlal.u32 q9,d28,d0
+
+ vmlal.u32 q5,d28,d2
+ vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
+ vmlal.u32 q8,d24,d1
+ vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
+ vmlal.u32 q6,d20,d1
+ vmlal.u32 q9,d26,d1
+ vmlal.u32 q7,d22,d1
+
+ vmlal.u32 q8,d22,d3
+ vld1.32 d8[1],[r7,:32]
+ vmlal.u32 q5,d26,d4
+ vld1.32 d8[0],[r6,:32]
+ vmlal.u32 q9,d24,d3
+ vmlal.u32 q6,d28,d4
+ vmlal.u32 q7,d20,d3
+
+ vmlal.u32 q8,d20,d5
+ vmlal.u32 q5,d24,d6
+ vmlal.u32 q9,d22,d5
+ vmlal.u32 q6,d26,d6
+ vmlal.u32 q7,d28,d6
+
+ vmlal.u32 q8,d28,d8
+ vorn q0,q0,q0 @ all-ones
+ vmlal.u32 q5,d22,d8
+ vshr.u64 q0,q0,#38
+ vmlal.u32 q9,d20,d7
+ vmlal.u32 q6,d24,d8
+ vmlal.u32 q7,d26,d8
+
+.Lshort_tail:
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ horizontal addition
+
+ vadd.i64 d16,d16,d17
+ vadd.i64 d10,d10,d11
+ vadd.i64 d18,d18,d19
+ vadd.i64 d12,d12,d13
+ vadd.i64 d14,d14,d15
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ lazy reduction, but without narrowing
+
+ vshr.u64 q15,q8,#26
+ vand.i64 q8,q8,q0
+ vshr.u64 q4,q5,#26
+ vand.i64 q5,q5,q0
+ vadd.i64 q9,q9,q15 @ h3 -> h4
+ vadd.i64 q6,q6,q4 @ h0 -> h1
+
+ vshr.u64 q15,q9,#26
+ vand.i64 q9,q9,q0
+ vshr.u64 q4,q6,#26
+ vand.i64 q6,q6,q0
+ vadd.i64 q7,q7,q4 @ h1 -> h2
+
+ vadd.i64 q5,q5,q15
+ vshl.u64 q15,q15,#2
+ vshr.u64 q4,q7,#26
+ vand.i64 q7,q7,q0
+ vadd.i64 q5,q5,q15 @ h4 -> h0
+ vadd.i64 q8,q8,q4 @ h2 -> h3
+
+ vshr.u64 q15,q5,#26
+ vand.i64 q5,q5,q0
+ vshr.u64 q4,q8,#26
+ vand.i64 q8,q8,q0
+ vadd.i64 q6,q6,q15 @ h0 -> h1
+ vadd.i64 q9,q9,q4 @ h3 -> h4
+
+ cmp r2,#0
+ bne .Leven
+
+ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+ @ store hash value
+
+ vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
+ vst1.32 {d18[0]},[r0]
+
+ vldmia sp!,{d8-d15} @ epilogue
+ ldmia sp!,{r4-r7}
+ bx lr @ bx lr
+.size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align 5
+.Lzeros:
+.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#ifndef __KERNEL__
+.LOPENSSL_armcap:
+# ifdef _WIN32
+.word OPENSSL_armcap_P
+# else
+.word OPENSSL_armcap_P-.Lpoly1305_init
+# endif
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+#endif
+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm"
+.align 2
--- b/arch/arm/crypto/poly1305-glue.c
+++ b/arch/arm/crypto/poly1305-glue.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
+ *
+ * Copyright (C) 2019 Linaro Ltd.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+void poly1305_init_arm(void *state, const u8 *key);
+void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
+void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
+void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
+
+void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
+{
+}
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
+{
+ poly1305_init_arm(&dctx->h, key);
+ dctx->s[0] = get_unaligned_le32(key + 16);
+ dctx->s[1] = get_unaligned_le32(key + 20);
+ dctx->s[2] = get_unaligned_le32(key + 24);
+ dctx->s[3] = get_unaligned_le32(key + 28);
+ dctx->buflen = 0;
+}
+EXPORT_SYMBOL(poly1305_init_arch);
+
+static int arm_poly1305_init(struct shash_desc *desc)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ dctx->buflen = 0;
+ dctx->rset = 0;
+ dctx->sset = false;
+
+ return 0;
+}
+
+static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+ u32 len, u32 hibit, bool do_neon)
+{
+ if (unlikely(!dctx->sset)) {
+ if (!dctx->rset) {
+ poly1305_init_arm(&dctx->h, src);
+ src += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ dctx->rset = 1;
+ }
+ if (len >= POLY1305_BLOCK_SIZE) {
+ dctx->s[0] = get_unaligned_le32(src + 0);
+ dctx->s[1] = get_unaligned_le32(src + 4);
+ dctx->s[2] = get_unaligned_le32(src + 8);
+ dctx->s[3] = get_unaligned_le32(src + 12);
+ src += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ dctx->sset = true;
+ }
+ if (len < POLY1305_BLOCK_SIZE)
+ return;
+ }
+
+ len &= ~(POLY1305_BLOCK_SIZE - 1);
+
+ if (static_branch_likely(&have_neon) && likely(do_neon))
+ poly1305_blocks_neon(&dctx->h, src, len, hibit);
+ else
+ poly1305_blocks_arm(&dctx->h, src, len, hibit);
+}
+
+static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
+ const u8 *src, u32 len, bool do_neon)
+{
+ if (unlikely(dctx->buflen)) {
+ u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ len -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ arm_poly1305_blocks(dctx, dctx->buf,
+ POLY1305_BLOCK_SIZE, 1, false);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(len >= POLY1305_BLOCK_SIZE)) {
+ arm_poly1305_blocks(dctx, src, len, 1, do_neon);
+ src += round_down(len, POLY1305_BLOCK_SIZE);
+ len %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(len)) {
+ dctx->buflen = len;
+ memcpy(dctx->buf, src, len);
+ }
+}
+
+static int arm_poly1305_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ arm_poly1305_do_update(dctx, src, srclen, false);
+ return 0;
+}
+
+static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
+ const u8 *src,
+ unsigned int srclen)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+ bool do_neon = crypto_simd_usable() && srclen > 128;
+
+ if (static_branch_likely(&have_neon) && do_neon)
+ kernel_neon_begin();
+ arm_poly1305_do_update(dctx, src, srclen, do_neon);
+ if (static_branch_likely(&have_neon) && do_neon)
+ kernel_neon_end();
+ return 0;
+}
+
+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+ unsigned int nbytes)
+{
+ bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ crypto_simd_usable();
+
+ if (unlikely(dctx->buflen)) {
+ u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ nbytes -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ poly1305_blocks_arm(&dctx->h, dctx->buf,
+ POLY1305_BLOCK_SIZE, 1);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
+ unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+
+ if (static_branch_likely(&have_neon) && do_neon) {
+ do {
+ unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+ kernel_neon_begin();
+ poly1305_blocks_neon(&dctx->h, src, todo, 1);
+ kernel_neon_end();
+
+ len -= todo;
+ src += todo;
+ } while (len);
+ } else {
+ poly1305_blocks_arm(&dctx->h, src, len, 1);
+ src += len;
+ }
+ nbytes %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(nbytes)) {
+ dctx->buflen = nbytes;
+ memcpy(dctx->buf, src, nbytes);
+ }
+}
+EXPORT_SYMBOL(poly1305_update_arch);
+
+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+{
+ if (unlikely(dctx->buflen)) {
+ dctx->buf[dctx->buflen++] = 1;
+ memset(dctx->buf + dctx->buflen, 0,
+ POLY1305_BLOCK_SIZE - dctx->buflen);
+ poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ }
+
+ poly1305_emit_arm(&dctx->h, dst, dctx->s);
+ *dctx = (struct poly1305_desc_ctx){};
+}
+EXPORT_SYMBOL(poly1305_final_arch);
+
+static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (unlikely(!dctx->sset))
+ return -ENOKEY;
+
+ poly1305_final_arch(dctx, dst);
+ return 0;
+}
+
+static struct shash_alg arm_poly1305_algs[] = {{
+ .init = arm_poly1305_init,
+ .update = arm_poly1305_update,
+ .final = arm_poly1305_final,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .descsize = sizeof(struct poly1305_desc_ctx),
+
+ .base.cra_name = "poly1305",
+ .base.cra_driver_name = "poly1305-arm",
+ .base.cra_priority = 150,
+ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+#ifdef CONFIG_KERNEL_MODE_NEON
+}, {
+ .init = arm_poly1305_init,
+ .update = arm_poly1305_update_neon,
+ .final = arm_poly1305_final,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .descsize = sizeof(struct poly1305_desc_ctx),
+
+ .base.cra_name = "poly1305",
+ .base.cra_driver_name = "poly1305-neon",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+#endif
+}};
+
+static int __init arm_poly1305_mod_init(void)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ (elf_hwcap & HWCAP_NEON))
+ static_branch_enable(&have_neon);
+ else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
+ /* register only the first entry */
+ return crypto_register_shash(&arm_poly1305_algs[0]);
+
+ return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+ crypto_register_shashes(arm_poly1305_algs,
+ ARRAY_SIZE(arm_poly1305_algs)) : 0;
+}
+
+static void __exit arm_poly1305_mod_exit(void)
+{
+ if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
+ return;
+ if (!static_branch_likely(&have_neon)) {
+ crypto_unregister_shash(&arm_poly1305_algs[0]);
+ return;
+ }
+ crypto_unregister_shashes(arm_poly1305_algs,
+ ARRAY_SIZE(arm_poly1305_algs));
+}
+
+module_init(arm_poly1305_mod_init);
+module_exit(arm_poly1305_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-arm");
+MODULE_ALIAS_CRYPTO("poly1305-neon");
--- b/arch/mips/crypto/poly1305-glue.c
+++ b/arch/mips/crypto/poly1305-glue.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
+ *
+ * Copyright (C) 2019 Linaro Ltd.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+asmlinkage void poly1305_init_mips(void *state, const u8 *key);
+asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
+
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
+{
+ poly1305_init_mips(&dctx->h, key);
+ dctx->s[0] = get_unaligned_le32(key + 16);
+ dctx->s[1] = get_unaligned_le32(key + 20);
+ dctx->s[2] = get_unaligned_le32(key + 24);
+ dctx->s[3] = get_unaligned_le32(key + 28);
+ dctx->buflen = 0;
+}
+EXPORT_SYMBOL(poly1305_init_arch);
+
+static int mips_poly1305_init(struct shash_desc *desc)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ dctx->buflen = 0;
+ dctx->rset = 0;
+ dctx->sset = false;
+
+ return 0;
+}
+
+static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
+ u32 len, u32 hibit)
+{
+ if (unlikely(!dctx->sset)) {
+ if (!dctx->rset) {
+ poly1305_init_mips(&dctx->h, src);
+ src += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ dctx->rset = 1;
+ }
+ if (len >= POLY1305_BLOCK_SIZE) {
+ dctx->s[0] = get_unaligned_le32(src + 0);
+ dctx->s[1] = get_unaligned_le32(src + 4);
+ dctx->s[2] = get_unaligned_le32(src + 8);
+ dctx->s[3] = get_unaligned_le32(src + 12);
+ src += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ dctx->sset = true;
+ }
+ if (len < POLY1305_BLOCK_SIZE)
+ return;
+ }
+
+ len &= ~(POLY1305_BLOCK_SIZE - 1);
+
+ poly1305_blocks_mips(&dctx->h, src, len, hibit);
+}
+
+static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
+ unsigned int len)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (unlikely(dctx->buflen)) {
+ u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ len -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(len >= POLY1305_BLOCK_SIZE)) {
+ mips_poly1305_blocks(dctx, src, len, 1);
+ src += round_down(len, POLY1305_BLOCK_SIZE);
+ len %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(len)) {
+ dctx->buflen = len;
+ memcpy(dctx->buf, src, len);
+ }
+ return 0;
+}
+
+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+ unsigned int nbytes)
+{
+ if (unlikely(dctx->buflen)) {
+ u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+ memcpy(dctx->buf + dctx->buflen, src, bytes);
+ src += bytes;
+ nbytes -= bytes;
+ dctx->buflen += bytes;
+
+ if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+ poly1305_blocks_mips(&dctx->h, dctx->buf,
+ POLY1305_BLOCK_SIZE, 1);
+ dctx->buflen = 0;
+ }
+ }
+
+ if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
+ unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+
+ poly1305_blocks_mips(&dctx->h, src, len, 1);
+ src += len;
+ nbytes %= POLY1305_BLOCK_SIZE;
+ }
+
+ if (unlikely(nbytes)) {
+ dctx->buflen = nbytes;
+ memcpy(dctx->buf, src, nbytes);
+ }
+}
+EXPORT_SYMBOL(poly1305_update_arch);
+
+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+{
+ if (unlikely(dctx->buflen)) {
+ dctx->buf[dctx->buflen++] = 1;
+ memset(dctx->buf + dctx->buflen, 0,
+ POLY1305_BLOCK_SIZE - dctx->buflen);
+ poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ }
+
+ poly1305_emit_mips(&dctx->h, dst, dctx->s);
+ *dctx = (struct poly1305_desc_ctx){};
+}
+EXPORT_SYMBOL(poly1305_final_arch);
+
+static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
+{
+ struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ if (unlikely(!dctx->sset))
+ return -ENOKEY;
+
+ poly1305_final_arch(dctx, dst);
+ return 0;
+}
+
+static struct shash_alg mips_poly1305_alg = {
+ .init = mips_poly1305_init,
+ .update = mips_poly1305_update,
+ .final = mips_poly1305_final,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .descsize = sizeof(struct poly1305_desc_ctx),
+
+ .base.cra_name = "poly1305",
+ .base.cra_driver_name = "poly1305-mips",
+ .base.cra_priority = 200,
+ .base.cra_blocksize = POLY1305_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+};
+
+static int __init mips_poly1305_mod_init(void)
+{
+ return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+ crypto_register_shash(&mips_poly1305_alg) : 0;
+}
+
+static void __exit mips_poly1305_mod_exit(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
+ crypto_unregister_shash(&mips_poly1305_alg);
+}
+
+module_init(mips_poly1305_mod_init);
+module_exit(mips_poly1305_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-mips");
--- /dev/null
+++ b/arch/mips/crypto/poly1305-mips.pl
@@ -0,0 +1,1273 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
+# project.
+# ====================================================================
+
+# Poly1305 hash for MIPS.
+#
+# May 2016
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+# IALU/gcc
+# R1x000 ~5.5/+130% (big-endian)
+# Octeon II 2.50/+70% (little-endian)
+#
+# March 2019
+#
+# Add 32-bit code path.
+#
+# October 2019
+#
+# Modulo-scheduling reduction allows to omit dependency chain at the
+# end of inner loop and improve performance. Also optimize MIPS32R2
+# code path for MIPS 1004K core. Per René von Dorst's suggestions.
+#
+# IALU/gcc
+# R1x000 ~9.8/? (big-endian)
+# Octeon II 3.65/+140% (little-endian)
+# MT7621/1004K 4.75/? (little-endian)
+#
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+# excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+# old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+#
+#
+######################################################################
+
+$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
+
+if ($flavour =~ /64|n32/i) {{{
+######################################################################
+# 64-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
+ defined(_MIPS_ARCH_MIPS64R6)) \\
+ && !defined(_MIPS_ARCH_MIPS64R2)
+# define _MIPS_ARCH_MIPS64R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+# define dmultu(rs,rt)
+# define mflo(rd,rs,rt) dmulu rd,rs,rt
+# define mfhi(rd,rs,rt) dmuhu rd,rs,rt
+#else
+# define dmultu(rs,rt) dmultu rs,rt
+# define mflo(rd,rs,rt) mflo rd
+# define mfhi(rd,rs,rt) mfhi rd
+#endif
+
+#ifdef __KERNEL__
+# define poly1305_init poly1305_init_mips
+# define poly1305_blocks poly1305_blocks_mips
+# define poly1305_emit poly1305_emit_mips
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 7
+#else
+# define MSB 7
+# define LSB 0
+#endif
+
+.text
+.set noat
+.set noreorder
+
+.align 5
+.globl poly1305_init
+.ent poly1305_init
+poly1305_init:
+ .frame $sp,0,$ra
+ .set reorder
+
+ sd $zero,0($ctx)
+ sd $zero,8($ctx)
+ sd $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+ andi $tmp0,$inp,7 # $inp % 8
+ dsubu $inp,$inp,$tmp0 # align $inp
+ sll $tmp0,$tmp0,3 # byte to bit offset
+ ld $in0,0($inp)
+ ld $in1,8($inp)
+ beqz $tmp0,.Laligned_key
+ ld $tmp2,16($inp)
+
+ subu $tmp1,$zero,$tmp0
+# ifdef MIPSEB
+ dsllv $in0,$in0,$tmp0
+ dsrlv $tmp3,$in1,$tmp1
+ dsllv $in1,$in1,$tmp0
+ dsrlv $tmp2,$tmp2,$tmp1
+# else
+ dsrlv $in0,$in0,$tmp0
+ dsllv $tmp3,$in1,$tmp1
+ dsrlv $in1,$in1,$tmp0
+ dsllv $tmp2,$tmp2,$tmp1
+# endif
+ or $in0,$in0,$tmp3
+ or $in1,$in1,$tmp2
+.Laligned_key:
+#else
+ ldl $in0,0+MSB($inp)
+ ldl $in1,8+MSB($inp)
+ ldr $in0,0+LSB($inp)
+ ldr $in1,8+LSB($inp)
+#endif
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh $in0,$in0 # byte swap
+ dsbh $in1,$in1
+ dshd $in0,$in0
+ dshd $in1,$in1
+# else
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+
+ and $tmp1,$in0,$tmp0 # byte swap
+ and $tmp3,$in1,$tmp0
+ dsrl $tmp2,$in0,24
+ dsrl $tmp4,$in1,24
+ dsll $tmp1,24
+ dsll $tmp3,24
+ and $tmp2,$tmp0
+ and $tmp4,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ and $tmp2,$in0,$tmp0
+ and $tmp4,$in1,$tmp0
+ dsrl $in0,8
+ dsrl $in1,8
+ dsll $tmp2,8
+ dsll $tmp4,8
+ and $in0,$tmp0
+ and $in1,$tmp0
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ or $in0,$tmp1
+ or $in1,$tmp3
+ dsrl $tmp1,$in0,32
+ dsrl $tmp3,$in1,32
+ dsll $in0,32
+ dsll $in1,32
+ or $in0,$tmp1
+ or $in1,$tmp3
+# endif
+#endif
+ li $tmp0,1
+ dsll $tmp0,32 # 0x0000000100000000
+ daddiu $tmp0,-63 # 0x00000000ffffffc1
+ dsll $tmp0,28 # 0x0ffffffc10000000
+ daddiu $tmp0,-1 # 0x0ffffffc0fffffff
+
+ and $in0,$tmp0
+ daddiu $tmp0,-3 # 0x0ffffffc0ffffffc
+ and $in1,$tmp0
+
+ sd $in0,24($ctx)
+ dsrl $tmp0,$in1,2
+ sd $in1,32($ctx)
+ daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
+ sd $tmp0,40($ctx)
+
+.Lno_key:
+ li $v0,0 # return 0
+ jr $ra
+.end poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
+
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+ ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
+my ($shr,$shl) = ($s6,$s7); # used on R6
+
+$code.=<<___;
+.align 5
+.globl poly1305_blocks
+.ent poly1305_blocks
+poly1305_blocks:
+ .set noreorder
+ dsrl $len,4 # number of complete blocks
+ bnez $len,poly1305_blocks_internal
+ nop
+ jr $ra
+ nop
+.end poly1305_blocks
+
+.align 5
+.ent poly1305_blocks_internal
+poly1305_blocks_internal:
+ .set noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+ .frame $sp,8*8,$ra
+ .mask $SAVED_REGS_MASK|0x000c0000,-8
+ dsubu $sp,8*8
+ sd $s7,56($sp)
+ sd $s6,48($sp)
+#else
+ .frame $sp,6*8,$ra
+ .mask $SAVED_REGS_MASK,-8
+ dsubu $sp,6*8
+#endif
+ sd $s5,40($sp)
+ sd $s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ sd $s3,24($sp)
+ sd $s2,16($sp)
+ sd $s1,8($sp)
+ sd $s0,0($sp)
+___
+$code.=<<___;
+ .set reorder
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+ andi $shr,$inp,7
+ dsubu $inp,$inp,$shr # align $inp
+ sll $shr,$shr,3 # byte to bit offset
+ subu $shl,$zero,$shr
+#endif
+
+ ld $h0,0($ctx) # load hash value
+ ld $h1,8($ctx)
+ ld $h2,16($ctx)
+
+ ld $r0,24($ctx) # load key
+ ld $r1,32($ctx)
+ ld $rs1,40($ctx)
+
+ dsll $len,4
+ daddu $len,$inp # end of buffer
+ b .Loop
+
+.align 4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS64R6)
+ ld $in0,0($inp) # load input
+ ld $in1,8($inp)
+ beqz $shr,.Laligned_inp
+
+ ld $tmp2,16($inp)
+# ifdef MIPSEB
+ dsllv $in0,$in0,$shr
+ dsrlv $tmp3,$in1,$shl
+ dsllv $in1,$in1,$shr
+ dsrlv $tmp2,$tmp2,$shl
+# else
+ dsrlv $in0,$in0,$shr
+ dsllv $tmp3,$in1,$shl
+ dsrlv $in1,$in1,$shr
+ dsllv $tmp2,$tmp2,$shl
+# endif
+ or $in0,$in0,$tmp3
+ or $in1,$in1,$tmp2
+.Laligned_inp:
+#else
+ ldl $in0,0+MSB($inp) # load input
+ ldl $in1,8+MSB($inp)
+ ldr $in0,0+LSB($inp)
+ ldr $in1,8+LSB($inp)
+#endif
+ daddiu $inp,16
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+ dsbh $in0,$in0 # byte swap
+ dsbh $in1,$in1
+ dshd $in0,$in0
+ dshd $in1,$in1
+# else
+ ori $tmp0,$zero,0xFF
+ dsll $tmp2,$tmp0,32
+ or $tmp0,$tmp2 # 0x000000FF000000FF
+
+ and $tmp1,$in0,$tmp0 # byte swap
+ and $tmp3,$in1,$tmp0
+ dsrl $tmp2,$in0,24
+ dsrl $tmp4,$in1,24
+ dsll $tmp1,24
+ dsll $tmp3,24
+ and $tmp2,$tmp0
+ and $tmp4,$tmp0
+ dsll $tmp0,8 # 0x0000FF000000FF00
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ and $tmp2,$in0,$tmp0
+ and $tmp4,$in1,$tmp0
+ dsrl $in0,8
+ dsrl $in1,8
+ dsll $tmp2,8
+ dsll $tmp4,8
+ and $in0,$tmp0
+ and $in1,$tmp0
+ or $tmp1,$tmp2
+ or $tmp3,$tmp4
+ or $in0,$tmp1
+ or $in1,$tmp3
+ dsrl $tmp1,$in0,32
+ dsrl $tmp3,$in1,32
+ dsll $in0,32
+ dsll $in1,32
+ or $in0,$tmp1
+ or $in1,$tmp3
+# endif
+#endif
+ dsrl $tmp1,$h2,2 # modulo-scheduled reduction
+ andi $h2,$h2,3
+ dsll $tmp0,$tmp1,2
+
+ daddu $d0,$h0,$in0 # accumulate input
+ daddu $tmp1,$tmp0
+ sltu $tmp0,$d0,$h0
+ daddu $d0,$d0,$tmp1 # ... and residue
+ sltu $tmp1,$d0,$tmp1
+ daddu $d1,$h1,$in1
+ daddu $tmp0,$tmp1
+ sltu $tmp1,$d1,$h1
+ daddu $d1,$tmp0
+
+ dmultu ($r0,$d0) # h0*r0
+ daddu $d2,$h2,$padbit
+ sltu $tmp0,$d1,$tmp0
+ mflo ($h0,$r0,$d0)
+ mfhi ($h1,$r0,$d0)
+
+ dmultu ($rs1,$d1) # h1*5*r1
+ daddu $d2,$tmp1
+ daddu $d2,$tmp0
+ mflo ($tmp0,$rs1,$d1)
+ mfhi ($tmp1,$rs1,$d1)
+
+ dmultu ($r1,$d0) # h0*r1
+ mflo ($tmp2,$r1,$d0)
+ mfhi ($h2,$r1,$d0)
+ daddu $h0,$tmp0
+ daddu $h1,$tmp1
+ sltu $tmp0,$h0,$tmp0
+
+ dmultu ($r0,$d1) # h1*r0
+ daddu $h1,$tmp0
+ daddu $h1,$tmp2
+ mflo ($tmp0,$r0,$d1)
+ mfhi ($tmp1,$r0,$d1)
+
+ dmultu ($rs1,$d2) # h2*5*r1
+ sltu $tmp2,$h1,$tmp2
+ daddu $h2,$tmp2
+ mflo ($tmp2,$rs1,$d2)
+
+ dmultu ($r0,$d2) # h2*r0
+ daddu $h1,$tmp0
+ daddu $h2,$tmp1
+ mflo ($tmp3,$r0,$d2)
+ sltu $tmp0,$h1,$tmp0
+ daddu $h2,$tmp0
+
+ daddu $h1,$tmp2
+ sltu $tmp2,$h1,$tmp2
+ daddu $h2,$tmp2
+ daddu $h2,$tmp3
+
+ bne $inp,$len,.Loop
+
+ sd $h0,0($ctx) # store hash value
+ sd $h1,8($ctx)
+ sd $h2,16($ctx)
+
+ .set noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+ ld $s7,56($sp)
+ ld $s6,48($sp)
+#endif
+ ld $s5,40($sp) # epilogue
+ ld $s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
+ ld $s3,24($sp)
+ ld $s2,16($sp)
+ ld $s1,8($sp)
+ ld $s0,0($sp)
+___
+$code.=<<___;
+ jr $ra
+#if defined(_MIPS_ARCH_MIPS64R6)
+ daddu $sp,8*8
+#else
+ daddu $sp,6*8
+#endif
+.end poly1305_blocks_internal
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.align 5
+.globl poly1305_emit
+.ent poly1305_emit
+poly1305_emit:
+ .frame $sp,0,$ra
+ .set reorder
+
+ ld $tmp2,16($ctx)
+ ld $tmp0,0($ctx)
+ ld $tmp1,8($ctx)
+
+ li $in0,-4 # final reduction
+ dsrl $in1,$tmp2,2
+ and $in0,$tmp2
+ andi $tmp2,$tmp2,3
+ daddu $in0,$in1
+
+ daddu $tmp0,$tmp0,$in0
+ sltu $in1,$tmp0,$in0
+ daddiu $in0,$tmp0,5 # compare to modulus
+ daddu $tmp1,$tmp1,$in1
+ sltiu $tmp3,$in0,5
+ sltu $tmp4,$tmp1,$in1
+ daddu $in1,$tmp1,$tmp3
+ daddu $tmp2,$tmp2,$tmp4
+ sltu $tmp3,$in1,$tmp3
+ daddu $tmp2,$tmp2,$tmp3
+
+ dsrl $tmp2,2 # see if it carried/borrowed
+ dsubu $tmp2,$zero,$tmp2
+
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+ and $in0,$tmp2
+ and $in1,$tmp2
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+
+ lwu $tmp0,0($nonce) # load nonce
+ lwu $tmp1,4($nonce)
+ lwu $tmp2,8($nonce)
+ lwu $tmp3,12($nonce)
+ dsll $tmp1,32
+ dsll $tmp3,32
+ or $tmp0,$tmp1
+ or $tmp2,$tmp3
+
+ daddu $in0,$tmp0 # accumulate nonce
+ daddu $in1,$tmp2
+ sltu $tmp0,$in0,$tmp0
+ daddu $in1,$tmp0
+
+ dsrl $tmp0,$in0,8 # write mac value
+ dsrl $tmp1,$in0,16
+ dsrl $tmp2,$in0,24
+ sb $in0,0($mac)
+ dsrl $tmp3,$in0,32
+ sb $tmp0,1($mac)
+ dsrl $tmp0,$in0,40
+ sb $tmp1,2($mac)
+ dsrl $tmp1,$in0,48
+ sb $tmp2,3($mac)
+ dsrl $tmp2,$in0,56
+ sb $tmp3,4($mac)
+ dsrl $tmp3,$in1,8
+ sb $tmp0,5($mac)
+ dsrl $tmp0,$in1,16
+ sb $tmp1,6($mac)
+ dsrl $tmp1,$in1,24
+ sb $tmp2,7($mac)
+
+ sb $in1,8($mac)
+ dsrl $tmp2,$in1,32
+ sb $tmp3,9($mac)
+ dsrl $tmp3,$in1,40
+ sb $tmp0,10($mac)
+ dsrl $tmp0,$in1,48
+ sb $tmp1,11($mac)
+ dsrl $tmp1,$in1,56
+ sb $tmp2,12($mac)
+ sb $tmp3,13($mac)
+ sb $tmp0,14($mac)
+ sb $tmp1,15($mac)
+
+ jr $ra
+.end poly1305_emit
+.rdata
+.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
+.align 2
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+ ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
+ defined(_MIPS_ARCH_MIPS32R6)) \\
+ && !defined(_MIPS_ARCH_MIPS32R2)
+# define _MIPS_ARCH_MIPS32R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+# define multu(rs,rt)
+# define mflo(rd,rs,rt) mulu rd,rs,rt
+# define mfhi(rd,rs,rt) muhu rd,rs,rt
+#else
+# define multu(rs,rt) multu rs,rt
+# define mflo(rd,rs,rt) mflo rd
+# define mfhi(rd,rs,rt) mfhi rd
+#endif
+
+#ifdef __KERNEL__
+# define poly1305_init poly1305_init_mips
+# define poly1305_blocks poly1305_blocks_mips
+# define poly1305_emit poly1305_emit_mips
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 3
+#else
+# define MSB 3
+# define LSB 0
+#endif
+
+.text
+.set noat
+.set noreorder
+
+.align 5
+.globl poly1305_init
+.ent poly1305_init
+poly1305_init:
+ .frame $sp,0,$ra
+ .set reorder
+
+ sw $zero,0($ctx)
+ sw $zero,4($ctx)
+ sw $zero,8($ctx)
+ sw $zero,12($ctx)
+ sw $zero,16($ctx)
+
+ beqz $inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+ andi $tmp0,$inp,3 # $inp % 4
+ subu $inp,$inp,$tmp0 # align $inp
+ sll $tmp0,$tmp0,3 # byte to bit offset
+ lw $in0,0($inp)
+ lw $in1,4($inp)
+ lw $in2,8($inp)
+ lw $in3,12($inp)
+ beqz $tmp0,.Laligned_key
+
+ lw $tmp2,16($inp)
+ subu $tmp1,$zero,$tmp0
+# ifdef MIPSEB
+ sllv $in0,$in0,$tmp0
+ srlv $tmp3,$in1,$tmp1
+ sllv $in1,$in1,$tmp0
+ or $in0,$in0,$tmp3
+ srlv $tmp3,$in2,$tmp1
+ sllv $in2,$in2,$tmp0
+ or $in1,$in1,$tmp3
+ srlv $tmp3,$in3,$tmp1
+ sllv $in3,$in3,$tmp0
+ or $in2,$in2,$tmp3
+ srlv $tmp2,$tmp2,$tmp1
+ or $in3,$in3,$tmp2
+# else
+ srlv $in0,$in0,$tmp0
+ sllv $tmp3,$in1,$tmp1
+ srlv $in1,$in1,$tmp0
+ or $in0,$in0,$tmp3
+ sllv $tmp3,$in2,$tmp1
+ srlv $in2,$in2,$tmp0
+ or $in1,$in1,$tmp3
+ sllv $tmp3,$in3,$tmp1
+ srlv $in3,$in3,$tmp0
+ or $in2,$in2,$tmp3
+ sllv $tmp2,$tmp2,$tmp1
+ or $in3,$in3,$tmp2
+# endif
+.Laligned_key:
+#else
+ lwl $in0,0+MSB($inp)
+ lwl $in1,4+MSB($inp)
+ lwl $in2,8+MSB($inp)
+ lwl $in3,12+MSB($inp)
+ lwr $in0,0+LSB($inp)
+ lwr $in1,4+LSB($inp)
+ lwr $in2,8+LSB($inp)
+ lwr $in3,12+LSB($inp)
+#endif
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+ wsbh $in0,$in0 # byte swap
+ wsbh $in1,$in1
+ wsbh $in2,$in2
+ wsbh $in3,$in3
+ rotr $in0,$in0,16
+ rotr $in1,$in1,16
+ rotr $in2,$in2,16
+ rotr $in3,$in3,16
+# else
+ srl $tmp0,$in0,24 # byte swap
+ srl $tmp1,$in0,8
+ andi $tmp2,$in0,0xFF00
+ sll $in0,$in0,24
+ andi $tmp1,0xFF00
+ sll $tmp2,$tmp2,8
+ or $in0,$tmp0
+ srl $tmp0,$in1,24
+ or $tmp1,$tmp2
+ srl $tmp2,$in1,8
+ or $in0,$tmp1
+ andi $tmp1,$in1,0xFF00
+ sll $in1,$in1,24
+ andi $tmp2,0xFF00
+ sll $tmp1,$tmp1,8
+ or $in1,$tmp0
+ srl $tmp0,$in2,24
+ or $tmp2,$tmp1
+ srl $tmp1,$in2,8
+ or $in1,$tmp2
+ andi $tmp2,$in2,0xFF00
+ sll $in2,$in2,24
+ andi $tmp1,0xFF00
+ sll $tmp2,$tmp2,8
+ or $in2,$tmp0
+ srl $tmp0,$in3,24
+ or $tmp1,$tmp2
+ srl $tmp2,$in3,8
+ or $in2,$tmp1
+ andi $tmp1,$in3,0xFF00
+ sll $in3,$in3,24
+ andi $tmp2,0xFF00
+ sll $tmp1,$tmp1,8
+ or $in3,$tmp0
+ or $tmp2,$tmp1
+ or $in3,$tmp2
+# endif
+#endif
+ lui $tmp0,0x0fff
+ ori $tmp0,0xffff # 0x0fffffff
+ and $in0,$in0,$tmp0
+ subu $tmp0,3 # 0x0ffffffc
+ and $in1,$in1,$tmp0
+ and $in2,$in2,$tmp0
+ and $in3,$in3,$tmp0
+
+ sw $in0,20($ctx)
+ sw $in1,24($ctx)
+ sw $in2,28($ctx)
+ sw $in3,32($ctx)
+
+ srl $tmp1,$in1,2
+ srl $tmp2,$in2,2
+ srl $tmp3,$in3,2
+ addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2)
+ addu $in2,$in2,$tmp2
+ addu $in3,$in3,$tmp3
+ sw $in1,36($ctx)
+ sw $in2,40($ctx)
+ sw $in3,44($ctx)
+.Lno_key:
+ li $v0,0
+ jr $ra
+.end poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
+
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+ ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
+my ($d0,$d1,$d2,$d3) =
+ ($a4,$a5,$a6,$a7);
+my $shr = $t2; # used on R6
+my $one = $t2; # used on R2
+
+$code.=<<___;
+.globl poly1305_blocks
+.align 5
+.ent poly1305_blocks
+poly1305_blocks:
+ .frame $sp,16*4,$ra
+ .mask $SAVED_REGS_MASK,-4
+ .set noreorder
+ subu $sp, $sp,4*12
+ sw $s11,4*11($sp)
+ sw $s10,4*10($sp)
+ sw $s9, 4*9($sp)
+ sw $s8, 4*8($sp)
+ sw $s7, 4*7($sp)
+ sw $s6, 4*6($sp)
+ sw $s5, 4*5($sp)
+ sw $s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ sw $s3, 4*3($sp)
+ sw $s2, 4*2($sp)
+ sw $s1, 4*1($sp)
+ sw $s0, 4*0($sp)
+___
+$code.=<<___;
+ .set reorder
+
+ srl $len,4 # number of complete blocks
+ li $one,1
+ beqz $len,.Labort
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+ andi $shr,$inp,3
+ subu $inp,$inp,$shr # align $inp
+ sll $shr,$shr,3 # byte to bit offset
+#endif
+
+ lw $h0,0($ctx) # load hash value
+ lw $h1,4($ctx)
+ lw $h2,8($ctx)
+ lw $h3,12($ctx)
+ lw $h4,16($ctx)
+
+ lw $r0,20($ctx) # load key
+ lw $r1,24($ctx)
+ lw $r2,28($ctx)
+ lw $r3,32($ctx)
+ lw $rs1,36($ctx)
+ lw $rs2,40($ctx)
+ lw $rs3,44($ctx)
+
+ sll $len,4
+ addu $len,$len,$inp # end of buffer
+ b .Loop
+
+.align 4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS32R6)
+ lw $d0,0($inp) # load input
+ lw $d1,4($inp)
+ lw $d2,8($inp)
+ lw $d3,12($inp)
+ beqz $shr,.Laligned_inp
+
+ lw $t0,16($inp)
+ subu $t1,$zero,$shr
+# ifdef MIPSEB
+ sllv $d0,$d0,$shr
+ srlv $at,$d1,$t1
+ sllv $d1,$d1,$shr
+ or $d0,$d0,$at
+ srlv $at,$d2,$t1
+ sllv $d2,$d2,$shr
+ or $d1,$d1,$at
+ srlv $at,$d3,$t1
+ sllv $d3,$d3,$shr
+ or $d2,$d2,$at
+ srlv $t0,$t0,$t1
+ or $d3,$d3,$t0
+# else
+ srlv $d0,$d0,$shr
+ sllv $at,$d1,$t1
+ srlv $d1,$d1,$shr
+ or $d0,$d0,$at
+ sllv $at,$d2,$t1
+ srlv $d2,$d2,$shr
+ or $d1,$d1,$at
+ sllv $at,$d3,$t1
+ srlv $d3,$d3,$shr
+ or $d2,$d2,$at
+ sllv $t0,$t0,$t1
+ or $d3,$d3,$t0
+# endif
+.Laligned_inp:
+#else
+ lwl $d0,0+MSB($inp) # load input
+ lwl $d1,4+MSB($inp)
+ lwl $d2,8+MSB($inp)
+ lwl $d3,12+MSB($inp)
+ lwr $d0,0+LSB($inp)
+ lwr $d1,4+LSB($inp)
+ lwr $d2,8+LSB($inp)
+ lwr $d3,12+LSB($inp)
+#endif
+#ifdef MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+ wsbh $d0,$d0 # byte swap
+ wsbh $d1,$d1
+ wsbh $d2,$d2
+ wsbh $d3,$d3
+ rotr $d0,$d0,16
+ rotr $d1,$d1,16
+ rotr $d2,$d2,16
+ rotr $d3,$d3,16
+# else
+ srl $at,$d0,24 # byte swap
+ srl $t0,$d0,8
+ andi $t1,$d0,0xFF00
+ sll $d0,$d0,24
+ andi $t0,0xFF00
+ sll $t1,$t1,8
+ or $d0,$at
+ srl $at,$d1,24
+ or $t0,$t1
+ srl $t1,$d1,8
+ or $d0,$t0
+ andi $t0,$d1,0xFF00
+ sll $d1,$d1,24
+ andi $t1,0xFF00
+ sll $t0,$t0,8
+ or $d1,$at
+ srl $at,$d2,24
+ or $t1,$t0
+ srl $t0,$d2,8
+ or $d1,$t1
+ andi $t1,$d2,0xFF00
+ sll $d2,$d2,24
+ andi $t0,0xFF00
+ sll $t1,$t1,8
+ or $d2,$at
+ srl $at,$d3,24
+ or $t0,$t1
+ srl $t1,$d3,8
+ or $d2,$t0
+ andi $t0,$d3,0xFF00
+ sll $d3,$d3,24
+ andi $t1,0xFF00
+ sll $t0,$t0,8
+ or $d3,$at
+ or $t1,$t0
+ or $d3,$t1
+# endif
+#endif
+ srl $t0,$h4,2 # modulo-scheduled reduction
+ andi $h4,$h4,3
+ sll $at,$t0,2
+
+ addu $d0,$d0,$h0 # accumulate input
+ addu $t0,$t0,$at
+ sltu $h0,$d0,$h0
+ addu $d0,$d0,$t0 # ... and residue
+ sltu $at,$d0,$t0
+
+ addu $d1,$d1,$h1
+ addu $h0,$h0,$at # carry
+ sltu $h1,$d1,$h1
+ addu $d1,$d1,$h0
+ sltu $h0,$d1,$h0
+
+ addu $d2,$d2,$h2
+ addu $h1,$h1,$h0 # carry
+ sltu $h2,$d2,$h2
+ addu $d2,$d2,$h1
+ sltu $h1,$d2,$h1
+
+ addu $d3,$d3,$h3
+ addu $h2,$h2,$h1 # carry
+ sltu $h3,$d3,$h3
+ addu $d3,$d3,$h2
+
+#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
+ multu $r0,$d0 # d0*r0
+ sltu $h2,$d3,$h2
+ maddu $rs3,$d1 # d1*s3
+ addu $h3,$h3,$h2 # carry
+ maddu $rs2,$d2 # d2*s2
+ addu $h4,$h4,$padbit
+ maddu $rs1,$d3 # d3*s1
+ addu $h4,$h4,$h3
+ mfhi $at
+ mflo $h0
+
+ multu $r1,$d0 # d0*r1
+ maddu $r0,$d1 # d1*r0
+ maddu $rs3,$d2 # d2*s3
+ maddu $rs2,$d3 # d3*s2
+ maddu $rs1,$h4 # h4*s1
+ maddu $at,$one # hi*1
+ mfhi $at
+ mflo $h1
+
+ multu $r2,$d0 # d0*r2
+ maddu $r1,$d1 # d1*r1
+ maddu $r0,$d2 # d2*r0
+ maddu $rs3,$d3 # d3*s3
+ maddu $rs2,$h4 # h4*s2
+ maddu $at,$one # hi*1
+ mfhi $at
+ mflo $h2
+
+ mul $t0,$r0,$h4 # h4*r0
+
+ multu $r3,$d0 # d0*r3
+ maddu $r2,$d1 # d1*r2
+ maddu $r1,$d2 # d2*r1
+ maddu $r0,$d3 # d3*r0
+ maddu $rs3,$h4 # h4*s3
+ maddu $at,$one # hi*1
+ mfhi $at
+ mflo $h3
+
+ addiu $inp,$inp,16
+
+ addu $h4,$t0,$at
+#else
+ multu ($r0,$d0) # d0*r0
+ mflo ($h0,$r0,$d0)
+ mfhi ($h1,$r0,$d0)
+
+ sltu $h2,$d3,$h2
+ addu $h3,$h3,$h2 # carry
+
+ multu ($rs3,$d1) # d1*s3
+ mflo ($at,$rs3,$d1)
+ mfhi ($t0,$rs3,$d1)
+
+ addu $h4,$h4,$padbit
+ addiu $inp,$inp,16
+ addu $h4,$h4,$h3
+
+ multu ($rs2,$d2) # d2*s2
+ mflo ($a3,$rs2,$d2)
+ mfhi ($t1,$rs2,$d2)
+ addu $h0,$h0,$at
+ addu $h1,$h1,$t0
+ multu ($rs1,$d3) # d3*s1
+ sltu $at,$h0,$at
+ addu $h1,$h1,$at
+
+ mflo ($at,$rs1,$d3)
+ mfhi ($t0,$rs1,$d3)
+ addu $h0,$h0,$a3
+ addu $h1,$h1,$t1
+ multu ($r1,$d0) # d0*r1
+ sltu $a3,$h0,$a3
+ addu $h1,$h1,$a3
+
+
+ mflo ($a3,$r1,$d0)
+ mfhi ($h2,$r1,$d0)
+ addu $h0,$h0,$at
+ addu $h1,$h1,$t0
+ multu ($r0,$d1) # d1*r0
+ sltu $at,$h0,$at
+ addu $h1,$h1,$at
+
+ mflo ($at,$r0,$d1)
+ mfhi ($t0,$r0,$d1)
+ addu $h1,$h1,$a3
+ sltu $a3,$h1,$a3
+ multu ($rs3,$d2) # d2*s3
+ addu $h2,$h2,$a3
+
+ mflo ($a3,$rs3,$d2)
+ mfhi ($t1,$rs3,$d2)
+ addu $h1,$h1,$at
+ addu $h2,$h2,$t0
+ multu ($rs2,$d3) # d3*s2
+ sltu $at,$h1,$at
+ addu $h2,$h2,$at
+
+ mflo ($at,$rs2,$d3)
+ mfhi ($t0,$rs2,$d3)
+ addu $h1,$h1,$a3
+ addu $h2,$h2,$t1
+ multu ($rs1,$h4) # h4*s1
+ sltu $a3,$h1,$a3
+ addu $h2,$h2,$a3
+
+ mflo ($a3,$rs1,$h4)
+ addu $h1,$h1,$at
+ addu $h2,$h2,$t0
+ multu ($r2,$d0) # d0*r2
+ sltu $at,$h1,$at
+ addu $h2,$h2,$at
+
+
+ mflo ($at,$r2,$d0)
+ mfhi ($h3,$r2,$d0)
+ addu $h1,$h1,$a3
+ sltu $a3,$h1,$a3
+ multu ($r1,$d1) # d1*r1
+ addu $h2,$h2,$a3
+
+ mflo ($a3,$r1,$d1)
+ mfhi ($t1,$r1,$d1)
+ addu $h2,$h2,$at
+ sltu $at,$h2,$at
+ multu ($r0,$d2) # d2*r0
+ addu $h3,$h3,$at
+
+ mflo ($at,$r0,$d2)
+ mfhi ($t0,$r0,$d2)
+ addu $h2,$h2,$a3
+ addu $h3,$h3,$t1
+ multu ($rs3,$d3) # d3*s3
+ sltu $a3,$h2,$a3
+ addu $h3,$h3,$a3
+
+ mflo ($a3,$rs3,$d3)
+ mfhi ($t1,$rs3,$d3)
+ addu $h2,$h2,$at
+ addu $h3,$h3,$t0
+ multu ($rs2,$h4) # h4*s2
+ sltu $at,$h2,$at
+ addu $h3,$h3,$at
+
+ mflo ($at,$rs2,$h4)
+ addu $h2,$h2,$a3
+ addu $h3,$h3,$t1
+ multu ($r3,$d0) # d0*r3
+ sltu $a3,$h2,$a3
+ addu $h3,$h3,$a3
+
+
+ mflo ($a3,$r3,$d0)
+ mfhi ($t1,$r3,$d0)
+ addu $h2,$h2,$at
+ sltu $at,$h2,$at
+ multu ($r2,$d1) # d1*r2
+ addu $h3,$h3,$at
+
+ mflo ($at,$r2,$d1)
+ mfhi ($t0,$r2,$d1)
+ addu $h3,$h3,$a3
+ sltu $a3,$h3,$a3
+ multu ($r0,$d3) # d3*r0
+ addu $t1,$t1,$a3
+
+ mflo ($a3,$r0,$d3)
+ mfhi ($d3,$r0,$d3)
+ addu $h3,$h3,$at
+ addu $t1,$t1,$t0
+ multu ($r1,$d2) # d2*r1
+ sltu $at,$h3,$at
+ addu $t1,$t1,$at
+
+ mflo ($at,$r1,$d2)
+ mfhi ($t0,$r1,$d2)
+ addu $h3,$h3,$a3
+ addu $t1,$t1,$d3
+ multu ($rs3,$h4) # h4*s3
+ sltu $a3,$h3,$a3
+ addu $t1,$t1,$a3
+
+ mflo ($a3,$rs3,$h4)
+ addu $h3,$h3,$at
+ addu $t1,$t1,$t0
+ multu ($r0,$h4) # h4*r0
+ sltu $at,$h3,$at
+ addu $t1,$t1,$at
+
+
+ mflo ($h4,$r0,$h4)
+ addu $h3,$h3,$a3
+ sltu $a3,$h3,$a3
+ addu $t1,$t1,$a3
+ addu $h4,$h4,$t1
+
+ li $padbit,1 # if we loop, padbit is 1
+#endif
+ bne $inp,$len,.Loop
+
+ sw $h0,0($ctx) # store hash value
+ sw $h1,4($ctx)
+ sw $h2,8($ctx)
+ sw $h3,12($ctx)
+ sw $h4,16($ctx)
+
+ .set noreorder
+.Labort:
+ lw $s11,4*11($sp)
+ lw $s10,4*10($sp)
+ lw $s9, 4*9($sp)
+ lw $s8, 4*8($sp)
+ lw $s7, 4*7($sp)
+ lw $s6, 4*6($sp)
+ lw $s5, 4*5($sp)
+ lw $s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+ lw $s3, 4*3($sp)
+ lw $s2, 4*2($sp)
+ lw $s1, 4*1($sp)
+ lw $s0, 4*0($sp)
+___
+$code.=<<___;
+ jr $ra
+ addu $sp,$sp,4*12
+.end poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.align 5
+.globl poly1305_emit
+.ent poly1305_emit
+poly1305_emit:
+ .frame $sp,0,$ra
+ .set reorder
+
+ lw $tmp4,16($ctx)
+ lw $tmp0,0($ctx)
+ lw $tmp1,4($ctx)
+ lw $tmp2,8($ctx)
+ lw $tmp3,12($ctx)
+
+ li $in0,-4 # final reduction
+ srl $ctx,$tmp4,2
+ and $in0,$in0,$tmp4
+ andi $tmp4,$tmp4,3
+ addu $ctx,$ctx,$in0
+
+ addu $tmp0,$tmp0,$ctx
+ sltu $ctx,$tmp0,$ctx
+ addiu $in0,$tmp0,5 # compare to modulus
+ addu $tmp1,$tmp1,$ctx
+ sltiu $in1,$in0,5
+ sltu $ctx,$tmp1,$ctx
+ addu $in1,$in1,$tmp1
+ addu $tmp2,$tmp2,$ctx
+ sltu $in2,$in1,$tmp1
+ sltu $ctx,$tmp2,$ctx
+ addu $in2,$in2,$tmp2
+ addu $tmp3,$tmp3,$ctx
+ sltu $in3,$in2,$tmp2
+ sltu $ctx,$tmp3,$ctx
+ addu $in3,$in3,$tmp3
+ addu $tmp4,$tmp4,$ctx
+ sltu $ctx,$in3,$tmp3
+ addu $ctx,$tmp4
+
+ srl $ctx,2 # see if it carried/borrowed
+ subu $ctx,$zero,$ctx
+
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+ xor $in2,$tmp2
+ xor $in3,$tmp3
+ and $in0,$ctx
+ and $in1,$ctx
+ and $in2,$ctx
+ and $in3,$ctx
+ xor $in0,$tmp0
+ xor $in1,$tmp1
+ xor $in2,$tmp2
+ xor $in3,$tmp3
+
+ lw $tmp0,0($nonce) # load nonce
+ lw $tmp1,4($nonce)
+ lw $tmp2,8($nonce)
+ lw $tmp3,12($nonce)
+
+ addu $in0,$tmp0 # accumulate nonce
+ sltu $ctx,$in0,$tmp0
+
+ addu $in1,$tmp1
+ sltu $tmp1,$in1,$tmp1
+ addu $in1,$ctx
+ sltu $ctx,$in1,$ctx
+ addu $ctx,$tmp1
+
+ addu $in2,$tmp2
+ sltu $tmp2,$in2,$tmp2
+ addu $in2,$ctx
+ sltu $ctx,$in2,$ctx
+ addu $ctx,$tmp2
+
+ addu $in3,$tmp3
+ addu $in3,$ctx
+
+ srl $tmp0,$in0,8 # write mac value
+ srl $tmp1,$in0,16
+ srl $tmp2,$in0,24
+ sb $in0, 0($mac)
+ sb $tmp0,1($mac)
+ srl $tmp0,$in1,8
+ sb $tmp1,2($mac)
+ srl $tmp1,$in1,16
+ sb $tmp2,3($mac)
+ srl $tmp2,$in1,24
+ sb $in1, 4($mac)
+ sb $tmp0,5($mac)
+ srl $tmp0,$in2,8
+ sb $tmp1,6($mac)
+ srl $tmp1,$in2,16
+ sb $tmp2,7($mac)
+ srl $tmp2,$in2,24
+ sb $in2, 8($mac)
+ sb $tmp0,9($mac)
+ srl $tmp0,$in3,8
+ sb $tmp1,10($mac)
+ srl $tmp1,$in3,16
+ sb $tmp2,11($mac)
+ srl $tmp2,$in3,24
+ sb $in3, 12($mac)
+ sb $tmp0,13($mac)
+ sb $tmp1,14($mac)
+ sb $tmp2,15($mac)
+
+ jr $ra
+.end poly1305_emit
+.rdata
+.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
+.align 2
+___
+}
+}}}
+
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT;
--- /dev/null
+++ b/include/crypto/blake2s.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
+ */
+
+#ifndef BLAKE2S_H
+#define BLAKE2S_H
+
+#include
+#include
+#include
+
+#include
+
+enum blake2s_lengths {
+ BLAKE2S_BLOCK_SIZE = 64,
+ BLAKE2S_HASH_SIZE = 32,
+ BLAKE2S_KEY_SIZE = 32,
+
+ BLAKE2S_128_HASH_SIZE = 16,
+ BLAKE2S_160_HASH_SIZE = 20,
+ BLAKE2S_224_HASH_SIZE = 28,
+ BLAKE2S_256_HASH_SIZE = 32,
+};
+
+struct blake2s_state {
+ u32 h[8];
+ u32 t[2];
+ u32 f[2];
+ u8 buf[BLAKE2S_BLOCK_SIZE];
+ unsigned int buflen;
+ unsigned int outlen;
+};
+
+enum blake2s_iv {
+ BLAKE2S_IV0 = 0x6A09E667UL,
+ BLAKE2S_IV1 = 0xBB67AE85UL,
+ BLAKE2S_IV2 = 0x3C6EF372UL,
+ BLAKE2S_IV3 = 0xA54FF53AUL,
+ BLAKE2S_IV4 = 0x510E527FUL,
+ BLAKE2S_IV5 = 0x9B05688CUL,
+ BLAKE2S_IV6 = 0x1F83D9ABUL,
+ BLAKE2S_IV7 = 0x5BE0CD19UL,
+};
+
+void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen);
+void blake2s_final(struct blake2s_state *state, u8 *out);
+
+static inline void blake2s_init_param(struct blake2s_state *state,
+ const u32 param)
+{
+ *state = (struct blake2s_state){{
+ BLAKE2S_IV0 ^ param,
+ BLAKE2S_IV1,
+ BLAKE2S_IV2,
+ BLAKE2S_IV3,
+ BLAKE2S_IV4,
+ BLAKE2S_IV5,
+ BLAKE2S_IV6,
+ BLAKE2S_IV7,
+ }};
+}
+
+static inline void blake2s_init(struct blake2s_state *state,
+ const size_t outlen)
+{
+ blake2s_init_param(state, 0x01010000 | outlen);
+ state->outlen = outlen;
+}
+
+static inline void blake2s_init_key(struct blake2s_state *state,
+ const size_t outlen, const void *key,
+ const size_t keylen)
+{
+ WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
+ !key || !keylen || keylen > BLAKE2S_KEY_SIZE));
+
+ blake2s_init_param(state, 0x01010000 | keylen << 8 | outlen);
+ memcpy(state->buf, key, keylen);
+ state->buflen = BLAKE2S_BLOCK_SIZE;
+ state->outlen = outlen;
+}
+
+static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
+ const size_t outlen, const size_t inlen,
+ const size_t keylen)
+{
+ struct blake2s_state state;
+
+ WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
+ outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
+ (!key && keylen)));
+
+ if (keylen)
+ blake2s_init_key(&state, outlen, key, keylen);
+ else
+ blake2s_init(&state, outlen);
+
+ blake2s_update(&state, in, inlen);
+ blake2s_final(&state, out);
+}
+
+void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
+ const size_t keylen);
+
+#endif /* BLAKE2S_H */
--- b/include/crypto/internal/blake2s.h
+++ b/include/crypto/internal/blake2s.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+
+#ifndef BLAKE2S_INTERNAL_H
+#define BLAKE2S_INTERNAL_H
+
+#include
+
+struct blake2s_tfm_ctx {
+ u8 key[BLAKE2S_KEY_SIZE];
+ unsigned int keylen;
+};
+
+void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+ size_t nblocks, const u32 inc);
+
+void blake2s_compress_arch(struct blake2s_state *state,const u8 *block,
+ size_t nblocks, const u32 inc);
+
+static inline void blake2s_set_lastblock(struct blake2s_state *state)
+{
+ state->f[0] = -1;
+}
+
+#endif /* BLAKE2S_INTERNAL_H */
--- /dev/null
+++ b/lib/crypto/blake2s-generic.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
+ *
+ * This is an implementation of the BLAKE2s hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ *
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+static const u8 blake2s_sigma[10][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
+
+static inline void blake2s_increment_counter(struct blake2s_state *state,
+ const u32 inc)
+{
+ state->t[0] += inc;
+ state->t[1] += (state->t[0] < inc);
+}
+
+void blake2s_compress_generic(struct blake2s_state *state,const u8 *block,
+ size_t nblocks, const u32 inc)
+{
+ u32 m[16];
+ u32 v[16];
+ int i;
+
+ WARN_ON(IS_ENABLED(DEBUG) &&
+ (nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
+
+ while (nblocks > 0) {
+ blake2s_increment_counter(state, inc);
+ memcpy(m, block, BLAKE2S_BLOCK_SIZE);
+ le32_to_cpu_array(m, ARRAY_SIZE(m));
+ memcpy(v, state->h, 32);
+ v[ 8] = BLAKE2S_IV0;
+ v[ 9] = BLAKE2S_IV1;
+ v[10] = BLAKE2S_IV2;
+ v[11] = BLAKE2S_IV3;
+ v[12] = BLAKE2S_IV4 ^ state->t[0];
+ v[13] = BLAKE2S_IV5 ^ state->t[1];
+ v[14] = BLAKE2S_IV6 ^ state->f[0];
+ v[15] = BLAKE2S_IV7 ^ state->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+ a += b + m[blake2s_sigma[r][2 * i + 0]]; \
+ d = ror32(d ^ a, 16); \
+ c += d; \
+ b = ror32(b ^ c, 12); \
+ a += b + m[blake2s_sigma[r][2 * i + 1]]; \
+ d = ror32(d ^ a, 8); \
+ c += d; \
+ b = ror32(b ^ c, 7); \
+} while (0)
+
+#define ROUND(r) do { \
+ G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+ G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+ G(r, 2, v[2], v[ 6], v[10], v[14]); \
+ G(r, 3, v[3], v[ 7], v[11], v[15]); \
+ G(r, 4, v[0], v[ 5], v[10], v[15]); \
+ G(r, 5, v[1], v[ 6], v[11], v[12]); \
+ G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+ G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+ ROUND(0);
+ ROUND(1);
+ ROUND(2);
+ ROUND(3);
+ ROUND(4);
+ ROUND(5);
+ ROUND(6);
+ ROUND(7);
+ ROUND(8);
+ ROUND(9);
+
+#undef G
+#undef ROUND
+
+ for (i = 0; i < 8; ++i)
+ state->h[i] ^= v[i] ^ v[i + 8];
+
+ block += BLAKE2S_BLOCK_SIZE;
+ --nblocks;
+ }
+}
+
+EXPORT_SYMBOL(blake2s_compress_generic);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("BLAKE2s hash function");
+MODULE_AUTHOR("Jason A. Donenfeld ");
--- /dev/null
+++ b/lib/crypto/blake2s-selftest.c
@@ -0,0 +1,622 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
+ */
+
+#include
+#include
+
+/*
+ * blake2s_testvecs[] generated with the program below (using libb2-dev and
+ * libssl-dev [OpenSSL])
+ *
+ * #include
+ * #include
+ * #include
+ *
+ * #include
+ * #include
+ *
+ * #define BLAKE2S_TESTVEC_COUNT 256
+ *
+ * static void print_vec(const uint8_t vec[], int len)
+ * {
+ * int i;
+ *
+ * printf(" { ");
+ * for (i = 0; i < len; i++) {
+ * if (i && (i % 12) == 0)
+ * printf("\n ");
+ * printf("0x%02x, ", vec[i]);
+ * }
+ * printf("},\n");
+ * }
+ *
+ * int main(void)
+ * {
+ * uint8_t key[BLAKE2S_KEYBYTES];
+ * uint8_t buf[BLAKE2S_TESTVEC_COUNT];
+ * uint8_t hash[BLAKE2S_OUTBYTES];
+ * int i, j;
+ *
+ * key[0] = key[1] = 1;
+ * for (i = 2; i < BLAKE2S_KEYBYTES; ++i)
+ * key[i] = key[i - 2] + key[i - 1];
+ *
+ * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i)
+ * buf[i] = (uint8_t)i;
+ *
+ * printf("static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
+ *
+ * for (i = 0; i < BLAKE2S_TESTVEC_COUNT; ++i) {
+ * int outlen = 1 + i % BLAKE2S_OUTBYTES;
+ * int keylen = (13 * i) % (BLAKE2S_KEYBYTES + 1);
+ *
+ * blake2s(hash, buf, key + BLAKE2S_KEYBYTES - keylen, outlen, i,
+ * keylen);
+ * print_vec(hash, outlen);
+ * }
+ * printf("};\n\n");
+ *
+ * printf("static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {\n");
+ *
+ * HMAC(EVP_blake2s256(), key, sizeof(key), buf, sizeof(buf), hash, NULL);
+ * print_vec(hash, BLAKE2S_OUTBYTES);
+ *
+ * HMAC(EVP_blake2s256(), buf, sizeof(buf), key, sizeof(key), hash, NULL);
+ * print_vec(hash, BLAKE2S_OUTBYTES);
+ *
+ * printf("};\n");
+ *
+ * return 0;
+ *}
+ */
+static const u8 blake2s_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
+ { 0xa1, },
+ { 0x7c, 0x89, },
+ { 0x74, 0x0e, 0xd4, },
+ { 0x47, 0x0c, 0x21, 0x15, },
+ { 0x18, 0xd6, 0x9c, 0xa6, 0xc4, },
+ { 0x13, 0x5d, 0x16, 0x63, 0x2e, 0xf9, },
+ { 0x2c, 0xb5, 0x04, 0xb7, 0x99, 0xe2, 0x73, },
+ { 0x9a, 0x0f, 0xd2, 0x39, 0xd6, 0x68, 0x1b, 0x92, },
+ { 0xc8, 0xde, 0x7a, 0xea, 0x2f, 0xf4, 0xd2, 0xe3, 0x2b, },
+ { 0x5b, 0xf9, 0x43, 0x52, 0x0c, 0x12, 0xba, 0xb5, 0x93, 0x9f, },
+ { 0xc6, 0x2c, 0x4e, 0x80, 0xfc, 0x32, 0x5b, 0x33, 0xb8, 0xb8, 0x0a, },
+ { 0xa7, 0x5c, 0xfd, 0x3a, 0xcc, 0xbf, 0x90, 0xca, 0xb7, 0x97, 0xde, 0xd8, },
+ { 0x66, 0xca, 0x3c, 0xc4, 0x19, 0xef, 0x92, 0x66, 0x3f, 0x21, 0x8f, 0xda,
+ 0xb7, },
+ { 0xba, 0xe5, 0xbb, 0x30, 0x25, 0x94, 0x6d, 0xc3, 0x89, 0x09, 0xc4, 0x25,
+ 0x52, 0x3e, },
+ { 0xa2, 0xef, 0x0e, 0x52, 0x0b, 0x5f, 0xa2, 0x01, 0x6d, 0x0a, 0x25, 0xbc,
+ 0x57, 0xe2, 0x27, },
+ { 0x4f, 0xe0, 0xf9, 0x52, 0x12, 0xda, 0x84, 0xb7, 0xab, 0xae, 0xb0, 0xa6,
+ 0x47, 0x2a, 0xc7, 0xf5, },
+ { 0x56, 0xe7, 0xa8, 0x1c, 0x4c, 0xca, 0xed, 0x90, 0x31, 0xec, 0x87, 0x43,
+ 0xe7, 0x72, 0x08, 0xec, 0xbe, },
+ { 0x7e, 0xdf, 0x80, 0x1c, 0x93, 0x33, 0xfd, 0x53, 0x44, 0xba, 0xfd, 0x96,
+ 0xe1, 0xbb, 0xb5, 0x65, 0xa5, 0x00, },
+ { 0xec, 0x6b, 0xed, 0xf7, 0x7b, 0x62, 0x1d, 0x7d, 0xf4, 0x82, 0xf3, 0x1e,
+ 0x18, 0xff, 0x2b, 0xc4, 0x06, 0x20, 0x2a, },
+ { 0x74, 0x98, 0xd7, 0x68, 0x63, 0xed, 0x87, 0xe4, 0x5d, 0x8d, 0x9e, 0x1d,
+ 0xfd, 0x2a, 0xbb, 0x86, 0xac, 0xe9, 0x2a, 0x89, },
+ { 0x89, 0xc3, 0x88, 0xce, 0x2b, 0x33, 0x1e, 0x10, 0xd1, 0x37, 0x20, 0x86,
+ 0x28, 0x43, 0x70, 0xd9, 0xfb, 0x96, 0xd9, 0xb5, 0xd3, },
+ { 0xcb, 0x56, 0x74, 0x41, 0x8d, 0x80, 0x01, 0x9a, 0x6b, 0x38, 0xe1, 0x41,
+ 0xad, 0x9c, 0x62, 0x74, 0xce, 0x35, 0xd5, 0x6c, 0x89, 0x6e, },
+ { 0x79, 0xaf, 0x94, 0x59, 0x99, 0x26, 0xe1, 0xc9, 0x34, 0xfe, 0x7c, 0x22,
+ 0xf7, 0x43, 0xd7, 0x65, 0xd4, 0x48, 0x18, 0xac, 0x3d, 0xfd, 0x93, },
+ { 0x85, 0x0d, 0xff, 0xb8, 0x3e, 0x87, 0x41, 0xb0, 0x95, 0xd3, 0x3d, 0x00,
+ 0x47, 0x55, 0x9e, 0xd2, 0x69, 0xea, 0xbf, 0xe9, 0x7a, 0x2d, 0x61, 0x45, },
+ { 0x03, 0xe0, 0x85, 0xec, 0x54, 0xb5, 0x16, 0x53, 0xa8, 0xc4, 0x71, 0xe9,
+ 0x6a, 0xe7, 0xcb, 0xc4, 0x15, 0x02, 0xfc, 0x34, 0xa4, 0xa4, 0x28, 0x13,
+ 0xd1, },
+ { 0xe3, 0x34, 0x4b, 0xe1, 0xd0, 0x4b, 0x55, 0x61, 0x8f, 0xc0, 0x24, 0x05,
+ 0xe6, 0xe0, 0x3d, 0x70, 0x24, 0x4d, 0xda, 0xb8, 0x91, 0x05, 0x29, 0x07,
+ 0x01, 0x3e, },
+ { 0x61, 0xff, 0x01, 0x72, 0xb1, 0x4d, 0xf6, 0xfe, 0xd1, 0xd1, 0x08, 0x74,
+ 0xe6, 0x91, 0x44, 0xeb, 0x61, 0xda, 0x40, 0xaf, 0xfc, 0x8c, 0x91, 0x6b,
+ 0xec, 0x13, 0xed, },
+ { 0xd4, 0x40, 0xd2, 0xa0, 0x7f, 0xc1, 0x58, 0x0c, 0x85, 0xa0, 0x86, 0xc7,
+ 0x86, 0xb9, 0x61, 0xc9, 0xea, 0x19, 0x86, 0x1f, 0xab, 0x07, 0xce, 0x37,
+ 0x72, 0x67, 0x09, 0xfc, },
+ { 0x9e, 0xf8, 0x18, 0x67, 0x93, 0x10, 0x9b, 0x39, 0x75, 0xe8, 0x8b, 0x38,
+ 0x82, 0x7d, 0xb8, 0xb7, 0xa5, 0xaf, 0xe6, 0x6a, 0x22, 0x5e, 0x1f, 0x9c,
+ 0x95, 0x29, 0x19, 0xf2, 0x4b, },
+ { 0xc8, 0x62, 0x25, 0xf5, 0x98, 0xc9, 0xea, 0xe5, 0x29, 0x3a, 0xd3, 0x22,
+ 0xeb, 0xeb, 0x07, 0x7c, 0x15, 0x07, 0xee, 0x15, 0x61, 0xbb, 0x05, 0x30,
+ 0x99, 0x7f, 0x11, 0xf6, 0x0a, 0x1d, },
+ { 0x68, 0x70, 0xf7, 0x90, 0xa1, 0x8b, 0x1f, 0x0f, 0xbb, 0xce, 0xd2, 0x0e,
+ 0x33, 0x1f, 0x7f, 0xa9, 0x78, 0xa8, 0xa6, 0x81, 0x66, 0xab, 0x8d, 0xcd,
+ 0x58, 0x55, 0x3a, 0x0b, 0x7a, 0xdb, 0xb5, },
+ { 0xdd, 0x35, 0xd2, 0xb4, 0xf6, 0xc7, 0xea, 0xab, 0x64, 0x24, 0x4e, 0xfe,
+ 0xe5, 0x3d, 0x4e, 0x95, 0x8b, 0x6d, 0x6c, 0xbc, 0xb0, 0xf8, 0x88, 0x61,
+ 0x09, 0xb7, 0x78, 0xa3, 0x31, 0xfe, 0xd9, 0x2f, },
+ { 0x0a, },
+ { 0x6e, 0xd4, },
+ { 0x64, 0xe9, 0xd1, },
+ { 0x30, 0xdd, 0x71, 0xef, },
+ { 0x11, 0xb5, 0x0c, 0x87, 0xc9, },
+ { 0x06, 0x1c, 0x6d, 0x04, 0x82, 0xd0, },
+ { 0x5c, 0x42, 0x0b, 0xee, 0xc5, 0x9c, 0xb2, },
+ { 0xe8, 0x29, 0xd6, 0xb4, 0x5d, 0xf7, 0x2b, 0x93, },
+ { 0x18, 0xca, 0x27, 0x72, 0x43, 0x39, 0x16, 0xbc, 0x6a, },
+ { 0x39, 0x8f, 0xfd, 0x64, 0xf5, 0x57, 0x23, 0xb0, 0x45, 0xf8, },
+ { 0xbb, 0x3a, 0x78, 0x6b, 0x02, 0x1d, 0x0b, 0x16, 0xe3, 0xb2, 0x9a, },
+ { 0xb8, 0xb4, 0x0b, 0xe5, 0xd4, 0x1d, 0x0d, 0x85, 0x49, 0x91, 0x35, 0xfa, },
+ { 0x6d, 0x48, 0x2a, 0x0c, 0x42, 0x08, 0xbd, 0xa9, 0x78, 0x6f, 0x18, 0xaf,
+ 0xe2, },
+ { 0x10, 0x45, 0xd4, 0x58, 0x88, 0xec, 0x4e, 0x1e, 0xf6, 0x14, 0x92, 0x64,
+ 0x7e, 0xb0, },
+ { 0x8b, 0x0b, 0x95, 0xee, 0x92, 0xc6, 0x3b, 0x91, 0xf1, 0x1e, 0xeb, 0x51,
+ 0x98, 0x0a, 0x8d, },
+ { 0xa3, 0x50, 0x4d, 0xa5, 0x1d, 0x03, 0x68, 0xe9, 0x57, 0x78, 0xd6, 0x04,
+ 0xf1, 0xc3, 0x94, 0xd8, },
+ { 0xb8, 0x66, 0x6e, 0xdd, 0x46, 0x15, 0xae, 0x3d, 0x83, 0x7e, 0xcf, 0xe7,
+ 0x2c, 0xe8, 0x8f, 0xc7, 0x34, },
+ { 0x2e, 0xc0, 0x1f, 0x29, 0xea, 0xf6, 0xb9, 0xe2, 0xc2, 0x93, 0xeb, 0x41,
+ 0x0d, 0xf0, 0x0a, 0x13, 0x0e, 0xa2, },
+ { 0x71, 0xb8, 0x33, 0xa9, 0x1b, 0xac, 0xf1, 0xb5, 0x42, 0x8f, 0x5e, 0x81,
+ 0x34, 0x43, 0xb7, 0xa4, 0x18, 0x5c, 0x47, },
+ { 0xda, 0x45, 0xb8, 0x2e, 0x82, 0x1e, 0xc0, 0x59, 0x77, 0x9d, 0xfa, 0xb4,
+ 0x1c, 0x5e, 0xa0, 0x2b, 0x33, 0x96, 0x5a, 0x58, },
+ { 0xe3, 0x09, 0x05, 0xa9, 0xeb, 0x48, 0x13, 0xad, 0x71, 0x88, 0x81, 0x9a,
+ 0x3e, 0x2c, 0xe1, 0x23, 0x99, 0x13, 0x35, 0x9f, 0xb5, },
+ { 0xb7, 0x86, 0x2d, 0x16, 0xe1, 0x04, 0x00, 0x47, 0x47, 0x61, 0x31, 0xfb,
+ 0x14, 0xac, 0xd8, 0xe9, 0xe3, 0x49, 0xbd, 0xf7, 0x9c, 0x3f, },
+ { 0x7f, 0xd9, 0x95, 0xa8, 0xa7, 0xa0, 0xcc, 0xba, 0xef, 0xb1, 0x0a, 0xa9,
+ 0x21, 0x62, 0x08, 0x0f, 0x1b, 0xff, 0x7b, 0x9d, 0xae, 0xb2, 0x95, },
+ { 0x85, 0x99, 0xea, 0x33, 0xe0, 0x56, 0xff, 0x13, 0xc6, 0x61, 0x8c, 0xf9,
+ 0x57, 0x05, 0x03, 0x11, 0xf9, 0xfb, 0x3a, 0xf7, 0xce, 0xbb, 0x52, 0x30, },
+ { 0xb2, 0x72, 0x9c, 0xf8, 0x77, 0x4e, 0x8f, 0x6b, 0x01, 0x6c, 0xff, 0x4e,
+ 0x4f, 0x02, 0xd2, 0xbc, 0xeb, 0x51, 0x28, 0x99, 0x50, 0xab, 0xc4, 0x42,
+ 0xe3, },
+ { 0x8b, 0x0a, 0xb5, 0x90, 0x8f, 0xf5, 0x7b, 0xdd, 0xba, 0x47, 0x37, 0xc9,
+ 0x2a, 0xd5, 0x4b, 0x25, 0x08, 0x8b, 0x02, 0x17, 0xa7, 0x9e, 0x6b, 0x6e,
+ 0xe3, 0x90, },
+ { 0x90, 0xdd, 0xf7, 0x75, 0xa7, 0xa3, 0x99, 0x5e, 0x5b, 0x7d, 0x75, 0xc3,
+ 0x39, 0x6b, 0xa0, 0xe2, 0x44, 0x53, 0xb1, 0x9e, 0xc8, 0xf1, 0x77, 0x10,
+ 0x58, 0x06, 0x9a, },
+ { 0x99, 0x52, 0xf0, 0x49, 0xa8, 0x8c, 0xec, 0xa6, 0x97, 0x32, 0x13, 0xb5,
+ 0xf7, 0xa3, 0x8e, 0xfb, 0x4b, 0x59, 0x31, 0x3d, 0x01, 0x59, 0x98, 0x5d,
+ 0x53, 0x03, 0x1a, 0x39, },
+ { 0x9f, 0xe0, 0xc2, 0xe5, 0x5d, 0x93, 0xd6, 0x9b, 0x47, 0x8f, 0x9b, 0xe0,
+ 0x26, 0x35, 0x84, 0x20, 0x1d, 0xc5, 0x53, 0x10, 0x0f, 0x22, 0xb9, 0xb5,
+ 0xd4, 0x36, 0xb1, 0xac, 0x73, },
+ { 0x30, 0x32, 0x20, 0x3b, 0x10, 0x28, 0xec, 0x1f, 0x4f, 0x9b, 0x47, 0x59,
+ 0xeb, 0x7b, 0xee, 0x45, 0xfb, 0x0c, 0x49, 0xd8, 0x3d, 0x69, 0xbd, 0x90,
+ 0x2c, 0xf0, 0x9e, 0x8d, 0xbf, 0xd5, },
+ { 0x2a, 0x37, 0x73, 0x7f, 0xf9, 0x96, 0x19, 0xaa, 0x25, 0xd8, 0x13, 0x28,
+ 0x01, 0x29, 0x89, 0xdf, 0x6e, 0x0c, 0x9b, 0x43, 0x44, 0x51, 0xe9, 0x75,
+ 0x26, 0x0c, 0xb7, 0x87, 0x66, 0x0b, 0x5f, },
+ { 0x23, 0xdf, 0x96, 0x68, 0x91, 0x86, 0xd0, 0x93, 0x55, 0x33, 0x24, 0xf6,
+ 0xba, 0x08, 0x75, 0x5b, 0x59, 0x11, 0x69, 0xb8, 0xb9, 0xe5, 0x2c, 0x77,
+ 0x02, 0xf6, 0x47, 0xee, 0x81, 0xdd, 0xb9, 0x06, },
+ { 0x9d, },
+ { 0x9d, 0x7d, },
+ { 0xfd, 0xc3, 0xda, },
+ { 0xe8, 0x82, 0xcd, 0x21, },
+ { 0xc3, 0x1d, 0x42, 0x4c, 0x74, },
+ { 0xe9, 0xda, 0xf1, 0xa2, 0xe5, 0x7c, },
+ { 0x52, 0xb8, 0x6f, 0x81, 0x5c, 0x3a, 0x4c, },
+ { 0x5b, 0x39, 0x26, 0xfc, 0x92, 0x5e, 0xe0, 0x49, },
+ { 0x59, 0xe4, 0x7c, 0x93, 0x1c, 0xf9, 0x28, 0x93, 0xde, },
+ { 0xde, 0xdf, 0xb2, 0x43, 0x61, 0x0b, 0x86, 0x16, 0x4c, 0x2e, },
+ { 0x14, 0x8f, 0x75, 0x51, 0xaf, 0xb9, 0xee, 0x51, 0x5a, 0xae, 0x23, },
+ { 0x43, 0x5f, 0x50, 0xd5, 0x70, 0xb0, 0x5b, 0x87, 0xf5, 0xd9, 0xb3, 0x6d, },
+ { 0x66, 0x0a, 0x64, 0x93, 0x79, 0x71, 0x94, 0x40, 0xb7, 0x68, 0x2d, 0xd3,
+ 0x63, },
+ { 0x15, 0x00, 0xc4, 0x0c, 0x7d, 0x1b, 0x10, 0xa9, 0x73, 0x1b, 0x90, 0x6f,
+ 0xe6, 0xa9, },
+ { 0x34, 0x75, 0xf3, 0x86, 0x8f, 0x56, 0xcf, 0x2a, 0x0a, 0xf2, 0x62, 0x0a,
+ 0xf6, 0x0e, 0x20, },
+ { 0xb1, 0xde, 0xc9, 0xf5, 0xdb, 0xf3, 0x2f, 0x4c, 0xd6, 0x41, 0x7d, 0x39,
+ 0x18, 0x3e, 0xc7, 0xc3, },
+ { 0xc5, 0x89, 0xb2, 0xf8, 0xb8, 0xc0, 0xa3, 0xb9, 0x3b, 0x10, 0x6d, 0x7c,
+ 0x92, 0xfc, 0x7f, 0x34, 0x41, },
+ { 0xc4, 0xd8, 0xef, 0xba, 0xef, 0xd2, 0xaa, 0xc5, 0x6c, 0x8e, 0x3e, 0xbb,
+ 0x12, 0xfc, 0x0f, 0x72, 0xbf, 0x0f, },
+ { 0xdd, 0x91, 0xd1, 0x15, 0x9e, 0x7d, 0xf8, 0xc1, 0xb9, 0x14, 0x63, 0x96,
+ 0xb5, 0xcb, 0x83, 0x1d, 0x35, 0x1c, 0xec, },
+ { 0xa9, 0xf8, 0x52, 0xc9, 0x67, 0x76, 0x2b, 0xad, 0xfb, 0xd8, 0x3a, 0xa6,
+ 0x74, 0x02, 0xae, 0xb8, 0x25, 0x2c, 0x63, 0x49, },
+ { 0x77, 0x1f, 0x66, 0x70, 0xfd, 0x50, 0x29, 0xaa, 0xeb, 0xdc, 0xee, 0xba,
+ 0x75, 0x98, 0xdc, 0x93, 0x12, 0x3f, 0xdc, 0x7c, 0x38, },
+ { 0xe2, 0xe1, 0x89, 0x5c, 0x37, 0x38, 0x6a, 0xa3, 0x40, 0xac, 0x3f, 0xb0,
+ 0xca, 0xfc, 0xa7, 0xf3, 0xea, 0xf9, 0x0f, 0x5d, 0x8e, 0x39, },
+ { 0x0f, 0x67, 0xc8, 0x38, 0x01, 0xb1, 0xb7, 0xb8, 0xa2, 0xe7, 0x0a, 0x6d,
+ 0xd2, 0x63, 0x69, 0x9e, 0xcc, 0xf0, 0xf2, 0xbe, 0x9b, 0x98, 0xdd, },
+ { 0x13, 0xe1, 0x36, 0x30, 0xfe, 0xc6, 0x01, 0x8a, 0xa1, 0x63, 0x96, 0x59,
+ 0xc2, 0xa9, 0x68, 0x3f, 0x58, 0xd4, 0x19, 0x0c, 0x40, 0xf3, 0xde, 0x02, },
+ { 0xa3, 0x9e, 0xce, 0xda, 0x42, 0xee, 0x8c, 0x6c, 0x5a, 0x7d, 0xdc, 0x89,
+ 0x02, 0x77, 0xdd, 0xe7, 0x95, 0xbb, 0xff, 0x0d, 0xa4, 0xb5, 0x38, 0x1e,
+ 0xaf, },
+ { 0x9a, 0xf6, 0xb5, 0x9a, 0x4f, 0xa9, 0x4f, 0x2c, 0x35, 0x3c, 0x24, 0xdc,
+ 0x97, 0x6f, 0xd9, 0xa1, 0x7d, 0x1a, 0x85, 0x0b, 0xf5, 0xda, 0x2e, 0xe7,
+ 0xb1, 0x1d, },
+ { 0x84, 0x1e, 0x8e, 0x3d, 0x45, 0xa5, 0xf2, 0x27, 0xf3, 0x31, 0xfe, 0xb9,
+ 0xfb, 0xc5, 0x45, 0x99, 0x99, 0xdd, 0x93, 0x43, 0x02, 0xee, 0x58, 0xaf,
+ 0xee, 0x6a, 0xbe, },
+ { 0x07, 0x2f, 0xc0, 0xa2, 0x04, 0xc4, 0xab, 0x7c, 0x26, 0xbb, 0xa8, 0xd8,
+ 0xe3, 0x1c, 0x75, 0x15, 0x64, 0x5d, 0x02, 0x6a, 0xf0, 0x86, 0xe9, 0xcd,
+ 0x5c, 0xef, 0xa3, 0x25, },
+ { 0x2f, 0x3b, 0x1f, 0xb5, 0x91, 0x8f, 0x86, 0xe0, 0xdc, 0x31, 0x48, 0xb6,
+ 0xa1, 0x8c, 0xfd, 0x75, 0xbb, 0x7d, 0x3d, 0xc1, 0xf0, 0x10, 0x9a, 0xd8,
+ 0x4b, 0x0e, 0xe3, 0x94, 0x9f, },
+ { 0x29, 0xbb, 0x8f, 0x6c, 0xd1, 0xf2, 0xb6, 0xaf, 0xe5, 0xe3, 0x2d, 0xdc,
+ 0x6f, 0xa4, 0x53, 0x88, 0xd8, 0xcf, 0x4d, 0x45, 0x42, 0x62, 0xdb, 0xdf,
+ 0xf8, 0x45, 0xc2, 0x13, 0xec, 0x35, },
+ { 0x06, 0x3c, 0xe3, 0x2c, 0x15, 0xc6, 0x43, 0x03, 0x81, 0xfb, 0x08, 0x76,
+ 0x33, 0xcb, 0x02, 0xc1, 0xba, 0x33, 0xe5, 0xe0, 0xd1, 0x92, 0xa8, 0x46,
+ 0x28, 0x3f, 0x3e, 0x9d, 0x2c, 0x44, 0x54, },
+ { 0xea, 0xbb, 0x96, 0xf8, 0xd1, 0x8b, 0x04, 0x11, 0x40, 0x78, 0x42, 0x02,
+ 0x19, 0xd1, 0xbc, 0x65, 0x92, 0xd3, 0xc3, 0xd6, 0xd9, 0x19, 0xe7, 0xc3,
+ 0x40, 0x97, 0xbd, 0xd4, 0xed, 0xfa, 0x5e, 0x28, },
+ { 0x02, },
+ { 0x52, 0xa8, },
+ { 0x38, 0x25, 0x0d, },
+ { 0xe3, 0x04, 0xd4, 0x92, },
+ { 0x97, 0xdb, 0xf7, 0x81, 0xca, },
+ { 0x8a, 0x56, 0x9d, 0x62, 0x56, 0xcc, },
+ { 0xa1, 0x8e, 0x3c, 0x72, 0x8f, 0x63, 0x03, },
+ { 0xf7, 0xf3, 0x39, 0x09, 0x0a, 0xa1, 0xbb, 0x23, },
+ { 0x6b, 0x03, 0xc0, 0xe9, 0xd9, 0x83, 0x05, 0x22, 0x01, },
+ { 0x1b, 0x4b, 0xf5, 0xd6, 0x4f, 0x05, 0x75, 0x91, 0x4c, 0x7f, },
+ { 0x4c, 0x8c, 0x25, 0x20, 0x21, 0xcb, 0xc2, 0x4b, 0x3a, 0x5b, 0x8d, },
+ { 0x56, 0xe2, 0x77, 0xa0, 0xb6, 0x9f, 0x81, 0xec, 0x83, 0x75, 0xc4, 0xf9, },
+ { 0x71, 0x70, 0x0f, 0xad, 0x4d, 0x35, 0x81, 0x9d, 0x88, 0x69, 0xf9, 0xaa,
+ 0xd3, },
+ { 0x50, 0x6e, 0x86, 0x6e, 0x43, 0xc0, 0xc2, 0x44, 0xc2, 0xe2, 0xa0, 0x1c,
+ 0xb7, 0x9a, },
+ { 0xe4, 0x7e, 0x72, 0xc6, 0x12, 0x8e, 0x7c, 0xfc, 0xbd, 0xe2, 0x08, 0x31,
+ 0x3d, 0x47, 0x3d, },
+ { 0x08, 0x97, 0x5b, 0x80, 0xae, 0xc4, 0x1d, 0x50, 0x77, 0xdf, 0x1f, 0xd0,
+ 0x24, 0xf0, 0x17, 0xc0, },
+ { 0x01, 0xb6, 0x29, 0xf4, 0xaf, 0x78, 0x5f, 0xb6, 0x91, 0xdd, 0x76, 0x76,
+ 0xd2, 0xfd, 0x0c, 0x47, 0x40, },
+ { 0xa1, 0xd8, 0x09, 0x97, 0x7a, 0xa6, 0xc8, 0x94, 0xf6, 0x91, 0x7b, 0xae,
+ 0x2b, 0x9f, 0x0d, 0x83, 0x48, 0xf7, },
+ { 0x12, 0xd5, 0x53, 0x7d, 0x9a, 0xb0, 0xbe, 0xd9, 0xed, 0xe9, 0x9e, 0xee,
+ 0x61, 0x5b, 0x42, 0xf2, 0xc0, 0x73, 0xc0, },
+ { 0xd5, 0x77, 0xd6, 0x5c, 0x6e, 0xa5, 0x69, 0x2b, 0x3b, 0x8c, 0xd6, 0x7d,
+ 0x1d, 0xbe, 0x2c, 0xa1, 0x02, 0x21, 0xcd, 0x29, },
+ { 0xa4, 0x98, 0x80, 0xca, 0x22, 0xcf, 0x6a, 0xab, 0x5e, 0x40, 0x0d, 0x61,
+ 0x08, 0x21, 0xef, 0xc0, 0x6c, 0x52, 0xb4, 0xb0, 0x53, },
+ { 0xbf, 0xaf, 0x8f, 0x3b, 0x7a, 0x97, 0x33, 0xe5, 0xca, 0x07, 0x37, 0xfd,
+ 0x15, 0xdf, 0xce, 0x26, 0x2a, 0xb1, 0xa7, 0x0b, 0xb3, 0xac, },
+ { 0x16, 0x22, 0xe1, 0xbc, 0x99, 0x4e, 0x01, 0xf0, 0xfa, 0xff, 0x8f, 0xa5,
+ 0x0c, 0x61, 0xb0, 0xad, 0xcc, 0xb1, 0xe1, 0x21, 0x46, 0xfa, 0x2e, },
+ { 0x11, 0x5b, 0x0b, 0x2b, 0xe6, 0x14, 0xc1, 0xd5, 0x4d, 0x71, 0x5e, 0x17,
+ 0xea, 0x23, 0xdd, 0x6c, 0xbd, 0x1d, 0xbe, 0x12, 0x1b, 0xee, 0x4c, 0x1a, },
+ { 0x40, 0x88, 0x22, 0xf3, 0x20, 0x6c, 0xed, 0xe1, 0x36, 0x34, 0x62, 0x2c,
+ 0x98, 0x83, 0x52, 0xe2, 0x25, 0xee, 0xe9, 0xf5, 0xe1, 0x17, 0xf0, 0x5c,
+ 0xae, },
+ { 0xc3, 0x76, 0x37, 0xde, 0x95, 0x8c, 0xca, 0x2b, 0x0c, 0x23, 0xe7, 0xb5,
+ 0x38, 0x70, 0x61, 0xcc, 0xff, 0xd3, 0x95, 0x7b, 0xf3, 0xff, 0x1f, 0x9d,
+ 0x59, 0x00, },
+ { 0x0c, 0x19, 0x52, 0x05, 0x22, 0x53, 0xcb, 0x48, 0xd7, 0x10, 0x0e, 0x7e,
+ 0x14, 0x69, 0xb5, 0xa2, 0x92, 0x43, 0xa3, 0x9e, 0x4b, 0x8f, 0x51, 0x2c,
+ 0x5a, 0x2c, 0x3b, },
+ { 0xe1, 0x9d, 0x70, 0x70, 0x28, 0xec, 0x86, 0x40, 0x55, 0x33, 0x56, 0xda,
+ 0x88, 0xca, 0xee, 0xc8, 0x6a, 0x20, 0xb1, 0xe5, 0x3d, 0x57, 0xf8, 0x3c,
+ 0x10, 0x07, 0x2a, 0xc4, },
+ { 0x0b, 0xae, 0xf1, 0xc4, 0x79, 0xee, 0x1b, 0x3d, 0x27, 0x35, 0x8d, 0x14,
+ 0xd6, 0xae, 0x4e, 0x3c, 0xe9, 0x53, 0x50, 0xb5, 0xcc, 0x0c, 0xf7, 0xdf,
+ 0xee, 0xa1, 0x74, 0xd6, 0x71, },
+ { 0xe6, 0xa4, 0xf4, 0x99, 0x98, 0xb9, 0x80, 0xea, 0x96, 0x7f, 0x4f, 0x33,
+ 0xcf, 0x74, 0x25, 0x6f, 0x17, 0x6c, 0xbf, 0xf5, 0x5c, 0x38, 0xd0, 0xff,
+ 0x96, 0xcb, 0x13, 0xf9, 0xdf, 0xfd, },
+ { 0xbe, 0x92, 0xeb, 0xba, 0x44, 0x2c, 0x24, 0x74, 0xd4, 0x03, 0x27, 0x3c,
+ 0x5d, 0x5b, 0x03, 0x30, 0x87, 0x63, 0x69, 0xe0, 0xb8, 0x94, 0xf4, 0x44,
+ 0x7e, 0xad, 0xcd, 0x20, 0x12, 0x16, 0x79, },
+ { 0x30, 0xf1, 0xc4, 0x8e, 0x05, 0x90, 0x2a, 0x97, 0x63, 0x94, 0x46, 0xff,
+ 0xce, 0xd8, 0x67, 0xa7, 0xac, 0x33, 0x8c, 0x95, 0xb7, 0xcd, 0xa3, 0x23,
+ 0x98, 0x9d, 0x76, 0x6c, 0x9d, 0xa8, 0xd6, 0x8a, },
+ { 0xbe, },
+ { 0x17, 0x6c, },
+ { 0x1a, 0x42, 0x4f, },
+ { 0xba, 0xaf, 0xb7, 0x65, },
+ { 0xc2, 0x63, 0x43, 0x6a, 0xea, },
+ { 0xe4, 0x4d, 0xad, 0xf2, 0x0b, 0x02, },
+ { 0x04, 0xc7, 0xc4, 0x7f, 0xa9, 0x2b, 0xce, },
+ { 0x66, 0xf6, 0x67, 0xcb, 0x03, 0x53, 0xc8, 0xf1, },
+ { 0x56, 0xa3, 0x60, 0x78, 0xc9, 0x5f, 0x70, 0x1b, 0x5e, },
+ { 0x99, 0xff, 0x81, 0x7c, 0x13, 0x3c, 0x29, 0x79, 0x4b, 0x65, },
+ { 0x51, 0x10, 0x50, 0x93, 0x01, 0x93, 0xb7, 0x01, 0xc9, 0x18, 0xb7, },
+ { 0x8e, 0x3c, 0x42, 0x1e, 0x5e, 0x7d, 0xc1, 0x50, 0x70, 0x1f, 0x00, 0x98, },
+ { 0x5f, 0xd9, 0x9b, 0xc8, 0xd7, 0xb2, 0x72, 0x62, 0x1a, 0x1e, 0xba, 0x92,
+ 0xe9, },
+ { 0x70, 0x2b, 0xba, 0xfe, 0xad, 0x5d, 0x96, 0x3f, 0x27, 0xc2, 0x41, 0x6d,
+ 0xc4, 0xb3, },
+ { 0xae, 0xe0, 0xd5, 0xd4, 0xc7, 0xae, 0x15, 0x5e, 0xdc, 0xdd, 0x33, 0x60,
+ 0xd7, 0xd3, 0x5e, },
+ { 0x79, 0x8e, 0xbc, 0x9e, 0x20, 0xb9, 0x19, 0x4b, 0x63, 0x80, 0xf3, 0x16,
+ 0xaf, 0x39, 0xbd, 0x92, },
+ { 0xc2, 0x0e, 0x85, 0xa0, 0x0b, 0x9a, 0xb0, 0xec, 0xde, 0x38, 0xd3, 0x10,
+ 0xd9, 0xa7, 0x66, 0x27, 0xcf, },
+ { 0x0e, 0x3b, 0x75, 0x80, 0x67, 0x14, 0x0c, 0x02, 0x90, 0xd6, 0xb3, 0x02,
+ 0x81, 0xf6, 0xa6, 0x87, 0xce, 0x58, },
+ { 0x79, 0xb5, 0xe9, 0x5d, 0x52, 0x4d, 0xf7, 0x59, 0xf4, 0x2e, 0x27, 0xdd,
+ 0xb3, 0xed, 0x57, 0x5b, 0x82, 0xea, 0x6f, },
+ { 0xa2, 0x97, 0xf5, 0x80, 0x02, 0x3d, 0xde, 0xa3, 0xf9, 0xf6, 0xab, 0xe3,
+ 0x57, 0x63, 0x7b, 0x9b, 0x10, 0x42, 0x6f, 0xf2, },
+ { 0x12, 0x7a, 0xfc, 0xb7, 0x67, 0x06, 0x0c, 0x78, 0x1a, 0xfe, 0x88, 0x4f,
+ 0xc6, 0xac, 0x52, 0x96, 0x64, 0x28, 0x97, 0x84, 0x06, },
+ { 0xc5, 0x04, 0x44, 0x6b, 0xb2, 0xa5, 0xa4, 0x66, 0xe1, 0x76, 0xa2, 0x51,
+ 0xf9, 0x59, 0x69, 0x97, 0x56, 0x0b, 0xbf, 0x50, 0xb3, 0x34, },
+ { 0x21, 0x32, 0x6b, 0x42, 0xb5, 0xed, 0x71, 0x8d, 0xf7, 0x5a, 0x35, 0xe3,
+ 0x90, 0xe2, 0xee, 0xaa, 0x89, 0xf6, 0xc9, 0x9c, 0x4d, 0x73, 0xf4, },
+ { 0x4c, 0xa6, 0x09, 0xf4, 0x48, 0xe7, 0x46, 0xbc, 0x49, 0xfc, 0xe5, 0xda,
+ 0xd1, 0x87, 0x13, 0x17, 0x4c, 0x59, 0x71, 0x26, 0x5b, 0x2c, 0x42, 0xb7, },
+ { 0x13, 0x63, 0xf3, 0x40, 0x02, 0xe5, 0xa3, 0x3a, 0x5e, 0x8e, 0xf8, 0xb6,
+ 0x8a, 0x49, 0x60, 0x76, 0x34, 0x72, 0x94, 0x73, 0xf6, 0xd9, 0x21, 0x6a,
+ 0x26, },
+ { 0xdf, 0x75, 0x16, 0x10, 0x1b, 0x5e, 0x81, 0xc3, 0xc8, 0xde, 0x34, 0x24,
+ 0xb0, 0x98, 0xeb, 0x1b, 0x8f, 0xa1, 0x9b, 0x05, 0xee, 0xa5, 0xe9, 0x35,
+ 0xf4, 0x1d, },
+ { 0xcd, 0x21, 0x93, 0x6e, 0x5b, 0xa0, 0x26, 0x2b, 0x21, 0x0e, 0xa0, 0xb9,
+ 0x1c, 0xb5, 0xbb, 0xb8, 0xf8, 0x1e, 0xff, 0x5c, 0xa8, 0xf9, 0x39, 0x46,
+ 0x4e, 0x29, 0x26, },
+ { 0x73, 0x7f, 0x0e, 0x3b, 0x0b, 0x5c, 0xf9, 0x60, 0xaa, 0x88, 0xa1, 0x09,
+ 0xb1, 0x5d, 0x38, 0x7b, 0x86, 0x8f, 0x13, 0x7a, 0x8d, 0x72, 0x7a, 0x98,
+ 0x1a, 0x5b, 0xff, 0xc9, },
+ { 0xd3, 0x3c, 0x61, 0x71, 0x44, 0x7e, 0x31, 0x74, 0x98, 0x9d, 0x9a, 0xd2,
+ 0x27, 0xf3, 0x46, 0x43, 0x42, 0x51, 0xd0, 0x5f, 0xe9, 0x1c, 0x5c, 0x69,
+ 0xbf, 0xf6, 0xbe, 0x3c, 0x40, },
+ { 0x31, 0x99, 0x31, 0x9f, 0xaa, 0x43, 0x2e, 0x77, 0x3e, 0x74, 0x26, 0x31,
+ 0x5e, 0x61, 0xf1, 0x87, 0xe2, 0xeb, 0x9b, 0xcd, 0xd0, 0x3a, 0xee, 0x20,
+ 0x7e, 0x10, 0x0a, 0x0b, 0x7e, 0xfa, },
+ { 0xa4, 0x27, 0x80, 0x67, 0x81, 0x2a, 0xa7, 0x62, 0xf7, 0x6e, 0xda, 0xd4,
+ 0x5c, 0x39, 0x74, 0xad, 0x7e, 0xbe, 0xad, 0xa5, 0x84, 0x7f, 0xa9, 0x30,
+ 0x5d, 0xdb, 0xe2, 0x05, 0x43, 0xf7, 0x1b, },
+ { 0x0b, 0x37, 0xd8, 0x02, 0xe1, 0x83, 0xd6, 0x80, 0xf2, 0x35, 0xc2, 0xb0,
+ 0x37, 0xef, 0xef, 0x5e, 0x43, 0x93, 0xf0, 0x49, 0x45, 0x0a, 0xef, 0xb5,
+ 0x76, 0x70, 0x12, 0x44, 0xc4, 0xdb, 0xf5, 0x7a, },
+ { 0x1f, },
+ { 0x82, 0x60, },
+ { 0xcc, 0xe3, 0x08, },
+ { 0x56, 0x17, 0xe4, 0x59, },
+ { 0xe2, 0xd7, 0x9e, 0xc4, 0x4c, },
+ { 0xb2, 0xad, 0xd3, 0x78, 0x58, 0x5a, },
+ { 0xce, 0x43, 0xb4, 0x02, 0x96, 0xab, 0x3c, },
+ { 0xe6, 0x05, 0x1a, 0x73, 0x22, 0x32, 0xbb, 0x77, },
+ { 0x23, 0xe7, 0xda, 0xfe, 0x2c, 0xef, 0x8c, 0x22, 0xec, },
+ { 0xe9, 0x8e, 0x55, 0x38, 0xd1, 0xd7, 0x35, 0x23, 0x98, 0xc7, },
+ { 0xb5, 0x81, 0x1a, 0xe5, 0xb5, 0xa5, 0xd9, 0x4d, 0xca, 0x41, 0xe7, },
+ { 0x41, 0x16, 0x16, 0x95, 0x8d, 0x9e, 0x0c, 0xea, 0x8c, 0x71, 0x9a, 0xc1, },
+ { 0x7c, 0x33, 0xc0, 0xa4, 0x00, 0x62, 0xea, 0x60, 0x67, 0xe4, 0x20, 0xbc,
+ 0x5b, },
+ { 0xdb, 0xb1, 0xdc, 0xfd, 0x08, 0xc0, 0xde, 0x82, 0xd1, 0xde, 0x38, 0xc0,
+ 0x90, 0x48, },
+ { 0x37, 0x18, 0x2e, 0x0d, 0x61, 0xaa, 0x61, 0xd7, 0x86, 0x20, 0x16, 0x60,
+ 0x04, 0xd9, 0xd5, },
+ { 0xb0, 0xcf, 0x2c, 0x4c, 0x5e, 0x5b, 0x4f, 0x2a, 0x23, 0x25, 0x58, 0x47,
+ 0xe5, 0x31, 0x06, 0x70, },
+ { 0x91, 0xa0, 0xa3, 0x86, 0x4e, 0xe0, 0x72, 0x38, 0x06, 0x67, 0x59, 0x5c,
+ 0x70, 0x25, 0xdb, 0x33, 0x27, },
+ { 0x44, 0x58, 0x66, 0xb8, 0x58, 0xc7, 0x13, 0xed, 0x4c, 0xc0, 0xf4, 0x9a,
+ 0x1e, 0x67, 0x75, 0x33, 0xb6, 0xb8, },
+ { 0x7f, 0x98, 0x4a, 0x8e, 0x50, 0xa2, 0x5c, 0xcd, 0x59, 0xde, 0x72, 0xb3,
+ 0x9d, 0xc3, 0x09, 0x8a, 0xab, 0x56, 0xf1, },
+ { 0x80, 0x96, 0x49, 0x1a, 0x59, 0xa2, 0xc5, 0xd5, 0xa7, 0x20, 0x8a, 0xb7,
+ 0x27, 0x62, 0x84, 0x43, 0xc6, 0xe1, 0x1b, 0x5d, },
+ { 0x6b, 0xb7, 0x2b, 0x26, 0x62, 0x14, 0x70, 0x19, 0x3d, 0x4d, 0xac, 0xac,
+ 0x63, 0x58, 0x5e, 0x94, 0xb5, 0xb7, 0xe8, 0xe8, 0xa2, },
+ { 0x20, 0xa8, 0xc0, 0xfd, 0x63, 0x3d, 0x6e, 0x98, 0xcf, 0x0c, 0x49, 0x98,
+ 0xe4, 0x5a, 0xfe, 0x8c, 0xaa, 0x70, 0x82, 0x1c, 0x7b, 0x74, },
+ { 0xc8, 0xe8, 0xdd, 0xdf, 0x69, 0x30, 0x01, 0xc2, 0x0f, 0x7e, 0x2f, 0x11,
+ 0xcc, 0x3e, 0x17, 0xa5, 0x69, 0x40, 0x3f, 0x0e, 0x79, 0x7f, 0xcf, },
+ { 0xdb, 0x61, 0xc0, 0xe2, 0x2e, 0x49, 0x07, 0x31, 0x1d, 0x91, 0x42, 0x8a,
+ 0xfc, 0x5e, 0xd3, 0xf8, 0x56, 0x1f, 0x2b, 0x73, 0xfd, 0x9f, 0xb2, 0x8e, },
+ { 0x0c, 0x89, 0x55, 0x0c, 0x1f, 0x59, 0x2c, 0x9d, 0x1b, 0x29, 0x1d, 0x41,
+ 0x1d, 0xe6, 0x47, 0x8f, 0x8c, 0x2b, 0xea, 0x8f, 0xf0, 0xff, 0x21, 0x70,
+ 0x88, },
+ { 0x12, 0x18, 0x95, 0xa6, 0x59, 0xb1, 0x31, 0x24, 0x45, 0x67, 0x55, 0xa4,
+ 0x1a, 0x2d, 0x48, 0x67, 0x1b, 0x43, 0x88, 0x2d, 0x8e, 0xa0, 0x70, 0xb3,
+ 0xc6, 0xbb, },
+ { 0xe7, 0xb1, 0x1d, 0xb2, 0x76, 0x4d, 0x68, 0x68, 0x68, 0x23, 0x02, 0x55,
+ 0x3a, 0xe2, 0xe5, 0xd5, 0x4b, 0x43, 0xf9, 0x34, 0x77, 0x5c, 0xa1, 0xf5,
+ 0x55, 0xfd, 0x4f, },
+ { 0x8c, 0x87, 0x5a, 0x08, 0x3a, 0x73, 0xad, 0x61, 0xe1, 0xe7, 0x99, 0x7e,
+ 0xf0, 0x5d, 0xe9, 0x5d, 0x16, 0x43, 0x80, 0x2f, 0xd0, 0x66, 0x34, 0xe2,
+ 0x42, 0x64, 0x3b, 0x1a, },
+ { 0x39, 0xc1, 0x99, 0xcf, 0x22, 0xbf, 0x16, 0x8f, 0x9f, 0x80, 0x7f, 0x95,
+ 0x0a, 0x05, 0x67, 0x27, 0xe7, 0x15, 0xdf, 0x9d, 0xb2, 0xfe, 0x1c, 0xb5,
+ 0x1d, 0x60, 0x8f, 0x8a, 0x1d, },
+ { 0x9b, 0x6e, 0x08, 0x09, 0x06, 0x73, 0xab, 0x68, 0x02, 0x62, 0x1a, 0xe4,
+ 0xd4, 0xdf, 0xc7, 0x02, 0x4c, 0x6a, 0x5f, 0xfd, 0x23, 0xac, 0xae, 0x6d,
+ 0x43, 0xa4, 0x7a, 0x50, 0x60, 0x3c, },
+ { 0x1d, 0xb4, 0xc6, 0xe1, 0xb1, 0x4b, 0xe3, 0xf2, 0xe2, 0x1a, 0x73, 0x1b,
+ 0xa0, 0x92, 0xa7, 0xf5, 0xff, 0x8f, 0x8b, 0x5d, 0xdf, 0xa8, 0x04, 0xb3,
+ 0xb0, 0xf7, 0xcc, 0x12, 0xfa, 0x35, 0x46, },
+ { 0x49, 0x45, 0x97, 0x11, 0x0f, 0x1c, 0x60, 0x8e, 0xe8, 0x47, 0x30, 0xcf,
+ 0x60, 0xa8, 0x71, 0xc5, 0x1b, 0xe9, 0x39, 0x4d, 0x49, 0xb6, 0x12, 0x1f,
+ 0x24, 0xab, 0x37, 0xff, 0x83, 0xc2, 0xe1, 0x3a, },
+ { 0x60, },
+ { 0x24, 0x26, },
+ { 0x47, 0xeb, 0xc9, },
+ { 0x4a, 0xd0, 0xbc, 0xf0, },
+ { 0x8e, 0x2b, 0xc9, 0x85, 0x3c, },
+ { 0xa2, 0x07, 0x15, 0xb8, 0x12, 0x74, },
+ { 0x0f, 0xdb, 0x5b, 0x33, 0x69, 0xfe, 0x4b, },
+ { 0xa2, 0x86, 0x54, 0xf4, 0xfd, 0xb2, 0xd4, 0xe6, },
+ { 0xbb, 0x84, 0x78, 0x49, 0x27, 0x8e, 0x61, 0xda, 0x60, },
+ { 0x04, 0xc3, 0xcd, 0xaa, 0x8f, 0xa7, 0x03, 0xc9, 0xf9, 0xb6, },
+ { 0xf8, 0x27, 0x1d, 0x61, 0xdc, 0x21, 0x42, 0xdd, 0xad, 0x92, 0x40, },
+ { 0x12, 0x87, 0xdf, 0xc2, 0x41, 0x45, 0x5a, 0x36, 0x48, 0x5b, 0x51, 0x2b, },
+ { 0xbb, 0x37, 0x5d, 0x1f, 0xf1, 0x68, 0x7a, 0xc4, 0xa5, 0xd2, 0xa4, 0x91,
+ 0x8d, },
+ { 0x5b, 0x27, 0xd1, 0x04, 0x54, 0x52, 0x9f, 0xa3, 0x47, 0x86, 0x33, 0x33,
+ 0xbf, 0xa0, },
+ { 0xcf, 0x04, 0xea, 0xf8, 0x03, 0x2a, 0x43, 0xff, 0xa6, 0x68, 0x21, 0x4c,
+ 0xd5, 0x4b, 0xed, },
+ { 0xaf, 0xb8, 0xbc, 0x63, 0x0f, 0x18, 0x4d, 0xe2, 0x7a, 0xdd, 0x46, 0x44,
+ 0xc8, 0x24, 0x0a, 0xb7, },
+ { 0x3e, 0xdc, 0x36, 0xe4, 0x89, 0xb1, 0xfa, 0xc6, 0x40, 0x93, 0x2e, 0x75,
+ 0xb2, 0x15, 0xd1, 0xb1, 0x10, },
+ { 0x6c, 0xd8, 0x20, 0x3b, 0x82, 0x79, 0xf9, 0xc8, 0xbc, 0x9d, 0xe0, 0x35,
+ 0xbe, 0x1b, 0x49, 0x1a, 0xbc, 0x3a, },
+ { 0x78, 0x65, 0x2c, 0xbe, 0x35, 0x67, 0xdc, 0x78, 0xd4, 0x41, 0xf6, 0xc9,
+ 0xde, 0xde, 0x1f, 0x18, 0x13, 0x31, 0x11, },
+ { 0x8a, 0x7f, 0xb1, 0x33, 0x8f, 0x0c, 0x3c, 0x0a, 0x06, 0x61, 0xf0, 0x47,
+ 0x29, 0x1b, 0x29, 0xbc, 0x1c, 0x47, 0xef, 0x7a, },
+ { 0x65, 0x91, 0xf1, 0xe6, 0xb3, 0x96, 0xd3, 0x8c, 0xc2, 0x4a, 0x59, 0x35,
+ 0x72, 0x8e, 0x0b, 0x9a, 0x87, 0xca, 0x34, 0x7b, 0x63, },
+ { 0x5f, 0x08, 0x87, 0x80, 0x56, 0x25, 0x89, 0x77, 0x61, 0x8c, 0x64, 0xa1,
+ 0x59, 0x6d, 0x59, 0x62, 0xe8, 0x4a, 0xc8, 0x58, 0x99, 0xd1, },
+ { 0x23, 0x87, 0x1d, 0xed, 0x6f, 0xf2, 0x91, 0x90, 0xe2, 0xfe, 0x43, 0x21,
+ 0xaf, 0x97, 0xc6, 0xbc, 0xd7, 0x15, 0xc7, 0x2d, 0x08, 0x77, 0x91, },
+ { 0x90, 0x47, 0x9a, 0x9e, 0x3a, 0xdf, 0xf3, 0xc9, 0x4c, 0x1e, 0xa7, 0xd4,
+ 0x6a, 0x32, 0x90, 0xfe, 0xb7, 0xb6, 0x7b, 0xfa, 0x96, 0x61, 0xfb, 0xa4, },
+ { 0xb1, 0x67, 0x60, 0x45, 0xb0, 0x96, 0xc5, 0x15, 0x9f, 0x4d, 0x26, 0xd7,
+ 0x9d, 0xf1, 0xf5, 0x6d, 0x21, 0x00, 0x94, 0x31, 0x64, 0x94, 0xd3, 0xa7,
+ 0xd3, },
+ { 0x02, 0x3e, 0xaf, 0xf3, 0x79, 0x73, 0xa5, 0xf5, 0xcc, 0x7a, 0x7f, 0xfb,
+ 0x79, 0x2b, 0x85, 0x8c, 0x88, 0x72, 0x06, 0xbe, 0xfe, 0xaf, 0xc1, 0x16,
+ 0xa6, 0xd6, },
+ { 0x2a, 0xb0, 0x1a, 0xe5, 0xaa, 0x6e, 0xb3, 0xae, 0x53, 0x85, 0x33, 0x80,
+ 0x75, 0xae, 0x30, 0xe6, 0xb8, 0x72, 0x42, 0xf6, 0x25, 0x4f, 0x38, 0x88,
+ 0x55, 0xd1, 0xa9, },
+ { 0x90, 0xd8, 0x0c, 0xc0, 0x93, 0x4b, 0x4f, 0x9e, 0x65, 0x6c, 0xa1, 0x54,
+ 0xa6, 0xf6, 0x6e, 0xca, 0xd2, 0xbb, 0x7e, 0x6a, 0x1c, 0xd3, 0xce, 0x46,
+ 0xef, 0xb0, 0x00, 0x8d, },
+ { 0xed, 0x9c, 0x49, 0xcd, 0xc2, 0xde, 0x38, 0x0e, 0xe9, 0x98, 0x6c, 0xc8,
+ 0x90, 0x9e, 0x3c, 0xd4, 0xd3, 0xeb, 0x88, 0x32, 0xc7, 0x28, 0xe3, 0x94,
+ 0x1c, 0x9f, 0x8b, 0xf3, 0xcb, },
+ { 0xac, 0xe7, 0x92, 0x16, 0xb4, 0x14, 0xa0, 0xe4, 0x04, 0x79, 0xa2, 0xf4,
+ 0x31, 0xe6, 0x0c, 0x26, 0xdc, 0xbf, 0x2f, 0x69, 0x1b, 0x55, 0x94, 0x67,
+ 0xda, 0x0c, 0xd7, 0x32, 0x1f, 0xef, },
+ { 0x68, 0x63, 0x85, 0x57, 0x95, 0x9e, 0x42, 0x27, 0x41, 0x43, 0x42, 0x02,
+ 0xa5, 0x78, 0xa7, 0xc6, 0x43, 0xc1, 0x6a, 0xba, 0x70, 0x80, 0xcd, 0x04,
+ 0xb6, 0x78, 0x76, 0x29, 0xf3, 0xe8, 0xa0, },
+ { 0xe6, 0xac, 0x8d, 0x9d, 0xf0, 0xc0, 0xf7, 0xf7, 0xe3, 0x3e, 0x4e, 0x28,
+ 0x0f, 0x59, 0xb2, 0x67, 0x9e, 0x84, 0x34, 0x42, 0x96, 0x30, 0x2b, 0xca,
+ 0x49, 0xb6, 0xc5, 0x9a, 0x84, 0x59, 0xa7, 0x81, },
+ { 0x7e, },
+ { 0x1e, 0x21, },
+ { 0x26, 0xd3, 0xdd, },
+ { 0x2c, 0xd4, 0xb3, 0x3d, },
+ { 0x86, 0x7b, 0x76, 0x3c, 0xf0, },
+ { 0x12, 0xc3, 0x70, 0x1d, 0x55, 0x18, },
+ { 0x96, 0xc2, 0xbd, 0x61, 0x55, 0xf4, 0x24, },
+ { 0x20, 0x51, 0xf7, 0x86, 0x58, 0x8f, 0x07, 0x2a, },
+ { 0x93, 0x15, 0xa8, 0x1d, 0xda, 0x97, 0xee, 0x0e, 0x6c, },
+ { 0x39, 0x93, 0xdf, 0xd5, 0x0e, 0xca, 0xdc, 0x7a, 0x92, 0xce, },
+ { 0x60, 0xd5, 0xfd, 0xf5, 0x1b, 0x26, 0x82, 0x26, 0x73, 0x02, 0xbc, },
+ { 0x98, 0xf2, 0x34, 0xe1, 0xf5, 0xfb, 0x00, 0xac, 0x10, 0x4a, 0x38, 0x9f, },
+ { 0xda, 0x3a, 0x92, 0x8a, 0xd0, 0xcd, 0x12, 0xcd, 0x15, 0xbb, 0xab, 0x77,
+ 0x66, },
+ { 0xa2, 0x92, 0x1a, 0xe5, 0xca, 0x0c, 0x30, 0x75, 0xeb, 0xaf, 0x00, 0x31,
+ 0x55, 0x66, },
+ { 0x06, 0xea, 0xfd, 0x3e, 0x86, 0x38, 0x62, 0x4e, 0xa9, 0x12, 0xa4, 0x12,
+ 0x43, 0xbf, 0xa1, },
+ { 0xe4, 0x71, 0x7b, 0x94, 0xdb, 0xa0, 0xd2, 0xff, 0x9b, 0xeb, 0xad, 0x8e,
+ 0x95, 0x8a, 0xc5, 0xed, },
+ { 0x25, 0x5a, 0x77, 0x71, 0x41, 0x0e, 0x7a, 0xe9, 0xed, 0x0c, 0x10, 0xef,
+ 0xf6, 0x2b, 0x3a, 0xba, 0x60, },
+ { 0xee, 0xe2, 0xa3, 0x67, 0x64, 0x1d, 0xc6, 0x04, 0xc4, 0xe1, 0x68, 0xd2,
+ 0x6e, 0xd2, 0x91, 0x75, 0x53, 0x07, },
+ { 0xe0, 0xf6, 0x4d, 0x8f, 0x68, 0xfc, 0x06, 0x7e, 0x18, 0x79, 0x7f, 0x2b,
+ 0x6d, 0xef, 0x46, 0x7f, 0xab, 0xb2, 0xad, },
+ { 0x3d, 0x35, 0x88, 0x9f, 0x2e, 0xcf, 0x96, 0x45, 0x07, 0x60, 0x71, 0x94,
+ 0x00, 0x8d, 0xbf, 0xf4, 0xef, 0x46, 0x2e, 0x3c, },
+ { 0x43, 0xcf, 0x98, 0xf7, 0x2d, 0xf4, 0x17, 0xe7, 0x8c, 0x05, 0x2d, 0x9b,
+ 0x24, 0xfb, 0x4d, 0xea, 0x4a, 0xec, 0x01, 0x25, 0x29, },
+ { 0x8e, 0x73, 0x9a, 0x78, 0x11, 0xfe, 0x48, 0xa0, 0x3b, 0x1a, 0x26, 0xdf,
+ 0x25, 0xe9, 0x59, 0x1c, 0x70, 0x07, 0x9f, 0xdc, 0xa0, 0xa6, },
+ { 0xe8, 0x47, 0x71, 0xc7, 0x3e, 0xdf, 0xb5, 0x13, 0xb9, 0x85, 0x13, 0xa8,
+ 0x54, 0x47, 0x6e, 0x59, 0x96, 0x09, 0x13, 0x5f, 0x82, 0x16, 0x0b, },
+ { 0xfb, 0xc0, 0x8c, 0x03, 0x21, 0xb3, 0xc4, 0xb5, 0x43, 0x32, 0x6c, 0xea,
+ 0x7f, 0xa8, 0x43, 0x91, 0xe8, 0x4e, 0x3f, 0xbf, 0x45, 0x58, 0x6a, 0xa3, },
+ { 0x55, 0xf8, 0xf3, 0x00, 0x76, 0x09, 0xef, 0x69, 0x5d, 0xd2, 0x8a, 0xf2,
+ 0x65, 0xc3, 0xcb, 0x9b, 0x43, 0xfd, 0xb1, 0x7e, 0x7f, 0xa1, 0x94, 0xb0,
+ 0xd7, },
+ { 0xaa, 0x13, 0xc1, 0x51, 0x40, 0x6d, 0x8d, 0x4c, 0x0a, 0x95, 0x64, 0x7b,
+ 0xd1, 0x96, 0xb6, 0x56, 0xb4, 0x5b, 0xcf, 0xd6, 0xd9, 0x15, 0x97, 0xdd,
+ 0xb6, 0xef, },
+ { 0xaf, 0xb7, 0x36, 0xb0, 0x04, 0xdb, 0xd7, 0x9c, 0x9a, 0x44, 0xc4, 0xf6,
+ 0x1f, 0x12, 0x21, 0x2d, 0x59, 0x30, 0x54, 0xab, 0x27, 0x61, 0xa3, 0x57,
+ 0xef, 0xf8, 0x53, },
+ { 0x97, 0x34, 0x45, 0x3e, 0xce, 0x7c, 0x35, 0xa2, 0xda, 0x9f, 0x4b, 0x46,
+ 0x6c, 0x11, 0x67, 0xff, 0x2f, 0x76, 0x58, 0x15, 0x71, 0xfa, 0x44, 0x89,
+ 0x89, 0xfd, 0xf7, 0x99, },
+ { 0x1f, 0xb1, 0x62, 0xeb, 0x83, 0xc5, 0x9c, 0x89, 0xf9, 0x2c, 0xd2, 0x03,
+ 0x61, 0xbc, 0xbb, 0xa5, 0x74, 0x0e, 0x9b, 0x7e, 0x82, 0x3e, 0x70, 0x0a,
+ 0xa9, 0x8f, 0x2b, 0x59, 0xfb, },
+ { 0xf8, 0xca, 0x5e, 0x3a, 0x4f, 0x9e, 0x10, 0x69, 0x10, 0xd5, 0x4c, 0xeb,
+ 0x1a, 0x0f, 0x3c, 0x6a, 0x98, 0xf5, 0xb0, 0x97, 0x5b, 0x37, 0x2f, 0x0d,
+ 0xbd, 0x42, 0x4b, 0x69, 0xa1, 0x82, },
+ { 0x12, 0x8c, 0x6d, 0x52, 0x08, 0xef, 0x74, 0xb2, 0xe6, 0xaa, 0xd3, 0xb0,
+ 0x26, 0xb0, 0xd9, 0x94, 0xb6, 0x11, 0x45, 0x0e, 0x36, 0x71, 0x14, 0x2d,
+ 0x41, 0x8c, 0x21, 0x53, 0x31, 0xe9, 0x68, },
+ { 0xee, 0xea, 0x0d, 0x89, 0x47, 0x7e, 0x72, 0xd1, 0xd8, 0xce, 0x58, 0x4c,
+ 0x94, 0x1f, 0x0d, 0x51, 0x08, 0xa3, 0xb6, 0x3d, 0xe7, 0x82, 0x46, 0x92,
+ 0xd6, 0x98, 0x6b, 0x07, 0x10, 0x65, 0x52, 0x65, },
+};
+
+static const u8 blake2s_hmac_testvecs[][BLAKE2S_HASH_SIZE] __initconst = {
+ { 0xce, 0xe1, 0x57, 0x69, 0x82, 0xdc, 0xbf, 0x43, 0xad, 0x56, 0x4c, 0x70,
+ 0xed, 0x68, 0x16, 0x96, 0xcf, 0xa4, 0x73, 0xe8, 0xe8, 0xfc, 0x32, 0x79,
+ 0x08, 0x0a, 0x75, 0x82, 0xda, 0x3f, 0x05, 0x11, },
+ { 0x77, 0x2f, 0x0c, 0x71, 0x41, 0xf4, 0x4b, 0x2b, 0xb3, 0xc6, 0xb6, 0xf9,
+ 0x60, 0xde, 0xe4, 0x52, 0x38, 0x66, 0xe8, 0xbf, 0x9b, 0x96, 0xc4, 0x9f,
+ 0x60, 0xd9, 0x24, 0x37, 0x99, 0xd6, 0xec, 0x31, },
+};
+
+bool __init blake2s_selftest(void)
+{
+ u8 key[BLAKE2S_KEY_SIZE];
+ u8 buf[ARRAY_SIZE(blake2s_testvecs)];
+ u8 hash[BLAKE2S_HASH_SIZE];
+ struct blake2s_state state;
+ bool success = true;
+ int i, l;
+
+ key[0] = key[1] = 1;
+ for (i = 2; i < sizeof(key); ++i)
+ key[i] = key[i - 2] + key[i - 1];
+
+ for (i = 0; i < sizeof(buf); ++i)
+ buf[i] = (u8)i;
+
+ for (i = l = 0; i < ARRAY_SIZE(blake2s_testvecs); l = (l + 37) % ++i) {
+ int outlen = 1 + i % BLAKE2S_HASH_SIZE;
+ int keylen = (13 * i) % (BLAKE2S_KEY_SIZE + 1);
+
+ blake2s(hash, buf, key + BLAKE2S_KEY_SIZE - keylen, outlen, i,
+ keylen);
+ if (memcmp(hash, blake2s_testvecs[i], outlen)) {
+ pr_err("blake2s self-test %d: FAIL\n", i + 1);
+ success = false;
+ }
+
+ if (!keylen)
+ blake2s_init(&state, outlen);
+ else
+ blake2s_init_key(&state, outlen,
+ key + BLAKE2S_KEY_SIZE - keylen,
+ keylen);
+
+ blake2s_update(&state, buf, l);
+ blake2s_update(&state, buf + l, i - l);
+ blake2s_final(&state, hash);
+ if (memcmp(hash, blake2s_testvecs[i], outlen)) {
+ pr_err("blake2s init/update/final self-test %d: FAIL\n",
+ i + 1);
+ success = false;
+ }
+ }
+
+ if (success) {
+ blake2s256_hmac(hash, buf, key, sizeof(buf), sizeof(key));
+ success &= !memcmp(hash, blake2s_hmac_testvecs[0], BLAKE2S_HASH_SIZE);
+
+ blake2s256_hmac(hash, key, buf, sizeof(key), sizeof(buf));
+ success &= !memcmp(hash, blake2s_hmac_testvecs[1], BLAKE2S_HASH_SIZE);
+
+ if (!success)
+ pr_err("blake2s256_hmac self-test: FAIL\n");
+ }
+
+ return success;
+}
--- /dev/null
+++ b/lib/crypto/blake2s.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
+ *
+ * This is an implementation of the BLAKE2s hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ *
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+bool blake2s_selftest(void);
+
+void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+{
+ const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+
+ if (unlikely(!inlen))
+ return;
+ if (inlen > fill) {
+ memcpy(state->buf + state->buflen, in, fill);
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S))
+ blake2s_compress_arch(state, state->buf, 1,
+ BLAKE2S_BLOCK_SIZE);
+ else
+ blake2s_compress_generic(state, state->buf, 1,
+ BLAKE2S_BLOCK_SIZE);
+ state->buflen = 0;
+ in += fill;
+ inlen -= fill;
+ }
+ if (inlen > BLAKE2S_BLOCK_SIZE) {
+ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+ /* Hash one less (full) block than strictly possible */
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S))
+ blake2s_compress_arch(state, in, nblocks - 1,
+ BLAKE2S_BLOCK_SIZE);
+ else
+ blake2s_compress_generic(state, in, nblocks - 1,
+ BLAKE2S_BLOCK_SIZE);
+ in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+ inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+ }
+ memcpy(state->buf + state->buflen, in, inlen);
+ state->buflen += inlen;
+}
+EXPORT_SYMBOL(blake2s_update);
+
+void blake2s_final(struct blake2s_state *state, u8 *out)
+{
+ WARN_ON(IS_ENABLED(DEBUG) && !out);
+ blake2s_set_lastblock(state);
+ memset(state->buf + state->buflen, 0,
+ BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S))
+ blake2s_compress_arch(state, state->buf, 1, state->buflen);
+ else
+ blake2s_compress_generic(state, state->buf, 1, state->buflen);
+ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+ memcpy(out, state->h, state->outlen);
+ memzero_explicit(state, sizeof(*state));
+}
+EXPORT_SYMBOL(blake2s_final);
+
+void blake2s256_hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen,
+ const size_t keylen)
+{
+ struct blake2s_state state;
+ u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
+ u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
+ int i;
+
+ if (keylen > BLAKE2S_BLOCK_SIZE) {
+ blake2s_init(&state, BLAKE2S_HASH_SIZE);
+ blake2s_update(&state, key, keylen);
+ blake2s_final(&state, x_key);
+ } else
+ memcpy(x_key, key, keylen);
+
+ for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
+ x_key[i] ^= 0x36;
+
+ blake2s_init(&state, BLAKE2S_HASH_SIZE);
+ blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
+ blake2s_update(&state, in, inlen);
+ blake2s_final(&state, i_hash);
+
+ for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
+ x_key[i] ^= 0x5c ^ 0x36;
+
+ blake2s_init(&state, BLAKE2S_HASH_SIZE);
+ blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
+ blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
+ blake2s_final(&state, i_hash);
+
+ memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
+ memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
+ memzero_explicit(i_hash, BLAKE2S_HASH_SIZE);
+}
+EXPORT_SYMBOL(blake2s256_hmac);
+
+static int __init mod_init(void)
+{
+ if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
+ WARN_ON(!blake2s_selftest()))
+ return -ENODEV;
+ return 0;
+}
+
+static void __exit mod_exit(void)
+{
+}
+
+module_init(mod_init);
+module_exit(mod_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("BLAKE2s hash function");
+MODULE_AUTHOR("Jason A. Donenfeld ");
--- b/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4036,4 +4036,28 @@
.fips_allowed = 1,
}, {
+ .alg = "blake2s-128",
+ .test = alg_test_hash,
+ .suite = {
+ .hash = __VECS(blakes2s_128_tv_template)
+ }
+ }, {
+ .alg = "blake2s-160",
+ .test = alg_test_hash,
+ .suite = {
+ .hash = __VECS(blakes2s_160_tv_template)
+ }
+ }, {
+ .alg = "blake2s-224",
+ .test = alg_test_hash,
+ .suite = {
+ .hash = __VECS(blakes2s_224_tv_template)
+ }
+ }, {
+ .alg = "blake2s-256",
+ .test = alg_test_hash,
+ .suite = {
+ .hash = __VECS(blakes2s_256_tv_template)
+ }
+ }, {
.alg = "cbc(aes)",
.test = alg_test_skcipher,
@@ -4273,4 +4297,10 @@
.fips_allowed = 1,
}, {
+ .alg = "curve25519",
+ .test = alg_test_kpp,
+ .suite = {
+ .kpp = __VECS(curve25519_tv_template)
+ }
+ }, {
.alg = "deflate",
.test = alg_test_comp,
--- b/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -1030,6 +1030,1231 @@
}
};
+static const struct kpp_testvec curve25519_tv_template[] = {
+{
+ .secret = (u8[32]){ 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d,
+ 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45,
+ 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a,
+ 0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a },
+ .b_public = (u8[32]){ 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4,
+ 0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37,
+ 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d,
+ 0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f },
+ .expected_ss = (u8[32]){ 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1,
+ 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25,
+ 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33,
+ 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+{
+ .secret = (u8[32]){ 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b,
+ 0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6,
+ 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd,
+ 0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb },
+ .b_public = (u8[32]){ 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54,
+ 0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a,
+ 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4,
+ 0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a },
+ .expected_ss = (u8[32]){ 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1,
+ 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25,
+ 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33,
+ 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+{
+ .secret = (u8[32]){ 1 },
+ .b_public = (u8[32]){ 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .expected_ss = (u8[32]){ 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64,
+ 0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d,
+ 0x0b, 0x95, 0x48, 0xdc, 0x0c, 0xd8, 0x19, 0x98,
+ 0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+{
+ .secret = (u8[32]){ 1 },
+ .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .expected_ss = (u8[32]){ 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f,
+ 0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d,
+ 0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x08, 0xed, 0xe3,
+ 0x0b, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+{
+ .secret = (u8[32]){ 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d,
+ 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd,
+ 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18,
+ 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 },
+ .b_public = (u8[32]){ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb,
+ 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c,
+ 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b,
+ 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c },
+ .expected_ss = (u8[32]){ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90,
+ 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f,
+ 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7,
+ 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+{
+ .secret = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0x0a, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0x0a, 0x00, 0xfb, 0x9f },
+ .expected_ss = (u8[32]){ 0x77, 0x52, 0xb6, 0x18, 0xc1, 0x2d, 0x48, 0xd2,
+ 0xc6, 0x93, 0x46, 0x83, 0x81, 0x7c, 0xc6, 0x57,
+ 0xf3, 0x31, 0x03, 0x19, 0x49, 0x48, 0x20, 0x05,
+ 0x42, 0x2b, 0x4e, 0xae, 0x8d, 0x1d, 0x43, 0x23 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+{
+ .secret = (u8[32]){ 0x8e, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x06 },
+ .expected_ss = (u8[32]){ 0x5a, 0xdf, 0xaa, 0x25, 0x86, 0x8e, 0x32, 0x3d,
+ 0xae, 0x49, 0x62, 0xc1, 0x01, 0x5c, 0xb3, 0x12,
+ 0xe1, 0xc5, 0xc7, 0x9e, 0x95, 0x3f, 0x03, 0x99,
+ 0xb0, 0xba, 0x16, 0x22, 0xf3, 0xb6, 0xf7, 0x0c },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - normal case */
+{
+ .secret = (u8[32]){ 0x48, 0x52, 0x83, 0x4d, 0x9d, 0x6b, 0x77, 0xda,
+ 0xde, 0xab, 0xaa, 0xf2, 0xe1, 0x1d, 0xca, 0x66,
+ 0xd1, 0x9f, 0xe7, 0x49, 0x93, 0xa7, 0xbe, 0xc3,
+ 0x6c, 0x6e, 0x16, 0xa0, 0x98, 0x3f, 0xea, 0xba },
+ .b_public = (u8[32]){ 0x9c, 0x64, 0x7d, 0x9a, 0xe5, 0x89, 0xb9, 0xf5,
+ 0x8f, 0xdc, 0x3c, 0xa4, 0x94, 0x7e, 0xfb, 0xc9,
+ 0x15, 0xc4, 0xb2, 0xe0, 0x8e, 0x74, 0x4a, 0x0e,
+ 0xdf, 0x46, 0x9d, 0xac, 0x59, 0xc8, 0xf8, 0x5a },
+ .expected_ss = (u8[32]){ 0x87, 0xb7, 0xf2, 0x12, 0xb6, 0x27, 0xf7, 0xa5,
+ 0x4c, 0xa5, 0xe0, 0xbc, 0xda, 0xdd, 0xd5, 0x38,
+ 0x9d, 0x9d, 0xe6, 0x15, 0x6c, 0xdb, 0xcf, 0x8e,
+ 0xbe, 0x14, 0xff, 0xbc, 0xfb, 0x43, 0x65, 0x51 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key on twist */
+{
+ .secret = (u8[32]){ 0x58, 0x8c, 0x06, 0x1a, 0x50, 0x80, 0x4a, 0xc4,
+ 0x88, 0xad, 0x77, 0x4a, 0xc7, 0x16, 0xc3, 0xf5,
+ 0xba, 0x71, 0x4b, 0x27, 0x12, 0xe0, 0x48, 0x49,
+ 0x13, 0x79, 0xa5, 0x00, 0x21, 0x19, 0x98, 0xa8 },
+ .b_public = (u8[32]){ 0x63, 0xaa, 0x40, 0xc6, 0xe3, 0x83, 0x46, 0xc5,
+ 0xca, 0xf2, 0x3a, 0x6d, 0xf0, 0xa5, 0xe6, 0xc8,
+ 0x08, 0x89, 0xa0, 0x86, 0x47, 0xe5, 0x51, 0xb3,
+ 0x56, 0x34, 0x49, 0xbe, 0xfc, 0xfc, 0x97, 0x33 },
+ .expected_ss = (u8[32]){ 0xb1, 0xa7, 0x07, 0x51, 0x94, 0x95, 0xff, 0xff,
+ 0xb2, 0x98, 0xff, 0x94, 0x17, 0x16, 0xb0, 0x6d,
+ 0xfa, 0xb8, 0x7c, 0xf8, 0xd9, 0x11, 0x23, 0xfe,
+ 0x2b, 0xe9, 0xa2, 0x33, 0xdd, 0xa2, 0x22, 0x12 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key on twist */
+{
+ .secret = (u8[32]){ 0xb0, 0x5b, 0xfd, 0x32, 0xe5, 0x53, 0x25, 0xd9,
+ 0xfd, 0x64, 0x8c, 0xb3, 0x02, 0x84, 0x80, 0x39,
+ 0x00, 0x0b, 0x39, 0x0e, 0x44, 0xd5, 0x21, 0xe5,
+ 0x8a, 0xab, 0x3b, 0x29, 0xa6, 0x96, 0x0b, 0xa8 },
+ .b_public = (u8[32]){ 0x0f, 0x83, 0xc3, 0x6f, 0xde, 0xd9, 0xd3, 0x2f,
+ 0xad, 0xf4, 0xef, 0xa3, 0xae, 0x93, 0xa9, 0x0b,
+ 0xb5, 0xcf, 0xa6, 0x68, 0x93, 0xbc, 0x41, 0x2c,
+ 0x43, 0xfa, 0x72, 0x87, 0xdb, 0xb9, 0x97, 0x79 },
+ .expected_ss = (u8[32]){ 0x67, 0xdd, 0x4a, 0x6e, 0x16, 0x55, 0x33, 0x53,
+ 0x4c, 0x0e, 0x3f, 0x17, 0x2e, 0x4a, 0xb8, 0x57,
+ 0x6b, 0xca, 0x92, 0x3a, 0x5f, 0x07, 0xb2, 0xc0,
+ 0x69, 0xb4, 0xc3, 0x10, 0xff, 0x2e, 0x93, 0x5b },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key on twist */
+{
+ .secret = (u8[32]){ 0x70, 0xe3, 0x4b, 0xcb, 0xe1, 0xf4, 0x7f, 0xbc,
+ 0x0f, 0xdd, 0xfd, 0x7c, 0x1e, 0x1a, 0xa5, 0x3d,
+ 0x57, 0xbf, 0xe0, 0xf6, 0x6d, 0x24, 0x30, 0x67,
+ 0xb4, 0x24, 0xbb, 0x62, 0x10, 0xbe, 0xd1, 0x9c },
+ .b_public = (u8[32]){ 0x0b, 0x82, 0x11, 0xa2, 0xb6, 0x04, 0x90, 0x97,
+ 0xf6, 0x87, 0x1c, 0x6c, 0x05, 0x2d, 0x3c, 0x5f,
+ 0xc1, 0xba, 0x17, 0xda, 0x9e, 0x32, 0xae, 0x45,
+ 0x84, 0x03, 0xb0, 0x5b, 0xb2, 0x83, 0x09, 0x2a },
+ .expected_ss = (u8[32]){ 0x4a, 0x06, 0x38, 0xcf, 0xaa, 0x9e, 0xf1, 0x93,
+ 0x3b, 0x47, 0xf8, 0x93, 0x92, 0x96, 0xa6, 0xb2,
+ 0x5b, 0xe5, 0x41, 0xef, 0x7f, 0x70, 0xe8, 0x44,
+ 0xc0, 0xbc, 0xc0, 0x0b, 0x13, 0x4d, 0xe6, 0x4a },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key on twist */
+{
+ .secret = (u8[32]){ 0x68, 0xc1, 0xf3, 0xa6, 0x53, 0xa4, 0xcd, 0xb1,
+ 0xd3, 0x7b, 0xba, 0x94, 0x73, 0x8f, 0x8b, 0x95,
+ 0x7a, 0x57, 0xbe, 0xb2, 0x4d, 0x64, 0x6e, 0x99,
+ 0x4d, 0xc2, 0x9a, 0x27, 0x6a, 0xad, 0x45, 0x8d },
+ .b_public = (u8[32]){ 0x34, 0x3a, 0xc2, 0x0a, 0x3b, 0x9c, 0x6a, 0x27,
+ 0xb1, 0x00, 0x81, 0x76, 0x50, 0x9a, 0xd3, 0x07,
+ 0x35, 0x85, 0x6e, 0xc1, 0xc8, 0xd8, 0xfc, 0xae,
+ 0x13, 0x91, 0x2d, 0x08, 0xd1, 0x52, 0xf4, 0x6c },
+ .expected_ss = (u8[32]){ 0x39, 0x94, 0x91, 0xfc, 0xe8, 0xdf, 0xab, 0x73,
+ 0xb4, 0xf9, 0xf6, 0x11, 0xde, 0x8e, 0xa0, 0xb2,
+ 0x7b, 0x28, 0xf8, 0x59, 0x94, 0x25, 0x0b, 0x0f,
+ 0x47, 0x5d, 0x58, 0x5d, 0x04, 0x2a, 0xc2, 0x07 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key on twist */
+{
+ .secret = (u8[32]){ 0xd8, 0x77, 0xb2, 0x6d, 0x06, 0xdf, 0xf9, 0xd9,
+ 0xf7, 0xfd, 0x4c, 0x5b, 0x37, 0x69, 0xf8, 0xcd,
+ 0xd5, 0xb3, 0x05, 0x16, 0xa5, 0xab, 0x80, 0x6b,
+ 0xe3, 0x24, 0xff, 0x3e, 0xb6, 0x9e, 0xa0, 0xb2 },
+ .b_public = (u8[32]){ 0xfa, 0x69, 0x5f, 0xc7, 0xbe, 0x8d, 0x1b, 0xe5,
+ 0xbf, 0x70, 0x48, 0x98, 0xf3, 0x88, 0xc4, 0x52,
+ 0xba, 0xfd, 0xd3, 0xb8, 0xea, 0xe8, 0x05, 0xf8,
+ 0x68, 0x1a, 0x8d, 0x15, 0xc2, 0xd4, 0xe1, 0x42 },
+ .expected_ss = (u8[32]){ 0x2c, 0x4f, 0xe1, 0x1d, 0x49, 0x0a, 0x53, 0x86,
+ 0x17, 0x76, 0xb1, 0x3b, 0x43, 0x54, 0xab, 0xd4,
+ 0xcf, 0x5a, 0x97, 0x69, 0x9d, 0xb6, 0xe6, 0xc6,
+ 0x8c, 0x16, 0x26, 0xd0, 0x76, 0x62, 0xf7, 0x58 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case on twist */
+{
+ .secret = (u8[32]){ 0x38, 0xdd, 0xe9, 0xf3, 0xe7, 0xb7, 0x99, 0x04,
+ 0x5f, 0x9a, 0xc3, 0x79, 0x3d, 0x4a, 0x92, 0x77,
+ 0xda, 0xde, 0xad, 0xc4, 0x1b, 0xec, 0x02, 0x90,
+ 0xf8, 0x1f, 0x74, 0x4f, 0x73, 0x77, 0x5f, 0x84 },
+ .b_public = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .expected_ss = (u8[32]){ 0x9a, 0x2c, 0xfe, 0x84, 0xff, 0x9c, 0x4a, 0x97,
+ 0x39, 0x62, 0x5c, 0xae, 0x4a, 0x3b, 0x82, 0xa9,
+ 0x06, 0x87, 0x7a, 0x44, 0x19, 0x46, 0xf8, 0xd7,
+ 0xb3, 0xd7, 0x95, 0xfe, 0x8f, 0x5d, 0x16, 0x39 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case on twist */
+{
+ .secret = (u8[32]){ 0x98, 0x57, 0xa9, 0x14, 0xe3, 0xc2, 0x90, 0x36,
+ 0xfd, 0x9a, 0x44, 0x2b, 0xa5, 0x26, 0xb5, 0xcd,
+ 0xcd, 0xf2, 0x82, 0x16, 0x15, 0x3e, 0x63, 0x6c,
+ 0x10, 0x67, 0x7a, 0xca, 0xb6, 0xbd, 0x6a, 0xa5 },
+ .b_public = (u8[32]){ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .expected_ss = (u8[32]){ 0x4d, 0xa4, 0xe0, 0xaa, 0x07, 0x2c, 0x23, 0x2e,
+ 0xe2, 0xf0, 0xfa, 0x4e, 0x51, 0x9a, 0xe5, 0x0b,
+ 0x52, 0xc1, 0xed, 0xd0, 0x8a, 0x53, 0x4d, 0x4e,
+ 0xf3, 0x46, 0xc2, 0xe1, 0x06, 0xd2, 0x1d, 0x60 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case on twist */
+{
+ .secret = (u8[32]){ 0x48, 0xe2, 0x13, 0x0d, 0x72, 0x33, 0x05, 0xed,
+ 0x05, 0xe6, 0xe5, 0x89, 0x4d, 0x39, 0x8a, 0x5e,
+ 0x33, 0x36, 0x7a, 0x8c, 0x6a, 0xac, 0x8f, 0xcd,
+ 0xf0, 0xa8, 0x8e, 0x4b, 0x42, 0x82, 0x0d, 0xb7 },
+ .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0x03, 0x00, 0x00, 0xf8, 0xff,
+ 0xff, 0x1f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0xfe, 0xff, 0xff, 0x07, 0x00,
+ 0x00, 0xf0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00 },
+ .expected_ss = (u8[32]){ 0x9e, 0xd1, 0x0c, 0x53, 0x74, 0x7f, 0x64, 0x7f,
+ 0x82, 0xf4, 0x51, 0x25, 0xd3, 0xde, 0x15, 0xa1,
+ 0xe6, 0xb8, 0x24, 0x49, 0x6a, 0xb4, 0x04, 0x10,
+ 0xff, 0xcc, 0x3c, 0xfe, 0x95, 0x76, 0x0f, 0x3b },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case on twist */
+{
+ .secret = (u8[32]){ 0x28, 0xf4, 0x10, 0x11, 0x69, 0x18, 0x51, 0xb3,
+ 0xa6, 0x2b, 0x64, 0x15, 0x53, 0xb3, 0x0d, 0x0d,
+ 0xfd, 0xdc, 0xb8, 0xff, 0xfc, 0xf5, 0x37, 0x00,
+ 0xa7, 0xbe, 0x2f, 0x6a, 0x87, 0x2e, 0x9f, 0xb0 },
+ .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0xfc, 0xff, 0xff, 0x07, 0x00,
+ 0x00, 0xe0, 0xff, 0xff, 0x3f, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0xf8, 0xff,
+ 0xff, 0x0f, 0x00, 0x00, 0xc0, 0xff, 0xff, 0x7f },
+ .expected_ss = (u8[32]){ 0xcf, 0x72, 0xb4, 0xaa, 0x6a, 0xa1, 0xc9, 0xf8,
+ 0x94, 0xf4, 0x16, 0x5b, 0x86, 0x10, 0x9a, 0xa4,
+ 0x68, 0x51, 0x76, 0x48, 0xe1, 0xf0, 0xcc, 0x70,
+ 0xe1, 0xab, 0x08, 0x46, 0x01, 0x76, 0x50, 0x6b },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case on twist */
+{
+ .secret = (u8[32]){ 0x18, 0xa9, 0x3b, 0x64, 0x99, 0xb9, 0xf6, 0xb3,
+ 0x22, 0x5c, 0xa0, 0x2f, 0xef, 0x41, 0x0e, 0x0a,
+ 0xde, 0xc2, 0x35, 0x32, 0x32, 0x1d, 0x2d, 0x8e,
+ 0xf1, 0xa6, 0xd6, 0x02, 0xa8, 0xc6, 0x5b, 0x83 },
+ .b_public = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+ 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x7f },
+ .expected_ss = (u8[32]){ 0x5d, 0x50, 0xb6, 0x28, 0x36, 0xbb, 0x69, 0x57,
+ 0x94, 0x10, 0x38, 0x6c, 0xf7, 0xbb, 0x81, 0x1c,
+ 0x14, 0xbf, 0x85, 0xb1, 0xc7, 0xb1, 0x7e, 0x59,
+ 0x24, 0xc7, 0xff, 0xea, 0x91, 0xef, 0x9e, 0x12 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case on twist */
+{
+ .secret = (u8[32]){ 0xc0, 0x1d, 0x13, 0x05, 0xa1, 0x33, 0x8a, 0x1f,
+ 0xca, 0xc2, 0xba, 0x7e, 0x2e, 0x03, 0x2b, 0x42,
+ 0x7e, 0x0b, 0x04, 0x90, 0x31, 0x65, 0xac, 0xa9,
+ 0x57, 0xd8, 0xd0, 0x55, 0x3d, 0x87, 0x17, 0xb0 },
+ .b_public = (u8[32]){ 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
+ .expected_ss = (u8[32]){ 0x19, 0x23, 0x0e, 0xb1, 0x48, 0xd5, 0xd6, 0x7c,
+ 0x3c, 0x22, 0xab, 0x1d, 0xae, 0xff, 0x80, 0xa5,
+ 0x7e, 0xae, 0x42, 0x65, 0xce, 0x28, 0x72, 0x65,
+ 0x7b, 0x2c, 0x80, 0x99, 0xfc, 0x69, 0x8e, 0x50 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for public key */
+{
+ .secret = (u8[32]){ 0x38, 0x6f, 0x7f, 0x16, 0xc5, 0x07, 0x31, 0xd6,
+ 0x4f, 0x82, 0xe6, 0xa1, 0x70, 0xb1, 0x42, 0xa4,
+ 0xe3, 0x4f, 0x31, 0xfd, 0x77, 0x68, 0xfc, 0xb8,
+ 0x90, 0x29, 0x25, 0xe7, 0xd1, 0xe2, 0x1a, 0xbe },
+ .b_public = (u8[32]){ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .expected_ss = (u8[32]){ 0x0f, 0xca, 0xb5, 0xd8, 0x42, 0xa0, 0x78, 0xd7,
+ 0xa7, 0x1f, 0xc5, 0x9b, 0x57, 0xbf, 0xb4, 0xca,
+ 0x0b, 0xe6, 0x87, 0x3b, 0x49, 0xdc, 0xdb, 0x9f,
+ 0x44, 0xe1, 0x4a, 0xe8, 0xfb, 0xdf, 0xa5, 0x42 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for public key */
+{
+ .secret = (u8[32]){ 0xe0, 0x23, 0xa2, 0x89, 0xbd, 0x5e, 0x90, 0xfa,
+ 0x28, 0x04, 0xdd, 0xc0, 0x19, 0xa0, 0x5e, 0xf3,
+ 0xe7, 0x9d, 0x43, 0x4b, 0xb6, 0xea, 0x2f, 0x52,
+ 0x2e, 0xcb, 0x64, 0x3a, 0x75, 0x29, 0x6e, 0x95 },
+ .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
+ .expected_ss = (u8[32]){ 0x54, 0xce, 0x8f, 0x22, 0x75, 0xc0, 0x77, 0xe3,
+ 0xb1, 0x30, 0x6a, 0x39, 0x39, 0xc5, 0xe0, 0x3e,
+ 0xef, 0x6b, 0xbb, 0x88, 0x06, 0x05, 0x44, 0x75,
+ 0x8d, 0x9f, 0xef, 0x59, 0xb0, 0xbc, 0x3e, 0x4f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for public key */
+{
+ .secret = (u8[32]){ 0x68, 0xf0, 0x10, 0xd6, 0x2e, 0xe8, 0xd9, 0x26,
+ 0x05, 0x3a, 0x36, 0x1c, 0x3a, 0x75, 0xc6, 0xea,
+ 0x4e, 0xbd, 0xc8, 0x60, 0x6a, 0xb2, 0x85, 0x00,
+ 0x3a, 0x6f, 0x8f, 0x40, 0x76, 0xb0, 0x1e, 0x83 },
+ .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 },
+ .expected_ss = (u8[32]){ 0xf1, 0x36, 0x77, 0x5c, 0x5b, 0xeb, 0x0a, 0xf8,
+ 0x11, 0x0a, 0xf1, 0x0b, 0x20, 0x37, 0x23, 0x32,
+ 0x04, 0x3c, 0xab, 0x75, 0x24, 0x19, 0x67, 0x87,
+ 0x75, 0xa2, 0x23, 0xdf, 0x57, 0xc9, 0xd3, 0x0d },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for public key */
+{
+ .secret = (u8[32]){ 0x58, 0xeb, 0xcb, 0x35, 0xb0, 0xf8, 0x84, 0x5c,
+ 0xaf, 0x1e, 0xc6, 0x30, 0xf9, 0x65, 0x76, 0xb6,
+ 0x2c, 0x4b, 0x7b, 0x6c, 0x36, 0xb2, 0x9d, 0xeb,
+ 0x2c, 0xb0, 0x08, 0x46, 0x51, 0x75, 0x5c, 0x96 },
+ .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xfb, 0xff,
+ 0xff, 0xdf, 0xff, 0xff, 0xdf, 0xff, 0xff, 0xff,
+ 0xfe, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xf7, 0xff,
+ 0xff, 0xf7, 0xff, 0xff, 0xbf, 0xff, 0xff, 0x3f },
+ .expected_ss = (u8[32]){ 0xbf, 0x9a, 0xff, 0xd0, 0x6b, 0x84, 0x40, 0x85,
+ 0x58, 0x64, 0x60, 0x96, 0x2e, 0xf2, 0x14, 0x6f,
+ 0xf3, 0xd4, 0x53, 0x3d, 0x94, 0x44, 0xaa, 0xb0,
+ 0x06, 0xeb, 0x88, 0xcc, 0x30, 0x54, 0x40, 0x7d },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for public key */
+{
+ .secret = (u8[32]){ 0x18, 0x8c, 0x4b, 0xc5, 0xb9, 0xc4, 0x4b, 0x38,
+ 0xbb, 0x65, 0x8b, 0x9b, 0x2a, 0xe8, 0x2d, 0x5b,
+ 0x01, 0x01, 0x5e, 0x09, 0x31, 0x84, 0xb1, 0x7c,
+ 0xb7, 0x86, 0x35, 0x03, 0xa7, 0x83, 0xe1, 0xbb },
+ .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
+ .expected_ss = (u8[32]){ 0xd4, 0x80, 0xde, 0x04, 0xf6, 0x99, 0xcb, 0x3b,
+ 0xe0, 0x68, 0x4a, 0x9c, 0xc2, 0xe3, 0x12, 0x81,
+ 0xea, 0x0b, 0xc5, 0xa9, 0xdc, 0xc1, 0x57, 0xd3,
+ 0xd2, 0x01, 0x58, 0xd4, 0x6c, 0xa5, 0x24, 0x6d },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for public key */
+{
+ .secret = (u8[32]){ 0xe0, 0x6c, 0x11, 0xbb, 0x2e, 0x13, 0xce, 0x3d,
+ 0xc7, 0x67, 0x3f, 0x67, 0xf5, 0x48, 0x22, 0x42,
+ 0x90, 0x94, 0x23, 0xa9, 0xae, 0x95, 0xee, 0x98,
+ 0x6a, 0x98, 0x8d, 0x98, 0xfa, 0xee, 0x23, 0xa2 },
+ .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
+ 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
+ 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f,
+ 0xff, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0x7f },
+ .expected_ss = (u8[32]){ 0x4c, 0x44, 0x01, 0xcc, 0xe6, 0xb5, 0x1e, 0x4c,
+ 0xb1, 0x8f, 0x27, 0x90, 0x24, 0x6c, 0x9b, 0xf9,
+ 0x14, 0xdb, 0x66, 0x77, 0x50, 0xa1, 0xcb, 0x89,
+ 0x06, 0x90, 0x92, 0xaf, 0x07, 0x29, 0x22, 0x76 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for public key */
+{
+ .secret = (u8[32]){ 0xc0, 0x65, 0x8c, 0x46, 0xdd, 0xe1, 0x81, 0x29,
+ 0x29, 0x38, 0x77, 0x53, 0x5b, 0x11, 0x62, 0xb6,
+ 0xf9, 0xf5, 0x41, 0x4a, 0x23, 0xcf, 0x4d, 0x2c,
+ 0xbc, 0x14, 0x0a, 0x4d, 0x99, 0xda, 0x2b, 0x8f },
+ .b_public = (u8[32]){ 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
+ .expected_ss = (u8[32]){ 0x57, 0x8b, 0xa8, 0xcc, 0x2d, 0xbd, 0xc5, 0x75,
+ 0xaf, 0xcf, 0x9d, 0xf2, 0xb3, 0xee, 0x61, 0x89,
+ 0xf5, 0x33, 0x7d, 0x68, 0x54, 0xc7, 0x9b, 0x4c,
+ 0xe1, 0x65, 0xea, 0x12, 0x29, 0x3b, 0x3a, 0x0f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0xf0, 0x1e, 0x48, 0xda, 0xfa, 0xc9, 0xd7, 0xbc,
+ 0xf5, 0x89, 0xcb, 0xc3, 0x82, 0xc8, 0x78, 0xd1,
+ 0x8b, 0xda, 0x35, 0x50, 0x58, 0x9f, 0xfb, 0x5d,
+ 0x50, 0xb5, 0x23, 0xbe, 0xbe, 0x32, 0x9d, 0xae },
+ .b_public = (u8[32]){ 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
+ .expected_ss = (u8[32]){ 0xbd, 0x36, 0xa0, 0x79, 0x0e, 0xb8, 0x83, 0x09,
+ 0x8c, 0x98, 0x8b, 0x21, 0x78, 0x67, 0x73, 0xde,
+ 0x0b, 0x3a, 0x4d, 0xf1, 0x62, 0x28, 0x2c, 0xf1,
+ 0x10, 0xde, 0x18, 0xdd, 0x48, 0x4c, 0xe7, 0x4b },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0x28, 0x87, 0x96, 0xbc, 0x5a, 0xff, 0x4b, 0x81,
+ 0xa3, 0x75, 0x01, 0x75, 0x7b, 0xc0, 0x75, 0x3a,
+ 0x3c, 0x21, 0x96, 0x47, 0x90, 0xd3, 0x86, 0x99,
+ 0x30, 0x8d, 0xeb, 0xc1, 0x7a, 0x6e, 0xaf, 0x8d },
+ .b_public = (u8[32]){ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
+ .expected_ss = (u8[32]){ 0xb4, 0xe0, 0xdd, 0x76, 0xda, 0x7b, 0x07, 0x17,
+ 0x28, 0xb6, 0x1f, 0x85, 0x67, 0x71, 0xaa, 0x35,
+ 0x6e, 0x57, 0xed, 0xa7, 0x8a, 0x5b, 0x16, 0x55,
+ 0xcc, 0x38, 0x20, 0xfb, 0x5f, 0x85, 0x4c, 0x5c },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0x98, 0xdf, 0x84, 0x5f, 0x66, 0x51, 0xbf, 0x11,
+ 0x38, 0x22, 0x1f, 0x11, 0x90, 0x41, 0xf7, 0x2b,
+ 0x6d, 0xbc, 0x3c, 0x4a, 0xce, 0x71, 0x43, 0xd9,
+ 0x9f, 0xd5, 0x5a, 0xd8, 0x67, 0x48, 0x0d, 0xa8 },
+ .b_public = (u8[32]){ 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
+ .expected_ss = (u8[32]){ 0x6f, 0xdf, 0x6c, 0x37, 0x61, 0x1d, 0xbd, 0x53,
+ 0x04, 0xdc, 0x0f, 0x2e, 0xb7, 0xc9, 0x51, 0x7e,
+ 0xb3, 0xc5, 0x0e, 0x12, 0xfd, 0x05, 0x0a, 0xc6,
+ 0xde, 0xc2, 0x70, 0x71, 0xd4, 0xbf, 0xc0, 0x34 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0xf0, 0x94, 0x98, 0xe4, 0x6f, 0x02, 0xf8, 0x78,
+ 0x82, 0x9e, 0x78, 0xb8, 0x03, 0xd3, 0x16, 0xa2,
+ 0xed, 0x69, 0x5d, 0x04, 0x98, 0xa0, 0x8a, 0xbd,
+ 0xf8, 0x27, 0x69, 0x30, 0xe2, 0x4e, 0xdc, 0xb0 },
+ .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
+ .expected_ss = (u8[32]){ 0x4c, 0x8f, 0xc4, 0xb1, 0xc6, 0xab, 0x88, 0xfb,
+ 0x21, 0xf1, 0x8f, 0x6d, 0x4c, 0x81, 0x02, 0x40,
+ 0xd4, 0xe9, 0x46, 0x51, 0xba, 0x44, 0xf7, 0xa2,
+ 0xc8, 0x63, 0xce, 0xc7, 0xdc, 0x56, 0x60, 0x2d },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0x18, 0x13, 0xc1, 0x0a, 0x5c, 0x7f, 0x21, 0xf9,
+ 0x6e, 0x17, 0xf2, 0x88, 0xc0, 0xcc, 0x37, 0x60,
+ 0x7c, 0x04, 0xc5, 0xf5, 0xae, 0xa2, 0xdb, 0x13,
+ 0x4f, 0x9e, 0x2f, 0xfc, 0x66, 0xbd, 0x9d, 0xb8 },
+ .b_public = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
+ .expected_ss = (u8[32]){ 0x1c, 0xd0, 0xb2, 0x82, 0x67, 0xdc, 0x54, 0x1c,
+ 0x64, 0x2d, 0x6d, 0x7d, 0xca, 0x44, 0xa8, 0xb3,
+ 0x8a, 0x63, 0x73, 0x6e, 0xef, 0x5c, 0x4e, 0x65,
+ 0x01, 0xff, 0xbb, 0xb1, 0x78, 0x0c, 0x03, 0x3c },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0x78, 0x57, 0xfb, 0x80, 0x86, 0x53, 0x64, 0x5a,
+ 0x0b, 0xeb, 0x13, 0x8a, 0x64, 0xf5, 0xf4, 0xd7,
+ 0x33, 0xa4, 0x5e, 0xa8, 0x4c, 0x3c, 0xda, 0x11,
+ 0xa9, 0xc0, 0x6f, 0x7e, 0x71, 0x39, 0x14, 0x9e },
+ .b_public = (u8[32]){ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
+ .expected_ss = (u8[32]){ 0x87, 0x55, 0xbe, 0x01, 0xc6, 0x0a, 0x7e, 0x82,
+ 0x5c, 0xff, 0x3e, 0x0e, 0x78, 0xcb, 0x3a, 0xa4,
+ 0x33, 0x38, 0x61, 0x51, 0x6a, 0xa5, 0x9b, 0x1c,
+ 0x51, 0xa8, 0xb2, 0xa5, 0x43, 0xdf, 0xa8, 0x22 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0xe0, 0x3a, 0xa8, 0x42, 0xe2, 0xab, 0xc5, 0x6e,
+ 0x81, 0xe8, 0x7b, 0x8b, 0x9f, 0x41, 0x7b, 0x2a,
+ 0x1e, 0x59, 0x13, 0xc7, 0x23, 0xee, 0xd2, 0x8d,
+ 0x75, 0x2f, 0x8d, 0x47, 0xa5, 0x9f, 0x49, 0x8f },
+ .b_public = (u8[32]){ 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 },
+ .expected_ss = (u8[32]){ 0x54, 0xc9, 0xa1, 0xed, 0x95, 0xe5, 0x46, 0xd2,
+ 0x78, 0x22, 0xa3, 0x60, 0x93, 0x1d, 0xda, 0x60,
+ 0xa1, 0xdf, 0x04, 0x9d, 0xa6, 0xf9, 0x04, 0x25,
+ 0x3c, 0x06, 0x12, 0xbb, 0xdc, 0x08, 0x74, 0x76 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0xf8, 0xf7, 0x07, 0xb7, 0x99, 0x9b, 0x18, 0xcb,
+ 0x0d, 0x6b, 0x96, 0x12, 0x4f, 0x20, 0x45, 0x97,
+ 0x2c, 0xa2, 0x74, 0xbf, 0xc1, 0x54, 0xad, 0x0c,
+ 0x87, 0x03, 0x8c, 0x24, 0xc6, 0xd0, 0xd4, 0xb2 },
+ .b_public = (u8[32]){ 0xda, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .expected_ss = (u8[32]){ 0xcc, 0x1f, 0x40, 0xd7, 0x43, 0xcd, 0xc2, 0x23,
+ 0x0e, 0x10, 0x43, 0xda, 0xba, 0x8b, 0x75, 0xe8,
+ 0x10, 0xf1, 0xfb, 0xab, 0x7f, 0x25, 0x52, 0x69,
+ 0xbd, 0x9e, 0xbb, 0x29, 0xe6, 0xbf, 0x49, 0x4f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0xa0, 0x34, 0xf6, 0x84, 0xfa, 0x63, 0x1e, 0x1a,
+ 0x34, 0x81, 0x18, 0xc1, 0xce, 0x4c, 0x98, 0x23,
+ 0x1f, 0x2d, 0x9e, 0xec, 0x9b, 0xa5, 0x36, 0x5b,
+ 0x4a, 0x05, 0xd6, 0x9a, 0x78, 0x5b, 0x07, 0x96 },
+ .b_public = (u8[32]){ 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .expected_ss = (u8[32]){ 0x54, 0x99, 0x8e, 0xe4, 0x3a, 0x5b, 0x00, 0x7b,
+ 0xf4, 0x99, 0xf0, 0x78, 0xe7, 0x36, 0x52, 0x44,
+ 0x00, 0xa8, 0xb5, 0xc7, 0xe9, 0xb9, 0xb4, 0x37,
+ 0x71, 0x74, 0x8c, 0x7c, 0xdf, 0x88, 0x04, 0x12 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0x30, 0xb6, 0xc6, 0xa0, 0xf2, 0xff, 0xa6, 0x80,
+ 0x76, 0x8f, 0x99, 0x2b, 0xa8, 0x9e, 0x15, 0x2d,
+ 0x5b, 0xc9, 0x89, 0x3d, 0x38, 0xc9, 0x11, 0x9b,
+ 0xe4, 0xf7, 0x67, 0xbf, 0xab, 0x6e, 0x0c, 0xa5 },
+ .b_public = (u8[32]){ 0xdc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .expected_ss = (u8[32]){ 0xea, 0xd9, 0xb3, 0x8e, 0xfd, 0xd7, 0x23, 0x63,
+ 0x79, 0x34, 0xe5, 0x5a, 0xb7, 0x17, 0xa7, 0xae,
+ 0x09, 0xeb, 0x86, 0xa2, 0x1d, 0xc3, 0x6a, 0x3f,
+ 0xee, 0xb8, 0x8b, 0x75, 0x9e, 0x39, 0x1e, 0x09 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0x90, 0x1b, 0x9d, 0xcf, 0x88, 0x1e, 0x01, 0xe0,
+ 0x27, 0x57, 0x50, 0x35, 0xd4, 0x0b, 0x43, 0xbd,
+ 0xc1, 0xc5, 0x24, 0x2e, 0x03, 0x08, 0x47, 0x49,
+ 0x5b, 0x0c, 0x72, 0x86, 0x46, 0x9b, 0x65, 0x91 },
+ .b_public = (u8[32]){ 0xea, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .expected_ss = (u8[32]){ 0x60, 0x2f, 0xf4, 0x07, 0x89, 0xb5, 0x4b, 0x41,
+ 0x80, 0x59, 0x15, 0xfe, 0x2a, 0x62, 0x21, 0xf0,
+ 0x7a, 0x50, 0xff, 0xc2, 0xc3, 0xfc, 0x94, 0xcf,
+ 0x61, 0xf1, 0x3d, 0x79, 0x04, 0xe8, 0x8e, 0x0e },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0x80, 0x46, 0x67, 0x7c, 0x28, 0xfd, 0x82, 0xc9,
+ 0xa1, 0xbd, 0xb7, 0x1a, 0x1a, 0x1a, 0x34, 0xfa,
+ 0xba, 0x12, 0x25, 0xe2, 0x50, 0x7f, 0xe3, 0xf5,
+ 0x4d, 0x10, 0xbd, 0x5b, 0x0d, 0x86, 0x5f, 0x8e },
+ .b_public = (u8[32]){ 0xeb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .expected_ss = (u8[32]){ 0xe0, 0x0a, 0xe8, 0xb1, 0x43, 0x47, 0x12, 0x47,
+ 0xba, 0x24, 0xf1, 0x2c, 0x88, 0x55, 0x36, 0xc3,
+ 0xcb, 0x98, 0x1b, 0x58, 0xe1, 0xe5, 0x6b, 0x2b,
+ 0xaf, 0x35, 0xc1, 0x2a, 0xe1, 0xf7, 0x9c, 0x26 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0x60, 0x2f, 0x7e, 0x2f, 0x68, 0xa8, 0x46, 0xb8,
+ 0x2c, 0xc2, 0x69, 0xb1, 0xd4, 0x8e, 0x93, 0x98,
+ 0x86, 0xae, 0x54, 0xfd, 0x63, 0x6c, 0x1f, 0xe0,
+ 0x74, 0xd7, 0x10, 0x12, 0x7d, 0x47, 0x24, 0x91 },
+ .b_public = (u8[32]){ 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .expected_ss = (u8[32]){ 0x98, 0xcb, 0x9b, 0x50, 0xdd, 0x3f, 0xc2, 0xb0,
+ 0xd4, 0xf2, 0xd2, 0xbf, 0x7c, 0x5c, 0xfd, 0xd1,
+ 0x0c, 0x8f, 0xcd, 0x31, 0xfc, 0x40, 0xaf, 0x1a,
+ 0xd4, 0x4f, 0x47, 0xc1, 0x31, 0x37, 0x63, 0x62 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0x60, 0x88, 0x7b, 0x3d, 0xc7, 0x24, 0x43, 0x02,
+ 0x6e, 0xbe, 0xdb, 0xbb, 0xb7, 0x06, 0x65, 0xf4,
+ 0x2b, 0x87, 0xad, 0xd1, 0x44, 0x0e, 0x77, 0x68,
+ 0xfb, 0xd7, 0xe8, 0xe2, 0xce, 0x5f, 0x63, 0x9d },
+ .b_public = (u8[32]){ 0xf0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .expected_ss = (u8[32]){ 0x38, 0xd6, 0x30, 0x4c, 0x4a, 0x7e, 0x6d, 0x9f,
+ 0x79, 0x59, 0x33, 0x4f, 0xb5, 0x24, 0x5b, 0xd2,
+ 0xc7, 0x54, 0x52, 0x5d, 0x4c, 0x91, 0xdb, 0x95,
+ 0x02, 0x06, 0x92, 0x62, 0x34, 0xc1, 0xf6, 0x33 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0x78, 0xd3, 0x1d, 0xfa, 0x85, 0x44, 0x97, 0xd7,
+ 0x2d, 0x8d, 0xef, 0x8a, 0x1b, 0x7f, 0xb0, 0x06,
+ 0xce, 0xc2, 0xd8, 0xc4, 0x92, 0x46, 0x47, 0xc9,
+ 0x38, 0x14, 0xae, 0x56, 0xfa, 0xed, 0xa4, 0x95 },
+ .b_public = (u8[32]){ 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .expected_ss = (u8[32]){ 0x78, 0x6c, 0xd5, 0x49, 0x96, 0xf0, 0x14, 0xa5,
+ 0xa0, 0x31, 0xec, 0x14, 0xdb, 0x81, 0x2e, 0xd0,
+ 0x83, 0x55, 0x06, 0x1f, 0xdb, 0x5d, 0xe6, 0x80,
+ 0xa8, 0x00, 0xac, 0x52, 0x1f, 0x31, 0x8e, 0x23 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - public key >= p */
+{
+ .secret = (u8[32]){ 0xc0, 0x4c, 0x5b, 0xae, 0xfa, 0x83, 0x02, 0xdd,
+ 0xde, 0xd6, 0xa4, 0xbb, 0x95, 0x77, 0x61, 0xb4,
+ 0xeb, 0x97, 0xae, 0xfa, 0x4f, 0xc3, 0xb8, 0x04,
+ 0x30, 0x85, 0xf9, 0x6a, 0x56, 0x59, 0xb3, 0xa5 },
+ .b_public = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+ .expected_ss = (u8[32]){ 0x29, 0xae, 0x8b, 0xc7, 0x3e, 0x9b, 0x10, 0xa0,
+ 0x8b, 0x4f, 0x68, 0x1c, 0x43, 0xc3, 0xe0, 0xac,
+ 0x1a, 0x17, 0x1d, 0x31, 0xb3, 0x8f, 0x1a, 0x48,
+ 0xef, 0xba, 0x29, 0xae, 0x63, 0x9e, 0xa1, 0x34 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - RFC 7748 */
+{
+ .secret = (u8[32]){ 0xa0, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d,
+ 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd,
+ 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18,
+ 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0x44 },
+ .b_public = (u8[32]){ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb,
+ 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c,
+ 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b,
+ 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c },
+ .expected_ss = (u8[32]){ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90,
+ 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f,
+ 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7,
+ 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - RFC 7748 */
+{
+ .secret = (u8[32]){ 0x48, 0x66, 0xe9, 0xd4, 0xd1, 0xb4, 0x67, 0x3c,
+ 0x5a, 0xd2, 0x26, 0x91, 0x95, 0x7d, 0x6a, 0xf5,
+ 0xc1, 0x1b, 0x64, 0x21, 0xe0, 0xea, 0x01, 0xd4,
+ 0x2c, 0xa4, 0x16, 0x9e, 0x79, 0x18, 0xba, 0x4d },
+ .b_public = (u8[32]){ 0xe5, 0x21, 0x0f, 0x12, 0x78, 0x68, 0x11, 0xd3,
+ 0xf4, 0xb7, 0x95, 0x9d, 0x05, 0x38, 0xae, 0x2c,
+ 0x31, 0xdb, 0xe7, 0x10, 0x6f, 0xc0, 0x3c, 0x3e,
+ 0xfc, 0x4c, 0xd5, 0x49, 0xc7, 0x15, 0xa4, 0x13 },
+ .expected_ss = (u8[32]){ 0x95, 0xcb, 0xde, 0x94, 0x76, 0xe8, 0x90, 0x7d,
+ 0x7a, 0xad, 0xe4, 0x5c, 0xb4, 0xb8, 0x73, 0xf8,
+ 0x8b, 0x59, 0x5a, 0x68, 0x79, 0x9f, 0xa1, 0x52,
+ 0xe6, 0xf8, 0xf7, 0x64, 0x7a, 0xac, 0x79, 0x57 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0x0a, 0xb4, 0xe7, 0x63, 0x80, 0xd8, 0x4d, 0xde,
+ 0x4f, 0x68, 0x33, 0xc5, 0x8f, 0x2a, 0x9f, 0xb8,
+ 0xf8, 0x3b, 0xb0, 0x16, 0x9b, 0x17, 0x2b, 0xe4,
+ 0xb6, 0xe0, 0x59, 0x28, 0x87, 0x74, 0x1a, 0x36 },
+ .expected_ss = (u8[32]){ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0x89, 0xe1, 0x0d, 0x57, 0x01, 0xb4, 0x33, 0x7d,
+ 0x2d, 0x03, 0x21, 0x81, 0x53, 0x8b, 0x10, 0x64,
+ 0xbd, 0x40, 0x84, 0x40, 0x1c, 0xec, 0xa1, 0xfd,
+ 0x12, 0x66, 0x3a, 0x19, 0x59, 0x38, 0x80, 0x00 },
+ .expected_ss = (u8[32]){ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0x2b, 0x55, 0xd3, 0xaa, 0x4a, 0x8f, 0x80, 0xc8,
+ 0xc0, 0xb2, 0xae, 0x5f, 0x93, 0x3e, 0x85, 0xaf,
+ 0x49, 0xbe, 0xac, 0x36, 0xc2, 0xfa, 0x73, 0x94,
+ 0xba, 0xb7, 0x6c, 0x89, 0x33, 0xf8, 0xf8, 0x1d },
+ .expected_ss = (u8[32]){ 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0x63, 0xe5, 0xb1, 0xfe, 0x96, 0x01, 0xfe, 0x84,
+ 0x38, 0x5d, 0x88, 0x66, 0xb0, 0x42, 0x12, 0x62,
+ 0xf7, 0x8f, 0xbf, 0xa5, 0xaf, 0xf9, 0x58, 0x5e,
+ 0x62, 0x66, 0x79, 0xb1, 0x85, 0x47, 0xd9, 0x59 },
+ .expected_ss = (u8[32]){ 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0xe4, 0x28, 0xf3, 0xda, 0xc1, 0x78, 0x09, 0xf8,
+ 0x27, 0xa5, 0x22, 0xce, 0x32, 0x35, 0x50, 0x58,
+ 0xd0, 0x73, 0x69, 0x36, 0x4a, 0xa7, 0x89, 0x02,
+ 0xee, 0x10, 0x13, 0x9b, 0x9f, 0x9d, 0xd6, 0x53 },
+ .expected_ss = (u8[32]){ 0xfc, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0xb3, 0xb5, 0x0e, 0x3e, 0xd3, 0xa4, 0x07, 0xb9,
+ 0x5d, 0xe9, 0x42, 0xef, 0x74, 0x57, 0x5b, 0x5a,
+ 0xb8, 0xa1, 0x0c, 0x09, 0xee, 0x10, 0x35, 0x44,
+ 0xd6, 0x0b, 0xdf, 0xed, 0x81, 0x38, 0xab, 0x2b },
+ .expected_ss = (u8[32]){ 0xf9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0x21, 0x3f, 0xff, 0xe9, 0x3d, 0x5e, 0xa8, 0xcd,
+ 0x24, 0x2e, 0x46, 0x28, 0x44, 0x02, 0x99, 0x22,
+ 0xc4, 0x3c, 0x77, 0xc9, 0xe3, 0xe4, 0x2f, 0x56,
+ 0x2f, 0x48, 0x5d, 0x24, 0xc5, 0x01, 0xa2, 0x0b },
+ .expected_ss = (u8[32]){ 0xf3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0x91, 0xb2, 0x32, 0xa1, 0x78, 0xb3, 0xcd, 0x53,
+ 0x09, 0x32, 0x44, 0x1e, 0x61, 0x39, 0x41, 0x8f,
+ 0x72, 0x17, 0x22, 0x92, 0xf1, 0xda, 0x4c, 0x18,
+ 0x34, 0xfc, 0x5e, 0xbf, 0xef, 0xb5, 0x1e, 0x3f },
+ .expected_ss = (u8[32]){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x03 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0x04, 0x5c, 0x6e, 0x11, 0xc5, 0xd3, 0x32, 0x55,
+ 0x6c, 0x78, 0x22, 0xfe, 0x94, 0xeb, 0xf8, 0x9b,
+ 0x56, 0xa3, 0x87, 0x8d, 0xc2, 0x7c, 0xa0, 0x79,
+ 0x10, 0x30, 0x58, 0x84, 0x9f, 0xab, 0xcb, 0x4f },
+ .expected_ss = (u8[32]){ 0xe5, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0x1c, 0xa2, 0x19, 0x0b, 0x71, 0x16, 0x35, 0x39,
+ 0x06, 0x3c, 0x35, 0x77, 0x3b, 0xda, 0x0c, 0x9c,
+ 0x92, 0x8e, 0x91, 0x36, 0xf0, 0x62, 0x0a, 0xeb,
+ 0x09, 0x3f, 0x09, 0x91, 0x97, 0xb7, 0xf7, 0x4e },
+ .expected_ss = (u8[32]){ 0xe3, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0xf7, 0x6e, 0x90, 0x10, 0xac, 0x33, 0xc5, 0x04,
+ 0x3b, 0x2d, 0x3b, 0x76, 0xa8, 0x42, 0x17, 0x10,
+ 0x00, 0xc4, 0x91, 0x62, 0x22, 0xe9, 0xe8, 0x58,
+ 0x97, 0xa0, 0xae, 0xc7, 0xf6, 0x35, 0x0b, 0x3c },
+ .expected_ss = (u8[32]){ 0xdd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0xbb, 0x72, 0x68, 0x8d, 0x8f, 0x8a, 0xa7, 0xa3,
+ 0x9c, 0xd6, 0x06, 0x0c, 0xd5, 0xc8, 0x09, 0x3c,
+ 0xde, 0xc6, 0xfe, 0x34, 0x19, 0x37, 0xc3, 0x88,
+ 0x6a, 0x99, 0x34, 0x6c, 0xd0, 0x7f, 0xaa, 0x55 },
+ .expected_ss = (u8[32]){ 0xdb, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0x88, 0xfd, 0xde, 0xa1, 0x93, 0x39, 0x1c, 0x6a,
+ 0x59, 0x33, 0xef, 0x9b, 0x71, 0x90, 0x15, 0x49,
+ 0x44, 0x72, 0x05, 0xaa, 0xe9, 0xda, 0x92, 0x8a,
+ 0x6b, 0x91, 0xa3, 0x52, 0xba, 0x10, 0xf4, 0x1f },
+ .expected_ss = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - edge case for shared secret */
+{
+ .secret = (u8[32]){ 0xa0, 0xa4, 0xf1, 0x30, 0xb9, 0x8a, 0x5b, 0xe4,
+ 0xb1, 0xce, 0xdb, 0x7c, 0xb8, 0x55, 0x84, 0xa3,
+ 0x52, 0x0e, 0x14, 0x2d, 0x47, 0x4d, 0xc9, 0xcc,
+ 0xb9, 0x09, 0xa0, 0x73, 0xa9, 0x76, 0xbf, 0x63 },
+ .b_public = (u8[32]){ 0x30, 0x3b, 0x39, 0x2f, 0x15, 0x31, 0x16, 0xca,
+ 0xd9, 0xcc, 0x68, 0x2a, 0x00, 0xcc, 0xc4, 0x4c,
+ 0x95, 0xff, 0x0d, 0x3b, 0xbe, 0x56, 0x8b, 0xeb,
+ 0x6c, 0x4e, 0x73, 0x9b, 0xaf, 0xdc, 0x2c, 0x68 },
+ .expected_ss = (u8[32]){ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - checking for overflow */
+{
+ .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
+ 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
+ 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
+ 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
+ .b_public = (u8[32]){ 0xfd, 0x30, 0x0a, 0xeb, 0x40, 0xe1, 0xfa, 0x58,
+ 0x25, 0x18, 0x41, 0x2b, 0x49, 0xb2, 0x08, 0xa7,
+ 0x84, 0x2b, 0x1e, 0x1f, 0x05, 0x6a, 0x04, 0x01,
+ 0x78, 0xea, 0x41, 0x41, 0x53, 0x4f, 0x65, 0x2d },
+ .expected_ss = (u8[32]){ 0xb7, 0x34, 0x10, 0x5d, 0xc2, 0x57, 0x58, 0x5d,
+ 0x73, 0xb5, 0x66, 0xcc, 0xb7, 0x6f, 0x06, 0x27,
+ 0x95, 0xcc, 0xbe, 0xc8, 0x91, 0x28, 0xe5, 0x2b,
+ 0x02, 0xf3, 0xe5, 0x96, 0x39, 0xf1, 0x3c, 0x46 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - checking for overflow */
+{
+ .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
+ 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
+ 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
+ 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
+ .b_public = (u8[32]){ 0xc8, 0xef, 0x79, 0xb5, 0x14, 0xd7, 0x68, 0x26,
+ 0x77, 0xbc, 0x79, 0x31, 0xe0, 0x6e, 0xe5, 0xc2,
+ 0x7c, 0x9b, 0x39, 0x2b, 0x4a, 0xe9, 0x48, 0x44,
+ 0x73, 0xf5, 0x54, 0xe6, 0x67, 0x8e, 0xcc, 0x2e },
+ .expected_ss = (u8[32]){ 0x64, 0x7a, 0x46, 0xb6, 0xfc, 0x3f, 0x40, 0xd6,
+ 0x21, 0x41, 0xee, 0x3c, 0xee, 0x70, 0x6b, 0x4d,
+ 0x7a, 0x92, 0x71, 0x59, 0x3a, 0x7b, 0x14, 0x3e,
+ 0x8e, 0x2e, 0x22, 0x79, 0x88, 0x3e, 0x45, 0x50 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - checking for overflow */
+{
+ .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
+ 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
+ 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
+ 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
+ .b_public = (u8[32]){ 0x64, 0xae, 0xac, 0x25, 0x04, 0x14, 0x48, 0x61,
+ 0x53, 0x2b, 0x7b, 0xbc, 0xb6, 0xc8, 0x7d, 0x67,
+ 0xdd, 0x4c, 0x1f, 0x07, 0xeb, 0xc2, 0xe0, 0x6e,
+ 0xff, 0xb9, 0x5a, 0xec, 0xc6, 0x17, 0x0b, 0x2c },
+ .expected_ss = (u8[32]){ 0x4f, 0xf0, 0x3d, 0x5f, 0xb4, 0x3c, 0xd8, 0x65,
+ 0x7a, 0x3c, 0xf3, 0x7c, 0x13, 0x8c, 0xad, 0xce,
+ 0xcc, 0xe5, 0x09, 0xe4, 0xeb, 0xa0, 0x89, 0xd0,
+ 0xef, 0x40, 0xb4, 0xe4, 0xfb, 0x94, 0x61, 0x55 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - checking for overflow */
+{
+ .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
+ 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
+ 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
+ 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
+ .b_public = (u8[32]){ 0xbf, 0x68, 0xe3, 0x5e, 0x9b, 0xdb, 0x7e, 0xee,
+ 0x1b, 0x50, 0x57, 0x02, 0x21, 0x86, 0x0f, 0x5d,
+ 0xcd, 0xad, 0x8a, 0xcb, 0xab, 0x03, 0x1b, 0x14,
+ 0x97, 0x4c, 0xc4, 0x90, 0x13, 0xc4, 0x98, 0x31 },
+ .expected_ss = (u8[32]){ 0x21, 0xce, 0xe5, 0x2e, 0xfd, 0xbc, 0x81, 0x2e,
+ 0x1d, 0x02, 0x1a, 0x4a, 0xf1, 0xe1, 0xd8, 0xbc,
+ 0x4d, 0xb3, 0xc4, 0x00, 0xe4, 0xd2, 0xa2, 0xc5,
+ 0x6a, 0x39, 0x26, 0xdb, 0x4d, 0x99, 0xc6, 0x5b },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - checking for overflow */
+{
+ .secret = (u8[32]){ 0xc8, 0x17, 0x24, 0x70, 0x40, 0x00, 0xb2, 0x6d,
+ 0x31, 0x70, 0x3c, 0xc9, 0x7e, 0x3a, 0x37, 0x8d,
+ 0x56, 0xfa, 0xd8, 0x21, 0x93, 0x61, 0xc8, 0x8c,
+ 0xca, 0x8b, 0xd7, 0xc5, 0x71, 0x9b, 0x12, 0xb2 },
+ .b_public = (u8[32]){ 0x53, 0x47, 0xc4, 0x91, 0x33, 0x1a, 0x64, 0xb4,
+ 0x3d, 0xdc, 0x68, 0x30, 0x34, 0xe6, 0x77, 0xf5,
+ 0x3d, 0xc3, 0x2b, 0x52, 0xa5, 0x2a, 0x57, 0x7c,
+ 0x15, 0xa8, 0x3b, 0xf2, 0x98, 0xe9, 0x9f, 0x19 },
+ .expected_ss = (u8[32]){ 0x18, 0xcb, 0x89, 0xe4, 0xe2, 0x0c, 0x0c, 0x2b,
+ 0xd3, 0x24, 0x30, 0x52, 0x45, 0x26, 0x6c, 0x93,
+ 0x27, 0x69, 0x0b, 0xbe, 0x79, 0xac, 0xb8, 0x8f,
+ 0x5b, 0x8f, 0xb3, 0xf7, 0x4e, 0xca, 0x3e, 0x52 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - private key == -1 (mod order) */
+{
+ .secret = (u8[32]){ 0xa0, 0x23, 0xcd, 0xd0, 0x83, 0xef, 0x5b, 0xb8,
+ 0x2f, 0x10, 0xd6, 0x2e, 0x59, 0xe1, 0x5a, 0x68,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50 },
+ .b_public = (u8[32]){ 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e,
+ 0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57,
+ 0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f,
+ 0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 },
+ .expected_ss = (u8[32]){ 0x25, 0x8e, 0x04, 0x52, 0x3b, 0x8d, 0x25, 0x3e,
+ 0xe6, 0x57, 0x19, 0xfc, 0x69, 0x06, 0xc6, 0x57,
+ 0x19, 0x2d, 0x80, 0x71, 0x7e, 0xdc, 0x82, 0x8f,
+ 0xa0, 0xaf, 0x21, 0x68, 0x6e, 0x2f, 0xaa, 0x75 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+},
+/* wycheproof - private key == 1 (mod order) on twist */
+{
+ .secret = (u8[32]){ 0x58, 0x08, 0x3d, 0xd2, 0x61, 0xad, 0x91, 0xef,
+ 0xf9, 0x52, 0x32, 0x2e, 0xc8, 0x24, 0xc6, 0x82,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x5f },
+ .b_public = (u8[32]){ 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f,
+ 0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6,
+ 0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64,
+ 0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 },
+ .expected_ss = (u8[32]){ 0x2e, 0xae, 0x5e, 0xc3, 0xdd, 0x49, 0x4e, 0x9f,
+ 0x2d, 0x37, 0xd2, 0x58, 0xf8, 0x73, 0xa8, 0xe6,
+ 0xe9, 0xd0, 0xdb, 0xd1, 0xe3, 0x83, 0xef, 0x64,
+ 0xd9, 0x8b, 0xb9, 0x1b, 0x3e, 0x0b, 0xe0, 0x35 },
+ .secret_size = 32,
+ .b_public_size = 32,
+ .expected_ss_size = 32,
+
+}
+};
+
static const struct kpp_testvec ecdh_tv_template[] = {
{
#ifndef CONFIG_CRYPTO_FIPS
@@ -31569,2 +32794,253 @@
+static const char blake2_ordered_sequence[] =
+ "\x00\x01\x02\x03\x04\x05\x06\x07"
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+ "\x10\x11\x12\x13\x14\x15\x16\x17"
+ "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+ "\x20\x21\x22\x23\x24\x25\x26\x27"
+ "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
+ "\x30\x31\x32\x33\x34\x35\x36\x37"
+ "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
+ "\x40\x41\x42\x43\x44\x45\x46\x47"
+ "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f"
+ "\x50\x51\x52\x53\x54\x55\x56\x57"
+ "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f"
+ "\x60\x61\x62\x63\x64\x65\x66\x67"
+ "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
+ "\x70\x71\x72\x73\x74\x75\x76\x77"
+ "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
+ "\x80\x81\x82\x83\x84\x85\x86\x87"
+ "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
+ "\x90\x91\x92\x93\x94\x95\x96\x97"
+ "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
+ "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7"
+ "\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
+ "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7"
+ "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
+ "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
+ "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
+ "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7"
+ "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
+ "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7"
+ "\xe8\xe9\xea\xeb\xec\xed\xee\xef"
+ "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"
+ "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff";
+
+static const struct hash_testvec blakes2s_128_tv_template[] = {{
+ .digest = (u8[]){ 0x64, 0x55, 0x0d, 0x6f, 0xfe, 0x2c, 0x0a, 0x01,
+ 0xa1, 0x4a, 0xba, 0x1e, 0xad, 0xe0, 0x20, 0x0c, },
+}, {
+ .plaintext = blake2_ordered_sequence,
+ .psize = 64,
+ .digest = (u8[]){ 0xdc, 0x66, 0xca, 0x8f, 0x03, 0x86, 0x58, 0x01,
+ 0xb0, 0xff, 0xe0, 0x6e, 0xd8, 0xa1, 0xa9, 0x0e, },
+}, {
+ .ksize = 16,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 1,
+ .digest = (u8[]){ 0x88, 0x1e, 0x42, 0xe7, 0xbb, 0x35, 0x80, 0x82,
+ 0x63, 0x7c, 0x0a, 0x0f, 0xd7, 0xec, 0x6c, 0x2f, },
+}, {
+ .ksize = 32,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 7,
+ .digest = (u8[]){ 0xcf, 0x9e, 0x07, 0x2a, 0xd5, 0x22, 0xf2, 0xcd,
+ 0xa2, 0xd8, 0x25, 0x21, 0x80, 0x86, 0x73, 0x1c, },
+}, {
+ .ksize = 1,
+ .key = "B",
+ .plaintext = blake2_ordered_sequence,
+ .psize = 15,
+ .digest = (u8[]){ 0xf6, 0x33, 0x5a, 0x2c, 0x22, 0xa0, 0x64, 0xb2,
+ 0xb6, 0x3f, 0xeb, 0xbc, 0xd1, 0xc3, 0xe5, 0xb2, },
+}, {
+ .ksize = 16,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 247,
+ .digest = (u8[]){ 0x72, 0x66, 0x49, 0x60, 0xf9, 0x4a, 0xea, 0xbe,
+ 0x1f, 0xf4, 0x60, 0xce, 0xb7, 0x81, 0xcb, 0x09, },
+}, {
+ .ksize = 32,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 256,
+ .digest = (u8[]){ 0xd5, 0xa4, 0x0e, 0xc3, 0x16, 0xc7, 0x51, 0xa6,
+ 0x3c, 0xd0, 0xd9, 0x11, 0x57, 0xfa, 0x1e, 0xbb, },
+}};
+
+static const struct hash_testvec blakes2s_160_tv_template[] = {{
+ .plaintext = blake2_ordered_sequence,
+ .psize = 7,
+ .digest = (u8[]){ 0xb4, 0xf2, 0x03, 0x49, 0x37, 0xed, 0xb1, 0x3e,
+ 0x5b, 0x2a, 0xca, 0x64, 0x82, 0x74, 0xf6, 0x62,
+ 0xe3, 0xf2, 0x84, 0xff, },
+}, {
+ .plaintext = blake2_ordered_sequence,
+ .psize = 256,
+ .digest = (u8[]){ 0xaa, 0x56, 0x9b, 0xdc, 0x98, 0x17, 0x75, 0xf2,
+ 0xb3, 0x68, 0x83, 0xb7, 0x9b, 0x8d, 0x48, 0xb1,
+ 0x9b, 0x2d, 0x35, 0x05, },
+}, {
+ .ksize = 1,
+ .key = "B",
+ .digest = (u8[]){ 0x50, 0x16, 0xe7, 0x0c, 0x01, 0xd0, 0xd3, 0xc3,
+ 0xf4, 0x3e, 0xb1, 0x6e, 0x97, 0xa9, 0x4e, 0xd1,
+ 0x79, 0x65, 0x32, 0x93, },
+}, {
+ .ksize = 32,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 1,
+ .digest = (u8[]){ 0x1c, 0x2b, 0xcd, 0x9a, 0x68, 0xca, 0x8c, 0x71,
+ 0x90, 0x29, 0x6c, 0x54, 0xfa, 0x56, 0x4a, 0xef,
+ 0xa2, 0x3a, 0x56, 0x9c, },
+}, {
+ .ksize = 16,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 15,
+ .digest = (u8[]){ 0x36, 0xc3, 0x5f, 0x9a, 0xdc, 0x7e, 0xbf, 0x19,
+ 0x68, 0xaa, 0xca, 0xd8, 0x81, 0xbf, 0x09, 0x34,
+ 0x83, 0x39, 0x0f, 0x30, },
+}, {
+ .ksize = 1,
+ .key = "B",
+ .plaintext = blake2_ordered_sequence,
+ .psize = 64,
+ .digest = (u8[]){ 0x86, 0x80, 0x78, 0xa4, 0x14, 0xec, 0x03, 0xe5,
+ 0xb6, 0x9a, 0x52, 0x0e, 0x42, 0xee, 0x39, 0x9d,
+ 0xac, 0xa6, 0x81, 0x63, },
+}, {
+ .ksize = 32,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 247,
+ .digest = (u8[]){ 0x2d, 0xd8, 0xd2, 0x53, 0x66, 0xfa, 0xa9, 0x01,
+ 0x1c, 0x9c, 0xaf, 0xa3, 0xe2, 0x9d, 0x9b, 0x10,
+ 0x0a, 0xf6, 0x73, 0xe8, },
+}};
+
+static const struct hash_testvec blakes2s_224_tv_template[] = {{
+ .plaintext = blake2_ordered_sequence,
+ .psize = 1,
+ .digest = (u8[]){ 0x61, 0xb9, 0x4e, 0xc9, 0x46, 0x22, 0xa3, 0x91,
+ 0xd2, 0xae, 0x42, 0xe6, 0x45, 0x6c, 0x90, 0x12,
+ 0xd5, 0x80, 0x07, 0x97, 0xb8, 0x86, 0x5a, 0xfc,
+ 0x48, 0x21, 0x97, 0xbb, },
+}, {
+ .plaintext = blake2_ordered_sequence,
+ .psize = 247,
+ .digest = (u8[]){ 0x9e, 0xda, 0xc7, 0x20, 0x2c, 0xd8, 0x48, 0x2e,
+ 0x31, 0x94, 0xab, 0x46, 0x6d, 0x94, 0xd8, 0xb4,
+ 0x69, 0xcd, 0xae, 0x19, 0x6d, 0x9e, 0x41, 0xcc,
+ 0x2b, 0xa4, 0xd5, 0xf6, },
+}, {
+ .ksize = 16,
+ .key = blake2_ordered_sequence,
+ .digest = (u8[]){ 0x32, 0xc0, 0xac, 0xf4, 0x3b, 0xd3, 0x07, 0x9f,
+ 0xbe, 0xfb, 0xfa, 0x4d, 0x6b, 0x4e, 0x56, 0xb3,
+ 0xaa, 0xd3, 0x27, 0xf6, 0x14, 0xbf, 0xb9, 0x32,
+ 0xa7, 0x19, 0xfc, 0xb8, },
+}, {
+ .ksize = 1,
+ .key = "B",
+ .plaintext = blake2_ordered_sequence,
+ .psize = 7,
+ .digest = (u8[]){ 0x73, 0xad, 0x5e, 0x6d, 0xb9, 0x02, 0x8e, 0x76,
+ 0xf2, 0x66, 0x42, 0x4b, 0x4c, 0xfa, 0x1f, 0xe6,
+ 0x2e, 0x56, 0x40, 0xe5, 0xa2, 0xb0, 0x3c, 0xe8,
+ 0x7b, 0x45, 0xfe, 0x05, },
+}, {
+ .ksize = 32,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 15,
+ .digest = (u8[]){ 0x16, 0x60, 0xfb, 0x92, 0x54, 0xb3, 0x6e, 0x36,
+ 0x81, 0xf4, 0x16, 0x41, 0xc3, 0x3d, 0xd3, 0x43,
+ 0x84, 0xed, 0x10, 0x6f, 0x65, 0x80, 0x7a, 0x3e,
+ 0x25, 0xab, 0xc5, 0x02, },
+}, {
+ .ksize = 16,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 64,
+ .digest = (u8[]){ 0xca, 0xaa, 0x39, 0x67, 0x9c, 0xf7, 0x6b, 0xc7,
+ 0xb6, 0x82, 0xca, 0x0e, 0x65, 0x36, 0x5b, 0x7c,
+ 0x24, 0x00, 0xfa, 0x5f, 0xda, 0x06, 0x91, 0x93,
+ 0x6a, 0x31, 0x83, 0xb5, },
+}, {
+ .ksize = 1,
+ .key = "B",
+ .plaintext = blake2_ordered_sequence,
+ .psize = 256,
+ .digest = (u8[]){ 0x90, 0x02, 0x26, 0xb5, 0x06, 0x9c, 0x36, 0x86,
+ 0x94, 0x91, 0x90, 0x1e, 0x7d, 0x2a, 0x71, 0xb2,
+ 0x48, 0xb5, 0xe8, 0x16, 0xfd, 0x64, 0x33, 0x45,
+ 0xb3, 0xd7, 0xec, 0xcc, },
+}};
+
+static const struct hash_testvec blakes2s_256_tv_template[] = {{
+ .plaintext = blake2_ordered_sequence,
+ .psize = 15,
+ .digest = (u8[]){ 0xd9, 0x7c, 0x82, 0x8d, 0x81, 0x82, 0xa7, 0x21,
+ 0x80, 0xa0, 0x6a, 0x78, 0x26, 0x83, 0x30, 0x67,
+ 0x3f, 0x7c, 0x4e, 0x06, 0x35, 0x94, 0x7c, 0x04,
+ 0xc0, 0x23, 0x23, 0xfd, 0x45, 0xc0, 0xa5, 0x2d, },
+}, {
+ .ksize = 32,
+ .key = blake2_ordered_sequence,
+ .digest = (u8[]){ 0x48, 0xa8, 0x99, 0x7d, 0xa4, 0x07, 0x87, 0x6b,
+ 0x3d, 0x79, 0xc0, 0xd9, 0x23, 0x25, 0xad, 0x3b,
+ 0x89, 0xcb, 0xb7, 0x54, 0xd8, 0x6a, 0xb7, 0x1a,
+ 0xee, 0x04, 0x7a, 0xd3, 0x45, 0xfd, 0x2c, 0x49, },
+}, {
+ .ksize = 1,
+ .key = "B",
+ .plaintext = blake2_ordered_sequence,
+ .psize = 1,
+ .digest = (u8[]){ 0x22, 0x27, 0xae, 0xaa, 0x6e, 0x81, 0x56, 0x03,
+ 0xa7, 0xe3, 0xa1, 0x18, 0xa5, 0x9a, 0x2c, 0x18,
+ 0xf4, 0x63, 0xbc, 0x16, 0x70, 0xf1, 0xe7, 0x4b,
+ 0x00, 0x6d, 0x66, 0x16, 0xae, 0x9e, 0x74, 0x4e, },
+}, {
+ .ksize = 16,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 7,
+ .digest = (u8[]){ 0x58, 0x5d, 0xa8, 0x60, 0x1c, 0xa4, 0xd8, 0x03,
+ 0x86, 0x86, 0x84, 0x64, 0xd7, 0xa0, 0x8e, 0x15,
+ 0x2f, 0x05, 0xa2, 0x1b, 0xbc, 0xef, 0x7a, 0x34,
+ 0xb3, 0xc5, 0xbc, 0x4b, 0xf0, 0x32, 0xeb, 0x12, },
+}, {
+ .ksize = 32,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 64,
+ .digest = (u8[]){ 0x89, 0x75, 0xb0, 0x57, 0x7f, 0xd3, 0x55, 0x66,
+ 0xd7, 0x50, 0xb3, 0x62, 0xb0, 0x89, 0x7a, 0x26,
+ 0xc3, 0x99, 0x13, 0x6d, 0xf0, 0x7b, 0xab, 0xab,
+ 0xbd, 0xe6, 0x20, 0x3f, 0xf2, 0x95, 0x4e, 0xd4, },
+}, {
+ .ksize = 1,
+ .key = "B",
+ .plaintext = blake2_ordered_sequence,
+ .psize = 247,
+ .digest = (u8[]){ 0x2e, 0x74, 0x1c, 0x1d, 0x03, 0xf4, 0x9d, 0x84,
+ 0x6f, 0xfc, 0x86, 0x32, 0x92, 0x49, 0x7e, 0x66,
+ 0xd7, 0xc3, 0x10, 0x88, 0xfe, 0x28, 0xb3, 0xe0,
+ 0xbf, 0x50, 0x75, 0xad, 0x8e, 0xa4, 0xe6, 0xb2, },
+}, {
+ .ksize = 16,
+ .key = blake2_ordered_sequence,
+ .plaintext = blake2_ordered_sequence,
+ .psize = 256,
+ .digest = (u8[]){ 0xb9, 0xd2, 0x81, 0x0e, 0x3a, 0xb1, 0x62, 0x9b,
+ 0xad, 0x44, 0x05, 0xf4, 0x92, 0x2e, 0x99, 0xc1,
+ 0x4a, 0x47, 0xbb, 0x5b, 0x6f, 0xb2, 0x96, 0xed,
+ 0xd5, 0x06, 0xb5, 0x3a, 0x7c, 0x7a, 0x65, 0x1d, },
+}};
+
#endif /* _CRYPTO_TESTMGR_H */
--- b/crypto/Makefile
+++ b/crypto/Makefile
@@ -74,6 +74,7 @@
obj-$(CONFIG_CRYPTO_WP512) += wp512.o
CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
+obj-$(CONFIG_CRYPTO_BLAKE2S) += blake2s_generic.o
obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
obj-$(CONFIG_CRYPTO_ECB) += ecb.o
obj-$(CONFIG_CRYPTO_CBC) += cbc.o
@@ -166,6 +167,7 @@
obj-$(CONFIG_CRYPTO_OFB) += ofb.o
obj-$(CONFIG_CRYPTO_ECC) += ecc.o
obj-$(CONFIG_CRYPTO_ESSIV) += essiv.o
+obj-$(CONFIG_CRYPTO_CURVE25519) += curve25519-generic.o
ecdh_generic-y += ecdh.o
ecdh_generic-y += ecdh_helper.o
--- /dev/null
+++ b/crypto/blake2s_generic.c
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
+ */
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+
+ if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(tctx->key, key, keylen);
+ tctx->keylen = keylen;
+
+ return 0;
+}
+
+static int crypto_blake2s_init(struct shash_desc *desc)
+{
+ struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct blake2s_state *state = shash_desc_ctx(desc);
+ const int outlen = crypto_shash_digestsize(desc->tfm);
+
+ if (tctx->keylen)
+ blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
+ else
+ blake2s_init(state, outlen);
+
+ return 0;
+}
+
+static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
+ unsigned int inlen)
+{
+ struct blake2s_state *state = shash_desc_ctx(desc);
+ const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+
+ if (unlikely(!inlen))
+ return 0;
+ if (inlen > fill) {
+ memcpy(state->buf + state->buflen, in, fill);
+ blake2s_compress_generic(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
+ state->buflen = 0;
+ in += fill;
+ inlen -= fill;
+ }
+ if (inlen > BLAKE2S_BLOCK_SIZE) {
+ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+ /* Hash one less (full) block than strictly possible */
+ blake2s_compress_generic(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+ in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+ inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+ }
+ memcpy(state->buf + state->buflen, in, inlen);
+ state->buflen += inlen;
+
+ return 0;
+}
+
+static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
+{
+ struct blake2s_state *state = shash_desc_ctx(desc);
+
+ blake2s_set_lastblock(state);
+ memset(state->buf + state->buflen, 0,
+ BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
+ blake2s_compress_generic(state, state->buf, 1, state->buflen);
+ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+ memcpy(out, state->h, state->outlen);
+ memzero_explicit(state, sizeof(*state));
+
+ return 0;
+}
+
+static struct shash_alg blake2s_algs[] = {{
+ .base.cra_name = "blake2s-128",
+ .base.cra_driver_name = "blake2s-128-generic",
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
+ .base.cra_priority = 200,
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = BLAKE2S_128_HASH_SIZE,
+ .setkey = crypto_blake2s_setkey,
+ .init = crypto_blake2s_init,
+ .update = crypto_blake2s_update,
+ .final = crypto_blake2s_final,
+ .descsize = sizeof(struct blake2s_state),
+}, {
+ .base.cra_name = "blake2s-160",
+ .base.cra_driver_name = "blake2s-160-generic",
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
+ .base.cra_priority = 200,
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = BLAKE2S_160_HASH_SIZE,
+ .setkey = crypto_blake2s_setkey,
+ .init = crypto_blake2s_init,
+ .update = crypto_blake2s_update,
+ .final = crypto_blake2s_final,
+ .descsize = sizeof(struct blake2s_state),
+}, {
+ .base.cra_name = "blake2s-224",
+ .base.cra_driver_name = "blake2s-224-generic",
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
+ .base.cra_priority = 200,
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = BLAKE2S_224_HASH_SIZE,
+ .setkey = crypto_blake2s_setkey,
+ .init = crypto_blake2s_init,
+ .update = crypto_blake2s_update,
+ .final = crypto_blake2s_final,
+ .descsize = sizeof(struct blake2s_state),
+}, {
+ .base.cra_name = "blake2s-256",
+ .base.cra_driver_name = "blake2s-256-generic",
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
+ .base.cra_priority = 200,
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = BLAKE2S_256_HASH_SIZE,
+ .setkey = crypto_blake2s_setkey,
+ .init = crypto_blake2s_init,
+ .update = crypto_blake2s_update,
+ .final = crypto_blake2s_final,
+ .descsize = sizeof(struct blake2s_state),
+}};
+
+static int __init blake2s_mod_init(void)
+{
+ return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+static void __exit blake2s_mod_exit(void)
+{
+ crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+subsys_initcall(blake2s_mod_init);
+module_exit(blake2s_mod_exit);
+
+MODULE_ALIAS_CRYPTO("blake2s-128");
+MODULE_ALIAS_CRYPTO("blake2s-128-generic");
+MODULE_ALIAS_CRYPTO("blake2s-160");
+MODULE_ALIAS_CRYPTO("blake2s-160-generic");
+MODULE_ALIAS_CRYPTO("blake2s-224");
+MODULE_ALIAS_CRYPTO("blake2s-224-generic");
+MODULE_ALIAS_CRYPTO("blake2s-256");
+MODULE_ALIAS_CRYPTO("blake2s-256-generic");
+MODULE_LICENSE("GPL v2");
--- b/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -11,6 +11,7 @@
avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
+adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@@ -41,4 +42,9 @@
obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+# These modules require the assembler to support ADX.
+ifeq ($(adx_supported),yes)
+ obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
+endif
+
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
@@ -48,6 +54,7 @@
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
+ obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
endif
# These modules require assembler to support AVX2.
@@ -70,6 +77,11 @@
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
+ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
+targets += poly1305-x86_64-cryptogams.S
+endif
ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
@@ -98,10 +110,8 @@
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
-poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
ifeq ($(avx2_supported),yes)
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
-poly1305-x86_64-y += poly1305-avx2-x86_64.o
endif
ifeq ($(sha1_ni_supported),yes)
sha1-ssse3-y += sha1_ni_asm.o
@@ -115,3 +125,8 @@
endif
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
+
+quiet_cmd_perlasm = PERLASM $@
+ cmd_perlasm = $(PERL) $< > $@
+$(obj)/%.S: $(src)/%.pl FORCE
+ $(call if_changed,perlasm)
--- /dev/null
+++ b/arch/x86/crypto/blake2s-core.S
@@ -0,0 +1,258 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
+ * Copyright (C) 2017-2019 Samuel Neves . All Rights Reserved.
+ */
+
+#include
+
+.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
+.align 32
+IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
+ .octa 0x5BE0CD191F83D9AB9B05688C510E527F
+.section .rodata.cst16.ROT16, "aM", @progbits, 16
+.align 16
+ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
+.section .rodata.cst16.ROR328, "aM", @progbits, 16
+.align 16
+ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
+.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
+.align 64
+SIGMA:
+.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
+.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
+.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
+.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
+.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
+.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
+.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
+.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
+.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
+.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
+#ifdef CONFIG_AS_AVX512
+.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
+.align 64
+SIGMA2:
+.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
+.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
+.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
+.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
+.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
+.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
+.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
+.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
+.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
+.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
+#endif /* CONFIG_AS_AVX512 */
+
+.text
+#ifdef CONFIG_AS_SSSE3
+ENTRY(blake2s_compress_ssse3)
+ testq %rdx,%rdx
+ je .Lendofloop
+ movdqu (%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqa ROT16(%rip),%xmm12
+ movdqa ROR328(%rip),%xmm13
+ movdqu 0x20(%rdi),%xmm14
+ movq %rcx,%xmm15
+ leaq SIGMA+0xa0(%rip),%r8
+ jmp .Lbeginofloop
+ .align 32
+.Lbeginofloop:
+ movdqa %xmm0,%xmm10
+ movdqa %xmm1,%xmm11
+ paddq %xmm15,%xmm14
+ movdqa IV(%rip),%xmm2
+ movdqa %xmm14,%xmm3
+ pxor IV+0x10(%rip),%xmm3
+ leaq SIGMA(%rip),%rcx
+.Lroundloop:
+ movzbl (%rcx),%eax
+ movd (%rsi,%rax,4),%xmm4
+ movzbl 0x1(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm5
+ movzbl 0x2(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm6
+ movzbl 0x3(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm7
+ punpckldq %xmm5,%xmm4
+ punpckldq %xmm7,%xmm6
+ punpcklqdq %xmm6,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movzbl 0x4(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm5
+ movzbl 0x5(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm6
+ movzbl 0x6(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm7
+ movzbl 0x7(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm4
+ punpckldq %xmm6,%xmm5
+ punpckldq %xmm4,%xmm7
+ punpcklqdq %xmm7,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movzbl 0x8(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm6
+ movzbl 0x9(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm7
+ movzbl 0xa(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm4
+ movzbl 0xb(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm5
+ punpckldq %xmm7,%xmm6
+ punpckldq %xmm5,%xmm4
+ punpcklqdq %xmm4,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movzbl 0xc(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm7
+ movzbl 0xd(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm4
+ movzbl 0xe(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm5
+ movzbl 0xf(%rcx),%eax
+ movd (%rsi,%rax,4),%xmm6
+ punpckldq %xmm4,%xmm7
+ punpckldq %xmm6,%xmm5
+ punpcklqdq %xmm5,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ addq $0x10,%rcx
+ cmpq %r8,%rcx
+ jnz .Lroundloop
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm1
+ pxor %xmm10,%xmm0
+ pxor %xmm11,%xmm1
+ addq $0x40,%rsi
+ decq %rdx
+ jnz .Lbeginofloop
+ movdqu %xmm0,(%rdi)
+ movdqu %xmm1,0x10(%rdi)
+ movdqu %xmm14,0x20(%rdi)
+.Lendofloop:
+ ret
+ENDPROC(blake2s_compress_ssse3)
+#endif /* CONFIG_AS_SSSE3 */
+
+#ifdef CONFIG_AS_AVX512
+ENTRY(blake2s_compress_avx512)
+ vmovdqu (%rdi),%xmm0
+ vmovdqu 0x10(%rdi),%xmm1
+ vmovdqu 0x20(%rdi),%xmm4
+ vmovq %rcx,%xmm5
+ vmovdqa IV(%rip),%xmm14
+ vmovdqa IV+16(%rip),%xmm15
+ jmp .Lblake2s_compress_avx512_mainloop
+.align 32
+.Lblake2s_compress_avx512_mainloop:
+ vmovdqa %xmm0,%xmm10
+ vmovdqa %xmm1,%xmm11
+ vpaddq %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm14,%xmm2
+ vpxor %xmm15,%xmm4,%xmm3
+ vmovdqu (%rsi),%ymm6
+ vmovdqu 0x20(%rsi),%ymm7
+ addq $0x40,%rsi
+ leaq SIGMA2(%rip),%rax
+ movb $0xa,%cl
+.Lblake2s_compress_avx512_roundloop:
+ addq $0x40,%rax
+ vmovdqa -0x40(%rax),%ymm8
+ vmovdqa -0x20(%rax),%ymm9
+ vpermi2d %ymm7,%ymm6,%ymm8
+ vpermi2d %ymm7,%ymm6,%ymm9
+ vmovdqa %ymm8,%ymm6
+ vmovdqa %ymm9,%ymm7
+ vpaddd %xmm8,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x10,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0xc,%xmm1,%xmm1
+ vextracti128 $0x1,%ymm8,%xmm8
+ vpaddd %xmm8,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0x7,%xmm1,%xmm1
+ vpshufd $0x93,%xmm0,%xmm0
+ vpshufd $0x4e,%xmm3,%xmm3
+ vpshufd $0x39,%xmm2,%xmm2
+ vpaddd %xmm9,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x10,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0xc,%xmm1,%xmm1
+ vextracti128 $0x1,%ymm9,%xmm9
+ vpaddd %xmm9,%xmm0,%xmm0
+ vpaddd %xmm1,%xmm0,%xmm0
+ vpxor %xmm0,%xmm3,%xmm3
+ vprord $0x8,%xmm3,%xmm3
+ vpaddd %xmm3,%xmm2,%xmm2
+ vpxor %xmm2,%xmm1,%xmm1
+ vprord $0x7,%xmm1,%xmm1
+ vpshufd $0x39,%xmm0,%xmm0
+ vpshufd $0x4e,%xmm3,%xmm3
+ vpshufd $0x93,%xmm2,%xmm2
+ decb %cl
+ jne .Lblake2s_compress_avx512_roundloop
+ vpxor %xmm10,%xmm0,%xmm0
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm2,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ decq %rdx
+ jne .Lblake2s_compress_avx512_mainloop
+ vmovdqu %xmm0,(%rdi)
+ vmovdqu %xmm1,0x10(%rdi)
+ vmovdqu %xmm4,0x20(%rdi)
+ vzeroupper
+ retq
+ENDPROC(blake2s_compress_avx512)
+#endif /* CONFIG_AS_AVX512 */
--- b/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
+ */
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
+ const u8 *block, const size_t nblocks,
+ const u32 inc);
+asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
+ const u8 *block, const size_t nblocks,
+ const u32 inc);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
+
+void blake2s_compress_arch(struct blake2s_state *state,
+ const u8 *block, size_t nblocks,
+ const u32 inc)
+{
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
+
+ if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
+ blake2s_compress_generic(state, block, nblocks, inc);
+ return;
+ }
+
+ do {
+ const size_t blocks = min_t(size_t, nblocks,
+ SZ_4K / BLAKE2S_BLOCK_SIZE);
+
+ kernel_fpu_begin();
+ if (IS_ENABLED(CONFIG_AS_AVX512) &&
+ static_branch_likely(&blake2s_use_avx512))
+ blake2s_compress_avx512(state, block, blocks, inc);
+ else
+ blake2s_compress_ssse3(state, block, blocks, inc);
+ kernel_fpu_end();
+
+ nblocks -= blocks;
+ block += blocks * BLAKE2S_BLOCK_SIZE;
+ } while (nblocks);
+}
+EXPORT_SYMBOL(blake2s_compress_arch);
+
+static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+
+ if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(tctx->key, key, keylen);
+ tctx->keylen = keylen;
+
+ return 0;
+}
+
+static int crypto_blake2s_init(struct shash_desc *desc)
+{
+ struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+ struct blake2s_state *state = shash_desc_ctx(desc);
+ const int outlen = crypto_shash_digestsize(desc->tfm);
+
+ if (tctx->keylen)
+ blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
+ else
+ blake2s_init(state, outlen);
+
+ return 0;
+}
+
+static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
+ unsigned int inlen)
+{
+ struct blake2s_state *state = shash_desc_ctx(desc);
+ const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+
+ if (unlikely(!inlen))
+ return 0;
+ if (inlen > fill) {
+ memcpy(state->buf + state->buflen, in, fill);
+ blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
+ state->buflen = 0;
+ in += fill;
+ inlen -= fill;
+ }
+ if (inlen > BLAKE2S_BLOCK_SIZE) {
+ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+ /* Hash one less (full) block than strictly possible */
+ blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+ in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+ inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+ }
+ memcpy(state->buf + state->buflen, in, inlen);
+ state->buflen += inlen;
+
+ return 0;
+}
+
+static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
+{
+ struct blake2s_state *state = shash_desc_ctx(desc);
+
+ blake2s_set_lastblock(state);
+ memset(state->buf + state->buflen, 0,
+ BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
+ blake2s_compress_arch(state, state->buf, 1, state->buflen);
+ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+ memcpy(out, state->h, state->outlen);
+ memzero_explicit(state, sizeof(*state));
+
+ return 0;
+}
+
+static struct shash_alg blake2s_algs[] = {{
+ .base.cra_name = "blake2s-128",
+ .base.cra_driver_name = "blake2s-128-x86",
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
+ .base.cra_priority = 200,
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = BLAKE2S_128_HASH_SIZE,
+ .setkey = crypto_blake2s_setkey,
+ .init = crypto_blake2s_init,
+ .update = crypto_blake2s_update,
+ .final = crypto_blake2s_final,
+ .descsize = sizeof(struct blake2s_state),
+}, {
+ .base.cra_name = "blake2s-160",
+ .base.cra_driver_name = "blake2s-160-x86",
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
+ .base.cra_priority = 200,
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = BLAKE2S_160_HASH_SIZE,
+ .setkey = crypto_blake2s_setkey,
+ .init = crypto_blake2s_init,
+ .update = crypto_blake2s_update,
+ .final = crypto_blake2s_final,
+ .descsize = sizeof(struct blake2s_state),
+}, {
+ .base.cra_name = "blake2s-224",
+ .base.cra_driver_name = "blake2s-224-x86",
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
+ .base.cra_priority = 200,
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = BLAKE2S_224_HASH_SIZE,
+ .setkey = crypto_blake2s_setkey,
+ .init = crypto_blake2s_init,
+ .update = crypto_blake2s_update,
+ .final = crypto_blake2s_final,
+ .descsize = sizeof(struct blake2s_state),
+}, {
+ .base.cra_name = "blake2s-256",
+ .base.cra_driver_name = "blake2s-256-x86",
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
+ .base.cra_priority = 200,
+ .base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+
+ .digestsize = BLAKE2S_256_HASH_SIZE,
+ .setkey = crypto_blake2s_setkey,
+ .init = crypto_blake2s_init,
+ .update = crypto_blake2s_update,
+ .final = crypto_blake2s_final,
+ .descsize = sizeof(struct blake2s_state),
+}};
+
+static int __init blake2s_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
+ return 0;
+
+ static_branch_enable(&blake2s_use_ssse3);
+
+ if (IS_ENABLED(CONFIG_AS_AVX512) &&
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL))
+ static_branch_enable(&blake2s_use_avx512);
+
+ return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+ crypto_register_shashes(blake2s_algs,
+ ARRAY_SIZE(blake2s_algs)) : 0;
+}
+
+static void __exit blake2s_mod_exit(void)
+{
+ if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+ crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+}
+
+module_init(blake2s_mod_init);
+module_exit(blake2s_mod_exit);
+
+MODULE_ALIAS_CRYPTO("blake2s-128");
+MODULE_ALIAS_CRYPTO("blake2s-128-x86");
+MODULE_ALIAS_CRYPTO("blake2s-160");
+MODULE_ALIAS_CRYPTO("blake2s-160-x86");
+MODULE_ALIAS_CRYPTO("blake2s-224");
+MODULE_ALIAS_CRYPTO("blake2s-224-x86");
+MODULE_ALIAS_CRYPTO("blake2s-256");
+MODULE_ALIAS_CRYPTO("blake2s-256-x86");
+MODULE_LICENSE("GPL v2");
--- b/include/crypto/curve25519.h
+++ b/include/crypto/curve25519.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
+ */
+
+#ifndef CURVE25519_H
+#define CURVE25519_H
+
+#include // For crypto_memneq.
+#include
+#include
+
+enum curve25519_lengths {
+ CURVE25519_KEY_SIZE = 32
+};
+
+extern const u8 curve25519_null_point[];
+extern const u8 curve25519_base_point[];
+
+void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
+ const u8 scalar[CURVE25519_KEY_SIZE],
+ const u8 point[CURVE25519_KEY_SIZE]);
+
+void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
+ const u8 scalar[CURVE25519_KEY_SIZE],
+ const u8 point[CURVE25519_KEY_SIZE]);
+
+void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE]);
+
+static inline
+bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519) &&
+ (!IS_ENABLED(CONFIG_CRYPTO_CURVE25519_X86) || IS_ENABLED(CONFIG_AS_ADX)))
+ curve25519_arch(mypublic, secret, basepoint);
+ else
+ curve25519_generic(mypublic, secret, basepoint);
+ return crypto_memneq(mypublic, curve25519_null_point,
+ CURVE25519_KEY_SIZE);
+}
+
+static inline bool
+__must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE])
+{
+ if (unlikely(!crypto_memneq(secret, curve25519_null_point,
+ CURVE25519_KEY_SIZE)))
+ return false;
+
+ if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519) &&
+ (!IS_ENABLED(CONFIG_CRYPTO_CURVE25519_X86) || IS_ENABLED(CONFIG_AS_ADX)))
+ curve25519_base_arch(pub, secret);
+ else
+ curve25519_generic(pub, secret, curve25519_base_point);
+ return crypto_memneq(pub, curve25519_null_point, CURVE25519_KEY_SIZE);
+}
+
+static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE])
+{
+ secret[0] &= 248;
+ secret[31] = (secret[31] & 127) | 64;
+}
+
+static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE])
+{
+ get_random_bytes_wait(secret, CURVE25519_KEY_SIZE);
+ curve25519_clamp_secret(secret);
+}
+
+#endif /* CURVE25519_H */
--- b/lib/crypto/curve25519-fiat32.c
+++ b/lib/crypto/curve25519-fiat32.c
@@ -0,0 +1,864 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2016 The fiat-crypto Authors.
+ * Copyright (C) 2018-2019 Jason A. Donenfeld . All Rights Reserved.
+ *
+ * This is a machine-generated formally verified implementation of Curve25519
+ * ECDH from: . Though originally
+ * machine generated, it has been tweaked to be suitable for use in the kernel.
+ * It is optimized for 32-bit machines and machines that cannot work efficiently
+ * with 128-bit integer types.
+ */
+
+#include
+#include
+#include
+
+/* fe means field element. Here the field is \Z/(2^255-19). An element t,
+ * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
+ * t[3]+2^102 t[4]+...+2^230 t[9].
+ * fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc.
+ * Multiplication and carrying produce fe from fe_loose.
+ */
+typedef struct fe { u32 v[10]; } fe;
+
+/* fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc
+ * Addition and subtraction produce fe_loose from (fe, fe).
+ */
+typedef struct fe_loose { u32 v[10]; } fe_loose;
+
+static __always_inline void fe_frombytes_impl(u32 h[10], const u8 *s)
+{
+ /* Ignores top bit of s. */
+ u32 a0 = get_unaligned_le32(s);
+ u32 a1 = get_unaligned_le32(s+4);
+ u32 a2 = get_unaligned_le32(s+8);
+ u32 a3 = get_unaligned_le32(s+12);
+ u32 a4 = get_unaligned_le32(s+16);
+ u32 a5 = get_unaligned_le32(s+20);
+ u32 a6 = get_unaligned_le32(s+24);
+ u32 a7 = get_unaligned_le32(s+28);
+ h[0] = a0&((1<<26)-1); /* 26 used, 32-26 left. 26 */
+ h[1] = (a0>>26) | ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 = 6+19 = 25 */
+ h[2] = (a1>>19) | ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */
+ h[3] = (a2>>13) | ((a3&((1<< 6)-1))<<19); /* (32-13) + 6 = 19+ 6 = 25 */
+ h[4] = (a3>> 6); /* (32- 6) = 26 */
+ h[5] = a4&((1<<25)-1); /* 25 */
+ h[6] = (a4>>25) | ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 = 7+19 = 26 */
+ h[7] = (a5>>19) | ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */
+ h[8] = (a6>>12) | ((a7&((1<< 6)-1))<<20); /* (32-12) + 6 = 20+ 6 = 26 */
+ h[9] = (a7>> 6)&((1<<25)-1); /* 25 */
+}
+
+static __always_inline void fe_frombytes(fe *h, const u8 *s)
+{
+ fe_frombytes_impl(h->v, s);
+}
+
+static __always_inline u8 /*bool*/
+addcarryx_u25(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
+{
+ /* This function extracts 25 bits of result and 1 bit of carry
+ * (26 total), so a 32-bit intermediate is sufficient.
+ */
+ u32 x = a + b + c;
+ *low = x & ((1 << 25) - 1);
+ return (x >> 25) & 1;
+}
+
+static __always_inline u8 /*bool*/
+addcarryx_u26(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
+{
+ /* This function extracts 26 bits of result and 1 bit of carry
+ * (27 total), so a 32-bit intermediate is sufficient.
+ */
+ u32 x = a + b + c;
+ *low = x & ((1 << 26) - 1);
+ return (x >> 26) & 1;
+}
+
+static __always_inline u8 /*bool*/
+subborrow_u25(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
+{
+ /* This function extracts 25 bits of result and 1 bit of borrow
+ * (26 total), so a 32-bit intermediate is sufficient.
+ */
+ u32 x = a - b - c;
+ *low = x & ((1 << 25) - 1);
+ return x >> 31;
+}
+
+static __always_inline u8 /*bool*/
+subborrow_u26(u8 /*bool*/ c, u32 a, u32 b, u32 *low)
+{
+ /* This function extracts 26 bits of result and 1 bit of borrow
+ *(27 total), so a 32-bit intermediate is sufficient.
+ */
+ u32 x = a - b - c;
+ *low = x & ((1 << 26) - 1);
+ return x >> 31;
+}
+
+static __always_inline u32 cmovznz32(u32 t, u32 z, u32 nz)
+{
+ t = -!!t; /* all set if nonzero, 0 if 0 */
+ return (t&nz) | ((~t)&z);
+}
+
+static __always_inline void fe_freeze(u32 out[10], const u32 in1[10])
+{
+ { const u32 x17 = in1[9];
+ { const u32 x18 = in1[8];
+ { const u32 x16 = in1[7];
+ { const u32 x14 = in1[6];
+ { const u32 x12 = in1[5];
+ { const u32 x10 = in1[4];
+ { const u32 x8 = in1[3];
+ { const u32 x6 = in1[2];
+ { const u32 x4 = in1[1];
+ { const u32 x2 = in1[0];
+ { u32 x20; u8/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20);
+ { u32 x23; u8/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23);
+ { u32 x26; u8/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26);
+ { u32 x29; u8/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29);
+ { u32 x32; u8/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32);
+ { u32 x35; u8/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35);
+ { u32 x38; u8/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38);
+ { u32 x41; u8/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41);
+ { u32 x44; u8/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44);
+ { u32 x47; u8/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47);
+ { u32 x49 = cmovznz32(x48, 0x0, 0xffffffff);
+ { u32 x50 = (x49 & 0x3ffffed);
+ { u32 x52; u8/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52);
+ { u32 x54 = (x49 & 0x1ffffff);
+ { u32 x56; u8/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56);
+ { u32 x58 = (x49 & 0x3ffffff);
+ { u32 x60; u8/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60);
+ { u32 x62 = (x49 & 0x1ffffff);
+ { u32 x64; u8/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64);
+ { u32 x66 = (x49 & 0x3ffffff);
+ { u32 x68; u8/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68);
+ { u32 x70 = (x49 & 0x1ffffff);
+ { u32 x72; u8/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72);
+ { u32 x74 = (x49 & 0x3ffffff);
+ { u32 x76; u8/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76);
+ { u32 x78 = (x49 & 0x1ffffff);
+ { u32 x80; u8/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80);
+ { u32 x82 = (x49 & 0x3ffffff);
+ { u32 x84; u8/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84);
+ { u32 x86 = (x49 & 0x1ffffff);
+ { u32 x88; addcarryx_u25(x85, x47, x86, &x88);
+ out[0] = x52;
+ out[1] = x56;
+ out[2] = x60;
+ out[3] = x64;
+ out[4] = x68;
+ out[5] = x72;
+ out[6] = x76;
+ out[7] = x80;
+ out[8] = x84;
+ out[9] = x88;
+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
+}
+
+static __always_inline void fe_tobytes(u8 s[32], const fe *f)
+{
+ u32 h[10];
+ fe_freeze(h, f->v);
+ s[0] = h[0] >> 0;
+ s[1] = h[0] >> 8;
+ s[2] = h[0] >> 16;
+ s[3] = (h[0] >> 24) | (h[1] << 2);
+ s[4] = h[1] >> 6;
+ s[5] = h[1] >> 14;
+ s[6] = (h[1] >> 22) | (h[2] << 3);
+ s[7] = h[2] >> 5;
+ s[8] = h[2] >> 13;
+ s[9] = (h[2] >> 21) | (h[3] << 5);
+ s[10] = h[3] >> 3;
+ s[11] = h[3] >> 11;
+ s[12] = (h[3] >> 19) | (h[4] << 6);
+ s[13] = h[4] >> 2;
+ s[14] = h[4] >> 10;
+ s[15] = h[4] >> 18;
+ s[16] = h[5] >> 0;
+ s[17] = h[5] >> 8;
+ s[18] = h[5] >> 16;
+ s[19] = (h[5] >> 24) | (h[6] << 1);
+ s[20] = h[6] >> 7;
+ s[21] = h[6] >> 15;
+ s[22] = (h[6] >> 23) | (h[7] << 3);
+ s[23] = h[7] >> 5;
+ s[24] = h[7] >> 13;
+ s[25] = (h[7] >> 21) | (h[8] << 4);
+ s[26] = h[8] >> 4;
+ s[27] = h[8] >> 12;
+ s[28] = (h[8] >> 20) | (h[9] << 6);
+ s[29] = h[9] >> 2;
+ s[30] = h[9] >> 10;
+ s[31] = h[9] >> 18;
+}
+
+/* h = f */
+static __always_inline void fe_copy(fe *h, const fe *f)
+{
+ memmove(h, f, sizeof(u32) * 10);
+}
+
+static __always_inline void fe_copy_lt(fe_loose *h, const fe *f)
+{
+ memmove(h, f, sizeof(u32) * 10);
+}
+
+/* h = 0 */
+static __always_inline void fe_0(fe *h)
+{
+ memset(h, 0, sizeof(u32) * 10);
+}
+
+/* h = 1 */
+static __always_inline void fe_1(fe *h)
+{
+ memset(h, 0, sizeof(u32) * 10);
+ h->v[0] = 1;
+}
+
+static noinline void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+{
+ { const u32 x20 = in1[9];
+ { const u32 x21 = in1[8];
+ { const u32 x19 = in1[7];
+ { const u32 x17 = in1[6];
+ { const u32 x15 = in1[5];
+ { const u32 x13 = in1[4];
+ { const u32 x11 = in1[3];
+ { const u32 x9 = in1[2];
+ { const u32 x7 = in1[1];
+ { const u32 x5 = in1[0];
+ { const u32 x38 = in2[9];
+ { const u32 x39 = in2[8];
+ { const u32 x37 = in2[7];
+ { const u32 x35 = in2[6];
+ { const u32 x33 = in2[5];
+ { const u32 x31 = in2[4];
+ { const u32 x29 = in2[3];
+ { const u32 x27 = in2[2];
+ { const u32 x25 = in2[1];
+ { const u32 x23 = in2[0];
+ out[0] = (x5 + x23);
+ out[1] = (x7 + x25);
+ out[2] = (x9 + x27);
+ out[3] = (x11 + x29);
+ out[4] = (x13 + x31);
+ out[5] = (x15 + x33);
+ out[6] = (x17 + x35);
+ out[7] = (x19 + x37);
+ out[8] = (x21 + x39);
+ out[9] = (x20 + x38);
+ }}}}}}}}}}}}}}}}}}}}
+}
+
+/* h = f + g
+ * Can overlap h with f or g.
+ */
+static __always_inline void fe_add(fe_loose *h, const fe *f, const fe *g)
+{
+ fe_add_impl(h->v, f->v, g->v);
+}
+
+static noinline void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+{
+ { const u32 x20 = in1[9];
+ { const u32 x21 = in1[8];
+ { const u32 x19 = in1[7];
+ { const u32 x17 = in1[6];
+ { const u32 x15 = in1[5];
+ { const u32 x13 = in1[4];
+ { const u32 x11 = in1[3];
+ { const u32 x9 = in1[2];
+ { const u32 x7 = in1[1];
+ { const u32 x5 = in1[0];
+ { const u32 x38 = in2[9];
+ { const u32 x39 = in2[8];
+ { const u32 x37 = in2[7];
+ { const u32 x35 = in2[6];
+ { const u32 x33 = in2[5];
+ { const u32 x31 = in2[4];
+ { const u32 x29 = in2[3];
+ { const u32 x27 = in2[2];
+ { const u32 x25 = in2[1];
+ { const u32 x23 = in2[0];
+ out[0] = ((0x7ffffda + x5) - x23);
+ out[1] = ((0x3fffffe + x7) - x25);
+ out[2] = ((0x7fffffe + x9) - x27);
+ out[3] = ((0x3fffffe + x11) - x29);
+ out[4] = ((0x7fffffe + x13) - x31);
+ out[5] = ((0x3fffffe + x15) - x33);
+ out[6] = ((0x7fffffe + x17) - x35);
+ out[7] = ((0x3fffffe + x19) - x37);
+ out[8] = ((0x7fffffe + x21) - x39);
+ out[9] = ((0x3fffffe + x20) - x38);
+ }}}}}}}}}}}}}}}}}}}}
+}
+
+/* h = f - g
+ * Can overlap h with f or g.
+ */
+static __always_inline void fe_sub(fe_loose *h, const fe *f, const fe *g)
+{
+ fe_sub_impl(h->v, f->v, g->v);
+}
+
+static noinline void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10])
+{
+ { const u32 x20 = in1[9];
+ { const u32 x21 = in1[8];
+ { const u32 x19 = in1[7];
+ { const u32 x17 = in1[6];
+ { const u32 x15 = in1[5];
+ { const u32 x13 = in1[4];
+ { const u32 x11 = in1[3];
+ { const u32 x9 = in1[2];
+ { const u32 x7 = in1[1];
+ { const u32 x5 = in1[0];
+ { const u32 x38 = in2[9];
+ { const u32 x39 = in2[8];
+ { const u32 x37 = in2[7];
+ { const u32 x35 = in2[6];
+ { const u32 x33 = in2[5];
+ { const u32 x31 = in2[4];
+ { const u32 x29 = in2[3];
+ { const u32 x27 = in2[2];
+ { const u32 x25 = in2[1];
+ { const u32 x23 = in2[0];
+ { u64 x40 = ((u64)x23 * x5);
+ { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5));
+ { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5));
+ { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5));
+ { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5));
+ { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5));
+ { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5));
+ { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5));
+ { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5));
+ { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5));
+ { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9));
+ { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9));
+ { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13));
+ { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13));
+ { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17));
+ { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17));
+ { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19))));
+ { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21));
+ { u64 x58 = ((u64)(0x2 * x38) * x20);
+ { u64 x59 = (x48 + (x58 << 0x4));
+ { u64 x60 = (x59 + (x58 << 0x1));
+ { u64 x61 = (x60 + x58);
+ { u64 x62 = (x47 + (x57 << 0x4));
+ { u64 x63 = (x62 + (x57 << 0x1));
+ { u64 x64 = (x63 + x57);
+ { u64 x65 = (x46 + (x56 << 0x4));
+ { u64 x66 = (x65 + (x56 << 0x1));
+ { u64 x67 = (x66 + x56);
+ { u64 x68 = (x45 + (x55 << 0x4));
+ { u64 x69 = (x68 + (x55 << 0x1));
+ { u64 x70 = (x69 + x55);
+ { u64 x71 = (x44 + (x54 << 0x4));
+ { u64 x72 = (x71 + (x54 << 0x1));
+ { u64 x73 = (x72 + x54);
+ { u64 x74 = (x43 + (x53 << 0x4));
+ { u64 x75 = (x74 + (x53 << 0x1));
+ { u64 x76 = (x75 + x53);
+ { u64 x77 = (x42 + (x52 << 0x4));
+ { u64 x78 = (x77 + (x52 << 0x1));
+ { u64 x79 = (x78 + x52);
+ { u64 x80 = (x41 + (x51 << 0x4));
+ { u64 x81 = (x80 + (x51 << 0x1));
+ { u64 x82 = (x81 + x51);
+ { u64 x83 = (x40 + (x50 << 0x4));
+ { u64 x84 = (x83 + (x50 << 0x1));
+ { u64 x85 = (x84 + x50);
+ { u64 x86 = (x85 >> 0x1a);
+ { u32 x87 = ((u32)x85 & 0x3ffffff);
+ { u64 x88 = (x86 + x82);
+ { u64 x89 = (x88 >> 0x19);
+ { u32 x90 = ((u32)x88 & 0x1ffffff);
+ { u64 x91 = (x89 + x79);
+ { u64 x92 = (x91 >> 0x1a);
+ { u32 x93 = ((u32)x91 & 0x3ffffff);
+ { u64 x94 = (x92 + x76);
+ { u64 x95 = (x94 >> 0x19);
+ { u32 x96 = ((u32)x94 & 0x1ffffff);
+ { u64 x97 = (x95 + x73);
+ { u64 x98 = (x97 >> 0x1a);
+ { u32 x99 = ((u32)x97 & 0x3ffffff);
+ { u64 x100 = (x98 + x70);
+ { u64 x101 = (x100 >> 0x19);
+ { u32 x102 = ((u32)x100 & 0x1ffffff);
+ { u64 x103 = (x101 + x67);
+ { u64 x104 = (x103 >> 0x1a);
+ { u32 x105 = ((u32)x103 & 0x3ffffff);
+ { u64 x106 = (x104 + x64);
+ { u64 x107 = (x106 >> 0x19);
+ { u32 x108 = ((u32)x106 & 0x1ffffff);
+ { u64 x109 = (x107 + x61);
+ { u64 x110 = (x109 >> 0x1a);
+ { u32 x111 = ((u32)x109 & 0x3ffffff);
+ { u64 x112 = (x110 + x49);
+ { u64 x113 = (x112 >> 0x19);
+ { u32 x114 = ((u32)x112 & 0x1ffffff);
+ { u64 x115 = (x87 + (0x13 * x113));
+ { u32 x116 = (u32) (x115 >> 0x1a);
+ { u32 x117 = ((u32)x115 & 0x3ffffff);
+ { u32 x118 = (x116 + x90);
+ { u32 x119 = (x118 >> 0x19);
+ { u32 x120 = (x118 & 0x1ffffff);
+ out[0] = x117;
+ out[1] = x120;
+ out[2] = (x119 + x93);
+ out[3] = x96;
+ out[4] = x99;
+ out[5] = x102;
+ out[6] = x105;
+ out[7] = x108;
+ out[8] = x111;
+ out[9] = x114;
+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
+}
+
+static __always_inline void fe_mul_ttt(fe *h, const fe *f, const fe *g)
+{
+ fe_mul_impl(h->v, f->v, g->v);
+}
+
+static __always_inline void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g)
+{
+ fe_mul_impl(h->v, f->v, g->v);
+}
+
+static __always_inline void
+fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g)
+{
+ fe_mul_impl(h->v, f->v, g->v);
+}
+
+static noinline void fe_sqr_impl(u32 out[10], const u32 in1[10])
+{
+ { const u32 x17 = in1[9];
+ { const u32 x18 = in1[8];
+ { const u32 x16 = in1[7];
+ { const u32 x14 = in1[6];
+ { const u32 x12 = in1[5];
+ { const u32 x10 = in1[4];
+ { const u32 x8 = in1[3];
+ { const u32 x6 = in1[2];
+ { const u32 x4 = in1[1];
+ { const u32 x2 = in1[0];
+ { u64 x19 = ((u64)x2 * x2);
+ { u64 x20 = ((u64)(0x2 * x2) * x4);
+ { u64 x21 = (0x2 * (((u64)x4 * x4) + ((u64)x2 * x6)));
+ { u64 x22 = (0x2 * (((u64)x4 * x6) + ((u64)x2 * x8)));
+ { u64 x23 = ((((u64)x6 * x6) + ((u64)(0x4 * x4) * x8)) + ((u64)(0x2 * x2) * x10));
+ { u64 x24 = (0x2 * ((((u64)x6 * x8) + ((u64)x4 * x10)) + ((u64)x2 * x12)));
+ { u64 x25 = (0x2 * (((((u64)x8 * x8) + ((u64)x6 * x10)) + ((u64)x2 * x14)) + ((u64)(0x2 * x4) * x12)));
+ { u64 x26 = (0x2 * (((((u64)x8 * x10) + ((u64)x6 * x12)) + ((u64)x4 * x14)) + ((u64)x2 * x16)));
+ { u64 x27 = (((u64)x10 * x10) + (0x2 * ((((u64)x6 * x14) + ((u64)x2 * x18)) + (0x2 * (((u64)x4 * x16) + ((u64)x8 * x12))))));
+ { u64 x28 = (0x2 * ((((((u64)x10 * x12) + ((u64)x8 * x14)) + ((u64)x6 * x16)) + ((u64)x4 * x18)) + ((u64)x2 * x17)));
+ { u64 x29 = (0x2 * (((((u64)x12 * x12) + ((u64)x10 * x14)) + ((u64)x6 * x18)) + (0x2 * (((u64)x8 * x16) + ((u64)x4 * x17)))));
+ { u64 x30 = (0x2 * (((((u64)x12 * x14) + ((u64)x10 * x16)) + ((u64)x8 * x18)) + ((u64)x6 * x17)));
+ { u64 x31 = (((u64)x14 * x14) + (0x2 * (((u64)x10 * x18) + (0x2 * (((u64)x12 * x16) + ((u64)x8 * x17))))));
+ { u64 x32 = (0x2 * ((((u64)x14 * x16) + ((u64)x12 * x18)) + ((u64)x10 * x17)));
+ { u64 x33 = (0x2 * ((((u64)x16 * x16) + ((u64)x14 * x18)) + ((u64)(0x2 * x12) * x17)));
+ { u64 x34 = (0x2 * (((u64)x16 * x18) + ((u64)x14 * x17)));
+ { u64 x35 = (((u64)x18 * x18) + ((u64)(0x4 * x16) * x17));
+ { u64 x36 = ((u64)(0x2 * x18) * x17);
+ { u64 x37 = ((u64)(0x2 * x17) * x17);
+ { u64 x38 = (x27 + (x37 << 0x4));
+ { u64 x39 = (x38 + (x37 << 0x1));
+ { u64 x40 = (x39 + x37);
+ { u64 x41 = (x26 + (x36 << 0x4));
+ { u64 x42 = (x41 + (x36 << 0x1));
+ { u64 x43 = (x42 + x36);
+ { u64 x44 = (x25 + (x35 << 0x4));
+ { u64 x45 = (x44 + (x35 << 0x1));
+ { u64 x46 = (x45 + x35);
+ { u64 x47 = (x24 + (x34 << 0x4));
+ { u64 x48 = (x47 + (x34 << 0x1));
+ { u64 x49 = (x48 + x34);
+ { u64 x50 = (x23 + (x33 << 0x4));
+ { u64 x51 = (x50 + (x33 << 0x1));
+ { u64 x52 = (x51 + x33);
+ { u64 x53 = (x22 + (x32 << 0x4));
+ { u64 x54 = (x53 + (x32 << 0x1));
+ { u64 x55 = (x54 + x32);
+ { u64 x56 = (x21 + (x31 << 0x4));
+ { u64 x57 = (x56 + (x31 << 0x1));
+ { u64 x58 = (x57 + x31);
+ { u64 x59 = (x20 + (x30 << 0x4));
+ { u64 x60 = (x59 + (x30 << 0x1));
+ { u64 x61 = (x60 + x30);
+ { u64 x62 = (x19 + (x29 << 0x4));
+ { u64 x63 = (x62 + (x29 << 0x1));
+ { u64 x64 = (x63 + x29);
+ { u64 x65 = (x64 >> 0x1a);
+ { u32 x66 = ((u32)x64 & 0x3ffffff);
+ { u64 x67 = (x65 + x61);
+ { u64 x68 = (x67 >> 0x19);
+ { u32 x69 = ((u32)x67 & 0x1ffffff);
+ { u64 x70 = (x68 + x58);
+ { u64 x71 = (x70 >> 0x1a);
+ { u32 x72 = ((u32)x70 & 0x3ffffff);
+ { u64 x73 = (x71 + x55);
+ { u64 x74 = (x73 >> 0x19);
+ { u32 x75 = ((u32)x73 & 0x1ffffff);
+ { u64 x76 = (x74 + x52);
+ { u64 x77 = (x76 >> 0x1a);
+ { u32 x78 = ((u32)x76 & 0x3ffffff);
+ { u64 x79 = (x77 + x49);
+ { u64 x80 = (x79 >> 0x19);
+ { u32 x81 = ((u32)x79 & 0x1ffffff);
+ { u64 x82 = (x80 + x46);
+ { u64 x83 = (x82 >> 0x1a);
+ { u32 x84 = ((u32)x82 & 0x3ffffff);
+ { u64 x85 = (x83 + x43);
+ { u64 x86 = (x85 >> 0x19);
+ { u32 x87 = ((u32)x85 & 0x1ffffff);
+ { u64 x88 = (x86 + x40);
+ { u64 x89 = (x88 >> 0x1a);
+ { u32 x90 = ((u32)x88 & 0x3ffffff);
+ { u64 x91 = (x89 + x28);
+ { u64 x92 = (x91 >> 0x19);
+ { u32 x93 = ((u32)x91 & 0x1ffffff);
+ { u64 x94 = (x66 + (0x13 * x92));
+ { u32 x95 = (u32) (x94 >> 0x1a);
+ { u32 x96 = ((u32)x94 & 0x3ffffff);
+ { u32 x97 = (x95 + x69);
+ { u32 x98 = (x97 >> 0x19);
+ { u32 x99 = (x97 & 0x1ffffff);
+ out[0] = x96;
+ out[1] = x99;
+ out[2] = (x98 + x72);
+ out[3] = x75;
+ out[4] = x78;
+ out[5] = x81;
+ out[6] = x84;
+ out[7] = x87;
+ out[8] = x90;
+ out[9] = x93;
+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
+}
+
+static __always_inline void fe_sq_tl(fe *h, const fe_loose *f)
+{
+ fe_sqr_impl(h->v, f->v);
+}
+
+static __always_inline void fe_sq_tt(fe *h, const fe *f)
+{
+ fe_sqr_impl(h->v, f->v);
+}
+
+static __always_inline void fe_loose_invert(fe *out, const fe_loose *z)
+{
+ fe t0;
+ fe t1;
+ fe t2;
+ fe t3;
+ int i;
+
+ fe_sq_tl(&t0, z);
+ fe_sq_tt(&t1, &t0);
+ for (i = 1; i < 2; ++i)
+ fe_sq_tt(&t1, &t1);
+ fe_mul_tlt(&t1, z, &t1);
+ fe_mul_ttt(&t0, &t0, &t1);
+ fe_sq_tt(&t2, &t0);
+ fe_mul_ttt(&t1, &t1, &t2);
+ fe_sq_tt(&t2, &t1);
+ for (i = 1; i < 5; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t1, &t2, &t1);
+ fe_sq_tt(&t2, &t1);
+ for (i = 1; i < 10; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t2, &t2, &t1);
+ fe_sq_tt(&t3, &t2);
+ for (i = 1; i < 20; ++i)
+ fe_sq_tt(&t3, &t3);
+ fe_mul_ttt(&t2, &t3, &t2);
+ fe_sq_tt(&t2, &t2);
+ for (i = 1; i < 10; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t1, &t2, &t1);
+ fe_sq_tt(&t2, &t1);
+ for (i = 1; i < 50; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t2, &t2, &t1);
+ fe_sq_tt(&t3, &t2);
+ for (i = 1; i < 100; ++i)
+ fe_sq_tt(&t3, &t3);
+ fe_mul_ttt(&t2, &t3, &t2);
+ fe_sq_tt(&t2, &t2);
+ for (i = 1; i < 50; ++i)
+ fe_sq_tt(&t2, &t2);
+ fe_mul_ttt(&t1, &t2, &t1);
+ fe_sq_tt(&t1, &t1);
+ for (i = 1; i < 5; ++i)
+ fe_sq_tt(&t1, &t1);
+ fe_mul_ttt(out, &t1, &t0);
+}
+
+static __always_inline void fe_invert(fe *out, const fe *z)
+{
+ fe_loose l;
+ fe_copy_lt(&l, z);
+ fe_loose_invert(out, &l);
+}
+
+/* Replace (f,g) with (g,f) if b == 1;
+ * replace (f,g) with (f,g) if b == 0.
+ *
+ * Preconditions: b in {0,1}
+ */
+static noinline void fe_cswap(fe *f, fe *g, unsigned int b)
+{
+ unsigned i;
+ b = 0 - b;
+ for (i = 0; i < 10; i++) {
+ u32 x = f->v[i] ^ g->v[i];
+ x &= b;
+ f->v[i] ^= x;
+ g->v[i] ^= x;
+ }
+}
+
+/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/
+static __always_inline void fe_mul_121666_impl(u32 out[10], const u32 in1[10])
+{
+ { const u32 x20 = in1[9];
+ { const u32 x21 = in1[8];
+ { const u32 x19 = in1[7];
+ { const u32 x17 = in1[6];
+ { const u32 x15 = in1[5];
+ { const u32 x13 = in1[4];
+ { const u32 x11 = in1[3];
+ { const u32 x9 = in1[2];
+ { const u32 x7 = in1[1];
+ { const u32 x5 = in1[0];
+ { const u32 x38 = 0;
+ { const u32 x39 = 0;
+ { const u32 x37 = 0;
+ { const u32 x35 = 0;
+ { const u32 x33 = 0;
+ { const u32 x31 = 0;
+ { const u32 x29 = 0;
+ { const u32 x27 = 0;
+ { const u32 x25 = 0;
+ { const u32 x23 = 121666;
+ { u64 x40 = ((u64)x23 * x5);
+ { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5));
+ { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5));
+ { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5));
+ { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5));
+ { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5));
+ { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5));
+ { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5));
+ { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5));
+ { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5));
+ { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9));
+ { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9));
+ { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13));
+ { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13));
+ { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17));
+ { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17));
+ { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19))));
+ { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21));
+ { u64 x58 = ((u64)(0x2 * x38) * x20);
+ { u64 x59 = (x48 + (x58 << 0x4));
+ { u64 x60 = (x59 + (x58 << 0x1));
+ { u64 x61 = (x60 + x58);
+ { u64 x62 = (x47 + (x57 << 0x4));
+ { u64 x63 = (x62 + (x57 << 0x1));
+ { u64 x64 = (x63 + x57);
+ { u64 x65 = (x46 + (x56 << 0x4));
+ { u64 x66 = (x65 + (x56 << 0x1));
+ { u64 x67 = (x66 + x56);
+ { u64 x68 = (x45 + (x55 << 0x4));
+ { u64 x69 = (x68 + (x55 << 0x1));
+ { u64 x70 = (x69 + x55);
+ { u64 x71 = (x44 + (x54 << 0x4));
+ { u64 x72 = (x71 + (x54 << 0x1));
+ { u64 x73 = (x72 + x54);
+ { u64 x74 = (x43 + (x53 << 0x4));
+ { u64 x75 = (x74 + (x53 << 0x1));
+ { u64 x76 = (x75 + x53);
+ { u64 x77 = (x42 + (x52 << 0x4));
+ { u64 x78 = (x77 + (x52 << 0x1));
+ { u64 x79 = (x78 + x52);
+ { u64 x80 = (x41 + (x51 << 0x4));
+ { u64 x81 = (x80 + (x51 << 0x1));
+ { u64 x82 = (x81 + x51);
+ { u64 x83 = (x40 + (x50 << 0x4));
+ { u64 x84 = (x83 + (x50 << 0x1));
+ { u64 x85 = (x84 + x50);
+ { u64 x86 = (x85 >> 0x1a);
+ { u32 x87 = ((u32)x85 & 0x3ffffff);
+ { u64 x88 = (x86 + x82);
+ { u64 x89 = (x88 >> 0x19);
+ { u32 x90 = ((u32)x88 & 0x1ffffff);
+ { u64 x91 = (x89 + x79);
+ { u64 x92 = (x91 >> 0x1a);
+ { u32 x93 = ((u32)x91 & 0x3ffffff);
+ { u64 x94 = (x92 + x76);
+ { u64 x95 = (x94 >> 0x19);
+ { u32 x96 = ((u32)x94 & 0x1ffffff);
+ { u64 x97 = (x95 + x73);
+ { u64 x98 = (x97 >> 0x1a);
+ { u32 x99 = ((u32)x97 & 0x3ffffff);
+ { u64 x100 = (x98 + x70);
+ { u64 x101 = (x100 >> 0x19);
+ { u32 x102 = ((u32)x100 & 0x1ffffff);
+ { u64 x103 = (x101 + x67);
+ { u64 x104 = (x103 >> 0x1a);
+ { u32 x105 = ((u32)x103 & 0x3ffffff);
+ { u64 x106 = (x104 + x64);
+ { u64 x107 = (x106 >> 0x19);
+ { u32 x108 = ((u32)x106 & 0x1ffffff);
+ { u64 x109 = (x107 + x61);
+ { u64 x110 = (x109 >> 0x1a);
+ { u32 x111 = ((u32)x109 & 0x3ffffff);
+ { u64 x112 = (x110 + x49);
+ { u64 x113 = (x112 >> 0x19);
+ { u32 x114 = ((u32)x112 & 0x1ffffff);
+ { u64 x115 = (x87 + (0x13 * x113));
+ { u32 x116 = (u32) (x115 >> 0x1a);
+ { u32 x117 = ((u32)x115 & 0x3ffffff);
+ { u32 x118 = (x116 + x90);
+ { u32 x119 = (x118 >> 0x19);
+ { u32 x120 = (x118 & 0x1ffffff);
+ out[0] = x117;
+ out[1] = x120;
+ out[2] = (x119 + x93);
+ out[3] = x96;
+ out[4] = x99;
+ out[5] = x102;
+ out[6] = x105;
+ out[7] = x108;
+ out[8] = x111;
+ out[9] = x114;
+ }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
+}
+
+static __always_inline void fe_mul121666(fe *h, const fe_loose *f)
+{
+ fe_mul_121666_impl(h->v, f->v);
+}
+
+void curve25519_generic(u8 out[CURVE25519_KEY_SIZE],
+ const u8 scalar[CURVE25519_KEY_SIZE],
+ const u8 point[CURVE25519_KEY_SIZE])
+{
+ fe x1, x2, z2, x3, z3;
+ fe_loose x2l, z2l, x3l;
+ unsigned swap = 0;
+ int pos;
+ u8 e[32];
+
+ memcpy(e, scalar, 32);
+ curve25519_clamp_secret(e);
+
+ /* The following implementation was transcribed to Coq and proven to
+ * correspond to unary scalar multiplication in affine coordinates given
+ * that x1 != 0 is the x coordinate of some point on the curve. It was
+ * also checked in Coq that doing a ladderstep with x1 = x3 = 0 gives
+ * z2' = z3' = 0, and z2 = z3 = 0 gives z2' = z3' = 0. The statement was
+ * quantified over the underlying field, so it applies to Curve25519
+ * itself and the quadratic twist of Curve25519. It was not proven in
+ * Coq that prime-field arithmetic correctly simulates extension-field
+ * arithmetic on prime-field values. The decoding of the byte array
+ * representation of e was not considered.
+ *
+ * Specification of Montgomery curves in affine coordinates:
+ *
+ *
+ * Proof that these form a group that is isomorphic to a Weierstrass
+ * curve:
+ *
+ *
+ * Coq transcription and correctness proof of the loop
+ * (where scalarbits=255):
+ *
+ *
+ * preconditions: 0 <= e < 2^255 (not necessarily e < order),
+ * fe_invert(0) = 0
+ */
+ fe_frombytes(&x1, point);
+ fe_1(&x2);
+ fe_0(&z2);
+ fe_copy(&x3, &x1);
+ fe_1(&z3);
+
+ for (pos = 254; pos >= 0; --pos) {
+ fe tmp0, tmp1;
+ fe_loose tmp0l, tmp1l;
+ /* loop invariant as of right before the test, for the case
+ * where x1 != 0:
+ * pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3
+ * is nonzero
+ * let r := e >> (pos+1) in the following equalities of
+ * projective points:
+ * to_xz (r*P) === if swap then (x3, z3) else (x2, z2)
+ * to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
+ * x1 is the nonzero x coordinate of the nonzero
+ * point (r*P-(r+1)*P)
+ */
+ unsigned b = 1 & (e[pos / 8] >> (pos & 7));
+ swap ^= b;
+ fe_cswap(&x2, &x3, swap);
+ fe_cswap(&z2, &z3, swap);
+ swap = b;
+ /* Coq transcription of ladderstep formula (called from
+ * transcribed loop):
+ *
+ *
+ * x1 != 0
+ * x1 = 0
+ */
+ fe_sub(&tmp0l, &x3, &z3);
+ fe_sub(&tmp1l, &x2, &z2);
+ fe_add(&x2l, &x2, &z2);
+ fe_add(&z2l, &x3, &z3);
+ fe_mul_tll(&z3, &tmp0l, &x2l);
+ fe_mul_tll(&z2, &z2l, &tmp1l);
+ fe_sq_tl(&tmp0, &tmp1l);
+ fe_sq_tl(&tmp1, &x2l);
+ fe_add(&x3l, &z3, &z2);
+ fe_sub(&z2l, &z3, &z2);
+ fe_mul_ttt(&x2, &tmp1, &tmp0);
+ fe_sub(&tmp1l, &tmp1, &tmp0);
+ fe_sq_tl(&z2, &z2l);
+ fe_mul121666(&z3, &tmp1l);
+ fe_sq_tl(&x3, &x3l);
+ fe_add(&tmp0l, &tmp0, &z3);
+ fe_mul_ttt(&z3, &x1, &z2);
+ fe_mul_tll(&z2, &tmp1l, &tmp0l);
+ }
+ /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3)
+ * else (x2, z2)
+ */
+ fe_cswap(&x2, &x3, swap);
+ fe_cswap(&z2, &z3, swap);
+
+ fe_invert(&z2, &z2);
+ fe_mul_ttt(&x2, &x2, &z2);
+ fe_tobytes(out, &x2);
+
+ memzero_explicit(&x1, sizeof(x1));
+ memzero_explicit(&x2, sizeof(x2));
+ memzero_explicit(&z2, sizeof(z2));
+ memzero_explicit(&x3, sizeof(x3));
+ memzero_explicit(&z3, sizeof(z3));
+ memzero_explicit(&x2l, sizeof(x2l));
+ memzero_explicit(&z2l, sizeof(z2l));
+ memzero_explicit(&x3l, sizeof(x3l));
+ memzero_explicit(&e, sizeof(e));
+}
--- /dev/null
+++ b/lib/crypto/curve25519-hacl64.c
@@ -0,0 +1,788 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
+ * Copyright (C) 2018-2019 Jason A. Donenfeld . All Rights Reserved.
+ *
+ * This is a machine-generated formally verified implementation of Curve25519
+ * ECDH from: . Though originally machine
+ * generated, it has been tweaked to be suitable for use in the kernel. It is
+ * optimized for 64-bit machines that can efficiently work with 128-bit
+ * integer types.
+ */
+
+#include
+#include
+#include
+
+typedef __uint128_t u128;
+
+static __always_inline u64 u64_eq_mask(u64 a, u64 b)
+{
+ u64 x = a ^ b;
+ u64 minus_x = ~x + (u64)1U;
+ u64 x_or_minus_x = x | minus_x;
+ u64 xnx = x_or_minus_x >> (u32)63U;
+ u64 c = xnx - (u64)1U;
+ return c;
+}
+
+static __always_inline u64 u64_gte_mask(u64 a, u64 b)
+{
+ u64 x = a;
+ u64 y = b;
+ u64 x_xor_y = x ^ y;
+ u64 x_sub_y = x - y;
+ u64 x_sub_y_xor_y = x_sub_y ^ y;
+ u64 q = x_xor_y | x_sub_y_xor_y;
+ u64 x_xor_q = x ^ q;
+ u64 x_xor_q_ = x_xor_q >> (u32)63U;
+ u64 c = x_xor_q_ - (u64)1U;
+ return c;
+}
+
+static __always_inline void modulo_carry_top(u64 *b)
+{
+ u64 b4 = b[4];
+ u64 b0 = b[0];
+ u64 b4_ = b4 & 0x7ffffffffffffLLU;
+ u64 b0_ = b0 + 19 * (b4 >> 51);
+ b[4] = b4_;
+ b[0] = b0_;
+}
+
+static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
+{
+ {
+ u128 xi = input[0];
+ output[0] = ((u64)(xi));
+ }
+ {
+ u128 xi = input[1];
+ output[1] = ((u64)(xi));
+ }
+ {
+ u128 xi = input[2];
+ output[2] = ((u64)(xi));
+ }
+ {
+ u128 xi = input[3];
+ output[3] = ((u64)(xi));
+ }
+ {
+ u128 xi = input[4];
+ output[4] = ((u64)(xi));
+ }
+}
+
+static __always_inline void
+fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
+{
+ output[0] += (u128)input[0] * s;
+ output[1] += (u128)input[1] * s;
+ output[2] += (u128)input[2] * s;
+ output[3] += (u128)input[3] * s;
+ output[4] += (u128)input[4] * s;
+}
+
+static __always_inline void fproduct_carry_wide_(u128 *tmp)
+{
+ {
+ u32 ctr = 0;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+ {
+ u32 ctr = 1;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+
+ {
+ u32 ctr = 2;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+ {
+ u32 ctr = 3;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+}
+
+static __always_inline void fmul_shift_reduce(u64 *output)
+{
+ u64 tmp = output[4];
+ u64 b0;
+ {
+ u32 ctr = 5 - 0 - 1;
+ u64 z = output[ctr - 1];
+ output[ctr] = z;
+ }
+ {
+ u32 ctr = 5 - 1 - 1;
+ u64 z = output[ctr - 1];
+ output[ctr] = z;
+ }
+ {
+ u32 ctr = 5 - 2 - 1;
+ u64 z = output[ctr - 1];
+ output[ctr] = z;
+ }
+ {
+ u32 ctr = 5 - 3 - 1;
+ u64 z = output[ctr - 1];
+ output[ctr] = z;
+ }
+ output[0] = tmp;
+ b0 = output[0];
+ output[0] = 19 * b0;
+}
+
+static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
+ u64 *input21)
+{
+ u32 i;
+ u64 input2i;
+ {
+ u64 input2i = input21[0];
+ fproduct_sum_scalar_multiplication_(output, input, input2i);
+ fmul_shift_reduce(input);
+ }
+ {
+ u64 input2i = input21[1];
+ fproduct_sum_scalar_multiplication_(output, input, input2i);
+ fmul_shift_reduce(input);
+ }
+ {
+ u64 input2i = input21[2];
+ fproduct_sum_scalar_multiplication_(output, input, input2i);
+ fmul_shift_reduce(input);
+ }
+ {
+ u64 input2i = input21[3];
+ fproduct_sum_scalar_multiplication_(output, input, input2i);
+ fmul_shift_reduce(input);
+ }
+ i = 4;
+ input2i = input21[i];
+ fproduct_sum_scalar_multiplication_(output, input, input2i);
+}
+
+static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
+{
+ u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
+ {
+ u128 b4;
+ u128 b0;
+ u128 b4_;
+ u128 b0_;
+ u64 i0;
+ u64 i1;
+ u64 i0_;
+ u64 i1_;
+ u128 t[5] = { 0 };
+ fmul_mul_shift_reduce_(t, tmp, input21);
+ fproduct_carry_wide_(t);
+ b4 = t[4];
+ b0 = t[0];
+ b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
+ b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
+ t[4] = b4_;
+ t[0] = b0_;
+ fproduct_copy_from_wide_(output, t);
+ i0 = output[0];
+ i1 = output[1];
+ i0_ = i0 & 0x7ffffffffffffLLU;
+ i1_ = i1 + (i0 >> 51);
+ output[0] = i0_;
+ output[1] = i1_;
+ }
+}
+
+static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
+{
+ u64 r0 = output[0];
+ u64 r1 = output[1];
+ u64 r2 = output[2];
+ u64 r3 = output[3];
+ u64 r4 = output[4];
+ u64 d0 = r0 * 2;
+ u64 d1 = r1 * 2;
+ u64 d2 = r2 * 2 * 19;
+ u64 d419 = r4 * 19;
+ u64 d4 = d419 * 2;
+ u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
+ (((u128)(d2) * (r3))));
+ u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
+ (((u128)(r3 * 19) * (r3))));
+ u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
+ (((u128)(d4) * (r3))));
+ u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
+ (((u128)(r4) * (d419))));
+ u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
+ (((u128)(r2) * (r2))));
+ tmp[0] = s0;
+ tmp[1] = s1;
+ tmp[2] = s2;
+ tmp[3] = s3;
+ tmp[4] = s4;
+}
+
+static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
+{
+ u128 b4;
+ u128 b0;
+ u128 b4_;
+ u128 b0_;
+ u64 i0;
+ u64 i1;
+ u64 i0_;
+ u64 i1_;
+ fsquare_fsquare__(tmp, output);
+ fproduct_carry_wide_(tmp);
+ b4 = tmp[4];
+ b0 = tmp[0];
+ b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
+ b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
+ tmp[4] = b4_;
+ tmp[0] = b0_;
+ fproduct_copy_from_wide_(output, tmp);
+ i0 = output[0];
+ i1 = output[1];
+ i0_ = i0 & 0x7ffffffffffffLLU;
+ i1_ = i1 + (i0 >> 51);
+ output[0] = i0_;
+ output[1] = i1_;
+}
+
+static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
+ u32 count1)
+{
+ u32 i;
+ fsquare_fsquare_(tmp, output);
+ for (i = 1; i < count1; ++i)
+ fsquare_fsquare_(tmp, output);
+}
+
+static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
+ u32 count1)
+{
+ u128 t[5];
+ memcpy(output, input, 5 * sizeof(*input));
+ fsquare_fsquare_times_(output, t, count1);
+}
+
+static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
+ u32 count1)
+{
+ u128 t[5];
+ fsquare_fsquare_times_(output, t, count1);
+}
+
+static __always_inline void crecip_crecip(u64 *out, u64 *z)
+{
+ u64 buf[20] = { 0 };
+ u64 *a0 = buf;
+ u64 *t00 = buf + 5;
+ u64 *b0 = buf + 10;
+ u64 *t01;
+ u64 *b1;
+ u64 *c0;
+ u64 *a;
+ u64 *t0;
+ u64 *b;
+ u64 *c;
+ fsquare_fsquare_times(a0, z, 1);
+ fsquare_fsquare_times(t00, a0, 2);
+ fmul_fmul(b0, t00, z);
+ fmul_fmul(a0, b0, a0);
+ fsquare_fsquare_times(t00, a0, 1);
+ fmul_fmul(b0, t00, b0);
+ fsquare_fsquare_times(t00, b0, 5);
+ t01 = buf + 5;
+ b1 = buf + 10;
+ c0 = buf + 15;
+ fmul_fmul(b1, t01, b1);
+ fsquare_fsquare_times(t01, b1, 10);
+ fmul_fmul(c0, t01, b1);
+ fsquare_fsquare_times(t01, c0, 20);
+ fmul_fmul(t01, t01, c0);
+ fsquare_fsquare_times_inplace(t01, 10);
+ fmul_fmul(b1, t01, b1);
+ fsquare_fsquare_times(t01, b1, 50);
+ a = buf;
+ t0 = buf + 5;
+ b = buf + 10;
+ c = buf + 15;
+ fmul_fmul(c, t0, b);
+ fsquare_fsquare_times(t0, c, 100);
+ fmul_fmul(t0, t0, c);
+ fsquare_fsquare_times_inplace(t0, 50);
+ fmul_fmul(t0, t0, b);
+ fsquare_fsquare_times_inplace(t0, 5);
+ fmul_fmul(out, t0, a);
+}
+
+static __always_inline void fsum(u64 *a, u64 *b)
+{
+ a[0] += b[0];
+ a[1] += b[1];
+ a[2] += b[2];
+ a[3] += b[3];
+ a[4] += b[4];
+}
+
+static __always_inline void fdifference(u64 *a, u64 *b)
+{
+ u64 tmp[5] = { 0 };
+ u64 b0;
+ u64 b1;
+ u64 b2;
+ u64 b3;
+ u64 b4;
+ memcpy(tmp, b, 5 * sizeof(*b));
+ b0 = tmp[0];
+ b1 = tmp[1];
+ b2 = tmp[2];
+ b3 = tmp[3];
+ b4 = tmp[4];
+ tmp[0] = b0 + 0x3fffffffffff68LLU;
+ tmp[1] = b1 + 0x3ffffffffffff8LLU;
+ tmp[2] = b2 + 0x3ffffffffffff8LLU;
+ tmp[3] = b3 + 0x3ffffffffffff8LLU;
+ tmp[4] = b4 + 0x3ffffffffffff8LLU;
+ {
+ u64 xi = a[0];
+ u64 yi = tmp[0];
+ a[0] = yi - xi;
+ }
+ {
+ u64 xi = a[1];
+ u64 yi = tmp[1];
+ a[1] = yi - xi;
+ }
+ {
+ u64 xi = a[2];
+ u64 yi = tmp[2];
+ a[2] = yi - xi;
+ }
+ {
+ u64 xi = a[3];
+ u64 yi = tmp[3];
+ a[3] = yi - xi;
+ }
+ {
+ u64 xi = a[4];
+ u64 yi = tmp[4];
+ a[4] = yi - xi;
+ }
+}
+
+static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
+{
+ u128 tmp[5];
+ u128 b4;
+ u128 b0;
+ u128 b4_;
+ u128 b0_;
+ {
+ u64 xi = b[0];
+ tmp[0] = ((u128)(xi) * (s));
+ }
+ {
+ u64 xi = b[1];
+ tmp[1] = ((u128)(xi) * (s));
+ }
+ {
+ u64 xi = b[2];
+ tmp[2] = ((u128)(xi) * (s));
+ }
+ {
+ u64 xi = b[3];
+ tmp[3] = ((u128)(xi) * (s));
+ }
+ {
+ u64 xi = b[4];
+ tmp[4] = ((u128)(xi) * (s));
+ }
+ fproduct_carry_wide_(tmp);
+ b4 = tmp[4];
+ b0 = tmp[0];
+ b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
+ b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
+ tmp[4] = b4_;
+ tmp[0] = b0_;
+ fproduct_copy_from_wide_(output, tmp);
+}
+
+static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
+{
+ fmul_fmul(output, a, b);
+}
+
+static __always_inline void crecip(u64 *output, u64 *input)
+{
+ crecip_crecip(output, input);
+}
+
+static __always_inline void point_swap_conditional_step(u64 *a, u64 *b,
+ u64 swap1, u32 ctr)
+{
+ u32 i = ctr - 1;
+ u64 ai = a[i];
+ u64 bi = b[i];
+ u64 x = swap1 & (ai ^ bi);
+ u64 ai1 = ai ^ x;
+ u64 bi1 = bi ^ x;
+ a[i] = ai1;
+ b[i] = bi1;
+}
+
+static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
+{
+ point_swap_conditional_step(a, b, swap1, 5);
+ point_swap_conditional_step(a, b, swap1, 4);
+ point_swap_conditional_step(a, b, swap1, 3);
+ point_swap_conditional_step(a, b, swap1, 2);
+ point_swap_conditional_step(a, b, swap1, 1);
+}
+
+static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
+{
+ u64 swap1 = 0 - iswap;
+ point_swap_conditional5(a, b, swap1);
+ point_swap_conditional5(a + 5, b + 5, swap1);
+}
+
+static __always_inline void point_copy(u64 *output, u64 *input)
+{
+ memcpy(output, input, 5 * sizeof(*input));
+ memcpy(output + 5, input + 5, 5 * sizeof(*input));
+}
+
+static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
+ u64 *pq, u64 *qmqp)
+{
+ u64 *qx = qmqp;
+ u64 *x2 = pp;
+ u64 *z2 = pp + 5;
+ u64 *x3 = ppq;
+ u64 *z3 = ppq + 5;
+ u64 *x = p;
+ u64 *z = p + 5;
+ u64 *xprime = pq;
+ u64 *zprime = pq + 5;
+ u64 buf[40] = { 0 };
+ u64 *origx = buf;
+ u64 *origxprime0 = buf + 5;
+ u64 *xxprime0;
+ u64 *zzprime0;
+ u64 *origxprime;
+ xxprime0 = buf + 25;
+ zzprime0 = buf + 30;
+ memcpy(origx, x, 5 * sizeof(*x));
+ fsum(x, z);
+ fdifference(z, origx);
+ memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
+ fsum(xprime, zprime);
+ fdifference(zprime, origxprime0);
+ fmul(xxprime0, xprime, z);
+ fmul(zzprime0, x, zprime);
+ origxprime = buf + 5;
+ {
+ u64 *xx0;
+ u64 *zz0;
+ u64 *xxprime;
+ u64 *zzprime;
+ u64 *zzzprime;
+ xx0 = buf + 15;
+ zz0 = buf + 20;
+ xxprime = buf + 25;
+ zzprime = buf + 30;
+ zzzprime = buf + 35;
+ memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
+ fsum(xxprime, zzprime);
+ fdifference(zzprime, origxprime);
+ fsquare_fsquare_times(x3, xxprime, 1);
+ fsquare_fsquare_times(zzzprime, zzprime, 1);
+ fmul(z3, zzzprime, qx);
+ fsquare_fsquare_times(xx0, x, 1);
+ fsquare_fsquare_times(zz0, z, 1);
+ {
+ u64 *zzz;
+ u64 *xx;
+ u64 *zz;
+ u64 scalar;
+ zzz = buf + 10;
+ xx = buf + 15;
+ zz = buf + 20;
+ fmul(x2, xx, zz);
+ fdifference(zz, xx);
+ scalar = 121665;
+ fscalar(zzz, zz, scalar);
+ fsum(zzz, xx);
+ fmul(z2, zzz, zz);
+ }
+ }
+}
+
+static __always_inline void
+ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
+ u64 *q, u8 byt)
+{
+ u64 bit0 = (u64)(byt >> 7);
+ u64 bit;
+ point_swap_conditional(nq, nqpq, bit0);
+ addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
+ bit = (u64)(byt >> 7);
+ point_swap_conditional(nq2, nqpq2, bit);
+}
+
+static __always_inline void
+ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2,
+ u64 *nqpq2, u64 *q, u8 byt)
+{
+ u8 byt1;
+ ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
+ byt1 = byt << 1;
+ ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
+}
+
+static __always_inline void
+ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
+ u64 *q, u8 byt, u32 i)
+{
+ while (i--) {
+ ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
+ nqpq2, q, byt);
+ byt <<= 2;
+ }
+}
+
+static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq,
+ u64 *nqpq, u64 *nq2,
+ u64 *nqpq2, u64 *q,
+ u32 i)
+{
+ while (i--) {
+ u8 byte = n1[i];
+ ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
+ byte, 4);
+ }
+}
+
+static void ladder_cmult(u64 *result, u8 *n1, u64 *q)
+{
+ u64 point_buf[40] = { 0 };
+ u64 *nq = point_buf;
+ u64 *nqpq = point_buf + 10;
+ u64 *nq2 = point_buf + 20;
+ u64 *nqpq2 = point_buf + 30;
+ point_copy(nqpq, q);
+ nq[0] = 1;
+ ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
+ point_copy(result, nq);
+}
+
+static __always_inline void format_fexpand(u64 *output, const u8 *input)
+{
+ const u8 *x00 = input + 6;
+ const u8 *x01 = input + 12;
+ const u8 *x02 = input + 19;
+ const u8 *x0 = input + 24;
+ u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
+ i0 = get_unaligned_le64(input);
+ i1 = get_unaligned_le64(x00);
+ i2 = get_unaligned_le64(x01);
+ i3 = get_unaligned_le64(x02);
+ i4 = get_unaligned_le64(x0);
+ output0 = i0 & 0x7ffffffffffffLLU;
+ output1 = i1 >> 3 & 0x7ffffffffffffLLU;
+ output2 = i2 >> 6 & 0x7ffffffffffffLLU;
+ output3 = i3 >> 1 & 0x7ffffffffffffLLU;
+ output4 = i4 >> 12 & 0x7ffffffffffffLLU;
+ output[0] = output0;
+ output[1] = output1;
+ output[2] = output2;
+ output[3] = output3;
+ output[4] = output4;
+}
+
+static __always_inline void format_fcontract_first_carry_pass(u64 *input)
+{
+ u64 t0 = input[0];
+ u64 t1 = input[1];
+ u64 t2 = input[2];
+ u64 t3 = input[3];
+ u64 t4 = input[4];
+ u64 t1_ = t1 + (t0 >> 51);
+ u64 t0_ = t0 & 0x7ffffffffffffLLU;
+ u64 t2_ = t2 + (t1_ >> 51);
+ u64 t1__ = t1_ & 0x7ffffffffffffLLU;
+ u64 t3_ = t3 + (t2_ >> 51);
+ u64 t2__ = t2_ & 0x7ffffffffffffLLU;
+ u64 t4_ = t4 + (t3_ >> 51);
+ u64 t3__ = t3_ & 0x7ffffffffffffLLU;
+ input[0] = t0_;
+ input[1] = t1__;
+ input[2] = t2__;
+ input[3] = t3__;
+ input[4] = t4_;
+}
+
+static __always_inline void format_fcontract_first_carry_full(u64 *input)
+{
+ format_fcontract_first_carry_pass(input);
+ modulo_carry_top(input);
+}
+
+static __always_inline void format_fcontract_second_carry_pass(u64 *input)
+{
+ u64 t0 = input[0];
+ u64 t1 = input[1];
+ u64 t2 = input[2];
+ u64 t3 = input[3];
+ u64 t4 = input[4];
+ u64 t1_ = t1 + (t0 >> 51);
+ u64 t0_ = t0 & 0x7ffffffffffffLLU;
+ u64 t2_ = t2 + (t1_ >> 51);
+ u64 t1__ = t1_ & 0x7ffffffffffffLLU;
+ u64 t3_ = t3 + (t2_ >> 51);
+ u64 t2__ = t2_ & 0x7ffffffffffffLLU;
+ u64 t4_ = t4 + (t3_ >> 51);
+ u64 t3__ = t3_ & 0x7ffffffffffffLLU;
+ input[0] = t0_;
+ input[1] = t1__;
+ input[2] = t2__;
+ input[3] = t3__;
+ input[4] = t4_;
+}
+
+static __always_inline void format_fcontract_second_carry_full(u64 *input)
+{
+ u64 i0;
+ u64 i1;
+ u64 i0_;
+ u64 i1_;
+ format_fcontract_second_carry_pass(input);
+ modulo_carry_top(input);
+ i0 = input[0];
+ i1 = input[1];
+ i0_ = i0 & 0x7ffffffffffffLLU;
+ i1_ = i1 + (i0 >> 51);
+ input[0] = i0_;
+ input[1] = i1_;
+}
+
+static __always_inline void format_fcontract_trim(u64 *input)
+{
+ u64 a0 = input[0];
+ u64 a1 = input[1];
+ u64 a2 = input[2];
+ u64 a3 = input[3];
+ u64 a4 = input[4];
+ u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
+ u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
+ u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
+ u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
+ u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
+ u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
+ u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
+ u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
+ u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
+ u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
+ u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
+ input[0] = a0_;
+ input[1] = a1_;
+ input[2] = a2_;
+ input[3] = a3_;
+ input[4] = a4_;
+}
+
+static __always_inline void format_fcontract_store(u8 *output, u64 *input)
+{
+ u64 t0 = input[0];
+ u64 t1 = input[1];
+ u64 t2 = input[2];
+ u64 t3 = input[3];
+ u64 t4 = input[4];
+ u64 o0 = t1 << 51 | t0;
+ u64 o1 = t2 << 38 | t1 >> 13;
+ u64 o2 = t3 << 25 | t2 >> 26;
+ u64 o3 = t4 << 12 | t3 >> 39;
+ u8 *b0 = output;
+ u8 *b1 = output + 8;
+ u8 *b2 = output + 16;
+ u8 *b3 = output + 24;
+ put_unaligned_le64(o0, b0);
+ put_unaligned_le64(o1, b1);
+ put_unaligned_le64(o2, b2);
+ put_unaligned_le64(o3, b3);
+}
+
+static __always_inline void format_fcontract(u8 *output, u64 *input)
+{
+ format_fcontract_first_carry_full(input);
+ format_fcontract_second_carry_full(input);
+ format_fcontract_trim(input);
+ format_fcontract_store(output, input);
+}
+
+static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
+{
+ u64 *x = point;
+ u64 *z = point + 5;
+ u64 buf[10] __aligned(32) = { 0 };
+ u64 *zmone = buf;
+ u64 *sc = buf + 5;
+ crecip(zmone, z);
+ fmul(sc, x, zmone);
+ format_fcontract(scalar, sc);
+}
+
+void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
+ const u8 secret[CURVE25519_KEY_SIZE],
+ const u8 basepoint[CURVE25519_KEY_SIZE])
+{
+ u64 buf0[10] __aligned(32) = { 0 };
+ u64 *x0 = buf0;
+ u64 *z = buf0 + 5;
+ u64 *q;
+ format_fexpand(x0, basepoint);
+ z[0] = 1;
+ q = buf0;
+ {
+ u8 e[32] __aligned(32) = { 0 };
+ u8 *scalar;
+ memcpy(e, secret, 32);
+ curve25519_clamp_secret(e);
+ scalar = e;
+ {
+ u64 buf[15] = { 0 };
+ u64 *nq = buf;
+ u64 *x = nq;
+ x[0] = 1;
+ ladder_cmult(nq, scalar, q);
+ format_scalar_of_point(mypublic, nq);
+ memzero_explicit(buf, sizeof(buf));
+ }
+ memzero_explicit(e, sizeof(e));
+ }
+ memzero_explicit(buf0, sizeof(buf0));
+}
--- b/lib/crypto/curve25519.c
+++ b/lib/crypto/curve25519.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved.
+ *
+ * This is an implementation of the Curve25519 ECDH algorithm, using either
+ * a 32-bit implementation or a 64-bit implementation with 128-bit integers,
+ * depending on what is supported by the target compiler.
+ *
+ * Information: https://cr.yp.to/ecdh.html
+ */
+
+#include
+#include
+#include
+
+bool curve25519_selftest(void);
+
+static int __init mod_init(void)
+{
+ if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS) &&
+ WARN_ON(!curve25519_selftest()))
+ return -ENODEV;
+ return 0;
+}
+
+static void __exit mod_exit(void)
+{
+}
+
+module_init(mod_init);
+module_exit(mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Curve25519 scalar multiplication");
+MODULE_AUTHOR("Jason A. Donenfeld ");
--- /dev/null
+++ b/crypto/curve25519-generic.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include
+#include
+#include
+#include
+#include
+
+static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
+ unsigned int len)
+{
+ u8 *secret = kpp_tfm_ctx(tfm);
+
+ if (!len)
+ curve25519_generate_secret(secret);
+ else if (len == CURVE25519_KEY_SIZE &&
+ crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
+ memcpy(secret, buf, CURVE25519_KEY_SIZE);
+ else
+ return -EINVAL;
+ return 0;
+}
+
+static int curve25519_compute_value(struct kpp_request *req)
+{
+ struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
+ const u8 *secret = kpp_tfm_ctx(tfm);
+ u8 public_key[CURVE25519_KEY_SIZE];
+ u8 buf[CURVE25519_KEY_SIZE];
+ int copied, nbytes;
+ u8 const *bp;
+
+ if (req->src) {
+ copied = sg_copy_to_buffer(req->src,
+ sg_nents_for_len(req->src,
+ CURVE25519_KEY_SIZE),
+ public_key, CURVE25519_KEY_SIZE);
+ if (copied != CURVE25519_KEY_SIZE)
+ return -EINVAL;
+ bp = public_key;
+ } else {
+ bp = curve25519_base_point;
+ }
+
+ curve25519_generic(buf, secret, bp);
+
+ /* might want less than we've got */
+ nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
+ copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
+ nbytes),
+ buf, nbytes);
+ if (copied != nbytes)
+ return -EINVAL;
+ return 0;
+}
+
+static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
+{
+ return CURVE25519_KEY_SIZE;
+}
+
+static struct kpp_alg curve25519_alg = {
+ .base.cra_name = "curve25519",
+ .base.cra_driver_name = "curve25519-generic",
+ .base.cra_priority = 100,
+ .base.cra_module = THIS_MODULE,
+ .base.cra_ctxsize = CURVE25519_KEY_SIZE,
+
+ .set_secret = curve25519_set_secret,
+ .generate_public_key = curve25519_compute_value,
+ .compute_shared_secret = curve25519_compute_value,
+ .max_size = curve25519_max_size,
+};
+
+static int curve25519_init(void)
+{
+ return crypto_register_kpp(&curve25519_alg);
+}
+
+static void curve25519_exit(void)
+{
+ crypto_unregister_kpp(&curve25519_alg);
+}
+
+subsys_initcall(curve25519_init);
+module_exit(curve25519_exit);
+
+MODULE_ALIAS_CRYPTO("curve25519");
+MODULE_ALIAS_CRYPTO("curve25519-generic");
+MODULE_LICENSE("GPL");
--- b/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -0,0 +1,1512 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2020 Jason A. Donenfeld