diff options
| author | Eric Biggers <ebiggers@kernel.org> | 2025-06-30 09:03:18 -0700 |
|---|---|---|
| committer | Eric Biggers <ebiggers@kernel.org> | 2025-06-30 09:26:20 -0700 |
| commit | 484c18119f4fbc6bca7c41e64b6fa84133e1057b (patch) | |
| tree | 346e22a5915b62021b8fbccb090193cfcff848a3 /lib | |
| parent | lib/crypto: sparc/sha512: Migrate optimized SHA-512 code to library (diff) | |
| download | linux-484c18119f4fbc6bca7c41e64b6fa84133e1057b.tar.gz linux-484c18119f4fbc6bca7c41e64b6fa84133e1057b.zip | |
lib/crypto: x86/sha512: Migrate optimized SHA-512 code to library
Instead of exposing the x86-optimized SHA-512 code via x86-specific
crypto_shash algorithms, instead just implement the sha512_blocks()
library function. This is much simpler, it makes the SHA-512 (and
SHA-384) library functions be x86-optimized, and it fixes the
longstanding issue where the x86-optimized SHA-512 code was disabled by
default. SHA-512 still remains available through crypto_shash, but
individual architectures no longer need to handle it.
To match sha512_blocks(), change the type of the nblocks parameter of
the assembly functions from int to size_t. The assembly functions
actually already treated it as size_t.
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160320.2888-15-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/crypto/Kconfig | 1 | ||||
| -rw-r--r-- | lib/crypto/Makefile | 3 | ||||
| -rw-r--r-- | lib/crypto/x86/sha512-avx-asm.S | 424 | ||||
| -rw-r--r-- | lib/crypto/x86/sha512-avx2-asm.S | 751 | ||||
| -rw-r--r-- | lib/crypto/x86/sha512-ssse3-asm.S | 423 | ||||
| -rw-r--r-- | lib/crypto/x86/sha512.h | 54 |
6 files changed, 1656 insertions, 0 deletions
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index c3edb78484f0..dce127a69f13 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -183,6 +183,7 @@ config CRYPTO_LIB_SHA512_ARCH default y if RISCV && 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO default y if S390 default y if SPARC64 + default y if X86_64 config CRYPTO_LIB_SM3 tristate diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 0fb93f2469b3..aaf445a85384 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -95,6 +95,9 @@ endif libsha512-$(CONFIG_RISCV) += riscv/sha512-riscv64-zvknhb-zvkb.o libsha512-$(CONFIG_SPARC) += sparc/sha512_asm.o +libsha512-$(CONFIG_X86) += x86/sha512-ssse3-asm.o \ + x86/sha512-avx-asm.o \ + x86/sha512-avx2-asm.o endif # CONFIG_CRYPTO_LIB_SHA512_ARCH obj-$(CONFIG_MPILIB) += mpi/ diff --git a/lib/crypto/x86/sha512-avx-asm.S b/lib/crypto/x86/sha512-avx-asm.S new file mode 100644 index 000000000000..0b5f69179d62 --- /dev/null +++ b/lib/crypto/x86/sha512-avx-asm.S @@ -0,0 +1,424 @@ +######################################################################## +# Implement fast SHA-512 with AVX instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# David Cote <david.m.cote@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#include <linux/linkage.h> + +.text + +# Virtual Registers +# ARG1 +digest = %rdi +# ARG2 +msg = %rsi +# ARG3 +msglen = %rdx +T1 = %rcx +T2 = %r8 +a_64 = %r9 +b_64 = %r10 +c_64 = %r11 +d_64 = %r12 +e_64 = %r13 +f_64 = %r14 +g_64 = %r15 +h_64 = %rbx +tmp0 = %rax + +# Local variables (stack frame) + +# Message Schedule +W_SIZE = 80*8 +# W[t] + K[t] | W[t+1] + K[t+1] +WK_SIZE = 2*8 + +frame_W = 0 +frame_WK = frame_W + W_SIZE +frame_size = frame_WK + WK_SIZE + +# Useful QWORD "arrays" for simpler memory references +# MSG, DIGEST, K_t, W_t are arrays +# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even + +# Input message (arg1) +#define MSG(i) 8*i(msg) + +# Output Digest (arg2) +#define DIGEST(i) 8*i(digest) + +# SHA Constants (static mem) +#define K_t(i) 8*i+K512(%rip) + +# Message Schedule (stack frame) +#define W_t(i) 8*i+frame_W(%rsp) + +# W[t]+K[t] (stack frame) +#define WK_2(i) 8*((i%2))+frame_WK(%rsp) + +.macro RotateState + # Rotate symbols a..h right + TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = TMP +.endm + +.macro RORQ p1 p2 + # shld is faster than ror on Sandybridge + shld $(64-\p2), \p1, \p1 +.endm + +.macro SHA512_Round rnd + # Compute Round %%t + mov f_64, T1 # T1 = f + mov e_64, tmp0 # tmp = e + xor g_64, T1 # T1 = f ^ g + RORQ tmp0, 23 # 41 # tmp = e ror 23 + and e_64, T1 # T1 = (f ^ g) & e + xor e_64, tmp0 # tmp = (e ror 23) ^ e + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + idx = \rnd + add WK_2(idx), T1 # W[t] + K[t] from message scheduler + RORQ tmp0, 4 # 18 # tmp = ((e ror 23) ^ e) ror 4 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e + mov a_64, T2 # T2 = a + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h + RORQ tmp0, 14 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov a_64, tmp0 # tmp = a + xor c_64, T2 # T2 = a ^ c + and c_64, tmp0 # tmp = a & c + and b_64, T2 # T2 = (a ^ c) & b + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov a_64, tmp0 # tmp = a + RORQ tmp0, 5 # 39 # tmp = a ror 5 + xor a_64, tmp0 # tmp = (a ror 5) ^ a + add T1, d_64 # e(next_state) = d + T1 + RORQ tmp0, 6 # 34 # tmp = ((a ror 5) ^ a) ror 6 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) + RORQ tmp0, 28 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +.endm + +.macro SHA512_2Sched_2Round_avx rnd + # Compute rounds t-2 and t-1 + # Compute message schedule QWORDS t and t+1 + + # Two rounds are computed based on the values for K[t-2]+W[t-2] and + # K[t-1]+W[t-1] which were previously stored at WK_2 by the message + # scheduler. + # The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)]. + # They are then added to their respective SHA512 constants at + # [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)] + # For brievity, the comments following vectored instructions only refer to + # the first of a pair of QWORDS. + # Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]} + # The computation of the message schedule and the rounds are tightly + # stitched to take advantage of instruction-level parallelism. + + idx = \rnd - 2 + vmovdqa W_t(idx), %xmm4 # XMM4 = W[t-2] + idx = \rnd - 15 + vmovdqu W_t(idx), %xmm5 # XMM5 = W[t-15] + mov f_64, T1 + vpsrlq $61, %xmm4, %xmm0 # XMM0 = W[t-2]>>61 + mov e_64, tmp0 + vpsrlq $1, %xmm5, %xmm6 # XMM6 = W[t-15]>>1 + xor g_64, T1 + RORQ tmp0, 23 # 41 + vpsrlq $19, %xmm4, %xmm1 # XMM1 = W[t-2]>>19 + and e_64, T1 + xor e_64, tmp0 + vpxor %xmm1, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 + xor g_64, T1 + idx = \rnd + add WK_2(idx), T1# + vpsrlq $8, %xmm5, %xmm7 # XMM7 = W[t-15]>>8 + RORQ tmp0, 4 # 18 + vpsrlq $6, %xmm4, %xmm2 # XMM2 = W[t-2]>>6 + xor e_64, tmp0 + mov a_64, T2 + add h_64, T1 + vpxor %xmm7, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 + RORQ tmp0, 14 # 14 + add tmp0, T1 + vpsrlq $7, %xmm5, %xmm8 # XMM8 = W[t-15]>>7 + mov a_64, tmp0 + xor c_64, T2 + vpsllq $(64-61), %xmm4, %xmm3 # XMM3 = W[t-2]<<3 + and c_64, tmp0 + and b_64, T2 + vpxor %xmm3, %xmm2, %xmm2 # XMM2 = W[t-2]>>6 ^ W[t-2]<<3 + xor tmp0, T2 + mov a_64, tmp0 + vpsllq $(64-1), %xmm5, %xmm9 # XMM9 = W[t-15]<<63 + RORQ tmp0, 5 # 39 + vpxor %xmm9, %xmm8, %xmm8 # XMM8 = W[t-15]>>7 ^ W[t-15]<<63 + xor a_64, tmp0 + add T1, d_64 + RORQ tmp0, 6 # 34 + xor a_64, tmp0 + vpxor %xmm8, %xmm6, %xmm6 # XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^ + # W[t-15]>>7 ^ W[t-15]<<63 + lea (T1, T2), h_64 + RORQ tmp0, 28 # 28 + vpsllq $(64-19), %xmm4, %xmm4 # XMM4 = W[t-2]<<25 + add tmp0, h_64 + RotateState + vpxor %xmm4, %xmm0, %xmm0 # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^ + # W[t-2]<<25 + mov f_64, T1 + vpxor %xmm2, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + mov e_64, tmp0 + xor g_64, T1 + idx = \rnd - 16 + vpaddq W_t(idx), %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + idx = \rnd - 7 + vmovdqu W_t(idx), %xmm1 # XMM1 = W[t-7] + RORQ tmp0, 23 # 41 + and e_64, T1 + xor e_64, tmp0 + xor g_64, T1 + vpsllq $(64-8), %xmm5, %xmm5 # XMM5 = W[t-15]<<56 + idx = \rnd + 1 + add WK_2(idx), T1 + vpxor %xmm5, %xmm6, %xmm6 # XMM6 = s0(W[t-15]) + RORQ tmp0, 4 # 18 + vpaddq %xmm6, %xmm0, %xmm0 # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15]) + xor e_64, tmp0 + vpaddq %xmm1, %xmm0, %xmm0 # XMM0 = W[t] = s1(W[t-2]) + W[t-7] + + # s0(W[t-15]) + W[t-16] + mov a_64, T2 + add h_64, T1 + RORQ tmp0, 14 # 14 + add tmp0, T1 + idx = \rnd + vmovdqa %xmm0, W_t(idx) # Store W[t] + vpaddq K_t(idx), %xmm0, %xmm0 # Compute W[t]+K[t] + vmovdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds + mov a_64, tmp0 + xor c_64, T2 + and c_64, tmp0 + and b_64, T2 + xor tmp0, T2 + mov a_64, tmp0 + RORQ tmp0, 5 # 39 + xor a_64, tmp0 + add T1, d_64 + RORQ tmp0, 6 # 34 + xor a_64, tmp0 + lea (T1, T2), h_64 + RORQ tmp0, 28 # 28 + add tmp0, h_64 + RotateState +.endm + +######################################################################## +# void sha512_transform_avx(struct sha512_block_state *state, +# const u8 *data, size_t nblocks); +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "nblocks" is the message length in SHA512 blocks +######################################################################## +SYM_FUNC_START(sha512_transform_avx) + + test msglen, msglen + je .Lnowork + + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # Allocate Stack Space + push %rbp + mov %rsp, %rbp + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + +.Lupdateblock: + + # Load state variables + mov DIGEST(0), a_64 + mov DIGEST(1), b_64 + mov DIGEST(2), c_64 + mov DIGEST(3), d_64 + mov DIGEST(4), e_64 + mov DIGEST(5), f_64 + mov DIGEST(6), g_64 + mov DIGEST(7), h_64 + + t = 0 + .rept 80/2 + 1 + # (80 rounds) / (2 rounds/iteration) + (1 iteration) + # +1 iteration because the scheduler leads hashing by 1 iteration + .if t < 2 + # BSWAP 2 QWORDS + vmovdqa XMM_QWORD_BSWAP(%rip), %xmm1 + vmovdqu MSG(t), %xmm0 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] + vmovdqa %xmm0, WK_2(t) # Store into WK for rounds + .elseif t < 16 + # BSWAP 2 QWORDS# Compute 2 Rounds + vmovdqu MSG(t), %xmm0 + vpshufb %xmm1, %xmm0, %xmm0 # BSWAP + SHA512_Round t-2 # Round t-2 + vmovdqa %xmm0, W_t(t) # Store Scheduled Pair + vpaddq K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t] + SHA512_Round t-1 # Round t-1 + vmovdqa %xmm0, WK_2(t)# Store W[t]+K[t] into WK + .elseif t < 79 + # Schedule 2 QWORDS# Compute 2 Rounds + SHA512_2Sched_2Round_avx t + .else + # Compute 2 Rounds + SHA512_Round t-2 + SHA512_Round t-1 + .endif + t = t+2 + .endr + + # Update digest + add a_64, DIGEST(0) + add b_64, DIGEST(1) + add c_64, DIGEST(2) + add d_64, DIGEST(3) + add e_64, DIGEST(4) + add f_64, DIGEST(5) + add g_64, DIGEST(6) + add h_64, DIGEST(7) + + # Advance to next message block + add $16*8, msg + dec msglen + jnz .Lupdateblock + + # Restore Stack Pointer + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + +.Lnowork: + RET +SYM_FUNC_END(sha512_transform_avx) + +######################################################################## +### Binary Data + +.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 +.align 16 +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/lib/crypto/x86/sha512-avx2-asm.S b/lib/crypto/x86/sha512-avx2-asm.S new file mode 100644 index 000000000000..2309c01e316b --- /dev/null +++ b/lib/crypto/x86/sha512-avx2-asm.S @@ -0,0 +1,751 @@ +######################################################################## +# Implement fast SHA-512 with AVX2 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# David Cote <david.m.cote@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## +# This code schedules 1 blocks at a time, with 4 lanes per block +######################################################################## + +#include <linux/linkage.h> + +.text + +# Virtual Registers +Y_0 = %ymm4 +Y_1 = %ymm5 +Y_2 = %ymm6 +Y_3 = %ymm7 + +YTMP0 = %ymm0 +YTMP1 = %ymm1 +YTMP2 = %ymm2 +YTMP3 = %ymm3 +YTMP4 = %ymm8 +XFER = YTMP0 + +BYTE_FLIP_MASK = %ymm9 + +# 1st arg is %rdi, which is saved to the stack and accessed later via %r12 +CTX1 = %rdi +CTX2 = %r12 +# 2nd arg +INP = %rsi +# 3rd arg +NUM_BLKS = %rdx + +c = %rcx +d = %r8 +e = %rdx +y3 = %rsi + +TBL = %rdi # clobbers CTX1 + +a = %rax +b = %rbx + +f = %r9 +g = %r10 +h = %r11 +old_h = %r11 + +T1 = %r12 # clobbers CTX2 +y0 = %r13 +y1 = %r14 +y2 = %r15 + +# Local variables (stack frame) +XFER_SIZE = 4*8 +SRND_SIZE = 1*8 +INP_SIZE = 1*8 +INPEND_SIZE = 1*8 +CTX_SIZE = 1*8 + +frame_XFER = 0 +frame_SRND = frame_XFER + XFER_SIZE +frame_INP = frame_SRND + SRND_SIZE +frame_INPEND = frame_INP + INP_SIZE +frame_CTX = frame_INPEND + INPEND_SIZE +frame_size = frame_CTX + CTX_SIZE + +## assume buffers not aligned +#define VMOVDQ vmovdqu + +# addm [mem], reg +# Add reg to mem using reg-mem add and store +.macro addm p1 p2 + add \p1, \p2 + mov \p2, \p1 +.endm + + +# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask +# Load ymm with mem and byte swap each dword +.macro COPY_YMM_AND_BSWAP p1 p2 p3 + VMOVDQ \p2, \p1 + vpshufb \p3, \p1, \p1 +.endm +# rotate_Ys +# Rotate values of symbols Y0...Y3 +.macro rotate_Ys + Y_ = Y_0 + Y_0 = Y_1 + Y_1 = Y_2 + Y_2 = Y_3 + Y_3 = Y_ +.endm + +# RotateState +.macro RotateState + # Rotate symbols a..h right + old_h = h + TMP_ = h + h = g + g = f + f = e + e = d + d = c + c = b + b = a + a = TMP_ +.endm + +# macro MY_VPALIGNR YDST, YSRC1, YSRC2, RVAL +# YDST = {YSRC1, YSRC2} >> RVAL*8 +.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL + vperm2f128 $0x3, \YSRC2, \YSRC1, \YDST # YDST = {YS1_LO, YS2_HI} + vpalignr $\RVAL, \YSRC2, \YDST, \YDST # YDST = {YDS1, YS2} >> RVAL*8 +.endm + +.macro FOUR_ROUNDS_AND_SCHED +################################### RND N + 0 ######################################### + + # Extract w[t-7] + MY_VPALIGNR YTMP0, Y_3, Y_2, 8 # YTMP0 = W[-7] + # Calculate w[t-16] + w[t-7] + vpaddq Y_0, YTMP0, YTMP0 # YTMP0 = W[-7] + W[-16] + # Extract w[t-15] + MY_VPALIGNR YTMP1, Y_1, Y_0, 8 # YTMP1 = W[-15] + + # Calculate sigma0 + + # Calculate w[t-15] ror 1 + vpsrlq $1, YTMP1, YTMP2 + vpsllq $(64-1), YTMP1, YTMP3 + vpor YTMP2, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 + # Calculate w[t-15] shr 7 + vpsrlq $7, YTMP1, YTMP4 # YTMP4 = W[-15] >> 7 + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add frame_XFER(%rsp),h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + rorx $14, e, y1 # y1 = (e >> 14) # S1 + + and e, y2 # y2 = (f^g)&e # CH + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + + add y0, y2 # y2 = S1 + CH # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +################################### RND N + 1 ######################################### + + # Calculate w[t-15] ror 8 + vpsrlq $8, YTMP1, YTMP2 + vpsllq $(64-8), YTMP1, YTMP1 + vpor YTMP2, YTMP1, YTMP1 # YTMP1 = W[-15] ror 8 + # XOR the three components + vpxor YTMP4, YTMP3, YTMP3 # YTMP3 = W[-15] ror 1 ^ W[-15] >> 7 + vpxor YTMP1, YTMP3, YTMP1 # YTMP1 = s0 + + + # Add three components, w[t-16], w[t-7] and sigma0 + vpaddq YTMP1, YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 + # Move to appropriate lanes for calculating w[16] and w[17] + vperm2f128 $0x0, YTMP0, YTMP0, Y_0 # Y_0 = W[-16] + W[-7] + s0 {BABA} + # Move to appropriate lanes for calculating w[18] and w[19] + vpand MASK_YMM_LO(%rip), YTMP0, YTMP0 # YTMP0 = W[-16] + W[-7] + s0 {DC00} + + # Calculate w[16] and w[17] in both 128 bit lanes + + # Calculate sigma1 for w[16] and w[17] on both 128 bit lanes + vperm2f128 $0x11, Y_3, Y_3, YTMP2 # YTMP2 = W[-2] {BABA} + vpsrlq $6, YTMP2, YTMP4 # YTMP4 = W[-2] >> 6 {BABA} + + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add 1*8+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + + and b, y3 # y3 = (a|c)&b # MAJA + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + + rorx $28, a, T1 # T1 = (a >> 28) # S0 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + + +################################### RND N + 2 ######################################### + + vpsrlq $19, YTMP2, YTMP3 # YTMP3 = W[-2] >> 19 {BABA} + vpsllq $(64-19), YTMP2, YTMP1 # YTMP1 = W[-2] << 19 {BABA} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {BABA} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA} + vpsrlq $61, YTMP2, YTMP3 # YTMP3 = W[-2] >> 61 {BABA} + vpsllq $(64-61), YTMP2, YTMP1 # YTMP1 = W[-2] << 61 {BABA} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {BABA} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ + # (W[-2] ror 61) ^ (W[-2] >> 6) {BABA} + + # Add sigma1 to the other compunents to get w[16] and w[17] + vpaddq YTMP4, Y_0, Y_0 # Y_0 = {W[1], W[0], W[1], W[0]} + + # Calculate sigma1 for w[18] and w[19] for upper 128 bit lane + vpsrlq $6, Y_0, YTMP4 # YTMP4 = W[-2] >> 6 {DC--} + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + add 2*8+frame_XFER(%rsp), h # h = k + w + h # -- + + rorx $18, e, y1 # y1 = e >> 18 # S1B + or c, y3 # y3 = a|c # MAJA + mov f, y2 # y2 = f # CH + xor g, y2 # y2 = f^g # CH + + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + and e, y2 # y2 = (f^g)&e # CH + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $39, a, y1 # y1 = a >> 39 # S0A + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +################################### RND N + 3 ######################################### + + vpsrlq $19, Y_0, YTMP3 # YTMP3 = W[-2] >> 19 {DC--} + vpsllq $(64-19), Y_0, YTMP1 # YTMP1 = W[-2] << 19 {DC--} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 19 {DC--} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--} + vpsrlq $61, Y_0, YTMP3 # YTMP3 = W[-2] >> 61 {DC--} + vpsllq $(64-61), Y_0, YTMP1 # YTMP1 = W[-2] << 61 {DC--} + vpor YTMP1, YTMP3, YTMP3 # YTMP3 = W[-2] ror 61 {DC--} + vpxor YTMP3, YTMP4, YTMP4 # YTMP4 = s1 = (W[-2] ror 19) ^ + # (W[-2] ror 61) ^ (W[-2] >> 6) {DC--} + + # Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19] + # to newly calculated sigma1 to get w[18] and w[19] + vpaddq YTMP4, YTMP0, YTMP2 # YTMP2 = {W[3], W[2], --, --} + + # Form w[19, w[18], w17], w[16] + vpblendd $0xF0, YTMP2, Y_0, Y_0 # Y_0 = {W[3], W[2], W[1], W[0]} + + mov a, y3 # y3 = a # MAJA + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + add 3*8+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + + mov f, y2 # y2 = f # CH + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + xor g, y2 # y2 = f^g # CH + + + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add h, d # d = k + w + h + d # -- + and b, y3 # y3 = (a|c)&b # MAJA + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + + rorx $39, a, y1 # y1 = a >> 39 # S0A + add y0, y2 # y2 = S1 + CH # -- + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + rorx $28, a, T1 # T1 = (a >> 28) # S0 + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and c, T1 # T1 = a&c # MAJB + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + + add y1, h # h = k + w + h + S0 # -- + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + + rotate_Ys +.endm + +.macro DO_4ROUNDS + +################################### RND N + 0 ######################################### + + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 1 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*1+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 2 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*2+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + RotateState + +################################### RND N + 3 ######################################### + + add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + mov f, y2 # y2 = f # CH + rorx $41, e, y0 # y0 = e >> 41 # S1A + rorx $18, e, y1 # y1 = e >> 18 # S1B + xor g, y2 # y2 = f^g # CH + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) # S1 + rorx $14, e, y1 # y1 = (e >> 14) # S1 + and e, y2 # y2 = (f^g)&e # CH + add y3, old_h # h = t1 + S0 + MAJ # -- + + xor y1, y0 # y0 = (e>>41) ^ (e>>18) ^ (e>>14) # S1 + rorx $34, a, T1 # T1 = a >> 34 # S0B + xor g, y2 # y2 = CH = ((f^g)&e)^g # CH + rorx $39, a, y1 # y1 = a >> 39 # S0A + mov a, y3 # y3 = a # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) # S0 + rorx $28, a, T1 # T1 = (a >> 28) # S0 + add 8*3+frame_XFER(%rsp), h # h = k + w + h # -- + or c, y3 # y3 = a|c # MAJA + + xor T1, y1 # y1 = (a>>39) ^ (a>>34) ^ (a>>28) # S0 + mov a, T1 # T1 = a # MAJB + and b, y3 # y3 = (a|c)&b # MAJA + and c, T1 # T1 = a&c # MAJB + add y0, y2 # y2 = S1 + CH # -- + + + add h, d # d = k + w + h + d # -- + or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ + add y1, h # h = k + w + h + S0 # -- + + add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- + + add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- + + add y3, h # h = t1 + S0 + MAJ # -- + + RotateState + +.endm + +######################################################################## +# void sha512_transform_rorx(struct sha512_block_state *state, +# const u8 *data, size_t nblocks); +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "nblocks" is the message length in SHA512 blocks +######################################################################## +SYM_FUNC_START(sha512_transform_rorx) + + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # Allocate Stack Space + push %rbp + mov %rsp, %rbp + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + + shl $7, NUM_BLKS # convert to bytes + jz .Ldone_hash + add INP, NUM_BLKS # pointer to end of data + mov NUM_BLKS, frame_INPEND(%rsp) + + ## load initial digest + mov 8*0(CTX1), a + mov 8*1(CTX1), b + mov 8*2(CTX1), c + mov 8*3(CTX1), d + mov 8*4(CTX1), e + mov 8*5(CTX1), f + mov 8*6(CTX1), g + mov 8*7(CTX1), h + + # save %rdi (CTX) before it gets clobbered + mov %rdi, frame_CTX(%rsp) + + vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK + +.Lloop0: + lea K512(%rip), TBL + + ## byte swap first 16 dwords + COPY_YMM_AND_BSWAP Y_0, (INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_1, 1*32(INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_2, 2*32(INP), BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_3, 3*32(INP), BYTE_FLIP_MASK + + mov INP, frame_INP(%rsp) + + ## schedule 64 input dwords, by doing 12 rounds of 4 each + movq $4, frame_SRND(%rsp) + +.align 16 +.Lloop1: + vpaddq (TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 1*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 2*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + FOUR_ROUNDS_AND_SCHED + + vpaddq 3*32(TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + add $(4*32), TBL + FOUR_ROUNDS_AND_SCHED + + subq $1, frame_SRND(%rsp) + jne .Lloop1 + + movq $2, frame_SRND(%rsp) +.Lloop2: + vpaddq (TBL), Y_0, XFER + vmovdqa XFER, frame_XFER(%rsp) + DO_4ROUNDS + vpaddq 1*32(TBL), Y_1, XFER + vmovdqa XFER, frame_XFER(%rsp) + add $(2*32), TBL + DO_4ROUNDS + + vmovdqa Y_2, Y_0 + vmovdqa Y_3, Y_1 + + subq $1, frame_SRND(%rsp) + jne .Lloop2 + + mov frame_CTX(%rsp), CTX2 + addm 8*0(CTX2), a + addm 8*1(CTX2), b + addm 8*2(CTX2), c + addm 8*3(CTX2), d + addm 8*4(CTX2), e + addm 8*5(CTX2), f + addm 8*6(CTX2), g + addm 8*7(CTX2), h + + mov frame_INP(%rsp), INP + add $128, INP + cmp frame_INPEND(%rsp), INP + jne .Lloop0 + +.Ldone_hash: + + # Restore Stack Pointer + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + + vzeroupper + RET +SYM_FUNC_END(sha512_transform_rorx) + +######################################################################## +### Binary Data + + +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 +.align 32 +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +PSHUFFLE_BYTE_FLIP_MASK: + .octa 0x08090a0b0c0d0e0f0001020304050607 + .octa 0x18191a1b1c1d1e1f1011121314151617 + +.section .rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32 +.align 32 +MASK_YMM_LO: + .octa 0x00000000000000000000000000000000 + .octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF diff --git a/lib/crypto/x86/sha512-ssse3-asm.S b/lib/crypto/x86/sha512-ssse3-asm.S new file mode 100644 index 000000000000..12e78142f2e3 --- /dev/null +++ b/lib/crypto/x86/sha512-ssse3-asm.S @@ -0,0 +1,423 @@ +######################################################################## +# Implement fast SHA-512 with SSSE3 instructions. (x86_64) +# +# Copyright (C) 2013 Intel Corporation. +# +# Authors: +# James Guilford <james.guilford@intel.com> +# Kirk Yap <kirk.s.yap@intel.com> +# David Cote <david.m.cote@intel.com> +# Tim Chen <tim.c.chen@linux.intel.com> +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +######################################################################## +# +# This code is described in an Intel White-Paper: +# "Fast SHA-512 Implementations on Intel Architecture Processors" +# +# To find it, surf to http://www.intel.com/p/en_US/embedded +# and search for that title. +# +######################################################################## + +#include <linux/linkage.h> + +.text + +# Virtual Registers +# ARG1 +digest = %rdi +# ARG2 +msg = %rsi +# ARG3 +msglen = %rdx +T1 = %rcx +T2 = %r8 +a_64 = %r9 +b_64 = %r10 +c_64 = %r11 +d_64 = %r12 +e_64 = %r13 +f_64 = %r14 +g_64 = %r15 +h_64 = %rbx +tmp0 = %rax + +# Local variables (stack frame) + +W_SIZE = 80*8 +WK_SIZE = 2*8 + +frame_W = 0 +frame_WK = frame_W + W_SIZE +frame_size = frame_WK + WK_SIZE + +# Useful QWORD "arrays" for simpler memory references +# MSG, DIGEST, K_t, W_t are arrays +# WK_2(t) points to 1 of 2 qwords at frame.WK depending on t being odd/even + +# Input message (arg1) +#define MSG(i) 8*i(msg) + +# Output Digest (arg2) +#define DIGEST(i) 8*i(digest) + +# SHA Constants (static mem) +#define K_t(i) 8*i+K512(%rip) + +# Message Schedule (stack frame) +#define W_t(i) 8*i+frame_W(%rsp) + +# W[t]+K[t] (stack frame) +#define WK_2(i) 8*((i%2))+frame_WK(%rsp) + +.macro RotateState + # Rotate symbols a..h right + TMP = h_64 + h_64 = g_64 + g_64 = f_64 + f_64 = e_64 + e_64 = d_64 + d_64 = c_64 + c_64 = b_64 + b_64 = a_64 + a_64 = TMP +.endm + +.macro SHA512_Round rnd + + # Compute Round %%t + mov f_64, T1 # T1 = f + mov e_64, tmp0 # tmp = e + xor g_64, T1 # T1 = f ^ g + ror $23, tmp0 # 41 # tmp = e ror 23 + and e_64, T1 # T1 = (f ^ g) & e + xor e_64, tmp0 # tmp = (e ror 23) ^ e + xor g_64, T1 # T1 = ((f ^ g) & e) ^ g = CH(e,f,g) + idx = \rnd + add WK_2(idx), T1 # W[t] + K[t] from message scheduler + ror $4, tmp0 # 18 # tmp = ((e ror 23) ^ e) ror 4 + xor e_64, tmp0 # tmp = (((e ror 23) ^ e) ror 4) ^ e + mov a_64, T2 # T2 = a + add h_64, T1 # T1 = CH(e,f,g) + W[t] + K[t] + h + ror $14, tmp0 # 14 # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e) + add tmp0, T1 # T1 = CH(e,f,g) + W[t] + K[t] + S1(e) + mov a_64, tmp0 # tmp = a + xor c_64, T2 # T2 = a ^ c + and c_64, tmp0 # tmp = a & c + and b_64, T2 # T2 = (a ^ c) & b + xor tmp0, T2 # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c) + mov a_64, tmp0 # tmp = a + ror $5, tmp0 # 39 # tmp = a ror 5 + xor a_64, tmp0 # tmp = (a ror 5) ^ a + add T1, d_64 # e(next_state) = d + T1 + ror $6, tmp0 # 34 # tmp = ((a ror 5) ^ a) ror 6 + xor a_64, tmp0 # tmp = (((a ror 5) ^ a) ror 6) ^ a + lea (T1, T2), h_64 # a(next_state) = T1 + Maj(a,b,c) + ror $28, tmp0 # 28 # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a) + add tmp0, h_64 # a(next_state) = T1 + Maj(a,b,c) S0(a) + RotateState +.endm + +.macro SHA512_2Sched_2Round_sse rnd + + # Compute rounds t-2 and t-1 + # Compute message schedule QWORDS t and t+1 + + # Two rounds are computed based on the values for K[t-2]+W[t-2] and + # K[t-1]+W[t-1] which were previously stored at WK_2 by the message + # scheduler. + # The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)]. + # They are then added to their respective SHA512 constants at + # [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)] + # For brievity, the comments following vectored instructions only refer to + # the first of a pair of QWORDS. + # Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]} + # The computation of the message schedule and the rounds are tightly + # stitched to take advantage of instruction-level parallelism. + # For clarity, integer instructions (for the rounds calculation) are indented + # by one tab. Vectored instructions (for the message scheduler) are indented + # by two tabs. + + mov f_64, T1 + idx = \rnd -2 + movdqa W_t(idx), %xmm2 # XMM2 = W[t-2] + xor g_64, T1 + and e_64, T1 + movdqa %xmm2, %xmm0 # XMM0 = W[t-2] + xor g_64, T1 + idx = \rnd + add WK_2(idx), T1 + idx = \rnd - 15 + movdqu W_t(idx), %xmm5 # XMM5 = W[t-15] + mov e_64, tmp0 + ror $23, tmp0 # 41 + movdqa %xmm5, %xmm3 # XMM3 = W[t-15] + xor e_64, tmp0 + ror $4, tmp0 # 18 + psrlq $61-19, %xmm0 # XMM0 = W[t-2] >> 42 + xor e_64, tmp0 + ror $14, tmp0 # 14 + psrlq $(8-7), %xmm3 # XMM3 = W[t-15] >> 1 + add tmp0, T1 + add h_64, T1 + pxor %xmm2, %xmm0 # XMM0 = (W[t-2] >> 42) ^ W[t-2] + mov a_64, T2 + xor c_64, T2 + pxor %xmm5, %xmm3 # XMM3 = (W[t-15] >> 1) ^ W[t-15] + and b_64, T2 + mov a_64, tmp0 + psrlq $(19-6), %xmm0 # XMM0 = ((W[t-2]>>42)^W[t-2])>>13 + and c_64, tmp0 + xor tmp0, T2 + psrlq $(7-1), %xmm3 # XMM3 = ((W[t-15]>>1)^W[t-15])>>6 + mov a_64, tmp0 + ror $5, tmp0 # 39 + pxor %xmm2, %xmm0 # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2] + xor a_64, tmp0 + ror $6, tmp0 # 34 + pxor %xmm5, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15] + xor a_64, tmp0 + ror $28, tmp0 # 28 + psrlq $6, %xmm0 # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6 + add tmp0, T2 + add T1, d_64 + psrlq $1, %xmm3 # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1 + lea (T1, T2), h_64 + RotateState + movdqa %xmm2, %xmm1 # XMM1 = W[t-2] + mov f_64, T1 + xor g_64, T1 + movdqa %xmm5, %xmm4 # XMM4 = W[t-15] + and e_64, T1 + xor g_64, T1 + psllq $(64-19)-(64-61) , %xmm1 # XMM1 = W[t-2] << 42 + idx = \rnd + 1 + add WK_2(idx), T1 + mov e_64, tmp0 + psllq $(64-1)-(64-8), %xmm4 # XMM4 = W[t-15] << 7 + ror $23, tmp0 # 41 + xor e_64, tmp0 + pxor %xmm2, %xmm1 # XMM1 = (W[t-2] << 42)^W[t-2] + ror $4, tmp0 # 18 + xor e_64, tmp0 + pxor %xmm5, %xmm4 # XMM4 = (W[t-15]<<7)^W[t-15] + ror $14, tmp0 # 14 + add tmp0, T1 + psllq $(64-61), %xmm1 # XMM1 = ((W[t-2] << 42)^W[t-2])<<3 + add h_64, T1 + mov a_64, T2 + psllq $(64-8), %xmm4 # XMM4 = ((W[t-15]<<7)^W[t-15])<<56 + xor c_64, T2 + and b_64, T2 + pxor %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + mov a_64, tmp0 + and c_64, tmp0 + idx = \rnd - 7 + movdqu W_t(idx), %xmm1 # XMM1 = W[t-7] + xor tmp0, T2 + pxor %xmm4, %xmm3 # XMM3 = s0(W[t-15]) + mov a_64, tmp0 + paddq %xmm3, %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + ror $5, tmp0 # 39 + idx =\rnd-16 + paddq W_t(idx), %xmm0 # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16] + xor a_64, tmp0 + paddq %xmm1, %xmm0 # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16] + ror $6, tmp0 # 34 + movdqa %xmm0, W_t(\rnd) # Store scheduled qwords + xor a_64, tmp0 + paddq K_t(\rnd), %xmm0 # Compute W[t]+K[t] + ror $28, tmp0 # 28 + idx = \rnd + movdqa %xmm0, WK_2(idx) # Store W[t]+K[t] for next rounds + add tmp0, T2 + add T1, d_64 + lea (T1, T2), h_64 + RotateState +.endm + +######################################################################## +# void sha512_transform_ssse3(struct sha512_block_state *state, +# const u8 *data, size_t nblocks); +# Purpose: Updates the SHA512 digest stored at "state" with the message +# stored in "data". +# The size of the message pointed to by "data" must be an integer multiple +# of SHA512 message blocks. +# "nblocks" is the message length in SHA512 blocks +######################################################################## +SYM_FUNC_START(sha512_transform_ssse3) + + test msglen, msglen + je .Lnowork + + # Save GPRs + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # Allocate Stack Space + push %rbp + mov %rsp, %rbp + sub $frame_size, %rsp + and $~(0x20 - 1), %rsp + +.Lupdateblock: + +# Load state variables + mov DIGEST(0), a_64 + mov DIGEST(1), b_64 + mov DIGEST(2), c_64 + mov DIGEST(3), d_64 + mov DIGEST(4), e_64 + mov DIGEST(5), f_64 + mov DIGEST(6), g_64 + mov DIGEST(7), h_64 + + t = 0 + .rept 80/2 + 1 + # (80 rounds) / (2 rounds/iteration) + (1 iteration) + # +1 iteration because the scheduler leads hashing by 1 iteration + .if t < 2 + # BSWAP 2 QWORDS + movdqa XMM_QWORD_BSWAP(%rip), %xmm1 + movdqu MSG(t), %xmm0 + pshufb %xmm1, %xmm0 # BSWAP + movdqa %xmm0, W_t(t) # Store Scheduled Pair + paddq K_t(t), %xmm0 # Compute W[t]+K[t] + movdqa %xmm0, WK_2(t) # Store into WK for rounds + .elseif t < 16 + # BSWAP 2 QWORDS# Compute 2 Rounds + movdqu MSG(t), %xmm0 + pshufb %xmm1, %xmm0 # BSWAP + SHA512_Round t-2 # Round t-2 + movdqa %xmm0, W_t(t) # Store Scheduled Pair + paddq K_t(t), %xmm0 # Compute W[t]+K[t] + SHA512_Round t-1 # Round t-1 + movdqa %xmm0, WK_2(t) # Store W[t]+K[t] into WK + .elseif t < 79 + # Schedule 2 QWORDS# Compute 2 Rounds + SHA512_2Sched_2Round_sse t + .else + # Compute 2 Rounds + SHA512_Round t-2 + SHA512_Round t-1 + .endif + t = t+2 + .endr + + # Update digest + add a_64, DIGEST(0) + add b_64, DIGEST(1) + add c_64, DIGEST(2) + add d_64, DIGEST(3) + add e_64, DIGEST(4) + add f_64, DIGEST(5) + add g_64, DIGEST(6) + add h_64, DIGEST(7) + + # Advance to next message block + add $16*8, msg + dec msglen + jnz .Lupdateblock + + # Restore Stack Pointer + mov %rbp, %rsp + pop %rbp + + # Restore GPRs + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + +.Lnowork: + RET +SYM_FUNC_END(sha512_transform_ssse3) + +######################################################################## +### Binary Data + +.section .rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16 +.align 16 +# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. +XMM_QWORD_BSWAP: + .octa 0x08090a0b0c0d0e0f0001020304050607 + +# Mergeable 640-byte rodata section. This allows linker to merge the table +# with other, exactly the same 640-byte fragment of another rodata section +# (if such section exists). +.section .rodata.cst640.K512, "aM", @progbits, 640 +.align 64 +# K[t] used in SHA512 hashing +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 diff --git a/lib/crypto/x86/sha512.h b/lib/crypto/x86/sha512.h new file mode 100644 index 000000000000..c13503d9d57d --- /dev/null +++ b/lib/crypto/x86/sha512.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * x86-optimized SHA-512 block function + * + * Copyright 2025 Google LLC + */ + +#include <asm/fpu/api.h> +#include <crypto/internal/simd.h> +#include <linux/static_call.h> + +DEFINE_STATIC_CALL(sha512_blocks_x86, sha512_blocks_generic); + +#define DEFINE_X86_SHA512_FN(c_fn, asm_fn) \ + asmlinkage void asm_fn(struct sha512_block_state *state, \ + const u8 *data, size_t nblocks); \ + static void c_fn(struct sha512_block_state *state, const u8 *data, \ + size_t nblocks) \ + { \ + if (likely(crypto_simd_usable())) { \ + kernel_fpu_begin(); \ + asm_fn(state, data, nblocks); \ + kernel_fpu_end(); \ + } else { \ + sha512_blocks_generic(state, data, nblocks); \ + } \ + } + +DEFINE_X86_SHA512_FN(sha512_blocks_ssse3, sha512_transform_ssse3); +DEFINE_X86_SHA512_FN(sha512_blocks_avx, sha512_transform_avx); +DEFINE_X86_SHA512_FN(sha512_blocks_avx2, sha512_transform_rorx); + +static void sha512_blocks(struct sha512_block_state *state, + const u8 *data, size_t nblocks) +{ + static_call(sha512_blocks_x86)(state, data, nblocks); +} + +#define sha512_mod_init_arch sha512_mod_init_arch +static inline void sha512_mod_init_arch(void) +{ + if (cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL) && + boot_cpu_has(X86_FEATURE_AVX)) { + if (boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_BMI2)) + static_call_update(sha512_blocks_x86, + sha512_blocks_avx2); + else + static_call_update(sha512_blocks_x86, + sha512_blocks_avx); + } else if (boot_cpu_has(X86_FEATURE_SSSE3)) { + static_call_update(sha512_blocks_x86, sha512_blocks_ssse3); + } +} |
