mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	Continue disentangling the crypto library functions from the generic crypto infrastructure by moving the x86 BLAKE2s, ChaCha, and Poly1305 library functions into a new directory arch/x86/lib/crypto/ that does not depend on CRYPTO. This mirrors the distinction between crypto/ and lib/crypto/. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
		
			
				
	
	
		
			252 lines
		
	
	
	
		
			6.9 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			252 lines
		
	
	
	
		
			6.9 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
 | 
						|
/*
 | 
						|
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 | 
						|
 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
 | 
						|
 */
 | 
						|
 | 
						|
#include <linux/linkage.h>
 | 
						|
 | 
						|
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
 | 
						|
.align 32
 | 
						|
IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
 | 
						|
	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
 | 
						|
.section .rodata.cst16.ROT16, "aM", @progbits, 16
 | 
						|
.align 16
 | 
						|
ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
 | 
						|
.section .rodata.cst16.ROR328, "aM", @progbits, 16
 | 
						|
.align 16
 | 
						|
ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
 | 
						|
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
 | 
						|
.align 64
 | 
						|
SIGMA:
 | 
						|
.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 | 
						|
.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
 | 
						|
.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
 | 
						|
.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
 | 
						|
.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
 | 
						|
.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
 | 
						|
.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
 | 
						|
.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
 | 
						|
.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
 | 
						|
.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
 | 
						|
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
 | 
						|
.align 64
 | 
						|
SIGMA2:
 | 
						|
.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 | 
						|
.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
 | 
						|
.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
 | 
						|
.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
 | 
						|
.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
 | 
						|
.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
 | 
						|
.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
 | 
						|
.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
 | 
						|
.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
 | 
						|
.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
 | 
						|
 | 
						|
.text
 | 
						|
SYM_FUNC_START(blake2s_compress_ssse3)
 | 
						|
	testq		%rdx,%rdx
 | 
						|
	je		.Lendofloop
 | 
						|
	movdqu		(%rdi),%xmm0
 | 
						|
	movdqu		0x10(%rdi),%xmm1
 | 
						|
	movdqa		ROT16(%rip),%xmm12
 | 
						|
	movdqa		ROR328(%rip),%xmm13
 | 
						|
	movdqu		0x20(%rdi),%xmm14
 | 
						|
	movq		%rcx,%xmm15
 | 
						|
	leaq		SIGMA+0xa0(%rip),%r8
 | 
						|
	jmp		.Lbeginofloop
 | 
						|
	.align		32
 | 
						|
.Lbeginofloop:
 | 
						|
	movdqa		%xmm0,%xmm10
 | 
						|
	movdqa		%xmm1,%xmm11
 | 
						|
	paddq		%xmm15,%xmm14
 | 
						|
	movdqa		IV(%rip),%xmm2
 | 
						|
	movdqa		%xmm14,%xmm3
 | 
						|
	pxor		IV+0x10(%rip),%xmm3
 | 
						|
	leaq		SIGMA(%rip),%rcx
 | 
						|
.Lroundloop:
 | 
						|
	movzbl		(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm4
 | 
						|
	movzbl		0x1(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm5
 | 
						|
	movzbl		0x2(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm6
 | 
						|
	movzbl		0x3(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm7
 | 
						|
	punpckldq	%xmm5,%xmm4
 | 
						|
	punpckldq	%xmm7,%xmm6
 | 
						|
	punpcklqdq	%xmm6,%xmm4
 | 
						|
	paddd		%xmm4,%xmm0
 | 
						|
	paddd		%xmm1,%xmm0
 | 
						|
	pxor		%xmm0,%xmm3
 | 
						|
	pshufb		%xmm12,%xmm3
 | 
						|
	paddd		%xmm3,%xmm2
 | 
						|
	pxor		%xmm2,%xmm1
 | 
						|
	movdqa		%xmm1,%xmm8
 | 
						|
	psrld		$0xc,%xmm1
 | 
						|
	pslld		$0x14,%xmm8
 | 
						|
	por		%xmm8,%xmm1
 | 
						|
	movzbl		0x4(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm5
 | 
						|
	movzbl		0x5(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm6
 | 
						|
	movzbl		0x6(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm7
 | 
						|
	movzbl		0x7(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm4
 | 
						|
	punpckldq	%xmm6,%xmm5
 | 
						|
	punpckldq	%xmm4,%xmm7
 | 
						|
	punpcklqdq	%xmm7,%xmm5
 | 
						|
	paddd		%xmm5,%xmm0
 | 
						|
	paddd		%xmm1,%xmm0
 | 
						|
	pxor		%xmm0,%xmm3
 | 
						|
	pshufb		%xmm13,%xmm3
 | 
						|
	paddd		%xmm3,%xmm2
 | 
						|
	pxor		%xmm2,%xmm1
 | 
						|
	movdqa		%xmm1,%xmm8
 | 
						|
	psrld		$0x7,%xmm1
 | 
						|
	pslld		$0x19,%xmm8
 | 
						|
	por		%xmm8,%xmm1
 | 
						|
	pshufd		$0x93,%xmm0,%xmm0
 | 
						|
	pshufd		$0x4e,%xmm3,%xmm3
 | 
						|
	pshufd		$0x39,%xmm2,%xmm2
 | 
						|
	movzbl		0x8(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm6
 | 
						|
	movzbl		0x9(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm7
 | 
						|
	movzbl		0xa(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm4
 | 
						|
	movzbl		0xb(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm5
 | 
						|
	punpckldq	%xmm7,%xmm6
 | 
						|
	punpckldq	%xmm5,%xmm4
 | 
						|
	punpcklqdq	%xmm4,%xmm6
 | 
						|
	paddd		%xmm6,%xmm0
 | 
						|
	paddd		%xmm1,%xmm0
 | 
						|
	pxor		%xmm0,%xmm3
 | 
						|
	pshufb		%xmm12,%xmm3
 | 
						|
	paddd		%xmm3,%xmm2
 | 
						|
	pxor		%xmm2,%xmm1
 | 
						|
	movdqa		%xmm1,%xmm8
 | 
						|
	psrld		$0xc,%xmm1
 | 
						|
	pslld		$0x14,%xmm8
 | 
						|
	por		%xmm8,%xmm1
 | 
						|
	movzbl		0xc(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm7
 | 
						|
	movzbl		0xd(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm4
 | 
						|
	movzbl		0xe(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm5
 | 
						|
	movzbl		0xf(%rcx),%eax
 | 
						|
	movd		(%rsi,%rax,4),%xmm6
 | 
						|
	punpckldq	%xmm4,%xmm7
 | 
						|
	punpckldq	%xmm6,%xmm5
 | 
						|
	punpcklqdq	%xmm5,%xmm7
 | 
						|
	paddd		%xmm7,%xmm0
 | 
						|
	paddd		%xmm1,%xmm0
 | 
						|
	pxor		%xmm0,%xmm3
 | 
						|
	pshufb		%xmm13,%xmm3
 | 
						|
	paddd		%xmm3,%xmm2
 | 
						|
	pxor		%xmm2,%xmm1
 | 
						|
	movdqa		%xmm1,%xmm8
 | 
						|
	psrld		$0x7,%xmm1
 | 
						|
	pslld		$0x19,%xmm8
 | 
						|
	por		%xmm8,%xmm1
 | 
						|
	pshufd		$0x39,%xmm0,%xmm0
 | 
						|
	pshufd		$0x4e,%xmm3,%xmm3
 | 
						|
	pshufd		$0x93,%xmm2,%xmm2
 | 
						|
	addq		$0x10,%rcx
 | 
						|
	cmpq		%r8,%rcx
 | 
						|
	jnz		.Lroundloop
 | 
						|
	pxor		%xmm2,%xmm0
 | 
						|
	pxor		%xmm3,%xmm1
 | 
						|
	pxor		%xmm10,%xmm0
 | 
						|
	pxor		%xmm11,%xmm1
 | 
						|
	addq		$0x40,%rsi
 | 
						|
	decq		%rdx
 | 
						|
	jnz		.Lbeginofloop
 | 
						|
	movdqu		%xmm0,(%rdi)
 | 
						|
	movdqu		%xmm1,0x10(%rdi)
 | 
						|
	movdqu		%xmm14,0x20(%rdi)
 | 
						|
.Lendofloop:
 | 
						|
	RET
 | 
						|
SYM_FUNC_END(blake2s_compress_ssse3)
 | 
						|
 | 
						|
SYM_FUNC_START(blake2s_compress_avx512)
 | 
						|
	vmovdqu		(%rdi),%xmm0
 | 
						|
	vmovdqu		0x10(%rdi),%xmm1
 | 
						|
	vmovdqu		0x20(%rdi),%xmm4
 | 
						|
	vmovq		%rcx,%xmm5
 | 
						|
	vmovdqa		IV(%rip),%xmm14
 | 
						|
	vmovdqa		IV+16(%rip),%xmm15
 | 
						|
	jmp		.Lblake2s_compress_avx512_mainloop
 | 
						|
.align 32
 | 
						|
.Lblake2s_compress_avx512_mainloop:
 | 
						|
	vmovdqa		%xmm0,%xmm10
 | 
						|
	vmovdqa		%xmm1,%xmm11
 | 
						|
	vpaddq		%xmm5,%xmm4,%xmm4
 | 
						|
	vmovdqa		%xmm14,%xmm2
 | 
						|
	vpxor		%xmm15,%xmm4,%xmm3
 | 
						|
	vmovdqu		(%rsi),%ymm6
 | 
						|
	vmovdqu		0x20(%rsi),%ymm7
 | 
						|
	addq		$0x40,%rsi
 | 
						|
	leaq		SIGMA2(%rip),%rax
 | 
						|
	movb		$0xa,%cl
 | 
						|
.Lblake2s_compress_avx512_roundloop:
 | 
						|
	addq		$0x40,%rax
 | 
						|
	vmovdqa		-0x40(%rax),%ymm8
 | 
						|
	vmovdqa		-0x20(%rax),%ymm9
 | 
						|
	vpermi2d	%ymm7,%ymm6,%ymm8
 | 
						|
	vpermi2d	%ymm7,%ymm6,%ymm9
 | 
						|
	vmovdqa		%ymm8,%ymm6
 | 
						|
	vmovdqa		%ymm9,%ymm7
 | 
						|
	vpaddd		%xmm8,%xmm0,%xmm0
 | 
						|
	vpaddd		%xmm1,%xmm0,%xmm0
 | 
						|
	vpxor		%xmm0,%xmm3,%xmm3
 | 
						|
	vprord		$0x10,%xmm3,%xmm3
 | 
						|
	vpaddd		%xmm3,%xmm2,%xmm2
 | 
						|
	vpxor		%xmm2,%xmm1,%xmm1
 | 
						|
	vprord		$0xc,%xmm1,%xmm1
 | 
						|
	vextracti128	$0x1,%ymm8,%xmm8
 | 
						|
	vpaddd		%xmm8,%xmm0,%xmm0
 | 
						|
	vpaddd		%xmm1,%xmm0,%xmm0
 | 
						|
	vpxor		%xmm0,%xmm3,%xmm3
 | 
						|
	vprord		$0x8,%xmm3,%xmm3
 | 
						|
	vpaddd		%xmm3,%xmm2,%xmm2
 | 
						|
	vpxor		%xmm2,%xmm1,%xmm1
 | 
						|
	vprord		$0x7,%xmm1,%xmm1
 | 
						|
	vpshufd		$0x93,%xmm0,%xmm0
 | 
						|
	vpshufd		$0x4e,%xmm3,%xmm3
 | 
						|
	vpshufd		$0x39,%xmm2,%xmm2
 | 
						|
	vpaddd		%xmm9,%xmm0,%xmm0
 | 
						|
	vpaddd		%xmm1,%xmm0,%xmm0
 | 
						|
	vpxor		%xmm0,%xmm3,%xmm3
 | 
						|
	vprord		$0x10,%xmm3,%xmm3
 | 
						|
	vpaddd		%xmm3,%xmm2,%xmm2
 | 
						|
	vpxor		%xmm2,%xmm1,%xmm1
 | 
						|
	vprord		$0xc,%xmm1,%xmm1
 | 
						|
	vextracti128	$0x1,%ymm9,%xmm9
 | 
						|
	vpaddd		%xmm9,%xmm0,%xmm0
 | 
						|
	vpaddd		%xmm1,%xmm0,%xmm0
 | 
						|
	vpxor		%xmm0,%xmm3,%xmm3
 | 
						|
	vprord		$0x8,%xmm3,%xmm3
 | 
						|
	vpaddd		%xmm3,%xmm2,%xmm2
 | 
						|
	vpxor		%xmm2,%xmm1,%xmm1
 | 
						|
	vprord		$0x7,%xmm1,%xmm1
 | 
						|
	vpshufd		$0x39,%xmm0,%xmm0
 | 
						|
	vpshufd		$0x4e,%xmm3,%xmm3
 | 
						|
	vpshufd		$0x93,%xmm2,%xmm2
 | 
						|
	decb		%cl
 | 
						|
	jne		.Lblake2s_compress_avx512_roundloop
 | 
						|
	vpxor		%xmm10,%xmm0,%xmm0
 | 
						|
	vpxor		%xmm11,%xmm1,%xmm1
 | 
						|
	vpxor		%xmm2,%xmm0,%xmm0
 | 
						|
	vpxor		%xmm3,%xmm1,%xmm1
 | 
						|
	decq		%rdx
 | 
						|
	jne		.Lblake2s_compress_avx512_mainloop
 | 
						|
	vmovdqu		%xmm0,(%rdi)
 | 
						|
	vmovdqu		%xmm1,0x10(%rdi)
 | 
						|
	vmovdqu		%xmm4,0x20(%rdi)
 | 
						|
	vzeroupper
 | 
						|
	RET
 | 
						|
SYM_FUNC_END(blake2s_compress_avx512)
 |