forked from mirrors/linux
		
	crypto: poly1305 - Add a SSE2 SIMD variant for x86_64
Implements an x86_64 assembler driver for the Poly1305 authenticator. This single block variant holds the 130-bit integer in 5 32-bit words, but uses SSE to do two multiplications/additions in parallel. When calling updates with small blocks, the overhead for kernel_fpu_begin/ kernel_fpu_end() negates the perfmance gain. We therefore use the poly1305-generic fallback for small updates. For large messages, throughput increases by ~5-10% compared to poly1305-generic: testing speed of poly1305 (poly1305-generic) test 0 ( 96 byte blocks, 16 bytes per update, 6 updates): 4080026 opers/sec, 391682496 bytes/sec test 1 ( 96 byte blocks, 32 bytes per update, 3 updates): 6221094 opers/sec, 597225024 bytes/sec test 2 ( 96 byte blocks, 96 bytes per update, 1 updates): 9609750 opers/sec, 922536057 bytes/sec test 3 ( 288 byte blocks, 16 bytes per update, 18 updates): 1459379 opers/sec, 420301267 bytes/sec test 4 ( 288 byte blocks, 32 bytes per update, 9 updates): 2115179 opers/sec, 609171609 bytes/sec test 5 ( 288 byte blocks, 288 bytes per update, 1 updates): 3729874 opers/sec, 1074203856 bytes/sec test 6 ( 1056 byte blocks, 32 bytes per update, 33 updates): 593000 opers/sec, 626208000 bytes/sec test 7 ( 1056 byte blocks, 1056 bytes per update, 1 updates): 1081536 opers/sec, 1142102332 bytes/sec test 8 ( 2080 byte blocks, 32 bytes per update, 65 updates): 302077 opers/sec, 628320576 bytes/sec test 9 ( 2080 byte blocks, 2080 bytes per update, 1 updates): 554384 opers/sec, 1153120176 bytes/sec test 10 ( 4128 byte blocks, 4128 bytes per update, 1 updates): 278715 opers/sec, 1150536345 bytes/sec test 11 ( 8224 byte blocks, 8224 bytes per update, 1 updates): 140202 opers/sec, 1153022070 bytes/sec testing speed of poly1305 (poly1305-simd) test 0 ( 96 byte blocks, 16 bytes per update, 6 updates): 3790063 opers/sec, 363846076 bytes/sec test 1 ( 96 byte blocks, 32 bytes per update, 3 updates): 5913378 opers/sec, 567684355 bytes/sec test 2 ( 96 byte blocks, 96 bytes per update, 1 updates): 9352574 opers/sec, 897847104 bytes/sec test 3 ( 288 byte blocks, 16 bytes per update, 18 updates): 1362145 opers/sec, 392297990 bytes/sec test 4 ( 288 byte blocks, 32 bytes per update, 9 updates): 2007075 opers/sec, 578037628 bytes/sec test 5 ( 288 byte blocks, 288 bytes per update, 1 updates): 3709811 opers/sec, 1068425798 bytes/sec test 6 ( 1056 byte blocks, 32 bytes per update, 33 updates): 566272 opers/sec, 597984182 bytes/sec test 7 ( 1056 byte blocks, 1056 bytes per update, 1 updates): 1111657 opers/sec, 1173910108 bytes/sec test 8 ( 2080 byte blocks, 32 bytes per update, 65 updates): 288857 opers/sec, 600823808 bytes/sec test 9 ( 2080 byte blocks, 2080 bytes per update, 1 updates): 590746 opers/sec, 1228751888 bytes/sec test 10 ( 4128 byte blocks, 4128 bytes per update, 1 updates): 301825 opers/sec, 1245936902 bytes/sec test 11 ( 8224 byte blocks, 8224 bytes per update, 1 updates): 153075 opers/sec, 1258896201 bytes/sec Benchmark results from a Core i5-4670T. Signed-off-by: Martin Willi <martin@strongswan.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
		
							parent
							
								
									2546f811ef
								
							
						
					
					
						commit
						c70f4abef0
					
				
					 4 changed files with 413 additions and 0 deletions
				
			
		|  | @ -31,6 +31,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o | ||||||
| obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o | obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o | ||||||
| obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o | obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o | ||||||
| obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o | obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o | ||||||
|  | obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o | ||||||
| 
 | 
 | ||||||
| # These modules require assembler to support AVX.
 | # These modules require assembler to support AVX.
 | ||||||
| ifeq ($(avx_supported),yes) | ifeq ($(avx_supported),yes) | ||||||
|  | @ -85,6 +86,7 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o | ||||||
| aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o | aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o | ||||||
| ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o | ||||||
| sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o | ||||||
|  | poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o | ||||||
| ifeq ($(avx2_supported),yes) | ifeq ($(avx2_supported),yes) | ||||||
| sha1-ssse3-y += sha1_avx2_x86_64_asm.o | sha1-ssse3-y += sha1_avx2_x86_64_asm.o | ||||||
| endif | endif | ||||||
|  |  | ||||||
							
								
								
									
										276
									
								
								arch/x86/crypto/poly1305-sse2-x86_64.S
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										276
									
								
								arch/x86/crypto/poly1305-sse2-x86_64.S
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,276 @@ | ||||||
|  | /* | ||||||
|  |  * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions | ||||||
|  |  * | ||||||
|  |  * Copyright (C) 2015 Martin Willi | ||||||
|  |  * | ||||||
|  |  * This program is free software; you can redistribute it and/or modify
 | ||||||
|  |  * it under the terms of the GNU General Public License as published by | ||||||
|  |  * the Free Software Foundation; either version 2 of the License, or
 | ||||||
|  |  * (at your option) any later version. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | #include <linux/linkage.h> | ||||||
|  | 
 | ||||||
|  | .data | ||||||
|  | .align 16
 | ||||||
|  | 
 | ||||||
|  | ANMASK:	.octa 0x0000000003ffffff0000000003ffffff | ||||||
|  | 
 | ||||||
|  | .text | ||||||
|  | 
 | ||||||
|  | #define h0 0x00(%rdi) | ||||||
|  | #define h1 0x04(%rdi) | ||||||
|  | #define h2 0x08(%rdi) | ||||||
|  | #define h3 0x0c(%rdi) | ||||||
|  | #define h4 0x10(%rdi) | ||||||
|  | #define r0 0x00(%rdx) | ||||||
|  | #define r1 0x04(%rdx) | ||||||
|  | #define r2 0x08(%rdx) | ||||||
|  | #define r3 0x0c(%rdx) | ||||||
|  | #define r4 0x10(%rdx) | ||||||
|  | #define s1 0x00(%rsp) | ||||||
|  | #define s2 0x04(%rsp) | ||||||
|  | #define s3 0x08(%rsp) | ||||||
|  | #define s4 0x0c(%rsp) | ||||||
|  | #define m %rsi | ||||||
|  | #define h01 %xmm0 | ||||||
|  | #define h23 %xmm1 | ||||||
|  | #define h44 %xmm2 | ||||||
|  | #define t1 %xmm3 | ||||||
|  | #define t2 %xmm4 | ||||||
|  | #define t3 %xmm5 | ||||||
|  | #define t4 %xmm6 | ||||||
|  | #define mask %xmm7 | ||||||
|  | #define d0 %r8 | ||||||
|  | #define d1 %r9 | ||||||
|  | #define d2 %r10 | ||||||
|  | #define d3 %r11 | ||||||
|  | #define d4 %r12 | ||||||
|  | 
 | ||||||
|  | ENTRY(poly1305_block_sse2) | ||||||
|  | 	# %rdi: Accumulator h[5] | ||||||
|  | 	# %rsi: 16 byte input block m | ||||||
|  | 	# %rdx: Poly1305 key r[5] | ||||||
|  | 	# %rcx: Block count | ||||||
|  | 
 | ||||||
|  | 	# This single block variant tries to improve performance by doing two | ||||||
|  | 	# multiplications in parallel using SSE instructions. There is quite | ||||||
|  | 	# some quardword packing involved, hence the speedup is marginal. | ||||||
|  | 
 | ||||||
|  | 	push		%rbx | ||||||
|  | 	push		%r12 | ||||||
|  | 	sub		$0x10,%rsp | ||||||
|  | 
 | ||||||
|  | 	# s1..s4 = r1..r4 * 5 | ||||||
|  | 	mov		r1,%eax | ||||||
|  | 	lea		(%eax,%eax,4),%eax | ||||||
|  | 	mov		%eax,s1 | ||||||
|  | 	mov		r2,%eax | ||||||
|  | 	lea		(%eax,%eax,4),%eax | ||||||
|  | 	mov		%eax,s2 | ||||||
|  | 	mov		r3,%eax | ||||||
|  | 	lea		(%eax,%eax,4),%eax | ||||||
|  | 	mov		%eax,s3 | ||||||
|  | 	mov		r4,%eax | ||||||
|  | 	lea		(%eax,%eax,4),%eax | ||||||
|  | 	mov		%eax,s4 | ||||||
|  | 
 | ||||||
|  | 	movdqa		ANMASK(%rip),mask | ||||||
|  | 
 | ||||||
|  | .Ldoblock: | ||||||
|  | 	# h01 = [0, h1, 0, h0] | ||||||
|  | 	# h23 = [0, h3, 0, h2] | ||||||
|  | 	# h44 = [0, h4, 0, h4] | ||||||
|  | 	movd		h0,h01 | ||||||
|  | 	movd		h1,t1 | ||||||
|  | 	movd		h2,h23 | ||||||
|  | 	movd		h3,t2 | ||||||
|  | 	movd		h4,h44 | ||||||
|  | 	punpcklqdq	t1,h01 | ||||||
|  | 	punpcklqdq	t2,h23 | ||||||
|  | 	punpcklqdq	h44,h44 | ||||||
|  | 
 | ||||||
|  | 	# h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ] | ||||||
|  | 	movd		0x00(m),t1 | ||||||
|  | 	movd		0x03(m),t2 | ||||||
|  | 	psrld		$2,t2 | ||||||
|  | 	punpcklqdq	t2,t1 | ||||||
|  | 	pand		mask,t1 | ||||||
|  | 	paddd		t1,h01 | ||||||
|  | 	# h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ] | ||||||
|  | 	movd		0x06(m),t1 | ||||||
|  | 	movd		0x09(m),t2 | ||||||
|  | 	psrld		$4,t1 | ||||||
|  | 	psrld		$6,t2 | ||||||
|  | 	punpcklqdq	t2,t1 | ||||||
|  | 	pand		mask,t1 | ||||||
|  | 	paddd		t1,h23 | ||||||
|  | 	# h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ] | ||||||
|  | 	mov		0x0c(m),%eax | ||||||
|  | 	shr		$8,%eax | ||||||
|  | 	or		$0x01000000,%eax | ||||||
|  | 	movd		%eax,t1 | ||||||
|  | 	pshufd		$0xc4,t1,t1 | ||||||
|  | 	paddd		t1,h44 | ||||||
|  | 
 | ||||||
|  | 	# t1[0] = h0 * r0 + h2 * s3 | ||||||
|  | 	# t1[1] = h1 * s4 + h3 * s2 | ||||||
|  | 	movd		r0,t1 | ||||||
|  | 	movd		s4,t2 | ||||||
|  | 	punpcklqdq	t2,t1 | ||||||
|  | 	pmuludq		h01,t1 | ||||||
|  | 	movd		s3,t2 | ||||||
|  | 	movd		s2,t3 | ||||||
|  | 	punpcklqdq	t3,t2 | ||||||
|  | 	pmuludq		h23,t2 | ||||||
|  | 	paddq		t2,t1 | ||||||
|  | 	# t2[0] = h0 * r1 + h2 * s4 | ||||||
|  | 	# t2[1] = h1 * r0 + h3 * s3 | ||||||
|  | 	movd		r1,t2 | ||||||
|  | 	movd		r0,t3 | ||||||
|  | 	punpcklqdq	t3,t2 | ||||||
|  | 	pmuludq		h01,t2 | ||||||
|  | 	movd		s4,t3 | ||||||
|  | 	movd		s3,t4 | ||||||
|  | 	punpcklqdq	t4,t3 | ||||||
|  | 	pmuludq		h23,t3 | ||||||
|  | 	paddq		t3,t2 | ||||||
|  | 	# t3[0] = h4 * s1 | ||||||
|  | 	# t3[1] = h4 * s2 | ||||||
|  | 	movd		s1,t3 | ||||||
|  | 	movd		s2,t4 | ||||||
|  | 	punpcklqdq	t4,t3 | ||||||
|  | 	pmuludq		h44,t3 | ||||||
|  | 	# d0 = t1[0] + t1[1] + t3[0] | ||||||
|  | 	# d1 = t2[0] + t2[1] + t3[1] | ||||||
|  | 	movdqa		t1,t4 | ||||||
|  | 	punpcklqdq	t2,t4 | ||||||
|  | 	punpckhqdq	t2,t1 | ||||||
|  | 	paddq		t4,t1 | ||||||
|  | 	paddq		t3,t1 | ||||||
|  | 	movq		t1,d0 | ||||||
|  | 	psrldq		$8,t1 | ||||||
|  | 	movq		t1,d1 | ||||||
|  | 
 | ||||||
|  | 	# t1[0] = h0 * r2 + h2 * r0 | ||||||
|  | 	# t1[1] = h1 * r1 + h3 * s4 | ||||||
|  | 	movd		r2,t1 | ||||||
|  | 	movd		r1,t2 | ||||||
|  | 	punpcklqdq 	t2,t1 | ||||||
|  | 	pmuludq		h01,t1 | ||||||
|  | 	movd		r0,t2 | ||||||
|  | 	movd		s4,t3 | ||||||
|  | 	punpcklqdq	t3,t2 | ||||||
|  | 	pmuludq		h23,t2 | ||||||
|  | 	paddq		t2,t1 | ||||||
|  | 	# t2[0] = h0 * r3 + h2 * r1 | ||||||
|  | 	# t2[1] = h1 * r2 + h3 * r0 | ||||||
|  | 	movd		r3,t2 | ||||||
|  | 	movd		r2,t3 | ||||||
|  | 	punpcklqdq	t3,t2 | ||||||
|  | 	pmuludq		h01,t2 | ||||||
|  | 	movd		r1,t3 | ||||||
|  | 	movd		r0,t4 | ||||||
|  | 	punpcklqdq	t4,t3 | ||||||
|  | 	pmuludq		h23,t3 | ||||||
|  | 	paddq		t3,t2 | ||||||
|  | 	# t3[0] = h4 * s3 | ||||||
|  | 	# t3[1] = h4 * s4 | ||||||
|  | 	movd		s3,t3 | ||||||
|  | 	movd		s4,t4 | ||||||
|  | 	punpcklqdq	t4,t3 | ||||||
|  | 	pmuludq		h44,t3 | ||||||
|  | 	# d2 = t1[0] + t1[1] + t3[0] | ||||||
|  | 	# d3 = t2[0] + t2[1] + t3[1] | ||||||
|  | 	movdqa		t1,t4 | ||||||
|  | 	punpcklqdq	t2,t4 | ||||||
|  | 	punpckhqdq	t2,t1 | ||||||
|  | 	paddq		t4,t1 | ||||||
|  | 	paddq		t3,t1 | ||||||
|  | 	movq		t1,d2 | ||||||
|  | 	psrldq		$8,t1 | ||||||
|  | 	movq		t1,d3 | ||||||
|  | 
 | ||||||
|  | 	# t1[0] = h0 * r4 + h2 * r2 | ||||||
|  | 	# t1[1] = h1 * r3 + h3 * r1 | ||||||
|  | 	movd		r4,t1 | ||||||
|  | 	movd		r3,t2 | ||||||
|  | 	punpcklqdq	t2,t1 | ||||||
|  | 	pmuludq		h01,t1 | ||||||
|  | 	movd		r2,t2 | ||||||
|  | 	movd		r1,t3 | ||||||
|  | 	punpcklqdq	t3,t2 | ||||||
|  | 	pmuludq		h23,t2 | ||||||
|  | 	paddq		t2,t1 | ||||||
|  | 	# t3[0] = h4 * r0 | ||||||
|  | 	movd		r0,t3 | ||||||
|  | 	pmuludq		h44,t3 | ||||||
|  | 	# d4 = t1[0] + t1[1] + t3[0] | ||||||
|  | 	movdqa		t1,t4 | ||||||
|  | 	psrldq		$8,t4 | ||||||
|  | 	paddq		t4,t1 | ||||||
|  | 	paddq		t3,t1 | ||||||
|  | 	movq		t1,d4 | ||||||
|  | 
 | ||||||
|  | 	# d1 += d0 >> 26 | ||||||
|  | 	mov		d0,%rax | ||||||
|  | 	shr		$26,%rax | ||||||
|  | 	add		%rax,d1 | ||||||
|  | 	# h0 = d0 & 0x3ffffff | ||||||
|  | 	mov		d0,%rbx | ||||||
|  | 	and		$0x3ffffff,%ebx | ||||||
|  | 
 | ||||||
|  | 	# d2 += d1 >> 26 | ||||||
|  | 	mov		d1,%rax | ||||||
|  | 	shr		$26,%rax | ||||||
|  | 	add		%rax,d2 | ||||||
|  | 	# h1 = d1 & 0x3ffffff | ||||||
|  | 	mov		d1,%rax | ||||||
|  | 	and		$0x3ffffff,%eax | ||||||
|  | 	mov		%eax,h1 | ||||||
|  | 
 | ||||||
|  | 	# d3 += d2 >> 26 | ||||||
|  | 	mov		d2,%rax | ||||||
|  | 	shr		$26,%rax | ||||||
|  | 	add		%rax,d3 | ||||||
|  | 	# h2 = d2 & 0x3ffffff | ||||||
|  | 	mov		d2,%rax | ||||||
|  | 	and		$0x3ffffff,%eax | ||||||
|  | 	mov		%eax,h2 | ||||||
|  | 
 | ||||||
|  | 	# d4 += d3 >> 26 | ||||||
|  | 	mov		d3,%rax | ||||||
|  | 	shr		$26,%rax | ||||||
|  | 	add		%rax,d4 | ||||||
|  | 	# h3 = d3 & 0x3ffffff | ||||||
|  | 	mov		d3,%rax | ||||||
|  | 	and		$0x3ffffff,%eax | ||||||
|  | 	mov		%eax,h3 | ||||||
|  | 
 | ||||||
|  | 	# h0 += (d4 >> 26) * 5 | ||||||
|  | 	mov		d4,%rax | ||||||
|  | 	shr		$26,%rax | ||||||
|  | 	lea		(%eax,%eax,4),%eax | ||||||
|  | 	add		%eax,%ebx | ||||||
|  | 	# h4 = d4 & 0x3ffffff | ||||||
|  | 	mov		d4,%rax | ||||||
|  | 	and		$0x3ffffff,%eax | ||||||
|  | 	mov		%eax,h4 | ||||||
|  | 
 | ||||||
|  | 	# h1 += h0 >> 26 | ||||||
|  | 	mov		%ebx,%eax | ||||||
|  | 	shr		$26,%eax | ||||||
|  | 	add		%eax,h1 | ||||||
|  | 	# h0 = h0 & 0x3ffffff | ||||||
|  | 	andl		$0x3ffffff,%ebx | ||||||
|  | 	mov		%ebx,h0 | ||||||
|  | 
 | ||||||
|  | 	add		$0x10,m | ||||||
|  | 	dec		%rcx | ||||||
|  | 	jnz		.Ldoblock | ||||||
|  | 
 | ||||||
|  | 	add		$0x10,%rsp | ||||||
|  | 	pop		%r12 | ||||||
|  | 	pop		%rbx | ||||||
|  | 	ret | ||||||
|  | ENDPROC(poly1305_block_sse2) | ||||||
							
								
								
									
										123
									
								
								arch/x86/crypto/poly1305_glue.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										123
									
								
								arch/x86/crypto/poly1305_glue.c
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,123 @@ | ||||||
|  | /*
 | ||||||
|  |  * Poly1305 authenticator algorithm, RFC7539, SIMD glue code | ||||||
|  |  * | ||||||
|  |  * Copyright (C) 2015 Martin Willi | ||||||
|  |  * | ||||||
|  |  * This program is free software; you can redistribute it and/or modify | ||||||
|  |  * it under the terms of the GNU General Public License as published by | ||||||
|  |  * the Free Software Foundation; either version 2 of the License, or | ||||||
|  |  * (at your option) any later version. | ||||||
|  |  */ | ||||||
|  | 
 | ||||||
|  | #include <crypto/algapi.h> | ||||||
|  | #include <crypto/internal/hash.h> | ||||||
|  | #include <crypto/poly1305.h> | ||||||
|  | #include <linux/crypto.h> | ||||||
|  | #include <linux/kernel.h> | ||||||
|  | #include <linux/module.h> | ||||||
|  | #include <asm/fpu/api.h> | ||||||
|  | #include <asm/simd.h> | ||||||
|  | 
 | ||||||
|  | asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, | ||||||
|  | 				    const u32 *r, unsigned int blocks); | ||||||
|  | 
 | ||||||
|  | static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, | ||||||
|  | 					 const u8 *src, unsigned int srclen) | ||||||
|  | { | ||||||
|  | 	unsigned int blocks, datalen; | ||||||
|  | 
 | ||||||
|  | 	if (unlikely(!dctx->sset)) { | ||||||
|  | 		datalen = crypto_poly1305_setdesckey(dctx, src, srclen); | ||||||
|  | 		src += srclen - datalen; | ||||||
|  | 		srclen = datalen; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if (srclen >= POLY1305_BLOCK_SIZE) { | ||||||
|  | 		blocks = srclen / POLY1305_BLOCK_SIZE; | ||||||
|  | 		poly1305_block_sse2(dctx->h, src, dctx->r, blocks); | ||||||
|  | 		srclen -= POLY1305_BLOCK_SIZE * blocks; | ||||||
|  | 	} | ||||||
|  | 	return srclen; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static int poly1305_simd_update(struct shash_desc *desc, | ||||||
|  | 				const u8 *src, unsigned int srclen) | ||||||
|  | { | ||||||
|  | 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); | ||||||
|  | 	unsigned int bytes; | ||||||
|  | 
 | ||||||
|  | 	/* kernel_fpu_begin/end is costly, use fallback for small updates */ | ||||||
|  | 	if (srclen <= 288 || !may_use_simd()) | ||||||
|  | 		return crypto_poly1305_update(desc, src, srclen); | ||||||
|  | 
 | ||||||
|  | 	kernel_fpu_begin(); | ||||||
|  | 
 | ||||||
|  | 	if (unlikely(dctx->buflen)) { | ||||||
|  | 		bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); | ||||||
|  | 		memcpy(dctx->buf + dctx->buflen, src, bytes); | ||||||
|  | 		src += bytes; | ||||||
|  | 		srclen -= bytes; | ||||||
|  | 		dctx->buflen += bytes; | ||||||
|  | 
 | ||||||
|  | 		if (dctx->buflen == POLY1305_BLOCK_SIZE) { | ||||||
|  | 			poly1305_simd_blocks(dctx, dctx->buf, | ||||||
|  | 					     POLY1305_BLOCK_SIZE); | ||||||
|  | 			dctx->buflen = 0; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if (likely(srclen >= POLY1305_BLOCK_SIZE)) { | ||||||
|  | 		bytes = poly1305_simd_blocks(dctx, src, srclen); | ||||||
|  | 		src += srclen - bytes; | ||||||
|  | 		srclen = bytes; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	kernel_fpu_end(); | ||||||
|  | 
 | ||||||
|  | 	if (unlikely(srclen)) { | ||||||
|  | 		dctx->buflen = srclen; | ||||||
|  | 		memcpy(dctx->buf, src, srclen); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static struct shash_alg alg = { | ||||||
|  | 	.digestsize	= POLY1305_DIGEST_SIZE, | ||||||
|  | 	.init		= crypto_poly1305_init, | ||||||
|  | 	.update		= poly1305_simd_update, | ||||||
|  | 	.final		= crypto_poly1305_final, | ||||||
|  | 	.setkey		= crypto_poly1305_setkey, | ||||||
|  | 	.descsize	= sizeof(struct poly1305_desc_ctx), | ||||||
|  | 	.base		= { | ||||||
|  | 		.cra_name		= "poly1305", | ||||||
|  | 		.cra_driver_name	= "poly1305-simd", | ||||||
|  | 		.cra_priority		= 300, | ||||||
|  | 		.cra_flags		= CRYPTO_ALG_TYPE_SHASH, | ||||||
|  | 		.cra_alignmask		= sizeof(u32) - 1, | ||||||
|  | 		.cra_blocksize		= POLY1305_BLOCK_SIZE, | ||||||
|  | 		.cra_module		= THIS_MODULE, | ||||||
|  | 	}, | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static int __init poly1305_simd_mod_init(void) | ||||||
|  | { | ||||||
|  | 	if (!cpu_has_xmm2) | ||||||
|  | 		return -ENODEV; | ||||||
|  | 
 | ||||||
|  | 	return crypto_register_shash(&alg); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void __exit poly1305_simd_mod_exit(void) | ||||||
|  | { | ||||||
|  | 	crypto_unregister_shash(&alg); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | module_init(poly1305_simd_mod_init); | ||||||
|  | module_exit(poly1305_simd_mod_exit); | ||||||
|  | 
 | ||||||
|  | MODULE_LICENSE("GPL"); | ||||||
|  | MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); | ||||||
|  | MODULE_DESCRIPTION("Poly1305 authenticator"); | ||||||
|  | MODULE_ALIAS_CRYPTO("poly1305"); | ||||||
|  | MODULE_ALIAS_CRYPTO("poly1305-simd"); | ||||||
|  | @ -470,6 +470,18 @@ config CRYPTO_POLY1305 | ||||||
| 	  It is used for the ChaCha20-Poly1305 AEAD, specified in RFC7539 for use | 	  It is used for the ChaCha20-Poly1305 AEAD, specified in RFC7539 for use | ||||||
| 	  in IETF protocols. This is the portable C implementation of Poly1305. | 	  in IETF protocols. This is the portable C implementation of Poly1305. | ||||||
| 
 | 
 | ||||||
|  | config CRYPTO_POLY1305_X86_64 | ||||||
|  | 	tristate "Poly1305 authenticator algorithm (x86_64/SSE2)" | ||||||
|  | 	depends on X86 && 64BIT | ||||||
|  | 	select CRYPTO_POLY1305 | ||||||
|  | 	help | ||||||
|  | 	  Poly1305 authenticator algorithm, RFC7539. | ||||||
|  | 
 | ||||||
|  | 	  Poly1305 is an authenticator algorithm designed by Daniel J. Bernstein. | ||||||
|  | 	  It is used for the ChaCha20-Poly1305 AEAD, specified in RFC7539 for use | ||||||
|  | 	  in IETF protocols. This is the x86_64 assembler implementation using SIMD | ||||||
|  | 	  instructions. | ||||||
|  | 
 | ||||||
| config CRYPTO_MD4 | config CRYPTO_MD4 | ||||||
| 	tristate "MD4 digest algorithm" | 	tristate "MD4 digest algorithm" | ||||||
| 	select CRYPTO_HASH | 	select CRYPTO_HASH | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue
	
	 Martin Willi
						Martin Willi