forked from mirrors/linux
		
	crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize
The way the KECCAK transform is currently coded involves many references
into the state array using indexes that are calculated at runtime using
simple but non-trivial arithmetic. This forces the compiler to treat the
state matrix as an array in memory rather than keep it in registers,
which results in poor performance.
So instead, let's rephrase the algorithm using fixed array indexes only.
This helps the compiler keep the state matrix in registers, resulting
in the following speedup (SHA3-256 performance in cycles per byte):
                                            before   after   speedup
  Intel Core i7 @ 2.0 GHz (2.9 turbo)        100.6    35.7     2.8x
  Cortex-A57 @ 2.0 GHz (64-bit mode)         101.6    12.7     8.0x
  Cortex-A53 @ 1.0 GHz                       224.4    15.8    14.2x
  Cortex-A57 @ 2.0 GHz (32-bit mode)         201.8    63.0     3.2x
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
			
			
This commit is contained in:
		
							parent
							
								
									c013cee99d
								
							
						
					
					
						commit
						83dee2ce1a
					
				
					 1 changed files with 95 additions and 37 deletions
				
			
		|  | @ -5,6 +5,7 @@ | |||
|  * http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
 | ||||
|  * | ||||
|  * SHA-3 code by Jeff Garzik <jeff@garzik.org> | ||||
|  *               Ard Biesheuvel <ard.biesheuvel@linaro.org> | ||||
|  * | ||||
|  * This program is free software; you can redistribute it and/or modify it | ||||
|  * under the terms of the GNU General Public License as published by the Free | ||||
|  | @ -22,8 +23,6 @@ | |||
| 
 | ||||
| #define KECCAK_ROUNDS 24 | ||||
| 
 | ||||
| #define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y)))) | ||||
| 
 | ||||
| static const u64 keccakf_rndc[24] = { | ||||
| 	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, | ||||
| 	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL, | ||||
|  | @ -35,53 +34,112 @@ static const u64 keccakf_rndc[24] = { | |||
| 	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL | ||||
| }; | ||||
| 
 | ||||
| static const int keccakf_rotc[24] = { | ||||
| 	1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14, | ||||
| 	27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44 | ||||
| }; | ||||
| 
 | ||||
| static const int keccakf_piln[24] = { | ||||
| 	10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4, | ||||
| 	15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1 | ||||
| }; | ||||
| 
 | ||||
| /* update the state with given number of rounds */ | ||||
| 
 | ||||
| static void keccakf(u64 st[25]) | ||||
| static void __attribute__((__optimize__("O3"))) keccakf(u64 st[25]) | ||||
| { | ||||
| 	int i, j, round; | ||||
| 	u64 t, bc[5]; | ||||
| 	u64 t[5], tt, bc[5]; | ||||
| 	int round; | ||||
| 
 | ||||
| 	for (round = 0; round < KECCAK_ROUNDS; round++) { | ||||
| 
 | ||||
| 		/* Theta */ | ||||
| 		for (i = 0; i < 5; i++) | ||||
| 			bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] | ||||
| 				^ st[i + 20]; | ||||
| 		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; | ||||
| 		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21]; | ||||
| 		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22]; | ||||
| 		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; | ||||
| 		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; | ||||
| 
 | ||||
| 		for (i = 0; i < 5; i++) { | ||||
| 			t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); | ||||
| 			for (j = 0; j < 25; j += 5) | ||||
| 				st[j + i] ^= t; | ||||
| 		} | ||||
| 		t[0] = bc[4] ^ rol64(bc[1], 1); | ||||
| 		t[1] = bc[0] ^ rol64(bc[2], 1); | ||||
| 		t[2] = bc[1] ^ rol64(bc[3], 1); | ||||
| 		t[3] = bc[2] ^ rol64(bc[4], 1); | ||||
| 		t[4] = bc[3] ^ rol64(bc[0], 1); | ||||
| 
 | ||||
| 		st[0] ^= t[0]; | ||||
| 
 | ||||
| 		/* Rho Pi */ | ||||
| 		t = st[1]; | ||||
| 		for (i = 0; i < 24; i++) { | ||||
| 			j = keccakf_piln[i]; | ||||
| 			bc[0] = st[j]; | ||||
| 			st[j] = ROTL64(t, keccakf_rotc[i]); | ||||
| 			t = bc[0]; | ||||
| 		} | ||||
| 		tt = st[1]; | ||||
| 		st[ 1] = rol64(st[ 6] ^ t[1], 44); | ||||
| 		st[ 6] = rol64(st[ 9] ^ t[4], 20); | ||||
| 		st[ 9] = rol64(st[22] ^ t[2], 61); | ||||
| 		st[22] = rol64(st[14] ^ t[4], 39); | ||||
| 		st[14] = rol64(st[20] ^ t[0], 18); | ||||
| 		st[20] = rol64(st[ 2] ^ t[2], 62); | ||||
| 		st[ 2] = rol64(st[12] ^ t[2], 43); | ||||
| 		st[12] = rol64(st[13] ^ t[3], 25); | ||||
| 		st[13] = rol64(st[19] ^ t[4],  8); | ||||
| 		st[19] = rol64(st[23] ^ t[3], 56); | ||||
| 		st[23] = rol64(st[15] ^ t[0], 41); | ||||
| 		st[15] = rol64(st[ 4] ^ t[4], 27); | ||||
| 		st[ 4] = rol64(st[24] ^ t[4], 14); | ||||
| 		st[24] = rol64(st[21] ^ t[1],  2); | ||||
| 		st[21] = rol64(st[ 8] ^ t[3], 55); | ||||
| 		st[ 8] = rol64(st[16] ^ t[1], 45); | ||||
| 		st[16] = rol64(st[ 5] ^ t[0], 36); | ||||
| 		st[ 5] = rol64(st[ 3] ^ t[3], 28); | ||||
| 		st[ 3] = rol64(st[18] ^ t[3], 21); | ||||
| 		st[18] = rol64(st[17] ^ t[2], 15); | ||||
| 		st[17] = rol64(st[11] ^ t[1], 10); | ||||
| 		st[11] = rol64(st[ 7] ^ t[2],  6); | ||||
| 		st[ 7] = rol64(st[10] ^ t[0],  3); | ||||
| 		st[10] = rol64(    tt ^ t[1],  1); | ||||
| 
 | ||||
| 		/* Chi */ | ||||
| 		for (j = 0; j < 25; j += 5) { | ||||
| 			for (i = 0; i < 5; i++) | ||||
| 				bc[i] = st[j + i]; | ||||
| 			for (i = 0; i < 5; i++) | ||||
| 				st[j + i] ^= (~bc[(i + 1) % 5]) & | ||||
| 					     bc[(i + 2) % 5]; | ||||
| 		} | ||||
| 		bc[ 0] = ~st[ 1] & st[ 2]; | ||||
| 		bc[ 1] = ~st[ 2] & st[ 3]; | ||||
| 		bc[ 2] = ~st[ 3] & st[ 4]; | ||||
| 		bc[ 3] = ~st[ 4] & st[ 0]; | ||||
| 		bc[ 4] = ~st[ 0] & st[ 1]; | ||||
| 		st[ 0] ^= bc[ 0]; | ||||
| 		st[ 1] ^= bc[ 1]; | ||||
| 		st[ 2] ^= bc[ 2]; | ||||
| 		st[ 3] ^= bc[ 3]; | ||||
| 		st[ 4] ^= bc[ 4]; | ||||
| 
 | ||||
| 		bc[ 0] = ~st[ 6] & st[ 7]; | ||||
| 		bc[ 1] = ~st[ 7] & st[ 8]; | ||||
| 		bc[ 2] = ~st[ 8] & st[ 9]; | ||||
| 		bc[ 3] = ~st[ 9] & st[ 5]; | ||||
| 		bc[ 4] = ~st[ 5] & st[ 6]; | ||||
| 		st[ 5] ^= bc[ 0]; | ||||
| 		st[ 6] ^= bc[ 1]; | ||||
| 		st[ 7] ^= bc[ 2]; | ||||
| 		st[ 8] ^= bc[ 3]; | ||||
| 		st[ 9] ^= bc[ 4]; | ||||
| 
 | ||||
| 		bc[ 0] = ~st[11] & st[12]; | ||||
| 		bc[ 1] = ~st[12] & st[13]; | ||||
| 		bc[ 2] = ~st[13] & st[14]; | ||||
| 		bc[ 3] = ~st[14] & st[10]; | ||||
| 		bc[ 4] = ~st[10] & st[11]; | ||||
| 		st[10] ^= bc[ 0]; | ||||
| 		st[11] ^= bc[ 1]; | ||||
| 		st[12] ^= bc[ 2]; | ||||
| 		st[13] ^= bc[ 3]; | ||||
| 		st[14] ^= bc[ 4]; | ||||
| 
 | ||||
| 		bc[ 0] = ~st[16] & st[17]; | ||||
| 		bc[ 1] = ~st[17] & st[18]; | ||||
| 		bc[ 2] = ~st[18] & st[19]; | ||||
| 		bc[ 3] = ~st[19] & st[15]; | ||||
| 		bc[ 4] = ~st[15] & st[16]; | ||||
| 		st[15] ^= bc[ 0]; | ||||
| 		st[16] ^= bc[ 1]; | ||||
| 		st[17] ^= bc[ 2]; | ||||
| 		st[18] ^= bc[ 3]; | ||||
| 		st[19] ^= bc[ 4]; | ||||
| 
 | ||||
| 		bc[ 0] = ~st[21] & st[22]; | ||||
| 		bc[ 1] = ~st[22] & st[23]; | ||||
| 		bc[ 2] = ~st[23] & st[24]; | ||||
| 		bc[ 3] = ~st[24] & st[20]; | ||||
| 		bc[ 4] = ~st[20] & st[21]; | ||||
| 		st[20] ^= bc[ 0]; | ||||
| 		st[21] ^= bc[ 1]; | ||||
| 		st[22] ^= bc[ 2]; | ||||
| 		st[23] ^= bc[ 3]; | ||||
| 		st[24] ^= bc[ 4]; | ||||
| 
 | ||||
| 		/* Iota */ | ||||
| 		st[0] ^= keccakf_rndc[round]; | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 Ard Biesheuvel
						Ard Biesheuvel