libsec: unroll portable sha2block functions

- unroll the loops
- rotate the taps on each step, avoiding copies
- simplify boolean formulas for Ch() and Maj()

this yields arround 40% throughput increase on 32/64bit
archs for sha2_256 and sha2_512 on amd64.
This commit is contained in:
cinap_lenrek 2017-11-30 02:16:27 +01:00
parent d51d54442e
commit c09cd2882c
2 changed files with 193 additions and 59 deletions

View file

@ -1,8 +1,6 @@
/*
* sha2_512 block cipher
* sha2_512 block cipher - unrolled version
*
* Implementation straight from Federal Information Processing Standards
* publication 180-2 (+Change Notice to include SHA-224) August 1, 2002
* note: the following upper and lower case macro names are distinct
* and reflect the functions defined in FIPS pub. 180-2.
*/
@ -14,8 +12,8 @@
#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x) >> 6))
#define SIGMA0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
#define SIGMA1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
#define Maj(x,y,z) (((x) | (y)) & ((z) | ((x) & (y))))
/*
* first 64 bits of the fractional parts of cube roots of
@ -41,14 +39,13 @@ static u64int K512[80] = {
0xca273eceea26619cLL, 0xd186b8c721c0c207LL, 0xeada7dd6cde0eb1eLL, 0xf57d4f7fee6ed178LL,
0x06f067aa72176fbaLL, 0x0a637dc5a2c898a6LL, 0x113f9804bef90daeLL, 0x1b710b35131c471bLL,
0x28db77f523047d84LL, 0x32caab7b40c72493LL, 0x3c9ebe0a15c9bebcLL, 0x431d67c49c100d4cLL,
0x4cc5d4becb3e42b6LL, 0x597f299cfc657e2aLL, 0x5fcb6fab3ad6faecLL, 0x6c44198c4a475817LL };
0x4cc5d4becb3e42b6LL, 0x597f299cfc657e2aLL, 0x5fcb6fab3ad6faecLL, 0x6c44198c4a475817LL
};
void
_sha2block128(uchar *p, ulong len, u64int *s)
{
u64int a, b, c, d, e, f, g, h, t1, t2;
u64int *kp, *wp;
u64int w[80];
u64int w[16], a, b, c, d, e, f, g, h;
uchar *end;
/* at this point, we have a multiple of 64 bytes */
@ -62,33 +59,111 @@ _sha2block128(uchar *p, ulong len, u64int *s)
g = s[6];
h = s[7];
for(wp = w; wp < &w[16]; wp++, p += 8)
wp[0] = ((vlong)p[0])<<56 | ((vlong)p[1])<<48 |
((vlong)p[2])<<40 | ((vlong)p[3])<<32 |
p[4] << 24 | p[5] << 16 | p[6] << 8 | p[7];
for(; wp < &w[80]; wp++) {
u64int s0, s1;
#define STEP(a,b,c,d,e,f,g,h,i) \
if(i < 16) { \
w[i] = (u64int)(p[0]<<24 | p[1]<<16 | p[2]<<8 | p[3])<<32 | \
(p[4]<<24 | p[5]<<16 | p[6]<<8 | p[7]); \
p += 8; \
} else { \
u64int s0, s1; \
s1 = sigma1(w[i-2&15]); \
s0 = sigma0(w[i-15&15]); \
w[i&15] += s1 + w[i-7&15] + s0; \
} \
h += SIGMA1(e) + Ch(e,f,g) + K512[i] + w[i&15]; \
d += h; \
h += SIGMA0(a) + Maj(a,b,c);
s0 = sigma0(wp[-15]);
s1 = sigma1(wp[-2]);
// wp[0] = sigma1(wp[-2]) + wp[-7] + sigma0(wp[-15]) + wp[-16];
wp[0] = s1 + wp[-7] + s0 + wp[-16];
}
STEP(a,b,c,d,e,f,g,h,0);
STEP(h,a,b,c,d,e,f,g,1);
STEP(g,h,a,b,c,d,e,f,2);
STEP(f,g,h,a,b,c,d,e,3);
STEP(e,f,g,h,a,b,c,d,4);
STEP(d,e,f,g,h,a,b,c,5);
STEP(c,d,e,f,g,h,a,b,6);
STEP(b,c,d,e,f,g,h,a,7);
for(kp = K512, wp = w; wp < &w[80]; ) {
t1 = h + SIGMA1(e) + Ch(e,f,g) + *kp++ + *wp++;
t2 = SIGMA0(a) + Maj(a,b,c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
}
STEP(a,b,c,d,e,f,g,h,8);
STEP(h,a,b,c,d,e,f,g,9);
STEP(g,h,a,b,c,d,e,f,10);
STEP(f,g,h,a,b,c,d,e,11);
STEP(e,f,g,h,a,b,c,d,12);
STEP(d,e,f,g,h,a,b,c,13);
STEP(c,d,e,f,g,h,a,b,14);
STEP(b,c,d,e,f,g,h,a,15);
STEP(a,b,c,d,e,f,g,h,16);
STEP(h,a,b,c,d,e,f,g,17);
STEP(g,h,a,b,c,d,e,f,18);
STEP(f,g,h,a,b,c,d,e,19);
STEP(e,f,g,h,a,b,c,d,20);
STEP(d,e,f,g,h,a,b,c,21);
STEP(c,d,e,f,g,h,a,b,22);
STEP(b,c,d,e,f,g,h,a,23);
STEP(a,b,c,d,e,f,g,h,24);
STEP(h,a,b,c,d,e,f,g,25);
STEP(g,h,a,b,c,d,e,f,26);
STEP(f,g,h,a,b,c,d,e,27);
STEP(e,f,g,h,a,b,c,d,28);
STEP(d,e,f,g,h,a,b,c,29);
STEP(c,d,e,f,g,h,a,b,30);
STEP(b,c,d,e,f,g,h,a,31);
STEP(a,b,c,d,e,f,g,h,32);
STEP(h,a,b,c,d,e,f,g,33);
STEP(g,h,a,b,c,d,e,f,34);
STEP(f,g,h,a,b,c,d,e,35);
STEP(e,f,g,h,a,b,c,d,36);
STEP(d,e,f,g,h,a,b,c,37);
STEP(c,d,e,f,g,h,a,b,38);
STEP(b,c,d,e,f,g,h,a,39);
STEP(a,b,c,d,e,f,g,h,40);
STEP(h,a,b,c,d,e,f,g,41);
STEP(g,h,a,b,c,d,e,f,42);
STEP(f,g,h,a,b,c,d,e,43);
STEP(e,f,g,h,a,b,c,d,44);
STEP(d,e,f,g,h,a,b,c,45);
STEP(c,d,e,f,g,h,a,b,46);
STEP(b,c,d,e,f,g,h,a,47);
STEP(a,b,c,d,e,f,g,h,48);
STEP(h,a,b,c,d,e,f,g,49);
STEP(g,h,a,b,c,d,e,f,50);
STEP(f,g,h,a,b,c,d,e,51);
STEP(e,f,g,h,a,b,c,d,52);
STEP(d,e,f,g,h,a,b,c,53);
STEP(c,d,e,f,g,h,a,b,54);
STEP(b,c,d,e,f,g,h,a,55);
STEP(a,b,c,d,e,f,g,h,56);
STEP(h,a,b,c,d,e,f,g,57);
STEP(g,h,a,b,c,d,e,f,58);
STEP(f,g,h,a,b,c,d,e,59);
STEP(e,f,g,h,a,b,c,d,60);
STEP(d,e,f,g,h,a,b,c,61);
STEP(c,d,e,f,g,h,a,b,62);
STEP(b,c,d,e,f,g,h,a,63);
STEP(a,b,c,d,e,f,g,h,64);
STEP(h,a,b,c,d,e,f,g,65);
STEP(g,h,a,b,c,d,e,f,66);
STEP(f,g,h,a,b,c,d,e,67);
STEP(e,f,g,h,a,b,c,d,68);
STEP(d,e,f,g,h,a,b,c,69);
STEP(c,d,e,f,g,h,a,b,70);
STEP(b,c,d,e,f,g,h,a,71);
STEP(a,b,c,d,e,f,g,h,72);
STEP(h,a,b,c,d,e,f,g,73);
STEP(g,h,a,b,c,d,e,f,74);
STEP(f,g,h,a,b,c,d,e,75);
STEP(e,f,g,h,a,b,c,d,76);
STEP(d,e,f,g,h,a,b,c,77);
STEP(c,d,e,f,g,h,a,b,78);
STEP(b,c,d,e,f,g,h,a,79);
/* save state */
s[0] += a;
s[1] += b;
s[2] += c;

View file

@ -1,8 +1,6 @@
/*
* sha2_256 block cipher
* sha2_256 block cipher - unrolled version
*
* Implementation straight from Federal Information Processing Standards
* publication 180-2 (+Change Notice to include SHA-224) August 1, 2002
* note: the following upper and lower case macro names are distinct
* and reflect the functions defined in FIPS pub. 180-2.
*/
@ -15,8 +13,8 @@
#define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x) >> 10))
#define SIGMA0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22))
#define SIGMA1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25))
#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
#define Maj(x,y,z) (((x) | (y)) & ((z) | ((x) & (y))))
/*
* first 32 bits of the fractional parts of cube roots of
@ -44,9 +42,7 @@ static u32int K256[64] = {
void
_sha2block64(uchar *p, ulong len, u32int *s)
{
u32int a, b, c, d, e, f, g, h, t1, t2;
u32int *kp, *wp;
u32int w[64];
u32int w[16], a, b, c, d, e, f, g, h;
uchar *end;
/* at this point, we have a multiple of 64 bytes */
@ -60,26 +56,89 @@ _sha2block64(uchar *p, ulong len, u32int *s)
g = s[6];
h = s[7];
for(wp = w; wp < &w[16]; wp++, p += 4)
wp[0] = p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3];
for(; wp < &w[64]; wp++)
wp[0] = sigma1(wp[-2]) + wp[-7] +
sigma0(wp[-15]) + wp[-16];
#define STEP(a,b,c,d,e,f,g,h,i) \
if(i < 16) {\
w[i] = p[0]<<24 | p[1]<<16 | p[2]<<8 | p[3]; \
p += 4; \
} else { \
w[i&15] += sigma1(w[i-2&15]) + w[i-7&15] + sigma0(w[i-15&15]); \
} \
h += SIGMA1(e) + Ch(e,f,g) + K256[i] + w[i&15]; \
d += h; \
h += SIGMA0(a) + Maj(a,b,c);
for(kp = K256, wp = w; wp < &w[64]; ) {
t1 = h + SIGMA1(e) + Ch(e,f,g) + *kp++ + *wp++;
t2 = SIGMA0(a) + Maj(a,b,c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
}
STEP(a,b,c,d,e,f,g,h,0);
STEP(h,a,b,c,d,e,f,g,1);
STEP(g,h,a,b,c,d,e,f,2);
STEP(f,g,h,a,b,c,d,e,3);
STEP(e,f,g,h,a,b,c,d,4);
STEP(d,e,f,g,h,a,b,c,5);
STEP(c,d,e,f,g,h,a,b,6);
STEP(b,c,d,e,f,g,h,a,7);
STEP(a,b,c,d,e,f,g,h,8);
STEP(h,a,b,c,d,e,f,g,9);
STEP(g,h,a,b,c,d,e,f,10);
STEP(f,g,h,a,b,c,d,e,11);
STEP(e,f,g,h,a,b,c,d,12);
STEP(d,e,f,g,h,a,b,c,13);
STEP(c,d,e,f,g,h,a,b,14);
STEP(b,c,d,e,f,g,h,a,15);
STEP(a,b,c,d,e,f,g,h,16);
STEP(h,a,b,c,d,e,f,g,17);
STEP(g,h,a,b,c,d,e,f,18);
STEP(f,g,h,a,b,c,d,e,19);
STEP(e,f,g,h,a,b,c,d,20);
STEP(d,e,f,g,h,a,b,c,21);
STEP(c,d,e,f,g,h,a,b,22);
STEP(b,c,d,e,f,g,h,a,23);
STEP(a,b,c,d,e,f,g,h,24);
STEP(h,a,b,c,d,e,f,g,25);
STEP(g,h,a,b,c,d,e,f,26);
STEP(f,g,h,a,b,c,d,e,27);
STEP(e,f,g,h,a,b,c,d,28);
STEP(d,e,f,g,h,a,b,c,29);
STEP(c,d,e,f,g,h,a,b,30);
STEP(b,c,d,e,f,g,h,a,31);
STEP(a,b,c,d,e,f,g,h,32);
STEP(h,a,b,c,d,e,f,g,33);
STEP(g,h,a,b,c,d,e,f,34);
STEP(f,g,h,a,b,c,d,e,35);
STEP(e,f,g,h,a,b,c,d,36);
STEP(d,e,f,g,h,a,b,c,37);
STEP(c,d,e,f,g,h,a,b,38);
STEP(b,c,d,e,f,g,h,a,39);
STEP(a,b,c,d,e,f,g,h,40);
STEP(h,a,b,c,d,e,f,g,41);
STEP(g,h,a,b,c,d,e,f,42);
STEP(f,g,h,a,b,c,d,e,43);
STEP(e,f,g,h,a,b,c,d,44);
STEP(d,e,f,g,h,a,b,c,45);
STEP(c,d,e,f,g,h,a,b,46);
STEP(b,c,d,e,f,g,h,a,47);
STEP(a,b,c,d,e,f,g,h,48);
STEP(h,a,b,c,d,e,f,g,49);
STEP(g,h,a,b,c,d,e,f,50);
STEP(f,g,h,a,b,c,d,e,51);
STEP(e,f,g,h,a,b,c,d,52);
STEP(d,e,f,g,h,a,b,c,53);
STEP(c,d,e,f,g,h,a,b,54);
STEP(b,c,d,e,f,g,h,a,55);
STEP(a,b,c,d,e,f,g,h,56);
STEP(h,a,b,c,d,e,f,g,57);
STEP(g,h,a,b,c,d,e,f,58);
STEP(f,g,h,a,b,c,d,e,59);
STEP(e,f,g,h,a,b,c,d,60);
STEP(d,e,f,g,h,a,b,c,61);
STEP(c,d,e,f,g,h,a,b,62);
STEP(b,c,d,e,f,g,h,a,63);
/* save state */
s[0] += a;
s[1] += b;
s[2] += c;