libsec: write optimized _chachablock() function for amd64 / sse2

doing 4 quarterround's in parallel using 128-bit
vector registers. for second round shuffle the columns and
then shuffle back.

code is rather obvious. only trick here is for the first
quaterround PSHUFLW/PSHUFHW is used to swap the halfwords
for the <<<16 rotation.
This commit is contained in:
cinap_lenrek 2017-11-20 00:10:35 +01:00
parent 1eb3739454
commit 077e719dfb
7 changed files with 112 additions and 36 deletions

View file

@ -3,6 +3,7 @@ APE=/sys/src/ape
LIB=/$objtype/lib/ape/libsec.a
FILES=\
chachablock\
md5block\
sha1block\
aesni\

View file

@ -11,7 +11,7 @@ CFILES = des.c desmodes.c desECB.c desCBC.c des3ECB.c des3CBC.c\
sha1pickle.c md5pickle.c\
poly1305.c\
rc4.c\
chacha.c\
chacha.c chachablock.c\
salsa.c\
genrandom.c prng.c fastrand.c nfastrand.c\
probably_prime.c smallprimetest.c genprime.c dsaprimes.c\

View file

@ -0,0 +1,74 @@
#define ROTATE(n, v1, v2) \
MOVO v1, v2; \
PSLLL $(n), v1; \
PSRLL $(32-n), v2; \
POR v1, v2
TEXT _chachablock(SB), 0, $0
MOVOU 0(RARG), X0
MOVOU 16(RARG), X1
MOVOU 32(RARG), X2
MOVOU 48(RARG), X3
MOVL rounds+8(FP), CX
SHRL $1, CX
_loop:
PADDL X1, X0
PXOR X0, X3
/* ROTATE(16, X3, X3) */
PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
PADDL X3, X2
MOVO X1, X4
PXOR X2, X4
ROTATE(12, X4, X1)
PADDL X1, X0
MOVO X0, X4
PXOR X3, X4
ROTATE(8, X4, X3)
PADDL X3, X2
MOVO X1, X4
PXOR X2, X4
ROTATE(7, X4, X1)
PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X1, X1
PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2
PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X3, X3
PADDL X1, X0
PXOR X0, X3
/* ROTATE(16, X3, X3) */
PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
PADDL X3, X2
MOVO X1, X4
PXOR X2, X4
ROTATE(12, X4, X1)
PADDL X1, X0
MOVO X0, X4
PXOR X3, X4
ROTATE(8, X4, X3)
PADDL X3, X2
MOVO X1, X4
PXOR X2, X4
ROTATE(7, X4, X1)
PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X1, X1
PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2
PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X3, X3
DECL CX
JNE _loop
MOVOU X0, 0(RARG)
MOVOU X1, 16(RARG)
MOVOU X2, 32(RARG)
MOVOU X3, 48(RARG)
RET

View file

@ -3,6 +3,7 @@ objtype=amd64
LIB=/$objtype/lib/libsec.a
FILES=\
chachablock\
md5block\
sha1block\
aesni\

View file

@ -10,26 +10,13 @@ and including the changes to block number and nonce defined in RFC7539
#include "os.h"
#include <libsec.h>
enum{
Blockwords= ChachaBsize/sizeof(u32int)
};
/* from chachablock.$O */
extern void _chachablock(u32int x[16], int rounds);
/* little-endian data order */
#define GET4(p) ((p)[0]|((p)[1]<<8)|((p)[2]<<16)|((p)[3]<<24))
#define PUT4(p,v) (p)[0]=(v);(p)[1]=(v)>>8;(p)[2]=(v)>>16;(p)[3]=(v)>>24
#define ROTATE(v,c) ((u32int)((v) << (c)) | ((v) >> (32 - (c))))
#define QUARTERROUND(ia,ib,ic,id) { \
u32int a, b, c, d, t; \
a = x[ia]; b = x[ib]; c = x[ic]; d = x[id]; \
a += b; t = d^a; d = ROTATE(t,16); \
c += d; t = b^c; b = ROTATE(t,12); \
a += b; t = d^a; d = ROTATE(t, 8); \
c += d; t = b^c; b = ROTATE(t, 7); \
x[ia] = a; x[ib] = b; x[ic] = c; x[id] = d; \
}
#define ENCRYPT(s, x, y, d) {\
u32int v; \
v = GET4(s); \
@ -87,22 +74,6 @@ setupChachastate(Chachastate *s, uchar *key, ulong keylen, uchar *iv, ulong ivle
chacha_setiv(s, iv);
}
static void
dorounds(u32int x[Blockwords], int rounds)
{
for(; rounds > 0; rounds -= 2) {
QUARTERROUND(0, 4, 8,12)
QUARTERROUND(1, 5, 9,13)
QUARTERROUND(2, 6,10,14)
QUARTERROUND(3, 7,11,15)
QUARTERROUND(0, 5,10,15)
QUARTERROUND(1, 6,11,12)
QUARTERROUND(2, 7, 8,13)
QUARTERROUND(3, 4, 9,14)
}
}
static void
hchachablock(uchar h[32], Chachastate *s)
{
@ -125,7 +96,7 @@ hchachablock(uchar h[32], Chachastate *s)
x[14] = s->input[14];
x[15] = s->input[15];
dorounds(x, s->rounds);
_chachablock(x, s->rounds);
PUT4(h+0*4, x[0]);
PUT4(h+1*4, x[1]);
@ -183,7 +154,7 @@ chacha_setblock(Chachastate *s, u64int blockno)
static void
encryptblock(Chachastate *s, uchar *src, uchar *dst)
{
u32int x[Blockwords];
u32int x[16];
int i;
x[0] = s->input[0];
@ -202,7 +173,7 @@ encryptblock(Chachastate *s, uchar *src, uchar *dst)
x[13] = s->input[13];
x[14] = s->input[14];
x[15] = s->input[15];
dorounds(x, s->rounds);
_chachablock(x, s->rounds);
for(i=0; i<nelem(x); i+=4){
ENCRYPT(src, x[i], s->input[i], dst);

View file

@ -0,0 +1,29 @@
#include "os.h"
#define ROTATE(v,c) ((u32int)((v) << (c)) | ((v) >> (32 - (c))))
#define QUARTERROUND(ia,ib,ic,id) { \
u32int a, b, c, d, t; \
a = x[ia]; b = x[ib]; c = x[ic]; d = x[id]; \
a += b; t = d^a; d = ROTATE(t,16); \
c += d; t = b^c; b = ROTATE(t,12); \
a += b; t = d^a; d = ROTATE(t, 8); \
c += d; t = b^c; b = ROTATE(t, 7); \
x[ia] = a; x[ib] = b; x[ic] = c; x[id] = d; \
}
void
_chachablock(u32int x[16], int rounds)
{
for(; rounds > 0; rounds -= 2) {
QUARTERROUND(0, 4, 8,12)
QUARTERROUND(1, 5, 9,13)
QUARTERROUND(2, 6,10,14)
QUARTERROUND(3, 7,11,15)
QUARTERROUND(0, 5,10,15)
QUARTERROUND(1, 6,11,12)
QUARTERROUND(2, 7, 8,13)
QUARTERROUND(3, 4, 9,14)
}
}

View file

@ -10,7 +10,7 @@ CFILES = des.c desmodes.c desECB.c desCBC.c des3ECB.c des3CBC.c\
sha1pickle.c md5pickle.c\
poly1305.c\
rc4.c\
chacha.c\
chacha.c chachablock.c\
salsa.c\
genrandom.c prng.c fastrand.c nfastrand.c\
probably_prime.c smallprimetest.c genprime.c dsaprimes.c\