libsec: write optimized _chachablock() function for amd64 / sse2
doing 4 quarterround's in parallel using 128-bit vector registers. for second round shuffle the columns and then shuffle back. code is rather obvious. only trick here is for the first quaterround PSHUFLW/PSHUFHW is used to swap the halfwords for the <<<16 rotation.
This commit is contained in:
parent
1eb3739454
commit
077e719dfb
|
@ -3,6 +3,7 @@ APE=/sys/src/ape
|
|||
|
||||
LIB=/$objtype/lib/ape/libsec.a
|
||||
FILES=\
|
||||
chachablock\
|
||||
md5block\
|
||||
sha1block\
|
||||
aesni\
|
||||
|
|
|
@ -11,7 +11,7 @@ CFILES = des.c desmodes.c desECB.c desCBC.c des3ECB.c des3CBC.c\
|
|||
sha1pickle.c md5pickle.c\
|
||||
poly1305.c\
|
||||
rc4.c\
|
||||
chacha.c\
|
||||
chacha.c chachablock.c\
|
||||
salsa.c\
|
||||
genrandom.c prng.c fastrand.c nfastrand.c\
|
||||
probably_prime.c smallprimetest.c genprime.c dsaprimes.c\
|
||||
|
|
74
sys/src/libsec/amd64/chachablock.s
Normal file
74
sys/src/libsec/amd64/chachablock.s
Normal file
|
@ -0,0 +1,74 @@
|
|||
#define ROTATE(n, v1, v2) \
|
||||
MOVO v1, v2; \
|
||||
PSLLL $(n), v1; \
|
||||
PSRLL $(32-n), v2; \
|
||||
POR v1, v2
|
||||
|
||||
TEXT _chachablock(SB), 0, $0
|
||||
MOVOU 0(RARG), X0
|
||||
MOVOU 16(RARG), X1
|
||||
MOVOU 32(RARG), X2
|
||||
MOVOU 48(RARG), X3
|
||||
|
||||
MOVL rounds+8(FP), CX
|
||||
SHRL $1, CX
|
||||
|
||||
_loop:
|
||||
PADDL X1, X0
|
||||
PXOR X0, X3
|
||||
/* ROTATE(16, X3, X3) */
|
||||
PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
|
||||
PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
|
||||
|
||||
PADDL X3, X2
|
||||
MOVO X1, X4
|
||||
PXOR X2, X4
|
||||
ROTATE(12, X4, X1)
|
||||
|
||||
PADDL X1, X0
|
||||
MOVO X0, X4
|
||||
PXOR X3, X4
|
||||
ROTATE(8, X4, X3)
|
||||
|
||||
PADDL X3, X2
|
||||
MOVO X1, X4
|
||||
PXOR X2, X4
|
||||
ROTATE(7, X4, X1)
|
||||
|
||||
PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X1, X1
|
||||
PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2
|
||||
PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X3, X3
|
||||
|
||||
PADDL X1, X0
|
||||
PXOR X0, X3
|
||||
/* ROTATE(16, X3, X3) */
|
||||
PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
|
||||
PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
|
||||
|
||||
PADDL X3, X2
|
||||
MOVO X1, X4
|
||||
PXOR X2, X4
|
||||
ROTATE(12, X4, X1)
|
||||
|
||||
PADDL X1, X0
|
||||
MOVO X0, X4
|
||||
PXOR X3, X4
|
||||
ROTATE(8, X4, X3)
|
||||
|
||||
PADDL X3, X2
|
||||
MOVO X1, X4
|
||||
PXOR X2, X4
|
||||
ROTATE(7, X4, X1)
|
||||
|
||||
PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X1, X1
|
||||
PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2
|
||||
PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X3, X3
|
||||
|
||||
DECL CX
|
||||
JNE _loop
|
||||
|
||||
MOVOU X0, 0(RARG)
|
||||
MOVOU X1, 16(RARG)
|
||||
MOVOU X2, 32(RARG)
|
||||
MOVOU X3, 48(RARG)
|
||||
RET
|
|
@ -3,6 +3,7 @@ objtype=amd64
|
|||
|
||||
LIB=/$objtype/lib/libsec.a
|
||||
FILES=\
|
||||
chachablock\
|
||||
md5block\
|
||||
sha1block\
|
||||
aesni\
|
||||
|
|
|
@ -10,26 +10,13 @@ and including the changes to block number and nonce defined in RFC7539
|
|||
#include "os.h"
|
||||
#include <libsec.h>
|
||||
|
||||
enum{
|
||||
Blockwords= ChachaBsize/sizeof(u32int)
|
||||
};
|
||||
/* from chachablock.$O */
|
||||
extern void _chachablock(u32int x[16], int rounds);
|
||||
|
||||
/* little-endian data order */
|
||||
#define GET4(p) ((p)[0]|((p)[1]<<8)|((p)[2]<<16)|((p)[3]<<24))
|
||||
#define PUT4(p,v) (p)[0]=(v);(p)[1]=(v)>>8;(p)[2]=(v)>>16;(p)[3]=(v)>>24
|
||||
|
||||
#define ROTATE(v,c) ((u32int)((v) << (c)) | ((v) >> (32 - (c))))
|
||||
|
||||
#define QUARTERROUND(ia,ib,ic,id) { \
|
||||
u32int a, b, c, d, t; \
|
||||
a = x[ia]; b = x[ib]; c = x[ic]; d = x[id]; \
|
||||
a += b; t = d^a; d = ROTATE(t,16); \
|
||||
c += d; t = b^c; b = ROTATE(t,12); \
|
||||
a += b; t = d^a; d = ROTATE(t, 8); \
|
||||
c += d; t = b^c; b = ROTATE(t, 7); \
|
||||
x[ia] = a; x[ib] = b; x[ic] = c; x[id] = d; \
|
||||
}
|
||||
|
||||
#define ENCRYPT(s, x, y, d) {\
|
||||
u32int v; \
|
||||
v = GET4(s); \
|
||||
|
@ -87,22 +74,6 @@ setupChachastate(Chachastate *s, uchar *key, ulong keylen, uchar *iv, ulong ivle
|
|||
chacha_setiv(s, iv);
|
||||
}
|
||||
|
||||
static void
|
||||
dorounds(u32int x[Blockwords], int rounds)
|
||||
{
|
||||
for(; rounds > 0; rounds -= 2) {
|
||||
QUARTERROUND(0, 4, 8,12)
|
||||
QUARTERROUND(1, 5, 9,13)
|
||||
QUARTERROUND(2, 6,10,14)
|
||||
QUARTERROUND(3, 7,11,15)
|
||||
|
||||
QUARTERROUND(0, 5,10,15)
|
||||
QUARTERROUND(1, 6,11,12)
|
||||
QUARTERROUND(2, 7, 8,13)
|
||||
QUARTERROUND(3, 4, 9,14)
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
hchachablock(uchar h[32], Chachastate *s)
|
||||
{
|
||||
|
@ -125,7 +96,7 @@ hchachablock(uchar h[32], Chachastate *s)
|
|||
x[14] = s->input[14];
|
||||
x[15] = s->input[15];
|
||||
|
||||
dorounds(x, s->rounds);
|
||||
_chachablock(x, s->rounds);
|
||||
|
||||
PUT4(h+0*4, x[0]);
|
||||
PUT4(h+1*4, x[1]);
|
||||
|
@ -183,7 +154,7 @@ chacha_setblock(Chachastate *s, u64int blockno)
|
|||
static void
|
||||
encryptblock(Chachastate *s, uchar *src, uchar *dst)
|
||||
{
|
||||
u32int x[Blockwords];
|
||||
u32int x[16];
|
||||
int i;
|
||||
|
||||
x[0] = s->input[0];
|
||||
|
@ -202,7 +173,7 @@ encryptblock(Chachastate *s, uchar *src, uchar *dst)
|
|||
x[13] = s->input[13];
|
||||
x[14] = s->input[14];
|
||||
x[15] = s->input[15];
|
||||
dorounds(x, s->rounds);
|
||||
_chachablock(x, s->rounds);
|
||||
|
||||
for(i=0; i<nelem(x); i+=4){
|
||||
ENCRYPT(src, x[i], s->input[i], dst);
|
||||
|
|
29
sys/src/libsec/port/chachablock.c
Normal file
29
sys/src/libsec/port/chachablock.c
Normal file
|
@ -0,0 +1,29 @@
|
|||
#include "os.h"
|
||||
|
||||
#define ROTATE(v,c) ((u32int)((v) << (c)) | ((v) >> (32 - (c))))
|
||||
|
||||
#define QUARTERROUND(ia,ib,ic,id) { \
|
||||
u32int a, b, c, d, t; \
|
||||
a = x[ia]; b = x[ib]; c = x[ic]; d = x[id]; \
|
||||
a += b; t = d^a; d = ROTATE(t,16); \
|
||||
c += d; t = b^c; b = ROTATE(t,12); \
|
||||
a += b; t = d^a; d = ROTATE(t, 8); \
|
||||
c += d; t = b^c; b = ROTATE(t, 7); \
|
||||
x[ia] = a; x[ib] = b; x[ic] = c; x[id] = d; \
|
||||
}
|
||||
|
||||
void
|
||||
_chachablock(u32int x[16], int rounds)
|
||||
{
|
||||
for(; rounds > 0; rounds -= 2) {
|
||||
QUARTERROUND(0, 4, 8,12)
|
||||
QUARTERROUND(1, 5, 9,13)
|
||||
QUARTERROUND(2, 6,10,14)
|
||||
QUARTERROUND(3, 7,11,15)
|
||||
|
||||
QUARTERROUND(0, 5,10,15)
|
||||
QUARTERROUND(1, 6,11,12)
|
||||
QUARTERROUND(2, 7, 8,13)
|
||||
QUARTERROUND(3, 4, 9,14)
|
||||
}
|
||||
}
|
|
@ -10,7 +10,7 @@ CFILES = des.c desmodes.c desECB.c desCBC.c des3ECB.c des3CBC.c\
|
|||
sha1pickle.c md5pickle.c\
|
||||
poly1305.c\
|
||||
rc4.c\
|
||||
chacha.c\
|
||||
chacha.c chachablock.c\
|
||||
salsa.c\
|
||||
genrandom.c prng.c fastrand.c nfastrand.c\
|
||||
probably_prime.c smallprimetest.c genprime.c dsaprimes.c\
|
||||
|
|
Loading…
Reference in a new issue