8c, 6c: native ROL (cyclic shift) instruction support, improve peephole optimizers

introduce rolor() function to subsitute (a << c) | (a >> (bits(a) - c))
with (a <<< c) where <<< is cyclic rotation and c is constant.
this almost doubles the speed of chacha encryption of 386 and amd64.

the peephole optimizer used to stop when it hit a shift or rol
instruction when attempting to eleminate moves by register
substitution. but we do not have to as long as the shift count
operand is not CX (which cannot be substituted) and CX is not
a subject for substitution.
This commit is contained in:
cinap_lenrek 2016-06-09 23:12:46 +02:00
parent 5cdabc5eb1
commit a00b6bdbfa
10 changed files with 119 additions and 33 deletions

View file

@ -159,6 +159,7 @@ cgen(Node *n, Node *nn)
regfree(&nod); regfree(&nod);
break; break;
case OROL:
case OLSHR: case OLSHR:
case OASHL: case OASHL:
case OASHR: case OASHR:

View file

@ -370,15 +370,11 @@ subprop(Reg *r0)
break; break;
p = r->prog; p = r->prog;
switch(p->as) { switch(p->as) {
case ACALL:
return 0;
case AIMULL: case AIMULL:
case AIMULQ: case AIMULQ:
case AIMULW: case AIMULW:
if(p->to.type != D_NONE) if(p->to.type != D_NONE)
break; break;
case ADIVB: case ADIVB:
case ADIVL: case ADIVL:
case ADIVQ: case ADIVQ:
@ -393,6 +389,19 @@ subprop(Reg *r0)
case AMULQ: case AMULQ:
case AMULW: case AMULW:
case ACWD:
case ACDQ:
case ACQO:
case AREP:
case AREPN:
case ALOOP:
case ALOOPEQ:
case ALOOPNE:
case ACALL:
return 0;
case AROLB: case AROLB:
case AROLL: case AROLL:
case AROLQ: case AROLQ:
@ -417,14 +426,9 @@ subprop(Reg *r0)
case ASHRL: case ASHRL:
case ASHRQ: case ASHRQ:
case ASHRW: case ASHRW:
if(p->from.type == D_CX && v1->type == D_CX)
case AREP:
case AREPN:
case ACWD:
case ACDQ:
case ACQO:
return 0; return 0;
break;
case AORL: case AORL:
case AORQ: case AORQ:

View file

@ -120,7 +120,6 @@ xcom(Node *n)
*l = *(n->left); *l = *(n->left);
l->xoffset += r->vconst; l->xoffset += r->vconst;
n->left = l; n->left = l;
r = n->right;
goto brk; goto brk;
} }
break; break;
@ -212,7 +211,6 @@ xcom(Node *n)
if(g >= 0) { if(g >= 0) {
n->left = r; n->left = r;
n->right = l; n->right = l;
l = r;
r = n->right; r = n->right;
} }
g = vlog(r); g = vlog(r);
@ -288,6 +286,12 @@ xcom(Node *n)
indexshift(n); indexshift(n);
break; break;
case OOR:
xcom(l);
xcom(r);
rolor(n);
break;
default: default:
if(l != Z) if(l != Z)
xcom(l); xcom(l);
@ -298,6 +302,8 @@ xcom(Node *n)
brk: brk:
if(n->addable >= 10) if(n->addable >= 10)
return; return;
l = n->left;
r = n->right;
if(l != Z) if(l != Z)
n->complex = l->complex; n->complex = l->complex;
if(r != Z) { if(r != Z) {
@ -344,6 +350,7 @@ brk:
} }
break; break;
case OROL:
case OLSHR: case OLSHR:
case OASHL: case OASHL:
case OASHR: case OASHR:

View file

@ -1305,6 +1305,16 @@ gopcode(int o, Type *ty, Node *f, Node *t)
a = ASALQ; a = ASALQ;
break; break;
case OROL:
a = AROLL;
if(et == TCHAR || et == TUCHAR)
a = AROLB;
if(et == TSHORT || et == TUSHORT)
a = AROLW;
if(et == TVLONG || et == TUVLONG || et == TIND)
a = AROLQ;
break;
case OFUNC: case OFUNC:
a = ACALL; a = ACALL;
break; break;

View file

@ -178,6 +178,7 @@ cgen(Node *n, Node *nn)
regfree(&nod); regfree(&nod);
break; break;
case OROL:
case OLSHR: case OLSHR:
case OASHL: case OASHL:
case OASHR: case OASHR:

View file

@ -264,9 +264,6 @@ subprop(Reg *r0)
break; break;
p = r->prog; p = r->prog;
switch(p->as) { switch(p->as) {
case ACALL:
return 0;
case AIMULL: case AIMULL:
case AIMULW: case AIMULW:
if(p->to.type != D_NONE) if(p->to.type != D_NONE)
@ -283,6 +280,23 @@ subprop(Reg *r0)
case AMULL: case AMULL:
case AMULW: case AMULW:
case AREP:
case AREPN:
case ALOOP:
case ALOOPNE:
case ACWD:
case ACDQ:
case ASTOSB:
case ASTOSL:
case AMOVSB:
case AMOVSL:
case AFSTSW:
case ACALL:
return 0;
case AROLB: case AROLB:
case AROLL: case AROLL:
case AROLW: case AROLW:
@ -301,19 +315,9 @@ subprop(Reg *r0)
case ASHRB: case ASHRB:
case ASHRL: case ASHRL:
case ASHRW: case ASHRW:
if(p->from.type == D_CX && v1->type == D_CX)
case AREP:
case AREPN:
case ACWD:
case ACDQ:
case ASTOSB:
case ASTOSL:
case AMOVSB:
case AMOVSL:
case AFSTSW:
return 0; return 0;
break;
case AORL: case AORL:
case AANDL: case AANDL:

View file

@ -127,7 +127,6 @@ xcom(Node *n)
*l = *(n->left); *l = *(n->left);
l->xoffset += r->vconst; l->xoffset += r->vconst;
n->left = l; n->left = l;
r = n->right;
goto brk; goto brk;
} }
break; break;
@ -219,7 +218,6 @@ xcom(Node *n)
if(g >= 0) { if(g >= 0) {
n->left = r; n->left = r;
n->right = l; n->right = l;
l = r;
r = n->right; r = n->right;
} }
g = vlog(r); g = vlog(r);
@ -230,7 +228,7 @@ xcom(Node *n)
indexshift(n); indexshift(n);
break; break;
} }
commute(n); commute(n);
break; break;
case OASLDIV: case OASLDIV:
@ -295,6 +293,13 @@ commute(n);
indexshift(n); indexshift(n);
break; break;
case OOR:
xcom(l);
xcom(r);
if(typechl[n->type->etype])
rolor(n);
break;
default: default:
if(l != Z) if(l != Z)
xcom(l); xcom(l);
@ -305,6 +310,8 @@ commute(n);
brk: brk:
if(n->addable >= 10) if(n->addable >= 10)
return; return;
l = n->left;
r = n->right;
if(l != Z) if(l != Z)
n->complex = l->complex; n->complex = l->complex;
if(r != Z) { if(r != Z) {
@ -349,6 +356,7 @@ brk:
} }
break; break;
case OROL:
case OLSHR: case OLSHR:
case OASHL: case OASHL:
case OASHR: case OASHR:

View file

@ -1238,6 +1238,14 @@ gopcode(int o, Type *ty, Node *f, Node *t)
a = ASALW; a = ASALW;
break; break;
case OROL:
a = AROLL;
if(et == TCHAR || et == TUCHAR)
a = AROLB;
if(et == TSHORT || et == TUSHORT)
a = AROLW;
break;
case OFUNC: case OFUNC:
a = ACALL; a = ACALL;
break; break;

View file

@ -273,6 +273,7 @@ enum
OPROTO, OPROTO,
OREGISTER, OREGISTER,
ORETURN, ORETURN,
OROL,
OSET, OSET,
OSIGN, OSIGN,
OSIZE, OSIZE,
@ -694,6 +695,7 @@ int log2(uvlong);
int vlog(Node*); int vlog(Node*);
int topbit(ulong); int topbit(ulong);
void simplifyshift(Node*); void simplifyshift(Node*);
void rolor(Node*);
long typebitor(long, long); long typebitor(long, long);
void diag(Node*, char*, ...); void diag(Node*, char*, ...);
void warn(Node*, char*, ...); void warn(Node*, char*, ...);

View file

@ -903,6 +903,46 @@ if(debug['<'])prtree(n, "rewrite2");
n->left->op = o; n->left->op = o;
} }
/*
* replace shift/or with rotate left
*/
void
rolor(Node *n)
{
Node *l, *r;
if(!typeu[n->type->etype])
return;
l = n->left;
r = n->right;
switch(l->op){
case OASHL:
if(r->op == OLSHR)
break;
return;
case OLSHR:
if(r->op == OASHL){
r = l;
l = n->right;
break;
}
default:
return;
}
if(l->right->op != OCONST || r->right->op != OCONST)
return;
if(vconst(l->right) + vconst(r->right) != ewidth[n->type->etype]*8)
return;
if(l->left->type != n->type || r->left->type != n->type)
return;
if(l->left->op != ONAME || r->left->op != ONAME || l->left->sym != r->left->sym)
return;
*n = *l;
n->op = OROL;
}
int int
side(Node *n) side(Node *n)
{ {
@ -1473,6 +1513,7 @@ Init onamesinit[] =
OPROTO, 0, "PROTO", OPROTO, 0, "PROTO",
OREGISTER, 0, "REGISTER", OREGISTER, 0, "REGISTER",
ORETURN, 0, "RETURN", ORETURN, 0, "RETURN",
OROL, 0, "ROL",
OSET, 0, "SET", OSET, 0, "SET",
OSIGN, 0, "SIGN", OSIGN, 0, "SIGN",
OSIZE, 0, "SIZE", OSIZE, 0, "SIZE",