make all the commands agnostic about Rune width. (from sources)
This commit is contained in:
parent
78c7ba36a1
commit
667010554b
|
@ -46,6 +46,7 @@ enum
|
||||||
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
||||||
Runeerror = 0xFFFD, /* decoding error in UTF */
|
Runeerror = 0xFFFD, /* decoding error in UTF */
|
||||||
Runemax = 0xFFFF, /* 16 bit rune */
|
Runemax = 0xFFFF, /* 16 bit rune */
|
||||||
|
Runemask = 0xFFFF, /* bits used by runes (see grep) */
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -20,7 +20,7 @@ Rune *lastregexp;
|
||||||
typedef struct Inst Inst;
|
typedef struct Inst Inst;
|
||||||
struct Inst
|
struct Inst
|
||||||
{
|
{
|
||||||
uint type; /* < 0x10000 ==> literal, otherwise action */
|
uint type; /* <= Runemax+1 ==> literal, otherwise action */
|
||||||
union {
|
union {
|
||||||
int sid;
|
int sid;
|
||||||
int subid;
|
int subid;
|
||||||
|
@ -61,25 +61,28 @@ static Rangeset sempty;
|
||||||
* 0x100xx are operators, value == precedence
|
* 0x100xx are operators, value == precedence
|
||||||
* 0x200xx are tokens, i.e. operands for operators
|
* 0x200xx are tokens, i.e. operands for operators
|
||||||
*/
|
*/
|
||||||
#define OPERATOR 0x10000 /* Bitmask of all operators */
|
enum {
|
||||||
#define START 0x10000 /* Start, used for marker on stack */
|
OPERATOR = Runemask+1, /* Bitmask of all operators */
|
||||||
#define RBRA 0x10001 /* Right bracket, ) */
|
START = OPERATOR, /* Start, used for marker on stack */
|
||||||
#define LBRA 0x10002 /* Left bracket, ( */
|
RBRA, /* Right bracket, ) */
|
||||||
#define OR 0x10003 /* Alternation, | */
|
LBRA, /* Left bracket, ( */
|
||||||
#define CAT 0x10004 /* Concatentation, implicit operator */
|
OR, /* Alternation, | */
|
||||||
#define STAR 0x10005 /* Closure, * */
|
CAT, /* Concatentation, implicit operator */
|
||||||
#define PLUS 0x10006 /* a+ == aa* */
|
STAR, /* Closure, * */
|
||||||
#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
|
PLUS, /* a+ == aa* */
|
||||||
#define ANY 0x20000 /* Any character but newline, . */
|
QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
|
||||||
#define NOP 0x20001 /* No operation, internal use only */
|
|
||||||
#define BOL 0x20002 /* Beginning of line, ^ */
|
|
||||||
#define EOL 0x20003 /* End of line, $ */
|
|
||||||
#define CCLASS 0x20004 /* Character class, [] */
|
|
||||||
#define NCCLASS 0x20005 /* Negated character class, [^] */
|
|
||||||
#define END 0x20077 /* Terminate: match found */
|
|
||||||
|
|
||||||
#define ISATOR 0x10000
|
ANY = OPERATOR<<1, /* Any character but newline, . */
|
||||||
#define ISAND 0x20000
|
NOP, /* No operation, internal use only */
|
||||||
|
BOL, /* Beginning of line, ^ */
|
||||||
|
EOL, /* End of line, $ */
|
||||||
|
CCLASS, /* Character class, [] */
|
||||||
|
NCCLASS, /* Negated character class, [^] */
|
||||||
|
END, /* Terminate: match found */
|
||||||
|
|
||||||
|
ISATOR = OPERATOR,
|
||||||
|
ISAND = OPERATOR<<1,
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Parser Information
|
* Parser Information
|
||||||
|
@ -452,7 +455,7 @@ nextrec(void)
|
||||||
exprp++;
|
exprp++;
|
||||||
return '\n';
|
return '\n';
|
||||||
}
|
}
|
||||||
return *exprp++|0x10000;
|
return *exprp++|(Runemax+1);
|
||||||
}
|
}
|
||||||
return *exprp++;
|
return *exprp++;
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,7 @@ enum
|
||||||
ESIZE = 256, /* max size of reg exp */
|
ESIZE = 256, /* max size of reg exp */
|
||||||
GBSIZE = 256, /* max size of global command */
|
GBSIZE = 256, /* max size of global command */
|
||||||
MAXSUB = 9, /* max number of sub reg exp */
|
MAXSUB = 9, /* max number of sub reg exp */
|
||||||
ESCFLG = 0xFFFF, /* escape Rune - user defined code */
|
ESCFLG = Runemax, /* escape Rune - user defined code */
|
||||||
EOF = -1,
|
EOF = -1,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -737,7 +737,7 @@ gety(void)
|
||||||
if(c == 0)
|
if(c == 0)
|
||||||
continue;
|
continue;
|
||||||
*p++ = c;
|
*p++ = c;
|
||||||
if(p >= &linebuf[LBSIZE-2])
|
if(p >= &linebuf[LBSIZE-sizeof(Rune)])
|
||||||
error(Q);
|
error(Q);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1162,7 +1162,7 @@ join(void)
|
||||||
for(a1=addr1; a1<=addr2; a1++) {
|
for(a1=addr1; a1<=addr2; a1++) {
|
||||||
lp = getline(*a1);
|
lp = getline(*a1);
|
||||||
while(*gp = *lp++)
|
while(*gp = *lp++)
|
||||||
if(gp++ >= &genbuf[LBSIZE-2])
|
if(gp++ >= &genbuf[LBSIZE-sizeof(Rune)])
|
||||||
error(Q);
|
error(Q);
|
||||||
}
|
}
|
||||||
lp = linebuf;
|
lp = linebuf;
|
||||||
|
|
|
@ -273,60 +273,6 @@ type(char *file, int nlen)
|
||||||
close(fd);
|
close(fd);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Unicode 4.0 4-byte runes.
|
|
||||||
*/
|
|
||||||
typedef int Rune1;
|
|
||||||
|
|
||||||
enum {
|
|
||||||
UTFmax1 = 4,
|
|
||||||
};
|
|
||||||
|
|
||||||
int
|
|
||||||
fullrune1(char *p, int n)
|
|
||||||
{
|
|
||||||
int c;
|
|
||||||
|
|
||||||
if(n >= 1) {
|
|
||||||
c = *(uchar*)p;
|
|
||||||
if(c < 0x80)
|
|
||||||
return 1;
|
|
||||||
if(n >= 2 && c < 0xE0)
|
|
||||||
return 1;
|
|
||||||
if(n >= 3 && c < 0xF0)
|
|
||||||
return 1;
|
|
||||||
if(n >= 4)
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
|
||||||
chartorune1(Rune1 *rune, char *str)
|
|
||||||
{
|
|
||||||
int c, c1, c2, c3, n;
|
|
||||||
Rune r;
|
|
||||||
|
|
||||||
c = *(uchar*)str;
|
|
||||||
if(c < 0xF0){
|
|
||||||
r = 0;
|
|
||||||
n = chartorune(&r, str);
|
|
||||||
*rune = r;
|
|
||||||
return n;
|
|
||||||
}
|
|
||||||
c &= ~0xF0;
|
|
||||||
c1 = *(uchar*)(str+1) & ~0x80;
|
|
||||||
c2 = *(uchar*)(str+2) & ~0x80;
|
|
||||||
c3 = *(uchar*)(str+3) & ~0x80;
|
|
||||||
n = (c<<18) | (c1<<12) | (c2<<6) | c3;
|
|
||||||
if(n < 0x10000 || n > 0x10FFFF){
|
|
||||||
*rune = Runeerror;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
*rune = n;
|
|
||||||
return 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
utfconv(void)
|
utfconv(void)
|
||||||
{
|
{
|
||||||
|
@ -392,7 +338,7 @@ utfconv(void)
|
||||||
void
|
void
|
||||||
filetype(int fd)
|
filetype(int fd)
|
||||||
{
|
{
|
||||||
Rune1 r;
|
Rune r;
|
||||||
int i, f, n;
|
int i, f, n;
|
||||||
char *p, *eob;
|
char *p, *eob;
|
||||||
|
|
||||||
|
@ -435,9 +381,9 @@ filetype(int fd)
|
||||||
language[i].count = 0;
|
language[i].count = 0;
|
||||||
eob = (char *)buf+nbuf;
|
eob = (char *)buf+nbuf;
|
||||||
for(n = 0, p = (char *)buf; p < eob; n++) {
|
for(n = 0, p = (char *)buf; p < eob; n++) {
|
||||||
if (!fullrune1(p, eob-p) && eob-p < UTFmax1)
|
if (!fullrune(p, eob-p) && eob-p < UTFmax)
|
||||||
break;
|
break;
|
||||||
p += chartorune1(&r, p);
|
p += chartorune(&r, p);
|
||||||
if (r == 0)
|
if (r == 0)
|
||||||
f = Cnull;
|
f = Cnull;
|
||||||
else if (r <= 0x7f) {
|
else if (r <= 0x7f) {
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
#include <libc.h>
|
#include <libc.h>
|
||||||
#include <bio.h>
|
#include <bio.h>
|
||||||
|
|
||||||
uvlong count[1<<16];
|
uvlong count[Runemax+1];
|
||||||
Biobuf bout;
|
Biobuf bout;
|
||||||
|
|
||||||
void usage(void);
|
void usage(void);
|
||||||
|
|
|
@ -275,7 +275,7 @@ re2class(char *s)
|
||||||
x = re2or(x, rclass(ov, p[0]-1));
|
x = re2or(x, rclass(ov, p[0]-1));
|
||||||
ov = p[1]+1;
|
ov = p[1]+1;
|
||||||
}
|
}
|
||||||
x = re2or(x, rclass(ov, 0xffff));
|
x = re2or(x, rclass(ov, Runemask));
|
||||||
} else {
|
} else {
|
||||||
x = rclass(p[0], p[1]);
|
x = rclass(p[0], p[1]);
|
||||||
for(p+=2; *p; p+=2)
|
for(p+=2; *p; p+=2)
|
||||||
|
|
|
@ -53,7 +53,7 @@ enum
|
||||||
|
|
||||||
Caselim = 7,
|
Caselim = 7,
|
||||||
Nhunk = 1<<16,
|
Nhunk = 1<<16,
|
||||||
Cbegin = 0x10000,
|
Cbegin = Runemax+1,
|
||||||
Flshcnt = (1<<9)-1,
|
Flshcnt = (1<<9)-1,
|
||||||
|
|
||||||
Cflag = 1<<0,
|
Cflag = 1<<0,
|
||||||
|
|
|
@ -16,6 +16,12 @@ rune2html(Rune r)
|
||||||
if(r == '\n')
|
if(r == '\n')
|
||||||
return L("\n");
|
return L("\n");
|
||||||
|
|
||||||
|
if(((uint)r&~0xFFFF) != 0){
|
||||||
|
/* The cache must grow a lot to handle them */
|
||||||
|
fprint(2, "%s: can't handle rune '%C'\n", argv0, r);
|
||||||
|
return L("?");
|
||||||
|
}
|
||||||
|
|
||||||
if(tcscache[r>>8] && tcscache[r>>8][r&0xFF])
|
if(tcscache[r>>8] && tcscache[r>>8][r&0xFF])
|
||||||
return tcscache[r>>8][r&0xFF];
|
return tcscache[r>>8][r&0xFF];
|
||||||
|
|
||||||
|
@ -59,7 +65,7 @@ rune2html(Rune r)
|
||||||
typedef struct Trtab Trtab;
|
typedef struct Trtab Trtab;
|
||||||
struct Trtab
|
struct Trtab
|
||||||
{
|
{
|
||||||
char t[3];
|
char t[UTFmax];
|
||||||
Rune r;
|
Rune r;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -118,18 +118,16 @@ glob(void *ap)
|
||||||
int
|
int
|
||||||
equtf(uchar *p, uchar *q)
|
equtf(uchar *p, uchar *q)
|
||||||
{
|
{
|
||||||
|
Rune pr, qr;
|
||||||
|
|
||||||
if(*p!=*q)
|
if(*p!=*q)
|
||||||
return 0;
|
return 0;
|
||||||
if(twobyte(*p)) return p[1]==q[1];
|
|
||||||
if(threebyte(*p)){
|
chartorune(&pr, (char*)p);
|
||||||
if(p[1]!=q[1])
|
chartorune(&qr, (char*)q);
|
||||||
return 0;
|
return pr == qr;
|
||||||
if(p[1]=='\0')
|
|
||||||
return 1; /* broken code at end of string! */
|
|
||||||
return p[2]==q[2];
|
|
||||||
}
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Return a pointer to the next utf code in the string,
|
* Return a pointer to the next utf code in the string,
|
||||||
* not jumping past nuls in broken utf codes!
|
* not jumping past nuls in broken utf codes!
|
||||||
|
@ -138,10 +136,11 @@ equtf(uchar *p, uchar *q)
|
||||||
uchar*
|
uchar*
|
||||||
nextutf(uchar *p)
|
nextutf(uchar *p)
|
||||||
{
|
{
|
||||||
if(twobyte(*p)) return p[1]=='\0'?p+1:p+2;
|
Rune dummy;
|
||||||
if(threebyte(*p)) return p[1]=='\0'?p+1:p[2]=='\0'?p+2:p+3;
|
|
||||||
return p+1;
|
return p + chartorune(&dummy, (char*)p);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Convert the utf code at *p to a unicode value
|
* Convert the utf code at *p to a unicode value
|
||||||
*/
|
*/
|
||||||
|
@ -149,14 +148,12 @@ nextutf(uchar *p)
|
||||||
int
|
int
|
||||||
unicode(uchar *p)
|
unicode(uchar *p)
|
||||||
{
|
{
|
||||||
int u = *p;
|
Rune r;
|
||||||
|
|
||||||
if(twobyte(u))
|
chartorune(&r, (char*)p);
|
||||||
return ((u&0x1f)<<6)|(p[1]&0x3f);
|
return r;
|
||||||
if(threebyte(u))
|
|
||||||
return (u<<12)|((p[1]&0x3f)<<6)|(p[2]&0x3f);
|
|
||||||
return u;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Does the string s match the pattern p
|
* Does the string s match the pattern p
|
||||||
* . and .. are only matched by patterns starting with .
|
* . and .. are only matched by patterns starting with .
|
||||||
|
|
|
@ -166,15 +166,25 @@ addtok(char *p, int val)
|
||||||
char*
|
char*
|
||||||
addutf(char *p, int c)
|
addutf(char *p, int c)
|
||||||
{
|
{
|
||||||
p = addtok(p, c);
|
uchar b, m;
|
||||||
if(twobyte(c)) /* 2-byte escape */
|
int i;
|
||||||
return addtok(p, advance());
|
|
||||||
if(threebyte(c)){ /* 3-byte escape */
|
p = addtok(p, c); /* 1-byte UTF runes are special */
|
||||||
|
if(onebyte(c))
|
||||||
|
return p;
|
||||||
|
|
||||||
|
m = 0xc0;
|
||||||
|
b = 0x80;
|
||||||
|
for(i=1; i < UTFmax; i++){
|
||||||
|
if((c&m) == b)
|
||||||
|
break;
|
||||||
p = addtok(p, advance());
|
p = addtok(p, advance());
|
||||||
return addtok(p, advance());
|
b = m;
|
||||||
|
m = (m >> 1)|0x80;
|
||||||
}
|
}
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
int lastdol; /* was the last token read '$' or '$#' or '"'? */
|
int lastdol; /* was the last token read '$' or '$#' or '"'? */
|
||||||
int lastword; /* was the last token read a word or compound word terminator? */
|
int lastword; /* was the last token read a word or compound word terminator? */
|
||||||
|
|
||||||
|
|
|
@ -123,12 +123,10 @@ int mypid;
|
||||||
*/
|
*/
|
||||||
#define GLOB ((char)0x01)
|
#define GLOB ((char)0x01)
|
||||||
/*
|
/*
|
||||||
* onebyte(c), twobyte(c), threebyte(c)
|
* onebyte(c)
|
||||||
* Is c the first character of a one- two- or three-byte utf sequence?
|
* Is c the first character of a one-byte utf sequence?
|
||||||
*/
|
*/
|
||||||
#define onebyte(c) ((c&0x80)==0x00)
|
#define onebyte(c) ((c&0x80)==0x00)
|
||||||
#define twobyte(c) ((c&0xe0)==0xc0)
|
|
||||||
#define threebyte(c) ((c&0xf0)==0xe0)
|
|
||||||
|
|
||||||
char **argp;
|
char **argp;
|
||||||
char **args;
|
char **args;
|
||||||
|
|
|
@ -9,7 +9,7 @@ typedef struct Inst Inst;
|
||||||
|
|
||||||
struct Inst
|
struct Inst
|
||||||
{
|
{
|
||||||
long type; /* < 0x10000 ==> literal, otherwise action */
|
long type; /* <= Runemax ==> literal, otherwise action */
|
||||||
union {
|
union {
|
||||||
int rsid;
|
int rsid;
|
||||||
int rsubid;
|
int rsubid;
|
||||||
|
@ -56,25 +56,28 @@ static Rangeset sempty;
|
||||||
* 0x100xx are operators, value == precedence
|
* 0x100xx are operators, value == precedence
|
||||||
* 0x200xx are tokens, i.e. operands for operators
|
* 0x200xx are tokens, i.e. operands for operators
|
||||||
*/
|
*/
|
||||||
#define OPERATOR 0x10000 /* Bitmask of all operators */
|
enum {
|
||||||
#define START 0x10000 /* Start, used for marker on stack */
|
OPERATOR = Runemask+1, /* Bitmask of all operators */
|
||||||
#define RBRA 0x10001 /* Right bracket, ) */
|
START = OPERATOR, /* Start, used for marker on stack */
|
||||||
#define LBRA 0x10002 /* Left bracket, ( */
|
RBRA, /* Right bracket, ) */
|
||||||
#define OR 0x10003 /* Alternation, | */
|
LBRA, /* Left bracket, ( */
|
||||||
#define CAT 0x10004 /* Concatentation, implicit operator */
|
OR, /* Alternation, | */
|
||||||
#define STAR 0x10005 /* Closure, * */
|
CAT, /* Concatentation, implicit operator */
|
||||||
#define PLUS 0x10006 /* a+ == aa* */
|
STAR, /* Closure, * */
|
||||||
#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
|
PLUS, /* a+ == aa* */
|
||||||
#define ANY 0x20000 /* Any character but newline, . */
|
QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
|
||||||
#define NOP 0x20001 /* No operation, internal use only */
|
|
||||||
#define BOL 0x20002 /* Beginning of line, ^ */
|
|
||||||
#define EOL 0x20003 /* End of line, $ */
|
|
||||||
#define CCLASS 0x20004 /* Character class, [] */
|
|
||||||
#define NCCLASS 0x20005 /* Negated character class, [^] */
|
|
||||||
#define END 0x20077 /* Terminate: match found */
|
|
||||||
|
|
||||||
#define ISATOR 0x10000
|
ANY = OPERATOR<<1, /* Any character but newline, . */
|
||||||
#define ISAND 0x20000
|
NOP, /* No operation, internal use only */
|
||||||
|
BOL, /* Beginning of line, ^ */
|
||||||
|
EOL, /* End of line, $ */
|
||||||
|
CCLASS, /* Character class, [] */
|
||||||
|
NCCLASS, /* Negated character class, [^] */
|
||||||
|
END, /* Terminate: match found */
|
||||||
|
|
||||||
|
ISATOR = OPERATOR,
|
||||||
|
ISAND = OPERATOR<<1,
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Parser Information
|
* Parser Information
|
||||||
|
@ -459,7 +462,7 @@ nextrec(void){
|
||||||
exprp++;
|
exprp++;
|
||||||
return '\n';
|
return '\n';
|
||||||
}
|
}
|
||||||
return *exprp++|0x10000;
|
return *exprp++|(Runemax+1);
|
||||||
}
|
}
|
||||||
return *exprp++;
|
return *exprp++;
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,10 +15,8 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
|
||||||
#define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07])
|
#define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07])
|
||||||
#define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07])
|
#define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07])
|
||||||
|
|
||||||
#define MAXRUNE Runemax
|
uchar f[(Runemax+1)/8];
|
||||||
|
uchar t[(Runemax+1)/8];
|
||||||
uchar f[(MAXRUNE+1)/8];
|
|
||||||
uchar t[(MAXRUNE+1)/8];
|
|
||||||
char wbuf[4096];
|
char wbuf[4096];
|
||||||
char *wptr;
|
char *wptr;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue