make all the commands agnostic about Rune width. (from sources)

This commit is contained in:
cinap_lenrek 2013-04-24 20:13:18 +02:00
parent 78c7ba36a1
commit 667010554b
13 changed files with 99 additions and 137 deletions

View file

@ -46,6 +46,7 @@ enum
Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */ Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0xFFFF, /* 16 bit rune */ Runemax = 0xFFFF, /* 16 bit rune */
Runemask = 0xFFFF, /* bits used by runes (see grep) */
}; };
/* /*

View file

@ -20,7 +20,7 @@ Rune *lastregexp;
typedef struct Inst Inst; typedef struct Inst Inst;
struct Inst struct Inst
{ {
uint type; /* < 0x10000 ==> literal, otherwise action */ uint type; /* <= Runemax+1 ==> literal, otherwise action */
union { union {
int sid; int sid;
int subid; int subid;
@ -61,25 +61,28 @@ static Rangeset sempty;
* 0x100xx are operators, value == precedence * 0x100xx are operators, value == precedence
* 0x200xx are tokens, i.e. operands for operators * 0x200xx are tokens, i.e. operands for operators
*/ */
#define OPERATOR 0x10000 /* Bitmask of all operators */ enum {
#define START 0x10000 /* Start, used for marker on stack */ OPERATOR = Runemask+1, /* Bitmask of all operators */
#define RBRA 0x10001 /* Right bracket, ) */ START = OPERATOR, /* Start, used for marker on stack */
#define LBRA 0x10002 /* Left bracket, ( */ RBRA, /* Right bracket, ) */
#define OR 0x10003 /* Alternation, | */ LBRA, /* Left bracket, ( */
#define CAT 0x10004 /* Concatentation, implicit operator */ OR, /* Alternation, | */
#define STAR 0x10005 /* Closure, * */ CAT, /* Concatentation, implicit operator */
#define PLUS 0x10006 /* a+ == aa* */ STAR, /* Closure, * */
#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ PLUS, /* a+ == aa* */
#define ANY 0x20000 /* Any character but newline, . */ QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
#define NOP 0x20001 /* No operation, internal use only */
#define BOL 0x20002 /* Beginning of line, ^ */
#define EOL 0x20003 /* End of line, $ */
#define CCLASS 0x20004 /* Character class, [] */
#define NCCLASS 0x20005 /* Negated character class, [^] */
#define END 0x20077 /* Terminate: match found */
#define ISATOR 0x10000 ANY = OPERATOR<<1, /* Any character but newline, . */
#define ISAND 0x20000 NOP, /* No operation, internal use only */
BOL, /* Beginning of line, ^ */
EOL, /* End of line, $ */
CCLASS, /* Character class, [] */
NCCLASS, /* Negated character class, [^] */
END, /* Terminate: match found */
ISATOR = OPERATOR,
ISAND = OPERATOR<<1,
};
/* /*
* Parser Information * Parser Information
@ -452,7 +455,7 @@ nextrec(void)
exprp++; exprp++;
return '\n'; return '\n';
} }
return *exprp++|0x10000; return *exprp++|(Runemax+1);
} }
return *exprp++; return *exprp++;
} }

View file

@ -15,7 +15,7 @@ enum
ESIZE = 256, /* max size of reg exp */ ESIZE = 256, /* max size of reg exp */
GBSIZE = 256, /* max size of global command */ GBSIZE = 256, /* max size of global command */
MAXSUB = 9, /* max number of sub reg exp */ MAXSUB = 9, /* max number of sub reg exp */
ESCFLG = 0xFFFF, /* escape Rune - user defined code */ ESCFLG = Runemax, /* escape Rune - user defined code */
EOF = -1, EOF = -1,
}; };
@ -737,7 +737,7 @@ gety(void)
if(c == 0) if(c == 0)
continue; continue;
*p++ = c; *p++ = c;
if(p >= &linebuf[LBSIZE-2]) if(p >= &linebuf[LBSIZE-sizeof(Rune)])
error(Q); error(Q);
} }
} }
@ -1162,7 +1162,7 @@ join(void)
for(a1=addr1; a1<=addr2; a1++) { for(a1=addr1; a1<=addr2; a1++) {
lp = getline(*a1); lp = getline(*a1);
while(*gp = *lp++) while(*gp = *lp++)
if(gp++ >= &genbuf[LBSIZE-2]) if(gp++ >= &genbuf[LBSIZE-sizeof(Rune)])
error(Q); error(Q);
} }
lp = linebuf; lp = linebuf;

View file

@ -273,60 +273,6 @@ type(char *file, int nlen)
close(fd); close(fd);
} }
/*
* Unicode 4.0 4-byte runes.
*/
typedef int Rune1;
enum {
UTFmax1 = 4,
};
int
fullrune1(char *p, int n)
{
int c;
if(n >= 1) {
c = *(uchar*)p;
if(c < 0x80)
return 1;
if(n >= 2 && c < 0xE0)
return 1;
if(n >= 3 && c < 0xF0)
return 1;
if(n >= 4)
return 1;
}
return 0;
}
int
chartorune1(Rune1 *rune, char *str)
{
int c, c1, c2, c3, n;
Rune r;
c = *(uchar*)str;
if(c < 0xF0){
r = 0;
n = chartorune(&r, str);
*rune = r;
return n;
}
c &= ~0xF0;
c1 = *(uchar*)(str+1) & ~0x80;
c2 = *(uchar*)(str+2) & ~0x80;
c3 = *(uchar*)(str+3) & ~0x80;
n = (c<<18) | (c1<<12) | (c2<<6) | c3;
if(n < 0x10000 || n > 0x10FFFF){
*rune = Runeerror;
return 1;
}
*rune = n;
return 4;
}
void void
utfconv(void) utfconv(void)
{ {
@ -392,7 +338,7 @@ utfconv(void)
void void
filetype(int fd) filetype(int fd)
{ {
Rune1 r; Rune r;
int i, f, n; int i, f, n;
char *p, *eob; char *p, *eob;
@ -435,9 +381,9 @@ filetype(int fd)
language[i].count = 0; language[i].count = 0;
eob = (char *)buf+nbuf; eob = (char *)buf+nbuf;
for(n = 0, p = (char *)buf; p < eob; n++) { for(n = 0, p = (char *)buf; p < eob; n++) {
if (!fullrune1(p, eob-p) && eob-p < UTFmax1) if (!fullrune(p, eob-p) && eob-p < UTFmax)
break; break;
p += chartorune1(&r, p); p += chartorune(&r, p);
if (r == 0) if (r == 0)
f = Cnull; f = Cnull;
else if (r <= 0x7f) { else if (r <= 0x7f) {

View file

@ -2,7 +2,7 @@
#include <libc.h> #include <libc.h>
#include <bio.h> #include <bio.h>
uvlong count[1<<16]; uvlong count[Runemax+1];
Biobuf bout; Biobuf bout;
void usage(void); void usage(void);

View file

@ -275,7 +275,7 @@ re2class(char *s)
x = re2or(x, rclass(ov, p[0]-1)); x = re2or(x, rclass(ov, p[0]-1));
ov = p[1]+1; ov = p[1]+1;
} }
x = re2or(x, rclass(ov, 0xffff)); x = re2or(x, rclass(ov, Runemask));
} else { } else {
x = rclass(p[0], p[1]); x = rclass(p[0], p[1]);
for(p+=2; *p; p+=2) for(p+=2; *p; p+=2)

View file

@ -53,7 +53,7 @@ enum
Caselim = 7, Caselim = 7,
Nhunk = 1<<16, Nhunk = 1<<16,
Cbegin = 0x10000, Cbegin = Runemax+1,
Flshcnt = (1<<9)-1, Flshcnt = (1<<9)-1,
Cflag = 1<<0, Cflag = 1<<0,

View file

@ -16,6 +16,12 @@ rune2html(Rune r)
if(r == '\n') if(r == '\n')
return L("\n"); return L("\n");
if(((uint)r&~0xFFFF) != 0){
/* The cache must grow a lot to handle them */
fprint(2, "%s: can't handle rune '%C'\n", argv0, r);
return L("?");
}
if(tcscache[r>>8] && tcscache[r>>8][r&0xFF]) if(tcscache[r>>8] && tcscache[r>>8][r&0xFF])
return tcscache[r>>8][r&0xFF]; return tcscache[r>>8][r&0xFF];
@ -59,7 +65,7 @@ rune2html(Rune r)
typedef struct Trtab Trtab; typedef struct Trtab Trtab;
struct Trtab struct Trtab
{ {
char t[3]; char t[UTFmax];
Rune r; Rune r;
}; };

View file

@ -118,18 +118,16 @@ glob(void *ap)
int int
equtf(uchar *p, uchar *q) equtf(uchar *p, uchar *q)
{ {
Rune pr, qr;
if(*p!=*q) if(*p!=*q)
return 0; return 0;
if(twobyte(*p)) return p[1]==q[1];
if(threebyte(*p)){ chartorune(&pr, (char*)p);
if(p[1]!=q[1]) chartorune(&qr, (char*)q);
return 0; return pr == qr;
if(p[1]=='\0')
return 1; /* broken code at end of string! */
return p[2]==q[2];
}
return 1;
} }
/* /*
* Return a pointer to the next utf code in the string, * Return a pointer to the next utf code in the string,
* not jumping past nuls in broken utf codes! * not jumping past nuls in broken utf codes!
@ -138,10 +136,11 @@ equtf(uchar *p, uchar *q)
uchar* uchar*
nextutf(uchar *p) nextutf(uchar *p)
{ {
if(twobyte(*p)) return p[1]=='\0'?p+1:p+2; Rune dummy;
if(threebyte(*p)) return p[1]=='\0'?p+1:p[2]=='\0'?p+2:p+3;
return p+1; return p + chartorune(&dummy, (char*)p);
} }
/* /*
* Convert the utf code at *p to a unicode value * Convert the utf code at *p to a unicode value
*/ */
@ -149,14 +148,12 @@ nextutf(uchar *p)
int int
unicode(uchar *p) unicode(uchar *p)
{ {
int u = *p; Rune r;
if(twobyte(u)) chartorune(&r, (char*)p);
return ((u&0x1f)<<6)|(p[1]&0x3f); return r;
if(threebyte(u))
return (u<<12)|((p[1]&0x3f)<<6)|(p[2]&0x3f);
return u;
} }
/* /*
* Does the string s match the pattern p * Does the string s match the pattern p
* . and .. are only matched by patterns starting with . * . and .. are only matched by patterns starting with .

View file

@ -166,15 +166,25 @@ addtok(char *p, int val)
char* char*
addutf(char *p, int c) addutf(char *p, int c)
{ {
p = addtok(p, c); uchar b, m;
if(twobyte(c)) /* 2-byte escape */ int i;
return addtok(p, advance());
if(threebyte(c)){ /* 3-byte escape */ p = addtok(p, c); /* 1-byte UTF runes are special */
if(onebyte(c))
return p;
m = 0xc0;
b = 0x80;
for(i=1; i < UTFmax; i++){
if((c&m) == b)
break;
p = addtok(p, advance()); p = addtok(p, advance());
return addtok(p, advance()); b = m;
m = (m >> 1)|0x80;
} }
return p; return p;
} }
int lastdol; /* was the last token read '$' or '$#' or '"'? */ int lastdol; /* was the last token read '$' or '$#' or '"'? */
int lastword; /* was the last token read a word or compound word terminator? */ int lastword; /* was the last token read a word or compound word terminator? */

View file

@ -123,12 +123,10 @@ int mypid;
*/ */
#define GLOB ((char)0x01) #define GLOB ((char)0x01)
/* /*
* onebyte(c), twobyte(c), threebyte(c) * onebyte(c)
* Is c the first character of a one- two- or three-byte utf sequence? * Is c the first character of a one-byte utf sequence?
*/ */
#define onebyte(c) ((c&0x80)==0x00) #define onebyte(c) ((c&0x80)==0x00)
#define twobyte(c) ((c&0xe0)==0xc0)
#define threebyte(c) ((c&0xf0)==0xe0)
char **argp; char **argp;
char **args; char **args;

View file

@ -9,7 +9,7 @@ typedef struct Inst Inst;
struct Inst struct Inst
{ {
long type; /* < 0x10000 ==> literal, otherwise action */ long type; /* <= Runemax ==> literal, otherwise action */
union { union {
int rsid; int rsid;
int rsubid; int rsubid;
@ -56,25 +56,28 @@ static Rangeset sempty;
* 0x100xx are operators, value == precedence * 0x100xx are operators, value == precedence
* 0x200xx are tokens, i.e. operands for operators * 0x200xx are tokens, i.e. operands for operators
*/ */
#define OPERATOR 0x10000 /* Bitmask of all operators */ enum {
#define START 0x10000 /* Start, used for marker on stack */ OPERATOR = Runemask+1, /* Bitmask of all operators */
#define RBRA 0x10001 /* Right bracket, ) */ START = OPERATOR, /* Start, used for marker on stack */
#define LBRA 0x10002 /* Left bracket, ( */ RBRA, /* Right bracket, ) */
#define OR 0x10003 /* Alternation, | */ LBRA, /* Left bracket, ( */
#define CAT 0x10004 /* Concatentation, implicit operator */ OR, /* Alternation, | */
#define STAR 0x10005 /* Closure, * */ CAT, /* Concatentation, implicit operator */
#define PLUS 0x10006 /* a+ == aa* */ STAR, /* Closure, * */
#define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ PLUS, /* a+ == aa* */
#define ANY 0x20000 /* Any character but newline, . */ QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
#define NOP 0x20001 /* No operation, internal use only */
#define BOL 0x20002 /* Beginning of line, ^ */
#define EOL 0x20003 /* End of line, $ */
#define CCLASS 0x20004 /* Character class, [] */
#define NCCLASS 0x20005 /* Negated character class, [^] */
#define END 0x20077 /* Terminate: match found */
#define ISATOR 0x10000 ANY = OPERATOR<<1, /* Any character but newline, . */
#define ISAND 0x20000 NOP, /* No operation, internal use only */
BOL, /* Beginning of line, ^ */
EOL, /* End of line, $ */
CCLASS, /* Character class, [] */
NCCLASS, /* Negated character class, [^] */
END, /* Terminate: match found */
ISATOR = OPERATOR,
ISAND = OPERATOR<<1,
};
/* /*
* Parser Information * Parser Information
@ -459,7 +462,7 @@ nextrec(void){
exprp++; exprp++;
return '\n'; return '\n';
} }
return *exprp++|0x10000; return *exprp++|(Runemax+1);
} }
return *exprp++; return *exprp++;
} }

View file

@ -15,10 +15,8 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
#define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07])
#define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07])
#define MAXRUNE Runemax uchar f[(Runemax+1)/8];
uchar t[(Runemax+1)/8];
uchar f[(MAXRUNE+1)/8];
uchar t[(MAXRUNE+1)/8];
char wbuf[4096]; char wbuf[4096];
char *wptr; char *wptr;