fix utf and rune handling in preparation for 32bit runes

This commit is contained in:
cinap_lenrek 2012-12-31 21:09:46 +01:00
parent 6d99096136
commit 6cadd03bbe
53 changed files with 582 additions and 283 deletions

View file

@ -14,7 +14,8 @@ enum
UTFmax = 3, /* maximum bytes per rune */ UTFmax = 3, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0x80, /* decoding error in UTF */ Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0xFFFF, /* 16 bit rune */
}; };
/* /*

View file

@ -45,6 +45,7 @@ enum
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */ Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0xFFFF, /* 16 bit rune */
}; };
/* /*

View file

@ -99,7 +99,9 @@ cgascreenputc(Rune c)
int i; int i;
uchar *p; uchar *p;
if(c == '\n'){ if(c == '\0')
return;
else if(c == '\n'){
cgapos = cgapos/Width; cgapos = cgapos/Width;
cgapos = (cgapos+1)*Width; cgapos = (cgapos+1)*Width;
} }
@ -138,8 +140,10 @@ cgascreenputc(Rune c)
static void static void
cgascreenputs(char* s, int n) cgascreenputs(char* s, int n)
{ {
static char rb[UTFmax];
static int nrb;
char *e;
Rune r; Rune r;
int i;
if(!islo()){ if(!islo()){
/* /*
@ -152,11 +156,14 @@ cgascreenputs(char* s, int n)
else else
lock(&cgascreenlock); lock(&cgascreenlock);
while(n > 0){ e = s + n;
i = chartorune(&r, s); while(s < e){
cgascreenputc(r); rb[nrb++] = *s++;
s += i; if(nrb >= UTFmax || fullrune(rb, nrb)){
n -= i; chartorune(&r, rb);
cgascreenputc(r);
nrb = 0;
}
} }
unlock(&cgascreenlock); unlock(&cgascreenlock);

View file

@ -119,9 +119,10 @@ vgascreenputc(VGAscr* scr, char* buf, Rectangle *flushr)
static void static void
vgascreenputs(char* s, int n) vgascreenputs(char* s, int n)
{ {
int i, gotdraw; static char rb[UTFmax+1];
Rune r; static int nrb;
char buf[4]; char *e;
int gotdraw;
VGAscr *scr; VGAscr *scr;
Rectangle flushr; Rectangle flushr;
@ -146,13 +147,14 @@ vgascreenputs(char* s, int n)
flushr = Rect(10000, 10000, -10000, -10000); flushr = Rect(10000, 10000, -10000, -10000);
while(n > 0){ e = s + n;
i = chartorune(&r, s); while(s < e){
memmove(buf, s, i); rb[nrb++] = *s++;
buf[i] = 0; if(nrb >= UTFmax || fullrune(rb, nrb)){
n -= i; rb[nrb] = 0;
s += i; vgascreenputc(scr, rb, &flushr);
vgascreenputc(scr, buf, &flushr); nrb = 0;
}
} }
flushmemscreen(flushr); flushmemscreen(flushr);

View file

@ -38,7 +38,8 @@ enum
UTFmax = 3, /* maximum bytes per rune */ UTFmax = 3, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence */ Runesync = 0x80, /* cannot represent part of a UTF sequence */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0x80, /* decoding error in UTF */ Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0xFFFF, /* 16 bit rune */
}; };
/* /*

View file

@ -1,4 +1,5 @@
#include <stdlib.h> #include <stdlib.h>
#include <utf.h>
/* /*
* Use the FSS-UTF transformation proposed by posix. * Use the FSS-UTF transformation proposed by posix.
@ -7,12 +8,14 @@
* Tx 10xxxxxx 6 free bits * Tx 10xxxxxx 6 free bits
* T1 110xxxxx 5 free bits * T1 110xxxxx 5 free bits
* T2 1110xxxx 4 free bits * T2 1110xxxx 4 free bits
* T3 11110xxx 3 free bits
* *
* Encoding is as follows. * Encoding is as follows.
* From hex Thru hex Sequence Bits * From hex Thru hex Sequence Bits
* 00000000 0000007F T0 7 * 00000000 0000007F T0 7
* 00000080 000007FF T1 Tx 11 * 00000080 000007FF T1 Tx 11
* 00000800 0000FFFF T2 Tx Tx 16 * 00000800 0000FFFF T2 Tx Tx 16
* 00010000 0010FFFF T3 Tx Tx Tx 20 (and change)
*/ */
int int
@ -25,7 +28,7 @@ mblen(const char *s, size_t n)
int int
mbtowc(wchar_t *pwc, const char *s, size_t n) mbtowc(wchar_t *pwc, const char *s, size_t n)
{ {
int c, c1, c2; int c, c1, c2, c3;
long l; long l;
if(!s) if(!s)
@ -70,7 +73,25 @@ mbtowc(wchar_t *pwc, const char *s, size_t n)
return 3; return 3;
} }
/* if(n < 4)
goto bad;
if(UTFmax >= 4) {
c3 = (s[3] ^ 0x80) & 0xff;
if(c3 & 0xC0)
goto bad;
if(c < 0xf8) {
l = ((((((c << 6) | c1) << 6) | c2) << 6) | c3) & 0x3fffff;
if(l <= 0x10000)
goto bad;
if(l > Runemax)
goto bad;
if(pwc)
*pwc = l;
return 4;
}
}
/*
* bad decoding * bad decoding
*/ */
bad: bad:
@ -86,7 +107,10 @@ wctomb(char *s, wchar_t wchar)
if(!s) if(!s)
return 0; return 0;
c = wchar & 0xFFFF; c = wchar;
if(c > Runemax)
c = Runeerror;
if(c < 0x80) { if(c < 0x80) {
s[0] = c; s[0] = c;
return 1; return 1;
@ -98,10 +122,18 @@ wctomb(char *s, wchar_t wchar)
return 2; return 2;
} }
s[0] = 0xE0 | (c >> 12); if(c < 0x10000) {
s[1] = 0x80 | ((c >> 6) & 0x3F); s[0] = 0xE0 | (c >> 12);
s[2] = 0x80 | (c & 0x3F); s[1] = 0x80 | ((c >> 6) & 0x3F);
return 3; s[2] = 0x80 | (c & 0x3F);
return 3;
}
s[0] = 0xf0 | c >> 18;
s[1] = 0x80 | (c >> 12) & 0x3F;
s[2] = 0x80 | (c >> 6) & 0x3F;
s[3] = 0x80 | (c & 0x3F);
return 4;
} }
size_t size_t
@ -117,7 +149,7 @@ mbstowcs(wchar_t *pwcs, const char *s, size_t n)
break; break;
s++; s++;
} else { } else {
d = mbtowc(pwcs, s, 3); d = mbtowc(pwcs, s, UTFmax);
if(d <= 0) if(d <= 0)
return (size_t)((d<0) ? -1 : i); return (size_t)((d<0) ? -1 : i);
s += d; s += d;
@ -133,10 +165,10 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n)
int i, d; int i, d;
long c; long c;
char *p, *pe; char *p, *pe;
char buf[3]; char buf[UTFmax];
p = s; p = s;
pe = p+n-3; pe = p+n-UTFmax;
while(p < pe) { while(p < pe) {
c = *pwcs++; c = *pwcs++;
if(c < 0x80) if(c < 0x80)
@ -146,17 +178,14 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n)
if(c == 0) if(c == 0)
return p-s; return p-s;
} }
while(p < pe+3) { while(p < pe+UTFmax) {
c = *pwcs++; c = *pwcs++;
d = wctomb(buf, c); d = wctomb(buf, c);
if(p+d <= pe+3) { if(p+d <= pe+UTFmax) {
*p++ = buf[0]; for(i = 0; i < d; i++)
if(d > 1) { p[i] = buf[i];
*p++ = buf[2]; p += d;
if(d > 2) }
*p++ = buf[3];
}
}
if(c == 0) if(c == 0)
break; break;
} }

View file

@ -546,12 +546,15 @@ __flagfmt(Fmt *f)
int int
__badfmt(Fmt *f) __badfmt(Fmt *f)
{ {
char x[3]; char x[2+UTFmax];
Rune r;
int n;
r = f->r;
x[0] = '%'; x[0] = '%';
x[1] = f->r; n = 1+runetochar(x+1, &r);
x[2] = '%'; x[n++] = '%';
f->prec = 3; f->prec = n;
__fmtcpy(f, (const void*)x, 3, 3); __fmtcpy(f, x, n, n);
return 0; return 0;
} }

View file

@ -23,16 +23,19 @@ enum
Bit2 = 5, Bit2 = 5,
Bit3 = 4, Bit3 = 4,
Bit4 = 3, Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */
@ -43,7 +46,7 @@ enum
int int
chartorune(Rune *rune, char *str) chartorune(Rune *rune, char *str)
{ {
int c, c1, c2; int c, c1, c2, c3;
long l; long l;
/* /*
@ -88,6 +91,25 @@ chartorune(Rune *rune, char *str)
return 3; return 3;
} }
/*
* four character sequence
* 10000-10FFFF => T4 Tx Tx Tx
*/
if(UTFmax >= 4) {
c3 = *(uchar*)(str+3) ^ Tx;
if(c3 & Testx)
goto bad;
if(c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if(l <= Rune3)
goto bad;
if(l > Runemax)
goto bad;
*rune = l;
return 4;
}
}
/* /*
* bad decoding * bad decoding
*/ */
@ -101,11 +123,14 @@ runetochar(char *str, Rune *rune)
{ {
long c; long c;
c = *rune;
if(c > Runemax)
c = Runeerror;
/* /*
* one character sequence * one character sequence
* 00000-0007F => 00-7F * 00000-0007F => 00-7F
*/ */
c = *rune;
if(c <= Rune1) { if(c <= Rune1) {
str[0] = c; str[0] = c;
return 1; return 1;
@ -125,17 +150,29 @@ runetochar(char *str, Rune *rune)
* three character sequence * three character sequence
* 0800-FFFF => T3 Tx Tx * 0800-FFFF => T3 Tx Tx
*/ */
str[0] = T3 | (c >> 2*Bitx); if(c <= Rune3) {
str[1] = Tx | ((c >> 1*Bitx) & Maskx); str[0] = T3 | (c >> 2*Bitx);
str[2] = Tx | (c & Maskx); str[1] = Tx | ((c >> 1*Bitx) & Maskx);
return 3; str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
} }
int int
runelen(long c) runelen(long c)
{ {
Rune rune; Rune rune;
char str[10]; char str[UTFmax];
rune = c; rune = c;
return runetochar(str, &rune); return runetochar(str, &rune);
@ -155,7 +192,10 @@ runenlen(Rune *r, int nrune)
if(c <= Rune2) if(c <= Rune2)
nb += 2; nb += 2;
else else
if(c <= Rune3 || c > Runemax)
nb += 3; nb += 3;
else
nb += 4;
} }
return nb; return nb;
} }
@ -165,13 +205,15 @@ fullrune(char *str, int n)
{ {
int c; int c;
if(n > 0) { if(n <= 0)
c = *(uchar*)str; return 0;
if(c < Tx) c = *(uchar*)str;
return 1; if(c < Tx)
if(n > 1) return 1;
if(c < T3 || n > 2) if(c < T3)
return 1; return n >= 2;
} if(UTFmax == 3 || c < T4)
return 0; return n >= 3;
return n >= 4;
} }

View file

@ -244,26 +244,26 @@ outstring(char *s, long n)
} }
long long
outlstring(ushort *s, long n) outlstring(Rune *s, long n)
{ {
char buf[2]; char buf[sizeof(Rune)];
int c; int c, i;
long r; long r;
while(nstring & 1) while(nstring % sizeof buf)
outstring("", 1); outstring("", 1);
r = nstring; r = nstring;
while(n > 0) { while(n > 0) {
c = *s++; c = *s++;
if(align(0, types[TCHAR], Aarg1)) { if(align(0, types[TCHAR], Aarg1)) {
buf[0] = c>>8; for(i = sizeof buf; i > 0; c >>= 8)
buf[1] = c; buf[--i] = c;
} else { } else {
buf[0] = c; for(i = 0; i < sizeof buf; c >>= 8)
buf[1] = c>>8; buf[i++] = c;
} }
outstring(buf, 2); outstring(buf, sizeof buf);
n -= sizeof(ushort); n -= sizeof buf;
} }
return r; return r;
} }

View file

@ -324,26 +324,26 @@ outstring(char *s, long n)
} }
long long
outlstring(ushort *s, long n) outlstring(Rune *s, long n)
{ {
char buf[2]; char buf[sizeof(Rune)];
int c; int c, i;
long r; long r;
while(nstring & 1) while(nstring % sizeof buf)
outstring("", 1); outstring("", 1);
r = nstring; r = nstring;
while(n > 0) { while(n > 0) {
c = *s++; c = *s++;
if(align(0, types[TCHAR], Aarg1)) { if(align(0, types[TCHAR], Aarg1)) {
buf[0] = c>>8; for(i = sizeof buf; i > 0; c >>= 8)
buf[1] = c; buf[--i] = c;
} else { } else {
buf[0] = c; for(i = 0; i < sizeof buf; c >>= 8)
buf[1] = c>>8; buf[i++] = c;
} }
outstring(buf, 2); outstring(buf, sizeof buf);
n -= sizeof(ushort); n -= sizeof buf;
} }
return r; return r;
} }

View file

@ -487,7 +487,7 @@ bldcclass(void)
exprp++; /* eat '-' */ exprp++; /* eat '-' */
if((c2 = nextrec()) == ']') if((c2 = nextrec()) == ']')
goto Error; goto Error;
classp[n+0] = 0xFFFF; classp[n+0] = Runemax;
classp[n+1] = c1; classp[n+1] = c1;
classp[n+2] = c2; classp[n+2] = c2;
n += 3; n += 3;
@ -509,7 +509,7 @@ classmatch(int classno, int c, int negate)
p = class[classno]; p = class[classno];
while(*p){ while(*p){
if(*p == 0xFFFF){ if(*p == Runemax){
if(p[1]<=c && c<=p[2]) if(p[1]<=c && c<=p[2])
return !negate; return !negate;
p += 3; p += 3;

View file

@ -121,7 +121,7 @@ badname(char *s)
for (; *s != '\0'; s += n) { for (; *s != '\0'; s += n) {
n = chartorune(&r, s); n = chartorune(&r, s);
if (n == 1 && r == Runeerror) if (r == Runeerror)
return 1; return 1;
} }
return 0; return 0;

View file

@ -395,7 +395,7 @@ threadmain(int argc, char *argv[])
if(strcmp(args[0], "keyboard:")==0 || strcmp(args[0], "scribble:")==0) if(strcmp(args[0], "keyboard:")==0 || strcmp(args[0], "scribble:")==0)
if(strcmp(args[1], "value") == 0){ if(strcmp(args[1], "value") == 0){
n = atoi(args[2]); n = atoi(args[2]);
if(n <= 0xFFFF){ if(n <= Runemax){
r = n; r = n;
i = runetochar(str, &r); i = runetochar(str, &r);
write(kbdfd, str, i); write(kbdfd, str, i);

View file

@ -282,7 +282,7 @@ threadmain(int argc, char *argv[])
n = atoi(args[2]); n = atoi(args[2]);
if(n == '\033') /* Escape exits */ if(n == '\033') /* Escape exits */
break; break;
if(n <= 0xFFFF){ if(n <= Runemax){
r = n; r = n;
send(kbdctl->c, &r); send(kbdctl->c, &r);
} }

View file

@ -51,7 +51,7 @@ struct Node
double fconst; /* fp constant */ double fconst; /* fp constant */
vlong vconst; /* non fp const */ vlong vconst; /* non fp const */
char* cstring; /* character string */ char* cstring; /* character string */
ushort* rstring; /* rune string */ Rune* rstring; /* rune string */
Sym* sym; Sym* sym;
Type* type; Type* type;
@ -336,6 +336,8 @@ enum
TFILE, TFILE,
TOLD, TOLD,
NALLTYPES, NALLTYPES,
TRUNE = sizeof(Rune)==4? TUINT: TUSHORT,
}; };
enum enum
{ {
@ -740,7 +742,7 @@ void gclean(void);
void gextern(Sym*, Node*, long, long); void gextern(Sym*, Node*, long, long);
void ginit(void); void ginit(void);
long outstring(char*, long); long outstring(char*, long);
long outlstring(ushort*, long); long outlstring(Rune*, long);
void sextern(Sym*, Node*, long, long); void sextern(Sym*, Node*, long, long);
void xcom(Node*); void xcom(Node*);
long exreg(Type*); long exreg(Type*);

View file

@ -855,9 +855,9 @@ lstring:
LLSTRING LLSTRING
{ {
$$ = new(OLSTRING, Z, Z); $$ = new(OLSTRING, Z, Z);
$$->type = typ(TARRAY, types[TUSHORT]); $$->type = typ(TARRAY, types[TRUNE]);
$$->type->width = $1.l + sizeof(ushort); $$->type->width = $1.l + sizeof(Rune);
$$->rstring = (ushort*)$1.s; $$->rstring = (Rune*)$1.s;
$$->sym = symstring; $$->sym = symstring;
$$->etype = TARRAY; $$->etype = TARRAY;
$$->class = CSTATIC; $$->class = CSTATIC;
@ -867,16 +867,16 @@ lstring:
char *s; char *s;
int n; int n;
n = $1->type->width - sizeof(ushort); n = $1->type->width - sizeof(Rune);
s = alloc(n+$2.l+MAXALIGN); s = alloc(n+$2.l+MAXALIGN);
memcpy(s, $1->rstring, n); memcpy(s, $1->rstring, n);
memcpy(s+n, $2.s, $2.l); memcpy(s+n, $2.s, $2.l);
*(ushort*)(s+n+$2.l) = 0; *(Rune*)(s+n+$2.l) = 0;
$$ = $1; $$ = $1;
$$->type->width += $2.l; $$->type->width += $2.l;
$$->rstring = (ushort*)s; $$->rstring = (Rune*)s;
} }
zelist: zelist:

View file

@ -633,10 +633,11 @@ tcomo(Node *n, int f)
break; break;
case OLSTRING: case OLSTRING:
if(n->type->link != types[TUSHORT]) { if(n->type->link != types[TRUNE]) {
o = outstring(0, 0); o = outstring(0, 0);
while(o & 3) { while(o & 3) {
outlstring(L"", sizeof(ushort)); Rune str[1] = {0};
outlstring(str, sizeof(Rune));
o = outlstring(0, 0); o = outlstring(0, 0);
} }
} }

View file

@ -67,13 +67,14 @@ getflag(char *s)
{ {
Bits flag; Bits flag;
int f; int f;
char *fmt; char *fmt, *e;
Rune c; Rune c;
fmt = fmtbuf; fmt = fmtbuf;
e = fmtbuf + sizeof(fmtbuf)-1;
flag = zbits; flag = zbits;
nstar = 0; nstar = 0;
for(;;) { while(fmt < e){
s += chartorune(&c, s); s += chartorune(&c, s);
fmt += runetochar(fmt, &c); fmt += runetochar(fmt, &c);
if(c == 0 || c >= nelem(flagbits)) if(c == 0 || c >= nelem(flagbits))
@ -175,7 +176,7 @@ pragvararg(void)
{ {
Sym *s; Sym *s;
int n, c; int n, c;
char *t; char *t, *e;
Rune r; Rune r;
Type *ty; Type *ty;
@ -225,12 +226,15 @@ cktype:
if(c != '"') if(c != '"')
goto bad; goto bad;
t = fmtbuf; t = fmtbuf;
e = t + sizeof(fmtbuf)-1;
for(;;) { for(;;) {
r = getr(); r = getr();
if(r == ' ' || r == '\n') if(r == ' ' || r == '\n')
goto bad; goto bad;
if(r == '"') if(r == '"')
break; break;
if(t >= e)
goto bad;
t += runetochar(t, &r); t += runetochar(t, &r);
} }
*t = 0; *t = 0;

View file

@ -467,7 +467,7 @@ l1:
yyerror("missing '"); yyerror("missing '");
peekc = c1; peekc = c1;
} }
yylval.vval = convvtox(c, TUSHORT); yylval.vval = convvtox(c, TRUNE);
return LUCONST; return LUCONST;
} }
if(c == '"') { if(c == '"') {
@ -541,15 +541,15 @@ l1:
c = escchar('"', 1, 0); c = escchar('"', 1, 0);
if(c == EOF) if(c == EOF)
break; break;
cp = allocn(cp, c1, sizeof(ushort)); cp = allocn(cp, c1, sizeof(Rune));
*(ushort*)(cp + c1) = c; *(Rune*)(cp + c1) = c;
c1 += sizeof(ushort); c1 += sizeof(Rune);
} }
yylval.sval.l = c1; yylval.sval.l = c1;
do { do {
cp = allocn(cp, c1, sizeof(ushort)); cp = allocn(cp, c1, sizeof(Rune));
*(ushort*)(cp + c1) = 0; *(Rune*)(cp + c1) = 0;
c1 += sizeof(ushort); c1 += sizeof(Rune);
} while(c1 & MAXALIGN); } while(c1 & MAXALIGN);
yylval.sval.s = cp; yylval.sval.s = cp;
return LLSTRING; return LLSTRING;
@ -1027,7 +1027,7 @@ getnsc(void)
} else } else
c = GETC(); c = GETC();
for(;;) { for(;;) {
if(!isspace(c)) if(c >= Runeself || !isspace(c))
return c; return c;
if(c == '\n') { if(c == '\n') {
lineno++; lineno++;

View file

@ -132,28 +132,28 @@ casf(void)
} }
long long
outlstring(ushort *s, long n) outlstring(Rune *s, long n)
{ {
char buf[2]; char buf[sizeof(Rune)];
int c; int c, i;
long r; long r;
if(suppress) if(suppress)
return nstring; return nstring;
while(nstring & 1) while(nstring % sizeof buf)
outstring("", 1); outstring("", 1);
r = nstring; r = nstring;
while(n > 0) { while(n > 0) {
c = *s++; c = *s++;
if(align(0, types[TCHAR], Aarg1)) { if(align(0, types[TCHAR], Aarg1)) {
buf[0] = c>>8; for(i = sizeof buf; i > 0; c >>= 8)
buf[1] = c; buf[--i] = c;
} else { } else {
buf[0] = c; for(i = 0; i < sizeof buf; c >>= 8)
buf[1] = c>>8; buf[i++] = c;
} }
outstring(buf, 2); outstring(buf, sizeof buf);
n -= sizeof(ushort); n -= sizeof buf;
} }
return r; return r;
} }

View file

@ -503,7 +503,6 @@ Cputrscvt(Cdimg *cd, char *s, int size)
{ {
Rune r[256]; Rune r[256];
strtorune(r, s);
Cputrs(cd, strtorune(r, s), size); Cputrs(cd, strtorune(r, s), size);
} }

View file

@ -45,8 +45,7 @@ isbadjoliet(char *s)
if(utflen(s) > 64) if(utflen(s) > 64)
return 1; return 1;
strtorune(r, s); for(p=strtorune(r, s); *p; p++)
for(p=r; *p; p++)
if(isjolietfrog(*p)) if(isjolietfrog(*p))
return 1; return 1;
return 0; return 0;

View file

@ -54,7 +54,7 @@ Reprog *pattern;
int peekc; int peekc;
int pflag; int pflag;
int rescuing; int rescuing;
Rune rhsbuf[LBSIZE/2]; Rune rhsbuf[LBSIZE/sizeof(Rune)];
char savedfile[FNSIZE]; char savedfile[FNSIZE];
jmp_buf savej; jmp_buf savej;
int subnewa; int subnewa;
@ -990,11 +990,11 @@ getline(int tl)
lp = linebuf; lp = linebuf;
bp = getblock(tl, OREAD); bp = getblock(tl, OREAD);
nl = nleft; nl = nleft;
tl &= ~((BLKSIZE/2) - 1); tl &= ~((BLKSIZE/sizeof(Rune)) - 1);
while(*lp++ = *bp++) { while(*lp++ = *bp++) {
nl -= sizeof(Rune); nl -= sizeof(Rune);
if(nl == 0) { if(nl == 0) {
bp = getblock(tl += BLKSIZE/2, OREAD); bp = getblock(tl += BLKSIZE/sizeof(Rune), OREAD);
nl = nleft; nl = nleft;
} }
} }
@ -1012,7 +1012,7 @@ putline(void)
tl = tline; tl = tline;
bp = getblock(tl, OWRITE); bp = getblock(tl, OWRITE);
nl = nleft; nl = nleft;
tl &= ~((BLKSIZE/2)-1); tl &= ~((BLKSIZE/sizeof(Rune))-1);
while(*bp = *lp++) { while(*bp = *lp++) {
if(*bp++ == '\n') { if(*bp++ == '\n') {
bp[-1] = 0; bp[-1] = 0;
@ -1021,7 +1021,7 @@ putline(void)
} }
nl -= sizeof(Rune); nl -= sizeof(Rune);
if(nl == 0) { if(nl == 0) {
tl += BLKSIZE/2; tl += BLKSIZE/sizeof(Rune);
bp = getblock(tl, OWRITE); bp = getblock(tl, OWRITE);
nl = nleft; nl = nleft;
} }
@ -1048,8 +1048,8 @@ getblock(int atl, int iof)
static uchar ibuff[BLKSIZE]; static uchar ibuff[BLKSIZE];
static uchar obuff[BLKSIZE]; static uchar obuff[BLKSIZE];
bno = atl / (BLKSIZE/2); bno = atl / (BLKSIZE/sizeof(Rune));
off = (atl<<1) & (BLKSIZE-1) & ~03; off = (atl*sizeof(Rune)) & (BLKSIZE-1) & ~03;
if(bno >= NBLK) { if(bno >= NBLK) {
lastc = '\n'; lastc = '\n';
error(T); error(T);
@ -1240,7 +1240,7 @@ compsub(void)
if(c == '\\') { if(c == '\\') {
c = getchr(); c = getchr();
*p++ = ESCFLG; *p++ = ESCFLG;
if(p >= &rhsbuf[LBSIZE/2]) if(p >= &rhsbuf[nelem(rhsbuf)])
error(Q); error(Q);
} else } else
if(c == '\n' && (!globp || !globp[0])) { if(c == '\n' && (!globp || !globp[0])) {
@ -1251,7 +1251,7 @@ compsub(void)
if(c == seof) if(c == seof)
break; break;
*p++ = c; *p++ = c;
if(p >= &rhsbuf[LBSIZE/2]) if(p >= &rhsbuf[nelem(rhsbuf)])
error(Q); error(Q);
} }
*p = 0; *p = 0;

View file

@ -359,7 +359,7 @@ utfconv(void)
rb = malloc(nbuf+1); rb = malloc(nbuf+1);
memmove(rb, buf+2, nbuf); memmove(rb, buf+2, nbuf);
p = (char*)buf; p = (char*)buf;
e = p+nbuf-4; e = p+sizeof(buf)-UTFmax-1;
for(i=0; i<nbuf && p < e; i+=2){ for(i=0; i<nbuf && p < e; i+=2){
r = rb[i+1] | rb[i]<<8; r = rb[i+1] | rb[i]<<8;
p += runetochar(p, &r); p += runetochar(p, &r);
@ -376,7 +376,7 @@ utfconv(void)
rb = malloc(nbuf+1); rb = malloc(nbuf+1);
memmove(rb, buf+2, nbuf); memmove(rb, buf+2, nbuf);
p = (char*)buf; p = (char*)buf;
e = p+nbuf-4; e = p+sizeof(buf)-UTFmax-1;
for(i=0; i<nbuf && p < e; i+=2){ for(i=0; i<nbuf && p < e; i+=2){
r = rb[i] | rb[i+1]<<8; r = rb[i] | rb[i+1]<<8;
p += runetochar(p, &r); p += runetochar(p, &r);

View file

@ -1525,7 +1525,7 @@ fromlatin1(char *from)
if(*p == 0) if(*p == 0)
return nil; return nil;
to = malloc(3*strlen(from)+2); to = malloc(UTFmax*strlen(from)+2);
if(to == nil) if(to == nil)
return nil; return nil;
for(p = to; *from; from++){ for(p = to; *from; from++){

View file

@ -59,7 +59,7 @@ _urlunesc(char *s)
t = v; t = v;
while(*s){ while(*s){
/* in decoding error, assume latin1 */ /* in decoding error, assume latin1 */
if((n=chartorune(&r, s)) == 1 && r == 0x80) if((n=chartorune(&r, s)) == 1 && r == Runeerror)
r = *s; r = *s;
s += n; s += n;
t += runetochar(t, &r); t += runetochar(t, &r);

View file

@ -286,7 +286,7 @@ output(int on1, int on2) /* print items from olist */
{ {
int i; int i;
Rune *temp; Rune *temp;
char buf[BUFSIZ]; char buf[BUFSIZ*UTFmax+1];
if (no <= 0) { /* default case */ if (no <= 0) { /* default case */
printf("%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2])); printf("%s", runetostr(buf, on1? ppi[F1][j1]: ppi[F2][j2]));

View file

@ -7,16 +7,19 @@ enum
Bit2 = 5, Bit2 = 5,
Bit3 = 4, Bit3 = 4,
Bit4 = 3, Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */
@ -27,7 +30,7 @@ enum
int int
chartorune(Rune *rune, char *str) chartorune(Rune *rune, char *str)
{ {
int c, c1, c2; int c, c1, c2, c3;
long l; long l;
/* /*
@ -72,6 +75,25 @@ chartorune(Rune *rune, char *str)
return 3; return 3;
} }
/*
* four character sequence
* 10000-10FFFF => T4 Tx Tx Tx
*/
if(UTFmax >= 4) {
c3 = *(unsigned char*)(str+3) ^ Tx;
if(c3 & Testx)
goto bad;
if(c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if(l <= Rune3)
goto bad;
if(l > Runemax)
goto bad;
*rune = l;
return 4;
}
}
/* /*
* bad decoding * bad decoding
*/ */
@ -85,11 +107,14 @@ runetochar(char *str, Rune *rune)
{ {
long c; long c;
c = *rune;
if(c > Runemax)
c = Runeerror;
/* /*
* one character sequence * one character sequence
* 00000-0007F => 00-7F * 00000-0007F => 00-7F
*/ */
c = *rune;
if(c <= Rune1) { if(c <= Rune1) {
str[0] = c; str[0] = c;
return 1; return 1;
@ -109,34 +134,70 @@ runetochar(char *str, Rune *rune)
* three character sequence * three character sequence
* 0800-FFFF => T3 Tx Tx * 0800-FFFF => T3 Tx Tx
*/ */
str[0] = T3 | (c >> 2*Bitx); if(c <= Rune3) {
str[1] = Tx | ((c >> 1*Bitx) & Maskx); str[0] = T3 | (c >> 2*Bitx);
str[2] = Tx | (c & Maskx); str[1] = Tx | ((c >> 1*Bitx) & Maskx);
return 3; str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
} }
int int
runelen(long c) runelen(long c)
{ {
Rune rune; Rune rune;
char str[10]; char str[UTFmax];
rune = c; rune = c;
return runetochar(str, &rune); return runetochar(str, &rune);
} }
int
runenlen(Rune *r, int nrune)
{
int nb, c;
nb = 0;
while(nrune--) {
c = *r++;
if(c <= Rune1)
nb++;
else
if(c <= Rune2)
nb += 2;
else
if(c <= Rune3 || c > Runemax)
nb += 3;
else
nb += 4;
}
return nb;
}
int int
fullrune(char *str, int n) fullrune(char *str, int n)
{ {
int c; int c;
if(n > 0) { if(n <= 0)
c = *(unsigned char*)str; return 0;
if(c < Tx) c = *(unsigned char*)str;
return 1; if(c < Tx)
if(n > 1) return 1;
if(c < T3 || n > 2) if(c < T3)
return 1; return n >= 2;
} if(UTFmax == 3 || c < T4)
return 0; return n >= 3;
return n >= 4;
} }

View file

@ -14,6 +14,7 @@ enum
UTFmax = 3, /* maximum bytes per rune */ UTFmax = 3, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a utf sequence (<) */ Runesync = 0x80, /* cannot represent part of a utf sequence (<) */
Runeself = 0x80, /* rune and utf sequences are the same (<) */ Runeself = 0x80, /* rune and utf sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in utf */ Runeerror = 0xFFFD, /* decoding error in utf */
Runemax = 0xFFFF, /* 16 bit rune */
}; };
#endif #endif

View file

@ -71,7 +71,7 @@ int
inputc(void) inputc(void)
{ {
int n, nbuf; int n, nbuf;
char buf[3]; char buf[UTFmax];
Rune r; Rune r;
Again: Again:

View file

@ -494,7 +494,7 @@ bldcclass(void)
exprp++; /* eat '-' */ exprp++; /* eat '-' */
if((c2 = nextrec()) == ']') if((c2 = nextrec()) == ']')
goto Error; goto Error;
classp[n+0] = 0xFFFF; classp[n+0] = Runemax;
classp[n+1] = c1; classp[n+1] = c1;
classp[n+2] = c2; classp[n+2] = c2;
n += 3; n += 3;
@ -516,7 +516,7 @@ classmatch(int classno, int c, int negate)
p = class[classno]; p = class[classno];
while(*p){ while(*p){
if(*p == 0xFFFF){ if(*p == Runemax){
if(p[1]<=c && c<=p[2]) if(p[1]<=c && c<=p[2])
return !negate; return !negate;
p += 3; p += 3;

View file

@ -429,7 +429,7 @@ outTv(Tmesg type, vlong v1)
void void
outTslS(Tmesg type, int s1, long l1, Rune *s) outTslS(Tmesg type, int s1, long l1, Rune *s)
{ {
char buf[DATASIZE*3+1]; char buf[DATASIZE*UTFmax+1];
char *c; char *c;
outstart(type); outstart(type);

View file

@ -625,7 +625,7 @@ compsub(Rune *rhs, Rune *end)
while ((r = *cp++) != '\0') { while ((r = *cp++) != '\0') {
if(r == '\\') { if(r == '\\') {
if (rhs < end) if (rhs < end)
*rhs++ = 0xFFFF; *rhs++ = Runemax;
else else
return 0; return 0;
r = *cp++; r = *cp++;
@ -1055,7 +1055,7 @@ dosub(Rune *rhsbuf)
sp = place(sp, loc1, loc2); sp = place(sp, loc1, loc2);
continue; continue;
} }
if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB + '0') { if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB + '0') {
n = c-'0'; n = c-'0';
if (subexp[n].rsp && subexp[n].rep) { if (subexp[n].rsp && subexp[n].rep) {
sp = place(sp, subexp[n].rsp, subexp[n].rep); sp = place(sp, subexp[n].rsp, subexp[n].rep);
@ -1336,7 +1336,7 @@ void
arout(void) arout(void)
{ {
int c; int c;
char *s; char *s, *e;
char buf[128]; char buf[128];
Rune *p1; Rune *p1;
Biobuf *fi; Biobuf *fi;
@ -1347,7 +1347,7 @@ arout(void)
Bputrune(&fout, *p1); Bputrune(&fout, *p1);
Bputc(&fout, '\n'); Bputc(&fout, '\n');
} else { } else {
for(s = buf, p1 = (*aptr)->text; *p1; p1++) for(s = buf, e = buf+sizeof(buf)-UTFmax-1, p1 = (*aptr)->text; *p1 && s < e; p1++)
s += runetochar(s, p1); s += runetochar(s, p1);
*s = '\0'; *s = '\0';
if((fi = Bopen(buf, OREAD)) == 0) if((fi = Bopen(buf, OREAD)) == 0)

View file

@ -93,7 +93,7 @@ isoutf_in(int fd, long *notused, struct convert *out)
if(!fullisorune(buf+i, tot-i)) if(!fullisorune(buf+i, tot-i))
break; break;
c = isochartorune(&runes[j], buf+i); c = isochartorune(&runes[j], buf+i);
if(runes[j] == Runeerror && c == 1){ if(runes[j] == Runeerror){
if(squawk) if(squawk)
EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
if(clean){ if(clean){

View file

@ -15,7 +15,7 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
#define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07])
#define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07])
#define MAXRUNE 0xFFFF #define MAXRUNE Runemax
uchar f[(MAXRUNE+1)/8]; uchar f[(MAXRUNE+1)/8];
uchar t[(MAXRUNE+1)/8]; uchar t[(MAXRUNE+1)/8];

View file

@ -803,13 +803,14 @@ attext(Thing *t, Point p, char *buf)
} }
int int
type(char *buf, char *tag) type(char *buf, int nbuf, char *tag)
{ {
Rune r; Rune r;
char *p; char *p, *e;
esetcursor(&busy); esetcursor(&busy);
p = buf; p = buf;
e = buf + nbuf-UTFmax-1;
for(;;){ for(;;){
*p = 0; *p = 0;
mesg("%s: %s", tag, buf); mesg("%s: %s", tag, buf);
@ -827,7 +828,8 @@ type(char *buf, char *tag)
--p; --p;
break; break;
default: default:
p += runetochar(p, &r); if(p < e)
p += runetochar(p, &r);
} }
} }
} }
@ -846,7 +848,7 @@ textedit(Thing *t, char *tag)
Thing *nt; Thing *nt;
buttons(Up); buttons(Up);
if(type(buf, tag) == 0) if(type(buf, sizeof(buf), tag) == 0)
return; return;
if(strcmp(tag, "file") == 0){ if(strcmp(tag, "file") == 0){
for(s=buf; *s; s++) for(s=buf; *s; s++)
@ -1174,7 +1176,7 @@ cntledit(char *tag)
long l; long l;
buttons(Up); buttons(Up);
if(type(buf, tag) == 0) if(type(buf, sizeof(buf), tag) == 0)
return; return;
if(strcmp(tag, "mag") == 0){ if(strcmp(tag, "mag") == 0){
if(buf[0]<'0' || '9'<buf[0] || (l=atoi(buf))<=0 || l>Maxmag){ if(buf[0]<'0' || '9'<buf[0] || (l=atoi(buf))<=0 || l>Maxmag){
@ -1806,7 +1808,7 @@ tchar(Thing *t)
return; return;
} }
} }
if(type(buf, "char (hex or character or hex-hex)") == 0) if(type(buf, sizeof(buf), "char (hex or character or hex-hex)") == 0)
return; return;
if(utflen(buf) == 1){ if(utflen(buf) == 1){
chartorune(&r, buf); chartorune(&r, buf);
@ -2000,7 +2002,7 @@ menu(void)
sel = emenuhit(3, &mouse, &menu3); sel = emenuhit(3, &mouse, &menu3);
switch(sel){ switch(sel){
case Mopen: case Mopen:
if(type(buf, "file")){ if(type(buf, sizeof(buf), "file")){
t = tget(buf); t = tget(buf);
if(t) if(t)
drawthing(t, 1); drawthing(t, 1);

View file

@ -51,13 +51,13 @@ range(char *argv[])
return "bad range"; return "bad range";
} }
min = strtoul(q, &q, 16); min = strtoul(q, &q, 16);
if(min<0 || min>0xFFFF || *q!='-') if(min<0 || min>Runemax || *q!='-')
goto err; goto err;
q++; q++;
if(strchr(hex, *q) == 0) if(strchr(hex, *q) == 0)
goto err; goto err;
max = strtoul(q, &q, 16); max = strtoul(q, &q, 16);
if(max<0 || max>0xFFFF || max<min || *q!=0) if(max<0 || max>Runemax || max<min || *q!=0)
goto err; goto err;
i = 0; i = 0;
do{ do{
@ -111,7 +111,7 @@ chars(char *argv[])
return "bad char"; return "bad char";
} }
m = strtoul(q, &q, 16); m = strtoul(q, &q, 16);
if(m<0 || m>0xFFFF || *q!=0) if(m<0 || m>Runemax || *q!=0)
goto err; goto err;
Bprint(&bout, "%C", m); Bprint(&bout, "%C", m);
if(!text) if(!text)

View file

@ -528,12 +528,15 @@ __flagfmt(Fmt *f)
int int
__badfmt(Fmt *f) __badfmt(Fmt *f)
{ {
char x[3]; char x[2+UTFmax];
Rune r;
int n;
r = f->r;
x[0] = '%'; x[0] = '%';
x[1] = f->r; n = 1+runetochar(x+1, &r);
x[2] = '%'; x[n++] = '%';
f->prec = 3; f->prec = n;
__fmtcpy(f, (const void*)x, 3, 3); _fmtcpy(f, x, n, n);
return 0; return 0;
} }

View file

@ -8,16 +8,19 @@ enum
Bit2 = 5, Bit2 = 5,
Bit3 = 4, Bit3 = 4,
Bit4 = 3, Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */
@ -28,7 +31,7 @@ enum
int int
chartorune(Rune *rune, char *str) chartorune(Rune *rune, char *str)
{ {
int c, c1, c2; int c, c1, c2, c3;
long l; long l;
/* /*
@ -73,6 +76,25 @@ chartorune(Rune *rune, char *str)
return 3; return 3;
} }
/*
* four character sequence
* 10000-10FFFF => T4 Tx Tx Tx
*/
if(UTFmax >= 4) {
c3 = *(uchar*)(str+3) ^ Tx;
if(c3 & Testx)
goto bad;
if(c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if(l <= Rune3)
goto bad;
if(l > Runemax)
goto bad;
*rune = l;
return 4;
}
}
/* /*
* bad decoding * bad decoding
*/ */
@ -86,11 +108,14 @@ runetochar(char *str, Rune *rune)
{ {
long c; long c;
c = *rune;
if(c > Runemax)
c = Runeerror;
/* /*
* one character sequence * one character sequence
* 00000-0007F => 00-7F * 00000-0007F => 00-7F
*/ */
c = *rune;
if(c <= Rune1) { if(c <= Rune1) {
str[0] = c; str[0] = c;
return 1; return 1;
@ -110,10 +135,22 @@ runetochar(char *str, Rune *rune)
* three character sequence * three character sequence
* 0800-FFFF => T3 Tx Tx * 0800-FFFF => T3 Tx Tx
*/ */
str[0] = T3 | (c >> 2*Bitx); if(c <= Rune3) {
str[1] = Tx | ((c >> 1*Bitx) & Maskx); str[0] = T3 | (c >> 2*Bitx);
str[2] = Tx | (c & Maskx); str[1] = Tx | ((c >> 1*Bitx) & Maskx);
return 3; str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
} }
int int
@ -140,7 +177,10 @@ runenlen(Rune *r, int nrune)
if(c <= Rune2) if(c <= Rune2)
nb += 2; nb += 2;
else else
if(c <= Rune3 || c > Runemax)
nb += 3; nb += 3;
else
nb += 4;
} }
return nb; return nb;
} }
@ -150,13 +190,15 @@ fullrune(char *str, int n)
{ {
int c; int c;
if(n > 0) { if(n <= 0)
c = *(uchar*)str; return 0;
if(c < Tx) c = *(uchar*)str;
return 1; if(c < Tx)
if(n > 1) return 1;
if(c < T3 || n > 2) if(c < T3)
return 1; return n >= 2;
} if(UTFmax == 3 || c < T4)
return 0; return n >= 3;
return n >= 4;
} }

View file

@ -8,7 +8,8 @@ enum
UTFmax = 3, /* maximum bytes per rune */ UTFmax = 3, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0x80, /* decoding error in UTF */ Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0xFFFF, /* 16 bit rune */
}; };
/* /*

View file

@ -1,6 +1,7 @@
#include <plan9.h> #include <plan9.h>
char *argv0; char *argv0;
enum enum
{ {
Bit1 = 7, Bit1 = 7,
@ -8,27 +9,30 @@ enum
Bit2 = 5, Bit2 = 5,
Bit3 = 4, Bit3 = 4,
Bit4 = 3, Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */
Bad = Runeerror Bad = Runeerror,
}; };
int int
chartorune(Rune *rune, char *str) chartorune(Rune *rune, char *str)
{ {
int c, c1, c2; int c, c1, c2, c3;
long l; long l;
/* /*
@ -73,6 +77,25 @@ chartorune(Rune *rune, char *str)
return 3; return 3;
} }
/*
* four character sequence
* 10000-10FFFF => T4 Tx Tx Tx
*/
if(UTFmax >= 4) {
c3 = *(uchar*)(str+3) ^ Tx;
if(c3 & Testx)
goto bad;
if(c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if(l <= Rune3)
goto bad;
if(l > Runemax)
goto bad;
*rune = l;
return 4;
}
}
/* /*
* bad decoding * bad decoding
*/ */
@ -86,11 +109,14 @@ runetochar(char *str, Rune *rune)
{ {
long c; long c;
c = *rune;
if(c > Runemax)
c = Runeerror;
/* /*
* one character sequence * one character sequence
* 00000-0007F => 00-7F * 00000-0007F => 00-7F
*/ */
c = *rune;
if(c <= Rune1) { if(c <= Rune1) {
str[0] = c; str[0] = c;
return 1; return 1;
@ -110,39 +136,70 @@ runetochar(char *str, Rune *rune)
* three character sequence * three character sequence
* 0800-FFFF => T3 Tx Tx * 0800-FFFF => T3 Tx Tx
*/ */
str[0] = T3 | (c >> 2*Bitx); if(c <= Rune3) {
str[1] = Tx | ((c >> 1*Bitx) & Maskx); str[0] = T3 | (c >> 2*Bitx);
str[2] = Tx | (c & Maskx); str[1] = Tx | ((c >> 1*Bitx) & Maskx);
return 3; str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
} }
int int
runelen(long c) runelen(long c)
{ {
Rune rune; Rune rune;
char str[10]; char str[UTFmax];
rune = c; rune = c;
return runetochar(str, &rune); return runetochar(str, &rune);
} }
int int
utflen(char *s) runenlen(Rune *r, int nrune)
{
int nb, c;
nb = 0;
while(nrune--) {
c = *r++;
if(c <= Rune1)
nb++;
else
if(c <= Rune2)
nb += 2;
else
if(c <= Rune3 || c > Runemax)
nb += 3;
else
nb += 4;
}
return nb;
}
int
fullrune(char *str, int n)
{ {
int c; int c;
long n;
Rune rune;
n = 0; if(n <= 0)
for(;;) { return 0;
c = *(uchar*)s; c = *(uchar*)str;
if(c < Runeself) { if(c < Tx)
if(c == 0) return 1;
return n; if(c < T3)
s++; return n >= 2;
} else if(UTFmax == 3 || c < T4)
s += chartorune(&rune, s); return n >= 3;
n++; return n >= 4;
}
return 0;
} }

View file

@ -1223,12 +1223,12 @@ latin1toutf(char **out, char *in, char *e)
return 0; return 0;
n += e-in; n += e-in;
*out = p = malloc(n+1); *out = p = malloc(UTFmax*n+1);
if(p == nil) if(p == nil)
return 0; return 0;
for(; in < e; in++){ for(; in < e; in++){
r = (uchar)*in; r = (*in) & 0xff;
p += runetochar(p, &r); p += runetochar(p, &r);
} }
*p = 0; *p = 0;

View file

@ -954,7 +954,7 @@ tokenconvert(String *t)
{ {
String *s; String *s;
char decoded[1024]; char decoded[1024];
char utfbuf[2*1024]; char utfbuf[UTFmax*1024];
int i, len; int i, len;
char *e; char *e;
char *token; char *token;

View file

@ -335,6 +335,8 @@ screenputc(char *buf)
addflush(r); addflush(r);
curpos.x = *xp; curpos.x = *xp;
break; break;
case '\0':
break;
default: default:
p = memsubfontwidth(memdefont, buf); p = memsubfontwidth(memdefont, buf);
w = p.x; w = p.x;
@ -354,23 +356,19 @@ screenputc(char *buf)
void void
screenputs(char *s, int n) screenputs(char *s, int n)
{ {
int i; static char rb[UTFmax+1];
Rune r; static int nrb;
char buf[4]; char *e;
drawlock(); drawlock();
while(n > 0){ e = s + n;
i = chartorune(&r, s); while(s < e){
if(i == 0){ rb[nrb++] = *s++;
s++; if(nrb >= UTFmax || fullrune(rb, nrb)){
--n; rb[nrb] = 0;
continue; screenputc(rb);
nrb = 0;
} }
memmove(buf, s, i);
buf[i] = 0;
n -= i;
s += i;
screenputc(buf);
} }
screenflush(); screenflush();
drawunlock(); drawunlock();

View file

@ -141,7 +141,7 @@ Biobuf* foutput; /* y.output file */
char* infile; /* input file name */ char* infile; /* input file name */
int numbval; /* value of an input number */ int numbval; /* value of an input number */
char tokname[NAMESIZE+4]; /* input token name, slop for runes and 0 */ char tokname[NAMESIZE+UTFmax+1]; /* input token name, slop for runes and 0 */
/* structure declarations */ /* structure declarations */

View file

@ -7,7 +7,7 @@ Bgetrune(Biobufhdr *bp)
{ {
int c, i; int c, i;
Rune rune; Rune rune;
char str[4]; char str[UTFmax];
c = Bgetc(bp); c = Bgetc(bp);
if(c < Runeself) { /* one char */ if(c < Runeself) { /* one char */

View file

@ -6,7 +6,7 @@ int
Bputrune(Biobufhdr *bp, long c) Bputrune(Biobufhdr *bp, long c)
{ {
Rune rune; Rune rune;
char str[4]; char str[UTFmax];
int n; int n;
rune = c; rune = c;

View file

@ -512,12 +512,15 @@ _flagfmt(Fmt *f)
int int
_badfmt(Fmt *f) _badfmt(Fmt *f)
{ {
char x[3]; char x[2+UTFmax];
Rune r;
int n;
r = f->r;
x[0] = '%'; x[0] = '%';
x[1] = f->r; n = 1+runetochar(x+1, &r);
x[2] = '%'; x[n++] = '%';
f->prec = 3; f->prec = n;
_fmtcpy(f, x, 3, 3); _fmtcpy(f, x, n, n);
return 0; return 0;
} }

View file

@ -8,16 +8,19 @@ enum
Bit2 = 5, Bit2 = 5,
Bit3 = 4, Bit3 = 4,
Bit4 = 3, Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */ Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */ Testx = Maskx ^ 0xFF, /* 1100 0000 */
@ -28,7 +31,7 @@ enum
int int
chartorune(Rune *rune, char *str) chartorune(Rune *rune, char *str)
{ {
int c, c1, c2; int c, c1, c2, c3;
long l; long l;
/* /*
@ -73,6 +76,25 @@ chartorune(Rune *rune, char *str)
return 3; return 3;
} }
/*
* four character sequence
* 10000-10FFFF => T4 Tx Tx Tx
*/
if(UTFmax >= 4) {
c3 = *(uchar*)(str+3) ^ Tx;
if(c3 & Testx)
goto bad;
if(c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if(l <= Rune3)
goto bad;
if(l > Runemax)
goto bad;
*rune = l;
return 4;
}
}
/* /*
* bad decoding * bad decoding
*/ */
@ -86,11 +108,14 @@ runetochar(char *str, Rune *rune)
{ {
long c; long c;
c = *rune;
if(c > Runemax)
c = Runeerror;
/* /*
* one character sequence * one character sequence
* 00000-0007F => 00-7F * 00000-0007F => 00-7F
*/ */
c = *rune;
if(c <= Rune1) { if(c <= Rune1) {
str[0] = c; str[0] = c;
return 1; return 1;
@ -110,17 +135,29 @@ runetochar(char *str, Rune *rune)
* three character sequence * three character sequence
* 0800-FFFF => T3 Tx Tx * 0800-FFFF => T3 Tx Tx
*/ */
str[0] = T3 | (c >> 2*Bitx); if(c <= Rune3) {
str[1] = Tx | ((c >> 1*Bitx) & Maskx); str[0] = T3 | (c >> 2*Bitx);
str[2] = Tx | (c & Maskx); str[1] = Tx | ((c >> 1*Bitx) & Maskx);
return 3; str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
} }
int int
runelen(long c) runelen(long c)
{ {
Rune rune; Rune rune;
char str[10]; char str[UTFmax];
rune = c; rune = c;
return runetochar(str, &rune); return runetochar(str, &rune);
@ -140,7 +177,10 @@ runenlen(Rune *r, int nrune)
if(c <= Rune2) if(c <= Rune2)
nb += 2; nb += 2;
else else
if(c <= Rune3 || c > Runemax)
nb += 3; nb += 3;
else
nb += 4;
} }
return nb; return nb;
} }
@ -150,13 +190,15 @@ fullrune(char *str, int n)
{ {
int c; int c;
if(n > 0) { if(n <= 0)
c = *(uchar*)str; return 0;
if(c < Tx) c = *(uchar*)str;
return 1; if(c < Tx)
if(n > 1) return 1;
if(c < T3 || n > 2) if(c < T3)
return 1; return n >= 2;
} if(UTFmax == 3 || c < T4)
return 0; return n >= 3;
return n >= 4;
} }

View file

@ -70,7 +70,7 @@ buildfont(Display *d, char *buf, char *name)
} }
max = strtol(s, &s, 0); max = strtol(s, &s, 0);
s = skip(s); s = skip(s);
if(*s==0 || min>=65536 || max>=65536 || min>max){ if(*s==0 || min>Runemax || max>Runemax || min>max){
werrstr("illegal subfont range"); werrstr("illegal subfont range");
Err3: Err3:
freefont(fnt); freefont(fnt);

View file

@ -199,7 +199,7 @@ static void
ekeyslave(int fd) ekeyslave(int fd)
{ {
Rune r; Rune r;
char t[3], k[10]; char t[1+UTFmax], k[10];
int kr, kn, w; int kr, kn, w;
if(eforkslave(Ekeyboard) < MAXSLAVE) if(eforkslave(Ekeyboard) < MAXSLAVE)
@ -215,10 +215,9 @@ ekeyslave(int fd)
} }
w = chartorune(&r, k); w = chartorune(&r, k);
kn -= w; kn -= w;
memmove(t+1, k, w);
memmove(k, &k[w], kn); memmove(k, &k[w], kn);
t[1] = r; if(write(epipe[1], t, sizeof(t)) != sizeof(t))
t[2] = r>>8;
if(write(epipe[1], t, 3) != 3)
break; break;
} }
breakout:; breakout:;
@ -302,7 +301,7 @@ loop:
s->head = (Ebuf *)1; s->head = (Ebuf *)1;
return; return;
} }
if(i == Skeyboard && n != 3) if(i == Skeyboard && n != (1+UTFmax))
drawerror(display, "events: protocol error: keyboard"); drawerror(display, "events: protocol error: keyboard");
if(i == Smouse){ if(i == Smouse){
if(n < 1+1+2*12) if(n < 1+1+2*12)
@ -418,14 +417,13 @@ int
ekbd(void) ekbd(void)
{ {
Ebuf *eb; Ebuf *eb;
int c; Rune r;
if(Skeyboard < 0) if(Skeyboard < 0)
drawerror(display, "events: keyboard not initialzed"); drawerror(display, "events: keyboard not initialzed");
eb = ebread(&eslave[Skeyboard]); eb = ebread(&eslave[Skeyboard]);
c = eb->buf[0] + (eb->buf[1]<<8); chartorune(&r, (char*)eb->buf);
free(eb); return r;
return c;
} }
void void

View file

@ -1310,9 +1310,9 @@ getchar(TokenSource* ts)
break; break;
case UTF_8: case UTF_8:
ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i); ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
n = chartorune(&r, (char*)(buf+ts->i));
if(ok) { if(ok) {
if(warn && c == 0x80) n = chartorune(&r, (char*)(buf+ts->i));
if(warn && c == Runeerror)
fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]); fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
ts->i += n; ts->i += n;
c = r; c = r;

View file

@ -535,7 +535,7 @@ toStr(uchar* buf, int n, int chset)
// Convert buf[0:n], Unicode characters, // Convert buf[0:n], Unicode characters,
// into an emalloc'd null-terminated string in character set chset. // into an emalloc'd null-terminated string in character set chset.
// Use 0x80 for unconvertable characters. // Use Runeerror for unconvertable characters.
uchar* uchar*
fromStr(Rune* buf, int n, int chset) fromStr(Rune* buf, int n, int chset)
{ {
@ -554,7 +554,7 @@ fromStr(Rune* buf, int n, int chset)
for(i = 0; i < n; i++) { for(i = 0; i < n; i++) {
ch = buf[i]; ch = buf[i];
if(ch > lim) if(ch > lim)
ch = 0x80; ch = Runeerror;
ans[i] = ch; ans[i] = ch;
} }
ans[n] = 0; ans[n] = 0;