From 917da0089dcaa013979a69aaeaeff0c08cbc7e26 Mon Sep 17 00:00:00 2001 From: cinap_lenrek Date: Thu, 24 Sep 2015 12:23:17 +0200 Subject: [PATCH] cpp: handle 4 byte utf sequences (21-bit runes) --- sys/src/cmd/cpp/lex.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sys/src/cmd/cpp/lex.c b/sys/src/cmd/cpp/lex.c index e90423e93..226097b35 100644 --- a/sys/src/cmd/cpp/lex.c +++ b/sys/src/cmd/cpp/lex.c @@ -29,6 +29,7 @@ #define UTF2(c) ((c)>=0xA0 && (c)<0xE0) /* 2-char UTF seq */ #define UTF3(c) ((c)>=0xE0 && (c)<0xF0) /* 3-char UTF seq */ +#define UTF4(c) ((c)>=0xF0 && (c)<0xF8) /* 4-char UTF seq */ /* character classes */ #define C_WS 1 @@ -259,7 +260,7 @@ expandlex(void) case C_ALPH: for (j=0; j<=256; j++) if ('a'<=j&&j<='z' || 'A'<=j&&j<='Z' - || UTF2(j) || UTF3(j) || j=='_') + || UTF2(j) || UTF3(j) || UTF4(j) || j=='_') bigfsm[j][fp->state] = nstate; continue; case C_NUM: @@ -274,7 +275,7 @@ expandlex(void) /* install special cases for ? (trigraphs), \ (splicing), runes */ for (i=0; i0) bigfsm[j][i] = ~bigfsm[j][i]; bigfsm[j][i] &= ~QBSBIT; @@ -393,6 +394,10 @@ gettokens(Tokenrow *trp, int reset) runelen = 3; goto reswitch; } + if (UTF4(c)) { + runelen = 4; + goto reswitch; + } error(WARNING, "Lexical botch in cpp"); ip += runelen; runelen = 1;