337 lines
4.5 KiB
C
337 lines
4.5 KiB
C
#include <u.h>
|
|
#include <libc.h>
|
|
#include <bio.h>
|
|
#include <ctype.h>
|
|
#include "code.h"
|
|
|
|
/* read an annotated spelling list in the form
|
|
word <tab> affixcode [ , affixcode ] ...
|
|
print a reencoded version
|
|
octal <tab> word
|
|
*/
|
|
|
|
typedef struct Dict Dict;
|
|
struct Dict
|
|
{
|
|
char* word;
|
|
int encode;
|
|
};
|
|
|
|
Dict words[200000];
|
|
char space[500000];
|
|
long encodes[4094];
|
|
long nspace;
|
|
long nwords;
|
|
int ncodes;
|
|
Biobuf bout;
|
|
|
|
void readinput(int f);
|
|
long typecode(char *str);
|
|
int wcmp(void*, void*);
|
|
void pdict(void);
|
|
void sput(int);
|
|
|
|
void
|
|
main(int argc, char *argv[])
|
|
{
|
|
int f;
|
|
|
|
Binit(&bout, 1, OWRITE);
|
|
nwords = 0;
|
|
nspace = 0;
|
|
ncodes = 0;
|
|
if(argc <= 1)
|
|
readinput(0);
|
|
while(argc > 1) {
|
|
f = open(argv[1], 0);
|
|
if(f < 0) {
|
|
fprint(2, "Cannot open %s\n", argv[1]);
|
|
exits("open");
|
|
}
|
|
readinput(f);
|
|
argc--;
|
|
argv++;
|
|
}
|
|
fprint(2, "words = %ld; space = %ld; codes = %d\n",
|
|
nwords, nspace, ncodes);
|
|
qsort(words, nwords, sizeof(words[0]), wcmp);
|
|
pdict();
|
|
exits(0);
|
|
}
|
|
|
|
wcmp(void *a, void *b)
|
|
{
|
|
|
|
return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
|
|
}
|
|
|
|
void
|
|
readinput(int f)
|
|
{
|
|
long i;
|
|
char *code, *line, *bword;
|
|
Biobuf buf;
|
|
long lineno = 0;
|
|
|
|
Binit(&buf, f, OREAD);
|
|
while(line = Brdline(&buf, '\n')) {
|
|
line[Blinelen(&buf)-1] = 0;
|
|
lineno++;
|
|
code = line;
|
|
while(isspace(*code))
|
|
code++;
|
|
bword = code;
|
|
while(*code && !isspace(*code))
|
|
code++;
|
|
|
|
i = code-bword;
|
|
memmove(space+nspace, bword, i);
|
|
words[nwords].word = space+nspace;
|
|
nspace += i;
|
|
space[nspace] = 0;
|
|
nspace++;
|
|
|
|
if(*code) {
|
|
*code++ = 0;
|
|
while(isspace(*code))
|
|
code++;
|
|
}
|
|
words[nwords].encode = typecode(code);
|
|
nwords++;
|
|
if(nwords >= sizeof(words)/sizeof(words[0])) {
|
|
fprint(2, "words array too small\n");
|
|
exits("words");
|
|
}
|
|
if(nspace >= sizeof(space)/sizeof(space[0])) {
|
|
fprint(2, "space array too small\n");
|
|
exits("space");
|
|
}
|
|
}
|
|
Bterm(&buf);
|
|
}
|
|
|
|
|
|
typedef struct Class Class;
|
|
struct Class
|
|
{
|
|
char* codename;
|
|
long bits;
|
|
};
|
|
Class codea[] =
|
|
{
|
|
{ "a", ADJ },
|
|
{ "adv", ADV },
|
|
0
|
|
};
|
|
Class codec[] =
|
|
{
|
|
{ "comp", COMP },
|
|
0
|
|
};
|
|
Class coded[] =
|
|
{
|
|
{ "d", DONT_TOUCH},
|
|
0
|
|
};
|
|
|
|
Class codee[] =
|
|
{
|
|
{ "ed", ED },
|
|
{ "er", ACTOR },
|
|
0
|
|
};
|
|
|
|
Class codei[] =
|
|
{
|
|
{ "in", IN },
|
|
{ "ion", ION },
|
|
0
|
|
};
|
|
|
|
Class codem[] =
|
|
{
|
|
{ "man", MAN },
|
|
{ "ms", MONO },
|
|
0
|
|
};
|
|
|
|
Class coden[] =
|
|
{
|
|
{ "n", NOUN },
|
|
{ "na", N_AFFIX },
|
|
{ "nopref", NOPREF },
|
|
0
|
|
};
|
|
|
|
Class codep[] =
|
|
{
|
|
{ "pc", PROP_COLLECT },
|
|
0
|
|
};
|
|
Class codes[] =
|
|
{
|
|
{ "s", STOP },
|
|
0
|
|
};
|
|
|
|
Class codev[] =
|
|
{
|
|
{ "v", VERB },
|
|
{ "va", V_AFFIX },
|
|
{ "vi", V_IRREG },
|
|
0
|
|
};
|
|
|
|
Class codey[] =
|
|
{
|
|
{ "y", _Y },
|
|
0
|
|
};
|
|
|
|
Class codez[] =
|
|
{
|
|
0
|
|
};
|
|
Class* codetab[] =
|
|
{
|
|
codea,
|
|
codez,
|
|
codec,
|
|
coded,
|
|
codee,
|
|
codez,
|
|
codez,
|
|
codez,
|
|
codei,
|
|
codez,
|
|
codez,
|
|
codez,
|
|
codem,
|
|
coden,
|
|
codez,
|
|
codep,
|
|
codez,
|
|
codez,
|
|
codes,
|
|
codez,
|
|
codez,
|
|
codev,
|
|
codez,
|
|
codez,
|
|
codey,
|
|
codez,
|
|
};
|
|
|
|
long
|
|
typecode(char *str)
|
|
{
|
|
Class *p;
|
|
long code;
|
|
int n, i;
|
|
char *s, *sp, *st;
|
|
|
|
code = 0;
|
|
|
|
loop:
|
|
for(s=str; *s != 0 && *s != ','; s++)
|
|
;
|
|
for(p = codetab[*str-'a']; sp = p->codename; p++) {
|
|
st = str;
|
|
for(n=s-str;; st++,sp++) {
|
|
if(*st != *sp)
|
|
goto cont;
|
|
n--;
|
|
if(n == 0)
|
|
break;
|
|
}
|
|
code |= p->bits;
|
|
if(*s == 0)
|
|
goto out;
|
|
str = s+1;
|
|
goto loop;
|
|
cont:;
|
|
}
|
|
fprint(2, "Unknown affix code \"%s\"\n", str);
|
|
return 0;
|
|
out:
|
|
for(i=0; i<ncodes; i++)
|
|
if(encodes[i] == code)
|
|
return i;
|
|
encodes[i] = code;
|
|
ncodes++;
|
|
return i;
|
|
}
|
|
|
|
void
|
|
sput(int s)
|
|
{
|
|
|
|
Bputc(&bout, s>>8);
|
|
Bputc(&bout, s);
|
|
}
|
|
|
|
void
|
|
lput(long l)
|
|
{
|
|
Bputc(&bout, l>>24);
|
|
Bputc(&bout, l>>16);
|
|
Bputc(&bout, l>>8);
|
|
Bputc(&bout, l);
|
|
}
|
|
|
|
/*
|
|
* spit out the encoded dictionary
|
|
* all numbers are encoded big-endian.
|
|
* struct
|
|
* {
|
|
* short ncodes;
|
|
* long encodes[ncodes];
|
|
* struct
|
|
* {
|
|
* short encode;
|
|
* char word[*];
|
|
* } words[*];
|
|
* };
|
|
* 0x8000 flag for code word
|
|
* 0x7800 count of number of common bytes with previous word
|
|
* 0x07ff index into codes array for affixes
|
|
*/
|
|
void
|
|
pdict(void)
|
|
{
|
|
long i, count;
|
|
int encode, j, c;
|
|
char *lastword, *thisword, *word;
|
|
|
|
sput(ncodes);
|
|
for(i=0; i<ncodes; i++)
|
|
lput(encodes[i]);
|
|
|
|
count = ncodes*4 + 2;
|
|
lastword = "";
|
|
for(i=0; i<nwords; i++) {
|
|
word = words[i].word;
|
|
thisword = word;
|
|
for(j=0; *thisword == *lastword; j++) {
|
|
if(*thisword == 0) {
|
|
fprint(2, "identical words: %s\n", word);
|
|
break;
|
|
}
|
|
thisword++;
|
|
lastword++;
|
|
}
|
|
if(j > 15)
|
|
j = 15;
|
|
encode = words[i].encode;
|
|
c = (1<<15) | (j<<11) | encode;
|
|
sput(c);
|
|
count += 2;
|
|
for(thisword=word+j; c = *thisword; thisword++) {
|
|
Bputc(&bout, c);
|
|
count++;
|
|
}
|
|
lastword = word;
|
|
}
|
|
fprint(2, "output bytes = %ld\n", count);
|
|
}
|