ktrans: 你好

This consolidates jisho and map lookups
to use the same structure and removes
the old jisho code.
This commit is contained in:
Jacob Moody 2022-07-17 14:52:11 +00:00
parent ccbabf1c16
commit c147614656
8 changed files with 86361 additions and 3178 deletions

83100
lib/hanzi.zidian Normal file

File diff suppressed because it is too large Load diff

210
sys/src/cmd/ktrans/hash.c Normal file
View file

@ -0,0 +1,210 @@
#include <u.h>
#include <libc.h>
#include "hash.h"
typedef struct Hnode Hnode;
struct Hnode {
int filled;
int next;
void *key;
};
enum{
Tagsize = sizeof(Hnode),
};
uvlong
shash(char *s)
{
uvlong hash;
hash = 7;
for(; *s; s++)
hash = hash*31 + *s;
return hash;
}
Hmap*
hmapalloc(int nbuckets, int size)
{
void *store;
Hmap *h;
int nsz;
nsz = Tagsize + size;
store = mallocz(sizeof(*h) + (nbuckets * nsz), 1);
if(store == nil)
return nil;
h = store;
h->nbs = nbuckets;
h->nsz = nsz;
h->len = h->cap = nbuckets;
h->nodes = store;
h->nodes += sizeof(*h);
return store;
}
int
hmapset(Hmap **store, char *key, void *new, void *old)
{
Hnode *n;
uchar *v;
uchar *oldv;
Hmap *h;
int next;
vlong diff;
h = *store;
oldv = nil;
v = h->nodes + (shash(key)%h->nbs) * h->nsz;
for(;;){
n = (Hnode*)v;
next = n->next;
if(n->filled == 0)
goto replace;
if(strcmp(n->key, key) == 0){
oldv = v + Tagsize;
goto replace;
}
if(next == 0)
break;
v = h->nodes + next*h->nsz;
}
if(h->cap == h->len){
/* figure out way back from a relocation */
diff = v - h->nodes;
h->cap *= 2;
*store = realloc(*store, sizeof(*h) + h->cap*h->nsz);
h = *store;
h->nodes = (uchar*)*store + sizeof(*h);
memset(h->nodes + h->len*h->nsz, 0, h->nsz);
v = h->nodes + diff;
n = (Hnode*)v;
}
n->next = h->len;
h->len++;
assert(h->len <= h->cap);
v = h->nodes + n->next*h->nsz;
n = (Hnode*)v;
replace:
memmove(v + Tagsize, new, h->nsz - Tagsize);
n->filled++;
n->key = key;
n->next = next;
if(old != nil && oldv != nil){
memmove(old, oldv, h->nsz - Tagsize);
return 1;
}
return 0;
}
void*
_hmapget(Hmap *h, char *key)
{
Hnode *n;
uchar *v;
v = h->nodes + (shash(key)%h->nbs)*h->nsz;
for(;;){
n = (Hnode*)v;
if(n->filled != 0 && strcmp(n->key, key) == 0)
return v;
if(n->next == 0)
break;
v = h->nodes + n->next*h->nsz;
}
return nil;
}
int
hmapget(Hmap *h, char *key, void *dst)
{
uchar *v;
v = _hmapget(h, key);
if(v == nil)
return -1;
if(dst != nil)
memmove(dst, v + Tagsize, h->nsz - Tagsize);
return 0;
}
int
hmapdel(Hmap *h, char *key, void *dst, int freekey)
{
uchar *v;
Hnode *n;
v = _hmapget(h, key);
if(v == nil)
return -1;
n = (Hnode*)v;
n->filled = 0;
if(freekey)
free(n->key);
if(dst != nil)
memmove(dst, v + Tagsize, h->nsz - Tagsize);
return 0;
}
char*
hmapkey(Hmap *h, char *key)
{
uchar *v;
Hnode *n;
v = _hmapget(h, key);
if(v == nil)
return nil;
n = (Hnode*)v;
return n->key;
}
Hmap*
hmaprehash(Hmap *old, int buckets)
{
int i;
uchar *v;
Hnode *n;
Hmap *new;
if(buckets == 0)
buckets = old->len;
new = hmapalloc(buckets, old->nsz - Tagsize);
for(i=0 ; i < old->len; i++){
v = old->nodes + i*old->nsz;
n = (Hnode*)v;
hmapset(&new, n->key, v + Tagsize, nil);
}
free(old);
return new;
}
void
hmapreset(Hmap *h, int freekeys)
{
Hnode *n;
uchar *v;
int i;
for(i=0; i < h->len; i++){
v = h->nodes + i*h->nsz;
n = (Hnode*)v;
if(n->filled == 0)
continue;
if(freekeys)
free(n->key);
n->filled = 0;
}
h->len = 0;
}

23
sys/src/cmd/ktrans/hash.h Normal file
View file

@ -0,0 +1,23 @@
typedef union Hkey Hkey;
union Hkey {
void *p;
int v;
};
typedef struct Hmap Hmap;
struct Hmap {
int nbs;
int nsz;
int len;
int cap;
uchar *nodes;
};
Hmap* hmapalloc(int nbuckets, int size);
int hmapget(Hmap *h, char *key, void *dst);
int hmapset(Hmap **h, char *key, void *new, void *old);
int hmapdel(Hmap *h, char *key, void *dst, int freekey);
void hmapfree(Hmap *h, int freekeys);
char* hmapkey(Hmap *h, char *key);
void hmapreset(Hmap *h, int freekeys);

View file

@ -1,211 +0,0 @@
/*
* open jisho file, and set the size of this jisho etc
*
* Kenji Okamoto August 4, 2000
* Osaka Prefecture Univ.
* okamoto@granite.cias.osakafu-u.ac.jp
*/
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "jisho.h"
Dictionary *openQDIC(char *);
void freeQDIC(Dictionary*);
KouhoList *getKouhoHash(Dictionary*, char *);
KouhoList *getKouhoFile(DicList*, char *);
void selectKouho(KouhoList **, KouhoList*);
int hashVal(char *);
void addHash(Hash **, DicList*);
/*
* Open QuickDIC (hashed personal dictionary)
* open skk styled ktrans dictionary file, and make its hash table
* based on individual header kana strings
*
* KouhoList
* |---------|
* Hash |---->kouho---->kouhotop
* |-------| |
* dic---->dhash---->dicindex---->kanahead
* |--------| |--------|
* Dictionary DicList
*
*/
Dictionary *
openQDIC(char *dicname)
{
Biobuf *f;
void *Bbuf;
Dictionary *dic;
DicList *dicitem; /* for a future extension */
char buf[1024], *startstr, *endstr;
int i;
SET(dicitem); /* yes, I know I'm wrong, but... */
dic = (Dictionary*)malloc(sizeof(Dictionary));
/* make room for pointer array (size=HASHSIZE) of hash table */
for(i=0; i< HASHSIZE; i++) dic->dhash[i] = 0;
dic->dlist = 0; /* for a future extension (more than one dics ^_^ */
if ((f = Bopen(dicname, OREAD)) == 0)
return dic;
/* make hash table by the dic's header word */
while(Bbuf = Brdline(f, '\n')) {
strncpy(buf, (char *)Bbuf, Blinelen(f));
if (buf[0] == ';') /* comment line */
continue;
else {
/* get header word from jisho */
startstr = buf;
if(!(endstr = utfutf(startstr, "\t"))) break;
*endstr = '\0';
/* dicitem includes each header word from the jisho */
dicitem = (DicList*)malloc(sizeof(DicList)+(endstr-startstr+1));
dicitem->nextitem = 0; /* for a future extension */
strcpy(dicitem->kanahead, startstr);
dicitem->kouho = getKouhoFile(dicitem, endstr); /* read kouho from jisho */
addHash(dic->dhash, dicitem);
}
continue;
}
dic->dlist = dicitem;
Bterm(f);
return dic;
}
/*
* free dynamically allocated memory
*/
void
freeQDIC(Dictionary *dic)
{
Hash *hash1, *hash2;
DicList *dlist, *dlist2;
int l;
for (dlist = dic->dlist;
dlist != 0;
dlist2 = dlist, dlist = dlist->nextitem, free((void *)dlist2));
for (l = 0; l < HASHSIZE; l++) {
for (hash1 = dic->dhash[l]; hash1; hash1 = hash2) {
if (hash1->next !=0) {
hash2 = hash1->next;
free((void *)hash1);
}else
break;
}
}
free((void *)dic);
}
int
hashVal(char *s)
{
uint h;
h = 0x811c9dc5;
while(*s != 0)
h = (h^(uchar)*s++) * 0x1000193;
return h % HASHSIZE;
}
void
addHash(Hash **hash, DicList *ditem)
{
Hash *h;
int v;
v = hashVal(ditem->kanahead);
h = (Hash*)malloc(sizeof(Hash));
h->dicindex = ditem;
h->length = strlen(ditem->kanahead);
h->next = hash[v];
hash[v] = h;
}
/*
* read Kouho list from the jisho file defined by Biobuf descriptor f
*
* revised for Plan 9 by K.Okamoto
*/
KouhoList *
getKouhoFile(DicList *dicitem, char * endstr)
{
char *kouhostart, *kouhoend;
KouhoList *kouhoitem, *currntkouhoitem=0, *prevkouhoitem;
prevkouhoitem = 0;
kouhostart = endstr + 1;
while((kouhoend = utfutf(kouhostart, " ")) ||
(kouhoend = utfutf(kouhostart, "\n"))) {
*kouhoend = '\0';
kouhoitem = (KouhoList*)malloc(sizeof(KouhoList)+(kouhoend-kouhostart+1));
kouhoitem->nextkouho = 0;
kouhoitem->prevkouho = prevkouhoitem;
kouhoitem->dicitem = dicitem;
strcpy(kouhoitem->kouhotop, kouhostart);
if (prevkouhoitem)
prevkouhoitem->nextkouho = kouhoitem;
else
currntkouhoitem = kouhoitem;
prevkouhoitem = kouhoitem;
kouhostart = kouhoend + 1;
}
return currntkouhoitem;
}
/*
* get matched kouho from the hash table of header word of the dict
* if found, returns pointer to the first candidate in the hash table.
* if not found, returns 0.
*
* from getCand() in skklib.c by Akinori Ito et al.,(aito@ei5sun.yz.yamagata-u.ac.jp)
*/
KouhoList *
getKouhoHash(Dictionary *dic, char *s)
{
int l, v;
Hash *h;
l = strlen(s);
v = hashVal(s);
for (h = dic->dhash[v]; h != 0; h = h->next) {
if (h->length != l ||
strcmp(h->dicindex->kanahead, s)) continue;
return h->dicindex->kouho; /* return matched kouho */
}
return 0;
}
/*
* from skklib.c by Akinori Ito et al.,(aito@ei5sun.yz.yamagata-u.ac.jp)
* just modified to read easier for current purpose
*/
void
selectKouho(KouhoList **first, KouhoList *current)
{
/* take off currentkouho from the kouholist table */
if (current->prevkouho) {
current->prevkouho->nextkouho = current->nextkouho;
if (current->nextkouho)
current->nextkouho->prevkouho = current->prevkouho;
current->prevkouho = 0;
}
/* take place of firstkouho by currentkouho */
if (*first != current) {
(*first)->prevkouho = current;
current->nextkouho = *first;
*first = current;
}
}

View file

@ -1,41 +0,0 @@
/*
* Kenji Okamoto August 4, 2000
* Osaka Prefecture Univ.
* okamoto@granite.cias.osakafu-u.ac.jp
*/
#define HASHSIZE 257
/*
* Structure for Dictionary's header word (in Hiragana)
*/
typedef struct DicList DicList;
struct DicList {
struct KouhoList *kouho;
struct DicList *nextitem; /* for a future extension */
char kanahead[1];
};
/*
* Structure for Kouho of each index word in the dictionary
*/
typedef struct KouhoList KouhoList;
struct KouhoList {
struct KouhoList *nextkouho;
struct KouhoList *prevkouho;
struct DicList *dicitem;
char kouhotop[1]; /* top of the kouhos */
} ;
typedef struct Hash Hash;
struct Hash {
DicList *dicindex; /* pointer to a KouhoList and kanahead etc */
short length;
struct Hash *next;
};
typedef struct Dictionary Dictionary;
struct Dictionary {
DicList *dlist; /* for a future extension, having more than one dictionaries */
Hash *dhash[HASHSIZE];
};

File diff suppressed because it is too large Load diff

View file

@ -6,16 +6,26 @@
* okamoto@granite.cias.osakafu-u.ac.jp
*/
/*
* A glossary on some of the Japanese vocabulary used:
* kana: syllabic letting, either hiragana() or katakana()
* kanji(): borrowed characters, in
* Okurigana(): kana tail to kanji, in
* Joshi(): particle, in
* Jisho(): dictionary
* kouho(): candidate
*/
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "hash.h"
#include "ktrans.h"
#include "jisho.h"
#define LSIZE 256
Rune lbuf[LSIZE]; /* hiragana buffer for key input written by send() */
Map *table = hira; /* default language conversion table */
Hmap *table;
uchar okurigana[LSIZE]; /* buffer for okurigana */
char okuri = 0; /* buffer/flag for capital input char */
int in, out;
@ -23,16 +33,10 @@ int llen, olen, joshi = 0;
int natural = 1; /* not Japanese but English mode */
int changelang(int);
int dotrans(Dictionary*);
int dotrans(Hmap*);
int nrune(char *);
void send(uchar *, int);
Map *match(uchar *p, int *nc, Map *table);
extern Dictionary *openQDIC(char *);
extern KouhoList *getKouhoHash(Dictionary*, char *);
extern KouhoList *getKouhoFile(DicList*, char *);
extern void freeQDIC(Dictionary*);
extern void selectKouho(KouhoList **, KouhoList*);
Hmap* opendict(Hmap *, char *);
void
kbdopen(void)
@ -89,6 +93,50 @@ kbdopen(void)
exits(nil);
}
Map signalmore = {
"_", nil, 1,
};
Hmap*
initmap(Map *m, int n)
{
int i, j;
char buf[16];
char *s;
Map prev;
Hmap *h;
h = hmapalloc(n, sizeof(Map));
for(i = 0; i < n; i++){
if(m[i].roma == nil || m[i].roma[0] == '\0')
continue;
//We mark all partial strings so we know when
//we have partial match when ingesting.
j = 2;
for(s = m[i].roma; *s && j <= sizeof buf; s++){
snprint(buf, j, "%s", m[i].roma);
prev = m[i];
if(hmapget(h, buf, &prev) == 0){
if(prev.leadstomore == 1 && s[1] == '\0'){
//confict; partial & valid input
prev = m[i];
prev.leadstomore = 1;
free(hmapkey(h, buf));
}
}
if(s[1] == '\0'){
hmapset(&h, strdup(buf), &prev, nil);
} else {
hmapset(&h, strdup(buf), &signalmore, nil);
}
j++;
}
}
return h;
}
void
usage(void)
{
@ -101,11 +149,11 @@ main(int argc, char *argv[])
{
uchar *bp, *ep, buf[128];
Map *mp;
int nchar, wantmore;
Map lkup, last;
int wantmore;
int n, c;
char *dictname;
Dictionary *jisho;
char *jishoname, *zidianname;
Hmap *jisho, *zidian;
ARGBEGIN{
default: usage();
@ -113,9 +161,20 @@ main(int argc, char *argv[])
if(argc != 0)
usage();
if((dictname = getenv("jisho")) == nil)
dictname = "/lib/kanji.jisho";
jisho = openQDIC(dictname);
if((jishoname = getenv("jisho")) == nil)
jishoname = "/lib/kanji.jisho";
jisho = opendict(nil, jishoname);
if((zidianname = getenv("zidian")) == nil)
zidianname = "/lib/hanzi.zidian";
zidian = opendict(nil, zidianname);
hira = table = initmap(mhira, nelem(mhira));
kata = initmap(mkata, nelem(mkata));
greek = initmap(mgreek, nelem(mgreek));
cyril = initmap(mcyril, nelem(mcyril));
hangul = initmap(mhangul, nelem(mhangul));
last = (Map){nil, nil, -1};
kbdopen();
if(fork())
@ -147,8 +206,8 @@ main(int argc, char *argv[])
wantmore = 0;
if (*bp=='') { /* ^x read ktrans-jisho once more */
freeQDIC(jisho);
jisho = openQDIC(dictname);
jisho = opendict(jisho, jishoname);
zidian = opendict(zidian, zidianname);
llen = 0;
olen = okuri = joshi = 0;
wantmore=0;
@ -156,6 +215,9 @@ main(int argc, char *argv[])
continue;
}
if (*bp=='') { /* ^\ (start translation command) */
if (table == hanzi)
c = dotrans(zidian);
else
c = dotrans(jisho);
if (c)
*bp = c; /* pointer to translated rune */
@ -167,11 +229,13 @@ main(int argc, char *argv[])
bp++;
llen = 0;
olen = okuri = joshi = 0;
last.kana = nil;
continue;
}
if (changelang(*bp)) { /* change language mode OK */
bp++;
olen = okuri = joshi = 0;
last.kana = nil;
continue;
}
if (natural || *bp<=' ' || *bp>='{') { /* English mode but not ascii */
@ -179,6 +243,7 @@ main(int argc, char *argv[])
int rlen = chartorune(&r, (char *)bp);
send(bp, rlen); /* write bp to /dev/cons */
bp += rlen;
last.kana = nil;
continue;
}
if (table == hira && (*bp >= 'A' && *bp <= 'Z') && (*(bp+1) < 'A'
@ -192,25 +257,31 @@ main(int argc, char *argv[])
joshi = 1;
olen = 0;
}
mp = match(bp, &nchar, table);
if (mp == 0) {
if (nchar>0) { /* match, longer possible */
wantmore++;
if(hmapget(table, (char*)bp, &lkup) < 0){
if(last.kana != nil){
send((uchar*)last.kana, strlen(last.kana));
bp += strlen(last.roma);
} else
send(bp++, 1);
last.kana = nil;
break;
}
send(bp++, 1); /* alphabet in kana mode */
} else {
send((uchar*)mp->kana, strlen(mp->kana));
bp += nchar;
}
}
}
}
/* concatinations; only advance a single character */
if(lkup.kana != nil && strstr("ッっ", lkup.kana))
lkup.roma = "_";
/* partial match */
if(lkup.kana == nil || lkup.leadstomore == 1){
if(lkup.kana != nil)
last = lkup;
int
min(int a, int b)
{
return a<b? a: b;
wantmore = 1;
break;
}
last.kana = nil;
send((uchar*)lkup.kana, strlen(lkup.kana));
bp += strlen(lkup.roma);
}
}
}
/*
@ -232,7 +303,9 @@ send(uchar *p, int n)
llen -= 64;
}
if (table!=hira || natural)
if(table != hira && table != hanzi)
return;
if(natural && table != hanzi)
return;
ep = p+n;
@ -253,49 +326,13 @@ send(uchar *p, int n)
}
}
/*
* Romaji to Hiragana/Katakana conversion
* romaji shoud be input as small letter
* returns the matched address in table, hira, kata, etc.
* nc: number of character (return value)
*/
Map *
match(uchar *p, int *nc, Map *table)
{
register Map *longp = 0, *kp;
static char last;
int longest = 0;
*nc = -1;
for (kp=table; kp->roma; kp++) {
if (*p == *kp->roma) {
int lr = strlen(kp->roma);
int len = min(lr, strlen((char *)p));
if (strncmp(kp->roma, (char *)p, len)==0) {
if (len<lr) {
*nc = 1;
return 0;
}
if (len>longest) {
longest = len;
longp = kp;
}
}
}
}
if (longp) {
last = longp->roma[longest-1];
*nc = longp->advance;
}
return longp;
}
int
changelang(int c)
{
switch(c){
case '': /* ^t (English mode) */
natural = 1;
table = hira;
llen = 0;
return 1;
break;
@ -334,23 +371,80 @@ changelang(int c)
llen = 0;
return 1;
break;
case '': /* ^c (Chinese mode) */
natural = 1;
table = hanzi;
llen = 0;
return 1;
break;
}
return 0;
}
Hmap*
opendict(Hmap *h, char *name)
{
Biobuf *b;
char *p;
char *dot, *rest;
char *kouho[16];
int i;
b = Bopen(name, OREAD);
if(b == nil)
return nil;
if(h == nil)
h = hmapalloc(8192, sizeof(kouho));
else
hmapreset(h, 1);
while(p = Brdstr(b, '\n', 1)){
if(p[0] == '\0' || p[0] == ';'){
Err:
free(p);
continue;
}
dot = utfrune(p, '\t');
if(dot == nil)
goto Err;
*dot = '\0';
rest = dot+1;
if(*rest == '\0')
goto Err;
memset(kouho, 0, sizeof kouho);
i = 0;
while(i < nelem(kouho)-1 && (dot = utfrune(rest, ' '))){
*dot = '\0';
kouho[i++] = rest;
rest = dot+1;
}
if(i < nelem(kouho)-1)
kouho[i] = rest;
/* key is the base pointer; overwrites clean up for us */
hmapset(&h, p, kouho, nil);
}
Bterm(b);
return h;
}
/*
* write translated kanji runes to stdout and return last character
* if it's not ctl-\. if the last is ctl-\, proceed with
* translation of the next kouho
*/
int
dotrans(Dictionary *dic)
dotrans(Hmap *dic)
{
Rune *res, r[1];
char v[1024], *p, tbuf[64], hirabuf[64];
int j, lastlen, nokouho = 0;
char ch;
KouhoList *fstkouho, *currentkouho;
int i;
char *kouho[16];
if (llen==0)
return 0; /* don't use kanji transform function */
@ -375,15 +469,13 @@ dotrans(Dictionary *dic)
if (okuri && joshi != 1) /* verb mode */
hirabuf[strlen(hirabuf) - 1] = '\0';
if(!(fstkouho = getKouhoHash(dic, v))) { /* not found */
if(hmapget(dic, v, kouho) < 0){
llen = olen = okuri = joshi = 0;
okurigana[0] = 0;
return 0;
}
currentkouho = fstkouho;
for(;;) {
p = currentkouho->kouhotop; /* p to the head of kanji kouho array */
for(i = 0; i < nelem(kouho) && kouho[i] != nil; i++) {
p = kouho[i];
lastlen = nrune(tbuf); /* number of rune chars */
if (okuri && joshi != 1) /* verb mode */
@ -407,10 +499,9 @@ dotrans(Dictionary *dic)
exits(nil);
if (ch == '') { /* if next input is ^\, once again */
if(currentkouho->nextkouho != 0) { /* have next kouho */
if(i+1 < nelem(kouho) && kouho[i+1] != nil) { /* have next kouho */
nokouho = 0;
strcpy(tbuf, p);
currentkouho = currentkouho->nextkouho;
if (okuri && joshi != 1) /* verb mode */
for (j=0; j<nrune(tbuf); j++)
@ -442,8 +533,12 @@ dotrans(Dictionary *dic)
break;
}
} else {
if(!nokouho) /* learn the previous use of the kouho */
selectKouho(&(fstkouho->dicitem->kouho), currentkouho);
if(!nokouho && i != 0){ /* learn the previous use of the kouho */
p = kouho[0];
kouho[0] = kouho[i];
kouho[i] = p;
hmapset(&dic, hmapkey(dic, v), kouho, nil);
}
olen = okuri = joshi = 0;
okurigana[0] = 0;

View file

@ -2,9 +2,9 @@
BIN=/$objtype/bin
TARG=ktrans
HFILES=jisho.h ktrans.h
HFILES=ktrans.h
OFILES=\
hash.$O\
main.$O\
jisho.$O
</sys/src/cmd/mkone