232 lines
3.7 KiB
C
232 lines
3.7 KiB
C
#include <u.h>
|
|
#include <libc.h>
|
|
#include <bio.h>
|
|
#include "regexp.h"
|
|
#include "hash.h"
|
|
|
|
enum
|
|
{
|
|
MAXTAB = 256,
|
|
MAXBEST = 32,
|
|
};
|
|
|
|
typedef struct Table Table;
|
|
struct Table
|
|
{
|
|
char *file;
|
|
Hash *hash;
|
|
int nmsg;
|
|
};
|
|
|
|
typedef struct Word Word;
|
|
struct Word
|
|
{
|
|
Stringtab *s; /* from hmsg */
|
|
int count[MAXTAB]; /* counts from each table */
|
|
double p[MAXTAB]; /* probabilities from each table */
|
|
double mp; /* max probability */
|
|
int mi; /* w.p[w.mi] = w.mp */
|
|
};
|
|
|
|
Table tab[MAXTAB];
|
|
int ntab;
|
|
|
|
Word best[MAXBEST];
|
|
int mbest;
|
|
int nbest;
|
|
|
|
int debug;
|
|
|
|
void
|
|
usage(void)
|
|
{
|
|
fprint(2, "usage: bayes [-D] [-m maxword] boxhash ... ~ msghash ...\n");
|
|
exits("usage");
|
|
}
|
|
|
|
void*
|
|
emalloc(int n)
|
|
{
|
|
void *v;
|
|
|
|
v = mallocz(n, 1);
|
|
if(v == nil)
|
|
sysfatal("out of memory");
|
|
return v;
|
|
}
|
|
|
|
void
|
|
noteword(Word *w)
|
|
{
|
|
int i;
|
|
|
|
for(i=nbest-1; i>=0; i--)
|
|
if(w->mp < best[i].mp)
|
|
break;
|
|
i++;
|
|
|
|
if(i >= mbest)
|
|
return;
|
|
if(nbest == mbest)
|
|
nbest--;
|
|
if(i < nbest)
|
|
memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
|
|
best[i] = *w;
|
|
nbest++;
|
|
}
|
|
|
|
Hash*
|
|
hread(char *s)
|
|
{
|
|
Hash *h;
|
|
Biobuf *b;
|
|
|
|
if((b = Bopenlock(s, OREAD)) == nil)
|
|
sysfatal("open %s: %r", s);
|
|
|
|
h = emalloc(sizeof(Hash));
|
|
Breadhash(b, h, 1);
|
|
Bterm(b);
|
|
return h;
|
|
}
|
|
|
|
void
|
|
main(int argc, char **argv)
|
|
{
|
|
int i, j, a, mi, oi, tot, keywords;
|
|
double totp, p, xp[MAXTAB];
|
|
Hash *hmsg;
|
|
Word w;
|
|
Stringtab *s, *t;
|
|
Biobuf bout;
|
|
|
|
mbest = 15;
|
|
keywords = 0;
|
|
ARGBEGIN{
|
|
case 'D':
|
|
debug = 1;
|
|
break;
|
|
case 'k':
|
|
keywords = 1;
|
|
break;
|
|
case 'm':
|
|
mbest = atoi(EARGF(usage()));
|
|
if(mbest > MAXBEST)
|
|
sysfatal("cannot keep more than %d words", MAXBEST);
|
|
break;
|
|
default:
|
|
usage();
|
|
}ARGEND
|
|
|
|
for(i=0; i<argc; i++)
|
|
if(strcmp(argv[i], "~") == 0)
|
|
break;
|
|
|
|
if(i > MAXTAB)
|
|
sysfatal("cannot handle more than %d tables", MAXTAB);
|
|
|
|
if(i+1 >= argc)
|
|
usage();
|
|
|
|
for(i=0; i<argc; i++){
|
|
if(strcmp(argv[i], "~") == 0)
|
|
break;
|
|
tab[ntab].file = argv[i];
|
|
tab[ntab].hash = hread(argv[i]);
|
|
s = findstab(tab[ntab].hash, "*nmsg*", 6, 1);
|
|
if(s == nil || s->count == 0)
|
|
tab[ntab].nmsg = 1;
|
|
else
|
|
tab[ntab].nmsg = s->count;
|
|
ntab++;
|
|
}
|
|
|
|
Binit(&bout, 1, OWRITE);
|
|
|
|
oi = ++i;
|
|
for(a=i; a<argc; a++){
|
|
hmsg = hread(argv[a]);
|
|
nbest = 0;
|
|
for(s=hmsg->all; s; s=s->link){
|
|
w.s = s;
|
|
tot = 0;
|
|
totp = 0.0;
|
|
for(i=0; i<ntab; i++){
|
|
t = findstab(tab[i].hash, s->str, s->n, 0);
|
|
if(t == nil)
|
|
w.count[i] = 0;
|
|
else
|
|
w.count[i] = t->count;
|
|
tot += w.count[i];
|
|
p = w.count[i]/(double)tab[i].nmsg;
|
|
if(p >= 1.0)
|
|
p = 1.0;
|
|
w.p[i] = p;
|
|
totp += p;
|
|
}
|
|
|
|
if(tot < 5){ /* word does not appear enough; give to box 0 */
|
|
w.p[0] = 0.5;
|
|
for(i=1; i<ntab; i++)
|
|
w.p[i] = 0.1;
|
|
w.mp = 0.5;
|
|
w.mi = 0;
|
|
noteword(&w);
|
|
continue;
|
|
}
|
|
|
|
w.mp = 0.0;
|
|
for(i=0; i<ntab; i++){
|
|
p = w.p[i];
|
|
p /= totp;
|
|
if(p < 0.01)
|
|
p = 0.01;
|
|
else if(p > 0.99)
|
|
p = 0.99;
|
|
if(p > w.mp){
|
|
w.mp = p;
|
|
w.mi = i;
|
|
}
|
|
w.p[i] = p;
|
|
}
|
|
noteword(&w);
|
|
}
|
|
|
|
totp = 0.0;
|
|
for(i=0; i<ntab; i++){
|
|
p = 1.0;
|
|
for(j=0; j<nbest; j++)
|
|
p *= best[j].p[i];
|
|
xp[i] = p;
|
|
totp += p;
|
|
}
|
|
for(i=0; i<ntab; i++)
|
|
xp[i] /= totp;
|
|
mi = 0;
|
|
for(i=1; i<ntab; i++)
|
|
if(xp[i] > xp[mi])
|
|
mi = i;
|
|
if(oi != argc-1)
|
|
Bprint(&bout, "%s: ", argv[a]);
|
|
Bprint(&bout, "%s %f", tab[mi].file, xp[mi]);
|
|
if(keywords){
|
|
for(i=0; i<nbest; i++){
|
|
Bprint(&bout, " ");
|
|
Bwrite(&bout, best[i].s->str, best[i].s->n);
|
|
Bprint(&bout, " %f", best[i].p[mi]);
|
|
}
|
|
}
|
|
freehash(hmsg);
|
|
Bprint(&bout, "\n");
|
|
if(debug){
|
|
for(i=0; i<nbest; i++){
|
|
Bwrite(&bout, best[i].s->str, best[i].s->n);
|
|
Bprint(&bout, " %f", best[i].p[mi]);
|
|
if(best[i].p[mi] < best[i].mp)
|
|
Bprint(&bout, " (%f %s)", best[i].mp, tab[best[i].mi].file);
|
|
Bprint(&bout, "\n");
|
|
}
|
|
}
|
|
}
|
|
Bterm(&bout);
|
|
}
|