plan9fox/sys/src/cmd/upas/smtp/rfc822.y

%{
#include "common.h"
#include "smtp.h"
#include <ctype.h>

#define YYMAXDEPTH	500		/* was default 150 */

char	*yylp;		/* next character to be lex'd */
int	yydone;		/* tell yylex to give up */
char	*yybuffer;	/* first parsed character */
char	*yyend;		/* end of buffer to be parsed */
Node	*root;
Field	*firstfield;
Field	*lastfield;
Node	*usender;
Node	*usys;
Node	*udate;
char	*startfield, *endfield;
int	originator;
int	destination;
int	date;
int	received;
int	messageid;
%}

%term WORD
%term DATE
%term RESENT_DATE
%term RETURN_PATH
%term FROM
%term SENDER
%term REPLY_TO
%term RESENT_FROM
%term RESENT_SENDER
%term RESENT_REPLY_TO
%term SUBJECT
%term TO
%term CC
%term BCC
%term RESENT_TO
%term RESENT_CC
%term RESENT_BCC
%term REMOTE
%term PRECEDENCE
%term MIMEVERSION
%term CONTENTTYPE
%term MESSAGEID
%term RECEIVED
%term MAILER
%term BADTOKEN
%start msg
%%

msg		: fields
		| unixfrom '\n' fields
		;
fields		: '\n'
			{ yydone = 1; }
		| field '\n'
		| field '\n' fields
		;
field		: dates
			{ date = 1; }
		| originator
			{ originator = 1; }
		| destination
			{ destination = 1; }
		| subject
		| optional
		| ignored
		| received
		| precedence
		| error '\n' field
		;
unixfrom	: FROM route_addr unix_date_time REMOTE FROM word
			{ freenode($1); freenode($4); freenode($5);
			  usender = $2; udate = $3; usys = $6;
			}
		;
originator	: REPLY_TO ':' address_list
			{ newfield(link3($1, $2, $3), 1); }
		| RETURN_PATH ':' route_addr
			{ newfield(link3($1, $2, $3), 1); }
		| FROM ':' mailbox_list
			{ newfield(link3($1, $2, $3), 1); }
		| SENDER ':' mailbox
			{ newfield(link3($1, $2, $3), 1); }
		| RESENT_REPLY_TO ':' address_list
			{ newfield(link3($1, $2, $3), 1); }
		| RESENT_SENDER ':' mailbox
			{ newfield(link3($1, $2, $3), 1); }
		| RESENT_FROM ':' mailbox
			{ newfield(link3($1, $2, $3), 1); }
		;
dates 		: DATE ':' date_time
			{ newfield(link3($1, $2, $3), 0); }
		| RESENT_DATE ':' date_time
			{ newfield(link3($1, $2, $3), 0); }
		;
destination	: TO ':'
			{ newfield(link2($1, $2), 0); }
		| TO ':' address_list
			{ newfield(link3($1, $2, $3), 0); }
		| RESENT_TO ':'
			{ newfield(link2($1, $2), 0); }
		| RESENT_TO ':' address_list
			{ newfield(link3($1, $2, $3), 0); }
		| CC ':'
			{ newfield(link2($1, $2), 0); }
		| CC ':' address_list
			{ newfield(link3($1, $2, $3), 0); }
		| RESENT_CC ':'
			{ newfield(link2($1, $2), 0); }
		| RESENT_CC ':' address_list
			{ newfield(link3($1, $2, $3), 0); }
		| BCC ':'
			{ newfield(link2($1, $2), 0); }
		| BCC ':' address_list
			{ newfield(link3($1, $2, $3), 0); }
		| RESENT_BCC ':'
			{ newfield(link2($1, $2), 0); }
		| RESENT_BCC ':' address_list
			{ newfield(link3($1, $2, $3), 0); }
		;
subject		: SUBJECT ':' things
			{ newfield(link3($1, $2, $3), 0); }
		| SUBJECT ':'
			{ newfield(link2($1, $2), 0); }
		;
received	: RECEIVED ':' things
			{ newfield(link3($1, $2, $3), 0); received++; }
		| RECEIVED ':'
			{ newfield(link2($1, $2), 0); received++; }
		;
precedence	: PRECEDENCE ':' things
			{ newfield(link3($1, $2, $3), 0); }
		| PRECEDENCE ':'
			{ newfield(link2($1, $2), 0); }
		;
ignored		: ignoredhdr ':' things
			{ newfield(link3($1, $2, $3), 0); }
		| ignoredhdr ':'
			{ newfield(link2($1, $2), 0); }
		;
ignoredhdr	: MIMEVERSION | CONTENTTYPE | MESSAGEID { messageid = 1; } | MAILER
		;
optional	: fieldwords ':' things
			{ /* hack to allow same lex for field names and the rest */
			 if(badfieldname($1)){
				freenode($1);
				freenode($2);
				freenode($3);
				return 1;
			 }
			 newfield(link3($1, $2, $3), 0);
			}
		| fieldwords ':'
			{ /* hack to allow same lex for field names and the rest */
			 if(badfieldname($1)){
				freenode($1);
				freenode($2);
				return 1;
			 }
			 newfield(link2($1, $2), 0);
			}
		;
address_list	: address
		| address_list ',' address
			{ $$ = link3($1, $2, $3); }
		;
address		: mailbox
		| group
		;
group		: phrase ':' address_list ';'
			{ $$ = link2($1, link3($2, $3, $4)); }
		| phrase ':' ';'
			{ $$ = link3($1, $2, $3); }
		;
mailbox_list	: mailbox
		| mailbox_list ',' mailbox
			{ $$ = link3($1, $2, $3); }
		;
mailbox		: route_addr
		| phrase brak_addr
			{ $$ = link2($1, $2); }
		| brak_addr
		;
brak_addr	: '<' route_addr '>'
			{ $$ = link3($1, $2, $3); }
		| '<' '>'
			{ $$ = nobody($2); freenode($1); }
		;
route_addr	: route ':' at_addr
			{ $$ = address(concat($1, concat($2, $3))); }
		| addr_spec
		;
route		: '@' domain
			{ $$ = concat($1, $2); }
		| route ',' '@' domain
			{ $$ = concat($1, concat($2, concat($3, $4))); }
		;
addr_spec	: local_part
			{ $$ = address($1); }
		| at_addr
		;
at_addr		: local_part '@' domain
			{ $$ = address(concat($1, concat($2, $3)));}
		| at_addr '@' domain
			{ $$ = address(concat($1, concat($2, $3)));}
		;
local_part	: word
		;
domain		: word
		;
phrase		: word
		| phrase word
			{ $$ = link2($1, $2); }
		;
things		: thing
		| things thing
			{ $$ = link2($1, $2); }
		;
thing		: word | '<' | '>' | '@' | ':' | ';' | ','
		;
date_time	: things
		;
unix_date_time	: word word word unix_time word word
			{ $$ = link3($1, $3, link3($2, $6, link2($4, $5))); }
		;
unix_time	: word
		| unix_time ':' word
			{ $$ = link3($1, $2, $3); }
		;
word		: WORD | DATE | RESENT_DATE | RETURN_PATH | FROM | SENDER
		| REPLY_TO | RESENT_FROM | RESENT_SENDER | RESENT_REPLY_TO
		| TO | CC | BCC | RESENT_TO | RESENT_CC | RESENT_BCC | REMOTE | SUBJECT
		| PRECEDENCE | MIMEVERSION | CONTENTTYPE | MESSAGEID | RECEIVED | MAILER
		;
fieldwords	: fieldword
		| WORD
		| fieldwords fieldword
			{ $$ = link2($1, $2); }
		| fieldwords word
			{ $$ = link2($1, $2); }
		;
fieldword	: '<' | '>' | '@' | ';' | ','
		;
%%

/*
 *  Initialize the parsing.  Done once for each header field.
 */
void
yyinit(char *p, int len)
{
	yybuffer = p;
	yylp = p;
	yyend = p + len;
	firstfield = lastfield = 0;
	received = 0;
}

/*
 *  keywords identifying header fields we care about
 */
typedef struct Keyword	Keyword;
struct Keyword {
	char	*rep;
	int	val;
};

/* field names that we need to recognize */
Keyword key[] = {
	{ "date", DATE },
	{ "resent-date", RESENT_DATE },
	{ "return_path", RETURN_PATH },
	{ "from", FROM },
	{ "sender", SENDER },
	{ "reply-to", REPLY_TO },
	{ "resent-from", RESENT_FROM },
	{ "resent-sender", RESENT_SENDER },
	{ "resent-reply-to", RESENT_REPLY_TO },
	{ "to", TO },
	{ "cc", CC },
	{ "bcc", BCC },
	{ "resent-to", RESENT_TO },
	{ "resent-cc", RESENT_CC },
	{ "resent-bcc", RESENT_BCC },
	{ "remote", REMOTE },
	{ "subject", SUBJECT },
	{ "precedence", PRECEDENCE },
	{ "mime-version", MIMEVERSION },
	{ "content-type", CONTENTTYPE },
	{ "message-id", MESSAGEID },
	{ "received", RECEIVED },
	{ "mailer", MAILER },
	{ "who-the-hell-cares", WORD }
};

/*
 *  Lexical analysis for an rfc822 header field.  Continuation lines
 *  are handled in yywhite() when skipping over white space.
 *
 */
yylex(void)
{
	String *t;
	int quoting;
	int escaping;
	char *start;
	Keyword *kp;
	int c, d;

/*	print("lexing\n"); /**/
	if(yylp >= yyend)
		return 0;
	if(yydone)
		return 0;

	quoting = escaping = 0;
	start = yylp;
	yylval = malloc(sizeof(Node));
	yylval->white = yylval->s = 0;
	yylval->next = 0;
	yylval->addr = 0;
	yylval->start = yylp;
	for(t = 0; yylp < yyend; yylp++){
		c = *yylp & 0xff;

		/* dump nulls, they can't be in header */
		if(c == 0)
			continue;

		if(escaping) {
			escaping = 0;
		} else if(quoting) {
			switch(c){
			case '\\':
				escaping = 1;
				break;
			case '\n':
				d = (*(yylp+1))&0xff;
				if(d != ' ' && d != '\t'){
					quoting = 0;
					yylp--;
					continue;
				}
				break;
			case '"':
				quoting = 0;
				break;
			}
		} else {
			switch(c){
			case '\\':
				escaping = 1;
				break;
			case '(':
			case ' ':
			case '\t':
			case '\r':
				goto out;
			case '\n':
				if(yylp == start){
					yylp++;
/*					print("lex(c %c)\n", c); /**/
					yylval->end = yylp;
					return yylval->c = c;
				}
				goto out;
			case '@':
			case '>':
			case '<':
			case ':':
			case ',':
			case ';':
				if(yylp == start){
					yylp++;
					yylval->white = yywhite();
/*					print("lex(c %c)\n", c); /**/
					yylval->end = yylp;
					return yylval->c = c;
				}
				goto out;
			case '"':
				quoting = 1;
				break;
			default:
				break;
			}
		}
		if(t == 0)
			t = s_new();
		s_putc(t, c);
	}
out:
	yylval->white = yywhite();
	if(t) {
		s_terminate(t);
	} else				/* message begins with white-space! */
		return yylval->c = '\n';
	yylval->s = t;
	for(kp = key; kp->val != WORD; kp++)
		if(cistrcmp(s_to_c(t), kp->rep)==0)
			break;
/*	print("lex(%d) %s\n", kp->val-WORD, s_to_c(t)); /**/
	yylval->end = yylp;
	return yylval->c = kp->val;
}

void
yyerror(char *x)
{
	USED(x);

	/*fprint(2, "parse err: %s\n", x);/**/
}

/*
 *  parse white space and comments
 */
String *
yywhite(void)
{
	String *w;
	int clevel;
	int c;
	int escaping;

	escaping = clevel = 0;
	for(w = 0; yylp < yyend; yylp++){
		c = *yylp & 0xff;

		/* dump nulls, they can't be in header */
		if(c == 0)
			continue;

		if(escaping){
			escaping = 0;
		} else if(clevel) {
			switch(c){
			case '\n':
				/*
				 *  look for multiline fields
				 */
				if(*(yylp+1)==' ' || *(yylp+1)=='\t')
					break;
				else
					goto out;
			case '\\':
				escaping = 1;
				break;
			case '(':
				clevel++;
				break;
			case ')':
				clevel--;
				break;
			}
		} else {
			switch(c){
			case '\\':
				escaping = 1;
				break;
			case '(':
				clevel++;
				break;
			case ' ':
			case '\t':
			case '\r':
				break;
			case '\n':
				/*
				 *  look for multiline fields
				 */
				if(*(yylp+1)==' ' || *(yylp+1)=='\t')
					break;
				else
					goto out;
			default:
				goto out;
			}
		}
		if(w == 0)
			w = s_new();
		s_putc(w, c);
	}
out:
	if(w)
		s_terminate(w);
	return w;
}

/*
 *  link two parsed entries together
 */
Node*
link2(Node *p1, Node *p2)
{
	Node *p;

	for(p = p1; p->next; p = p->next)
		;
	p->next = p2;
	return p1;
}

/*
 *  link three parsed entries together
 */
Node*
link3(Node *p1, Node *p2, Node *p3)
{
	Node *p;

	for(p = p2; p->next; p = p->next)
		;
	p->next = p3;

	for(p = p1; p->next; p = p->next)
		;
	p->next = p2;

	return p1;
}

/*
 *  make a:b, move all white space after both
 */
Node*
colon(Node *p1, Node *p2)
{
	if(p1->white){
		if(p2->white)
			s_append(p1->white, s_to_c(p2->white));
	} else {
		p1->white = p2->white;
		p2->white = 0;
	}

	s_append(p1->s, ":");
	if(p2->s)
		s_append(p1->s, s_to_c(p2->s));

	if(p1->end < p2->end)
		p1->end = p2->end;
	freenode(p2);
	return p1;
}

/*
 *  concatenate two fields, move all white space after both
 */
Node*
concat(Node *p1, Node *p2)
{
	char buf[2];

	if(p1->white){
		if(p2->white)
			s_append(p1->white, s_to_c(p2->white));
	} else {
		p1->white = p2->white;
		p2->white = 0;
	}

	if(p1->s == nil){
		buf[0] = p1->c;
		buf[1] = 0;
		p1->s = s_new();
		s_append(p1->s, buf);
	}

	if(p2->s)
		s_append(p1->s, s_to_c(p2->s));
	else {
		buf[0] = p2->c;
		buf[1] = 0;
		s_append(p1->s, buf);
	}

	if(p1->end < p2->end)
		p1->end = p2->end;
	freenode(p2);
	return p1;
}

/*
 *  look for disallowed chars in the field name
 */
int
badfieldname(Node *p)
{
	for(; p; p = p->next){
		/* field name can't contain white space */
		if(p->white && p->next)
			return 1;
	}
	return 0;
}

/*
 *  mark as an address
 */
Node *
address(Node *p)
{
	p->addr = 1;
	return p;
}

/*
 *  case independent string compare
 */
int
cistrcmp(char *s1, char *s2)
{
	int c1, c2;

	for(; *s1; s1++, s2++){
		c1 = isupper(*s1) ? tolower(*s1) : *s1;
		c2 = isupper(*s2) ? tolower(*s2) : *s2;
		if (c1 != c2)
			return -1;
	}
	return *s2;
}

/*
 *  free a node
 */
void
freenode(Node *p)
{
	Node *tp;

	while(p){
		tp = p->next;
		if(p->s)
			s_free(p->s);
		if(p->white)
			s_free(p->white);
		free(p);
		p = tp;
	}
}


/*
 *  an anonymous user
 */
Node*
nobody(Node *p)
{
	if(p->s)
		s_free(p->s);
	p->s = s_copy("pOsTmAsTeR");
	p->addr = 1;
	return p;
}

/*
 *  add anything that was dropped because of a parse error
 */
void
missing(Node *p)
{
	Node *np;
	char *start, *end;
	Field *f;
	String *s;

	start = yybuffer;
	if(lastfield != nil){
		for(np = lastfield->node; np; np = np->next)
			start = np->end+1;
	}

	end = p->start-1;

	if(end <= start)
		return;

	if(strncmp(start, "From ", 5) == 0)
		return;

	np = malloc(sizeof(Node));
	np->start = start;
	np->end = end;
	np->white = nil;
	s = s_copy("BadHeader: ");
	np->s = s_nappend(s, start, end-start);
	np->next = nil;

	f = malloc(sizeof(Field));
	f->next = 0;
	f->node = np;
	f->source = 0;
	if(firstfield)
		lastfield->next = f;
	else
		firstfield = f;
	lastfield = f;
}

/*
 *  create a new field
 */
void
newfield(Node *p, int source)
{
	Field *f;

	missing(p);

	f = malloc(sizeof(Field));
	f->next = 0;
	f->node = p;
	f->source = source;
	if(firstfield)
		lastfield->next = f;
	else
		firstfield = f;
	lastfield = f;
	endfield = startfield;
	startfield = yylp;
}

/*
 *  fee a list of fields
 */
void
freefield(Field *f)
{
	Field *tf;

	while(f){
		tf = f->next;
		freenode(f->node);
		free(f);
		f = tf;
	}
}

/*
 *  add some white space to a node
 */
Node*
whiten(Node *p)
{
	Node *tp;

	for(tp = p; tp->next; tp = tp->next)
		;
	if(tp->white == 0)
		tp->white = s_copy(" ");
	return p;
}

void
yycleanup(void)
{
	Field *f, *fnext;
	Node *np, *next;

	for(f = firstfield; f; f = fnext){
		for(np = f->node; np; np = next){
			if(np->s)
				s_free(np->s);
			if(np->white)
				s_free(np->white);
			next = np->next;
			free(np);
		}
		fnext = f->next;
		free(f);
	}
	firstfield = lastfield = 0;
}