webfs: do url escaping/unescaping

2011-10-06 05:14:59 +02:00 · 2011-10-06 05:14:59 +02:00 · 4d696d4141
commit 4d696d4141
parent d0405d6c24
1 changed files with 50 additions and 113 deletions
--- a/sys/src/cmd/webfs/url.c
+++ b/sys/src/cmd/webfs/url.c
@ -112,18 +112,12 @@ ischeme(char *s)

 /* RE character-class components -- these go in brackets */
 #define PUNCT			"\\-_.!~*'()"
-#define RES			";/?:@&=+$,"
 #define ALNUM		"a-zA-Z0-9"
 #define HEX			"0-9a-fA-F"
 #define UNRES			ALNUM PUNCT

 /* RE components; _N => has N parenthesized subexpressions when expanded */
-#define ESCAPED_1			"(%[" HEX "][" HEX "])"
-#define URIC_2			"([" RES UNRES "]|" ESCAPED_1 ")"
-#define URICNOSLASH_2		"([" UNRES ";?:@&=+$,]|" ESCAPED_1 ")"
-#define USERINFO_2		"([" UNRES ";:&=+$,]|" ESCAPED_1 ")"
-#define PCHAR_2			"([" UNRES ":@&=+$,]|" ESCAPED_1 ")"
-#define PSEGCHAR_3		"([/;]|" PCHAR_2 ")"
+#define USERINFO_2		"([" UNRES ";:&=+$,]|(%[" HEX "][" HEX "]))"

 typedef struct Retab Retab;
 struct Retab
@ -138,16 +132,10 @@ enum
 {
 	REsplit = 0,
 	REscheme,
-	REunknowndata,
 	REauthority,
 	REhost,
 	REuserinfo,
-	REabspath,
-	REquery,
-	REfragment,
-	REhttppath,
 	REftppath,
-	REfilepath,

 	MaxResub=	20,
 };
@ -163,10 +151,6 @@ Retab retab[] =	/* view in constant width Font */
 	"^[a-z][a-z0-9+-.]*$", nil, 0,
 	{ 0, },

-[REunknowndata]
-	"^" URICNOSLASH_2 URIC_2 "*$", nil, 0,
-	{ 0, },
-
 [REauthority]
 	"^(((" USERINFO_2 "*)@)?(((\\[[^\\]@]+\\])|([^:\\[@]+))(:([0-9]*))?)?)?$", nil, 0,
 	/* |----user info-----|  |--------host----------------|  |-port-| */
@ -182,30 +166,10 @@ Retab retab[] =	/* view in constant width Font */
 	/* |user-|  |pass-| */
 	{  2,       4, },

-[REabspath]
-	"^/" PSEGCHAR_3 "*$", nil, 0,
-	{ 0, },
-
-[REquery]
-	"^" URIC_2 "*$", nil, 0,
-	{ 0, },
-
-[REfragment]
-	"^" URIC_2 "*$", nil, 0,
-	{ 0, },
-
-[REhttppath]
-	"^.*$", nil, 0,
-	{ 0, },
-
 [REftppath]
 	"^(.+)(;[tT][yY][pP][eE]=([aAiIdD]))?$", nil, 0,
 	/*|--|-path              |ftptype-| */
 	{ 1,                     3, }, 
-
-[REfilepath]
-	"^.*$", nil, 0,
-	{ 0, },
 };

 static int
@ -532,20 +496,6 @@ spliturl(char *url, SplitUrl *su)
 		return -1;
 	}

-	/*
-	 * Because we use NUL-terminated strings, as do many client and server
-	 * implementations, an escaped NUL ("%00") will quite likely cause problems
-	 * when unescaped.  We can check for such a sequence once before examining
- 	 * the components because, per RFC2396 sec. 2.4.1 - 2.4.2, '%' is reserved
-	 * in URIs to _always_ indicate escape sequences.  Something like "%2500"
-	 * will still get by, but that's legitimate, and if it ends up causing
-	 * a NUL then someone is unescaping too many times.
-	 */
-	if(strstr(url, "%00")){
-		werrstr("escaped NUL in URI");
-		return -1;
-	}
-
 	m[0].sp = m[0].ep = nil;
 	t = &retab[REsplit];
 	if(!regx(t->prog, url, m, t->size)){
@ -614,8 +564,6 @@ parse_unknown_part(SplitUrl *su, Url *u)
 		e = s+strlen(s);

 	u->schemedata = estredup(s, e);
-	if(!ismatch(REunknowndata, u->schemedata, "unknown scheme data"))
-		return -1;
 	return 0;
 }

@ -705,39 +653,47 @@ parse_authority(SplitUrl *su, Url *u)
 static int
 parse_abspath(SplitUrl *su, Url *u)
 {
+	char *s;
+
 	if(su->path.s == nil)
 		return 0;
-	u->path = estredup(su->path.s, su->path.e);
-	if(!ismatch(REabspath, u->path, "absolute path"))
-		return -1;
+	s = estredup(su->path.s, su->path.e);
+	u->path = unescapeurl(s, "/");
+	free(s);
 	return 0;
 }

 static int
 parse_query(SplitUrl *su, Url *u)
 {
+	char *s;
+
 	if(su->query.s == nil)
 		return 0;
-	u->query = estredup(su->query.s, su->query.e);
-	if(!ismatch(REquery, u->query, "query"))
-		return -1;
+	s = estredup(su->query.s, su->query.e);
+	u->query = unescapeurl(s, "&=");
+	free(s);
 	return 0;
 }

 static int
 parse_fragment(SplitUrl *su, Url *u)
 {
+	char *s;
+
 	if(su->fragment.s == nil)
 		return 0;
-	u->fragment = estredup(su->fragment.s, su->fragment.e);
-	if(!ismatch(REfragment, u->fragment, "fragment"))
-		return -1;
+	s = estredup(su->fragment.s, su->fragment.e);
+	u->fragment = unescapeurl(s, "");
+	free(s);
 	return 0;
 }

 static int
 postparse_http(Url *u)
 {
+	char *p, *q;
+
 	u->open = httpopen;
 	u->read = httpread;
 	u->close = httpclose;
@ -755,17 +711,17 @@ postparse_http(Url *u)
 		u->http.page_spec = estrdup("/");
 		return 0;
 	}
-
-	if(!ismatch(REhttppath, u->path, "http path"))
-		return -1;
+	p = escapeurl(u->path, " \"<>#%\\");
 	if(u->query){
-		u->http.page_spec = emalloc(strlen(u->path)+1+strlen(u->query)+1);
-		strcpy(u->http.page_spec, u->path);
+		q = escapeurl(u->query, " \"<>#%\\");
+		u->http.page_spec = emalloc(strlen(p)+1+strlen(q)+1);
+		strcpy(u->http.page_spec, p);
 		strcat(u->http.page_spec, "?");
-		strcat(u->http.page_spec, u->query);
+		strcat(u->http.page_spec, q);
+		free(q);
+		free(p);
 	}else
-		u->http.page_spec = estrdup(u->path);
-
+		u->http.page_spec = p;
 	return 0;
 }

@ -840,9 +796,6 @@ postparse_file(Url *u)
 		return -1;
 	}

-	if(!ismatch(REfilepath, u->path, "file path"))
-		return -1;
-
 	/* "localhost" is equivalent to no host spec, we'll chose the latter */
 	if(u->host && cistrcmp(u->host, "localhost") == 0){
 		free(u->host);
@ -886,7 +839,7 @@ parseurl(char *url, Url *base)
 			/* 'u.url' refers to current document; set fragment and return */
 			if(parse_fragment(&su, u) < 0)
 				goto Fail;
-			return u;
+			goto Done;
 		}
 	}

@ -897,7 +850,7 @@ parseurl(char *url, Url *base)
 	if(u->ischeme == USunknown){
 		if(parse_unknown_part(&su, u) < 0)
 			goto Fail;
-		return u;
+		goto Done;
 	}

 	if(parse_query(&su, u) < 0
@ -909,7 +862,9 @@ parseurl(char *url, Url *base)
 		if((*postparse[u->ischeme])(u) < 0)
 			goto Fail;

+Done:
 	setmalloctag(u, getcallerpc(&url));
+	rewriteurl(u);
 	return u;
 }

@ -970,12 +925,8 @@ seturlquery(Url *u, char *query)
 		u->query = nil;
 		return 0;
 	}
-
-	if(!ismatch(REquery, query, "query"))
-		return -1;
-
 	free(u->query);
-	u->query = estrdup(query);
+	u->query = unescapeurl(query, "&=");
 	return 0;
 }

@ -1030,61 +981,47 @@ dhex(char c)
 }

 char*
-escapeurl(char *s, int (*needesc)(int))
+escapeurl(char *s, char *special)
 {
 	int n;
 	char *t, *u;
-	Rune r;
 	static char *hex = "0123456789abcdef";

 	n = 0;
 	for(t=s; *t; t++)
-		if((*needesc)(*t))
+		if(*t <= 0x1F || *t >= 0x7F || strchr(special, *t))
 			n++;
-
 	u = emalloc(strlen(s)+2*n+1);
 	t = u;
 	for(; *s; s++){
-		s += chartorune(&r, s);
-		if(r >= 0xFF){
-			werrstr("URLs cannot contain Runes > 0xFF");
-			free(t);
-			return nil;
-		}
-		if((*needesc)(r)){
+		if(s[0] == '%' && isxdigit(s[1]) && isxdigit(s[2]))
+			*u++ = *s;
+		else if(*s <= 0x1F || *s >= 0x7F || strchr(special, *s)){
 			*u++ = '%';
-			*u++ = hex[(r>>4)&0xF];
-			*u++ = hex[r&0xF];
+			*u++ = hex[(*s>>4)&0xF];
+			*u++ = hex[*s&0xF];
 		}else
-			*u++ = r;
+			*u++ = *s;
 	}
 	*u = '\0';
 	return t;
 }

 char*
-unescapeurl(char *s)
+unescapeurl(char *s, char *special)
 {
-	char *r, *w;
-	Rune rune;
+	char *r, *w, x;

 	s = estrdup(s);
-	for(r=w=s; *r; r++){
-		if(*r=='%'){
-			r++;
-			if(!isxdigit(r[0]) || !isxdigit(r[1])){
-				werrstr("bad escape sequence '%.3s' in URL", r);
-				return nil;
-			}
-			if(r[0]=='0' && r[2]=='0'){
-				werrstr("escaped NUL in URL");
-				return nil;
-			}
-			rune = (dhex(r[0])<<4)|dhex(r[1]);	/* latin1 */
-			w += runetochar(w, &rune);
-			r += 2;
-		}else
-			*w++ = *r;
+	for(r=w=s; x = *r; r++){
+		if(x=='%' && isxdigit(r[1]) && isxdigit(r[2])){
+			x = (dhex(r[1])<<4)|dhex(r[2]);
+			if(x == 0 || (x > 0x1F && x < 0x7F && strchr(special, x)))
+				x = *r;
+			else
+				r += 2;
+		}
+		*w++ = x;
 	}
 	*w = '\0';
 	return s;