/* @(#)match.c 1.27 17/08/13 Copyright 1985, 1995-2017 J. Schilling */ #include #include #define POSIX_CLASS /* Support [[:alpha:]] by default */ #ifdef NO_POSIX_CLASS /* Allow to disable [[:alpha:]] */ #undef POSIX_CLASS #endif #ifdef POSIX_CLASS #include /* With [[:alpha:]], we need wctype() */ #include /* and thus wchar.h and wctype.h */ #endif /* * Pattern matching functions * * Copyright (c) 1985, 1995-2017 J. Schilling */ /* * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * See the file CDDL.Schily.txt in this distribution for details. * A copy of the CDDL is also available via the Internet at * http://www.opensource.org/licenses/cddl1.txt * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file CDDL.Schily.txt from this distribution. */ /* * The pattern matching functions below are based on the algorithm * presented by Martin Richards in: * * "A Compact Function for Regular Expression Pattern Matching", * Software-Practice and Experience, Vol. 9, 527-534 (1979) * * Several changes have been made to the original source which has been * written in BCPL: * * '/' is replaced by '!' (to allow UNIX filenames) * '(',')' are replaced by '{', '}' * '\'' is replaced by '\\' (UNIX compatible quote) * * Character classes have been added to allow "[]" * to be used. * POSIX features like [[:alpha:]] have been added. * Start of line '^' and end of line '$' have been added. */ #undef CHAR #ifdef __LINE_MATCH #ifdef __WIDE_CHAR #define patmatch patwlmatch #else #define opatmatch opatlmatch #define patmatch patlmatch #endif #endif #ifdef __WIDE_CHAR #ifndef __LINE_MATCH #define patcompile patwcompile #define patmatch patwmatch #endif #define CHAR wchar_t #define PCHAR wchar_t #endif #ifdef __MB_CHAR #undef patmatch #ifdef __LINE_MATCH #define patmatch patmblmatch #else #define patmatch patmbmatch #endif #define PCHAR wchar_t #endif #ifndef CHAR typedef unsigned char Uchar; #define DID_UCHAR_TYPE #define CHAR Uchar #endif #ifndef PCHAR #ifndef DID_UCHAR_TYPE typedef unsigned char Uchar; #endif #define PCHAR Uchar #endif #define ENDSTATE (-1) #define CL_SIZE 32 /* Max size for '[: :]' */ /* * The Interpreter */ /* * put adds a new state to the active list */ #define put(ret, state, sp, n) { \ register int *lstate = state; \ register int *lsp = sp; \ register int ln = n; \ \ while (lstate < lsp) { \ if (*lstate++ == ln) { \ ret = lsp; \ lsp = 0; \ break; \ } \ } \ if (lsp) { \ *lstate++ = ln; \ ret = lstate; \ } \ } /* * match a character in class * * Syntax errors do not appear here, they are handled by the compiler, * so in theory we could remove the "return (0)" statements from the * the POSIX class code. */ #ifdef POSIX_CLASS #define CHK_POSIX_CLASS \ else if (*lpat == LCLASS) { \ if (lpat[1] == ':') { \ char class[CL_SIZE+1]; \ char *pc = class; \ \ lpat += 2; /* Eat ':' */ \ for (;;) { \ if (*lpat == '\0') { \ ok = FALSE; \ goto out; \ } \ if (*lpat == ':' && lpat[1] == RCLASS) \ break; \ if (pc >= &class[CL_SIZE]) { \ ok = FALSE; \ goto out; \ } \ *pc++ = *lpat++; \ } \ if (pc == class) { \ ok = FALSE; \ goto out; \ } \ *pc = '\0'; \ lpat += 2; /* Skip ":]" */ \ if (iswctype(lc, wctype(class))) { \ ok = !ok; \ goto out; \ } \ continue; \ } \ } #else #define CHK_POSIX_CLASS #endif #define in_class(found, pat, c) { \ register const PCHAR *lpat = pat; \ register int lc = c; \ int lo_bound; \ int hi_bound; \ BOOL ok = FALSE; \ \ if (*lpat == NOT) { \ lpat++; \ ok = TRUE; \ } \ while (*lpat != RCLASS) { \ if (*lpat == QUOTE) \ lpat++; \ CHK_POSIX_CLASS \ lo_bound = *lpat++; \ if (*lpat == RANGE) { \ lpat++; \ if (*lpat == QUOTE) \ lpat++; \ hi_bound = *lpat++; \ } else { \ hi_bound = lo_bound; \ } \ if (lo_bound <= lc && lc <= hi_bound) { \ ok = !ok; \ goto out; \ } \ } \ out: \ found = ok; \ } /* * opatmatch - the old external interpreter interface. * * Trys to match a string beginning at offset * against the compiled pattern. */ #if !defined(__WIDE_CHAR) && !defined(__MB_CHAR) EXPORT CHAR *opatmatch(pat, aux, str, soff, slen, alt) const PCHAR *pat; const int *aux; const CHAR *str; int soff; int slen; int alt; { int state[MAXPAT]; return (patmatch(pat, aux, str, soff, slen, alt, state)); } #endif /* * patmatch - the external interpreter interface. * * Trys to match a string beginning at offset * against the compiled pattern. */ EXPORT CHAR * patmatch(pat, aux, str, soff, slen, alt, state) const PCHAR *pat; const int *aux; const CHAR *str; int soff; int slen; int alt; int state[]; { register int *sp; register int *n; register int *i; register int p; register int q, s, k; #ifdef __MB_CHAR wchar_t c; int mlen = 1; #else int c; #endif const CHAR *lastp = (CHAR *)NULL; #ifdef __LINE_MATCH for (; soff <= slen; soff++) { #endif sp = state; put(sp, state, state, 0); if (alt != ENDSTATE) put(sp, state, sp, alt); #ifdef __MB_CHAR mbtowc(NULL, NULL, 0); for (s = soff; ; s += mlen) { #else for (s = soff; ; s++) { #endif /* * next char from input string */ if (s >= slen) { c = 0; } else { #ifdef __MB_CHAR mlen = mbtowc(&c, (char *)str, slen - s); if (mlen < 0) { mbtowc(NULL, NULL, 0); c = str[s]; mlen = 1; } #else c = str[s]; #endif } /* * first complete the closure */ for (n = state; n < sp; ) { p = *n++; /* next state number */ if (p == ENDSTATE) continue; q = aux[p]; /* get next state for pat */ k = pat[p]; /* get next char from pat */ switch (k) { case REP: put(sp, state, sp, p+1); /* FALLTHRU */ case NIL: /* NIL matches always */ case STAR: put(sp, state, sp, q); break; case LBRACK: /* alternations */ case ALT: put(sp, state, sp, p+1); if (q != ENDSTATE) put(sp, state, sp, q); break; case START: if (s == 0) put(sp, state, sp, q); break; case END: if (c == '\0') put(sp, state, sp, q); break; } } for (i = state; i < sp; ) { if (*i++ == ENDSTATE) { lastp = &str[s]; break; } } if (c == 0) return ((CHAR *)lastp); /* * now try to match next character */ n = sp; sp = state; for (i = sp; i < n; ) { p = *i++; /* next active state number */ if (p == ENDSTATE) continue; k = pat[p]; switch (k) { case ALT: case REP: case NIL: case LBRACK: case START: case END: continue; case LCLASS: in_class(q, &pat[p+1], c); if (!q) continue; break; case STAR: put(sp, state, sp, p); continue; case QUOTE: k = pat[p+1]; default: if (k != c) continue; /* FALLTHRU */ case ANY: break; } put(sp, state, sp, aux[p]); } if (sp == state) { /* if no new states return */ #ifdef __LINE_MATCH if (lastp || (soff == slen - 1)) return ((CHAR *)lastp); else break; #else return ((CHAR *)lastp); #endif } } #ifdef __LINE_MATCH } return ((CHAR *)lastp); #endif } #if !defined(__LINE_MATCH) && !defined(__MB_CHAR) /* * The Compiler */ typedef struct args { const PCHAR *pattern; int *aux; int patp; int length; PCHAR Ch; } arg_t; LOCAL void nextitem __PR((arg_t *)); LOCAL int prim __PR((arg_t *)); LOCAL int expr __PR((arg_t *, int *)); LOCAL void setexits __PR((int *, int, int)); LOCAL int join __PR((int *, int, int)); /* * 'read' the next character from pattern */ #define rch(ap) \ { \ if (++(ap)->patp >= (ap)->length) \ (ap)->Ch = 0; \ else \ (ap)->Ch = (ap)->pattern[(ap)->patp]; \ } /* * 'peek' the next character from pattern */ #define pch(ap) \ ((((ap)->patp + 1) >= (ap)->length) ? \ 0 \ : \ (ap)->pattern[(ap)->patp+1]) \ /* * get the next item from pattern */ LOCAL void nextitem(ap) arg_t *ap; { if (ap->Ch == QUOTE) rch(ap); rch(ap); } /* * parse a primary */ LOCAL int prim(ap) arg_t *ap; { int a = ap->patp; int op = ap->Ch; int t; nextitem(ap); switch (op) { case '\0': case ALT: case RBRACK: return (ENDSTATE); case LCLASS: while (ap->Ch != RCLASS && ap->Ch != '\0') { #ifdef POSIX_CLASS if (ap->Ch == LCLASS) { if (pch(ap) == ':') { /* [:alpha:] */ char class[CL_SIZE+1]; char *pc = class; nextitem(ap); nextitem(ap); while (ap->Ch != ':' && ap->Ch != '\0') { if (pc > &class[CL_SIZE]) return (ENDSTATE); *pc = ap->Ch; if (*pc++ != ap->Ch) return (ENDSTATE); nextitem(ap); } if (pc == class) return (ENDSTATE); *pc = '\0'; if (ap->Ch == '\0') return (ENDSTATE); if (wctype(class) == 0) return (ENDSTATE); nextitem(ap); } if (ap->Ch != RCLASS) return (ENDSTATE); } #endif nextitem(ap); } if (ap->Ch == '\0') return (ENDSTATE); nextitem(ap); break; case REP: t = prim(ap); if (t == ENDSTATE) return (ENDSTATE); setexits(ap->aux, t, a); break; case LBRACK: a = expr(ap, &ap->aux[a]); if (a == ENDSTATE || ap->Ch != RBRACK) return (ENDSTATE); nextitem(ap); break; } return (a); } /* * parse an expression (a sequence of primaries) */ LOCAL int expr(ap, altp) arg_t *ap; int *altp; { int exits = ENDSTATE; int a; int *aux = ap->aux; PCHAR Ch; for (;;) { a = prim(ap); if (a == ENDSTATE) return (ENDSTATE); Ch = ap->Ch; if (Ch == ALT || Ch == RBRACK || Ch == '\0') { exits = join(aux, exits, a); if (Ch != ALT) return (exits); *altp = ap->patp; altp = &aux[ap->patp]; nextitem(ap); } else setexits(aux, a, ap->patp); } } /* * set all exits in a list to a specified value */ LOCAL void setexits(aux, list, val) int *aux; int list; int val; { int a; while (list != ENDSTATE) { a = aux[list]; aux[list] = val; list = a; } } /* * concatenate two lists */ LOCAL int join(aux, a, b) int *aux; int a; int b; { int t; if (a == ENDSTATE) return (b); t = a; while (aux[t] != ENDSTATE) t = aux[t]; aux[t] = b; return (a); } /* * patcompile - the external compiler interface. * * The pattern is compiled into the aux array. * Return value on success, is the outermost alternate which is != 0. * Error is indicated by return of 0. */ EXPORT int patcompile(pat, len, aux) const PCHAR *pat; int len; int *aux; { arg_t a; int alt = ENDSTATE; int i; a.pattern = pat; a.length = len; a.aux = aux; a.patp = -1; for (i = 0; i < len; i++) aux[i] = ENDSTATE; rch(&a); i = expr(&a, &alt); if (i == ENDSTATE) return (0); setexits(aux, i, ENDSTATE); return (alt); } #endif /* !defined(__LINE_MATCH) && !defined(__MB_CHAR) */