mirror of
https://github.com/reactos/reactos.git
synced 2024-10-31 20:02:55 +00:00
608 lines
11 KiB
C
608 lines
11 KiB
C
/* @(#)match.c 1.27 17/08/13 Copyright 1985, 1995-2017 J. Schilling */
|
|
#include <schily/standard.h>
|
|
#include <schily/patmatch.h>
|
|
#define POSIX_CLASS /* Support [[:alpha:]] by default */
|
|
#ifdef NO_POSIX_CLASS /* Allow to disable [[:alpha:]] */
|
|
#undef POSIX_CLASS
|
|
#endif
|
|
#ifdef POSIX_CLASS
|
|
#include <schily/wchar.h> /* With [[:alpha:]], we need wctype() */
|
|
#include <schily/wctype.h> /* and thus wchar.h and wctype.h */
|
|
#endif
|
|
/*
|
|
* Pattern matching functions
|
|
*
|
|
* Copyright (c) 1985, 1995-2017 J. Schilling
|
|
*/
|
|
/*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License, Version 1.0 only
|
|
* (the "License"). You may not use this file except in compliance
|
|
* with the License.
|
|
*
|
|
* See the file CDDL.Schily.txt in this distribution for details.
|
|
* A copy of the CDDL is also available via the Internet at
|
|
* http://www.opensource.org/licenses/cddl1.txt
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file CDDL.Schily.txt from this distribution.
|
|
*/
|
|
/*
|
|
* The pattern matching functions below are based on the algorithm
|
|
* presented by Martin Richards in:
|
|
*
|
|
* "A Compact Function for Regular Expression Pattern Matching",
|
|
* Software-Practice and Experience, Vol. 9, 527-534 (1979)
|
|
*
|
|
* Several changes have been made to the original source which has been
|
|
* written in BCPL:
|
|
*
|
|
* '/' is replaced by '!' (to allow UNIX filenames)
|
|
* '(',')' are replaced by '{', '}'
|
|
* '\'' is replaced by '\\' (UNIX compatible quote)
|
|
*
|
|
* Character classes have been added to allow "[<character list>]"
|
|
* to be used.
|
|
* POSIX features like [[:alpha:]] have been added.
|
|
* Start of line '^' and end of line '$' have been added.
|
|
*/
|
|
|
|
#undef CHAR
|
|
|
|
#ifdef __LINE_MATCH
|
|
#ifdef __WIDE_CHAR
|
|
#define patmatch patwlmatch
|
|
#else
|
|
#define opatmatch opatlmatch
|
|
#define patmatch patlmatch
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef __WIDE_CHAR
|
|
#ifndef __LINE_MATCH
|
|
#define patcompile patwcompile
|
|
#define patmatch patwmatch
|
|
#endif
|
|
#define CHAR wchar_t
|
|
#define PCHAR wchar_t
|
|
#endif
|
|
|
|
#ifdef __MB_CHAR
|
|
#undef patmatch
|
|
#ifdef __LINE_MATCH
|
|
#define patmatch patmblmatch
|
|
#else
|
|
#define patmatch patmbmatch
|
|
#endif
|
|
#define PCHAR wchar_t
|
|
#endif
|
|
|
|
#ifndef CHAR
|
|
typedef unsigned char Uchar;
|
|
#define DID_UCHAR_TYPE
|
|
#define CHAR Uchar
|
|
#endif
|
|
|
|
#ifndef PCHAR
|
|
#ifndef DID_UCHAR_TYPE
|
|
typedef unsigned char Uchar;
|
|
#endif
|
|
#define PCHAR Uchar
|
|
#endif
|
|
|
|
#define ENDSTATE (-1)
|
|
|
|
#define CL_SIZE 32 /* Max size for '[: :]' */
|
|
|
|
/*
|
|
* The Interpreter
|
|
*/
|
|
|
|
|
|
/*
|
|
* put adds a new state to the active list
|
|
*/
|
|
#define put(ret, state, sp, n) { \
|
|
register int *lstate = state; \
|
|
register int *lsp = sp; \
|
|
register int ln = n; \
|
|
\
|
|
while (lstate < lsp) { \
|
|
if (*lstate++ == ln) { \
|
|
ret = lsp; \
|
|
lsp = 0; \
|
|
break; \
|
|
} \
|
|
} \
|
|
if (lsp) { \
|
|
*lstate++ = ln; \
|
|
ret = lstate; \
|
|
} \
|
|
}
|
|
|
|
|
|
/*
|
|
* match a character in class
|
|
*
|
|
* Syntax errors do not appear here, they are handled by the compiler,
|
|
* so in theory we could remove the "return (0)" statements from the
|
|
* the POSIX class code.
|
|
*/
|
|
#ifdef POSIX_CLASS
|
|
#define CHK_POSIX_CLASS \
|
|
else if (*lpat == LCLASS) { \
|
|
if (lpat[1] == ':') { \
|
|
char class[CL_SIZE+1]; \
|
|
char *pc = class; \
|
|
\
|
|
lpat += 2; /* Eat ':' */ \
|
|
for (;;) { \
|
|
if (*lpat == '\0') { \
|
|
ok = FALSE; \
|
|
goto out; \
|
|
} \
|
|
if (*lpat == ':' && lpat[1] == RCLASS) \
|
|
break; \
|
|
if (pc >= &class[CL_SIZE]) { \
|
|
ok = FALSE; \
|
|
goto out; \
|
|
} \
|
|
*pc++ = *lpat++; \
|
|
} \
|
|
if (pc == class) { \
|
|
ok = FALSE; \
|
|
goto out; \
|
|
} \
|
|
*pc = '\0'; \
|
|
lpat += 2; /* Skip ":]" */ \
|
|
if (iswctype(lc, wctype(class))) { \
|
|
ok = !ok; \
|
|
goto out; \
|
|
} \
|
|
continue; \
|
|
} \
|
|
}
|
|
#else
|
|
#define CHK_POSIX_CLASS
|
|
#endif
|
|
#define in_class(found, pat, c) { \
|
|
register const PCHAR *lpat = pat; \
|
|
register int lc = c; \
|
|
int lo_bound; \
|
|
int hi_bound; \
|
|
BOOL ok = FALSE; \
|
|
\
|
|
if (*lpat == NOT) { \
|
|
lpat++; \
|
|
ok = TRUE; \
|
|
} \
|
|
while (*lpat != RCLASS) { \
|
|
if (*lpat == QUOTE) \
|
|
lpat++; \
|
|
CHK_POSIX_CLASS \
|
|
lo_bound = *lpat++; \
|
|
if (*lpat == RANGE) { \
|
|
lpat++; \
|
|
if (*lpat == QUOTE) \
|
|
lpat++; \
|
|
hi_bound = *lpat++; \
|
|
} else { \
|
|
hi_bound = lo_bound; \
|
|
} \
|
|
if (lo_bound <= lc && lc <= hi_bound) { \
|
|
ok = !ok; \
|
|
goto out; \
|
|
} \
|
|
} \
|
|
out: \
|
|
found = ok; \
|
|
}
|
|
|
|
/*
|
|
* opatmatch - the old external interpreter interface.
|
|
*
|
|
* Trys to match a string beginning at offset
|
|
* against the compiled pattern.
|
|
*/
|
|
#if !defined(__WIDE_CHAR) && !defined(__MB_CHAR)
|
|
EXPORT CHAR
|
|
*opatmatch(pat, aux, str, soff, slen, alt)
|
|
const PCHAR *pat;
|
|
const int *aux;
|
|
const CHAR *str;
|
|
int soff;
|
|
int slen;
|
|
int alt;
|
|
{
|
|
int state[MAXPAT];
|
|
|
|
return (patmatch(pat, aux, str, soff, slen, alt, state));
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* patmatch - the external interpreter interface.
|
|
*
|
|
* Trys to match a string beginning at offset
|
|
* against the compiled pattern.
|
|
*/
|
|
EXPORT CHAR *
|
|
patmatch(pat, aux, str, soff, slen, alt, state)
|
|
const PCHAR *pat;
|
|
const int *aux;
|
|
const CHAR *str;
|
|
int soff;
|
|
int slen;
|
|
int alt;
|
|
int state[];
|
|
{
|
|
register int *sp;
|
|
register int *n;
|
|
register int *i;
|
|
register int p;
|
|
register int q, s, k;
|
|
#ifdef __MB_CHAR
|
|
wchar_t c;
|
|
int mlen = 1;
|
|
#else
|
|
int c;
|
|
#endif
|
|
const CHAR *lastp = (CHAR *)NULL;
|
|
|
|
#ifdef __LINE_MATCH
|
|
for (; soff <= slen; soff++) {
|
|
#endif
|
|
|
|
sp = state;
|
|
put(sp, state, state, 0);
|
|
if (alt != ENDSTATE)
|
|
put(sp, state, sp, alt);
|
|
|
|
#ifdef __MB_CHAR
|
|
mbtowc(NULL, NULL, 0);
|
|
for (s = soff; ; s += mlen) {
|
|
#else
|
|
for (s = soff; ; s++) {
|
|
#endif
|
|
/*
|
|
* next char from input string
|
|
*/
|
|
if (s >= slen) {
|
|
c = 0;
|
|
} else {
|
|
#ifdef __MB_CHAR
|
|
mlen = mbtowc(&c, (char *)str, slen - s);
|
|
if (mlen < 0) {
|
|
mbtowc(NULL, NULL, 0);
|
|
c = str[s];
|
|
mlen = 1;
|
|
}
|
|
#else
|
|
c = str[s];
|
|
#endif
|
|
}
|
|
/*
|
|
* first complete the closure
|
|
*/
|
|
for (n = state; n < sp; ) {
|
|
p = *n++; /* next state number */
|
|
if (p == ENDSTATE)
|
|
continue;
|
|
q = aux[p]; /* get next state for pat */
|
|
k = pat[p]; /* get next char from pat */
|
|
switch (k) {
|
|
|
|
case REP:
|
|
put(sp, state, sp, p+1);
|
|
/* FALLTHRU */
|
|
case NIL: /* NIL matches always */
|
|
case STAR:
|
|
put(sp, state, sp, q);
|
|
break;
|
|
case LBRACK: /* alternations */
|
|
case ALT:
|
|
put(sp, state, sp, p+1);
|
|
if (q != ENDSTATE)
|
|
put(sp, state, sp, q);
|
|
break;
|
|
case START:
|
|
if (s == 0)
|
|
put(sp, state, sp, q);
|
|
break;
|
|
case END:
|
|
if (c == '\0')
|
|
put(sp, state, sp, q);
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (i = state; i < sp; ) {
|
|
if (*i++ == ENDSTATE) {
|
|
lastp = &str[s];
|
|
break;
|
|
}
|
|
}
|
|
if (c == 0)
|
|
return ((CHAR *)lastp);
|
|
|
|
/*
|
|
* now try to match next character
|
|
*/
|
|
n = sp;
|
|
sp = state;
|
|
for (i = sp; i < n; ) {
|
|
p = *i++; /* next active state number */
|
|
if (p == ENDSTATE)
|
|
continue;
|
|
k = pat[p];
|
|
switch (k) {
|
|
|
|
case ALT:
|
|
case REP:
|
|
case NIL:
|
|
case LBRACK:
|
|
case START:
|
|
case END:
|
|
continue;
|
|
case LCLASS:
|
|
in_class(q, &pat[p+1], c);
|
|
if (!q)
|
|
continue;
|
|
break;
|
|
case STAR:
|
|
put(sp, state, sp, p);
|
|
continue;
|
|
case QUOTE:
|
|
k = pat[p+1];
|
|
default:
|
|
if (k != c)
|
|
continue;
|
|
/* FALLTHRU */
|
|
case ANY:
|
|
break;
|
|
}
|
|
put(sp, state, sp, aux[p]);
|
|
}
|
|
if (sp == state) { /* if no new states return */
|
|
#ifdef __LINE_MATCH
|
|
|
|
if (lastp || (soff == slen - 1))
|
|
return ((CHAR *)lastp);
|
|
else
|
|
break;
|
|
#else
|
|
return ((CHAR *)lastp);
|
|
#endif
|
|
}
|
|
}
|
|
#ifdef __LINE_MATCH
|
|
}
|
|
return ((CHAR *)lastp);
|
|
#endif
|
|
}
|
|
|
|
|
|
#if !defined(__LINE_MATCH) && !defined(__MB_CHAR)
|
|
/*
|
|
* The Compiler
|
|
*/
|
|
|
|
typedef struct args {
|
|
const PCHAR *pattern;
|
|
int *aux;
|
|
int patp;
|
|
int length;
|
|
PCHAR Ch;
|
|
} arg_t;
|
|
|
|
LOCAL void nextitem __PR((arg_t *));
|
|
LOCAL int prim __PR((arg_t *));
|
|
LOCAL int expr __PR((arg_t *, int *));
|
|
LOCAL void setexits __PR((int *, int, int));
|
|
LOCAL int join __PR((int *, int, int));
|
|
|
|
/*
|
|
* 'read' the next character from pattern
|
|
*/
|
|
#define rch(ap) \
|
|
{ \
|
|
if (++(ap)->patp >= (ap)->length) \
|
|
(ap)->Ch = 0; \
|
|
else \
|
|
(ap)->Ch = (ap)->pattern[(ap)->patp]; \
|
|
}
|
|
|
|
/*
|
|
* 'peek' the next character from pattern
|
|
*/
|
|
#define pch(ap) \
|
|
((((ap)->patp + 1) >= (ap)->length) ? \
|
|
0 \
|
|
: \
|
|
(ap)->pattern[(ap)->patp+1]) \
|
|
|
|
/*
|
|
* get the next item from pattern
|
|
*/
|
|
LOCAL void
|
|
nextitem(ap)
|
|
arg_t *ap;
|
|
{
|
|
if (ap->Ch == QUOTE)
|
|
rch(ap);
|
|
rch(ap);
|
|
}
|
|
|
|
/*
|
|
* parse a primary
|
|
*/
|
|
LOCAL int
|
|
prim(ap)
|
|
arg_t *ap;
|
|
{
|
|
int a = ap->patp;
|
|
int op = ap->Ch;
|
|
int t;
|
|
|
|
nextitem(ap);
|
|
switch (op) {
|
|
|
|
case '\0':
|
|
case ALT:
|
|
case RBRACK:
|
|
return (ENDSTATE);
|
|
case LCLASS:
|
|
while (ap->Ch != RCLASS && ap->Ch != '\0') {
|
|
#ifdef POSIX_CLASS
|
|
if (ap->Ch == LCLASS) {
|
|
if (pch(ap) == ':') { /* [:alpha:] */
|
|
char class[CL_SIZE+1];
|
|
char *pc = class;
|
|
|
|
nextitem(ap);
|
|
nextitem(ap);
|
|
while (ap->Ch != ':' &&
|
|
ap->Ch != '\0') {
|
|
if (pc > &class[CL_SIZE])
|
|
return (ENDSTATE);
|
|
*pc = ap->Ch;
|
|
if (*pc++ != ap->Ch)
|
|
return (ENDSTATE);
|
|
nextitem(ap);
|
|
}
|
|
if (pc == class)
|
|
return (ENDSTATE);
|
|
*pc = '\0';
|
|
if (ap->Ch == '\0')
|
|
return (ENDSTATE);
|
|
if (wctype(class) == 0)
|
|
return (ENDSTATE);
|
|
nextitem(ap);
|
|
}
|
|
if (ap->Ch != RCLASS)
|
|
return (ENDSTATE);
|
|
}
|
|
#endif
|
|
nextitem(ap);
|
|
}
|
|
if (ap->Ch == '\0')
|
|
return (ENDSTATE);
|
|
nextitem(ap);
|
|
break;
|
|
case REP:
|
|
t = prim(ap);
|
|
if (t == ENDSTATE)
|
|
return (ENDSTATE);
|
|
setexits(ap->aux, t, a);
|
|
break;
|
|
case LBRACK:
|
|
a = expr(ap, &ap->aux[a]);
|
|
if (a == ENDSTATE || ap->Ch != RBRACK)
|
|
return (ENDSTATE);
|
|
nextitem(ap);
|
|
break;
|
|
}
|
|
return (a);
|
|
}
|
|
|
|
/*
|
|
* parse an expression (a sequence of primaries)
|
|
*/
|
|
LOCAL int
|
|
expr(ap, altp)
|
|
arg_t *ap;
|
|
int *altp;
|
|
{
|
|
int exits = ENDSTATE;
|
|
int a;
|
|
int *aux = ap->aux;
|
|
PCHAR Ch;
|
|
|
|
for (;;) {
|
|
a = prim(ap);
|
|
if (a == ENDSTATE)
|
|
return (ENDSTATE);
|
|
Ch = ap->Ch;
|
|
if (Ch == ALT || Ch == RBRACK || Ch == '\0') {
|
|
exits = join(aux, exits, a);
|
|
if (Ch != ALT)
|
|
return (exits);
|
|
*altp = ap->patp;
|
|
altp = &aux[ap->patp];
|
|
nextitem(ap);
|
|
} else
|
|
setexits(aux, a, ap->patp);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* set all exits in a list to a specified value
|
|
*/
|
|
LOCAL void
|
|
setexits(aux, list, val)
|
|
int *aux;
|
|
int list;
|
|
int val;
|
|
{
|
|
int a;
|
|
|
|
while (list != ENDSTATE) {
|
|
a = aux[list];
|
|
aux[list] = val;
|
|
list = a;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* concatenate two lists
|
|
*/
|
|
LOCAL int
|
|
join(aux, a, b)
|
|
int *aux;
|
|
int a;
|
|
int b;
|
|
{
|
|
int t;
|
|
|
|
if (a == ENDSTATE)
|
|
return (b);
|
|
t = a;
|
|
while (aux[t] != ENDSTATE)
|
|
t = aux[t];
|
|
aux[t] = b;
|
|
return (a);
|
|
}
|
|
|
|
/*
|
|
* patcompile - the external compiler interface.
|
|
*
|
|
* The pattern is compiled into the aux array.
|
|
* Return value on success, is the outermost alternate which is != 0.
|
|
* Error is indicated by return of 0.
|
|
*/
|
|
EXPORT int
|
|
patcompile(pat, len, aux)
|
|
const PCHAR *pat;
|
|
int len;
|
|
int *aux;
|
|
{
|
|
arg_t a;
|
|
int alt = ENDSTATE;
|
|
int i;
|
|
|
|
a.pattern = pat;
|
|
a.length = len;
|
|
a.aux = aux;
|
|
a.patp = -1;
|
|
|
|
for (i = 0; i < len; i++)
|
|
aux[i] = ENDSTATE;
|
|
rch(&a);
|
|
i = expr(&a, &alt);
|
|
if (i == ENDSTATE)
|
|
return (0);
|
|
setexits(aux, i, ENDSTATE);
|
|
return (alt);
|
|
}
|
|
#endif /* !defined(__LINE_MATCH) && !defined(__MB_CHAR) */
|