reactos/rosapps/smartpdf/fitz/mupdf/pdf_open.c

#include <fitz.h>
#include <mupdf.h>

static inline int iswhite(int ch)
{
	return	ch == '\000' || ch == '\011' || ch == '\012' ||
			ch == '\014' || ch == '\015' || ch == '\040';
}

/*
 * magic version tag and startxref
 */

static fz_error *
loadversion(pdf_xref *xref)
{
	char buf[20];
	int n;

	n = fz_seek(xref->file, 0, 0);
	if (n < 0)
		return fz_ioerror(xref->file);

	fz_readline(xref->file, buf, sizeof buf);
	if (memcmp(buf, "%PDF-", 5) != 0)
		return fz_throw("syntaxerror: corrupt version marker");

	xref->version = atof(buf + 5);

	pdf_logxref("version %g\n", xref->version);

	return nil;
}

static fz_error *
readstartxref(pdf_xref *xref)
{
	char buf[1024];
	int t, n;
	int i;

	t = fz_seek(xref->file, 0, 2);
	if (t == -1)
		return fz_ioerror(xref->file);

	t = fz_seek(xref->file, MAX(0, t - ((int)sizeof buf)), 0);
	if (t == -1)
		return fz_ioerror(xref->file);

	n = fz_read(xref->file, buf, sizeof buf);
	if (n == -1)
		return fz_ioerror(xref->file);

	for (i = n - 9; i >= 0; i--)
	{
		if (memcmp(buf + i, "startxref", 9) == 0)
		{
			i += 9;
			while (iswhite(buf[i]) && i < n)
				i ++;
			xref->startxref = atoi(buf + i);
			return nil;
		}
	}

	return fz_throw("syntaxerror: could not find startxref");
}

#define WHITE_SPACE_CHARS " \n\t\r"

static const char *str_find_char(const char *txt, char c)
{
    while (*txt != c) {
        if (0 == *txt)
            return NULL;
        ++txt;
    }
    return txt;
}

static int str_contains(const char *str, char c)
{
    const char *pos = str_find_char(str, c);
    if (!pos)
        return 0;
    return 1;
}

static void str_strip_right(char *txt, const char *to_strip)
{
    char * new_end;
    char   c;
    if (!txt || !to_strip)
        return;
    if (0 == *txt)
        return;
    /* point at the last character in the string */
    new_end = txt + strlen(txt) - 1;
    for (;;) {
        c = *new_end;
        if (!str_contains(to_strip, c))
            break;
        if (txt == new_end)
            break;
        --new_end;
    }
    if (str_contains(to_strip, *new_end))
        new_end[0] = 0;
    else
        new_end[1] = 0;
}

static void str_strip_ws_right(char *txt)
{
    str_strip_right(txt, WHITE_SPACE_CHARS);
}


/*
 * trailer dictionary
 */

static fz_error *
readoldtrailer(pdf_xref *xref, char *buf, int cap)
{
	int ofs, len;
	char *s;
	int n;
	int t;
	int c;

	pdf_logxref("load old xref format trailer\n");

	fz_readline(xref->file, buf, cap);
    str_strip_ws_right(buf);
	if (strcmp(buf, "xref") != 0)
		return fz_throw("ioerror: missing xref");

	while (1)
	{
		c = fz_peekbyte(xref->file);
		if (!(c >= '0' && c <= '9'))
			break;

		n = fz_readline(xref->file, buf, cap);
		if (n < 0)
			return fz_ioerror(xref->file);

		s = buf;
		ofs = atoi(strsep(&s, " "));
		len = atoi(strsep(&s, " "));

		/* broken pdfs where the section is not on a separate line */
		if (s && *s != '\0')
			fz_seek(xref->file, -(n + buf - s + 2), 1);

		t = fz_tell(xref->file);
		if (t < 0)
			return fz_ioerror(xref->file);

		n = fz_seek(xref->file, t + 20 * len, 0);
		if (n < 0)
			return fz_ioerror(xref->file);
	}

	t = pdf_lex(xref->file, buf, cap, &n);
	if (t != PDF_TTRAILER)
		return fz_throw("syntaxerror: expected trailer");

	t = pdf_lex(xref->file, buf, cap, &n);
	if (t != PDF_TODICT)
		return fz_throw("syntaxerror: expected trailer dictionary");

	return pdf_parsedict(&xref->trailer, xref->file, buf, cap);
}

static fz_error *
readnewtrailer(pdf_xref *xref, char *buf, int cap)
{
	pdf_logxref("load new xref format trailer\n");
	return pdf_parseindobj(&xref->trailer, xref->file, buf, cap, nil, nil, nil);
}

static fz_error *
readtrailer(pdf_xref *xref, char *buf, int cap)
{
	int n;
	int c;

	n = fz_seek(xref->file, xref->startxref, 0);
	if (n < 0)
		return fz_ioerror(xref->file);

	c = fz_peekbyte(xref->file);
	if (c == 'x')
		return readoldtrailer(xref, buf, cap);
	else if (c >= '0' && c <= '9')
		return readnewtrailer(xref, buf, cap);

	return fz_throw("syntaxerror: could not find xref");
}

/*
 * xref tables
 */

static fz_error *
readoldxref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap)
{
	int ofs, len;
	char *s;
	int n;
	int t;
	int i;
	int c;

	pdf_logxref("load old xref format\n");

	fz_readline(xref->file, buf, cap);
    str_strip_ws_right(buf);
	if (strcmp(buf, "xref") != 0)
		return fz_throw("syntaxerror: expected xref");

	while (1)
	{
		c = fz_peekbyte(xref->file);
		if (!(c >= '0' && c <= '9'))
			break;

		n = fz_readline(xref->file, buf, cap);
		if (n < 0)
			return fz_ioerror(xref->file);

		s = buf;
		ofs = atoi(strsep(&s, " "));
		len = atoi(strsep(&s, " "));

		/* broken pdfs where the section is not on a separate line */
		if (s && *s != '\0')
		{
			fz_warn("syntaxerror: broken xref section");
			fz_seek(xref->file, -(n + buf - s + 2), 1);
		}

		for (i = 0; i < len; i++)
		{
			n = fz_read(xref->file, buf, 20);
			if (n < 0)
				return fz_ioerror(xref->file);
			if (n != 20)
				return fz_throw("syntaxerror: truncated xref table");
			if (!xref->table[ofs + i].type)
			{
				s = buf;
				xref->table[ofs + i].ofs = atoi(s);
				xref->table[ofs + i].gen = atoi(s + 11);
				xref->table[ofs + i].type = s[17];
			}
		}
	}

	t = pdf_lex(xref->file, buf, cap, &n);
	if (t != PDF_TTRAILER)
		return fz_throw("syntaxerror: expected trailer");
	t = pdf_lex(xref->file, buf, cap, &n);
	if (t != PDF_TODICT)
		return fz_throw("syntaxerror: expected trailer dictionary");

	return pdf_parsedict(trailerp, xref->file, buf, cap);
}

static fz_error *
readnewxref(fz_obj **trailerp, pdf_xref *xref, char *buf, int cap)
{
	fz_error *error;
	fz_stream *stm;
	fz_obj *trailer;
	fz_obj *obj;
	int oid, gen, stmofs;
	int size, w0, w1, w2, i0, i1;
	int i, n;

	pdf_logxref("load new xref format\n");

	error = pdf_parseindobj(&trailer, xref->file, buf, cap, &oid, &gen, &stmofs);
	if (error)
		return error;

	if (oid < 0 || oid >= xref->len) {
		error = fz_throw("rangecheck: object id out of range");
		goto cleanup;
	}

	xref->table[oid].type = 'n';
	xref->table[oid].gen = gen;
	xref->table[oid].obj = fz_keepobj(trailer);
	xref->table[oid].stmofs = stmofs;

	obj = fz_dictgets(trailer, "Size");
	if (!obj) {
		error = fz_throw("syntaxerror: xref stream missing Size entry");
		goto cleanup;
	}
	size = fz_toint(obj);

	obj = fz_dictgets(trailer, "W");
	if (!obj) {
		error = fz_throw("syntaxerror: xref stream missing W entry");
		goto cleanup;
	}
	w0 = fz_toint(fz_arrayget(obj, 0));
	w1 = fz_toint(fz_arrayget(obj, 1));
	w2 = fz_toint(fz_arrayget(obj, 2));

	obj = fz_dictgets(trailer, "Index");
	if (obj) {
		i0 = fz_toint(fz_arrayget(obj, 0));
		i1 = fz_toint(fz_arrayget(obj, 1));
	}
	else {
		i0 = 0;
		i1 = size;
	}

	if (i0 < 0 || i1 > xref->len) {
		error = fz_throw("syntaxerror: xref stream has too many entries");
		goto cleanup;
	}

	error = pdf_openstream(&stm, xref, oid, gen);
	if (error)
		goto cleanup;

	for (i = i0; i < i0 + i1; i++)
	{
		int a = 0;
		int b = 0;
		int c = 0;

		if (fz_peekbyte(stm) == EOF)
		{
			error = fz_throw("syntaxerror: truncated xref stream");
			fz_dropstream(stm);
			goto cleanup;
		}

		for (n = 0; n < w0; n++)
			a = (a << 8) + fz_readbyte(stm);
		for (n = 0; n < w1; n++)
			b = (b << 8) + fz_readbyte(stm);
		for (n = 0; n < w2; n++)
			c = (c << 8) + fz_readbyte(stm);

		if (!xref->table[i].type)
		{
			int t = w0 ? a : 1;
			xref->table[i].type = t == 0 ? 'f' : t == 1 ? 'n' : t == 2 ? 'o' : 0;
			xref->table[i].ofs = w2 ? b : 0;
			xref->table[i].gen = w1 ? c : 0;
		}
	}

	fz_dropstream(stm);

	*trailerp = trailer;

	return nil;

cleanup:
	fz_dropobj(trailer);
	return error;
}

static fz_error *
readxref(fz_obj **trailerp, pdf_xref *xref, int ofs, char *buf, int cap)
{
	int n;
	int c;

	n = fz_seek(xref->file, ofs, 0);
	if (n < 0)
		return fz_ioerror(xref->file);

	c = fz_peekbyte(xref->file);
	if (c == 'x')
		return readoldxref(trailerp, xref, buf, cap);
	else if (c >= '0' && c <= '9')
		return readnewxref(trailerp, xref, buf, cap);

	return fz_throw("syntaxerror: expected xref");
}

static fz_error *
readxrefsections(pdf_xref *xref, int ofs, char *buf, int cap)
{
	fz_error *error;
	fz_obj *trailer;
	fz_obj *prev;
	fz_obj *xrefstm;

	error = readxref(&trailer, xref, ofs, buf, cap);
	if (error)
		return error;

	/* FIXME: do we overwrite free entries properly? */
	xrefstm = fz_dictgets(trailer, "XrefStm");
	if (xrefstm)
	{
		pdf_logxref("load xrefstm\n");
		error = readxrefsections(xref, fz_toint(xrefstm), buf, cap);
		if (error)
			goto cleanup;
	}

	prev = fz_dictgets(trailer, "Prev");
	if (prev)
	{
		pdf_logxref("load prev\n");
		error = readxrefsections(xref, fz_toint(prev), buf, cap);
		if (error)
			goto cleanup;
	}

	fz_dropobj(trailer);
	return nil;

cleanup:
	fz_dropobj(trailer);
	return error;
}

/*
 * compressed object streams
 */

fz_error *
pdf_loadobjstm(pdf_xref *xref, int oid, int gen, char *buf, int cap)
{
	fz_error *error;
	fz_stream *stm;
	fz_obj *objstm;
	int *oidbuf;
	int *ofsbuf;

	fz_obj *obj;
	int first;
	int count;
	int i, n, t;

	pdf_logxref("loadobjstm %d %d\n", oid, gen);

	error = pdf_loadobject(&objstm, xref, oid, gen);
	if (error)
		return error;

	count = fz_toint(fz_dictgets(objstm, "N"));
	first = fz_toint(fz_dictgets(objstm, "First"));

	pdf_logxref("  count %d\n", count);

	oidbuf = fz_malloc(count * sizeof(int));
	if (!oidbuf) { error = fz_outofmem; goto cleanupobj; }

	ofsbuf = fz_malloc(count * sizeof(int));
	if (!ofsbuf) { error = fz_outofmem; goto cleanupoid; }

	error = pdf_openstream(&stm, xref, oid, gen);
	if (error)
		goto cleanupofs;

	for (i = 0; i < count; i++)
	{
		t = pdf_lex(stm, buf, cap, &n);
		if (t != PDF_TINT)
		{
			error = fz_throw("syntaxerror: corrupt object stream");
			goto cleanupstm;
		}
		oidbuf[i] = atoi(buf);

		t = pdf_lex(stm, buf, cap, &n);
		if (t != PDF_TINT)
		{
			error = fz_throw("syntaxerror: corrupt object stream");
			goto cleanupstm;
		}
		ofsbuf[i] = atoi(buf);
	}

	n = fz_seek(stm, first, 0);
	if (n < 0)
	{
		error = fz_ioerror(stm);
		goto cleanupstm;
	}

	for (i = 0; i < count; i++)
	{
		/* FIXME: seek to first + ofsbuf[i] */

		error = pdf_parsestmobj(&obj, stm, buf, cap);
		if (error)
			goto cleanupstm;

		if (oidbuf[i] < 1 || oidbuf[i] >= xref->len)
		{
			error = fz_throw("rangecheck: object number out of range");
			goto cleanupstm;
		}

		if (xref->table[oidbuf[i]].obj)
			fz_dropobj(xref->table[oidbuf[i]].obj);
		xref->table[oidbuf[i]].obj = obj;
	}

	fz_dropstream(stm);
	fz_free(ofsbuf);
	fz_free(oidbuf);
	fz_dropobj(objstm);
	return nil;

cleanupstm:
	fz_dropstream(stm);
cleanupofs:
	fz_free(ofsbuf);
cleanupoid:
	fz_free(oidbuf);
cleanupobj:
	fz_dropobj(objstm);
	return error;
}

/*
 * open and load xref tables from pdf
 */

fz_error *
pdf_loadxref(pdf_xref *xref, char *filename)
{
	fz_error *error;
	fz_obj *size;
	int i;

	char buf[65536];	/* yeowch! */

	pdf_logxref("loadxref '%s' %p\n", filename, xref);

	error = fz_openrfile(&xref->file, filename);
	if (error)
		return error;

	error = loadversion(xref);
	if (error)
		return error;

	error = readstartxref(xref);
	if (error)
		return error;

	error = readtrailer(xref, buf, sizeof buf);
	if (error)
		return error;

	size = fz_dictgets(xref->trailer, "Size");
	if (!size)
		return fz_throw("syntaxerror: trailer missing Size entry");

	pdf_logxref("  size %d\n", fz_toint(size));

	assert(xref->table == nil);

	xref->cap = fz_toint(size);
	xref->len = fz_toint(size);
	xref->table = fz_malloc(xref->cap * sizeof(pdf_xrefentry));
	if (!xref->table)
		return fz_outofmem;

	for (i = 0; i < xref->len; i++)
	{
		xref->table[i].ofs = 0;
		xref->table[i].gen = 0;
		xref->table[i].type = 0;
		xref->table[i].mark = 0;
		xref->table[i].stmbuf = nil;
		xref->table[i].stmofs = 0;
		xref->table[i].obj = nil;
	}

	error = readxrefsections(xref, xref->startxref, buf, sizeof buf);
	if (error)
		return error;

	return nil;
}