reactos/rosapps/smartpdf/fitz/mupdf/pdf_unicode.c

#include <fitz.h>
#include <mupdf.h>

/*
 * ToUnicode map for fonts
 */

fz_error *
pdf_loadtounicode(pdf_font *font, pdf_xref *xref,
	char **strings, char *collection, fz_obj *cmapstm)
{
	fz_error *error;
	pdf_cmap *cmap;
	int cid;
	int ucs;
	int i;

	if (fz_isindirect(cmapstm))
	{
		pdf_logfont("tounicode embedded cmap\n");

		error = pdf_loadembeddedcmap(&cmap, xref, cmapstm);
		if (error)
			return error;

		error = pdf_newcmap(&font->tounicode);
		if (error)
			goto cleanup;

		for (i = 0; i < (strings ? 256 : 65536); i++)
		{
			cid = pdf_lookupcmap(font->encoding, i);
			if (cid > 0)
			{
				ucs = pdf_lookupcmap(cmap, i);
				if (ucs > 0)
				{
					error = pdf_maprangetorange(font->tounicode, cid, cid, ucs);
					if (error)
						goto cleanup;
				}
			}
		}

		error = pdf_sortcmap(font->tounicode);
		if (error)
			goto cleanup;

	cleanup:
		pdf_dropcmap(cmap);
		return error;
	}

	else if (collection)
	{
		pdf_logfont("tounicode cid collection\n");

		if (!strcmp(collection, "Adobe-CNS1"))
			return pdf_loadsystemcmap(&font->tounicode, "Adobe-CNS1-UCS2");
		else if (!strcmp(collection, "Adobe-GB1"))
			return pdf_loadsystemcmap(&font->tounicode, "Adobe-GB1-UCS2");
		else if (!strcmp(collection, "Adobe-Japan1"))
			return pdf_loadsystemcmap(&font->tounicode, "Adobe-Japan1-UCS2");
		else if (!strcmp(collection, "Adobe-Japan2"))
			return pdf_loadsystemcmap(&font->tounicode, "Adobe-Japan2-UCS2");
		else if (!strcmp(collection, "Adobe-Korea1"))
			return pdf_loadsystemcmap(&font->tounicode, "Adobe-Korea1-UCS2");
	}

	if (strings)
	{
		pdf_logfont("tounicode strings\n");

		/* TODO use tounicode cmap here ... for one-to-many mappings */

		font->ncidtoucs = 256;
		font->cidtoucs = fz_malloc(256 * sizeof(unsigned short));
		if (!font->cidtoucs)
			return fz_outofmem;

		for (i = 0; i < 256; i++)
		{
			if (strings[i])
			{
				int aglbuf[256];
				int aglnum;
				aglnum = pdf_lookupagl(strings[i], aglbuf, nelem(aglbuf));
				if (aglnum > 0)
					font->cidtoucs[i] = aglbuf[0];
				else
					font->cidtoucs[i] = '?';
			}
			else
				font->cidtoucs[i] = '?';
		}

		return nil;
	}

	pdf_logfont("tounicode impossible");
	return nil;
}

/*
 * Extract lines of text from display tree
 */

fz_error *
pdf_newtextline(pdf_textline **linep)
{
	pdf_textline *line;
	line = *linep = fz_malloc(sizeof(pdf_textline));
	if (!line)
		return fz_outofmem;
	line->len = 0;
	line->cap = 0;
	line->text = nil;
	line->next = nil;
	return nil;
}

void
pdf_droptextline(pdf_textline *line)
{
	if (line->next)
		pdf_droptextline(line->next);
	fz_free(line->text);
	fz_free(line);
}

static fz_error *
addtextchar(pdf_textline *line, fz_irect bbox, int c)
{
	pdf_textchar *newtext;
	int newcap;

	if (line->len + 1 >= line->cap)
	{
		newcap = line->cap ? line->cap * 2 : 80;
		newtext = fz_realloc(line->text, sizeof(pdf_textchar) * newcap);
		if (!newtext)
			return fz_outofmem;
		line->cap = newcap;
		line->text = newtext;
	}

	line->text[line->len].bbox = bbox;
	line->text[line->len].c = c;
	line->len ++;

	return nil;
}

/* XXX global! not reentrant! */
static fz_point oldpt = { 0, 0 };

static fz_error *
extracttext(pdf_textline **line, fz_node *node, fz_matrix ctm)
{
	fz_error *error;

	if (fz_istextnode(node))
	{
		fz_textnode *text = (fz_textnode*)node;
		pdf_font *font = (pdf_font*)text->font;
		fz_matrix inv = fz_invertmatrix(text->trm);
		fz_matrix tm = text->trm;
		fz_matrix trm;
		float dx, dy, t;
		fz_point p;
		fz_point vx;
		fz_point vy;
		fz_vmtx v;
		fz_hmtx h;
		int i, g;
		int x, y;
		fz_irect box;
		int c;

		for (i = 0; i < text->len; i++)
		{
			g = text->els[i].cid;

			tm.e = text->els[i].x;
			tm.f = text->els[i].y;
			trm = fz_concat(tm, ctm);
			x = trm.e;
			y = trm.f;
			trm.e = 0;
			trm.f = 0;

			p.x = text->els[i].x;
			p.y = text->els[i].y;
			p = fz_transformpoint(inv, p);
			dx = oldpt.x - p.x;
			dy = oldpt.y - p.y;
			oldpt = p;

			if (text->font->wmode == 0)
			{
				h = fz_gethmtx(text->font, g);
				oldpt.x += h.w * 0.001;

				vx.x = h.w * 0.001; vx.y = 0;
				vy.x = 0; vy.y = 1;
			}
			else
			{
				v = fz_getvmtx(text->font, g);
				oldpt.y += v.w * 0.001;
				t = dy; dy = dx; dx = t;

				vx.x = 0.5; vx.y = 0;
				vy.x = 0; vy.y = v.w * 0.001;
			}

			if (fabs(dy) > 0.2)
			{
				pdf_textline *newline;
				error = pdf_newtextline(&newline);
				if (error)
					return error;
				(*line)->next = newline;
				*line = newline;
			}
			else if (fabs(dx) > 0.2)
			{
				box.x0 = x; box.x1 = x;
				box.y0 = y; box.y1 = y;
				error = addtextchar(*line, box, ' ');
				if (error)
					return error;
			}

			vx = fz_transformpoint(trm, vx);
			vy = fz_transformpoint(trm, vy);
			box.x0 = MIN(0, MIN(vx.x, vy.x)) + x;
			box.x1 = MAX(0, MAX(vx.x, vy.x)) + x;
			box.y0 = MIN(0, MIN(vx.y, vy.y)) + y;
			box.y1 = MAX(0, MAX(vx.y, vy.y)) + y;

			if (font->tounicode)
				c = pdf_lookupcmap(font->tounicode, g);
			else if (g < font->ncidtoucs)
				c = font->cidtoucs[g];
			else
				c = g;

			error = addtextchar(*line, box, c);
			if (error)
				return error;
		}
	}

	if (fz_istransformnode(node))
		ctm = fz_concat(((fz_transformnode*)node)->m, ctm);

	for (node = node->first; node; node = node->next)
	{
		error = extracttext(line, node, ctm);
		if (error)
			return error;
	}

	return nil;
}

fz_error *
pdf_loadtextfromtree(pdf_textline **outp, fz_tree *tree, fz_matrix ctm)
{
	pdf_textline *root;
	pdf_textline *line;
	fz_error *error;

	oldpt.x = -1;
	oldpt.y = -1;

	error = pdf_newtextline(&root);
	if (error)
		return error;

	line = root;

	error = extracttext(&line, tree->root, ctm);
	if (error)
	{
		pdf_droptextline(root);
		return error;
	}

	*outp = root;
	return nil;
}

void
pdf_debugtextline(pdf_textline *line)
{
	char buf[10];
	int c, n, k, i;

	for (i = 0; i < line->len; i++)
	{
		c = line->text[i].c;
		if (c < 128)
			putchar(c);
		else
		{
			n = runetochar(buf, &c);
			for (k = 0; k < n; k++)
				putchar(buf[k]);
		}
	}
	putchar('\n');

	if (line->next)
		pdf_debugtextline(line->next);
}