Súbor: [Platon] / cpdf / parse.c (stiahnutie)
Revízia 1.20, Mon Dec 23 02:40:05 2002 UTC (21 years, 4 months ago) by lynx
Zmeny od 1.19: +160 -128
[lines]
First phase parsing code is now very optimized...its faster than ever and he
is stable for all valid PDF.
Removed problems with decompression streams.
|
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <assert.h>
#include "cpdf.h"
#include "parse.h"
#include "filter.h"
#include "utils.h"
/* function compute number of locations of char c from buffer to character c
*/
int strlenc(char *buffer, char c, char to)
{
int i = 0;
while (*buffer && *buffer != to) {
if (*buffer == c)
i++;
buffer++;
}
return i;
}
/* function compute lenght of string to char to
*/
int strlento(char *buffer, char to)
{
char *old = buffer;
while (*buffer && *buffer != to)
buffer++;
return buffer - old;
}
/* convert hex to bin
*/
char *hex(char *buffer)
{
char *hex_array, *end;
char num[5] = { '0', 'x', '\0', '\0', '\0' };
int i;
if (*buffer != '<')
return NULL;
if ((i = strlento(buffer + 1, '>') >> 1) == 0)
return NULL;
end = hex_array = (char *) xmalloc(i);
for (buffer++; *buffer != '>'; hex_array++) {
num[2] = *buffer;
if (*(buffer + 1) == '>') {
num[3] = '0';
buffer++;
} else {
num[3] = *(buffer + 1);
buffer += 2;
}
*hex_array = (char) strtoul((char *) &num, NULL, 16);
}
return end;
}
/* interpreter for special characters in name string */
char *name(char *buffer)
{
char *ret, *end;
char num[5] = { '0', 'x', '\0', '\0', '\0' };
if (*buffer != '/')
return NULL;
if (memchr(buffer, '#', MAXNAMELEN) == NULL)
return buffer;
end = ret = (char *) xmalloc(MAXNAMELEN);
while (*buffer && *buffer != ' ') {
if (*buffer == '#') {
num[2] = *(buffer + 1);
num[3] = *(buffer + 2);
*ret = (char) strtoul(num, NULL, 16);
ret++;
buffer += 3;
}
*ret = *buffer;
ret++;
buffer++;
}
return end;
}
char **array(char *buffer)
{
char **ret = NULL, *last;
static char delim[2] = { '[', ']' };
int index = 0, len, end = 0;
last = buffer + 1;
do {
ret =
(char **) realloc(ret, (index + 1) * sizeof(char **));
if (*buffer == '[' || *buffer == ']') {
if (*buffer == '[') {
ret[index] = &delim[0];
end++;
} else {
ret[index] = &delim[1];
end--;
}
buffer++;
index++;
continue;
}
while (1) {
while (*buffer != ' ' && *buffer != ']'
&& *buffer != '\n')
buffer++;
if (buffer - last == 0) {
buffer++;
continue;
} else
break;
}
len = buffer - last;
ret[index] = (char *) xcalloc(len + 1, sizeof(char));
strncpy(ret[index], last, len);
index++;
if (*buffer != ']')
last = ++buffer;
else
last = buffer;
} while (end);
return ret;
}
#define BUFLEN 255
char *read_obj(long offset)
{
char entry[BUFLEN], *obj = NULL;
int size = 0;
fseek(fp, offset, SEEK_SET);
memset(entry, '\0', BUFLEN);
fget(entry, BUFLEN, fp);
do {
fget(entry, BUFLEN, fp);
/* stream need special care in reading */
if (strstr(entry, "stream")) {
char *p;
int len;
long off;
obj = resize(obj, size + strlen(entry));
strncpy(obj + size, entry, strlen(entry) + 1);
size += strlen(entry);
p = strstr(obj, "Length");
sscanf(p, "Length %d", &len);
for (p += 6; *p != '/'; p++)
if (!strncmp(p, "0 R", 3)) {
off = ftell(fp);
p = (char *) get_object(len, NULL);
sscanf(p, "%d\nendobj", &len);
fseek(fp, off, SEEK_SET);
break;
}
len += sizeof("endstream");
size += len;
obj = resize(obj, size);
fread(obj + strlen(obj), sizeof(char), len, fp);
continue;
}
if (strstr(entry, "endobj"))
break;
obj = resize(obj, size + strlen(entry));
strncpy(obj + size, entry, strlen(entry));
size += strlen(entry);
} while (1);
return obj;
}
int get_typenum(char *dictionary)
{
char *p;
p = strstr(dictionary, "Type");
if (!p) {
if ((p = strstr(dictionary, "stream")) != NULL)
return CONTENTS;
return DATA;
}
for (p += 4; *p != '/'; p++);
p++;
if (!strncmp(p, "Catalog", 7))
return CATALOG;
if (!strncmp(p, "Page", 4)) {
if (p[4] == 's')
return PAGES;
return PAGE;
}
return DATA;
}
void *get_object(int num, int *type)
{
void *ret = NULL;
char *dict;
struct object *obj = (struct object *) *(objects + num - 1);
if (obj->parsed) {
if (type)
*type = obj->type;
return obj->parsed;
}
dict = read_obj(obj->offset);
obj->type = get_typenum(dict);
if (type)
*type = obj->type;
if (obj->type == CATALOG) {
ret = fill_catalog(dict);
goto out;
}
if (obj->type == PAGES) {
ret = fill_pages(dict);
goto out;
}
if (obj->type == PAGE) {
ret = fill_page(dict);
goto out;
}
if (obj->type == CONTENTS) {
ret = fill_contents(dict);
goto out;
}
if (obj->type == RESOURCES) {
ret = fill_resources(dict);
goto out;
}
if (obj->type == DATA) {
obj->parsed = dict;
return dict;
}
out:
free(dict);
obj->parsed = ret;
return ret;
}
/* add object to object structure */
void add_obj(char *s, int num)
{
struct object *o =
(struct object *) xcalloc(sizeof(struct object), 1);
sscanf(s, "%ld 00000 n", &o->offset);
o->obj_num = num;
objects[num - 1] = o;
return;
}
/* get filter number */
char get_filternum(char *filter)
{
if (!strncmp(filter, "Fl", 2))
return FLATEDECODE;
if (!strncmp(filter, "DCTDecode", 9))
return DCTDECODE;
return -1;
}
/* get offset of start xref table */
long get_startxref(void)
{
char startxref[XREFLINE];
int i = XREFLINE - 2;
long offset;
startxref[XREFLINE - 1] = '\0';
fstat(fp->_fileno, &fpstat);
offset = fpstat.st_size - EOFSIZE;
fseek(fp, offset, SEEK_SET);
while (1) {
startxref[i] = fgetc(fp);
if (startxref[i] == '\r' || startxref[i] == '\n')
break;
i--;
offset--;
fseek(fp, offset, SEEK_SET);
}
return strtoul(&startxref[i + 1], NULL, 10);
}
void probe_ptree(int num)
{
struct pages *pages;
int type, a;
pages = (struct pages *) get_object(num, NULL);
pages->kids++;
a = atoi(*pages->kids);
pages = (struct pages *) get_object(a, &type);
while (type != PAGE) {
a = atoi(pages->kids[1]);
pages = (struct pages *) get_object(a, &type);
}
page_tree[current_page] = a;
return;
/* while(1) {
if(**pages->kids == ']' && !pages->parent)
break;
if(**pages->kids == ']') {
free(pages->old_kids);
pages = (struct pages *) get_object(pages->parent,NULL);
}
num = atoi(*pages->kids);
p = (struct pages *) get_object(num,&type);
if(type == PAGE) {
page_tree[current_page] = num;
current_page++;
pages->kids += 3;
continue;
}
pages->kids += 3;
pages = p;
}
return;*/
}
/* find objects and add it to object structure */
void parse_xref(void)
{
char xref[XREFLINE], *p;
int entry[2];
fseek(fp, get_startxref(), SEEK_SET);
fget(xref, XREFLINE, fp);
if (!strstr(xref, "xref")) {
fprintf(stderr, "PDF document si corrupted!\n");
/* reconstruction ? */
return;
}
objects = NULL;
obj_count = 0;
do {
fget(xref, XREFLINE, fp);
sscanf(xref, "%d %d", &entry[0], &entry[1]);
entry[1] += entry[0];
if (entry[0] == 0)
entry[0]++;
if (entry[1] > obj_count) {
objects = (struct object **) realloc(objects,
entry[1] *
sizeof(char
*));
obj_count = entry[1];
}
do {
fget(xref, XREFLINE, fp);
if (!strncmp(xref, "trailer", 7))
break;
if (xref[XREFLINE - 3] == 'f')
continue;
add_obj(xref, entry[0]);
entry[0]++;
} while (1);
{
char *tline = (char *) xmalloc(3 * XREFLINE);
fget(tline, 3 * XREFLINE, fp);
trailer.prev = 0;
while (tline[0] != '>' && tline[1] != '>') {
if ((p = strstr(tline, "Root")) != NULL)
sscanf(p, "Root %d 0 R",
&trailer.root);
if ((p = strstr(tline, "Info")) != NULL)
sscanf(p, "Info %d 0 R",
&trailer.info);
if ((p = strstr(tline, "Prev")) != NULL)
sscanf(p, "Prev %ld",
&trailer.prev);
if ((p = strstr(tline, "Encrypt")) != NULL)
sscanf(p,
"Encrypt %d 0 R",
&trailer.encrypt);
fget(tline, 3 * XREFLINE, fp);
}
free(tline);
}
if (!trailer.prev)
break;
fseek(fp, trailer.prev, SEEK_SET);
fget(xref, XREFLINE, fp);
} while (1);
return;
}
int parse_catalog(void)
{
struct catalog *c = NULL;
c = (struct catalog *) get_object(trailer.root, NULL);
if (!c) {
/* error */
}
return c->pages;
}
void parse_pdf(void)
{
int ret;
struct pages *pages;
parse_xref();
ret = parse_catalog();
pages = (struct pages *) get_object(ret, NULL);
page_tree = (int *) xcalloc(pages->count, 1);
current_page = 0;
ret = parse_catalog();
probe_ptree(ret);
current_page = 0;
loop();
return;
}
struct catalog *fill_catalog(char *dictionary)
{
char *p;
struct catalog *catalog =
(struct catalog *) xcalloc(sizeof(struct catalog), 1);
p = strstr(dictionary, "Pages");
if (!p) {
/* error */
return NULL;
}
sscanf(p, "Pages %d 0 R", &catalog->pages);
return catalog;
}
struct pages *fill_pages(char *dictionary)
{
char *p;
struct pages *pages =
(struct pages *) xcalloc(sizeof(struct pages), 1);
p = strstr(dictionary, "Count");
if (!p) {
/* error */
return NULL;
}
sscanf(p, "Count %d", &pages->count);
p = strstr(dictionary, "Kids");
if (!p) {
/* error */
return NULL;
}
pages->old_kids = pages->kids = array(p + 5);
p = strstr(dictionary, "Parent");
if (p)
sscanf(p, "Parent %d 0 R", &pages->parent);
return pages;
}
struct page *fill_page(char *dictionary)
{
struct page *page;
char *p, **mediabox;
int ref;
page = (struct page *) xcalloc(sizeof(struct page), 1);
p = strstr(dictionary, "Contents");
if (!p) {
/* error */
return NULL;
}
if (p[9] == '<') { /* if /Contents << */
page->contents = fill_contents(dictionary);
} else {
sscanf(p, "Contents %d 0 R", &ref);
page->contents = (struct contents *) get_object(ref, NULL);
}
p = strstr(dictionary, "Resources");
if (!p) {
/* error */
return NULL;
}
if (p[10] == '<') /* if /Resources << */
page->resources = fill_resources(dictionary);
else {
sscanf(p, "Resources %d 0 R", &ref);
page->resources =
(struct resources *) get_object(ref, NULL);
}
p = strstr(dictionary, "MediaBox");
if (!p) {
/* error */
return NULL;
}
mediabox = array(&p[9]);
page->mediabox[0] = (short) strtoul(mediabox[1], NULL, 10);
free(mediabox[1]);
page->mediabox[1] = (short) strtoul(mediabox[2], NULL, 10);
free(mediabox[2]);
page->mediabox[2] = (short) strtoul(mediabox[3], NULL, 10);
free(mediabox[3]);
page->mediabox[3] = (short) strtoul(mediabox[4], NULL, 10);
free(mediabox[4]);
free(mediabox);
p = strstr(dictionary, "Parent");
if (!p) {
/* error */
return NULL;
}
sscanf(p, "Parent %d 0 R", &page->parent);
p = strstr(dictionary, "Rotate");
if (p)
sscanf(p, "/Rotate %hd", &page->rotate);
return page;
}
struct contents *fill_contents(char *dictionary)
{
struct contents *content;
char *p;
content = (struct contents *) xcalloc(sizeof(struct contents), 1);
p = strstr(dictionary, "Length");
if (!p) {
/* error */
free(content);
return NULL;
}
sscanf(p, "Length %d", &content->length);
for (p += 6; *p != '/'; p++)
if (!strncmp(p, "0 R", 3)) {
p = (char *) get_object(content->length, NULL);
sscanf(p, "%d\nendobj", &content->length);
break;
}
p = strstr(dictionary, "Filter");
if (!p) {
/* error */
free(content);
return NULL;
}
content->filter = get_filternum(&p[8]);
p = strstr(dictionary, "stream") + sizeof("stream");
if (!p) {
/* error */
free(content);
return NULL;
}
content->stream = (unsigned char *) xcalloc(content->length, 1);
memcpy(content->stream, p, content->length);
return content;
}
struct resources *fill_resources(char *dictionary)
{
struct resources *resources;
resources =
(struct resources *) xcalloc(sizeof(struct resources), 1);
return resources;
}
Platon Group <platon@platon.sk> http://platon.sk/
|