Platon Technologies
neprihlásený Prihlásiť Registrácia
SlovakEnglish
open source software development oslavujeme 10 rokov vývoja otvoreného softvéru! Piatok, 19. apríl 2024

Súbor: [Platon] / cpdf / parse.c (stiahnutie)

Revízia 1.20, Mon Dec 23 02:40:05 2002 UTC (21 years, 4 months ago) by lynx

Zmeny od 1.19: +160 -128 [lines]

First phase parsing code is now very optimized...its faster than ever and he
is stable for all valid PDF.
Removed problems with decompression streams.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <assert.h>

#include "cpdf.h"
#include "parse.h"
#include "filter.h"
#include "utils.h"

/* function compute number of locations of char c from buffer to character c 
 */
int strlenc(char *buffer, char c, char to)
{
    int i = 0;
    while (*buffer && *buffer != to) {
        if (*buffer == c)
            i++;
        buffer++;
    }
    return i;
}

/* function compute lenght of string to char to
 */
int strlento(char *buffer, char to)
{
    char *old = buffer;
    while (*buffer && *buffer != to)
        buffer++;
    return buffer - old;
}

/* convert hex to bin
 */
char *hex(char *buffer)
{
    char *hex_array, *end;
    char num[5] = { '0', 'x', '\0', '\0', '\0' };
    int i;
    if (*buffer != '<')
        return NULL;
    if ((i = strlento(buffer + 1, '>') >> 1) == 0)
        return NULL;
    end = hex_array = (char *) xmalloc(i);
    for (buffer++; *buffer != '>'; hex_array++) {
        num[2] = *buffer;
        if (*(buffer + 1) == '>') {
            num[3] = '0';
            buffer++;
        } else {
            num[3] = *(buffer + 1);
            buffer += 2;
        }
        *hex_array = (char) strtoul((char *) &num, NULL, 16);
    }
    return end;
}

/* interpreter for special characters in name string */
char *name(char *buffer)
{
    char *ret, *end;
    char num[5] = { '0', 'x', '\0', '\0', '\0' };
    if (*buffer != '/')
        return NULL;
    if (memchr(buffer, '#', MAXNAMELEN) == NULL)
        return buffer;
    end = ret = (char *) xmalloc(MAXNAMELEN);
    while (*buffer && *buffer != ' ') {
        if (*buffer == '#') {
            num[2] = *(buffer + 1);
            num[3] = *(buffer + 2);
            *ret = (char) strtoul(num, NULL, 16);
            ret++;
            buffer += 3;
        }
        *ret = *buffer;
        ret++;
        buffer++;
    }
    return end;
}

char **array(char *buffer)
{
    char **ret = NULL, *last;
    static char delim[2] = { '[', ']' };
    int index = 0, len, end = 0;
    last = buffer + 1;
    do {
        ret =
            (char **) realloc(ret, (index + 1) * sizeof(char **));
        if (*buffer == '[' || *buffer == ']') {
            if (*buffer == '[') {
                ret[index] = &delim[0];
                end++;
            } else {
                ret[index] = &delim[1];
                end--;
            }
            buffer++;
            index++;
            continue;
        }
        while (1) {
            while (*buffer != ' ' && *buffer != ']'
                   && *buffer != '\n')
                buffer++;
            if (buffer - last == 0) {
                buffer++;
                continue;
            } else
                break;
        }
        len = buffer - last;
        ret[index] = (char *) xcalloc(len + 1, sizeof(char));
        strncpy(ret[index], last, len);
        index++;
        if (*buffer != ']')
            last = ++buffer;
        else
            last = buffer;
    } while (end);
    return ret;
}

#define BUFLEN 255
char *read_obj(long offset)
{
    char entry[BUFLEN], *obj = NULL;
    int size = 0;
    fseek(fp, offset, SEEK_SET);
    memset(entry, '\0', BUFLEN);
    fget(entry, BUFLEN, fp);
    do {
        fget(entry, BUFLEN, fp);
        /* stream need special care in reading */
        if (strstr(entry, "stream")) {
            char *p;
            int len;
            long off;
            obj = resize(obj, size + strlen(entry));
            strncpy(obj + size, entry, strlen(entry) + 1);
            size += strlen(entry);
            p = strstr(obj, "Length");
            sscanf(p, "Length %d", &len);
            for (p += 6; *p != '/'; p++)
                if (!strncmp(p, "0 R", 3)) {
                    off = ftell(fp);
                    p = (char *) get_object(len, NULL);
                    sscanf(p, "%d\nendobj", &len);
                    fseek(fp, off, SEEK_SET);
                    break;
                }
            len += sizeof("endstream");
            size += len;
            obj = resize(obj, size);
            fread(obj + strlen(obj), sizeof(char), len, fp);
            continue;
        }
        if (strstr(entry, "endobj"))
            break;
        obj = resize(obj, size + strlen(entry));
        strncpy(obj + size, entry, strlen(entry));
        size += strlen(entry);
    } while (1);
    return obj;
}

int get_typenum(char *dictionary)
{
    char *p;
    p = strstr(dictionary, "Type");
    if (!p) {
        if ((p = strstr(dictionary, "stream")) != NULL)
            return CONTENTS;
        return DATA;
    }
    for (p += 4; *p != '/'; p++);
    p++;
    if (!strncmp(p, "Catalog", 7))
        return CATALOG;
    if (!strncmp(p, "Page", 4)) {
        if (p[4] == 's')
            return PAGES;
        return PAGE;
    }
    return DATA;
}

void *get_object(int num, int *type)
{
    void *ret = NULL;
    char *dict;
    struct object *obj = (struct object *) *(objects + num - 1);
    if (obj->parsed) {
        if (type)
            *type = obj->type;
        return obj->parsed;
    }
    dict = read_obj(obj->offset);
    obj->type = get_typenum(dict);
    if (type)
        *type = obj->type;
    if (obj->type == CATALOG) {
        ret = fill_catalog(dict);
        goto out;
    }
    if (obj->type == PAGES) {
        ret = fill_pages(dict);
        goto out;
    }
    if (obj->type == PAGE) {
        ret = fill_page(dict);
        goto out;
    }
    if (obj->type == CONTENTS) {
        ret = fill_contents(dict);
        goto out;
    }
    if (obj->type == RESOURCES) {
        ret = fill_resources(dict);
        goto out;
    }
    if (obj->type == DATA) {
        obj->parsed = dict;
        return dict;
    }
      out:
    free(dict);
    obj->parsed = ret;
    return ret;
}

/* add object to object structure */
void add_obj(char *s, int num)
{
    struct object *o =
        (struct object *) xcalloc(sizeof(struct object), 1);
    sscanf(s, "%ld 00000 n", &o->offset);
    o->obj_num = num;
    objects[num - 1] = o;
    return;
}

/* get filter number */
char get_filternum(char *filter)
{
    if (!strncmp(filter, "Fl", 2))
        return FLATEDECODE;
    if (!strncmp(filter, "DCTDecode", 9))
        return DCTDECODE;
    return -1;
}

/* get offset of start xref table */
long get_startxref(void)
{
    char startxref[XREFLINE];
    int i = XREFLINE - 2;
    long offset;
    startxref[XREFLINE - 1] = '\0';
    fstat(fp->_fileno, &fpstat);
    offset = fpstat.st_size - EOFSIZE;
    fseek(fp, offset, SEEK_SET);
    while (1) {
        startxref[i] = fgetc(fp);
        if (startxref[i] == '\r' || startxref[i] == '\n')
            break;
        i--;
        offset--;
        fseek(fp, offset, SEEK_SET);
    }
    return strtoul(&startxref[i + 1], NULL, 10);
}

void probe_ptree(int num)
{
    struct pages *pages;
    int type, a;
    pages = (struct pages *) get_object(num, NULL);
    pages->kids++;
    a = atoi(*pages->kids);
    pages = (struct pages *) get_object(a, &type);
    while (type != PAGE) {
        a = atoi(pages->kids[1]);
        pages = (struct pages *) get_object(a, &type);
    }
    page_tree[current_page] = a;
    return;
/*    while(1) {
        if(**pages->kids == ']' && !pages->parent)
            break;
        if(**pages->kids == ']') {
            free(pages->old_kids);
            pages = (struct pages *) get_object(pages->parent,NULL);
        }
        num = atoi(*pages->kids);
        p = (struct pages *) get_object(num,&type);
        if(type == PAGE) {
            page_tree[current_page] = num;
            current_page++;
            pages->kids += 3;
            continue;
        }
        pages->kids += 3;
        pages = p;            
    }
    return;*/
}

/* find objects and add it to object structure */
void parse_xref(void)
{
    char xref[XREFLINE], *p;
    int entry[2];
    fseek(fp, get_startxref(), SEEK_SET);
    fget(xref, XREFLINE, fp);
    if (!strstr(xref, "xref")) {
        fprintf(stderr, "PDF document si corrupted!\n");
        /* reconstruction ? */
        return;
    }
    objects = NULL;
    obj_count = 0;
    do {
        fget(xref, XREFLINE, fp);
        sscanf(xref, "%d %d", &entry[0], &entry[1]);
        entry[1] += entry[0];
        if (entry[0] == 0)
            entry[0]++;
        if (entry[1] > obj_count) {
            objects = (struct object **) realloc(objects,
                                 entry[1] *
                                 sizeof(char
                                    *));
            obj_count = entry[1];
        }
        do {
            fget(xref, XREFLINE, fp);
            if (!strncmp(xref, "trailer", 7))
                break;
            if (xref[XREFLINE - 3] == 'f')
                continue;
            add_obj(xref, entry[0]);
            entry[0]++;
        } while (1);
        {
            char *tline = (char *) xmalloc(3 * XREFLINE);
            fget(tline, 3 * XREFLINE, fp);
            trailer.prev = 0;
            while (tline[0] != '>' && tline[1] != '>') {
                if ((p = strstr(tline, "Root")) != NULL)
                    sscanf(p, "Root %d 0 R",
                           &trailer.root);
                if ((p = strstr(tline, "Info")) != NULL)
                    sscanf(p, "Info %d 0 R",
                           &trailer.info);
                if ((p = strstr(tline, "Prev")) != NULL)
                    sscanf(p, "Prev %ld",
                           &trailer.prev);
                if ((p = strstr(tline, "Encrypt")) != NULL)
                    sscanf(p,
                           "Encrypt %d 0 R",
                           &trailer.encrypt);
                fget(tline, 3 * XREFLINE, fp);
            }
            free(tline);
        }
        if (!trailer.prev)
            break;
        fseek(fp, trailer.prev, SEEK_SET);
        fget(xref, XREFLINE, fp);
    } while (1);
    return;
}

int parse_catalog(void)
{
    struct catalog *c = NULL;
    c = (struct catalog *) get_object(trailer.root, NULL);
    if (!c) {
        /* error */
    }
    return c->pages;
}

void parse_pdf(void)
{
    int ret;
    struct pages *pages;
    parse_xref();
    ret = parse_catalog();
    pages = (struct pages *) get_object(ret, NULL);
    page_tree = (int *) xcalloc(pages->count, 1);
    current_page = 0;
    ret = parse_catalog();
    probe_ptree(ret);
    current_page = 0;
    loop();
    return;
}

struct catalog *fill_catalog(char *dictionary)
{
    char *p;
    struct catalog *catalog =
        (struct catalog *) xcalloc(sizeof(struct catalog), 1);
    p = strstr(dictionary, "Pages");
    if (!p) {
        /* error */
        return NULL;
    }
    sscanf(p, "Pages %d 0 R", &catalog->pages);
    return catalog;
}

struct pages *fill_pages(char *dictionary)
{
    char *p;
    struct pages *pages =
        (struct pages *) xcalloc(sizeof(struct pages), 1);
    p = strstr(dictionary, "Count");
    if (!p) {
        /* error */
        return NULL;
    }
    sscanf(p, "Count %d", &pages->count);
    p = strstr(dictionary, "Kids");
    if (!p) {
        /* error */
        return NULL;
    }
    pages->old_kids = pages->kids = array(p + 5);
    p = strstr(dictionary, "Parent");
    if (p)
        sscanf(p, "Parent %d 0 R", &pages->parent);
    return pages;
}

struct page *fill_page(char *dictionary)
{
    struct page *page;
    char *p, **mediabox;
    int ref;
    page = (struct page *) xcalloc(sizeof(struct page), 1);
    p = strstr(dictionary, "Contents");
    if (!p) {
        /* error */
        return NULL;
    }
    if (p[9] == '<') {    /* if /Contents << */
        page->contents = fill_contents(dictionary);
    } else {
        sscanf(p, "Contents %d 0 R", &ref);
        page->contents = (struct contents *) get_object(ref, NULL);
    }
    p = strstr(dictionary, "Resources");
    if (!p) {
        /* error */
        return NULL;
    }
    if (p[10] == '<')    /* if /Resources << */
        page->resources = fill_resources(dictionary);
    else {
        sscanf(p, "Resources %d 0 R", &ref);
        page->resources =
            (struct resources *) get_object(ref, NULL);
    }
    p = strstr(dictionary, "MediaBox");
    if (!p) {
        /* error */
        return NULL;
    }
    mediabox = array(&p[9]);
    page->mediabox[0] = (short) strtoul(mediabox[1], NULL, 10);
    free(mediabox[1]);
    page->mediabox[1] = (short) strtoul(mediabox[2], NULL, 10);
    free(mediabox[2]);
    page->mediabox[2] = (short) strtoul(mediabox[3], NULL, 10);
    free(mediabox[3]);
    page->mediabox[3] = (short) strtoul(mediabox[4], NULL, 10);
    free(mediabox[4]);
    free(mediabox);
    p = strstr(dictionary, "Parent");
    if (!p) {
        /* error */
        return NULL;
    }
    sscanf(p, "Parent %d 0 R", &page->parent);
    p = strstr(dictionary, "Rotate");
    if (p)
        sscanf(p, "/Rotate %hd", &page->rotate);
    return page;
}

struct contents *fill_contents(char *dictionary)
{
    struct contents *content;
    char *p;
    content = (struct contents *) xcalloc(sizeof(struct contents), 1);
    p = strstr(dictionary, "Length");
    if (!p) {
        /* error */
        free(content);
        return NULL;
    }

    sscanf(p, "Length %d", &content->length);
    for (p += 6; *p != '/'; p++)
        if (!strncmp(p, "0 R", 3)) {
            p = (char *) get_object(content->length, NULL);
            sscanf(p, "%d\nendobj", &content->length);
            break;
        }
    p = strstr(dictionary, "Filter");
    if (!p) {
        /* error */
        free(content);
        return NULL;
    }
    content->filter = get_filternum(&p[8]);
    p = strstr(dictionary, "stream") + sizeof("stream");
    if (!p) {
        /* error */
        free(content);
        return NULL;
    }
    content->stream = (unsigned char *) xcalloc(content->length, 1);
    memcpy(content->stream, p, content->length);
    return content;
}

struct resources *fill_resources(char *dictionary)
{
    struct resources *resources;
    resources =
        (struct resources *) xcalloc(sizeof(struct resources), 1);
    return resources;
}

Platon Group <platon@platon.sk> http://platon.sk/
Copyright © 2002-2006 Platon Group
Stránka používa redakčný systém Metafox
Na začiatok