% Copyright (C) 2001-2018 Artifex Software, Inc.
% All Rights Reserved.
%
% This software is provided AS-IS with no warranty, either express or
% implied.
%
% This software is distributed under license and may not be copied,
% modified or distributed except as expressly authorized under the terms
% of the license contained in the file LICENSE in this distribution.
%
% Refer to licensing information at http://www.artifex.com or contact
% Artifex Software, Inc., 1305 Grant Avenue - Suite 200, Novato,
% CA 94945, U.S.A., +1(415)492-9861, for further information.
%
% pdf_rbld.ps - Rebuilding of broken PDF files (xref errors)
% This module contains routines that are used if we detect an error
% while reading the xref tables. These routines will scan the file and
% build an xref table by finding the objects. We also need to find the
% appropriate trailer dictionary. Note: One procedure is also used
% even if we do not need to rebuild a PDF file.
%
% This module cannot rebuild a PDF file which has had errors created inside
% of objects or binary data streams. It often succeeds with files that
% have had its end of lines converted between unix and dos versions.
% if true --> we have an object with duplicate object and generation numbers.
/dup_obj_gen_num //false def
% Note: This procedure is also used by non-rebuild code.
% Store a line in the xref array (Actually Objects and Generations arrays)
% <obj num> (strm num> <obj loc> <gen num> <rebuild>
% setxrefentry <obj num> strm num> <obj loc> <gen num>
/setxrefentry
{
5 1 roll
dup 65535 or 65535 ne {
( **** Error: Generation number out of 0..65535 range, assuming 0.\n)
pdfformaterror
( Output may be incorrect.\n) pdfformaterror
pop 0
} if
% We store generation numbers as value + 1
% We reserve 0 to indicate an free xref entry
1 add % increment generation number
% To save space, generations numbers are stored in a string unless we
% find a generation number greater than 255. If so then transfer to
% an array.
dup 255 gt {
Generations type /stringtype eq { % Convert Generations to an array.
Generations length array dup % Create new array
0 1 2 index length 1 sub { % Copy from old string to new array
Generations 1 index get put dup
} for
pop
/Generations exch store % Save new Generations array
} if
} if
% Verify that the new values are for a new object. If the current
% entry is null then we have a new entry.
Objects 4 index get //null eq {
ObjectStream 4 index 4 index cvx put % Save ObjectStream object number
Objects 4 index 3 index cvx put % Save object location
Generations 4 index 2 index put % Save geenration number
} {
% Verify that the new entry has at least as high a generaton number
% We accept equal entry number because we have found PDF files in
% which there are multiple objects with the same object and entry
% numbers. The normal xref logic only accepts the first such
% entry that it finds. However the 'rebuild PDF' logic can find
% both such entries. The correct one is usually the last one.
Generations 4 index get 1 index le {
%% Check if the object we already found was at the locaton specified
%% in the original xref (now stored in Orig_Objects). If so, prefer
%% that offset, otherwise prefer the later object.
%% NB check first to see that the object number is in the range of the original
%% xref. If it isn't, set the 'original' Offset to 0.
3 index Orig_Objects length le {Orig_Objects 4 index get}{0}ifelse
Objects 5 index get eq not
{
ObjectStream 4 index 4 index cvx put % Save ObjectStream object number
Objects 4 index 3 index cvx put % Save object location
Generations 4 index 2 index put % Save geenration number
} if
} if
% Set error flag if we have equal object and generation numbers
Generations 4 index get 1 index eq { /dup_obj_gen_num //true def } if
} 8 -1 roll { ifelse } { pop if } ifelse % Run 'else' only when rebuilding.
} bind executeonly def
% Print the contents of the xref array. This actually consists of three
% arrays (Objects, Generations, and ObjectStream).
/print_xref % - print_xref -
{ 0 1 Objects length 1 sub % stack: 0 1 <number of objects - 1>
{ dup =only % print object number
( ) print
dup Generations exch get 1 sub =only % print Generation number
( ) print
dup ObjectStream exch get ==only % print ObjectStream object number
( ) print
Objects exch get === % print object location
} for
flush
} bind executeonly def
% Get token from string and check its type
% <string> <type> typed_token <false> % no token or not match
% <string> <type> typed_token <obj> <last> <true> % matching token type
% Where last is the string remainder
/typed_token
{ exch
token_nofail % get token
{
dup type % stack: type last token type
4 -1 roll eq { % stack: last token bool
exch //true % desired object found - set exit status
} {
pop pop //false % not type - clear stack, set exit status
} ifelse
} {
pop //false % no token - pop type, set exit status
} ifelse % check if we got token
} bind executeonly def
% Allocate space for post_eof_count to be bound into procedures below.
/post_eof_count 0 def
% We want the location of the trailer dictionary at the start of file.
% First we will find the xref. Then we will skip over the xref entries
% to the trailer.
/search_start_trailer % - search_start_trailer <trailer loc>
{ % Read the first 300 bytes and check for xref
PDFfile 0 setfileposition
PDFfile bytesavailable post_eof_count sub % location of end of data
300 .min % block size to read
dup string 0 1 4 -1 roll 1 sub
{ 2 copy PDFfile read pop put pop } for
(xref) search {
% found 'xref'
exch pop exch pop length 4 add PDFfile exch setfileposition
PDFfile token pop % get starting entry - or 'trailer'
(trailer) ne { % if we do not already have 'trailer'
PDFfile token pop % get number of entries
PDFfile token pop pop % this moves us into the middle of the first entry
25 string exch % define working string for readline
{ PDFfile 1 index readline pop pop
} repeat % skip entries
pop % pop working string
PDFfile token pop pop % get 'trailer'
PDFfile fileposition % get file position
} if
} {
pop 0 % no xref, should not happen, report it upstrem
} ifelse
} bind executeonly def
%% Searches backwards from a specified point looking for a 'trailer' keyword.
%% position search_earlier_trailer position or 0
%%
%% Its is just possible that a 'trailer' keyword could straddle a buffer, in which case
%% we wouldn't find it. Given that this only executes for broken files anyway I don't
%% propose to worry about it at the moment, if anyone ever turns up an example we may
%% choose to enhance this routine further.
%%
/search_earlier_trailer {
{ % position
dup 0 gt { % position bool
dup 65535 .min exch % block_size position
1 index sub % block_size position-block_size
dup % block_size new_position new_position
PDFfile exch setfileposition % block_size new position
exch dup % new position block_size block_size
dup string 0 1 4 -1 roll 1 sub %
{2 copy PDFfile read pop put pop } for %
% new_position block size (...string from file....)
(trailer) search {
pop
{ search not { exit } if pop } loop
% determine where the trailer is in the file
% trailer loc = end loc - remaing string length
length % new_position block size string length
3 1 roll % string length new_position block size
add exch sub % string length - (new_position + block size)
} {
pop % discard old block size
pop 0
} ifelse
} {
pop 0 exit
}ifelse
%% We either have a position for a trailer, or 0 if we failed to find one
dup 0 eq not {
exit
}if
pop % renove the zero leaving the new start position
} loop
} bind executeonly def
% We want the location of the trailer dictionary at the end of file.
% We will read the last block of data and search for the final occurance
% of the word 'trailer'
/search_end_trailer % - search_end_trailer <trailer loc>
{ % Position to read block of data from the end of the file. Note: We ignore
% anything past the last %%EOF since this is not PDF data.
PDFfile 0 setfileposition
PDFfile bytesavailable post_eof_count sub % location of end of data
dup 65535 .min % block size to read
% stack: <file end pos> <block size>
% move file position to the start of the block
2 copy sub PDFfile exch setfileposition
% read block of data
dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
% search for last occurance of 'trailer'
(trailer) search {
pop
{ search not { exit } if pop } loop
% determine where the trailer is in the file
% trailer loc = end loc - remaing string length
length sub
} {
pop pop 0
} ifelse
} bind executeonly def
% We want to find the trailer dictionary. There is a trailer dictionary
% for each xref object list. We only want the trailer dictionary associated
% with the first xref object list. In theory this can be anywhere in the
% file. However since we are trying to repair a broken file, we cannot simply
% follow the xref links. So we are falling back to a simple strategy. We
% find the specified location of the first xref list. If its location is in
% the first half of the file then we search for the first trailer dictionary
% at the start of the file. Otherwise we search for the last trailer at the
% end of the file.
/search_trailer % - search_trailer -
{ % Find the 'startxref' and associated position at the end of the file.
% Position to read block of data from the end of the file. Note: We
% actually end at the end of the last %%EOF since this is the end of the
% useful PDF data. (Some files contain trailing garbage.)
PDFfile 0 setfileposition
PDFfile bytesavailable % size of file
post_eof_count sub dup % location of end of last %%EOF
dup 4096 .min % block size to read
% stack: <useful file size> <useful file size file> <block size>
% move file position to the start of the block
2 copy sub PDFfile exch setfileposition
% read block of data
dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
% search for last occurance of 'startxref'
//false % Assume that startxref not present
exch (startxref) {
search not { exit } if % Exit loop when no more startxref's
pop 3 -1 roll pop //true 3 1 roll % Idicate that we have found starxref
} loop
exch % Exch last string and 'found' flag
{
% determine where the startxref is in the file
% 'startxref' loc = end loc - remaing string length - 9 bytes
length sub 9 sub
% move the file to this position and read startxref and position
PDFfile exch setfileposition PDFfile token
pop pop PDFfile token_no_close pop
dup type /integertype eq not {
pop
% startxref not followed by integer. We will search the end of the file for trailer.
PDFfilelen
} if
} {
% startxref not found. We will search the end of the file for trailer.
pop pop PDFfilelen
} ifelse
% compare xref position to 1/2 the length of the file and search for trailer
exch 2 div lt {
search_start_trailer dup 0 eq { pop search_end_trailer } if
} {
search_end_trailer dup 0 eq { pop search_start_trailer } if
} ifelse
dup 0 eq {
pop
( **** Error: Trailer dictionary not found.\n) pdfformaterror
( Output may be incorrect.\n) pdfformaterror
}{
% get the trailer
dup
PDFfile exch setfileposition % set to the specified trailer location
/dictlevelcount 0 def
PDFfile traileropdict .pdfrun % read trailer info
{
dup /Root known not {
( **** Warning: This trailer dictionary does not contain a /Root entry\n searching for a prior trailer.\n) pdfformatwarning
%% remove trailer dict copy, duplicate file position, then remove the length of 'trailer' so we don't
%% find the same one again....
pop dup 7 sub
search_earlier_trailer
}{exit}ifelse
dup 0 eq {
pop
( **** Error: Valid trailer not found.\n) pdfformaterror
( Output may be incorrect.\n) pdfformaterror
}{
PDFfile exch setfileposition % set to the specified trailer location
/dictlevelcount 0 def
PDFfile traileropdict .pdfrun % read trailer info
} ifelse
} loop
/Trailer exch def
pop
} ifelse
} bind executeonly def
% This routine will determine if there is stuff after the %%EOF. There is
% supposed to be only a line termination. However many real life files
% contain some garbage. This routine checks how much. We then ignore this
% stuff when we are scanning for objects.
/determine_post_eof_count % - determine_post_eof_count <count>
{ % Position to read block of data from the end of the file.
PDFfilelen % size of file
dup 4096 .min % file_size block_size
dup 3 1 roll sub % block_size file_size-block_size
PDFfile exch setfileposition % block_size
string PDFfile exch readstring pop % ()
% search for last occurance of 'startxref', '%%EOF' is often damaged
(startxref) search {
pop
{ search not { exit } if pop
} loop
% how much is left = remaining string length
% Now search for %%EO or try to read a number after 'startxref'.
(%%EO) search {
pop pop
} {
% Look for a number after startxref
{ dup token { pop exch pop } if
} stopped pop
} ifelse
length
} {
% Can't even find startxref, assume it's all objects
pop 0
} ifelse
} bind executeonly def
% This routine will scan a file searaching for object locations to build
% an alternate version of the data in the xref tables.
% Its purpose is to provide a basis for an xref fixing facility.
/search_objects % - search_objects -
{ % Initialize the Objects, Generations, etc. arrays
Objects dup length array copy /Orig_Objects exch def
initPDFobjects
% reset duplicate object and generation numbers error flag
/dup_obj_gen_num //false def
% Determine how many bytes are in the file after the final %%EOF
/post_eof_count determine_post_eof_count def
% Start at the beginning of the file
PDFfile 0 setfileposition
% Create a working string (and also store its length on stack). We are
% using a maximum size string size the logic below wants a recovered object
% to fit into our working string.
65535 dup string
{ % Now loop through the entire file looking for objects
PDFfile fileposition % save current file position
% When we get near the end of the file, we use a smaller interval of
% our working string to prevent reading past the end. (See comments on
% EOF testing below.)
PDFfile bytesavailable post_eof_count sub 10 sub dup 4 index lt {
2 index 0 3 -1 roll getinterval % near EOF, use interval of string
} {
pop 1 index % not near end, use full working string
} ifelse
% Read a line from file. If the line does not fit into our working string,
% or any other error, then we will discard it.
PDFfile exch { readline } .internalstopped
{ pop pop //false } if % indicate no string if we stopped
{ % stack: <length> <working_str> <loc> <string>
% Now that we have line, get obj num, ref num, and 'obj'. Verify that each
% of these is correct type.
dup (obj) search { % preliminary check for obj
pop pop pop
/integertype typed_token { % get obj number
/integertype typed_token { % get ref number
/nametype typed_token { % get 'obj' text
pop % pop remaining string
/obj eq { % verify name is 'obj'
% make sure we have room in the arrays. We work in increments
% of 20 each time we increase the size.
1 index 20 add 20 idiv 20 mul
growPDFobjects
% save xref parameters into ObjectStream, Objects and Generations
1 index 0 % rearrange parms for setxrefentry
4 index PDFoffset sub 3 index
//true setxrefentry % save parameters
pop pop pop pop % clear parameters
} if % check if name is 'obj'
} if % check if we got 'obj" string
pop % remove ref number
} if % check if we got ref number
pop % remove obj number
} if % check if we got object number
} {
pop pop
} ifelse
} if % check if got a string from readline
pop % remove location
% Check if we are approaching the end of the file. We do not want to
% read past the end of the file since that closes it. We actually stop
% 10-20 bytes early since there cannot be an object that close to the end.
% (There is a Trailer dictionary, etc. at the end of the file.)
PDFfile bytesavailable post_eof_count sub 20 lt { exit } if
} loop % loop through the entire file
pop pop % remove working string and its length
% Output warning if we have two objects with the same object and generation
% numbers.
dup_obj_gen_num {
( **** Warning: There are objects with matching object and generation\n)
pdfformatwarning
( **** numbers. The output may be incorrect.\n)
pdfformatwarning
} if
currentdict /Orig_Objects undef
} bind executeonly def
% Print warning message because we found a problem while reading the xref
% tables
/print_xref_warning
{ ( **** Error: An error occurred while reading an XREF table.\n)
pdfformaterror
( **** The file has been damaged. This may have been caused\n)
pdfformaterror
( **** by a problem while converting or transfering the file.\n)
pdfformaterror
( **** Ghostscript will attempt to recover the data.\n)
pdfformaterror
( **** However, the output may be incorrect.\n) pdfformaterror
} bind executeonly def
% Attempt to recover the XRef data. This is called if we have a failure
% while reading the normal XRef tables. This routine usually works
% only for pre PDF1.5 versions of PDF files.
/recover_xref_data % - recover_xref_data -
{ print_xref_warning % Print warning message
count pdfemptycount sub { pop } repeat % remove anything left by readxref
search_objects % Search for objects
} bind executeonly def