Skip to content

Commit

Permalink
#292 PDFs take long time to parse
Browse files Browse the repository at this point in the history
  • Loading branch information
JamesHeinrich committed May 9, 2021
1 parent 422485c commit a5f31b3
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 25 deletions.
2 changes: 1 addition & 1 deletion getid3/getid3.php
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ class getID3
*/
protected $startup_warning = '';

const VERSION = '1.9.20-202104081001';
const VERSION = '1.9.20-202105091633';
const FREAD_BUFFER_SIZE = 32768;

const ATTACHMENTS_NONE = false;
Expand Down
53 changes: 29 additions & 24 deletions getid3/module.misc.pdf.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ class getid3_pdf extends getid3_handler
*/
public function Analyze() {
$info = &$this->getid3->info;

$this->fseek(0);
if (preg_match('#^%PDF-([0-9\\.]+)$#', rtrim($this->fgets()), $matches)) {
$info['pdf']['header']['version'] = floatval($matches[1]);
Expand Down Expand Up @@ -67,39 +66,45 @@ public function Analyze() {
}
}
}

asort($info['pdf']['xref']['offset']);
$maxObjLengths = array();
$prevOffset = 0;
$prevObjNum = 0;
foreach ($info['pdf']['xref']['offset'] as $objectNumber => $offset) {
// walk through all listed offsets to calculate the maximum possible length for each known object
if ($prevObjNum) {
$maxObjLengths[$prevObjNum] = $offset - $prevOffset;
}
$prevOffset = $offset;
$prevObjNum = $objectNumber;
}
ksort($maxObjLengths);
foreach ($info['pdf']['xref']['offset'] as $objectNumber => $offset) {
if ($info['pdf']['xref']['entry'][$objectNumber] == 'f') {
// "free" object means "deleted", ignore
continue;
}
$this->fseek($offset);
$line = rtrim($this->fgets());
if (preg_match('#^'.$objectNumber.' ([0-9]+) obj#', $line, $matches)) {
if (strlen($line) > strlen($matches[0])) {
// object header line not actually on its own line, rewind file pointer to start reading data
$this->fseek($offset + strlen($matches[0]));
}
$objectData = '';
while (!$this->feof()) {
$line = $this->fgets();
if (rtrim($line) == 'endobj') {
break;
}
$objectData .= $line;
}
if (preg_match('#^<<[\r\n\s]*(/Type|/Pages|/Parent [0-9]+ [0-9]+ [A-Z]|/Count [0-9]+|/Kids *\\[[0-9A-Z ]+\\]|[\r\n\s])+[\r\n\s]*>>#', $objectData, $matches)) {
if (preg_match('#/Count ([0-9]+)#', $objectData, $matches)) {
$info['pdf']['pages'] = (int) $matches[1];
break; // for now this is the only data we're looking for in the PDF not need to loop through every object in the file (and a large PDF may contain MANY objects). And it MAY be possible that there are other objects elsewhere in the file that define additional (or removed?) pages
if (($maxObjLengths[$objectNumber] > 0) && ($maxObjLengths[$objectNumber] < $this->getid3->option_fread_buffer_size)) {
// ignore object that are zero-size or >32kB, they are unlikely to contain information we're interested in
$this->fseek($offset);
$objBlob = $this->fread($maxObjLengths[$objectNumber]);
if (preg_match('#^'.$objectNumber.'[\\x00 \\r\\n\\t]*([0-9]+)[\\x00 \\r\\n\\t]*obj[\\x00 \\r\\n\\t]*(.*)(endobj)?[\\x00 \\r\\n\\t]*$#s', $objBlob, $matches)) {
list($dummy, $generation, $objectData) = $matches;
if (preg_match('#^<<[\r\n\s]*(/Type|/Pages|/Parent [0-9]+ [0-9]+ [A-Z]|/Count [0-9]+|/Kids *\\[[0-9A-Z ]+\\]|[\r\n\s])+[\r\n\s]*>>#', $objectData, $matches)) {
if (preg_match('#/Count ([0-9]+)#', $objectData, $matches)) {
$info['pdf']['pages'] = (int) $matches[1];
break; // for now this is the only data we're looking for in the PDF not need to loop through every object in the file (and a large PDF may contain MANY objects). And it MAY be possible that there are other objects elsewhere in the file that define additional (or removed?) pages
}
}
} else {
$this->error('Unexpected structure "'.substr($objBlob, 0, 100).'" at offset '.$offset);
break;
}
} else {
$this->error('Unexpected structure "'.$line.'" at offset '.$offset);
break;
}
}
if (!$this->returnXREF) {
unset($info['pdf']['xref']['offset'], $info['pdf']['xref']['generation'], $info['pdf']['xref']['entry']);
unset($info['pdf']['xref']['offset'], $info['pdf']['xref']['generation'], $info['pdf']['xref']['entry'], $info['pdf']['xref']['xref_offsets']);
}

} else {
Expand Down

0 comments on commit a5f31b3

Please sign in to comment.