forked from rsc/pdf
-
Notifications
You must be signed in to change notification settings - Fork 150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
More flexible logic for libtiff #25
Open
lzambarda
wants to merge
6
commits into
ledongthuc:master
Choose a base branch
from
lzambarda:master
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
7dfab7e
fix(read): correctly use logic operator in header condition
lzambarda a962e84
refactor(read): make header validation more flexible
lzambarda 0468b66
chore: add go.mod
lzambarda 7b000bd
refactor(read): make EOF and startxref read more flexible
lzambarda e3c03b5
fix(page): correctly handle panic in GetTextByRow
lzambarda 63ec7d0
docs: add useful references in readme
lzambarda File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
module github.com/ledongthuc/pdf | ||
|
||
go 1.17 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,6 +72,7 @@ import ( | |
"io" | ||
"io/ioutil" | ||
"os" | ||
"regexp" | ||
"sort" | ||
"strconv" | ||
) | ||
|
@@ -122,26 +123,33 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) { | |
return NewReaderEncrypted(f, size, nil) | ||
} | ||
|
||
// headerRegexp is used to check the validity of the header line of a PDF. | ||
// This should be able to support extra spaces between the version and the | ||
// newline (as inserted by libtiff/tiff2pdf) as well as supporting CRLF and LF. | ||
var headerRegexp = regexp.MustCompile(`^%PDF-1\.[0-7]\s*\r?\n`) | ||
|
||
// NewReaderEncrypted opens a file for reading, using the data in f with the given total size. | ||
// If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords | ||
// to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt | ||
// the file and returns an error. | ||
func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) { | ||
buf := make([]byte, 10) | ||
const headerLen = 11 | ||
buf := make([]byte, 11) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use headerLen otherwise delete const? |
||
f.ReadAt(buf, 0) | ||
if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' { | ||
if !headerRegexp.Match(buf) { | ||
return nil, fmt.Errorf("not a PDF file: invalid header") | ||
} | ||
end := size | ||
const endChunk = 100 | ||
// https://stackoverflow.com/questions/11896858/does-the-eof-in-a-pdf-have-to-appear-within-the-last-1024-bytes-of-the-file | ||
const endChunk = 1024 | ||
buf = make([]byte, endChunk) | ||
f.ReadAt(buf, end-endChunk) | ||
for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' { | ||
buf = buf[:len(buf)-1] | ||
_, err := f.ReadAt(buf, end-endChunk) | ||
if err != nil { | ||
return nil, err | ||
} | ||
buf = bytes.TrimRight(buf, "\r\n\t ") | ||
if !bytes.HasSuffix(buf, []byte("%%EOF")) { | ||
return nil, fmt.Errorf("not a PDF file: missing %%%%EOF") | ||
const eof = "%%EOF" | ||
if findLastLine(buf, eof) < 0 { | ||
return nil, fmt.Errorf("not a PDF file: missing %s", eof) | ||
} | ||
i := findLastLine(buf, "startxref") | ||
if i < 0 { | ||
|
@@ -430,6 +438,8 @@ func readXrefTableData(b *buffer, table []xref) ([]xref, error) { | |
return table, nil | ||
} | ||
|
||
// findLastLine looks for the last index of s in the given buffer. The search | ||
// term must be alone in the line (surrounded by newlines). | ||
func findLastLine(buf []byte, s string) int { | ||
bs := []byte(s) | ||
max := len(buf) | ||
|
@@ -438,7 +448,14 @@ func findLastLine(buf []byte, s string) int { | |
if i <= 0 || i+len(bs) >= len(buf) { | ||
return -1 | ||
} | ||
if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') { | ||
if buf[i-1] == '\n' || buf[i-1] == '\r' { | ||
return i | ||
} | ||
if buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r' { | ||
return i | ||
} | ||
// libtiff/tiff2pdf can add an extra space before the newline | ||
if buf[i+len(bs)] == ' ' || buf[i+len(bs)+1] == '\n' || buf[i+len(bs)+1] == '\r' { | ||
return i | ||
} | ||
max = i | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
package pdf | ||
|
||
import ( | ||
"testing" | ||
) | ||
|
||
func TestRead(t *testing.T) { | ||
t.Run("HeaderValidation", testHeaderValidation) | ||
} | ||
|
||
func testHeaderValidation(t *testing.T) { | ||
tscs := map[string]struct { | ||
input []byte | ||
expectedValid bool | ||
}{ | ||
"nil": { | ||
input: nil, | ||
expectedValid: false, | ||
}, | ||
"empty": { | ||
input: []byte{}, | ||
expectedValid: false, | ||
}, | ||
"missing LF": { | ||
input: []byte{37, 80, 68, 70, 45, 49, 46, 55}, | ||
expectedValid: false, | ||
}, | ||
"ok LF": { | ||
input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 10}, | ||
expectedValid: true, | ||
}, | ||
"invalid version 1.8": { | ||
input: []byte{37, 80, 68, 70, 45, 49, 46, 58, 10}, | ||
expectedValid: false, | ||
}, | ||
"ok CRLF": { | ||
input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 13, 10}, | ||
expectedValid: true, | ||
}, | ||
"ok space + CRLF": { | ||
input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 32, 13, 10}, | ||
expectedValid: true, | ||
}, | ||
} | ||
for name, data := range tscs { | ||
data := data | ||
t.Run(name, func(t *testing.T) { | ||
gotValid := headerRegexp.Match(data.input) | ||
if gotValid != data.expectedValid { | ||
t.Errorf("expected %t, got %t", data.expectedValid, gotValid) | ||
} | ||
}) | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think %PDF-1.7\r
is a valid case as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in my tests i changed this to
^%PDF-1\.[0-7][ ]*[\r\n]+
need to check the spec again.