Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More flexible logic for libtiff #25

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,11 @@ func readPdf(path string) (string, error) {

## Demo
![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif)

## References

List of useful references to how the PDF file format is structured:

* https://web.archive.org/web/20210128014024/https://www.adobe.com/content/dam/acom/en/devnet/pdf/PDF32000_2008.pdf
* https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html
* https://commandlinefanatic.com/cgi-bin/showarticle.cgi?article=art019
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/ledongthuc/pdf

go 1.17
5 changes: 2 additions & 3 deletions page.go
Original file line number Diff line number Diff line change
Expand Up @@ -643,9 +643,8 @@ type Row struct {
type Rows []*Row

// GetTextByRow returns the page's all text grouped by rows
func (p Page) GetTextByRow() (Rows, error) {
result := Rows{}
var err error
func (p Page) GetTextByRow() (result Rows, err error) {
result = Rows{}

defer func() {
if r := recover(); r != nil {
Expand Down
37 changes: 27 additions & 10 deletions read.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ import (
"io"
"io/ioutil"
"os"
"regexp"
"sort"
"strconv"
)
Expand Down Expand Up @@ -122,26 +123,33 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) {
return NewReaderEncrypted(f, size, nil)
}

// headerRegexp is used to check the validity of the header line of a PDF.
// This should be able to support extra spaces between the version and the
// newline (as inserted by libtiff/tiff2pdf) as well as supporting CRLF and LF.
var headerRegexp = regexp.MustCompile(`^%PDF-1\.[0-7]\s*\r?\n`)
Copy link

@romanpickl romanpickl Mar 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think %PDF-1.7\r
is a valid case as well.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in my tests i changed this to
^%PDF-1\.[0-7][ ]*[\r\n]+
need to check the spec again.


// NewReaderEncrypted opens a file for reading, using the data in f with the given total size.
// If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords
// to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt
// the file and returns an error.
func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) {
buf := make([]byte, 10)
const headerLen = 11
buf := make([]byte, 11)
Copy link

@romanpickl romanpickl Mar 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use headerLen otherwise delete const?

f.ReadAt(buf, 0)
if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' {
if !headerRegexp.Match(buf) {
return nil, fmt.Errorf("not a PDF file: invalid header")
}
end := size
const endChunk = 100
// https://stackoverflow.com/questions/11896858/does-the-eof-in-a-pdf-have-to-appear-within-the-last-1024-bytes-of-the-file
const endChunk = 1024
buf = make([]byte, endChunk)
f.ReadAt(buf, end-endChunk)
for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' {
buf = buf[:len(buf)-1]
_, err := f.ReadAt(buf, end-endChunk)
if err != nil {
return nil, err
}
buf = bytes.TrimRight(buf, "\r\n\t ")
if !bytes.HasSuffix(buf, []byte("%%EOF")) {
return nil, fmt.Errorf("not a PDF file: missing %%%%EOF")
const eof = "%%EOF"
if findLastLine(buf, eof) < 0 {
return nil, fmt.Errorf("not a PDF file: missing %s", eof)
}
i := findLastLine(buf, "startxref")
if i < 0 {
Expand Down Expand Up @@ -430,6 +438,8 @@ func readXrefTableData(b *buffer, table []xref) ([]xref, error) {
return table, nil
}

// findLastLine looks for the last index of s in the given buffer. The search
// term must be alone in the line (surrounded by newlines).
func findLastLine(buf []byte, s string) int {
bs := []byte(s)
max := len(buf)
Expand All @@ -438,7 +448,14 @@ func findLastLine(buf []byte, s string) int {
if i <= 0 || i+len(bs) >= len(buf) {
return -1
}
if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') {
if buf[i-1] == '\n' || buf[i-1] == '\r' {
return i
}
if buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r' {
return i
}
// libtiff/tiff2pdf can add an extra space before the newline
if buf[i+len(bs)] == ' ' || buf[i+len(bs)+1] == '\n' || buf[i+len(bs)+1] == '\r' {
return i
}
max = i
Expand Down
54 changes: 54 additions & 0 deletions read_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package pdf

import (
"testing"
)

func TestRead(t *testing.T) {
t.Run("HeaderValidation", testHeaderValidation)
}

func testHeaderValidation(t *testing.T) {
tscs := map[string]struct {
input []byte
expectedValid bool
}{
"nil": {
input: nil,
expectedValid: false,
},
"empty": {
input: []byte{},
expectedValid: false,
},
"missing LF": {
input: []byte{37, 80, 68, 70, 45, 49, 46, 55},
expectedValid: false,
},
"ok LF": {
input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 10},
expectedValid: true,
},
"invalid version 1.8": {
input: []byte{37, 80, 68, 70, 45, 49, 46, 58, 10},
expectedValid: false,
},
"ok CRLF": {
input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 13, 10},
expectedValid: true,
},
"ok space + CRLF": {
input: []byte{37, 80, 68, 70, 45, 49, 46, 55, 32, 13, 10},
expectedValid: true,
},
}
for name, data := range tscs {
data := data
t.Run(name, func(t *testing.T) {
gotValid := headerRegexp.Match(data.input)
if gotValid != data.expectedValid {
t.Errorf("expected %t, got %t", data.expectedValid, gotValid)
}
})
}
}