ledongthuc · lzambarda · Mar 15, 2022 · Mar 15, 2022 · Mar 15, 2022 · Mar 15, 2022
diff --git a/README.md b/README.md
@@ -136,3 +136,11 @@ func readPdf(path string) (string, error) {
 
 ## Demo
 ![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif)
+
+## References
+
+List of useful references to how the PDF file format is structured:
+
+* https://web.archive.org/web/20210128014024/https://www.adobe.com/content/dam/acom/en/devnet/pdf/PDF32000_2008.pdf
+* https://www.oreilly.com/library/view/pdf-explained/9781449321581/ch04.html
+* https://commandlinefanatic.com/cgi-bin/showarticle.cgi?article=art019
diff --git a/go.mod b/go.mod
@@ -0,0 +1,3 @@
+module github.com/ledongthuc/pdf
+
+go 1.17
diff --git a/page.go b/page.go
@@ -643,9 +643,8 @@ type Row struct {
 type Rows []*Row
 
 // GetTextByRow returns the page's all text grouped by rows
-func (p Page) GetTextByRow() (Rows, error) {
-	result := Rows{}
-	var err error
+func (p Page) GetTextByRow() (result Rows, err error) {
+	result = Rows{}
 
 	defer func() {
 		if r := recover(); r != nil {

diff --git a/read.go b/read.go
@@ -72,6 +72,7 @@ import (
 	"io"
 	"io/ioutil"
 	"os"
+	"regexp"
 	"sort"
 	"strconv"
 )
@@ -122,26 +123,33 @@ func NewReader(f io.ReaderAt, size int64) (*Reader, error) {
 	return NewReaderEncrypted(f, size, nil)
 }
 
+// headerRegexp is used to check the validity of the header line of a PDF.
+// This should be able to support extra spaces between the version and the
+// newline (as inserted by libtiff/tiff2pdf) as well as supporting CRLF and LF.
+var headerRegexp = regexp.MustCompile(`^%PDF-1\.[0-7]\s*\r?\n`)
+
 // NewReaderEncrypted opens a file for reading, using the data in f with the given total size.
 // If the PDF is encrypted, NewReaderEncrypted calls pw repeatedly to obtain passwords
 // to try. If pw returns the empty string, NewReaderEncrypted stops trying to decrypt
 // the file and returns an error.
 func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, error) {
-	buf := make([]byte, 10)
+	const headerLen = 11
+	buf := make([]byte, 11)
 	f.ReadAt(buf, 0)
-	if !bytes.HasPrefix(buf, []byte("%PDF-1.")) || buf[7] < '0' || buf[7] > '7' || buf[8] != '\r' && buf[8] != '\n' {
+	if !headerRegexp.Match(buf) {
 		return nil, fmt.Errorf("not a PDF file: invalid header")
 	}
 	end := size
-	const endChunk = 100
+	// https://stackoverflow.com/questions/11896858/does-the-eof-in-a-pdf-have-to-appear-within-the-last-1024-bytes-of-the-file
+	const endChunk = 1024
 	buf = make([]byte, endChunk)
-	f.ReadAt(buf, end-endChunk)
-	for len(buf) > 0 && buf[len(buf)-1] == '\n' || buf[len(buf)-1] == '\r' {
-		buf = buf[:len(buf)-1]
+	_, err := f.ReadAt(buf, end-endChunk)
+	if err != nil {
+		return nil, err
 	}
-	buf = bytes.TrimRight(buf, "\r\n\t ")
-	if !bytes.HasSuffix(buf, []byte("%%EOF")) {
-		return nil, fmt.Errorf("not a PDF file: missing %%%%EOF")
+	const eof = "%%EOF"
+	if findLastLine(buf, eof) < 0 {
+		return nil, fmt.Errorf("not a PDF file: missing %s", eof)
 	}
 	i := findLastLine(buf, "startxref")
 	if i < 0 {
@@ -430,6 +438,8 @@ func readXrefTableData(b *buffer, table []xref) ([]xref, error) {
 	return table, nil
 }
 
+// findLastLine looks for the last index of s in the given buffer. The search
+// term must be alone in the line (surrounded by newlines).
 func findLastLine(buf []byte, s string) int {
 	bs := []byte(s)
 	max := len(buf)
@@ -438,7 +448,14 @@ func findLastLine(buf []byte, s string) int {
 		if i <= 0 || i+len(bs) >= len(buf) {
 			return -1
 		}
-		if (buf[i-1] == '\n' || buf[i-1] == '\r') && (buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r') {
+		if buf[i-1] == '\n' || buf[i-1] == '\r' {
+			return i
+		}
+		if buf[i+len(bs)] == '\n' || buf[i+len(bs)] == '\r' {
+			return i
+		}
+		// libtiff/tiff2pdf can add an extra space before the newline
+		if buf[i+len(bs)] == ' ' || buf[i+len(bs)+1] == '\n' || buf[i+len(bs)+1] == '\r' {
 			return i
 		}
 		max = i

diff --git a/read_test.go b/read_test.go
@@ -0,0 +1,54 @@
+package pdf
+
+import (
+	"testing"
+)
+
+func TestRead(t *testing.T) {
+	t.Run("HeaderValidation", testHeaderValidation)
+}
+
+func testHeaderValidation(t *testing.T) {
+	tscs := map[string]struct {
+		input         []byte
+		expectedValid bool
+	}{
+		"nil": {
+			input:         nil,
+			expectedValid: false,
+		},
+		"empty": {
+			input:         []byte{},
+			expectedValid: false,
+		},
+		"missing LF": {
+			input:         []byte{37, 80, 68, 70, 45, 49, 46, 55},
+			expectedValid: false,
+		},
+		"ok LF": {
+			input:         []byte{37, 80, 68, 70, 45, 49, 46, 55, 10},
+			expectedValid: true,
+		},
+		"invalid version 1.8": {
+			input:         []byte{37, 80, 68, 70, 45, 49, 46, 58, 10},
+			expectedValid: false,
+		},
+		"ok CRLF": {
+			input:         []byte{37, 80, 68, 70, 45, 49, 46, 55, 13, 10},
+			expectedValid: true,
+		},
+		"ok space + CRLF": {
+			input:         []byte{37, 80, 68, 70, 45, 49, 46, 55, 32, 13, 10},
+			expectedValid: true,
+		},
+	}
+	for name, data := range tscs {
+		data := data
+		t.Run(name, func(t *testing.T) {
+			gotValid := headerRegexp.Match(data.input)
+			if gotValid != data.expectedValid {
+				t.Errorf("expected %t, got %t", data.expectedValid, gotValid)
+			}
+		})
+	}
+}