Skip to content

Commit

Permalink
Improve x-subrip detection performance (#524)
Browse files Browse the repository at this point in the history
* srt: improve performance by using bytes.Cut instead of bufio.Scanner

it helps with mem allocs
Before:
BenchmarkSrt-8   	  946042	      1089 ns/op	    4240 B/op	       5 allocs/op
After:
BenchmarkSrt-8   	 3235448	       368.8 ns/op	      64 B/op	       2 allocs/op
  • Loading branch information
gabriel-vasile authored May 12, 2024
1 parent 043efb9 commit 341c422
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 66 deletions.
34 changes: 24 additions & 10 deletions internal/magic/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,19 +302,21 @@ func Svg(raw []byte, limit uint32) bool {

// Srt matches a SubRip file.
func Srt(in []byte, _ uint32) bool {
s := bufio.NewScanner(bytes.NewReader(in))
if !s.Scan() {
line, in, found := scanLine(in)
if !found {
return false
}

// First line must be 1.
if s.Text() != "1" {
if string(line) != "1" {
return false
}

if !s.Scan() {
line, in, found = scanLine(in)
if !found {
return false
}
secondLine := s.Text()

secondLine := string(line)
// Timestamp format (e.g: 00:02:16,612 --> 00:02:19,376) limits secondLine
// length to exactly 29 characters.
if len(secondLine) != 29 {
Expand All @@ -325,14 +327,12 @@ func Srt(in []byte, _ uint32) bool {
if strings.Contains(secondLine, ".") {
return false
}
// For Go <1.17, comma is not recognised as a decimal separator by `time.Parse`.
secondLine = strings.ReplaceAll(secondLine, ",", ".")
// Second line must be a time range.
ts := strings.Split(secondLine, " --> ")
if len(ts) != 2 {
return false
}
const layout = "15:04:05.000"
const layout = "15:04:05,000"
t0, err := time.Parse(layout, ts[0])
if err != nil {
return false
Expand All @@ -345,8 +345,9 @@ func Srt(in []byte, _ uint32) bool {
return false
}

line, _, found = scanLine(in)
// A third line must exist and not be empty. This is the actual subtitle text.
return s.Scan() && len(s.Bytes()) != 0
return found && len(line) != 0
}

// Vtt matches a Web Video Text Tracks (WebVTT) file. See
Expand All @@ -373,3 +374,16 @@ func Vtt(raw []byte, limit uint32) bool {
return bytes.Equal(raw, []byte{0xEF, 0xBB, 0xBF, 0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) || // UTF-8 BOM and "WEBVTT"
bytes.Equal(raw, []byte{0x57, 0x45, 0x42, 0x56, 0x54, 0x54}) // "WEBVTT"
}

func scanLine(in []byte) (line, remainder []byte, found bool) {
line, remainder, found = bytes.Cut(in, []byte("\n"))
if !found {
return
}

// Drop off any \r before \n.
if lenLine := len(line); lenLine > 0 && line[lenLine-1] == '\r' {
line = line[:lenLine-1]
}
return
}
48 changes: 32 additions & 16 deletions mimetype_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,18 +176,13 @@ var files = map[string]string{
"so.so": "application/x-sharedlib",
"sqlite.sqlite": "application/vnd.sqlite3",
"srt.srt": "application/x-subrip",
// not.srt.txt uses periods instead of commas for the decimal separators of
// the timestamps.
"not.srt.txt": "text/plain; charset=utf-8",
// not.srt.2.txt does not specify milliseconds.
"not.srt.2.txt": "text/plain; charset=utf-8",
"svg.1.svg": "image/svg+xml",
"svg.svg": "image/svg+xml",
"swf.swf": "application/x-shockwave-flash",
"tar.tar": "application/x-tar",
"tar.gnu.tar": "application/x-tar",
"tar.oldgnu.tar": "application/x-tar",
"tar.posix.tar": "application/x-tar",
"svg.1.svg": "image/svg+xml",
"svg.svg": "image/svg+xml",
"swf.swf": "application/x-shockwave-flash",
"tar.tar": "application/x-tar",
"tar.gnu.tar": "application/x-tar",
"tar.oldgnu.tar": "application/x-tar",
"tar.posix.tar": "application/x-tar",
// tar.star.tar was generated with star 1.6.
"tar.star.tar": "application/x-tar",
"tar.ustar.tar": "application/x-tar",
Expand Down Expand Up @@ -491,6 +486,7 @@ func BenchmarkSliceRand(b *testing.B) {
}

b.ResetTimer()
b.ReportAllocs()

b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
Expand All @@ -499,6 +495,24 @@ func BenchmarkSliceRand(b *testing.B) {
})
}

func BenchmarkAll(b *testing.B) {
r := rand.New(rand.NewSource(0))
data := make([]byte, 3072)
if _, err := io.ReadFull(r, data); err != io.ErrUnexpectedEOF && err != nil {
b.Fatal(err)
}
for _, m := range root.flatten() {
b.Run(m.String(), func(b *testing.B) {
b.ReportAllocs()
b.ResetTimer()
for n := 0; n < b.N; n++ {
m.detector(data, uint32(len(data)))
}
})
}

}

func BenchmarkCommon(b *testing.B) {
commonFiles := []string{
"xlsx.xlsx",
Expand All @@ -512,13 +526,15 @@ func BenchmarkCommon(b *testing.B) {
"gif.gif",
"xls.xls",
"webm.webm",
"csv.csv",
}
for _, file := range commonFiles {
f, err := os.ReadFile(filepath.Join(testDataDir, file))
if err != nil {
b.Fatal(err)
}
b.Run(filepath.Ext(file), func(b *testing.B) {
f, err := os.ReadFile(testDataDir + file)
if err != nil {
b.Fatal(err)
}
b.ReportAllocs()
b.ResetTimer()
for n := 0; n < b.N; n++ {
Detect(f)
Expand Down
20 changes: 0 additions & 20 deletions testdata/not.srt.2.txt

This file was deleted.

20 changes: 0 additions & 20 deletions testdata/not.srt.txt

This file was deleted.

0 comments on commit 341c422

Please sign in to comment.