-
Notifications
You must be signed in to change notification settings - Fork 250
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use the go standard library implementation of Boyer-Moore string sear…
…ch from the strings package for preamble searching.
- Loading branch information
Showing
2 changed files
with
145 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
// Copyright 2012 The Go Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
package decode | ||
|
||
import "bytes" | ||
|
||
// stringFinder efficiently finds strings in a source text. It's implemented | ||
// using the Boyer-Moore string search algorithm: | ||
// http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm | ||
// http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged | ||
// document uses 1-based indexing) | ||
type byteFinder struct { | ||
// pattern is the string that we are searching for in the text. | ||
pattern []byte | ||
|
||
// badCharSkip[b] contains the distance between the last byte of pattern | ||
// and the rightmost occurrence of b in pattern. If b is not in pattern, | ||
// badCharSkip[b] is len(pattern). | ||
// | ||
// Whenever a mismatch is found with byte b in the text, we can safely | ||
// shift the matching frame at least badCharSkip[b] until the next time | ||
// the matching char could be in alignment. | ||
badCharSkip [256]int | ||
|
||
// goodSuffixSkip[i] defines how far we can shift the matching frame given | ||
// that the suffix pattern[i+1:] matches, but the byte pattern[i] does | ||
// not. There are two cases to consider: | ||
// | ||
// 1. The matched suffix occurs elsewhere in pattern (with a different | ||
// byte preceding it that we might possibly match). In this case, we can | ||
// shift the matching frame to align with the next suffix chunk. For | ||
// example, the pattern "mississi" has the suffix "issi" next occurring | ||
// (in right-to-left order) at index 1, so goodSuffixSkip[3] == | ||
// shift+len(suffix) == 3+4 == 7. | ||
// | ||
// 2. If the matched suffix does not occur elsewhere in pattern, then the | ||
// matching frame may share part of its prefix with the end of the | ||
// matching suffix. In this case, goodSuffixSkip[i] will contain how far | ||
// to shift the frame to align this portion of the prefix to the | ||
// suffix. For example, in the pattern "abcxxxabc", when the first | ||
// mismatch from the back is found to be in position 3, the matching | ||
// suffix "xxabc" is not found elsewhere in the pattern. However, its | ||
// rightmost "abc" (at position 6) is a prefix of the whole pattern, so | ||
// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11. | ||
goodSuffixSkip []int | ||
} | ||
|
||
func makeByteFinder(pattern []byte) *byteFinder { | ||
f := &byteFinder{ | ||
pattern: pattern, | ||
goodSuffixSkip: make([]int, len(pattern)), | ||
} | ||
// last is the index of the last character in the pattern. | ||
last := len(pattern) - 1 | ||
|
||
// Build bad character table. | ||
// Bytes not in the pattern can skip one pattern's length. | ||
for i := range f.badCharSkip { | ||
f.badCharSkip[i] = len(pattern) | ||
} | ||
// The loop condition is < instead of <= so that the last byte does not | ||
// have a zero distance to itself. Finding this byte out of place implies | ||
// that it is not in the last position. | ||
for i := 0; i < last; i++ { | ||
f.badCharSkip[pattern[i]] = last - i | ||
} | ||
|
||
// Build good suffix table. | ||
// First pass: set each value to the next index which starts a prefix of | ||
// pattern. | ||
lastPrefix := last | ||
for i := last; i >= 0; i-- { | ||
if bytes.HasPrefix(pattern, pattern[i+1:]) { | ||
lastPrefix = i + 1 | ||
} | ||
// lastPrefix is the shift, and (last-i) is len(suffix). | ||
f.goodSuffixSkip[i] = lastPrefix + last - i | ||
} | ||
// Second pass: find repeats of pattern's suffix starting from the front. | ||
for i := 0; i < last; i++ { | ||
lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1]) | ||
if pattern[i-lenSuffix] != pattern[last-lenSuffix] { | ||
// (last-i) is the shift, and lenSuffix is len(suffix). | ||
f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i | ||
} | ||
} | ||
|
||
return f | ||
} | ||
|
||
func longestCommonSuffix(a, b []byte) (i int) { | ||
for ; i < len(a) && i < len(b); i++ { | ||
if a[len(a)-1-i] != b[len(b)-1-i] { | ||
break | ||
} | ||
} | ||
return | ||
} | ||
|
||
// next returns the index in text of the first occurrence of the pattern. If | ||
// the pattern is not found, it returns -1. | ||
func (f *byteFinder) next(text []byte) int { | ||
i := len(f.pattern) - 1 | ||
for i < len(text) { | ||
// Compare backwards from the end until the first unmatching character. | ||
j := len(f.pattern) - 1 | ||
for j >= 0 && text[i] == f.pattern[j] { | ||
i-- | ||
j-- | ||
} | ||
if j < 0 { | ||
return i + 1 // match | ||
} | ||
i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j]) | ||
} | ||
return -1 | ||
} | ||
|
||
func max(a, b int) int { | ||
if a > b { | ||
return a | ||
} | ||
return b | ||
} |