From e7958936e4cce981a25ea8072f33b61d209573df Mon Sep 17 00:00:00 2001 From: mizho Date: Wed, 16 Oct 2024 01:05:22 +0900 Subject: [PATCH] Subject mime decode functions and test --- store/search.go | 60 ++++++++++++++++++++++++++++++++++++++++++++ store/search_test.go | 38 ++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 store/search_test.go diff --git a/store/search.go b/store/search.go index ec4858adde..10a396391c 100644 --- a/store/search.go +++ b/store/search.go @@ -2,13 +2,21 @@ package store import ( "bytes" + "encoding/base64" + "fmt" "io" + "regexp" "strings" "unicode" "unicode/utf8" "github.com/mjl-/mox/message" "github.com/mjl-/mox/mlog" + + "golang.org/x/text/encoding" + "golang.org/x/text/encoding/japanese" + encUnicode "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" ) // WordSearch holds context for a search, with scratch buffers to prevent @@ -193,3 +201,55 @@ func toLower(buf []byte) []byte { } return r } + +func decodeRFC2047(encoded string) (string, error) { + // match e.g. =?(iso-2022-jp)?(B)?(Rnc6...)?= + r := regexp.MustCompile(`=\?([^?]+)\?([BQ])\?([^?]+)\?=`) + matches := r.FindAllStringSubmatch(encoded, -1) + + if len(matches) == 0 { // no match. Looks ASCII. + return encoded, nil + } + + var decodedStrings []string + for _, match := range matches { + charset := match[1] + encodingName := match[2] + encodedText := match[3] + + // Decode Base64 or Quoted-Printable + var decodedBytes []byte + var err error + if encodingName == "B" { + decodedBytes, err = base64.StdEncoding.DecodeString(encodedText) + if err != nil { + return "", fmt.Errorf("Base64 decode error: %w", err) + } + } else { + return "", fmt.Errorf("not supported encoding: %s", encodingName) + } + + // Select charset + var enc encoding.Encoding + switch strings.ToLower(charset) { + case "iso-2022-jp": + enc = japanese.ISO2022JP + case "utf-8": + enc = encUnicode.UTF8 + default: + return "", fmt.Errorf("not supported charset: %s", charset) + } + + // Decode with charset + reader := transform.NewReader(strings.NewReader(string(decodedBytes)), enc.NewDecoder()) + decodedText, err := io.ReadAll(reader) + if err != nil { + return "", err + } + + decodedStrings = append(decodedStrings, string(decodedText)) + } + + // Concat multiple strings + return strings.Join(decodedStrings, ""), nil +} diff --git a/store/search_test.go b/store/search_test.go new file mode 100644 index 0000000000..995bb05663 --- /dev/null +++ b/store/search_test.go @@ -0,0 +1,38 @@ +package store + +import ( + "fmt" + "testing" +) + +func TestSubjectMatch(t *testing.T) { + // Auto detect subject text encoding and decode + + //log := mlog.New("search", nil) + + originalSubject := `テストテキスト Abc 123...` + asciiSubject := "test text Abc 123..." + + encodedSubjectUTF8 := `=?UTF-8?B?44OG44K544OI44OG44Kt44K544OIIEFiYyAxMjMuLi4=?=` + encodedSubjectISO2022 := `=?iso-2022-jp?B?GyRCJUYlOSVIJUYlLSU5JUgbKEIgQWJjIDEyMy4uLg==?=` + encodedSubjectUTF8 = encodedSubjectUTF8 + " \n " + encodedSubjectUTF8 + encodedSubjectISO2022 = encodedSubjectISO2022 + " \n " + encodedSubjectISO2022 + originalSubject = originalSubject + originalSubject + + encodedTexts := map[string]string{encodedSubjectUTF8: originalSubject, encodedSubjectISO2022: originalSubject, asciiSubject: asciiSubject} + + for encodedSubject, originalSubject := range encodedTexts { + + // Autodetect & decode + decodedSubject, err := decodeRFC2047(encodedSubject) + + fmt.Printf("decoded text:%s\n", decodedSubject) + if err != nil { + t.Fatalf("Decode error: %v", err) + } + + if originalSubject != decodedSubject { + t.Fatalf("Decode mismatch %s != %s", originalSubject, decodedSubject) + } + } +}