From a6bbc989768ae016da4ff50fb0191c57965f4904 Mon Sep 17 00:00:00 2001 From: Leander Beernaert Date: Fri, 17 Feb 2023 11:07:29 +0100 Subject: [PATCH] feat(GODT-2201): RFC5322 AddressList parser rewritten in Go --- imap/envelope.go | 25 +- rfc5322/address.go | 554 ++++++++++++++++++++++ rfc5322/address_test.go | 52 +++ rfc5322/atom.go | 318 +++++++++++++ rfc5322/atom_test.go | 39 ++ rfc5322/backtracing_scanner.go | 94 ++++ rfc5322/cfws.go | 307 ++++++++++++ rfc5322/cfws_test.go | 58 +++ rfc5322/miscelleaneous.go | 80 ++++ rfc5322/miscelleaneous_test.go | 41 ++ rfc5322/parser.go | 83 ++++ rfc5322/parser_test.go | 832 +++++++++++++++++++++++++++++++++ rfc5322/quoted.go | 95 ++++ rfc5322/quoted_test.go | 22 + rfcparser/parser.go | 23 +- rfcparser/scanner.go | 18 +- 16 files changed, 2622 insertions(+), 19 deletions(-) create mode 100644 rfc5322/address.go create mode 100644 rfc5322/address_test.go create mode 100644 rfc5322/atom.go create mode 100644 rfc5322/atom_test.go create mode 100644 rfc5322/backtracing_scanner.go create mode 100644 rfc5322/cfws.go create mode 100644 rfc5322/cfws_test.go create mode 100644 rfc5322/miscelleaneous.go create mode 100644 rfc5322/miscelleaneous_test.go create mode 100644 rfc5322/parser.go create mode 100644 rfc5322/parser_test.go create mode 100644 rfc5322/quoted.go create mode 100644 rfc5322/quoted_test.go diff --git a/imap/envelope.go b/imap/envelope.go index 4de47adf..8a7b8845 100644 --- a/imap/envelope.go +++ b/imap/envelope.go @@ -1,10 +1,10 @@ package imap import ( + "github.com/ProtonMail/gluon/rfc5322" "net/mail" "strings" - "github.com/ProtonMail/gluon/internal/parser" "github.com/ProtonMail/gluon/rfc822" "github.com/sirupsen/logrus" ) @@ -29,27 +29,24 @@ func envelope(header *rfc822.Header, c *paramList, writer parListWriter) error { addString(writer, header.Get("Date")). addString(writer, header.Get("Subject")) - addressParser := parser.NewRFC5322AddressListParser() - defer addressParser.Close() - if v, ok := header.GetChecked("From"); !ok { fields.addString(writer, "") } else { - fields.addAddresses(writer, tryParseAddressList(addressParser, v)) + fields.addAddresses(writer, tryParseAddressList(v)) } if v, ok := header.GetChecked("Sender"); ok { - fields.addAddresses(writer, tryParseAddressList(addressParser, v)) + fields.addAddresses(writer, tryParseAddressList(v)) } else if v, ok := header.GetChecked("From"); ok { - fields.addAddresses(writer, tryParseAddressList(addressParser, v)) + fields.addAddresses(writer, tryParseAddressList(v)) } else { fields.addString(writer, "") } if v, ok := header.GetChecked("Reply-To"); ok { - fields.addAddresses(writer, tryParseAddressList(addressParser, v)) + fields.addAddresses(writer, tryParseAddressList(v)) } else if v, ok := header.GetChecked("From"); ok { - fields.addAddresses(writer, tryParseAddressList(addressParser, v)) + fields.addAddresses(writer, tryParseAddressList(v)) } else { fields.addString(writer, "") } @@ -57,19 +54,19 @@ func envelope(header *rfc822.Header, c *paramList, writer parListWriter) error { if v, ok := header.GetChecked("To"); !ok { fields.addString(writer, "") } else { - fields.addAddresses(writer, tryParseAddressList(addressParser, v)) + fields.addAddresses(writer, tryParseAddressList(v)) } if v, ok := header.GetChecked("Cc"); !ok { fields.addString(writer, "") } else { - fields.addAddresses(writer, tryParseAddressList(addressParser, v)) + fields.addAddresses(writer, tryParseAddressList(v)) } if v, ok := header.GetChecked("Bcc"); !ok { fields.addString(writer, "") } else { - fields.addAddresses(writer, tryParseAddressList(addressParser, v)) + fields.addAddresses(writer, tryParseAddressList(v)) } fields.addString(writer, header.Get("In-Reply-To")) @@ -79,8 +76,8 @@ func envelope(header *rfc822.Header, c *paramList, writer parListWriter) error { return nil } -func tryParseAddressList(parser *parser.RFC5322AddressListParser, val string) []*mail.Address { - addr, err := parser.Parse(val) +func tryParseAddressList(val string) []*mail.Address { + addr, err := rfc5322.ParseAddressList(val) if err != nil { logrus.WithError(err).Error("Failed to parse address") return []*mail.Address{{Name: val}} diff --git a/rfc5322/address.go b/rfc5322/address.go new file mode 100644 index 00000000..8895eaf2 --- /dev/null +++ b/rfc5322/address.go @@ -0,0 +1,554 @@ +package rfc5322 + +import ( + "net/mail" + + "github.com/ProtonMail/gluon/rfcparser" +) + +// 3.4. Address Specification + +func parseAddressList(p *Parser) ([]*mail.Address, error) { + // address-list = (address *("," address)) / obs-addr-list + // *([CFWS] ",") address *("," [address / CFWS]) + // We extended this rule to allow ';' as separator + var result []*mail.Address + + isSep := func(tokenType rfcparser.TokenType) bool { + return tokenType == rfcparser.TokenTypeComma || tokenType == rfcparser.TokenTypeSemicolon + } + + // *([CFWS] ",") + for { + if _, err := tryParseCFWS(p.parser); err != nil { + return nil, err + } + + if ok, err := p.parser.MatchesWith(isSep); err != nil { + return nil, err + } else if !ok { + break + } + } + + var groupConsumedSemiColon bool + // Address + { + addr, gConsumedSemiColon, err := parseAddress(p) + if err != nil { + return nil, err + } + + groupConsumedSemiColon = gConsumedSemiColon + + result = append(result, addr...) + } + + // *("," [address / CFWS]) + for { + if ok, err := p.parser.MatchesWith(isSep); err != nil { + return nil, err + } else if !ok { // see `parseAddress` comment about why this is necessary. + if !groupConsumedSemiColon || p.parser.CurrentToken().TType == rfcparser.TokenTypeEOF { + break + } + } + + if ok, err := tryParseCFWS(p.parser); err != nil { + return nil, err + } else if ok { + // Only continue if the next input is EOF or comma or we can run into issues with parsring + // the `',' address` rules. + if p.parser.Check(rfcparser.TokenTypeEOF) || p.parser.CheckWith(isSep) { + continue + } + } + + // address + addr, consumedSemiColon, err := parseAddress(p) + if err != nil { + return nil, err + } + + groupConsumedSemiColon = consumedSemiColon + + result = append(result, addr...) + } + + return result, nil +} + +// The boolean parameter represents whether a group consumed a ';' separator. This is necessary to disambiguate +// an address list where we have the sequence ` g:
;
` since we also allow groups to have optional +// `;` terminators. +func parseAddress(p *Parser) ([]*mail.Address, bool, error) { + // address = mailbox / group + // name-addr = [display-name] angle-addr + // group = display-name ":" [group-list] ";" [CFWS] + // + if _, err := tryParseCFWS(p.parser); err != nil { + return nil, false, err + } + + // check addr-spec standalone + if p.parser.Check(rfcparser.TokenTypeLess) { + addr, err := parseAngleAddr(p.parser) + if err != nil { + return nil, false, err + } + + return []*mail.Address{{ + Name: "", + Address: addr, + }}, false, nil + } + + parserState := p.SaveState() + + if address, err := parseMailbox(p); err == nil { + return []*mail.Address{ + address, + }, false, nil + } + + p.RestoreState(parserState) + + group, didConsumeSemicolon, err := parseGroup(p) + if err != nil { + return nil, false, err + } + + return group, didConsumeSemicolon, nil +} + +func parseGroup(p *Parser) ([]*mail.Address, bool, error) { + // nolint:dupword + // group = display-name ":" [group-list] ";" [CFWS] + // group-list = mailbox-list / CFWS / obs-group-list + // obs-group-list = 1*([CFWS] ",") [CFWS] + // + // nolint:dupword + // mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list + // obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]) + // + // This version has been relaxed so that the ';' is optional. and that a group can be wrapped in `"` + hasQuotes, err := p.parser.Matches(rfcparser.TokenTypeDQuote) + if err != nil { + return nil, false, err + } + + if _, err := parseDisplayName(p.parser); err != nil { + return nil, false, err + } + + if err := p.parser.Consume(rfcparser.TokenTypeColon, "expected ':' for group start"); err != nil { + return nil, false, err + } + + var didConsumeSemicolon bool + + var result []*mail.Address + + if ok, err := p.parser.Matches(rfcparser.TokenTypeSemicolon); err != nil { + return nil, false, err + } else if !ok { + + // *([CFWS] ",") + for { + if _, err := tryParseCFWS(p.parser); err != nil { + return nil, false, err + } + + if ok, err := p.parser.Matches(rfcparser.TokenTypeComma); err != nil { + return nil, false, err + } else if !ok { + break + } + } + + // Mailbox + var parsedFirstMailbox bool + + { + parserState := p.SaveState() + mailbox, err := parseMailbox(p) + if err != nil { + p.RestoreState(parserState) + } else { + parsedFirstMailbox = true + result = append(result, mailbox) + } + } + + // *("," [mailbox / CFWS]) + if parsedFirstMailbox { + for { + if ok, err := p.parser.Matches(rfcparser.TokenTypeComma); err != nil { + return nil, false, err + } else if !ok { + break + } + + if ok, err := tryParseCFWS(p.parser); err != nil { + return nil, false, err + } else if ok { + continue + } + + // Mailbox + mailbox, err := parseMailbox(p) + if err != nil { + return nil, false, err + } + + result = append(result, mailbox) + } + } + + consumedSemicolon, err := p.parser.Matches(rfcparser.TokenTypeSemicolon) + if err != nil { + return nil, false, err + } + + didConsumeSemicolon = consumedSemicolon + } else { + didConsumeSemicolon = true + } + + if _, err := tryParseCFWS(p.parser); err != nil { + return nil, false, err + } + + if hasQuotes { + if err := p.parser.Consume(rfcparser.TokenTypeDQuote, `expected '"' for group end`); err != nil { + return nil, false, err + } + } + + return result, didConsumeSemicolon, nil +} + +func parseMailbox(p *Parser) (*mail.Address, error) { + // mailbox = name-addr / addr-spec + parserState := p.SaveState() + + if addr, err := parseNameAddr(p.parser); err == nil { + return addr, nil + } + + p.RestoreState(parserState) + + addr, err := parseAddrSpec(p.parser) + if err != nil { + return nil, err + } + + return &mail.Address{ + Address: addr, + }, nil +} + +func parseNameAddr(p *rfcparser.Parser) (*mail.Address, error) { + // name-addr = [display-name] angle-addr + if _, err := tryParseCFWS(p); err != nil { + return nil, err + } + + // Only has angle-addr component. + if p.Check(rfcparser.TokenTypeLess) { + address, err := parseAngleAddr(p) + if err != nil { + return nil, err + } + + return &mail.Address{Address: address}, nil + } + + displayName, err := parseDisplayName(p) + if err != nil { + return nil, err + } + + address, err := parseAngleAddr(p) + if err != nil { + return nil, err + } + + return &mail.Address{Address: address, Name: displayName}, nil +} + +func parseAngleAddr(p *rfcparser.Parser) (string, error) { + // angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / + // obs-angle-addr + // + // obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS] + // + // obs-route = obs-domain-list ":" + // + // obs-domain-list = *(CFWS / ",") "@" domain + // *("," [CFWS] ["@" domain]) + // + // This version has been extended so that add-rspec is optional + if _, err := tryParseCFWS(p); err != nil { + return "", err + } + + if err := p.Consume(rfcparser.TokenTypeLess, "expected < for angle-addr start"); err != nil { + return "", err + } + + if ok, err := p.Matches(rfcparser.TokenTypeGreater); err != nil { + return "", err + } else if ok { + return "", nil + } + + for { + if ok, err := tryParseCFWS(p); err != nil { + return "", err + } else if !ok { + if ok, err := p.Matches(rfcparser.TokenTypeComma); err != nil { + return "", err + } else if !ok { + break + } + } + } + + if ok, err := p.Matches(rfcparser.TokenTypeAt); err != nil { + return "", err + } else if ok { + if _, err := parseDomain(p); err != nil { + return "", err + } + + for { + if ok, err := p.Matches(rfcparser.TokenTypeComma); err != nil { + return "", err + } else if !ok { + break + } + + if _, err := tryParseCFWS(p); err != nil { + return "", err + } + + if ok, err := p.Matches(rfcparser.TokenTypeAt); err != nil { + return "", err + } else if ok { + if _, err := parseDomain(p); err != nil { + return "", err + } + } + } + + if err := p.Consume(rfcparser.TokenTypeColon, "expected ':' for obs-route end"); err != nil { + return "", err + } + } + + addr, err := parseAddrSpec(p) + if err != nil { + return "", err + } + + if err := p.Consume(rfcparser.TokenTypeGreater, "expected > for angle-addr end"); err != nil { + return "", err + } + + if _, err := tryParseCFWS(p); err != nil { + return "", err + } + + return addr, nil +} + +func parseDisplayName(p *rfcparser.Parser) (string, error) { + // display-name = phrase + phrase, err := parsePhrase(p) + if err != nil { + return "", err + } + + return joinWithSpacingRules(phrase), nil +} + +func parseAddrSpec(p *rfcparser.Parser) (string, error) { + // addr-spec = local-part "@" domain + // This version adds an option port extension : COLON ATOM + localPart, err := parseLocalPart(p) + if err != nil { + return "", err + } + + if err := p.Consume(rfcparser.TokenTypeAt, "expected @ after local-part"); err != nil { + return "", err + } + + domain, err := parseDomain(p) + if err != nil { + return "", err + } + + if ok, err := p.Matches(rfcparser.TokenTypeColon); err != nil { + return "", err + } else if ok { + port, err := parseAtom(p) + if err != nil { + return "", err + } + + return localPart + "@" + domain + ":" + port.String.Value, nil + } + + return localPart + "@" + domain, nil +} + +func parseLocalPart(p *rfcparser.Parser) (string, error) { + // nolint:dupword + // local-part = dot-atom / quoted-string / obs-local-part + // obs-local-part = word *("." word) + // word = atom / quoted-string + // ^ above rule can be relaxed into just the last part, dot-atom just + // Local part extended + var words []parserString + + { + word, err := parseWord(p) + if err != nil { + return "", err + } + + words = append(words, word) + } + + for { + if ok, err := p.Matches(rfcparser.TokenTypePeriod); err != nil { + return "", err + } else if !ok { + break + } + + words = append(words, parserString{ + String: rfcparser.String{ + Value: ".", + Offset: p.PreviousToken().Offset, + }, + Type: parserStringTypeUnspaced, + }) + + word, err := parseWord(p) + if err != nil { + return "", err + } + + words = append(words, word) + } + + return joinWithSpacingRules(words), nil +} + +func parseDomain(p *rfcparser.Parser) (string, error) { + // domain = dot-atom / domain-literal / obs-domain + // + // obs-domain = atom *("." atom) + // + if _, err := tryParseCFWS(p); err != nil { + return "", err + } + + if ok, err := p.Matches(rfcparser.TokenTypeLBracket); err != nil { + return "", err + } else if ok { + return parseDomainLiteral(p) + } + + // obs-domain can be seen as a more restrictive dot-atom so we just use that rule instead. + dotAtom, err := parseDotAtom(p) + if err != nil { + return "", err + } + + return dotAtom.Value, nil +} + +func parseDomainLiteral(p *rfcparser.Parser) (string, error) { + // domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS] + // + // [CFWS] and "[" consumed before entry + // + result := []byte{'['} + + for { + if _, err := tryParseFWS(p); err != nil { + return "", err + } + + if ok, err := p.MatchesWith(isDText); err != nil { + return "", err + } else if !ok { + break + } + + result = append(result, p.PreviousToken().Value) + } + + if _, err := tryParseFWS(p); err != nil { + return "", err + } + + if err := p.Consume(rfcparser.TokenTypeRBracket, "expecetd ] for domain-literal end"); err != nil { + return "", err + } + + result = append(result, ']') + + if _, err := tryParseCFWS(p); err != nil { + return "", err + } + + return string(result), nil +} + +func isDText(tokenType rfcparser.TokenType) bool { + // dtext = %d33-90 / ; Printable US-ASCII + // %d94-126 / ; characters not including + // obs-dtext ; "[", "]", or "\" + // + // obs-dtext = obs-NO-WS-CTL / quoted-pair // <- we have not included this + // + if rfcparser.IsCTL(tokenType) || + tokenType == rfcparser.TokenTypeLBracket || + tokenType == rfcparser.TokenTypeRBracket || + tokenType == rfcparser.TokenTypeBackslash { + return false + } + + return true +} + +func joinWithSpacingRules(v []parserString) string { + result := v[0].String.Value + + prevStrType := v[0].Type + + for i := 1; i < len(v); i++ { + curStrType := v[i].Type + + if prevStrType == parserStringTypeEncoded { + if curStrType == parserStringTypeOther { + result += " " + } + } else if prevStrType != parserStringTypeUnspaced { + if curStrType != parserStringTypeUnspaced { + result += " " + } + } + + prevStrType = curStrType + + result += v[i].String.Value + } + + return result +} diff --git a/rfc5322/address_test.go b/rfc5322/address_test.go new file mode 100644 index 00000000..5021e9a8 --- /dev/null +++ b/rfc5322/address_test.go @@ -0,0 +1,52 @@ +package rfc5322 + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestParseAddrSpec(t *testing.T) { + inputs := map[string]string{ + `pete(his account)@silly.test(his host)`: `pete@silly.test`, + `jdoe@machine.example`: `jdoe@machine.example`, + `john.q.public@example.com`: `john.q.public@example.com`, + `user@example.com`: `user@example.com`, + `user@[10.0.0.1]`: `user@[10.0.0.1]`, + `hořejšek@mail.com `: `hořejšek@mail.com`, + } + + for i, e := range inputs { + t.Run(i, func(t *testing.T) { + p := newTestRFCParser(i) + v, err := parseAddrSpec(p) + require.NoError(t, err) + require.Equal(t, e, v) + }) + } +} + +func TestParseAngleAddr(t *testing.T) { + inputs := map[string]string{ + ``: `pete@silly.test`, + ``: `jdoe@machine.example`, + ``: `john.q.public@example.com`, + ``: `user@example.com`, + ``: `user@[10.0.0.1]`, + ``: `hořejšek@mail.com`, + `<@foo.com:foo@bar.com>`: `foo@bar.com`, + `<,@foo.com:foo@bar.com>`: `foo@bar.com`, + `< @foo.com:foo@bar.com>`: `foo@bar.com`, + `<@foo.com,@bar.bar:foo@bar.com>`: `foo@bar.com`, + "<@foo.com,\r\n @bar.bar:foo@bar.com>": `foo@bar.com`, + } + + for i, e := range inputs { + t.Run(i, func(t *testing.T) { + p := newTestRFCParser(i) + v, err := parseAngleAddr(p) + require.NoError(t, err) + require.Equal(t, e, v) + }) + } +} diff --git a/rfc5322/atom.go b/rfc5322/atom.go new file mode 100644 index 00000000..e4e7b46f --- /dev/null +++ b/rfc5322/atom.go @@ -0,0 +1,318 @@ +package rfc5322 + +// 3.2.4. Quoted Strings + +import ( + "fmt" + "io" + "mime" + + "github.com/ProtonMail/gluon/rfcparser" +) + +func parseDotAtom(p *rfcparser.Parser) (rfcparser.String, error) { + // dot-atom = [CFWS] dot-atom-text [CFWS] + if _, err := tryParseCFWS(p); err != nil { + return rfcparser.String{}, err + } + + atom, err := parseDotAtomText(p) + if err != nil { + return rfcparser.String{}, err + } + + if _, err := tryParseCFWS(p); err != nil { + return rfcparser.String{}, err + } + + return atom, nil +} + +func parseDotAtomText(p *rfcparser.Parser) (rfcparser.String, error) { + // dot-atom-text = 1*atext *("." 1*atext) + // This version has been extended to allow for trailing '.' files. + if err := p.ConsumeWith(isAText, "expected atext char for dot-atom-text"); err != nil { + return rfcparser.String{}, err + } + + atom, err := p.CollectBytesWhileMatchesWithPrevWith(isAText) + if err != nil { + return rfcparser.String{}, err + } + + for { + if ok, err := p.Matches(rfcparser.TokenTypePeriod); err != nil { + return rfcparser.String{}, err + } else if !ok { + break + } + + atom.Value = append(atom.Value, '.') + + if p.Check(rfcparser.TokenTypePeriod) { + return rfcparser.String{}, p.MakeError("invalid token after '.'") + } + + // Early exit to allow trailing '.' + if !p.CheckWith(isAText) { + break + } + + if err := p.ConsumeWith(isAText, "expected atext char for dot-atom-text"); err != nil { + return rfcparser.String{}, err + } + + atomNext, err := p.CollectBytesWhileMatchesWithPrevWith(isAText) + if err != nil { + return rfcparser.String{}, err + } + + atom.Value = append(atom.Value, atomNext.Value...) + } + + return atom.IntoString(), nil +} + +func parseAtom(p *rfcparser.Parser) (parserString, error) { + // atom = [CFWS] 1*atext [CFWS] + if _, err := tryParseCFWS(p); err != nil { + return parserString{}, err + } + + if err := p.ConsumeWith(isAText, "expected atext char for atom"); err != nil { + return parserString{}, err + } + + atom, err := p.CollectBytesWhileMatchesWithPrevWith(isAText) + if err != nil { + return parserString{}, err + } + + if _, err := tryParseCFWS(p); err != nil { + return parserString{}, err + } + + return parserString{ + String: atom.IntoString(), + Type: parserStringTypeOther, + }, nil +} + +var CharsetReader func(charset string, input io.Reader) (io.Reader, error) + +func parseEncodedAtom(p *rfcparser.Parser) (parserString, error) { + // encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" + // + // charset = token ; see section 3 + // + // encoding = token ; see section 4 + // + // + if _, err := tryParseCFWS(p); err != nil { + return parserString{}, err + } + + var fullWord string + + startOffset := p.CurrentToken().Offset + + if err := p.ConsumeBytesFold('=', '?'); err != nil { + return parserString{}, err + } + + fullWord += "=?" + + charset, err := p.CollectBytesWhileMatchesWith(isEncodedAtomToken) + if err != nil { + return parserString{}, err + } + + fullWord += charset.IntoString().Value + + if err := p.Consume(rfcparser.TokenTypeQuestion, "expected '?' after encoding charset"); err != nil { + return parserString{}, err + } + + fullWord += "?" + + if err := p.Consume(rfcparser.TokenTypeChar, "expected char after '?'"); err != nil { + return parserString{}, err + } + + encoding := rfcparser.ByteToLower(p.PreviousToken().Value) + if encoding != 'q' && encoding != 'b' { + return parserString{}, p.MakeError("encoding should either be 'Q' or 'B'") + } + + if err := p.Consume(rfcparser.TokenTypeQuestion, "expected '?' after encoding byte"); err != nil { + return parserString{}, err + } + + if encoding == 'b' { + fullWord += "B" + } else { + fullWord += "Q" + } + + fullWord += "?" + + encodedText, err := p.CollectBytesWhileMatchesWith(isEncodedText) + if err != nil { + return parserString{}, err + } + + fullWord += encodedText.IntoString().Value + + if err := p.ConsumeBytesFold('?', '='); err != nil { + return parserString{}, err + } + + fullWord += "?=" + + if _, err := tryParseCFWS(p); err != nil { + return parserString{}, err + } + + decoder := mime.WordDecoder{CharsetReader: CharsetReader} + + decoded, err := decoder.Decode(fullWord) + if err != nil { + return parserString{}, p.MakeErrorAtOffset(fmt.Sprintf("failed to decode encoded atom: %v", err), startOffset) + } + + return parserString{ + String: rfcparser.String{Value: decoded, Offset: startOffset}, + Type: parserStringTypeEncoded, + }, nil +} + +func isEncodedAtomToken(tokenType rfcparser.TokenType) bool { + // token = 1* + // + // specials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / " + // <"> / "/" / "[" / "]" / "?" / "." / "=" + if rfcparser.IsCTL(tokenType) { + return false + } + + switch tokenType { //nolint:exhaustive + case rfcparser.TokenTypeEOF: + fallthrough + case rfcparser.TokenTypeError: + fallthrough + case rfcparser.TokenTypeSP: + fallthrough + case rfcparser.TokenTypeLParen: + fallthrough + case rfcparser.TokenTypeRParen: + fallthrough + case rfcparser.TokenTypeLess: + fallthrough + case rfcparser.TokenTypeGreater: + fallthrough + case rfcparser.TokenTypeAt: + fallthrough + case rfcparser.TokenTypeComma: + fallthrough + case rfcparser.TokenTypeSemicolon: + fallthrough + case rfcparser.TokenTypeColon: + fallthrough + case rfcparser.TokenTypeDQuote: + fallthrough + case rfcparser.TokenTypeSlash: + fallthrough + case rfcparser.TokenTypeLBracket: + fallthrough + case rfcparser.TokenTypeRBracket: + fallthrough + case rfcparser.TokenTypeQuestion: + fallthrough + case rfcparser.TokenTypePeriod: + fallthrough + case rfcparser.TokenTypeEqual: + return false + default: + return true + } +} + +func isEncodedText(tokenType rfcparser.TokenType) bool { + // encoded-text = 1* + // ; (but see "Use of encoded-words in message + // ; headers", section 5) + // + if rfcparser.IsCTL(tokenType) || + tokenType == rfcparser.TokenTypeSP || + tokenType == rfcparser.TokenTypeQuestion || + tokenType == rfcparser.TokenTypeEOF || + tokenType == rfcparser.TokenTypeError || + tokenType == rfcparser.TokenTypeExtendedChar { + return false + } + + return true +} + +func isAText(tokenType rfcparser.TokenType) bool { + // atext = ALPHA / DIGIT / ; Printable US-ASCII + // "!" / "#" / ; characters not including + // "$" / "%" / ; specials. Used for atoms. + // "&" / "'" / + // "*" / "+" / + // "-" / "/" / + // "=" / "?" / + // "^" / "_" / + // "`" / "{" / + // "|" / "}" / + // "~" + switch tokenType { //nolint:exhaustive + case rfcparser.TokenTypeDigit: + fallthrough + case rfcparser.TokenTypeChar: + fallthrough + case rfcparser.TokenTypeExclamation: + fallthrough + case rfcparser.TokenTypeHash: + fallthrough + case rfcparser.TokenTypeDollar: + fallthrough + case rfcparser.TokenTypePercent: + fallthrough + case rfcparser.TokenTypeAmpersand: + fallthrough + case rfcparser.TokenTypeSQuote: + fallthrough + case rfcparser.TokenTypeAsterisk: + fallthrough + case rfcparser.TokenTypePlus: + fallthrough + case rfcparser.TokenTypeMinus: + fallthrough + case rfcparser.TokenTypeSlash: + fallthrough + case rfcparser.TokenTypeEqual: + fallthrough + case rfcparser.TokenTypeQuestion: + fallthrough + case rfcparser.TokenTypeCaret: + fallthrough + case rfcparser.TokenTypeUnderscore: + fallthrough + case rfcparser.TokenTyeBacktick: + fallthrough + case rfcparser.TokenTypeLCurly: + fallthrough + case rfcparser.TokenTypeRCurly: + fallthrough + case rfcparser.TokenTypePipe: + fallthrough + case rfcparser.TokenTypeExtendedChar: // RFC6532 + fallthrough + case rfcparser.TokenTypeTilde: + return true + default: + return false + } +} diff --git a/rfc5322/atom_test.go b/rfc5322/atom_test.go new file mode 100644 index 00000000..8c5cbca7 --- /dev/null +++ b/rfc5322/atom_test.go @@ -0,0 +1,39 @@ +package rfc5322 + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestParseDotAtom(t *testing.T) { + inputs := map[string]string{ + "foobar.!#$%'*+-=?^~_{}`|/": "foobar.!#$%'*+-=?^~_{}`|/", + " f.b ": "f.b", + " \r\n f.b": "f.b", + " \r\n f.b \r\n ": "f.b", + } + + for i, e := range inputs { + p := newTestRFCParser(i) + v, err := parseDotAtom(p) + require.NoError(t, err) + require.Equal(t, e, v.Value) + } +} + +func TestParseAtom(t *testing.T) { + inputs := map[string]string{ + "foobar!#$%'*+-=?^~_{}`|/": "foobar!#$%'*+-=?^~_{}`|/", + " fb ": "fb", + " \r\n fb": "fb", + " \r\n fb \r\n ": "fb", + } + + for i, e := range inputs { + p := newTestRFCParser(i) + v, err := parseDotAtom(p) + require.NoError(t, err) + require.Equal(t, e, v.Value) + } +} diff --git a/rfc5322/backtracing_scanner.go b/rfc5322/backtracing_scanner.go new file mode 100644 index 00000000..83dded87 --- /dev/null +++ b/rfc5322/backtracing_scanner.go @@ -0,0 +1,94 @@ +package rfc5322 + +import ( + "bytes" + "io" +) + +type BacktrackingByteScanner struct { + data []byte + offset int +} + +func NewBacktrackingByteScanner(data []byte) *BacktrackingByteScanner { + return &BacktrackingByteScanner{ + data: data, + } +} + +type BacktrackingByteScannerScope struct { + offset int +} + +func (bs *BacktrackingByteScanner) Read(dst []byte) (int, error) { + thisLen := len(bs.data) + + if bs.offset >= thisLen { + return 0, io.EOF + } + + dstLen := len(dst) + + if bs.offset+dstLen >= thisLen { + bytesRead := thisLen - bs.offset + + copy(dst, bs.data[bs.offset:]) + + return bytesRead, nil + } + + nextOffset := bs.offset + dstLen + + copy(dst, bs.data[bs.offset:nextOffset]) + + bs.offset = nextOffset + + return dstLen, nil +} + +func (bs *BacktrackingByteScanner) ReadByte() (byte, error) { + if bs.offset >= len(bs.data) { + return 0, io.EOF + } + + b := bs.data[bs.offset] + + bs.offset++ + + return b, nil +} + +func (bs *BacktrackingByteScanner) ReadBytes(delim byte) ([]byte, error) { + if bs.offset >= len(bs.data) { + return nil, io.EOF + } + + var result []byte + + index := bytes.IndexByte(bs.data[bs.offset:], delim) + if index < 0 { + copy(result, bs.data[bs.offset:]) + bs.offset = len(bs.data) + + return result, nil + } + + nextOffset := bs.offset + index + 1 + if nextOffset >= len(bs.data) { + copy(result, bs.data[bs.offset:]) + bs.offset = len(bs.data) + } else { + copy(result, bs.data[bs.offset:nextOffset]) + bs.offset = nextOffset + } + + return result, nil +} + +func (bs *BacktrackingByteScanner) SaveState() BacktrackingByteScannerScope { + return BacktrackingByteScannerScope{offset: bs.offset} +} + +func (bs *BacktrackingByteScanner) RestoreState(scope BacktrackingByteScannerScope) { + bs.offset = scope.offset +} diff --git a/rfc5322/cfws.go b/rfc5322/cfws.go new file mode 100644 index 00000000..5caf5d3a --- /dev/null +++ b/rfc5322/cfws.go @@ -0,0 +1,307 @@ +package rfc5322 + +import "github.com/ProtonMail/gluon/rfcparser" + +// Section 3.2.2 White space and Comments + +func tryParseCFWS(p *rfcparser.Parser) (bool, error) { + if !p.CheckWith(func(tokenType rfcparser.TokenType) bool { + return isWSP(tokenType) || tokenType == rfcparser.TokenTypeCR || tokenType == rfcparser.TokenTypeLParen + }) { + return false, nil + } + + return true, parseCFWS(p) +} + +func parseCFWS(p *rfcparser.Parser) error { + // CFWS = (1*([FWS] comment) [FWS]) / FWS + parsedFirstFWS, err := tryParseFWS(p) + if err != nil { + return err + } + + // Handle case where it can just be FWS without comment + if !p.Check(rfcparser.TokenTypeLParen) { + if !parsedFirstFWS { + return p.MakeError("expected FWS or comment for CFWS") + } + + return nil + } + + if err := parseComment(p); err != nil { + return err + } + + // Read remaining [FWS] comment + for { + if _, err := tryParseFWS(p); err != nil { + return err + } + + if !p.Check(rfcparser.TokenTypeLParen) { + break + } + + if err := parseComment(p); err != nil { + return err + } + } + + if _, err := tryParseFWS(p); err != nil { + return err + } + + return nil +} + +func tryParseFWS(p *rfcparser.Parser) (bool, error) { + if !p.CheckWith(func(tokenType rfcparser.TokenType) bool { + return isWSP(tokenType) || tokenType == rfcparser.TokenTypeCR + }) { + return false, nil + } + + return true, parseFWS(p) +} + +func parseFWS(p *rfcparser.Parser) error { + // FWS = ([*WSP CRLF] 1*WSP) / obs-FWS + // ; Folding white space + // obs-FWS = 1*WSP *(CRLF 1*WSP) + // + // Parse 0 or more WSP + for { + if ok, err := p.MatchesWith(isWSP); err != nil { + return err + } else if !ok { + break + } + } + + if !p.Check(rfcparser.TokenTypeCR) { + // Early exit. + return nil + } + + if err := p.ConsumeNewLine(); err != nil { + return err + } + + // Parse one or many WSP. + if err := p.ConsumeWith(isWSP, "expected WSP after CRLF"); err != nil { + return err + } + + for { + if ok, err := p.MatchesWith(isWSP); err != nil { + return err + } else if !ok { + break + } + } + + // Handle obs-FWS case where there can be multiple repeating loops + for { + if !p.Check(rfcparser.TokenTypeCR) { + break + } + + if err := p.ConsumeNewLine(); err != nil { + return err + } + + // Parse one or many WSP. + if err := p.ConsumeWith(isWSP, "expected WSP after CRLF"); err != nil { + return err + } + + for { + if ok, err := p.MatchesWith(isWSP); err != nil { + return err + } else if !ok { + break + } + } + } + + return nil +} + +func parseCContent(p *rfcparser.Parser) error { + if ok, err := p.MatchesWith(isCText); err != nil { + return err + } else if ok { + return nil + } + + if _, ok, err := tryParseQuotedPair(p); err != nil { + return err + } else if ok { + return nil + } + + if p.Check(rfcparser.TokenTypeLParen) { + return parseComment(p) + } + + return p.MakeError("unexpected ccontent token") +} + +func parseComment(p *rfcparser.Parser) error { + if err := p.Consume(rfcparser.TokenTypeLParen, "expected ( for comment start"); err != nil { + return err + } + + for { + if _, err := tryParseFWS(p); err != nil { + return err + } + + if !p.CheckWith(func(tokenType rfcparser.TokenType) bool { + return isCText(tokenType) || tokenType == rfcparser.TokenTypeBackslash || tokenType == rfcparser.TokenTypeLParen + }) { + break + } + + if err := parseCContent(p); err != nil { + return err + } + } + + if _, err := tryParseFWS(p); err != nil { + return err + } + + if err := p.Consume(rfcparser.TokenTypeRParen, "expected ) for comment end"); err != nil { + return err + } + + return nil +} + +func tryParseQuotedPair(p *rfcparser.Parser) (byte, bool, error) { + if !p.Check(rfcparser.TokenTypeBackslash) { + return 0, false, nil + } + + b, err := parseQuotedPair(p) + if err != nil { + return 0, false, err + } + + return b, true, nil +} + +func parseQuotedPair(p *rfcparser.Parser) (byte, error) { + // quoted-pair = ("\" (VCHAR / WSP)) / obs-qp + // + // obs-qp = "\" (%d0 / obs-NO-WS-CTL / LF / CR) + // + if err := p.Consume(rfcparser.TokenTypeBackslash, "expected \\ for quoted pair start"); err != nil { + return 0, err + } + + if ok, err := p.MatchesWith(isVChar); err != nil { + return 0, err + } else if ok { + return p.PreviousToken().Value, nil + } + + if ok, err := p.MatchesWith(isWSP); err != nil { + return 0, err + } else if ok { + return p.PreviousToken().Value, nil + } + + if ok, err := p.MatchesWith(func(tokenType rfcparser.TokenType) bool { + return isObsNoWSCTL(tokenType) || + tokenType == rfcparser.TokenTypeCR || + tokenType == rfcparser.TokenTypeLF || + tokenType == rfcparser.TokenTypeZero + }); err != nil { + return 0, err + } else if ok { + return p.PreviousToken().Value, nil + } + + return 0, p.MakeError("unexpected character for quoted pair") +} + +func isWSP(tokenType rfcparser.TokenType) bool { + return tokenType == rfcparser.TokenTypeSP || tokenType == rfcparser.TokenTypeTab +} + +func isCText(tokenType rfcparser.TokenType) bool { + // ctext = %d33-39 / ; Printable US-ASCII + // %d42-91 / ; characters not including + // %d93-126 / ; "(", ")", or "\" + // obs-ctext + // + // obs-NO-WS-CTL = %d1-8 / ; US-ASCII control + // %d11 / ; characters that do not + // %d12 / ; include the carriage + // %d14-31 / ; return, line feed, and + // %d127 ; white space characters + // + // obs-ctext = obs-NO-WS-CTL + switch tokenType { // nolint:exhaustive + case rfcparser.TokenTypeEOF: + fallthrough + case rfcparser.TokenTypeError: + fallthrough + case rfcparser.TokenTypeLParen: + fallthrough + case rfcparser.TokenTypeRParen: + fallthrough + case rfcparser.TokenTypeCR: + fallthrough + case rfcparser.TokenTypeTab: + fallthrough + case rfcparser.TokenTypeLF: + fallthrough + case rfcparser.TokenTypeSP: + fallthrough + case rfcparser.TokenTypeBackslash: + return false + default: + return true + } +} + +func isObsNoWSCTL(tokenType rfcparser.TokenType) bool { + // obs-NO-WS-CTL = %d1-8 / ; US-ASCII control + // %d11 / ; characters that do not + // %d12 / ; include the carriage + // %d14-31 / ; return, line feed, and + // %d127 ; white space characters + switch tokenType { // nolint:exhaustive + case rfcparser.TokenTypeEOF: + fallthrough + case rfcparser.TokenTypeError: + fallthrough + case rfcparser.TokenTypeCR: + fallthrough + case rfcparser.TokenTypeTab: + fallthrough + case rfcparser.TokenTypeLF: + fallthrough + case rfcparser.TokenTypeSP: + return false + default: + return rfcparser.IsCTL(tokenType) || tokenType == rfcparser.TokenTypeDelete + } +} + +func isVChar(tokenType rfcparser.TokenType) bool { + // VChar %x21-7E + if rfcparser.IsCTL(tokenType) || + tokenType == rfcparser.TokenTypeDelete || + tokenType == rfcparser.TokenTypeError || + tokenType == rfcparser.TokenTypeEOF { + return false + } + + return true +} diff --git a/rfc5322/cfws_test.go b/rfc5322/cfws_test.go new file mode 100644 index 00000000..3f991dc6 --- /dev/null +++ b/rfc5322/cfws_test.go @@ -0,0 +1,58 @@ +package rfc5322 + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestParseFWS(t *testing.T) { + inputs := []string{ + " \t ", + "\r\n\t", + " \r\n\t", + " \r\n \r\n \r\n\t", + " \t\r\n ", + } + + for _, i := range inputs { + p := newTestRFCParser(i) + err := parseFWS(p) + require.NoError(t, err) + } +} + +func TestParserComment(t *testing.T) { + inputs := []string{ + "(my comment here)", + "(my comment here )", + "( my comment here)", + "( my comment here )", + "(my\r\n comment here)", + "(my\r\n (comment) here)", + "(\\my\r\n (comment) here)", + "(" + string([]byte{0x7F, 0x8}) + ")", + } + + for _, i := range inputs { + p := newTestRFCParser(i) + err := parseComment(p) + require.NoError(t, err) + } +} + +func TestParserCFWS(t *testing.T) { + inputs := []string{ + " ", + "(my comment here)", + " (my comment here) ", + " \r\n (my comment here) ", + " \r\n \r\n (my comment here) \r\n ", + } + + for _, i := range inputs { + p := newTestRFCParser(i) + err := parseCFWS(p) + require.NoError(t, err) + } +} diff --git a/rfc5322/miscelleaneous.go b/rfc5322/miscelleaneous.go new file mode 100644 index 00000000..18b80552 --- /dev/null +++ b/rfc5322/miscelleaneous.go @@ -0,0 +1,80 @@ +package rfc5322 + +import ( + "github.com/ProtonMail/gluon/rfcparser" +) + +// 3.2.5. Miscellaneous Tokens + +func parseWord(p *rfcparser.Parser) (parserString, error) { + // word = atom / quoted-string + if _, err := tryParseCFWS(p); err != nil { + return parserString{}, err + } + + if p.Check(rfcparser.TokenTypeEqual) { + return parseEncodedAtom(p) + } + + if p.Check(rfcparser.TokenTypeDQuote) { + return parseQuotedString(p) + } + + result, err := parseAtom(p) + if err != nil { + return parserString{}, err + } + + return result, nil +} + +func parsePhrase(p *rfcparser.Parser) ([]parserString, error) { + // nolint:dupword + // phrase = 1*word / obs-phrase + // obs-phrase = word *(word / "." / CFWS) + // This version has been extended to allow '@' to appear in obs-phrase + word, err := parseWord(p) + if err != nil { + return nil, err + } + + var result = []parserString{word} + + isSep := func(tokenType rfcparser.TokenType) bool { + return tokenType == rfcparser.TokenTypePeriod || tokenType == rfcparser.TokenTypeAt + } + + for { + // check period case + if ok, err := p.MatchesWith(isSep); err != nil { + return nil, err + } else if ok { + prevToken := p.PreviousToken() + result = append(result, parserString{ + String: rfcparser.String{ + Value: string(prevToken.Value), + Offset: prevToken.Offset, + }, + Type: parserStringTypeUnspaced, + }) + continue + } + + if _, err := tryParseCFWS(p); err != nil { + return nil, err + } + + if !(p.CheckWith(isAText) || p.Check(rfcparser.TokenTypeDQuote)) { + break + } + + nextWord, err := parseWord(p) + if err != nil { + return nil, err + } + + result = append(result, nextWord) + } + + return result, nil +} diff --git a/rfc5322/miscelleaneous_test.go b/rfc5322/miscelleaneous_test.go new file mode 100644 index 00000000..8248c4c9 --- /dev/null +++ b/rfc5322/miscelleaneous_test.go @@ -0,0 +1,41 @@ +package rfc5322 + +import ( + "testing" + + "github.com/bradenaw/juniper/xslices" + "github.com/stretchr/testify/require" +) + +func TestParseWord(t *testing.T) { + inputs := map[string]string{ + `"f\".c"`: "f\".c", + "\" \r\n f\\\".c\r\n \"": " f\".c ", + ` " foo bar derer " `: " foo bar derer ", + `foo`: "foo", + } + + for i, e := range inputs { + p := newTestRFCParser(i) + v, err := parseWord(p) + require.NoError(t, err) + require.Equal(t, e, v.String.Value) + } +} + +func TestParsePhrase(t *testing.T) { + inputs := map[string][]string{ + `foo "quoted"`: {"foo", "quoted"}, + `"f\".c" "quoted"`: {"f\".c", "quoted"}, + `foo bar`: {"foo", "bar"}, + `foo.bar`: {"foo", ".", "bar"}, + `foo . bar`: {"foo", ".", "bar"}, + } + + for i, e := range inputs { + p := newTestRFCParser(i) + v, err := parsePhrase(p) + require.NoError(t, err) + require.Equal(t, e, xslices.Map(v, func(v parserString) string { return v.String.Value })) + } +} diff --git a/rfc5322/parser.go b/rfc5322/parser.go new file mode 100644 index 00000000..250dde14 --- /dev/null +++ b/rfc5322/parser.go @@ -0,0 +1,83 @@ +package rfc5322 + +import ( + "net/mail" + + "github.com/ProtonMail/gluon/rfcparser" +) + +type Parser struct { + source *BacktrackingByteScanner + scanner *rfcparser.Scanner + parser *rfcparser.Parser +} + +type parserStringType int + +const ( + parserStringTypeOther parserStringType = iota + parserStringTypeUnspaced + parserStringTypeEncoded +) + +type parserString struct { + String rfcparser.String + Type parserStringType +} + +func ParseAddress(input string) ([]*mail.Address, error) { + source := NewBacktrackingByteScanner([]byte(input)) + scanner := rfcparser.NewScannerWithReader(source) + parser := rfcparser.NewParser(scanner) + + p := Parser{ + source: source, + scanner: scanner, + parser: parser, + } + + if err := p.parser.Advance(); err != nil { + return nil, err + } + + addr, _, err := parseAddress(&p) + + return addr, err +} + +func ParseAddressList(input string) ([]*mail.Address, error) { + source := NewBacktrackingByteScanner([]byte(input)) + scanner := rfcparser.NewScannerWithReader(source) + parser := rfcparser.NewParser(scanner) + + p := Parser{ + source: source, + scanner: scanner, + parser: parser, + } + + if err := p.parser.Advance(); err != nil { + return nil, err + } + + return parseAddressList(&p) +} + +type ParserState struct { + scanner BacktrackingByteScannerScope + parser rfcparser.ParserState +} + +func (p *Parser) SaveState() ParserState { + scannerScope := p.source.SaveState() + + return ParserState{ + scanner: scannerScope, + parser: p.parser.SaveState(), + } +} + +func (p *Parser) RestoreState(s ParserState) { + p.source.RestoreState(s.scanner) + p.parser.RestoreState(s.parser) +} diff --git a/rfc5322/parser_test.go b/rfc5322/parser_test.go new file mode 100644 index 00000000..a74554e5 --- /dev/null +++ b/rfc5322/parser_test.go @@ -0,0 +1,832 @@ +package rfc5322 + +import ( + "bytes" + "net/mail" + "testing" + + "github.com/ProtonMail/gluon/rfcparser" + "github.com/stretchr/testify/assert" +) + +func newTestRFCParser(s string) *rfcparser.Parser { + p := rfcparser.NewParser(rfcparser.NewScanner(bytes.NewReader([]byte(s)))) + if p.Advance() != nil { + panic("failed to advance parser") + } + + return p +} + +func TestParseAddress(t *testing.T) { + tests := []struct { + input string + addrs []*mail.Address + }{ + { + input: `user@example.com`, + addrs: []*mail.Address{{ + Address: `user@example.com`, + }}, + }, + { + input: `John Doe `, + addrs: []*mail.Address{{ + Name: `John Doe`, + Address: `jdoe@machine.example`, + }}, + }, + { + input: `Mary Smith `, + addrs: []*mail.Address{{ + Name: `Mary Smith`, + Address: `mary@example.net`, + }}, + }, + { + input: `"Joe Q. Public" `, + addrs: []*mail.Address{{ + Name: `Joe Q. Public`, + Address: `john.q.public@example.com`, + }}, + }, + { + input: `Mary Smith `, + addrs: []*mail.Address{{ + Name: `Mary Smith`, + Address: `mary@x.test`, + }}, + }, + { + input: `jdoe@example.org`, + addrs: []*mail.Address{{ + Address: `jdoe@example.org`, + }}, + }, + { + input: `Who? `, + addrs: []*mail.Address{{ + Name: `Who?`, + Address: `one@y.test`, + }}, + }, + { + input: ``, + addrs: []*mail.Address{{ + Address: `boss@nil.test`, + }}, + }, + { + input: `"Giant; \"Big\" Box" `, + addrs: []*mail.Address{{ + Name: `Giant; "Big" Box`, + Address: `sysservices@example.net`, + }}, + }, + { + input: `Pete `, + addrs: []*mail.Address{{ + Name: `Pete`, + Address: `pete@silly.example`, + }}, + }, + { + input: `"Mary Smith: Personal Account" `, + addrs: []*mail.Address{{ + Name: `Mary Smith: Personal Account`, + Address: `smith@home.example`, + }}, + }, + { + input: `Pete(A nice \) chap) `, + addrs: []*mail.Address{{ + Name: `Pete`, + Address: `pete@silly.test`, + }}, + }, + { + input: `Gogh Fir `, + addrs: []*mail.Address{{ + Name: `Gogh Fir`, + Address: `gf@example.com`, + }}, + }, + { + input: `normal name `, + addrs: []*mail.Address{{ + Name: `normal name`, + Address: `username@server.com`, + }}, + }, + { + input: `"comma, name" `, + addrs: []*mail.Address{{ + Name: `comma, name`, + Address: `username@server.com`, + }}, + }, + { + input: `name (ignore comment)`, + addrs: []*mail.Address{{ + Name: `name`, + Address: `username@server.com`, + }}, + }, + { + input: `"Mail Robot" <>`, + addrs: []*mail.Address{{ + Name: `Mail Robot`, + }}, + }, + { + input: `Michal Hořejšek `, + addrs: []*mail.Address{{ + Name: `Michal Hořejšek`, + Address: `hořejšek@mail.com`, // Not his real address. + }}, + }, + { + input: `First Last `, + addrs: []*mail.Address{{ + Name: `First Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `First Last `, + addrs: []*mail.Address{{ + Name: `First Last`, + Address: `user@domain.com.`, + }}, + }, + { + input: `First Last `, + addrs: []*mail.Address{{ + Name: `First Last`, + Address: `user@domain.com.`, + }}, + }, + { + input: `First Last `, + addrs: []*mail.Address{{ + Name: `First Last`, + Address: `user@domain.com:25`, + }}, + }, + { + input: `First Last `, + addrs: []*mail.Address{{ + Name: `First Last`, + Address: `user@[10.0.0.1]`, + }}, + }, + { + input: ``, + addrs: []*mail.Address{{ + Address: `postmaster@[10.10.10.10]`, + }}, + }, + { + input: `First Last < user@domain.com>`, + addrs: []*mail.Address{{ + Name: `First Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `user@domain.com,`, + addrs: []*mail.Address{{ + Address: `user@domain.com`, + }}, + }, + { + input: `First Middle "Last" `, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `First Middle Last `, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `First Middle"Last" `, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `First Middle "Last"`, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `First "Middle" "Last" `, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `First "Middle""Last" `, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `first.last `, + addrs: []*mail.Address{{ + Name: `first.last`, + Address: `user@domain.com`, + }}, + }, + { + input: `first . last `, + addrs: []*mail.Address{{ + Name: `first.last`, + Address: `user@domain.com`, + }}, + }, + } + for _, test := range tests { + test := test + + t.Run(test.input, func(t *testing.T) { + addrs, err := ParseAddress(test.input) + assert.NoError(t, err) + assert.ElementsMatch(t, test.addrs, addrs) + }) + } +} + +func TestParseAddressList(t *testing.T) { + tests := []struct { + input string + addrs []*mail.Address + }{ + { + input: `Alice , Bob , Eve `, + addrs: []*mail.Address{ + { + Name: `Alice`, + Address: `alice@example.com`, + }, + { + Name: `Bob`, + Address: `bob@example.com`, + }, + { + Name: `Eve`, + Address: `eve@example.com`, + }, + }, + }, + { + input: `Alice ; Bob ; Eve `, + addrs: []*mail.Address{ + { + Name: `Alice`, + Address: `alice@example.com`, + }, + { + Name: `Bob`, + Address: `bob@example.com`, + }, + { + Name: `Eve`, + Address: `eve@example.com`, + }, + }, + }, + { + input: `Ed Jones ,joe@where.test,John `, + addrs: []*mail.Address{ + { + Name: `Ed Jones`, + Address: `c@a.test`, + }, + { + Address: `joe@where.test`, + }, + { + Name: `John`, + Address: `jdoe@one.test`, + }, + }, + }, + { + input: `name (ignore comment) , (Comment as name) username2@server.com`, + addrs: []*mail.Address{ + { + Name: `name`, + Address: `username@server.com`, + }, + { + Address: `username2@server.com`, + }, + }, + }, + { + input: `"normal name" , "comma, name" `, + addrs: []*mail.Address{ + { + Name: `normal name`, + Address: `username@server.com`, + }, + { + Name: `comma, name`, + Address: `address@server.com`, + }, + }, + }, + { + input: `"comma, one" , "comma, two" `, + addrs: []*mail.Address{ + { + Name: `comma, one`, + Address: `username@server.com`, + }, + { + Name: `comma, two`, + Address: `address@server.com`, + }, + }, + }, + { + input: `normal name , (comment)All.(around)address@(the)server.com`, + addrs: []*mail.Address{ + { + Name: `normal name`, + Address: `username@server.com`, + }, + { + Address: `All.address@server.com`, + }, + }, + }, + { + input: `normal name , All.("comma, in comment")address@(the)server.com`, + addrs: []*mail.Address{ + { + Name: `normal name`, + Address: `username@server.com`, + }, + { + Address: `All.address@server.com`, + }, + }, + }, + { + input: `Alice , Group:foo@bar;, bar@bar`, + addrs: []*mail.Address{ + { + Name: `Alice`, + Address: `alice@example.com`, + }, + { + Name: ``, + Address: `foo@bar`, + }, + { + Name: ``, + Address: `bar@bar`, + }, + }, + }, + { + input: `user@domain `, + addrs: []*mail.Address{{ + Name: `user@domain`, + Address: `user@domain.com`, + }}, + }, + { + input: `user @ domain `, + addrs: []*mail.Address{{ + Name: `user@domain`, + Address: `user@domain.com`, + }}, + }, + } + for _, test := range tests { + test := test + + t.Run(test.input, func(t *testing.T) { + addrs, err := ParseAddressList(test.input) + assert.NoError(t, err) + assert.ElementsMatch(t, test.addrs, addrs) + }) + } +} + +func TestParseGroup(t *testing.T) { + tests := []struct { + input string + addrs []*mail.Address + }{ + { + input: `A Group:Ed Jones ,joe@where.test,John ;`, + addrs: []*mail.Address{ + { + Name: `Ed Jones`, + Address: `c@a.test`, + }, + { + Address: `joe@where.test`, + }, + { + Name: `John`, + Address: `jdoe@one.test`, + }, + }, + }, + { + input: `undisclosed recipients:;`, + addrs: []*mail.Address{}, + }, + { + // We permit the group to not end in a semicolon, although as per RFC5322 it really should. + input: `undisclosed recipients:`, + addrs: []*mail.Address{}, + }, + { + // We permit the group to be surrounded with quotes, although as per RFC5322 it really shouldn't be. + input: `"undisclosed recipients:"`, + addrs: []*mail.Address{}, + }, + { + // We permit the group to be surrounded with quotes, although as per RFC5322 it really shouldn't be. + input: `"undisclosed recipients:;"`, + addrs: []*mail.Address{}, + }, + { + input: `undisclosed recipients:, foo@bar`, + addrs: []*mail.Address{ + { + Address: `foo@bar`, + }, + }, + }, + { + input: `undisclosed recipients:;, foo@bar`, + addrs: []*mail.Address{ + { + Address: `foo@bar`, + }, + }, + }, + { + input: `undisclosed recipients:bar@bar;, foo@bar`, + addrs: []*mail.Address{ + { + Address: `bar@bar`, + }, + { + Address: `foo@bar`, + }, + }, + }, + { + input: `"undisclosed recipients:", foo@bar`, + addrs: []*mail.Address{ + { + Address: `foo@bar`, + }, + }, + }, + { + input: `(Empty list)(start)Hidden recipients :(nobody(that I know)) ;`, + addrs: []*mail.Address{}, + }, + { + input: `foo@bar, g:bar@bar; z@z`, + addrs: []*mail.Address{ + { + Address: `foo@bar`, + }, + { + Address: `bar@bar`, + }, + { + Address: `z@z`, + }, + }, + }, + { + input: `foo@bar, g:bar@bar;; z@z`, + addrs: []*mail.Address{ + { + Address: `foo@bar`, + }, + { + Address: `bar@bar`, + }, + { + Address: `z@z`, + }, + }, + }, + { + input: `foo@bar, g:bar@bar;, z@z`, + addrs: []*mail.Address{ + { + Address: `foo@bar`, + }, + { + Address: `bar@bar`, + }, + { + Address: `z@z`, + }, + }, + }, + { + input: `foo@bar, g:; z@z`, + addrs: []*mail.Address{ + { + Address: `foo@bar`, + }, + { + Address: `z@z`, + }, + }, + }, + { + input: `foo@bar, g:;; z@z`, + addrs: []*mail.Address{ + { + Address: `foo@bar`, + }, + { + Address: `z@z`, + }, + }, + }, + { + input: `foo@bar, g:;, z@z`, + addrs: []*mail.Address{ + { + Address: `foo@bar`, + }, + { + Address: `z@z`, + }, + }, + }, + { + input: `foo@bar, "g:;", z@z`, + addrs: []*mail.Address{ + { + Address: `foo@bar`, + }, + { + Address: `z@z`, + }, + }, + }, + } + for _, test := range tests { + test := test + + t.Run(test.input, func(t *testing.T) { + addrs, err := ParseAddressList(test.input) + assert.NoError(t, err) + assert.ElementsMatch(t, test.addrs, addrs) + }) + } +} + +func TestParseSingleAddressEncodedWord(t *testing.T) { + tests := []struct { + input string + addrs []*mail.Address + }{ + { + input: `=?US-ASCII?Q?Keith_Moore?= `, + addrs: []*mail.Address{{ + Name: `Keith Moore`, + Address: `moore@cs.utk.edu`, + }}, + }, + { + input: `=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= `, + addrs: []*mail.Address{{ + Name: `Keld Jørn Simonsen`, + Address: `keld@dkuug.dk`, + }}, + }, + { + input: `=?ISO-8859-1?Q?Andr=E9?= Pirard `, + addrs: []*mail.Address{{ + Name: `André Pirard`, + Address: `PIRARD@vm1.ulg.ac.be`, + }}, + }, + { + input: `=?ISO-8859-1?Q?Olle_J=E4rnefors?= `, + addrs: []*mail.Address{{ + Name: `Olle Järnefors`, + Address: `ojarnef@admin.kth.se`, + }}, + }, + { + input: `=?ISO-8859-1?Q?Patrik_F=E4ltstr=F6m?= `, + addrs: []*mail.Address{{ + Name: `Patrik Fältström`, + Address: `paf@nada.kth.se`, + }}, + }, + { + input: `Nathaniel Borenstein (=?iso-8859-8?b?7eXs+SDv4SDp7Oj08A==?=)`, + addrs: []*mail.Address{{ + Name: `Nathaniel Borenstein`, + Address: `nsb@thumper.bellcore.com`, + }}, + }, + { + input: `=?UTF-8?B?PEJlemUgam3DqW5hPg==?= `, + addrs: []*mail.Address{{ + Name: ``, + Address: `user@domain.com`, + }}, + }, + { + input: `First Middle =?utf-8?Q?Last?= `, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `First Middle=?utf-8?Q?Last?= `, + addrs: []*mail.Address{{ + Name: `First Middle=?utf-8?Q?Last?=`, + Address: `user@domain.com`, + }}, + }, + { + input: `First Middle =?utf-8?Q?Last?=`, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `First =?utf-8?Q?Middle?= =?utf-8?Q?Last?= `, + addrs: []*mail.Address{{ + Name: `First MiddleLast`, + Address: `user@domain.com`, + }}, + }, + { + input: `First =?utf-8?Q?Middle?==?utf-8?Q?Last?= `, + addrs: []*mail.Address{{ + Name: `First MiddleLast`, + Address: `user@domain.com`, + }}, + }, + { + input: `First "Middle"=?utf-8?Q?Last?= `, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `First "Middle" =?utf-8?Q?Last?= `, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `First "Middle" =?utf-8?Q?Last?=`, + addrs: []*mail.Address{{ + Name: `First Middle Last`, + Address: `user@domain.com`, + }}, + }, + { + input: `=?UTF-8?B?PEJlemUgam3DqW5hPg==?= `, + addrs: []*mail.Address{{ + Name: ``, + Address: `user@domain.com`, + }}, + }, + } + for _, test := range tests { + test := test + + t.Run(test.input, func(t *testing.T) { + addrs, err := ParseAddressList(test.input) + assert.NoError(t, err) + assert.ElementsMatch(t, test.addrs, addrs) + }) + } +} + +func TestParseAddressInvalid(t *testing.T) { + inputs := []string{ + `user@domain...com`, + `"comma, name" , another, name `, + `username`, + `=?ISO-8859-2?Q?First_Last?= , `, + `=?windows-1250?Q?Spr=E1vce_syst=E9mu?=`, + `"'user@domain.com.'"`, + ``, + `"Mail Delivery System <>" <@>`, + } + + for _, test := range inputs { + test := test + + t.Run(test, func(t *testing.T) { + _, err := ParseAddressList(test) + assert.Error(t, err) + assert.True(t, rfcparser.IsError(err)) + }) + } +} + +func TestParseAddressListEmoji(t *testing.T) { + input := `=?utf-8?q?Goce_Test_=F0=9F=A4=A6=F0=9F=8F=BB=E2=99=82=F0=9F=99=88?= =?utf-8?q?=F0=9F=8C=B2=E2=98=98=F0=9F=8C=B4?= , "Proton GMX Edit" , "beta@bar.com" , "testios12" , "random@bar.com" , =?utf-8?q?=C3=9C=C3=A4=C3=B6_Jakdij?= , =?utf-8?q?Q=C3=A4_T=C3=B6=C3=BCst_12_Edit?= , =?utf-8?q?=E2=98=98=EF=B8=8F=F0=9F=8C=B2=F0=9F=8C=B4=F0=9F=99=82=E2=98=BA?= =?utf-8?q?=EF=B8=8F=F0=9F=98=83?= , "Somebody Outlook" ` + expected := []*mail.Address{ + { + Name: "Goce Test 🤦🏻♂🙈🌲☘🌴", + Address: "foo@bar.com", + }, + { + Name: "Proton GMX Edit", + Address: "z@bar.com", + }, + { + Name: "beta@bar.com", + Address: "beta@bar.com", + }, + { + Name: "testios12", + Address: "random@bar.com", + }, + { + Name: "random@bar.com", + Address: "random@bar.com", + }, + { + Name: "Üäö Jakdij", + Address: "another@bar.com", + }, + { + Name: "Qä Töüst 12 Edit", + Address: "random2@bar.com", + }, + { + Name: "☘️🌲🌴🙂☺️😃", + Address: "dust@bar.com", + }, + { + Name: "Somebody Outlook", + Address: "hotmal@bar.com", + }, + } + + addrs, err := ParseAddressList(input) + assert.NoError(t, err) + assert.ElementsMatch(t, expected, addrs) +} + +func TestParserAddressEmailValidation(t *testing.T) { + inputs := []string{ + "test@io", + "test@iana.org", + "test@nominet.org.uk", + "test@about.museum", + "a@iana.org", + "test.test@iana.org", + "!#$%&`*+/=?^`{|}~@iana.org", + "123@iana.org", + "test@123.com", + "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghiklm@iana.org", + "test@mason-dixon.com", + "test@c--n.com", + "test@xn--hxajbheg2az3al.xn--jxalpdlp", + "xn--test@iana.org", + "1@pm.me", + } + + for _, test := range inputs { + test := test + + t.Run(test, func(t *testing.T) { + _, err := ParseAddressList(test) + assert.NoError(t, err) + }) + } +} diff --git a/rfc5322/quoted.go b/rfc5322/quoted.go new file mode 100644 index 00000000..fd9ef9df --- /dev/null +++ b/rfc5322/quoted.go @@ -0,0 +1,95 @@ +package rfc5322 + +// 3.2.4. Quoted Strings + +import "github.com/ProtonMail/gluon/rfcparser" + +func parseQuotedString(p *rfcparser.Parser) (parserString, error) { + var result rfcparser.Bytes + result.Offset = p.CurrentToken().Offset + + if _, err := tryParseCFWS(p); err != nil { + return parserString{}, err + } + + if err := p.Consume(rfcparser.TokenTypeDQuote, "expected \" for quoted string start"); err != nil { + return parserString{}, err + } + + for { + if ok, err := tryParseFWS(p); err != nil { + return parserString{}, err + } else if ok { + result.Value = append(result.Value, ' ') + } + + if !(p.CheckWith(isQText) || p.Check(rfcparser.TokenTypeBackslash)) { + break + } + + if p.CheckWith(isQText) { + b, err := parseQContent(p) + if err != nil { + return parserString{}, err + } + + result.Value = append(result.Value, b) + } else { + b, err := parseQuotedPair(p) + if err != nil { + return parserString{}, err + } + + result.Value = append(result.Value, b) + } + } + + if ok, err := tryParseFWS(p); err != nil { + return parserString{}, err + } else if ok { + result.Value = append(result.Value, ' ') + } + + if err := p.Consume(rfcparser.TokenTypeDQuote, "expected \" for quoted string end"); err != nil { + return parserString{}, err + } + + if _, err := tryParseCFWS(p); err != nil { + return parserString{}, err + } + + return parserString{ + String: result.IntoString(), + Type: parserStringTypeOther, + }, nil +} + +func parseQContent(p *rfcparser.Parser) (byte, error) { + if ok, err := p.MatchesWith(isQText); err != nil { + return 0, err + } else if ok { + return p.PreviousToken().Value, nil + } + + return parseQuotedPair(p) +} + +func isQText(tokenType rfcparser.TokenType) bool { + // qtext = %d33 / ; Printable US-ASCII + // %d35-91 / ; characters not including + // %d93-126 / ; "\" or the quote character + // obs-qtext + // + // obs-qtext = obs-NO-WS-CTL + // + if (rfcparser.IsCTL(tokenType) && !isObsNoWSCTL(tokenType)) || + tokenType == rfcparser.TokenTypeDQuote || + tokenType == rfcparser.TokenTypeBackslash || + tokenType == rfcparser.TokenTypeSP || + tokenType == rfcparser.TokenTypeEOF || + tokenType == rfcparser.TokenTypeError { + return false + } + + return true +} diff --git a/rfc5322/quoted_test.go b/rfc5322/quoted_test.go new file mode 100644 index 00000000..a66a1eaf --- /dev/null +++ b/rfc5322/quoted_test.go @@ -0,0 +1,22 @@ +package rfc5322 + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestQuotedString(t *testing.T) { + inputs := map[string]string{ + `"f\".c"`: "f\".c", + "\" \r\n f\\\".c\r\n \"": " f\".c ", + ` " foo bar derer " `: " foo bar derer ", + } + + for i, e := range inputs { + p := newTestRFCParser(i) + v, err := parseQuotedString(p) + require.NoError(t, err) + require.Equal(t, e, v.String.Value) + } +} diff --git a/rfcparser/parser.go b/rfcparser/parser.go index 3623f954..ddde24cf 100644 --- a/rfcparser/parser.go +++ b/rfcparser/parser.go @@ -57,6 +57,11 @@ func IsError(err error) bool { return errors.As(err, &perr) } +type ParserState struct { + prevToken Token + curToken Token +} + func NewParser(s *Scanner) *Parser { return &Parser{scanner: s} } @@ -476,6 +481,22 @@ func (p *Parser) MakeErrorAtOffset(err string, offset int) error { } } +// SaveState saves the current and previous token state so it can potentially be restored later with RestoreState. +func (p *Parser) SaveState() ParserState { + return ParserState{ + prevToken: p.previousToken, + curToken: p.currentToken, + } +} + +// RestoreState restores the previous and current tokens from the given state. +// NOTE: If this is called without adjusting the scanner input to the location where these were recorded +// you can break your parsing. +func (p *Parser) RestoreState(state ParserState) { + p.previousToken = state.prevToken + p.currentToken = state.curToken +} + func IsAStringChar(tokenType TokenType) bool { /* ASTRING-CHAR = ATOM-CHAR / resp-specials @@ -519,7 +540,7 @@ func IsQuotedChar(tokenType TokenType) bool { } func IsCTL(tokenType TokenType) bool { - return tokenType == TokenTypeCTL || tokenType == TokenTypeCR || tokenType == TokenTypeLF + return tokenType == TokenTypeCTL || tokenType == TokenTypeCR || tokenType == TokenTypeLF || tokenType == TokenTypeTab } func ByteToInt(b byte) int { diff --git a/rfcparser/scanner.go b/rfcparser/scanner.go index bffdfa43..7cb999b7 100644 --- a/rfcparser/scanner.go +++ b/rfcparser/scanner.go @@ -51,6 +51,9 @@ const ( TokenTypeCR TokenTypeLF TokenTypeCTL + TokenTypeTab + TokenTypeDelete + TokenTypeZero ) type Token struct { @@ -128,13 +131,18 @@ func (s *Scanner) ScanToken() (Token, error) { } if isByteCTL(b) { - if b == '\r' { + switch b { + case 0x0: + return s.makeToken(TokenTypeZero), nil + case '\r': return s.makeToken(TokenTypeCR), nil - } else if b == '\n' { + case '\n': return s.makeToken(TokenTypeLF), nil + case '\t': + return s.makeToken(TokenTypeTab), nil + default: + return s.makeToken(TokenTypeCTL), nil } - - return s.makeToken(TokenTypeCTL), nil } switch b { @@ -204,6 +212,8 @@ func (s *Scanner) ScanToken() (Token, error) { return s.makeToken(TokenTypeRCurly), nil case '~': return s.makeToken(TokenTypeTilde), nil + case 0x7F: + return s.makeToken(TokenTypeDelete), nil } return Token{}, fmt.Errorf("unexpected character %v", b)