Skip to content

Commit

Permalink
Merge pull request #510 from projectdiscovery/issue-441-utf8
Browse files Browse the repository at this point in the history
Adding support for euc-kr charset
  • Loading branch information
ehsandeep authored Feb 7, 2022
2 parents 484e8ee + eb37875 commit bb5de05
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 6 deletions.
6 changes: 6 additions & 0 deletions common/httpx/encodings.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bytes"
"io/ioutil"

"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/transform"
Expand Down Expand Up @@ -43,3 +44,8 @@ func Encodebig5(s []byte) ([]byte, error) {
}
return d, nil
}

func DecodeKorean(s []byte) ([]byte, error) {
koreanDecoder := korean.EUCKR.NewDecoder()
return koreanDecoder.Bytes(s)
}
18 changes: 12 additions & 6 deletions common/httpx/title.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"regexp"
"strings"

"github.com/projectdiscovery/stringsutil"
"golang.org/x/net/html"
)

Expand Down Expand Up @@ -39,16 +40,21 @@ func ExtractTitle(r *Response) (title string) {

// Non UTF-8
if contentTypes, ok := r.Headers["Content-Type"]; ok {
contentType := strings.Join(contentTypes, ";")
contentType := strings.ToLower(strings.Join(contentTypes, ";"))

// special cases
if strings.Contains(strings.ToLower(contentType), "charset=gb2312") ||
strings.Contains(strings.ToLower(contentType), "charset=gbk") {
switch {
case stringsutil.ContainsAny(contentType, "charset=gb2312", "charset=gbk"):
titleUtf8, err := Decodegbk([]byte(title))
if err != nil {
return
}

return string(titleUtf8)
case stringsutil.ContainsAny(contentType, "euc-kr"):
titleUtf8, err := DecodeKorean([]byte(title))
if err != nil {
return
}
return string(titleUtf8)
}

Expand All @@ -63,12 +69,12 @@ func ExtractTitle(r *Response) (title string) {
}
mcontentType = strings.ToLower(mcontentType)
}
if strings.Contains(mcontentType, "gb2312") || strings.Contains(mcontentType, "gbk") {
switch {
case stringsutil.ContainsAny(mcontentType, "gb2312", "gbk"):
titleUtf8, err := Decodegbk([]byte(title))
if err != nil {
return
}

return string(titleUtf8)
}
}
Expand Down

0 comments on commit bb5de05

Please sign in to comment.