Skip to content

Commit

Permalink
Add logic: Detect char-encoding and convert to UTF-8
Browse files Browse the repository at this point in the history
  • Loading branch information
spiegel-im-spiegel committed Dec 4, 2017
1 parent 97b43ec commit fd9bd11
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
language: go

go:
- 1.9.2
- 1.9.x

env:
- DEP_VERSION="0.3.2"
Expand Down
14 changes: 13 additions & 1 deletion Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,11 @@ ignored = [
[[constraint]]
name = "github.com/atotto/clipboard"
branch = "master"

[[constraint]]
name = "github.com/saintfish/chardet"
branch = "master"

[[constraint]]
name = "golang.org/x/text"
branch = "master"
88 changes: 88 additions & 0 deletions charencode.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package mklink

import (
"bytes"
"io"

"github.com/saintfish/chardet"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/transform"
)

//CharEncode is type of character encoding
type CharEncode int

const (
//CharUnknown is unknown character
CharUnknown CharEncode = iota
//CharUTF8 is UTF-8
CharUTF8
//CharISO8859_1 is ISO-8859-1
CharISO8859_1
//CharShiftJIS is Shift-JIS
CharShiftJIS
//CharEUCJP is EUC-JP
CharEUCJP
//CharISO2022JP is ISO-2022-JP
CharISO2022JP
)

var (
charEncodeMap = map[CharEncode]string{
CharUTF8: "UTF-8",
CharISO8859_1: "ISO-8859-1",
CharShiftJIS: "Shift_JIS",
CharEUCJP: "EUC-JP",
CharISO2022JP: "ISO-2022-JP",
}
)

//TypeofCharEncode returns CharEncode from string
func TypeofCharEncode(s string) CharEncode {
for key, value := range charEncodeMap {
if value == s {
return key
}
}
return CharUnknown
}

func (e CharEncode) String() string {
if name, ok := charEncodeMap[e]; ok {
return name
}
return "unknown"

}

//DetectCharEncode returns character encoding
func DetectCharEncode(body []byte) CharEncode {
det := chardet.NewTextDetector()
res, err := det.DetectBest(body)
if err != nil {
return CharUnknown
}
//fmt.Println(res.Charset)
return TypeofCharEncode(res.Charset)
}

//ToUTF8 returns string with UTF-8 encoding
func ToUTF8(body []byte) string {
var trans transform.Transformer
switch DetectCharEncode(body) {
case CharUTF8, CharISO8859_1:
return string(body)
case CharShiftJIS:
trans = japanese.ShiftJIS.NewDecoder()
case CharEUCJP:
trans = japanese.EUCJP.NewDecoder()
case CharISO2022JP:
trans = japanese.ISO2022JP.NewDecoder()
default:
return ""
}
r := transform.NewReader(bytes.NewReader(body), trans)
buf := new(bytes.Buffer)
io.Copy(buf, r)
return buf.String()
}
9 changes: 5 additions & 4 deletions mklink.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,16 @@ func New(url string) (*Link, error) {

doc.Find("head").Each(func(_ int, s *goquery.Selection) {
s.Find("title").Each(func(_ int, s *goquery.Selection) {
t := s.Text()
if utf8.ValidString(t) {
t := ToUTF8([]byte(s.Text()))
if len(t) > 0 && utf8.ValidString(t) {
link.Title = trimString(t)
}
})
s.Find("meta[name='description']").Each(func(_ int, s *goquery.Selection) {
if v, ok := s.Attr("content"); ok {
if utf8.ValidString(v) {
link.Description = trimString(v)
d := ToUTF8([]byte(v))
if len(d) > 0 && utf8.ValidString(d) {
link.Description = trimString(d)
}
}
})
Expand Down

0 comments on commit fd9bd11

Please sign in to comment.