diff --git a/.travis.yml b/.travis.yml index 0884434..26d5c41 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,7 @@ language: go go: - - 1.9.2 + - 1.9.x env: - DEP_VERSION="0.3.2" diff --git a/Gopkg.lock b/Gopkg.lock index 45bac48..42c7c1e 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -37,6 +37,12 @@ revision = "645ef00459ed84a119197bfb8d8205042c6df63d" version = "v0.8.0" +[[projects]] + branch = "master" + name = "github.com/saintfish/chardet" + packages = ["."] + revision = "3af4cd4741ca4f3eb0c407c034571a6fb0ea529c" + [[projects]] name = "github.com/spf13/cobra" packages = ["."] @@ -67,9 +73,15 @@ packages = ["unix"] revision = "75813c647272dd855bda156405bf844a5414f5bf" +[[projects]] + branch = "master" + name = "golang.org/x/text" + packages = ["encoding","encoding/internal","encoding/internal/identifier","encoding/japanese","internal/gen","transform","unicode/cldr"] + revision = "75cc3cad82b5f47d3fb229ddda8c5167da14f294" + [solve-meta] analyzer-name = "dep" analyzer-version = 1 - inputs-digest = "30c51b72b39edcaf014ce774692d9b3e90db85c627a7db67589230d137f57eef" + inputs-digest = "f5b4f0260ca993878fb958e8cd706e3762b4b84c4435dab35d75bfe2260366ff" solver-name = "gps-cdcl" solver-version = 1 diff --git a/Gopkg.toml b/Gopkg.toml index dfa2dee..f472e92 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -54,3 +54,11 @@ ignored = [ [[constraint]] name = "github.com/atotto/clipboard" branch = "master" + +[[constraint]] + name = "github.com/saintfish/chardet" + branch = "master" + +[[constraint]] + name = "golang.org/x/text" + branch = "master" diff --git a/charencode.go b/charencode.go new file mode 100644 index 0000000..d5c59c2 --- /dev/null +++ b/charencode.go @@ -0,0 +1,88 @@ +package mklink + +import ( + "bytes" + "io" + + "github.com/saintfish/chardet" + "golang.org/x/text/encoding/japanese" + "golang.org/x/text/transform" +) + +//CharEncode is type of character encoding +type CharEncode int + +const ( + //CharUnknown is unknown character + CharUnknown CharEncode = iota + //CharUTF8 is UTF-8 + CharUTF8 + //CharISO8859_1 is ISO-8859-1 + CharISO8859_1 + //CharShiftJIS is Shift-JIS + CharShiftJIS + //CharEUCJP is EUC-JP + CharEUCJP + //CharISO2022JP is ISO-2022-JP + CharISO2022JP +) + +var ( + charEncodeMap = map[CharEncode]string{ + CharUTF8: "UTF-8", + CharISO8859_1: "ISO-8859-1", + CharShiftJIS: "Shift_JIS", + CharEUCJP: "EUC-JP", + CharISO2022JP: "ISO-2022-JP", + } +) + +//TypeofCharEncode returns CharEncode from string +func TypeofCharEncode(s string) CharEncode { + for key, value := range charEncodeMap { + if value == s { + return key + } + } + return CharUnknown +} + +func (e CharEncode) String() string { + if name, ok := charEncodeMap[e]; ok { + return name + } + return "unknown" + +} + +//DetectCharEncode returns character encoding +func DetectCharEncode(body []byte) CharEncode { + det := chardet.NewTextDetector() + res, err := det.DetectBest(body) + if err != nil { + return CharUnknown + } + //fmt.Println(res.Charset) + return TypeofCharEncode(res.Charset) +} + +//ToUTF8 returns string with UTF-8 encoding +func ToUTF8(body []byte) string { + var trans transform.Transformer + switch DetectCharEncode(body) { + case CharUTF8, CharISO8859_1: + return string(body) + case CharShiftJIS: + trans = japanese.ShiftJIS.NewDecoder() + case CharEUCJP: + trans = japanese.EUCJP.NewDecoder() + case CharISO2022JP: + trans = japanese.ISO2022JP.NewDecoder() + default: + return "" + } + r := transform.NewReader(bytes.NewReader(body), trans) + buf := new(bytes.Buffer) + io.Copy(buf, r) + return buf.String() +} diff --git a/mklink.go b/mklink.go index 3dc1597..0a9973f 100644 --- a/mklink.go +++ b/mklink.go @@ -32,15 +32,16 @@ func New(url string) (*Link, error) { doc.Find("head").Each(func(_ int, s *goquery.Selection) { s.Find("title").Each(func(_ int, s *goquery.Selection) { - t := s.Text() - if utf8.ValidString(t) { + t := ToUTF8([]byte(s.Text())) + if len(t) > 0 && utf8.ValidString(t) { link.Title = trimString(t) } }) s.Find("meta[name='description']").Each(func(_ int, s *goquery.Selection) { if v, ok := s.Attr("content"); ok { - if utf8.ValidString(v) { - link.Description = trimString(v) + d := ToUTF8([]byte(v)) + if len(d) > 0 && utf8.ValidString(d) { + link.Description = trimString(d) } } })