diff --git a/go.mod b/go.mod index ecbe34388..f43d30e50 100644 --- a/go.mod +++ b/go.mod @@ -22,5 +22,6 @@ require ( github.com/sergi/go-diff v1.0.0 // indirect github.com/sirupsen/logrus v1.4.1 // indirect github.com/stretchr/testify v1.3.0 // indirect + golang.org/x/net v0.0.0-20181201002055-351d144fa1fc google.golang.org/appengine v1.5.0 // indirect ) diff --git a/go.sum b/go.sum index 7e629bb8c..17e737d3c 100644 --- a/go.sum +++ b/go.sum @@ -78,6 +78,7 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f h1:Bl/8QSvNqXvPGPGXa2z5xUTm golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33 h1:I6FyU15t786LL7oL/hn43zqTuEGr4PN7F4XJ1p4E3Y8= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= google.golang.org/appengine v1.5.0 h1:KxkO13IPW4Lslp2bz+KHP2E3gtFlrIGNThxkZQ3g+4c= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= diff --git a/pkg/readability/conv.go b/pkg/readability/conv.go new file mode 100644 index 000000000..d2916494b --- /dev/null +++ b/pkg/readability/conv.go @@ -0,0 +1,19 @@ +package readability + +import ( + "bytes" + "io" + "io/ioutil" + + "golang.org/x/net/html/charset" +) + +// NewUTF8Reader converts a reader from a charset to UTF-8 +func NewUTF8Reader(reader io.Reader, sourceCharset string) (io.Reader, error) { + b, err := ioutil.ReadAll(reader) + if err != nil { + return nil, err + } + br := bytes.NewReader(b) + return charset.NewReaderLabel(sourceCharset, br) +} diff --git a/pkg/readability/meta.go b/pkg/readability/meta.go new file mode 100644 index 000000000..7ed414ed3 --- /dev/null +++ b/pkg/readability/meta.go @@ -0,0 +1,75 @@ +package readability + +import ( + "bytes" + "io" + + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// Meta is a HTML meta tag +type Meta struct { + Name string + Property string + Content string +} + +// Metas is the set of meta tags +type Metas map[string]*Meta + +// GetContent get first content form keys +func (m Metas) GetContent(keys ...string) *string { + for _, key := range keys { + if m[key] != nil { + return &m[key].Content + } + } + return nil +} + +// ExtractMetas extracts meta tags from a HTML document. +func ExtractMetas(doc io.Reader) (Metas, error) { + var buf bytes.Buffer + tee := io.TeeReader(doc, &buf) + + metas := make(map[string]*Meta) + z := html.NewTokenizer(tee) + for { + tt := z.Next() + if tt == html.ErrorToken { + if z.Err() == io.EOF { + return metas, nil + } + return nil, z.Err() + } + + t := z.Token() + + if t.DataAtom == atom.Head && t.Type == html.EndTagToken { + return metas, nil + } + + if t.DataAtom == atom.Meta { + meta := Meta{} + for _, a := range t.Attr { + switch a.Key { + case "property": + meta.Property = a.Val + case "name": + meta.Name = a.Val + case "content": + meta.Content = a.Val + case "charset": + meta.Name = "charset" + meta.Content = a.Val + } + } + key := meta.Name + if meta.Property != "" { + key = meta.Property + } + metas[key] = &meta + } + } +} diff --git a/pkg/readability/readability.go b/pkg/readability/readability.go new file mode 100644 index 000000000..ea7c443b2 --- /dev/null +++ b/pkg/readability/readability.go @@ -0,0 +1,116 @@ +package readability + +import ( + "bytes" + "context" + "fmt" + "io" + "net/http" + nurl "net/url" + "strings" + "time" + + read "github.com/go-shiori/go-readability" + "github.com/ncarlier/readflow/pkg/model" + "github.com/ncarlier/readflow/pkg/tooling" + "golang.org/x/net/html/charset" +) + +func getContentType(ctx context.Context, url string) (string, error) { + ctx, _ = context.WithTimeout(ctx, 10*time.Second) + req, _ := http.NewRequest("HEAD", url, nil) + req = req.WithContext(ctx) + res, err := http.DefaultClient.Do(req) + if err != nil { + return "", err + } + return res.Header.Get("Content-type"), nil +} + +func get(ctx context.Context, url string) (*http.Response, error) { + ctx, _ = context.WithTimeout(ctx, 10*time.Second) + req, _ := http.NewRequest("GET", url, nil) + req = req.WithContext(ctx) + return http.DefaultClient.Do(req) +} + +// FetchArticle fetch article from an URL +func FetchArticle(ctx context.Context, url string) (*model.Article, error) { + // Validate URL + _, err := nurl.ParseRequestURI(url) + if err != nil { + return nil, fmt.Errorf("invalid URL: %v", err) + } + + // Get URL content type + contentType, err := getContentType(ctx, url) + if err != nil { + return nil, err + } + + if !strings.HasPrefix(contentType, "text/html") { + return nil, fmt.Errorf("invalid content-type: %s", contentType) + } + + // Get URL content + res, err := get(ctx, url) + if err != nil { + return nil, err + } + defer res.Body.Close() + body, err := charset.NewReader(res.Body, contentType) + if err != nil { + return nil, err + } + + // Extract metas + metas, err := ExtractMetas(body) + if err != nil { + return nil, err + } + + // Create article with Open Graph atributes + result := &model.Article{ + Text: metas.GetContent("og:description", "twitter:description", "description"), + Image: metas.GetContent("og:image", "twitter:image"), + } + title := metas.GetContent("og:title") + if title != nil { + result.Title = *title + } + + var buffer bytes.Buffer + tee := io.TeeReader(body, &buffer) + + // Test if the HTML page is readable by Shiori readability + if !read.IsReadable(tee) { + return result, fmt.Errorf("unable to extract content from HTML page") + } + + // Extract content from the HTML page + article, err := read.FromReader(&buffer, url) + if err != nil { + return result, err + } + + // Complete result with extracted properties + result.HTML = &article.Content + if result.Title == "" { + result.Title = article.Title + } + if result.Text == nil { + // FIXME: readability excerpt don't well support UTF8 + text := tooling.ToUTF8(article.Excerpt) + result.Text = &text + } + if result.Image == nil { + result.Image = &article.Image + } + + // TODO: add other properties to the result + // article.Favicon + // article.Length + // article.SiteName + + return result, nil +} diff --git a/pkg/readability/test/fetch_test.go b/pkg/readability/test/fetch_test.go new file mode 100644 index 000000000..cbd72f6c4 --- /dev/null +++ b/pkg/readability/test/fetch_test.go @@ -0,0 +1,20 @@ +package test + +import ( + "context" + "testing" + + "github.com/ncarlier/readflow/pkg/assert" + "github.com/ncarlier/readflow/pkg/readability" +) + +func TestFetchNonReadablePage(t *testing.T) { + ctx := context.TODO() + article, err := readability.FetchArticle(ctx, "https://about.readflow.app/") + assert.NotNil(t, err, "error should not be nil") + assert.Equal(t, "unable to extract content from HTML page", err.Error(), "") + assert.NotNil(t, article, "article should not be nil") + assert.Equal(t, "readflow", article.Title, "") + assert.Equal(t, "Read your Internet article flow in one place with complete peace of mind and freedom", *article.Text, "") + assert.Equal(t, "https://about.readflow.app/images/readflow.png", *article.Image, "") +} diff --git a/pkg/readability/test/meta_test.go b/pkg/readability/test/meta_test.go new file mode 100644 index 000000000..98faff387 --- /dev/null +++ b/pkg/readability/test/meta_test.go @@ -0,0 +1,30 @@ +package test + +import ( + "strings" + "testing" + + "github.com/ncarlier/readflow/pkg/assert" + "github.com/ncarlier/readflow/pkg/readability" +) + +var testCase = ` +Test case + + + + + + +` + +func TestExtract(t *testing.T) { + metas, err := readability.ExtractMetas(strings.NewReader(testCase)) + assert.Nil(t, err, "error should be nil") + assert.Equal(t, 6, len(metas), "") + assert.Equal(t, "", metas["og:title"].Name, "") + assert.Equal(t, "og:title", metas["og:title"].Property, "") + assert.Equal(t, "test case", metas["og:title"].Content, "") + assert.Equal(t, "twitter description", *metas.GetContent("twitter:description", "description"), "") + assert.Equal(t, "iso-8859-1", metas["charset"].Content, "") +} diff --git a/pkg/service/articles.go b/pkg/service/articles.go index 51cb0699d..2ed7948dc 100644 --- a/pkg/service/articles.go +++ b/pkg/service/articles.go @@ -3,14 +3,10 @@ package service import ( "context" "errors" - "time" - - "github.com/ncarlier/readflow/pkg/tooling" - - readability "github.com/go-shiori/go-readability" "github.com/ncarlier/readflow/pkg/event" "github.com/ncarlier/readflow/pkg/model" + "github.com/ncarlier/readflow/pkg/readability" ) // ArticleCreationOptions article creation options @@ -189,23 +185,22 @@ func (reg *Registry) MarkAllArticlesAsRead(ctx context.Context, categoryID *uint // HydrateArticle add missimg attributes form original article func (reg *Registry) HydrateArticle(ctx context.Context, article *model.Article) error { - art, err := readability.FromURL(*article.URL, 5*time.Second) - if err != nil { + art, err := readability.FetchArticle(ctx, *article.URL) + if art == nil { return err } - if article.HTML == nil { - article.HTML = &art.Content - } if article.Title == "" { article.Title = art.Title } - // FIXME: readability excerpt don't well support UTF8 - text := tooling.ToUTF8(art.Excerpt) - article.Text = &text - article.Image = &art.Image - // TODO: - // article.Favicon = &art.Favicon - // article.Length = art.Length + if article.HTML == nil { + article.HTML = art.HTML + } + if article.Text == nil { + article.Text = art.Text + } + if article.Image == nil { + article.Image = art.Image + } - return nil + return err }