Skip to content

Commit

Permalink
feat(articles): improve article data extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
ncarlier committed May 12, 2019
1 parent 8b89e71 commit 65ae8ca
Show file tree
Hide file tree
Showing 8 changed files with 275 additions and 18 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,6 @@ require (
github.com/sergi/go-diff v1.0.0 // indirect
github.com/sirupsen/logrus v1.4.1 // indirect
github.com/stretchr/testify v1.3.0 // indirect
golang.org/x/net v0.0.0-20181201002055-351d144fa1fc
google.golang.org/appengine v1.5.0 // indirect
)
1 change: 1 addition & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f h1:Bl/8QSvNqXvPGPGXa2z5xUTm
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33 h1:I6FyU15t786LL7oL/hn43zqTuEGr4PN7F4XJ1p4E3Y8=
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
google.golang.org/appengine v1.5.0 h1:KxkO13IPW4Lslp2bz+KHP2E3gtFlrIGNThxkZQ3g+4c=
google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
19 changes: 19 additions & 0 deletions pkg/readability/conv.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package readability

import (
"bytes"
"io"
"io/ioutil"

"golang.org/x/net/html/charset"
)

// NewUTF8Reader converts a reader from a charset to UTF-8
func NewUTF8Reader(reader io.Reader, sourceCharset string) (io.Reader, error) {
b, err := ioutil.ReadAll(reader)
if err != nil {
return nil, err
}
br := bytes.NewReader(b)
return charset.NewReaderLabel(sourceCharset, br)
}
75 changes: 75 additions & 0 deletions pkg/readability/meta.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package readability

import (
"bytes"
"io"

"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)

// Meta is a HTML meta tag
type Meta struct {
Name string
Property string
Content string
}

// Metas is the set of meta tags
type Metas map[string]*Meta

// GetContent get first content form keys
func (m Metas) GetContent(keys ...string) *string {
for _, key := range keys {
if m[key] != nil {
return &m[key].Content
}
}
return nil
}

// ExtractMetas extracts meta tags from a HTML document.
func ExtractMetas(doc io.Reader) (Metas, error) {
var buf bytes.Buffer
tee := io.TeeReader(doc, &buf)

metas := make(map[string]*Meta)
z := html.NewTokenizer(tee)
for {
tt := z.Next()
if tt == html.ErrorToken {
if z.Err() == io.EOF {
return metas, nil
}
return nil, z.Err()
}

t := z.Token()

if t.DataAtom == atom.Head && t.Type == html.EndTagToken {
return metas, nil
}

if t.DataAtom == atom.Meta {
meta := Meta{}
for _, a := range t.Attr {
switch a.Key {
case "property":
meta.Property = a.Val
case "name":
meta.Name = a.Val
case "content":
meta.Content = a.Val
case "charset":
meta.Name = "charset"
meta.Content = a.Val
}
}
key := meta.Name
if meta.Property != "" {
key = meta.Property
}
metas[key] = &meta
}
}
}
116 changes: 116 additions & 0 deletions pkg/readability/readability.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package readability

import (
"bytes"
"context"
"fmt"
"io"
"net/http"
nurl "net/url"
"strings"
"time"

read "github.com/go-shiori/go-readability"
"github.com/ncarlier/readflow/pkg/model"
"github.com/ncarlier/readflow/pkg/tooling"
"golang.org/x/net/html/charset"
)

func getContentType(ctx context.Context, url string) (string, error) {
ctx, _ = context.WithTimeout(ctx, 10*time.Second)
req, _ := http.NewRequest("HEAD", url, nil)
req = req.WithContext(ctx)
res, err := http.DefaultClient.Do(req)
if err != nil {
return "", err
}
return res.Header.Get("Content-type"), nil
}

func get(ctx context.Context, url string) (*http.Response, error) {
ctx, _ = context.WithTimeout(ctx, 10*time.Second)
req, _ := http.NewRequest("GET", url, nil)
req = req.WithContext(ctx)
return http.DefaultClient.Do(req)
}

// FetchArticle fetch article from an URL
func FetchArticle(ctx context.Context, url string) (*model.Article, error) {
// Validate URL
_, err := nurl.ParseRequestURI(url)
if err != nil {
return nil, fmt.Errorf("invalid URL: %v", err)
}

// Get URL content type
contentType, err := getContentType(ctx, url)
if err != nil {
return nil, err
}

if !strings.HasPrefix(contentType, "text/html") {
return nil, fmt.Errorf("invalid content-type: %s", contentType)
}

// Get URL content
res, err := get(ctx, url)
if err != nil {
return nil, err
}
defer res.Body.Close()
body, err := charset.NewReader(res.Body, contentType)
if err != nil {
return nil, err
}

// Extract metas
metas, err := ExtractMetas(body)
if err != nil {
return nil, err
}

// Create article with Open Graph atributes
result := &model.Article{
Text: metas.GetContent("og:description", "twitter:description", "description"),
Image: metas.GetContent("og:image", "twitter:image"),
}
title := metas.GetContent("og:title")
if title != nil {
result.Title = *title
}

var buffer bytes.Buffer
tee := io.TeeReader(body, &buffer)

// Test if the HTML page is readable by Shiori readability
if !read.IsReadable(tee) {
return result, fmt.Errorf("unable to extract content from HTML page")
}

// Extract content from the HTML page
article, err := read.FromReader(&buffer, url)
if err != nil {
return result, err
}

// Complete result with extracted properties
result.HTML = &article.Content
if result.Title == "" {
result.Title = article.Title
}
if result.Text == nil {
// FIXME: readability excerpt don't well support UTF8
text := tooling.ToUTF8(article.Excerpt)
result.Text = &text
}
if result.Image == nil {
result.Image = &article.Image
}

// TODO: add other properties to the result
// article.Favicon
// article.Length
// article.SiteName

return result, nil
}
20 changes: 20 additions & 0 deletions pkg/readability/test/fetch_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package test

import (
"context"
"testing"

"github.com/ncarlier/readflow/pkg/assert"
"github.com/ncarlier/readflow/pkg/readability"
)

func TestFetchNonReadablePage(t *testing.T) {
ctx := context.TODO()
article, err := readability.FetchArticle(ctx, "https://about.readflow.app/")
assert.NotNil(t, err, "error should not be nil")
assert.Equal(t, "unable to extract content from HTML page", err.Error(), "")
assert.NotNil(t, article, "article should not be nil")
assert.Equal(t, "readflow", article.Title, "")
assert.Equal(t, "Read your Internet article flow in one place with complete peace of mind and freedom", *article.Text, "")
assert.Equal(t, "https://about.readflow.app/images/readflow.png", *article.Image, "")
}
30 changes: 30 additions & 0 deletions pkg/readability/test/meta_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package test

import (
"strings"
"testing"

"github.com/ncarlier/readflow/pkg/assert"
"github.com/ncarlier/readflow/pkg/readability"
)

var testCase = `<head>
<title>Test case</title>
<meta charset="iso-8859-1" />
<meta property="og:title" content="test case" />
<meta name="description" content="general description">
<meta property="twitter:description" content="twitter description" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="author" content="John Doe">
</head>`

func TestExtract(t *testing.T) {
metas, err := readability.ExtractMetas(strings.NewReader(testCase))
assert.Nil(t, err, "error should be nil")
assert.Equal(t, 6, len(metas), "")
assert.Equal(t, "", metas["og:title"].Name, "")
assert.Equal(t, "og:title", metas["og:title"].Property, "")
assert.Equal(t, "test case", metas["og:title"].Content, "")
assert.Equal(t, "twitter description", *metas.GetContent("twitter:description", "description"), "")
assert.Equal(t, "iso-8859-1", metas["charset"].Content, "")
}
31 changes: 13 additions & 18 deletions pkg/service/articles.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,10 @@ package service
import (
"context"
"errors"
"time"

"github.com/ncarlier/readflow/pkg/tooling"

readability "github.com/go-shiori/go-readability"

"github.com/ncarlier/readflow/pkg/event"
"github.com/ncarlier/readflow/pkg/model"
"github.com/ncarlier/readflow/pkg/readability"
)

// ArticleCreationOptions article creation options
Expand Down Expand Up @@ -189,23 +185,22 @@ func (reg *Registry) MarkAllArticlesAsRead(ctx context.Context, categoryID *uint

// HydrateArticle add missimg attributes form original article
func (reg *Registry) HydrateArticle(ctx context.Context, article *model.Article) error {
art, err := readability.FromURL(*article.URL, 5*time.Second)
if err != nil {
art, err := readability.FetchArticle(ctx, *article.URL)
if art == nil {
return err
}
if article.HTML == nil {
article.HTML = &art.Content
}
if article.Title == "" {
article.Title = art.Title
}
// FIXME: readability excerpt don't well support UTF8
text := tooling.ToUTF8(art.Excerpt)
article.Text = &text
article.Image = &art.Image
// TODO:
// article.Favicon = &art.Favicon
// article.Length = art.Length
if article.HTML == nil {
article.HTML = art.HTML
}
if article.Text == nil {
article.Text = art.Text
}
if article.Image == nil {
article.Image = art.Image
}

return nil
return err
}

0 comments on commit 65ae8ca

Please sign in to comment.