-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(articles): improve article data extraction
- Loading branch information
Showing
8 changed files
with
275 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
package readability | ||
|
||
import ( | ||
"bytes" | ||
"io" | ||
"io/ioutil" | ||
|
||
"golang.org/x/net/html/charset" | ||
) | ||
|
||
// NewUTF8Reader converts a reader from a charset to UTF-8 | ||
func NewUTF8Reader(reader io.Reader, sourceCharset string) (io.Reader, error) { | ||
b, err := ioutil.ReadAll(reader) | ||
if err != nil { | ||
return nil, err | ||
} | ||
br := bytes.NewReader(b) | ||
return charset.NewReaderLabel(sourceCharset, br) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
package readability | ||
|
||
import ( | ||
"bytes" | ||
"io" | ||
|
||
"golang.org/x/net/html" | ||
"golang.org/x/net/html/atom" | ||
) | ||
|
||
// Meta is a HTML meta tag | ||
type Meta struct { | ||
Name string | ||
Property string | ||
Content string | ||
} | ||
|
||
// Metas is the set of meta tags | ||
type Metas map[string]*Meta | ||
|
||
// GetContent get first content form keys | ||
func (m Metas) GetContent(keys ...string) *string { | ||
for _, key := range keys { | ||
if m[key] != nil { | ||
return &m[key].Content | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
// ExtractMetas extracts meta tags from a HTML document. | ||
func ExtractMetas(doc io.Reader) (Metas, error) { | ||
var buf bytes.Buffer | ||
tee := io.TeeReader(doc, &buf) | ||
|
||
metas := make(map[string]*Meta) | ||
z := html.NewTokenizer(tee) | ||
for { | ||
tt := z.Next() | ||
if tt == html.ErrorToken { | ||
if z.Err() == io.EOF { | ||
return metas, nil | ||
} | ||
return nil, z.Err() | ||
} | ||
|
||
t := z.Token() | ||
|
||
if t.DataAtom == atom.Head && t.Type == html.EndTagToken { | ||
return metas, nil | ||
} | ||
|
||
if t.DataAtom == atom.Meta { | ||
meta := Meta{} | ||
for _, a := range t.Attr { | ||
switch a.Key { | ||
case "property": | ||
meta.Property = a.Val | ||
case "name": | ||
meta.Name = a.Val | ||
case "content": | ||
meta.Content = a.Val | ||
case "charset": | ||
meta.Name = "charset" | ||
meta.Content = a.Val | ||
} | ||
} | ||
key := meta.Name | ||
if meta.Property != "" { | ||
key = meta.Property | ||
} | ||
metas[key] = &meta | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
package readability | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"fmt" | ||
"io" | ||
"net/http" | ||
nurl "net/url" | ||
"strings" | ||
"time" | ||
|
||
read "github.com/go-shiori/go-readability" | ||
"github.com/ncarlier/readflow/pkg/model" | ||
"github.com/ncarlier/readflow/pkg/tooling" | ||
"golang.org/x/net/html/charset" | ||
) | ||
|
||
func getContentType(ctx context.Context, url string) (string, error) { | ||
ctx, _ = context.WithTimeout(ctx, 10*time.Second) | ||
req, _ := http.NewRequest("HEAD", url, nil) | ||
req = req.WithContext(ctx) | ||
res, err := http.DefaultClient.Do(req) | ||
if err != nil { | ||
return "", err | ||
} | ||
return res.Header.Get("Content-type"), nil | ||
} | ||
|
||
func get(ctx context.Context, url string) (*http.Response, error) { | ||
ctx, _ = context.WithTimeout(ctx, 10*time.Second) | ||
req, _ := http.NewRequest("GET", url, nil) | ||
req = req.WithContext(ctx) | ||
return http.DefaultClient.Do(req) | ||
} | ||
|
||
// FetchArticle fetch article from an URL | ||
func FetchArticle(ctx context.Context, url string) (*model.Article, error) { | ||
// Validate URL | ||
_, err := nurl.ParseRequestURI(url) | ||
if err != nil { | ||
return nil, fmt.Errorf("invalid URL: %v", err) | ||
} | ||
|
||
// Get URL content type | ||
contentType, err := getContentType(ctx, url) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
if !strings.HasPrefix(contentType, "text/html") { | ||
return nil, fmt.Errorf("invalid content-type: %s", contentType) | ||
} | ||
|
||
// Get URL content | ||
res, err := get(ctx, url) | ||
if err != nil { | ||
return nil, err | ||
} | ||
defer res.Body.Close() | ||
body, err := charset.NewReader(res.Body, contentType) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// Extract metas | ||
metas, err := ExtractMetas(body) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// Create article with Open Graph atributes | ||
result := &model.Article{ | ||
Text: metas.GetContent("og:description", "twitter:description", "description"), | ||
Image: metas.GetContent("og:image", "twitter:image"), | ||
} | ||
title := metas.GetContent("og:title") | ||
if title != nil { | ||
result.Title = *title | ||
} | ||
|
||
var buffer bytes.Buffer | ||
tee := io.TeeReader(body, &buffer) | ||
|
||
// Test if the HTML page is readable by Shiori readability | ||
if !read.IsReadable(tee) { | ||
return result, fmt.Errorf("unable to extract content from HTML page") | ||
} | ||
|
||
// Extract content from the HTML page | ||
article, err := read.FromReader(&buffer, url) | ||
if err != nil { | ||
return result, err | ||
} | ||
|
||
// Complete result with extracted properties | ||
result.HTML = &article.Content | ||
if result.Title == "" { | ||
result.Title = article.Title | ||
} | ||
if result.Text == nil { | ||
// FIXME: readability excerpt don't well support UTF8 | ||
text := tooling.ToUTF8(article.Excerpt) | ||
result.Text = &text | ||
} | ||
if result.Image == nil { | ||
result.Image = &article.Image | ||
} | ||
|
||
// TODO: add other properties to the result | ||
// article.Favicon | ||
// article.Length | ||
// article.SiteName | ||
|
||
return result, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package test | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
|
||
"github.com/ncarlier/readflow/pkg/assert" | ||
"github.com/ncarlier/readflow/pkg/readability" | ||
) | ||
|
||
func TestFetchNonReadablePage(t *testing.T) { | ||
ctx := context.TODO() | ||
article, err := readability.FetchArticle(ctx, "https://about.readflow.app/") | ||
assert.NotNil(t, err, "error should not be nil") | ||
assert.Equal(t, "unable to extract content from HTML page", err.Error(), "") | ||
assert.NotNil(t, article, "article should not be nil") | ||
assert.Equal(t, "readflow", article.Title, "") | ||
assert.Equal(t, "Read your Internet article flow in one place with complete peace of mind and freedom", *article.Text, "") | ||
assert.Equal(t, "https://about.readflow.app/images/readflow.png", *article.Image, "") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
package test | ||
|
||
import ( | ||
"strings" | ||
"testing" | ||
|
||
"github.com/ncarlier/readflow/pkg/assert" | ||
"github.com/ncarlier/readflow/pkg/readability" | ||
) | ||
|
||
var testCase = `<head> | ||
<title>Test case</title> | ||
<meta charset="iso-8859-1" /> | ||
<meta property="og:title" content="test case" /> | ||
<meta name="description" content="general description"> | ||
<meta property="twitter:description" content="twitter description" /> | ||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | ||
<meta name="author" content="John Doe"> | ||
</head>` | ||
|
||
func TestExtract(t *testing.T) { | ||
metas, err := readability.ExtractMetas(strings.NewReader(testCase)) | ||
assert.Nil(t, err, "error should be nil") | ||
assert.Equal(t, 6, len(metas), "") | ||
assert.Equal(t, "", metas["og:title"].Name, "") | ||
assert.Equal(t, "og:title", metas["og:title"].Property, "") | ||
assert.Equal(t, "test case", metas["og:title"].Content, "") | ||
assert.Equal(t, "twitter description", *metas.GetContent("twitter:description", "description"), "") | ||
assert.Equal(t, "iso-8859-1", metas["charset"].Content, "") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters