-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_clean.go
117 lines (97 loc) · 2.72 KB
/
extract_clean.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
package main
import (
"bytes"
"log"
"net/url"
"strings"
"golang.org/x/net/html"
"gopkg.in/neurosnap/sentences.v1/english"
)
func extract(exInC *DownloadResult, exOutC chan ExtractResult) {
var result ExtractResult
reader := bytes.NewReader(exInC.body)
tokenizer, err := english.NewSentenceTokenizer(nil)
if err != nil {
panic(err)
}
// Parse the HTML content
doc, err := html.Parse(reader)
if err != nil {
log.Fatalf("Could not parse html %v", err)
}
var f func(*html.Node)
f = func(n *html.Node) {
switch n.Type {
case html.ElementNode:
for _, attr := range n.Attr {
if attr.Key == "href" {
result.hrefs = append(result.hrefs, attr.Val)
}
}
// Extracting the title name
if n.Data == "title" && n.Parent.Data == "head" {
// fmt.Println("Current url title:" + strings.TrimSpace(n.FirstChild.Data))
result.title = strings.TrimSpace(n.FirstChild.Data)
}
case html.TextNode:
p := n.Parent
if p.Type == html.ElementNode && (p.Data != "style" && p.Data != "script") {
sentences := tokenizer.Tokenize(strings.TrimSpace(n.Data))
for _, s := range sentences {
result.sentences = append(result.sentences, s.Text)
// fmt.Println(s.Text)
}
}
}
// go through the child nodes recursively
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
// Put the results into the extract output channel
exOutC <- result
}
func clean(host string, href string) string {
var err error
if len(href) == 0 {
return "error"
}
// If the link has backslash at the end, remove it for concatenation later
if href[len(href)-1] == '/' {
href = href[:len(href)-1]
}
// I cannot figure out how to delete DS_Store.
if strings.Contains(href, "DS_Store") {
return "error"
}
if hostUrl, err := url.Parse(host); err == nil {
if parsedUrl, err := url.Parse(href); err == nil {
// If the link starts with a backslash (is part of a complete link)
// then just add the hostUrl's scheme and host
if len(href) != 0 && href[0] == '/' {
parsedUrl.Scheme = hostUrl.Scheme
parsedUrl.Host = hostUrl.Host
return parsedUrl.String()
}
// Do not crawl pictures.
if strings.Contains(href, "jpg") || strings.Contains(href, "png") {
return "error"
}
// if it is an incomplete url (html file), add it to the host URL
if parsedUrl.Scheme == "" && parsedUrl.Path != "" {
return host + "/" + parsedUrl.String()
}
// ADDED: if the new url is different from the initial host url, ignore it
// to prevent crawling beyond the initial host and return an error
if parsedUrl.Host != hostUrl.Host {
return "error"
}
return parsedUrl.String()
}
}
if err != nil {
log.Fatalf("Url could not be parsed %v", err)
}
return ""
}