-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a90e00b
commit 04867a7
Showing
2 changed files
with
221 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"sync" | ||
"time" | ||
) | ||
|
||
type Fetcher interface { | ||
// Fetch 返回 URL 的 body 内容,并且将在这个页面上找到的 URL 放到一个 slice 中。 | ||
Fetch(url string) (body string, urls []string, err error) | ||
} | ||
|
||
type URLCache struct { | ||
urls map[string]bool | ||
mux sync.Mutex | ||
} | ||
|
||
var g_urlCache URLCache = URLCache{ | ||
urls: map[string]bool{"https://golang.org/": true}, | ||
} | ||
|
||
// Crawl 使用 fetcher 从某个 URL 开始递归的爬取页面,直到达到最大深度。 | ||
func Crawl(url string, depth int, fetcher Fetcher) { | ||
// TODO: 并行的抓取 URL。 | ||
// TODO: 不重复抓取页面。 | ||
// 下面并没有实现上面两种情况: | ||
if depth <= 0 { | ||
return | ||
} | ||
body, urls, err := fetcher.Fetch(url) | ||
if err != nil { | ||
fmt.Println(err) | ||
return | ||
} | ||
fmt.Printf("found: %s %q\n", url, body) | ||
g_urlCache.mux.Lock() | ||
for _, u := range urls { | ||
_, ok := g_urlCache.urls[u] | ||
// fmt.Printf("current u is: %s, ok is: %v\n", u, ok) | ||
if !ok { | ||
g_urlCache.urls[u] = true | ||
go Crawl(u, depth-1, fetcher) | ||
} | ||
} | ||
g_urlCache.mux.Unlock() | ||
return | ||
} | ||
|
||
func main() { | ||
Crawl("https://golang.org/", 4, fetcher) | ||
time.Sleep(5 * time.Second) | ||
} | ||
|
||
// fakeFetcher 是返回若干结果的 Fetcher。 | ||
type fakeFetcher map[string]*fakeResult | ||
|
||
type fakeResult struct { | ||
body string | ||
urls []string | ||
} | ||
|
||
func (f fakeFetcher) Fetch(url string) (string, []string, error) { | ||
if res, ok := f[url]; ok { | ||
return res.body, res.urls, nil | ||
} | ||
return "", nil, fmt.Errorf("not found: %s", url) | ||
} | ||
|
||
// fetcher 是填充后的 fakeFetcher。 | ||
var fetcher = fakeFetcher{ | ||
"https://golang.org/": &fakeResult{ | ||
"The Go Programming Language", | ||
[]string{ | ||
"https://golang.org/pkg/", | ||
"https://golang.org/cmd/", | ||
}, | ||
}, | ||
"https://golang.org/pkg/": &fakeResult{ | ||
"Packages", | ||
[]string{ | ||
"https://golang.org/", | ||
"https://golang.org/cmd/", | ||
"https://golang.org/pkg/fmt/", | ||
"https://golang.org/pkg/os/", | ||
}, | ||
}, | ||
"https://golang.org/pkg/fmt/": &fakeResult{ | ||
"Package fmt", | ||
[]string{ | ||
"https://golang.org/", | ||
"https://golang.org/pkg/", | ||
}, | ||
}, | ||
"https://golang.org/pkg/os/": &fakeResult{ | ||
"Package os", | ||
[]string{ | ||
"https://golang.org/", | ||
"https://golang.org/pkg/", | ||
}, | ||
}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"sync" | ||
) | ||
|
||
type urlsCache struct { | ||
mux sync.Mutex | ||
urls map[string]bool | ||
} | ||
|
||
func (cache *urlsCache) setVisited(name string) bool { | ||
cache.mux.Lock() | ||
defer cache.mux.Unlock() | ||
|
||
if cache.urls[name] { // this url is in map, skip | ||
return true | ||
} | ||
cache.urls[name] = true // mark it as used. | ||
return false // this url has never been searched, will recursively call crawler | ||
} | ||
|
||
var cacheInstance = urlsCache{urls: make(map[string]bool)} | ||
|
||
// Fetcher interface | ||
type Fetcher interface { | ||
// Fetch returns the body of URL and | ||
// a slice of URLs found on that page. | ||
Fetch(url string) (body string, urls []string, err error) | ||
} | ||
|
||
func crawlInner(url string, depth int, fetcher Fetcher, wg *sync.WaitGroup) { | ||
defer wg.Done() | ||
if depth <= 0 { | ||
return | ||
} | ||
|
||
if cacheInstance.setVisited(url) { // this url is in map, skip | ||
return | ||
} | ||
|
||
body, urls, err := fetcher.Fetch(url) | ||
if err != nil { | ||
fmt.Println(err) | ||
return | ||
} | ||
fmt.Printf("found: %s %q\n", url, body) | ||
for _, u := range urls { | ||
wg.Add(1) | ||
go crawlInner(u, depth-1, fetcher, wg) | ||
} | ||
return | ||
} | ||
|
||
// Crawl uses fetcher to recursively crawl | ||
// pages starting with url, to a maximum of depth. | ||
func Crawl(url string, depth int, fetcher Fetcher) { | ||
waitGroup := &sync.WaitGroup{} | ||
|
||
waitGroup.Add(1) | ||
|
||
go crawlInner(url, depth, fetcher, waitGroup) | ||
|
||
waitGroup.Wait() | ||
} | ||
|
||
func main() { | ||
Crawl("https://golang.org/", 4, fetcher) | ||
} | ||
|
||
// fakeFetcher is Fetcher that returns canned results. | ||
type fakeFetcher map[string]*fakeResult | ||
|
||
type fakeResult struct { | ||
body string | ||
urls []string | ||
} | ||
|
||
func (f fakeFetcher) Fetch(url string) (string, []string, error) { | ||
if res, ok := f[url]; ok { | ||
return res.body, res.urls, nil | ||
} | ||
return "", nil, fmt.Errorf("not found: %s", url) | ||
} | ||
|
||
// fetcher is a populated fakeFetcher. | ||
var fetcher = fakeFetcher{ | ||
"https://golang.org/": &fakeResult{ | ||
"The Go Programming Language", | ||
[]string{ | ||
"https://golang.org/pkg/", | ||
"https://golang.org/cmd/", | ||
}, | ||
}, | ||
"https://golang.org/pkg/": &fakeResult{ | ||
"Packages", | ||
[]string{ | ||
"https://golang.org/", | ||
"https://golang.org/cmd/", | ||
"https://golang.org/pkg/fmt/", | ||
"https://golang.org/pkg/os/", | ||
}, | ||
}, | ||
"https://golang.org/pkg/fmt/": &fakeResult{ | ||
"Package fmt", | ||
[]string{ | ||
"https://golang.org/", | ||
"https://golang.org/pkg/", | ||
}, | ||
}, | ||
"https://golang.org/pkg/os/": &fakeResult{ | ||
"Package os", | ||
[]string{ | ||
"https://golang.org/", | ||
"https://golang.org/pkg/", | ||
}, | ||
}, | ||
} |