diff --git a/Go/A Tour of Go/Exercise_Web_Crawler.go b/Go/A Tour of Go/Exercise_Web_Crawler.go new file mode 100644 index 0000000..f614f9b --- /dev/null +++ b/Go/A Tour of Go/Exercise_Web_Crawler.go @@ -0,0 +1,102 @@ +package main + +import ( + "fmt" + "sync" + "time" +) + +type Fetcher interface { + // Fetch 返回 URL 的 body 内容,并且将在这个页面上找到的 URL 放到一个 slice 中。 + Fetch(url string) (body string, urls []string, err error) +} + +type URLCache struct { + urls map[string]bool + mux sync.Mutex +} + +var g_urlCache URLCache = URLCache{ + urls: map[string]bool{"https://golang.org/": true}, +} + +// Crawl 使用 fetcher 从某个 URL 开始递归的爬取页面,直到达到最大深度。 +func Crawl(url string, depth int, fetcher Fetcher) { + // TODO: 并行的抓取 URL。 + // TODO: 不重复抓取页面。 + // 下面并没有实现上面两种情况: + if depth <= 0 { + return + } + body, urls, err := fetcher.Fetch(url) + if err != nil { + fmt.Println(err) + return + } + fmt.Printf("found: %s %q\n", url, body) + g_urlCache.mux.Lock() + for _, u := range urls { + _, ok := g_urlCache.urls[u] + // fmt.Printf("current u is: %s, ok is: %v\n", u, ok) + if !ok { + g_urlCache.urls[u] = true + go Crawl(u, depth-1, fetcher) + } + } + g_urlCache.mux.Unlock() + return +} + +func main() { + Crawl("https://golang.org/", 4, fetcher) + time.Sleep(5 * time.Second) +} + +// fakeFetcher 是返回若干结果的 Fetcher。 +type fakeFetcher map[string]*fakeResult + +type fakeResult struct { + body string + urls []string +} + +func (f fakeFetcher) Fetch(url string) (string, []string, error) { + if res, ok := f[url]; ok { + return res.body, res.urls, nil + } + return "", nil, fmt.Errorf("not found: %s", url) +} + +// fetcher 是填充后的 fakeFetcher。 +var fetcher = fakeFetcher{ + "https://golang.org/": &fakeResult{ + "The Go Programming Language", + []string{ + "https://golang.org/pkg/", + "https://golang.org/cmd/", + }, + }, + "https://golang.org/pkg/": &fakeResult{ + "Packages", + []string{ + "https://golang.org/", + "https://golang.org/cmd/", + "https://golang.org/pkg/fmt/", + "https://golang.org/pkg/os/", + }, + }, + "https://golang.org/pkg/fmt/": &fakeResult{ + "Package fmt", + []string{ + "https://golang.org/", + "https://golang.org/pkg/", + }, + }, + "https://golang.org/pkg/os/": &fakeResult{ + "Package os", + []string{ + "https://golang.org/", + "https://golang.org/pkg/", + }, + }, +} diff --git a/Go/A Tour of Go/Exercise_Web_Crawler_waitGroup.go b/Go/A Tour of Go/Exercise_Web_Crawler_waitGroup.go new file mode 100644 index 0000000..59a2027 --- /dev/null +++ b/Go/A Tour of Go/Exercise_Web_Crawler_waitGroup.go @@ -0,0 +1,119 @@ +package main + +import ( + "fmt" + "sync" +) + +type urlsCache struct { + mux sync.Mutex + urls map[string]bool +} + +func (cache *urlsCache) setVisited(name string) bool { + cache.mux.Lock() + defer cache.mux.Unlock() + + if cache.urls[name] { // this url is in map, skip + return true + } + cache.urls[name] = true // mark it as used. + return false // this url has never been searched, will recursively call crawler +} + +var cacheInstance = urlsCache{urls: make(map[string]bool)} + +// Fetcher interface +type Fetcher interface { + // Fetch returns the body of URL and + // a slice of URLs found on that page. + Fetch(url string) (body string, urls []string, err error) +} + +func crawlInner(url string, depth int, fetcher Fetcher, wg *sync.WaitGroup) { + defer wg.Done() + if depth <= 0 { + return + } + + if cacheInstance.setVisited(url) { // this url is in map, skip + return + } + + body, urls, err := fetcher.Fetch(url) + if err != nil { + fmt.Println(err) + return + } + fmt.Printf("found: %s %q\n", url, body) + for _, u := range urls { + wg.Add(1) + go crawlInner(u, depth-1, fetcher, wg) + } + return +} + +// Crawl uses fetcher to recursively crawl +// pages starting with url, to a maximum of depth. +func Crawl(url string, depth int, fetcher Fetcher) { + waitGroup := &sync.WaitGroup{} + + waitGroup.Add(1) + + go crawlInner(url, depth, fetcher, waitGroup) + + waitGroup.Wait() +} + +func main() { + Crawl("https://golang.org/", 4, fetcher) +} + +// fakeFetcher is Fetcher that returns canned results. +type fakeFetcher map[string]*fakeResult + +type fakeResult struct { + body string + urls []string +} + +func (f fakeFetcher) Fetch(url string) (string, []string, error) { + if res, ok := f[url]; ok { + return res.body, res.urls, nil + } + return "", nil, fmt.Errorf("not found: %s", url) +} + +// fetcher is a populated fakeFetcher. +var fetcher = fakeFetcher{ + "https://golang.org/": &fakeResult{ + "The Go Programming Language", + []string{ + "https://golang.org/pkg/", + "https://golang.org/cmd/", + }, + }, + "https://golang.org/pkg/": &fakeResult{ + "Packages", + []string{ + "https://golang.org/", + "https://golang.org/cmd/", + "https://golang.org/pkg/fmt/", + "https://golang.org/pkg/os/", + }, + }, + "https://golang.org/pkg/fmt/": &fakeResult{ + "Package fmt", + []string{ + "https://golang.org/", + "https://golang.org/pkg/", + }, + }, + "https://golang.org/pkg/os/": &fakeResult{ + "Package os", + []string{ + "https://golang.org/", + "https://golang.org/pkg/", + }, + }, +} \ No newline at end of file