Skip to content

Commit

Permalink
Finished A tour of GO
Browse files Browse the repository at this point in the history
  • Loading branch information
UndefinedSy committed Oct 29, 2020
1 parent a90e00b commit 04867a7
Show file tree
Hide file tree
Showing 2 changed files with 221 additions and 0 deletions.
102 changes: 102 additions & 0 deletions Go/A Tour of Go/Exercise_Web_Crawler.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package main

import (
"fmt"
"sync"
"time"
)

type Fetcher interface {
// Fetch 返回 URL 的 body 内容,并且将在这个页面上找到的 URL 放到一个 slice 中。
Fetch(url string) (body string, urls []string, err error)
}

type URLCache struct {
urls map[string]bool
mux sync.Mutex
}

var g_urlCache URLCache = URLCache{
urls: map[string]bool{"https://golang.org/": true},
}

// Crawl 使用 fetcher 从某个 URL 开始递归的爬取页面,直到达到最大深度。
func Crawl(url string, depth int, fetcher Fetcher) {
// TODO: 并行的抓取 URL。
// TODO: 不重复抓取页面。
// 下面并没有实现上面两种情况:
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
g_urlCache.mux.Lock()
for _, u := range urls {
_, ok := g_urlCache.urls[u]
// fmt.Printf("current u is: %s, ok is: %v\n", u, ok)
if !ok {
g_urlCache.urls[u] = true
go Crawl(u, depth-1, fetcher)
}
}
g_urlCache.mux.Unlock()
return
}

func main() {
Crawl("https://golang.org/", 4, fetcher)
time.Sleep(5 * time.Second)
}

// fakeFetcher 是返回若干结果的 Fetcher。
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
body string
urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher 是填充后的 fakeFetcher。
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}
119 changes: 119 additions & 0 deletions Go/A Tour of Go/Exercise_Web_Crawler_waitGroup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package main

import (
"fmt"
"sync"
)

type urlsCache struct {
mux sync.Mutex
urls map[string]bool
}

func (cache *urlsCache) setVisited(name string) bool {
cache.mux.Lock()
defer cache.mux.Unlock()

if cache.urls[name] { // this url is in map, skip
return true
}
cache.urls[name] = true // mark it as used.
return false // this url has never been searched, will recursively call crawler
}

var cacheInstance = urlsCache{urls: make(map[string]bool)}

// Fetcher interface
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}

func crawlInner(url string, depth int, fetcher Fetcher, wg *sync.WaitGroup) {
defer wg.Done()
if depth <= 0 {
return
}

if cacheInstance.setVisited(url) { // this url is in map, skip
return
}

body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
wg.Add(1)
go crawlInner(u, depth-1, fetcher, wg)
}
return
}

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
waitGroup := &sync.WaitGroup{}

waitGroup.Add(1)

go crawlInner(url, depth, fetcher, waitGroup)

waitGroup.Wait()
}

func main() {
Crawl("https://golang.org/", 4, fetcher)
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
body string
urls []string
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}

0 comments on commit 04867a7

Please sign in to comment.