Skip to content

Commit

Permalink
x/net/html update, code grouming, issues bage (#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
s0rg authored Sep 1, 2022
1 parent bd3f47b commit 191cab7
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 17 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
[![Go Report Card](https://goreportcard.com/badge/github.com/s0rg/crawley)](https://goreportcard.com/report/github.com/s0rg/crawley)
[![Maintainability](https://api.codeclimate.com/v1/badges/6542cd90a6c665e4202e/maintainability)](https://codeclimate.com/github/s0rg/crawley/maintainability)
[![Test Coverage](https://api.codeclimate.com/v1/badges/e1c002df2b4571e01537/test_coverage)](https://codeclimate.com/github/s0rg/crawley/test_coverage)
![Issues](https://img.shields.io/github/issues/s0rg/crawley)

[![License](https://img.shields.io/badge/license-MIT%20License-blue.svg)](https://github.com/s0rg/crawley/blob/main/LICENSE)
[![Go Version](https://img.shields.io/github/go-mod/go-version/s0rg/crawley)](go.mod)
Expand All @@ -16,7 +17,7 @@ Crawls web pages and prints any link it can find.
# features

- fast html SAX-parser (powered by `golang.org/x/net/html`)
- small (<1500 SLOC), idiomatic, 100% test covered codebase
- small (<1300 SLOC), idiomatic, 100% test covered codebase
- grabs most of useful resources urls (pics, videos, audios, forms, etc...)
- found urls are streamed to stdout and guranteed to be unique (with fragments omitted)
- scan depth (limited by starting host and path, by default - 0) can be configured
Expand Down
45 changes: 31 additions & 14 deletions cmd/crawley/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ var (
gitHash string
gitVersion string
buildDate string
extCookies values.List
extHeaders values.List
defaultUA = "Mozilla/5.0 (compatible; Win64; x64) Mr." + appName + "/" + gitVersion + "-" + gitHash
fVersion = flag.Bool("version", false, "show version")
fBrute = flag.Bool("brute", false, "scan html comments")
fSkipSSL = flag.Bool("skip-ssl", false, "skip ssl verification")
Expand All @@ -35,13 +34,14 @@ var (
fDepth = flag.Int("depth", 0, "scan depth (-1 - unlimited)")
fWorkers = flag.Int("workers", runtime.NumCPU(), "number of workers")
fDelay = flag.Duration("delay", defaultDelay, "per-request delay (0 - disable)")
fUA = flag.String("user-agent", defaultAgent, "user-agent string")
fUA = flag.String("user-agent", defaultUA, "user-agent string")
fRobotsPolicy = flag.String("robots", "ignore", "policy for robots.txt: ignore / crawl / respect")
fDirsPolicy = flag.String("dirs", "show", "policy for non-resource urls: show / hide / only")
defaultAgent = "Mozilla/5.0 (compatible; Win64; x64) Mr." + appName + "/" + gitVersion + "-" + gitHash
extCookies values.List
extHeaders values.List
)

func callback(s string) {
func puts(s string) {
_, _ = os.Stdout.WriteString(s + "\n")
}

Expand All @@ -51,7 +51,7 @@ func crawl(uri string, opts ...crawler.Option) error {
log.Printf("[*] config: %s", c.DumpConfig())
log.Printf("[*] crawling url: %s", uri)

if err := c.Run(uri, callback); err != nil {
if err := c.Run(uri, puts); err != nil {
return fmt.Errorf("run: %w", err)
}

Expand All @@ -60,35 +60,45 @@ func crawl(uri string, opts ...crawler.Option) error {
return nil
}

func options() (rv []crawler.Option) {
func initOptions() (rv []crawler.Option, err error) {
robots, err := crawler.ParseRobotsPolicy(*fRobotsPolicy)
if err != nil {
log.Fatal("robots policy:", err)
err = fmt.Errorf("robots policy: %w", err)

return
}

dirs, err := crawler.ParseDirsPolicy(*fDirsPolicy)
if err != nil {
log.Fatal("dirs policy:", err)
err = fmt.Errorf("dirs policy: %w", err)

return
}

workdir, err := os.Getwd()
if err != nil {
log.Fatal("work dir:", err)
err = fmt.Errorf("work dir: %w", err)

return
}

fs := os.DirFS(workdir)

headers, err := extHeaders.Load(fs)
if err != nil {
log.Fatal("headers:", err)
err = fmt.Errorf("headers: %w", err)

return
}

cookies, err := extCookies.Load(fs)
if err != nil {
log.Fatal("cookies:", err)
err = fmt.Errorf("cookies: %w", err)

return
}

return []crawler.Option{
rv = []crawler.Option{
crawler.WithUserAgent(*fUA),
crawler.WithDelay(*fDelay),
crawler.WithMaxCrawlDepth(*fDepth),
Expand All @@ -101,6 +111,8 @@ func options() (rv []crawler.Option) {
crawler.WithExtraHeaders(headers),
crawler.WithExtraCookies(cookies),
}

return rv, nil
}

func main() {
Expand Down Expand Up @@ -128,11 +140,16 @@ func main() {
return
}

opts, err := initOptions()
if err != nil {
log.Fatal("options:", err)
}

if *fSilent {
log.SetOutput(io.Discard)
}

if err := crawl(flag.Arg(0), options()...); err != nil {
if err := crawl(flag.Arg(0), opts...); err != nil {
// forcing back stderr in case of errors, otherwise
// if 'silent' is on - no one will know what happened.
log.SetOutput(os.Stderr)
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
golang.org/x/net v0.0.0-20220607020251-c690dde0001d h1:4SFsTMi4UahlKoloni7L4eYzhFRifURQLw+yv0QDCx8=
golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.0.0-20220802222814-0bcc04d9c69b h1:3ogNYyK4oIQdIKzTu68hQrr4iuVxF3AxKl9Aj/eDrw0=
golang.org/x/net v0.0.0-20220802222814-0bcc04d9c69b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk=

0 comments on commit 191cab7

Please sign in to comment.