scrape.go

// Package scrape provides a searching api on top of golang.org/x/net/html
package scrape

import (
	"strings"

	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
)

// Matcher should return true when a desired node is found.
type Matcher func(node *html.Node) bool

// FindAll returns all nodes which match the provided Matcher. After discovering a matching
// node, it will _not_ discover matching subnodes of that node.
func FindAll(node *html.Node, matcher Matcher) []*html.Node {
	return findAllInternal(node, matcher, false)
}

// FindAllNested returns all nodes which match the provided Matcher and _will_ discover
// matching subnodes of matching nodes.
func FindAllNested(node *html.Node, matcher Matcher) []*html.Node {
	return findAllInternal(node, matcher, true)
}

// Find returns the first node which matches the matcher using depth-first search.
// If no node is found, ok will be false.
//
//     root, err := html.Parse(resp.Body)
//     if err != nil {
//         // handle error
//     }
//     matcher := func(n *html.Node) bool {
//         return n.DataAtom == atom.Body
//     }
//     body, ok := scrape.Find(root, matcher)
func Find(node *html.Node, matcher Matcher) (n *html.Node, ok bool) {
	if matcher(node) {
		return node, true
	}

	for c := node.FirstChild; c != nil; c = c.NextSibling {
		n, ok := Find(c, matcher)
		if ok {
			return n, true
		}
	}
	return nil, false
}

// FindParent searches up HTML tree from the current node until either a
// match is found or the top is hit.
func FindParent(node *html.Node, matcher Matcher) (n *html.Node, ok bool) {
	for p := node.Parent; p != nil; p = p.Parent {
		if matcher(p) {
			return p, true
		}
	}
	return nil, false
}

// Text returns text from all descendant text nodes joined.
// For control over the join function, see TextJoin.
func Text(node *html.Node) string {
	joiner := func(s []string) string {
		n := 0
		for i := range s {
			trimmed := strings.TrimSpace(s[i])
			if trimmed != "" {
				s[n] = trimmed
				n++
			}
		}
		return strings.Join(s[:n], " ")
	}
	return TextJoin(node, joiner)
}

// TextJoin returns a string from all descendant text nodes joined by a
// caller provided join function.
func TextJoin(node *html.Node, join func([]string) string) string {
	nodes := FindAll(node, func(n *html.Node) bool { return n.Type == html.TextNode })
	parts := make([]string, len(nodes))
	for i, n := range nodes {
		parts[i] = n.Data
	}
	return join(parts)
}

// Attr returns the value of an HTML attribute.
func Attr(node *html.Node, key string) string {
	for _, a := range node.Attr {
		if a.Key == key {
			return a.Val
		}
	}
	return ""
}

// ByTag returns a Matcher which matches all nodes of the provided tag type.
//
//     root, err := html.Parse(resp.Body)
//     if err != nil {
//         // handle error
//     }
//     title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
func ByTag(a atom.Atom) Matcher {
	return func(node *html.Node) bool { return node.DataAtom == a }
}

// ById returns a Matcher which matches all nodes with the provided id.
func ById(id string) Matcher {
	return func(node *html.Node) bool { return Attr(node, "id") == id }
}

// ByClass returns a Matcher which matches all nodes with the provided class.
func ByClass(class string) Matcher {
	return func(node *html.Node) bool {
		classes := strings.Fields(Attr(node, "class"))
		for _, c := range classes {
			if c == class {
				return true
			}
		}
		return false
	}
}

// findAllInternal encapsulates the node tree traversal
func findAllInternal(node *html.Node, matcher Matcher, searchNested bool) []*html.Node {
	matched := []*html.Node{}

	if matcher(node) {
		matched = append(matched, node)

		if !searchNested {
			return matched
		}
	}

	for c := node.FirstChild; c != nil; c = c.NextSibling {
		found := findAllInternal(c, matcher, searchNested)
		if len(found) > 0 {
			matched = append(matched, found...)
		}
	}
	return matched
}

// Find returns the first node which matches the matcher using next sibling search.
// If no node is found, ok will be false.
//
//     root, err := html.Parse(resp.Body)
//     if err != nil {
//         // handle error
//     }
//     matcher := func(n *html.Node) bool {
//         return n.DataAtom == atom.Body
//     }
//     body, ok := scrape.FindNextSibling(root, matcher)
func FindNextSibling(node *html.Node, matcher Matcher) (n *html.Node, ok bool) {

	for s := node.NextSibling; s != nil; s = s.NextSibling {
		if matcher(s) {
			return s, true
		}
	}
	return nil, false
}

// Find returns the first node which matches the matcher using previous sibling search.
// If no node is found, ok will be false.
//
//     root, err := html.Parse(resp.Body)
//     if err != nil {
//         // handle error
//     }
//     matcher := func(n *html.Node) bool {
//         return n.DataAtom == atom.Body
//     }
//     body, ok := scrape.FindPrevSibling(root, matcher)
func FindPrevSibling(node *html.Node, matcher Matcher) (n *html.Node, ok bool) {
	for s := node.PrevSibling; s != nil; s = s.PrevSibling {
		if matcher(s) {
			return s, true
		}
	}
	return nil, false
}