forked from jeroenjanssens/data-science-at-the-command-line
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape
executable file
·51 lines (43 loc) · 1.8 KB
/
scrape
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python
# scrape: Extract HTML elements using an XPath query or CSS3 selector.
#
# Example usage: curl 'http://en.wikipedia.org/wiki/List_of_sovereign_states' -s \
# | scrape -be 'table.wikitable > tr > td > b > a'
#
# Dependencies: lxml and optionally cssselector
#
# Author: http://jeroenjanssens.com
import sys
import argparse
from lxml import etree
def main():
parser = argparse.ArgumentParser()
parser.add_argument('html', nargs='?', type=argparse.FileType('rb'), default=sys.stdin, help="HTML", metavar="HTML")
parser.add_argument('-e', '--expression', default='*', help="XPath query or CSS3 selector")
parser.add_argument('-t', '--text', action='store_true', default=False, help="Output text instead of HTML")
parser.add_argument('-b', '--body', action='store_true', default=False, help="Enclose output with HTML and BODY tags")
parser.add_argument('-d', '--delimiter', default=' ', help="Delimiter when output is text")
args = parser.parse_args()
if not args.expression.startswith('//'):
from cssselect import GenericTranslator, SelectorError
try:
expression = GenericTranslator().css_to_xpath(args.expression)
except SelectorError:
parser.error('Invalid CSS selector')
else:
expression = args.expression
html_parser = etree.HTMLParser(encoding='utf-8')
document = etree.parse(args.html, html_parser)
if args.body:
sys.stdout.write("<!DOCTYPE html>\n<html>\n<body>\n")
for e in document.xpath(expression):
try:
text = etree.tostring(e)
sys.stdout.write(text.encode('utf-8') + "\n")
sys.stdout.flush()
except IOError:
pass
if args.body:
sys.stdout.write("</body>\n</html>\n")
if __name__ == "__main__":
exit(main())