-
Notifications
You must be signed in to change notification settings - Fork 0
/
chapter12-parse HTML using BeautifulSoup
91 lines (89 loc) · 3.22 KB
/
chapter12-parse HTML using BeautifulSoup
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
ord() function tells us the numeric value of a simple ASCII character; print(ord('\n'))
Uppercase letters had a lower ordinal than lowercase letters. So, Hi is less than zzz all lowercase
byte string(uncoded), unicode string=regular string: x=u'你好', print(type(x))-> class 'str'; x=b'abc', print(type(x))-> class 'bytes'
#Decode from bytes to strings
import socket
mysock=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
mysock.connect (('data.pret.org',80))
cmd='GET http://data.pr4e.org/romeo.txt HTTP/1.0\n\n'.encode()
mysock.send(cmd)
while True:
data = mysock.recv(512) # bytes
if(len(data)<1):
break
mystring = data.decode() # unicode
print(mystring)
------------------------------------------------------------------------
1. import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen ('http://data.pr4e.org/romeo.txt ') # open file
for line in fhand:
print(line.decode().strip()) # string
2. import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen ('http://data.pr4e.org/romeo.txt ') # or 'http://www.dr-chuck.com/page1.htm'
counts=dict()
for line in fhand:
words=line.decode().split()
for word in words:
counts[word]=counts.get(word,0)+1
print(counts)
------------------------------------------------------------------------------
BeatufilSoup install (1) https://pypi.python.org/pypi/beautifulsoup4 or download the file from http://www.py4e.com/code3/bs4.zip
and unzip it in the same directory as this file.
PowerShell: PS C:\Users\Sharon> pip install bs4
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
# url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
#retrieve all of the anchor tags
tags=soup('a')
for tag in tags:
print (tag.get('href',none))
# Look at the parts of a tag
print ('TAG:',tag)
print ('URL:',tag.get('href', None))
print ('Contents:',tag.contents[0])
print ('Attrs:',tag.attrs)
PS C:\Users\Sharon\code\test> py test1.py
Enter-http://www.dr-chuck.com/page1.htm
http://www.dr-chuck.com/page2.htm
TAG: <a href="http://www.dr-chuck.com/page2.htm">
Second Page</a>
URL: http://www.dr-chuck.com/page2.htm
Contents:
Second Page
Attrs: {'href': 'http://www.dr-chuck.com/page2.htm'}
----------------------------------------------------------------
spans = soup('span')
list= [int(span.string) for span in spans]
def sum_list(nums):
sum=0
for num in nums:
sum=sum+num
return sum
print (sum_list(list))
-----------------------------------------------------------------
url = 'http://py4e-data.dr-chuck.net/known_by_Neeve.html'
position = input ('Enter position:')
position= int(position)
times = input('Enter times:')
times=int(times)
t=0
while t <times:
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
x=0
for tag in tags:
x=x+1
if x==position:
url =tag.get('href',None)
print ('Retriving:',url)
break
t=t+1