chapter12-parse HTML using BeautifulSoup

ord() function tells us the numeric value of a simple ASCII character; print(ord('\n'))
Uppercase letters had a lower ordinal than lowercase letters. So, Hi is less than zzz all lowercase 
byte string(uncoded), unicode string=regular string:  x=u'你好', print(type(x))-> class 'str'; x=b'abc', print(type(x))-> class 'bytes'
#Decode from bytes to strings
import socket
mysock=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
mysock.connect (('data.pret.org',80))
cmd='GET http://data.pr4e.org/romeo.txt HTTP/1.0\n\n'.encode()
mysock.send(cmd)
while True:
  data = mysock.recv(512)  # bytes
  if(len(data)<1):
    break
  mystring = data.decode() # unicode
  print(mystring)
------------------------------------------------------------------------
1. import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen ('http://data.pr4e.org/romeo.txt ') # open file
for line in fhand:
    print(line.decode().strip()) # string
2. import urllib.request, urllib.parse, urllib.error
fhand = urllib.request.urlopen ('http://data.pr4e.org/romeo.txt ') # or 'http://www.dr-chuck.com/page1.htm'
counts=dict()
for line in fhand:
    words=line.decode().split()
    for word in words:
        counts[word]=counts.get(word,0)+1
print(counts)
------------------------------------------------------------------------------
BeatufilSoup install (1) https://pypi.python.org/pypi/beautifulsoup4 or download the file from http://www.py4e.com/code3/bs4.zip
and unzip it in the same directory as this file. 
PowerShell: PS C:\Users\Sharon> pip install bs4
 
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input('Enter - ')
# url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')
#retrieve all of the anchor tags
tags=soup('a')
for tag in tags:
    print (tag.get('href',none))
   # Look at the parts of a tag
   print ('TAG:',tag)
   print ('URL:',tag.get('href', None))
   print ('Contents:',tag.contents[0])
   print ('Attrs:',tag.attrs)
PS C:\Users\Sharon\code\test> py test1.py
Enter-http://www.dr-chuck.com/page1.htm
http://www.dr-chuck.com/page2.htm
TAG: <a href="http://www.dr-chuck.com/page2.htm">
Second Page</a>
URL: http://www.dr-chuck.com/page2.htm
Contents:
Second Page
Attrs: {'href': 'http://www.dr-chuck.com/page2.htm'}
----------------------------------------------------------------
spans = soup('span')
list= [int(span.string) for span in spans]
def sum_list(nums):
    sum=0
    for num in nums:
        sum=sum+num
    return sum
print (sum_list(list))
-----------------------------------------------------------------
url = 'http://py4e-data.dr-chuck.net/known_by_Neeve.html'
position = input ('Enter position:')
position= int(position)
times = input('Enter times:')
times=int(times)
t=0
while t <times:
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')

    x=0
    for tag in tags:
        x=x+1
        if x==position:
            url =tag.get('href',None)
            print ('Retriving:',url)
            break
    t=t+1