-
Notifications
You must be signed in to change notification settings - Fork 0
/
WebScrapingTeste.py
74 lines (54 loc) · 2.17 KB
/
WebScrapingTeste.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
# from selenium.webdriver.chrome.options import Options
import json
# pip install requests2
# pip install pandas
# pip install lxml
# pip install beautifulsoup4
# pip install selenium
# Grab content from URL (Pegar conteúdo HTML a partir da URL)
url = "https://stats.nba.com/players/traditional/?PerMode=Totals&Season=2019-20&SeasonType=Regular%20Season&sort=PLAYER_NAME&dir=-1"
top10ranking = {}
rankings = {
'3points': {'field': 'FG3M', 'label': '3PM'},
'points': {'field': 'PTS', 'label': 'PTS'},
'assistants': {'field': 'AST', 'label': 'AST'},
'rebounds': {'field': 'REB', 'label': 'REB'},
'steals': {'field': 'STL', 'label': 'STL'},
'blocks': {'field': 'BLK', 'label': 'BLK'},
}
def buildrank(type):
field = rankings[type]['field']
label = rankings[type]['label']
driver.find_element_by_xpath(
f"//div[@class='nba-stat-table']//table//thead//tr//th[@data-field='{field}']").click()
element = driver.find_element_by_xpath(
"//div[@class='nba-stat-table']//table")
html_content = element.get_attribute('outerHTML')
# Parse HTML (Parsear o conteúdo HTML) - BeaultifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find(name='table')
# Data Structure Conversion (Estruturar conteúdo em um Data Frame) - Pandas
df_full = pd.read_html(str(table))[0].head(10)
df = df_full[['Unnamed: 0', 'PLAYER', 'TEAM', label]]
df.columns = ['pos', 'player', 'team', 'total']
# Convert to Dict (Transformar os Dados em um Dicionário de dados próprio)
return df.to_dict('records')
option = Options()
# option.headless = True
driver = webdriver.Firefox(options=option)
# driver = webdriver.chrome(Options=option)
driver.get(url)
# time.sleep(10)
driver.implicitly_wait(10) # in seconds
for k in rankings:
top10ranking[k] = buildrank(k)
driver.quit()
# Dump and Save to JSON file (Converter e salvar em um arquivo JSON)
with open('ranking.json', 'w', encoding='utf-8') as jp:
js = json.dumps(top10ranking, indent=4)
jp.write(js)