-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
68 lines (49 loc) · 1.79 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import requests
from bs4 import BeautifulSoup
import json
import sys
###### SCRAPER ######
def clean_text(txt: str) -> str:
"""Replace all white space with single space."""
return " ".join(txt.strip().split())
def parse_rows(rows):
split_rows = [row.split(" ") for row in rows]
split_split_rows = []
for row in split_rows:
list_strs = []
for str in row:
if str != '' and str!= " ":
list_strs.append(str)
split_split_rows.append(list_strs)
return split_split_rows
def scrape(url: str) -> 'tuple[list[str], list[str]]':
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
title = soup("h1")[0].text
ingr_class = "ingredients-item-name"
ingr_tags = soup("span", class_=ingr_class)
ingredients = [clean_text(ingr.text) for ingr in ingr_tags]
instr_class = "subcontainer instructions-section-item"
instr_tags = soup("li", class_=instr_class)
instructions = [clean_text(instr.div.div.p.text) for instr in instr_tags]
return title, ingredients, instructions
def scrape_ingr_subs(url: str):
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
table_row_class = "tableRow"
table_rows = soup("tr", class_=table_row_class)
table_rows_content = []
count = 0
for table_row in table_rows:
table_rows_content.append(table_row.text)
rows = parse_rows(table_rows_content)
sub_dict = {}
for row in rows[1:-1]:
sub_dict[row[0]] = {}
sub_dict[row[0]]["amount"] = row[1]
sub_dict[row[0]]["substitution"] = row[2]
f = open("subs.json", "w")
json.dump(sub_dict, f)
f.close()
return sub_dict
#url = "https://www.allrecipes.com/article/common-ingredient-substitutions/"