-
Notifications
You must be signed in to change notification settings - Fork 0
/
amazon_scraper.py
214 lines (176 loc) · 7.57 KB
/
amazon_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# import all necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import lxml
import urllib.parse
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import time
HEADER = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Mobile Safari/537.36',
'accept-language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7,ar;q=0.6'
}
SEARCH_URL = "https://www.amazon.com/s?k=manga&rh=n%3A4367&ref=nb_sb_noss"
MAX_RESULTS = 20
#page=requests.get(url,headers=HEADER)
#soup=BeautifulSoup(page.text, 'lxml')
#print('status is :', page.status_code)
#to get book info from a URL
def get_book_info(url):
page = requests.get(url, headers=HEADER)
soup = BeautifulSoup(page.text, 'lxml')
#for debug
#print('the page status ', page.status_code)
title_elem = soup.select_one("#productTitle")
title = title_elem.text.strip() if title_elem else None
#print(title)
author_elem = soup.select_one('span.author')
author = author_elem.text.strip() if author_elem else None
#print(author)
date_elem = soup.select_one("#rpi-attribute-book_details-publication_date > div.a-section.a-spacing-none.a-text-center.rpi-attribute-value > span")
date = date_elem.text.strip() if date_elem else None
#print(date)
full_price_elem = soup.select_one('span.a-price-whole')
frac_price_elem = soup.select_one('span.a-price-fraction')
curr_elem = soup.select_one("span.a-price-symbol")
price = (full_price_elem.text + frac_price_elem.text + curr_elem.text).strip() if full_price_elem and frac_price_elem and curr_elem else None
#print(price)
order_bs_elem = soup.select_one("div.zg-badge-wrapper > a > i")
order_bs = order_bs_elem.text.strip() if order_bs_elem else None
cat_bs_elem = soup.select_one("div.zg-badge-wrapper > a > span > span")
cat_bs = cat_bs_elem.text.strip() if cat_bs_elem else None
#print(order_bs, cat_bs)
#for debug
'''print("the manga: ",{
'title': title,
'author': author,
'date': date,
'price': price,
'order_bs': order_bs,
'cat_bs': cat_bs
})'''
return {
'title': title,
'author': author,
'date': date,
'price': price,
'order_bs': order_bs,
'cat_bs': cat_bs
}
# to list search results
def search_res_listing(url_search, books_data):
global counter
response = requests.get(url_search, headers=HEADER)
#for debug
print('The result status:', response.status_code)
if response.status_code != 200:
print("Failed to retrieve the page")
return books_data
search_soup = BeautifulSoup(response.text, 'lxml')
links = search_soup.find_all("a", attrs={'class':"a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"})
#for debug
print('list of links:', links)
url_elem_list = []
for link in links:
href = link.get('href')
if href and href.startswith('/'):
url_elem_list.append(href)
#for debug
print("list of urls:", url_elem_list)
for url_elem in url_elem_list:
if counter >= MAX_RESULTS:
print("Counter reached MAX_RESULTS")
return books_data
url = urllib.parse.urljoin(url_search, url_elem)
print("Processing URL:", url) # Print the URL being processed
book_info = get_book_info(url)
books_data.append(book_info)
counter += 1
# pagination handling
next_book_el = search_soup.select_one('a.s-pagination-next')
if next_book_el and counter < MAX_RESULTS:
next_book_url = next_book_el.attrs.get('href')
next_book_url = urllib.parse.urljoin(url_search, next_book_url)
#for debug
print("Next page URL:", next_book_url)
time.sleep(2) # Delay between requests
books_data = search_res_listing(next_book_url, books_data)
return books_data
#to preprocess the data
def preprocess_data(dataframe):
dataframe['date'] = pd.to_datetime(dataframe['date'], errors='coerce')
for i, auth in enumerate(dataframe['author']):
if auth:
auth = auth.split(" ")
auth_name = auth[0] + ' ' + (auth[1] if len(auth) > 1 and 'Author' not in auth[1] else "")
dataframe.at[i, "author"] = auth_name.split("\n")[0]
dataframe['cat_bs'] = dataframe['cat_bs'].fillna("No bs")
dataframe['cat_bs'] = dataframe['cat_bs'].apply(lambda x: [word for word in str(x).split() if word not in ['in', '&']] if x != 'No bs' else ["No bs"])
dataframe['order_bs'] = dataframe['order_bs'].fillna("No bs")
return dataframe
# to visualize the data (basic plotting)
"""def visualize_data(dataframe):
plt.hist(dataframe['date'], bins=20)
plt.figure()
plt.plot(dataframe['price'], c='green', marker='o', linestyle='-')
plt.xlabel('Index')
plt.ylabel('Price ($)')
plt.tight_layout()
plt.show()
"""
# to send email alerts
def email_alert(subject, body, to):
msg = MIMEMultipart()
msg.attach(MIMEText(body, 'plain'))
msg['subject'] = subject
msg['to'] = to
user = '[email protected]'
msg['from'] = user
pwd = 'x'
server = smtplib.SMTP("smtp.gmail.com", 587)
server.starttls()
server.login(user, pwd)
server.send_message(msg)
server.quit()
# Function to analyze variations
def variation_analysis(old_data_path, new_data_path):
old_data = pd.read_csv(old_data_path)
new_data = pd.read_csv(new_data_path)
for ind, row in new_data.iterrows():
old_row = old_data[old_data['title'] == row['title']]
if not old_row.empty:
#print("hey old_row is not empty")
old_row = old_row.iloc[0]
if row['price'] != old_row['price']:
#rint('prices are no longer same\n')
email_alert('Price Changing', f"the price of {row['title']} has changed from {old_row['price']} to {row['price']}", '[email protected]')
if row['order_bs'] != old_row['order_bs']:
#print('order of best selling are no longer same')
email_alert('Order of Best Seller Changing', f"{row['title']} that had {old_row['order_bs']} as an order of best selling, in {old_row['cat_bs']} category(ies) before, has now an order of {row['order_bs']} in {row['cat_bs']} category(ies)", '[email protected]')
else:
#print('new manga added !')
email_alert('New manga added', f"A new manga has appeared in your best sellers search results: {row['title']} by {row['author']} published in {row['date']} is at position {row['order_bs']} in category {row['cat_bs']}.", '[email protected]')
for ind, row in old_data.iterrows():
if row['title'] not in new_data['title'].values:
#print('a manga has disappeared')
email_alert('Manga disappeared!', f"the manga {row['title']} by {row['author']}, published in {row['date']} is no longer a best-seller", '[email protected]')
new_data.to_csv('m_amazon_2.csv', index=False)
# Main function to execute the script
def main():
books_data = search_res_listing(SEARCH_URL, [])
dataframe = pd.DataFrame(books_data)
dataframe.to_csv("m_amazon_1.csv", index=False, sep=',')
#dataframe = preprocess_data(dataframe)
#visualize_data(df)
# Variation analysis example
old_data_path = 'mangas_amazon_test1.csv'
new_data_path = 'm_amazon_1.csv'
#variation_analysis(old_data_path, new_data_path)
if __name__ == "__main__":
counter = 0
main()