-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathavito_parser.py
115 lines (88 loc) · 4.08 KB
/
avito_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import csv
import time
import random
import urllib3
import requests
from bs4 import BeautifulSoup
from IPython.display import clear_output
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
HEADERS = {
'User-agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'
}
COLUMNS = ['label', 'model', 'generation', 'modification', 'year', 'mileage', 'condition',
'doors_num', 'body', 'engine', 'transmission', 'color', 'drive', 'wheel', 'package', 'price']
def parse_avito():
url = 'https://www.avito.ru/nizhniy_novgorod/avtomobili?cd=1&radius=200'
request = get_request(url)
soup = BeautifulSoup(request.text, 'lxml')
page = soup.find('div', {'class': 'popular-rubricator-links-b0HPS'})
modelsList = page.find_all('div', {'class': ['popular-rubricator-row-2oc-J']})
for model in modelsList:
time.sleep(1 + random.random())
modelLink = 'https://avito.ru' + model.find('a').get('href')
parse_car_model(modelLink)
def parse_car_model(url):
lastPage = get_last_page(url)
for pageNum in range(1, lastPage+1):
clear_output(wait=True)
print(f"Страница: {pageNum}/{lastPage}", flush=True)
time.sleep(1 + random.random())
pageLink = url + f'&p={pageNum}'
parse_page_with_ads(pageLink)
def get_last_page(url):
request = get_request(url)
soup = BeautifulSoup(request.text, 'lxml')
pagination = soup.find('div', {'class': 'pagination-root-2oCjZ'})
page = pagination.find_all('span')
lastPage = int(page[-2].text)
return lastPage
def parse_page_with_ads(url):
request = get_request(url)
soup = BeautifulSoup(request.text, 'lxml')
page = soup.find('div', {'class': 'snippet-list js-catalog_serp'})
carList = page.find_all('div', {'class': ['item__line']})
for car in carList:
time.sleep(1 + random.random())
carLink = 'https://avito.ru' + car.find('div',
{'class': 'snippet-title-row'}).find('a').get('href')
parse_ad(carLink)
def parse_ad(url):
carParams = get_car_info(url)
write_to_csv(carParams)
def get_car_info(url):
ParamNameToEnglish = {'Марка': 'label', 'Модель': 'model', 'Поколение': 'generation',
'Модификация': 'modification', 'Год выпуска': 'year',
'Пробег': 'mileage', 'Состояние': 'condition', 'Руль': 'wheel',
'Тип кузова': 'body', 'Количество дверей': 'doors_num',
'Тип двигателя': 'engine', 'Коробка передач': 'transmission',
'Привод': 'drive', 'Цвет': 'color', 'Комплектация': 'package'}
carParams = {key: '' for key in COLUMNS}
request = get_request(url)
soup = BeautifulSoup(request.text, 'lxml')
params = soup.find_all('li', {'class': 'item-params-list-item'})
for param in params:
stringToParse = param.text.replace(u'\xa0', u' ').split(':')
param = ParamNameToEnglish.get(stringToParse[0].strip())
paramValue = stringToParse[1].strip()
carParams[param] = paramValue
price = soup.find('div', {'class': 'item-price'}).text.strip().split()
carParams['price'] = ''.join(price[:price.index('₽')])
return carParams
def write_to_csv(carParams):
with open("data/avito_cars.csv", 'a', newline='') as outputFile:
writer = csv.DictWriter(outputFile, fieldnames=COLUMNS)
carDescription = {param:carParams[param] for param in COLUMNS}
writer.writerow(carDescription)
def get_request(url):
while True:
try:
rs = requests.get(url, headers=HEADERS, verify=False)
if rs.status_code == 200:
rs.encoding = 'utf-8'
return rs
print("Ошибка, код ответа:", rs.status_code)
time.sleep(random.random())
continue
except:
print("Ошибка")
time.sleep(random.random())