-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrapydemo.py
31 lines (26 loc) · 1.39 KB
/
scrapydemo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import scrapy
class EPSpeakerSpider(scrapy.Spider):
name = 'epspeakers'
start_urls = ['https://ep2015.europython.eu/en/speakers/']
def parse(self, response):
for url in response.xpath('//li/a[contains(@href, "/conference/")]/@href').extract():
full_url = response.urljoin(url)
yield scrapy.Request(full_url, callback=self.parse_speakerdetails)
def parse_speakerdetails(self, response):
item = {}
item['url'] = response.url
item['name'] = response.xpath('//section[@class="profile-name"]//h1/text()').extract()[0].strip()
item['avatar'] = response.urljoin(response.xpath('//img[@class="avatar"]/@src').extract()[0])
# Getting all attributes about the Speaker
for field in response.xpath('//dl[@class="dl-horizontal"]//dt'):
name = field.xpath('.//text()').extract()[0].strip()
description = ''.join(field.xpath('.//following-sibling::dd[1]//text()').extract()).strip()
item[name] = description
# Extracting list of talks
talks = []
for talk in response.xpath('//div[@class="speaker-talks well"]//li'):
talk_title = talk.xpath('.//text()').extract()[0]
talk_url = response.urljoin(talk.xpath('.//a/@href').extract()[0])
talks.append({'title': talk_title, 'url': talk_url})
item['talks'] = talks
return item