-
Notifications
You must be signed in to change notification settings - Fork 2
/
nltkdemo.py
38 lines (29 loc) · 1.17 KB
/
nltkdemo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from nltk import sent_tokenize
from nltk.tag.stanford import NERTagger
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class EPSpeakerSpider(CrawlSpider):
name = 'epspeakers_crawlspider'
start_urls = ['https://ep2015.europython.eu/en/speakers/']
rules = [
Rule(LinkExtractor(allow=('/conference/',)),
callback='parse_speakers')
]
ner = NERTagger('./NER/classifiers/english.all.3class.distsim.crf.ser.gz',
'./NER/stanford-ner.jar',
encoding='utf-8')
def parse_speakers(self, response):
item = {}
item['url'] = response.url
bio = response.xpath(
'//*[text()="Compact biography"]/following-sibling::'
'dd//text()[normalize-space()]').extract()[0]
# Speaker name is usually in the first sentence
bio_first_sentence = sent_tokenize(bio)[:1][0]
tags = self.ner.tag(bio_first_sentence.split())
speaker_name = ''
for tag in tags:
if tag[1] == 'PERSON':
speaker_name += '%s ' % tag[0]
item['name'] = speaker_name.strip()
return item