Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Named Entity Chunker #120

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sudo: false
sudo: true
language: python

python:
Expand All @@ -7,9 +7,13 @@ python:
- '2.7'
- pypy

addons:
apt:
packages:
- numpy

install:
- pip install coveralls
- pip install nltk
- pip install -r requirements.txt

services:
Expand Down
48 changes: 48 additions & 0 deletions chatterbot/utils/entity_tagger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import nltk

from chatterbot.utils.pos_tagger import POSTagger


class NamedEntityTagger():
"""
A wrapper class for ChatterBot's custom implementations
of:
1) Named Entity chunking
"""

def __init__(self):
"""
Constructor to initialize instance variables.
"""
from nltk.data import find
from nltk import download

try:
find('words.zip')
except LookupError:
download('words')

try:
find('maxent_ne_chunker.zip')
except LookupError:
download('maxent_ne_chunker')

self.tagger = POSTagger()

def ne_chunk(self, string):
"""
Find all of the named entities and return them.
"""
ne_list = []

named_entities = nltk.ne_chunk(self.tagger.tag(self.tagger.tokenize(string)), binary=True)
named_entities = nltk.chunk.tree2conlltags(named_entities)

# Getting named entities in a text
for entity in named_entities:
if "NE" in entity[2]:
if entity not in ne_list:
ne_list.append(entity[0])

# Returning list of named entities
return ne_list
14 changes: 14 additions & 0 deletions chatterbot/utils/pos_tagger.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from nltk import word_tokenize
from nltk import pos_tag


class POSTagger():

def __init__(self):
from nltk.data import find
from nltk import download
Expand All @@ -11,6 +13,11 @@ def __init__(self):
except LookupError:
download('punkt')

try:
find('averaged_perceptron_tagger.zip')
except LookupError:
download('averaged_perceptron_tagger')

def tokenize(self, text):
"""
Takes an input string and tokenizes that text.
Expand All @@ -19,3 +26,10 @@ def tokenize(self, text):
"""

return word_tokenize(text)

def tag(self, tokens):
"""
Takes a set of tokens and returns the tagged tokens.
"""

return pos_tag(tokens)
9 changes: 9 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from chatterbot.utils.pos_tagger import POSTagger
from chatterbot.utils.stop_words import StopWordsManager
from chatterbot.utils.word_net import Wordnet
from chatterbot.utils.entity_tagger import NamedEntityTagger


class UtilityTests(TestCase):
Expand Down Expand Up @@ -34,6 +35,14 @@ def test_word_net(self):

self.assertEqual(0.06666666666666667, synsets[0].path_similarity(synsets[1]))

def test_ne_tagger(self):
named_entity_tagger = NamedEntityTagger()
named_entities = named_entity_tagger.ne_chunk("test one, two, and three.")
valid_check = named_entity_tagger.ne_chunk("New York Times")

self.assertEqual(named_entities, [])
self.assertEqual(valid_check, ['New', 'York', 'Times'])

class CleanWhitespaceTests(TestCase):

def test_clean_whitespace(self):
Expand Down