Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example using Seldon for text classification with SpaCy tokenizer #578

Merged
merged 1 commit into from
May 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions examples/models/sklearn_spacy_text/RedditClassifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import dill

from ml_utils import CleanTextTransformer, SpacyTokenTransformer

class RedditClassifier(object):
def __init__(self):

self._clean_text_transformer = CleanTextTransformer()
self._spacy_tokenizer = SpacyTokenTransformer()

with open('tfidf_vectorizer.model', 'rb') as model_file:
self._tfidf_vectorizer = dill.load(model_file)

with open('lr.model', 'rb') as model_file:
self._lr_model = dill.load(model_file)

def predict(self, X, feature_names):
clean_text = self._clean_text_transformer.transform(X)
spacy_tokens = self._spacy_tokenizer.transform(clean_text)
tfidf_features = self._tfidf_vectorizer.transform(spacy_tokens)
predictions = self._lr_model.predict_proba(tfidf_features)
return predictions

Empty file.
75 changes: 75 additions & 0 deletions examples/models/sklearn_spacy_text/ml_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import spacy
from spacy.cli import download
import re
import numpy as np
from sklearn.base import TransformerMixin
from html.parser import HTMLParser
import dill
import sys, os

download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm', parser=False, entity=False)

class SpacyTokenTransformer(TransformerMixin):
__symbols = set("!$%^&*()_+|~-=`{}[]:\";'<>?,./-")

def transform(self, X, **kwargs):
f = np.vectorize(SpacyTokenTransformer.transform_to_token, otypes=[object])
X_tokenized = f(X)
return X_tokenized

def fit(self, X, y=None, **fit_params):
return self

@staticmethod
def transform_to_token(text):
str_text = str(text)
doc = nlp(str_text, disable=['parser', 'tagger', 'ner'])
tokens = []
for token in doc:
if token.like_url:
clean_token = "URL"
else:
clean_token = token.lemma_.lower().strip()
if len(clean_token) < 1 or clean_token in \
SpacyTokenTransformer.__symbols:
continue
tokens.append(clean_token)
return tokens

class CleanTextTransformer(TransformerMixin):
__html_parser = HTMLParser()
__uplus_pattern = \
re.compile("\<[uU]\+(?P<digit>[a-zA-Z0-9]+)\>")
__markup_link_pattern = \
re.compile("\[(.*)\]\((.*)\)")

def transform(self, X, **kwargs):
f = np.vectorize(CleanTextTransformer.transform_clean_text)
X_clean = f(X)
return X_clean

def fit(self, X, y=None, **fit_params):
return self

@staticmethod
def transform_clean_text(raw_text):
try:
decoded = raw_text.encode("ISO-8859-1").decode("utf-8")
except:
decoded = raw_text.encode("ISO-8859-1").decode("cp1252")
html_unescaped = CleanTextTransformer.\
__html_parser.unescape(decoded)
html_unescaped = re.sub(r"\r\n", " ", html_unescaped)
html_unescaped = re.sub(r"\r\r\n", " ", html_unescaped)
html_unescaped = re.sub(r"\r", " ", html_unescaped)
html_unescaped = html_unescaped.replace("&gt;", " > ")
html_unescaped = html_unescaped.replace("&lt;", " < ")
html_unescaped = html_unescaped.replace("--", " - ")
html_unescaped = CleanTextTransformer.__uplus_pattern.sub(
" U\g<digit> ", html_unescaped)
html_unescaped = CleanTextTransformer.__markup_link_pattern.sub(
" \1 \2 ", html_unescaped)
html_unescaped = html_unescaped.replace("\\", "")
return html_unescaped

53 changes: 53 additions & 0 deletions examples/models/sklearn_spacy_text/reddit_clf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"apiVersion": "machinelearning.seldon.io/v1alpha2",
"kind": "SeldonDeployment",
"metadata": {
"labels": {
"app": "seldon"
},
"name": "reddit-classifier"
},
"spec": {
"annotations": {
"project_name": "Reddit classifier",
"deployment_version": "v1"
},
"name": "reddit-classifier",
"oauth_key": "oauth-key",
"oauth_secret": "oauth-secret",
"predictors": [
{
"componentSpecs": [{
"spec": {
"containers": [
{
"image": "reddit-classifier:0.1",
"imagePullPolicy": "IfNotPresent",
"name": "classifier",
"resources": {
"requests": {
"memory": "1Mi"
}
}
}
],
"terminationGracePeriodSeconds": 20
}
}],
"graph": {
"children": [],
"name": "classifier",
"endpoint": {
"type" : "REST"
},
"type": "MODEL"
},
"name": "single-model",
"replicas": 1,
"annotations": {
"predictor_version" : "v1"
}
}
]
}
}
4 changes: 4 additions & 0 deletions examples/models/sklearn_spacy_text/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
scipy>= 0.13.3
scikit-learn>=0.18
spacy==2.0.18
dill==0.2.9
Loading