forked from amueller/kaggle_insults
-
Notifications
You must be signed in to change notification settings - Fork 0
/
features_old.py
102 lines (89 loc) · 3.97 KB
/
features_old.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import numpy as np
from scipy import sparse
import re
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
class TextFeatureTransformer(BaseEstimator):
def __init__(self, word_range=(1, 1), char_range=(1, 1), char=False,
word=True, designed=True, tokenizer_func=None):
self.word_range = word_range
self.char_range = char_range
self.char = char
self.designed = designed
self.word = word
self.tokenizer_func = tokenizer_func
def get_feature_names(self):
feature_names = []
if self.word:
feature_names.append(self.countvect.get_feature_names())
if self.char:
feature_names.append(self.countvect_char.get_feature_names())
if self.designed:
feature_names.append(['n_words', 'n_chars', 'allcaps', 'max_len',
'mean_len', '@', '!', '?', 'dots', 'spaces', 'bad_ratio',
'n_bad', 'capsratio'])
feature_names = np.hstack(feature_names)
return feature_names
def fit(self, comments, y=None):
# get the google bad word list
#with open("google_badlist.txt") as f:
with open("my_badlist.txt") as f:
badwords = [l.strip() for l in f.readlines()]
self.badwords_ = badwords
print("vecorizing")
if self.word:
if self.tokenizer_func != None:
def build_tokenizer(func):
regexp = re.compile(ur"\b\w\w+\b")
tokenizer = lambda doc: [func(word) for word in
regexp.findall(doc)]
return tokenizer
tokenizer = build_tokenizer(self.tokenizer_func)
else:
tokenizer = None
countvect = TfidfVectorizer(ngram_range=self.word_range,
binary=False, tokenizer=tokenizer, min_df=2)
countvect.fit(comments)
self.countvect = countvect
if self.char:
countvect_char = TfidfVectorizer(ngram_range=self.char_range,
analyzer="char", binary=False)
countvect_char.fit(comments)
self.countvect_char = countvect_char
return self
def transform(self, comments):
## some handcrafted features!
n_words = [len(c.split()) for c in comments]
n_chars = [len(c) for c in comments]
# number of uppercase words
allcaps = [np.sum([w.isupper() for w in comment.split()])
for comment in comments]
# longest word
max_word_len = [np.max([len(w) for w in c.split()]) for c in comments]
# average word length
mean_word_len = [np.mean([len(w) for w in c.split()])
for c in comments]
# number of google badwords:
n_bad = [np.sum([c.lower().count(w) for w in self.badwords_])
for c in comments]
exclamation = [c.count("!") for c in comments]
addressing = [c.count("@") for c in comments]
question = [c.count("?") for c in comments]
spaces = [c.count(" ") for c in comments]
dots = [c.count("...") for c in comments]
allcaps_ratio = np.array(allcaps) / np.array(n_words, dtype=np.float)
bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float)
designed = np.array([n_words, n_chars, allcaps, max_word_len,
mean_word_len, exclamation, question, addressing, dots, spaces,
bad_ratio, n_bad, allcaps_ratio]).T
features = []
if self.word:
counts = self.countvect.transform(comments).tocsr()
features.append(counts)
if self.char:
counts_char = self.countvect_char.transform(comments).tocsr()
features.append(counts_char)
if self.designed:
features.append(sparse.csr_matrix(designed))
features = sparse.hstack(features)
return features.tocsr()