-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.py
138 lines (124 loc) · 3.37 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- coding: utf-8 -*-
import nltk
import regex as re
# Data loading
DATA_FILE = "data/train.csv"
TEST_SIZE = 0.2
PROCESSED_DATA_FILE = "data/train_clean.json"
# Data Labels
CONTENT = "comment_text"
UNIQUE_ID = "id"
LABELS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
# Plotting
PRECISION = 1e4
# TfidfVectorizer
STOP_WORDS = set(nltk.corpus.stopwords.words("english"))
STRIP_ACCENTS = "unicode"
MAX_FEATURES = 10000
MIN_DF = 1
TFIDF_FILE = "data/word_counts.json"
# Logistic regression
LOG_REGRESSION_SOLVER = "lbfgs" # Optimizer
MAX_ITER = 100 # Num of iterations
C = 4 # Inverse of regularization stregth
# LDA model
NUM_TOPICS = 15
LDA_FILE = "data/topic_probabaility.json"
# LSTM
DROPOUT_RATE = 0.5
EPOCHS = 50
BATCH_SIZE = 512
LSTM_HIDDEN_STATE = 128
DENSE_UNITS = 50
PREDICTION_THRESHOLD = 0.25
PATIENCE = 3 # Number of epochs for early stopping
# Word Embeddings
GLOVE_FILE = "glove/glove.6B.50d.txt"
MAX_WORDS = 20000
MAX_SEQUENCE_LEN = 1000
# Text preprocessing
LEMMATIZER = nltk.stem.wordnet.WordNetLemmatizer()
PORTER = nltk.stem.PorterStemmer()
TOKENIZER = nltk.tokenize.RegexpTokenizer(r"\w+[']\w*|\w+")
# Tokens generated in text cleaning.
# Do not use any special characters
re.DEFAULT_VERSION = re.VERSION1 # Version of regex used for patterns
PATTERNS = [[] for i in range(6)]
PATTERNS[0] = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
PATTERNS[1] = re.compile("http://.*com")
PATTERNS[2] = re.compile("\d[19|20]\d{2}s?")
PATTERNS[3] = re.compile("\d+(?:st|nd|rd|th)")
PATTERNS[4] = re.compile("\d{1,3}([,]\d{3})*([.]\d+)*")
PATTERNS[5] = re.compile("[\d]+")
TOKENS = [[] for i in range(6)]
TOKENS[0] = "IP"
TOKENS[1] = "URL"
TOKENS[2] = "YEAR"
TOKENS[3] = "ORDER"
TOKENS[4] = "DIGIT"
TOKENS[5] = "DIGIT"
SPAM_TOKEN = "SPAM"
SPAM_CHAR_LIMIT = 50 # Longest english word: 45 chars
# Aphostrophes dict
# Credits: https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
APPO = {
"aren't": "are not",
"can't": "cannot",
"couldn't": "could not",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'll": "he will",
"he's": "he is",
"i'd": "I would",
"i'd": "I had",
"i'll": "I will",
"i'm": "I am",
"isn't": "is not",
"it's": "it is",
"it'll": "it will",
"i've": "I have",
"let's": "let us",
"mightn't": "might not",
"mustn't": "must not",
"shan't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"shouldn't": "should not",
"that's": "that is",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"we'd": "we would",
"we're": "we are",
"weren't": "were not",
"we've": "we have",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where's": "where is",
"who'd": "who would",
"who'll": "who will",
"who're": "who are",
"who's": "who is",
"who've": "who have",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"you've": "you have",
"'re": " are",
"wasn't": "was not",
"we'll": " will",
"didn't": "did not",
"tryin'": "trying",
}