-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_loading.py
126 lines (98 loc) · 3.49 KB
/
data_loading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import regex as re
import math
import string
import json
import os.path
from pandas import DataFrame
from typing import List
from config import (
APPO,
TOKENS,
PATTERNS,
SPAM_TOKEN,
SPAM_CHAR_LIMIT,
CONTENT,
STOP_WORDS,
TEST_SIZE,
PROCESSED_DATA_FILE,
LEMMATIZER,
PORTER,
TOKENIZER,
)
def loadData(file: str, preprocess: bool = True, save_to_file: bool = True) -> pd.DataFrame:
""" Load train.csv dataset from https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
Args:
file: data file stored locally, full or relative path
preprocess (optional): cleaning and tokenizing text data. Valid only when loading original file.
True for preprocessing, false otherwise
save_to_file (optional): saving preprocessed data. Valid only when loading original file.
True for saving, false otherwise
Returns:
data: df with comments and labels
"""
if os.path.exists(PROCESSED_DATA_FILE):
data = loadProcessedData(PROCESSED_DATA_FILE)
else:
data = pd.read_csv(file)
if preprocess:
data[CONTENT] = data[CONTENT].apply(lambda x: cleanAndTokenize(x))
if save_to_file:
saveProcessedData(data, PROCESSED_DATA_FILE)
return data
def cleanAndTokenize(comment: str) -> List[str]:
""" Preprocess text for NLP applications. Preprocessing includes:
1. Cleaning:
- Lowercase
- Remove apostrophes
- Detect IP
- Detect links
- Detect years
- Detect digits
- Detect order (1st, 22nd etc.)
- Detect spam
2. Lemmatizing
3. Stemming
Args:
comment: text to be processed
Returns:
words: cleaned and tokenized comment
"""
comment = comment.lower()
for pattern, token in zip(PATTERNS, TOKENS):
comment = pattern.sub(token, comment)
tokens = set(TOKENS)
words = TOKENIZER.tokenize(comment)
idx = 0
while idx < len(words):
if words[idx] in APPO:
words[idx] = APPO[words[idx]]
# elif words[idx] in STOP_WORDS:
# del words[idx]
# continue
elif words[idx] not in tokens:
if len(words[idx]) > SPAM_CHAR_LIMIT:
words[idx] = SPAM_TOKEN
else:
words[idx] = re.sub(r"'", "", words[idx])
words[idx] = LEMMATIZER.lemmatize(words[idx], "v")
words[idx] = PORTER.stem(words[idx])
idx += 1
return " ".join(words)
def saveProcessedData(df: DataFrame, file: str) -> None:
""" Save dataframe to json file
Args:
df: dataframe to be saved
file: file name to store df, full or relative path
Returns:
None
"""
df.to_json(file)
def loadProcessedData(file: str) -> DataFrame:
""" Load dataframe from json file
Args:
file: file name where data is stored, full or relative path
Returns:
df: dataframe stored in file
"""
return pd.read_json(file)