-
Notifications
You must be signed in to change notification settings - Fork 1
/
app.py
145 lines (106 loc) · 3.79 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -*- coding: utf-8 -*-
"""consumercomplaint.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1UQBwaz-v9phmZn9NDfelKH4RPLRkHwVO
#IMPORTING LIBRARIES
"""
#1.Pandas for dataframe
import pandas as pd
#2. NumPy to peform Calculations
import numpy as np
#3. Seaborn to Visualize data
#import seaborn as sns
#4. To split data
from sklearn.model_selection import train_test_split
#5. For Logistic Regression
from sklearn.linear_model import LogisticRegression
#6. For Plotting Graph
import matplotlib.pyplot as plt
#7. For Natural Language Processing
import nltk
from nltk.corpus import stopwords
#8. For Classification
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
#9. For Analysing Text
import regex as re
import string
#10. For Deployment
import streamlit as st
# **DATA PREPROCESSING**"""
#Importing Data:
data_complaint= pd.read_excel('Complaints.xlsx')
#Creating Copy:
data=data_complaint.copy()
#Printing top 5 entries
print(data.head())
#Taking only columns required for prediction
data = data[["Product", "Issue", "Consumer complaint narrative"]]
#Printing total NULL values of each column
print(data.isnull().sum())
#Dropping NaN(Not a Number) values
data = data.dropna(axis=0)
print(data.isnull().sum())
#Checking objects data type
print(data.info())
#Typecasting Data
data['Product'] = data['Product'].astype('string')
data['Issue'] = data['Issue'].astype('string')
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].astype('string')
#Again checking objects data type
print(data.info())
#Count of Various Issues
print(data["Issue"].value_counts())
#Using NLP to pre-process text data
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))
#Defining function to clean text
def clean(text):
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
text = [word for word in text.split(' ') if word not in stopword]
text=" ".join(text)
text = [stemmer.stem(word) for word in text.split(' ')]
text=" ".join(text)
return text
#Calling clean(text) function
data["Consumer complaint narrative"] = data["Consumer complaint narrative"].apply(clean)
#Spliting data into train and test sets
x = np.array(data["Consumer complaint narrative"])
y = np.array(data["Product"])
#Training Model
cv = CountVectorizer()
X = cv.fit_transform(x)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.33,
random_state=42)
# **VISUALIZATION**"""
#Frequency distribution of Complaint Type
#SalStat=sns.countplot(data['Product'])
#Pie Chart representation of Consumer Complaint Type
#data.Product.value_counts().plot(kind='pie',autopct='%1.0f%%',figsize=(12, 8))
#plt.title("Complaint Type")
#plt.axis("equal") # Equal aspect ratio ensures a circular pie chart
#plt.show()"""
# **APPLYING CLASSIFICATION ALGORITHM**"""
#Stochastic Gradient Descent (SGD) Classifier Algorithm
sgdmodel = SGDClassifier()
sgdmodel.fit(X_train,y_train)
# **ACCURACY**"""
print("Accuracy of SGD model:",sgdmodel.score(X_test,y_test)*100,"%")
# **PREDICTION OF COMPLAINT**"""
#Taking User Input
st.header("Consumer Complaint Classification")
user =st.text_input("Enter Complaint Narrative: ")
data = cv.transform([user]).toarray()
output = sgdmodel.predict(data)
#Printing the type of complaint
st.write("Type of Complaint:\n")
st.write(output[0])