-
Notifications
You must be signed in to change notification settings - Fork 0
/
oom_analysis.py
executable file
·125 lines (104 loc) · 3.87 KB
/
oom_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import pandas as pd
import os
import gzip
import datetime as dt
from sqlalchemy import create_engine
REVIEW_COLS = ["reviewerID", 'asin', 'overall', 'review_length', 'summary_length', 'upvotes', 'helpfulness']
METADATA_COLS = ['asin', 'price']
script_dir = os.getcwd()
script_type = "MAC"
if script_type == "MAC":
DATA_FILE = 'reviews_Amazon_Instant_Video.json.gz'
METADATA_FILE = 'meta_Amazon_Instant_Video.json.gz'
# os.chdir('/Volumes/Expansion/Amazon_Review_Data')
elif script_type == 'WIN':
DATA_FILE = 'reviews_Amazon_Instant_Video.json.gz'
METADATA_FILE = 'meta_Amazon_Instant_Video.json.gz'
os.chdir(script_dir)
else:
DATA_FILE = 'aggressive_dedup.json.gz'
METADATA_FILE = 'metadata.json.gz'
os.chdir(script_dir)
start = dt.datetime.now()
disk_engine = create_engine('sqlite:///amazon.db')
def parse(path):
# print path
g = gzip.open(path, 'rb')
for l in g:
yield eval(l)
def get_df(path, metadata=False):
i = 0
index_start = 1
for d in parse(path):
df = {}
if not metadata: # Extract only the columns we are interested in to save memory
d['review_length'] = len(d['reviewText'])
d['summary_length'] = len(d['summary'])
if 'helpful' in d.keys():
d['upvotes'] = d['helpful'][0]
try:
d['helpfulness'] = round(float(d['helpful'][0]) / float(d['helpful'][1]), 2)
except ZeroDivisionError:
d['helpfulness'] = 0.0
for keys in d.keys():
if keys not in REVIEW_COLS:
del d[keys]
else:
if 'price' not in d.keys():
continue
for keys in d.keys():
if keys not in METADATA_COLS:
del d[keys]
df[i] = d
# print df
try:
# print df
df = pd.DataFrame.from_dict(df, orient='index')
# df.index = [index_start]
# print df.index
if not metadata:
df.columns = REVIEW_COLS
df.to_sql('reviews', disk_engine, if_exists='append')
else:
# print df
df.columns = METADATA_COLS
df.to_sql('meta', disk_engine, if_exists='append')
index_start += 1
# print index_start
except Exception as e:
# print df
print str(e)
pass
print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, i)
i += 1
return # pd.DataFrame.from_dict(df, orient='index')
def parse_data(path, metadata=False):
chunksize = 20000
j = 0
index_start = 1
for df in pd.read_json(path, chunksize=chunksize, iterator=True, encoding='utf-8'):
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns
df.index += index_start
# Remove the un-interesting columns
if not metadata:
columns = ["reviewerID", 'asin', 'helpful', 'overall', 'review_length', 'summary_length']
dbname = 'reviews'
else:
columns = ['asin', 'price', 'brand']
dbname = 'meta'
for c in df.columns:
if c not in columns:
df = df.drop(c, axis=1)
j += 1
print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j * chunksize)
df.to_sql(dbname, disk_engine, if_exists='append')
index_start = df.index[-1] + 1
if not os.path.exists(os.path.join(os.getcwd(), 'amazon.db')):
print "Creating Dataframes..."
get_df(DATA_FILE)
get_df(METADATA_FILE, metadata=True)
# parse_data(METADATA_FILE, metadata=True)
print "Dataframes created!"
# print sum(1 for i in parse(DATA_FILE))
# df = pd.read_sql_query('SELECT count(*) FROM meta', disk_engine)
# print df.head()