-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_files.py
executable file
·92 lines (78 loc) · 2.87 KB
/
process_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import pandas as pd
import gzip
import sys
import contextlib
script_dir = os.getcwd()
script_type = "SERVER"
if script_type == "MAC":
DATA_FILE = 'reviews_Amazon_Instant_Video.json.gz'
METADATA_FILE = 'meta_Amazon_Instant_Video.json.gz'
os.chdir(script_dir)
elif script_dir == 'WIN':
DATA_FILE = 'reviews_Amazon_Instant_Video.json.gz'
METADATA_FILE = 'meta_Amazon_Instant_Video.json.gz'
os.chdir(script_dir)
else:
DATA_PATH = os.path.join(script_dir, 'userSplit')
METADATA_PATH = os.path.join(script_dir, 'metasplit')
os.chdir(script_dir)
REVIEW_COLS = ["reviewerID", 'asin', 'helpful', 'overall', 'review_length', 'summary_length']
METADATA_COLS = ['asin', 'price', 'brand']
# start = dt.datetime.now()
if not os.path.exists(os.path.join(os.getcwd(), 'processed_data')):
os.mkdir(os.path.join(os.getcwd(), 'processed_data'))
pickle_path = os.path.join(os.getcwd(), 'processed_data')
def parse(path):
g = gzip.open(path, 'rb')
for l in g:
try:
yield eval(l)
except:
pass
def get_df(path, metadata=False):
print "Processing file: " + str(path)
file_name_appender = path[-4:]
i = 0
df = {}
for d in parse(path):
try:
if not metadata: # Extract only the columns we are interested in to save memory
d['review_length'] = len(d['reviewText'])
d['summary_length'] = len(d['summary'])
for keys in d.keys():
if keys not in REVIEW_COLS:
del d[keys]
if 'helpful' in d.keys():
d['upvotes'] = d['helpful'][0]
try:
d['helpfulness'] = round(float(d['helpful'][0]) / float(d['helpful'][1]), 2)
except ZeroDivisionError:
d['helpfulness'] = 0.0
else:
for keys in d.keys():
if keys not in METADATA_COLS:
del d[keys]
df[i] = d
except:
pass
print "Storing Dataframe..."
data_frame = pd.DataFrame.from_dict(df, orient='index')
if not metadata:
file_name = "user_data_" + str(file_name_appender) + ".p"
else:
file_name = "metadata_" + str(file_name_appender) + ".p"
data_frame.to_pickle(os.path.join(pickle_path, file_name))
return # pd.DataFrame.from_dict(df, orient='index')
# print "Creating Dataframes..."
# get_df(DATA_FILE)
# sys.exit()
# print "Dataframes created!"
# print "Processing MetaData"
# for metadata_file in os.listdir(METADATA_PATH):
# dir_join = os.path.join(os.getcwd(), METADATA_PATH)
# get_df(os.path.join(dir_join, metadata_file))
print "Processing Review Data"
for reviewdata_file in os.listdir(DATA_PATH):
dir_join = os.path.join(os.getcwd(), DATA_PATH)
get_df(os.path.join(dir_join, reviewdata_file))