-
Notifications
You must be signed in to change notification settings - Fork 0
/
WSBpmaw.py
94 lines (77 loc) · 4.07 KB
/
WSBpmaw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -*- coding: utf-8 -*-
# Require packate:
# pip install pmaw
import datetime as dt
import pandas as pd
import numpy as np
from tqdm import tqdm
from pmaw import PushshiftAPI
# Create time filter
before = int(dt.datetime(2021,1,12,0,0).timestamp()) # timestamp in seconds
# Create subreddit and parameters
subreddit="wallstreetbets"
size = 500 # maximum submission size = 500
api = PushshiftAPI()
limit = 1000000
test_list_data = []
total_results_df = []
#------------------------------------PARAMETERS EDIT------------------------------------#
min_score = '>40' # > here is >= #Sumission score 40, comment score 10
time_range = 365*2
download_per_loop = 10 # days
IS_SUBMISSION = True
#Download by smaller step to avoid imcomplete result (too many resuls coulde be lost due to PushShift shards are down???)
for i in tqdm(range(int(time_range/download_per_loop)+2)):
after = before - 60*60*24*download_per_loop # download per day
if IS_SUBMISSION:
file_name = 'submissions'
results = api.search_submissions(score=min_score, sort='desc',
sort_type='score',
subreddit=subreddit,
size=size,
before=before,
after=after,
limit=limit)
results_df = pd.DataFrame(results)
if not results_df.empty:
# Remove anypost that was removed by any (post that was not removed has this value as nan)
if hasattr(results_df, 'removed_by_category'):
results_df = results_df[results_df.removed_by_category.isnull()]
text_list_title = results_df.title.tolist()
text_list_content = results_df.selftext.replace('', np.nan) #Replace empty text with nan
text_list_content = text_list_content.dropna().tolist() #Drop empty row (a lot of submission don't have text content)
test_list_data = test_list_data + text_list_title + text_list_content
else:
file_name = 'comments'
results = api.search_comments(score=min_score, sort='desc',
sort_type='score',
subreddit=subreddit,
size=size,
before=before,
after=after,
limit=limit)
results_df = pd.DataFrame(results)
# Extract text to list if result is not empty
if not results_df.empty:
text_list = results_df.body.tolist()
test_list_data = test_list_data + text_list
if not results_df.empty:
if len(total_results_df) == 0:
total_results_df = results_df.copy(deep=True)
else:
total_results_df = total_results_df.append(results_df)
text_date_before = dt.datetime.utcfromtimestamp(before).strftime("%Y-%m-%d")
text_date_after = dt.datetime.utcfromtimestamp(after).strftime("%Y-%m-%d")
file_number = str(i).zfill(3)
before = after
print("\n---Number: {} before: {}, after: {}, results length: {}".format(file_number,
text_date_before,
text_date_after,
len(test_list_data)))
# results_df.to_csv(file_name+'-'+text_date_after+'.csv', header=True, index=False,
# columns=list(results_df.axes[1]), encoding='utf-8-sig')
df = pd.DataFrame(test_list_data, columns=['text'])
df.to_csv(file_name+ '.csv', encoding='utf-8-sig')
total_results_df.to_csv(file_name+'-total.csv', header=True, index=False,
columns=list(total_results_df.axes[1]),
encoding='utf-8-sig')