forked from abha-b/crawling-reddit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl_by_subreddit.py
157 lines (124 loc) · 5.52 KB
/
crawl_by_subreddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# importing necessary modules
import sys
import praw
import pandas as pd
import datetime as dt
'''
Following instructions on http://www.storybench.org/how-to-scrape-reddit-with-python/
Created reddit app at https://ssl.reddit.com/prefs/apps/
app name: xxx
description: zzz
personal use script: yyy
secret key: www
Example run: python crawl_by_subreddit.py <subreddit_name>
'''
##########################################################################################
# extract submission info into a dictionary
def get_posts_dict(submissions, cond):
posts_dict = { "title":[], "score":[], "id":[], "url":[],\
"comms_num": [], "created": [], "username":[], "body":[], "subreddit":[]}
try:
for submission in submissions:
# filtering out empty/deleted submissions or submissions posted in Alzheimers or Dementia groups
if (submission.selftext) and (submission.selftext != '[removed]'):
posts_dict["title"].append(submission.title)
posts_dict["score"].append(submission.score)
posts_dict["id"].append(submission.id)
posts_dict["url"].append(submission.url)
posts_dict["username"].append(str(submission.author))
posts_dict["subreddit"].append(str(submission.subreddit))
posts_dict["comms_num"].append(submission.num_comments)
posts_dict["created"].append(submission.created)
posts_dict["body"].append(submission.selftext)
except:
try:
fo = open(cond + '_deleted_posts.csv', 'a+')
fo.write('\t'.join([str(submission.subreddit), str(submission.author), submission.id]) + '\n')
fo.close()
except:
fo = open(cond + '_deleted_posts.csv', 'a+')
fo.write('subreddit ' + str(submission.subreddit) + '/' + str(submission.author) + ' posts not available')
fo.close()
return posts_dict
##########################################################################################
# extract submission info into a dictionary
def get_comments_dict(comments, cond):
comments_dict = { "id":[],"username":[], "subreddit":[], "created": [], "body":[], "link_id":[], "parent_id":[]}
try:
for comm in comments:
# filtering out empty/deleted submissions or submissions posted in Alzheimers or Dementia groups
if (comm.body):
comments_dict["created"].append(comm.created)
comments_dict["body"].append(comm.body)
comments_dict["username"].append(str(comm.author))
comments_dict["subreddit"].append(str(comm.subreddit))
comments_dict["id"].append(comm.id)
comments_dict["link_id"].append(comm.link_id)
comments_dict["parent_id"].append(comm.parent_id)
#print(comments_dict)
except:
try:
fo = open(cond + '_deleted_comments.csv', 'a+')
fo.write('\t'.join(map(str, [str(comm.subreddit), str(comm.author), comm.id])) + '\n')
fo.close()
except:
fo = open(cond + '_deleted_comments.csv', 'a+')
fo.write('subreddit ' + str(comm.subreddit) + '/' + str(comm.author) + ' comments not available')
fo.close()
return comments_dict
##########################################################################################
# write user submissions (posts) info onto a file
def write_u_subs(reddit, uname, m, hbool, cond):
u_reddit = reddit.redditor(uname)
print('getting submissions by user: ' + uname)
u_submissions = u_reddit.submissions.top(limit=None)
u_submissions_dict = get_posts_dict(u_submissions, cond)
u_submissions_data = pd.DataFrame(u_submissions_dict)
u_submissions_data.to_csv('control_posts_' + cond + '.csv', index=False, mode=m, header=hbool)
print('getting comments by user: ' + uname)
u_comments = u_reddit.comments.top(limit=None)
u_comments_dict = get_comments_dict(u_comments, cond)
u_comments_data = pd.DataFrame(u_comments_dict)
u_comments_data.to_csv('control_comments_' + cond + '.csv', index=False, mode=m, header=hbool)
return
##########################################################################################
'''
OBJECTIVE:
to get all posts of users posting in the target subreddit
INPUT:
subreddit name
OUTPUT:
files containing all submissions and comments by users posting in the given subreddit
'''
##########################################################################################
def main(cond):
# create reddit object instance
reddit = praw.Reddit(client_id=<client_id>, \
client_secret=<secret_key>, \
user_agent=<app_name>, username=<user_name>, \
password=<pwd>)
subreddit = reddit.subreddit(cond)
print('\ngetting submissions for subreddit: ' + cond)
# for the given subreddit, extract all posts, convert to dataframe and print onto output files
s_submissions = subreddit.top(limit=None)
s_submissions_dict = get_posts_dict(s_submissions, cond)
s_submissions_data = pd.DataFrame(s_submissions_dict)
s_submissions_data.to_csv('posts_' + cond + '.csv', index=False)
print('*'*50 + '\n')
users_submissions = list(set(s_submissions_data.username))
print(len(users_submissions))
print('\ngetting comments for subreddit: ' + cond)
# for the given subreddit, extract all comments, convert to dataframe and print onto output files
s_comments = subreddit.comments(limit=None)
s_comments_dict = get_comments_dict(s_comments, cond)
s_comments_data = pd.DataFrame(s_comments_dict)
s_comments_data.to_csv('comments_' + cond + '.csv', index=False)
print('*'*50 + '\n')
users_comments = list(set(s_comments_data.username))
print(len(users_comments))
print('Done.')
return
##########################################################################################
# call the main function
main(sys.argv[1])
##########################################################################################