-
Notifications
You must be signed in to change notification settings - Fork 46
/
postDownloader.py
180 lines (155 loc) · 6.19 KB
/
postDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import requests
from datetime import datetime
import traceback
import time
import json
import sys
import csv
import json
# IMPORTANT READ THIS FIRST
# The pushshift service that this script uses is only available to moderators. Go through this guide to get a token and update the token field below
# https://api.pushshift.io/guide
token = ""
username = "" # put the username you want to download in the quotes
subreddit = "" # put the subreddit you want to download in the quotes
thread_id = "" # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi
# leave either one blank to download an entire user's or subreddit's history
# or fill in both to download a specific users history from a specific subreddit
# change this to one of "human", "csv" or "json"
# - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes
# - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url
# - json: the full json object
output_format = "human"
# default start time is the current time and default end time is all history
# you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date
start_time = datetime.utcnow() #datetime.strptime("10/05/2021", "%m/%d/%Y")
end_time = None #datetime.strptime("09/25/2021", "%m/%d/%Y")
convert_to_ascii = False # don't touch this unless you know what you're doing
convert_thread_id_to_base_ten = True # don't touch this unless you know what you're doing
def write_human_line(handle, obj, is_submission, convert_to_ascii):
handle.write(str(obj['score']))
handle.write(" : ")
handle.write(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
if is_submission:
handle.write(" : ")
if convert_to_ascii:
handle.write(obj['title'].encode(encoding='ascii', errors='ignore').decode())
else:
handle.write(obj['title'])
handle.write(" : u/")
handle.write(obj['author'])
handle.write(" : ")
handle.write(f"https://www.reddit.com{obj['permalink']}")
handle.write("\n")
if is_submission:
if obj['is_self']:
if 'selftext' in obj:
if convert_to_ascii:
handle.write(obj['selftext'].encode(encoding='ascii', errors='ignore').decode())
else:
handle.write(obj['selftext'])
else:
handle.write(obj['url'])
else:
if convert_to_ascii:
handle.write(obj['body'].encode(encoding='ascii', errors='ignore').decode())
else:
handle.write(obj['body'])
handle.write("\n-------------------------------\n")
def write_csv_line(writer, obj, is_submission):
output_list = []
output_list.append(str(obj['score']))
output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d"))
if is_submission:
output_list.append(obj['title'])
output_list.append(f"u/{obj['author']}")
output_list.append(f"https://www.reddit.com{obj['permalink']}")
if is_submission:
if obj['is_self']:
if 'selftext' in obj:
output_list.append(obj['selftext'])
else:
output_list.append("")
else:
output_list.append(obj['url'])
else:
output_list.append(obj['body'])
writer.writerow(output_list)
def write_json_line(handle, obj):
handle.write(json.dumps(obj))
handle.write("\n")
def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii):
print(f"Saving to {filename}")
count = 0
if output_format == "human" or output_format == "json":
if convert_to_ascii:
handle = open(filename, 'w', encoding='ascii')
else:
handle = open(filename, 'w', encoding='UTF-8')
else:
handle = open(filename, 'w', encoding='UTF-8', newline='')
writer = csv.writer(handle)
previous_epoch = int(start_datetime.timestamp())
break_out = False
while True:
new_url = url_base+str(previous_epoch)
json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1", 'Authorization': f"Bearer {token}"})
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
try:
json_data = json_text.json()
except json.decoder.JSONDecodeError:
time.sleep(1)
continue
if 'data' not in json_data:
break
objects = json_data['data']
if len(objects) == 0:
break
for obj in objects:
previous_epoch = obj['created_utc'] - 1
if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime:
break_out = True
break
count += 1
try:
if output_format == "human":
write_human_line(handle, obj, is_submission, convert_to_ascii)
elif output_format == "csv":
write_csv_line(writer, obj, is_submission)
elif output_format == "json":
write_json_line(handle, obj)
except Exception as err:
if 'permalink' in obj:
print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}")
else:
print(f"Couldn't print object, missing permalink: {obj['id']}")
print(err)
print(traceback.format_exc())
if break_out:
break
print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}")
print(f"Saved {count}")
handle.close()
if __name__ == "__main__":
filter_string = None
if username == "" and subreddit == "" and thread_id == "":
print("Fill in username, subreddit or thread id")
sys.exit(0)
if output_format not in ("human", "csv", "json"):
print("Output format must be one of human, csv, json")
sys.exit(0)
filters = []
if username:
filters.append(f"author={username}")
if subreddit:
filters.append(f"subreddit={subreddit}")
if thread_id:
if convert_thread_id_to_base_ten:
filters.append(f"link_id={int(thread_id, 36)}")
else:
filters.append(f"link_id=t3_{thread_id}")
filter_string = '&'.join(filters)
url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before="
if not thread_id:
download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time, end_time, True, convert_to_ascii)
download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time, end_time, False, convert_to_ascii)