-
Notifications
You must be signed in to change notification settings - Fork 27
/
examples.py
507 lines (438 loc) · 18 KB
/
examples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2016 Pascal Jürgens and Andreas Jungherr
# See License.txt
"""
Examples for accessing the API
------------------------------
These are some examples demonstrating the use of provided functions for
gathering data from the Twitter API.
Requirements:
- depends on API access modules rest.py and streaming.py
"""
import rest
import streaming
import database
import logging
import json
import datetime
from pytz import timezone
import peewee
from progress.bar import Bar
MST = timezone("MST")
#
# Setup
#
def hydrate(idlist_file="data/example_dataset_tweet_ids.txt"):
"""
This function reads a file with tweet IDs and then loads them
through the API into the database. Prepare to wait quite a bit,
depending on the size of the dataset.
"""
ids_to_fetch = set()
for line in open(idlist_file, "r"):
# Remove newline character through .strip()
# Convert to int since that's what the database uses
ids_to_fetch.add(int(line.strip()))
# Find a list of Tweets that we already have
ids_in_db = set(t.id for t in database.Tweet.select(database.Tweet.id))
# Sets have an efficient .difference() method that returns IDs only present
# in the first set, but not in the second.
ids_to_fetch = ids_to_fetch.difference(ids_in_db)
logging.warning(
"\nLoaded a list of {0} tweet IDs to hydrate".format(len(ids_to_fetch)))
# Set up a progressbar
bar = Bar('Fetching tweets', max=len(ids_to_fetch), suffix='%(eta)ds')
for page in rest.fetch_tweet_list(ids_to_fetch):
bar.next(len(page))
for tweet in page:
database.create_tweet_from_dict(tweet)
bar.finish()
logging.warning("Done hydrating!")
def dehydrate(filename="data/dehydrated_tweet_ids.txt"):
"""
This function writes the Tweet IDs contained in the current database to
a file that allows re-hydration with the above method.
"""
with open(filename, "w") as f:
for tweet in database.Tweet.select(database.Tweet.id):
f.write("{0}\n".format(tweet.id))
#
# Helper Functions
#
def print_tweet(tweet):
"""
Print a tweet as one line:
user: tweet
"""
logging.warning(
u"{0}: {1}".format(tweet["user"]["screen_name"], tweet["text"]))
def print_notice(notice):
"""
This just prints the raw response, such as:
{u'track': 1, u'timestamp_ms': u'1446089368786'}}
"""
logging.error(u"{0}".format(notice))
#
# Examples
#
def import_json(fi):
"""
Load json data from a file into the database.
"""
logging.warning("Loading tweets from json file {0}".format(fi))
for line in open(fi, "rb"):
data = json.loads(line.decode('utf-8'))
database.create_tweet_from_dict(data)
def print_user_archive():
"""
Fetch all available tweets for one user and print them, line by line
"""
archive_generator = rest.fetch_user_archive("lessig")
for page in archive_generator:
for tweet in page:
print_tweet(tweet)
def save_user_archive_to_file():
"""
Fetch all available tweets for one user and save them to a text file, one tweet per line.
(This is approximately the format that GNIP uses)
"""
with open("lessig-tweets.json", "w") as f:
archive_generator = rest.fetch_user_archive("lessig")
for page in archive_generator:
for tweet in page:
f.write(json.dumps(tweet) + "\n")
logging.warning(u"Wrote tweets from @lessig to file lessig-tweets.json")
def save_user_archive_to_database():
"""
Fetch all available tweets for one user and save them to the database.
"""
archive_generator = rest.fetch_user_archive("lessig")
for page in archive_generator:
for tweet in page:
database.create_tweet_from_dict(tweet)
logging.warning(u"Wrote tweets from @lessig to database")
def print_list_of_tweets():
"""
Fetch a list of three tweets by ID, then print them line by line
This example can be easily adapted to write the tweets to a file, see above.
"""
list_generator = rest.fetch_tweet_list(
[62154131600224256, 662025716746354688, 661931648171302912, ])
for page in list_generator:
for tweet in page:
print_tweet(tweet)
def track_keywords():
"""
Track two keywords with a tracking stream and print machting tweets and notices.
To stop the stream, press ctrl-c or kill the python process.
"""
keywords = ["politics", "election"]
stream = streaming.stream(
on_tweet=print_tweet, on_notification=print_notice, track=keywords)
def save_track_keywords():
"""
Track two keywords with a tracking stream and save machting tweets.
To stop the stream, press ctrl-c or kill the python process.
"""
# Set up file to write to
outfile = open("keywords_example.json", "w")
def save_tweet(tweet):
json.dump(tweet, outfile)
# Insert a newline after one tweet
outfile.write("\n")
keywords = ["politics", "election"]
try:
stream = streaming.stream(
on_tweet=save_tweet, on_notification=print_notice, track=keywords)
except (KeyboardInterrupt, SystemExit):
logging.error("User stopped program, exiting!")
outfile.flush()
outfile.close()
def follow_users():
"""
Follow several users, printing their tweets (and retweets) as they arrive.
To stop the stream, press ctrl-c or kill the python process.
"""
# user IDs are: nytimes: 807095, washingtonpost: 2467791
# they can be obtained through:
# users = ["nytimes", "washingtonpost"]
# users_json = rest.fetch_user_list_by_screen_name(screen_names=users)
# for u in users_json:
# print("{0}: {1}".format(u["screen_name"], u["id"]))
users = ["807095", "2467791"]
stream = streaming.stream(
on_tweet=print_tweet, on_notification=print_notice, follow=users)
def save_follow_users():
"""
Follow several users, saving their tweets (and retweets) as they arrive.
To stop the stream, press ctrl-c or kill the python process.
"""
outfile = open("user_example.json", "w")
def save_tweet(tweet):
json.dump(tweet, outfile)
# Insert a newline after one tweet
outfile.write("\n")
users = ["807095", "2467791"]
stream = streaming.stream(
on_tweet=save_tweet, on_notification=print_notice, follow=users)
def export_hashtag_counts(interval="day", hashtags=["Bush", "Carson", "Christie", "Cruz", "Fiorina", "Huckabee", "Kasich", "Paul", "Rubio", "Trump"]):
"""
Create daily counts for given Hashtags. A bit slow. An easy speedup is to convert the list of hashtags to Hashtag database objects and query for them.
"""
# Create output file
with open("hashtag_counts.csv", "w") as f:
# Write header line
f.write("{0},".format(interval))
f.write(",".join(hashtags))
f.write(",\n")
# Prepare interator over intervals
# htm is an intermediary model for many-to-many-relationships
# In this case Tweet -> htm -> Hashtag
htm = database.Tweet.tags.get_through_model()
intervals = database.objects_by_interval(
database.Tweet, interval=interval, start_date=None, stop_date=None)
for (interval_start, interval_stop), query in intervals:
# Convert the timestamp to Mountain Standard Time which is
# the local timezone for the example data
timestamp = MST.normalize(interval_start).strftime(
"%Y-%m-%d %H:%M:%S %z")
f.write("{0},".format(timestamp))
for tag in hashtags:
# Match ignoring case
count = query.join(htm).join(database.Hashtag).where(
peewee.fn.Lower(database.Hashtag.tag) == tag.lower()).count()
f.write("{0},".format(count))
f.write("\n")
def export_mention_counts(interval="day", usernames=["jebbush", "realbencarson", "chrischristie", "tedcruz", "carlyfiorina", "govmikehuckabee", "johnkasich", "randpaul", "marcorubio", "realdonaldtrump"]):
"""
Create daily counts for mentions of given Users.
"""
# Create output file
with open("mention_counts.csv", "w") as f:
# Write header line
f.write("{0},".format(interval))
f.write(",".join(usernames))
f.write(",\n")
# Prepare interator over intervals
# htm is an intermediary model for many-to-many-relationships
# In this case Tweet -> htm -> Hashtag
mtm = database.Tweet.mentions.get_through_model()
intervals = database.objects_by_interval(
database.Tweet, interval=interval, start_date=None, stop_date=None)
for (interval_start, interval_stop), query in intervals:
# Convert the timestamp to Mountain Standard Time which is
# the local timezone for the example data
timestamp = MST.normalize(interval_start).strftime(
"%Y-%m-%d %H:%M:%S %z")
f.write("{0},".format(timestamp))
for user in usernames:
# Match ignoring case
count = query.join(mtm).join(database.User).where(
peewee.fn.Lower(database.User.username) == user.lower()).count()
f.write("{0},".format(count))
f.write("\n")
def export_keyword_counts(interval="day", keywords=["Bush", "Carson", "Christie", "Cruz", "Fiorina", "Huckabee", "Kasich", "Paul", "Rubio", "Trump"]):
"""
Create daily counts for given Keywords.
"""
# Create output file
with open("keyword_counts.csv", "w") as f:
# Write header line
f.write("{0},".format(interval))
f.write(",".join(keywords))
f.write(",\n")
# Prepare interator over intervals
intervals = database.objects_by_interval(
database.Tweet, interval=interval, start_date=None, stop_date=None)
for (interval_start, interval_stop), query in intervals:
# Convert the timestamp to Mountain Standard Time which is
# the local timezone for the example data
timestamp = MST.normalize(interval_start).strftime(
"%Y-%m-%d %H:%M:%S %z")
f.write("{0},".format(timestamp))
for word in keywords:
# Match ignoring case
kwcount = query.where(
peewee.fn.Lower(database.Tweet.text).contains(word.lower())).count()
f.write("{0},".format(kwcount))
f.write("\n")
def export_user_counts(interval="day", usernames=["JebBush", "RealBenCarson", "ChrisChristie", "tedcruz", "CarlyFiorina", "GovMikeHuckabee", "JohnKasich", "RandPaul", "marcorubio", "realDonaldTrump"]):
"""
Create daily counts for given Users.
"""
# Create output file
with open("user_counts.csv", "w") as f:
# Write header line
f.write("{0},".format(interval))
f.write(",".join(usernames))
f.write(",\n")
# Prepare interator over intervals
intervals = database.objects_by_interval(
database.Tweet, interval=interval, start_date=None, stop_date=None)
for (interval_start, interval_stop), query in intervals:
# Convert the timestamp to Mountain Standard Time which is
# the local timezone for the example data
timestamp = MST.normalize(interval_start).strftime(
"%Y-%m-%d %H:%M:%S %z")
f.write("{0},".format(timestamp))
for username in usernames:
# Match precise username
ucount = query.join(database.User).where(
database.User.username == username).count()
f.write("{0},".format(ucount))
f.write("\n")
def export_total_counts(interval="day"):
"""
Create hourly counts for Tweets
"""
# Create output file
with open("total_counts.csv", "w") as f:
# Write header line
f.write("{0},".format(interval))
f.write("total,")
f.write("\n")
# Prepare interator over intervals
intervals = database.objects_by_interval(
database.Tweet, interval=interval, start_date=None, stop_date=None)
for (interval_start, interval_stop), query in intervals:
# Convert the timestamp to Mountain Standard Time which is
# the local timezone for the example data
timestamp = MST.normalize(interval_start).strftime(
"%Y-%m-%d %H:%M:%S %z")
f.write("{0},".format(timestamp))
f.write("{0},".format(query.count()))
f.write("\n")
def export_featureless_counts(interval="day"):
"""
Create hourly counts for Tweets without mentions or URLs.
Complex queries on many-to-many-relationships are very
contrived with peewee. For the sake of simplicity, this
function instead
"""
# Create output file
with open("featureless_counts.csv", "w") as f:
# Write header line
f.write("{0},".format(interval))
f.write("featureless,")
f.write("\n")
# Prepare interator over intervals
intervals = database.objects_by_interval(
database.Tweet, interval=interval, start_date=None, stop_date=None)
for (interval_start, interval_stop), query in intervals:
# Convert the timestamp to Mountain Standard Time which is
# the local timezone for the example data
timestamp = MST.normalize(interval_start).strftime(
"%Y-%m-%d %H:%M:%S %z")
f.write("{0},".format(timestamp))
featureless_count = 0
for t in query:
if bool(t.mentions.is_null() and t.urls.is_null() and t.reply_to_tweet is None):
featureless_count += 1
f.write("{0},".format(featureless_count))
f.write("\n")
def export_mention_totals(n=50):
"""
Export the N most mentioned users and their respective counts to
a CSV file.
"""
start_date = MST.localize(datetime.datetime(2015, 10, 27, 0))
stop_date = MST.localize(datetime.datetime(2015, 11, 2, 23, 59))
with open("mention_totals.csv", "w") as f:
f.write("user, mentions\n")
for user in database.mention_counts(start_date, stop_date)[:50]:
f.write("{0}, {1}\n".format(user.username, user.count))
def export_url_totals(n=50):
"""
Export the N most mentioned URLs and their respective counts to
a CSV file.
"""
start_date = MST.localize(datetime.datetime(2015, 10, 27, 0))
stop_date = MST.localize(datetime.datetime(2015, 11, 2, 23, 59))
with open("url_totals.csv", "w") as f:
f.write("url, mentions\n")
for url in database.url_counts(start_date, stop_date)[:50]:
f.write("{0}, {1}\n".format(url.url, url.count))
def export_hashtag_totals(n=50):
"""
Export the N most mentioned hashtags and their respective counts to
a CSV file.
"""
start_date = MST.localize(datetime.datetime(2015, 10, 27, 0))
stop_date = MST.localize(datetime.datetime(2015, 11, 2, 23, 59))
with open("hashtag_totals.csv", "w") as f:
f.write("hashtag, mentions\n")
for hashtag in database.hashtag_counts(start_date, stop_date)[:50]:
f.write("{0}, {1}\n".format(hashtag.tag, hashtag.count))
def export_retweet_totals(n=50):
"""
Export the N most retweeted users and their respective counts to
a CSV file.
"""
start_date = MST.localize(datetime.datetime(2015, 10, 27, 0))
stop_date = MST.localize(datetime.datetime(2015, 11, 2, 23, 59))
with open("retweet_totals.csv", "w") as f:
f.write("user, retweets\n")
retweetcounts = database.retweet_counts(
start_date, stop_date, 50).items()
for username, count in retweetcounts:
f.write("{0}, {1}\n".format(username, count))
def top_retweets(n=50):
"""
Find the most retweeted tweets and display them.
For readability's sake, this is not done through SQL
"""
rt_counts = {}
# all retweets
retweets = database.Tweet.select(database.Tweet.retweet).where(
database.Tweet.retweet.is_null(False)).group_by(database.Tweet.retweet)
for tweet in retweets:
rt_counts[tweet.retweet.id] = tweet.retweet.retweets.count()
from collections import Counter
c = Counter(rt_counts)
from collections import OrderedDict
results = OrderedDict()
for k, v in c.most_common(n):
results[database.Tweet.get(id=k).text] = v
return results
def top_retweets_straight(n=50):
"""
Get N most retweeted Tweets directly via the database.
The query logic is a bit contrived.
Returns tweet objects which are actually retweets but contain
the retweet count as attribute "rt_count". To get the original (retweeted) Tweet,
refer to the "retweet_id" and "retweet" fields.
Example:
for tweet in top_retweets_straight():
print(tweet.rt_count, tweet.retweet.id, tweet.retweet.text)
"""
# Alias for RT count
rt_count = peewee.fn.Count(database.Tweet.retweet_id)
# Directly aggregate in DB by counting retweet_id field and then grouping
# by current tweet id.
retweets = (
database.Tweet
.select(database.Tweet, rt_count.alias("rt_count"))
.where(database.Tweet.retweet_id > 0)
.group_by(database.Tweet.retweet_id)
.order_by(rt_count.desc())
)
return retweets.limit(n)
def export_retweet_text(n=50):
"""
Find the most retweeted tweets and export them to a CSV file
"""
rt_counts = {}
# all retweets
retweets = database.Tweet.select(database.Tweet.retweet).where(
database.Tweet.retweet.is_null(False)).group_by(database.Tweet.retweet)
for tweet in retweets:
rt_counts[tweet.retweet.id] = tweet.retweet.retweets.count()
from collections import Counter
c = Counter(rt_counts)
with open("retweet_texts.csv", "w") as f:
f.write("tweet text, count\n")
for k, v in c.most_common(n):
tweet_text = database.Tweet.get(id=k).text
f.write("{0},{1}\n".format(
tweet_text.replace("\n", "<newline>"), v))