-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessTweets.py
43 lines (30 loc) · 1.13 KB
/
processTweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from pymongo import MongoClient
import re
RawDbClient = MongoClient('', 27017) # Add URL
ProcessedDbClient = MongoClient('', 27017) # Add URL
RawDb = RawDbClient.RawDb
RawTweets = RawDb.tweets
ProcessedDb = ProcessedDbClient.ProcessedDb
ProcessedTweets = ProcessedDb.tweets
# Keywords and language
keywords = ['Storm', 'Winter', 'Canada', 'Temperature', 'Flu', 'Snow', 'Indoor', 'Safety']
language = ['en']
def cleanandstore(RawTweets, ProcessedTweets):
for tweet in RawTweets.find():
text = re.sub(r'htt\S+', '', tweet['text'], flags=re.MULTILINE)
text = re.compile('[\U00010000-\U0010ffff]', flags=re.MULTILINE).sub(r'', text)
text = re.sub(r"[^a-zA-Z0-9]+", ' ', text)
if tweet['place'] is not None:
location = tweet['place']['name']
else:
location = None
tweetObj = {
'text': text,
'time': tweet['created_at'],
'location': location,
'user': tweet['user']['screen_name']
}
ProcessedTweets.insert_one(tweetObj)
return True
if __name__ == '__main__':
cleanandstore(RawTweets, ProcessedTweets)