forked from TranafLee/IT4142E-DataScience-Group6
-
Notifications
You must be signed in to change notification settings - Fork 1
/
addInfo.py
45 lines (39 loc) · 1.9 KB
/
addInfo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import pandas as pd
import sys
side = sys.argv[1]
keyword = sys.argv[2] + ' ' + sys.argv[3]
path = "/kaggle/working/TweetCrawler/data"
# Check whether the path exists or not
if not os.path.exists(path):
os.makedirs(path)
### Scraping User Profile (Optional) ###
all_info = []
filename = keyword.replace(' ', '')
info = pd.read_json('/kaggle/working/TweetCrawler/raw_data/{}/{}.json'.format(side,filename), lines=True)[['user']]
rows = []
for idx in range(len(info['user'])):
row = info['user'][idx]
for key in row.keys():
row[key] = [row[key]]
rows.append(pd.DataFrame.from_dict(row))
all_info.append(pd.concat(rows))
info_df = pd.concat(all_info, ignore_index=True)[['id', 'location']]
info_df.rename(columns = {'id':'user_id'}, inplace=True)
### Combine all content of tweets and necessary features ###
frames = []
filename = keyword.replace(' ', '')
df = pd.read_json('/kaggle/working/TweetCrawler/raw_data/{}/{}.json'.format(side, filename), lines=True)[['id',
'url',
'date',
'rawContent',
'hashtags',
'replyCount',
'retweetCount',
'likeCount']]
df['key'] = keyword
frames.append(df)
tweets_df = pd.concat(frames, ignore_index=True)
final = pd.concat([tweets_df, info_df], axis=1)
final['subject'] = side
final.to_csv(f'/kaggle/working/Crawl_{side}_{sys.argv[2]}_{sys.argv[3]}.csv')