-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_retrieve_cleaning.py
164 lines (135 loc) · 5.53 KB
/
data_retrieve_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
KV Le
CSE 163 AG
Final Project
A script that retrieves and cleans the data needed for my Final Project about
MyAnimeList Users
"""
import pandas as pd
from jikanpy import Jikan
from time import sleep
def clean_user_animelists():
"""Cleans original user animelists for information relavent to the project
Notes
-----
File Path: data/user_animelists_cleaned.csv
"""
new_animelists = \
pd.read_csv('data/original_data/users_animelists_azathoth.csv',
usecols=["username", "anime_id", "my_score",
"my_status", "my_watched_episodes"]).dropna()
new_animelists.to_csv("data/user_animelists_cleaned.csv", index=False)
def clean_animelist():
"""Cleans original anime info for information relavent to the project
Notes
-----
File Path: data/animelist_cleaned.csv
"""
new_animelist = \
pd.read_csv('data/original_data/anime_azathoth.csv',
usecols=["anime_id", "title", "image_url", "type",
"episodes", "duration_min", "score", "scored_by",
"rank", "popularity", "members", "favorites",
"related", "studio", "genre", "aired_from_year",
"source"]).dropna()
new_animelist.to_csv("data/animelist_cleaned.csv", index=False)
def clean_userlist():
"""Cleans original MAL user info for information relavent to the project
Notes
-----
File Path: data/userlist_cleaned.csv
"""
new_userlist = \
pd.read_csv('data/original_data/users_azathoth.csv',
usecols=["username", "user_id", "user_watching",
"user_completed", "user_onhold", "user_dropped",
"user_plantowatch", "user_days_spent_watching",
"gender", "location", "birth_date",
"stats_mean_score", "stats_episodes"]).dropna()
new_userlist["age"] = 2020 - \
pd.to_numeric(new_userlist["birth_date"]
.str.split("-", expand=True, n=1)[0])
# Not to be offensive but working with non-Binary genders will be too hard
new_userlist = new_userlist[new_userlist["gender"]
.isin(["Male", "Female"])]
new_userlist.to_csv("data/userlist_cleaned.csv", index=False)
def get_2019_mal_data():
"""Uses JikanAPI to retrieve anime info from 2019
Notes
-----
File Path: data/animelist_2019.csv
"""
jikan = Jikan()
anime_2019 = {
"anime_2019_spr": jikan.season(2019, "spring"),
"anime_2019_sum": jikan.season(2019, "summer"),
"anime_2019_fall": jikan.season(2019, "fall"),
"anime_2019_win": jikan.season(2019, "winter")
}
attributes = ["type", "episodes", "score", "source",
"members", "genres", "title", "mal_id"]
animes = {
"anime_id": [],
"title": [],
"type": [],
"episodes": [],
"duration_min": [],
"source": [],
"genre": [],
"studio": [],
"score": [],
"favorites": [],
"members": []
}
for season in anime_2019:
print(f"Retrieving all {season} Anime")
for anime in anime_2019[season]["anime"]:
for attribute in attributes:
if attribute == "mal_id":
anime_id = anime[attribute]
animes["anime_id"].append(anime_id)
trying = True
while trying:
try:
anime_info = \
jikan.anime(anime_id)
trying = False
except Exception:
print(f"Error in getting {anime['title']}. " +
"Retrying")
sleep(2)
studios = []
for studio_info in anime_info["studios"]:
studios.append(studio_info["name"])
duration = anime_info["duration"]
duration = duration.split()
duration_hr = duration[duration.index("hr") - 1] \
if "hr" in duration else 0
duration_min = duration[duration.index("min") - 1] \
if "min" in duration else 0
duration = int(duration_hr) * 60 + int(duration_min)
favorites = anime_info["favorites"]
animes["studio"].append(", ".join(studios))
animes["duration_min"].append(duration)
animes["favorites"].append(favorites)
# Sleep is to ensure that we don't make to many requests
# and get blocked from JikanAPI
sleep(4)
elif attribute == "genres":
genres = []
for genre_info in anime[attribute]:
genres.append(genre_info["name"])
animes["genre"].append(", ".join(genres))
else:
animes[attribute].append(anime[attribute])
result = pd.DataFrame.from_dict(animes).dropna()
result = result[result["studio"].astype(bool)]
result.to_csv("data/animelist_2019.csv", index=False)
def main():
"""Runs all functions to retrieve and clean information for my project"""
clean_user_animelists()
clean_animelist()
clean_userlist()
get_2019_mal_data()
if __name__ == "__main__":
main()