forked from hoppiece/anime-tinder
-
Notifications
You must be signed in to change notification settings - Fork 1
/
anime_info_collector.py
75 lines (61 loc) · 2.23 KB
/
anime_info_collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import dataclasses
import json
from urllib.parse import urlencode
from bs4 import BeautifulSoup
from logzero import logger
import requests
@dataclasses.dataclass
class AnimeInfo:
"""Class for keeping Anime information"""
pageid: int
title: str
genre: str
class InfoCollector:
def _create_mediawiki_query_form_title(self, title=str) -> str:
url_base = "https://ja.wikipedia.org/w/api.php"
query = urlencode(
{
"format": "json",
"action": "query",
"prop": "revisions", # 本来記事のリビジョンを取得
"titles": title, #
"rvprop": "content", # 本文を取得
"rvparse": "", # 本文をHTMLで取得
}
)
url = f"{url_base}?{query}"
return url
def extract_animeinfo_from_html(self, html: str) -> AnimeInfo:
soup = BeautifulSoup(html, "html.parser")
info_table = soup.select(
"#mw-content-text > div.mw-parser-output > table.infobox.bordered"
)
print(info_table)
def from_mediawiki(self, title: str) -> AnimeInfo:
"""
Crawling data via [MediaWiki API](https://www.mediawiki.org/wiki/API:Main_page/ja)
TODO 存在しない記事を引いた際の挙動をスマートにしたい。たとえば類似文字列検索で関連記事を取得してくるとか
Parameters
----------
title : str
Title of the content
Returns
-------
AnimeInfo
[description]
"""
logger.info(f"[START] Collecting '{title}' from media wiki api")
query_url = self._create_mediawiki_query_form_title(title)
r = requests.get(query_url)
logger.info(f"[FINISH] Collecting '{title}' from media wiki api")
if r.text:
content = json.loads(r.text)
pages = content["query"]["pages"]
for key, val in pages.items():
pageid = val["pageid"]
html = val["revisions"][0]["*"]
self.extract_animeinfo_from_html(html)
def main():
InfoCollector().from_mediawiki("ソードアート・オンライン")
if __name__ == "__main__":
main()