-
Notifications
You must be signed in to change notification settings - Fork 0
/
dmhy.py
198 lines (167 loc) · 7.23 KB
/
dmhy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#VERSION: 1.1
import re
from enum import Enum
from html.parser import HTMLParser
from helpers import (
# download_file,
retrieve_url,
)
from novaprinter import prettyPrinter
ENGINE_BASEURL = 'http://dmhy.org'
MAGNET_PATTERN = r'magnet:\?xt=urn:btih:[a-zA-Z0-9]*'
TITLE = 3
MAGLINK = 4
SIZE = 5
SEEDER = 6
LEECH = 7
class dmhy(object):
"""
`url`, `name`, `supported_categories` should be static variables of the engine_name class,
otherwise qbt won't install the plugin.
`url`: The URL of the search engine.
`name`: The name of the search engine, spaces and special characters are allowed here.
`supported_categories`: What categories are supported by the search engine and their
corresponding id, possible categories are ('all', 'anime', 'books', 'games', 'movies',
'music', 'pictures', 'software', 'tv').
"""
url = ENGINE_BASEURL
name = 'dmhy'
supported_categories = {
'all': '0',
}
class DmhyParser(HTMLParser):
def __init__(self, outer_class):
super().__init__()
self.outer_class = outer_class
self.in_table = False
self.in_tbody = False
self.in_row = False
self.in_cell_num = 0
self.in_cell = False
self.result_dict = {}
def handle_starttag(self, tag, attrs):
# Find the table that contains the search results
# The table has an id of "topic_list"
if tag == 'table':
for attr in attrs:
if attr[0] == 'id' and attr[1] == "topic_list":
self.in_table = True
return
if tag == 'tbody' and self.in_table:
self.in_tbody = True
return
# Find the rows in the table and initialize the result_dict
if tag == 'tr' and self.in_tbody:
self.in_row = True
self.in_cell_num = 0
self.result_dict = {
"link": "-1",
"name": "",
"size": "-1",
"seeds": "-1",
"leech": "-1",
"engine_url": ENGINE_BASEURL,
"desc_link": "-1",
}
return
# Find the cells in the row. Keep track of the cell number
if tag == 'td' and self.in_row:
self.in_cell_num += 1
self.in_cell = True
return
# The anchor tag in the third cell contains the page url.
# Save page url into the dictionary, and retrieve the page to get the magnet link
# Also save the magnet link into the dictionary
if tag == 'a' and self.in_cell_num == TITLE:
for attr in attrs:
if attr[0] == 'href':
self.result_dict["desc_link"] = ENGINE_BASEURL + attr[1]
return
# Only the first (of two) anchor tag in the fourth cell contains the magnet link
# So a regular expression check is performed to make sure it is a magnet link
if tag == 'a' and self.in_cell_num == MAGLINK:
for attr in attrs:
if attr[0] == 'href' and re.match(MAGNET_PATTERN, attr[1]):
self.result_dict["link"] = attr[1]
def handle_data(self, data):
# The third cell contains the name of the torrent,
# but it may be split into multiple parts. Concatenate them.
if self.in_cell and self.in_cell_num == TITLE:
self.result_dict["name"] += re.sub(r"[\t\n]", "", data)
return
# The fourth cell contains the size of the torrent
# Safe to use as is.
if self.in_cell and self.in_cell_num == SIZE:
self.result_dict["size"] = data
return
# The sixth cell contains the number of seeders
# This data is not always available, so check for a dash
if self.in_cell and self.in_cell_num == SEEDER and data != '-':
self.result_dict["seeds"] = data
return
# The seventh cell contains the number of leech
# Same as the seeders, check for a dash
if self.in_cell and self.in_cell_num == LEECH and data != '-':
self.result_dict["leech"] = data
return
def handle_endtag(self, tag):
# Reset the cell flag when the cell ends
if tag == 'td' and self.in_cell:
self.in_cell = False
return
# Reset the row and cell flags when the row ends
if tag == 'tr' and self.in_row:
self.in_row = False
self.in_cell_num = 0
# It may happen that a magnet link is not directly available from the table
# In that case, th description page is retrieved to get the magnet link
if self.result_dict["link"] == "-1":
page = retrieve_url(self.result_dict["desc_link"])
magnet_links = re.findall(MAGNET_PATTERN, page)
self.result_dict["link"] = magnet_links[0]
prettyPrinter(self.result_dict)
return
# Reset the tbody flag when the tbody ends
if tag == 'tbody' and self.in_tbody:
self.in_tbody = False
return
# Reset the table flag when the table ends
if tag == 'table' and self.in_table:
self.in_table = False
return
def __init__(self):
"""
Some initialization
"""
self.result_dicts = []
# def download_torrent(self, info):
# """
# Providing this function is optional.
# It can however be interesting to provide your own torrent download
# implementation in case the search engine in question does not allow
# traditional downloads (for example, cookie-based download).
# """
# print(download_file(info))
# DO NOT CHANGE the name and parameters of this function
# This function will be the one called by nova2.py
def search(self, what, cat='all'):
"""
Here you can do what you want to get the result from the search engine website.
Everytime you parse a result line, store it in a dictionary
and call the prettyPrint(your_dict) function.
`what` is a string with the search tokens, already escaped (e.g. "Ubuntu+Linux")
`cat` is the name of a search category in ('all', 'anime', 'books', 'games',
'movies', 'music', 'pictures', 'software', 'tv')
"""
search_url = f"http://dmhy.org/topics/list?keyword={what}"
while True:
result_page = retrieve_url(search_url)
parser = self.DmhyParser(outer_class=self)
parser.feed(result_page)
pattern = fr'<a\s+href="/topics/list/page/(\d+)\?keyword={re.escape(what)}">下一頁</a>'
match = re.search(pattern, result_page)
if match:
search_url = f"http://dmhy.org/topics/list/page/{match.group(1)}?keyword={what}"
continue
else:
break