-
Notifications
You must be signed in to change notification settings - Fork 1
/
fa_crawler.py
147 lines (101 loc) · 5.1 KB
/
fa_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from urllib.parse import urlparse
from kh_common.config.credentials import furaffinity
from kh_common.gateway import ClientResponse, Gateway
from kh_common.hashing import Hashable
from lxml.html import fromstring
class First :
def __init__(self, method=None) :
self.method = method
def __call__(self, it) :
try :
return next(filter(self.method, it))
except (TypeError, StopIteration) :
return None
first = First()
class SiteNotCrawled(Exception) :
pass
def isint(s) :
try : return int(s)
except : return None
async def response_text(response: ClientResponse) -> str :
# furaffinity seems to be responding with mangled unicode in some
# places, so we need to tell the decoder to ignore such errors
return (await response.read()).decode(errors='replace')
FurAffinityGateway: Gateway = Gateway('https://www.furaffinity.net/view/{id}', decoder=response_text)
class FurAffinityCrawler(Hashable) :
submissionTypes = { 'story', 'music' }
xpathargs = { 'regexp': False, 'smart_strings': False }
async def crawl(self: 'FurAffinityCrawler', post_id: int) :
html = await FurAffinityGateway(id=post_id, headers=furaffinity['headers'])
document = fromstring(html)
return self.parse(document, post_id)
def parse(self: 'FurAffinityCrawler', document, post_id) :
# check that the website isn't down and etc etc
if first(document.xpath('//body//div[@class="attribution"]/a/text()', **self.xpathargs)) == 'DDoS protection by Cloudflare' :
raise SiteNotCrawled('furaffinity is currently behind cloudflare.')
if first(document.xpath('//body/@id', **self.xpathargs)) == 'pageid-matureimage-error' :
raise SiteNotCrawled('furaffinity login error.')
elif document.xpath('//head/title[contains(text(), "System Error")]', **self.xpathargs) and document.xpath('//body/section/div[@class="section-body" and contains(text(), "The submission you are trying to find is not in our database")]', **self.xpathargs) :
raise SiteNotCrawled('url does not have a submission.')
elif document.xpath('//img[@src="/fa_offline.jpg"]', **self.xpathargs) :
raise SiteNotCrawled('furaffinity is currently offline.')
# now we can actually crawl
image_url = first(document.xpath('//img[@id="submissionImg"]/@src', **self.xpathargs))
if not image_url :
raise SiteNotCrawled('submission does not contain an image.')
filetype = document.xpath('//div[@class="submission-content"]//center[contains(@class, "p20")]/div[contains(strong/text(), "File type")]', **self.xpathargs)
if filetype :
raise SiteNotCrawled('submission is not an image.')
sidebar = first(document.xpath('//div[@class="submission-sidebar"]', **self.xpathargs))
resolution = first(sidebar.xpath('self::*//section[@class="info text"]//span[contains(preceding-sibling::*/text(), "Size")]/text()', **self.xpathargs))
if resolution :
resolution = resolution.split('x')
x = isint(resolution[0])
y = isint(resolution[1])
if x and y :
resolution = (x, y)
else :
resolution = None
timestamp = image_url.split('/')[5] # this will ALWAYS be [5]
if isint(timestamp) is not None :
timestamp = int(timestamp)
uploadTimestamp = image_url.split('/')[6]
uploadTimestamp = int(uploadTimestamp[:uploadTimestamp.find('.')])
elif timestamp in FurAffinityCrawler.submissionTypes :
raise SiteNotCrawled(f'submission is not an image. type: {timestamp}.')
else :
raise SiteNotCrawled(f'could not find image id (timestamp) from image url. image_url: {image_url}, timestamp: {timestamp}.')
if image_url.startswith('//') :
image_url = 'https:' + image_url
artist = first(document.xpath('//div[@class="submission-id-container"]//a[contains(@href, "/user/") and strong]', **self.xpathargs))
artist_url = None
if artist :
artist_url = first(artist.xpath('@href', **self.xpathargs))
if artist_url :
artist_url = 'https://www.furaffinity.net' + artist_url
else :
raise SiteNotCrawled('could not find artist url in html.')
artist = first(artist.xpath('strong/text()', **self.xpathargs))
if not artist :
raise SiteNotCrawled('could not find artist in html.')
description = ''.join(document.xpath('//div[@class="submission-content"]/section/div[@class="section-body"]/div[contains(@class, "submission-description")]//text()', **self.xpathargs)).strip()
title = first(document.xpath('//div[@class="submission-id-container"]//div[@class="submission-title"]//p/text()', **self.xpathargs))
if not title :
self.logger.warning(f'could not find submission title in html. url: {self.url}')
# get thumbnail host
data_preview_src = first(document.xpath('//img[@id="submissionImg"]/@data-preview-src', **self.xpathargs))
if not data_preview_src :
data_preview_src = 'https://t.facdn.net'
# for furaffinity crawls, self.url holds the webcode
thumbnail = f'https://{urlparse(data_preview_src).netloc}/{post_id}@{{}}-{timestamp}.jpg'
thumbnails = [thumbnail.format(r) for r in (200, 300, 400, 600, 800)]
return {
'image': image_url,
'title': title,
'timestamp': uploadTimestamp,
'description': description,
'artist': artist,
'artist_url': artist_url,
'thumbnails': thumbnails,
'resolution': resolution,
}