Skip to content

Commit

Permalink
Issue #59: Error in resp.raw.read(MAX_RESPONSE_SIZE, decode_content=T…
Browse files Browse the repository at this point in the history
…rue)
  • Loading branch information
Nekmo committed Oct 22, 2018
1 parent c551062 commit bbf0fa6
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion dirhunt/crawler_url.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# -*- coding: utf-8 -*-
import socket

from bs4 import BeautifulSoup
from requests import RequestException
from urllib3.exceptions import ReadTimeoutError

from dirhunt.url import Url
from dirhunt.url_loop import is_url_loop
Expand Down Expand Up @@ -62,7 +65,13 @@ def start(self):

processor = None
if resp.status_code < 300 and self.maybe_directory():
text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
try:
text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
except (RequestException, ReadTimeoutError, socket.timeout) as e:
self.crawler.current_processed_count += 1
self.crawler.results.put(Error(self, e))
self.close()
return self
soup = BeautifulSoup(text, 'html.parser')
if self.maybe_directory():
processor = get_processor(resp, text, self, soup) or GenericProcessor(resp, self)
Expand Down

0 comments on commit bbf0fa6

Please sign in to comment.