Issue #59: Error in resp.raw.read(MAX_RESPONSE_SIZE, decode_content=T…

…rue)
Nekmo · Oct 22, 2018 · bbf0fa6 · bbf0fa6
1 parent c551062
commit bbf0fa6
Showing 1 changed file with 10 additions and 1 deletion.
diff --git a/dirhunt/crawler_url.py b/dirhunt/crawler_url.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
+import socket
+
 from bs4 import BeautifulSoup
 from requests import RequestException
+from urllib3.exceptions import ReadTimeoutError
 
 from dirhunt.url import Url
 from dirhunt.url_loop import is_url_loop
@@ -62,7 +65,13 @@ def start(self):
 
         processor = None
         if resp.status_code < 300 and self.maybe_directory():
-            text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
+            try:
+                text = resp.raw.read(MAX_RESPONSE_SIZE, decode_content=True)
+            except (RequestException, ReadTimeoutError, socket.timeout) as e:
+                self.crawler.current_processed_count += 1
+                self.crawler.results.put(Error(self, e))
+                self.close()
+                return self
             soup = BeautifulSoup(text, 'html.parser')
         if self.maybe_directory():
             processor = get_processor(resp, text, self, soup) or GenericProcessor(resp, self)