From 48cef6e931ff73347ad6c8b1b42ed338a2ebd963 Mon Sep 17 00:00:00 2001 From: Francesco Poldi Date: Thu, 19 Dec 2019 21:39:28 +0100 Subject: [PATCH 1/5] Added field has_more_items --- twint/feed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twint/feed.py b/twint/feed.py index 7caea706..50dd9253 100644 --- a/twint/feed.py +++ b/twint/feed.py @@ -43,4 +43,4 @@ def Json(response): html = json_response["items_html"] soup = BeautifulSoup(html, "html.parser") feed = soup.find_all("div", "tweet") - return feed, json_response["min_position"] + return feed, json_response["min_position"], json_response["has_more_items"] From b94c64d873c444d55cbfa99d46969a1689e92005 Mon Sep 17 00:00:00 2001 From: Francesco Poldi Date: Thu, 19 Dec 2019 21:41:09 +0100 Subject: [PATCH 2/5] Cleaned run.Feed, added check has_more_items --- twint/run.py | 113 +++++++++++++++++++++++++++------------------------ 1 file changed, 60 insertions(+), 53 deletions(-) diff --git a/twint/run.py b/twint/run.py index 72fa1bc6..e67eadc8 100644 --- a/twint/run.py +++ b/twint/run.py @@ -21,6 +21,8 @@ def __init__(self, config): self.feed = [-1] self.count = 0 + self.consecutive_errors_count = 0 + self.has_more_items = True self.user_agent = "" self.config = config self.conn = db.Conn(config.Database) @@ -44,64 +46,66 @@ def get_resume(self, resumeFile): async def Feed(self): logme.debug(__name__+':Twint:Feed') - consecutive_errors_count = 0 - while True: - response = await get.RequestUrl(self.config, self.init, headers=[("User-Agent", self.user_agent)]) - if self.config.Debug: - print(response, file=open("twint-last-request.log", "w", encoding="utf-8")) + + response = await get.RequestUrl(self.config, self.init, headers=[("User-Agent", self.user_agent)]) - if self.config.Resume: - print(self.init, file=open(self.config.Resume, "w", encoding="utf-8")) - - self.feed = [] - try: - if self.config.Favorites: + if self.config.Debug: + print(response, file=open("twint-last-request.log", "w", encoding="utf-8")) + if self.config.Resume: + print(self.init, file=open(self.config.Resume, "w", encoding="utf-8")) + + self.feed = [] + try: + if self.config.Favorites: + self.feed, self.init = feed.Mobile(response) + if not self.count%40: + time.sleep(5) + elif self.config.Followers or self.config.Following: + self.feed, self.init = feed.Follow(response) + if not self.count%40: + time.sleep(5) + elif self.config.Profile: + if self.config.Profile_full: self.feed, self.init = feed.Mobile(response) - if not self.count%40: - time.sleep(5) - elif self.config.Followers or self.config.Following: - self.feed, self.init = feed.Follow(response) - if not self.count%40: - time.sleep(5) - elif self.config.Profile: - if self.config.Profile_full: - self.feed, self.init = feed.Mobile(response) - else: - self.feed, self.init = feed.profile(response) - elif self.config.TwitterSearch: - self.feed, self.init = feed.Json(response) - break - except TimeoutError as e: - if self.config.Proxy_host.lower() == "tor": - print("[?] Timed out, changing Tor identity...") - if self.config.Tor_control_password is None: - logme.critical(__name__+':Twint:Feed:tor-password') - sys.stderr.write("Error: config.Tor_control_password must be set for proxy autorotation!\r\n") - sys.stderr.write("Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors-controller-interface-directly\r\n") - break - else: - get.ForceNewTorIdentity(self.config) - continue else: - logme.critical(__name__+':Twint:Feed:' + str(e)) - print(str(e)) - break - except Exception as e: - if self.config.Profile or self.config.Favorites: - print("[!] Twitter does not return more data, scrape stops here.") - break - logme.critical(__name__+':Twint:Feed:noData' + str(e)) - # Sometimes Twitter says there is no data. But it's a lie. - consecutive_errors_count += 1 - if consecutive_errors_count < self.config.Retries_count: - self.user_agent = await get.RandomUserAgent() - continue - logme.critical(__name__+':Twint:Feed:Tweets_known_error:' + str(e)) - print(str(e) + " [x] run.Feed") - print("[!] if get this error but you know for sure that more tweets exist, please open an issue and we will investigate it!") - break + self.feed, self.init = feed.profile(response) + elif self.config.TwitterSearch: + self.feed, self.init, _has_more_items = feed.Json(response) + if (not self.feed) and self.has_more_items: + await self.Feed() + self.has_more_items = _has_more_items + return + except TimeoutError as e: + if self.config.Proxy_host.lower() == "tor": + print("[?] Timed out, changing Tor identity...") + if self.config.Tor_control_password is None: + logme.critical(__name__+':Twint:Feed:tor-password') + sys.stderr.write("Error: config.Tor_control_password must be set for proxy autorotation!\r\n") + sys.stderr.write("Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors-controller-interface-directly\r\n") + exit(1) + else: + get.ForceNewTorIdentity(self.config) + await self.Feed() + else: + logme.critical(__name__+':Twint:Feed:' + str(e)) + exit(str(e)) + except Exception as e: + if self.config.Profile or self.config.Favorites: + exit("[!] Twitter does not return more data, scrape stops here.") + logme.critical(__name__+':Twint:Feed:noData' + str(e)) + # Sometimes Twitter says there is no data. But it's a lie. + self.consecutive_errors_count += 1 + if self.consecutive_errors_count < self.config.Retries_count: + self.user_agent = await get.RandomUserAgent() + time.sleep(5) + await self.Feed() + logme.critical(__name__+':Twint:Feed:Tweets_known_error:' + str(e)) + exit(str(e) + " [x] run.Feed\n"+ + "[!] if get this error but you know for sure that more tweets exist, please open an issue and we will investigate it!") + async def follow(self): + self.consecutive_errors_count = 0 await self.Feed() if self.config.User_full: logme.debug(__name__+':Twint:follow:userFull') @@ -114,11 +118,13 @@ async def follow(self): await output.Username(username, self.config, self.conn) async def favorite(self): + self.consecutive_errors_count = 0 logme.debug(__name__+':Twint:favorite') await self.Feed() self.count += await get.Multi(self.feed, self.config, self.conn) async def profile(self): + self.consecutive_errors_count = 0 await self.Feed() if self.config.Profile_full: logme.debug(__name__+':Twint:profileFull') @@ -130,6 +136,7 @@ async def profile(self): await output.Tweets(tweet, self.config, self.conn) async def tweets(self): + self.consecutive_errors_count = 0 await self.Feed() if self.config.Location: logme.debug(__name__+':Twint:tweets:location') From a658326a2c4bfc1169736f7f98bc3c868a26a4dc Mon Sep 17 00:00:00 2001 From: Francesco Poldi Date: Thu, 19 Dec 2019 21:47:54 +0100 Subject: [PATCH 3/5] Basically just wait if TwitterSearch breaks --- twint/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twint/run.py b/twint/run.py index e67eadc8..e4cc6572 100644 --- a/twint/run.py +++ b/twint/run.py @@ -96,7 +96,7 @@ async def Feed(self): # Sometimes Twitter says there is no data. But it's a lie. self.consecutive_errors_count += 1 if self.consecutive_errors_count < self.config.Retries_count: - self.user_agent = await get.RandomUserAgent() + self.user_agent = await get.RandomUserAgent(wa=True if self.config.TwitterSearch else False) time.sleep(5) await self.Feed() logme.critical(__name__+':Twint:Feed:Tweets_known_error:' + str(e)) From bcbd8f49e78eafcf5a7fd56a262a3ec87e056992 Mon Sep 17 00:00:00 2001 From: Francesco Poldi Date: Thu, 19 Dec 2019 22:00:43 +0100 Subject: [PATCH 4/5] Added deep log for requests --- twint/run.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/twint/run.py b/twint/run.py index e4cc6572..a6859975 100644 --- a/twint/run.py +++ b/twint/run.py @@ -23,6 +23,7 @@ def __init__(self, config): self.count = 0 self.consecutive_errors_count = 0 self.has_more_items = True + self._has_more_items = True self.user_agent = "" self.config = config self.conn = db.Conn(config.Database) @@ -51,6 +52,8 @@ async def Feed(self): if self.config.Debug: print(response, file=open("twint-last-request.log", "w", encoding="utf-8")) + print(f"had_more_items:{self._has_more_items};has_more_items:{self.has_more_items};init:{self.init};len_feed:{len(self.feed)}", + file=open("twint-requests-deep.csv", 'a')) if self.config.Resume: print(self.init, file=open(self.config.Resume, "w", encoding="utf-8")) @@ -70,10 +73,10 @@ async def Feed(self): else: self.feed, self.init = feed.profile(response) elif self.config.TwitterSearch: - self.feed, self.init, _has_more_items = feed.Json(response) + self.feed, self.init, self._has_more_items = feed.Json(response) if (not self.feed) and self.has_more_items: await self.Feed() - self.has_more_items = _has_more_items + self.has_more_items = self._has_more_items return except TimeoutError as e: if self.config.Proxy_host.lower() == "tor": From 573fa6c5df70300a6e4917b3229c7986a1137719 Mon Sep 17 00:00:00 2001 From: Francesco Poldi Date: Thu, 19 Dec 2019 23:11:04 +0100 Subject: [PATCH 5/5] Updated exit calls --- twint/run.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/twint/run.py b/twint/run.py index a6859975..531483ca 100644 --- a/twint/run.py +++ b/twint/run.py @@ -94,7 +94,8 @@ async def Feed(self): exit(str(e)) except Exception as e: if self.config.Profile or self.config.Favorites: - exit("[!] Twitter does not return more data, scrape stops here.") + print("[!] Twitter does not return more data, scrape stops here.") + return logme.critical(__name__+':Twint:Feed:noData' + str(e)) # Sometimes Twitter says there is no data. But it's a lie. self.consecutive_errors_count += 1 @@ -103,8 +104,9 @@ async def Feed(self): time.sleep(5) await self.Feed() logme.critical(__name__+':Twint:Feed:Tweets_known_error:' + str(e)) - exit(str(e) + " [x] run.Feed\n"+ + print(str(e) + " [x] run.Feed\n"+ "[!] if get this error but you know for sure that more tweets exist, please open an issue and we will investigate it!") + return async def follow(self):