From 8b9a8c992f31b9eb78cc924fc2b3747856154f85 Mon Sep 17 00:00:00 2001 From: Mennaruuk <52135169+Mennaruuk@users.noreply.github.com> Date: Fri, 11 Feb 2022 00:50:12 -0500 Subject: [PATCH] Fix problems --- twayback.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/twayback.py b/twayback.py index 5fc8fa5..65ef3d5 100644 --- a/twayback.py +++ b/twayback.py @@ -1,11 +1,10 @@ -import requests, re, os, argparse, sys, subprocess, time +import requests, re, os, argparse, sys, waybackpack, time, subprocess from requests import Session session = Session() from tqdm import tqdm import colorama from colorama import Fore, Back, Style colorama.init(autoreset=True) -from rich.progress import track os.system('cls') parser = argparse.ArgumentParser() @@ -21,9 +20,9 @@ data1 =f"https://twitter.com/{username}" results = [] headers = {'user-agent':'Mozilla/5.0 (compatible; DuckDuckBot-Https/1.1; https://duckduckgo.com/duckduckbot)'} + response = session.get(data1, headers=headers, allow_redirects=False) status_code = response.status_code - if status_code == 200: print(Back.GREEN + Fore.WHITE + f"Account is ACTIVE\n") time.sleep(1) @@ -31,19 +30,22 @@ print(Back.RED + Fore.WHITE + f"Account is SUSPENDED. This means all of {Back.WHITE + Fore.RED + username + Back.RED + Fore.WHITE}'s Tweets will be downloaded.\n") time.sleep(3) else: - print(Back.YELLOW + Fore.WHITE + f"No one currently has this handle. Twayback will search for a history of this handle's Tweets.\n") + print(Back.RED + Fore.WHITE + f"No one currently has this handle. Twayback will search for a history of this handle's Tweets.\n") time.sleep(2) stuck = "(Don't worry, Twayback isn't stuck!" + print(f"Please wait. Twayback is searching far and wide for deleted tweets from {username}.\nDrink some delicious coffee while this gets done.\n\n{Back.MAGENTA + stuck + Fore.WHITE}\nDepending on the number of Tweets, this step might take several minutes.)\n") +print(f"Grabbing links for Tweets from the Wayback Machine...\n") + link = f"https://web.archive.org/cdx/search/cdx?url=twitter.com/{username}/status&matchType=prefix&filter=statuscode:200&from={fromdate}&to={todate}" data2 = [] -blocklist = [] + c = session.get(link).text urls = re.findall(r'https?://(?:www\.)?(?:mobile\.)?twitter\.com/(?:#!/)?\w+/status(?:es)?/\d+', c) +blocklist = [] blocks = re.findall(r'Blocked', c) - for block in blocks: blocklist.append(f"{block}") if any("Blocked" in s for s in blocklist): @@ -52,9 +54,6 @@ else: pass -for url in urls: - data2.append(f"{url}") - username_character_count = len(username) if username_character_count == 1: data2 = [g for g in data2 if len(str(g)) <= 48] @@ -88,9 +87,12 @@ data2 = [g for g in data2 if len(str(g)) <= 62] else: pass + +for url in urls: + data2.append(f"{url}") # Remove duplicate URLs -data3 = list(dict.fromkeys(data2)) +data3 = list(set(data2)) number_of_elements = len(data3) if number_of_elements >= 1000: @@ -100,9 +102,9 @@ # Obtain status codes results = [] -headers = {'user-agent':'Mozilla/5.0 (compatible; DuckDuckBot-Https/1.1; https://duckduckgo.com/duckduckbot)'} +headers = {'user-agent':'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'} -for url in track(data3): +for url in tqdm(data3): response = session.get(url, headers=headers) status_code = response.status_code results.append((url, status_code)) @@ -114,10 +116,10 @@ # Filter for only deleted Tweets data4 = [g for g in data3 if " 404" in g] data5 = [g.replace(' 404', '') for g in data4] -data6 = [g for g in data5 if "," not in g ] + os.system('cls') -number_of_elements = len(data6) +number_of_elements = len(data5) if number_of_elements == 1: answer = input(f"\n{number_of_elements} deleted Tweet has been found.\nWould you like to download it? Type yes or no. Then press Enter. \n") @@ -126,15 +128,13 @@ exit() else: answer = input(f"\n{number_of_elements} deleted Tweets have been found.\nWould you like to download them all? Type yes or no. Then press Enter. \n") - os.system('cls') - # Use waybackpack to download URLs if answer.lower() == 'yes': - for url in tqdm(data6, position=0, leave=True): + for url in tqdm(data5, position=0, leave=True): subprocess.run(f"waybackpack -d {username} --uniques-only --raw {url}", text=True, capture_output=True) with open(f'{username}/{username}.txt', 'w') as file: - for row in data6: + for row in data5: s = "".join(map(str, row)) file.write(s+'\n') print(f"\nAll Tweets have been successfully downloaded!\nThey can be found as HTML files inside the folder {Back.MAGENTA + Fore.WHITE + username + Back.BLACK + Fore.WHITE}.\nAlso, a text file ({username}.txt) is saved, which lists all URLs for the deleted Tweets.")