Skip to content

Commit

Permalink
Fix problems
Browse files Browse the repository at this point in the history
  • Loading branch information
Mennaruuk authored Feb 11, 2022
1 parent 57afd9e commit 8b9a8c9
Showing 1 changed file with 18 additions and 18 deletions.
36 changes: 18 additions & 18 deletions twayback.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import requests, re, os, argparse, sys, subprocess, time
import requests, re, os, argparse, sys, waybackpack, time, subprocess
from requests import Session
session = Session()
from tqdm import tqdm
import colorama
from colorama import Fore, Back, Style
colorama.init(autoreset=True)
from rich.progress import track
os.system('cls')

parser = argparse.ArgumentParser()
Expand All @@ -21,29 +20,32 @@
data1 =f"https://twitter.com/{username}"
results = []
headers = {'user-agent':'Mozilla/5.0 (compatible; DuckDuckBot-Https/1.1; https://duckduckgo.com/duckduckbot)'}

response = session.get(data1, headers=headers, allow_redirects=False)
status_code = response.status_code

if status_code == 200:
print(Back.GREEN + Fore.WHITE + f"Account is ACTIVE\n")
time.sleep(1)
elif status_code == 302:
print(Back.RED + Fore.WHITE + f"Account is SUSPENDED. This means all of {Back.WHITE + Fore.RED + username + Back.RED + Fore.WHITE}'s Tweets will be downloaded.\n")
time.sleep(3)
else:
print(Back.YELLOW + Fore.WHITE + f"No one currently has this handle. Twayback will search for a history of this handle's Tweets.\n")
print(Back.RED + Fore.WHITE + f"No one currently has this handle. Twayback will search for a history of this handle's Tweets.\n")
time.sleep(2)

stuck = "(Don't worry, Twayback isn't stuck!"

print(f"Please wait. Twayback is searching far and wide for deleted tweets from {username}.\nDrink some delicious coffee while this gets done.\n\n{Back.MAGENTA + stuck + Fore.WHITE}\nDepending on the number of Tweets, this step might take several minutes.)\n")

print(f"Grabbing links for Tweets from the Wayback Machine...\n")

link = f"https://web.archive.org/cdx/search/cdx?url=twitter.com/{username}/status&matchType=prefix&filter=statuscode:200&from={fromdate}&to={todate}"
data2 = []
blocklist = []

c = session.get(link).text
urls = re.findall(r'https?://(?:www\.)?(?:mobile\.)?twitter\.com/(?:#!/)?\w+/status(?:es)?/\d+', c)
blocklist = []
blocks = re.findall(r'Blocked', c)

for block in blocks:
blocklist.append(f"{block}")
if any("Blocked" in s for s in blocklist):
Expand All @@ -52,9 +54,6 @@
else:
pass

for url in urls:
data2.append(f"{url}")

username_character_count = len(username)
if username_character_count == 1:
data2 = [g for g in data2 if len(str(g)) <= 48]
Expand Down Expand Up @@ -88,9 +87,12 @@
data2 = [g for g in data2 if len(str(g)) <= 62]
else:
pass

for url in urls:
data2.append(f"{url}")

# Remove duplicate URLs
data3 = list(dict.fromkeys(data2))
data3 = list(set(data2))

number_of_elements = len(data3)
if number_of_elements >= 1000:
Expand All @@ -100,9 +102,9 @@

# Obtain status codes
results = []
headers = {'user-agent':'Mozilla/5.0 (compatible; DuckDuckBot-Https/1.1; https://duckduckgo.com/duckduckbot)'}
headers = {'user-agent':'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'}

for url in track(data3):
for url in tqdm(data3):
response = session.get(url, headers=headers)
status_code = response.status_code
results.append((url, status_code))
Expand All @@ -114,10 +116,10 @@
# Filter for only deleted Tweets
data4 = [g for g in data3 if " 404" in g]
data5 = [g.replace(' 404', '') for g in data4]
data6 = [g for g in data5 if "," not in g ]

os.system('cls')

number_of_elements = len(data6)
number_of_elements = len(data5)

if number_of_elements == 1:
answer = input(f"\n{number_of_elements} deleted Tweet has been found.\nWould you like to download it? Type yes or no. Then press Enter. \n")
Expand All @@ -126,15 +128,13 @@
exit()
else:
answer = input(f"\n{number_of_elements} deleted Tweets have been found.\nWould you like to download them all? Type yes or no. Then press Enter. \n")

os.system('cls')

# Use waybackpack to download URLs
if answer.lower() == 'yes':
for url in tqdm(data6, position=0, leave=True):
for url in tqdm(data5, position=0, leave=True):
subprocess.run(f"waybackpack -d {username} --uniques-only --raw {url}", text=True, capture_output=True)
with open(f'{username}/{username}.txt', 'w') as file:
for row in data6:
for row in data5:
s = "".join(map(str, row))
file.write(s+'\n')
print(f"\nAll Tweets have been successfully downloaded!\nThey can be found as HTML files inside the folder {Back.MAGENTA + Fore.WHITE + username + Back.BLACK + Fore.WHITE}.\nAlso, a text file ({username}.txt) is saved, which lists all URLs for the deleted Tweets.")
Expand Down

0 comments on commit 8b9a8c9

Please sign in to comment.