Skip to content

Commit

Permalink
Implemented new feature, known urls for domain.
Browse files Browse the repository at this point in the history
  • Loading branch information
Akash Mahanty committed Oct 2, 2020
1 parent c9fa114 commit ce7294d
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 17 deletions.
94 changes: 77 additions & 17 deletions waybackpy/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# -*- coding: utf-8 -*-
from __future__ import print_function
import sys
import os
import re
import argparse
from waybackpy.wrapper import Url
from waybackpy.__version__ import __version__
Expand Down Expand Up @@ -31,6 +33,36 @@ def _near(obj, args):
_near_args["minute"] = args.minute
return (obj.near(**_near_args))

def _known_urls(obj, args):
sd = False
al = False
if args.subdomain:
sd = True
if args.alive:
al = True
url_list = obj.known_urls(alive=al, subdomain=sd)
total_urls = len(url_list)

if total_urls > 0:
m = re.search('https?://([A-Za-z_0-9.-]+).*', url_list[0])
if m:
domain = m.group(1)
else:
domain = "waybackpy-known"
dir_path = os.path.abspath(os.getcwd())
file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls)
text = "\n".join(url_list) + "\n"
with open(file_name, "a+") as f:
f.write(text)
text = text + "%d URLs found and saved in ./%s-%d-urls.txt" % (
total_urls, domain, total_urls
)

else:
text = "No known URLs found. Please try a diffrent domain!"

return text

def _get(obj, args):
if args.get.lower() == "url":
return (obj.get())
Expand All @@ -52,10 +84,10 @@ def _get(obj, args):

def args_handler(args):
if args.version:
return (__version__)
return ("waybackpy version %s" % __version__)

if not args.url:
return ("Specify an URL. See --help for help using waybackpy.")
return ("waybackpy %s \nSee 'waybackpy --help' for help using this tool." % __version__)

if args.user_agent:
obj = Url(args.url, args.user_agent)
Expand All @@ -72,26 +104,54 @@ def args_handler(args):
return _total_archives(obj)
if args.near:
return _near(obj, args)
if args.known_urls:
return _known_urls(obj, args)
if args.get:
return _get(obj, args)
return ("Usage: waybackpy --url [URL] --user_agent [USER AGENT] [OPTIONS]. See --help for help using waybackpy.")
return ("You only specified the URL. But you also need to specify the operation.\nSee 'waybackpy --help' for help using this tool.")

def parse_args(argv):
parser = argparse.ArgumentParser()
parser.add_argument("-u", "--url", help="URL on which Wayback machine operations would occur.")
parser.add_argument("-ua", "--user_agent", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\".")
parser.add_argument("-s", "--save", action='store_true', help="Save the URL on the Wayback machine.")
parser.add_argument("-o", "--oldest", action='store_true', help="Oldest archive for the specified URL.")
parser.add_argument("-n", "--newest", action='store_true', help="Newest archive for the specified URL.")
parser.add_argument("-t", "--total", action='store_true', help="Total number of archives for the specified URL.")
parser.add_argument("-g", "--get", help="Prints the source code of the supplied url. Use '--get help' for extended usage.")
parser.add_argument("-v", "--version", action='store_true', help="Prints the waybackpy version.")
parser.add_argument("-N", "--near", action='store_true', help="Latest/Newest archive for the specified URL.")
parser.add_argument("-Y", "--year", type=int, help="Year in integer. For use with --near.")
parser.add_argument("-M", "--month", type=int, help="Month in integer. For use with --near.")
parser.add_argument("-D", "--day", type=int, help="Day in integer. For use with --near.")
parser.add_argument("-H", "--hour", type=int, help="Hour in integer. For use with --near.")
parser.add_argument("-MIN", "--minute", type=int, help="Minute in integer. For use with --near.")

requiredArgs = parser.add_argument_group('URL argument (required)')
requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur")

userAgentArg = parser.add_argument_group('User Agent')
userAgentArg.add_argument("--user_agent", "-ua", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"")

saveArg = parser.add_argument_group("Create new archive/save URL")
saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine")

oldestArg = parser.add_argument_group("Oldest archive")
oldestArg.add_argument("--oldest", "-o", action='store_true', help="Oldest archive for the specified URL")

newestArg = parser.add_argument_group("Newest archive")
newestArg.add_argument("--newest", "-n", action='store_true', help="Newest archive for the specified URL")

totalArg = parser.add_argument_group("Total number of archives")
totalArg.add_argument("--total", "-t", action='store_true', help="Total number of archives for the specified URL")

getArg = parser.add_argument_group("Get source code")
getArg.add_argument("--get", "-g", help="Prints the source code of the supplied url. Use '--get help' for extended usage")

knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.")
knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.")
knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help="Use with '--known_urls' to include known URLs for subdomains.")
knownUrlArg.add_argument("--alive", "-a", action='store_true', help="Only include live URLs. Will not inlclude dead links.")


nearArg = parser.add_argument_group('Archive close to time specified')
nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time")

nearArgs = parser.add_argument_group('Arguments that are used only with --near')
nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer")
nearArgs.add_argument("--month", "-M", type=int, help="Month in integer")
nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.")
nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege")
nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")

parser.add_argument("--version", "-v", action='store_true', help="Waybackpy version")

return parser.parse_args(argv[1:])

def main(argv=None):
Expand Down
49 changes: 49 additions & 0 deletions waybackpy/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,10 @@ def get(self, url="", user_agent="", encoding=""):
"""Return the source code of the supplied URL.
If encoding is not supplied, it is auto-detected from the response.
"""

if not url:
url = self._clean_url()

if not user_agent:
user_agent = self.user_agent

Expand Down Expand Up @@ -173,3 +175,50 @@ def total_archives(self):
response = _get_response(req)
# Most efficient method to count number of archives (yet)
return str(response.read()).count(",")

def known_urls(self, alive=False, subdomain=False):
"""Returns list of URLs known to exist for given domain name
because these URLs were crawled by WayBack Machine bots.
Useful for pen-testers and others.
Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
"""

url_list = []

if subdomain:
request_url = (
"https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey"
% self._clean_url()
)

else:
request_url = (
"http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey"
% self._clean_url()
)

hdr = {"User-Agent": "%s" % self.user_agent}
req = Request(request_url, headers=hdr) # nosec
response = _get_response(req)

data = json.loads(response.read().decode("UTF-8"))
url_list = [y[0] for y in data if y[0] != "original"]

#Remove all deadURLs from url_list if alive=True
if alive:
tmp_url_list = []
for url in url_list:

try:
urlopen(url)
except:
continue

tmp_url_list.append(url)

url_list = tmp_url_list

return url_list

0 comments on commit ce7294d

Please sign in to comment.