From ce7294d990b4bca98568334d09ab93be11b02376 Mon Sep 17 00:00:00 2001 From: Akash Mahanty Date: Fri, 2 Oct 2020 20:27:28 +0530 Subject: [PATCH] Implemented new feature, known urls for domain. --- waybackpy/cli.py | 94 ++++++++++++++++++++++++++++++++++++-------- waybackpy/wrapper.py | 49 +++++++++++++++++++++++ 2 files changed, 126 insertions(+), 17 deletions(-) diff --git a/waybackpy/cli.py b/waybackpy/cli.py index 1ad7623..9db0cb3 100644 --- a/waybackpy/cli.py +++ b/waybackpy/cli.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- from __future__ import print_function import sys +import os +import re import argparse from waybackpy.wrapper import Url from waybackpy.__version__ import __version__ @@ -31,6 +33,36 @@ def _near(obj, args): _near_args["minute"] = args.minute return (obj.near(**_near_args)) +def _known_urls(obj, args): + sd = False + al = False + if args.subdomain: + sd = True + if args.alive: + al = True + url_list = obj.known_urls(alive=al, subdomain=sd) + total_urls = len(url_list) + + if total_urls > 0: + m = re.search('https?://([A-Za-z_0-9.-]+).*', url_list[0]) + if m: + domain = m.group(1) + else: + domain = "waybackpy-known" + dir_path = os.path.abspath(os.getcwd()) + file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls) + text = "\n".join(url_list) + "\n" + with open(file_name, "a+") as f: + f.write(text) + text = text + "%d URLs found and saved in ./%s-%d-urls.txt" % ( + total_urls, domain, total_urls + ) + + else: + text = "No known URLs found. Please try a diffrent domain!" + + return text + def _get(obj, args): if args.get.lower() == "url": return (obj.get()) @@ -52,10 +84,10 @@ def _get(obj, args): def args_handler(args): if args.version: - return (__version__) + return ("waybackpy version %s" % __version__) if not args.url: - return ("Specify an URL. See --help for help using waybackpy.") + return ("waybackpy %s \nSee 'waybackpy --help' for help using this tool." % __version__) if args.user_agent: obj = Url(args.url, args.user_agent) @@ -72,26 +104,54 @@ def args_handler(args): return _total_archives(obj) if args.near: return _near(obj, args) + if args.known_urls: + return _known_urls(obj, args) if args.get: return _get(obj, args) - return ("Usage: waybackpy --url [URL] --user_agent [USER AGENT] [OPTIONS]. See --help for help using waybackpy.") + return ("You only specified the URL. But you also need to specify the operation.\nSee 'waybackpy --help' for help using this tool.") def parse_args(argv): parser = argparse.ArgumentParser() - parser.add_argument("-u", "--url", help="URL on which Wayback machine operations would occur.") - parser.add_argument("-ua", "--user_agent", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\".") - parser.add_argument("-s", "--save", action='store_true', help="Save the URL on the Wayback machine.") - parser.add_argument("-o", "--oldest", action='store_true', help="Oldest archive for the specified URL.") - parser.add_argument("-n", "--newest", action='store_true', help="Newest archive for the specified URL.") - parser.add_argument("-t", "--total", action='store_true', help="Total number of archives for the specified URL.") - parser.add_argument("-g", "--get", help="Prints the source code of the supplied url. Use '--get help' for extended usage.") - parser.add_argument("-v", "--version", action='store_true', help="Prints the waybackpy version.") - parser.add_argument("-N", "--near", action='store_true', help="Latest/Newest archive for the specified URL.") - parser.add_argument("-Y", "--year", type=int, help="Year in integer. For use with --near.") - parser.add_argument("-M", "--month", type=int, help="Month in integer. For use with --near.") - parser.add_argument("-D", "--day", type=int, help="Day in integer. For use with --near.") - parser.add_argument("-H", "--hour", type=int, help="Hour in integer. For use with --near.") - parser.add_argument("-MIN", "--minute", type=int, help="Minute in integer. For use with --near.") + + requiredArgs = parser.add_argument_group('URL argument (required)') + requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur") + + userAgentArg = parser.add_argument_group('User Agent') + userAgentArg.add_argument("--user_agent", "-ua", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"") + + saveArg = parser.add_argument_group("Create new archive/save URL") + saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine") + + oldestArg = parser.add_argument_group("Oldest archive") + oldestArg.add_argument("--oldest", "-o", action='store_true', help="Oldest archive for the specified URL") + + newestArg = parser.add_argument_group("Newest archive") + newestArg.add_argument("--newest", "-n", action='store_true', help="Newest archive for the specified URL") + + totalArg = parser.add_argument_group("Total number of archives") + totalArg.add_argument("--total", "-t", action='store_true', help="Total number of archives for the specified URL") + + getArg = parser.add_argument_group("Get source code") + getArg.add_argument("--get", "-g", help="Prints the source code of the supplied url. Use '--get help' for extended usage") + + knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.") + knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.") + knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help="Use with '--known_urls' to include known URLs for subdomains.") + knownUrlArg.add_argument("--alive", "-a", action='store_true', help="Only include live URLs. Will not inlclude dead links.") + + + nearArg = parser.add_argument_group('Archive close to time specified') + nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time") + + nearArgs = parser.add_argument_group('Arguments that are used only with --near') + nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer") + nearArgs.add_argument("--month", "-M", type=int, help="Month in integer") + nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.") + nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege") + nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer") + + parser.add_argument("--version", "-v", action='store_true', help="Waybackpy version") + return parser.parse_args(argv[1:]) def main(argv=None): diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py index 1317ac0..8fa1a5b 100644 --- a/waybackpy/wrapper.py +++ b/waybackpy/wrapper.py @@ -100,8 +100,10 @@ def get(self, url="", user_agent="", encoding=""): """Return the source code of the supplied URL. If encoding is not supplied, it is auto-detected from the response. """ + if not url: url = self._clean_url() + if not user_agent: user_agent = self.user_agent @@ -173,3 +175,50 @@ def total_archives(self): response = _get_response(req) # Most efficient method to count number of archives (yet) return str(response.read()).count(",") + + def known_urls(self, alive=False, subdomain=False): + """Returns list of URLs known to exist for given domain name + because these URLs were crawled by WayBack Machine bots. + + Useful for pen-testers and others. + + Idea by Mohammed Diaa (https://github.com/mhmdiaa) from: + https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050 + """ + + url_list = [] + + if subdomain: + request_url = ( + "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey" + % self._clean_url() + ) + + else: + request_url = ( + "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey" + % self._clean_url() + ) + + hdr = {"User-Agent": "%s" % self.user_agent} + req = Request(request_url, headers=hdr) # nosec + response = _get_response(req) + + data = json.loads(response.read().decode("UTF-8")) + url_list = [y[0] for y in data if y[0] != "original"] + + #Remove all deadURLs from url_list if alive=True + if alive: + tmp_url_list = [] + for url in url_list: + + try: + urlopen(url) + except: + continue + + tmp_url_list.append(url) + + url_list = tmp_url_list + + return url_list \ No newline at end of file