From ce7294d990b4bca98568334d09ab93be11b02376 Mon Sep 17 00:00:00 2001
From: Akash Mahanty <akash3pro@gmail.com>
Date: Fri, 2 Oct 2020 20:27:28 +0530
Subject: [PATCH] Implemented new feature, known urls for domain.

---
 waybackpy/cli.py     | 94 ++++++++++++++++++++++++++++++++++++--------
 waybackpy/wrapper.py | 49 +++++++++++++++++++++++
 2 files changed, 126 insertions(+), 17 deletions(-)

diff --git a/waybackpy/cli.py b/waybackpy/cli.py
index 1ad7623..9db0cb3 100644
--- a/waybackpy/cli.py
+++ b/waybackpy/cli.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
 import sys
+import os
+import re
 import argparse
 from waybackpy.wrapper import Url
 from waybackpy.__version__ import __version__
@@ -31,6 +33,36 @@ def _near(obj, args):
         _near_args["minute"] = args.minute
     return (obj.near(**_near_args))
 
+def _known_urls(obj, args):
+    sd = False
+    al = False
+    if args.subdomain:
+        sd = True
+    if args.alive:
+        al = True
+    url_list = obj.known_urls(alive=al, subdomain=sd)
+    total_urls = len(url_list)
+
+    if total_urls > 0:
+        m = re.search('https?://([A-Za-z_0-9.-]+).*', url_list[0])
+        if m:
+            domain = m.group(1)
+        else:
+            domain = "waybackpy-known"
+        dir_path = os.path.abspath(os.getcwd())
+        file_name = dir_path + "/%s-%d-urls.txt" % (domain, total_urls)
+        text = "\n".join(url_list) + "\n"
+        with open(file_name, "a+") as f:
+            f.write(text)
+        text =  text + "%d URLs found and saved in ./%s-%d-urls.txt" % (
+            total_urls, domain, total_urls
+            )
+
+    else:
+        text = "No known URLs found. Please try a diffrent domain!"
+
+    return text
+
 def _get(obj, args):
     if args.get.lower() == "url":
         return (obj.get())
@@ -52,10 +84,10 @@ def _get(obj, args):
 
 def args_handler(args):
     if args.version:
-        return (__version__)
+        return ("waybackpy version %s" % __version__)
 
     if not args.url:
-        return ("Specify an URL. See --help for help using waybackpy.")
+        return ("waybackpy %s \nSee 'waybackpy --help' for help using this tool." % __version__)
 
     if args.user_agent:
         obj = Url(args.url, args.user_agent)
@@ -72,26 +104,54 @@ def args_handler(args):
         return _total_archives(obj)
     if args.near:
         return _near(obj, args)
+    if args.known_urls:
+        return _known_urls(obj, args)
     if args.get:
         return _get(obj, args)
-    return ("Usage: waybackpy --url [URL] --user_agent [USER AGENT] [OPTIONS]. See --help for help using waybackpy.")
+    return ("You only specified the URL. But you also need to specify the operation.\nSee 'waybackpy --help' for help using this tool.")
 
 def parse_args(argv):
     parser = argparse.ArgumentParser()
-    parser.add_argument("-u", "--url", help="URL on which Wayback machine operations would occur.")
-    parser.add_argument("-ua", "--user_agent", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\".")
-    parser.add_argument("-s", "--save", action='store_true', help="Save the URL on the Wayback machine.")
-    parser.add_argument("-o", "--oldest", action='store_true', help="Oldest archive for the specified URL.")
-    parser.add_argument("-n", "--newest", action='store_true', help="Newest archive for the specified URL.")
-    parser.add_argument("-t", "--total", action='store_true', help="Total number of archives for the specified URL.")
-    parser.add_argument("-g", "--get", help="Prints the source code of the supplied url. Use '--get help' for extended usage.")
-    parser.add_argument("-v", "--version", action='store_true', help="Prints the waybackpy version.")
-    parser.add_argument("-N", "--near", action='store_true', help="Latest/Newest archive for the specified URL.")
-    parser.add_argument("-Y", "--year", type=int, help="Year in integer. For use with --near.")
-    parser.add_argument("-M", "--month", type=int, help="Month in integer. For use with --near.")
-    parser.add_argument("-D", "--day", type=int, help="Day in integer. For use with --near.")
-    parser.add_argument("-H", "--hour", type=int, help="Hour in integer. For use with --near.")
-    parser.add_argument("-MIN", "--minute", type=int, help="Minute in integer. For use with --near.")
+
+    requiredArgs = parser.add_argument_group('URL argument (required)')
+    requiredArgs.add_argument("--url", "-u", help="URL on which Wayback machine operations would occur")
+
+    userAgentArg = parser.add_argument_group('User Agent')
+    userAgentArg.add_argument("--user_agent", "-ua", help="User agent, default user_agent is \"waybackpy python package - https://github.com/akamhy/waybackpy\"")
+    
+    saveArg = parser.add_argument_group("Create new archive/save URL")
+    saveArg.add_argument("--save", "-s", action='store_true', help="Save the URL on the Wayback machine")
+    
+    oldestArg = parser.add_argument_group("Oldest archive")
+    oldestArg.add_argument("--oldest", "-o", action='store_true', help="Oldest archive for the specified URL")
+    
+    newestArg = parser.add_argument_group("Newest archive")
+    newestArg.add_argument("--newest", "-n", action='store_true', help="Newest archive for the specified URL")
+    
+    totalArg = parser.add_argument_group("Total number of archives")
+    totalArg.add_argument("--total", "-t", action='store_true', help="Total number of archives for the specified URL")
+    
+    getArg = parser.add_argument_group("Get source code")
+    getArg.add_argument("--get", "-g", help="Prints the source code of the supplied url. Use '--get help' for extended usage")
+
+    knownUrlArg = parser.add_argument_group("URLs known and archived to Waybcak Machine for the site.")
+    knownUrlArg.add_argument("--known_urls", "-ku", action='store_true', help="URLs known for the domain.")
+    knownUrlArg.add_argument("--subdomain", "-sub", action='store_true', help="Use with '--known_urls' to include known URLs for subdomains.")
+    knownUrlArg.add_argument("--alive", "-a", action='store_true', help="Only include live URLs. Will not inlclude dead links.")
+
+
+    nearArg = parser.add_argument_group('Archive close to time specified')
+    nearArg.add_argument("--near", "-N", action='store_true', help="Archive near specified time")
+
+    nearArgs = parser.add_argument_group('Arguments that are used only with --near')
+    nearArgs.add_argument("--year", "-Y", type=int, help="Year in integer")
+    nearArgs.add_argument("--month", "-M", type=int, help="Month in integer")
+    nearArgs.add_argument("--day", "-D", type=int, help="Day in integer.")
+    nearArgs.add_argument("--hour", "-H", type=int, help="Hour in intege")
+    nearArgs.add_argument("--minute", "-MIN", type=int, help="Minute in integer")
+
+    parser.add_argument("--version", "-v", action='store_true', help="Waybackpy version")
+    
     return parser.parse_args(argv[1:])
 
 def main(argv=None):
diff --git a/waybackpy/wrapper.py b/waybackpy/wrapper.py
index 1317ac0..8fa1a5b 100644
--- a/waybackpy/wrapper.py
+++ b/waybackpy/wrapper.py
@@ -100,8 +100,10 @@ def get(self, url="", user_agent="", encoding=""):
         """Return the source code of the supplied URL.
         If encoding is not supplied, it is auto-detected from the response.
         """
+        
         if not url:
             url = self._clean_url()
+
         if not user_agent:
             user_agent = self.user_agent
 
@@ -173,3 +175,50 @@ def total_archives(self):
         response = _get_response(req)
         # Most efficient method to count number of archives (yet)
         return str(response.read()).count(",")
+
+    def known_urls(self, alive=False, subdomain=False):
+        """Returns list of URLs known to exist for given domain name
+        because these URLs were crawled by WayBack Machine bots.
+
+        Useful for pen-testers and others.
+
+        Idea by Mohammed Diaa (https://github.com/mhmdiaa) from:
+        https://gist.github.com/mhmdiaa/adf6bff70142e5091792841d4b372050
+        """
+
+        url_list = []
+
+        if subdomain:
+            request_url = (
+            "https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&fl=original&collapse=urlkey" 
+            % self._clean_url()
+            )
+
+        else:
+            request_url = (
+            "http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&fl=original&collapse=urlkey" 
+            % self._clean_url()
+            )
+
+        hdr = {"User-Agent": "%s" % self.user_agent}
+        req = Request(request_url, headers=hdr)  # nosec
+        response = _get_response(req)
+
+        data = json.loads(response.read().decode("UTF-8"))
+        url_list = [y[0] for y in data if y[0] != "original"]
+
+        #Remove all deadURLs from url_list if alive=True
+        if alive:
+            tmp_url_list = []
+            for url in url_list:
+
+                try:
+                    urlopen(url)
+                except:
+                    continue
+
+                tmp_url_list.append(url)
+
+            url_list = tmp_url_list
+        
+        return url_list
\ No newline at end of file