Merge pull request #68 from cgeopapa/master

Save output to path
opsdisk · Jun 4, 2022 · 1797384 · 1797384
2 parents 68ff778 + bd8c73a
commit 1797384
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -9,9 +9,9 @@ There are 2 parts.  The first is `ghdb_scraper.py` that retrieves the latest Goo
 `pagodo.py` that leverages the information gathered by `ghdb_scraper.py`.
 
 The core Google search library now uses the more flexible [yagooglesearch](https://github.com/opsdisk/yagooglesearch)
-instead of [googlesearch](https://github.com/MarioVilas/googlesearch).  Check out the
-[yagooglesearch README](https://github.com/opsdisk/yagooglesearch/blob/master/README.md) for a more in-depth explanation
-of the library differences and capabilities.
+instead of [googlesearch](https://github.com/MarioVilas/googlesearch).  Check out the [yagooglesearch
+README](https://github.com/opsdisk/yagooglesearch/blob/master/README.md) for a more in-depth explanation of the library
+differences and capabilities.
 
 This version of `pagodo` also supports native HTTP(S) and SOCKS5 application support, so no more wrapping it in a tool
 like `proxychains4` if you need proxy support.  You can specify multiple proxies to use in a round-robin fashion by
@@ -53,9 +53,9 @@ pip install -r requirements.txt
 
 ## ghdb_scraper.py
 
-To start off, `pagodo.py` needs a list of all the current Google dorks.  The repo contains a `dorks/` directory with
-the current dorks when the `ghdb_scraper.py` was last run. It's advised to run `ghdb_scraper.py` to get the freshest
-data before running `pagodo.py`.  The `dorks/` directory contains:
+To start off, `pagodo.py` needs a list of all the current Google dorks.  The repo contains a `dorks/` directory with the
+current dorks when the `ghdb_scraper.py` was last run. It's advised to run `ghdb_scraper.py` to get the freshest data
+before running `pagodo.py`.  The `dorks/` directory contains:
 
 * the `all_google_dorks.txt` file which contains all the Google dorks, one per line
 * the `all_google_dorks.json` file which is the JSON response from GHDB
@@ -165,8 +165,8 @@ pg = pagodo.Pagodo(
     google_dorks_file="dorks.txt",
     domain="github.com",
     max_search_result_urls_to_return_per_dork=3,
-    save_pagodo_results_to_json_file=True,
-    save_urls_to_file=True,
+    save_pagodo_results_to_json_file=None,  # None = Auto-generate file name, otherwise pass a string for path and filename.
+    save_urls_to_file=None,  # None = Auto-generate file name, otherwise pass a string for path and filename.
     verbosity=5,
 )
 pagodo_results_dict = pg.go()
@@ -209,6 +209,14 @@ between each different Google dork search.
 results at a time, so if you pick `-m 500`, 5 separate search queries will have to be made for each Google dork search,
 which will increase the amount of time to complete.
 
+### Save Output
+
+`-o [optional/path/to/results.json]` - Save output to a JSON file.  If you do not specify a filename, a datetimestamped
+one will be generated.
+
+`-s [optional/path/to/results.txt]` - Save URLs to a text file.  If you do not specify a filename, a datetimestamped one
+will be generated.
+
 ## Google is blocking me!
 
 Performing 7300+ search requests to Google as fast as possible will simply not work.  Google will rightfully detect it
@@ -260,7 +268,7 @@ Throw `proxychains4` in front of the `pagodo.py` script and each *request* looku
 thus source from a different IP).
 
 ```bash
-proxychains4 python pagodo.py -g dorks/all_google_dorks.txt -o -s
+proxychains4 python pagodo.py -g dorks/all_google_dorks.txt -o [optional/path/to/results.json] -s [optional/path/to/results.txt]
 ```
 
 Note that this may not appear natural to Google if you:

diff --git a/pagodo.py b/pagodo.py
@@ -16,7 +16,7 @@
 
 # Custom Python libraries.
 
-__version__ = "2.2.1"
+__version__ = "2.3.1"
 
 # Logging
 ROOT_LOGGER = logging.getLogger("pagodo")
@@ -42,9 +42,9 @@ def __init__(
         google_dorks_file,
         domain="",
         max_search_result_urls_to_return_per_dork=100,
-        save_pagodo_results_to_json_file=False,
+        save_pagodo_results_to_json_file=None,  # None = Auto-generate file name, otherwise pass a string for path and filename.
         proxies="",
-        save_urls_to_file=False,
+        save_urls_to_file=None,  # None = Auto-generate file name, otherwise pass a string for path and filename.
         minimum_delay_between_dork_searches_in_seconds=37,
         maximum_delay_between_dork_searches_in_seconds=60,
         disable_verify_ssl=False,
@@ -79,7 +79,7 @@ def __init__(
         # All passed paramters look good, assign to the class object.
         self.google_dorks_file = google_dorks_file
         self.google_dorks = []
-        with open(google_dorks_file, "r") as fh:
+        with open(google_dorks_file, "r", encoding="utf-8") as fh:
             for line in fh.read().splitlines():
                 if line.strip():
                     self.google_dorks.append(line)
@@ -122,6 +122,14 @@ def __init__(
         self.total_urls_found = 0
         self.proxy_rotation_index = 0
 
+        # -o with no filename.  Desire to save results, don't care about the file name.
+        if self.save_pagodo_results_to_json_file is None:
+            self.save_pagodo_results_to_json_file = f"{self.base_file_name}.json"
+
+        # -s with no filename.  Desire to save results, don't care about the file name.
+        if self.save_urls_to_file is None:
+            self.save_urls_to_file = f"{self.base_file_name}.txt"
+
         # Assign log level.
         ROOT_LOGGER.setLevel((6 - self.verbosity) * 10)
 
@@ -146,13 +154,10 @@ def go(self):
 
         for dork in self.google_dorks:
 
-            # fmt: off
             self.pagodo_results_dict["dorks"][dork] = {
                 "urls_size": 0,
                 "urls": [],
-
             }
-            # fmt: on
 
             try:
                 dork = dork.strip()
@@ -241,18 +246,16 @@ def go(self):
 
                     # Save URLs with valid results to an .txt file.
                     if self.save_urls_to_file:
-                        with open(f"{self.base_file_name}.txt", "a") as fh:
+                        with open(self.save_urls_to_file, "a") as fh:
                             fh.write(f"# {dork}\n")
                             for url in dork_urls_list:
                                 fh.write(f"{url}\n")
                             fh.write("#" * 50 + "\n")
 
-                    # fmt: off
                     self.pagodo_results_dict["dorks"][dork] = {
                         "urls_size": dork_urls_list_size,
                         "urls": dork_urls_list,
                     }
-                    # fmt: on
 
                 # No Google dork results found.
                 else:
@@ -290,15 +293,27 @@ def go(self):
 
         # Save pagodo_results_dict to a .json file.
         if self.save_pagodo_results_to_json_file:
-            with open(f"{self.base_file_name}.json", "w") as fh:
+            with open(self.save_pagodo_results_to_json_file, "w") as fh:
                 json.dump(self.pagodo_results_dict, fh, indent=4)
 
         return self.pagodo_results_dict
 
 
+# http://stackoverflow.com/questions/3853722/python-argparse-how-to-insert-newline-in-the-help-text
+class SmartFormatter(argparse.HelpFormatter):
+    def _split_lines(self, text, width):
+        if text.startswith("R|"):
+            return text[2:].splitlines()
+        # This is the RawTextHelpFormatter._split_lines
+        return argparse.HelpFormatter._split_lines(self, text, width)
+
+
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(description=f"pagodo - Passive Google Dork v{__version__}")
+    parser = argparse.ArgumentParser(
+        description=f"pagodo - Passive Google Dork v{__version__}",
+        formatter_class=SmartFormatter,
+    )
     parser.add_argument(
         "-g", dest="google_dorks_file", action="store", required=True, help="File containing Google dorks, 1 per line."
     )
@@ -360,22 +375,27 @@ def go(self):
     )
     parser.add_argument(
         "-o",
+        nargs="?",
+        metavar="JSON_FILE",
         dest="save_pagodo_results_to_json_file",
-        action="store_true",
-        required=False,
+        action="store",
         default=False,
-        help=(
-            "Save JSON dictionary to pagodo_results_<TIMESTAMP>.json file.  Contains more information than "
-            "pagodo_results_<TIMESTAMP>.txt"
-        ),
+        help="R|Save URL dork data to a JSON file.  Contains more information than .txt version\n"
+        "no -o = Do not save dork data to a JSON file\n"
+        "-o = Save dork data to pagodo_results_<TIMESTAMP>.json\n"
+        "-o JSON_FILE = Save dork data to JSON_FILE",
     )
     parser.add_argument(
         "-s",
+        nargs="?",
+        metavar="URL_FILE",
         dest="save_urls_to_file",
-        action="store_true",
-        required=False,
+        action="store",
         default=False,
-        help="Save any URLS found for a dork to the pagodo_results_<TIMESTAMP>.txt file.",
+        help="R|Save URL dork data to a text file.\n"
+        "no -s = Do not save dork data to a file\n"
+        "-s = Save dork data to pagodo_results_<TIMESTAMP>.txt\n"
+        "-s URL_FILE = Save dork data to URL_FILE",
     )
     parser.add_argument(
         "-v",

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,3 @@
-beautifulsoup4==4.10.0
-yagooglesearch==1.6.0
+beautifulsoup4==4.11.1
 requests==2.27.1
+yagooglesearch==1.6.0