From cabe0527024e32c58a1558d11e6fbff98d7d4272 Mon Sep 17 00:00:00 2001
From: "Sean T. Allen" <sean@seantallen.com>
Date: Sat, 24 Feb 2024 23:00:28 +0000
Subject: [PATCH] Optimization

---
 htmlproofer/plugin.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/htmlproofer/plugin.py b/htmlproofer/plugin.py
index 7451f9c..a35872e 100644
--- a/htmlproofer/plugin.py
+++ b/htmlproofer/plugin.py
@@ -3,7 +3,7 @@
 import os.path
 import pathlib
 import re
-from typing import List, Optional, Set
+from typing import Dict, List, Optional, Set
 import urllib.parse
 import uuid
 
@@ -100,6 +100,13 @@ def on_post_page(self, output_content: str, page: Page, config: Config) -> None:
 
         use_directory_urls = config.data["use_directory_urls"]
 
+        # Optimization: At this point, we have all the files, so we can create
+        # a dictionary for faster lookups. Prior to this point, files are
+        # still being updated so creating a dictionary before now would result
+        # in incorrect values appearing as the key.
+        opt_files = {}
+        opt_files.update({os.path.normpath(file.url): file for file in self.files})
+
         # Optimization: only parse links and headings
         # li, sup are used for footnotes
         strainer = SoupStrainer(('a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'sup', 'img'))
@@ -122,7 +129,7 @@ def on_post_page(self, output_content: str, page: Page, config: Config) -> None:
                     log_warning(f"ignoring URL {url} from {page.file.src_path}")
                 continue
 
-            url_status = self.get_url_status(url, page.file.src_path, all_element_ids, self.files, use_directory_urls)
+            url_status = self.get_url_status(url, page.file.src_path, all_element_ids, opt_files, use_directory_urls)
 
             if self.bad_url(url_status) and self.is_error(self.config, url, url_status):
                 self.report_invalid_url(url, url_status, page.file.src_path)
@@ -161,7 +168,7 @@ def get_url_status(
             url: str,
             src_path: str,
             all_element_ids: Set[str],
-            files: List[File],
+            files: Dict[str, File],
             use_directory_urls: bool
     ) -> int:
         if any(pat.match(url) for pat in LOCAL_PATTERNS):
@@ -188,7 +195,7 @@ def get_url_status(
         return 0
 
     @staticmethod
-    def is_url_target_valid(url: str, src_path: str, files: List[File]) -> bool:
+    def is_url_target_valid(url: str, src_path: str, files: Dict[str, File]) -> bool:
         match = MARKDOWN_ANCHOR_PATTERN.match(url)
         if match is None:
             return True
@@ -209,7 +216,7 @@ def is_url_target_valid(url: str, src_path: str, files: List[File]) -> bool:
         return True
 
     @staticmethod
-    def find_target_markdown(url: str, src_path: str, files: List[File]) -> Optional[str]:
+    def find_target_markdown(url: str, src_path: str, files: Dict[str, File]) -> Optional[str]:
         """From a built URL, find the original Markdown source from the project that built it."""
 
         file = HtmlProoferPlugin.find_source_file(url, src_path, files)
@@ -218,7 +225,7 @@ def find_target_markdown(url: str, src_path: str, files: List[File]) -> Optional
         return None
 
     @staticmethod
-    def find_source_file(url: str, src_path: str, files: List[File]) -> Optional[File]:
+    def find_source_file(url: str, src_path: str, files: Dict[str, File]) -> Optional[File]:
         """From a built URL, find the original file from the project that built it."""
 
         if len(url) > 1 and url[0] == '/':
@@ -229,13 +236,9 @@ def find_source_file(url: str, src_path: str, files: List[File]) -> Optional[Fil
             src_dir = urllib.parse.quote(str(pathlib.Path(src_path).parent), safe='/\\')
             search_path = os.path.normpath(str(pathlib.Path(src_dir) / pathlib.Path(url)))
 
-        for file in files:
-            # Need to call normpath on the url to get the Windows tests to
-            # pass. This might be required for other platforms as well, but
-            # based on the tests, it seems to be required for Windows only.
-            if os.path.normpath(file.url) == search_path:
-                return file
-        else:
+        try:
+            return files[search_path]
+        except KeyError:
             return None
 
     @staticmethod