From cabe0527024e32c58a1558d11e6fbff98d7d4272 Mon Sep 17 00:00:00 2001 From: "Sean T. Allen" Date: Sat, 24 Feb 2024 23:00:28 +0000 Subject: [PATCH] Optimization --- htmlproofer/plugin.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/htmlproofer/plugin.py b/htmlproofer/plugin.py index 7451f9c..a35872e 100644 --- a/htmlproofer/plugin.py +++ b/htmlproofer/plugin.py @@ -3,7 +3,7 @@ import os.path import pathlib import re -from typing import List, Optional, Set +from typing import Dict, List, Optional, Set import urllib.parse import uuid @@ -100,6 +100,13 @@ def on_post_page(self, output_content: str, page: Page, config: Config) -> None: use_directory_urls = config.data["use_directory_urls"] + # Optimization: At this point, we have all the files, so we can create + # a dictionary for faster lookups. Prior to this point, files are + # still being updated so creating a dictionary before now would result + # in incorrect values appearing as the key. + opt_files = {} + opt_files.update({os.path.normpath(file.url): file for file in self.files}) + # Optimization: only parse links and headings # li, sup are used for footnotes strainer = SoupStrainer(('a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'sup', 'img')) @@ -122,7 +129,7 @@ def on_post_page(self, output_content: str, page: Page, config: Config) -> None: log_warning(f"ignoring URL {url} from {page.file.src_path}") continue - url_status = self.get_url_status(url, page.file.src_path, all_element_ids, self.files, use_directory_urls) + url_status = self.get_url_status(url, page.file.src_path, all_element_ids, opt_files, use_directory_urls) if self.bad_url(url_status) and self.is_error(self.config, url, url_status): self.report_invalid_url(url, url_status, page.file.src_path) @@ -161,7 +168,7 @@ def get_url_status( url: str, src_path: str, all_element_ids: Set[str], - files: List[File], + files: Dict[str, File], use_directory_urls: bool ) -> int: if any(pat.match(url) for pat in LOCAL_PATTERNS): @@ -188,7 +195,7 @@ def get_url_status( return 0 @staticmethod - def is_url_target_valid(url: str, src_path: str, files: List[File]) -> bool: + def is_url_target_valid(url: str, src_path: str, files: Dict[str, File]) -> bool: match = MARKDOWN_ANCHOR_PATTERN.match(url) if match is None: return True @@ -209,7 +216,7 @@ def is_url_target_valid(url: str, src_path: str, files: List[File]) -> bool: return True @staticmethod - def find_target_markdown(url: str, src_path: str, files: List[File]) -> Optional[str]: + def find_target_markdown(url: str, src_path: str, files: Dict[str, File]) -> Optional[str]: """From a built URL, find the original Markdown source from the project that built it.""" file = HtmlProoferPlugin.find_source_file(url, src_path, files) @@ -218,7 +225,7 @@ def find_target_markdown(url: str, src_path: str, files: List[File]) -> Optional return None @staticmethod - def find_source_file(url: str, src_path: str, files: List[File]) -> Optional[File]: + def find_source_file(url: str, src_path: str, files: Dict[str, File]) -> Optional[File]: """From a built URL, find the original file from the project that built it.""" if len(url) > 1 and url[0] == '/': @@ -229,13 +236,9 @@ def find_source_file(url: str, src_path: str, files: List[File]) -> Optional[Fil src_dir = urllib.parse.quote(str(pathlib.Path(src_path).parent), safe='/\\') search_path = os.path.normpath(str(pathlib.Path(src_dir) / pathlib.Path(url))) - for file in files: - # Need to call normpath on the url to get the Windows tests to - # pass. This might be required for other platforms as well, but - # based on the tests, it seems to be required for Windows only. - if os.path.normpath(file.url) == search_path: - return file - else: + try: + return files[search_path] + except KeyError: return None @staticmethod