From 80d3b1eba1e2819ff24ee428b145006669eef424 Mon Sep 17 00:00:00 2001
From: mholt <mholt@pacificbiosciences.com>
Date: Fri, 28 Jul 2023 11:08:49 -0700
Subject: [PATCH 1/3] swaps from github API to expanding the assets

---
 bioconda_utils/hosters.py | 102 +++++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 40 deletions(-)

diff --git a/bioconda_utils/hosters.py b/bioconda_utils/hosters.py
index 5fd56c1ef6..c8185247de 100644
--- a/bioconda_utils/hosters.py
+++ b/bioconda_utils/hosters.py
@@ -231,6 +231,36 @@ def error(self, message: str) -> None:
         logger.debug("Error parsing HTML: %s", message)
 
 
+class IncludeFragmentParser(HTMLParser):
+    """Extract include-fragment targets from HTML"""
+    def __init__(self, link_re: Pattern[str]) -> None:
+        super().__init__()
+        self.link_re = link_re
+        self.matches: List[Mapping[str, Any]] = []
+
+    def get_matches(self) -> List[Mapping[str, Any]]:
+        """Return matches found for **link_re** in href links"""
+        return self.matches
+
+    def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]) -> None:
+        if tag == "include-fragment":
+            for key, val in attrs:
+                if key == "src":
+                    self.handle_a_href(val)
+                    break
+
+    def handle_a_href(self, href: str) -> None:
+        """Process href attributes of anchor tags"""
+        match = self.link_re.search(href)
+        if match:
+            data = match.groupdict()
+            data["href"] = href
+            self.matches.append(data)
+
+    def error(self, message: str) -> None:
+        logger.debug("Error parsing HTML: %s", message)
+
+
 # pylint: disable=abstract-method
 class HTMLHoster(Hoster):
     """Base for Hosters handling release listings in HTML format"""
@@ -326,7 +356,7 @@ class GithubBase(OrderedHTMLHoster):
 class GithubRelease(GithubBase):
     """Matches release artifacts uploaded to Github"""
     link_pattern = r"/{account}/{project}/releases/download/{tag}/{fname}{ext}?"
-    alt_releases_formats = ["https://api.github.com/repos/{account}/{project}/releases"]
+    expanded_assets_pattern = r"https://github.com/{account}/{project}/releases/expanded_assets/{version}"
 
     async def get_versions(self, req, orig_version):
         # first, try the older version when HTML worked
@@ -334,53 +364,45 @@ async def get_versions(self, req, orig_version):
         if len(matches) > 0:
             return matches
 
-        # old version found nothing, try with the alternate github API URLs which return JSON
-        self.releases_urls = [
-            template.format_map(self.vals)
-            for template in self.alt_releases_formats
-        ]
-
-        # this is basically copied from a mixture of the base version and the JSON version
-        # need to compile the link regex
+        # old version found nothing, pull the webpage and expand the assets
+        # this section is basically copied from HTMLHoster, but we need the raw contents of the webpage to look for expanded assets
         exclude = set(self.exclude)
         vals = {key: val
                 for key, val in self.vals.items()
                 if key not in exclude}
+        
+        # this is the pattern for the expanded assets
+        expanded_assets_pattern = replace_named_capture_group(self.expanded_assets_pattern_compiled, vals)
+        expanded_assets_re = re.compile(expanded_assets_pattern)
+
+        # after we expand an asset, we still need to look for the original link pattern within the asset
         link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals)
         link_re = re.compile(link_pattern)
-
-        # now iterate over the alter release URLs
-        matches = []
+        
+        result = []
         for url in self.releases_urls:
-            text = await req.get_text_from_url(url)
-            data = json.loads(text)
-
-            # structured as an array of tagged releases
-            for tag_dict in data:
-                # each release has an asset dict
-                for asset_dict in tag_dict.get('assets', []):
-                    # there is a direct download link for each asset, which should make the typical pattern for an HTML user
-                    download_url = asset_dict['browser_download_url']
-                    re_match = link_re.search(download_url)
+            # we cannot use the HrefParser because it's not in an <a> tag
+            parser = IncludeFragmentParser(expanded_assets_re)
+            parser.feed(await req.get_text_from_url(url))
             
-                    if re_match:
-                        # this one matches the pattern
-                        # link - just copy the download_url in full
-                        # version - pull out of the regex match
-                        data = re_match.groupdict()
-                        matches.append({
-                            'link' : download_url,
-                            'version' : data['version']
-                        })
-
-        # now strip down to the version(s) that are more recent than what currently is in bioconda
-        num = None
-        for num, match in enumerate(matches):
-            if match["version"] == self.vals["version"]:
-                break
-        if num is None:
-            return matches
-        return matches[:num + 1]
+            # now iterate over each expanded asset we find
+            for match in parser.get_matches():
+                # fetch the expansion and look for the primary URL
+                link_parser = HrefParser(link_re)
+                link_parser.feed(await req.get_text_from_url(match["href"]))
+                
+                for lp_match in link_parser.get_matches():
+                    # we found a match in the expansion
+                    result.append({
+                        'link' : urljoin(url, lp_match["href"]),
+                        'version' : lp_match['version']
+                    })
+
+                if match["version"] == self.vals["version"]:
+                    # we hit the current version, early exit so we do not fetch every expanded asset on the full page
+                    break
+
+        return result
 
 class GithubTag(GithubBase):
     """Matches GitHub repository archives created automatically from tags"""

From 9fc5b49dfa62a04bd6a3aa183fc1b9164822d936 Mon Sep 17 00:00:00 2001
From: mholt <mholt@pacificbiosciences.com>
Date: Mon, 31 Jul 2023 06:01:57 -0700
Subject: [PATCH 2/3] comment to rebump

---
 bioconda_utils/hosters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bioconda_utils/hosters.py b/bioconda_utils/hosters.py
index c8185247de..2e5e3712a1 100644
--- a/bioconda_utils/hosters.py
+++ b/bioconda_utils/hosters.py
@@ -371,7 +371,7 @@ async def get_versions(self, req, orig_version):
                 for key, val in self.vals.items()
                 if key not in exclude}
         
-        # this is the pattern for the expanded assets
+        # this is the pattern for the expanded assets, which auto-expand when viewed via web
         expanded_assets_pattern = replace_named_capture_group(self.expanded_assets_pattern_compiled, vals)
         expanded_assets_re = re.compile(expanded_assets_pattern)
 

From 5193ff2300487a411bae3ae0ca36147ff5896089 Mon Sep 17 00:00:00 2001
From: mholt <mholt@pacificbiosciences.com>
Date: Wed, 9 Aug 2023 06:23:25 -0700
Subject: [PATCH 3/3] adds in the API version for records and future mods

---
 bioconda_utils/hosters.py | 67 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/bioconda_utils/hosters.py b/bioconda_utils/hosters.py
index 2e5e3712a1..0f61c23be0 100644
--- a/bioconda_utils/hosters.py
+++ b/bioconda_utils/hosters.py
@@ -357,14 +357,25 @@ class GithubRelease(GithubBase):
     """Matches release artifacts uploaded to Github"""
     link_pattern = r"/{account}/{project}/releases/download/{tag}/{fname}{ext}?"
     expanded_assets_pattern = r"https://github.com/{account}/{project}/releases/expanded_assets/{version}"
+    alt_releases_formats = ["https://api.github.com/repos/{account}/{project}/releases"]
 
     async def get_versions(self, req, orig_version):
         # first, try the older version when HTML worked
         matches = await super().get_versions(req, orig_version)
         if len(matches) > 0:
             return matches
-
-        # old version found nothing, pull the webpage and expand the assets
+        
+        # now try the expanded webpage parsing, this may break if the HTML page changes in the future
+        matches = await self.get_expanded_versions(req, orig_version)
+        if len(matches) > 0:
+            return matches
+        
+        # now try the github API parsing, this will hit the API rate limit
+        matches = await self.get_api_versions(req, orig_version)
+        return matches
+    
+    async def get_expanded_versions(self, req, orig_version):
+        # this version will parse the releases page and expand sub-pages that are collapsed in the initial download
         # this section is basically copied from HTMLHoster, but we need the raw contents of the webpage to look for expanded assets
         exclude = set(self.exclude)
         vals = {key: val
@@ -404,6 +415,58 @@ async def get_versions(self, req, orig_version):
 
         return result
 
+    async def get_api_versions(self, req, orig_version):
+        # this version searches using the API for releases
+        # TODO: we basically immediately hit the rate limit with this version, we eventually need some long-term persistent memory
+        #   that can track the etags or last-modified so we do not hit this limit except in the initial spin-up
+        #   more information on etags: https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#conditional-requests
+        self.releases_urls = [
+            template.format_map(self.vals)
+            for template in self.alt_releases_formats
+        ]
+
+        # this is basically copied from a mixture of the base version and the JSON version
+        # need to compile the link regex
+        exclude = set(self.exclude)
+        vals = {key: val
+                for key, val in self.vals.items()
+                if key not in exclude}
+        link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals)
+        link_re = re.compile(link_pattern)
+
+        # now iterate over the alternate release URLs
+        matches = []
+        for url in self.releases_urls:
+            text = await req.get_text_from_url(url)
+            data = json.loads(text)
+
+            # structured as an array of tagged releases
+            for tag_dict in data:
+                # each release has an asset dict
+                for asset_dict in tag_dict.get('assets', []):
+                    # there is a direct download link for each asset, which should make the typical pattern for an HTML user
+                    download_url = asset_dict['browser_download_url']
+                    re_match = link_re.search(download_url)
+            
+                    if re_match:
+                        # this one matches the pattern
+                        # link - just copy the download_url in full
+                        # version - pull out of the regex match
+                        data = re_match.groupdict()
+                        matches.append({
+                            'link' : download_url,
+                            'version' : data['version']
+                        })
+
+        # now strip down to the version(s) that are more recent than what currently is in bioconda
+        num = None
+        for num, match in enumerate(matches):
+            if match["version"] == self.vals["version"]:
+                break
+        if num is None:
+            return matches
+        return matches[:num + 1]
+
 class GithubTag(GithubBase):
     """Matches GitHub repository archives created automatically from tags"""
     link_pattern = r"/{account}/{project}/archive(/refs/tags)?/{tag}{ext}"