Skip to content

Commit

Permalink
Merge pull request #113 from nexB/purl2sym-metadata
Browse files Browse the repository at this point in the history
Add metadata support for linux, mtd-utils, barebox, e2fsprogs and erofs-utils
  • Loading branch information
keshav-space authored Mar 28, 2024
2 parents 16b267d + 2170bfe commit 4f957a8
Show file tree
Hide file tree
Showing 85 changed files with 242,884 additions and 2,482 deletions.
164 changes: 133 additions & 31 deletions src/fetchcode/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,22 @@ def get_github_data_for_miniupnp(purl):
)


@router.route("pkg:generic/erofs-utils.*",)
def get_github_data_for_erofs_utils(purl):
"""
Yield `Package` object for erofs-utils package from GitHub.
"""
generic_purl = PackageURL.from_string(purl)
github_repo_purl = PackageURL(
type="github",
namespace="erofs",
name="erofs-utils",
version=generic_purl.version,
)

return GitHubSource.get_package_info(github_repo_purl)


@router.route("pkg:bitbucket/.*")
def get_bitbucket_data_from_purl(purl):
"""
Expand Down Expand Up @@ -317,9 +333,12 @@ def get_gnu_data_from_purl(purl):
"""Generate `Package` object from the `purl` string of gnu type"""
purl = PackageURL.from_string(purl)
source_archive_url = f"https://ftp.gnu.org/pub/gnu/{purl.name}/"
regex = r"^({}-)([\w.-]*)(.tar.gz)$".format(purl.name)
version_regex_template = r"^({}-)(?P<version>[\w.-]*)(.tar.gz)$"
version_regex = re.compile(version_regex_template.format(re.escape(purl.name)))

yield from extract_packages_from_listing(purl, source_archive_url, regex, [])
yield from extract_packages_from_listing(
purl, source_archive_url, version_regex, []
)


@dataclasses.dataclass
Expand All @@ -333,8 +352,8 @@ class DirectoryListedSource:
"description": "Flag indicating whether the archives are nested within another directory"
},
)
source_archive_regex: str = dataclasses.field(
default="",
source_archive_regex: re.Pattern = dataclasses.field(
default=None,
metadata={
"description": "Regular expression pattern to match files in the directory listing."
},
Expand Down Expand Up @@ -368,7 +387,9 @@ def get_package_info(cls, package_url):
class IpkgDirectoryListedSource(DirectoryListedSource):
source_url = "https://web.archive.org/web/20090326020239/http://handhelds.org/download/packages/ipkg/"
is_nested = False
source_archive_regex = r"^(ipkg[-_])([\w.-]*)(_arm.ipk|.tar.gz)$"
source_archive_regex = re.compile(
r"^(ipkg[-_])(?P<version>[\w.-]*)(_arm.ipk|.tar.gz)$"
)
ignored_files_and_dir = []

@classmethod
Expand Down Expand Up @@ -398,132 +419,163 @@ def get_package_info(cls, package_url):
class UtilLinuxDirectoryListedSource(DirectoryListedSource):
source_url = "https://mirrors.edge.kernel.org/pub/linux/utils/util-linux/"
is_nested = True
source_archive_regex = r"^(util-linux-)([\w.-]*)(.tar.gz)$"
# Source archive ex: util-linux-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(util-linux-)(?P<version>[\w.-]*)(.tar.gz)$")
ignored_files_and_dir = []


class BusyBoxDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.busybox.net/downloads/"
source_archive_regex = r"^(busybox-)([\w.-]*)(.tar.bz2)$"
# Source archive ex: busybox-1.2.3.tar.bz2
source_archive_regex = re.compile(r"^(busybox-)(?P<version>[\w.-]*)(.tar.bz2)$")
is_nested = False
ignored_files_and_dir = []


class UclibcDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.uclibc.org/downloads/"
source_archive_regex = r"^(uClibc-)([\w.-]*)(.tar.gz)$"
# Source archive ex: uClibc-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(uClibc-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class UclibcNGDirectoryListedSource(DirectoryListedSource):
source_url = "https://downloads.uclibc-ng.org/releases/"
source_archive_regex = r"^(uClibc-ng-)([\w.-]*)(.tar.gz)$"
# Source archive ex: uClibc-ng-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(uClibc-ng-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = True
ignored_files_and_dir = []


class Bzip2DirectoryListedSource(DirectoryListedSource):
source_url = "https://sourceware.org/pub/bzip2/"
source_archive_regex = r"^(bzip2-)([\w.-]*)(.tar.gz)$"
# Source archive ex: bzip2-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(bzip2-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class OpenSSHDirectoryListedSource(DirectoryListedSource):
source_url = "https://cdn.openbsd.org/pub/OpenBSD/OpenSSH/"
source_archive_regex = r"^(openssh-)([\w.-]*)(.tgz|.tar.gz)$"
# Source archive ex: openssh-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(openssh-)(?P<version>[\w.-]*)(.tgz|.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class DnsmasqDirectoryListedSource(DirectoryListedSource):
source_url = "https://thekelleys.org.uk/dnsmasq/"
source_archive_regex = r"^(dnsmasq-)([\w.-]*)(.tar.xz|.tar.gz)$"
# Source archive ex: dnsmasq-1.2.3.tar.gz
source_archive_regex = re.compile(
r"^(dnsmasq-)(?P<version>[\w.-]*)(.tar.xz|.tar.gz)$"
)
is_nested = False
ignored_files_and_dir = []


class EbtablesDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.netfilter.org/pub/ebtables/"
source_archive_regex = r"^(ebtables-)([\w.-]*)(.tar.gz)$"
# Source archive ex: ebtables-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(ebtables-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class HostapdDirectoryListedSource(DirectoryListedSource):
source_url = "https://w1.fi/releases/"
source_archive_regex = r"^(hostapd-)([\w.-]*)(.tar.gz)$"
# Source archive ex: hostapd-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(hostapd-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class Iproute2DirectoryListedSource(DirectoryListedSource):
source_url = "https://mirrors.edge.kernel.org/pub/linux/utils/net/iproute2/"
source_archive_regex = r"^(iproute2-)([\w.-]*)(.tar.xz|.tar.gz)$"
source_archive_regex = re.compile(
# Source archive ex: iproute2-1.2.3.tar.gz
r"^(iproute2-)(?P<version>[\w.-]*)(.tar.xz|.tar.gz)$"
)
is_nested = False
ignored_files_and_dir = []


class IptablesDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.netfilter.org/pub/iptables/"
source_archive_regex = r"^(iptables-)([\w.-]*)(.tar.bz2)$"
# Source archive ex: iptables-1.2.3.tar.bz2
source_archive_regex = re.compile(r"^(iptables-)(?P<version>[\w.-]*)(.tar.bz2)$")
is_nested = False
ignored_files_and_dir = []


class LibnlDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.infradead.org/~tgr/libnl/files/"
source_archive_regex = r"^(libnl-)([\w.-]*)(.tar.gz)$"
# Source archive ex: libnl-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(libnl-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class LighttpdDirectoryListedSource(DirectoryListedSource):
source_url = "https://download.lighttpd.net/lighttpd/releases-1.4.x/"
source_archive_regex = r"^(lighttpd-)([\w.-]*)(.tar.gz)$"
# Source archive ex: lighttpd-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(lighttpd-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class NftablesDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.netfilter.org/pub/nftables/"
source_archive_regex = r"^(nftables-)([\w.-]*)(.tar.xz|.tar.bz2)$"
# Source archive ex: nftables-1.2.3.tar.bz2
source_archive_regex = re.compile(
r"^(nftables-)(?P<version>[\w.-]*)(.tar.xz|.tar.bz2)$"
)
is_nested = False
ignored_files_and_dir = []


class WpaSupplicantDirectoryListedSource(DirectoryListedSource):
source_url = "https://w1.fi/releases/"
source_archive_regex = r"^(wpa_supplicant-)([\w.-]*)(.tar.gz)$"
# Source archive ex: wpa_supplicant-1.2.3.tar.gz
source_archive_regex = re.compile(
r"^(wpa_supplicant-)(?P<version>[\w.-]*)(.tar.gz)$"
)
is_nested = False
ignored_files_and_dir = []


class SyslinuxDirectoryListedSource(DirectoryListedSource):
source_url = "https://mirrors.edge.kernel.org/pub/linux/utils/boot/syslinux/"
source_archive_regex = r"^(syslinux-)([\w.-]*)(.tar.gz)$"
# Source archive ex: syslinux-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(syslinux-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class SyslinuxDirectoryListedSource(DirectoryListedSource):
source_url = "https://mirrors.edge.kernel.org/pub/linux/utils/boot/syslinux/"
source_archive_regex = r"^(syslinux-)([\w.-]*)(.tar.gz)$"
# Source archive ex: syslinux-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(syslinux-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class ToyboxDirectoryListedSource(DirectoryListedSource):
source_url = "http://www.landley.net/toybox/downloads/"
source_archive_regex = r"^(toybox-)([\w.-]*)(.tar.gz|.tar.bz2)$"
# Source archive ex: toybox-1.2.3.tar.gz
source_archive_regex = re.compile(
r"^(toybox-)(?P<version>[\w.-]*)(.tar.gz|.tar.bz2)$"
)
is_nested = False
ignored_files_and_dir = []


class DropbearDirectoryListedSource(DirectoryListedSource):
source_url = "https://matt.ucc.asn.au/dropbear/releases/"
source_archive_regex = r"^(dropbear-)([\w.-]*)(.tar.bz2|_i386.deb)$"
# Source archive ex: dropbear-1.2.3.tar.bz2
source_archive_regex = re.compile(
r"^(dropbear-)(?P<version>[\w.-]*)(.tar.bz2|_i386.deb)$"
)
is_nested = False
ignored_files_and_dir = [
"dropbear-0.44test1.tar.bz2",
Expand All @@ -539,10 +591,53 @@ class DropbearDirectoryListedSource(DirectoryListedSource):

class SambaDirectoryListedSource(DirectoryListedSource):
source_url = "https://download.samba.org/pub/samba/stable/"
source_archive_regex = r"^(samba-)([\w.-]*)(.tar.gz)$"
# Source archive ex: samba-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(samba-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = False
ignored_files_and_dir = []


class MtdUtilsDirectoryListedSource(DirectoryListedSource):
source_url = "https://infraroot.at/pub/mtd/"
# Source archive ex: mtd-utils-1.2.3.tar.bz2
source_archive_regex = re.compile(r"^(mtd-utils-)(?P<version>[\w.-]*)(.tar.bz2)$")
is_nested = False
ignored_files_and_dir = []


class BareboxDirectoryListedSource(DirectoryListedSource):
source_url = "https://www.barebox.org/download/"
# Source archive ex: barebox-1.2.3.tar.bz2
source_archive_regex = re.compile(r"^(barebox-)(?P<version>[\w.-]*)(.tar.bz2)$")
is_nested = False
ignored_files_and_dir = []

class LinuxDirectoryListedSource(DirectoryListedSource):
source_url = "https://cdn.kernel.org/pub/linux/kernel/"
# Source archive ex: linux-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(linux-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = True
ignored_files_and_dir = [
"Historic/",
"SillySounds/",
"crypto/",
"firmware/",
"next/",
"people/",
"ports/",
"projects/",
"testing/",
"tools/",
"uemacs/",
]

class E2fsprogsDirectoryListedSource(DirectoryListedSource):
source_url = "https://mirrors.edge.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/"
# Source archive ex: e2fsprogs-1.2.3.tar.gz
source_archive_regex = re.compile(r"^(e2fsprogs-)(?P<version>[\w.-]*)(.tar.gz)$")
is_nested = True
ignored_files_and_dir = ["testing/"]


DIR_SUPPORTED_PURLS = [
"pkg:generic/busybox.*",
Expand All @@ -566,6 +661,10 @@ class SambaDirectoryListedSource(DirectoryListedSource):
"pkg:generic/util-linux.*",
"pkg:generic/wpa_supplicant.*",
"pkg:generic/ipkg.*",
"pkg:generic/mtd-utils.*",
"pkg:generic/barebox.*",
"pkg:generic/linux.*",
"pkg:generic/e2fsprogs.*",
]

DIR_LISTED_SOURCE_BY_PACKAGE_NAME = {
Expand All @@ -589,6 +688,10 @@ class SambaDirectoryListedSource(DirectoryListedSource):
"util-linux": UtilLinuxDirectoryListedSource,
"wpa_supplicant": WpaSupplicantDirectoryListedSource,
"ipkg": IpkgDirectoryListedSource,
"mtd-utils": MtdUtilsDirectoryListedSource,
"barebox": BareboxDirectoryListedSource,
"linux": LinuxDirectoryListedSource,
"e2fsprogs": E2fsprogsDirectoryListedSource,
}


Expand All @@ -605,18 +708,17 @@ def get_packages_from_listing(purl, source_archive_url, regex, ignored_files_and
"""
Return list of package data from a directory listing based on the specified regex.
"""
pattern = re.compile(regex)
_, listing = htmllistparse.fetch_listing(source_archive_url)

packages = []
for file in listing:
if not pattern.match(file.name) or file.name in ignored_files_and_dir:
match = regex.match(file.name)
if not match or file.name in ignored_files_and_dir:
continue

match = re.search(regex, file.name)
version = match.group(2)
version = match.group("version")
version = version.strip("v").strip()
if not version:
if not version or not version[0].isdigit():
continue

modified_time = file.modified
Expand Down Expand Up @@ -668,7 +770,7 @@ def extract_package_from_nested_listing(purl, source_url, regex, ignored_files_a
"""
_, listing = htmllistparse.fetch_listing(source_url)
for directory in listing:
if not directory.name.endswith("/"):
if not directory.name.endswith("/") or directory.name in ignored_files_and_dir:
continue

directory_url = urljoin(source_url, directory.name)
Expand Down
2 changes: 1 addition & 1 deletion src/fetchcode/package_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def _get_github_packages(purl, version_regex, ignored_tag_regex, default_package
version = tag

version = version.strip("Vv").strip()
if not version:
if not version or not version[0].isdigit():
continue

download_url = archive_download_url.format(
Expand Down
Loading

0 comments on commit 4f957a8

Please sign in to comment.