From 9517a91192c9de50d58df5c6f41eb5cb370a7ce2 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 22 Nov 2023 13:49:39 -0800 Subject: [PATCH 01/54] Create new django app matchcodeio #224 * Add scancodeio under extra_requires Signed-off-by: Jono Yang --- MANIFEST.in | 7 ++++++- matchcodeio/__init__.py | 0 matchcodeio/dbrouter.py | 32 ++++++++++++++++++++++++++++++++ matchcodeio/settings.py | 35 +++++++++++++++++++++++++++++++++++ matchcodeio/wsgi.py | 25 +++++++++++++++++++++++++ setup.cfg | 12 ++++++++++-- 6 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 matchcodeio/__init__.py create mode 100644 matchcodeio/dbrouter.py create mode 100644 matchcodeio/settings.py create mode 100644 matchcodeio/wsgi.py diff --git a/MANIFEST.in b/MANIFEST.in index ef3721e8..ac7bf4f4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,9 @@ -graft src +graft clearcode +graft clearindex +graft matchcode +graft minecode +graft packagedb +graft purldb include *.LICENSE include NOTICE diff --git a/matchcodeio/__init__.py b/matchcodeio/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/matchcodeio/dbrouter.py b/matchcodeio/dbrouter.py new file mode 100644 index 00000000..fb438c7e --- /dev/null +++ b/matchcodeio/dbrouter.py @@ -0,0 +1,32 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + + +class PackageDBReadOnlyRouter(object): + app_labels = [ + 'clearcode', + 'clearindex', + 'minecode', + 'matchcode', + 'packagedb', + ] + + def db_for_read(self, model, **hints): + if model._meta.app_label in self.app_labels: + return 'packagedb' + return None + + def db_for_write(self, model, **hints): + return None + + def allow_relation(self, obj1, obj2, **hints): + return None + + def allow_migrate(self, db, app_label, model_name=None, **hints): + return None diff --git a/matchcodeio/settings.py b/matchcodeio/settings.py new file mode 100644 index 00000000..2b1c263c --- /dev/null +++ b/matchcodeio/settings.py @@ -0,0 +1,35 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from scancodeio.settings import * + + +INSTALLED_APPS += [ + "packagedb", + "matchcode", +] + + +# Database + +DATABASES.update( + { + 'packagedb': { + 'ENGINE': env.str('PACKAGEDB_DB_ENGINE', 'django.db.backends.postgresql'), + 'HOST': env.str('PACKAGEDB_DB_HOST', 'localhost'), + 'NAME': env.str('PACKAGEDB_DB_NAME', 'packagedb'), + 'USER': env.str('PACKAGEDB_DB_USER', 'packagedb'), + 'PASSWORD': env.str('PACKAGEDB_DB_PASSWORD', 'packagedb'), + 'PORT': env.str('PACKAGEDB_DB_PORT', '5432'), + 'ATOMIC_REQUESTS': True, + } + } +) + +DATABASE_ROUTERS = ["matchcodeio.dbrouter.PackageDBReadOnlyRouter",] diff --git a/matchcodeio/wsgi.py b/matchcodeio/wsgi.py new file mode 100644 index 00000000..2c570b4e --- /dev/null +++ b/matchcodeio/wsgi.py @@ -0,0 +1,25 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +""" +WSGI config for MatchCode.io. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/dev/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "matchcodeio.settings") + +application = get_wsgi_application() diff --git a/setup.cfg b/setup.cfg index d0889f09..b57377f5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -62,9 +62,12 @@ setup_requires = setuptools_scm[toml] >= 4 python_requires = >=3.8 [options.packages.find] -where = src +where = . [options.extras_require] +matchcodeio = + scancodeio + testing = pytest >= 6, != 7.0.0 pytest-xdist >= 2 @@ -72,8 +75,13 @@ testing = aboutcode-toolkit >= 6.0.0 black mock + scancodeio docs = Sphinx==5.0.2 sphinx-rtd-theme==1.0.0 - doc8==0.11.2 \ No newline at end of file + doc8==0.11.2 + +[options.entry_points] +scancodeio_pipelines = + scan_and_fingerprint_package = matchcode_toolkit.pipelines.scan_and_fingerprint_package:ScanAndFingerprintPackage From 104a8afb6ee659dcba62e878a91dfa60a87e35ba Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 22 Nov 2023 19:17:18 -0800 Subject: [PATCH 02/54] Create Package matching pipeline #224 * Drop aboutcode-toolkit version Signed-off-by: Jono Yang --- matchcode/pipelines/__init__.py | 0 matchcode/pipelines/matching.py | 109 ++++++++ matchcode/pipes/__init__.py | 0 matchcode/pipes/matching.py | 425 ++++++++++++++++++++++++++++++++ matchcodeio/settings.py | 1 - requirements-dev.txt | 2 +- setup.cfg | 2 +- 7 files changed, 536 insertions(+), 3 deletions(-) create mode 100644 matchcode/pipelines/__init__.py create mode 100644 matchcode/pipelines/matching.py create mode 100644 matchcode/pipes/__init__.py create mode 100644 matchcode/pipes/matching.py diff --git a/matchcode/pipelines/__init__.py b/matchcode/pipelines/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/matchcode/pipelines/matching.py b/matchcode/pipelines/matching.py new file mode 100644 index 00000000..2dd06ab6 --- /dev/null +++ b/matchcode/pipelines/matching.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from scanpipe.pipelines.scan_codebase import ScanCodebase +from matchcode.pipes import matching +from scanpipe.pipes import matchcode + + +class Matching(ScanCodebase): + """ + Establish relationships between two code trees: deployment and development. + + This pipeline is expecting 2 archive files with "from-" and "to-" filename + prefixes as inputs: + - "from-[FILENAME]" archive containing the development source code + - "to-[FILENAME]" archive containing the deployment compiled code + """ + + @classmethod + def steps(cls): + return ( + cls.copy_inputs_to_codebase_directory, + cls.extract_archives, + cls.collect_and_create_codebase_resources, + cls.fingerprint_codebase_directories, + cls.flag_empty_files, + cls.flag_ignored_resources, + cls.match_archives_to_purldb, + cls.match_directories_to_purldb, + cls.match_resources_to_purldb, + cls.match_purldb_resources_post_process, + cls.remove_packages_without_resources, + ) + + purldb_package_extensions = [".jar", ".war", ".zip"] + purldb_resource_extensions = [ + ".map", + ".js", + ".mjs", + ".ts", + ".d.ts", + ".jsx", + ".tsx", + ".css", + ".scss", + ".less", + ".sass", + ".soy", + ".class", + ] + + def fingerprint_codebase_directories(self): + """Compute directory fingerprints for matching""" + matchcode.fingerprint_codebase_directories(self.project) + + def match_archives_to_purldb(self): + """Match selected package archives by extension to PurlDB.""" + matching.match_purldb_resources( + project=self.project, + extensions=self.purldb_package_extensions, + matcher_func=matching.match_purldb_package, + logger=self.log, + ) + + def match_directories_to_purldb(self): + """Match selected directories in PurlDB.""" + matching.match_purldb_directories( + project=self.project, + logger=self.log, + ) + + def match_resources_to_purldb(self): + """Match selected files by extension in PurlDB.""" + matching.match_purldb_resources( + project=self.project, + extensions=self.purldb_resource_extensions, + matcher_func=matching.match_purldb_resource, + logger=self.log, + ) + + def match_purldb_resources_post_process(self): + """Choose the best package for PurlDB matched resources.""" + matching.match_purldb_resources_post_process(self.project, logger=self.log) + + def remove_packages_without_resources(self): + """Remove packages without any resources.""" + package_without_resources = self.project.discoveredpackages.filter( + codebase_resources__isnull=True + ) + package_without_resources.delete() diff --git a/matchcode/pipes/__init__.py b/matchcode/pipes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/matchcode/pipes/matching.py b/matchcode/pipes/matching.py new file mode 100644 index 00000000..566cc1d2 --- /dev/null +++ b/matchcode/pipes/matching.py @@ -0,0 +1,425 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from collections import defaultdict + +from django.db.models import Q +from django.template.defaultfilters import pluralize + +from scanpipe import pipes +from scanpipe.pipes import LoopProgress +from scanpipe.pipes import flag +from scanpipe.pipes import js +from scanpipe.pipes import purldb + +from matchcode.models import ApproximateDirectoryContentIndex +from packagedb.models import Package +from packagedb.models import Resource + + +def get_project_resources_qs(project, resources): + """ + Return a queryset of CodebaseResources from `project` containing the + CodebaseResources from `resources` . If a CodebaseResource in `resources` is + an archive or directory, then their descendants are also included in the + queryset. + + Return None if `resources` is empty or None. + """ + lookups = Q() + for resource in resources or []: + lookups |= Q(path=resource.path) + if resource.is_archive: + # This is done to capture the extracted contents of the archive we + # matched to. Generally, the archive contents are in a directory + # that is the archive path with `-extract` at the end. + lookups |= Q(path__startswith=resource.path) + elif resource.is_dir: + # We add a trailing slash to avoid matching on directories we do not + # intend to. For example, if we have matched on the directory with + # the path `foo/bar/1`, using the __startswith filter without + # including a trailing slash on the path would have us get all + # diretories under `foo/bar/` that start with 1, such as + # `foo/bar/10001`, `foo/bar/123`, etc., when we just want `foo/bar/1` + # and its descendants. + path = f"{resource.path}/" + lookups |= Q(path__startswith=path) + if lookups: + return project.codebaseresources.filter(lookups) + + +def create_package_from_purldb_data(project, resources, package_data, status): + """ + Create a DiscoveredPackage instance from PurlDB ``package_data``. + + Return a tuple, containing the created DiscoveredPackage and the number of + CodebaseResources matched to PurlDB that are part of that DiscoveredPackage. + """ + package_data = package_data.copy() + # Do not re-use uuid from PurlDB as DiscoveredPackage.uuid is unique and a + # PurlDB match can be found in different projects. + package_data.pop("uuid", None) + package_data.pop("dependencies", None) + + resources_qs = get_project_resources_qs(project, resources) + package = pipes.update_or_create_package( + project=project, + package_data=package_data, + codebase_resources=resources_qs, + ) + # Get the number of already matched CodebaseResources from `resources_qs` + # before we update the status of all CodebaseResources from `resources_qs`, + # then subtract the number of already matched CodebaseResources from the + # total number of CodebaseResources updated. This is to prevent + # double-counting of CodebaseResources that were matched to purldb + purldb_statuses = [ + flag.MATCHED_TO_PURLDB_PACKAGE, + flag.MATCHED_TO_PURLDB_RESOURCE, + flag.MATCHED_TO_PURLDB_DIRECTORY, + ] + matched_resources_count = resources_qs.exclude(status__in=purldb_statuses).update( + status=status + ) + return package, matched_resources_count + + +def match_purldb_package( + project, resources_by_sha1, enhance_package_data=True, **kwargs +): + """ + Given a mapping of lists of CodebaseResources by their sha1 values, + `resources_by_sha1`, send those sha1 values to purldb packages API endpoint, + process the matched Package data, then return the number of + CodebaseResources that were matched to a Package. + """ + match_count = 0 + sha1_list = list(resources_by_sha1.keys()) + results = Package.objects.using('packagedb').filter(sha1__in=sha1_list) + # Process matched Package data + for package in results: + package_data = package.to_dict() + sha1 = package_data["sha1"] + resources = resources_by_sha1.get(sha1) or [] + if not resources: + continue + _, matched_resources_count = create_package_from_purldb_data( + project=project, + resources=resources, + package_data=package_data, + status=flag.MATCHED_TO_PURLDB_PACKAGE, + ) + match_count += matched_resources_count + return match_count + + +def match_purldb_resource( + project, resources_by_sha1, package_data_by_purldb_urls=None, **kwargs +): + """ + Given a mapping of lists of CodebaseResources by their sha1 values, + `resources_by_sha1`, send those sha1 values to purldb resources API + endpoint, process the matched Package data, then return the number of + CodebaseResources that were matched to a Package. + + `package_data_by_purldb_urls` is a mapping of package data by their purldb + package instance URLs. This is intended to be used as a cache, to avoid + retrieving package data we retrieved before. + """ + package_data_by_purldb_urls = package_data_by_purldb_urls or {} + match_count = 0 + sha1_list = list(resources_by_sha1.keys()) + results = Resource.objects.using('packagedb').filter(sha1__in=sha1_list) + # Process match results + for resource in results: + # Get package data + package_data = resource.package.to_dict() + sha1 = package_data["sha1"] + resources = resources_by_sha1.get(sha1) or [] + if not resources: + continue + _, matched_resources_count = create_package_from_purldb_data( + project=project, + resources=resources, + package_data=package_data, + status=flag.MATCHED_TO_PURLDB_RESOURCE, + ) + match_count += matched_resources_count + return match_count + + +def match_purldb_directory(project, resource): + """Match a single directory resource in the PurlDB.""" + fingerprint = resource.extra_data.get("directory_content", "") + results = ApproximateDirectoryContentIndex.match(directory_fingerprint=fingerprint) + for result in results: + package_data = result.package.to_dict() + return create_package_from_purldb_data( + project, [resource], package_data, flag.MATCHED_TO_PURLDB_DIRECTORY + ) + + +def match_sha1s_to_purldb( + project, resources_by_sha1, matcher_func, package_data_by_purldb_urls +): + """ + Process `resources_by_sha1` with `matcher_func` and return a 3-tuple + contaning an empty defaultdict(list), the number of matches and the number + of sha1s sent to purldb. + """ + matched_count = matcher_func( + project=project, + resources_by_sha1=resources_by_sha1, + package_data_by_purldb_urls=package_data_by_purldb_urls, + ) + sha1_count = len(resources_by_sha1) + # Clear out resources_by_sha1 when we are done with the current batch of + # CodebaseResources + resources_by_sha1 = defaultdict(list) + return resources_by_sha1, matched_count, sha1_count + + +def match_purldb_resources( + project, extensions, matcher_func, chunk_size=1000, logger=None +): + """ + Match against PurlDB selecting codebase resources using provided + ``package_extensions`` for archive type files, and ``resource_extensions``. + + Match requests are sent off in batches of 1000 SHA1s. This number is set + using `chunk_size`. + """ + resources = ( + project.codebaseresources.files() + .no_status() + .has_value("sha1") + .filter(extension__in=extensions) + ) + resource_count = resources.count() + + extensions_str = ", ".join(extensions) + if logger: + if resource_count > 0: + logger( + f"Matching {resource_count:,d} {extensions_str} resources in PurlDB, " + "using SHA1" + ) + else: + logger( + f"Skipping matching for {extensions_str} resources, " + f"as there are {resource_count:,d}" + ) + + _match_purldb_resources( + project=project, + resources=resources, + matcher_func=matcher_func, + chunk_size=chunk_size, + logger=logger, + ) + + +def _match_purldb_resources( + project, to_resources, matcher_func, chunk_size=1000, logger=None +): + resource_count = to_resources.count() + resource_iterator = to_resources.iterator(chunk_size=chunk_size) + progress = LoopProgress(resource_count, logger) + total_matched_count = 0 + total_sha1_count = 0 + processed_resources_count = 0 + resources_by_sha1 = defaultdict(list) + package_data_by_purldb_urls = {} + + for to_resource in progress.iter(resource_iterator): + resources_by_sha1[to_resource.sha1].append(to_resource) + if to_resource.path.endswith(".map"): + for js_sha1 in js.source_content_sha1_list(to_resource): + resources_by_sha1[js_sha1].append(to_resource) + processed_resources_count += 1 + + if processed_resources_count % chunk_size == 0: + resources_by_sha1, matched_count, sha1_count = match_sha1s_to_purldb( + project=project, + resources_by_sha1=resources_by_sha1, + matcher_func=matcher_func, + package_data_by_purldb_urls=package_data_by_purldb_urls, + ) + total_matched_count += matched_count + total_sha1_count += sha1_count + + if resources_by_sha1: + resources_by_sha1, matched_count, sha1_count = match_sha1s_to_purldb( + project=project, + resources_by_sha1=resources_by_sha1, + matcher_func=matcher_func, + package_data_by_purldb_urls=package_data_by_purldb_urls, + ) + total_matched_count += matched_count + total_sha1_count += sha1_count + + logger( + f"{total_matched_count:,d} resources matched in PurlDB " + f"using {total_sha1_count:,d} SHA1s" + ) + + +def match_purldb_directories(project, logger=None): + """Match against PurlDB selecting codebase directories.""" + # If we are able to get match results for a directory fingerprint, then that + # means every resource and directory under that directory is part of a + # Package. By starting from the root to/ directory, we are attempting to + # match as many files as we can before attempting to match further down. The + # more "higher-up" directories we can match to means that we reduce the + # number of queries made to purldb. + to_directories = ( + project.codebaseresources.directories() + .no_status(status=flag.ABOUT_MAPPED) + .no_status(status=flag.MATCHED_TO_PURLDB_PACKAGE) + .order_by("path") + ) + directory_count = to_directories.count() + + if logger: + logger( + f"Matching {directory_count:,d} " + f"director{pluralize(directory_count, 'y,ies')} from to/ in PurlDB" + ) + + directory_iterator = to_directories.iterator(chunk_size=2000) + progress = LoopProgress(directory_count, logger) + + for directory in progress.iter(directory_iterator): + directory.refresh_from_db() + if directory.status != flag.MATCHED_TO_PURLDB_DIRECTORY: + match_purldb_directory(project, directory) + + matched_count = ( + project.codebaseresources.directories() + .filter(status=flag.MATCHED_TO_PURLDB_DIRECTORY) + .count() + ) + logger( + f"{matched_count:,d} director{pluralize(matched_count, 'y,ies')} " + f"matched in PurlDB" + ) + + +def match_resources_with_no_java_source(project, logger=None): + """ + Match resources with ``no-java-source`` to PurlDB, if no match + is found update status to ``requires-review``. + """ + project_files = project.codebaseresources.files() + + to_no_java_source = project_files.to_codebase().filter(status=flag.NO_JAVA_SOURCE) + + if to_no_java_source: + resource_count = to_no_java_source.count() + if logger: + logger( + f"Mapping {resource_count:,d} to/ resources with {flag.NO_JAVA_SOURCE} " + "status in PurlDB using SHA1" + ) + + _match_purldb_resources( + project=project, + to_resources=to_no_java_source, + matcher_func=match_purldb_resource, + logger=logger, + ) + to_no_java_source.exclude(status=flag.MATCHED_TO_PURLDB_RESOURCE).update( + status=flag.REQUIRES_REVIEW + ) + + +def match_purldb_resources_post_process(project, logger=None): + """Choose the best package for PurlDB matched resources.""" + to_extract_directories = ( + project.codebaseresources.directories() + .to_codebase() + .filter(path__regex=r"^.*-extract$") + ) + + to_resources = project.codebaseresources.files().filter( + status=flag.MATCHED_TO_PURLDB_RESOURCE + ) + + resource_count = to_extract_directories.count() + + if logger: + logger( + f"Refining matching for {resource_count:,d} " + f"{flag.MATCHED_TO_PURLDB_RESOURCE} archives." + ) + + resource_iterator = to_extract_directories.iterator(chunk_size=2000) + progress = LoopProgress(resource_count, logger) + map_count = 0 + + for directory in progress.iter(resource_iterator): + map_count += _match_purldb_resources_post_process( + directory, to_extract_directories, to_resources + ) + + logger(f"{map_count:,d} resource processed") + + +def _match_purldb_resources_post_process( + directory_path, to_extract_directories, to_resources +): + # Exclude the content of nested archive. + interesting_codebase_resources = ( + to_resources.filter(path__startswith=directory_path) + .filter(status=flag.MATCHED_TO_PURLDB_RESOURCE) + .exclude(path__regex=rf"^{directory_path}.*-extract\/.*$") + ) + + if not interesting_codebase_resources: + return 0 + + packages_map = {} + + for resource in interesting_codebase_resources: + for package in resource.discovered_packages.all(): + if package in packages_map: + packages_map[package].append(resource) + else: + packages_map[package] = [resource] + + # Rank the packages by most number of matched resources. + ranked_packages = dict( + sorted(packages_map.items(), key=lambda item: len(item[1]), reverse=True) + ) + + for resource in interesting_codebase_resources: + resource.discovered_packages.clear() + + for package, resources in ranked_packages.items(): + unmapped_resources = [ + resource + for resource in resources + if not resource.discovered_packages.exists() + ] + if unmapped_resources: + package.add_resources(unmapped_resources) + + return interesting_codebase_resources.count() diff --git a/matchcodeio/settings.py b/matchcodeio/settings.py index 2b1c263c..eb14b3f7 100644 --- a/matchcodeio/settings.py +++ b/matchcodeio/settings.py @@ -15,7 +15,6 @@ "matchcode", ] - # Database DATABASES.update( diff --git a/requirements-dev.txt b/requirements-dev.txt index 4c61da6c..5ef9572c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -aboutcode-toolkit==10.1.0 +aboutcode-toolkit==9.0.0 black==23.11.0 et-xmlfile==1.1.0 execnet==2.0.2 diff --git a/setup.cfg b/setup.cfg index b57377f5..8a827456 100644 --- a/setup.cfg +++ b/setup.cfg @@ -84,4 +84,4 @@ docs = [options.entry_points] scancodeio_pipelines = - scan_and_fingerprint_package = matchcode_toolkit.pipelines.scan_and_fingerprint_package:ScanAndFingerprintPackage + matching = matchcode.pipelines.matching:Matching From 51118eaae0c5b84647dc8a1b1077f58c2925561e Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 29 Nov 2023 16:38:21 -0800 Subject: [PATCH 03/54] Define URL route for matchcode.io #224 * Create MatchingViewSet Signed-off-by: Jono Yang --- .gitignore | 2 ++ matchcode/api.py | 18 ++++++++++++++++++ matchcode/pipes/matching.py | 8 ++++---- matchcodeio/settings.py | 7 ++++++- matchcodeio/urls.py | 36 ++++++++++++++++++++++++++++++++++++ purldb/urls.py | 2 +- 6 files changed, 67 insertions(+), 6 deletions(-) create mode 100644 matchcodeio/urls.py diff --git a/.gitignore b/.gitignore index db8d038c..391f0819 100644 --- a/.gitignore +++ b/.gitignore @@ -74,3 +74,5 @@ tcl # Env Files .env + +var/ diff --git a/matchcode/api.py b/matchcode/api.py index 68844b8d..11c6c5db 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -12,6 +12,8 @@ from django.forms.fields import MultipleChoiceField from django_filters.filters import MultipleChoiceFilter from django_filters.rest_framework import FilterSet +from rest_framework import mixins +from rest_framework import viewsets from rest_framework.decorators import action from rest_framework.response import Response from rest_framework.serializers import CharField @@ -31,6 +33,10 @@ from matchcode.models import ApproximateDirectoryContentIndex from matchcode.models import ApproximateDirectoryStructureIndex +from scanpipe.api.serializers import ProjectSerializer +from scanpipe.api.views import ProjectFilterSet +from scanpipe.models import Project + class BaseFileIndexSerializer(ModelSerializer): sha1 = CharField(source='fingerprint') @@ -307,3 +313,15 @@ class ApproximateDirectoryStructureIndexViewSet(BaseDirectoryIndexViewSet): queryset = ApproximateDirectoryStructureIndex.objects.all() serializer_class = ApproximateDirectoryStructureIndexSerializer filterset_class = ApproximateDirectoryStructureFilterSet + + +class MatchingViewSet( + mixins.CreateModelMixin, + mixins.RetrieveModelMixin, + mixins.DestroyModelMixin, + mixins.ListModelMixin, + viewsets.GenericViewSet, +): + queryset = Project.objects.all() + serializer_class = ProjectSerializer + filterset_class = ProjectFilterSet diff --git a/matchcode/pipes/matching.py b/matchcode/pipes/matching.py index 566cc1d2..49de16d4 100644 --- a/matchcode/pipes/matching.py +++ b/matchcode/pipes/matching.py @@ -238,10 +238,10 @@ def match_purldb_resources( def _match_purldb_resources( - project, to_resources, matcher_func, chunk_size=1000, logger=None + project, resources, matcher_func, chunk_size=1000, logger=None ): - resource_count = to_resources.count() - resource_iterator = to_resources.iterator(chunk_size=chunk_size) + resource_count = resources.count() + resource_iterator = resources.iterator(chunk_size=chunk_size) progress = LoopProgress(resource_count, logger) total_matched_count = 0 total_sha1_count = 0 @@ -342,7 +342,7 @@ def match_resources_with_no_java_source(project, logger=None): _match_purldb_resources( project=project, - to_resources=to_no_java_source, + resources=to_no_java_source, matcher_func=match_purldb_resource, logger=logger, ) diff --git a/matchcodeio/settings.py b/matchcodeio/settings.py index eb14b3f7..2ba549ff 100644 --- a/matchcodeio/settings.py +++ b/matchcodeio/settings.py @@ -11,8 +11,11 @@ INSTALLED_APPS += [ - "packagedb", + "clearcode", + "clearindex", "matchcode", + "minecode", + "packagedb", ] # Database @@ -32,3 +35,5 @@ ) DATABASE_ROUTERS = ["matchcodeio.dbrouter.PackageDBReadOnlyRouter",] + +ROOT_URLCONF = 'matchcodeio.urls' diff --git a/matchcodeio/urls.py b/matchcodeio/urls.py new file mode 100644 index 00000000..e0cf419f --- /dev/null +++ b/matchcodeio/urls.py @@ -0,0 +1,36 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from django.urls import include +from django.urls import path +from django.views.generic import RedirectView +from rest_framework import routers + +from clearcode.api import CDitemViewSet +from packagedb.api import PackageViewSet +from packagedb.api import PackageSetViewSet +from packagedb.api import ResourceViewSet +from matchcode.api import MatchingViewSet +from minecode.api import PriorityResourceURIViewSet +from scanpipe.api.views import RunViewSet + + +api_router = routers.DefaultRouter() +api_router.register('packages', PackageViewSet) +api_router.register('package_sets', PackageSetViewSet) +api_router.register('resources', ResourceViewSet) +api_router.register('matching', MatchingViewSet) +api_router.register('runs', RunViewSet) +api_router.register('cditems', CDitemViewSet, 'cditems') +api_router.register('on_demand_queue', PriorityResourceURIViewSet) + +urlpatterns = [ + path('api/', include(api_router.urls)), + path('', RedirectView.as_view(url='api/')), +] diff --git a/purldb/urls.py b/purldb/urls.py index 045a2d34..d0e24d7e 100644 --- a/purldb/urls.py +++ b/purldb/urls.py @@ -36,5 +36,5 @@ urlpatterns = [ path('api/', include((api_router.urls, 'api'))), - path("", RedirectView.as_view(url="api/")), + path('', RedirectView.as_view(url='api/')), ] From 4bbc611bf2bad4b7448116b8330986ee7f875959 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 29 Nov 2023 17:13:52 -0800 Subject: [PATCH 04/54] Create MatchingSerializer #224 Signed-off-by: Jono Yang --- matchcode/api.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/matchcode/api.py b/matchcode/api.py index 11c6c5db..2361ead1 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -16,6 +16,7 @@ from rest_framework import viewsets from rest_framework.decorators import action from rest_framework.response import Response +from rest_framework import serializers from rest_framework.serializers import CharField from rest_framework.serializers import FloatField from rest_framework.serializers import HyperlinkedRelatedField @@ -36,7 +37,7 @@ from scanpipe.api.serializers import ProjectSerializer from scanpipe.api.views import ProjectFilterSet from scanpipe.models import Project - +from scanpipe.pipes.fetch import fetch_urls class BaseFileIndexSerializer(ModelSerializer): sha1 = CharField(source='fingerprint') @@ -315,6 +316,36 @@ class ApproximateDirectoryStructureIndexViewSet(BaseDirectoryIndexViewSet): filterset_class = ApproximateDirectoryStructureFilterSet +class MatchingSerializer(ProjectSerializer): + def create(self, validated_data, matching_pipeline_name='matching2'): + """ + Create a new `project` with `upload_file`, using the `matching2` pipeline + """ + execute_now = True + upload_file = validated_data.pop("upload_file", None) + input_urls = validated_data.pop("input_urls", []) + webhook_url = validated_data.pop("webhook_url", None) + + downloads, errors = fetch_urls(input_urls) + if errors: + raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) + + project = super().create(validated_data) + + if upload_file: + project.add_uploads([upload_file]) + + if downloads: + project.add_downloads(downloads) + + if webhook_url: + project.add_webhook_subscription(webhook_url) + + project.add_pipeline(matching_pipeline_name, execute_now) + + return project + + class MatchingViewSet( mixins.CreateModelMixin, mixins.RetrieveModelMixin, @@ -323,5 +354,5 @@ class MatchingViewSet( viewsets.GenericViewSet, ): queryset = Project.objects.all() - serializer_class = ProjectSerializer + serializer_class = MatchingSerializer filterset_class = ProjectFilterSet From bb0e2882f8c668588d93d6a58dc3239dec9a95f0 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 30 Nov 2023 16:43:35 -0800 Subject: [PATCH 05/54] Serialize more fields using MatchingSerializer #224 Signed-off-by: Jono Yang --- matchcode/api.py | 94 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 5 deletions(-) diff --git a/matchcode/api.py b/matchcode/api.py index 2361ead1..8afcb498 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -6,13 +6,14 @@ # See https://github.com/nexB/purldb for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # - +from uuid import uuid4 from django.db.models import Q from django.forms import widgets from django.forms.fields import MultipleChoiceField from django_filters.filters import MultipleChoiceFilter from django_filters.rest_framework import FilterSet from rest_framework import mixins +from rest_framework import renderers from rest_framework import viewsets from rest_framework.decorators import action from rest_framework.response import Response @@ -34,10 +35,14 @@ from matchcode.models import ApproximateDirectoryContentIndex from matchcode.models import ApproximateDirectoryStructureIndex -from scanpipe.api.serializers import ProjectSerializer +from scanpipe.api.serializers import RunSerializer +from scanpipe.api.serializers import StrListField from scanpipe.api.views import ProjectFilterSet from scanpipe.models import Project +from scanpipe.pipes import count_group_by from scanpipe.pipes.fetch import fetch_urls +from scanpipe.views import project_results_json_response + class BaseFileIndexSerializer(ModelSerializer): sha1 = CharField(source='fingerprint') @@ -316,12 +321,73 @@ class ApproximateDirectoryStructureIndexViewSet(BaseDirectoryIndexViewSet): filterset_class = ApproximateDirectoryStructureFilterSet -class MatchingSerializer(ProjectSerializer): - def create(self, validated_data, matching_pipeline_name='matching2'): +class MatchingSerializer(serializers.ModelSerializer): + upload_file = serializers.FileField(write_only=True, required=False) + input_urls = StrListField( + write_only=True, + required=False, + style={"base_template": "textarea.html"}, + ) + webhook_url = serializers.CharField(write_only=True, required=False) + runs = RunSerializer(many=True, read_only=True) + input_sources = serializers.JSONField(source="input_sources_list", read_only=True) + codebase_resources_summary = serializers.SerializerMethodField() + discovered_packages_summary = serializers.SerializerMethodField() + discovered_dependencies_summary = serializers.SerializerMethodField() + codebase_relations_summary = serializers.SerializerMethodField() + + class Meta: + model = Project + fields = ( + 'url', + 'uuid', + "upload_file", + "input_urls", + "webhook_url", + "created_date", + "input_sources", + "runs", + "resource_count", + "package_count", + "dependency_count", + "relation_count", + "codebase_resources_summary", + "discovered_packages_summary", + "discovered_dependencies_summary", + "codebase_relations_summary", + ) + + def get_codebase_resources_summary(self, project): + queryset = project.codebaseresources.all() + return count_group_by(queryset, "status") + + def get_discovered_packages_summary(self, project): + base_qs = project.discoveredpackages + return { + "total": base_qs.count(), + "with_missing_resources": base_qs.exclude(missing_resources=[]).count(), + "with_modified_resources": base_qs.exclude(modified_resources=[]).count(), + } + + def get_discovered_dependencies_summary(self, project): + base_qs = project.discovereddependencies + return { + "total": base_qs.count(), + "is_runtime": base_qs.filter(is_runtime=True).count(), + "is_optional": base_qs.filter(is_optional=True).count(), + "is_resolved": base_qs.filter(is_resolved=True).count(), + } + + def get_codebase_relations_summary(self, project): + queryset = project.codebaserelations.all() + return count_group_by(queryset, "map_type") + + def create(self, validated_data, matching_pipeline_name='matching'): """ - Create a new `project` with `upload_file`, using the `matching2` pipeline + Create a new `project` with `upload_file`, using the `matching` pipeline """ execute_now = True + validated_data['name'] = uuid4() upload_file = validated_data.pop("upload_file", None) input_urls = validated_data.pop("input_urls", []) webhook_url = validated_data.pop("webhook_url", None) @@ -356,3 +422,21 @@ class MatchingViewSet( queryset = Project.objects.all() serializer_class = MatchingSerializer filterset_class = ProjectFilterSet + + def get_queryset(self): + return ( + super() + .get_queryset() + .prefetch_related( + "runs", + ) + ) + + @action(detail=True, renderer_classes=[renderers.JSONRenderer]) + def results(self, request, *args, **kwargs): + """ + Return the results compatible with ScanCode data format. + The content is returned as a stream of JSON content using the + JSONResultsGenerator class. + """ + return project_results_json_response(self.get_object()) From cb7b7637ecd260e71f3b73cc3fa38cee20fa9bfb Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 30 Nov 2023 17:58:36 -0800 Subject: [PATCH 06/54] Update matching pipeline #224 * Use JSON input to build codebase Signed-off-by: Jono Yang --- matchcode/pipelines/matching.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/matchcode/pipelines/matching.py b/matchcode/pipelines/matching.py index 2dd06ab6..3f2b93cb 100644 --- a/matchcode/pipelines/matching.py +++ b/matchcode/pipelines/matching.py @@ -20,12 +20,13 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. +from scanpipe.pipelines.load_inventory import LoadInventory from scanpipe.pipelines.scan_codebase import ScanCodebase from matchcode.pipes import matching from scanpipe.pipes import matchcode -class Matching(ScanCodebase): +class Matching(ScanCodebase, LoadInventory): """ Establish relationships between two code trees: deployment and development. @@ -38,9 +39,8 @@ class Matching(ScanCodebase): @classmethod def steps(cls): return ( - cls.copy_inputs_to_codebase_directory, - cls.extract_archives, - cls.collect_and_create_codebase_resources, + cls.get_inputs, + cls.build_inventory_from_scans, cls.fingerprint_codebase_directories, cls.flag_empty_files, cls.flag_ignored_resources, From f4811733f639c10f110100aed3a14f8a52b5d9c9 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 4 Dec 2023 11:35:55 -0800 Subject: [PATCH 07/54] Only expose matching endpoint on matchcodeio #224 Signed-off-by: Jono Yang --- matchcodeio/urls.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/matchcodeio/urls.py b/matchcodeio/urls.py index e0cf419f..489105e3 100644 --- a/matchcodeio/urls.py +++ b/matchcodeio/urls.py @@ -12,25 +12,14 @@ from django.views.generic import RedirectView from rest_framework import routers -from clearcode.api import CDitemViewSet -from packagedb.api import PackageViewSet -from packagedb.api import PackageSetViewSet -from packagedb.api import ResourceViewSet from matchcode.api import MatchingViewSet -from minecode.api import PriorityResourceURIViewSet -from scanpipe.api.views import RunViewSet api_router = routers.DefaultRouter() -api_router.register('packages', PackageViewSet) -api_router.register('package_sets', PackageSetViewSet) -api_router.register('resources', ResourceViewSet) api_router.register('matching', MatchingViewSet) -api_router.register('runs', RunViewSet) -api_router.register('cditems', CDitemViewSet, 'cditems') -api_router.register('on_demand_queue', PriorityResourceURIViewSet) urlpatterns = [ path('api/', include(api_router.urls)), + path("", include("scanpipe.urls")), path('', RedirectView.as_view(url='api/')), ] From 5dc4f44ed5b937cc20df4c3448ca18d7812659d1 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 4 Dec 2023 12:44:18 -0800 Subject: [PATCH 08/54] Move Matching view to matchcodeio #224 * Fix urls.py Signed-off-by: Jono Yang --- matchcode/api.py | 134 ----------------------------------------- matchcodeio/api.py | 144 ++++++++++++++++++++++++++++++++++++++++++++ matchcodeio/urls.py | 4 +- 3 files changed, 147 insertions(+), 135 deletions(-) create mode 100644 matchcodeio/api.py diff --git a/matchcode/api.py b/matchcode/api.py index 8afcb498..9810645a 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -6,18 +6,13 @@ # See https://github.com/nexB/purldb for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # -from uuid import uuid4 from django.db.models import Q from django.forms import widgets from django.forms.fields import MultipleChoiceField from django_filters.filters import MultipleChoiceFilter from django_filters.rest_framework import FilterSet -from rest_framework import mixins -from rest_framework import renderers -from rest_framework import viewsets from rest_framework.decorators import action from rest_framework.response import Response -from rest_framework import serializers from rest_framework.serializers import CharField from rest_framework.serializers import FloatField from rest_framework.serializers import HyperlinkedRelatedField @@ -35,14 +30,6 @@ from matchcode.models import ApproximateDirectoryContentIndex from matchcode.models import ApproximateDirectoryStructureIndex -from scanpipe.api.serializers import RunSerializer -from scanpipe.api.serializers import StrListField -from scanpipe.api.views import ProjectFilterSet -from scanpipe.models import Project -from scanpipe.pipes import count_group_by -from scanpipe.pipes.fetch import fetch_urls -from scanpipe.views import project_results_json_response - class BaseFileIndexSerializer(ModelSerializer): sha1 = CharField(source='fingerprint') @@ -319,124 +306,3 @@ class ApproximateDirectoryStructureIndexViewSet(BaseDirectoryIndexViewSet): queryset = ApproximateDirectoryStructureIndex.objects.all() serializer_class = ApproximateDirectoryStructureIndexSerializer filterset_class = ApproximateDirectoryStructureFilterSet - - -class MatchingSerializer(serializers.ModelSerializer): - upload_file = serializers.FileField(write_only=True, required=False) - input_urls = StrListField( - write_only=True, - required=False, - style={"base_template": "textarea.html"}, - ) - webhook_url = serializers.CharField(write_only=True, required=False) - runs = RunSerializer(many=True, read_only=True) - input_sources = serializers.JSONField(source="input_sources_list", read_only=True) - codebase_resources_summary = serializers.SerializerMethodField() - discovered_packages_summary = serializers.SerializerMethodField() - discovered_dependencies_summary = serializers.SerializerMethodField() - codebase_relations_summary = serializers.SerializerMethodField() - - class Meta: - model = Project - fields = ( - 'url', - 'uuid', - "upload_file", - "input_urls", - "webhook_url", - "created_date", - "input_sources", - "runs", - "resource_count", - "package_count", - "dependency_count", - "relation_count", - "codebase_resources_summary", - "discovered_packages_summary", - "discovered_dependencies_summary", - "codebase_relations_summary", - ) - - def get_codebase_resources_summary(self, project): - queryset = project.codebaseresources.all() - return count_group_by(queryset, "status") - - def get_discovered_packages_summary(self, project): - base_qs = project.discoveredpackages - return { - "total": base_qs.count(), - "with_missing_resources": base_qs.exclude(missing_resources=[]).count(), - "with_modified_resources": base_qs.exclude(modified_resources=[]).count(), - } - - def get_discovered_dependencies_summary(self, project): - base_qs = project.discovereddependencies - return { - "total": base_qs.count(), - "is_runtime": base_qs.filter(is_runtime=True).count(), - "is_optional": base_qs.filter(is_optional=True).count(), - "is_resolved": base_qs.filter(is_resolved=True).count(), - } - - def get_codebase_relations_summary(self, project): - queryset = project.codebaserelations.all() - return count_group_by(queryset, "map_type") - - def create(self, validated_data, matching_pipeline_name='matching'): - """ - Create a new `project` with `upload_file`, using the `matching` pipeline - """ - execute_now = True - validated_data['name'] = uuid4() - upload_file = validated_data.pop("upload_file", None) - input_urls = validated_data.pop("input_urls", []) - webhook_url = validated_data.pop("webhook_url", None) - - downloads, errors = fetch_urls(input_urls) - if errors: - raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) - - project = super().create(validated_data) - - if upload_file: - project.add_uploads([upload_file]) - - if downloads: - project.add_downloads(downloads) - - if webhook_url: - project.add_webhook_subscription(webhook_url) - - project.add_pipeline(matching_pipeline_name, execute_now) - - return project - - -class MatchingViewSet( - mixins.CreateModelMixin, - mixins.RetrieveModelMixin, - mixins.DestroyModelMixin, - mixins.ListModelMixin, - viewsets.GenericViewSet, -): - queryset = Project.objects.all() - serializer_class = MatchingSerializer - filterset_class = ProjectFilterSet - - def get_queryset(self): - return ( - super() - .get_queryset() - .prefetch_related( - "runs", - ) - ) - - @action(detail=True, renderer_classes=[renderers.JSONRenderer]) - def results(self, request, *args, **kwargs): - """ - Return the results compatible with ScanCode data format. - The content is returned as a stream of JSON content using the - JSONResultsGenerator class. - """ - return project_results_json_response(self.get_object()) diff --git a/matchcodeio/api.py b/matchcodeio/api.py new file mode 100644 index 00000000..b32bc74c --- /dev/null +++ b/matchcodeio/api.py @@ -0,0 +1,144 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +from uuid import uuid4 + +from rest_framework import mixins +from rest_framework import renderers +from rest_framework import serializers +from rest_framework import viewsets +from rest_framework.decorators import action + +from scanpipe.api.serializers import RunSerializer +from scanpipe.api.serializers import StrListField +from scanpipe.api.views import ProjectFilterSet +from scanpipe.models import Project +from scanpipe.pipes import count_group_by +from scanpipe.pipes.fetch import fetch_urls +from scanpipe.views import project_results_json_response + + +class MatchingSerializer(serializers.ModelSerializer): + upload_file = serializers.FileField(write_only=True, required=False) + input_urls = StrListField( + write_only=True, + required=False, + style={"base_template": "textarea.html"}, + ) + webhook_url = serializers.CharField(write_only=True, required=False) + runs = RunSerializer(many=True, read_only=True) + input_sources = serializers.JSONField(source="input_sources_list", read_only=True) + codebase_resources_summary = serializers.SerializerMethodField() + discovered_packages_summary = serializers.SerializerMethodField() + discovered_dependencies_summary = serializers.SerializerMethodField() + codebase_relations_summary = serializers.SerializerMethodField() + + class Meta: + model = Project + fields = ( + 'url', + 'uuid', + "upload_file", + "input_urls", + "webhook_url", + "created_date", + "input_sources", + "runs", + "resource_count", + "package_count", + "dependency_count", + "relation_count", + "codebase_resources_summary", + "discovered_packages_summary", + "discovered_dependencies_summary", + "codebase_relations_summary", + ) + + def get_codebase_resources_summary(self, project): + queryset = project.codebaseresources.all() + return count_group_by(queryset, "status") + + def get_discovered_packages_summary(self, project): + base_qs = project.discoveredpackages + return { + "total": base_qs.count(), + "with_missing_resources": base_qs.exclude(missing_resources=[]).count(), + "with_modified_resources": base_qs.exclude(modified_resources=[]).count(), + } + + def get_discovered_dependencies_summary(self, project): + base_qs = project.discovereddependencies + return { + "total": base_qs.count(), + "is_runtime": base_qs.filter(is_runtime=True).count(), + "is_optional": base_qs.filter(is_optional=True).count(), + "is_resolved": base_qs.filter(is_resolved=True).count(), + } + + def get_codebase_relations_summary(self, project): + queryset = project.codebaserelations.all() + return count_group_by(queryset, "map_type") + + def create(self, validated_data, matching_pipeline_name='matching'): + """ + Create a new `project` with `upload_file`, using the `matching` pipeline + """ + execute_now = True + validated_data['name'] = uuid4() + upload_file = validated_data.pop("upload_file", None) + input_urls = validated_data.pop("input_urls", []) + webhook_url = validated_data.pop("webhook_url", None) + + downloads, errors = fetch_urls(input_urls) + if errors: + raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) + + project = super().create(validated_data) + + if upload_file: + project.add_uploads([upload_file]) + + if downloads: + project.add_downloads(downloads) + + if webhook_url: + project.add_webhook_subscription(webhook_url) + + project.add_pipeline(matching_pipeline_name, execute_now) + + return project + + +class MatchingViewSet( + mixins.CreateModelMixin, + mixins.RetrieveModelMixin, + mixins.DestroyModelMixin, + mixins.ListModelMixin, + viewsets.GenericViewSet, +): + queryset = Project.objects.all() + serializer_class = MatchingSerializer + filterset_class = ProjectFilterSet + + def get_queryset(self): + return ( + super() + .get_queryset() + .prefetch_related( + "runs", + ) + ) + + @action(detail=True, renderer_classes=[renderers.JSONRenderer]) + def results(self, request, *args, **kwargs): + """ + Return the results compatible with ScanCode data format. + The content is returned as a stream of JSON content using the + JSONResultsGenerator class. + """ + return project_results_json_response(self.get_object()) diff --git a/matchcodeio/urls.py b/matchcodeio/urls.py index 489105e3..45e3d921 100644 --- a/matchcodeio/urls.py +++ b/matchcodeio/urls.py @@ -12,11 +12,13 @@ from django.views.generic import RedirectView from rest_framework import routers -from matchcode.api import MatchingViewSet +from matchcodeio.api import MatchingViewSet +from scanpipe.api.views import RunViewSet api_router = routers.DefaultRouter() api_router.register('matching', MatchingViewSet) +api_router.register('runs', RunViewSet) urlpatterns = [ path('api/', include(api_router.urls)), From aac8a6d72434275608fc7966e215e1172bf25619 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 5 Dec 2023 00:00:02 -0800 Subject: [PATCH 09/54] Move pipeline-related code to matchcodeio #224 Signed-off-by: Jono Yang --- Makefile | 3 ++- {matchcode => matchcodeio}/pipelines/__init__.py | 0 {matchcode => matchcodeio}/pipelines/matching.py | 2 +- {matchcode => matchcodeio}/pipes/__init__.py | 0 {matchcode => matchcodeio}/pipes/matching.py | 1 - 5 files changed, 3 insertions(+), 3 deletions(-) rename {matchcode => matchcodeio}/pipelines/__init__.py (100%) rename {matchcode => matchcodeio}/pipelines/matching.py (99%) rename {matchcode => matchcodeio}/pipes/__init__.py (100%) rename {matchcode => matchcodeio}/pipes/matching.py (99%) diff --git a/Makefile b/Makefile index 859388fa..4c7e3d89 100644 --- a/Makefile +++ b/Makefile @@ -107,7 +107,8 @@ process_scans: test: @echo "-> Run the test suite" - ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode-toolkit + ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode-toolkit --ignore matchcodeio + ${ACTIVATE} DJANGO_SETTINGS_MODULE=matchcodeio.settings ${PYTHON_EXE} -m pytest -vvs matchcodeio ${ACTIVATE} ${PYTHON_EXE} -m pytest -vvs matchcode-toolkit --ignore matchcode-toolkit/src/matchcode_toolkit/pipelines shell: diff --git a/matchcode/pipelines/__init__.py b/matchcodeio/pipelines/__init__.py similarity index 100% rename from matchcode/pipelines/__init__.py rename to matchcodeio/pipelines/__init__.py diff --git a/matchcode/pipelines/matching.py b/matchcodeio/pipelines/matching.py similarity index 99% rename from matchcode/pipelines/matching.py rename to matchcodeio/pipelines/matching.py index 3f2b93cb..e6d74dc1 100644 --- a/matchcode/pipelines/matching.py +++ b/matchcodeio/pipelines/matching.py @@ -22,7 +22,7 @@ from scanpipe.pipelines.load_inventory import LoadInventory from scanpipe.pipelines.scan_codebase import ScanCodebase -from matchcode.pipes import matching +from matchcodeio.pipes import matching from scanpipe.pipes import matchcode diff --git a/matchcode/pipes/__init__.py b/matchcodeio/pipes/__init__.py similarity index 100% rename from matchcode/pipes/__init__.py rename to matchcodeio/pipes/__init__.py diff --git a/matchcode/pipes/matching.py b/matchcodeio/pipes/matching.py similarity index 99% rename from matchcode/pipes/matching.py rename to matchcodeio/pipes/matching.py index 49de16d4..ee956567 100644 --- a/matchcode/pipes/matching.py +++ b/matchcodeio/pipes/matching.py @@ -29,7 +29,6 @@ from scanpipe.pipes import LoopProgress from scanpipe.pipes import flag from scanpipe.pipes import js -from scanpipe.pipes import purldb from matchcode.models import ApproximateDirectoryContentIndex from packagedb.models import Package From dd68128ca716b8c4a8fd5b22fa65cf10956a966a Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 11 Dec 2023 17:15:57 -0800 Subject: [PATCH 10/54] Add traefik-specific files and config #224 Signed-off-by: Jono Yang --- docker-compose.yml | 31 ++++++++++++++++++++++++++++--- traefik.yml | 16 ++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 traefik.yml diff --git a/docker-compose.yml b/docker-compose.yml index 8bcae773..f5ebe8e1 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,26 @@ version: "3" services: + reverse-proxy: + # The official v2 Traefik docker image + image: traefik:v2.10 + # Enables the web UI and tells Traefik to listen to docker + command: + - "--api.insecure=true" + - "--providers.docker" + - "--providers.docker.exposedbydefault=false" + - "--entrypoints.web.address=:80" + - "--entrypoints.websecure.address=:443" + ports: + # The HTTP port + - "80:80" + - "443:443" + # The Web UI (enabled by --api.insecure=true) + - "8080:8080" + volumes: + # So that Traefik can listen to the Docker events + - /var/run/docker.sock:/var/run/docker.sock + db: image: postgres:13 env_file: @@ -23,6 +43,10 @@ services: - static:/var/purldb/static/ depends_on: - db + labels: + - "traefik.enable=true" + - "traefik.http.routers.development.rule=Host(`127.0.0.1`) || Host(`localhost`)" + - "traefik.http.routers.development.entrypoints=web" visitor: build: . @@ -115,9 +139,10 @@ services: nginx: image: nginx - ports: - - 80:80 - - 443:443 + labels: + - "traefik.enable=true" + - "traefik.http.routers.staticfiles.rule=PathPrefix(`/static/`) || PathPrefix(`/media/`)" + - "traefik.http.routers.staticfiles.entrypoints=web" volumes: - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ - static:/var/purldb/static/ diff --git a/traefik.yml b/traefik.yml new file mode 100644 index 00000000..5e4150f1 --- /dev/null +++ b/traefik.yml @@ -0,0 +1,16 @@ +## STATIC CONFIGURATION +log: + level: INFO + +api: + insecure: true + dashboard: true + +entryPoints: + web: + address: ":80" + +providers: + docker: + endpoint: "unix:///var/run/docker.sock" + exposedByDefault: false From 4659ccbc0871478edacfa83eacd5e02865c4f9fc Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 22 Nov 2023 13:49:39 -0800 Subject: [PATCH 11/54] Create new django app matchcodeio #224 * Add scancodeio under extra_requires Signed-off-by: Jono Yang --- MANIFEST.in | 7 ++++++- matchcodeio/__init__.py | 0 matchcodeio/dbrouter.py | 32 ++++++++++++++++++++++++++++++++ matchcodeio/settings.py | 35 +++++++++++++++++++++++++++++++++++ matchcodeio/wsgi.py | 25 +++++++++++++++++++++++++ setup.cfg | 7 +++++++ 6 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 matchcodeio/__init__.py create mode 100644 matchcodeio/dbrouter.py create mode 100644 matchcodeio/settings.py create mode 100644 matchcodeio/wsgi.py diff --git a/MANIFEST.in b/MANIFEST.in index ef3721e8..ac7bf4f4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,9 @@ -graft src +graft clearcode +graft clearindex +graft matchcode +graft minecode +graft packagedb +graft purldb include *.LICENSE include NOTICE diff --git a/matchcodeio/__init__.py b/matchcodeio/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/matchcodeio/dbrouter.py b/matchcodeio/dbrouter.py new file mode 100644 index 00000000..fb438c7e --- /dev/null +++ b/matchcodeio/dbrouter.py @@ -0,0 +1,32 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + + +class PackageDBReadOnlyRouter(object): + app_labels = [ + 'clearcode', + 'clearindex', + 'minecode', + 'matchcode', + 'packagedb', + ] + + def db_for_read(self, model, **hints): + if model._meta.app_label in self.app_labels: + return 'packagedb' + return None + + def db_for_write(self, model, **hints): + return None + + def allow_relation(self, obj1, obj2, **hints): + return None + + def allow_migrate(self, db, app_label, model_name=None, **hints): + return None diff --git a/matchcodeio/settings.py b/matchcodeio/settings.py new file mode 100644 index 00000000..2b1c263c --- /dev/null +++ b/matchcodeio/settings.py @@ -0,0 +1,35 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from scancodeio.settings import * + + +INSTALLED_APPS += [ + "packagedb", + "matchcode", +] + + +# Database + +DATABASES.update( + { + 'packagedb': { + 'ENGINE': env.str('PACKAGEDB_DB_ENGINE', 'django.db.backends.postgresql'), + 'HOST': env.str('PACKAGEDB_DB_HOST', 'localhost'), + 'NAME': env.str('PACKAGEDB_DB_NAME', 'packagedb'), + 'USER': env.str('PACKAGEDB_DB_USER', 'packagedb'), + 'PASSWORD': env.str('PACKAGEDB_DB_PASSWORD', 'packagedb'), + 'PORT': env.str('PACKAGEDB_DB_PORT', '5432'), + 'ATOMIC_REQUESTS': True, + } + } +) + +DATABASE_ROUTERS = ["matchcodeio.dbrouter.PackageDBReadOnlyRouter",] diff --git a/matchcodeio/wsgi.py b/matchcodeio/wsgi.py new file mode 100644 index 00000000..2c570b4e --- /dev/null +++ b/matchcodeio/wsgi.py @@ -0,0 +1,25 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +""" +WSGI config for MatchCode.io. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/dev/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "matchcodeio.settings") + +application = get_wsgi_application() diff --git a/setup.cfg b/setup.cfg index 2e989d7a..1d064975 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,6 +65,9 @@ python_requires = >=3.8 where = . [options.extras_require] +matchcodeio = + scancodeio + testing = pytest >= 6, != 7.0.0 pytest-xdist >= 2 @@ -72,6 +75,7 @@ testing = aboutcode-toolkit >= 6.0.0 black mock + scancodeio docs = Sphinx==5.0.2 @@ -81,3 +85,6 @@ docs = [options.entry_points] console_scripts = purldb = purldb:command_line + +scancodeio_pipelines = + scan_and_fingerprint_package = matchcode_toolkit.pipelines.scan_and_fingerprint_package:ScanAndFingerprintPackage From c06ea756bcf71fc14417976307f946d338ac9c3a Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 22 Nov 2023 19:17:18 -0800 Subject: [PATCH 12/54] Create Package matching pipeline #224 * Drop aboutcode-toolkit version Signed-off-by: Jono Yang --- matchcode/pipelines/__init__.py | 0 matchcode/pipelines/matching.py | 109 ++++++++ matchcode/pipes/__init__.py | 0 matchcode/pipes/matching.py | 425 ++++++++++++++++++++++++++++++++ matchcodeio/settings.py | 1 - requirements-dev.txt | 2 +- setup.cfg | 2 +- 7 files changed, 536 insertions(+), 3 deletions(-) create mode 100644 matchcode/pipelines/__init__.py create mode 100644 matchcode/pipelines/matching.py create mode 100644 matchcode/pipes/__init__.py create mode 100644 matchcode/pipes/matching.py diff --git a/matchcode/pipelines/__init__.py b/matchcode/pipelines/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/matchcode/pipelines/matching.py b/matchcode/pipelines/matching.py new file mode 100644 index 00000000..2dd06ab6 --- /dev/null +++ b/matchcode/pipelines/matching.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from scanpipe.pipelines.scan_codebase import ScanCodebase +from matchcode.pipes import matching +from scanpipe.pipes import matchcode + + +class Matching(ScanCodebase): + """ + Establish relationships between two code trees: deployment and development. + + This pipeline is expecting 2 archive files with "from-" and "to-" filename + prefixes as inputs: + - "from-[FILENAME]" archive containing the development source code + - "to-[FILENAME]" archive containing the deployment compiled code + """ + + @classmethod + def steps(cls): + return ( + cls.copy_inputs_to_codebase_directory, + cls.extract_archives, + cls.collect_and_create_codebase_resources, + cls.fingerprint_codebase_directories, + cls.flag_empty_files, + cls.flag_ignored_resources, + cls.match_archives_to_purldb, + cls.match_directories_to_purldb, + cls.match_resources_to_purldb, + cls.match_purldb_resources_post_process, + cls.remove_packages_without_resources, + ) + + purldb_package_extensions = [".jar", ".war", ".zip"] + purldb_resource_extensions = [ + ".map", + ".js", + ".mjs", + ".ts", + ".d.ts", + ".jsx", + ".tsx", + ".css", + ".scss", + ".less", + ".sass", + ".soy", + ".class", + ] + + def fingerprint_codebase_directories(self): + """Compute directory fingerprints for matching""" + matchcode.fingerprint_codebase_directories(self.project) + + def match_archives_to_purldb(self): + """Match selected package archives by extension to PurlDB.""" + matching.match_purldb_resources( + project=self.project, + extensions=self.purldb_package_extensions, + matcher_func=matching.match_purldb_package, + logger=self.log, + ) + + def match_directories_to_purldb(self): + """Match selected directories in PurlDB.""" + matching.match_purldb_directories( + project=self.project, + logger=self.log, + ) + + def match_resources_to_purldb(self): + """Match selected files by extension in PurlDB.""" + matching.match_purldb_resources( + project=self.project, + extensions=self.purldb_resource_extensions, + matcher_func=matching.match_purldb_resource, + logger=self.log, + ) + + def match_purldb_resources_post_process(self): + """Choose the best package for PurlDB matched resources.""" + matching.match_purldb_resources_post_process(self.project, logger=self.log) + + def remove_packages_without_resources(self): + """Remove packages without any resources.""" + package_without_resources = self.project.discoveredpackages.filter( + codebase_resources__isnull=True + ) + package_without_resources.delete() diff --git a/matchcode/pipes/__init__.py b/matchcode/pipes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/matchcode/pipes/matching.py b/matchcode/pipes/matching.py new file mode 100644 index 00000000..566cc1d2 --- /dev/null +++ b/matchcode/pipes/matching.py @@ -0,0 +1,425 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from collections import defaultdict + +from django.db.models import Q +from django.template.defaultfilters import pluralize + +from scanpipe import pipes +from scanpipe.pipes import LoopProgress +from scanpipe.pipes import flag +from scanpipe.pipes import js +from scanpipe.pipes import purldb + +from matchcode.models import ApproximateDirectoryContentIndex +from packagedb.models import Package +from packagedb.models import Resource + + +def get_project_resources_qs(project, resources): + """ + Return a queryset of CodebaseResources from `project` containing the + CodebaseResources from `resources` . If a CodebaseResource in `resources` is + an archive or directory, then their descendants are also included in the + queryset. + + Return None if `resources` is empty or None. + """ + lookups = Q() + for resource in resources or []: + lookups |= Q(path=resource.path) + if resource.is_archive: + # This is done to capture the extracted contents of the archive we + # matched to. Generally, the archive contents are in a directory + # that is the archive path with `-extract` at the end. + lookups |= Q(path__startswith=resource.path) + elif resource.is_dir: + # We add a trailing slash to avoid matching on directories we do not + # intend to. For example, if we have matched on the directory with + # the path `foo/bar/1`, using the __startswith filter without + # including a trailing slash on the path would have us get all + # diretories under `foo/bar/` that start with 1, such as + # `foo/bar/10001`, `foo/bar/123`, etc., when we just want `foo/bar/1` + # and its descendants. + path = f"{resource.path}/" + lookups |= Q(path__startswith=path) + if lookups: + return project.codebaseresources.filter(lookups) + + +def create_package_from_purldb_data(project, resources, package_data, status): + """ + Create a DiscoveredPackage instance from PurlDB ``package_data``. + + Return a tuple, containing the created DiscoveredPackage and the number of + CodebaseResources matched to PurlDB that are part of that DiscoveredPackage. + """ + package_data = package_data.copy() + # Do not re-use uuid from PurlDB as DiscoveredPackage.uuid is unique and a + # PurlDB match can be found in different projects. + package_data.pop("uuid", None) + package_data.pop("dependencies", None) + + resources_qs = get_project_resources_qs(project, resources) + package = pipes.update_or_create_package( + project=project, + package_data=package_data, + codebase_resources=resources_qs, + ) + # Get the number of already matched CodebaseResources from `resources_qs` + # before we update the status of all CodebaseResources from `resources_qs`, + # then subtract the number of already matched CodebaseResources from the + # total number of CodebaseResources updated. This is to prevent + # double-counting of CodebaseResources that were matched to purldb + purldb_statuses = [ + flag.MATCHED_TO_PURLDB_PACKAGE, + flag.MATCHED_TO_PURLDB_RESOURCE, + flag.MATCHED_TO_PURLDB_DIRECTORY, + ] + matched_resources_count = resources_qs.exclude(status__in=purldb_statuses).update( + status=status + ) + return package, matched_resources_count + + +def match_purldb_package( + project, resources_by_sha1, enhance_package_data=True, **kwargs +): + """ + Given a mapping of lists of CodebaseResources by their sha1 values, + `resources_by_sha1`, send those sha1 values to purldb packages API endpoint, + process the matched Package data, then return the number of + CodebaseResources that were matched to a Package. + """ + match_count = 0 + sha1_list = list(resources_by_sha1.keys()) + results = Package.objects.using('packagedb').filter(sha1__in=sha1_list) + # Process matched Package data + for package in results: + package_data = package.to_dict() + sha1 = package_data["sha1"] + resources = resources_by_sha1.get(sha1) or [] + if not resources: + continue + _, matched_resources_count = create_package_from_purldb_data( + project=project, + resources=resources, + package_data=package_data, + status=flag.MATCHED_TO_PURLDB_PACKAGE, + ) + match_count += matched_resources_count + return match_count + + +def match_purldb_resource( + project, resources_by_sha1, package_data_by_purldb_urls=None, **kwargs +): + """ + Given a mapping of lists of CodebaseResources by their sha1 values, + `resources_by_sha1`, send those sha1 values to purldb resources API + endpoint, process the matched Package data, then return the number of + CodebaseResources that were matched to a Package. + + `package_data_by_purldb_urls` is a mapping of package data by their purldb + package instance URLs. This is intended to be used as a cache, to avoid + retrieving package data we retrieved before. + """ + package_data_by_purldb_urls = package_data_by_purldb_urls or {} + match_count = 0 + sha1_list = list(resources_by_sha1.keys()) + results = Resource.objects.using('packagedb').filter(sha1__in=sha1_list) + # Process match results + for resource in results: + # Get package data + package_data = resource.package.to_dict() + sha1 = package_data["sha1"] + resources = resources_by_sha1.get(sha1) or [] + if not resources: + continue + _, matched_resources_count = create_package_from_purldb_data( + project=project, + resources=resources, + package_data=package_data, + status=flag.MATCHED_TO_PURLDB_RESOURCE, + ) + match_count += matched_resources_count + return match_count + + +def match_purldb_directory(project, resource): + """Match a single directory resource in the PurlDB.""" + fingerprint = resource.extra_data.get("directory_content", "") + results = ApproximateDirectoryContentIndex.match(directory_fingerprint=fingerprint) + for result in results: + package_data = result.package.to_dict() + return create_package_from_purldb_data( + project, [resource], package_data, flag.MATCHED_TO_PURLDB_DIRECTORY + ) + + +def match_sha1s_to_purldb( + project, resources_by_sha1, matcher_func, package_data_by_purldb_urls +): + """ + Process `resources_by_sha1` with `matcher_func` and return a 3-tuple + contaning an empty defaultdict(list), the number of matches and the number + of sha1s sent to purldb. + """ + matched_count = matcher_func( + project=project, + resources_by_sha1=resources_by_sha1, + package_data_by_purldb_urls=package_data_by_purldb_urls, + ) + sha1_count = len(resources_by_sha1) + # Clear out resources_by_sha1 when we are done with the current batch of + # CodebaseResources + resources_by_sha1 = defaultdict(list) + return resources_by_sha1, matched_count, sha1_count + + +def match_purldb_resources( + project, extensions, matcher_func, chunk_size=1000, logger=None +): + """ + Match against PurlDB selecting codebase resources using provided + ``package_extensions`` for archive type files, and ``resource_extensions``. + + Match requests are sent off in batches of 1000 SHA1s. This number is set + using `chunk_size`. + """ + resources = ( + project.codebaseresources.files() + .no_status() + .has_value("sha1") + .filter(extension__in=extensions) + ) + resource_count = resources.count() + + extensions_str = ", ".join(extensions) + if logger: + if resource_count > 0: + logger( + f"Matching {resource_count:,d} {extensions_str} resources in PurlDB, " + "using SHA1" + ) + else: + logger( + f"Skipping matching for {extensions_str} resources, " + f"as there are {resource_count:,d}" + ) + + _match_purldb_resources( + project=project, + resources=resources, + matcher_func=matcher_func, + chunk_size=chunk_size, + logger=logger, + ) + + +def _match_purldb_resources( + project, to_resources, matcher_func, chunk_size=1000, logger=None +): + resource_count = to_resources.count() + resource_iterator = to_resources.iterator(chunk_size=chunk_size) + progress = LoopProgress(resource_count, logger) + total_matched_count = 0 + total_sha1_count = 0 + processed_resources_count = 0 + resources_by_sha1 = defaultdict(list) + package_data_by_purldb_urls = {} + + for to_resource in progress.iter(resource_iterator): + resources_by_sha1[to_resource.sha1].append(to_resource) + if to_resource.path.endswith(".map"): + for js_sha1 in js.source_content_sha1_list(to_resource): + resources_by_sha1[js_sha1].append(to_resource) + processed_resources_count += 1 + + if processed_resources_count % chunk_size == 0: + resources_by_sha1, matched_count, sha1_count = match_sha1s_to_purldb( + project=project, + resources_by_sha1=resources_by_sha1, + matcher_func=matcher_func, + package_data_by_purldb_urls=package_data_by_purldb_urls, + ) + total_matched_count += matched_count + total_sha1_count += sha1_count + + if resources_by_sha1: + resources_by_sha1, matched_count, sha1_count = match_sha1s_to_purldb( + project=project, + resources_by_sha1=resources_by_sha1, + matcher_func=matcher_func, + package_data_by_purldb_urls=package_data_by_purldb_urls, + ) + total_matched_count += matched_count + total_sha1_count += sha1_count + + logger( + f"{total_matched_count:,d} resources matched in PurlDB " + f"using {total_sha1_count:,d} SHA1s" + ) + + +def match_purldb_directories(project, logger=None): + """Match against PurlDB selecting codebase directories.""" + # If we are able to get match results for a directory fingerprint, then that + # means every resource and directory under that directory is part of a + # Package. By starting from the root to/ directory, we are attempting to + # match as many files as we can before attempting to match further down. The + # more "higher-up" directories we can match to means that we reduce the + # number of queries made to purldb. + to_directories = ( + project.codebaseresources.directories() + .no_status(status=flag.ABOUT_MAPPED) + .no_status(status=flag.MATCHED_TO_PURLDB_PACKAGE) + .order_by("path") + ) + directory_count = to_directories.count() + + if logger: + logger( + f"Matching {directory_count:,d} " + f"director{pluralize(directory_count, 'y,ies')} from to/ in PurlDB" + ) + + directory_iterator = to_directories.iterator(chunk_size=2000) + progress = LoopProgress(directory_count, logger) + + for directory in progress.iter(directory_iterator): + directory.refresh_from_db() + if directory.status != flag.MATCHED_TO_PURLDB_DIRECTORY: + match_purldb_directory(project, directory) + + matched_count = ( + project.codebaseresources.directories() + .filter(status=flag.MATCHED_TO_PURLDB_DIRECTORY) + .count() + ) + logger( + f"{matched_count:,d} director{pluralize(matched_count, 'y,ies')} " + f"matched in PurlDB" + ) + + +def match_resources_with_no_java_source(project, logger=None): + """ + Match resources with ``no-java-source`` to PurlDB, if no match + is found update status to ``requires-review``. + """ + project_files = project.codebaseresources.files() + + to_no_java_source = project_files.to_codebase().filter(status=flag.NO_JAVA_SOURCE) + + if to_no_java_source: + resource_count = to_no_java_source.count() + if logger: + logger( + f"Mapping {resource_count:,d} to/ resources with {flag.NO_JAVA_SOURCE} " + "status in PurlDB using SHA1" + ) + + _match_purldb_resources( + project=project, + to_resources=to_no_java_source, + matcher_func=match_purldb_resource, + logger=logger, + ) + to_no_java_source.exclude(status=flag.MATCHED_TO_PURLDB_RESOURCE).update( + status=flag.REQUIRES_REVIEW + ) + + +def match_purldb_resources_post_process(project, logger=None): + """Choose the best package for PurlDB matched resources.""" + to_extract_directories = ( + project.codebaseresources.directories() + .to_codebase() + .filter(path__regex=r"^.*-extract$") + ) + + to_resources = project.codebaseresources.files().filter( + status=flag.MATCHED_TO_PURLDB_RESOURCE + ) + + resource_count = to_extract_directories.count() + + if logger: + logger( + f"Refining matching for {resource_count:,d} " + f"{flag.MATCHED_TO_PURLDB_RESOURCE} archives." + ) + + resource_iterator = to_extract_directories.iterator(chunk_size=2000) + progress = LoopProgress(resource_count, logger) + map_count = 0 + + for directory in progress.iter(resource_iterator): + map_count += _match_purldb_resources_post_process( + directory, to_extract_directories, to_resources + ) + + logger(f"{map_count:,d} resource processed") + + +def _match_purldb_resources_post_process( + directory_path, to_extract_directories, to_resources +): + # Exclude the content of nested archive. + interesting_codebase_resources = ( + to_resources.filter(path__startswith=directory_path) + .filter(status=flag.MATCHED_TO_PURLDB_RESOURCE) + .exclude(path__regex=rf"^{directory_path}.*-extract\/.*$") + ) + + if not interesting_codebase_resources: + return 0 + + packages_map = {} + + for resource in interesting_codebase_resources: + for package in resource.discovered_packages.all(): + if package in packages_map: + packages_map[package].append(resource) + else: + packages_map[package] = [resource] + + # Rank the packages by most number of matched resources. + ranked_packages = dict( + sorted(packages_map.items(), key=lambda item: len(item[1]), reverse=True) + ) + + for resource in interesting_codebase_resources: + resource.discovered_packages.clear() + + for package, resources in ranked_packages.items(): + unmapped_resources = [ + resource + for resource in resources + if not resource.discovered_packages.exists() + ] + if unmapped_resources: + package.add_resources(unmapped_resources) + + return interesting_codebase_resources.count() diff --git a/matchcodeio/settings.py b/matchcodeio/settings.py index 2b1c263c..eb14b3f7 100644 --- a/matchcodeio/settings.py +++ b/matchcodeio/settings.py @@ -15,7 +15,6 @@ "matchcode", ] - # Database DATABASES.update( diff --git a/requirements-dev.txt b/requirements-dev.txt index 4c61da6c..5ef9572c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -aboutcode-toolkit==10.1.0 +aboutcode-toolkit==9.0.0 black==23.11.0 et-xmlfile==1.1.0 execnet==2.0.2 diff --git a/setup.cfg b/setup.cfg index 1d064975..1d1ff095 100644 --- a/setup.cfg +++ b/setup.cfg @@ -87,4 +87,4 @@ console_scripts = purldb = purldb:command_line scancodeio_pipelines = - scan_and_fingerprint_package = matchcode_toolkit.pipelines.scan_and_fingerprint_package:ScanAndFingerprintPackage + matching = matchcode.pipelines.matching:Matching From 4a585580b22eccf4691d94d1afc754c656d22b61 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 29 Nov 2023 16:38:21 -0800 Subject: [PATCH 13/54] Define URL route for matchcode.io #224 * Create MatchingViewSet Signed-off-by: Jono Yang --- .gitignore | 2 ++ matchcode/api.py | 18 ++++++++++++++++++ matchcode/pipes/matching.py | 8 ++++---- matchcodeio/settings.py | 7 ++++++- matchcodeio/urls.py | 36 ++++++++++++++++++++++++++++++++++++ 5 files changed, 66 insertions(+), 5 deletions(-) create mode 100644 matchcodeio/urls.py diff --git a/.gitignore b/.gitignore index db8d038c..391f0819 100644 --- a/.gitignore +++ b/.gitignore @@ -74,3 +74,5 @@ tcl # Env Files .env + +var/ diff --git a/matchcode/api.py b/matchcode/api.py index 68844b8d..11c6c5db 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -12,6 +12,8 @@ from django.forms.fields import MultipleChoiceField from django_filters.filters import MultipleChoiceFilter from django_filters.rest_framework import FilterSet +from rest_framework import mixins +from rest_framework import viewsets from rest_framework.decorators import action from rest_framework.response import Response from rest_framework.serializers import CharField @@ -31,6 +33,10 @@ from matchcode.models import ApproximateDirectoryContentIndex from matchcode.models import ApproximateDirectoryStructureIndex +from scanpipe.api.serializers import ProjectSerializer +from scanpipe.api.views import ProjectFilterSet +from scanpipe.models import Project + class BaseFileIndexSerializer(ModelSerializer): sha1 = CharField(source='fingerprint') @@ -307,3 +313,15 @@ class ApproximateDirectoryStructureIndexViewSet(BaseDirectoryIndexViewSet): queryset = ApproximateDirectoryStructureIndex.objects.all() serializer_class = ApproximateDirectoryStructureIndexSerializer filterset_class = ApproximateDirectoryStructureFilterSet + + +class MatchingViewSet( + mixins.CreateModelMixin, + mixins.RetrieveModelMixin, + mixins.DestroyModelMixin, + mixins.ListModelMixin, + viewsets.GenericViewSet, +): + queryset = Project.objects.all() + serializer_class = ProjectSerializer + filterset_class = ProjectFilterSet diff --git a/matchcode/pipes/matching.py b/matchcode/pipes/matching.py index 566cc1d2..49de16d4 100644 --- a/matchcode/pipes/matching.py +++ b/matchcode/pipes/matching.py @@ -238,10 +238,10 @@ def match_purldb_resources( def _match_purldb_resources( - project, to_resources, matcher_func, chunk_size=1000, logger=None + project, resources, matcher_func, chunk_size=1000, logger=None ): - resource_count = to_resources.count() - resource_iterator = to_resources.iterator(chunk_size=chunk_size) + resource_count = resources.count() + resource_iterator = resources.iterator(chunk_size=chunk_size) progress = LoopProgress(resource_count, logger) total_matched_count = 0 total_sha1_count = 0 @@ -342,7 +342,7 @@ def match_resources_with_no_java_source(project, logger=None): _match_purldb_resources( project=project, - to_resources=to_no_java_source, + resources=to_no_java_source, matcher_func=match_purldb_resource, logger=logger, ) diff --git a/matchcodeio/settings.py b/matchcodeio/settings.py index eb14b3f7..2ba549ff 100644 --- a/matchcodeio/settings.py +++ b/matchcodeio/settings.py @@ -11,8 +11,11 @@ INSTALLED_APPS += [ - "packagedb", + "clearcode", + "clearindex", "matchcode", + "minecode", + "packagedb", ] # Database @@ -32,3 +35,5 @@ ) DATABASE_ROUTERS = ["matchcodeio.dbrouter.PackageDBReadOnlyRouter",] + +ROOT_URLCONF = 'matchcodeio.urls' diff --git a/matchcodeio/urls.py b/matchcodeio/urls.py new file mode 100644 index 00000000..e0cf419f --- /dev/null +++ b/matchcodeio/urls.py @@ -0,0 +1,36 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from django.urls import include +from django.urls import path +from django.views.generic import RedirectView +from rest_framework import routers + +from clearcode.api import CDitemViewSet +from packagedb.api import PackageViewSet +from packagedb.api import PackageSetViewSet +from packagedb.api import ResourceViewSet +from matchcode.api import MatchingViewSet +from minecode.api import PriorityResourceURIViewSet +from scanpipe.api.views import RunViewSet + + +api_router = routers.DefaultRouter() +api_router.register('packages', PackageViewSet) +api_router.register('package_sets', PackageSetViewSet) +api_router.register('resources', ResourceViewSet) +api_router.register('matching', MatchingViewSet) +api_router.register('runs', RunViewSet) +api_router.register('cditems', CDitemViewSet, 'cditems') +api_router.register('on_demand_queue', PriorityResourceURIViewSet) + +urlpatterns = [ + path('api/', include(api_router.urls)), + path('', RedirectView.as_view(url='api/')), +] From 91a31a92a80b15210ab24fc403458f893db1dc19 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 29 Nov 2023 17:13:52 -0800 Subject: [PATCH 14/54] Create MatchingSerializer #224 Signed-off-by: Jono Yang --- matchcode/api.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/matchcode/api.py b/matchcode/api.py index 11c6c5db..2361ead1 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -16,6 +16,7 @@ from rest_framework import viewsets from rest_framework.decorators import action from rest_framework.response import Response +from rest_framework import serializers from rest_framework.serializers import CharField from rest_framework.serializers import FloatField from rest_framework.serializers import HyperlinkedRelatedField @@ -36,7 +37,7 @@ from scanpipe.api.serializers import ProjectSerializer from scanpipe.api.views import ProjectFilterSet from scanpipe.models import Project - +from scanpipe.pipes.fetch import fetch_urls class BaseFileIndexSerializer(ModelSerializer): sha1 = CharField(source='fingerprint') @@ -315,6 +316,36 @@ class ApproximateDirectoryStructureIndexViewSet(BaseDirectoryIndexViewSet): filterset_class = ApproximateDirectoryStructureFilterSet +class MatchingSerializer(ProjectSerializer): + def create(self, validated_data, matching_pipeline_name='matching2'): + """ + Create a new `project` with `upload_file`, using the `matching2` pipeline + """ + execute_now = True + upload_file = validated_data.pop("upload_file", None) + input_urls = validated_data.pop("input_urls", []) + webhook_url = validated_data.pop("webhook_url", None) + + downloads, errors = fetch_urls(input_urls) + if errors: + raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) + + project = super().create(validated_data) + + if upload_file: + project.add_uploads([upload_file]) + + if downloads: + project.add_downloads(downloads) + + if webhook_url: + project.add_webhook_subscription(webhook_url) + + project.add_pipeline(matching_pipeline_name, execute_now) + + return project + + class MatchingViewSet( mixins.CreateModelMixin, mixins.RetrieveModelMixin, @@ -323,5 +354,5 @@ class MatchingViewSet( viewsets.GenericViewSet, ): queryset = Project.objects.all() - serializer_class = ProjectSerializer + serializer_class = MatchingSerializer filterset_class = ProjectFilterSet From 51a35d21296cf197564a662706af9ee2f21e83ed Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 30 Nov 2023 16:43:35 -0800 Subject: [PATCH 15/54] Serialize more fields using MatchingSerializer #224 Signed-off-by: Jono Yang --- matchcode/api.py | 94 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 5 deletions(-) diff --git a/matchcode/api.py b/matchcode/api.py index 2361ead1..8afcb498 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -6,13 +6,14 @@ # See https://github.com/nexB/purldb for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # - +from uuid import uuid4 from django.db.models import Q from django.forms import widgets from django.forms.fields import MultipleChoiceField from django_filters.filters import MultipleChoiceFilter from django_filters.rest_framework import FilterSet from rest_framework import mixins +from rest_framework import renderers from rest_framework import viewsets from rest_framework.decorators import action from rest_framework.response import Response @@ -34,10 +35,14 @@ from matchcode.models import ApproximateDirectoryContentIndex from matchcode.models import ApproximateDirectoryStructureIndex -from scanpipe.api.serializers import ProjectSerializer +from scanpipe.api.serializers import RunSerializer +from scanpipe.api.serializers import StrListField from scanpipe.api.views import ProjectFilterSet from scanpipe.models import Project +from scanpipe.pipes import count_group_by from scanpipe.pipes.fetch import fetch_urls +from scanpipe.views import project_results_json_response + class BaseFileIndexSerializer(ModelSerializer): sha1 = CharField(source='fingerprint') @@ -316,12 +321,73 @@ class ApproximateDirectoryStructureIndexViewSet(BaseDirectoryIndexViewSet): filterset_class = ApproximateDirectoryStructureFilterSet -class MatchingSerializer(ProjectSerializer): - def create(self, validated_data, matching_pipeline_name='matching2'): +class MatchingSerializer(serializers.ModelSerializer): + upload_file = serializers.FileField(write_only=True, required=False) + input_urls = StrListField( + write_only=True, + required=False, + style={"base_template": "textarea.html"}, + ) + webhook_url = serializers.CharField(write_only=True, required=False) + runs = RunSerializer(many=True, read_only=True) + input_sources = serializers.JSONField(source="input_sources_list", read_only=True) + codebase_resources_summary = serializers.SerializerMethodField() + discovered_packages_summary = serializers.SerializerMethodField() + discovered_dependencies_summary = serializers.SerializerMethodField() + codebase_relations_summary = serializers.SerializerMethodField() + + class Meta: + model = Project + fields = ( + 'url', + 'uuid', + "upload_file", + "input_urls", + "webhook_url", + "created_date", + "input_sources", + "runs", + "resource_count", + "package_count", + "dependency_count", + "relation_count", + "codebase_resources_summary", + "discovered_packages_summary", + "discovered_dependencies_summary", + "codebase_relations_summary", + ) + + def get_codebase_resources_summary(self, project): + queryset = project.codebaseresources.all() + return count_group_by(queryset, "status") + + def get_discovered_packages_summary(self, project): + base_qs = project.discoveredpackages + return { + "total": base_qs.count(), + "with_missing_resources": base_qs.exclude(missing_resources=[]).count(), + "with_modified_resources": base_qs.exclude(modified_resources=[]).count(), + } + + def get_discovered_dependencies_summary(self, project): + base_qs = project.discovereddependencies + return { + "total": base_qs.count(), + "is_runtime": base_qs.filter(is_runtime=True).count(), + "is_optional": base_qs.filter(is_optional=True).count(), + "is_resolved": base_qs.filter(is_resolved=True).count(), + } + + def get_codebase_relations_summary(self, project): + queryset = project.codebaserelations.all() + return count_group_by(queryset, "map_type") + + def create(self, validated_data, matching_pipeline_name='matching'): """ - Create a new `project` with `upload_file`, using the `matching2` pipeline + Create a new `project` with `upload_file`, using the `matching` pipeline """ execute_now = True + validated_data['name'] = uuid4() upload_file = validated_data.pop("upload_file", None) input_urls = validated_data.pop("input_urls", []) webhook_url = validated_data.pop("webhook_url", None) @@ -356,3 +422,21 @@ class MatchingViewSet( queryset = Project.objects.all() serializer_class = MatchingSerializer filterset_class = ProjectFilterSet + + def get_queryset(self): + return ( + super() + .get_queryset() + .prefetch_related( + "runs", + ) + ) + + @action(detail=True, renderer_classes=[renderers.JSONRenderer]) + def results(self, request, *args, **kwargs): + """ + Return the results compatible with ScanCode data format. + The content is returned as a stream of JSON content using the + JSONResultsGenerator class. + """ + return project_results_json_response(self.get_object()) From faa4e603bb555ae911b3f78150f7c90d7b78d69f Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 30 Nov 2023 17:58:36 -0800 Subject: [PATCH 16/54] Update matching pipeline #224 * Use JSON input to build codebase Signed-off-by: Jono Yang --- matchcode/pipelines/matching.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/matchcode/pipelines/matching.py b/matchcode/pipelines/matching.py index 2dd06ab6..3f2b93cb 100644 --- a/matchcode/pipelines/matching.py +++ b/matchcode/pipelines/matching.py @@ -20,12 +20,13 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. +from scanpipe.pipelines.load_inventory import LoadInventory from scanpipe.pipelines.scan_codebase import ScanCodebase from matchcode.pipes import matching from scanpipe.pipes import matchcode -class Matching(ScanCodebase): +class Matching(ScanCodebase, LoadInventory): """ Establish relationships between two code trees: deployment and development. @@ -38,9 +39,8 @@ class Matching(ScanCodebase): @classmethod def steps(cls): return ( - cls.copy_inputs_to_codebase_directory, - cls.extract_archives, - cls.collect_and_create_codebase_resources, + cls.get_inputs, + cls.build_inventory_from_scans, cls.fingerprint_codebase_directories, cls.flag_empty_files, cls.flag_ignored_resources, From cf7e164a5712051221cfe55d05c0b02e422c56de Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 4 Dec 2023 11:35:55 -0800 Subject: [PATCH 17/54] Only expose matching endpoint on matchcodeio #224 Signed-off-by: Jono Yang --- matchcodeio/urls.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/matchcodeio/urls.py b/matchcodeio/urls.py index e0cf419f..489105e3 100644 --- a/matchcodeio/urls.py +++ b/matchcodeio/urls.py @@ -12,25 +12,14 @@ from django.views.generic import RedirectView from rest_framework import routers -from clearcode.api import CDitemViewSet -from packagedb.api import PackageViewSet -from packagedb.api import PackageSetViewSet -from packagedb.api import ResourceViewSet from matchcode.api import MatchingViewSet -from minecode.api import PriorityResourceURIViewSet -from scanpipe.api.views import RunViewSet api_router = routers.DefaultRouter() -api_router.register('packages', PackageViewSet) -api_router.register('package_sets', PackageSetViewSet) -api_router.register('resources', ResourceViewSet) api_router.register('matching', MatchingViewSet) -api_router.register('runs', RunViewSet) -api_router.register('cditems', CDitemViewSet, 'cditems') -api_router.register('on_demand_queue', PriorityResourceURIViewSet) urlpatterns = [ path('api/', include(api_router.urls)), + path("", include("scanpipe.urls")), path('', RedirectView.as_view(url='api/')), ] From 377cd42fecc8df9e036ce578f5d95ad26b149f2e Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 4 Dec 2023 12:44:18 -0800 Subject: [PATCH 18/54] Move Matching view to matchcodeio #224 * Fix urls.py Signed-off-by: Jono Yang --- matchcode/api.py | 134 ----------------------------------------- matchcodeio/api.py | 144 ++++++++++++++++++++++++++++++++++++++++++++ matchcodeio/urls.py | 4 +- 3 files changed, 147 insertions(+), 135 deletions(-) create mode 100644 matchcodeio/api.py diff --git a/matchcode/api.py b/matchcode/api.py index 8afcb498..9810645a 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -6,18 +6,13 @@ # See https://github.com/nexB/purldb for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # -from uuid import uuid4 from django.db.models import Q from django.forms import widgets from django.forms.fields import MultipleChoiceField from django_filters.filters import MultipleChoiceFilter from django_filters.rest_framework import FilterSet -from rest_framework import mixins -from rest_framework import renderers -from rest_framework import viewsets from rest_framework.decorators import action from rest_framework.response import Response -from rest_framework import serializers from rest_framework.serializers import CharField from rest_framework.serializers import FloatField from rest_framework.serializers import HyperlinkedRelatedField @@ -35,14 +30,6 @@ from matchcode.models import ApproximateDirectoryContentIndex from matchcode.models import ApproximateDirectoryStructureIndex -from scanpipe.api.serializers import RunSerializer -from scanpipe.api.serializers import StrListField -from scanpipe.api.views import ProjectFilterSet -from scanpipe.models import Project -from scanpipe.pipes import count_group_by -from scanpipe.pipes.fetch import fetch_urls -from scanpipe.views import project_results_json_response - class BaseFileIndexSerializer(ModelSerializer): sha1 = CharField(source='fingerprint') @@ -319,124 +306,3 @@ class ApproximateDirectoryStructureIndexViewSet(BaseDirectoryIndexViewSet): queryset = ApproximateDirectoryStructureIndex.objects.all() serializer_class = ApproximateDirectoryStructureIndexSerializer filterset_class = ApproximateDirectoryStructureFilterSet - - -class MatchingSerializer(serializers.ModelSerializer): - upload_file = serializers.FileField(write_only=True, required=False) - input_urls = StrListField( - write_only=True, - required=False, - style={"base_template": "textarea.html"}, - ) - webhook_url = serializers.CharField(write_only=True, required=False) - runs = RunSerializer(many=True, read_only=True) - input_sources = serializers.JSONField(source="input_sources_list", read_only=True) - codebase_resources_summary = serializers.SerializerMethodField() - discovered_packages_summary = serializers.SerializerMethodField() - discovered_dependencies_summary = serializers.SerializerMethodField() - codebase_relations_summary = serializers.SerializerMethodField() - - class Meta: - model = Project - fields = ( - 'url', - 'uuid', - "upload_file", - "input_urls", - "webhook_url", - "created_date", - "input_sources", - "runs", - "resource_count", - "package_count", - "dependency_count", - "relation_count", - "codebase_resources_summary", - "discovered_packages_summary", - "discovered_dependencies_summary", - "codebase_relations_summary", - ) - - def get_codebase_resources_summary(self, project): - queryset = project.codebaseresources.all() - return count_group_by(queryset, "status") - - def get_discovered_packages_summary(self, project): - base_qs = project.discoveredpackages - return { - "total": base_qs.count(), - "with_missing_resources": base_qs.exclude(missing_resources=[]).count(), - "with_modified_resources": base_qs.exclude(modified_resources=[]).count(), - } - - def get_discovered_dependencies_summary(self, project): - base_qs = project.discovereddependencies - return { - "total": base_qs.count(), - "is_runtime": base_qs.filter(is_runtime=True).count(), - "is_optional": base_qs.filter(is_optional=True).count(), - "is_resolved": base_qs.filter(is_resolved=True).count(), - } - - def get_codebase_relations_summary(self, project): - queryset = project.codebaserelations.all() - return count_group_by(queryset, "map_type") - - def create(self, validated_data, matching_pipeline_name='matching'): - """ - Create a new `project` with `upload_file`, using the `matching` pipeline - """ - execute_now = True - validated_data['name'] = uuid4() - upload_file = validated_data.pop("upload_file", None) - input_urls = validated_data.pop("input_urls", []) - webhook_url = validated_data.pop("webhook_url", None) - - downloads, errors = fetch_urls(input_urls) - if errors: - raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) - - project = super().create(validated_data) - - if upload_file: - project.add_uploads([upload_file]) - - if downloads: - project.add_downloads(downloads) - - if webhook_url: - project.add_webhook_subscription(webhook_url) - - project.add_pipeline(matching_pipeline_name, execute_now) - - return project - - -class MatchingViewSet( - mixins.CreateModelMixin, - mixins.RetrieveModelMixin, - mixins.DestroyModelMixin, - mixins.ListModelMixin, - viewsets.GenericViewSet, -): - queryset = Project.objects.all() - serializer_class = MatchingSerializer - filterset_class = ProjectFilterSet - - def get_queryset(self): - return ( - super() - .get_queryset() - .prefetch_related( - "runs", - ) - ) - - @action(detail=True, renderer_classes=[renderers.JSONRenderer]) - def results(self, request, *args, **kwargs): - """ - Return the results compatible with ScanCode data format. - The content is returned as a stream of JSON content using the - JSONResultsGenerator class. - """ - return project_results_json_response(self.get_object()) diff --git a/matchcodeio/api.py b/matchcodeio/api.py new file mode 100644 index 00000000..b32bc74c --- /dev/null +++ b/matchcodeio/api.py @@ -0,0 +1,144 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +from uuid import uuid4 + +from rest_framework import mixins +from rest_framework import renderers +from rest_framework import serializers +from rest_framework import viewsets +from rest_framework.decorators import action + +from scanpipe.api.serializers import RunSerializer +from scanpipe.api.serializers import StrListField +from scanpipe.api.views import ProjectFilterSet +from scanpipe.models import Project +from scanpipe.pipes import count_group_by +from scanpipe.pipes.fetch import fetch_urls +from scanpipe.views import project_results_json_response + + +class MatchingSerializer(serializers.ModelSerializer): + upload_file = serializers.FileField(write_only=True, required=False) + input_urls = StrListField( + write_only=True, + required=False, + style={"base_template": "textarea.html"}, + ) + webhook_url = serializers.CharField(write_only=True, required=False) + runs = RunSerializer(many=True, read_only=True) + input_sources = serializers.JSONField(source="input_sources_list", read_only=True) + codebase_resources_summary = serializers.SerializerMethodField() + discovered_packages_summary = serializers.SerializerMethodField() + discovered_dependencies_summary = serializers.SerializerMethodField() + codebase_relations_summary = serializers.SerializerMethodField() + + class Meta: + model = Project + fields = ( + 'url', + 'uuid', + "upload_file", + "input_urls", + "webhook_url", + "created_date", + "input_sources", + "runs", + "resource_count", + "package_count", + "dependency_count", + "relation_count", + "codebase_resources_summary", + "discovered_packages_summary", + "discovered_dependencies_summary", + "codebase_relations_summary", + ) + + def get_codebase_resources_summary(self, project): + queryset = project.codebaseresources.all() + return count_group_by(queryset, "status") + + def get_discovered_packages_summary(self, project): + base_qs = project.discoveredpackages + return { + "total": base_qs.count(), + "with_missing_resources": base_qs.exclude(missing_resources=[]).count(), + "with_modified_resources": base_qs.exclude(modified_resources=[]).count(), + } + + def get_discovered_dependencies_summary(self, project): + base_qs = project.discovereddependencies + return { + "total": base_qs.count(), + "is_runtime": base_qs.filter(is_runtime=True).count(), + "is_optional": base_qs.filter(is_optional=True).count(), + "is_resolved": base_qs.filter(is_resolved=True).count(), + } + + def get_codebase_relations_summary(self, project): + queryset = project.codebaserelations.all() + return count_group_by(queryset, "map_type") + + def create(self, validated_data, matching_pipeline_name='matching'): + """ + Create a new `project` with `upload_file`, using the `matching` pipeline + """ + execute_now = True + validated_data['name'] = uuid4() + upload_file = validated_data.pop("upload_file", None) + input_urls = validated_data.pop("input_urls", []) + webhook_url = validated_data.pop("webhook_url", None) + + downloads, errors = fetch_urls(input_urls) + if errors: + raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) + + project = super().create(validated_data) + + if upload_file: + project.add_uploads([upload_file]) + + if downloads: + project.add_downloads(downloads) + + if webhook_url: + project.add_webhook_subscription(webhook_url) + + project.add_pipeline(matching_pipeline_name, execute_now) + + return project + + +class MatchingViewSet( + mixins.CreateModelMixin, + mixins.RetrieveModelMixin, + mixins.DestroyModelMixin, + mixins.ListModelMixin, + viewsets.GenericViewSet, +): + queryset = Project.objects.all() + serializer_class = MatchingSerializer + filterset_class = ProjectFilterSet + + def get_queryset(self): + return ( + super() + .get_queryset() + .prefetch_related( + "runs", + ) + ) + + @action(detail=True, renderer_classes=[renderers.JSONRenderer]) + def results(self, request, *args, **kwargs): + """ + Return the results compatible with ScanCode data format. + The content is returned as a stream of JSON content using the + JSONResultsGenerator class. + """ + return project_results_json_response(self.get_object()) diff --git a/matchcodeio/urls.py b/matchcodeio/urls.py index 489105e3..45e3d921 100644 --- a/matchcodeio/urls.py +++ b/matchcodeio/urls.py @@ -12,11 +12,13 @@ from django.views.generic import RedirectView from rest_framework import routers -from matchcode.api import MatchingViewSet +from matchcodeio.api import MatchingViewSet +from scanpipe.api.views import RunViewSet api_router = routers.DefaultRouter() api_router.register('matching', MatchingViewSet) +api_router.register('runs', RunViewSet) urlpatterns = [ path('api/', include(api_router.urls)), From b476ce5dde490de3e76382fc031ba3cd5aa2b0da Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 5 Dec 2023 00:00:02 -0800 Subject: [PATCH 19/54] Move pipeline-related code to matchcodeio #224 Signed-off-by: Jono Yang --- Makefile | 3 ++- {matchcode => matchcodeio}/pipelines/__init__.py | 0 {matchcode => matchcodeio}/pipelines/matching.py | 2 +- {matchcode => matchcodeio}/pipes/__init__.py | 0 {matchcode => matchcodeio}/pipes/matching.py | 1 - 5 files changed, 3 insertions(+), 3 deletions(-) rename {matchcode => matchcodeio}/pipelines/__init__.py (100%) rename {matchcode => matchcodeio}/pipelines/matching.py (99%) rename {matchcode => matchcodeio}/pipes/__init__.py (100%) rename {matchcode => matchcodeio}/pipes/matching.py (99%) diff --git a/Makefile b/Makefile index 41a5073f..6cb5dd45 100644 --- a/Makefile +++ b/Makefile @@ -107,8 +107,9 @@ process_scans: test: @echo "-> Run the test suite" - ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode-toolkit --ignore packagedb/tests/test_throttling.py + ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode-toolkit --ignore packagedb/tests/test_throttling.py --ignore matchcodeio ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs packagedb/tests/test_throttling.py + ${ACTIVATE} DJANGO_SETTINGS_MODULE=matchcodeio.settings ${PYTHON_EXE} -m pytest -vvs matchcodeio ${ACTIVATE} ${PYTHON_EXE} -m pytest -vvs matchcode-toolkit --ignore matchcode-toolkit/src/matchcode_toolkit/pipelines shell: diff --git a/matchcode/pipelines/__init__.py b/matchcodeio/pipelines/__init__.py similarity index 100% rename from matchcode/pipelines/__init__.py rename to matchcodeio/pipelines/__init__.py diff --git a/matchcode/pipelines/matching.py b/matchcodeio/pipelines/matching.py similarity index 99% rename from matchcode/pipelines/matching.py rename to matchcodeio/pipelines/matching.py index 3f2b93cb..e6d74dc1 100644 --- a/matchcode/pipelines/matching.py +++ b/matchcodeio/pipelines/matching.py @@ -22,7 +22,7 @@ from scanpipe.pipelines.load_inventory import LoadInventory from scanpipe.pipelines.scan_codebase import ScanCodebase -from matchcode.pipes import matching +from matchcodeio.pipes import matching from scanpipe.pipes import matchcode diff --git a/matchcode/pipes/__init__.py b/matchcodeio/pipes/__init__.py similarity index 100% rename from matchcode/pipes/__init__.py rename to matchcodeio/pipes/__init__.py diff --git a/matchcode/pipes/matching.py b/matchcodeio/pipes/matching.py similarity index 99% rename from matchcode/pipes/matching.py rename to matchcodeio/pipes/matching.py index 49de16d4..ee956567 100644 --- a/matchcode/pipes/matching.py +++ b/matchcodeio/pipes/matching.py @@ -29,7 +29,6 @@ from scanpipe.pipes import LoopProgress from scanpipe.pipes import flag from scanpipe.pipes import js -from scanpipe.pipes import purldb from matchcode.models import ApproximateDirectoryContentIndex from packagedb.models import Package From 3f1da4e0ba37e76975294170014596ce461e151a Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 13 Dec 2023 11:21:35 -0800 Subject: [PATCH 20/54] Create separate manage.py and docker-related files #224 Signed-off-by: Jono Yang --- Dockerfile | 9 +- docker-compose-matchcodeio.yml | 83 +++++++++++ ...r-compose.yml => docker-compose_purldb.yml | 34 ++--- docker-compose_purldb_public.yml | 135 ++++++++++++++++++ docker_matchcode.env | 11 ++ docker.env => docker_purldb.env | 0 manage_matchcode.py | 19 +++ manage.py => manage_purldb.py | 0 manage_purldb_public.py | 19 +++ .../__init__.py | 0 {matchcodeio => matchcode_pipeline}/api.py | 0 .../pipelines/__init__.py | 0 .../pipelines/matching.py | 2 +- .../pipes/__init__.py | 0 .../pipes/matching.py | 0 matchcode_project/__init__.py | 0 .../dbrouter.py | 0 .../settings.py | 0 {matchcodeio => matchcode_project}/urls.py | 2 +- {matchcodeio => matchcode_project}/wsgi.py | 0 setup.cfg | 5 +- 21 files changed, 291 insertions(+), 28 deletions(-) create mode 100644 docker-compose-matchcodeio.yml rename docker-compose.yml => docker-compose_purldb.yml (70%) create mode 100644 docker-compose_purldb_public.yml create mode 100644 docker_matchcode.env rename docker.env => docker_purldb.env (100%) create mode 100644 manage_matchcode.py rename manage.py => manage_purldb.py (100%) create mode 100644 manage_purldb_public.py rename {matchcodeio => matchcode_pipeline}/__init__.py (100%) rename {matchcodeio => matchcode_pipeline}/api.py (100%) rename {matchcodeio => matchcode_pipeline}/pipelines/__init__.py (100%) rename {matchcodeio => matchcode_pipeline}/pipelines/matching.py (98%) rename {matchcodeio => matchcode_pipeline}/pipes/__init__.py (100%) rename {matchcodeio => matchcode_pipeline}/pipes/matching.py (100%) create mode 100644 matchcode_project/__init__.py rename {matchcodeio => matchcode_project}/dbrouter.py (100%) rename {matchcodeio => matchcode_project}/settings.py (100%) rename {matchcodeio => matchcode_project}/urls.py (94%) rename {matchcodeio => matchcode_project}/wsgi.py (100%) diff --git a/Dockerfile b/Dockerfile index 517f6105..2e55d972 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,10 +31,9 @@ RUN apt-get update \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -COPY setup.cfg setup.py /app/ -RUN mkdir -p /app/matchcode-toolkit/src/ -COPY matchcode-toolkit/setup.cfg matchcode-toolkit/setup.py /app/matchcode-toolkit/ -RUN pip install -e matchcode-toolkit -RUN pip install -e . +# Install the dependencies before the codebase COPY for proper Docker layer caching +COPY setup.cfg setup.py requirements.txt /app/ +RUN pip install --no-cache-dir -c requirements.txt . +# Copy the codebase COPY . /app diff --git a/docker-compose-matchcodeio.yml b/docker-compose-matchcodeio.yml new file mode 100644 index 00000000..071f2bd2 --- /dev/null +++ b/docker-compose-matchcodeio.yml @@ -0,0 +1,83 @@ +version: "3" + +services: + db: + image: postgres:13 + env_file: + - docker.env + volumes: + - db_data:/var/lib/postgresql/data/ + shm_size: "1gb" + restart: always + + redis: + image: redis + # Enable redis data persistence using the "Append Only File" with the + # default policy of fsync every second. See https://redis.io/topics/persistence + command: redis-server --appendonly yes + volumes: + - redis_data:/data + restart: always + + web: + build: . + command: wait-for-it --strict --timeout=60 db:5432 -- sh -c " + python manage.py migrate && + python manage.py collectstatic --no-input --verbosity 0 --clear && + gunicorn matchcodeio.wsgi:application --bind :8000 --timeout 600 --workers 8" + environment: + - DJANGO_SETTINGS_MODULE=matchcodeio.settings + env_file: + - docker.env + expose: + - 8000 + volumes: + - .env:/opt/scancodeio/.env + - /etc/scancodeio/:/etc/scancodeio/ + - workspace:/var/scancodeio/workspace/ + - static:/var/scancodeio/static/ + depends_on: + - db + labels: + - "traefik.enable=true" + - "traefik.http.routers.matchcodeio.rule=(Host(`127.0.0.1`) && Path(`/project`))" + - "traefik.http.routers.matchcodeio.entrypoints=web" + + worker: + build: . + # Ensure that potential db migrations run first by waiting until "web" is up + command: wait-for-it --strict --timeout=120 web:8000 -- sh -c " + ./manage.py rqworker --worker-class scancodeio.worker.ScanCodeIOWorker + --queue-class scancodeio.worker.ScanCodeIOQueue + --verbosity 1" + env_file: + - docker.env + volumes: + - .env:/opt/scancodeio/.env + - /etc/scancodeio/:/etc/scancodeio/ + - workspace:/var/scancodeio/workspace/ + depends_on: + - redis + - db + - web + + nginx: + image: nginx:alpine + volumes: + - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ + - /var/www/html:/var/www/html + - static:/var/scancodeio/static/ + depends_on: + - web + restart: always + +networks: + default: + external: + name: purldb + +volumes: + db_data: + redis_data: + static: + workspace: diff --git a/docker-compose.yml b/docker-compose_purldb.yml similarity index 70% rename from docker-compose.yml rename to docker-compose_purldb.yml index 8bcae773..1bcc7c39 100644 --- a/docker-compose.yml +++ b/docker-compose_purldb.yml @@ -4,18 +4,18 @@ services: db: image: postgres:13 env_file: - - docker.env + - docker_purldb.env volumes: - db_data:/var/lib/postgresql/data/ web: build: . command: sh -c " - python manage.py migrate && - python manage.py collectstatic --no-input --verbosity 0 --clear && + python manage_purldb.py migrate && + python manage_purldb.py collectstatic --no-input --verbosity 0 --clear && gunicorn purldb.wsgi:application --bind :8000 --timeout 600 --workers 8" env_file: - - docker.env + - docker_purldb.env expose: - 8000 volumes: @@ -27,10 +27,10 @@ services: visitor: build: . command: sh -c " - wait-for-it web:8000 -- python manage.py seed && - python manage.py run_visit --ignore-robots --ignore-throttle" + wait-for-it web:8000 -- python manage_purldb.py seed && + python manage_purldb.py run_visit --ignore-robots --ignore-throttle" env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: @@ -41,9 +41,9 @@ services: mapper: build: . - command: wait-for-it web:8000 -- python manage.py run_map + command: wait-for-it web:8000 -- python manage_purldb.py run_map env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: @@ -56,7 +56,7 @@ services: build: . command: wait-for-it web:8000 -- clearsync --save-to-db --verbose -n 3 env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: @@ -67,7 +67,7 @@ services: clearindex: build: . - command: wait-for-it web:8000 -- python manage.py run_clearindex + command: wait-for-it web:8000 -- python manage_purldb.py run_clearindex profiles: - clearsync depends_on: @@ -76,9 +76,9 @@ services: request_scan: build: . - command: wait-for-it web:8000 -- python manage.py request_scans + command: wait-for-it web:8000 -- python manage_purldb.py request_scans env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: @@ -89,9 +89,9 @@ services: process_scan: build: . - command: wait-for-it web:8000 -- python manage.py process_scans + command: wait-for-it web:8000 -- python manage_purldb.py process_scans env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: @@ -102,9 +102,9 @@ services: priority_queue: build: . - command: wait-for-it web:8000 -- python manage.py priority_queue + command: wait-for-it web:8000 -- python manage_purldb.py priority_queue env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: diff --git a/docker-compose_purldb_public.yml b/docker-compose_purldb_public.yml new file mode 100644 index 00000000..f8bd2fcc --- /dev/null +++ b/docker-compose_purldb_public.yml @@ -0,0 +1,135 @@ +version: "3" + +services: + db: + image: postgres:13 + env_file: + - docker.env + volumes: + - db_data:/var/lib/postgresql/data/ + - ./etc/postgresql/postgresql.conf:/var/lib/postgresql/data/postgresql.conf + + web: + build: . + command: sh -c " + python manage_purldb_public.py migrate && + python manage_purldb_public.py collectstatic --no-input --verbosity 0 --clear && + gunicorn purldb_public_project.wsgi:application --bind :8000 --timeout 600 --workers 8" + env_file: + - docker.env + expose: + - 8000 + volumes: + - /etc/purldb/:/etc/purldb/ + - static:/var/purldb/static/ + depends_on: + - db + + visitor: + build: . + command: sh -c " + wait-for-it web:8000 -- python manage_purldb_public.py seed && + python manage_purldb_public.py run_visit --ignore-robots --ignore-throttle" + env_file: + - docker.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - visit_and_map + depends_on: + - db + - web # Ensure that potential db migrations run first + + mapper: + build: . + command: wait-for-it web:8000 -- python manage_purldb_public.py run_map + env_file: + - docker.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - visit_and_map + depends_on: + - db + - web # Ensure that potential db migrations run first + + clearsync: + build: . + command: wait-for-it web:8000 -- clearsync --save-to-db --verbose -n 3 + env_file: + - docker.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - clearsync + depends_on: + - db + - web # Ensure that potential db migrations run first + + clearindex: + build: . + command: wait-for-it web:8000 -- python manage_purldb_public.py run_clearindex + profiles: + - clearsync + depends_on: + - db + - web # Ensure that potential db migrations run first + + request_scan: + build: . + command: wait-for-it web:8000 -- python manage_purldb_public.py request_scans + env_file: + - docker.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - scan_queue + depends_on: + - db + - web + + process_scan: + build: . + command: wait-for-it web:8000 -- python manage_purldb_public.py process_scans + env_file: + - docker.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - scan_queue + depends_on: + - db + - web + + priority_queue: + build: . + command: wait-for-it web:8000 -- python manage_purldb_public.py priority_queue + env_file: + - docker.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - priority_queue + depends_on: + - db + - web + + nginx: + image: nginx + ports: + - 80:80 + - 443:443 + volumes: + - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ + - static:/var/purldb/static/ + depends_on: + - web + +networks: + default: + name: purldb + external: true + +volumes: + db_data: + static: diff --git a/docker_matchcode.env b/docker_matchcode.env new file mode 100644 index 00000000..15ad29fa --- /dev/null +++ b/docker_matchcode.env @@ -0,0 +1,11 @@ +POSTGRES_DB=packagedb +POSTGRES_USER=packagedb +POSTGRES_PASSWORD=packagedb +POSTGRES_INITDB_ARGS=--encoding=UTF-8 --lc-collate=en_US.UTF-8 --lc-ctype=en_US.UTF-8 + +PACKAGEDB_DB_HOST=db + +SCANCODEIO_DB_HOST=db +SCANCODEIO_REDIS_HOST=redis +SCANCODEIO_ASYNC=True +SCANCODEIO_WORKSPACE_LOCATION=/var/scancodeio/workspace/ diff --git a/docker.env b/docker_purldb.env similarity index 100% rename from docker.env rename to docker_purldb.env diff --git a/manage_matchcode.py b/manage_matchcode.py new file mode 100644 index 00000000..b19f6acb --- /dev/null +++ b/manage_matchcode.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os +import sys + + +if __name__ == '__main__': + from django.core.management import execute_from_command_line + + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'matchcode_project.settings') + execute_from_command_line(sys.argv) diff --git a/manage.py b/manage_purldb.py similarity index 100% rename from manage.py rename to manage_purldb.py diff --git a/manage_purldb_public.py b/manage_purldb_public.py new file mode 100644 index 00000000..ba7f0bbb --- /dev/null +++ b/manage_purldb_public.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os +import sys + + +if __name__ == '__main__': + from django.core.management import execute_from_command_line + + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'purldb_public_project.settings') + execute_from_command_line(sys.argv) diff --git a/matchcodeio/__init__.py b/matchcode_pipeline/__init__.py similarity index 100% rename from matchcodeio/__init__.py rename to matchcode_pipeline/__init__.py diff --git a/matchcodeio/api.py b/matchcode_pipeline/api.py similarity index 100% rename from matchcodeio/api.py rename to matchcode_pipeline/api.py diff --git a/matchcodeio/pipelines/__init__.py b/matchcode_pipeline/pipelines/__init__.py similarity index 100% rename from matchcodeio/pipelines/__init__.py rename to matchcode_pipeline/pipelines/__init__.py diff --git a/matchcodeio/pipelines/matching.py b/matchcode_pipeline/pipelines/matching.py similarity index 98% rename from matchcodeio/pipelines/matching.py rename to matchcode_pipeline/pipelines/matching.py index e6d74dc1..a605e9c5 100644 --- a/matchcodeio/pipelines/matching.py +++ b/matchcode_pipeline/pipelines/matching.py @@ -22,7 +22,7 @@ from scanpipe.pipelines.load_inventory import LoadInventory from scanpipe.pipelines.scan_codebase import ScanCodebase -from matchcodeio.pipes import matching +from matchcodeio_project.pipes import matching from scanpipe.pipes import matchcode diff --git a/matchcodeio/pipes/__init__.py b/matchcode_pipeline/pipes/__init__.py similarity index 100% rename from matchcodeio/pipes/__init__.py rename to matchcode_pipeline/pipes/__init__.py diff --git a/matchcodeio/pipes/matching.py b/matchcode_pipeline/pipes/matching.py similarity index 100% rename from matchcodeio/pipes/matching.py rename to matchcode_pipeline/pipes/matching.py diff --git a/matchcode_project/__init__.py b/matchcode_project/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/matchcodeio/dbrouter.py b/matchcode_project/dbrouter.py similarity index 100% rename from matchcodeio/dbrouter.py rename to matchcode_project/dbrouter.py diff --git a/matchcodeio/settings.py b/matchcode_project/settings.py similarity index 100% rename from matchcodeio/settings.py rename to matchcode_project/settings.py diff --git a/matchcodeio/urls.py b/matchcode_project/urls.py similarity index 94% rename from matchcodeio/urls.py rename to matchcode_project/urls.py index 45e3d921..1f3ad35b 100644 --- a/matchcodeio/urls.py +++ b/matchcode_project/urls.py @@ -12,7 +12,7 @@ from django.views.generic import RedirectView from rest_framework import routers -from matchcodeio.api import MatchingViewSet +from matchcode_pipeline.api import MatchingViewSet from scanpipe.api.views import RunViewSet diff --git a/matchcodeio/wsgi.py b/matchcode_project/wsgi.py similarity index 100% rename from matchcodeio/wsgi.py rename to matchcode_project/wsgi.py diff --git a/setup.cfg b/setup.cfg index 1d1ff095..65feed8c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,7 @@ install_requires = urlpy == 0.5 matchcode-toolkit >= 1.1.1 univers == 30.11.0 + scancodeio setup_requires = setuptools_scm[toml] >= 4 python_requires = >=3.8 @@ -65,9 +66,6 @@ python_requires = >=3.8 where = . [options.extras_require] -matchcodeio = - scancodeio - testing = pytest >= 6, != 7.0.0 pytest-xdist >= 2 @@ -75,7 +73,6 @@ testing = aboutcode-toolkit >= 6.0.0 black mock - scancodeio docs = Sphinx==5.0.2 From 7ecc0ad755d89e713fe01dd8db797f0737e2ae3f Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 13 Dec 2023 15:58:04 -0800 Subject: [PATCH 21/54] Update module names and references #224 Signed-off-by: Jono Yang --- Makefile | 6 +++--- matchcode_pipeline/pipelines/matching.py | 2 +- matchcode_project/settings.py | 4 ++-- setup.cfg | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 6cb5dd45..efcab3a3 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ # Python version can be specified with `$ PYTHON_EXE=python3.x make conf` PYTHON_EXE?=python3 VENV=venv -MANAGE=${VENV}/bin/python manage.py +MANAGE=${VENV}/bin/python manage_purldb.py ACTIVATE?=. ${VENV}/bin/activate; VIRTUALENV_PYZ=../etc/thirdparty/virtualenv.pyz # Do not depend on Python to generate the SECRET_KEY @@ -107,9 +107,9 @@ process_scans: test: @echo "-> Run the test suite" - ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode-toolkit --ignore packagedb/tests/test_throttling.py --ignore matchcodeio + ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode-toolkit --ignore packagedb/tests/test_throttling.py --ignore matchcode_pipeline --ignore matchcode_project ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs packagedb/tests/test_throttling.py - ${ACTIVATE} DJANGO_SETTINGS_MODULE=matchcodeio.settings ${PYTHON_EXE} -m pytest -vvs matchcodeio + ${ACTIVATE} DJANGO_SETTINGS_MODULE=matchcode_project.settings ${PYTHON_EXE} -m pytest -vvs matchcode_pipeline ${ACTIVATE} ${PYTHON_EXE} -m pytest -vvs matchcode-toolkit --ignore matchcode-toolkit/src/matchcode_toolkit/pipelines shell: diff --git a/matchcode_pipeline/pipelines/matching.py b/matchcode_pipeline/pipelines/matching.py index a605e9c5..f4461a86 100644 --- a/matchcode_pipeline/pipelines/matching.py +++ b/matchcode_pipeline/pipelines/matching.py @@ -22,7 +22,7 @@ from scanpipe.pipelines.load_inventory import LoadInventory from scanpipe.pipelines.scan_codebase import ScanCodebase -from matchcodeio_project.pipes import matching +from matchcode_pipeline.pipes import matching from scanpipe.pipes import matchcode diff --git a/matchcode_project/settings.py b/matchcode_project/settings.py index 2ba549ff..c3b136a4 100644 --- a/matchcode_project/settings.py +++ b/matchcode_project/settings.py @@ -34,6 +34,6 @@ } ) -DATABASE_ROUTERS = ["matchcodeio.dbrouter.PackageDBReadOnlyRouter",] +DATABASE_ROUTERS = ["matchcode_project.dbrouter.PackageDBReadOnlyRouter",] -ROOT_URLCONF = 'matchcodeio.urls' +ROOT_URLCONF = 'matchcode_project.urls' diff --git a/setup.cfg b/setup.cfg index 65feed8c..0111f8c0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -84,4 +84,4 @@ console_scripts = purldb = purldb:command_line scancodeio_pipelines = - matching = matchcode.pipelines.matching:Matching + matching = matchcode_pipeline.pipelines.matching:Matching From 8fd48b6714400947fde8f220a4e91fb6d6c30629 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 13 Dec 2023 17:50:25 -0800 Subject: [PATCH 22/54] Fix docker-compose.yml files #224 Signed-off-by: Jono Yang --- ...odeio.yml => docker-compose_matchcodeio.yml | 18 ++++++++++-------- docker-compose_purldb.yml | 2 +- docker-compose_purldb_public.yml | 17 ++++++++--------- 3 files changed, 19 insertions(+), 18 deletions(-) rename docker-compose-matchcodeio.yml => docker-compose_matchcodeio.yml (77%) diff --git a/docker-compose-matchcodeio.yml b/docker-compose_matchcodeio.yml similarity index 77% rename from docker-compose-matchcodeio.yml rename to docker-compose_matchcodeio.yml index 071f2bd2..9038d1f0 100644 --- a/docker-compose-matchcodeio.yml +++ b/docker-compose_matchcodeio.yml @@ -4,7 +4,7 @@ services: db: image: postgres:13 env_file: - - docker.env + - docker_matchcode.env volumes: - db_data:/var/lib/postgresql/data/ shm_size: "1gb" @@ -22,13 +22,13 @@ services: web: build: . command: wait-for-it --strict --timeout=60 db:5432 -- sh -c " - python manage.py migrate && - python manage.py collectstatic --no-input --verbosity 0 --clear && - gunicorn matchcodeio.wsgi:application --bind :8000 --timeout 600 --workers 8" + python manage_matchcode.py migrate && + python manage_matchcode.py collectstatic --no-input --verbosity 0 --clear && + gunicorn matchcode_project.wsgi:application --bind :8000 --timeout 600 --workers 8" environment: - - DJANGO_SETTINGS_MODULE=matchcodeio.settings + - DJANGO_SETTINGS_MODULE=matchcode_project.settings env_file: - - docker.env + - docker_matchcode.env expose: - 8000 volumes: @@ -47,11 +47,13 @@ services: build: . # Ensure that potential db migrations run first by waiting until "web" is up command: wait-for-it --strict --timeout=120 web:8000 -- sh -c " - ./manage.py rqworker --worker-class scancodeio.worker.ScanCodeIOWorker + ./manage_matchcode.py rqworker --worker-class scancodeio.worker.ScanCodeIOWorker --queue-class scancodeio.worker.ScanCodeIOQueue --verbosity 1" + environment: + - DJANGO_SETTINGS_MODULE=matchcode_project.settings env_file: - - docker.env + - docker_matchcode.env volumes: - .env:/opt/scancodeio/.env - /etc/scancodeio/:/etc/scancodeio/ diff --git a/docker-compose_purldb.yml b/docker-compose_purldb.yml index 1bcc7c39..68b1f8ea 100644 --- a/docker-compose_purldb.yml +++ b/docker-compose_purldb.yml @@ -13,7 +13,7 @@ services: command: sh -c " python manage_purldb.py migrate && python manage_purldb.py collectstatic --no-input --verbosity 0 --clear && - gunicorn purldb.wsgi:application --bind :8000 --timeout 600 --workers 8" + gunicorn purldb_project.wsgi:application --bind :8000 --timeout 600 --workers 8" env_file: - docker_purldb.env expose: diff --git a/docker-compose_purldb_public.yml b/docker-compose_purldb_public.yml index f8bd2fcc..becbd82f 100644 --- a/docker-compose_purldb_public.yml +++ b/docker-compose_purldb_public.yml @@ -4,10 +4,9 @@ services: db: image: postgres:13 env_file: - - docker.env + - docker_purldb.env volumes: - db_data:/var/lib/postgresql/data/ - - ./etc/postgresql/postgresql.conf:/var/lib/postgresql/data/postgresql.conf web: build: . @@ -16,7 +15,7 @@ services: python manage_purldb_public.py collectstatic --no-input --verbosity 0 --clear && gunicorn purldb_public_project.wsgi:application --bind :8000 --timeout 600 --workers 8" env_file: - - docker.env + - docker_purldb.env expose: - 8000 volumes: @@ -31,7 +30,7 @@ services: wait-for-it web:8000 -- python manage_purldb_public.py seed && python manage_purldb_public.py run_visit --ignore-robots --ignore-throttle" env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: @@ -44,7 +43,7 @@ services: build: . command: wait-for-it web:8000 -- python manage_purldb_public.py run_map env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: @@ -57,7 +56,7 @@ services: build: . command: wait-for-it web:8000 -- clearsync --save-to-db --verbose -n 3 env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: @@ -79,7 +78,7 @@ services: build: . command: wait-for-it web:8000 -- python manage_purldb_public.py request_scans env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: @@ -92,7 +91,7 @@ services: build: . command: wait-for-it web:8000 -- python manage_purldb_public.py process_scans env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: @@ -105,7 +104,7 @@ services: build: . command: wait-for-it web:8000 -- python manage_purldb_public.py priority_queue env_file: - - docker.env + - docker_purldb.env volumes: - /etc/purldb/:/etc/purldb/ profiles: From 7399bc446c151393ac370f14d085899c167f182f Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 14 Dec 2023 15:30:10 -0800 Subject: [PATCH 23/54] Update docker-compose.yml files #224 Signed-off-by: Jono Yang --- docker-compose_purldb.yml | 33 +++++++-------------------------- docker-compose_traefik.yml | 18 ++++++++++++++++++ 2 files changed, 25 insertions(+), 26 deletions(-) create mode 100644 docker-compose_traefik.yml diff --git a/docker-compose_purldb.yml b/docker-compose_purldb.yml index a609c602..3036dcff 100644 --- a/docker-compose_purldb.yml +++ b/docker-compose_purldb.yml @@ -1,26 +1,6 @@ version: "3" services: - reverse-proxy: - # The official v2 Traefik docker image - image: traefik:v2.10 - # Enables the web UI and tells Traefik to listen to docker - command: - - "--api.insecure=true" - - "--providers.docker" - - "--providers.docker.exposedbydefault=false" - - "--entrypoints.web.address=:80" - - "--entrypoints.websecure.address=:443" - ports: - # The HTTP port - - "80:80" - - "443:443" - # The Web UI (enabled by --api.insecure=true) - - "8080:8080" - volumes: - # So that Traefik can listen to the Docker events - - /var/run/docker.sock:/var/run/docker.sock - db: image: postgres:13 env_file: @@ -43,10 +23,6 @@ services: - static:/var/purldb/static/ depends_on: - db - labels: - - "traefik.enable=true" - - "traefik.http.routers.development.rule=Host(`127.0.0.1`) || Host(`localhost`)" - - "traefik.http.routers.development.entrypoints=web" visitor: build: . @@ -141,14 +117,19 @@ services: image: nginx labels: - "traefik.enable=true" - - "traefik.http.routers.staticfiles.rule=PathPrefix(`/static/`) || PathPrefix(`/media/`)" - - "traefik.http.routers.staticfiles.entrypoints=web" + - "traefik.http.routers.web.rule=Host(`127.0.0.1`) || Host(`localhost`)" + - "traefik.http.routers.web.entrypoints=web" volumes: - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ - static:/var/purldb/static/ depends_on: - web +networks: + default: + name: purldb + external: true + volumes: db_data: static: diff --git a/docker-compose_traefik.yml b/docker-compose_traefik.yml new file mode 100644 index 00000000..b6ee4c38 --- /dev/null +++ b/docker-compose_traefik.yml @@ -0,0 +1,18 @@ +version: "3.7" + +services: + traefik: + image: "traefik:v2.10" + container_name: "traefik" + hostname: "traefik" + ports: + - "80:80" + - "8080:8080" + volumes: + - "/var/run/docker.sock:/var/run/docker.sock:ro" + - "./traefik.yml:/traefik.yml:ro" + +networks: + default: + external: + name: purldb From f798cea41f12f7d72e8c5be4ca951a351cb8510d Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 14 Dec 2023 19:50:33 -0800 Subject: [PATCH 24/54] Create multiple DBs in Postgres container #224 * Update PackageDB router Signed-off-by: Jono Yang --- docker-compose_matchcodeio.yml | 29 +++++++------------ docker-compose_purldb.yml | 1 + docker_matchcode.env | 1 + docker_purldb.env | 1 + .../create-multiple-postgresql-databases.sh | 25 ++++++++++++++++ matchcode_project/dbrouter.py | 11 ++++++- matchcode_project/settings.py | 2 +- matchcodeio/settings.py | 2 +- 8 files changed, 50 insertions(+), 22 deletions(-) create mode 100644 etc/multiple-databases/create-multiple-postgresql-databases.sh diff --git a/docker-compose_matchcodeio.yml b/docker-compose_matchcodeio.yml index 9038d1f0..cf17df63 100644 --- a/docker-compose_matchcodeio.yml +++ b/docker-compose_matchcodeio.yml @@ -1,16 +1,8 @@ version: "3" services: - db: - image: postgres:13 - env_file: - - docker_matchcode.env - volumes: - - db_data:/var/lib/postgresql/data/ - shm_size: "1gb" - restart: always - redis: + container_name: matchcodeio_redis image: redis # Enable redis data persistence using the "Append Only File" with the # default policy of fsync every second. See https://redis.io/topics/persistence @@ -20,9 +12,10 @@ services: restart: always web: + container_name: matchcodeio_web build: . command: wait-for-it --strict --timeout=60 db:5432 -- sh -c " - python manage_matchcode.py migrate && + python manage_matchcode.py migrate --database default && python manage_matchcode.py collectstatic --no-input --verbosity 0 --clear && gunicorn matchcode_project.wsgi:application --bind :8000 --timeout 600 --workers 8" environment: @@ -36,18 +29,13 @@ services: - /etc/scancodeio/:/etc/scancodeio/ - workspace:/var/scancodeio/workspace/ - static:/var/scancodeio/static/ - depends_on: - - db - labels: - - "traefik.enable=true" - - "traefik.http.routers.matchcodeio.rule=(Host(`127.0.0.1`) && Path(`/project`))" - - "traefik.http.routers.matchcodeio.entrypoints=web" worker: + container_name: matchcodeio_worker build: . # Ensure that potential db migrations run first by waiting until "web" is up command: wait-for-it --strict --timeout=120 web:8000 -- sh -c " - ./manage_matchcode.py rqworker --worker-class scancodeio.worker.ScanCodeIOWorker + python manage_matchcode.py rqworker --worker-class scancodeio.worker.ScanCodeIOWorker --queue-class scancodeio.worker.ScanCodeIOQueue --verbosity 1" environment: @@ -60,10 +48,10 @@ services: - workspace:/var/scancodeio/workspace/ depends_on: - redis - - db - web nginx: + container_name: matchcodeio_nginx image: nginx:alpine volumes: - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ @@ -72,6 +60,10 @@ services: depends_on: - web restart: always + labels: + - "traefik.enable=true" + - "traefik.http.routers.matchcodeio.rule=Host(`127.0.0.1`) && PathPrefix(`/api/matching`) || Host(`localhost`) && PathPrefix(`/api/matching`) || Host(`192.168.1.12`) && PathPrefix(`/api/matching`)" + - "traefik.http.routers.matchcodeio.entrypoints=web" networks: default: @@ -79,7 +71,6 @@ networks: name: purldb volumes: - db_data: redis_data: static: workspace: diff --git a/docker-compose_purldb.yml b/docker-compose_purldb.yml index 3036dcff..79634538 100644 --- a/docker-compose_purldb.yml +++ b/docker-compose_purldb.yml @@ -6,6 +6,7 @@ services: env_file: - docker_purldb.env volumes: + - ./etc/multiple-databases:/docker-entrypoint-initdb.d - db_data:/var/lib/postgresql/data/ web: diff --git a/docker_matchcode.env b/docker_matchcode.env index 15ad29fa..61919222 100644 --- a/docker_matchcode.env +++ b/docker_matchcode.env @@ -1,3 +1,4 @@ +POSTGRES_MULTIPLE_DATABASES=scancodeio:scancodeio:scancodeio POSTGRES_DB=packagedb POSTGRES_USER=packagedb POSTGRES_PASSWORD=packagedb diff --git a/docker_purldb.env b/docker_purldb.env index 1b588cc0..dcdd8fd8 100644 --- a/docker_purldb.env +++ b/docker_purldb.env @@ -1,3 +1,4 @@ +POSTGRES_MULTIPLE_DATABASES=scancodeio:scancodeio:scancodeio POSTGRES_DB=packagedb POSTGRES_USER=packagedb POSTGRES_PASSWORD=packagedb diff --git a/etc/multiple-databases/create-multiple-postgresql-databases.sh b/etc/multiple-databases/create-multiple-postgresql-databases.sh new file mode 100644 index 00000000..b582883a --- /dev/null +++ b/etc/multiple-databases/create-multiple-postgresql-databases.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +set -e +set -u + +function create_user_and_database() { + local dbinfo=$1 + IFS=":" read -r database user password <<< "$dbinfo" + + echo "Creating database '$database' with user '$user' and password '$password'" + psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" <<-EOSQL + CREATE USER $user; + ALTER USER $user WITH ENCRYPTED PASSWORD '$password'; + CREATE DATABASE $database; + GRANT ALL PRIVILEGES ON DATABASE $database TO $user; +EOSQL +} + +if [ -n "$POSTGRES_MULTIPLE_DATABASES" ]; then + echo "Multiple database creation requested: $POSTGRES_MULTIPLE_DATABASES" + for db in $(echo $POSTGRES_MULTIPLE_DATABASES | tr ',' ' '); do + create_user_and_database $db + done + echo "Multiple databases created" +fi diff --git a/matchcode_project/dbrouter.py b/matchcode_project/dbrouter.py index fb438c7e..3e837eb8 100644 --- a/matchcode_project/dbrouter.py +++ b/matchcode_project/dbrouter.py @@ -8,7 +8,7 @@ # -class PackageDBReadOnlyRouter(object): +class PackageDBRouter(object): app_labels = [ 'clearcode', 'clearindex', @@ -23,10 +23,19 @@ def db_for_read(self, model, **hints): return None def db_for_write(self, model, **hints): + if model._meta.app_label in self.app_labels: + return 'packagedb' return None def allow_relation(self, obj1, obj2, **hints): + if ( + obj1._meta.app_label in self.app_labels + or obj2._meta.app_label in self.app_labels + ): + return True return None def allow_migrate(self, db, app_label, model_name=None, **hints): + if app_label in self.app_labels: + return db == 'packagedb' return None diff --git a/matchcode_project/settings.py b/matchcode_project/settings.py index c3b136a4..5453b628 100644 --- a/matchcode_project/settings.py +++ b/matchcode_project/settings.py @@ -34,6 +34,6 @@ } ) -DATABASE_ROUTERS = ["matchcode_project.dbrouter.PackageDBReadOnlyRouter",] +DATABASE_ROUTERS = ["matchcode_project.dbrouter.PackageDBRouter",] ROOT_URLCONF = 'matchcode_project.urls' diff --git a/matchcodeio/settings.py b/matchcodeio/settings.py index 2ba549ff..60f9b6a9 100644 --- a/matchcodeio/settings.py +++ b/matchcodeio/settings.py @@ -34,6 +34,6 @@ } ) -DATABASE_ROUTERS = ["matchcodeio.dbrouter.PackageDBReadOnlyRouter",] +DATABASE_ROUTERS = ["matchcodeio.dbrouter.PackageDBRouter",] ROOT_URLCONF = 'matchcodeio.urls' From f67041fef6f55d6936c326f1a39b9b84ad6706c4 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 15 Dec 2023 13:59:50 -0800 Subject: [PATCH 25/54] Create nginx conf for matchcode instance #224 * Update docker-compose.yml files Signed-off-by: Jono Yang --- docker-compose_matchcodeio.yml | 28 +- docker-compose_purldb.yml | 2 +- docker_matchcode.env | 2 +- etc/nginx/matchcodeio-conf.d/default.conf | 20 + matchcodeio/__init__.py | 0 matchcodeio/api.py | 144 -------- matchcodeio/dbrouter.py | 32 -- matchcodeio/pipelines/__init__.py | 0 matchcodeio/pipelines/matching.py | 109 ------ matchcodeio/pipes/__init__.py | 0 matchcodeio/pipes/matching.py | 424 ---------------------- matchcodeio/settings.py | 39 -- matchcodeio/urls.py | 27 -- matchcodeio/wsgi.py | 25 -- 14 files changed, 34 insertions(+), 818 deletions(-) create mode 100644 etc/nginx/matchcodeio-conf.d/default.conf delete mode 100644 matchcodeio/__init__.py delete mode 100644 matchcodeio/api.py delete mode 100644 matchcodeio/dbrouter.py delete mode 100644 matchcodeio/pipelines/__init__.py delete mode 100644 matchcodeio/pipelines/matching.py delete mode 100644 matchcodeio/pipes/__init__.py delete mode 100644 matchcodeio/pipes/matching.py delete mode 100644 matchcodeio/settings.py delete mode 100644 matchcodeio/urls.py delete mode 100644 matchcodeio/wsgi.py diff --git a/docker-compose_matchcodeio.yml b/docker-compose_matchcodeio.yml index cf17df63..114f8023 100644 --- a/docker-compose_matchcodeio.yml +++ b/docker-compose_matchcodeio.yml @@ -1,8 +1,7 @@ version: "3" services: - redis: - container_name: matchcodeio_redis + matchcodeio_redis: image: redis # Enable redis data persistence using the "Append Only File" with the # default policy of fsync every second. See https://redis.io/topics/persistence @@ -11,30 +10,28 @@ services: - redis_data:/data restart: always - web: - container_name: matchcodeio_web + matchcodeio_web: build: . command: wait-for-it --strict --timeout=60 db:5432 -- sh -c " python manage_matchcode.py migrate --database default && python manage_matchcode.py collectstatic --no-input --verbosity 0 --clear && - gunicorn matchcode_project.wsgi:application --bind :8000 --timeout 600 --workers 8" + gunicorn matchcode_project.wsgi:application --bind :8001 --timeout 600 --workers 8" environment: - DJANGO_SETTINGS_MODULE=matchcode_project.settings env_file: - docker_matchcode.env expose: - - 8000 + - 8001 volumes: - .env:/opt/scancodeio/.env - /etc/scancodeio/:/etc/scancodeio/ - workspace:/var/scancodeio/workspace/ - static:/var/scancodeio/static/ - worker: - container_name: matchcodeio_worker + matchcodeio_worker: build: . # Ensure that potential db migrations run first by waiting until "web" is up - command: wait-for-it --strict --timeout=120 web:8000 -- sh -c " + command: wait-for-it --strict --timeout=120 matchcodeio_web:8001 -- sh -c " python manage_matchcode.py rqworker --worker-class scancodeio.worker.ScanCodeIOWorker --queue-class scancodeio.worker.ScanCodeIOQueue --verbosity 1" @@ -47,22 +44,21 @@ services: - /etc/scancodeio/:/etc/scancodeio/ - workspace:/var/scancodeio/workspace/ depends_on: - - redis - - web + - matchcodeio_redis + - matchcodeio_web - nginx: - container_name: matchcodeio_nginx + matchcodeio_nginx: image: nginx:alpine volumes: - - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ + - ./etc/nginx/matchcodeio-conf.d/:/etc/nginx/conf.d/ - /var/www/html:/var/www/html - static:/var/scancodeio/static/ depends_on: - - web + - matchcodeio_web restart: always labels: - "traefik.enable=true" - - "traefik.http.routers.matchcodeio.rule=Host(`127.0.0.1`) && PathPrefix(`/api/matching`) || Host(`localhost`) && PathPrefix(`/api/matching`) || Host(`192.168.1.12`) && PathPrefix(`/api/matching`)" + - "traefik.http.routers.matchcodeio.rule=Host(`127.0.0.1`) && PathPrefix(`/api/matching`) || Host(`localhost`) && PathPrefix(`/api/matching`) || Host(`127.0.0.1`) && PathPrefix(`/project`) || Host(`localhost`) && PathPrefix(`/project`)" - "traefik.http.routers.matchcodeio.entrypoints=web" networks: diff --git a/docker-compose_purldb.yml b/docker-compose_purldb.yml index 79634538..dfd0ffe5 100644 --- a/docker-compose_purldb.yml +++ b/docker-compose_purldb.yml @@ -118,7 +118,7 @@ services: image: nginx labels: - "traefik.enable=true" - - "traefik.http.routers.web.rule=Host(`127.0.0.1`) || Host(`localhost`)" + - "traefik.http.routers.web.rule=Host(`127.0.0.1`) || Host(`localhost`) || Host(`192.168.1.12`)" - "traefik.http.routers.web.entrypoints=web" volumes: - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ diff --git a/docker_matchcode.env b/docker_matchcode.env index 61919222..77b4284e 100644 --- a/docker_matchcode.env +++ b/docker_matchcode.env @@ -7,6 +7,6 @@ POSTGRES_INITDB_ARGS=--encoding=UTF-8 --lc-collate=en_US.UTF-8 --lc-ctype=en_US. PACKAGEDB_DB_HOST=db SCANCODEIO_DB_HOST=db -SCANCODEIO_REDIS_HOST=redis +SCANCODEIO_REDIS_HOST=matchcodeio_redis SCANCODEIO_ASYNC=True SCANCODEIO_WORKSPACE_LOCATION=/var/scancodeio/workspace/ diff --git a/etc/nginx/matchcodeio-conf.d/default.conf b/etc/nginx/matchcodeio-conf.d/default.conf new file mode 100644 index 00000000..e86c15cf --- /dev/null +++ b/etc/nginx/matchcodeio-conf.d/default.conf @@ -0,0 +1,20 @@ +upstream matchcodeio_app { + server matchcodeio_web:8001; +} + +server { + listen 80; + + location / { + proxy_pass http://matchcodeio_app; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $host; + proxy_redirect off; + client_max_body_size 10G; + proxy_read_timeout 600s; + } + + location /static/ { + alias /var/purldb/static/; + } +} \ No newline at end of file diff --git a/matchcodeio/__init__.py b/matchcodeio/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/matchcodeio/api.py b/matchcodeio/api.py deleted file mode 100644 index b32bc74c..00000000 --- a/matchcodeio/api.py +++ /dev/null @@ -1,144 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# -from uuid import uuid4 - -from rest_framework import mixins -from rest_framework import renderers -from rest_framework import serializers -from rest_framework import viewsets -from rest_framework.decorators import action - -from scanpipe.api.serializers import RunSerializer -from scanpipe.api.serializers import StrListField -from scanpipe.api.views import ProjectFilterSet -from scanpipe.models import Project -from scanpipe.pipes import count_group_by -from scanpipe.pipes.fetch import fetch_urls -from scanpipe.views import project_results_json_response - - -class MatchingSerializer(serializers.ModelSerializer): - upload_file = serializers.FileField(write_only=True, required=False) - input_urls = StrListField( - write_only=True, - required=False, - style={"base_template": "textarea.html"}, - ) - webhook_url = serializers.CharField(write_only=True, required=False) - runs = RunSerializer(many=True, read_only=True) - input_sources = serializers.JSONField(source="input_sources_list", read_only=True) - codebase_resources_summary = serializers.SerializerMethodField() - discovered_packages_summary = serializers.SerializerMethodField() - discovered_dependencies_summary = serializers.SerializerMethodField() - codebase_relations_summary = serializers.SerializerMethodField() - - class Meta: - model = Project - fields = ( - 'url', - 'uuid', - "upload_file", - "input_urls", - "webhook_url", - "created_date", - "input_sources", - "runs", - "resource_count", - "package_count", - "dependency_count", - "relation_count", - "codebase_resources_summary", - "discovered_packages_summary", - "discovered_dependencies_summary", - "codebase_relations_summary", - ) - - def get_codebase_resources_summary(self, project): - queryset = project.codebaseresources.all() - return count_group_by(queryset, "status") - - def get_discovered_packages_summary(self, project): - base_qs = project.discoveredpackages - return { - "total": base_qs.count(), - "with_missing_resources": base_qs.exclude(missing_resources=[]).count(), - "with_modified_resources": base_qs.exclude(modified_resources=[]).count(), - } - - def get_discovered_dependencies_summary(self, project): - base_qs = project.discovereddependencies - return { - "total": base_qs.count(), - "is_runtime": base_qs.filter(is_runtime=True).count(), - "is_optional": base_qs.filter(is_optional=True).count(), - "is_resolved": base_qs.filter(is_resolved=True).count(), - } - - def get_codebase_relations_summary(self, project): - queryset = project.codebaserelations.all() - return count_group_by(queryset, "map_type") - - def create(self, validated_data, matching_pipeline_name='matching'): - """ - Create a new `project` with `upload_file`, using the `matching` pipeline - """ - execute_now = True - validated_data['name'] = uuid4() - upload_file = validated_data.pop("upload_file", None) - input_urls = validated_data.pop("input_urls", []) - webhook_url = validated_data.pop("webhook_url", None) - - downloads, errors = fetch_urls(input_urls) - if errors: - raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) - - project = super().create(validated_data) - - if upload_file: - project.add_uploads([upload_file]) - - if downloads: - project.add_downloads(downloads) - - if webhook_url: - project.add_webhook_subscription(webhook_url) - - project.add_pipeline(matching_pipeline_name, execute_now) - - return project - - -class MatchingViewSet( - mixins.CreateModelMixin, - mixins.RetrieveModelMixin, - mixins.DestroyModelMixin, - mixins.ListModelMixin, - viewsets.GenericViewSet, -): - queryset = Project.objects.all() - serializer_class = MatchingSerializer - filterset_class = ProjectFilterSet - - def get_queryset(self): - return ( - super() - .get_queryset() - .prefetch_related( - "runs", - ) - ) - - @action(detail=True, renderer_classes=[renderers.JSONRenderer]) - def results(self, request, *args, **kwargs): - """ - Return the results compatible with ScanCode data format. - The content is returned as a stream of JSON content using the - JSONResultsGenerator class. - """ - return project_results_json_response(self.get_object()) diff --git a/matchcodeio/dbrouter.py b/matchcodeio/dbrouter.py deleted file mode 100644 index fb438c7e..00000000 --- a/matchcodeio/dbrouter.py +++ /dev/null @@ -1,32 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -class PackageDBReadOnlyRouter(object): - app_labels = [ - 'clearcode', - 'clearindex', - 'minecode', - 'matchcode', - 'packagedb', - ] - - def db_for_read(self, model, **hints): - if model._meta.app_label in self.app_labels: - return 'packagedb' - return None - - def db_for_write(self, model, **hints): - return None - - def allow_relation(self, obj1, obj2, **hints): - return None - - def allow_migrate(self, db, app_label, model_name=None, **hints): - return None diff --git a/matchcodeio/pipelines/__init__.py b/matchcodeio/pipelines/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/matchcodeio/pipelines/matching.py b/matchcodeio/pipelines/matching.py deleted file mode 100644 index e6d74dc1..00000000 --- a/matchcodeio/pipelines/matching.py +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/nexB/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/nexB/scancode.io for support and download. - -from scanpipe.pipelines.load_inventory import LoadInventory -from scanpipe.pipelines.scan_codebase import ScanCodebase -from matchcodeio.pipes import matching -from scanpipe.pipes import matchcode - - -class Matching(ScanCodebase, LoadInventory): - """ - Establish relationships between two code trees: deployment and development. - - This pipeline is expecting 2 archive files with "from-" and "to-" filename - prefixes as inputs: - - "from-[FILENAME]" archive containing the development source code - - "to-[FILENAME]" archive containing the deployment compiled code - """ - - @classmethod - def steps(cls): - return ( - cls.get_inputs, - cls.build_inventory_from_scans, - cls.fingerprint_codebase_directories, - cls.flag_empty_files, - cls.flag_ignored_resources, - cls.match_archives_to_purldb, - cls.match_directories_to_purldb, - cls.match_resources_to_purldb, - cls.match_purldb_resources_post_process, - cls.remove_packages_without_resources, - ) - - purldb_package_extensions = [".jar", ".war", ".zip"] - purldb_resource_extensions = [ - ".map", - ".js", - ".mjs", - ".ts", - ".d.ts", - ".jsx", - ".tsx", - ".css", - ".scss", - ".less", - ".sass", - ".soy", - ".class", - ] - - def fingerprint_codebase_directories(self): - """Compute directory fingerprints for matching""" - matchcode.fingerprint_codebase_directories(self.project) - - def match_archives_to_purldb(self): - """Match selected package archives by extension to PurlDB.""" - matching.match_purldb_resources( - project=self.project, - extensions=self.purldb_package_extensions, - matcher_func=matching.match_purldb_package, - logger=self.log, - ) - - def match_directories_to_purldb(self): - """Match selected directories in PurlDB.""" - matching.match_purldb_directories( - project=self.project, - logger=self.log, - ) - - def match_resources_to_purldb(self): - """Match selected files by extension in PurlDB.""" - matching.match_purldb_resources( - project=self.project, - extensions=self.purldb_resource_extensions, - matcher_func=matching.match_purldb_resource, - logger=self.log, - ) - - def match_purldb_resources_post_process(self): - """Choose the best package for PurlDB matched resources.""" - matching.match_purldb_resources_post_process(self.project, logger=self.log) - - def remove_packages_without_resources(self): - """Remove packages without any resources.""" - package_without_resources = self.project.discoveredpackages.filter( - codebase_resources__isnull=True - ) - package_without_resources.delete() diff --git a/matchcodeio/pipes/__init__.py b/matchcodeio/pipes/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/matchcodeio/pipes/matching.py b/matchcodeio/pipes/matching.py deleted file mode 100644 index ee956567..00000000 --- a/matchcodeio/pipes/matching.py +++ /dev/null @@ -1,424 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/nexB/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/nexB/scancode.io for support and download. - -from collections import defaultdict - -from django.db.models import Q -from django.template.defaultfilters import pluralize - -from scanpipe import pipes -from scanpipe.pipes import LoopProgress -from scanpipe.pipes import flag -from scanpipe.pipes import js - -from matchcode.models import ApproximateDirectoryContentIndex -from packagedb.models import Package -from packagedb.models import Resource - - -def get_project_resources_qs(project, resources): - """ - Return a queryset of CodebaseResources from `project` containing the - CodebaseResources from `resources` . If a CodebaseResource in `resources` is - an archive or directory, then their descendants are also included in the - queryset. - - Return None if `resources` is empty or None. - """ - lookups = Q() - for resource in resources or []: - lookups |= Q(path=resource.path) - if resource.is_archive: - # This is done to capture the extracted contents of the archive we - # matched to. Generally, the archive contents are in a directory - # that is the archive path with `-extract` at the end. - lookups |= Q(path__startswith=resource.path) - elif resource.is_dir: - # We add a trailing slash to avoid matching on directories we do not - # intend to. For example, if we have matched on the directory with - # the path `foo/bar/1`, using the __startswith filter without - # including a trailing slash on the path would have us get all - # diretories under `foo/bar/` that start with 1, such as - # `foo/bar/10001`, `foo/bar/123`, etc., when we just want `foo/bar/1` - # and its descendants. - path = f"{resource.path}/" - lookups |= Q(path__startswith=path) - if lookups: - return project.codebaseresources.filter(lookups) - - -def create_package_from_purldb_data(project, resources, package_data, status): - """ - Create a DiscoveredPackage instance from PurlDB ``package_data``. - - Return a tuple, containing the created DiscoveredPackage and the number of - CodebaseResources matched to PurlDB that are part of that DiscoveredPackage. - """ - package_data = package_data.copy() - # Do not re-use uuid from PurlDB as DiscoveredPackage.uuid is unique and a - # PurlDB match can be found in different projects. - package_data.pop("uuid", None) - package_data.pop("dependencies", None) - - resources_qs = get_project_resources_qs(project, resources) - package = pipes.update_or_create_package( - project=project, - package_data=package_data, - codebase_resources=resources_qs, - ) - # Get the number of already matched CodebaseResources from `resources_qs` - # before we update the status of all CodebaseResources from `resources_qs`, - # then subtract the number of already matched CodebaseResources from the - # total number of CodebaseResources updated. This is to prevent - # double-counting of CodebaseResources that were matched to purldb - purldb_statuses = [ - flag.MATCHED_TO_PURLDB_PACKAGE, - flag.MATCHED_TO_PURLDB_RESOURCE, - flag.MATCHED_TO_PURLDB_DIRECTORY, - ] - matched_resources_count = resources_qs.exclude(status__in=purldb_statuses).update( - status=status - ) - return package, matched_resources_count - - -def match_purldb_package( - project, resources_by_sha1, enhance_package_data=True, **kwargs -): - """ - Given a mapping of lists of CodebaseResources by their sha1 values, - `resources_by_sha1`, send those sha1 values to purldb packages API endpoint, - process the matched Package data, then return the number of - CodebaseResources that were matched to a Package. - """ - match_count = 0 - sha1_list = list(resources_by_sha1.keys()) - results = Package.objects.using('packagedb').filter(sha1__in=sha1_list) - # Process matched Package data - for package in results: - package_data = package.to_dict() - sha1 = package_data["sha1"] - resources = resources_by_sha1.get(sha1) or [] - if not resources: - continue - _, matched_resources_count = create_package_from_purldb_data( - project=project, - resources=resources, - package_data=package_data, - status=flag.MATCHED_TO_PURLDB_PACKAGE, - ) - match_count += matched_resources_count - return match_count - - -def match_purldb_resource( - project, resources_by_sha1, package_data_by_purldb_urls=None, **kwargs -): - """ - Given a mapping of lists of CodebaseResources by their sha1 values, - `resources_by_sha1`, send those sha1 values to purldb resources API - endpoint, process the matched Package data, then return the number of - CodebaseResources that were matched to a Package. - - `package_data_by_purldb_urls` is a mapping of package data by their purldb - package instance URLs. This is intended to be used as a cache, to avoid - retrieving package data we retrieved before. - """ - package_data_by_purldb_urls = package_data_by_purldb_urls or {} - match_count = 0 - sha1_list = list(resources_by_sha1.keys()) - results = Resource.objects.using('packagedb').filter(sha1__in=sha1_list) - # Process match results - for resource in results: - # Get package data - package_data = resource.package.to_dict() - sha1 = package_data["sha1"] - resources = resources_by_sha1.get(sha1) or [] - if not resources: - continue - _, matched_resources_count = create_package_from_purldb_data( - project=project, - resources=resources, - package_data=package_data, - status=flag.MATCHED_TO_PURLDB_RESOURCE, - ) - match_count += matched_resources_count - return match_count - - -def match_purldb_directory(project, resource): - """Match a single directory resource in the PurlDB.""" - fingerprint = resource.extra_data.get("directory_content", "") - results = ApproximateDirectoryContentIndex.match(directory_fingerprint=fingerprint) - for result in results: - package_data = result.package.to_dict() - return create_package_from_purldb_data( - project, [resource], package_data, flag.MATCHED_TO_PURLDB_DIRECTORY - ) - - -def match_sha1s_to_purldb( - project, resources_by_sha1, matcher_func, package_data_by_purldb_urls -): - """ - Process `resources_by_sha1` with `matcher_func` and return a 3-tuple - contaning an empty defaultdict(list), the number of matches and the number - of sha1s sent to purldb. - """ - matched_count = matcher_func( - project=project, - resources_by_sha1=resources_by_sha1, - package_data_by_purldb_urls=package_data_by_purldb_urls, - ) - sha1_count = len(resources_by_sha1) - # Clear out resources_by_sha1 when we are done with the current batch of - # CodebaseResources - resources_by_sha1 = defaultdict(list) - return resources_by_sha1, matched_count, sha1_count - - -def match_purldb_resources( - project, extensions, matcher_func, chunk_size=1000, logger=None -): - """ - Match against PurlDB selecting codebase resources using provided - ``package_extensions`` for archive type files, and ``resource_extensions``. - - Match requests are sent off in batches of 1000 SHA1s. This number is set - using `chunk_size`. - """ - resources = ( - project.codebaseresources.files() - .no_status() - .has_value("sha1") - .filter(extension__in=extensions) - ) - resource_count = resources.count() - - extensions_str = ", ".join(extensions) - if logger: - if resource_count > 0: - logger( - f"Matching {resource_count:,d} {extensions_str} resources in PurlDB, " - "using SHA1" - ) - else: - logger( - f"Skipping matching for {extensions_str} resources, " - f"as there are {resource_count:,d}" - ) - - _match_purldb_resources( - project=project, - resources=resources, - matcher_func=matcher_func, - chunk_size=chunk_size, - logger=logger, - ) - - -def _match_purldb_resources( - project, resources, matcher_func, chunk_size=1000, logger=None -): - resource_count = resources.count() - resource_iterator = resources.iterator(chunk_size=chunk_size) - progress = LoopProgress(resource_count, logger) - total_matched_count = 0 - total_sha1_count = 0 - processed_resources_count = 0 - resources_by_sha1 = defaultdict(list) - package_data_by_purldb_urls = {} - - for to_resource in progress.iter(resource_iterator): - resources_by_sha1[to_resource.sha1].append(to_resource) - if to_resource.path.endswith(".map"): - for js_sha1 in js.source_content_sha1_list(to_resource): - resources_by_sha1[js_sha1].append(to_resource) - processed_resources_count += 1 - - if processed_resources_count % chunk_size == 0: - resources_by_sha1, matched_count, sha1_count = match_sha1s_to_purldb( - project=project, - resources_by_sha1=resources_by_sha1, - matcher_func=matcher_func, - package_data_by_purldb_urls=package_data_by_purldb_urls, - ) - total_matched_count += matched_count - total_sha1_count += sha1_count - - if resources_by_sha1: - resources_by_sha1, matched_count, sha1_count = match_sha1s_to_purldb( - project=project, - resources_by_sha1=resources_by_sha1, - matcher_func=matcher_func, - package_data_by_purldb_urls=package_data_by_purldb_urls, - ) - total_matched_count += matched_count - total_sha1_count += sha1_count - - logger( - f"{total_matched_count:,d} resources matched in PurlDB " - f"using {total_sha1_count:,d} SHA1s" - ) - - -def match_purldb_directories(project, logger=None): - """Match against PurlDB selecting codebase directories.""" - # If we are able to get match results for a directory fingerprint, then that - # means every resource and directory under that directory is part of a - # Package. By starting from the root to/ directory, we are attempting to - # match as many files as we can before attempting to match further down. The - # more "higher-up" directories we can match to means that we reduce the - # number of queries made to purldb. - to_directories = ( - project.codebaseresources.directories() - .no_status(status=flag.ABOUT_MAPPED) - .no_status(status=flag.MATCHED_TO_PURLDB_PACKAGE) - .order_by("path") - ) - directory_count = to_directories.count() - - if logger: - logger( - f"Matching {directory_count:,d} " - f"director{pluralize(directory_count, 'y,ies')} from to/ in PurlDB" - ) - - directory_iterator = to_directories.iterator(chunk_size=2000) - progress = LoopProgress(directory_count, logger) - - for directory in progress.iter(directory_iterator): - directory.refresh_from_db() - if directory.status != flag.MATCHED_TO_PURLDB_DIRECTORY: - match_purldb_directory(project, directory) - - matched_count = ( - project.codebaseresources.directories() - .filter(status=flag.MATCHED_TO_PURLDB_DIRECTORY) - .count() - ) - logger( - f"{matched_count:,d} director{pluralize(matched_count, 'y,ies')} " - f"matched in PurlDB" - ) - - -def match_resources_with_no_java_source(project, logger=None): - """ - Match resources with ``no-java-source`` to PurlDB, if no match - is found update status to ``requires-review``. - """ - project_files = project.codebaseresources.files() - - to_no_java_source = project_files.to_codebase().filter(status=flag.NO_JAVA_SOURCE) - - if to_no_java_source: - resource_count = to_no_java_source.count() - if logger: - logger( - f"Mapping {resource_count:,d} to/ resources with {flag.NO_JAVA_SOURCE} " - "status in PurlDB using SHA1" - ) - - _match_purldb_resources( - project=project, - resources=to_no_java_source, - matcher_func=match_purldb_resource, - logger=logger, - ) - to_no_java_source.exclude(status=flag.MATCHED_TO_PURLDB_RESOURCE).update( - status=flag.REQUIRES_REVIEW - ) - - -def match_purldb_resources_post_process(project, logger=None): - """Choose the best package for PurlDB matched resources.""" - to_extract_directories = ( - project.codebaseresources.directories() - .to_codebase() - .filter(path__regex=r"^.*-extract$") - ) - - to_resources = project.codebaseresources.files().filter( - status=flag.MATCHED_TO_PURLDB_RESOURCE - ) - - resource_count = to_extract_directories.count() - - if logger: - logger( - f"Refining matching for {resource_count:,d} " - f"{flag.MATCHED_TO_PURLDB_RESOURCE} archives." - ) - - resource_iterator = to_extract_directories.iterator(chunk_size=2000) - progress = LoopProgress(resource_count, logger) - map_count = 0 - - for directory in progress.iter(resource_iterator): - map_count += _match_purldb_resources_post_process( - directory, to_extract_directories, to_resources - ) - - logger(f"{map_count:,d} resource processed") - - -def _match_purldb_resources_post_process( - directory_path, to_extract_directories, to_resources -): - # Exclude the content of nested archive. - interesting_codebase_resources = ( - to_resources.filter(path__startswith=directory_path) - .filter(status=flag.MATCHED_TO_PURLDB_RESOURCE) - .exclude(path__regex=rf"^{directory_path}.*-extract\/.*$") - ) - - if not interesting_codebase_resources: - return 0 - - packages_map = {} - - for resource in interesting_codebase_resources: - for package in resource.discovered_packages.all(): - if package in packages_map: - packages_map[package].append(resource) - else: - packages_map[package] = [resource] - - # Rank the packages by most number of matched resources. - ranked_packages = dict( - sorted(packages_map.items(), key=lambda item: len(item[1]), reverse=True) - ) - - for resource in interesting_codebase_resources: - resource.discovered_packages.clear() - - for package, resources in ranked_packages.items(): - unmapped_resources = [ - resource - for resource in resources - if not resource.discovered_packages.exists() - ] - if unmapped_resources: - package.add_resources(unmapped_resources) - - return interesting_codebase_resources.count() diff --git a/matchcodeio/settings.py b/matchcodeio/settings.py deleted file mode 100644 index 60f9b6a9..00000000 --- a/matchcodeio/settings.py +++ /dev/null @@ -1,39 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from scancodeio.settings import * - - -INSTALLED_APPS += [ - "clearcode", - "clearindex", - "matchcode", - "minecode", - "packagedb", -] - -# Database - -DATABASES.update( - { - 'packagedb': { - 'ENGINE': env.str('PACKAGEDB_DB_ENGINE', 'django.db.backends.postgresql'), - 'HOST': env.str('PACKAGEDB_DB_HOST', 'localhost'), - 'NAME': env.str('PACKAGEDB_DB_NAME', 'packagedb'), - 'USER': env.str('PACKAGEDB_DB_USER', 'packagedb'), - 'PASSWORD': env.str('PACKAGEDB_DB_PASSWORD', 'packagedb'), - 'PORT': env.str('PACKAGEDB_DB_PORT', '5432'), - 'ATOMIC_REQUESTS': True, - } - } -) - -DATABASE_ROUTERS = ["matchcodeio.dbrouter.PackageDBRouter",] - -ROOT_URLCONF = 'matchcodeio.urls' diff --git a/matchcodeio/urls.py b/matchcodeio/urls.py deleted file mode 100644 index 45e3d921..00000000 --- a/matchcodeio/urls.py +++ /dev/null @@ -1,27 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from django.urls import include -from django.urls import path -from django.views.generic import RedirectView -from rest_framework import routers - -from matchcodeio.api import MatchingViewSet -from scanpipe.api.views import RunViewSet - - -api_router = routers.DefaultRouter() -api_router.register('matching', MatchingViewSet) -api_router.register('runs', RunViewSet) - -urlpatterns = [ - path('api/', include(api_router.urls)), - path("", include("scanpipe.urls")), - path('', RedirectView.as_view(url='api/')), -] diff --git a/matchcodeio/wsgi.py b/matchcodeio/wsgi.py deleted file mode 100644 index 2c570b4e..00000000 --- a/matchcodeio/wsgi.py +++ /dev/null @@ -1,25 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -""" -WSGI config for MatchCode.io. - -It exposes the WSGI callable as a module-level variable named ``application``. - -For more information on this file, see -https://docs.djangoproject.com/en/dev/howto/deployment/wsgi/ -""" - -import os - -from django.core.wsgi import get_wsgi_application - -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "matchcodeio.settings") - -application = get_wsgi_application() From 72a06626360112795102d3af5077f957e1361474 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 18 Dec 2023 18:04:03 -0800 Subject: [PATCH 26/54] Update traefik route #224 Signed-off-by: Jono Yang --- docker-compose_matchcodeio.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose_matchcodeio.yml b/docker-compose_matchcodeio.yml index 114f8023..65884ca0 100644 --- a/docker-compose_matchcodeio.yml +++ b/docker-compose_matchcodeio.yml @@ -58,7 +58,7 @@ services: restart: always labels: - "traefik.enable=true" - - "traefik.http.routers.matchcodeio.rule=Host(`127.0.0.1`) && PathPrefix(`/api/matching`) || Host(`localhost`) && PathPrefix(`/api/matching`) || Host(`127.0.0.1`) && PathPrefix(`/project`) || Host(`localhost`) && PathPrefix(`/project`)" + - "traefik.http.routers.matchcodeio.rule=Host(`127.0.0.1`) && PathPrefix(`/api/matching`) || Host(`localhost`) && PathPrefix(`/api/matching`) || Host(`192.168.1.12`) && PathPrefix(`/api/matching`) || Host(`127.0.0.1`) && PathPrefix(`/project`) || Host(`localhost`) && PathPrefix(`/project`) || Host(`192.168.1.12`) && PathPrefix(`/project`) || Host(`127.0.0.1`) && PathPrefix(`/api/runs`) || Host(`localhost`) && PathPrefix(`/api/runs`) || Host(`192.168.1.12`) && PathPrefix(`/api/runs`)" - "traefik.http.routers.matchcodeio.entrypoints=web" networks: From e58f24c5047f4daceae1bc54927dd77560b9d7ab Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 20 Dec 2023 13:38:57 -0800 Subject: [PATCH 27/54] Update traefik routes for local development #224 Signed-off-by: Jono Yang --- docker-compose_matchcodeio.yml | 10 +++++++++- docker-compose_purldb.yml | 4 +++- traefik.yml | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/docker-compose_matchcodeio.yml b/docker-compose_matchcodeio.yml index 65884ca0..663a909d 100644 --- a/docker-compose_matchcodeio.yml +++ b/docker-compose_matchcodeio.yml @@ -58,7 +58,15 @@ services: restart: always labels: - "traefik.enable=true" - - "traefik.http.routers.matchcodeio.rule=Host(`127.0.0.1`) && PathPrefix(`/api/matching`) || Host(`localhost`) && PathPrefix(`/api/matching`) || Host(`192.168.1.12`) && PathPrefix(`/api/matching`) || Host(`127.0.0.1`) && PathPrefix(`/project`) || Host(`localhost`) && PathPrefix(`/project`) || Host(`192.168.1.12`) && PathPrefix(`/project`) || Host(`127.0.0.1`) && PathPrefix(`/api/runs`) || Host(`localhost`) && PathPrefix(`/api/runs`) || Host(`192.168.1.12`) && PathPrefix(`/api/runs`)" + - "traefik.http.routers.matchcodeio.rule= + Host(`127.0.0.1`) && PathPrefix(`/api/matching`) + || Host(`127.0.0.1`) && PathPrefix(`/api/runs`) + || Host(`127.0.0.1`) && PathPrefix(`/project`) + || Host(`127.0.0.1`) && PathPrefix(`/runs`) + || Host(`localhost`) && PathPrefix(`/api/matching`) + || Host(`localhost`) && PathPrefix(`/api/runs`) + || Host(`localhost`) && PathPrefix(`/project`) + || Host(`localhost`) && PathPrefix(`/runs`)" - "traefik.http.routers.matchcodeio.entrypoints=web" networks: diff --git a/docker-compose_purldb.yml b/docker-compose_purldb.yml index dfd0ffe5..ccf5691c 100644 --- a/docker-compose_purldb.yml +++ b/docker-compose_purldb.yml @@ -118,7 +118,9 @@ services: image: nginx labels: - "traefik.enable=true" - - "traefik.http.routers.web.rule=Host(`127.0.0.1`) || Host(`localhost`) || Host(`192.168.1.12`)" + - "traefik.http.routers.web.rule= + Host(`127.0.0.1`) + || Host(`localhost`)" - "traefik.http.routers.web.entrypoints=web" volumes: - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ diff --git a/traefik.yml b/traefik.yml index 5e4150f1..ef22048d 100644 --- a/traefik.yml +++ b/traefik.yml @@ -9,6 +9,8 @@ api: entryPoints: web: address: ":80" + websecure: + address: ":443" providers: docker: From 64af97f95682050350b68363c868154f7b12f1d4 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 20 Dec 2023 17:49:44 -0800 Subject: [PATCH 28/54] Adapt tests from scancode.io d2d pipes #224 * Create router to ensure scanpipe is only on the default db Signed-off-by: Jono Yang --- .../tests/data/d2d-javascript/to/main.js | 3 + .../tests/data/d2d-javascript/to/main.js.map | 14 ++ .../data/d2d/find_java_packages/Foo.java | 3 + .../tests/pipes/test_matching.py | 216 ++++++++++++++++++ matchcode_project/dbrouter.py | 29 +++ matchcode_project/settings.py | 5 +- 6 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 matchcode_pipeline/tests/data/d2d-javascript/to/main.js create mode 100644 matchcode_pipeline/tests/data/d2d-javascript/to/main.js.map create mode 100644 matchcode_pipeline/tests/data/d2d/find_java_packages/Foo.java create mode 100644 matchcode_pipeline/tests/pipes/test_matching.py diff --git a/matchcode_pipeline/tests/data/d2d-javascript/to/main.js b/matchcode_pipeline/tests/data/d2d-javascript/to/main.js new file mode 100644 index 00000000..355446fc --- /dev/null +++ b/matchcode_pipeline/tests/data/d2d-javascript/to/main.js @@ -0,0 +1,3 @@ +const charSet="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_-+=";function generatePassword(r){let e="";for(let s=0;s Date: Thu, 21 Dec 2023 18:06:57 -0800 Subject: [PATCH 29/54] Remove match_resources_with_no_java_source #224 * Update code to remove references to to/ codebase Signed-off-by: Jono Yang --- matchcode_pipeline/pipelines/matching.py | 55 +++++++-------- matchcode_pipeline/pipes/matching.py | 69 +++++-------------- .../tests/pipes/test_matching.py | 64 +++++------------ 3 files changed, 63 insertions(+), 125 deletions(-) diff --git a/matchcode_pipeline/pipelines/matching.py b/matchcode_pipeline/pipelines/matching.py index f4461a86..98d62916 100644 --- a/matchcode_pipeline/pipelines/matching.py +++ b/matchcode_pipeline/pipelines/matching.py @@ -28,12 +28,18 @@ class Matching(ScanCodebase, LoadInventory): """ - Establish relationships between two code trees: deployment and development. + 1. Match archive to Packages + 2. Match archive to Resources + 3. Match directory exactly + 4. Match files exactly + 5. Match directories approximatly + 6. Match files approximately + 7. Matching on similar file attributes (path, type, extension, size, Java classpath, etc.) + 8. Return only the best matches (We could inject some user input, policies, we could provide a list of purls to guide matching, ) - This pipeline is expecting 2 archive files with "from-" and "to-" filename - prefixes as inputs: - - "from-[FILENAME]" archive containing the development source code - - "to-[FILENAME]" archive containing the deployment compiled code + new step: + Focused matching based on list of existing packages in the codebase (or sbom input, codebase disclosure, or scan) + validate package exists in purldb (raise warning) """ @classmethod @@ -41,43 +47,35 @@ def steps(cls): return ( cls.get_inputs, cls.build_inventory_from_scans, - cls.fingerprint_codebase_directories, cls.flag_empty_files, - cls.flag_ignored_resources, - cls.match_archives_to_purldb, + cls.match_archives_to_purldb_packages, + cls.match_archives_to_purldb_resources, + cls.fingerprint_codebase_directories, cls.match_directories_to_purldb, cls.match_resources_to_purldb, cls.match_purldb_resources_post_process, cls.remove_packages_without_resources, ) - purldb_package_extensions = [".jar", ".war", ".zip"] - purldb_resource_extensions = [ - ".map", - ".js", - ".mjs", - ".ts", - ".d.ts", - ".jsx", - ".tsx", - ".css", - ".scss", - ".less", - ".sass", - ".soy", - ".class", - ] - def fingerprint_codebase_directories(self): """Compute directory fingerprints for matching""" matchcode.fingerprint_codebase_directories(self.project) - def match_archives_to_purldb(self): - """Match selected package archives by extension to PurlDB.""" + def match_archives_to_purldb_packages(self): + """Match package archives against PurlDB packages""" matching.match_purldb_resources( project=self.project, - extensions=self.purldb_package_extensions, matcher_func=matching.match_purldb_package, + archives_only=True, + logger=self.log, + ) + + def match_archives_to_purldb_resources(self): + """Match package archives against PurlDB resources""" + matching.match_purldb_resources( + project=self.project, + matcher_func=matching.match_purldb_resource, + archives_only=True, logger=self.log, ) @@ -92,7 +90,6 @@ def match_resources_to_purldb(self): """Match selected files by extension in PurlDB.""" matching.match_purldb_resources( project=self.project, - extensions=self.purldb_resource_extensions, matcher_func=matching.match_purldb_resource, logger=self.log, ) diff --git a/matchcode_pipeline/pipes/matching.py b/matchcode_pipeline/pipes/matching.py index ee956567..9846f760 100644 --- a/matchcode_pipeline/pipes/matching.py +++ b/matchcode_pipeline/pipes/matching.py @@ -197,11 +197,12 @@ def match_sha1s_to_purldb( def match_purldb_resources( - project, extensions, matcher_func, chunk_size=1000, logger=None + project, matcher_func, archives_only=False, chunk_size=1000, logger=None ): """ - Match against PurlDB selecting codebase resources using provided - ``package_extensions`` for archive type files, and ``resource_extensions``. + Match CodebaseResources from `project` against the PurlDB. If + `archives_only` is True, then only CodebaseResources where the `is_archive` + flag is True is looked up in the PurlDB. Match requests are sent off in batches of 1000 SHA1s. This number is set using `chunk_size`. @@ -210,21 +211,19 @@ def match_purldb_resources( project.codebaseresources.files() .no_status() .has_value("sha1") - .filter(extension__in=extensions) + .filter(is_archive=archives_only) ) resource_count = resources.count() - extensions_str = ", ".join(extensions) if logger: if resource_count > 0: logger( - f"Matching {resource_count:,d} {extensions_str} resources in PurlDB, " + f"Matching {resource_count:,d} resources in PurlDB, " "using SHA1" ) else: logger( - f"Skipping matching for {extensions_str} resources, " - f"as there are {resource_count:,d}" + f"Skipping resource matching as there are {resource_count:,d}" ) _match_purldb_resources( @@ -282,28 +281,27 @@ def _match_purldb_resources( def match_purldb_directories(project, logger=None): - """Match against PurlDB selecting codebase directories.""" + """Match directory CodebaseResources from `project` against the PurlDB.""" # If we are able to get match results for a directory fingerprint, then that # means every resource and directory under that directory is part of a # Package. By starting from the root to/ directory, we are attempting to # match as many files as we can before attempting to match further down. The # more "higher-up" directories we can match to means that we reduce the # number of queries made to purldb. - to_directories = ( + directories = ( project.codebaseresources.directories() - .no_status(status=flag.ABOUT_MAPPED) .no_status(status=flag.MATCHED_TO_PURLDB_PACKAGE) .order_by("path") ) - directory_count = to_directories.count() + directory_count = directories.count() if logger: logger( f"Matching {directory_count:,d} " - f"director{pluralize(directory_count, 'y,ies')} from to/ in PurlDB" + f"director{pluralize(directory_count, 'y,ies')} against PurlDB" ) - directory_iterator = to_directories.iterator(chunk_size=2000) + directory_iterator = directories.iterator(chunk_size=2000) progress = LoopProgress(directory_count, logger) for directory in progress.iter(directory_iterator): @@ -322,47 +320,18 @@ def match_purldb_directories(project, logger=None): ) -def match_resources_with_no_java_source(project, logger=None): - """ - Match resources with ``no-java-source`` to PurlDB, if no match - is found update status to ``requires-review``. - """ - project_files = project.codebaseresources.files() - - to_no_java_source = project_files.to_codebase().filter(status=flag.NO_JAVA_SOURCE) - - if to_no_java_source: - resource_count = to_no_java_source.count() - if logger: - logger( - f"Mapping {resource_count:,d} to/ resources with {flag.NO_JAVA_SOURCE} " - "status in PurlDB using SHA1" - ) - - _match_purldb_resources( - project=project, - resources=to_no_java_source, - matcher_func=match_purldb_resource, - logger=logger, - ) - to_no_java_source.exclude(status=flag.MATCHED_TO_PURLDB_RESOURCE).update( - status=flag.REQUIRES_REVIEW - ) - - def match_purldb_resources_post_process(project, logger=None): """Choose the best package for PurlDB matched resources.""" - to_extract_directories = ( + extract_directories = ( project.codebaseresources.directories() - .to_codebase() .filter(path__regex=r"^.*-extract$") ) - to_resources = project.codebaseresources.files().filter( + resources = project.codebaseresources.files().filter( status=flag.MATCHED_TO_PURLDB_RESOURCE ) - resource_count = to_extract_directories.count() + resource_count = extract_directories.count() if logger: logger( @@ -370,24 +339,24 @@ def match_purldb_resources_post_process(project, logger=None): f"{flag.MATCHED_TO_PURLDB_RESOURCE} archives." ) - resource_iterator = to_extract_directories.iterator(chunk_size=2000) + resource_iterator = extract_directories.iterator(chunk_size=2000) progress = LoopProgress(resource_count, logger) map_count = 0 for directory in progress.iter(resource_iterator): map_count += _match_purldb_resources_post_process( - directory, to_extract_directories, to_resources + directory.path, resources ) logger(f"{map_count:,d} resource processed") def _match_purldb_resources_post_process( - directory_path, to_extract_directories, to_resources + directory_path, codebase_resources ): # Exclude the content of nested archive. interesting_codebase_resources = ( - to_resources.filter(path__startswith=directory_path) + codebase_resources.filter(path__startswith=directory_path) .filter(status=flag.MATCHED_TO_PURLDB_RESOURCE) .exclude(path__regex=rf"^{directory_path}.*-extract\/.*$") ) diff --git a/matchcode_pipeline/tests/pipes/test_matching.py b/matchcode_pipeline/tests/pipes/test_matching.py index c77fd9bd..132552f3 100644 --- a/matchcode_pipeline/tests/pipes/test_matching.py +++ b/matchcode_pipeline/tests/pipes/test_matching.py @@ -1,20 +1,22 @@ from pathlib import Path import io +import uuid + from django.test import TestCase -from scanpipe.pipes import flag -from scanpipe.models import CodebaseResource + from scanpipe.models import Project +from scanpipe.pipes import flag from scanpipe.tests import make_resource_directory from scanpipe.tests import make_resource_file from scanpipe.tests import package_data1 from scanpipe.tests import package_data2 + +from matchcode.models import ApproximateDirectoryContentIndex from matchcode_pipeline.pipes import matching -import uuid from packagedb.models import Package -from matchcode.models import ApproximateDirectoryContentIndex from scanpipe import pipes from scanpipe.pipes.input import copy_inputs -from scanpipe.pipes.input import copy_input + class MatchingPipesTest(TestCase): data_location = Path(__file__).parent.parent / "data" @@ -74,12 +76,12 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_resources(self): buffer = io.StringIO() matching.match_purldb_resources( self.project1, - extensions=[".jar"], matcher_func=matching.match_purldb_package, + archives_only=True, logger=buffer.write, ) expected = ( - "Matching 1 .jar resources in PurlDB, using SHA1" + "Matching 1 resources in PurlDB, using SHA1" "3 resources matched in PurlDB using 1 SHA1s" ) self.assertEqual(expected, buffer.getvalue()) @@ -108,7 +110,7 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_directories(self): ) expected = ( - "Matching 1 directory from to/ in PurlDB" "1 directory matched in PurlDB" + "Matching 1 directory against PurlDB" "1 directory matched in PurlDB" ) self.assertEqual(expected, buffer.getvalue()) @@ -120,43 +122,13 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_directories(self): self.assertEqual("matched-to-purldb-directory", resource.status) self.assertEqual(package, resource.discovered_packages.get()) - def test_match_resources_with_no_java_source(self): - to_dir = ( - self.project1.codebase_path / "to/project.tar.zst-extract/osgi/marketplace/" - "resources/node_modules/foo-bar" - ) - to_input_location = self.data_location / "d2d/find_java_packages/Foo.java" - to_dir.mkdir(parents=True) - copy_input(to_input_location, to_dir) - - pipes.collect_and_create_codebase_resources(self.project1) - - foo_java = self.project1.codebaseresources.get( - path=( - "to/project.tar.zst-extract/osgi/marketplace/" - "resources/node_modules/foo-bar/Foo.java" - ) - ) - - foo_java.update(status=flag.NO_JAVA_SOURCE) - - buffer = io.StringIO() - matching.match_resources_with_no_java_source(self.project1, logger=buffer.write) - foo_java.refresh_from_db() - - expected = ( - f"Mapping 1 to/ resources with {flag.NO_JAVA_SOURCE} " - "status in PurlDB using SHA1" - ) - self.assertIn(expected, buffer.getvalue()) - self.assertEqual(flag.REQUIRES_REVIEW, foo_java.status) - def test_scanpipe_pipes_d2d_match_purldb_resources_post_process(self): + def test_matchcode_pipeline_pipes_matching_match_purldb_resources_post_process(self): to_map = self.data_location / "d2d-javascript" / "to" / "main.js.map" to_mini = self.data_location / "d2d-javascript" / "to" / "main.js" to_dir = ( self.project1.codebase_path - / "to/project.tar.zst/modules/apps/adaptive-media/" + / "project.tar.zst/modules/apps/adaptive-media/" "adaptive-media-web-extract/src/main/resources/META-INF/resources/" "adaptive_media/js" ) @@ -165,17 +137,17 @@ def test_scanpipe_pipes_d2d_match_purldb_resources_post_process(self): pipes.collect_and_create_codebase_resources(self.project1) - to_resources = self.project1.codebaseresources.filter( + resources = self.project1.codebaseresources.filter( path__startswith=( - "to/project.tar.zst/modules/apps/adaptive-media/" + "project.tar.zst/modules/apps/adaptive-media/" "adaptive-media-web-extract/src/main/resources/META-INF/resources/" "adaptive_media/js/main.js" ) ) - to_mini_resource = self.project1.codebaseresources.filter( + mini_resource = self.project1.codebaseresources.filter( path=( - "to/project.tar.zst/modules/apps/adaptive-media/" + "project.tar.zst/modules/apps/adaptive-media/" "adaptive-media-web-extract/src/main/resources/META-INF/resources/" "adaptive_media/js/main.js" ) @@ -185,7 +157,7 @@ def test_scanpipe_pipes_d2d_match_purldb_resources_post_process(self): dummy_package_data1["uuid"] = uuid.uuid4() package1, _ = matching.create_package_from_purldb_data( self.project1, - to_resources, + resources, dummy_package_data1, flag.MATCHED_TO_PURLDB_RESOURCE, ) @@ -194,7 +166,7 @@ def test_scanpipe_pipes_d2d_match_purldb_resources_post_process(self): dummy_package_data2["uuid"] = uuid.uuid4() package2, _ = matching.create_package_from_purldb_data( self.project1, - to_mini_resource, + mini_resource, dummy_package_data2, flag.MATCHED_TO_PURLDB_RESOURCE, ) From 23d96eba00fdc11b296581035d1dcc7c1187669d Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 21 Dec 2023 18:10:18 -0800 Subject: [PATCH 30/54] Use matchcode-toolkit v1.1.3 #224 * This is to avoid dependency issues when using scancode.io as a dependency Signed-off-by: Jono Yang --- configure | 6 +++--- setup.cfg | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configure b/configure index 4b3cd869..d30b8922 100755 --- a/configure +++ b/configure @@ -30,9 +30,9 @@ CLI_ARGS=$1 CUSTOM_PACKAGES="" # Requirement arguments passed to pip and used by default or with --dev. -REQUIREMENTS="$CUSTOM_PACKAGES --editable matchcode-toolkit --editable . --constraint requirements.txt" -DEV_REQUIREMENTS="$CUSTOM_PACKAGES --editable matchcode-toolkit --editable .[testing] --constraint requirements.txt --constraint requirements-dev.txt" -DOCS_REQUIREMENTS="$CUSTOM_PACKAGES --editable matchcode-toolkit --editable .[docs] --constraint requirements.txt" +REQUIREMENTS="$CUSTOM_PACKAGES --editable . --constraint requirements.txt" +DEV_REQUIREMENTS="$CUSTOM_PACKAGES --editable .[testing] --constraint requirements.txt --constraint requirements-dev.txt" +DOCS_REQUIREMENTS="$CUSTOM_PACKAGES --editable .[docs] --constraint requirements.txt" # where we create a virtualenv VIRTUALENV_DIR=venv diff --git a/setup.cfg b/setup.cfg index c2f2b1db..f137b63a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -55,9 +55,9 @@ install_requires = rubymarshal == 1.0.3 scancode-toolkit[full] == 32.0.8 urlpy == 0.5 - matchcode-toolkit >= 2.0.1 + matchcode-toolkit == 1.1.3 univers == 30.11.0 - scancodeio + scancodeio == 32.7.0 setup_requires = setuptools_scm[toml] >= 4 python_requires = >=3.8 From 45d653b75301343bb559f6b3557718901200c4ec Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 22 Dec 2023 10:34:18 -0800 Subject: [PATCH 31/54] Add step for exact directory matching #224 Signed-off-by: Jono Yang --- matchcode/models.py | 16 ++++++++++++++-- matchcode_pipeline/pipelines/matching.py | 11 ++++++++++- matchcode_pipeline/pipes/matching.py | 15 +++++++++++---- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/matchcode/models.py b/matchcode/models.py index dba27aaa..4e55c9f6 100644 --- a/matchcode/models.py +++ b/matchcode/models.py @@ -233,7 +233,7 @@ def index(cls, directory_fingerprint, resource_path, package): logger.error(msg) @classmethod - def match(cls, directory_fingerprint): + def match(cls, directory_fingerprint, exact_directory_match=False): """ Return a list of matched Packages """ @@ -243,9 +243,21 @@ def match(cls, directory_fingerprint): if not directory_fingerprint: return cls.objects.none() - # Step 1: find fingerprints with matching chunks indexed_elements_count, bah128 = split_fingerprint(directory_fingerprint) chunk1, chunk2, chunk3, chunk4 = create_halohash_chunks(bah128) + + # Step 0: if exact only, then return a filter + if exact_directory_match: + matches = cls.objects.filter( + indexed_elements_count=indexed_elements_count, + chunk1=chunk1, + chunk2=chunk2, + chunk3=chunk3, + chunk4=chunk4, + ) + return matches + + # Step 1: find fingerprints with matching chunks range = bah128_ranges(indexed_elements_count) matches = cls.objects.filter( models.Q( diff --git a/matchcode_pipeline/pipelines/matching.py b/matchcode_pipeline/pipelines/matching.py index 98d62916..c7490f50 100644 --- a/matchcode_pipeline/pipelines/matching.py +++ b/matchcode_pipeline/pipelines/matching.py @@ -51,8 +51,9 @@ def steps(cls): cls.match_archives_to_purldb_packages, cls.match_archives_to_purldb_resources, cls.fingerprint_codebase_directories, - cls.match_directories_to_purldb, + cls.match_directories_exact_to_purldb, cls.match_resources_to_purldb, + cls.match_directories_to_purldb, cls.match_purldb_resources_post_process, cls.remove_packages_without_resources, ) @@ -86,6 +87,14 @@ def match_directories_to_purldb(self): logger=self.log, ) + def match_directories_exact_to_purldb(self): + """Match selected directories in PurlDB.""" + matching.match_purldb_directories( + project=self.project, + exact_directory_match=True, + logger=self.log, + ) + def match_resources_to_purldb(self): """Match selected files by extension in PurlDB.""" matching.match_purldb_resources( diff --git a/matchcode_pipeline/pipes/matching.py b/matchcode_pipeline/pipes/matching.py index 9846f760..3ac07f78 100644 --- a/matchcode_pipeline/pipes/matching.py +++ b/matchcode_pipeline/pipes/matching.py @@ -165,10 +165,13 @@ def match_purldb_resource( return match_count -def match_purldb_directory(project, resource): +def match_purldb_directory(project, resource, exact_directory_match=False): """Match a single directory resource in the PurlDB.""" fingerprint = resource.extra_data.get("directory_content", "") - results = ApproximateDirectoryContentIndex.match(directory_fingerprint=fingerprint) + results = ApproximateDirectoryContentIndex.match( + directory_fingerprint=fingerprint, + exact_directory_match=exact_directory_match + ) for result in results: package_data = result.package.to_dict() return create_package_from_purldb_data( @@ -280,7 +283,7 @@ def _match_purldb_resources( ) -def match_purldb_directories(project, logger=None): +def match_purldb_directories(project, exact_directory_match=False, logger=None): """Match directory CodebaseResources from `project` against the PurlDB.""" # If we are able to get match results for a directory fingerprint, then that # means every resource and directory under that directory is part of a @@ -307,7 +310,11 @@ def match_purldb_directories(project, logger=None): for directory in progress.iter(directory_iterator): directory.refresh_from_db() if directory.status != flag.MATCHED_TO_PURLDB_DIRECTORY: - match_purldb_directory(project, directory) + match_purldb_directory( + project, + directory, + exact_directory_match + ) matched_count = ( project.codebaseresources.directories() From 4eebe69e14de86368fedd8541dfadfd39093ad25 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 3 Jan 2024 13:42:02 -0800 Subject: [PATCH 32/54] Create test for Matching API endpoint #224 Signed-off-by: Jono Yang --- matchcode_pipeline/tests/test_api.py | 54 ++++++++++++++++++++++++++++ matchcode_project/urls.py | 6 ++-- matchcode_project/wsgi.py | 2 +- 3 files changed, 58 insertions(+), 4 deletions(-) create mode 100644 matchcode_pipeline/tests/test_api.py diff --git a/matchcode_pipeline/tests/test_api.py b/matchcode_pipeline/tests/test_api.py new file mode 100644 index 00000000..c681f0ca --- /dev/null +++ b/matchcode_pipeline/tests/test_api.py @@ -0,0 +1,54 @@ +from pathlib import Path +from scanpipe.tests import dependency_data1 +from scanpipe.tests import package_data1 +from django.test import TransactionTestCase +from scanpipe.models import CodebaseRelation +from scanpipe.models import DiscoveredDependency +from scanpipe.models import Project +from scanpipe.models import CodebaseResource +from django.urls import reverse +from rest_framework.test import APIClient +from django.contrib.auth.models import User + + +class MatchCodePipelineAPITest(TransactionTestCase): + databases = {'default', 'packagedb'} + data_location = Path(__file__).parent / "data" + + def setUp(self): + self.project1 = Project.objects.create(name="Analysis") + self.resource1 = CodebaseResource.objects.create( + project=self.project1, + path="daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO", + ) + self.discovered_package1 = self.resource1.create_and_add_package(package_data1) + self.discovered_dependency1 = DiscoveredDependency.create_from_data( + self.project1, dependency_data1 + ) + self.codebase_relation1 = CodebaseRelation.objects.create( + project=self.project1, + from_resource=self.resource1, + to_resource=self.resource1, + map_type="java_to_class", + ) + + self.matching_list_url = reverse("matching-list") + self.project1_detail_url = reverse("matching-detail", args=[self.project1.uuid]) + + self.user = User.objects.create_user("username", "e@mail.com", "secret") + self.auth = f"Token {self.user.auth_token.key}" + + self.csrf_client = APIClient(enforce_csrf_checks=True) + self.csrf_client.credentials(HTTP_AUTHORIZATION=self.auth) + + def test_scanpipe_api_project_list(self): + response = self.csrf_client.get(self.matching_list_url) + + self.assertContains(response, self.project1_detail_url) + self.assertEqual(1, response.data["count"]) + self.assertNotContains(response, "input_root") + self.assertNotContains(response, "extra_data") + self.assertNotContains(response, "message_count") + self.assertNotContains(response, "resource_count") + self.assertNotContains(response, "package_count") + self.assertNotContains(response, "dependency_count") diff --git a/matchcode_project/urls.py b/matchcode_project/urls.py index 1f3ad35b..f793027a 100644 --- a/matchcode_project/urls.py +++ b/matchcode_project/urls.py @@ -17,11 +17,11 @@ api_router = routers.DefaultRouter() -api_router.register('matching', MatchingViewSet) -api_router.register('runs', RunViewSet) +api_router.register('matching', MatchingViewSet, basename='matching') +api_router.register('runs', RunViewSet, basename='runs') urlpatterns = [ path('api/', include(api_router.urls)), - path("", include("scanpipe.urls")), + path('', include('scanpipe.urls')), path('', RedirectView.as_view(url='api/')), ] diff --git a/matchcode_project/wsgi.py b/matchcode_project/wsgi.py index 2c570b4e..83964488 100644 --- a/matchcode_project/wsgi.py +++ b/matchcode_project/wsgi.py @@ -20,6 +20,6 @@ from django.core.wsgi import get_wsgi_application -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "matchcodeio.settings") +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "matchcode_project.settings") application = get_wsgi_application() From 79d480fc5b0f2479e6422ad5c5f3fcf442d020bd Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 4 Jan 2024 15:51:29 -0800 Subject: [PATCH 33/54] Create test for Matching API view #224 * Update Run Serializer Signed-off-by: Jono Yang --- matchcode_pipeline/api.py | 48 ++++++++++++++++++++++++++-- matchcode_pipeline/tests/test_api.py | 18 +++++++---- matchcode_project/urls.py | 2 +- 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/matchcode_pipeline/api.py b/matchcode_pipeline/api.py index b32bc74c..f11e59c8 100644 --- a/matchcode_pipeline/api.py +++ b/matchcode_pipeline/api.py @@ -14,16 +14,44 @@ from rest_framework import viewsets from rest_framework.decorators import action -from scanpipe.api.serializers import RunSerializer +from scanpipe.api import ExcludeFromListViewMixin +from scanpipe.api.serializers import SerializerExcludeFieldsMixin from scanpipe.api.serializers import StrListField from scanpipe.api.views import ProjectFilterSet from scanpipe.models import Project +from scanpipe.models import Run from scanpipe.pipes import count_group_by from scanpipe.pipes.fetch import fetch_urls from scanpipe.views import project_results_json_response -class MatchingSerializer(serializers.ModelSerializer): +class RunSerializer(SerializerExcludeFieldsMixin, serializers.ModelSerializer): + matching_project = serializers.HyperlinkedRelatedField( + view_name="matching-detail", read_only=True + ) + + class Meta: + model = Run + fields = [ + "url", + "pipeline_name", + "status", + "description", + "matching_project", + "uuid", + "created_date", + "scancodeio_version", + "task_id", + "task_start_date", + "task_end_date", + "task_exitcode", + "task_output", + "log", + "execution_time", + ] + + +class MatchingSerializer(ExcludeFromListViewMixin, serializers.ModelSerializer): upload_file = serializers.FileField(write_only=True, required=False) input_urls = StrListField( write_only=True, @@ -58,6 +86,22 @@ class Meta: "discovered_dependencies_summary", "codebase_relations_summary", ) + exclude_from_list_view = [ + "resource_count", + "package_count", + "dependency_count", + "relation_count", + "codebase_resources_summary", + "discovered_packages_summary", + "discovered_dependencies_summary", + "codebase_relations_summary", + ] + extra_kwargs = { + 'url': { + 'view_name': 'matching-detail', + 'lookup_field': 'pk', + }, + } def get_codebase_resources_summary(self, project): queryset = project.codebaseresources.all() diff --git a/matchcode_pipeline/tests/test_api.py b/matchcode_pipeline/tests/test_api.py index c681f0ca..36bb30d6 100644 --- a/matchcode_pipeline/tests/test_api.py +++ b/matchcode_pipeline/tests/test_api.py @@ -1,14 +1,19 @@ from pathlib import Path -from scanpipe.tests import dependency_data1 -from scanpipe.tests import package_data1 + +from django.contrib.auth.models import User from django.test import TransactionTestCase +from django.urls import reverse +from rest_framework.test import APIClient + +from django.contrib.auth import get_user_model + +from packagedb.models import ApiUser from scanpipe.models import CodebaseRelation +from scanpipe.models import CodebaseResource from scanpipe.models import DiscoveredDependency from scanpipe.models import Project -from scanpipe.models import CodebaseResource -from django.urls import reverse -from rest_framework.test import APIClient -from django.contrib.auth.models import User +from scanpipe.tests import dependency_data1 +from scanpipe.tests import package_data1 class MatchCodePipelineAPITest(TransactionTestCase): @@ -52,3 +57,4 @@ def test_scanpipe_api_project_list(self): self.assertNotContains(response, "resource_count") self.assertNotContains(response, "package_count") self.assertNotContains(response, "dependency_count") + diff --git a/matchcode_project/urls.py b/matchcode_project/urls.py index f793027a..ff63c5ee 100644 --- a/matchcode_project/urls.py +++ b/matchcode_project/urls.py @@ -18,7 +18,7 @@ api_router = routers.DefaultRouter() api_router.register('matching', MatchingViewSet, basename='matching') -api_router.register('runs', RunViewSet, basename='runs') +api_router.register('runs', RunViewSet) urlpatterns = [ path('api/', include(api_router.urls)), From 9082e524da3387a36a0fa4fb91e04c73000796f3 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 4 Jan 2024 20:26:24 -0800 Subject: [PATCH 34/54] Create match request creation test #224 Signed-off-by: Jono Yang --- .../data/match-creation-api-results.json | 142 ++++++++++++++++++ matchcode_pipeline/tests/data/test-out.json | 111 ++++++++++++++ matchcode_pipeline/tests/test_api.py | 93 +++++++++++- 3 files changed, 344 insertions(+), 2 deletions(-) create mode 100644 matchcode_pipeline/tests/data/match-creation-api-results.json create mode 100644 matchcode_pipeline/tests/data/test-out.json diff --git a/matchcode_pipeline/tests/data/match-creation-api-results.json b/matchcode_pipeline/tests/data/match-creation-api-results.json new file mode 100644 index 00000000..1ff6d577 --- /dev/null +++ b/matchcode_pipeline/tests/data/match-creation-api-results.json @@ -0,0 +1,142 @@ +{ + "headers": [ + { + "tool_name": "scanpipe", + "tool_version": "v3.0.0-243-g79d480f", + "other_tools": [ + "pkg:pypi/scancode-toolkit@32.0.8" + ], + "notice": "Generated with ScanCode.io and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied.\nNo content created from ScanCode.io should be considered or used as legal advice.\nConsult an Attorney for any legal advice.\nScanCode.io is a free software code scanning tool from nexB Inc. and others\nlicensed under the Apache License version 2.0.\nScanCode is a trademark of nexB Inc.\nVisit https://github.com/nexB/scancode.io for support and download.\n", + "uuid": "0d08f9c9-daa7-49b3-b672-570ef9b9c106", + "created_date": "2024-01-05T04:06:23.300Z", + "notes": "", + "settings": {}, + "input_sources": [], + "runs": [], + "extra_data": {} + } + ], + "packages": [ + { + "purl": "pkg:deb/debian/adduser@3.118?arch=all", + "type": "deb", + "namespace": "debian", + "name": "adduser", + "version": "3.118", + "qualifiers": "arch=all", + "subpath": "", + "tag": "", + "primary_language": "bash", + "description": "add and remove users and groups", + "release_date": "1999-10-10", + "parties": [ + { + "url": null, + "name": "Debian Adduser Developers ", + "role": "maintainer", + "type": null, + "email": null + } + ], + "keywords": [ + "admin" + ], + "homepage_url": "https://packages.debian.org", + "download_url": "https://download.url/package.zip", + "bug_tracking_url": "", + "code_view_url": "", + "vcs_url": "https://packages.vcs.url", + "repository_homepage_url": "", + "repository_download_url": "", + "api_data_url": "", + "size": 849, + "md5": "76cf50f29e47676962645632737365a7", + "sha1": "", + "sha256": "", + "sha512": "", + "copyright": "Copyright (c) 2000 Roland Bauerschmidt \nCopyright (c) 1997, 1998, 1999 Guy Maor \nCopyright (c) 1995 Ted Hajek \nportions Copyright (c) 1994 Debian Association, Inc.", + "holder": "", + "declared_license_expression": "gpl-2.0 AND gpl-2.0-plus", + "declared_license_expression_spdx": "GPL-2.0-only AND GPL-2.0-or-later", + "license_detections": [], + "other_license_expression": "", + "other_license_expression_spdx": "", + "other_license_detections": [], + "extracted_license_statement": "", + "compliance_alert": "", + "notice_text": "Notice\nText", + "source_packages": [], + "extra_data": {}, + "package_uid": "pkg:deb/debian/adduser@3.118?uuid=610bed29-ce39-40e7-92d6-fd8b", + "datasource_id": "", + "file_references": [], + "missing_resources": [], + "modified_resources": [], + "affected_by_vulnerabilities": [] + } + ], + "dependencies": [ + { + "purl": "pkg:pypi/dask", + "extracted_requirement": "dask<2023.0.0,>=2022.6.0", + "scope": "install", + "is_runtime": true, + "is_optional": false, + "is_resolved": false, + "dependency_uid": "pkg:pypi/dask?uuid=e656b571-7d3f-46d1-b95b-8f037aef9692", + "for_package_uid": "pkg:deb/debian/adduser@3.118?uuid=610bed29-ce39-40e7-92d6-fd8b", + "datafile_path": "daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO", + "datasource_id": "pypi_sdist_pkginfo", + "package_type": "pypi", + "affected_by_vulnerabilities": [] + } + ], + "files": [ + { + "path": "daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO", + "type": "", + "name": "", + "status": "", + "tag": "", + "extension": "", + "size": null, + "md5": "", + "sha1": "", + "sha256": "", + "sha512": "", + "mime_type": "", + "file_type": "", + "programming_language": "", + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_key_file": false, + "detected_license_expression": "", + "detected_license_expression_spdx": "", + "license_detections": [], + "license_clues": [], + "percentage_of_license_text": null, + "compliance_alert": "", + "copyrights": [], + "holders": [], + "authors": [], + "package_data": [], + "for_packages": [ + "pkg:deb/debian/adduser@3.118?uuid=610bed29-ce39-40e7-92d6-fd8b" + ], + "emails": [], + "urls": [], + "extra_data": {} + } + ], + "relations": [ + { + "to_resource": "daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO", + "status": "", + "map_type": "java_to_class", + "score": "", + "from_resource": "daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO" + } + ] +} \ No newline at end of file diff --git a/matchcode_pipeline/tests/data/test-out.json b/matchcode_pipeline/tests/data/test-out.json new file mode 100644 index 00000000..d59cb29e --- /dev/null +++ b/matchcode_pipeline/tests/data/test-out.json @@ -0,0 +1,111 @@ +{ + "headers": [ + { + "tool_name": "scancode-toolkit", + "tool_version": "32.0.8", + "options": { + "input": [ + "." + ], + "--info": true, + "--json-pp": "/home/jono/test-out.json" + }, + "notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.", + "start_timestamp": "2024-01-05T014959.288450", + "end_timestamp": "2024-01-05T014959.409373", + "output_format_version": "3.0.0", + "duration": 0.12093544006347656, + "message": null, + "errors": [], + "warnings": [], + "extra_data": { + "system_environment": { + "operating_system": "linux", + "cpu_architecture": "64", + "platform": "Linux-6.2.16-19-pve-x86_64-with-glibc2.35", + "platform_version": "#1 SMP PREEMPT_DYNAMIC PMX 6.2.16-19 (2023-10-24T12:07Z)", + "python_version": "3.11.4 (main, Aug 3 2023, 18:50:43) [GCC 11.3.0]" + }, + "spdx_license_list_version": "3.21", + "files_count": 1 + } + } + ], + "files": [ + { + "path": "test", + "type": "directory", + "name": "test", + "base_name": "test", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 1, + "dirs_count": 1, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "test/foo", + "type": "directory", + "name": "foo", + "base_name": "foo", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 1, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "test/foo/test_package", + "type": "file", + "name": "test_package", + "base_name": "test_package", + "extension": "", + "size": 0, + "date": "2024-01-05", + "sha1": null, + "md5": null, + "sha256": null, + "mime_type": "inode/x-empty", + "file_type": "empty", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/matchcode_pipeline/tests/test_api.py b/matchcode_pipeline/tests/test_api.py index 36bb30d6..4b05017d 100644 --- a/matchcode_pipeline/tests/test_api.py +++ b/matchcode_pipeline/tests/test_api.py @@ -1,5 +1,6 @@ from pathlib import Path - +from unittest import mock +import json from django.contrib.auth.models import User from django.test import TransactionTestCase from django.urls import reverse @@ -14,6 +15,8 @@ from scanpipe.models import Project from scanpipe.tests import dependency_data1 from scanpipe.tests import package_data1 +from rest_framework import status +from packagedb.models import Package class MatchCodePipelineAPITest(TransactionTestCase): @@ -37,6 +40,14 @@ def setUp(self): map_type="java_to_class", ) + self.package1 = Package.objects.create( + type=package_data1['type'], + namespace=package_data1['namespace'], + name=package_data1['name'], + version=package_data1['version'], + sha1='deadbeef', + ) + self.matching_list_url = reverse("matching-list") self.project1_detail_url = reverse("matching-detail", args=[self.project1.uuid]) @@ -46,7 +57,7 @@ def setUp(self): self.csrf_client = APIClient(enforce_csrf_checks=True) self.csrf_client.credentials(HTTP_AUTHORIZATION=self.auth) - def test_scanpipe_api_project_list(self): + def test_matchcode_pipeline_api_matching_list(self): response = self.csrf_client.get(self.matching_list_url) self.assertContains(response, self.project1_detail_url) @@ -58,3 +69,81 @@ def test_scanpipe_api_project_list(self): self.assertNotContains(response, "package_count") self.assertNotContains(response, "dependency_count") + def test_matchcode_pipeline_api_matching_detail(self): + response = self.csrf_client.get(self.project1_detail_url) + self.assertIn(self.project1_detail_url, response.data["url"]) + self.assertEqual(str(self.project1.uuid), response.data["uuid"]) + self.assertEqual([], response.data["input_sources"]) + self.assertEqual([], response.data["runs"]) + self.assertEqual(1, response.data["resource_count"]) + self.assertEqual(1, response.data["package_count"]) + self.assertEqual(1, response.data["dependency_count"]) + self.assertEqual(1, response.data["relation_count"]) + + expected = {"": 1} + self.assertEqual(expected, response.data["codebase_resources_summary"]) + + expected = { + "total": 1, + "with_missing_resources": 0, + "with_modified_resources": 0, + } + self.assertEqual(expected, response.data["discovered_packages_summary"]) + + expected = { + "total": 1, + "is_runtime": 1, + "is_optional": 0, + "is_resolved": 0, + } + self.assertEqual(expected, response.data["discovered_dependencies_summary"]) + + expected = {"java_to_class": 1} + self.assertEqual(expected, response.data["codebase_relations_summary"]) + + self.project1.add_input_source(filename="file1", source="uploaded") + self.project1.add_input_source(filename="file2", source="https://download.url") + self.project1.save() + response = self.csrf_client.get(self.project1_detail_url) + expected = [ + {"filename": "file1", "source": "uploaded"}, + {"filename": "file2", "source": "https://download.url"}, + ] + self.assertEqual(expected, response.data["input_sources"]) + + @mock.patch("scanpipe.models.Run.execute_task_async") + def test_scanpipe_api_project_create(self, mock_execute_pipeline_task): + # load upload_file contents + test_out_loc = self.data_location / "test-out.json" + content = open(test_out_loc, 'r') + data = { + "upload_file": content, + } + + # Send match request + response = self.csrf_client.post(self.matching_list_url, data) + self.assertEqual(status.HTTP_201_CREATED, response.status_code) + self.assertEqual(1, len(response.data["runs"])) + self.assertEqual('matching', response.data["runs"][0]["pipeline_name"]) + mock_execute_pipeline_task.assert_called_once() + + created_matching_project_detail_url = response.data["url"] + matching_project_uuid = response.data['uuid'] + results_url = reverse("matching-results", args=[matching_project_uuid]) + + # Check that the file was uploaded + response = self.csrf_client.get(created_matching_project_detail_url) + self.assertEqual('test-out.json', response.data['input_sources'][0]['filename']) + + # Get and check results + response = self.csrf_client.get(results_url) + results = [line.decode('utf-8') for line in list(response.streaming_content)] + results = ''.join(results) + print(results) + results = json.loads(results) + + expected_results_loc = self.data_location / "match-creation-api-results.json" + with open(expected_results_loc, 'r') as f: + expected_results = json.load(f) + + self.assertEqual(expected_results, results) From a8befaa11533c88856bf45223e2167db3dbce632 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 5 Jan 2024 12:20:19 -0800 Subject: [PATCH 35/54] Create test for modified Run detail view #224 Signed-off-by: Jono Yang --- matchcode_pipeline/api.py | 9 +- matchcode_pipeline/tests/test_api.py | 152 +++++++++++++-------------- matchcode_project/urls.py | 2 +- 3 files changed, 84 insertions(+), 79 deletions(-) diff --git a/matchcode_pipeline/api.py b/matchcode_pipeline/api.py index f11e59c8..5d0d62ac 100644 --- a/matchcode_pipeline/api.py +++ b/matchcode_pipeline/api.py @@ -18,6 +18,7 @@ from scanpipe.api.serializers import SerializerExcludeFieldsMixin from scanpipe.api.serializers import StrListField from scanpipe.api.views import ProjectFilterSet +from scanpipe.api.views import RunViewSet from scanpipe.models import Project from scanpipe.models import Run from scanpipe.pipes import count_group_by @@ -26,7 +27,7 @@ class RunSerializer(SerializerExcludeFieldsMixin, serializers.ModelSerializer): - matching_project = serializers.HyperlinkedRelatedField( + project = serializers.HyperlinkedRelatedField( view_name="matching-detail", read_only=True ) @@ -37,7 +38,7 @@ class Meta: "pipeline_name", "status", "description", - "matching_project", + "project", "uuid", "created_date", "scancodeio_version", @@ -51,6 +52,10 @@ class Meta: ] +class RunViewSet(RunViewSet): + serializer_class = RunSerializer + + class MatchingSerializer(ExcludeFromListViewMixin, serializers.ModelSerializer): upload_file = serializers.FileField(write_only=True, required=False) input_urls = StrListField( diff --git a/matchcode_pipeline/tests/test_api.py b/matchcode_pipeline/tests/test_api.py index 4b05017d..4dbd783a 100644 --- a/matchcode_pipeline/tests/test_api.py +++ b/matchcode_pipeline/tests/test_api.py @@ -1,33 +1,38 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + from pathlib import Path from unittest import mock -import json + from django.contrib.auth.models import User from django.test import TransactionTestCase from django.urls import reverse from rest_framework.test import APIClient -from django.contrib.auth import get_user_model - -from packagedb.models import ApiUser from scanpipe.models import CodebaseRelation from scanpipe.models import CodebaseResource from scanpipe.models import DiscoveredDependency from scanpipe.models import Project +from scanpipe.models import Run from scanpipe.tests import dependency_data1 from scanpipe.tests import package_data1 from rest_framework import status -from packagedb.models import Package class MatchCodePipelineAPITest(TransactionTestCase): - databases = {'default', 'packagedb'} - data_location = Path(__file__).parent / "data" + data_location = Path(__file__).parent / 'data' def setUp(self): - self.project1 = Project.objects.create(name="Analysis") + self.project1 = Project.objects.create(name='Analysis') self.resource1 = CodebaseResource.objects.create( project=self.project1, - path="daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO", + path='daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO', ) self.discovered_package1 = self.resource1.create_and_add_package(package_data1) self.discovered_dependency1 = DiscoveredDependency.create_from_data( @@ -37,22 +42,14 @@ def setUp(self): project=self.project1, from_resource=self.resource1, to_resource=self.resource1, - map_type="java_to_class", + map_type='java_to_class', ) - self.package1 = Package.objects.create( - type=package_data1['type'], - namespace=package_data1['namespace'], - name=package_data1['name'], - version=package_data1['version'], - sha1='deadbeef', - ) + self.matching_list_url = reverse('matching-list') + self.project1_detail_url = reverse('matching-detail', args=[self.project1.uuid]) - self.matching_list_url = reverse("matching-list") - self.project1_detail_url = reverse("matching-detail", args=[self.project1.uuid]) - - self.user = User.objects.create_user("username", "e@mail.com", "secret") - self.auth = f"Token {self.user.auth_token.key}" + self.user = User.objects.create_user('username', 'e@mail.com', 'secret') + self.auth = f'Token {self.user.auth_token.key}' self.csrf_client = APIClient(enforce_csrf_checks=True) self.csrf_client.credentials(HTTP_AUTHORIZATION=self.auth) @@ -61,89 +58,92 @@ def test_matchcode_pipeline_api_matching_list(self): response = self.csrf_client.get(self.matching_list_url) self.assertContains(response, self.project1_detail_url) - self.assertEqual(1, response.data["count"]) - self.assertNotContains(response, "input_root") - self.assertNotContains(response, "extra_data") - self.assertNotContains(response, "message_count") - self.assertNotContains(response, "resource_count") - self.assertNotContains(response, "package_count") - self.assertNotContains(response, "dependency_count") + self.assertEqual(1, response.data['count']) + self.assertNotContains(response, 'input_root') + self.assertNotContains(response, 'extra_data') + self.assertNotContains(response, 'message_count') + self.assertNotContains(response, 'resource_count') + self.assertNotContains(response, 'package_count') + self.assertNotContains(response, 'dependency_count') def test_matchcode_pipeline_api_matching_detail(self): response = self.csrf_client.get(self.project1_detail_url) - self.assertIn(self.project1_detail_url, response.data["url"]) - self.assertEqual(str(self.project1.uuid), response.data["uuid"]) - self.assertEqual([], response.data["input_sources"]) - self.assertEqual([], response.data["runs"]) - self.assertEqual(1, response.data["resource_count"]) - self.assertEqual(1, response.data["package_count"]) - self.assertEqual(1, response.data["dependency_count"]) - self.assertEqual(1, response.data["relation_count"]) - - expected = {"": 1} - self.assertEqual(expected, response.data["codebase_resources_summary"]) + self.assertIn(self.project1_detail_url, response.data['url']) + self.assertEqual(str(self.project1.uuid), response.data['uuid']) + self.assertEqual([], response.data['input_sources']) + self.assertEqual([], response.data['runs']) + self.assertEqual(1, response.data['resource_count']) + self.assertEqual(1, response.data['package_count']) + self.assertEqual(1, response.data['dependency_count']) + self.assertEqual(1, response.data['relation_count']) + + expected = {'': 1} + self.assertEqual(expected, response.data['codebase_resources_summary']) expected = { - "total": 1, - "with_missing_resources": 0, - "with_modified_resources": 0, + 'total': 1, + 'with_missing_resources': 0, + 'with_modified_resources': 0, } - self.assertEqual(expected, response.data["discovered_packages_summary"]) + self.assertEqual(expected, response.data['discovered_packages_summary']) expected = { - "total": 1, - "is_runtime": 1, - "is_optional": 0, - "is_resolved": 0, + 'total': 1, + 'is_runtime': 1, + 'is_optional': 0, + 'is_resolved': 0, } - self.assertEqual(expected, response.data["discovered_dependencies_summary"]) + self.assertEqual(expected, response.data['discovered_dependencies_summary']) - expected = {"java_to_class": 1} - self.assertEqual(expected, response.data["codebase_relations_summary"]) + expected = {'java_to_class': 1} + self.assertEqual(expected, response.data['codebase_relations_summary']) - self.project1.add_input_source(filename="file1", source="uploaded") - self.project1.add_input_source(filename="file2", source="https://download.url") + self.project1.add_input_source(filename='file1', source='uploaded') + self.project1.add_input_source(filename='file2', source='https://download.url') self.project1.save() response = self.csrf_client.get(self.project1_detail_url) expected = [ - {"filename": "file1", "source": "uploaded"}, - {"filename": "file2", "source": "https://download.url"}, + {'filename': 'file1', 'source': 'uploaded'}, + {'filename': 'file2', 'source': 'https://download.url'}, ] - self.assertEqual(expected, response.data["input_sources"]) + self.assertEqual(expected, response.data['input_sources']) - @mock.patch("scanpipe.models.Run.execute_task_async") - def test_scanpipe_api_project_create(self, mock_execute_pipeline_task): + @mock.patch('scanpipe.models.Run.execute_task_async') + def test_matching_pipeline_api_matching_create(self, mock_execute_pipeline_task): # load upload_file contents - test_out_loc = self.data_location / "test-out.json" + test_out_loc = self.data_location / 'test-out.json' content = open(test_out_loc, 'r') data = { - "upload_file": content, + 'upload_file': content, } # Send match request response = self.csrf_client.post(self.matching_list_url, data) self.assertEqual(status.HTTP_201_CREATED, response.status_code) - self.assertEqual(1, len(response.data["runs"])) - self.assertEqual('matching', response.data["runs"][0]["pipeline_name"]) + self.assertEqual(1, len(response.data['runs'])) + self.assertEqual('matching', response.data['runs'][0]['pipeline_name']) mock_execute_pipeline_task.assert_called_once() - created_matching_project_detail_url = response.data["url"] + created_matching_project_detail_url = response.data['url'] matching_project_uuid = response.data['uuid'] - results_url = reverse("matching-results", args=[matching_project_uuid]) + results_url = reverse('matching-results', args=[matching_project_uuid]) # Check that the file was uploaded response = self.csrf_client.get(created_matching_project_detail_url) self.assertEqual('test-out.json', response.data['input_sources'][0]['filename']) - # Get and check results - response = self.csrf_client.get(results_url) - results = [line.decode('utf-8') for line in list(response.streaming_content)] - results = ''.join(results) - print(results) - results = json.loads(results) - - expected_results_loc = self.data_location / "match-creation-api-results.json" - with open(expected_results_loc, 'r') as f: - expected_results = json.load(f) - - self.assertEqual(expected_results, results) + def test_matchcode_pipeline_api_run_detail(self): + run1 = self.project1.add_pipeline('matching') + url = reverse('run-detail', args=[run1.uuid]) + response = self.csrf_client.get(url) + self.assertEqual(str(run1.uuid), response.data['uuid']) + self.assertIn(self.project1_detail_url, response.data['project']) + self.assertEqual('matching', response.data['pipeline_name']) + self.assertEqual('', response.data['description']) + self.assertEqual('', response.data['scancodeio_version']) + self.assertIsNone(response.data['task_id']) + self.assertIsNone(response.data['task_start_date']) + self.assertIsNone(response.data['task_end_date']) + self.assertEqual('', response.data['task_output']) + self.assertIsNone(response.data['execution_time']) + self.assertEqual(Run.Status.NOT_STARTED, response.data['status']) diff --git a/matchcode_project/urls.py b/matchcode_project/urls.py index ff63c5ee..5f5cc2fc 100644 --- a/matchcode_project/urls.py +++ b/matchcode_project/urls.py @@ -13,7 +13,7 @@ from rest_framework import routers from matchcode_pipeline.api import MatchingViewSet -from scanpipe.api.views import RunViewSet +from matchcode_pipeline.api import RunViewSet api_router = routers.DefaultRouter() From e61fd1f3d1b493eb51034c04250001b55f288c8c Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 9 Jan 2024 14:46:31 -0800 Subject: [PATCH 36/54] Remove ApiUser model #270 * Use create-user from scancode.io * Update tests Signed-off-by: Jono Yang --- matchcode_project/dbrouter.py | 2 + packagedb/management/commands/create-user.py | 115 ++++++++++++++++++ .../management/commands/create_api_user.py | 56 --------- packagedb/migrations/0082_delete_apiuser.py | 15 +++ packagedb/models.py | 22 ++-- packagedb/tests/test_throttling.py | 21 ++-- 6 files changed, 154 insertions(+), 77 deletions(-) create mode 100644 packagedb/management/commands/create-user.py delete mode 100644 packagedb/management/commands/create_api_user.py create mode 100644 packagedb/migrations/0082_delete_apiuser.py diff --git a/matchcode_project/dbrouter.py b/matchcode_project/dbrouter.py index 330543b9..e5090c35 100644 --- a/matchcode_project/dbrouter.py +++ b/matchcode_project/dbrouter.py @@ -15,6 +15,8 @@ class PackageDBRouter(object): 'minecode', 'matchcode', 'packagedb', + 'auth', + 'authtoken', ] def db_for_read(self, model, **hints): diff --git a/packagedb/management/commands/create-user.py b/packagedb/management/commands/create-user.py new file mode 100644 index 00000000..64d25cdc --- /dev/null +++ b/packagedb/management/commands/create-user.py @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +import getpass + +from django.contrib.auth import get_user_model +from django.contrib.auth.password_validation import validate_password +from django.core import exceptions +from django.core.management.base import BaseCommand +from django.core.management.base import CommandError + +from rest_framework.authtoken.models import Token + + +class Command(BaseCommand): + help = "Create a user and generate an API key for authentication." + requires_migrations_checks = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.UserModel = get_user_model() + self.username_field = self.UserModel._meta.get_field( + self.UserModel.USERNAME_FIELD + ) + + def add_arguments(self, parser): + parser.add_argument("username", help="Specifies the username for the user.") + parser.add_argument( + "--no-input", + action="store_false", + dest="interactive", + help="Do not prompt the user for input of any kind.", + ) + + def handle(self, *args, **options): + username = options["username"] + + error_msg = self._validate_username(username) + if error_msg: + raise CommandError(error_msg) + + password = None + if options["interactive"]: + password = self.get_password_from_stdin(username) + + user = self.UserModel._default_manager.create_user(username, password=password) + token, _ = Token._default_manager.get_or_create(user=user) + + if options["verbosity"] >= 1: + msg = f"User {username} created with API key: {token.key}" + self.stdout.write(msg, self.style.SUCCESS) + + def get_password_from_stdin(self, username): + # Validators, such as UserAttributeSimilarityValidator, depends on other user's + # fields data for password validation. + fake_user_data = { + self.UserModel.USERNAME_FIELD: username, + } + + password = None + while password is None: + password1 = getpass.getpass() + password2 = getpass.getpass("Password (again): ") + if password1 != password2: + self.stderr.write("Error: Your passwords didn't match.") + continue + if password1.strip() == "": + self.stderr.write("Error: Blank passwords aren't allowed.") + continue + try: + validate_password(password2, self.UserModel(**fake_user_data)) + except exceptions.ValidationError as err: + self.stderr.write("\n".join(err.messages)) + response = input( + "Bypass password validation and create user anyway? [y/N]: " + ) + if response.lower() != "y": + continue + password = password1 + + return password + + def _validate_username(self, username): + """Validate username. If invalid, return a string error message.""" + if self.username_field.unique: + try: + self.UserModel._default_manager.get_by_natural_key(username) + except self.UserModel.DoesNotExist: + pass + else: + return "Error: That username is already taken." + + try: + self.username_field.clean(username, None) + except exceptions.ValidationError as e: + return "; ".join(e.messages) diff --git a/packagedb/management/commands/create_api_user.py b/packagedb/management/commands/create_api_user.py deleted file mode 100644 index bd079cf9..00000000 --- a/packagedb/management/commands/create_api_user.py +++ /dev/null @@ -1,56 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# PurlDB is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from django.core import exceptions -from django.core.management.base import BaseCommand -from django.core.management.base import CommandError -from django.core.validators import validate_email - -from packagedb.models import ApiUser - -""" -Create a basic API-only user based on an email. -""" - - -class Command(BaseCommand): - help = "Create a basic passwordless user with an API key for sole API authentication usage." - requires_migrations_checks = True - - def add_arguments(self, parser): - parser.add_argument( - "--email", - help="Specifies the email for the user.", - ) - parser.add_argument( - "--first-name", - default="", - help="First name.", - ) - parser.add_argument( - "--last-name", - default="", - help="Last name.", - ) - - def handle(self, *args, **options): - - email = options["email"] - try: - validate_email(email) - user = ApiUser.objects.create_api_user( - username=email, - first_name=options["first_name"] or "", - last_name=options["last_name"] or "", - ) - except exceptions.ValidationError as e: - raise CommandError(str(e)) - - msg = f"User {user.email} created with API key: {user.auth_token.key}" - self.stdout.write(msg, self.style.SUCCESS) diff --git a/packagedb/migrations/0082_delete_apiuser.py b/packagedb/migrations/0082_delete_apiuser.py new file mode 100644 index 00000000..fc51e923 --- /dev/null +++ b/packagedb/migrations/0082_delete_apiuser.py @@ -0,0 +1,15 @@ +# Generated by Django 4.2.6 on 2024-01-09 22:12 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("packagedb", "0081_apiuser"), + ] + + operations = [ + migrations.DeleteModel( + name="ApiUser", + ), + ] diff --git a/packagedb/models.py b/packagedb/models.py index 6880ea5e..27df8f36 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -14,15 +14,17 @@ import sys import uuid -from django.contrib.auth import get_user_model +from django.conf import settings from django.contrib.auth.models import UserManager from django.contrib.postgres.fields import ArrayField from django.core import exceptions from django.core.paginator import Paginator from django.db import models from django.db import transaction +from django.dispatch import receiver from django.utils import timezone from django.utils.translation import gettext_lazy as _ +from rest_framework.authtoken.models import Token from dateutil.parser import parse as dateutil_parse from licensedcode.cache import build_spdx_license_expression @@ -30,7 +32,6 @@ from packageurl import PackageURL from packageurl.contrib.django.models import PackageURLMixin from packageurl.contrib.django.models import PackageURLQuerySetMixin -from rest_framework.authtoken.models import Token TRACE = False @@ -1243,9 +1244,6 @@ def get_package_set_members(self): ) -UserModel = get_user_model() - - class ApiUserManager(UserManager): def create_api_user(self, username, first_name="", last_name="", **extra_fields): """ @@ -1287,12 +1285,8 @@ def _validate_username(self, email): raise exceptions.ValidationError(f"Error: This email already exists: {email}") -class ApiUser(UserModel): - """ - A User proxy model to facilitate simplified admin API user creation. - """ - - objects = ApiUserManager() - - class Meta: - proxy = True +@receiver(models.signals.post_save, sender=settings.AUTH_USER_MODEL) +def create_auth_token(sender, instance=None, created=False, **kwargs): + """Create an API key token on user creation, using the signal system.""" + if created: + Token.objects.create(user_id=instance.pk) diff --git a/packagedb/tests/test_throttling.py b/packagedb/tests/test_throttling.py index 1e3d9253..96f16c2c 100644 --- a/packagedb/tests/test_throttling.py +++ b/packagedb/tests/test_throttling.py @@ -11,27 +11,34 @@ from rest_framework.test import APITestCase from unittest.mock import patch -from packagedb.models import ApiUser - +from django.contrib.auth.models import User @patch('rest_framework.throttling.UserRateThrottle.get_rate', lambda x: '20/day') @patch('rest_framework.throttling.AnonRateThrottle.get_rate', lambda x: '10/day') class ThrottleApiTests(APITestCase): def setUp(self): # create a basic user - self.user = ApiUser.objects.create_api_user(username='e@mail.com') - self.auth = f'Token {self.user.auth_token.key}' + self.user = User.objects.create_user( + username="username", + email="e@mail.com", + password="secret" + ) + self.auth = f"Token {self.user.auth_token.key}" self.csrf_client = APIClient(enforce_csrf_checks=True) self.csrf_client.credentials(HTTP_AUTHORIZATION=self.auth) # create a staff user - self.staff_user = ApiUser.objects.create_api_user(username='staff@mail.com', is_staff=True) - self.staff_auth = f'Token {self.staff_user.auth_token.key}' + self.staff_user = User.objects.create_user( + username="staff_username", + email="staff_e@mail.com", + password="secret", + is_staff=True + ) + self.staff_auth = f"Token {self.staff_user.auth_token.key}" self.staff_csrf_client = APIClient(enforce_csrf_checks=True) self.staff_csrf_client.credentials(HTTP_AUTHORIZATION=self.staff_auth) self.csrf_client_anon = APIClient(enforce_csrf_checks=True) - self.csrf_client_anon_1 = APIClient(enforce_csrf_checks=True) def test_package_endpoint_throttling(self): for i in range(0, 20): From 1f0b07385df8c384956ad9f03c2f79aa28da262e Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 9 Jan 2024 16:35:11 -0800 Subject: [PATCH 37/54] Create Make commands for running matchcode server #224 Signed-off-by: Jono Yang --- Makefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Makefile b/Makefile index efcab3a3..7ae37937 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,7 @@ PYTHON_EXE?=python3 VENV=venv MANAGE=${VENV}/bin/python manage_purldb.py +MATCHCODE_MANAGE=${VENV}/bin/python manage_matchcode.py ACTIVATE?=. ${VENV}/bin/activate; VIRTUALENV_PYZ=../etc/thirdparty/virtualenv.pyz # Do not depend on Python to generate the SECRET_KEY @@ -19,6 +20,7 @@ GET_SECRET_KEY=`base64 /dev/urandom | head -c50` ENV_FILE=.env # Customize with `$ make postgres PACKAGEDB_DB_PASSWORD=YOUR_PASSWORD` PACKAGEDB_DB_PASSWORD=packagedb +SCANCODEIO_DB_PASSWORD=scancodeio # Use sudo for postgres, but only on Linux UNAME := $(shell uname) @@ -87,9 +89,23 @@ postgres: ${SUDO_POSTGRES} createdb --encoding=utf-8 --owner=packagedb packagedb @$(MAKE) migrate +postgres_scancodeio: + @echo "-> Configure PostgreSQL database" + @echo "-> Create database user 'scancodeio'" + ${SUDO_POSTGRES} createuser --no-createrole --no-superuser --login --inherit --createdb scancodeio || true + ${SUDO_POSTGRES} psql -c "alter user scancodeio with encrypted password '${SCANCODEIO_DB_PASSWORD}';" || true + @echo "-> Drop 'scancodeio' database" + ${SUDO_POSTGRES} dropdb scancodeio || true + @echo "-> Create 'scancodeio' database" + ${SUDO_POSTGRES} createdb --encoding=utf-8 --owner=scancodeio scancodeio + ${MATCHCODE_MANAGE} migrate + run: ${MANAGE} runserver 8001 --insecure +run_matchcode: + ${MATCHCODE_MANAGE} runserver 8002 --insecure + seed: ${MANAGE} seed From b5a3c47421cc3b3524f0e2e21c631d01e5b92f61 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Wed, 10 Jan 2024 16:44:13 -0800 Subject: [PATCH 38/54] Copy .env to /etc/scancodeio during testing #224 Signed-off-by: Jono Yang --- .github/workflows/purldb-tests.yml | 54 +++++++++++++------ .../create-multiple-postgresql-databases.sh | 13 +++-- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/.github/workflows/purldb-tests.yml b/.github/workflows/purldb-tests.yml index 3601ccc6..8f0c2bd1 100644 --- a/.github/workflows/purldb-tests.yml +++ b/.github/workflows/purldb-tests.yml @@ -3,35 +3,39 @@ name: PurlDB Tests CI on: [push, pull_request] env: - POSTGRES_DB: packagedb + #POSTGRES_DB: packagedb POSTGRES_USER: packagedb POSTGRES_PASSWORD: packagedb POSTGRES_INITDB_ARGS: --encoding=UTF-8 --lc-collate=en_US.UTF-8 --lc-ctype=en_US.UTF-8 + POSTGRES_MULTIPLE_DATABASES: scancodeio packagedb jobs: build: runs-on: ubuntu-20.04 - services: - postgres: - image: postgres:13 - env: - POSTGRES_DB: ${{ env.POSTGRES_DB }} - POSTGRES_USER: ${{ env.POSTGRES_USER }} - POSTGRES_PASSWORD: ${{ env.POSTGRES_PASSWORD }} - POSTGRES_INITDB_ARGS: ${{ env.POSTGRES_INITDB_ARGS }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 5432:5432 + # services: + # postgres: + # image: postgres:13 + # env: + # POSTGRES_DB: ${{ env.POSTGRES_DB }} + # POSTGRES_USER: ${{ env.POSTGRES_USER }} + # POSTGRES_PASSWORD: ${{ env.POSTGRES_PASSWORD }} + # POSTGRES_INITDB_ARGS: ${{ env.POSTGRES_INITDB_ARGS }} + # POSTGRES_MULTIPLE_DATABASES: scancodeio:scancodeio:scancodeio + # options: >- + # --health-cmd pg_isready + # --health-interval 10s + # --health-timeout 5s + # --health-retries 5 + # ports: + # - 5432:5432 + # volumes: + # - ${{ github.workspace }}/etc/multiple-databases:/docker-entrypoint-initdb.d strategy: max-parallel: 4 matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.10"] steps: - name: Checkout code @@ -42,6 +46,20 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Set up Postgres 13 + working-directory: . + run: | + docker run \ + --name postgres \ + -p 5432:5432 \ + -e POSTGRES_USER=${{ env.POSTGRES_USER }} \ + -e POSTGRES_PASSWORD=${{ env.POSTGRES_PASSWORD }} \ + -e POSTGRES_INITDB_ARGS="${{ env.POSTGRES_INITDB_ARGS }}" \ + -e POSTGRES_MULTIPLE_DATABASES="scancodeio packagedb" \ + -v ${{ github.workspace }}/etc/multiple-databases:/docker-entrypoint-initdb.d \ + -d \ + postgres:13 + - name: Install dependencies working-directory: . run: | @@ -51,4 +69,6 @@ jobs: working-directory: . run: | make envfile + sudo mkdir /etc/scancodeio + sudo cp .env /etc/scancodeio make test diff --git a/etc/multiple-databases/create-multiple-postgresql-databases.sh b/etc/multiple-databases/create-multiple-postgresql-databases.sh index b582883a..8f3e64fc 100644 --- a/etc/multiple-databases/create-multiple-postgresql-databases.sh +++ b/etc/multiple-databases/create-multiple-postgresql-databases.sh @@ -4,21 +4,20 @@ set -e set -u function create_user_and_database() { - local dbinfo=$1 - IFS=":" read -r database user password <<< "$dbinfo" + local database=$1 - echo "Creating database '$database' with user '$user' and password '$password'" + echo "Creating database '$database' with user '$database' and password '$database'" psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" <<-EOSQL - CREATE USER $user; - ALTER USER $user WITH ENCRYPTED PASSWORD '$password'; + CREATE USER $database; + ALTER USER $database WITH ENCRYPTED PASSWORD '$database'; CREATE DATABASE $database; - GRANT ALL PRIVILEGES ON DATABASE $database TO $user; + GRANT ALL PRIVILEGES ON DATABASE $database TO $database; EOSQL } if [ -n "$POSTGRES_MULTIPLE_DATABASES" ]; then echo "Multiple database creation requested: $POSTGRES_MULTIPLE_DATABASES" - for db in $(echo $POSTGRES_MULTIPLE_DATABASES | tr ',' ' '); do + for db in $(echo $POSTGRES_MULTIPLE_DATABASES); do create_user_and_database $db done echo "Multiple databases created" From 7a19e4ce5486d88b4b5512be4447b7a6e8799987 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 12 Jan 2024 22:43:21 -0800 Subject: [PATCH 39/54] Use two databases in testing #224 Signed-off-by: Jono Yang --- .github/workflows/purldb-tests.yml | 67 ++++++++++++++---------------- Makefile | 7 ++++ 2 files changed, 39 insertions(+), 35 deletions(-) diff --git a/.github/workflows/purldb-tests.yml b/.github/workflows/purldb-tests.yml index 8f0c2bd1..19d23610 100644 --- a/.github/workflows/purldb-tests.yml +++ b/.github/workflows/purldb-tests.yml @@ -3,34 +3,45 @@ name: PurlDB Tests CI on: [push, pull_request] env: - #POSTGRES_DB: packagedb + POSTGRES_DB: packagedb POSTGRES_USER: packagedb POSTGRES_PASSWORD: packagedb POSTGRES_INITDB_ARGS: --encoding=UTF-8 --lc-collate=en_US.UTF-8 --lc-ctype=en_US.UTF-8 - POSTGRES_MULTIPLE_DATABASES: scancodeio packagedb jobs: build: runs-on: ubuntu-20.04 - # services: - # postgres: - # image: postgres:13 - # env: - # POSTGRES_DB: ${{ env.POSTGRES_DB }} - # POSTGRES_USER: ${{ env.POSTGRES_USER }} - # POSTGRES_PASSWORD: ${{ env.POSTGRES_PASSWORD }} - # POSTGRES_INITDB_ARGS: ${{ env.POSTGRES_INITDB_ARGS }} - # POSTGRES_MULTIPLE_DATABASES: scancodeio:scancodeio:scancodeio - # options: >- - # --health-cmd pg_isready - # --health-interval 10s - # --health-timeout 5s - # --health-retries 5 - # ports: - # - 5432:5432 - # volumes: - # - ${{ github.workspace }}/etc/multiple-databases:/docker-entrypoint-initdb.d + services: + postgres1: + image: postgres:13 + env: + POSTGRES_DB: ${{ env.POSTGRES_DB }} + POSTGRES_USER: ${{ env.POSTGRES_USER }} + POSTGRES_PASSWORD: ${{ env.POSTGRES_PASSWORD }} + POSTGRES_INITDB_ARGS: ${{ env.POSTGRES_INITDB_ARGS }} + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + postgres2: + image: postgres:13 + env: + POSTGRES_DB: scancodeio + POSTGRES_USER: scancodeio + POSTGRES_PASSWORD: scancodeio + POSTGRES_INITDB_ARGS: ${{ env.POSTGRES_INITDB_ARGS }} + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5433:5432 strategy: max-parallel: 4 @@ -46,20 +57,6 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Set up Postgres 13 - working-directory: . - run: | - docker run \ - --name postgres \ - -p 5432:5432 \ - -e POSTGRES_USER=${{ env.POSTGRES_USER }} \ - -e POSTGRES_PASSWORD=${{ env.POSTGRES_PASSWORD }} \ - -e POSTGRES_INITDB_ARGS="${{ env.POSTGRES_INITDB_ARGS }}" \ - -e POSTGRES_MULTIPLE_DATABASES="scancodeio packagedb" \ - -v ${{ github.workspace }}/etc/multiple-databases:/docker-entrypoint-initdb.d \ - -d \ - postgres:13 - - name: Install dependencies working-directory: . run: | @@ -68,7 +65,7 @@ jobs: - name: Run tests working-directory: . run: | - make envfile + make envfile_testing sudo mkdir /etc/scancodeio sudo cp .env /etc/scancodeio make test diff --git a/Makefile b/Makefile index 7ae37937..289eef62 100644 --- a/Makefile +++ b/Makefile @@ -48,6 +48,13 @@ envfile: @mkdir -p $(shell dirname ${ENV_FILE}) && touch ${ENV_FILE} @echo SECRET_KEY=\"${GET_SECRET_KEY}\" > ${ENV_FILE} +envfile_testing: + @echo "-> Create the .env file and generate a secret key" + @if test -f ${ENV_FILE}; then echo ".env file exists already"; exit 1; fi + @mkdir -p $(shell dirname ${ENV_FILE}) && touch ${ENV_FILE} + @echo SECRET_KEY=\"${GET_SECRET_KEY}\" >> ${ENV_FILE} + @echo SCANCODEIO_DB_PORT=\"5433\" >> ${ENV_FILE} + isort: @echo "-> Apply isort changes to ensure proper imports ordering" ${VENV}/bin/isort . From c98dad4fac9ae97bb782e859db2f98d48c69c73e Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 15 Jan 2024 11:21:30 -0800 Subject: [PATCH 40/54] Update expected test results #224 Signed-off-by: Jono Yang --- minecode/tests/testfiles/directories/ls-lr-expected.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/minecode/tests/testfiles/directories/ls-lr-expected.json b/minecode/tests/testfiles/directories/ls-lr-expected.json index 9b088b89..1692d6a4 100644 --- a/minecode/tests/testfiles/directories/ls-lr-expected.json +++ b/minecode/tests/testfiles/directories/ls-lr-expected.json @@ -101,14 +101,14 @@ "path":"dists/experimental/InRelease", "type":"f", "size":187349, - "date":"2023-01", + "date":"2024-01", "target":null }, { "path":"dists/experimental/Release.gpg", "type":"f", "size":1554, - "date":"2023-01", + "date":"2024-01", "target":null }, { From 4ecfa10b275d884102bbdfbe469fbf96eb288511 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 15 Jan 2024 12:41:11 -0800 Subject: [PATCH 41/54] Do not route auth apps to PackageDB #224 Signed-off-by: Jono Yang --- matchcode_project/dbrouter.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/matchcode_project/dbrouter.py b/matchcode_project/dbrouter.py index e5090c35..330543b9 100644 --- a/matchcode_project/dbrouter.py +++ b/matchcode_project/dbrouter.py @@ -15,8 +15,6 @@ class PackageDBRouter(object): 'minecode', 'matchcode', 'packagedb', - 'auth', - 'authtoken', ] def db_for_read(self, model, **hints): From d97a5040b594efa617485effc5aac9f649f87f63 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 15 Jan 2024 14:49:49 -0800 Subject: [PATCH 42/54] Get or create Token to avoid creating existing token #224 Signed-off-by: Jono Yang --- .github/workflows/purldb-tests.yml | 2 +- matchcode_pipeline/tests/test_api.py | 1 + packagedb/models.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/purldb-tests.yml b/.github/workflows/purldb-tests.yml index 19d23610..7652ea04 100644 --- a/.github/workflows/purldb-tests.yml +++ b/.github/workflows/purldb-tests.yml @@ -46,7 +46,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: ["3.10"] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - name: Checkout code diff --git a/matchcode_pipeline/tests/test_api.py b/matchcode_pipeline/tests/test_api.py index 4dbd783a..510d72d9 100644 --- a/matchcode_pipeline/tests/test_api.py +++ b/matchcode_pipeline/tests/test_api.py @@ -26,6 +26,7 @@ class MatchCodePipelineAPITest(TransactionTestCase): + databases = {'default', 'packagedb'} data_location = Path(__file__).parent / 'data' def setUp(self): diff --git a/packagedb/models.py b/packagedb/models.py index 27df8f36..c9b7bbc6 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -1289,4 +1289,4 @@ def _validate_username(self, email): def create_auth_token(sender, instance=None, created=False, **kwargs): """Create an API key token on user creation, using the signal system.""" if created: - Token.objects.create(user_id=instance.pk) + Token.objects.get_or_create(user_id=instance.pk) From 598ed4f308e3ad95e7afa7b89846140d58bedc6a Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 16 Jan 2024 10:34:30 -0800 Subject: [PATCH 43/54] Use separate databases for purldb and matchcode #224 Signed-off-by: Jono Yang --- docker-compose_matchcodeio.yml | 12 ++++++++-- docker-compose_purldb.yml | 1 - ...er_matchcode.env => docker_matchcodeio.env | 9 ++++--- docker_purldb.env | 1 - .../create-multiple-postgresql-databases.sh | 24 ------------------- 5 files changed, 14 insertions(+), 33 deletions(-) rename docker_matchcode.env => docker_matchcodeio.env (59%) delete mode 100644 etc/multiple-databases/create-multiple-postgresql-databases.sh diff --git a/docker-compose_matchcodeio.yml b/docker-compose_matchcodeio.yml index 663a909d..dbb86bfa 100644 --- a/docker-compose_matchcodeio.yml +++ b/docker-compose_matchcodeio.yml @@ -1,6 +1,13 @@ version: "3" services: + matchcodeio_db: + image: postgres:13 + env_file: + - docker_matchcodeio.env + volumes: + - matchcodeio_db_data:/var/lib/postgresql/data/ + matchcodeio_redis: image: redis # Enable redis data persistence using the "Append Only File" with the @@ -19,7 +26,7 @@ services: environment: - DJANGO_SETTINGS_MODULE=matchcode_project.settings env_file: - - docker_matchcode.env + - docker_matchcodeio.env expose: - 8001 volumes: @@ -38,7 +45,7 @@ services: environment: - DJANGO_SETTINGS_MODULE=matchcode_project.settings env_file: - - docker_matchcode.env + - docker_matchcodeio.env volumes: - .env:/opt/scancodeio/.env - /etc/scancodeio/:/etc/scancodeio/ @@ -78,3 +85,4 @@ volumes: redis_data: static: workspace: + matchcodeio_db_data: diff --git a/docker-compose_purldb.yml b/docker-compose_purldb.yml index ccf5691c..6f7da682 100644 --- a/docker-compose_purldb.yml +++ b/docker-compose_purldb.yml @@ -6,7 +6,6 @@ services: env_file: - docker_purldb.env volumes: - - ./etc/multiple-databases:/docker-entrypoint-initdb.d - db_data:/var/lib/postgresql/data/ web: diff --git a/docker_matchcode.env b/docker_matchcodeio.env similarity index 59% rename from docker_matchcode.env rename to docker_matchcodeio.env index 77b4284e..3a0414f9 100644 --- a/docker_matchcode.env +++ b/docker_matchcodeio.env @@ -1,12 +1,11 @@ -POSTGRES_MULTIPLE_DATABASES=scancodeio:scancodeio:scancodeio -POSTGRES_DB=packagedb -POSTGRES_USER=packagedb -POSTGRES_PASSWORD=packagedb +POSTGRES_DB=scancodeio +POSTGRES_USER=scancodeio +POSTGRES_PASSWORD=scancodeio POSTGRES_INITDB_ARGS=--encoding=UTF-8 --lc-collate=en_US.UTF-8 --lc-ctype=en_US.UTF-8 PACKAGEDB_DB_HOST=db -SCANCODEIO_DB_HOST=db +SCANCODEIO_DB_HOST=matchcodeio_db SCANCODEIO_REDIS_HOST=matchcodeio_redis SCANCODEIO_ASYNC=True SCANCODEIO_WORKSPACE_LOCATION=/var/scancodeio/workspace/ diff --git a/docker_purldb.env b/docker_purldb.env index dcdd8fd8..1b588cc0 100644 --- a/docker_purldb.env +++ b/docker_purldb.env @@ -1,4 +1,3 @@ -POSTGRES_MULTIPLE_DATABASES=scancodeio:scancodeio:scancodeio POSTGRES_DB=packagedb POSTGRES_USER=packagedb POSTGRES_PASSWORD=packagedb diff --git a/etc/multiple-databases/create-multiple-postgresql-databases.sh b/etc/multiple-databases/create-multiple-postgresql-databases.sh deleted file mode 100644 index 8f3e64fc..00000000 --- a/etc/multiple-databases/create-multiple-postgresql-databases.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -set -e -set -u - -function create_user_and_database() { - local database=$1 - - echo "Creating database '$database' with user '$database' and password '$database'" - psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" <<-EOSQL - CREATE USER $database; - ALTER USER $database WITH ENCRYPTED PASSWORD '$database'; - CREATE DATABASE $database; - GRANT ALL PRIVILEGES ON DATABASE $database TO $database; -EOSQL -} - -if [ -n "$POSTGRES_MULTIPLE_DATABASES" ]; then - echo "Multiple database creation requested: $POSTGRES_MULTIPLE_DATABASES" - for db in $(echo $POSTGRES_MULTIPLE_DATABASES); do - create_user_and_database $db - done - echo "Multiple databases created" -fi From cea92104bcf272088be3f8cda707dac54ee483c0 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 16 Jan 2024 18:52:15 -0800 Subject: [PATCH 44/54] Set up https traefik route #224 Signed-off-by: Jono Yang --- docker-compose_purldb_production.yml | 146 +++++++++++++++++++++++++++ docker-compose_traefik.yml | 1 + 2 files changed, 147 insertions(+) create mode 100644 docker-compose_purldb_production.yml diff --git a/docker-compose_purldb_production.yml b/docker-compose_purldb_production.yml new file mode 100644 index 00000000..b554ee16 --- /dev/null +++ b/docker-compose_purldb_production.yml @@ -0,0 +1,146 @@ +version: "3" + +services: + db: + image: postgres:13 + env_file: + - docker_purldb.env + volumes: + - db_data:/var/lib/postgresql/data/ + + web: + build: . + command: sh -c " + python manage_purldb.py migrate && + python manage_purldb.py collectstatic --no-input --verbosity 0 --clear && + gunicorn purldb_project.wsgi:application --bind :8000 --timeout 600 --workers 8" + env_file: + - docker_purldb.env + expose: + - 8000 + volumes: + - /etc/purldb/:/etc/purldb/ + - static:/var/purldb/static/ + depends_on: + - db + + visitor: + build: . + command: sh -c " + wait-for-it web:8000 -- python manage_purldb.py seed && + python manage_purldb.py run_visit --ignore-robots --ignore-throttle" + env_file: + - docker_purldb.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - visit_and_map + depends_on: + - db + - web # Ensure that potential db migrations run first + + mapper: + build: . + command: wait-for-it web:8000 -- python manage_purldb.py run_map + env_file: + - docker_purldb.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - visit_and_map + depends_on: + - db + - web # Ensure that potential db migrations run first + + clearsync: + build: . + command: wait-for-it web:8000 -- clearsync --save-to-db --verbose -n 3 + env_file: + - docker_purldb.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - clearsync + depends_on: + - db + - web # Ensure that potential db migrations run first + + clearindex: + build: . + command: wait-for-it web:8000 -- python manage_purldb.py run_clearindex + profiles: + - clearsync + depends_on: + - db + - web # Ensure that potential db migrations run first + + request_scan: + build: . + command: wait-for-it web:8000 -- python manage_purldb.py request_scans + env_file: + - docker_purldb.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - scan_queue + depends_on: + - db + - web + + process_scan: + build: . + command: wait-for-it web:8000 -- python manage_purldb.py process_scans + env_file: + - docker_purldb.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - scan_queue + depends_on: + - db + - web + + priority_queue: + build: . + command: wait-for-it web:8000 -- python manage_purldb.py priority_queue + env_file: + - docker_purldb.env + volumes: + - /etc/purldb/:/etc/purldb/ + profiles: + - priority_queue + depends_on: + - db + - web + + nginx: + image: nginx + labels: + - "traefik.enable=true" + - "traefik.http.middlewares.websecure.redirectscheme.scheme=https" + - "traefik.http.routers.web.entrypoints=web" + - "traefik.http.routers.web.rule= + Host(`127.0.0.1`) + || Host(`localhost`) + || Host(`192.168.1.12`)" + - "traefik.http.routers.web.middlewares=websecure@docker" + - "traefik.http.routers.websecure.entrypoints=websecure" + - "traefik.http.routers.websecure.tls=true" + - "traefik.http.routers.websecure.rule= + Host(`127.0.0.1`) + || Host(`localhost`) + || Host(`192.168.1.12`)" + volumes: + - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ + - static:/var/purldb/static/ + depends_on: + - web + +networks: + default: + name: purldb + external: true + +volumes: + db_data: + static: diff --git a/docker-compose_traefik.yml b/docker-compose_traefik.yml index b6ee4c38..9bd98ad5 100644 --- a/docker-compose_traefik.yml +++ b/docker-compose_traefik.yml @@ -7,6 +7,7 @@ services: hostname: "traefik" ports: - "80:80" + - "443:443" - "8080:8080" volumes: - "/var/run/docker.sock:/var/run/docker.sock:ro" From f8483ce7b1b3df5d869befae9c3e527b0898e655 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 25 Jan 2024 16:18:15 -0800 Subject: [PATCH 45/54] Update docker-compose files #224 * Declare external network differently * Remove production docker-compose file Signed-off-by: Jono Yang --- docker-compose_matchcodeio.yml | 4 +- docker-compose_purldb_production.yml | 146 --------------------------- docker-compose_purldb_public.yml | 9 +- docker-compose_traefik.yml | 5 +- 4 files changed, 10 insertions(+), 154 deletions(-) delete mode 100644 docker-compose_purldb_production.yml diff --git a/docker-compose_matchcodeio.yml b/docker-compose_matchcodeio.yml index dbb86bfa..dedcc8c8 100644 --- a/docker-compose_matchcodeio.yml +++ b/docker-compose_matchcodeio.yml @@ -78,8 +78,8 @@ services: networks: default: - external: - name: purldb + name: purldb + external: true volumes: redis_data: diff --git a/docker-compose_purldb_production.yml b/docker-compose_purldb_production.yml deleted file mode 100644 index b554ee16..00000000 --- a/docker-compose_purldb_production.yml +++ /dev/null @@ -1,146 +0,0 @@ -version: "3" - -services: - db: - image: postgres:13 - env_file: - - docker_purldb.env - volumes: - - db_data:/var/lib/postgresql/data/ - - web: - build: . - command: sh -c " - python manage_purldb.py migrate && - python manage_purldb.py collectstatic --no-input --verbosity 0 --clear && - gunicorn purldb_project.wsgi:application --bind :8000 --timeout 600 --workers 8" - env_file: - - docker_purldb.env - expose: - - 8000 - volumes: - - /etc/purldb/:/etc/purldb/ - - static:/var/purldb/static/ - depends_on: - - db - - visitor: - build: . - command: sh -c " - wait-for-it web:8000 -- python manage_purldb.py seed && - python manage_purldb.py run_visit --ignore-robots --ignore-throttle" - env_file: - - docker_purldb.env - volumes: - - /etc/purldb/:/etc/purldb/ - profiles: - - visit_and_map - depends_on: - - db - - web # Ensure that potential db migrations run first - - mapper: - build: . - command: wait-for-it web:8000 -- python manage_purldb.py run_map - env_file: - - docker_purldb.env - volumes: - - /etc/purldb/:/etc/purldb/ - profiles: - - visit_and_map - depends_on: - - db - - web # Ensure that potential db migrations run first - - clearsync: - build: . - command: wait-for-it web:8000 -- clearsync --save-to-db --verbose -n 3 - env_file: - - docker_purldb.env - volumes: - - /etc/purldb/:/etc/purldb/ - profiles: - - clearsync - depends_on: - - db - - web # Ensure that potential db migrations run first - - clearindex: - build: . - command: wait-for-it web:8000 -- python manage_purldb.py run_clearindex - profiles: - - clearsync - depends_on: - - db - - web # Ensure that potential db migrations run first - - request_scan: - build: . - command: wait-for-it web:8000 -- python manage_purldb.py request_scans - env_file: - - docker_purldb.env - volumes: - - /etc/purldb/:/etc/purldb/ - profiles: - - scan_queue - depends_on: - - db - - web - - process_scan: - build: . - command: wait-for-it web:8000 -- python manage_purldb.py process_scans - env_file: - - docker_purldb.env - volumes: - - /etc/purldb/:/etc/purldb/ - profiles: - - scan_queue - depends_on: - - db - - web - - priority_queue: - build: . - command: wait-for-it web:8000 -- python manage_purldb.py priority_queue - env_file: - - docker_purldb.env - volumes: - - /etc/purldb/:/etc/purldb/ - profiles: - - priority_queue - depends_on: - - db - - web - - nginx: - image: nginx - labels: - - "traefik.enable=true" - - "traefik.http.middlewares.websecure.redirectscheme.scheme=https" - - "traefik.http.routers.web.entrypoints=web" - - "traefik.http.routers.web.rule= - Host(`127.0.0.1`) - || Host(`localhost`) - || Host(`192.168.1.12`)" - - "traefik.http.routers.web.middlewares=websecure@docker" - - "traefik.http.routers.websecure.entrypoints=websecure" - - "traefik.http.routers.websecure.tls=true" - - "traefik.http.routers.websecure.rule= - Host(`127.0.0.1`) - || Host(`localhost`) - || Host(`192.168.1.12`)" - volumes: - - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ - - static:/var/purldb/static/ - depends_on: - - web - -networks: - default: - name: purldb - external: true - -volumes: - db_data: - static: diff --git a/docker-compose_purldb_public.yml b/docker-compose_purldb_public.yml index becbd82f..4ffbf261 100644 --- a/docker-compose_purldb_public.yml +++ b/docker-compose_purldb_public.yml @@ -115,9 +115,12 @@ services: nginx: image: nginx - ports: - - 80:80 - - 443:443 + labels: + - "traefik.enable=true" + - "traefik.http.routers.web.rule= + Host(`127.0.0.1`) + || Host(`localhost`)" + - "traefik.http.routers.web.entrypoints=web" volumes: - ./etc/nginx/conf.d/:/etc/nginx/conf.d/ - static:/var/purldb/static/ diff --git a/docker-compose_traefik.yml b/docker-compose_traefik.yml index 9bd98ad5..76e706bc 100644 --- a/docker-compose_traefik.yml +++ b/docker-compose_traefik.yml @@ -8,12 +8,11 @@ services: ports: - "80:80" - "443:443" - - "8080:8080" volumes: - "/var/run/docker.sock:/var/run/docker.sock:ro" - "./traefik.yml:/traefik.yml:ro" networks: default: - external: - name: purldb + name: purldb + external: true From 5a0a5e9a5dc24b451c7e28ae460a182492e435f7 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 26 Jan 2024 15:45:56 -0800 Subject: [PATCH 46/54] Create matchcodeio database and use it #224 Signed-off-by: Jono Yang --- Makefile | 18 +++++++------- docker_matchcodeio.env | 6 ++--- matchcode_project/settings.py | 46 ++++++++++++++++++++--------------- 3 files changed, 39 insertions(+), 31 deletions(-) diff --git a/Makefile b/Makefile index 289eef62..dc5b6987 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ GET_SECRET_KEY=`base64 /dev/urandom | head -c50` ENV_FILE=.env # Customize with `$ make postgres PACKAGEDB_DB_PASSWORD=YOUR_PASSWORD` PACKAGEDB_DB_PASSWORD=packagedb -SCANCODEIO_DB_PASSWORD=scancodeio +MATCHCODEIO_DB_PASSWORD=matchcodeio # Use sudo for postgres, but only on Linux UNAME := $(shell uname) @@ -96,15 +96,15 @@ postgres: ${SUDO_POSTGRES} createdb --encoding=utf-8 --owner=packagedb packagedb @$(MAKE) migrate -postgres_scancodeio: +postgres_matchcodeio: @echo "-> Configure PostgreSQL database" - @echo "-> Create database user 'scancodeio'" - ${SUDO_POSTGRES} createuser --no-createrole --no-superuser --login --inherit --createdb scancodeio || true - ${SUDO_POSTGRES} psql -c "alter user scancodeio with encrypted password '${SCANCODEIO_DB_PASSWORD}';" || true - @echo "-> Drop 'scancodeio' database" - ${SUDO_POSTGRES} dropdb scancodeio || true - @echo "-> Create 'scancodeio' database" - ${SUDO_POSTGRES} createdb --encoding=utf-8 --owner=scancodeio scancodeio + @echo "-> Create database user 'matchcodeio'" + ${SUDO_POSTGRES} createuser --no-createrole --no-superuser --login --inherit --createdb matchcodeio || true + ${SUDO_POSTGRES} psql -c "alter user matchcodeio with encrypted password '${MATCHCODEIO_DB_PASSWORD}';" || true + @echo "-> Drop 'matchcodeio' database" + ${SUDO_POSTGRES} dropdb matchcodeio || true + @echo "-> Create 'matchcodeio' database" + ${SUDO_POSTGRES} createdb --encoding=utf-8 --owner=matchcodeio matchcodeio ${MATCHCODE_MANAGE} migrate run: diff --git a/docker_matchcodeio.env b/docker_matchcodeio.env index 3a0414f9..e9f876a1 100644 --- a/docker_matchcodeio.env +++ b/docker_matchcodeio.env @@ -1,6 +1,6 @@ -POSTGRES_DB=scancodeio -POSTGRES_USER=scancodeio -POSTGRES_PASSWORD=scancodeio +POSTGRES_DB=matchcodeio +POSTGRES_USER=matchcodeio +POSTGRES_PASSWORD=matchcodeio POSTGRES_INITDB_ARGS=--encoding=UTF-8 --lc-collate=en_US.UTF-8 --lc-ctype=en_US.UTF-8 PACKAGEDB_DB_HOST=db diff --git a/matchcode_project/settings.py b/matchcode_project/settings.py index 7496b432..72df7878 100644 --- a/matchcode_project/settings.py +++ b/matchcode_project/settings.py @@ -11,32 +11,40 @@ INSTALLED_APPS += [ - "clearcode", - "clearindex", - "matchcode", - "minecode", - "packagedb", + 'clearcode', + 'clearindex', + 'matchcode', + 'minecode', + 'packagedb', ] # Database -DATABASES.update( - { - 'packagedb': { - 'ENGINE': env.str('PACKAGEDB_DB_ENGINE', 'django.db.backends.postgresql'), - 'HOST': env.str('PACKAGEDB_DB_HOST', 'localhost'), - 'NAME': env.str('PACKAGEDB_DB_NAME', 'packagedb'), - 'USER': env.str('PACKAGEDB_DB_USER', 'packagedb'), - 'PASSWORD': env.str('PACKAGEDB_DB_PASSWORD', 'packagedb'), - 'PORT': env.str('PACKAGEDB_DB_PORT', '5432'), - 'ATOMIC_REQUESTS': True, - } +DATABASES = { + 'default': { + 'ENGINE': env.str('SCANCODEIO_DB_ENGINE', 'django.db.backends.postgresql'), + 'HOST': env.str('SCANCODEIO_DB_HOST', 'localhost'), + 'NAME': env.str('SCANCODEIO_DB_NAME', 'matchcodeio'), + 'USER': env.str('SCANCODEIO_DB_USER', 'matchcodeio'), + 'PASSWORD': env.str('SCANCODEIO_DB_PASSWORD', 'matchcodeio'), + 'PORT': env.str('SCANCODEIO_DB_PORT', '5432'), + 'ATOMIC_REQUESTS': True, + }, + 'packagedb': { + 'ENGINE': env.str('PACKAGEDB_DB_ENGINE', 'django.db.backends.postgresql'), + 'HOST': env.str('PACKAGEDB_DB_HOST', 'localhost'), + 'NAME': env.str('PACKAGEDB_DB_NAME', 'packagedb'), + 'USER': env.str('PACKAGEDB_DB_USER', 'packagedb'), + 'PASSWORD': env.str('PACKAGEDB_DB_PASSWORD', 'packagedb'), + 'PORT': env.str('PACKAGEDB_DB_PORT', '5432'), + 'ATOMIC_REQUESTS': True, } -) +} + DATABASE_ROUTERS = [ - "matchcode_project.dbrouter.PackageDBRouter", - "matchcode_project.dbrouter.ScancodeIORouter", + 'matchcode_project.dbrouter.PackageDBRouter', + 'matchcode_project.dbrouter.ScancodeIORouter', ] ROOT_URLCONF = 'matchcode_project.urls' From bca5869051a5fa931f3a77983d72e976a77a12fe Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 26 Jan 2024 17:51:07 -0800 Subject: [PATCH 47/54] Update README.rst #224 Signed-off-by: Jono Yang --- Makefile | 2 +- README.rst | 49 +++++++++++++++++++++++++++---------------------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index dc5b6987..8cb8f46b 100644 --- a/Makefile +++ b/Makefile @@ -110,7 +110,7 @@ postgres_matchcodeio: run: ${MANAGE} runserver 8001 --insecure -run_matchcode: +run_matchcodeio: ${MATCHCODE_MANAGE} runserver 8002 --insecure seed: diff --git a/README.rst b/README.rst index 2227dc42..2f1f2f6f 100644 --- a/README.rst +++ b/README.rst @@ -8,6 +8,7 @@ This repo consists of four main tools: - MineCode that contains utilities to mine package repositories - MatchCode that contains utilities to index package metadata and resources for matching +- MatchCode.io that provides package matching functionalities for codebases - ClearCode that contains utilities to mine Clearlydefined for package data These are designed to be used first for reference such that one can query for @@ -39,6 +40,7 @@ Once the prerequisites have been installed, set up PurlDB with the following com make dev make envfile make postgres + make postgres_matchcodeio Once PurlDB and the database has been set up, run tests to ensure functionality: :: @@ -53,6 +55,11 @@ Start the PurlDB server by running: make run +Start the MatchCode.io server by running: +:: + + make run_matchcodeio + To start visiting upstream package repositories for package metadata: :: @@ -69,33 +76,13 @@ Populating Package Resource Data The Resources of Packages can be collected using the scan queue. By default, a scan request will be created for each mapped Package. -The following environment variables will have to be set for the scan queue -commands to work: +Given that you have access to a ScanCode.io instance, the following environment +variables will have to be set for the scan queue commands to work: :: SCANCODEIO_URL= SCANCODEIO_API_KEY= -``matchcode-toolkit`` will also have to be installed in the same environment as -ScanCode.io. If running ScanCode.io in a virtual environment from a git -checkout, you can install ``matchcode-toolkit`` in editable mode: -:: - - pip install -e - -Otherwise, you can create a wheel from ``matchcode-toolkit`` and install it in -the ScanCode.io virutal environment or modify the ScanCode.io Dockerfile to -install the ``matchcode-toolkit`` wheel. - -To build the ``matchcode-toolkit`` wheel: -:: - - # From the matchcode-toolkit directory - python setup.py bdist_wheel - -The wheel ``matchcode_toolkit-0.0.1-py3-none-any.whl`` will be created in the -``matchcode-toolkit/dist/`` directory. - The scan queue is run using two commands: :: @@ -136,6 +123,24 @@ matching indices from the collected Package data: make index_packages +MatchCode.io +------------ + +MatchCode.io is a Django app, based off of ScanCode.io, that exposes one API +endpoint, ``api/matching``, which takes a ScanCode.io codebase scan, and +performs Package matching on it. + +Currently, it performs three matching steps: + + * Match codebase resources against the Packages in the PackageDB + * Match codebase resources against the Resources in the PackageDB + * Match codebase directories against the directory matching indices of + MatchCode + +This API endpoint is intended to be used with the ``match_to_purldb`` pipeline +in ScanCode.io. + + API Endpoints ------------- From 614701467d6aeeb45dd642f186daaecee241e10b Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 29 Jan 2024 12:10:47 -0800 Subject: [PATCH 48/54] Update github CI settings #224 Signed-off-by: Jono Yang --- .github/workflows/purldb-tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/purldb-tests.yml b/.github/workflows/purldb-tests.yml index 7652ea04..4eb16274 100644 --- a/.github/workflows/purldb-tests.yml +++ b/.github/workflows/purldb-tests.yml @@ -31,9 +31,9 @@ jobs: postgres2: image: postgres:13 env: - POSTGRES_DB: scancodeio - POSTGRES_USER: scancodeio - POSTGRES_PASSWORD: scancodeio + POSTGRES_DB: matchcodeio + POSTGRES_USER: matchcodeio + POSTGRES_PASSWORD: matchcodeio POSTGRES_INITDB_ARGS: ${{ env.POSTGRES_INITDB_ARGS }} options: >- --health-cmd pg_isready From 8745b127dc62c481a44953d81a640ca4a1310343 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 29 Jan 2024 15:59:46 -0800 Subject: [PATCH 49/54] Add Docker instructions #224 Signed-off-by: Jono Yang --- README.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.rst b/README.rst index 2f1f2f6f..759352ec 100644 --- a/README.rst +++ b/README.rst @@ -177,6 +177,27 @@ API Endpoints * Used to check the SHA1 values of archives from a scan to determine if they are known Packages +Docker Setup for Local Development and Testing +---------------------------------------------- + +PurlDB and MatchCode.io are two separate Django apps. In order to run both of +these Django apps on the same host, we need to use Traefik. + +Traefik is an edge router that receives requests and finds out which services +are responsible for handling them. In the docker-compose.yml files for PurlDB +and MatchCode.io, we have made these two services part of the same Docker +network and set up the routes for each service. + +All requests to the host go to the PurlDB service, but requests that go to the +``api/matching`` endpoint are routed to the MatchCode.io service. + +To run PurlDB and Matchcode.io with Docker: +:: + + docker compose -f docker-compose_traefik.yml up -d + docker compose -f docker-compose_purldb.yml up -d + docker compose -f docker-compose_matchcodeio.yml up -d + Funding ------- From 9152bdadefeb9aa5ab97744bf1723c299a1ba241 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Mon, 29 Jan 2024 16:54:40 -0800 Subject: [PATCH 50/54] Update README.rst #224 Signed-off-by: Jono Yang --- README.rst | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/README.rst b/README.rst index 759352ec..538b5fd4 100644 --- a/README.rst +++ b/README.rst @@ -123,26 +123,8 @@ matching indices from the collected Package data: make index_packages -MatchCode.io ------------- - -MatchCode.io is a Django app, based off of ScanCode.io, that exposes one API -endpoint, ``api/matching``, which takes a ScanCode.io codebase scan, and -performs Package matching on it. - -Currently, it performs three matching steps: - - * Match codebase resources against the Packages in the PackageDB - * Match codebase resources against the Resources in the PackageDB - * Match codebase directories against the directory matching indices of - MatchCode - -This API endpoint is intended to be used with the ``match_to_purldb`` pipeline -in ScanCode.io. - - -API Endpoints -------------- +PurlDB API Endpoints +-------------------- * ``api/packages`` @@ -177,6 +159,30 @@ API Endpoints * Used to check the SHA1 values of archives from a scan to determine if they are known Packages +MatchCode.io +------------ + +MatchCode.io is a Django app, based off of ScanCode.io, that exposes one API +endpoint, ``api/matching``, which takes a ScanCode.io codebase scan, and +performs Package matching on it. + +Currently, it performs three matching steps: + + * Match codebase resources against the Packages in the PackageDB + * Match codebase resources against the Resources in the PackageDB + * Match codebase directories against the directory matching indices of + MatchCode + + +MatchCode.io API Endpoints +-------------------------- + +* ``api/matching`` + + * Performs Package matching on an uploaded ScanCode.io scan + * Intended to be used with the ``match_to_purldb`` pipeline in ScanCode.io + + Docker Setup for Local Development and Testing ---------------------------------------------- From e5f1e064639b72e77929dfe7c3418cc0e6e91644 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 30 Jan 2024 13:58:48 -0800 Subject: [PATCH 51/54] Update dependencies #224 Signed-off-by: Jono Yang --- matchcode_pipeline/api.py | 7 +- matchcode_pipeline/tests/test_api.py | 22 +++++- ...lete_apiuser.py => 0083_delete_apiuser.py} | 2 +- requirements-dev.txt | 14 ++-- requirements.txt | 77 ++++++++++++++----- setup.cfg | 14 ++-- 6 files changed, 92 insertions(+), 44 deletions(-) rename packagedb/migrations/{0082_delete_apiuser.py => 0083_delete_apiuser.py} (84%) diff --git a/matchcode_pipeline/api.py b/matchcode_pipeline/api.py index 5d0d62ac..b07b05a7 100644 --- a/matchcode_pipeline/api.py +++ b/matchcode_pipeline/api.py @@ -15,6 +15,7 @@ from rest_framework.decorators import action from scanpipe.api import ExcludeFromListViewMixin +from scanpipe.api.serializers import InputSourceSerializer from scanpipe.api.serializers import SerializerExcludeFieldsMixin from scanpipe.api.serializers import StrListField from scanpipe.api.views import ProjectFilterSet @@ -65,7 +66,11 @@ class MatchingSerializer(ExcludeFromListViewMixin, serializers.ModelSerializer): ) webhook_url = serializers.CharField(write_only=True, required=False) runs = RunSerializer(many=True, read_only=True) - input_sources = serializers.JSONField(source="input_sources_list", read_only=True) + input_sources = InputSourceSerializer( + source="inputsources", + many=True, + read_only=True, + ) codebase_resources_summary = serializers.SerializerMethodField() discovered_packages_summary = serializers.SerializerMethodField() discovered_dependencies_summary = serializers.SerializerMethodField() diff --git a/matchcode_pipeline/tests/test_api.py b/matchcode_pipeline/tests/test_api.py index 510d72d9..ec976b8a 100644 --- a/matchcode_pipeline/tests/test_api.py +++ b/matchcode_pipeline/tests/test_api.py @@ -99,13 +99,27 @@ def test_matchcode_pipeline_api_matching_detail(self): expected = {'java_to_class': 1} self.assertEqual(expected, response.data['codebase_relations_summary']) - self.project1.add_input_source(filename='file1', source='uploaded') - self.project1.add_input_source(filename='file2', source='https://download.url') + input1 = self.project1.add_input_source(filename='file1', is_uploaded=True) + input2 = self.project1.add_input_source(filename='file2', download_url='https://download.url') self.project1.save() response = self.csrf_client.get(self.project1_detail_url) expected = [ - {'filename': 'file1', 'source': 'uploaded'}, - {'filename': 'file2', 'source': 'https://download.url'}, + { + "filename": "file1", + "download_url": "", + "is_uploaded": True, + "tag": "", + "exists": False, + "uuid": str(input1.uuid), + }, + { + "filename": "file2", + "download_url": "https://download.url", + "is_uploaded": False, + "tag": "", + "exists": False, + "uuid": str(input2.uuid), + }, ] self.assertEqual(expected, response.data['input_sources']) diff --git a/packagedb/migrations/0082_delete_apiuser.py b/packagedb/migrations/0083_delete_apiuser.py similarity index 84% rename from packagedb/migrations/0082_delete_apiuser.py rename to packagedb/migrations/0083_delete_apiuser.py index fc51e923..92c16ff3 100644 --- a/packagedb/migrations/0082_delete_apiuser.py +++ b/packagedb/migrations/0083_delete_apiuser.py @@ -5,7 +5,7 @@ class Migration(migrations.Migration): dependencies = [ - ("packagedb", "0081_apiuser"), + ("packagedb", "0082_packagewatch"), ] operations = [ diff --git a/requirements-dev.txt b/requirements-dev.txt index 5ef9572c..e1420b5d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,13 +1,9 @@ -aboutcode-toolkit==9.0.0 -black==23.11.0 -et-xmlfile==1.1.0 +black==24.1.1 execnet==2.0.2 iniconfig==2.0.0 -mock==5.1.0 mypy-extensions==1.0.0 -openpyxl==3.1.2 -pathspec==0.11.2 -platformdirs==4.0.0 -pytest==7.4.3 -pytest-django==4.7.0 +pathspec==0.12.1 +platformdirs==4.1.0 +pytest==8.0.0 +pytest-django==4.8.0 pytest-xdist==3.5.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2a19ae03..9f58e271 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ +aboutcode-toolkit==10.1.0 arrow==1.2.3 asgiref==3.7.2 -attrs==23.1.0 +attrs==23.2.0 banal==1.0.6 -beautifulsoup4==4.12.2 +beautifulsoup4==4.12.3 binaryornot==0.4.4 bitarray==2.6.0 boolean.py==4.0 @@ -13,82 +14,117 @@ charset-normalizer==3.3.2 click==8.1.7 colorama==0.4.6 commoncode==31.0.3 +construct==2.10.70 container-inspector==32.0.1 -cryptography==41.0.5 +crispy-bootstrap3==2022.1 +crontab==1.0.1 +cryptography==42.0.2 +cyclonedx-python-lib==3.1.5 debian-inspector==31.0.0 Deprecated==1.2.14 -Django==4.2.6 +Django==5.0.1 +django-crispy-forms==2.1 django-environ==0.11.2 -django-filter==23.3 +django-filter==23.5 +django-probes==1.7.0 +django-rq==2.10.1 +django-taggit==5.0.1 djangorestframework==3.14.0 +dnspython==2.5.0 dockerfile-parse==2.0.1 dparse2==0.7.0 +email-validator==2.1.0.post1 +et-xmlfile==1.1.0 extractcode==31.0.0 extractcode-7z==16.5.210531 extractcode-libarchive==3.5.1.210531 fasteners==0.19 fetchcode==0.3.0 +fetchcode-container==1.2.3.210512 fingerprints==1.2.3 +fontawesomefree==6.5.1 +freezegun==1.4.0 ftfy==6.1.3 ftputil==5.0.4 gemfileparser2==0.9.3 +gitdb==4.0.11 +GitPython==3.1.40 gunicorn==21.2.0 +hoppr-cyclonedx-models==0.4.10 html5lib==1.1 -idna==3.4 -importlib-metadata==6.8.0 +idna==3.6 +importlib-metadata==7.0.1 +inflection==0.5.1 intbitset==3.0.2 isodate==0.6.1 jaraco.functools==4.0.0 javaproperties==0.8.1 jawa==2.2.0 -Jinja2==3.1.2 +Jinja2==3.1.3 +jsonschema==4.19.1 +jsonschema-specifications==2023.12.1 jsonstreams==0.6.0 -license-expression==30.1.1 -lxml==4.9.3 +license-expression==30.2.0 +lxml==4.9.4 Markdown==3.5.1 -MarkupSafe==2.1.3 -more-itertools==10.1.0 +MarkupSafe==2.1.4 +mock==5.1.0 +more-itertools==10.2.0 natsort==8.2.0 normality==2.5.0 +openpyxl==3.1.2 +packagedcode-msitools==0.101.210706 packageurl-python==0.11.2 packaging==23.2 packvers==21.5 parameter-expansion-patched==0.3.1 -pdfminer.six==20221105 +pdfminer.six==20231228 pefile==2023.2.7 pip==22.2.2 pip-requirements-parser==32.0.1 pkginfo2==30.0.0 -pluggy==1.3.0 +pluggy==1.4.0 plugincode==32.0.0 ply==3.11 -psycopg==3.1.12 -psycopg-binary==3.1.12 +psycopg==3.1.17 +psycopg-binary==3.1.17 publicsuffix2==2.20191221 pyahocorasick==2.0.0 pycparser==2.21 +pydantic==1.10.14 PyGithub==1.56 pygmars==0.8.0 Pygments==2.17.2 +pyinstrument==4.6.1 PyJWT==2.8.0 pymaven-patch==0.3.0 PyNaCl==1.5.0 pyparsing==3.1.1 python-dateutil==2.8.2 -pytz==2023.3.post1 +python-inspector==0.10.0 +pytz==2023.4 PyYAML==6.0.1 rdflib==7.0.0 redis==5.0.1 +referencing==0.33.0 +regipy==3.1.6 reppy2==0.3.6 requests==2.31.0 +resolvelib==1.0.1 +rpds-py==0.17.1 +rpm-inspector-rpm==4.16.1.3.210404 rq==1.15.1 +rq-scheduler==0.13.1 rubymarshal==1.0.3 saneyaml==0.6.0 scancode-toolkit==32.0.8 +scancodeio==33.0.0 semantic-version==2.10.0 semver==3.0.2 setuptools==65.3.0 six==1.16.0 +smmap==5.0.1 +sortedcontainers==2.4.0 soupsieve==2.5 spdx-tools==0.7.0rc0 sqlparse==0.4.4 @@ -96,13 +132,14 @@ text-unidecode==1.3 toml==0.10.2 typecode==30.0.1 typecode-libmagic==5.39.210531 -typing_extensions==4.8.0 +typing_extensions==4.9.0 univers==30.11.0 -urllib3==2.1.0 +urllib3==2.2.0 urlpy==0.5 -wcwidth==0.2.12 +wcwidth==0.2.13 webencodings==0.5.1 wheel==0.37.1 wrapt==1.16.0 +XlsxWriter==3.1.9 xmltodict==0.13.0 zipp==3.17.0 diff --git a/setup.cfg b/setup.cfg index 48087317..b29d5883 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,11 +39,11 @@ install_requires = bitarray == 2.6.0 debian-inspector == 31.0.0 commoncode == 31.0.3 - Django == 4.2.6 + Django == 5.0.1 django-environ == 0.11.2 django-rq == 2.10.1 djangorestframework == 3.14.0 - django-filter == 23.3 + django-filter == 23.5 fetchcode == 0.3.0 gunicorn == 21.2.0 ftputil == 5.0.4 @@ -51,16 +51,16 @@ install_requires = markdown == 3.5.1 natsort == 8.2.0 packageurl-python == 0.11.2 - psycopg[binary]==3.1.12 + psycopg[binary]==3.1.17 PyGithub == 1.56 reppy2 == 0.3.6 rq-scheduler == 0.13.1 rubymarshal == 1.0.3 scancode-toolkit[full] == 32.0.8 urlpy == 0.5 - matchcode-toolkit == 1.1.3 + matchcode-toolkit == 2.0.1 univers == 30.11.0 - scancodeio == 32.7.0 + scancodeio == 33.0.0 setup_requires = setuptools_scm[toml] >= 4 python_requires = >=3.8 @@ -69,9 +69,6 @@ python_requires = >=3.8 where = . [options.extras_require] -matchcodeio = - scancodeio - testing = pytest >= 6, != 7.0.0 pytest-xdist >= 2 @@ -79,7 +76,6 @@ testing = aboutcode-toolkit >= 6.0.0 black mock - scancodeio docs = Sphinx==5.0.2 From f2edd6679311fc2332c3db19cc086d5a988bf3ec Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 30 Jan 2024 14:10:32 -0800 Subject: [PATCH 52/54] Run tests on Python 3.10 and 3.11 #224 Signed-off-by: Jono Yang --- .github/workflows/purldb-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/purldb-tests.yml b/.github/workflows/purldb-tests.yml index 4eb16274..a88f4026 100644 --- a/.github/workflows/purldb-tests.yml +++ b/.github/workflows/purldb-tests.yml @@ -46,7 +46,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.10", "3.11"] steps: - name: Checkout code From 6f924d9a84db7121c9ea52d6465dd9a9b2d7df87 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 30 Jan 2024 15:24:40 -0800 Subject: [PATCH 53/54] Bump doc requirements #224 Signed-off-by: Jono Yang --- setup.cfg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index b29d5883..0fec309c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -78,9 +78,9 @@ testing = mock docs = - Sphinx==5.0.2 - sphinx-rtd-theme==1.0.0 - doc8==0.11.2 + Sphinx==7.2.6 + sphinx-rtd-theme==2.0.0 + doc8==1.1.1 [options.entry_points] console_scripts = From f2f4b81c6009590f341881fba2199696f6ba387d Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Tue, 30 Jan 2024 15:41:05 -0800 Subject: [PATCH 54/54] Bump python version used for doc test #224 Signed-off-by: Jono Yang --- .github/workflows/docs-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs-ci.yml b/.github/workflows/docs-ci.yml index e4c5cd3c..f6ab1172 100644 --- a/.github/workflows/docs-ci.yml +++ b/.github/workflows/docs-ci.yml @@ -9,7 +9,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.9] + python-version: ["3.10"] steps: - name: Checkout code