Implement improver registry and management command

Improver look strikingly similar to importers with some enhancements. (Eg: It doesn't use an importer_yielder alternative, see issue 501). Overview: Improvers maintain a contract (like Advisory) with improve_runner which is named Inference. Inference class embeds an advisory and a confidence score for that advisory. It is the job of an improver to fetch data to improve from the database (probably using some helper functions) then use whatever means necessary to improve that data sample and return with Inferences. Do note, that Inferences which have already been "imported" by importers would be totally discarded as redundant. Also, in case of two inferences on same data point, the one with highest confidence will be taken into the database. Food for thought: Pssst... Probably Inference class is useless and Advisory class can itself have that confidence score, but then the importers would have to mention that whatever they import have 100% confidence which might be susceptible to typo errors making some importers not mention their confidence thus zeroing on confidence. Anyway, importer and improvers should be different and separated. If not, then we could totally discard the idea of improvers and embed everything in an importer with a confidence score. Well, then, where goes the idea of modularity and keeping things simple ? Also, data coming from an "import"er should always be absolutely correct. This will also ensure that if downstream doesn't want any "improved" data then they don't get our guesses. The whole point of separating importers and improvers is that running improvers could be totally optional and based on downstream taste. Signed-off-by: Hritik Vijay <[email protected]>
aboutcode-org · Aug 12, 2021 · 331f70c · 331f70c
1 parent c4aaa10
commit 331f70c
Show file tree

Hide file tree

Showing 4 changed files with 170 additions and 0 deletions.
diff --git a/vulnerabilities/data_inference.py b/vulnerabilities/data_inference.py
@@ -0,0 +1,28 @@
+import dataclasses
+import logging
+from vulnerabilities.data_source import Advisory
+
+logger = logging.getLogger(__name__)
+
+class OverConfidenceError(ValueError):
+    pass
+
+class UnderConfidenceError(ValueError):
+    pass
+
+MAX_CONFIDENCE = 100
+
+@dataclasses.dataclass(order=True)
+class Inference:
+    """
+    This data class expresses the contract between data improvers and the improve runner.
+    """
+    advisory: Advisory
+    confidence: int
+
+    def __post_init__(self):
+        if self.confidence > MAX_CONFIDENCE:
+            raise OverConfidenceError
+
+        if self.confidence < 0:
+            raise UnderConfidenceError
diff --git a/vulnerabilities/improve_runner.py b/vulnerabilities/improve_runner.py
@@ -0,0 +1,25 @@
+from datetime import datetime
+import dataclasses
+import logging
+
+logger = logging.getLogger(__name__)
+
+class ImproveRunner:
+    """
+    The ImproveRunner is responsible to improve the already imported data by a datasource.
+    Inferences regarding the data could be generated based on multiple factors.
+    All the inferences consist of a confidence score whose threshold could be tuned in user
+    settings (.env file)
+    """
+    def __init__(self, improver):
+        self.improver = improver
+
+    def run(self) -> None:
+        logger.info("Improving using %s.", self.improver.__module__)
+        inferences = self.improver.updated_inferences()
+        process_inferences(inferences)
+        logger.info("Finished improving using %s.", self.improver.__module__)
+
+
+def process_inferences(inferences):
+    ...
diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py
@@ -0,0 +1,8 @@
+IMPROVER_REGISTRY = []
+
+def class_name(module_name: str):
+    for improver in IMPROVER_REGISTRY:
+        if improver.__module__ == module_name:
+            return improver
+
+    raise AttributeError
diff --git a/vulnerabilities/management/commands/improve.py b/vulnerabilities/management/commands/improve.py
@@ -0,0 +1,109 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/vulnerablecode/
+# The VulnerableCode software is licensed under the Apache License version 2.0.
+# Data generated with VulnerableCode require an acknowledgment.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with VulnerableCode or any VulnerableCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+#  Generated with VulnerableCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+#  VulnerableCode should be considered or used as legal advice. Consult an Attorney
+#  for any legal advice.
+#  VulnerableCode is a free software code scanning tool from nexB Inc. and others.
+#  Visit https://github.com/nexB/vulnerablecode/ for support and download.
+
+from datetime import datetime
+import traceback
+
+from django.core.management.base import BaseCommand
+from django.core.management.base import CommandError
+
+from vulnerabilities.models import Importer
+from vulnerabilities.import_runner import ImportRunner
+from vulnerabilities.importer_yielder import load_importers
+from vulnerabilities.improvers import IMPROVER_REGISTRY
+from vulnerabilities.improvers import class_name
+from vulnerabilities.improve_runner import ImproveRunner
+
+
+class Command(BaseCommand):
+    help = "Improve imported vulnerability data"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--list",
+            action="store_true",
+            help="List available data inferences",
+        )
+        parser.add_argument(
+            "--all", action="store_true", help="Improve data from all available inferences"
+        )
+
+        parser.add_argument("sources", nargs="*", help="Data sources from which to import")
+
+    def handle(self, *args, **options):
+        if options["list"]:
+            self.list_sources()
+            return
+
+        if options["all"]:
+            self.improve_data(IMPROVER_REGISTRY)
+            return
+
+        sources = options["sources"]
+        if not sources:
+            raise CommandError(
+                'Please provide at least one data inference to improve from or use "--all".'
+            )
+
+        self.improve_data(valid_sources(sources))
+
+    def list_sources(self):
+        improvers = [ improver.__module__ for improver in IMPROVER_REGISTRY ]
+        self.stdout.write("Vulnerability data can be improved from the following sources:")
+        self.stdout.write(", ".join(improvers))
+
+    def improve_data(self, improvers):
+        failed_improvers = []
+
+        for improver in improvers:
+            self.stdout.write(f"Improving data using {improver.__module__}")
+            try:
+                ImproveRunner(improver).run()
+                self.stdout.write(
+                    self.style.SUCCESS(f"Successfully improved data using {improver.__module__}")
+                )
+            except Exception:
+                failed_improvers.append(improver.__module__)
+                traceback.print_exc()
+                self.stdout.write(
+                    self.style.ERROR(f"Failed to run improver {improver.__module__}. Continuing...")
+                )
+
+        if failed_improvers:
+            raise CommandError(f"{len(failed_improvers)} failed!: {','.join(failed_improvers)}")
+
+
+def valid_sources(sources):
+    improvers = []
+    unknown_sources = []
+    for source in sources:
+        try:
+            improvers.append(class_name(source))
+        except AttributeError:
+            unknown_sources.append(source)
+    if unknown_sources:
+        raise CommandError(f"Unknown sources: {unknown_sources}")
+
+    return improvers
+
+