fl support code

mlcommons · Feb 28, 2024 · c837479 · c837479
1 parent 905e9fd
commit c837479
Show file tree

Hide file tree

Showing 108 changed files with 5,902 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -147,3 +147,4 @@ cython_debug/
 # Dev Environment Specific
 .vscode
 .venv
+server/keys
diff --git a/TODO b/TODO
@@ -0,0 +1,47 @@
+# TODO: remove me from the repo
+
+FOR TUTORIAL
+
+- stream logs
+- check benchmark execution mlcube training exp ID
+- if association request failed for some reason, delete private key (or at least check if rerunning the request will simply overwrite the key)
+- define output folders in medperf storage (logs for both, weights for agg)
+- adding email to CN currently could be challenging. THINK
+  - ASSUMPTION: emails are not changed after signup
+
+- We now have demo data url and hash in training exp (dummy) that we don't use.
+  - what to say about this in miccai (I think no worries; it's hidden now)
+- rethink/review about the following serializers and if necessary use atomic transactions
+  - association creation (dataset-training, agg-training)
+  - association approval (dataset-training, agg-training)
+  - training experiment creation (creating keypair); this could move to approval
+- public/private keys uniqueness constraint while blank; check django docs on how
+- fix bug about association list; /home/hasan/work/openfl_ws/medperf-private/server/utils/views.py
+- pull latest medperf main
+- test agg and training exp owner being same user
+  - basically, test the tutorial steps EXACTLY
+
+AFTER TUTORIAL
+
+- FOLLOWUP: collaborators doesn't use tensorboard logs.
+- FOLLOWUP: show csr hash on approval is not necessary since now CSRs are transported securely
+- test remote aggregator
+- make network config better structured (URL to file? no, could be annoying.)
+- move key generation after admin approval of training experiments.
+- when the training experiment owner wants to "lock" the experiment
+  - ask for confirmation? it's an easy command and after execution there is no going back; a mess if unintended.
+- secretstorage gcloud
+
+NOT SURE
+
+- consider if we want to enable restarts and epochs/"fresh restarts" for training exps (it's hard)
+- mlcube for agg alone
+
+LATER / FUTURE INVESTIGATIONS
+
+- root key thing.
+- limit network access (for now we can rely on the review of the experiment owner)
+- compatibility tests
+- rethink if keys are always needed (just for exps where they on't need a custom cert)
+- server side verification of CSRs (check common names)
+  - later: the whole design might be changed
diff --git a/cli/medperf/__main__.py b/cli/medperf/__main__.py
@@ -25,7 +25,8 @@
 )
 import medperf.commands.association.association as association
 import medperf.commands.compatibility_test.compatibility_test as compatibility_test
-
+import medperf.commands.training.training as training
+import medperf.commands.aggregator.aggregator as aggregator
 
 app = typer.Typer()
 app.add_typer(mlcube.app, name="mlcube", help="Manage mlcubes")
@@ -38,6 +39,8 @@
 app.add_typer(profile.app, name="profile", help="Manage profiles")
 app.add_typer(compatibility_test.app, name="test", help="Manage compatibility tests")
 app.add_typer(auth.app, name="auth", help="Authentication")
+app.add_typer(training.app, name="training", help="Training")
+app.add_typer(aggregator.app, name="aggregator", help="Aggregator")
 
 
 @app.command("run")

diff --git a/cli/medperf/account_management.py b/cli/medperf/account_management.py
@@ -1,45 +1,82 @@
 import keyring
-from medperf.utils import read_config, write_config
+import os
+import base64
+from medperf.utils import read_config, write_config, base_storage_path, remove_path
 from medperf import config
 from medperf.exceptions import MedperfException
 
 
+# class TokenStore:
+#     def __init__(self):
+#         pass
+
+#     def set_tokens(self, account_id, access_token, refresh_token):
+#         keyring.set_password(
+#             config.keyring_access_token_service_name,
+#             account_id,
+#             access_token,
+#         )
+#         keyring.set_password(
+#             config.keyring_refresh_token_service_name,
+#             account_id,
+#             refresh_token,
+#         )
+
+#     def read_tokens(self, account_id):
+#         access_token = keyring.get_password(
+#             config.keyring_access_token_service_name,
+#             account_id,
+#         )
+#         refresh_token = keyring.get_password(
+#             config.keyring_refresh_token_service_name,
+#             account_id,
+#         )
+#         return access_token, refresh_token
+
+#     def delete_tokens(self, account_id):
+#         keyring.delete_password(
+#             config.keyring_access_token_service_name,
+#             account_id,
+#         )
+#         keyring.delete_password(
+#             config.keyring_refresh_token_service_name,
+#             account_id,
+#         )
+
 class TokenStore:
     def __init__(self):
-        pass
+        self.creds_folder = base_storage_path(config.creds_folder)
+        os.makedirs(self.creds_folder, exist_ok=True)
+
+    def __get_paths(self, account_id):
+        account_id_encoded = base64.b64encode(account_id.encode("utf-8")).decode("utf-8")
+        account_folder = os.path.join(self.creds_folder, account_id_encoded)
+        os.makedirs(account_folder, exist_ok=True)
+
+        access_token_file = os.path.join(account_folder, config.keyring_access_token_service_name)
+        refresh_token_file = os.path.join(account_folder, config.keyring_refresh_token_service_name)
 
+        return access_token_file, refresh_token_file
+
     def set_tokens(self, account_id, access_token, refresh_token):
-        keyring.set_password(
-            config.keyring_access_token_service_name,
-            account_id,
-            access_token,
-        )
-        keyring.set_password(
-            config.keyring_refresh_token_service_name,
-            account_id,
-            refresh_token,
-        )
+        access_token_file, refresh_token_file = self.__get_paths(account_id)
+        with open(access_token_file, "w") as f:
+            f.write(access_token)
+        with open(refresh_token_file, "w") as f:
+            f.write(refresh_token)
 
     def read_tokens(self, account_id):
-        access_token = keyring.get_password(
-            config.keyring_access_token_service_name,
-            account_id,
-        )
-        refresh_token = keyring.get_password(
-            config.keyring_refresh_token_service_name,
-            account_id,
-        )
+        access_token_file, refresh_token_file = self.__get_paths(account_id)
+        with open(access_token_file) as f:
+            access_token = f.read()
+        with open(refresh_token_file) as f:
+            refresh_token = f.read()
         return access_token, refresh_token
 
     def delete_tokens(self, account_id):
-        keyring.delete_password(
-            config.keyring_access_token_service_name,
-            account_id,
-        )
-        keyring.delete_password(
-            config.keyring_refresh_token_service_name,
-            account_id,
-        )
+        access_token_file, refresh_token_file = self.__get_paths(account_id)
+        remove_path(access_token_file)
+        remove_path(refresh_token_file)
 
 
 def read_user_account():

diff --git a/cli/medperf/commands/aggregator/aggregator.py b/cli/medperf/commands/aggregator/aggregator.py
@@ -0,0 +1,107 @@
+from typing import Optional
+from medperf.entities.aggregator import Aggregator
+import typer
+
+import medperf.config as config
+from medperf.decorators import clean_except
+from medperf.commands.aggregator.submit import SubmitAggregator
+from medperf.commands.aggregator.associate import AssociateAggregator
+from medperf.commands.aggregator.run import StartAggregator
+
+from medperf.commands.list import EntityList
+from medperf.commands.view import EntityView
+
+app = typer.Typer()
+
+
+@app.command("submit")
+@clean_except
+def submit(
+    name: str = typer.Option(..., "--name", "-n", help="Name of the agg"),
+    address: str = typer.Option(
+        ..., "--address", "-a", help="UID of benchmark to associate with"
+    ),
+    port: int = typer.Option(
+        ..., "--port", "-p", help="UID of benchmark to associate with"
+    ),
+):
+    """Associates a benchmark with a given mlcube or dataset. Only one option at a time."""
+    SubmitAggregator.run(name, address, port)
+    config.ui.print("✅ Done!")
+
+
+@app.command("associate")
+@clean_except
+def associate(
+    aggregator_id: int = typer.Option(
+        ..., "--aggregator_id", "-a", help="UID of benchmark to associate with"
+    ),
+    training_exp_id: int = typer.Option(
+        ..., "--training_exp_id", "-t", help="UID of benchmark to associate with"
+    ),
+    approval: bool = typer.Option(False, "-y", help="Skip approval step"),
+):
+    """Associates a benchmark with a given mlcube or dataset. Only one option at a time."""
+    AssociateAggregator.run(aggregator_id, training_exp_id, approved=approval)
+    config.ui.print("✅ Done!")
+
+
+@app.command("start")
+@clean_except
+def run(
+    aggregator_id: int = typer.Option(
+        ..., "--aggregator_id", "-a", help="UID of benchmark to associate with"
+    ),
+    training_exp_id: int = typer.Option(
+        ..., "--training_exp_id", "-t", help="UID of benchmark to associate with"
+    ),
+):
+    """Associates a benchmark with a given mlcube or dataset. Only one option at a time."""
+    StartAggregator.run(training_exp_id, aggregator_id)
+    config.ui.print("✅ Done!")
+
+
+@app.command("ls")
+@clean_except
+def list(
+    local: bool = typer.Option(False, "--local", help="Get local aggregators"),
+    mine: bool = typer.Option(False, "--mine", help="Get current-user aggregators"),
+):
+    """List aggregators stored locally and remotely from the user"""
+    EntityList.run(
+        Aggregator,
+        fields=["UID", "Name", "Address", "Port"],
+        local_only=local,
+        mine_only=mine,
+    )
+
+
+@app.command("view")
+@clean_except
+def view(
+    entity_id: Optional[int] = typer.Argument(None, help="Benchmark ID"),
+    format: str = typer.Option(
+        "yaml",
+        "-f",
+        "--format",
+        help="Format to display contents. Available formats: [yaml, json]",
+    ),
+    local: bool = typer.Option(
+        False,
+        "--local",
+        help="Display local benchmarks if benchmark ID is not provided",
+    ),
+    mine: bool = typer.Option(
+        False,
+        "--mine",
+        help="Display current-user benchmarks if benchmark ID is not provided",
+    ),
+    output: str = typer.Option(
+        None,
+        "--output",
+        "-o",
+        help="Output file to store contents. If not provided, the output will be displayed",
+    ),
+):
+    """Displays the information of one or more aggregators"""
+    EntityView.run(entity_id, Aggregator, format, local, mine, output)
diff --git a/cli/medperf/commands/aggregator/associate.py b/cli/medperf/commands/aggregator/associate.py
@@ -0,0 +1,38 @@
+from medperf import config
+from medperf.entities.aggregator import Aggregator
+from medperf.entities.training_exp import TrainingExp
+from medperf.utils import approval_prompt, generate_agg_csr
+from medperf.exceptions import InvalidArgumentError
+
+
+class AssociateAggregator:
+    @staticmethod
+    def run(training_exp_id: int, agg_uid: int, approved=False):
+        """Associates a registered aggregator with a benchmark
+
+        Args:
+            agg_uid (int): UID of the registered aggregator to associate
+            benchmark_uid (int): UID of the benchmark to associate with
+        """
+        comms = config.comms
+        ui = config.ui
+        agg = Aggregator.get(agg_uid)
+        if agg.id is None:
+            msg = "The provided aggregator is not registered."
+            raise InvalidArgumentError(msg)
+
+        training_exp = TrainingExp.get(training_exp_id)
+        csr, csr_hash = generate_agg_csr(training_exp_id, agg.address, agg.id)
+        msg = "Please confirm that you would like to associate"
+        msg += f" the aggregator {agg.name} with the training exp {training_exp.name}."
+        msg += f" The certificate signing request hash is: {csr_hash}"
+        msg += " [Y/n]"
+
+        approved = approved or approval_prompt(msg)
+        if approved:
+            ui.print("Generating aggregator training association")
+            # TODO: delete keys if upload fails
+            #       check if on failure, other (possible) request will overwrite key
+            comms.associate_aggregator(agg.id, training_exp_id, csr)
+        else:
+            ui.print("Aggregator association operation cancelled.")
-Original file line number
+Diff line change
@@ Expand Up / @@ -147,3 +147,4 @@ cython_debug/ @@
     # Dev Environment Specific
     .vscode
     .venv
+    server/keys