JMGaljaard · JMGaljaard · Sep 4, 2022 · Aug 22, 2022 · Aug 22, 2022 · Aug 22, 2022
diff --git a/.gitignore b/.gitignore
@@ -147,4 +147,45 @@ refactor-notes.md
 experiments/**/exps/*
 *.test.py
 
-logging/*
+logging/*
+
+### Helm ###
+# Chart dependencies
+**/charts/*.tgz
+
+### Terraform ###
+# Local .terraform directories
+**/.terraform/*
+
+# .tfstate files
+*.tfstate
+*.tfstate.*
+
+# Crash log files
+**/crash.log
+**/crash.*.log
+
+# Exclude all .tfvars files, which are likely to contain sensitive data, such as
+# password, private keys, and other secrets. These should not be part of version
+# control as they are data points which are potentially sensitive and subject
+# to change depending on the environment.
+**/*.tfvars
+**/*.tfvars.json
+
+# Ignore override files as they are usually used to override resources locally and so
+# are not checked in
+**/override.tf
+**/override.tf.json
+**/*_override.tf
+**/*_override.tf.json
+
+# Include override files you do wish to add to version control using negated pattern
+# !example_override.tf
+
+# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
+# example: *tfplan*
+
+# Ignore CLI configuration files
+**/.terraformrc
+**/terraform.rc
+*.terraform.lock.hcl
diff --git a/README.md b/README.md
diff --git a/charts/fltk-values.yaml b/charts/fltk-values.yaml
@@ -1,7 +1,8 @@
 fltk:
+    outputDir: output
     configDir: config
     workDir: /opt/federation-lab
 provider:
     domain: gcr.io
-    projectName: test-bed-distml
+    projectName: test-bed-fltk
     imageName: fltk:latest
diff --git a/charts/orchestrator/templates/fl-server-pod.yaml b/charts/orchestrator/templates/fl-server-pod.yaml
@@ -30,7 +30,7 @@ spec:
           memory: {{ (.Values.orchestrator.memory | int) }}
       volumeMounts:
         - name: fl-server-log-volume
-          mountPath: {{ .Values.fltk.workDir }}/output
+          mountPath: {{ .Values.fltk.workDir }}/{{ .Values.fltk.outputDir }}
           readOnly: true
         - name: fltk-orchestrator-config-volume
           mountPath: {{ .Values.fltk.workDir }}/{{ .Values.fltk.configDir }}

diff --git a/configs/distributed_tasks/example_arrival_config.json b/configs/distributed_tasks/example_arrival_config.json
@@ -1,58 +1,66 @@
-[
-  {
-    "type": "distributed",
-    "jobClassParameters": {
-      "networkConfiguration": {
-        "network": "FashionMNISTCNN",
-        "lossFunction": "CrossEntropyLoss",
-        "dataset": "mnist"
-      },
-      "systemParameters": {
-        "dataParallelism": 2,
-        "configurations": {
-          "default": {
-            "cores": "1000m",
-            "memory": "1Gi"
-          }
-        }
-      },
-      "hyperParameters": {
-        "default": {
-          "batchSize": 128,
-          "testBatchSize": 128,
-          "learningRateDecay": 0.0002,
-          "optimizerConfig": {
-            "type": "Adam",
-            "learningRate": 0.001,
-            "betas": [
-              0.9,
-              0.999
-            ]
+{
+  "trainTasks": [
+    {
+      "type": "distributed",
+      "lambda": 1.5,
+      "preemptJobs": false,
+      "jobClassParameters": [
+        {
+          "classProbability": 0.1,
+          "priorities": [
+            {
+              "priority": 1,
+              "probability": 0.9
+            },
+            {
+              "priority": 0,
+              "probability": 0.1
+            }
+          ],
+          "networkConfiguration": {
+            "network": "FashionMNISTCNN",
+            "lossFunction": "CrossEntropyLoss",
+            "dataset": "mnist"
+          },
+          "systemParameters": {
+            "dataParallelism": 4,
+            "configurations": {
+              "default": {
+                "cores": "1000m",
+                "memory": "1Gi"
+              }
+            }
+          },
+          "hyperParameters": {
+            "default": {
+              "totalEpochs": 100,
+              "batchSize": 128,
+              "testBatchSize": 128,
+              "learningRateDecay": 0.0002,
+              "optimizerConfig": {
+                "type": "Adam",
+                "learningRate": 0.001,
+                "betas": [
+                  0.9,
+                  0.999
+                ]
+              },
+              "schedulerConfig": {
+                "schedulerStepSize": 50,
+                "schedulerGamma": 0.5,
+                "minimumLearningRate": 1e-10
+              }
+            },
+            "configurations": {
+              "Master": null,
+              "Worker": null
+            }
           },
-          "schedulerConfig": {
-            "schedulerStepSize": 50,
-            "schedulerGamma": 0.5,
-            "minimumLearningRate": 1e-10
+          "learningParameters": {
+            "cuda": false
           }
-        },
-        "configurations": {
-          "Master": null,
-          "Worker": null
         }
-      },
-      "learningParameters": {
-        "totalEpochs": 100,
-        "cuda": false
-      },
-      "experimentConfiguration": {
-        "randomSeed": [
-          1,
-          41,
-          42,
-          43,
-          430
-        ]
-      }
+      ]
     }
-  }
-]
+  ]
+}
diff --git a/configs/example_cloud_experiment.json b/configs/example_cloud_experiment.json
@@ -1,15 +1,14 @@
 {
    "cluster": {
       "orchestrator": {
-         "wait_for_clients": true,
-         "service": "fl-server.test.svc.cluster.local",
-         "nic": "eth0"
+         "orchestrator_type": "simulated"
       },
       "client": {
          "prefix": "client",
          "tensorboard_active": false
       },
-      "image": "gcr.io/test-bed-distml/fltk:latest"
+      "image": "gcr.io/test-bed-fltk/fltk:latest",
+      "namespace": "test"
    },
    "execution_config": {
       "duration": 3600,

diff --git a/configs/federated_tasks/example_arrival_config.json b/configs/federated_tasks/example_arrival_config.json
@@ -1,14 +1,14 @@
 [
   {
     "type": "federated",
-    "jobClassParameters": {
+    "jobClassParameters": [{
       "networkConfiguration": {
         "network": "FashionMNISTCNN",
         "lossFunction": "CrossEntropyLoss",
         "dataset": "mnist"
       },
       "systemParameters": {
-        "dataParallelism": null,
+        "dataParallelism": 4,
         "configurations": {
           "Master": {
             "cores": "1000m",
@@ -62,20 +62,7 @@
           "shuffle": true
         },
         "aggregation": "FedAvg"
-      },
-      "experimentConfiguration": {
-        "randomSeed": [
-          1,
-          41,
-          42,
-          43,
-          430
-        ],
-        "workerReplication": {
-          "Master": 1,
-          "Worker": 2
-        }
       }
-    }
+    }]
   }
 ]
diff --git a/experiments/dist_node.jinja.yaml b/experiments/dist_node.jinja.yaml
@@ -8,7 +8,7 @@ optimizer: {{ task.get_optimizer_param(tpe, 'type').value }}
 optimizer_args: {{ task.get_optimizer_args(tpe) }}
 model: {{ task.get_net_param('network').value }}
 dataset: {{ task.get_net_param('dataset').value }}
-max_epoch: {{ task.get_learn_param('total_epochs') }}
+max_epoch: {{ task.get_hyper_param(tpe, 'total_epochs') }}
 learning_rate: {{ task.get_optimizer_param(tpe, 'lr') }}
 learning_rate_decay: {{ task.get_hyper_param(tpe, 'lr_decay') }}
 seed: {{ task.get_net_param('seed') }}

diff --git a/fltk/__main__.py b/fltk/__main__.py
@@ -1,28 +1,34 @@
-import argparse
+from argparse import Namespace, ArgumentParser
 import logging
 from pathlib import Path
 from typing import Optional, Any, Dict
 
 import sys
 
-from fltk.launch import launch_extractor, launch_client, launch_single, \
-    launch_remote, launch_cluster, launch_signature
+from fltk.launch import launch_extractor, launch_client, launch_single, launch_remote, launch_cluster, launch_signature
 from fltk.util.config import get_distributed_config
 from fltk.util.config.arguments import create_all_subparsers
-from fltk.util.generate_experiments import generate, run
 
 __run_op_dict: Dict[str, launch_signature] = {
-    'util-generate': generate,
-    'util-run': run,
-    'remote': launch_remote,
-    'single': launch_single,
-    'cluster': launch_cluster,
-    'client': launch_client,
-    'extractor': launch_extractor
+    'remote': launch_remote,            # Federated experiment (cluster)
+    'single': launch_single,            # Federated experiment (locally_
+    'cluster': launch_cluster,          # Cluster orchestrator
+    'client': launch_client,            # Distributed client
+    'extractor': launch_extractor       # Extractor (local)
 }
 
 
-def _save_get(args, param) -> Optional[Any]:
+def _save_get(args: Namespace, param: str) -> Optional[Any]:
+    """
+    Helper function to retrieve parameters from argument namespace.
+
+    @param args: Arguments passed from the commandline.
+    @type args: Namespace
+    @param param: Parameter to (safely) retrieve from the passed arguments.
+    @type param: str
+    @return: Value that was passed from the CLI if it was provided.
+    @rtype: Optional[Any]
+    """
     save_argument = None
     if args is not None and hasattr(args, param):
         save_argument = args.__dict__[param]
@@ -31,42 +37,45 @@ def _save_get(args, param) -> Optional[Any]:
     return save_argument
 
 
-def __main__():
+# noinspection PyBroadException
+def main():
     """
-    Main loop to perform learning (either Federated or Distributed). Note that Orchestrator is part of this setup for
-    now.
-    @return: Nothing.
+    Main loop to perform learning (either Federated or Distributed). Note that Orchestrator is part
+    of this setup for a unified startup. A future revision may extract the Orchestrator.
+    @return: None.
     @rtype: None
     """
-    parser = argparse.ArgumentParser(prog='fltk',
-                                     description='Experiment launcher for the Federated Learning Testbed (fltk)')
+    parser = ArgumentParser(prog='fltk',
+                            description='Launcher for the Federated Learning Testbed (fltk)')
     subparsers = parser.add_subparsers(dest="action", required=True)
     create_all_subparsers(subparsers)
     # To create your own parser mirror the construction in the 'client_parser' object.
     # Or refer to the ArgumentParser library documentation.
     args = parser.parse_args()
     distributed_config = get_distributed_config(args)
 
+    # Docker based launches rely on different arguments, prepare the placeholder values for a
+    # unified argument list.
     arg_path, conf_path = None, None
+
     try:
         arg_path = Path(args.path)
-    except Exception as _:  # pylint: disable=broad-except
+    except Exception as _:
         print('No argument path is provided.')
     try:
         conf_path = Path(args.config)
-    except Exception as _:  # pylint: disable=broad-except
+    except Exception as _:
         print('No configuration path is provided.')
 
     launch_fn: launch_signature = __run_op_dict[args.action]
-    launch_fn(arg_path, conf_path,
-              _save_get(args, 'rank'),
-              _save_get(args, 'nic'),
-              _save_get(args, 'host'),
-              _save_get(args, 'prefix'),
-              args,
-              distributed_config)
     try:
-        pass
+        launch_fn(arg_path, conf_path,
+                      _save_get(args, 'rank'),
+                      _save_get(args, 'nic'),
+                      _save_get(args, 'host'),
+                      _save_get(args, 'prefix'),
+                      args,
+                      distributed_config)
     except Exception as e:
         print(f"Failed with reason: {e}")
         parser.print_help()
@@ -75,10 +84,13 @@ def __main__():
 
 
 if __name__ == "__main__":
-    # Get loger and set format for logging before starting the main loop.
+    # Get logger and set format for logging before starting the main loop.
     root = logging.getLogger()
     if root.handlers:
         for handler in root.handlers:
             root.removeHandler(handler)
-    logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d-%Y %H:%M:%S', )
-    __main__()
+    # noinspection SpellCheckingInspection
+    logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
+                        datefmt='%m-%d-%Y %H:%M:%S')
+
+    main()
diff --git a/fltk/core/__init__.py b/fltk/core/__init__.py
@@ -0,0 +1,3 @@
+from .client import Client
+from .federator import Federator
+from .node import Node