Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Terraform deployment #44

Merged
merged 32 commits into from
Sep 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
14bf423
Add initial variant for testing
JMGaljaard Aug 22, 2022
3730286
Update to mvp
JMGaljaard Aug 22, 2022
62bcee9
add jupyter
JMGaljaard Aug 22, 2022
e9f4331
Update kubeflow deployment to for standalone release
JMGaljaard Aug 23, 2022
9a62edb
Update kubeflow deployment to for standalone release
JMGaljaard Aug 23, 2022
8fe63a2
Update codebase to make use of kubeflow 1.5.0 training operator
JMGaljaard Aug 23, 2022
2e9b0ce
Update implementation
JMGaljaard Aug 23, 2022
7633403
Cleanup old code from templates
JMGaljaard Aug 23, 2022
19c3d48
Make use of zonal cluster
JMGaljaard Aug 31, 2022
e7bb81a
Update configuration object examples for new definition
JMGaljaard Sep 2, 2022
d84d67b
Clean up main file for starting/launching
JMGaljaard Sep 2, 2022
57603f5
Update core to differentiate between federated and distributed tasks
JMGaljaard Sep 2, 2022
f6e0e6b
Clean up definition of datasets to differentiate between Distributed …
JMGaljaard Sep 2, 2022
7f5c823
Make intention fo distributed datasets more clear
JMGaljaard Sep 2, 2022
21ee969
Make DistributedConfig only import for type checking to prevent cycli…
JMGaljaard Sep 2, 2022
c5e01bc
Update imports and versions of objects for cluster utilities
JMGaljaard Sep 2, 2022
0ab2f26
Update definitions in preparation for restructured configuration objects
JMGaljaard Sep 2, 2022
7989ad6
Remove old objects from distributed configuration object
JMGaljaard Sep 2, 2022
a084e5a
Remove comment
JMGaljaard Sep 2, 2022
ab8d818
Clean up import list of learning config
JMGaljaard Sep 2, 2022
612d4d6
Update parameter to re-allow for complex simulated types of jobs
JMGaljaard Sep 2, 2022
122f013
Make test-ready version of deployment with Fed and Dis tasks
JMGaljaard Sep 2, 2022
bd21e9b
Update notebook
JMGaljaard Sep 2, 2022
c911900
Clean outputs
JMGaljaard Sep 2, 2022
4e23a96
Update description of juptyer notebook
JMGaljaard Sep 2, 2022
1d49bd6
udpate references to cost
JMGaljaard Sep 2, 2022
7999dda
Move terraform directory to content root
JMGaljaard Sep 4, 2022
b02138a
Clean up deployment
JMGaljaard Sep 4, 2022
473446a
Update README to use terraform by default
JMGaljaard Sep 4, 2022
31bc22a
Remove old references to seeds, clean up objects
JMGaljaard Sep 4, 2022
45a36c1
Unify reference to project name
JMGaljaard Sep 4, 2022
614e850
Update notebook to reflect project name
JMGaljaard Sep 4, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,45 @@ refactor-notes.md
experiments/**/exps/*
*.test.py

logging/*
logging/*

### Helm ###
# Chart dependencies
**/charts/*.tgz

### Terraform ###
# Local .terraform directories
**/.terraform/*

# .tfstate files
*.tfstate
*.tfstate.*

# Crash log files
**/crash.log
**/crash.*.log

# Exclude all .tfvars files, which are likely to contain sensitive data, such as
# password, private keys, and other secrets. These should not be part of version
# control as they are data points which are potentially sensitive and subject
# to change depending on the environment.
**/*.tfvars
**/*.tfvars.json

# Ignore override files as they are usually used to override resources locally and so
# are not checked in
**/override.tf
**/override.tf.json
**/*_override.tf
**/*_override.tf.json

# Include override files you do wish to add to version control using negated pattern
# !example_override.tf

# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
# example: *tfplan*

# Ignore CLI configuration files
**/.terraformrc
**/terraform.rc
*.terraform.lock.hcl
352 changes: 88 additions & 264 deletions README.md

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion charts/fltk-values.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
fltk:
outputDir: output
configDir: config
workDir: /opt/federation-lab
provider:
domain: gcr.io
projectName: test-bed-distml
projectName: test-bed-fltk
imageName: fltk:latest
2 changes: 1 addition & 1 deletion charts/orchestrator/templates/fl-server-pod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
memory: {{ (.Values.orchestrator.memory | int) }}
volumeMounts:
- name: fl-server-log-volume
mountPath: {{ .Values.fltk.workDir }}/output
mountPath: {{ .Values.fltk.workDir }}/{{ .Values.fltk.outputDir }}
readOnly: true
- name: fltk-orchestrator-config-volume
mountPath: {{ .Values.fltk.workDir }}/{{ .Values.fltk.configDir }}
Expand Down
116 changes: 62 additions & 54 deletions configs/distributed_tasks/example_arrival_config.json
Original file line number Diff line number Diff line change
@@ -1,58 +1,66 @@
[
{
"type": "distributed",
"jobClassParameters": {
"networkConfiguration": {
"network": "FashionMNISTCNN",
"lossFunction": "CrossEntropyLoss",
"dataset": "mnist"
},
"systemParameters": {
"dataParallelism": 2,
"configurations": {
"default": {
"cores": "1000m",
"memory": "1Gi"
}
}
},
"hyperParameters": {
"default": {
"batchSize": 128,
"testBatchSize": 128,
"learningRateDecay": 0.0002,
"optimizerConfig": {
"type": "Adam",
"learningRate": 0.001,
"betas": [
0.9,
0.999
]
{
"trainTasks": [
{
"type": "distributed",
"lambda": 1.5,
"preemptJobs": false,
"jobClassParameters": [
{
"classProbability": 0.1,
"priorities": [
{
"priority": 1,
"probability": 0.9
},
{
"priority": 0,
"probability": 0.1
}
],
"networkConfiguration": {
"network": "FashionMNISTCNN",
"lossFunction": "CrossEntropyLoss",
"dataset": "mnist"
},
"systemParameters": {
"dataParallelism": 4,
"configurations": {
"default": {
"cores": "1000m",
"memory": "1Gi"
}
}
},
"hyperParameters": {
"default": {
"totalEpochs": 100,
"batchSize": 128,
"testBatchSize": 128,
"learningRateDecay": 0.0002,
"optimizerConfig": {
"type": "Adam",
"learningRate": 0.001,
"betas": [
0.9,
0.999
]
},
"schedulerConfig": {
"schedulerStepSize": 50,
"schedulerGamma": 0.5,
"minimumLearningRate": 1e-10
}
},
"configurations": {
"Master": null,
"Worker": null
}
},
"schedulerConfig": {
"schedulerStepSize": 50,
"schedulerGamma": 0.5,
"minimumLearningRate": 1e-10
"learningParameters": {
"cuda": false
}
},
"configurations": {
"Master": null,
"Worker": null
}
},
"learningParameters": {
"totalEpochs": 100,
"cuda": false
},
"experimentConfiguration": {
"randomSeed": [
1,
41,
42,
43,
430
]
}
]
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be addressed in #43

}
}
]
]
}
7 changes: 3 additions & 4 deletions configs/example_cloud_experiment.json
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
{
"cluster": {
"orchestrator": {
"wait_for_clients": true,
"service": "fl-server.test.svc.cluster.local",
"nic": "eth0"
"orchestrator_type": "simulated"
},
"client": {
"prefix": "client",
"tensorboard_active": false
},
"image": "gcr.io/test-bed-distml/fltk:latest"
"image": "gcr.io/test-bed-fltk/fltk:latest",
"namespace": "test"
},
"execution_config": {
"duration": 3600,
Expand Down
19 changes: 3 additions & 16 deletions configs/federated_tasks/example_arrival_config.json
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[
{
"type": "federated",
"jobClassParameters": {
"jobClassParameters": [{
"networkConfiguration": {
"network": "FashionMNISTCNN",
"lossFunction": "CrossEntropyLoss",
"dataset": "mnist"
},
"systemParameters": {
"dataParallelism": null,
"dataParallelism": 4,
"configurations": {
"Master": {
"cores": "1000m",
Expand Down Expand Up @@ -62,20 +62,7 @@
"shuffle": true
},
"aggregation": "FedAvg"
},
"experimentConfiguration": {
"randomSeed": [
1,
41,
42,
43,
430
],
"workerReplication": {
"Master": 1,
"Worker": 2
}
}
}
}]
}
]
2 changes: 1 addition & 1 deletion experiments/dist_node.jinja.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ optimizer: {{ task.get_optimizer_param(tpe, 'type').value }}
optimizer_args: {{ task.get_optimizer_args(tpe) }}
model: {{ task.get_net_param('network').value }}
dataset: {{ task.get_net_param('dataset').value }}
max_epoch: {{ task.get_learn_param('total_epochs') }}
max_epoch: {{ task.get_hyper_param(tpe, 'total_epochs') }}
learning_rate: {{ task.get_optimizer_param(tpe, 'lr') }}
learning_rate_decay: {{ task.get_hyper_param(tpe, 'lr_decay') }}
seed: {{ task.get_net_param('seed') }}
Expand Down
74 changes: 43 additions & 31 deletions fltk/__main__.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,34 @@
import argparse
from argparse import Namespace, ArgumentParser
import logging
from pathlib import Path
from typing import Optional, Any, Dict

import sys

from fltk.launch import launch_extractor, launch_client, launch_single, \
launch_remote, launch_cluster, launch_signature
from fltk.launch import launch_extractor, launch_client, launch_single, launch_remote, launch_cluster, launch_signature
from fltk.util.config import get_distributed_config
from fltk.util.config.arguments import create_all_subparsers
from fltk.util.generate_experiments import generate, run

__run_op_dict: Dict[str, launch_signature] = {
'util-generate': generate,
'util-run': run,
'remote': launch_remote,
'single': launch_single,
'cluster': launch_cluster,
'client': launch_client,
'extractor': launch_extractor
'remote': launch_remote, # Federated experiment (cluster)
'single': launch_single, # Federated experiment (locally_
'cluster': launch_cluster, # Cluster orchestrator
'client': launch_client, # Distributed client
'extractor': launch_extractor # Extractor (local)
}


def _save_get(args, param) -> Optional[Any]:
def _save_get(args: Namespace, param: str) -> Optional[Any]:
"""
Helper function to retrieve parameters from argument namespace.

@param args: Arguments passed from the commandline.
@type args: Namespace
@param param: Parameter to (safely) retrieve from the passed arguments.
@type param: str
@return: Value that was passed from the CLI if it was provided.
@rtype: Optional[Any]
"""
save_argument = None
if args is not None and hasattr(args, param):
save_argument = args.__dict__[param]
Expand All @@ -31,42 +37,45 @@ def _save_get(args, param) -> Optional[Any]:
return save_argument


def __main__():
# noinspection PyBroadException
def main():
"""
Main loop to perform learning (either Federated or Distributed). Note that Orchestrator is part of this setup for
now.
@return: Nothing.
Main loop to perform learning (either Federated or Distributed). Note that Orchestrator is part
of this setup for a unified startup. A future revision may extract the Orchestrator.
@return: None.
@rtype: None
"""
parser = argparse.ArgumentParser(prog='fltk',
description='Experiment launcher for the Federated Learning Testbed (fltk)')
parser = ArgumentParser(prog='fltk',
description='Launcher for the Federated Learning Testbed (fltk)')
subparsers = parser.add_subparsers(dest="action", required=True)
create_all_subparsers(subparsers)
# To create your own parser mirror the construction in the 'client_parser' object.
# Or refer to the ArgumentParser library documentation.
args = parser.parse_args()
distributed_config = get_distributed_config(args)

# Docker based launches rely on different arguments, prepare the placeholder values for a
# unified argument list.
arg_path, conf_path = None, None

try:
arg_path = Path(args.path)
except Exception as _: # pylint: disable=broad-except
except Exception as _:
print('No argument path is provided.')
try:
conf_path = Path(args.config)
except Exception as _: # pylint: disable=broad-except
except Exception as _:
print('No configuration path is provided.')

launch_fn: launch_signature = __run_op_dict[args.action]
launch_fn(arg_path, conf_path,
_save_get(args, 'rank'),
_save_get(args, 'nic'),
_save_get(args, 'host'),
_save_get(args, 'prefix'),
args,
distributed_config)
try:
pass
launch_fn(arg_path, conf_path,
_save_get(args, 'rank'),
_save_get(args, 'nic'),
_save_get(args, 'host'),
_save_get(args, 'prefix'),
args,
distributed_config)
except Exception as e:
print(f"Failed with reason: {e}")
parser.print_help()
Expand All @@ -75,10 +84,13 @@ def __main__():


if __name__ == "__main__":
# Get loger and set format for logging before starting the main loop.
# Get logger and set format for logging before starting the main loop.
root = logging.getLogger()
if root.handlers:
for handler in root.handlers:
root.removeHandler(handler)
logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d-%Y %H:%M:%S', )
__main__()
# noinspection SpellCheckingInspection
logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%m-%d-%Y %H:%M:%S')

main()
3 changes: 3 additions & 0 deletions fltk/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .client import Client
from .federator import Federator
from .node import Node
Loading