Skip to content

Commit

Permalink
Update CI/CD requirements and tests (NVIDIA#376)
Browse files Browse the repository at this point in the history
* Update CI/CD requirements and tests

* Fix main ci (NVIDIA#377)

* Fix build doc process (NVIDIA#387)

* Fix app testing pt model validator

* Update tests

* Fix format issues

* Fix integration tests

* Update premerge.yml

* Fix format

* Fix comments
  • Loading branch information
YuanTingHsieh authored Apr 19, 2022
1 parent 057aec6 commit 7910aad
Show file tree
Hide file tree
Showing 124 changed files with 521 additions and 731 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ jobs:
python-version: ["3.8"]

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
35 changes: 10 additions & 25 deletions .github/workflows/premerge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,57 +6,42 @@ on:
branches:
- main
pull_request:
workflow_dispatch:

jobs:
unit-tests:
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: 3.8
- run: pip install -r requirements-dev.txt
- run: PYTHONPATH=$(pwd) ./runtest.sh

example-tests:
integration-tests:
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: 3.8
- run: pip install -r requirements-dev.txt
- run: |
export PYTHONPATH=$(pwd)
rm -rf /tmp/snapshot-storage
cd test/app_testing
./run_example_tests.sh
rm -rf /tmp/snapshot-storage
app-tests:
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.8
- run: pip install -r requirements-dev.txt
- run: |
export PYTHONPATH=$(pwd)
rm -rf /tmp/snapshot-storage
cd test/app_testing
./run_app_tests.sh
cd tests/integration_test
./run_integration_tests.sh
rm -rf /tmp/snapshot-storage
wheel-build:
runs-on: self-hosted
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: 3.8
- run: pip install -r requirements-dev.txt
Expand Down
3 changes: 1 addition & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode):
# -- Project information -----------------------------------------------------

project = "NVIDIA FLARE"
copyright = "2021, NVIDIA"
copyright = "2022, NVIDIA"
author = "NVIDIA"

# The full version, including alpha/beta/rc tags
Expand Down Expand Up @@ -76,7 +76,6 @@ def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode):
"sphinx.ext.autodoc",
"sphinx.ext.viewcode",
"sphinx.ext.autosectionlabel",
"sphinxcontrib.exceltable",
]

autoclass_content = "both"
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/hello_numpy.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ In a file called ``np_trainer.py``, import nvflare and numpy.
Now we will implement the ``execute`` function to enable the clients to perform
a simple addition of a diff to represent one calculation of training a round.

See :ref:`hello_numpy_np_trainer` or find the full code of ``np_trainer.py`` at
Find the full code of ``np_trainer.py`` at
``examples/hello-numpy-sag/custom/np_trainer.py`` to follow along.

The server sends either the initial weights or any stored weights to each of the clients
Expand Down
9 changes: 0 additions & 9 deletions docs/examples/hello_numpy_np_trainer.rst

This file was deleted.

24 changes: 14 additions & 10 deletions nvflare/apis/impl/job_def_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import os
import pathlib
import shutil
import tempfile
import time
import uuid
from abc import ABC, abstractmethod
Expand Down Expand Up @@ -87,15 +88,12 @@ def filter_job(self, meta: dict):


class SimpleJobDefManager(JobDefManagerSpec):
def __init__(self, uri_root: str = "jobs", job_store_id: str = "job_store", temp_dir: str = "/tmp"):
def __init__(self, uri_root: str = "jobs", job_store_id: str = "job_store"):
super().__init__()
self.uri_root = uri_root
if not os.path.exists(uri_root):
os.mkdir(uri_root)
self.job_store_id = job_store_id
if not os.path.isdir(temp_dir):
raise ValueError("temp_dir {} is not a valid dir".format(temp_dir))
self.temp_dir = temp_dir

def _get_job_store(self, fl_ctx):
engine = fl_ctx.get_engine()
Expand Down Expand Up @@ -165,23 +163,29 @@ def set_results_uri(self, jid: str, result_uri: str, fl_ctx: FLContext):
return self.get_job(jid, fl_ctx)

def get_app(self, job: Job, app_name: str, fl_ctx: FLContext) -> bytes:
job_id_dir = self._load_job_data_from_store(job.job_id, fl_ctx)
temp_dir = tempfile.mkdtemp()
job_id_dir = self._load_job_data_from_store(job.job_id, temp_dir, fl_ctx)
job_folder = os.path.join(job_id_dir, job.meta[JobMetaKey.JOB_FOLDER_NAME.value])
fullpath_src = os.path.join(job_folder, app_name)
return zip_directory_to_bytes(fullpath_src, "")
result = zip_directory_to_bytes(fullpath_src, "")
shutil.rmtree(temp_dir)
return result

def get_apps(self, job: Job, fl_ctx: FLContext) -> Dict[str, bytes]:
job_id_dir = self._load_job_data_from_store(job.job_id, fl_ctx)
temp_dir = tempfile.mkdtemp()
job_id_dir = self._load_job_data_from_store(job.job_id, temp_dir, fl_ctx)
job_folder = os.path.join(job_id_dir, job.meta[JobMetaKey.JOB_FOLDER_NAME.value])
result_dict = {}
for app in job.get_deployment():
result_dict[app] = zip_directory_to_bytes(job_folder, app)
fullpath_src = os.path.join(job_folder, app)
result_dict[app] = zip_directory_to_bytes(fullpath_src, "")
shutil.rmtree(temp_dir)
return result_dict

def _load_job_data_from_store(self, jid: str, fl_ctx: FLContext):
def _load_job_data_from_store(self, jid: str, temp_dir: str, fl_ctx: FLContext):
store = self._get_job_store(fl_ctx)
data_bytes = store.get_data(self.job_uri(jid))
job_id_dir = os.path.join(self.temp_dir, jid)
job_id_dir = os.path.join(temp_dir, jid)
if os.path.exists(job_id_dir):
shutil.rmtree(job_id_dir)
os.mkdir(job_id_dir)
Expand Down
5 changes: 2 additions & 3 deletions nvflare/app_common/workflows/cross_site_model_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,7 @@ def start_controller(self, fl_ctx: FLContext):

# If the list of participating clients is not provided, include all clients currently available.
if not self._participating_clients:
clients = engine.get_clients()
self._participating_clients = [c.name for c in clients]
self._participating_clients = [c.name for c in engine.get_clients()]

# Create shareable dirs for models and results
workspace: Workspace = engine.get_workspace()
Expand Down Expand Up @@ -186,7 +185,7 @@ def control_flow(self, abort_signal: Signal, fl_ctx: FLContext):
engine = fl_ctx.get_engine()
start_time = time.time()
while not self._participating_clients:
self._participating_clients = engine.get_clients()
self._participating_clients = [c.name for c in engine.get_clients()]
if time.time() - start_time > self._wait_for_clients_timeout:
self.log_info(fl_ctx, "No clients available - quit model validation.")
return
Expand Down
38 changes: 23 additions & 15 deletions nvflare/fuel/hci/client/file_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,6 @@ def upload_job(self, args, api: AdminAPISpec):
if not os.path.isdir(full_path):
return {"status": APIStatus.ERROR_RUNTIME, "details": f"'{full_path}' is not a valid folder."}

meta = {}
try:
meta_path = os.path.join(full_path, "meta.json")
with open(meta_path) as file:
Expand All @@ -324,23 +323,32 @@ def upload_job(self, args, api: AdminAPISpec):
"status": APIStatus.ERROR_RUNTIME,
"details": "Exception while loading required meta.json in job directory: " + str(e),
}
if not isinstance(meta.get(JobMetaKey.STUDY_NAME), str):
return {"status": APIStatus.ERROR_RUNTIME, "details": "STUDY_NAME is expected in meta.json to be a str"}
if not isinstance(meta.get(JobMetaKey.RESOURCE_SPEC), dict):
return {"status": APIStatus.ERROR_RUNTIME, "details": "RESOURCE_SPEC is expected in meta.json to be a dict"}
if not isinstance(meta.get(JobMetaKey.DEPLOY_MAP), dict):
return {"status": APIStatus.ERROR_RUNTIME, "details": "DEPLOY_MAP is expected in meta.json to be a dict"}
try:
if not isinstance(meta.get(JobMetaKey.STUDY_NAME), str):
return {"status": APIStatus.ERROR_RUNTIME, "details": "STUDY_NAME is expected in meta.json to be a str"}
if not isinstance(meta.get(JobMetaKey.RESOURCE_SPEC), dict):
return {
"status": APIStatus.ERROR_RUNTIME,
"details": "RESOURCE_SPEC is expected in meta.json to be a dict",
}
if not isinstance(meta.get(JobMetaKey.DEPLOY_MAP), dict):
return {
"status": APIStatus.ERROR_RUNTIME,
"details": "DEPLOY_MAP is expected in meta.json to be a dict",
}

# zip the data
data = zip_directory_to_bytes(self.upload_dir, folder_name)
# zip the data
data = zip_directory_to_bytes(self.upload_dir, folder_name)
rel_path = os.path.relpath(full_path, self.upload_dir)
folder_name = _remove_loading_dotdot(rel_path)
meta[JobMetaKey.JOB_FOLDER_NAME.value] = folder_name

rel_path = os.path.relpath(full_path, self.upload_dir)
folder_name = _remove_loading_dotdot(rel_path)
meta[JobMetaKey.JOB_FOLDER_NAME.value] = folder_name
b64str = bytes_to_b64str(data)
serialized_meta = json.dumps(meta).encode("utf-8")
meta_b64str = bytes_to_b64str(serialized_meta)
except Exception as e:
return {"status": APIStatus.ERROR_RUNTIME, "details": f"Exception: {e}"}

b64str = bytes_to_b64str(data)
serialized_meta = json.dumps(meta).encode("utf-8")
meta_b64str = bytes_to_b64str(serialized_meta)
parts = [_server_cmd_name(ftd.SERVER_CMD_UPLOAD_JOB), meta_b64str, b64str]
command = join_args(parts)
return api.server_execute(command)
Expand Down
8 changes: 7 additions & 1 deletion nvflare/fuel/hci/client/fl_admin_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ def _get_processed_cmd_reply_data(self, command) -> Tuple[bool, str, Dict[str, A
Tuple of bool to indicate if success is in reply data, str with full response of the reply data, and the raw
reply.
"""
# TODO:: this only get success / string / error from reply message
success_in_data = False
reply = self.do_command(command)
# handle errors from write_error (these can be from FileTransferModule)
Expand Down Expand Up @@ -348,7 +349,12 @@ def upload_job(self, job_folder: str) -> FLAdminAPIResponse:
success, reply_data_full_response, reply = self._get_processed_cmd_reply_data("upload_job " + job_folder)
if reply_data_full_response:
if "Uploaded job" in reply_data_full_response:
return FLAdminAPIResponse(APIStatus.SUCCESS, {"message": reply_data_full_response})
# TODO:: this is a hack to get job id
return FLAdminAPIResponse(
APIStatus.SUCCESS,
{"message": reply_data_full_response, "job_id": reply_data_full_response.split(":")[-1].strip()},
reply,
)
return FLAdminAPIResponse(
APIStatus.ERROR_RUNTIME, {"message": "Runtime error: could not handle server reply."}, reply
)
Expand Down
2 changes: 1 addition & 1 deletion nvflare/fuel/hci/server/file_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def upload_job(self, conn: Connection, args: List[str]):
meta = job_def_manager.create(meta, data_bytes, fl_ctx)
conn.set_prop("meta", meta)
conn.set_prop("upload_job_id", meta.get(JobMetaKey.JOB_ID))
conn.append_string("Uploaded job {}".format(meta.get(JobMetaKey.JOB_ID)))
conn.append_string("Uploaded job: {}".format(meta.get(JobMetaKey.JOB_ID)))
except Exception as e:
conn.append_error("Exception occurred trying to upload job: " + str(e))
return
Expand Down
2 changes: 1 addition & 1 deletion nvflare/lighter/impl/he.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def initialize(self, ctx):
# dynamically call different generate keys method
# getattr(self._context, f'generate_{self.key_type}_keys')()
self._context.generate_relin_keys()
self._context.global_scale = 2 ** self.scale_bits
self._context.global_scale = 2**self.scale_bits

def build(self, project, ctx):
servers = project.get_participants_by_type("server", first_only=False)
Expand Down
1 change: 0 additions & 1 deletion nvflare/lighter/project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@ builders:
args:
uri_root: /tmp/jobs-storage
job_store_id: job_store
temp_dir: /tmp/jobs_temp
- id: study_manager # This id is reserved by system. Do not change it.
path: nvflare.apis.impl.study_manager.StudyManager
args:
Expand Down
1 change: 0 additions & 1 deletion nvflare/poc/client/startup/sub_start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
client=$1
sp=$2


DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
echo "WORKSPACE set to $DIR/.."
mkdir -p $DIR/../transfer
Expand Down
5 changes: 2 additions & 3 deletions nvflare/poc/server/startup/fed_server.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@
"path": "nvflare.apis.impl.job_def_manager.SimpleJobDefManager",
"args": {
"uri_root": "/tmp/jobs-storage",
"job_store_id": "job_store",
"temp_dir": "/tmp/jobs_temp"
"job_store_id": "job_store"
}
},
{
Expand All @@ -81,4 +80,4 @@
"args": {}
}
]
}
}
2 changes: 2 additions & 0 deletions nvflare/poc/server/startup/sub_start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ sp=$2
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
echo "WORKSPACE set to $DIR/.."
mkdir -p $DIR/../transfer
export PYTHONPATH=/local/custom:$PYTHONPATH
echo "PYTHONPATH is $PYTHONPATH"

SECONDS=0
lst=-400
Expand Down
9 changes: 7 additions & 2 deletions nvflare/private/fed/server/fed_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,8 +811,13 @@ def _turn_to_hot(self):
self.logger.info(f"Restore the previous snapshot. Run_number: {run_number}")
with self.engine.new_context() as fl_ctx:
job_runner = self.engine.job_runner
job_runner.restore_running_job(run_number=run_number, job_id=job_id,
job_clients=job_clients, snapshot=snapshot, fl_ctx=fl_ctx)
job_runner.restore_running_job(
run_number=run_number,
job_id=job_id,
job_clients=job_clients,
snapshot=snapshot,
fl_ctx=fl_ctx,
)

self.server_state = HotState(
host=self.server_state.host, port=self.server_state.service_port, ssid=self.server_state.ssid
Expand Down
6 changes: 3 additions & 3 deletions nvflare/private/fed/server/job_cmds.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from nvflare.fuel.hci.conn import Connection
from nvflare.fuel.hci.reg import CommandModuleSpec, CommandSpec
from nvflare.private.fed.server.server_engine import ServerEngine
from nvflare.private.fed.server.server_engine_internal_spec import ServerEngineInternalSpec
from nvflare.security.security import Action

from .training_cmds import TrainingCommandModule
Expand All @@ -31,6 +30,7 @@ class JobCommandModule(TrainingCommandModule):
"""Command module with commands for job management."""

def __init__(self):
super().__init__()
self.logger = logging.getLogger(self.__class__.__name__)

def get_spec(self):
Expand Down Expand Up @@ -152,8 +152,8 @@ def delete_job(self, conn: Connection, args: List[str]):

def abort_job(self, conn: Connection, args: List[str]):
engine = conn.app_ctx
if not isinstance(engine, ServerEngineInternalSpec):
raise TypeError("engine must be ServerEngineInternalSpec but got {}".format(type(engine)))
if not isinstance(engine, ServerEngine):
raise TypeError("engine must be ServerEngine but got {}".format(type(engine)))

run_number = conn.get_prop(self.RUN_NUMBER)
engine.job_runner.stop_run(run_number, engine.new_context())
Expand Down
3 changes: 1 addition & 2 deletions nvflare/private/fed/server/job_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,7 @@ def run(self, fl_ctx: FLContext):

def restore_running_job(self, run_number: str, job_id: str, job_clients, snapshot, fl_ctx: FLContext):
engine = fl_ctx.get_engine()
engine.start_app_on_server(run_number, job_id=job_id,
job_clients=job_clients, snapshot=snapshot)
engine.start_app_on_server(run_number, job_id=job_id, job_clients=job_clients, snapshot=snapshot)

try:
job_manager = engine.get_component(SystemComponents.JOB_MANAGER)
Expand Down
Loading

0 comments on commit 7910aad

Please sign in to comment.