diff --git a/Dockerfile b/Dockerfile index fa8c5bd22..5a7259859 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,8 @@ ARG BASE_IMG=python:3.10-slim FROM $BASE_IMG +ARG GRPC_HEALTH_PROBE_VERSION="" + # Requirements (use MNIST Keras as default) ARG REQUIREMENTS="" @@ -15,6 +17,17 @@ COPY $REQUIREMENTS /app/config/requirements.txt # Install developer tools (needed for psutil) RUN apt-get update && apt-get install -y python3-dev gcc +# Install grpc health probe checker +RUN if [ ! -z "$GRPC_HEALTH_PROBE_VERSION" ]; then \ + apt-get install -y wget && \ + wget -qO/bin/grpc_health_probe https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/${GRPC_HEALTH_PROBE_VERSION}/grpc_health_probe-linux-amd64 && \ + chmod +x /bin/grpc_health_probe && \ + apt-get remove -y wget && apt autoremove -y; \ + else \ + echo "No grpc_health_probe version specified, skipping installation"; \ + fi + + # Create FEDn app directory SHELL ["/bin/bash", "-c"] RUN mkdir -p /app \ diff --git a/docker-compose.yaml b/docker-compose.yaml index 0c36d669e..94f6b8b59 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -91,6 +91,7 @@ services: context: . args: BASE_IMG: ${BASE_IMG:-python:3.10-slim} + GRPC_HEALTH_PROBE_VERSION: v0.4.24 working_dir: /app volumes: - ${HOST_REPO_DIR:-.}/fedn:/app/fedn @@ -99,6 +100,16 @@ services: - "/venv/bin/pip install --no-cache-dir -e /app/fedn && /venv/bin/fedn run combiner --init config/settings-combiner.yaml" ports: - 12080:12080 + healthcheck: + test: + [ + "CMD", + "/bin/grpc_health_probe", + "-addr=localhost:12080" + ] + interval: 2s + timeout: 10s + retries: 5 depends_on: - api-server @@ -119,5 +130,5 @@ services: deploy: replicas: 0 depends_on: - - api-server - - combiner + combiner: + condition: service_healthy diff --git a/docs/quickstart.rst b/docs/quickstart.rst index ca60fd149..263eb635c 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -1,6 +1,11 @@ Quickstart Tutorial PyTorch (MNIST) =================================== +.. note:: + This tutorial is a quickstart guide to FEDn where the federated learning project has been defined + using an already implemented ``compute package``. This turorial is a good starting point for developers, however, + head over to :ref:`tutorial-label` when you are ready to define your own federated learning projects. + This classic example of hand-written text recognition is well suited as a lightweight test when developing on FEDn in pseudo-distributed mode. A normal high-end laptop or a workstation should be able to sustain a few clients. The example automates the partitioning of data and deployment of a variable number of clients on a single host. diff --git a/examples/mnist-pytorch/API_Example.ipynb b/examples/mnist-pytorch/API_Example.ipynb index 3ac7b615b..9f13af04f 100644 --- a/examples/mnist-pytorch/API_Example.ipynb +++ b/examples/mnist-pytorch/API_Example.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "743dfe47", "metadata": {}, "outputs": [], @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "1061722d", "metadata": {}, "outputs": [], @@ -58,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "5107f6f9", "metadata": {}, "outputs": [], @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "f0380d35", "metadata": {}, "outputs": [], @@ -90,7 +90,7 @@ " \"session_id\": session_id,\n", " \"aggregator\": \"fedavg\",\n", " \"model_id\": seed_model['model_id'],\n", - " \"rounds\": 10\n", + " \"rounds\": 2\n", " }\n", "\n", "result_fedavg = client.start_session(**session_config_fedavg)" diff --git a/fedn/fedn/network/combiner/combiner.py b/fedn/fedn/network/combiner/combiner.py index b4ea1bc1c..207786e1f 100644 --- a/fedn/fedn/network/combiner/combiner.py +++ b/fedn/fedn/network/combiner/combiner.py @@ -398,7 +398,7 @@ def _flush_model_update_queue(self): ##################################################################################################################### - # Control Service + # Controller Service def Start(self, control: fedn.ControlRequest, context): """ Start a round of federated learning" diff --git a/fedn/fedn/network/grpc/server.py b/fedn/fedn/network/grpc/server.py index acbbd98b1..59ed6b1ba 100644 --- a/fedn/fedn/network/grpc/server.py +++ b/fedn/fedn/network/grpc/server.py @@ -1,6 +1,7 @@ from concurrent import futures import grpc +from grpc_health.v1 import health, health_pb2_grpc import fedn.network.grpc.fedn_pb2_grpc as rpc from fedn.common.log_config import (logger, set_log_level_from_string, @@ -17,6 +18,7 @@ def __init__(self, servicer, modelservicer, config): self.server = grpc.server(futures.ThreadPoolExecutor(max_workers=350)) self.certificate = None + self.health_servicer = health.HealthServicer() if isinstance(servicer, rpc.CombinerServicer): rpc.add_CombinerServicer_to_server(servicer, self.server) @@ -29,6 +31,8 @@ def __init__(self, servicer, modelservicer, config): if isinstance(servicer, rpc.CombinerServicer): rpc.add_ControlServicer_to_server(servicer, self.server) + health_pb2_grpc.add_HealthServicer_to_server(self.health_servicer, self.server) + if config['secure']: logger.info(f'Creating secure gRPCS server using certificate: {config["certificate"]}') server_credentials = grpc.ssl_server_credentials( diff --git a/fedn/setup.py b/fedn/setup.py index ffee7d1fc..16cb88238 100644 --- a/fedn/setup.py +++ b/fedn/setup.py @@ -2,13 +2,13 @@ setup( name='fedn', - version='0.7.2', + version='0.8.0', description="""Scaleout Federated Learning""", author='Scaleout Systems AB', author_email='contact@scaleoutsystems.com', url='https://www.scaleoutsystems.com', py_modules=['fedn'], - python_requires='>=3.7,<3.11', + python_requires='>=3.8,<3.11', install_requires=[ "PyYAML>=5.4", "requests", @@ -31,7 +31,8 @@ "plotly", "pandas", "bokeh<3.0.0", - "networkx" + "networkx", + "grpcio-health-checking" ], license='Apache 2.0', zip_safe=False, @@ -42,7 +43,6 @@ packages=find_packages(exclude=["tests", "tests.*"]), classifiers=[ 'Natural Language :: English', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10',