From 6c356296a01ebf3d8ad0cab6058fb8c03ccbf8f6 Mon Sep 17 00:00:00 2001
From: Chen Shen <scv119@gmail.com>
Date: Sat, 22 Apr 2023 01:28:06 -0700
Subject: [PATCH] [autoscaler v2] Interface between autoscaler and gcs (#34680)

Why are these changes needed?
This PR introduce the interface between GCS and Autoscaler.

Specifically it introduces 2 APIs

GetClusterResourceState: Autoscaler will query this interface to get cluster resource usage, which includes nodes (state and resource ulitization), as well as pending requests, which include ResourceRequest, GangResourceRequest, as well as ClusterResourceConstraint.

For NodeState, it includes NodeStatus, which can transit from ALIVE -> DEAD, or ALIVE -> DRAIN_PENDING -> DRAINING -> DRAINED -> DEAD, or ALIVE -> DRAIN_PENDING -> DRAIN_FAILED.
it also includes instance_id where the autoscaler is aware of, this allows autoscaler to do reconsiliation if available.
For ResourceRequest, it comes with a PlacementConstraint which only support AntiAffinityConstraint today, which the semantics the resource request can't be allocated on a node with the same label/value specified in the AntiAffinityConstraint
There is also GangResourceRequest, which has gang scheduling semantics where the requests in the gang should be all fulfilled atomically.
ReportAutoscalingState: Autoscaler will also report its own state back to cluster using this API, where it includes all instances (including both pending launch), as well as infeasible requests.

Instance state could transition from QUEUED -> REQUESTED -> BOOTSTRAPPING -> ALIVE -> TERMINATING -> DEAD.

two special states are TO_BE_PREEMPTED and TO_BE_DRAINED, where one is force preemption, another is collaborating draining (can be reversed).

It also reports back requests that infeasible, associated with a specific request version.
---
 src/ray/protobuf/BUILD                        |  15 ++
 .../protobuf/experimental/autoscaler.proto    | 221 ++++++++++++++++++
 2 files changed, 236 insertions(+)
 create mode 100644 src/ray/protobuf/experimental/autoscaler.proto

diff --git a/src/ray/protobuf/BUILD b/src/ray/protobuf/BUILD
index 22054c994261..3ab65258b760 100644
--- a/src/ray/protobuf/BUILD
+++ b/src/ray/protobuf/BUILD
@@ -347,3 +347,18 @@ cc_proto_library(
     name = "usage_cc_proto",
     deps = [":usage_proto"],
 )
+
+proto_library(
+    name = "autoscaler_proto",
+    srcs = ["experimental/autoscaler.proto"],
+)
+
+python_grpc_compile(
+    name = "autoscaler_py_proto",
+    deps = [":autoscaler_proto"],
+)
+
+cc_proto_library(
+    name = "autoscaler_cc_proto",
+    deps = [":autoscaler_proto"],
+)
diff --git a/src/ray/protobuf/experimental/autoscaler.proto b/src/ray/protobuf/experimental/autoscaler.proto
new file mode 100644
index 000000000000..93bbf98f72df
--- /dev/null
+++ b/src/ray/protobuf/experimental/autoscaler.proto
@@ -0,0 +1,221 @@
+// Copyright 2023 The Ray Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+option cc_enable_arenas = true;
+
+package ray.rpc;
+
+// ============= Cluster Resources ====================
+//
+// Following fields represents the Cluster Resources autoscaler interested
+// in.
+
+// Represents an anti-affinity constraint. A bundle with this constraint
+// can't be allocated to a node that has a label with the same name and
+// value. This is used to implement placement group anti-affinity.
+//
+// For placement group, the label_name is "_PG" (reserved),
+// and the label_value is the placement group id.
+message AntiAffinityConstraint {
+  string label_name = 1;
+  string label_value = 2;
+  // If true, the label will be created on the node
+  // where the request with this constraint is scheduled.
+  bool create_label_on_schedule = 3;
+}
+
+message PlacementConstraint {
+  AntiAffinityConstraint anti_affinity = 1;
+}
+
+message ResourceRequest {
+  // resource requirements for the request.
+  map<string, double> resources_bundle = 1;
+  // placement constraint for the request. multiple constraints
+  // form AND semantics.
+  repeated PlacementConstraint placement_constraints = 2;
+}
+
+message ResourceRequestByCount {
+  ResourceRequest request = 1;
+  int64 count = 2;
+}
+
+// All bundles in the same resource request require gang
+// allocation semantics: they should be allocated all or nothing.
+message GangResourceRequest {
+  // a map from bundles to the number of bundles requested.
+  repeated ResourceRequest requests = 1;
+}
+
+// Cluster resource constraint represents minimial cluster size requirement,
+// this is issued through ray.autoscaler.sdk.request_resources.
+message ClusterResourceConstraint {
+  // If not empty, the cluster should have the capacity (total resource) to
+  // fit the min_resources.
+  map<string, double> min_resources = 1;
+  // If not emtpy, the cluster should have the capacity (total resource) to fit
+  // the min_bundles.
+  repeated ResourceRequest min_bundles = 2;
+  // Id of the job who issued this constraint.
+  string job_id = 3;
+}
+
+message NodeState {
+  enum NodeStatus {
+    // Node is alive.
+    ALIVE = 0;
+    // Node is dead.
+    DEAD = 1;
+    // Node is being drained.
+    DRAIN_PENDING = 2;
+    // Node is being drained.
+    DRAIN_FAILED = 3;
+    // Node is being drained.
+    DRAINING = 4;
+    // Node is already drained, and ready to be removed.
+    DRAINED = 5;
+  }
+  // The node id internal to Ray.
+  string node_id = 11;
+
+  // The instance id that the node is running on.
+  // This is passed in when the node is registered.
+  string instance_id = 12;
+
+  // The available resources on the node.
+  // Reserved resource names: CPU, GPU, MEMORY, OBJECT_STORE_MEMORY
+  map<string, double> available_resources = 13;
+
+  // The corresponding total resources on the node.
+  map<string, double> total_resources = 14;
+
+  // Dynamic labels associated with the node.
+  // Reserved dynamic label names: _PG
+  map<string, string> dynamic_labels = 15;
+
+  // A monotonic increasing version of the node resource state.
+  int64 node_state_version = 16;
+
+  // The status of the node.
+  NodeStatus status = 17;
+}
+
+// ============= Autoscaling State Service API =======================
+//
+// Autoscaler periodically calls to
+// two snapshot APIs, GetClusterResourceState
+// and ReportAutoscalingState.
+// The GetClusterResourceState will return a snapshot
+// of Ray state that Autoscaler interested, along with
+// the cluster_resource_state_version (version).
+//
+// Separately, autoscaler will constantly making decisions
+// based on the latest Ray state, and also change its
+// state based on the information from node provider.
+// Autoscaler will periodically report its state to GCS
+// through ReportAutoscalingState API.
+
+message GetClusterResourceStateRequest {
+  // The last seen cluster resource state version. The default value is reserved for if a
+  // previous scheduling state has never been seen.
+  int64 last_seen_cluster_resource_state_version = 1;
+}
+
+message GetClusterResourceStateReply {
+  // an monotonically increasing version of the cluster resources.
+  int64 cluster_resource_state_version = 1;
+  // last seen autoscaler state.
+  int64 last_seen_autoscaler_state_version = 2;
+  // Current cluster resources.
+  repeated NodeState node_states = 3;
+  // Resource requests pending scheduling.
+  repeated ResourceRequestByCount pending_resource_requests = 4;
+  // Gang resource requests pending scheduling.
+  repeated GangResourceRequest pending_gang_resource_requests = 5;
+  // Cluster resource constraints.
+  // There could be multiple constraints issued by different
+  // jobs. Autoscaler to make sure all constraints are satisfied.
+  repeated ClusterResourceConstraint cluster_resource_constraints = 6;
+}
+
+message Instance {
+  enum InstanceStatus {
+    // The unspecified state - most likey it is queued.
+    INSTANCE_STATUS_UNSPECIFIED = 0;
+    // Instance is starting. The first state update received from the
+    // instance.
+    STARTING = 1;
+    // The instance is running - one of two states of a healthy instance.
+    RUNNING = 2;
+    // The instance is idle - one of two states of a healthy instance.
+    IDLE = 3;
+    // The instance is stopping - usually follows from the RUNNING, IDLE,
+    // PREEMPT_REQUEST or DRAIN_REQUEST state.
+    STOPPING = 4;
+    // The instance is stopped - follows from the STOPPING state.
+    STOPPED = 5;
+    // The instance is in a bad state - but it is still able to send updates.
+    FAILING = 6;
+    // The subscribe service moves instances to this state if they
+    // have been idle for too long. This allows the cluster manager to
+    // make a final decision on whether or not to commence a drain
+    // sequence for this instance.
+    DRAIN_CONFIRMATION_PENDING = 7;
+    // The instance should be drained, Ray should start draining process
+    // but could reject if failed to drain.
+    DRAIN_REQUEST = 8;
+    // The instance will be reempted by the instance manager, regardless
+    // of whether it is drainable or not.
+    PREEMPT_REQUEST = 9;
+  }
+  // an unique id for the instance that's generated by the
+  // instance manager. This may be optional if
+  // the instance hasn't be started yet.
+  string instance_id = 11;
+  // the status of the instance.
+  InstanceStatus status = 12;
+  // the node id of the instance.
+  string node_type = 13;
+  // The corresponding total resources on the node.
+  map<string, double> total_resources = 14;
+  // timestamp of the last state changed.
+  int64 timestamp_since_last_state_change = 15;
+}
+
+message ReportAutoscalingStateRequest {
+  int64 last_seen_cluster_resource_state_version = 1;
+  // A monotonically increasing version identifies
+  // the state of autoscaler.
+  // Note: for the same cluster resource state, the
+  // autoscaler state might be different, since
+  // the autoscaler's state could also be updated by
+  // node provider.
+  int64 autoscaler_state_version = 2;
+  repeated Instance instances = 3;
+  // infeasible resource requests.
+  repeated ResourceRequest infeasible_resource_requests = 4;
+  repeated ClusterResourceConstraint infeasible_gange_resource_requests = 5;
+  repeated ClusterResourceConstraint infeasible_cluster_resource_constraints = 6;
+}
+
+message ReportAutoscalingStateReply {}
+
+service AutoscalerStateService {
+  rpc GetClusterResourceState(GetClusterResourceStateRequest)
+      returns (GetClusterResourceStateReply);
+  rpc ReportAutoscalingState(ReportAutoscalingStateRequest)
+      returns (ReportAutoscalingStateReply);
+}
\ No newline at end of file