[DPE-2564] - Large deployments relations (#190)

## Issue This PR addresses [DPE-2564](https://warthogs.atlassian.net/browse/DPE-2564), namely, this PR addresses: - Implementation of Peer Cluster Relations - Orchestrators: - implementation of the `main` orchestrator logic - implementation of the `failover` orchestrator logic - implementation of the demotion / promotion of either - validation of relations - Management of `Main / Failover`-orchestrators vs regular clusters - propagation of errors from orchestrators to related clusters - various changes to make it work with the charm - changed the previous terminology from `main/failover-cluster-manager` to `main/failover-orchestrator` - fixes unit tests ## Implemented UX: ``` juju deploy tls-certificates-operator --channel stable --show-log --verbose juju config tls-certificates-operator generate-self-signed-certificates=true ca-common-name="CN_CA" # deploy main-orchestrator cluster juju deploy -n 3 ./opensearch.charm \ main \ --config cluster_name="log-app" --config init_hold=false --config roles="cluster_manager" # deploy failover-orchestrator cluster juju deploy -n 2 ./opensearch.charm \ failover \ --config cluster_name="log-app" --config init_hold=true --config roles="cluster_manager" # deploy data-hot cluster juju deploy -n 2 ./opensearch.charm \ data-hot \ --config cluster_name="log-app" --config init_hold=true --config roles="data.hot" # integrate TLS juju integrate tls-certificates-operator main juju integrate tls-certificates-operator failover juju integrate tls-certificates-operator data-hot # integrate the "main"-orchestrator with all clusters: juju integrate main:peer-cluster-orchestrator failover:peer-cluster juju integrate main:peer-cluster-orchestrator data-hot:peer-cluster # integrate the "failover"-orchestrator with rest of clusters: juju integrate failover:peer-cluster-orchestrator data-hot:peer-cluster # trigger the promotion of the "failover" to "main" orchestrator juju remove-relation main:peer-cluster-orchestrator failover:peer-cluster juju remove-relation main:peer-cluster-orchestrator data-hot:peer-cluster # have the "old" main orchestrator rejoins the relation and becomes a "failover" juju integrate failover:peer-cluster-orchestrator main:peer-cluster juju integrate main:peer-cluster-orchestrator data-hot:peer-cluster ``` ### Next steps: - Integ. tests + unit tests in other PR - Use secrets to pass credentials between the main orchestrator and the rest of the clusters - Externalize error messages in `constants-charm.py` - delay the initialization of the security index until the first data node joins the cluster [DPE-2564]: https://warthogs.atlassian.net/browse/DPE-2564?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
canonical · Apr 3, 2024 · 6e4cbee · 6e4cbee
1 parent c022520
commit 6e4cbee
Show file tree

Hide file tree

Showing 15 changed files with 1,454 additions and 153 deletions.
diff --git a/lib/charms/opensearch/v0/constants_charm.py b/lib/charms/opensearch/v0/constants_charm.py
@@ -27,7 +27,9 @@
 ServiceStopped = "The OpenSearch service stopped."
 ServiceStopFailed = "An error occurred while attempting to stop the OpenSearch service."
 ServiceIsStopping = "The OpenSearch service is stopping."
+AdminUserNotConfigured = "Waiting for the admin user to be fully configured..."
 TLSNotFullyConfigured = "Waiting for TLS to be fully configured..."
+TLSRelationMissing = "Missing TLS relation with this cluster."
 TLSRelationBrokenError = (
     "Relation broken with the TLS Operator while TLS not fully configured. Stopping OpenSearch."
 )
@@ -82,6 +84,7 @@
 # Relation Interfaces
 ClientRelationName = "opensearch-client"
 PeerRelationName = "opensearch-peers"
+PeerClusterOrchestratorRelationName = "peer-cluster-orchestrator"
 PeerClusterRelationName = "peer-cluster"
 COSUser = "monitor"
 COSRelationName = "cos-agent"

diff --git a/lib/charms/opensearch/v0/helper_charm.py b/lib/charms/opensearch/v0/helper_charm.py
@@ -3,8 +3,12 @@
 
 """Utility functions for charms related operations."""
 import re
+from datetime import datetime
 
+from charms.data_platform_libs.v0.data_interfaces import Scope
+from charms.opensearch.v0.constants_charm import PeerRelationName
 from charms.opensearch.v0.helper_enums import BaseStrEnum
+from ops import CharmBase
 from ops.model import ActiveStatus, StatusBase
 
 # The unique Charmhub library identifier, never change it
@@ -68,3 +72,43 @@ def set(self, status: StatusBase, app: bool = False):
             return
 
         context.status = status
+
+
+class RelDepartureReason(BaseStrEnum):
+    """Enum depicting the 3 various causes of a Relation Departed event."""
+
+    APP_REMOVAL = "app-removal"
+    SCALE_DOWN = "scale-down"
+    REL_BROKEN = "rel-broken"
+
+
+def relation_departure_reason(charm: CharmBase, relation_name: str) -> RelDepartureReason:
+    """Compute the reason behind a relation departed event."""
+    # fetch relation info
+    goal_state = charm.model._backend._run("goal-state", return_output=True, use_json=True)
+    rel_info = goal_state["relations"][relation_name]
+
+    # check dying units
+    dying_units = [
+        unit_data["status"] == "dying"
+        for unit, unit_data in rel_info.items()
+        if unit != relation_name
+    ]
+
+    # check if app removal
+    if all(dying_units):
+        return RelDepartureReason.APP_REMOVAL
+
+    if any(dying_units):
+        return RelDepartureReason.SCALE_DOWN
+
+    return RelDepartureReason.REL_BROKEN
+
+
+def trigger_leader_peer_rel_changed(charm: CharmBase) -> None:
+    """Force trigger a peer rel changed event by leader."""
+    if not charm.unit.is_leader():
+        return
+
+    charm.peers_data.put(Scope.APP, "triggered", datetime.now().timestamp())
+    charm.on[PeerRelationName].relation_changed.emit(charm.model.get_relation(PeerRelationName))
diff --git a/lib/charms/opensearch/v0/models.py b/lib/charms/opensearch/v0/models.py
@@ -3,7 +3,8 @@
 
 """Cluster-related data structures / model classes."""
 from abc import ABC
-from typing import Any, Dict, List, Optional
+from datetime import datetime
+from typing import Any, Dict, List, Literal, Optional
 
 from charms.opensearch.v0.helper_enums import BaseStrEnum
 from pydantic import BaseModel, Field, root_validator, validator
@@ -31,8 +32,10 @@ def to_dict(self) -> Dict[str, Any]:
         return self.dict()
 
     @classmethod
-    def from_dict(cls, input_dict: Dict[str, Any]):
+    def from_dict(cls, input_dict: Optional[Dict[str, Any]]):
         """Create a new instance of this class from a json/dict repr."""
+        if not input_dict:  # to handle when classes defined defaults
+            return cls()
         return cls(**input_dict)
 
     @classmethod
@@ -91,8 +94,8 @@ def is_data(self):
 class DeploymentType(BaseStrEnum):
     """Nature of a sub cluster deployment."""
 
-    MAIN_CLUSTER_MANAGER = "main-cluster-manager"
-    CLUSTER_MANAGER_FAILOVER = "cluster-manager-failover"
+    MAIN_ORCHESTRATOR = "main-orchestrator"
+    FAILOVER_ORCHESTRATOR = "failover-orchestrator"
     OTHER = "other"
 
 
@@ -141,22 +144,6 @@ def prevent_none(cls, values):  # noqa: N805
         return values
 
 
-class PeerClusterRelDataCredentials(Model):
-    """Model class for credentials passed on the PCluster relation."""
-
-    admin_username: str
-    admin_password: str
-
-
-class PeerClusterRelData(Model):
-    """Model class for the PCluster relation data."""
-
-    cluster_name: Optional[str]
-    cm_nodes: List[str]
-    credentials: PeerClusterRelDataCredentials
-    tls_ca: str
-
-
 class PeerClusterConfig(Model):
     """Model class for the multi-clusters related config set by the user."""
 
@@ -201,4 +188,68 @@ class DeploymentDescription(Model):
     start: StartMode
     pending_directives: List[Directive]
     typ: DeploymentType
+    app: str
     state: DeploymentState = DeploymentState(value=State.ACTIVE)
+    promotion_time: Optional[float]
+
+    @root_validator
+    def set_promotion_time(cls, values):  # noqa: N805
+        """Set promotion time of a failover to a main CM."""
+        if values["typ"] == DeploymentType.MAIN_ORCHESTRATOR:
+            values["promotion_time"] = datetime.now().timestamp()
+
+        return values
+
+
+class PeerClusterRelDataCredentials(Model):
+    """Model class for credentials passed on the PCluster relation."""
+
+    admin_username: str
+    admin_password: str
+    admin_password_hash: str
+    admin_tls: Dict[str, Optional[str]]
+
+
+class PeerClusterRelData(Model):
+    """Model class for the PCluster relation data."""
+
+    cluster_name: str
+    cm_nodes: List[Node]
+    credentials: PeerClusterRelDataCredentials
+    deployment_desc: Optional[DeploymentDescription]
+
+
+class PeerClusterRelErrorData(Model):
+    """Model class for the PCluster relation data."""
+
+    cluster_name: Optional[str]
+    should_sever_relation: bool
+    should_wait: bool
+    blocked_message: str
+    deployment_desc: Optional[DeploymentDescription]
+
+
+class PeerClusterOrchestrators(Model):
+    """Model class for the PClusters registered main/failover clusters."""
+
+    _TYPES = Literal["main", "failover"]
+
+    main_rel_id: int = -1
+    main_app: Optional[str]
+    failover_rel_id: int = -1
+    failover_app: Optional[str]
+
+    def delete(self, typ: _TYPES) -> None:
+        """Delete an orchestrator from the current pair."""
+        if typ == "main":
+            self.main_rel_id = -1
+            self.main_app = None
+        else:
+            self.failover_rel_id = -1
+            self.failover_app = None
+
+    def promote_failover(self) -> None:
+        """Delete previous main orchestrator and promote failover if any."""
+        self.main_app = self.failover_app
+        self.main_rel_id = self.failover_rel_id
+        self.delete("failover")