Skip to content

Commit

Permalink
[DPE-2564] - Large deployments relations (#190)
Browse files Browse the repository at this point in the history
## Issue
This PR addresses
[DPE-2564](https://warthogs.atlassian.net/browse/DPE-2564), namely, this
PR addresses:
- Implementation of Peer Cluster Relations 
- Orchestrators:
    - implementation of the `main` orchestrator logic
    - implementation of the `failover` orchestrator logic
    - implementation of the demotion / promotion of either
    - validation of relations
- Management of `Main / Failover`-orchestrators vs regular clusters
- propagation of errors from orchestrators to related clusters 
- various changes to make it work with the charm  
- changed the previous terminology from `main/failover-cluster-manager`
to `main/failover-orchestrator`
- fixes unit tests

## Implemented UX:
```
juju deploy tls-certificates-operator --channel stable --show-log --verbose
juju config tls-certificates-operator generate-self-signed-certificates=true ca-common-name="CN_CA"

# deploy main-orchestrator cluster 
juju deploy -n 3 ./opensearch.charm \
    main \
    --config cluster_name="log-app" --config init_hold=false --config roles="cluster_manager"

# deploy failover-orchestrator cluster
juju deploy -n 2 ./opensearch.charm \
    failover \
    --config cluster_name="log-app" --config init_hold=true --config roles="cluster_manager"

# deploy data-hot cluster
juju deploy -n 2 ./opensearch.charm \
    data-hot \
    --config cluster_name="log-app" --config init_hold=true --config roles="data.hot"

# integrate TLS
juju integrate tls-certificates-operator main
juju integrate tls-certificates-operator failover
juju integrate tls-certificates-operator data-hot

# integrate the "main"-orchestrator with all clusters:
juju integrate main:peer-cluster-orchestrator failover:peer-cluster
juju integrate main:peer-cluster-orchestrator data-hot:peer-cluster

# integrate the "failover"-orchestrator with rest of clusters:
juju integrate failover:peer-cluster-orchestrator data-hot:peer-cluster

# trigger the promotion of the "failover" to "main" orchestrator
juju remove-relation main:peer-cluster-orchestrator failover:peer-cluster
juju remove-relation main:peer-cluster-orchestrator data-hot:peer-cluster

# have the "old" main orchestrator rejoins the relation and becomes a "failover"
juju integrate failover:peer-cluster-orchestrator main:peer-cluster
juju integrate main:peer-cluster-orchestrator data-hot:peer-cluster
``` 

### Next steps:
- Integ. tests + unit tests in other PR 
- Use secrets to pass credentials between the main orchestrator and the
rest of the clusters
- Externalize error messages in `constants-charm.py` 
- delay the initialization of the security index until the first data
node joins the cluster

[DPE-2564]:
https://warthogs.atlassian.net/browse/DPE-2564?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
  • Loading branch information
Mehdi-Bendriss authored Apr 3, 2024
1 parent c022520 commit 6e4cbee
Show file tree
Hide file tree
Showing 15 changed files with 1,454 additions and 153 deletions.
3 changes: 3 additions & 0 deletions lib/charms/opensearch/v0/constants_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
ServiceStopped = "The OpenSearch service stopped."
ServiceStopFailed = "An error occurred while attempting to stop the OpenSearch service."
ServiceIsStopping = "The OpenSearch service is stopping."
AdminUserNotConfigured = "Waiting for the admin user to be fully configured..."
TLSNotFullyConfigured = "Waiting for TLS to be fully configured..."
TLSRelationMissing = "Missing TLS relation with this cluster."
TLSRelationBrokenError = (
"Relation broken with the TLS Operator while TLS not fully configured. Stopping OpenSearch."
)
Expand Down Expand Up @@ -82,6 +84,7 @@
# Relation Interfaces
ClientRelationName = "opensearch-client"
PeerRelationName = "opensearch-peers"
PeerClusterOrchestratorRelationName = "peer-cluster-orchestrator"
PeerClusterRelationName = "peer-cluster"
COSUser = "monitor"
COSRelationName = "cos-agent"
Expand Down
44 changes: 44 additions & 0 deletions lib/charms/opensearch/v0/helper_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@

"""Utility functions for charms related operations."""
import re
from datetime import datetime

from charms.data_platform_libs.v0.data_interfaces import Scope
from charms.opensearch.v0.constants_charm import PeerRelationName
from charms.opensearch.v0.helper_enums import BaseStrEnum
from ops import CharmBase
from ops.model import ActiveStatus, StatusBase

# The unique Charmhub library identifier, never change it
Expand Down Expand Up @@ -68,3 +72,43 @@ def set(self, status: StatusBase, app: bool = False):
return

context.status = status


class RelDepartureReason(BaseStrEnum):
"""Enum depicting the 3 various causes of a Relation Departed event."""

APP_REMOVAL = "app-removal"
SCALE_DOWN = "scale-down"
REL_BROKEN = "rel-broken"


def relation_departure_reason(charm: CharmBase, relation_name: str) -> RelDepartureReason:
"""Compute the reason behind a relation departed event."""
# fetch relation info
goal_state = charm.model._backend._run("goal-state", return_output=True, use_json=True)
rel_info = goal_state["relations"][relation_name]

# check dying units
dying_units = [
unit_data["status"] == "dying"
for unit, unit_data in rel_info.items()
if unit != relation_name
]

# check if app removal
if all(dying_units):
return RelDepartureReason.APP_REMOVAL

if any(dying_units):
return RelDepartureReason.SCALE_DOWN

return RelDepartureReason.REL_BROKEN


def trigger_leader_peer_rel_changed(charm: CharmBase) -> None:
"""Force trigger a peer rel changed event by leader."""
if not charm.unit.is_leader():
return

charm.peers_data.put(Scope.APP, "triggered", datetime.now().timestamp())
charm.on[PeerRelationName].relation_changed.emit(charm.model.get_relation(PeerRelationName))
91 changes: 71 additions & 20 deletions lib/charms/opensearch/v0/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

"""Cluster-related data structures / model classes."""
from abc import ABC
from typing import Any, Dict, List, Optional
from datetime import datetime
from typing import Any, Dict, List, Literal, Optional

from charms.opensearch.v0.helper_enums import BaseStrEnum
from pydantic import BaseModel, Field, root_validator, validator
Expand Down Expand Up @@ -31,8 +32,10 @@ def to_dict(self) -> Dict[str, Any]:
return self.dict()

@classmethod
def from_dict(cls, input_dict: Dict[str, Any]):
def from_dict(cls, input_dict: Optional[Dict[str, Any]]):
"""Create a new instance of this class from a json/dict repr."""
if not input_dict: # to handle when classes defined defaults
return cls()
return cls(**input_dict)

@classmethod
Expand Down Expand Up @@ -91,8 +94,8 @@ def is_data(self):
class DeploymentType(BaseStrEnum):
"""Nature of a sub cluster deployment."""

MAIN_CLUSTER_MANAGER = "main-cluster-manager"
CLUSTER_MANAGER_FAILOVER = "cluster-manager-failover"
MAIN_ORCHESTRATOR = "main-orchestrator"
FAILOVER_ORCHESTRATOR = "failover-orchestrator"
OTHER = "other"


Expand Down Expand Up @@ -141,22 +144,6 @@ def prevent_none(cls, values): # noqa: N805
return values


class PeerClusterRelDataCredentials(Model):
"""Model class for credentials passed on the PCluster relation."""

admin_username: str
admin_password: str


class PeerClusterRelData(Model):
"""Model class for the PCluster relation data."""

cluster_name: Optional[str]
cm_nodes: List[str]
credentials: PeerClusterRelDataCredentials
tls_ca: str


class PeerClusterConfig(Model):
"""Model class for the multi-clusters related config set by the user."""

Expand Down Expand Up @@ -201,4 +188,68 @@ class DeploymentDescription(Model):
start: StartMode
pending_directives: List[Directive]
typ: DeploymentType
app: str
state: DeploymentState = DeploymentState(value=State.ACTIVE)
promotion_time: Optional[float]

@root_validator
def set_promotion_time(cls, values): # noqa: N805
"""Set promotion time of a failover to a main CM."""
if values["typ"] == DeploymentType.MAIN_ORCHESTRATOR:
values["promotion_time"] = datetime.now().timestamp()

return values


class PeerClusterRelDataCredentials(Model):
"""Model class for credentials passed on the PCluster relation."""

admin_username: str
admin_password: str
admin_password_hash: str
admin_tls: Dict[str, Optional[str]]


class PeerClusterRelData(Model):
"""Model class for the PCluster relation data."""

cluster_name: str
cm_nodes: List[Node]
credentials: PeerClusterRelDataCredentials
deployment_desc: Optional[DeploymentDescription]


class PeerClusterRelErrorData(Model):
"""Model class for the PCluster relation data."""

cluster_name: Optional[str]
should_sever_relation: bool
should_wait: bool
blocked_message: str
deployment_desc: Optional[DeploymentDescription]


class PeerClusterOrchestrators(Model):
"""Model class for the PClusters registered main/failover clusters."""

_TYPES = Literal["main", "failover"]

main_rel_id: int = -1
main_app: Optional[str]
failover_rel_id: int = -1
failover_app: Optional[str]

def delete(self, typ: _TYPES) -> None:
"""Delete an orchestrator from the current pair."""
if typ == "main":
self.main_rel_id = -1
self.main_app = None
else:
self.failover_rel_id = -1
self.failover_app = None

def promote_failover(self) -> None:
"""Delete previous main orchestrator and promote failover if any."""
self.main_app = self.failover_app
self.main_rel_id = self.failover_rel_id
self.delete("failover")
Loading

0 comments on commit 6e4cbee

Please sign in to comment.