Skip to content

Commit

Permalink
Fix for non-nitro AWS ephemeral disks. Refactor for multiple clouds. (s…
Browse files Browse the repository at this point in the history
…ky-uk#87)

+ Update blockdevmap.py to fix problem with non-nitro AWS ephemeral disks.
+ Refactor for multiple clouds
  + Remove inline aws and gcp code from generic yml files. E.g.: `clean_vms.yml` contains blocks for `when: cluster_vars.type == "aws"` and `when: cluster_vars.type == "gcp"`).  Replace with cloud-specifc files, e.g.: `clean/tasks/aws.yml` and `clean/tasks/gcp.yml`.
  + Allows others to more easily add new cloud technologies (e.g. Azure or Vmware) without changing existing cloud files.
  + These changes don't introduce any functional differences, just a reorganisation.
+ Fix: test for `clean is defined`
+ Reuse get_cluster_hosts_state_{{buildenv}}.yml code to build dynamic inventory.
+ Add disk info to cluster_hosts_state to reduce code duplication and save on cloud calls during rollback with _scheme_rmvm_keepdisk_rollback
+ Add automatic checking of correct device mapping to disks_auto_aws_gcp.yml
+ Add dynamic_inventory as a dependency of redeployment
  • Loading branch information
dseeley-sky authored Apr 15, 2021
1 parent 18c48f6 commit ec23be6
Show file tree
Hide file tree
Showing 61 changed files with 863 additions and 749 deletions.
24 changes: 12 additions & 12 deletions EXAMPLE/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@ The `cluster.yml` sub-role immutably deploys a cluster from the config defined a

### AWS:
```
ansible-playbook -u ubuntu --private-key=/home/<user>/.ssh/<rsa key> cluster.yml -e buildenv=sandbox -e cloud_type=aws -e region=eu-west-1 -e clusterid=test [email protected]
ansible-playbook -u ubuntu --private-key=/home/<user>/.ssh/<rsa key> cluster.yml -e buildenv=sandbox -e cloud_type=aws -e region=eu-west-1 -e clusterid=test [email protected] --tags=clusterverse_clean -e clean=_all_
ansible-playbook -u ubuntu --private-key=/home/<user>/.ssh/<rsa key> cluster.yml -e buildenv=sandbox -e clusterid=test_aws_euw1 [email protected]
ansible-playbook -u ubuntu --private-key=/home/<user>/.ssh/<rsa key> cluster.yml -e buildenv=sandbox -e clusterid=test_aws_euw1 [email protected] --tags=clusterverse_clean -e clean=_all_
ansible-playbook cluster.yml -e buildenv=sandbox -e clusterid=testid -e cloud_type=aws -e region=eu-west-1 [email protected]
ansible-playbook cluster.yml -e buildenv=sandbox -e clusterid=testid -e cloud_type=aws -e region=eu-west-1 [email protected] --tags=clusterverse_clean -e clean=_all_
ansible-playbook cluster.yml -e buildenv=sandbox -e clusterid=test_aws_euw1 [email protected]
ansible-playbook cluster.yml -e buildenv=sandbox -e clusterid=test_aws_euw1 [email protected] --tags=clusterverse_clean -e clean=_all_
```
### GCP:
```
ansible-playbook -u <username> --private-key=/home/<user>/.ssh/<rsa key> cluster.yml -e buildenv=sandbox -e clusterid=test -e cloud_type=gcp -e region=europe-west1 [email protected]
ansible-playbook -u <username> --private-key=/home/<user>/.ssh/<rsa key> cluster.yml -e buildenv=sandbox -e clusterid=test -e cloud_type=gcp -e region=europe-west1 [email protected] --tags=clusterverse_clean -e clean=_all_
ansible-playbook -u <username> --private-key=/home/<user>/.ssh/<rsa key> cluster.yml -e buildenv=sandbox -e clusterid=test_gcp_euw1 [email protected]
ansible-playbook -u <username> --private-key=/home/<user>/.ssh/<rsa key> cluster.yml -e buildenv=sandbox -e clusterid=test_gcp_euw1 [email protected] --tags=clusterverse_clean -e clean=_all_
ansible-playbook cluster.yml -e buildenv=sandbox -e clusterid=testid -e cloud_type=gcp -e region=europe-west1 [email protected]
ansible-playbook cluster.yml -e buildenv=sandbox -e clusterid=testid -e cloud_type=gcp -e region=europe-west1 [email protected] --tags=clusterverse_clean -e clean=_all_
ansible-playbook cluster.yml -e buildenv=sandbox -e clusterid=test_gcp_euw1 [email protected]
ansible-playbook cluster.yml -e buildenv=sandbox -e clusterid=test_gcp_euw1 [email protected] --tags=clusterverse_clean -e clean=_all_
```

### Mandatory command-line variables:
Expand Down Expand Up @@ -62,13 +62,13 @@ The `redeploy.yml` sub-role will completely redeploy the cluster; this is useful

### AWS:
```
ansible-playbook -u ubuntu --private-key=/home/<user>/.ssh/<rsa key> redeploy.yml -e buildenv=sandbox -e clusterid=test_aws_euw1 [email protected] -e canary=none
ansible-playbook -u ubuntu --private-key=/home/<user>/.ssh/<rsa key> redeploy.yml -e buildenv=sandbox -e cloud_type=aws -e region=eu-west-1 -e clusterid=test [email protected] -e canary=none
ansible-playbook redeploy.yml -e buildenv=sandbox -e cloud_type=aws -e region=eu-west-1 -e clusterid=test [email protected] -e canary=none
ansible-playbook redeploy.yml -e buildenv=sandbox -e clusterid=test_aws_euw1 [email protected] -e canary=none
```
### GCP:
```
ansible-playbook -u <username> --private-key=/home/<user>/.ssh/<rsa key> redeploy.yml -e buildenv=sandbox -e clusterid=test_aws_euw1 [email protected] -e canary=none
ansible-playbook -u <username> --private-key=/home/<user>/.ssh/<rsa key> redeploy.yml -e buildenv=sandbox -e clusterid=test -e cloud_type=gcp -e region=europe-west1 [email protected] -e canary=none
ansible-playbook redeploy.yml -e buildenv=sandbox -e clusterid=test -e cloud_type=gcp -e region=europe-west1 [email protected] -e canary=none
ansible-playbook redeploy.yml -e buildenv=sandbox -e clusterid=test_aws_euw1 [email protected] -e canary=none
```

### Mandatory command-line variables:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ beats_config:
cluster_vars:
dns_nameserver_zone: &dns_nameserver_zone "" # The zone that dns_server will operate on. gcloud dns needs a trailing '.'. Leave blank if no external DNS (use IPs only)
dns_user_domain: "{%- if _dns_nameserver_zone -%}{{cloud_type}}-{{region}}.{{app_class}}.{{buildenv}}.{{_dns_nameserver_zone}}{%- endif -%}" # A user-defined _domain_ part of the FDQN, (if more prefixes are required before the dns_nameserver_zone)
dns_server: "" # Specify DNS server. nsupdate, route53 or clouddns. If empty string is specified, no DNS will be added.
instance_profile_name: ""
custom_tagslabels:
inv_resident_id: "myresident"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---

cluster_vars:
image: "ami-04ffbabc7935ec0e9" # eu-west-1, ubuntu, 20.04, amd64, hvm-ssd, 20210108. Ubuntu images can be located at https://cloud-images.ubuntu.com/locator/
image: "ami-0dd0f5f97a21a8fe9" # eu-west-1, ubuntu, 20.04, amd64, hvm-ssd, 20210315. Ubuntu images can be located at https://cloud-images.ubuntu.com/locator/
# image: "ami-0b850cf02cc00fdc8" # eu-west-1, CentOS7
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ cluster_vars:
version: "{{sysdisks_version | default('')}}"
vms_by_az: { a: 1, b: 1, c: 0 }

hostnvme-notnitro:
auto_volumes:
- { device_name: "/dev/sdb", mountpoint: "/media/mysvc", fstype: "ext4", volume_type: "ephemeral", ephemeral: ephemeral0 }
flavor: i3.large
version: "{{sys_version | default('')}}"
vms_by_az: { a: 1, b: 1, c: 0 }

hostnvme-multi:
auto_volumes:
- { device_name: "/dev/sdb", mountpoint: "/media/mysvc", fstype: "ext4", volume_type: "ephemeral", ephemeral: ephemeral0 }
Expand Down
5 changes: 3 additions & 2 deletions EXAMPLE/cluster_defs/cluster_vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ redeploy_schemes_supported: ['_scheme_addallnew_rmdisk_rollback', '_scheme_addne
#redeploy_scheme: _scheme_rmvm_keepdisk_rollback

skip_dynamic_inventory_sshwait: true
test_touch_disks: true

app_name: "{{lookup('pipe', 'whoami')}}-test" # The name of the application cluster (e.g. 'couchbase', 'nginx'); becomes part of cluster_name. Provided is a default to ensure no accidental overwriting.
app_class: "test" # The class of application (e.g. 'database', 'webserver'); becomes part of the fqdn
app_name: "{{lookup('pipe', 'whoami') | lower}}-test" # The name of the application cluster (e.g. 'couchbase', 'nginx'); becomes part of cluster_name. Provided is a default to ensure no accidental overwriting.
app_class: "test" # The class of application (e.g. 'database', 'webserver'); becomes part of the fqdn

beats_config:
filebeat:
Expand Down
2 changes: 1 addition & 1 deletion EXAMPLE/cluster_defs/gcp/cluster_vars__cloud.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---

cluster_vars:
image: "projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20210112" # Ubuntu images can be located at https://cloud-images.ubuntu.com/locator/
image: "projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20210315" # Ubuntu images can be located at https://cloud-images.ubuntu.com/locator/
# image: "projects/ubuntu-os-cloud/global/images/centos-7-v20201216
dns_cloud_internal_domain: "c.{{ (_gcp_service_account_rawtext | string | from_json).project_id }}.internal" # The cloud-internal zone as defined by the cloud provider (e.g. GCP, AWS)
dns_server: "clouddns" # Specify DNS server. nsupdate, route53 or clouddns. If empty string is specified, no DNS will be added.
Expand Down
2 changes: 1 addition & 1 deletion EXAMPLE/cluster_defs/test_aws_euw1/cluster_vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ cluster_name: "{{app_name}}-{{buildenv}}" # Identifies the cluster within

cluster_vars:
type: &cloud_type "aws"
image: "ami-04ffbabc7935ec0e9" # eu-west-1, ubuntu, 20.04, amd64, hvm-ssd, 20210108. Ubuntu images can be located at https://cloud-images.ubuntu.com/locator/
image: "ami-0dd0f5f97a21a8fe9" # eu-west-1, ubuntu, 20.04, amd64, hvm-ssd, 20210315. Ubuntu images can be located at https://cloud-images.ubuntu.com/locator/
# image: "ami-0b850cf02cc00fdc8" # eu-west-1, CentOS7
region: &region "eu-west-1"
dns_cloud_internal_domain: "{{_region}}.compute.internal" # The cloud-internal zone as defined by the cloud provider (e.g. GCP, AWS)
Expand Down
2 changes: 1 addition & 1 deletion EXAMPLE/cluster_defs/test_gcp_euw1/cluster_vars.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ cluster_name: "{{app_name}}-{{buildenv}}" # Identifies the cluster within

cluster_vars:
type: &cloud_type "gcp"
image: "projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20210112" # Ubuntu images can be located at https://cloud-images.ubuntu.com/locator/
image: "projects/ubuntu-os-cloud/global/images/ubuntu-2004-focal-v20210315" # Ubuntu images can be located at https://cloud-images.ubuntu.com/locator/
# image: "projects/ubuntu-os-cloud/global/images/centos-7-v20201216
region: &region "europe-west1"
dns_cloud_internal_domain: "c.{{ (_gcp_service_account_rawtext | string | from_json).project_id }}.internal" # The cloud-internal zone as defined by the cloud provider (e.g. GCP, AWS)
Expand Down
6 changes: 3 additions & 3 deletions EXAMPLE/clusterverse_label_upgrade_v1-v2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
connection: local
gather_facts: true
tasks:
- import_role:
- include_role:
name: 'clusterverse/_dependencies'

- import_role:
- include_role:
name: 'clusterverse/cluster_hosts'
tasks_from: get_cluster_hosts_state.yml
tasks_from: "get_cluster_hosts_state_{{cluster_vars.type}}.yml"

- block:
- name: clusterverse_label_upgrade_v1-v2 | Add lifecycle_state and cluster_suffix label to AWS EC2 VM
Expand Down
21 changes: 10 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ A full-lifecycle, immutable cloud infrastructure cluster management **role**, us
+ **Multi-cloud:** clusterverse can manage cluster lifecycle in AWS and GCP
+ **Deploy:** You define your infrastructure as code (in Ansible yaml), and clusterverse will deploy it
+ **Scale-up:** If you change the cluster definitions and rerun the deploy, new nodes will be added.
+ **Redeploy (e.g. up-version):** If you need to up-version, the `redeploy.yml` playbook will replace each node in turn, (with optional callbacks), and rollback if any failures occur.
+ **Redeploy (e.g. up-version):** If you need to up-version, or replace the underlying OS, (i.e. to achieve fully immutable, zero-patching redeploys), the `redeploy.yml` playbook will replace each node in the cluster (via various redeploy schemes), and rollback if any failures occur.

**clusterverse** is designed to manage base-vm infrastructure that underpins cluster-based infrastructure, for example, Couchbase, Kafka, Elasticsearch, or Cassandra.

Expand All @@ -22,19 +22,19 @@ To active the pipenv:

### AWS
+ AWS account with IAM rights to create EC2 VMs and security groups in the chosen VPCs/subnets. Place the credentials in:
+ `cluster_vars/<buildenv>/aws_access_key:`
+ `cluster_vars/<buildenv>/aws_secret_key:`
+ `cluster_vars[buildenv].aws_access_key:`
+ `cluster_vars[buildenv].aws_secret_key:`
+ Preexisting VPCs:
+ `cluster_vars/<buildenv>/vpc_name: my-vpc-{{buildenv}}`
+ `cluster_vars[buildenv].vpc_name: my-vpc-{{buildenv}}`
+ Preexisting subnets. This is a prefix - the cloud availability zone will be appended to the end (e.g. `a`, `b`, `c`).
+ `cluster_vars/<buildenv>/vpc_subnet_name_prefix: my-subnet-{{region}}`
+ `cluster_vars[buildenv].vpc_subnet_name_prefix: my-subnet-{{region}}`
+ Preexisting keys (in AWS IAM):
+ `cluster_vars/<buildenv>/key_name: my_key__id_rsa`
+ `cluster_vars[buildenv].key_name: my_key__id_rsa`

### GCP
+ Create a gcloud account.
+ Create a service account in `IAM & Admin` / `Service Accounts`. Download the json file locally.
+ Store the contents within the `cluster_vars/gcp_service_account_rawtext` variable.
+ Store the contents within the `cluster_vars[buildenv].gcp_service_account_rawtext` variable.
+ During execution, the json file will be copied locally because the Ansible GCP modules often require the file as input.
+ Google Cloud SDK needs to be installed to run gcloud command-line (e.g. to disable delete protection) - this is handled by `pipenv install`

Expand Down Expand Up @@ -183,14 +183,13 @@ The role is designed to run in two modes:
+ The `redeploy.yml` sub-role will completely redeploy the cluster; this is useful for example to upgrade the underlying operating system version.
+ It supports `canary` deploys. The `canary` extra variable must be defined on the command line set to one of: `start`, `finish`, `none` or `tidy`.
+ It contains callback hooks:
+ `mainclusteryml`: This is the name of the deployment playbook. It is called to rollback a failed deployment. It should be set to the value of the primary _deploy_ playbook yml (e.g. `cluster.yml`)
+ `mainclusteryml`: This is the name of the deployment playbook. It is called to deploy nodes for the new cluster, or to rollback a failed deployment. It should be set to the value of the primary _deploy_ playbook yml (e.g. `cluster.yml`)
+ `predeleterole`: This is the name of a role that should be called prior to deleting VMs; it is used for example to eject nodes from a Couchbase cluster. It takes a list of `hosts_to_remove` VMs.
+ It supports pluggable redeployment schemes. The following are provided:
+ **_scheme_rmvm_rmdisk_only**
+ This is a very basic rolling redeployment of the cluster.
+ Canary **is not** supported.
+ _Supports redploying to bigger, but not smaller clusters_
+ **It assumes a resilient deployment (it can tolerate one node being deleted from the cluster). There is no rollback in case of failure.**
+ **It assumes a resilient deployment (it can tolerate one node being deleted from the cluster). There is _no rollback_ in case of failure.**
+ For each node in the cluster:
+ Run `predeleterole`
+ Delete/ terminate the node (note, this is _irreversible_).
Expand All @@ -217,7 +216,7 @@ The role is designed to run in two modes:
+ To delete the old VMs, either set '-e canary_tidy_on_success=true', or call redeploy.yml with '-e canary=tidy'
+ **_scheme_rmvm_keepdisk_rollback**
+ Redeploys the nodes one by one, and moves the secondary (non-root) disks from the old to the new (note, only non-ephemeral disks can be moved).
+ _Cluster topology must remain identical. More disks may be added, but none may change or be removed._
+ _Cluster node topology must remain identical. More disks may be added, but none may change or be removed._
+ **It assumes a resilient deployment (it can tolerate one node being removed from the cluster).**
+ For each node in the cluster:
+ Run `predeleterole`
Expand Down
22 changes: 14 additions & 8 deletions _dependencies/filter_plugins/custom.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python

from ansible.utils.display import Display
from ansible import constants as C
from ansible.module_utils._text import to_native, to_text

display = Display()
# display.v(u"json_loads_loose - input type: %s" % type(inStr))
Expand Down Expand Up @@ -35,24 +37,28 @@ def iplookup(fqdn):
return fqdn
else:
import dns.resolver
return str(dns.resolver.query(fqdn, 'A')[0])
return to_text(dns.resolver.query(fqdn, 'A')[0])

# Returns a json object from a loosely defined string (e.g. encoded using single quotes instead of double), or an object containing "AnsibleUnsafeText"
def json_loads_loose(inStr):
import re, json
import re, json, sys

display.vvv(u"json_loads_loose - input type: %s" % type(inStr))
display.vv(u"json_loads_loose - input type: %s; value %s" % (type(inStr), inStr))
if type(inStr) is dict or type(inStr) is list:
json_object = json.loads((str(json.dumps(inStr))).encode('utf-8'))
json_object = json.loads((to_text(json.dumps(inStr))).encode('utf-8'))
else:
try:
json_object = json.loads(inStr)
except (ValueError, AttributeError) as e:
except (ValueError, AttributeError, TypeError) as e:
try:
json_object = json.loads(str(re.sub(r'\'(.*?)\'([,:}])', r'"\1"\2', inStr).replace(': True', ': "True"').replace(': False', ': "False"')).encode('utf-8'))
except (ValueError, AttributeError) as e:
display.v(u"json_loads_loose - WARNING: could not parse attribute string as json: %s" % inStr)
json_object = json.loads(to_text(re.sub(r'\'(.*?)\'([,:}])', r'"\1"\2', inStr).replace(': True', ': "True"').replace(': False', ': "False"')).encode('utf-8'))
except (ValueError, AttributeError, TypeError) as e:
display.warning(u"json_loads_loose - WARNING: could not parse attribute string (%s) as json: %s" % (to_native(inStr), to_native(e)))
return inStr
except:
e = sys.exc_info()[0]
display.warning(u"json_loads_loose - WARNING: could not parse attribute string (%s) as json: %s" % (to_native(inStr), to_native(e)))
return inStr
return json_object


Expand Down
Loading

0 comments on commit ec23be6

Please sign in to comment.