enos: default undo-logs to cluster behavior (#18771)

* enos: default undo-logs to cluster behavior

* change a step dependency

* rearrange steps, wait a bit longer for undo logs
This commit is contained in:
Hamid Ghaf 2023-01-20 10:25:14 -05:00 committed by GitHub
parent 74b591c2c3
commit 4b4e0437e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 301 additions and 71 deletions

View File

@ -61,6 +61,10 @@ module "shutdown_node" {
source = "./modules/shutdown_node"
}
module "shutdown_multiple_nodes" {
source = "./modules/shutdown_multiple_nodes"
}
module "vault_agent" {
source = "./modules/vault_agent"
@ -188,3 +192,8 @@ module "vault_verify_write_data" {
vault_install_dir = var.vault_install_dir
vault_instance_count = var.vault_instance_count
}
module "vault_raft_remove_peer" {
source = "./modules/vault_raft_remove_peer"
vault_install_dir = var.vault_install_dir
}

View File

@ -1,12 +1,11 @@
scenario "autopilot" {
matrix {
arch = ["amd64", "arm64"]
artifact_source = ["local", "crt", "artifactory"]
artifact_type = ["bundle", "package"]
distro = ["ubuntu", "rhel"]
edition = ["ent", "ent.fips1402", "ent.hsm", "ent.hsm.fips1402"]
seal = ["awskms", "shamir"]
undo_logs_status = ["0", "1"]
arch = ["amd64", "arm64"]
artifact_source = ["local", "crt", "artifactory"]
artifact_type = ["bundle", "package"]
distro = ["ubuntu", "rhel"]
edition = ["ent", "ent.fips1402", "ent.hsm", "ent.hsm.fips1402"]
seal = ["awskms", "shamir"]
# Packages are not offered for the oss, ent.fips1402, and ent.hsm.fips1402 editions
exclude {
@ -46,8 +45,6 @@ scenario "autopilot" {
arm64 = "t4g.small"
}
enable_undo_logs = matrix.undo_logs_status == "1" && semverconstraint(var.vault_product_version, ">=1.13.0-0") ? true : false
vault_instance_type = coalesce(var.vault_instance_type, local.vault_instance_types[matrix.arch])
vault_license_path = abspath(var.vault_license_path != null ? var.vault_license_path : joinpath(path.root, "./support/vault.hclic"))
vault_install_dir_packages = {
@ -221,28 +218,9 @@ scenario "autopilot" {
vault_unseal_when_no_init = matrix.seal == "shamir"
vault_unseal_keys = matrix.seal == "shamir" ? step.create_vault_cluster.vault_unseal_keys_hex : null
vpc_id = step.create_vpc.vpc_id
vault_environment = { "VAULT_REPLICATION_USE_UNDO_LOGS" : local.enable_undo_logs }
}
}
step "get_updated_vault_cluster_ips" {
module = module.vault_get_cluster_ips
depends_on = [
step.create_vault_cluster,
step.get_vault_cluster_ips,
step.upgrade_vault_cluster_with_autopilot
]
providers = {
enos = local.enos_provider[matrix.distro]
}
variables {
vault_instances = step.create_vault_cluster.vault_instances
vault_install_dir = local.vault_install_dir
added_vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances
vault_root_token = step.create_vault_cluster.vault_root_token
node_public_ip = step.get_vault_cluster_ips.leader_public_ip
vault_environment = {
"VAULT_LOG_LEVEL" : "debug"
}
}
}
@ -281,6 +259,47 @@ scenario "autopilot" {
}
}
step "verify_autopilot_await_server_removal_state" {
module = module.vault_verify_autopilot
depends_on = [
step.upgrade_vault_cluster_with_autopilot,
step.verify_raft_auto_join_voter
]
providers = {
enos = local.enos_provider[matrix.distro]
}
variables {
vault_autopilot_upgrade_version = matrix.artifact_source == "local" ? step.get_local_metadata.version : var.vault_product_version
vault_autopilot_upgrade_status = "await-server-removal"
vault_install_dir = local.vault_install_dir
vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances
vault_root_token = step.create_vault_cluster.vault_root_token
}
}
step "get_updated_vault_cluster_ips" {
module = module.vault_get_cluster_ips
depends_on = [
step.create_vault_cluster,
step.get_vault_cluster_ips,
step.upgrade_vault_cluster_with_autopilot
]
providers = {
enos = local.enos_provider[matrix.distro]
}
variables {
vault_instances = step.create_vault_cluster.vault_instances
vault_install_dir = local.vault_install_dir
added_vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances
vault_root_token = step.create_vault_cluster.vault_root_token
node_public_ip = step.get_vault_cluster_ips.leader_public_ip
}
}
step "verify_read_test_data" {
module = module.vault_verify_read_data
depends_on = [
@ -301,32 +320,12 @@ scenario "autopilot" {
}
}
step "verify_autopilot_upgraded_vault_cluster" {
module = module.vault_verify_autopilot
step "raft_remove_peers" {
module = module.vault_raft_remove_peer
depends_on = [
step.get_updated_vault_cluster_ips,
step.upgrade_vault_cluster_with_autopilot,
step.verify_raft_auto_join_voter
]
providers = {
enos = local.enos_provider[matrix.distro]
}
variables {
vault_autopilot_upgrade_version = matrix.artifact_source == "local" ? step.get_local_metadata.version : var.vault_product_version
vault_autopilot_upgrade_status = "await-server-removal"
vault_install_dir = local.vault_install_dir
vault_instances = step.create_vault_cluster.vault_instances
vault_root_token = step.create_vault_cluster.vault_root_token
}
}
step "verify_undo_logs_status" {
skip_step = semverconstraint(var.vault_product_version, "<1.13.0-0")
module = module.vault_verify_undo_logs
depends_on = [
step.upgrade_vault_cluster_with_autopilot,
step.verify_autopilot_upgraded_vault_cluster
step.verify_autopilot_await_server_removal_state
]
providers = {
@ -335,12 +334,71 @@ scenario "autopilot" {
variables {
vault_install_dir = local.vault_install_dir
vault_undo_logs_status = matrix.undo_logs_status
vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances
operator_instance = step.get_updated_vault_cluster_ips.leader_public_ip
remove_vault_instances = step.create_vault_cluster.vault_instances
vault_instance_count = 3
vault_root_token = step.create_vault_cluster.vault_root_token
}
}
step "remove_old_nodes" {
module = module.shutdown_multiple_nodes
depends_on = [
step.create_vault_cluster,
step.raft_remove_peers
]
providers = {
enos = local.enos_provider[matrix.distro]
}
variables {
old_vault_instances = step.create_vault_cluster.vault_instances
vault_instance_count = 3
}
}
step "verify_autopilot_idle_state" {
module = module.vault_verify_autopilot
depends_on = [
step.upgrade_vault_cluster_with_autopilot,
step.verify_raft_auto_join_voter,
step.remove_old_nodes
]
providers = {
enos = local.enos_provider[matrix.distro]
}
variables {
vault_autopilot_upgrade_version = matrix.artifact_source == "local" ? step.get_local_metadata.version : var.vault_product_version
vault_autopilot_upgrade_status = "idle"
vault_install_dir = local.vault_install_dir
vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances
vault_root_token = step.create_vault_cluster.vault_root_token
}
}
step "verify_undo_logs_status" {
skip_step = semverconstraint(var.vault_product_version, "<1.13.0-0")
module = module.vault_verify_undo_logs
depends_on = [
step.remove_old_nodes,
step.upgrade_vault_cluster_with_autopilot,
step.verify_autopilot_idle_state
]
providers = {
enos = local.enos_provider[matrix.distro]
}
variables {
vault_install_dir = local.vault_install_dir
vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances
vault_root_token = step.create_vault_cluster.vault_root_token
}
}
output "vault_cluster_instance_ids" {
description = "The Vault cluster instance IDs"
value = step.create_vault_cluster.instance_ids

View File

@ -165,3 +165,17 @@ variable "vault_upgrade_initial_release" {
version = "1.10.4"
}
}
variable "operator_instance" {
type = string
description = "The ip address of the operator (Voter) node"
}
variable "remove_vault_instances" {
type = map(object({
private_ip = string
public_ip = string
}))
description = "The old vault nodes to be removed"
}

View File

@ -0,0 +1,40 @@
terraform {
required_providers {
enos = {
source = "app.terraform.io/hashicorp-qti/enos"
}
}
}
variable "vault_instance_count" {
type = number
description = "How many vault instances are in the cluster"
}
variable "old_vault_instances" {
type = map(object({
private_ip = string
public_ip = string
}))
description = "The vault cluster instances to be shutdown"
}
locals {
public_ips = {
for idx in range(var.vault_instance_count) : idx => {
public_ip = values(var.old_vault_instances)[idx].public_ip
private_ip = values(var.old_vault_instances)[idx].private_ip
}
}
}
resource "enos_remote_exec" "shutdown_multiple_nodes" {
for_each = local.public_ips
inline = ["sudo shutdown -H --no-wall; exit 0"]
transport = {
ssh = {
host = each.value.public_ip
}
}
}

View File

@ -0,0 +1,71 @@
terraform {
required_providers {
enos = {
source = "app.terraform.io/hashicorp-qti/enos"
}
}
}
variable "vault_cluster_addr_port" {
description = "The Raft cluster address port"
type = string
default = "8201"
}
variable "vault_install_dir" {
type = string
description = "The directory where the Vault binary will be installed"
}
variable "vault_instance_count" {
type = number
description = "How many vault instances are in the cluster"
}
variable "operator_instance" {
type = string
description = "The ip address of the operator (Voter) node"
}
variable "remove_vault_instances" {
type = map(object({
private_ip = string
public_ip = string
}))
description = "The old vault nodes to be removed"
}
variable "vault_root_token" {
type = string
description = "The vault root token"
}
locals {
instances = {
for idx in range(var.vault_instance_count) : idx => {
public_ip = values(var.remove_vault_instances)[idx].public_ip
private_ip = values(var.remove_vault_instances)[idx].private_ip
}
}
}
resource "enos_remote_exec" "vault_raft_remove_peer" {
for_each = local.instances
environment = {
VAULT_TOKEN = var.vault_root_token
VAULT_ADDR = "http://localhost:8200"
}
content = templatefile("${path.module}/templates/raft-remove-peer.sh", {
remove_vault_cluster_addr = "${each.value.private_ip}:${var.vault_cluster_addr_port}"
vault_install_dir = var.vault_install_dir
vault_local_binary_path = "${var.vault_install_dir}/vault"
})
transport = {
ssh = {
host = var.operator_instance
}
}
}

View File

@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -e
binpath=${vault_install_dir}/vault
node_addr=${remove_vault_cluster_addr}
fail() {
echo "$1" 2>&1
return 1
}
retry() {
local retries=$1
shift
local count=0
until "$@"; do
exit=$?
wait=$((2 ** count))
count=$((count + 1))
if [ "$count" -lt "$retries" ]; then
sleep "$wait"
echo "retry $count"
else
return "$exit"
fi
done
return 0
}
remove_peer() {
node_id=$($binpath operator raft list-peers -format json | jq -Mr --argjson expected "false" '.data.config.servers[] | select(.address=='\""$node_addr"\"') | select(.voter==$expected) | .node_id')
if [ "$?" != "0" ];then
fail "failed to get node id of a non-voter node"
fi
$binpath operator raft remove-peer "$node_id"
}
test -x "$binpath" || fail "unable to locate vault binary at $binpath"
# Retry a few times because it can take some time for things to settle after autopilot upgrade
retry 5 remove_peer

View File

@ -29,12 +29,6 @@ variable "vault_root_token" {
description = "The vault root token"
}
variable "vault_undo_logs_status" {
type = string
description = "An integer either 0 or 1 which indicates whether undo_logs are disabled or enabled"
default = null
}
locals {
public_ips = {
for idx in range(var.vault_instance_count) : idx => {
@ -48,9 +42,8 @@ resource "enos_remote_exec" "smoke-verify-undo-logs" {
for_each = local.public_ips
environment = {
VAULT_TOKEN = var.vault_root_token
VAULT_ADDR = "http://localhost:8200"
VAULT_UNDO_LOGS_STATUS = var.vault_undo_logs_status
VAULT_TOKEN = var.vault_root_token
VAULT_ADDR = "http://localhost:8200"
}
scripts = [abspath("${path.module}/scripts/smoke-verify-undo-logs.sh")]

View File

@ -1,19 +1,18 @@
#!/bin/bash
undo_logs_status="${VAULT_UNDO_LOGS_STATUS}"
function fail() {
echo "$1" 1>&2
exit 1
}
count=0
retries=7
retries=20
while :; do
state=$(curl --header "X-Vault-Token: $VAULT_TOKEN" "$VAULT_ADDR/v1/sys/metrics" | jq -r '.Gauges[] | select(.Name == "vault.core.replication.write_undo_logs")')
leader_address=$(curl -H "X-Vault-Request: true" -H "X-Vault-Token: $VAULT_TOKEN" "$VAULT_ADDR/v1/sys/leader" | jq '.leader_address' | sed 's/\"//g')
state=$(curl --header "X-Vault-Token: $VAULT_TOKEN" "$leader_address/v1/sys/metrics" | jq -r '.Gauges[] | select(.Name == "vault.core.replication.write_undo_logs")')
target_undo_logs_status="$(jq -r '.Value' <<< "$state")"
if [ "$undo_logs_status" = "$target_undo_logs_status" ]; then
if [ "$target_undo_logs_status" == "1" ]; then
exit 0
fi