open-vault/.github/workflows/test-ci-cleanup.yml
Josh Brand c2ae1f1654
Add automated CI account cleanup & monitoring (#18659)
This uses aws-nuke and awslimitchecker to monitor the new vault CI account to clean up and prevent resource quota exhaustion.  AWS-nuke will scan all regions of the accounts for lingering resources enos/terraform didn't clean up, and if they don't match exclusion criteria, delete them every night.  By default, we exclude corp-sec created resources, our own CI resources, and when possible, anything created within the past 72 hours. Because this account is dedicated to CI, users should not expect resources to persist beyond this without additional configuration.
2023-01-11 17:24:08 -05:00

89 lines
3.3 KiB
YAML

name: test-ci-cleanup
on:
schedule:
# * is a special character in YAML so you have to quote this string
- cron: '05 02 * * *'
jobs:
setup:
runs-on: ubuntu-latest
outputs:
regions: ${{steps.setup.outputs.regions}}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1-node16
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID_CI }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY_CI }}
aws-region: us-east-1
role-to-assume: ${{ secrets.AWS_ROLE_ARN_CI }}
role-skip-session-tagging: true
role-duration-seconds: 3600
- name: Get all regions
id: setup
run: |
echo "regions=$(aws ec2 describe-regions --region us-east-1 --output json --query 'Regions[].RegionName' | tr -d '\n ')" >> $GITHUB_OUTPUT
aws-nuke:
needs: setup
runs-on: ubuntu-latest
container:
image: rebuy/aws-nuke
options:
--user root
-t
env:
AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY }}
TIME_LIMIT: "72h"
timeout-minutes: 60
steps:
- name: Configure AWS credentials
id: aws-configure
uses: aws-actions/configure-aws-credentials@v1-node16
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID_CI }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY_CI }}
aws-region: us-east-1
role-to-assume: ${{ secrets.AWS_ROLE_ARN_CI }}
role-skip-session-tagging: true
role-duration-seconds: 3600
mask-aws-account-id: false
- uses: actions/checkout@v3
- name: Configure
run: |
cp enos/ci/aws-nuke.yml .
sed -i "s/ACCOUNT_NUM/${{ steps.aws-configure.outputs.aws-account-id }}/g" aws-nuke.yml
sed -i "s/TIME_LIMIT/${TIME_LIMIT}/g" aws-nuke.yml
# We don't care if cleanup succeeds or fails, because dependencies be dependenceies,
# we'll fail on actually actionable things in the quota steep afterwards.
- name: Clean up abandoned resources
# Filter STDERR because it's super noisy about things we don't have access to
run: |
aws-nuke -c aws-nuke.yml -q --no-dry-run --force 2>/tmp/aws-nuke-error.log || true
check-quotas:
needs: [ setup, aws-nuke ]
runs-on: ubuntu-latest
container:
image: jantman/awslimitchecker
env:
AWS_ACCESS_KEY_ID: ${{ env.AWS_ACCESS_KEY_ID_CI }}
AWS_SECRET_ACCESS_KEY: ${{ env.AWS_SECRET_ACCESS_KEY_CI }}
strategy:
matrix:
region: ${{ fromJSON(needs.setup.outputs.regions) }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1-node16
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID_CI }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY_CI }}
aws-region: us-east-1
role-to-assume: ${{ secrets.AWS_ROLE_ARN_CI }}
role-skip-session-tagging: true
role-duration-seconds: 3600
# Currently just checking VPC limits across all region, can add more checks here in future
- name: Check AWS Quotas
run: awslimitchecker -S "VPC" -r ${{matrix.region}}