Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow customising parameters related to disk cleanup #623

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
4 changes: 2 additions & 2 deletions docker-compose.unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ version: '3'

services:
unit-tests:
image: lucor/bats
build: ./unit-tests
volumes:
- .:/src:ro
working_dir: /src
command: bats /src/unit-tests/
command: bats /src/unit-tests/
57 changes: 41 additions & 16 deletions packer/linux/conf/bin/bk-check-disk-space.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,50 @@
#!/bin/bash
set -euo pipefail

DISK_MIN_AVAILABLE=${DISK_MIN_AVAILABLE:-5242880} # 5GB
DISK_MIN_INODES=${DISK_MIN_INODES:-250000} # docker needs lots

DOCKER_DIR="/var/lib/docker/"

disk_avail=$(df -k --output=avail "$DOCKER_DIR" | tail -n1)

echo "Disk space free: $(df -k -h --output=avail "$DOCKER_DIR" | tail -n1 | sed -e 's/^[[:space:]]//')"

if [[ $disk_avail -lt $DISK_MIN_AVAILABLE ]]; then
echo "Not enough disk space free, cutoff is ${DISK_MIN_AVAILABLE} 🚨" >&2
exit 1
# Usage:
# bk-check-disk-space.sh (min disk required) (min inodes required)
# min disk required can be either an amount of bytes, a pattern like 10G
# or 500M, or a percentage like 5%
# min inodes must be a number, default to 250,000

min_available=${1:-5G}
lox marked this conversation as resolved.
Show resolved Hide resolved
docker_dir="/var/lib/docker/"

# First check the disk available

disk_avail=$(df -k --output=avail "$docker_dir" | tail -n1)
disk_avail_human=$(df -k -h --output=avail "$docker_dir" | tail -n1 | tr -d '[:space:]')
disk_used_pct=$(df -k --output=pcent "$docker_dir" | tail -n1 | tr -d '[:space:]' | tr -d '%')
disk_free_pct=$((100-disk_used_pct))

printf "Disk space free: %s (%s%%)\\n" "$disk_avail_human" "$disk_free_pct"

# Check if the min_available is a percentage
if [[ $min_available =~ \%$ ]] ; then
if [[ $(echo "${disk_free_pct}<${min_available}" | sed 's/%//g' | bc) -gt 0 ]] ; then
echo "Not enough disk space free, cutoff percentage is ${min_available} 🚨" >&2
exit 1
fi
else
disk_avail_bytes="$((disk_avail*1024))"
min_available_bytes="$(/usr/local/bin/bk-parse-byte-units.sh "$min_available")"
if [[ $disk_avail_bytes -lt $min_available_bytes ]]; then
echo "Not enough disk space free, cutoff is ${min_available} 🚨" >&2
exit 1
fi
fi

inodes_avail=$(df -k --output=iavail "$DOCKER_DIR" | tail -n1)
# Next check inodes, these can be exhausted by docker build operations

inodes_min_available=${2:-250000}
inodes_avail=$(df -k --output=iavail "$docker_dir" | tail -n1 | tr -d '[:space:]')
inodes_avail_human=$(df -k -h --output=iavail "$docker_dir" | tail -n1 | tr -d '[:space:]')
inodes_used_pct=$(df -k --output=ipcent "$docker_dir" | tail -n1 | tr -d '[:space:]' | tr -d '%')
inodes_free_pct=$((100-inodes_used_pct))

echo "Inodes free: $(df -k -h --output=iavail "$DOCKER_DIR" | tail -n1 | sed -e 's/^[[:space:]]//')"
printf "Inodes free: %s (%s%%)\\n" "$inodes_avail_human" "$inodes_free_pct"

if [[ $inodes_avail -lt $DISK_MIN_INODES ]]; then
echo "Not enough inodes free, cutoff is ${DISK_MIN_INODES} 🚨" >&2
if [[ $inodes_avail -lt $inodes_min_available ]]; then
echo "Not enough inodes free, cutoff is ${inodes_min_available} 🚨" >&2
exit 1
fi
6 changes: 6 additions & 0 deletions packer/linux/conf/bin/bk-install-elastic-stack.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ export PLUGINS_ENABLED="${PLUGINS_ENABLED[*]-}"
export BUILDKITE_ECR_POLICY=${BUILDKITE_ECR_POLICY:-none}
EOF

# cron-env is sourced by crontab entries and low disk scripts
cat << EOF > /var/lib/buildkite-agent/cron-env
export DISK_MIN_AVAILABLE=$DISK_MIN_AVAILABLE
export DOCKER_PRUNE_UNTIL=$DOCKER_PRUNE_UNTIL
EOF

if [[ "${BUILDKITE_AGENT_RELEASE}" == "edge" ]] ; then
echo "Downloading buildkite-agent edge..."
curl -Lsf -o /usr/bin/buildkite-agent-edge \
Expand Down
14 changes: 14 additions & 0 deletions packer/linux/conf/bin/bk-parse-byte-units.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash
# Converts human-readable units like 1.43K and 120.3M to bytes

if [[ ! "${1:-}" =~ ^[0-9] ]] ; then
printf "Invalid input, must start with a number: %q\n" "${1:-}" >&2
exit 1
fi

/usr/bin/awk \
'/[0-9][bB]?$/ {printf "%u\n", $1*1}
/[tT][bB]?$/ {printf "%u\n", $1*(1024*1024*1024*1024)}
/[gG][bB]?$/ {printf "%u\n", $1*(1024*1024*1024)}
/[mM][bB]?$/ {printf "%u\n", $1*(1024*1024)}
/[kK][bB]?$/ {printf "%u\n", $1*1024}' <<< "$1"
9 changes: 7 additions & 2 deletions packer/linux/conf/buildkite-agent/hooks/environment
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ source ~/cfn-env
echo "~~~ :llama: Setting up elastic stack environment ($BUILDKITE_STACK_VERSION)"
cat ~/cfn-env

if [[ -f ~/cron-env ]] ; then
# shellcheck source=/dev/null
source ~/cron-env
fi

echo "Checking docker"
if ! docker ps ; then
echo "^^^ +++"
Expand All @@ -17,13 +22,13 @@ if ! docker ps ; then
fi

echo "Checking disk space"
if ! /usr/local/bin/bk-check-disk-space.sh ; then
if ! /usr/local/bin/bk-check-disk-space.sh "${DISK_MIN_AVAILABLE:-}" ; then

echo "Cleaning up docker resources older than ${DOCKER_PRUNE_UNTIL:-4h}"
docker image prune --all --force --filter "until=${DOCKER_PRUNE_UNTIL:-4h}"

echo "Checking disk space again"
if ! /usr/local/bin/bk-check-disk-space.sh ; then
if ! /usr/local/bin/bk-check-disk-space.sh "${DISK_MIN_AVAILABLE:-}"; then
echo "Disk health checks failed" >&2
exit 1
fi
Expand Down
12 changes: 9 additions & 3 deletions packer/linux/conf/docker/cron.hourly/docker-gc
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,16 @@ if [[ $EUID -eq 0 ]]; then
exec >> /var/log/elastic-stack.log 2>&1 # Logs to elastic-stack.log
fi

DOCKER_PRUNE_UNTIL=${DOCKER_PRUNE_UNTIL:-4h}
# Load config from file if it exists
if [[ -f /var/lib/buildkite-agent/cron-env ]] ; then
# shellcheck source=/dev/null
source /var/lib/buildkite-agent/cron-env
else
DOCKER_PRUNE_UNTIL=4h
fi

## ------------------------------------------
## Prune stuff that doesn't affect cache hits

docker network prune --force --filter "until=${DOCKER_PRUNE_UNTIL}"
docker container prune --force --filter "until=${DOCKER_PRUNE_UNTIL}"
docker network prune --force --filter "until=${!DOCKER_PRUNE_UNTIL}"
docker container prune --force --filter "until=${!DOCKER_PRUNE_UNTIL}"
16 changes: 10 additions & 6 deletions packer/linux/conf/docker/cron.hourly/docker-low-disk-gc
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ if [[ $EUID -eq 0 ]]; then
exec >> /var/log/elastic-stack.log 2>&1 # Logs to elastic-stack.log
fi

DOCKER_PRUNE_UNTIL=${DOCKER_PRUNE_UNTIL:-1h}

mark_instance_unhealthy() {
# cancel any running buildkite builds
killall -QUIT buildkite-agent || true
Expand All @@ -19,14 +17,20 @@ mark_instance_unhealthy() {

trap mark_instance_unhealthy ERR

# Load config from file if it exists
if [[ -f /var/lib/buildkite-agent/cron-env ]] ; then
# shellcheck source=/dev/null
source /var/lib/buildkite-agent/cron-env
fi

## -----------------------------------------------------------------
## Check disk, we only want to prune images/containers if we have to

if ! /usr/local/bin/bk-check-disk-space.sh ; then
echo "Cleaning up docker resources older than ${DOCKER_PRUNE_UNTIL}"
docker image prune --all --force --filter "until=${DOCKER_PRUNE_UNTIL}"
if ! /usr/local/bin/bk-check-disk-space.sh "${DISK_MIN_AVAILABLE:-}" ; then
echo "Cleaning up docker resources older than 1h"
docker image prune --all --force --filter "until=1h"

if ! /usr/local/bin/bk-check-disk-space.sh ; then
if ! /usr/local/bin/bk-check-disk-space.sh "${DISK_MIN_AVAILABLE:-}" ; then
echo "Disk health checks failed" >&2
exit 1
fi
Expand Down
12 changes: 12 additions & 0 deletions templates/aws-stack.yml
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,16 @@ Parameters:
- "false"
Default: "false"

MinimumDiskAvailableBeforeCleanup:
Type: String
Description: Either a percentage (%) or absolute unit (B, MB, GB) of disk below which disk cleanup is run
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What an absolute unit.

Default: "2GB"

DockerPruneUntil:
Type: String
Description: How far back to prune docker networks images and containers on hourly cleanup
Default: "4h"

Outputs:
VpcId:
Value:
Expand Down Expand Up @@ -857,6 +867,8 @@ Resources:
BUILDKITE_ECR_POLICY=${ECRAccessPolicy} \
BUILDKITE_TERMINATE_INSTANCE_AFTER_JOB=${BuildkiteTerminateInstanceAfterJob} \
BUILDKITE_ADDITIONAL_SUDO_PERMISSIONS=${BuildkiteAdditionalSudoPermissions} \
DISK_MIN_AVAILABLE="${MinimumDiskAvailableBeforeCleanup}" \
DOCKER_PRUNE_UNTIL="${DockerPruneUntil}" \
AWS_DEFAULT_REGION=${AWS::Region} \
SECRETS_PLUGIN_ENABLED=${EnableSecretsPlugin} \
ECR_PLUGIN_ENABLED=${EnableECRPlugin} \
Expand Down
10 changes: 10 additions & 0 deletions unit-tests/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM amazonlinux:2

RUN yum install -y git

RUN git clone https://github.com/bats-core/bats-core.git \
&& cd bats-core \
&& git checkout v1.1.0 \
&& ./install.sh /usr/local

CMD [ "bash", "/usr/local/bin/bats", "/src/unit-tests/*.bats"]
45 changes: 0 additions & 45 deletions unit-tests/fix-buildkite-agent-builds-permissions.bats
Original file line number Diff line number Diff line change
Expand Up @@ -5,89 +5,44 @@ FIX_PERMISSIONS_SCRIPT="/src/packer/linux/conf/buildkite-agent/scripts/fix-build
@test "Slashes in the agent arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "/" "abc" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the agent arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc/" "abc" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the agent arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "/abc" "abc" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the agent arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc/def" "abc" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the agent arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc/def/ghi" "abc" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the agent arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "/abc/" "abc" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the org arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "/" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the org arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc/" "abc" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the org arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "/abc" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the org arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "abc/def" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the org arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "abc/def/ghi" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the org arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "/abc/" "abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the pipeline arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "abc" "/"
[ "$status" -eq 1 ]
}

@test "Slashes in the pipeline arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "abc" "abc/"
[ "$status" -eq 1 ]
}

@test "Slashes in the pipeline arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "abc" "/abc"
[ "$status" -eq 1 ]
}

@test "Slashes in the pipeline arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "abc" "abc/def"
[ "$status" -eq 1 ]
}

@test "Slashes in the pipeline arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "abc" "abc/def/ghi"
[ "$status" -eq 1 ]
}

@test "Slashes in the pipeline arg cause an exit 1" {
run "$FIX_PERMISSIONS_SCRIPT" "abc" "abc" "/abc/"
[ "$status" -eq 1 ]
}
Expand Down
Loading