Skip to content

Commit

Permalink
Merge pull request #171 from asmorodskyi/volumes
Browse files Browse the repository at this point in the history
Redo volumes,disks,images cleanup
  • Loading branch information
asmorodskyi authored Dec 6, 2022
2 parents b0dafb1 + db13069 commit 4bd675f
Show file tree
Hide file tree
Showing 15 changed files with 385 additions and 825 deletions.
1 change: 1 addition & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
omit =
*/lib/python*
*/migrations/*
*/tests/*
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ test:
flake8 webui
flake8 ocw
flake8 manage.py
pytest --cov=./
pytest --cov

.PHONY: codecov
codecov:
pytest -v --cov=./ --cov-report=html && xdg-open htmlcov/index.html
pytest -v --cov --cov-report=html && xdg-open htmlcov/index.html

# Build containers
docker-container:
Expand Down
126 changes: 24 additions & 102 deletions ocw/lib/EC2.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from .provider import Provider, Image
from .provider import Provider
from webui.settings import PCWConfig, ConfigFile
from dateutil.parser import parse
import boto3
from botocore.exceptions import ClientError
import re
from datetime import date, datetime, timedelta, timezone
from ocw.lib.emailnotify import send_mail
import traceback
Expand Down Expand Up @@ -89,47 +88,35 @@ def all_clusters(self):
return clusters

@staticmethod
def needs_to_delete_snapshot(snapshot, cleanup_ec2_max_snapshot_age_days) -> bool:
delete_older_than = date.today() - timedelta(days=cleanup_ec2_max_snapshot_age_days)
if datetime.date(snapshot['StartTime']) < delete_older_than:
regexes = [
re.compile(r'''^OpenQA upload image$'''),
re.compile(r'''^Created by CreateImage\([\w-]+\) for ami-\w+ from vol-\w+$''')
]
for regex in regexes:
m = re.match(regex, snapshot['Description'].strip())
if m:
return True
return False
def is_outdated(creation_time, valid_period_days) -> bool:
return datetime.date(creation_time) < (date.today() - timedelta(days=valid_period_days))

def cleanup_snapshots(self, cleanup_ec2_max_snapshot_age_days):
def cleanup_snapshots(self, valid_period_days):
for region in self.all_regions:
response = self.ec2_client(region).describe_snapshots(OwnerIds=['self'])
response['Snapshots'].sort(key=lambda snapshot: snapshot['StartTime'].timestamp())
for snapshot in response['Snapshots']:
if EC2.needs_to_delete_snapshot(snapshot, cleanup_ec2_max_snapshot_age_days):
self.log_info("Deleting snapshot {} in region {} with StartTime={}", snapshot['SnapshotId'],
region, snapshot['StartTime'])
if EC2.is_outdated(snapshot['StartTime'], valid_period_days):
try:
if self.dry_run:
self.log_info("Snapshot deletion of {} skipped due to dry run mode",
snapshot['SnapshotId'])
else:
self.log_info("Deleting snapshot {} in region {} with StartTime={}",
snapshot['SnapshotId'], region, snapshot['StartTime'])
self.ec2_client(region).delete_snapshot(SnapshotId=snapshot['SnapshotId'])
except ClientError as ex:
if ex.response['Error']['Code'] == 'InvalidSnapshot.InUse':
self.log_info(ex.response['Error']['Message'])
else:
raise ex

def cleanup_volumes(self, cleanup_ec2_max_volumes_age_days):
delete_older_than = date.today() - timedelta(days=cleanup_ec2_max_volumes_age_days)
def cleanup_volumes(self, valid_period_days):
for region in self.all_regions:
response = self.ec2_client(region).describe_volumes()
for volume in response['Volumes']:
if datetime.date(volume['CreateTime']) < delete_older_than:
if EC2.is_outdated(volume['CreateTime'], valid_period_days):
if self.volume_protected(volume):
self.log_info('Volume {} has tag DO_NOT_DELETE so protected from deletion',
self.log_info('Volume {} has tag pcw_ignore so protected from deletion',
volume['VolumeId'])
elif self.dry_run:
self.log_info("Volume deletion of {} skipped due to dry run mode", volume['VolumeId'])
Expand All @@ -147,7 +134,7 @@ def cleanup_volumes(self, cleanup_ec2_max_volumes_age_days):
def volume_protected(self, volume):
if 'Tags' in volume:
for tag in volume['Tags']:
if tag['Key'] == 'DO_NOT_DELETE':
if tag['Key'] == 'pcw_ignore':
return True
return False

Expand Down Expand Up @@ -209,66 +196,13 @@ def delete_all_clusters(self):
self.log_info("Finally deleting {} cluster", cluster)
self.eks_client(region).delete_cluster(name=cluster)

def parse_image_name(self, img_name):
regexes = [
# openqa-SLES12-SP5-EC2.x86_64-0.9.1-BYOS-Build1.55.raw.xz
re.compile(r'''^openqa-SLES
(?P<version>\d+(-SP\d+)?)
-(?P<flavor>EC2)
\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
(?P<type>(BYOS|On-Demand))
-Build
(?P<build>\d+\.\d+)
\.raw\.xz
''', re.RegexFlag.X),
# openqa-SLES15-SP2.x86_64-0.9.3-EC2-HVM-Build1.10.raw.xz'
# openqa-SLES15-SP2-BYOS.x86_64-0.9.3-EC2-HVM-Build1.10.raw.xz'
# openqa-SLES15-SP2.aarch64-0.9.3-EC2-HVM-Build1.49.raw.xz'
# openqa-SLES15-SP4-SAP-BYOS.x86_64-0.9.3-EC2-Build150400.1.31.raw.xz
re.compile(r'''^openqa-SLES
(?P<version>\d+(-SP\d+)?)
(-(?P<type>[^\.]+))?
\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
(?P<flavor>EC2[-\w]*)
-Build(\d+\.)?
(?P<build>\d+\.\d+)
\.raw\.xz
''', re.RegexFlag.X),
# openqa-SLES12-SP4-EC2-HVM-BYOS.x86_64-0.9.2-Build2.56.raw.xz'
re.compile(r'''^openqa-SLES
(?P<version>\d+(-SP\d+)?)
-
(?P<flavor>EC2[^\.]+)
\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
Build
(?P<build>\d+\.\d+)
\.raw\.xz
''', re.RegexFlag.X)
]
return self.parse_image_name_helper(img_name, regexes)

def cleanup_all(self):
cleanup_ec2_max_snapshot_age_days = PCWConfig.get_feature_property('cleanup', 'ec2-max-snapshot-age-days',
self._namespace)
cleanup_ec2_max_volumes_age_days = PCWConfig.get_feature_property('cleanup', 'ec2-max-volumes-age-days',
self._namespace)
self.cleanup_images()
if cleanup_ec2_max_snapshot_age_days >= 0:
self.cleanup_snapshots(cleanup_ec2_max_snapshot_age_days)
if cleanup_ec2_max_volumes_age_days >= 0:
self.cleanup_volumes(cleanup_ec2_max_volumes_age_days)
valid_period_days = PCWConfig.get_feature_property('cleanup', 'ec2-max-age-days', self._namespace)

if valid_period_days > 0:
self.cleanup_images(valid_period_days)
self.cleanup_snapshots(valid_period_days)
self.cleanup_volumes(valid_period_days)
if PCWConfig.getBoolean('cleanup/vpc_cleanup', self._namespace):
self.cleanup_uploader_vpcs()

Expand Down Expand Up @@ -389,25 +323,13 @@ def cleanup_uploader_vpcs(self):
region)
send_mail('VPC deletion locked by running VMs', body)

def cleanup_images(self):
def cleanup_images(self, valid_period_days):
for region in self.all_regions:
response = self.ec2_client(region).describe_images(Owners=['self'])
images = list()
for img in response['Images']:
# img is in the format described here:
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ec2.html#EC2.Client.describe_images
m = self.parse_image_name(img['Name'])
if m:
self.log_dbg("Image {} is candidate for deletion with build {}", img['Name'], m['build'])
images.append(
Image(img['Name'], flavor=m['key'], build=m['build'], date=parse(img['CreationDate']),
img_id=img['ImageId']))
else:
self.log_err(" Unable to parse image name '{}'", img['Name'])
keep_images = self.get_keeping_image_names(images)
for img in [i for i in images if i.name not in keep_images]:
self.log_dbg("Delete image '{}' (ami:{})".format(img.name, img.id))
if self.dry_run:
self.log_info("Image deletion {} skipped due to dry run mode", img.id)
else:
self.ec2_client(region).deregister_image(ImageId=img.id, DryRun=False)
if EC2.is_outdated(parse(img['CreationDate']), valid_period_days):
if self.dry_run:
self.log_info("Image deletion {} skipped due to dry run mode", img['ImageId'])
else:
self.log_dbg("Delete image '{}' (ami:{})".format(img['Name'], img['ImageId']))
self.ec2_client(region).deregister_image(ImageId=img['ImageId'], DryRun=False)
164 changes: 50 additions & 114 deletions ocw/lib/azure.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .provider import Provider, Image
from .provider import Provider
from webui.settings import PCWConfig
from azure.identity import ClientSecretCredential
from azure.mgmt.resource import ResourceManagementClient
Expand Down Expand Up @@ -100,126 +100,62 @@ def list_disks_by_resource_group(self, resource_group):

def list_by_resource_group(self, resource_group, filters=None):
return [item for item in self.resource_mgmt_client().resources.list_by_resource_group(
resource_group, filter=filters)]

def get_keeping_image_names(self):
images = list()
for item in self.container_client('sle-images').list_blobs():
m = self.parse_image_name(item.name)
if m:
images.append(Image(item.name, flavor=m['key'], build=m['build'], date=item.last_modified))
else:
self.log_err("Unable to parse image name '{}'", item.name)

return super().get_keeping_image_names(images)
resource_group, filter=filters, expand="changedTime")]

def cleanup_all(self):
''' Cleanup all autodateed data which might created during automated tests.'''
self.cleanup_bootdiagnostics()

keep_images = self.get_keeping_image_names()
self.cleanup_sle_images_container(keep_images)
self.cleanup_disks_from_rg(keep_images)
self.cleanup_images_from_rg(keep_images)
for i in keep_images:
self.log_info("Keep image {} ", i)

def cleanup_bootdiagnostics(self):
containers = self.bs_client().list_containers()
self.cleanup_images_from_rg()
self.cleanup_disks_from_rg()
self.cleanup_blob_containers()

@staticmethod
def container_valid_for_cleanup(container):
'''
under term "container" we meant Azure Blob Storage Container.
See https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blobs-introduction
for more details
Container is valid for cleanup if it met 2 conditions :
1. "metadata" of container does not contain special tag (pcw_ignore)
2. Container name or contains "bootdiagnostics-" in its name or its name is "sle-images"
'''
if 'pcw_ignore' in container['metadata']:
return False
if re.match('^bootdiagnostics-', container.name):
return True
if container.name == 'sle-images':
return True
return False

def cleanup_blob_containers(self):
containers = self.bs_client().list_containers(include_metadata=True)
for c in containers:
self.log_dbg('Found container {}', c.name)
if (re.match('^bootdiagnostics-', c.name)):
self.cleanup_bootdiagnostics_container(c)

def cleanup_bootdiagnostics_container(self, container):
latest_modification = container.last_modified
container_blobs = self.container_client(container.name).list_blobs()
for blob in container_blobs:
if (latest_modification > blob.last_modified):
latest_modification = blob.last_modified
if (self.older_than_min_age(latest_modification)):
self.log_info("Mark container for deletion {}", container.name)
if self.dry_run:
self.log_info("Deletion of boot diagnostic container {} skipped due to dry run mode", container.name)
else:
self.bs_client().delete_container(container.name)

def parse_image_name(self, img_name):
regexes = [
# SLES12-SP5-Azure.x86_64-0.9.1-SAP-BYOS-Build3.3.vhd
re.compile(r"""
SLES
(?P<version>\d+(-SP\d+)?)
-Azure\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
-
(?P<flavor>[-\w]+)
-
Build(?P<build>\d+\.\d+)
\.vhd
""",
re.X),

# SLES15-SP2-BYOS.x86_64-0.9.3-Azure-Build1.10.vhd
# SLES15-SP2.x86_64-0.9.3-Azure-Basic-Build1.11.vhd
# SLES15-SP2-SAP-BYOS.x86_64-0.9.2-Azure-Build1.9.vhd
# SLES15-SP4-BYOS.x86_64-0.9.1-Azure-Build150400.2.103.vhd
re.compile(r"""
SLES
(?P<version>\d+(-SP\d+)?)
(-(?P<type>[^\.]+))?\.
(?P<arch>[^-]+)
-
(?P<kiwi>\d+\.\d+\.\d+)
(-(?P<flavor>Azure[-\w]*))?
-
Build(\d+\.)?(?P<build>\d+\.\d+)
\.vhd
""",
re.X)
]
return self.parse_image_name_helper(img_name, regexes)

def cleanup_sle_images_container(self, keep_images):
container_client = self.container_client('sle-images')
for img in container_client.list_blobs():
m = self.parse_image_name(img.name)
if m:
self.log_dbg('Blob {} is candidate for deletion with build {} ', img.name, m['build'])

if img.name not in keep_images:
self.log_info("Delete blob '{}'", img.name)
if self.dry_run:
self.log_info("Deletion of blob image {} skipped due to dry run mode", img.name)
else:
container_client.delete_blob(img.name, delete_snapshots="include")
if Azure.container_valid_for_cleanup(c):
self.log_dbg('Found container {}', c.name)
container_blobs = self.container_client(c.name).list_blobs()
for blob in container_blobs:
if (self.is_outdated(blob.last_modified)):
if self.dry_run:
self.log_info("Deletion of blob {} skipped due to dry run mode", blob.name)
else:
self.log_info("Deleting blob {}", blob.name)
self.container_client(c.name).delete_blob(blob.name, delete_snapshots="include")

def cleanup_images_from_rg(self, keep_images):
def cleanup_images_from_rg(self):
for item in self.list_images_by_resource_group(self.__resource_group):
m = self.parse_image_name(item.name)
if m:
self.log_dbg('Image {} is candidate for deletion with build {} ', item.name, m['build'])
if item.name not in keep_images:
if self.is_outdated(item.changed_time):
if self.dry_run:
self.log_info("Deletion of image {} skipped due to dry run mode", item.name)
else:
self.log_info("Delete image '{}'", item.name)
if self.dry_run:
self.log_info("Deletion of image {} skipped due to dry run mode", item.name)
else:
self.compute_mgmt_client().images.begin_delete(self.__resource_group, item.name)
self.compute_mgmt_client().images.begin_delete(self.__resource_group, item.name)

def cleanup_disks_from_rg(self, keep_images):
def cleanup_disks_from_rg(self):
for item in self.list_disks_by_resource_group(self.__resource_group):
m = self.parse_image_name(item.name)
if m:
self.log_dbg('Disk {} is candidate for deletion with build {} ', item.name, m['build'])

if item.name not in keep_images:
if self.compute_mgmt_client().disks.get(self.__resource_group, item.name).managed_by:
self.log_warn("Disk is in use - unable delete {}", item.name)
if self.is_outdated(item.changed_time):
if self.compute_mgmt_client().disks.get(self.__resource_group, item.name).managed_by:
self.log_warn("Disk is in use - unable delete {}", item.name)
else:
if self.dry_run:
self.log_info("Deletion of disk {} skipped due to dry run mode", item.name)
else:
self.log_info("Delete disk '{}'", item.name)
if self.dry_run:
self.log_info("Deletion of image {} skipped due to dry run mode", item.name)
else:
self.compute_mgmt_client().disks.begin_delete(self.__resource_group, item.name)
self.compute_mgmt_client().disks.begin_delete(self.__resource_group, item.name)
Loading

0 comments on commit 4bd675f

Please sign in to comment.