Skip to content

Commit

Permalink
[ganglia] maintenance release (#1289)
Browse files Browse the repository at this point in the history
* Fixing ganglia.sh

Fix the script to fix two scenarios:
1. No source list file needs to be updated
2. Multiple source list files need to be updated

* ganglia/BUILD:
* using perl to verify http server instead of python

ganglia/ganglia.sh:
* export DEBIAN_FRONTEND=noninteractive
* use latest implementation of repair_old_backports
* include some utility functions to help check for running OS, and
  versions of spark, dataproc and OS versions
* refactor ganglia worker setup into function
* install worker on master when using single node configuration
* reduce noise from apt and wait for lock release

ganglia/test_ganglia.py:
* increase test instance size to n1-standard-8
* replaced complicated python test with simple perl
* run test with conda python
* corrected version nit about where ganglia ui is supported

ganglia/verify_ganglia_running.py,
ganglia/verify_ganglia_running.pl:
* replaced complicated python with simple perl

---------

Co-authored-by: C.J. Collier <[email protected]>
  • Loading branch information
xby-G and cjac authored Jan 9, 2025
1 parent 4a477e3 commit 45a0a24
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 130 deletions.
7 changes: 1 addition & 6 deletions ganglia/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,12 @@ py_test(
srcs = ["test_ganglia.py"],
data = [
"ganglia.sh",
"verify_ganglia_running.pl",
],
local = True,
shard_count = 4,
deps = [
":verify_ganglia_running",
"//integration_tests:dataproc_test_case",
"@io_abseil_py//absl/testing:parameterized",
],
)

py_library(
name = "verify_ganglia_running",
srcs = ["verify_ganglia_running.py"],
)
117 changes: 93 additions & 24 deletions ganglia/ganglia.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,51 @@

set -euxo pipefail

function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )

# For version (or real number) comparison
# if first argument is greater than or equal to, greater than, less than or equal to, or less than the second
# ( version_ge 2.0 2.1 ) evaluates to false
# ( version_ge 2.2 2.1 ) evaluates to true
function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; )
function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; )
function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )

function define_os_comparison_functions() {

readonly -A supported_os=(
['debian']="10 11 12"
['rocky']="8 9"
['ubuntu']="18.04 20.04 22.04"
)

# dynamically define OS version test utility functions
if [[ "$(os_id)" == "rocky" ]];
then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
else _os_version="$(os_version)"; fi
for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"

for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
done
done
eval "function is_debuntu() ( set +x ; is_debian || is_ubuntu ; )"
}

function err() {
echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $*" >&2
return 1
}

function update_apt_get() {
for ((i = 0; i < 10; i++)); do
if apt-get update; then
if apt-get update > /dev/null ; then
return 0
fi
sleep 5
Expand All @@ -34,61 +71,93 @@ function update_apt_get() {

function setup_ganglia_host() {
# Install dependencies needed for Ganglia host
DEBIAN_FRONTEND=noninteractive apt-get install -y \
apt-get install -qq -y -o DPkg::Lock::Timeout=60 \
rrdtool \
gmetad \
ganglia-webfrontend || err 'Unable to install packages'
ganglia-webfrontend >/dev/null || err 'Unable to install packages'

ln -s /etc/ganglia-webfrontend/apache.conf /etc/apache2/sites-enabled/ganglia.conf
sed -i "s/my cluster/${master_hostname}/" /etc/ganglia/gmetad.conf
ln -sf /etc/ganglia-webfrontend/apache.conf /etc/apache2/sites-enabled/ganglia.conf
perl -pi -e "s:^data_source.*:data_source \"${master_hostname}\" localhost:g" /etc/ganglia/gmetad.conf
sed -i '26s/ \$context_metrics \= \"\"\;/ \$context_metrics \= array\(\)\;/g' /usr/share/ganglia-webfrontend/cluster_view.php
systemctl restart ganglia-monitor gmetad apache2
}

function remove_old_backports {
function setup_ganglia_worker() {
# on single node instances, also configure ganglia-monitor
sed -e "/deaf = no /s/no/yes/" -i /etc/ganglia/gmond.conf
sed -i '/udp_recv_channel {/,/}/d' /etc/ganglia/gmond.conf
systemctl restart ganglia-monitor
}

function repair_old_backports {
if ! is_debuntu ; then return ; fi
# This script uses 'apt-get update' and is therefore potentially dependent on
# backports repositories which have been archived. In order to mitigate this
# problem, we will remove any reference to backports repos older than oldstable
# problem, we will use archive.debian.org for the oldoldstable repo

# https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}');
stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}');

matched_files="$(grep -rsil '\-backports' /etc/apt/sources.list*)"
if [[ -n "$matched_files" ]]; then
for filename in "$matched_files"; do
grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \
sed -i -e 's/^.*-backports.*$//' "$filename"
done
fi
debdists="https://deb.debian.org/debian/dists"
oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}');
stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}');

matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )

for filename in "${matched_files[@]}"; do
# Fetch from archive.debian.org for ${oldoldstable}-backports
perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports }
{\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}"
done
}

function main() {
local master_hostname=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
local cluster_name=$(/usr/share/google/get_metadata_value attributes/dataproc-cluster-name)

export DEBIAN_FRONTEND=noninteractive

OS=$(. /etc/os-release && echo "${ID}")

define_os_comparison_functions

# Detect dataproc image version
SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
readonly SPARK_VERSION

if (! test -v DATAPROC_IMAGE_VERSION) ; then
if test -v DATAPROC_VERSION ; then
DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
else
if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
else echo "Unknown dataproc image version" ; exit 1 ; fi
fi
fi

if [[ ${OS} == debian ]] && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then
remove_old_backports
repair_old_backports
fi

update_apt_get || err 'Unable to update apt-get'
apt-get install -y ganglia-monitor
update_apt_get > /dev/null || err 'Unable to update apt-get'
apt-get install -qq -y -o DPkg::Lock::Timeout=60 ganglia-monitor > /dev/null

sed -e "/send_metadata_interval = 0 /s/0/5/" -i /etc/ganglia/gmond.conf
sed -e "/name = \"unspecified\" /s/unspecified/${cluster_name}/" -i /etc/ganglia/gmond.conf
sed -e '/mcast_join /s/^ / #/' -i /etc/ganglia/gmond.conf
sed -e '/bind /s/^ / #/' -i /etc/ganglia/gmond.conf
sed -e "/udp_send_channel {/a\ host = ${master_hostname}" -i /etc/ganglia/gmond.conf

local worker_count=$(/usr/share/google/get_metadata_value dataproc-worker-count)
if [[ "$(hostname -s)" == "${master_hostname}" ]]; then
# Setup Ganglia host only on the master node ("0"-master in HA mode)
setup_ganglia_host || err 'Setting up Ganglia host failed'
if [[ "${worker_count}" == "0" ]] ; then
# on single node instances, also configure ganglia-monitor
setup_ganglia_worker
fi
else
# Configure non-host Ganglia nodes
sed -e "/deaf = no /s/no/yes/" -i /etc/ganglia/gmond.conf
sed -i '/udp_recv_channel {/,/}/d' /etc/ganglia/gmond.conf
systemctl restart ganglia-monitor
setup_ganglia_worker
fi
}

Expand Down
20 changes: 8 additions & 12 deletions ganglia/test_ganglia.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,17 @@

from integration_tests.dataproc_test_case import DataprocTestCase


class GangliaTestCase(DataprocTestCase):
COMPONENT = 'ganglia'
INIT_ACTIONS = ['ganglia/ganglia.sh']
TEST_SCRIPT_FILE_NAME = 'verify_ganglia_running.py'
TEST_SCRIPT_FILE_NAME = 'verify_ganglia_running.pl'

def verify_instance(self, name):
test_script_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
self.TEST_SCRIPT_FILE_NAME)
self.upload_test_file(test_script_path, name)
self.assert_instance_command(name,
"yes | sudo apt-get install python3-pip libxml2-dev libxslt-dev")
self.assert_instance_command(name, "sudo -H pip3 install --upgrade pip")
self.assert_instance_command(name, "sudo pip3 install requests-html")
self.assert_instance_command(name, "sudo pip install -U urllib3 requests")
self.assert_instance_command(name, "pip install lxml[html_clean]")
self.assert_instance_command(
name, "python3 {}".format(self.TEST_SCRIPT_FILE_NAME))
self.assert_instance_command(name,"/usr/bin/perl {}".format(self.TEST_SCRIPT_FILE_NAME))
self.remove_test_script(self.TEST_SCRIPT_FILE_NAME, name)

@parameterized.parameters(
Expand All @@ -38,9 +30,13 @@ def test_ganglia(self, configuration, machine_suffixes):
self.skipTest("Not supported in Rocky Linux-based images")

if self.getImageVersion() > pkg_resources.parse_version("2.0"):
self.skipTest("Ganglia UI is not supported for 2.0+ versions")
self.skipTest("Ganglia UI is not supported for 2.1+ versions")

self.createCluster(configuration, self.INIT_ACTIONS)
self.createCluster(
configuration,
self.INIT_ACTIONS,
machine_type="n1-standard-8",
)
for machine_suffix in machine_suffixes:
self.verify_instance("{}-{}".format(self.getClusterName(),
machine_suffix))
Expand Down
32 changes: 32 additions & 0 deletions ganglia/verify_ganglia_running.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/perl -w
# -*-CPerl-*-

# verify_ganglia_running.py: Script for ganglia initialization action test.

use strict;
use LWP::UserAgent;

my $hostname = qx(hostname -s); chomp $hostname;
my $role = qx(/usr/share/google/get_metadata_value attributes/dataproc-role);
my $primary_master = qx(/usr/share/google/get_metadata_value attributes/dataproc-master);
my $cluster_name = qx(/usr/share/google/get_metadata_value attributes/dataproc-cluster-name);

if ( $hostname eq $primary_master ){
my $hostname = 'localhost';
my $port = '80';

my $ua = LWP::UserAgent->new;

my $response = $ua->get("http://${hostname}:${port}/ganglia/");

die $response->status_line unless $response->is_success;
my( $page_title ) = ( $response->decoded_content =~ m:<b id="page_title">([^>]+)</b>: );
die 'Ganglia UI is not found on master node' unless( $page_title =~ /^${cluster_name}/ );
print("Ganglia UI is running on this node.",$/);
}else{
if ( $hostname =~ /-w-/ ){
print("Ganglia UI should not run on worker node",$/);
}elsif( $hostname =~ /-m-/ ){
print("Ganglia UI should not run on additional master",$/);
}
}
88 changes: 0 additions & 88 deletions ganglia/verify_ganglia_running.py

This file was deleted.

0 comments on commit 45a0a24

Please sign in to comment.