diff --git a/ganglia/BUILD b/ganglia/BUILD index c25124265..7bc3dae90 100644 --- a/ganglia/BUILD +++ b/ganglia/BUILD @@ -6,17 +6,12 @@ py_test( srcs = ["test_ganglia.py"], data = [ "ganglia.sh", + "verify_ganglia_running.pl", ], local = True, shard_count = 4, deps = [ - ":verify_ganglia_running", "//integration_tests:dataproc_test_case", "@io_abseil_py//absl/testing:parameterized", ], ) - -py_library( - name = "verify_ganglia_running", - srcs = ["verify_ganglia_running.py"], -) diff --git a/ganglia/ganglia.sh b/ganglia/ganglia.sh index 28ad3f643..84fc48946 100755 --- a/ganglia/ganglia.sh +++ b/ganglia/ganglia.sh @@ -17,6 +17,43 @@ set -euxo pipefail +function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) +function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) + +# For version (or real number) comparison +# if first argument is greater than or equal to, greater than, less than or equal to, or less than the second +# ( version_ge 2.0 2.1 ) evaluates to false +# ( version_ge 2.2 2.1 ) evaluates to true +function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | tail -n1)" ] ; ) +function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; ) +function version_le() ( set +x ; [ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ] ; ) +function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) + +function define_os_comparison_functions() { + + readonly -A supported_os=( + ['debian']="10 11 12" + ['rocky']="8 9" + ['ubuntu']="18.04 20.04 22.04" + ) + + # dynamically define OS version test utility functions + if [[ "$(os_id)" == "rocky" ]]; + then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') + else _os_version="$(os_version)"; fi + for os_id_val in 'rocky' 'ubuntu' 'debian' ; do + eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" + + for osver in $(echo "${supported_os["${os_id_val}"]}") ; do + eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" + eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" + eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" + done + done + eval "function is_debuntu() ( set +x ; is_debian || is_ubuntu ; )" +} + function err() { echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $*" >&2 return 1 @@ -24,7 +61,7 @@ function err() { function update_apt_get() { for ((i = 0; i < 10; i++)); do - if apt-get update; then + if apt-get update > /dev/null ; then return 0 fi sleep 5 @@ -34,46 +71,76 @@ function update_apt_get() { function setup_ganglia_host() { # Install dependencies needed for Ganglia host - DEBIAN_FRONTEND=noninteractive apt-get install -y \ + apt-get install -qq -y -o DPkg::Lock::Timeout=60 \ rrdtool \ gmetad \ - ganglia-webfrontend || err 'Unable to install packages' + ganglia-webfrontend >/dev/null || err 'Unable to install packages' - ln -s /etc/ganglia-webfrontend/apache.conf /etc/apache2/sites-enabled/ganglia.conf - sed -i "s/my cluster/${master_hostname}/" /etc/ganglia/gmetad.conf + ln -sf /etc/ganglia-webfrontend/apache.conf /etc/apache2/sites-enabled/ganglia.conf + perl -pi -e "s:^data_source.*:data_source \"${master_hostname}\" localhost:g" /etc/ganglia/gmetad.conf sed -i '26s/ \$context_metrics \= \"\"\;/ \$context_metrics \= array\(\)\;/g' /usr/share/ganglia-webfrontend/cluster_view.php systemctl restart ganglia-monitor gmetad apache2 } -function remove_old_backports { +function setup_ganglia_worker() { + # on single node instances, also configure ganglia-monitor + sed -e "/deaf = no /s/no/yes/" -i /etc/ganglia/gmond.conf + sed -i '/udp_recv_channel {/,/}/d' /etc/ganglia/gmond.conf + systemctl restart ganglia-monitor +} + +function repair_old_backports { + if ! is_debuntu ; then return ; fi # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this - # problem, we will remove any reference to backports repos older than oldstable + # problem, we will use archive.debian.org for the oldoldstable repo # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 - oldstable=$(curl -s https://deb.debian.org/debian/dists/oldstable/Release | awk '/^Codename/ {print $2}'); - stable=$(curl -s https://deb.debian.org/debian/dists/stable/Release | awk '/^Codename/ {print $2}'); - - matched_files="$(grep -rsil '\-backports' /etc/apt/sources.list*)" - if [[ -n "$matched_files" ]]; then - for filename in "$matched_files"; do - grep -e "$oldstable-backports" -e "$stable-backports" "$filename" || \ - sed -i -e 's/^.*-backports.*$//' "$filename" - done - fi + debdists="https://deb.debian.org/debian/dists" + oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); + oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); + stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); + + matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) + + for filename in "${matched_files[@]}"; do + # Fetch from archive.debian.org for ${oldoldstable}-backports + perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports } + {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}" + done } function main() { local master_hostname=$(/usr/share/google/get_metadata_value attributes/dataproc-master) local cluster_name=$(/usr/share/google/get_metadata_value attributes/dataproc-cluster-name) + export DEBIAN_FRONTEND=noninteractive + OS=$(. /etc/os-release && echo "${ID}") + + define_os_comparison_functions + + # Detect dataproc image version + SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" + readonly SPARK_VERSION + + if (! test -v DATAPROC_IMAGE_VERSION) ; then + if test -v DATAPROC_VERSION ; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" + else + if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" + elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" + elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" + else echo "Unknown dataproc image version" ; exit 1 ; fi + fi + fi + if [[ ${OS} == debian ]] && [[ $(echo "${DATAPROC_IMAGE_VERSION} <= 2.1" | bc -l) == 1 ]]; then - remove_old_backports + repair_old_backports fi - update_apt_get || err 'Unable to update apt-get' - apt-get install -y ganglia-monitor + update_apt_get > /dev/null || err 'Unable to update apt-get' + apt-get install -qq -y -o DPkg::Lock::Timeout=60 ganglia-monitor > /dev/null sed -e "/send_metadata_interval = 0 /s/0/5/" -i /etc/ganglia/gmond.conf sed -e "/name = \"unspecified\" /s/unspecified/${cluster_name}/" -i /etc/ganglia/gmond.conf @@ -81,14 +148,16 @@ function main() { sed -e '/bind /s/^ / #/' -i /etc/ganglia/gmond.conf sed -e "/udp_send_channel {/a\ host = ${master_hostname}" -i /etc/ganglia/gmond.conf + local worker_count=$(/usr/share/google/get_metadata_value dataproc-worker-count) if [[ "$(hostname -s)" == "${master_hostname}" ]]; then # Setup Ganglia host only on the master node ("0"-master in HA mode) setup_ganglia_host || err 'Setting up Ganglia host failed' + if [[ "${worker_count}" == "0" ]] ; then + # on single node instances, also configure ganglia-monitor + setup_ganglia_worker + fi else - # Configure non-host Ganglia nodes - sed -e "/deaf = no /s/no/yes/" -i /etc/ganglia/gmond.conf - sed -i '/udp_recv_channel {/,/}/d' /etc/ganglia/gmond.conf - systemctl restart ganglia-monitor + setup_ganglia_worker fi } diff --git a/ganglia/test_ganglia.py b/ganglia/test_ganglia.py index ce29380d6..17ca72bfd 100644 --- a/ganglia/test_ganglia.py +++ b/ganglia/test_ganglia.py @@ -6,25 +6,17 @@ from integration_tests.dataproc_test_case import DataprocTestCase - class GangliaTestCase(DataprocTestCase): COMPONENT = 'ganglia' INIT_ACTIONS = ['ganglia/ganglia.sh'] - TEST_SCRIPT_FILE_NAME = 'verify_ganglia_running.py' + TEST_SCRIPT_FILE_NAME = 'verify_ganglia_running.pl' def verify_instance(self, name): test_script_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), self.TEST_SCRIPT_FILE_NAME) self.upload_test_file(test_script_path, name) - self.assert_instance_command(name, - "yes | sudo apt-get install python3-pip libxml2-dev libxslt-dev") - self.assert_instance_command(name, "sudo -H pip3 install --upgrade pip") - self.assert_instance_command(name, "sudo pip3 install requests-html") - self.assert_instance_command(name, "sudo pip install -U urllib3 requests") - self.assert_instance_command(name, "pip install lxml[html_clean]") - self.assert_instance_command( - name, "python3 {}".format(self.TEST_SCRIPT_FILE_NAME)) + self.assert_instance_command(name,"/usr/bin/perl {}".format(self.TEST_SCRIPT_FILE_NAME)) self.remove_test_script(self.TEST_SCRIPT_FILE_NAME, name) @parameterized.parameters( @@ -38,9 +30,13 @@ def test_ganglia(self, configuration, machine_suffixes): self.skipTest("Not supported in Rocky Linux-based images") if self.getImageVersion() > pkg_resources.parse_version("2.0"): - self.skipTest("Ganglia UI is not supported for 2.0+ versions") + self.skipTest("Ganglia UI is not supported for 2.1+ versions") - self.createCluster(configuration, self.INIT_ACTIONS) + self.createCluster( + configuration, + self.INIT_ACTIONS, + machine_type="n1-standard-8", + ) for machine_suffix in machine_suffixes: self.verify_instance("{}-{}".format(self.getClusterName(), machine_suffix)) diff --git a/ganglia/verify_ganglia_running.pl b/ganglia/verify_ganglia_running.pl new file mode 100644 index 000000000..8b98f509c --- /dev/null +++ b/ganglia/verify_ganglia_running.pl @@ -0,0 +1,32 @@ +#!/usr/bin/perl -w +# -*-CPerl-*- + +# verify_ganglia_running.py: Script for ganglia initialization action test. + +use strict; +use LWP::UserAgent; + +my $hostname = qx(hostname -s); chomp $hostname; +my $role = qx(/usr/share/google/get_metadata_value attributes/dataproc-role); +my $primary_master = qx(/usr/share/google/get_metadata_value attributes/dataproc-master); +my $cluster_name = qx(/usr/share/google/get_metadata_value attributes/dataproc-cluster-name); + +if ( $hostname eq $primary_master ){ + my $hostname = 'localhost'; + my $port = '80'; + + my $ua = LWP::UserAgent->new; + + my $response = $ua->get("http://${hostname}:${port}/ganglia/"); + + die $response->status_line unless $response->is_success; + my( $page_title ) = ( $response->decoded_content =~ m:([^>]+): ); + die 'Ganglia UI is not found on master node' unless( $page_title =~ /^${cluster_name}/ ); + print("Ganglia UI is running on this node.",$/); +}else{ + if ( $hostname =~ /-w-/ ){ + print("Ganglia UI should not run on worker node",$/); + }elsif( $hostname =~ /-m-/ ){ + print("Ganglia UI should not run on additional master",$/); + } +} diff --git a/ganglia/verify_ganglia_running.py b/ganglia/verify_ganglia_running.py deleted file mode 100644 index 9418aa21e..000000000 --- a/ganglia/verify_ganglia_running.py +++ /dev/null @@ -1,88 +0,0 @@ -"""verify_ganglia_running.py: Script for ganglia initialization action test. -""" -import socket -import subprocess - -from requests_html import HTMLSession - -BASE = 'localhost' -PORT = 80 - - -class Ganglia(object): - def __init__(self, base, port): - self.path = 'http://{}:{}/ganglia/'.format(base, port) - self.host = socket.gethostname() - self.cluster_name = self.get_cluster_name() - if self.host in self.get_main_master(): - self.is_main_master = True - else: - self.is_main_master = False - - def get_homepage_title(self): - session = HTMLSession() - r = session.get(self.path) - try: - return r.html.find('#page_title', first=True).text - except Exception: - return None - - def detect_role(self): - if self.host == self.get_main_master(): - self.is_main_master = True - else: - self.is_main_master = False - - def get_main_master(self): - cmd = '/usr/share/google/get_metadata_value attributes/dataproc-master' - p = subprocess.Popen( - cmd, - shell=True, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = p.communicate() - return stdout.decode("utf-8") - - def get_cluster_name(self): - cmd = '/usr/share/google/get_metadata_value attributes/dataproc-cluster-name' - p = subprocess.Popen( - cmd, - shell=True, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = p.communicate() - return stdout.decode("utf-8") - - -def validate_homepage(ganglia): - if ganglia.is_main_master: - if ganglia.cluster_name in ganglia.get_homepage_title(): - print('Ganglia UI is running on master node') - else: - raise Exception('Ganglia UI is not found on master node') - else: - if '-w-' in ganglia.host: - print("Ganglia UI should not run on worker node") - elif '-m-' in ganglia.host: - print("Ganglia UI should not run on additional master") - - -def main(): - """Drives the script. - - Returns: - None - - Raises: - Exception: If a response does not contain the expected value - """ - ganglia = Ganglia(BASE, PORT) - validate_homepage(ganglia) - - -if __name__ == '__main__': - main()