diff --git a/beam/beam.sh b/beam/beam.sh index 2cad639d8..2ce1640bf 100755 --- a/beam/beam.sh +++ b/beam/beam.sh @@ -18,14 +18,13 @@ readonly BEAM_IMAGE_VERSION_DEFAULT="master" readonly BEAM_IMAGE_REPOSITORY_KEY="beam-image-repository" readonly BEAM_IMAGE_REPOSITORY_DEFAULT="apache.bintray.io/beam" - readonly START_FLINK_YARN_SESSION_METADATA_KEY='flink-start-yarn-session' # Set this to true to start a flink yarn session at initialization time. readonly START_FLINK_YARN_SESSION_DEFAULT=true function is_master() { local role="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" - if [[ "$role" == 'Master' ]] ; then + if [[ "$role" == 'Master' ]]; then true else false @@ -33,8 +32,8 @@ function is_master() { } function get_artifacts_dir() { - /usr/share/google/get_metadata_value "attributes/${ARTIFACTS_GCS_PATH_METADATA_KEY}" \ - || echo "gs://$(/usr/share/google/get_metadata_value "attributes/dataproc-bucket")/beam-artifacts" + /usr/share/google/get_metadata_value "attributes/${ARTIFACTS_GCS_PATH_METADATA_KEY}" || + echo "gs://$(/usr/share/google/get_metadata_value "attributes/dataproc-bucket")/beam-artifacts" } function download_snapshot() { @@ -49,17 +48,17 @@ function download_snapshot() { function flink_master_url() { local start_flink_yarn_session="$(/usr/share/google/get_metadata_value \ - "attributes/${START_FLINK_YARN_SESSION_METADATA_KEY}" \ - || echo "${START_FLINK_YARN_SESSION_DEFAULT}")" - # TODO: delete this workaround when the beam job service is able to understand + "attributes/${START_FLINK_YARN_SESSION_METADATA_KEY}" || + echo "${START_FLINK_YARN_SESSION_DEFAULT}")" + # TODO: delete this workaround when the beam job service is able to understand # flink in yarn mode. - if ${start_flink_yarn_session} ; then + if ${start_flink_yarn_session}; then # grab final field from the first yarn application that contains 'flink' - yarn application -list \ - | grep -i 'flink' \ - | head -n1 \ - | awk -F $'\t' '{print $9}' \ - | cut -c8- + yarn application -list | + grep -i 'flink' | + head -n1 | + awk -F $'\t' '{print $9}' | + cut -c8- else echo "localhost:8081" fi @@ -69,8 +68,8 @@ function install_job_service() { local master_url="$(/usr/share/google/get_metadata_value attributes/dataproc-master)" local artifacts_dir="$(get_artifacts_dir)" local release_snapshot_url="$(/usr/share/google/get_metadata_value \ - "attributes/${RELEASE_SNAPSHOT_URL_METADATA_KEY}" \ - || echo "${RELEASE_SNAPSHOT_URL_DEFAULT}")" + "attributes/${RELEASE_SNAPSHOT_URL_METADATA_KEY}" || + echo "${RELEASE_SNAPSHOT_URL_DEFAULT}")" echo "Retrieving Beam Job Service snapshot from ${release_snapshot_url}" @@ -84,7 +83,7 @@ function install_job_service() { mkdir -p "${SERVICE_WORKING_DIR}" chown -R "${SERVICE_WORKING_USER}" "${SERVICE_WORKING_DIR}" - cat > "/etc/systemd/system/beam-job-service.service" <"/etc/systemd/system/beam-job-service.service" < [ []]" >&2 exit 1 fi @@ -20,10 +20,10 @@ function build_job_service() { function build_container() { ./gradlew docker - local images=($(docker images \ - | grep '.*-docker-apache' \ - | awk '{print $1}')) - for image in ${images} ; do + local images=($(docker images | + grep '.*-docker-apache' | + awk '{print $1}')) + for image in ${images}; do local image_destination="${BEAM_CONTAINER_IMAGE_DESTINATION}/$(basename ${image}):${BEAM_SOURCE_VERSION}" docker tag $image:latest ${image_destination} docker push ${image_destination} @@ -31,7 +31,7 @@ function build_container() { } function main() { - if [[ $# -eq 4 ]] ; then + if [[ $# -eq 4 ]]; then # if there is a 4th argument, use it as the beam source directory pushd "$4" else diff --git a/bigdl/bigdl.sh b/bigdl/bigdl.sh index 30fd485de..e600aa314 100755 --- a/bigdl/bigdl.sh +++ b/bigdl/bigdl.sh @@ -17,10 +17,10 @@ cd /opt/intel-bigdl wget -nv --timeout=30 --tries=5 --retry-connrefused "${BIGDL_DOWNLOAD_URL}" unzip *.zip -JAR=`realpath lib/*.jar` -PYTHON_ZIP=`realpath lib/*.zip` +JAR=$(realpath lib/*.jar) +PYTHON_ZIP=$(realpath lib/*.zip) -cat << EOF >> /etc/spark/conf/spark-env.sh +cat <>/etc/spark/conf/spark-env.sh SPARK_DIST_CLASSPATH="\$SPARK_DIST_CLASSPATH:$JAR" PYTHONPATH="\$PYTHONPATH:$PYTHON_ZIP" EOF @@ -30,20 +30,20 @@ EOF if [[ "${ROLE}" == "Master" ]]; then NUM_NODEMANAGERS_TARGET="${WORKER_COUNT}" - if (( "${WORKER_COUNT}" == 0 )); then + if (("${WORKER_COUNT}" == 0)); then # Single node clusters have one node manager NUM_NODEMANAGERS_TARGET=1 fi # Wait for 5 minutes for Node Managers to register and run. # Break early if the expected number of node managers have registered. - for (( i=0; i < 5*60; i++ )); do + for ((i = 0; i < 5 * 60; i++)); do CURRENTLY_RUNNING_NODEMANAGERS=$(yarn node -list | grep RUNNING | wc -l) - if (( CURRENTLY_RUNNING_NODEMANAGERS == NUM_NODEMANAGERS_TARGET )); then + if ((CURRENTLY_RUNNING_NODEMANAGERS == NUM_NODEMANAGERS_TARGET)); then break fi sleep 1 done - if (( CURRENTLY_RUNNING_NODEMANAGERS == 0 )); then + if ((CURRENTLY_RUNNING_NODEMANAGERS == 0)); then echo "No node managers running. Cluster creation likely failed" exit 1 fi @@ -73,14 +73,14 @@ if [[ "${ROLE}" == "Master" ]]; then # Check if it BigDL conf or Zoo if [ -f conf/spark-bigdl.conf ]; then - cat conf/spark-bigdl.conf >> /etc/spark/conf/spark-defaults.conf + cat conf/spark-bigdl.conf >>/etc/spark/conf/spark-defaults.conf elif [ -f conf/spark-analytics-zoo.conf ]; then - cat conf/spark-analytics-zoo.conf >> /etc/spark/conf/spark-defaults.conf + cat conf/spark-analytics-zoo.conf >>/etc/spark/conf/spark-defaults.conf else err "Can't find any suitable spark config for Intel BigDL/Zoo" fi - cat << EOF >> /etc/spark/conf/spark-defaults.conf + cat <>/etc/spark/conf/spark-defaults.conf spark.dynamicAllocation.enabled=false spark.executor.instances=${SPARK_NUM_EXECUTORS} diff --git a/cloud-sql-proxy/cloud-sql-proxy.sh b/cloud-sql-proxy/cloud-sql-proxy.sh index 787de9c38..e38605948 100755 --- a/cloud-sql-proxy/cloud-sql-proxy.sh +++ b/cloud-sql-proxy/cloud-sql-proxy.sh @@ -42,36 +42,36 @@ readonly kms_key_uri="$(/usr/share/google/get_metadata_value attributes/kms-key- readonly db_admin_password_uri="$(/usr/share/google/get_metadata_value attributes/db-admin-password-uri)" if [[ -n "${db_admin_password_uri}" ]]; then # Decrypt password - readonly db_admin_password="$(gsutil cat $db_admin_password_uri | \ + readonly db_admin_password="$(gsutil cat $db_admin_password_uri | gcloud kms decrypt \ - --ciphertext-file - \ - --plaintext-file - \ - --key $kms_key_uri)" + --ciphertext-file - \ + --plaintext-file - \ + --key $kms_key_uri)" else readonly db_admin_password='' fi if [ "${db_admin_password}" == "" ]; then - readonly db_admin_password_parameter="" + readonly db_admin_password_parameter="" else - readonly db_admin_password_parameter="-p${db_admin_password}" + readonly db_admin_password_parameter="-p${db_admin_password}" fi # Database password to use to access metastore. readonly db_hive_password_uri="$(/usr/share/google/get_metadata_value attributes/db-hive-password-uri)" if [[ -n "${db_hive_password_uri}" ]]; then # Decrypt password - readonly db_hive_password="$(gsutil cat $db_hive_password_uri | \ + readonly db_hive_password="$(gsutil cat $db_hive_password_uri | gcloud kms decrypt \ - --ciphertext-file - \ - --plaintext-file - \ - --key $kms_key_uri)" + --ciphertext-file - \ + --plaintext-file - \ + --key $kms_key_uri)" else readonly db_hive_password='hive-password' fi if [ "${db_hive_password}" == "" ]; then - readonly db_hive_password_parameter="" + readonly db_hive_password_parameter="" else - readonly db_hive_password_parameter="-p${db_hive_password}" + readonly db_hive_password_parameter="-p${db_hive_password}" fi readonly PROXY_DIR='/var/run/cloud_sql_proxy' @@ -85,8 +85,8 @@ readonly DATAPROC_MASTER=$(/usr/share/google/get_metadata_value attributes/datap function get_java_property() { local property_file=$1 local property_name=$2 - local property_value=$(grep "^${property_name}=" "${property_file}" | \ - tail -n 1 | cut -d '=' -f 2- | sed -r 's/\\([#!=:])/\1/g') + local property_value=$(grep "^${property_name}=" "${property_file}" | + tail -n 1 | cut -d '=' -f 2- | sed -r 's/\\([#!=:])/\1/g') echo "${property_value}" } @@ -102,7 +102,7 @@ function is_component_selected() { local component=$1 local activated_components=$(get_dataproc_property \ - dataproc.components.activate) + dataproc.components.activate) if [[ ${activated_components} == *${component}* ]]; then return 0 @@ -123,7 +123,7 @@ function get_hive_principal() { function get_hiveserver_uri() { local base_connect_string="jdbc:hive2://localhost:10000" - if [[ "${KERBEROS_ENABLED}" == 'true' ]] ; then + if [[ "${KERBEROS_ENABLED}" == 'true' ]]; then local hive_principal=$(get_hive_principal) echo "${base_connect_string}/;principal=${hive_principal}" else @@ -151,12 +151,12 @@ function run_with_retries() { break else local sleep_time=${retry_backoff[$i]} - echo "'${cmd[*]}' attempt $(( $i + 1 )) failed! Sleeping ${sleep_time}." >&2 + echo "'${cmd[*]}' attempt $(($i + 1)) failed! Sleeping ${sleep_time}." >&2 sleep ${sleep_time} fi done - if ! (( ${update_succeeded} )); then + if ! ((${update_succeeded})); then echo "Final attempt of '${cmd[*]}'..." # Let any final error propagate all the way out to any error traps. "${cmd[@]}" @@ -165,10 +165,10 @@ function run_with_retries() { function configure_proxy_flags() { # If a cloud sql instance has both public and private IP, use private IP. - if [[ $use_cloud_sql_private_ip = "true" ]]; then + if [[ $use_cloud_sql_private_ip == "true" ]]; then proxy_instances_flags+=" --ip_address_types=PRIVATE" fi - if [[ $enable_cloud_sql_metastore = "true" ]]; then + if [[ $enable_cloud_sql_metastore == "true" ]]; then if [[ -z "${metastore_instance}" ]]; then err 'Must specify hive-metastore-instance VM metadata' elif ! [[ "${metastore_instance}" =~ .+:.+:.+ ]]; then @@ -197,7 +197,7 @@ function install_cloud_sql_proxy() { mkdir -p ${PROXY_DIR} # Install proxy as systemd service for reboot tolerance. - cat << EOF > ${INIT_SCRIPT} + cat <${INIT_SCRIPT} [Unit] Description=Google Cloud SQL Proxy After=local-fs.target network-online.target @@ -215,18 +215,18 @@ WantedBy=multi-user.target EOF chmod a+rw ${INIT_SCRIPT} systemctl enable cloud-sql-proxy - systemctl start cloud-sql-proxy \ - || err 'Unable to start cloud-sql-proxy service' - - if [[ $enable_cloud_sql_metastore = "true" ]]; then + systemctl start cloud-sql-proxy || + err 'Unable to start cloud-sql-proxy service' + + if [[ $enable_cloud_sql_metastore == "true" ]]; then run_with_retries nc -zv localhost ${metastore_proxy_port} fi echo 'Cloud SQL Proxy installation succeeded' >&2 - if [[ $enable_cloud_sql_metastore = "true" ]]; then + if [[ $enable_cloud_sql_metastore == "true" ]]; then # Update hive-site.xml - cat << EOF > hive-template.xml + cat <hive-template.xml @@ -245,17 +245,16 @@ EOF EOF - bdconfig merge_configurations \ - --configuration_file /etc/hive/conf/hive-site.xml \ - --source_configuration_file hive-template.xml \ - --clobber -fi + bdconfig merge_configurations \ + --configuration_file /etc/hive/conf/hive-site.xml \ + --source_configuration_file hive-template.xml \ + --clobber + fi } - -function configure_sql_client(){ +function configure_sql_client() { # Configure mysql client to talk to metastore - cat << EOF > /etc/mysql/conf.d/cloud-sql-proxy.cnf + cat </etc/mysql/conf.d/cloud-sql-proxy.cnf [client] protocol = tcp port = ${metastore_proxy_port} @@ -280,29 +279,29 @@ EOF mysql -u "${db_admin_user}" "${db_admin_password_parameter}" -e \ "CREATE DATABASE ${metastore_db}; \ GRANT ALL PRIVILEGES ON ${metastore_db}.* TO '${db_hive_user}';" - /usr/lib/hive/bin/schematool -dbType mysql -initSchema \ - || err 'Failed to set mysql schema.' + /usr/lib/hive/bin/schematool -dbType mysql -initSchema || + err 'Failed to set mysql schema.' fi run_with_retries run_validation } function run_validation() { - if ( systemctl is-enabled --quiet hive-metastore ); then + if (systemctl is-enabled --quiet hive-metastore); then # Start metastore back up. - systemctl restart hive-metastore \ - || err 'Unable to start hive-metastore service' + systemctl restart hive-metastore || + err 'Unable to start hive-metastore service' else echo "Service hive-metastore is not loaded" fi # Check that metastore schema is compatible. - /usr/lib/hive/bin/schematool -dbType mysql -info || \ - err 'Run /usr/lib/hive/bin/schematool -dbType mysql -upgradeSchemaFrom to upgrade the schema. Note that this may break Hive metastores that depend on the old schema' + /usr/lib/hive/bin/schematool -dbType mysql -info || + err 'Run /usr/lib/hive/bin/schematool -dbType mysql -upgradeSchemaFrom to upgrade the schema. Note that this may break Hive metastores that depend on the old schema' # Validate it's functioning. local hiveserver_uri=$(get_hiveserver_uri) - if ! timeout 60s beeline -u ${hiveserver_uri} -e 'SHOW TABLES;' >& /dev/null; then + if ! timeout 60s beeline -u ${hiveserver_uri} -e 'SHOW TABLES;' >&/dev/null; then err 'Failed to bring up Cloud SQL Metastore' else echo 'Cloud SQL Hive Metastore initialization succeeded' >&2 @@ -310,15 +309,14 @@ function run_validation() { } - -function configure_hive_warehouse_dir(){ +function configure_hive_warehouse_dir() { # Wait for master 0 to create the metastore db if necessary. run_with_retries run_validation local hiveserver_uri=$(get_hiveserver_uri) HIVE_WAREHOURSE_URI=$(beeline -u ${hiveserver_uri} \ - -e "describe database default;" \ - | sed '4q;d' | cut -d "|" -f4 | tr -d '[:space:]') + -e "describe database default;" | + sed '4q;d' | cut -d "|" -f4 | tr -d '[:space:]') echo "Hive warehouse uri: $HIVE_WAREHOURSE_URI" @@ -368,14 +366,14 @@ function main() { if [[ "${role}" == 'Master' ]]; then # Disable Hive Metastore and MySql Server. - if [[ $enable_cloud_sql_metastore = "true" ]]; then - if ( systemctl is-enabled --quiet hive-metastore ); then + if [[ $enable_cloud_sql_metastore == "true" ]]; then + if (systemctl is-enabled --quiet hive-metastore); then # Stop hive-metastore if it is enabled systemctl stop hive-metastore else echo "Service hive-metastore is not enabled" fi - if ( systemctl is-enabled --quiet mysql ); then + if (systemctl is-enabled --quiet mysql); then systemctl stop mysql systemctl disable mysql else @@ -383,7 +381,7 @@ function main() { fi fi install_cloud_sql_proxy - if [[ $enable_cloud_sql_metastore = "true" ]]; then + if [[ $enable_cloud_sql_metastore == "true" ]]; then if [[ "${HOSTNAME}" == "${DATAPROC_MASTER}" ]]; then # Initialize metastore db instance and set hive.metastore.warehouse.dir # on master 0. @@ -396,7 +394,7 @@ function main() { else # This part runs on workers. # Run installation on workers when enable_proxy_on_workers is set. - if [[ $enable_proxy_on_workers = "true" ]]; then + if [[ $enable_proxy_on_workers == "true" ]]; then install_cloud_sql_proxy fi fi diff --git a/conda/bootstrap-conda.sh b/conda/bootstrap-conda.sh index 63391ae69..06f501b1b 100755 --- a/conda/bootstrap-conda.sh +++ b/conda/bootstrap-conda.sh @@ -17,105 +17,105 @@ NEW_MINICONDA_VERSION="4.7.10" MIN_SPARK_VERSION_FOR_NEWER_MINICONDA="2.2.0" if [[ -f /etc/profile.d/effective-python.sh ]]; then - PROFILE_SCRIPT_PATH=/etc/profile.d/effective-python.sh + PROFILE_SCRIPT_PATH=/etc/profile.d/effective-python.sh else - PROFILE_SCRIPT_PATH=/etc/profile.d/conda.sh + PROFILE_SCRIPT_PATH=/etc/profile.d/conda.sh fi if [[ ! -v CONDA_INSTALL_PATH ]]; then - echo "CONDA_INSTALL_PATH not set, setting ..." - CONDA_INSTALL_PATH="/opt/conda" - echo "Set CONDA_INSTALL_PATH to $CONDA_INSTALL_PATH" + echo "CONDA_INSTALL_PATH not set, setting ..." + CONDA_INSTALL_PATH="/opt/conda" + echo "Set CONDA_INSTALL_PATH to $CONDA_INSTALL_PATH" fi # Check if Conda is already installed at the expected location. This will allow # this init action to override the default Miniconda in 1.4+ or Anaconda # optional component in 1.3+ which is installed at /opt/conda/default. if [[ -f "${CONDA_INSTALL_PATH}/bin/conda" ]]; then - echo "Dataproc has installed Conda previously at ${CONDA_INSTALL_PATH}. Skipping install!" - exit 0 + echo "Dataproc has installed Conda previously at ${CONDA_INSTALL_PATH}. Skipping install!" + exit 0 else - # 0. Specify Miniconda version - ## 0.1 A few parameters - ## specify base operating system - if [[ ! -v OS_TYPE ]]; then - echo "OS_TYPE not set, setting ..." - OS_TYPE="Linux-x86_64.sh" - echo "Set OS_TYPE to $OS_TYPE" - fi - ## Python 2 or 3 based miniconda? - if [[ -z "${MINICONDA_VARIANT}" ]]; then - echo "MINICONDA_VARIANT not set, setting ... " - MINICONDA_VARIANT="3" #for Python 3.5.x - echo "Set MINICONDA_VARIANT to $MINICONDA_VARIANT" - fi - ## specify Miniconda release (e.g., MINICONDA_VERSION='4.0.5') - if [[ -z "${MINICONDA_VERSION}" ]]; then - # Pin to 4.2.12 by default until Spark default is 2.2.0, then use newer - # one (https://issues.apache.org/jira/browse/SPARK-19019) - SPARK_VERSION=`spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\.[0-9]\+\+\).*/\1/p' | head -n1` - if dpkg --compare-versions ${SPARK_VERSION} ge ${MIN_SPARK_VERSION_FOR_NEWER_MINICONDA}; then - MINICONDA_VERSION="${NEW_MINICONDA_VERSION}" - else - MINICONDA_VERSION="${OLD_MINICONDA_VERSION}" - fi - fi - - ## 0.2 Compute Miniconda version - MINICONDA_FULL_NAME="Miniconda$MINICONDA_VARIANT-$MINICONDA_VERSION-$OS_TYPE" - echo "Complete Miniconda version resolved to: $MINICONDA_FULL_NAME" - ## 0.3 Set MD5 hash for check (if desired) - #expectedHash="b1b15a3436bb7de1da3ccc6e08c7a5df" - - # 1. Setup Miniconda Install - ## 1.1 Define Miniconda install directory - echo "Working directory: $PWD" - if [[ ! -v $PROJ_DIR ]]; then - echo "No path argument specified, setting install directory as working directory: $PWD." - PROJ_DIR=$PWD - fi - - ## 1.2 Setup Miniconda - cd $PROJ_DIR - MINICONDA_SCRIPT_PATH="$PROJ_DIR/$MINICONDA_FULL_NAME" - echo "Defined Miniconda script path: $MINICONDA_SCRIPT_PATH" - - if [[ -f "$MINICONDA_SCRIPT_PATH" ]]; then - echo "Found existing Miniconda script at: $MINICONDA_SCRIPT_PATH" + # 0. Specify Miniconda version + ## 0.1 A few parameters + ## specify base operating system + if [[ ! -v OS_TYPE ]]; then + echo "OS_TYPE not set, setting ..." + OS_TYPE="Linux-x86_64.sh" + echo "Set OS_TYPE to $OS_TYPE" + fi + ## Python 2 or 3 based miniconda? + if [[ -z "${MINICONDA_VARIANT}" ]]; then + echo "MINICONDA_VARIANT not set, setting ... " + MINICONDA_VARIANT="3" #for Python 3.5.x + echo "Set MINICONDA_VARIANT to $MINICONDA_VARIANT" + fi + ## specify Miniconda release (e.g., MINICONDA_VERSION='4.0.5') + if [[ -z "${MINICONDA_VERSION}" ]]; then + # Pin to 4.2.12 by default until Spark default is 2.2.0, then use newer + # one (https://issues.apache.org/jira/browse/SPARK-19019) + SPARK_VERSION=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\.[0-9]\+\+\).*/\1/p' | head -n1) + if dpkg --compare-versions ${SPARK_VERSION} ge ${MIN_SPARK_VERSION_FOR_NEWER_MINICONDA}; then + MINICONDA_VERSION="${NEW_MINICONDA_VERSION}" else - echo "Downloading Miniconda script to: $MINICONDA_SCRIPT_PATH ..." - wget -nv --timeout=30 --tries=5 --retry-connrefused \ - https://repo.continuum.io/miniconda/$MINICONDA_FULL_NAME -P "$PROJ_DIR" - echo "Downloaded $MINICONDA_FULL_NAME!" - ls -al $MINICONDA_SCRIPT_PATH - chmod 755 $MINICONDA_SCRIPT_PATH - fi - - ## 1.3 #md5sum hash check of miniconda installer - if [[ -v expectedHash ]]; then - md5Output=$(md5sum $MINICONDA_SCRIPT_PATH | awk '{print $1}') - if [ "$expectedHash" != "$md5Output" ]; then - echo "Unexpected md5sum $md5Output for $MINICONDA_FULL_NAME" - exit 1 - fi + MINICONDA_VERSION="${OLD_MINICONDA_VERSION}" fi - - # 2. Install conda - ## 2.1 Via bootstrap - LOCAL_CONDA_PATH="$PROJ_DIR/miniconda" - if [[ ! -d $LOCAL_CONDA_PATH ]]; then - #blow away old symlink / default Miniconda install - rm -rf "$PROJ_DIR/miniconda" - # Install Miniconda - echo "Installing $MINICONDA_FULL_NAME to $CONDA_INSTALL_PATH..." - bash $MINICONDA_SCRIPT_PATH -b -p $CONDA_INSTALL_PATH -f - chmod 755 $CONDA_INSTALL_PATH - #create symlink - ln -sf $CONDA_INSTALL_PATH "$PROJ_DIR/miniconda" - chmod 755 "$PROJ_DIR/miniconda" - else - echo "Existing directory at path: $LOCAL_CONDA_PATH, skipping install!" + fi + + ## 0.2 Compute Miniconda version + MINICONDA_FULL_NAME="Miniconda$MINICONDA_VARIANT-$MINICONDA_VERSION-$OS_TYPE" + echo "Complete Miniconda version resolved to: $MINICONDA_FULL_NAME" + ## 0.3 Set MD5 hash for check (if desired) + #expectedHash="b1b15a3436bb7de1da3ccc6e08c7a5df" + + # 1. Setup Miniconda Install + ## 1.1 Define Miniconda install directory + echo "Working directory: $PWD" + if [[ ! -v $PROJ_DIR ]]; then + echo "No path argument specified, setting install directory as working directory: $PWD." + PROJ_DIR=$PWD + fi + + ## 1.2 Setup Miniconda + cd $PROJ_DIR + MINICONDA_SCRIPT_PATH="$PROJ_DIR/$MINICONDA_FULL_NAME" + echo "Defined Miniconda script path: $MINICONDA_SCRIPT_PATH" + + if [[ -f "$MINICONDA_SCRIPT_PATH" ]]; then + echo "Found existing Miniconda script at: $MINICONDA_SCRIPT_PATH" + else + echo "Downloading Miniconda script to: $MINICONDA_SCRIPT_PATH ..." + wget -nv --timeout=30 --tries=5 --retry-connrefused \ + https://repo.continuum.io/miniconda/$MINICONDA_FULL_NAME -P "$PROJ_DIR" + echo "Downloaded $MINICONDA_FULL_NAME!" + ls -al $MINICONDA_SCRIPT_PATH + chmod 755 $MINICONDA_SCRIPT_PATH + fi + + ## 1.3 #md5sum hash check of miniconda installer + if [[ -v expectedHash ]]; then + md5Output=$(md5sum $MINICONDA_SCRIPT_PATH | awk '{print $1}') + if [ "$expectedHash" != "$md5Output" ]; then + echo "Unexpected md5sum $md5Output for $MINICONDA_FULL_NAME" + exit 1 fi + fi + + # 2. Install conda + ## 2.1 Via bootstrap + LOCAL_CONDA_PATH="$PROJ_DIR/miniconda" + if [[ ! -d $LOCAL_CONDA_PATH ]]; then + #blow away old symlink / default Miniconda install + rm -rf "$PROJ_DIR/miniconda" + # Install Miniconda + echo "Installing $MINICONDA_FULL_NAME to $CONDA_INSTALL_PATH..." + bash $MINICONDA_SCRIPT_PATH -b -p $CONDA_INSTALL_PATH -f + chmod 755 $CONDA_INSTALL_PATH + #create symlink + ln -sf $CONDA_INSTALL_PATH "$PROJ_DIR/miniconda" + chmod 755 "$PROJ_DIR/miniconda" + else + echo "Existing directory at path: $LOCAL_CONDA_PATH, skipping install!" + fi fi ## 2.2 Update PATH and conda... @@ -138,8 +138,8 @@ conda info -a echo "Updating global profiles to export miniconda bin location to PATH..." echo "Adding path definition to profiles..." echo "# Environment varaibles set by Conda init action." | tee -a "${PROFILE_SCRIPT_PATH}" #/etc/*bashrc /etc/profile -echo "export CONDA_BIN_PATH=$CONDA_BIN_PATH" | tee -a "${PROFILE_SCRIPT_PATH}" #/etc/*bashrc /etc/profile -echo 'export PATH=$CONDA_BIN_PATH:$PATH' | tee -a "${PROFILE_SCRIPT_PATH}" #/etc/*bashrc /etc/profile +echo "export CONDA_BIN_PATH=$CONDA_BIN_PATH" | tee -a "${PROFILE_SCRIPT_PATH}" #/etc/*bashrc /etc/profile +echo 'export PATH=$CONDA_BIN_PATH:$PATH' | tee -a "${PROFILE_SCRIPT_PATH}" #/etc/*bashrc /etc/profile # 2.3 Update global profiles to add the miniconda location to PATH echo "Updating global profiles to export miniconda bin location to PATH and set PYTHONHASHSEED ..." @@ -147,16 +147,16 @@ echo "Updating global profiles to export miniconda bin location to PATH and set # Issue here: https://issues.apache.org/jira/browse/SPARK-13330 (fixed in Spark 2.2.0 release) # Fix here: http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed/ echo "Adding PYTHONHASHSEED=0 to profiles and spark-defaults.conf..." -echo "export PYTHONHASHSEED=0" | tee -a "${PROFILE_SCRIPT_PATH}" #/etc/*bashrc /usr/lib/spark/conf/spark-env.sh -echo "spark.executorEnv.PYTHONHASHSEED=0" >> /etc/spark/conf/spark-defaults.conf +echo "export PYTHONHASHSEED=0" | tee -a "${PROFILE_SCRIPT_PATH}" #/etc/*bashrc /usr/lib/spark/conf/spark-env.sh +echo "spark.executorEnv.PYTHONHASHSEED=0" >>/etc/spark/conf/spark-defaults.conf ## 3. Ensure that Anaconda Python and PySpark play nice ### http://blog.cloudera.com/blog/2015/09/how-to-prepare-your-apache-hadoop-cluster-for-pyspark-jobs/ echo "Ensure that Anaconda Python and PySpark play nice by all pointing to same Python distro..." -echo "export PYSPARK_PYTHON=$CONDA_BIN_PATH/python" | tee -a "${PROFILE_SCRIPT_PATH}" /etc/environment /usr/lib/spark/conf/spark-env.sh +echo "export PYSPARK_PYTHON=$CONDA_BIN_PATH/python" | tee -a "${PROFILE_SCRIPT_PATH}" /etc/environment /usr/lib/spark/conf/spark-env.sh # CloudSDK libraries are installed in system python -echo 'export CLOUDSDK_PYTHON=/usr/bin/python' | tee -a "${PROFILE_SCRIPT_PATH}" #/etc/*bashrc /etc/profile +echo 'export CLOUDSDK_PYTHON=/usr/bin/python' | tee -a "${PROFILE_SCRIPT_PATH}" #/etc/*bashrc /etc/profile echo "Finished bootstrapping via Miniconda, sourcing /etc/profile ..." source /etc/profile diff --git a/conda/install-conda-env.sh b/conda/install-conda-env.sh index ab67037ef..da81cb89e 100755 --- a/conda/install-conda-env.sh +++ b/conda/install-conda-env.sh @@ -2,14 +2,14 @@ set -exo pipefail if [[ -f /etc/profile.d/effective-python.sh ]]; then - PROFILE_SCRIPT_PATH=/etc/profile.d/effective-python.sh + PROFILE_SCRIPT_PATH=/etc/profile.d/effective-python.sh elif [[ -f /etc/profile.d/conda.sh ]]; then - PROFILE_SCRIPT_PATH=/etc/profile.d/conda.sh + PROFILE_SCRIPT_PATH=/etc/profile.d/conda.sh fi # 0.1 Ensure we have conda installed and available on the PATH if [[ -f "${PROFILE_SCRIPT_PATH}" ]]; then - source "${PROFILE_SCRIPT_PATH}" + source "${PROFILE_SCRIPT_PATH}" fi echo "echo \$USER: $USER" @@ -27,18 +27,17 @@ fi # 0.2. Specify conda environment name (recommend leaving as root) if [[ ! -v CONDA_ENV_NAME ]]; then - echo "No conda environment name specified, setting to 'root' env..." - CONDA_ENV_NAME='root' + echo "No conda environment name specified, setting to 'root' env..." + CONDA_ENV_NAME='root' # Force conda env name to be set to root for now, until a braver soul manages the complexity of environment activation # across the cluster. else - echo "conda environment name is set to $CONDA_ENV_NAME" - if [[ ! $CONDA_ENV_NAME == 'root' ]] - then - echo "Custom conda environment names not supported at this time." - echo "Force setting conda env to 'root'..." - fi - CONDA_ENV_NAME='root' + echo "conda environment name is set to $CONDA_ENV_NAME" + if [[ ! $CONDA_ENV_NAME == 'root' ]]; then + echo "Custom conda environment names not supported at this time." + echo "Force setting conda env to 'root'..." + fi + CONDA_ENV_NAME='root' fi #conda update --all @@ -46,35 +45,32 @@ fi # 1.1 Update conda env from conda environment.yml (if specified) # For Dataproc provisioning, we should install to root conda env. if [[ -v CONDA_ENV_YAML ]]; then - #CONDA_ENV_NAME=$(grep 'name: ' $CONDA_ENV_YAML | awk '{print $2}') - # if conda environment name is root, we *update* the root environment with env yaml - if [[ $CONDA_ENV_NAME == 'root' ]] - then - echo "Updating root environment with file $CONDA_ENV_YAML" - conda env update --name=$CONDA_ENV_NAME --file=$CONDA_ENV_YAML - echo "Root environment updated..." - # otherwise, perform a typical environment creation via install - else - echo "Creating $CONDA_ENV_NAME environment with file $CONDA_ENV_YAML" - conda env create --name=$CONDA_ENV_NAME --file=$CONDA_ENV_YAML - echo "conda environment $CONDA_ENV_NAME created..." - fi + #CONDA_ENV_NAME=$(grep 'name: ' $CONDA_ENV_YAML | awk '{print $2}') + # if conda environment name is root, we *update* the root environment with env yaml + if [[ $CONDA_ENV_NAME == 'root' ]]; then + echo "Updating root environment with file $CONDA_ENV_YAML" + conda env update --name=$CONDA_ENV_NAME --file=$CONDA_ENV_YAML + echo "Root environment updated..." + # otherwise, perform a typical environment creation via install + else + echo "Creating $CONDA_ENV_NAME environment with file $CONDA_ENV_YAML" + conda env create --name=$CONDA_ENV_NAME --file=$CONDA_ENV_YAML + echo "conda environment $CONDA_ENV_NAME created..." + fi fi # 1. Or create conda env manually. echo "Attempting to create conda environment: $CONDA_ENV_NAME" -if conda info --envs | grep -q $CONDA_ENV_NAME - then - echo "conda environment $CONDA_ENV_NAME detected, skipping env creation..." +if conda info --envs | grep -q $CONDA_ENV_NAME; then + echo "conda environment $CONDA_ENV_NAME detected, skipping env creation..." else - echo "Creating conda environment directly..." - conda create --quiet --yes --name=$CONDA_ENV_NAME python || true - echo "conda environment $CONDA_ENV_NAME created..." + echo "Creating conda environment directly..." + conda create --quiet --yes --name=$CONDA_ENV_NAME python || true + echo "conda environment $CONDA_ENV_NAME created..." fi -if [[ ! $CONDA_ENV_NAME == 'root' ]] - then - echo "Activating $CONDA_ENV_NAME environment..." - source activate $CONDA_ENV_NAME +if [[ ! $CONDA_ENV_NAME == 'root' ]]; then + echo "Activating $CONDA_ENV_NAME environment..." + source activate $CONDA_ENV_NAME fi # Pin base conda and Python versions to minor version to prevent unexpected upgrades @@ -89,43 +85,40 @@ CONDA_PINNED_FILE="${CONDA_BASE_PATH}/conda-meta/pinned" function pin_component_version() { local component=$1 - version=$(conda list "${component}" \ - | grep -E "^${component}\s+" | sed -E "s/[ ]+/ /g" \ - | cut -f2 -d' ' | cut -f1,2 -d'.') - echo "${component} ${version}.*" >> "${CONDA_PINNED_FILE}" + version=$(conda list "${component}" | + grep -E "^${component}\s+" | sed -E "s/[ ]+/ /g" | + cut -f2 -d' ' | cut -f1,2 -d'.') + echo "${component} ${version}.*" >>"${CONDA_PINNED_FILE}" } pin_component_version conda pin_component_version python # 3. Install conda and pip packages (if specified) if [[ ! -z "${CONDA_PACKAGES}" ]]; then - echo "Installing conda packages for $CONDA_ENV_NAME..." - echo "conda packages requested: $CONDA_PACKAGES" - conda install $CONDA_PACKAGES + echo "Installing conda packages for $CONDA_ENV_NAME..." + echo "conda packages requested: $CONDA_PACKAGES" + conda install $CONDA_PACKAGES fi if [[ ! -z "${PIP_PACKAGES}" ]]; then - echo "Installing pip packages for $CONDA_ENV_NAME..." - echo "conda packages requested: $PIP_PACKAGES" - pip install $PIP_PACKAGES + echo "Installing pip packages for $CONDA_ENV_NAME..." + echo "conda packages requested: $PIP_PACKAGES" + pip install $PIP_PACKAGES fi # 2. Append profiles with conda env source activate echo "Attempting to append ${PROFILE_SCRIPT_PATH} to activate conda env at login..." -if [[ -f "${PROFILE_SCRIPT_PATH}" ]] && [[ ! $CONDA_ENV_NAME == 'root' ]] - then - if grep -ir "source activate $CONDA_ENV_NAME" "${PROFILE_SCRIPT_PATH}" - then - echo "conda env activation found in ${PROFILE_SCRIPT_PATH}, skipping..." - else - echo "Appending ${PROFILE_SCRIPT_PATH} to activate conda env $CONDA_ENV_NAME for shell..." - sudo echo "source activate $CONDA_ENV_NAME" | tee -a "${PROFILE_SCRIPT_PATH}" - echo "${PROFILE_SCRIPT_PATH} successfully appended!" - fi -elif [[ $CONDA_ENV_NAME == 'root' ]] - then - echo "The conda env specified is 'root', the default environment, no need to activate, skipping..." +if [[ -f "${PROFILE_SCRIPT_PATH}" ]] && [[ ! $CONDA_ENV_NAME == 'root' ]]; then + if grep -ir "source activate $CONDA_ENV_NAME" "${PROFILE_SCRIPT_PATH}"; then + echo "conda env activation found in ${PROFILE_SCRIPT_PATH}, skipping..." + else + echo "Appending ${PROFILE_SCRIPT_PATH} to activate conda env $CONDA_ENV_NAME for shell..." + sudo echo "source activate $CONDA_ENV_NAME" | tee -a "${PROFILE_SCRIPT_PATH}" + echo "${PROFILE_SCRIPT_PATH} successfully appended!" + fi +elif [[ $CONDA_ENV_NAME == 'root' ]]; then + echo "The conda env specified is 'root', the default environment, no need to activate, skipping..." else - echo "No file detected at ${PROFILE_SCRIPT_PATH}..." - echo "Are you sure you installed conda?" - exit 1 + echo "No file detected at ${PROFILE_SCRIPT_PATH}..." + echo "Are you sure you installed conda?" + exit 1 fi diff --git a/docker/docker.sh b/docker/docker.sh index 0afbbf7a1..e3e6b4668 100755 --- a/docker/docker.sh +++ b/docker/docker.sh @@ -8,10 +8,9 @@ readonly OS_CODE=$(lsb_release -cs) readonly DOCKER_VERSION="18.06.0~ce~3-0~${OS_ID}" readonly CREDENTIAL_HELPER_VERSION='1.5.0' - function is_master() { local role="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" - if [[ "$role" == 'Master' ]] ; then + if [[ "$role" == 'Master' ]]; then true else false @@ -23,7 +22,7 @@ function get_docker_gpg() { } function update_apt_get() { - for ((i = 0; i < 10; i++)) ; do + for ((i = 0; i < 10; i++)); do if apt-get update; then return 0 fi @@ -44,9 +43,9 @@ function install_docker() { function configure_gcr() { # this standalone method is recommended here: # https://cloud.google.com/container-registry/docs/advanced-authentication#standalone_docker_credential_helper - curl -fsSL "https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v${CREDENTIAL_HELPER_VERSION}/docker-credential-gcr_linux_amd64-${CREDENTIAL_HELPER_VERSION}.tar.gz" \ - | tar xz --to-stdout ./docker-credential-gcr \ - > /usr/local/bin/docker-credential-gcr && chmod +x /usr/local/bin/docker-credential-gcr + curl -fsSL "https://github.com/GoogleCloudPlatform/docker-credential-gcr/releases/download/v${CREDENTIAL_HELPER_VERSION}/docker-credential-gcr_linux_amd64-${CREDENTIAL_HELPER_VERSION}.tar.gz" | + tar xz --to-stdout ./docker-credential-gcr \ + >/usr/local/bin/docker-credential-gcr && chmod +x /usr/local/bin/docker-credential-gcr # this command configures docker on a per-user basis. Therefore we configure # the root user, as well as the yarn user which is part of the docker group. @@ -64,7 +63,7 @@ function configure_docker() { systemctl enable docker # Restart YARN daemons to pick up new group without restarting nodes. - if is_master ; then + if is_master; then systemctl restart hadoop-yarn-resourcemanager else systemctl restart hadoop-yarn-nodemanager diff --git a/dr-elephant/dr-elephant.sh b/dr-elephant/dr-elephant.sh index 4efd23a44..989f8cca7 100755 --- a/dr-elephant/dr-elephant.sh +++ b/dr-elephant/dr-elephant.sh @@ -130,9 +130,9 @@ the License. EOF bdconfig set_property \ - --configuration_file "/opt/dr-elephant/app-conf/GeneralConf.xml" \ - --name 'drelephant.analysis.backfill.enabled' --value 'true' \ - --clobber + --configuration_file "/opt/dr-elephant/app-conf/GeneralConf.xml" \ + --name 'drelephant.analysis.backfill.enabled' --value 'true' \ + --clobber # Enable compression to make metrics accessible by Dr. Elephant echo "spark.eventLog.compress = true" >>"/usr/lib/spark/conf/spark-defaults.conf" diff --git a/drill/drill.sh b/drill/drill.sh index f0599f8ea..38c825137 100755 --- a/drill/drill.sh +++ b/drill/drill.sh @@ -15,17 +15,16 @@ function err() { } function print_err_logs() { - for i in ${DRILL_LOG_DIR}/*; - do + for i in ${DRILL_LOG_DIR}/*; do echo ">>> $i" - cat "$i"; + cat "$i" done return 1 } function create_hive_storage_plugin() { # Create the hive storage plugin - cat > /tmp/hive_plugin.json </tmp/hive_plugin.json < /tmp/gcs_plugin.json </tmp/gcs_plugin.json < /tmp/hdfs_plugin.json </tmp/hdfs_plugin.json <> ${DRILL_HOME}/conf/drill-env.sh + echo DRILL_LOG_DIR=${DRILL_LOG_DIR} >>${DRILL_HOME}/conf/drill-env.sh # Link GCS connector to drill 3rdparty jars local connector_dir @@ -276,7 +275,7 @@ function main() { # Set ZK PStore to use a GCS Bucket # Using GCS makes all Drill profiles available from any drillbit, and also # persists the profiles past the lifetime of a cluster. - cat >> ${DRILL_HOME}/conf/drill-override.conf <>${DRILL_HOME}/conf/drill-override.conf < /dev/null - echo " Running flink job" - HADOOP_CONF_DIR=/etc/hadoop/conf /usr/lib/flink/bin/flink run \ - -m yarn-cluster \ - -yid ${application_id} \ - /usr/lib/flink/examples/batch/WordCount.jar + echo "Yarn test result is ${yarn_result_code}" + else + echo 'flink-start-yarn-session set to true, so skipped validation for this case' + yarn_result_code=0 + echo " Using existing YARN session" + application_id=$(yarn --loglevel ERROR application -list | grep -oP "(application_\d+_\d+)") &>/dev/null + echo " Running flink job" + HADOOP_CONF_DIR=/etc/hadoop/conf /usr/lib/flink/bin/flink run \ + -m yarn-cluster \ + -yid ${application_id} \ + /usr/lib/flink/examples/batch/WordCount.jar - flink_return_code=$? - echo " Done. Flink return code: ${flink_return_code}" -fi + flink_return_code=$? + echo " Done. Flink return code: ${flink_return_code}" + fi fi if [[ ${flink_return_code} -eq 0 && ${yarn_result_code} -eq 0 ]]; then diff --git a/gobblin/gobblin.sh b/gobblin/gobblin.sh index 868ed5136..d3e4e0dc6 100755 --- a/gobblin/gobblin.sh +++ b/gobblin/gobblin.sh @@ -27,7 +27,7 @@ readonly JAR_NAME_CANONICALIZER="s/([-a-zA-Z0-9]+?)[-]([0-9][0-9.]+?)([-.].*?)?. function maybe_symlink() { local jar=$1 - if [[ ! -f "${HADOOP_LIB}/${jar}" ]] ; then + if [[ ! -f "${HADOOP_LIB}/${jar}" ]]; then ln -s "${INSTALL_LIB}/${jar}" "${HADOOP_LIB}/${jar}" fi } @@ -37,14 +37,14 @@ function configure_env() { # Use hdfs:/// so we don't have to disambiguate between Highly Available # and regular cluster types. sed -E "s/(fs.uri)=(.+)$/\1=hdfs:\/\/\//" \ - -i "${INSTALL_CONF}/gobblin-mapreduce.properties" + -i "${INSTALL_CONF}/gobblin-mapreduce.properties" sed -E "s/env:GOBBLIN_WORK_DIR/fs.uri/g" \ - -i "${INSTALL_CONF}/gobblin-mapreduce.properties" + -i "${INSTALL_CONF}/gobblin-mapreduce.properties" - echo "export HADOOP_USER_CLASSPATH_FIRST=true" >> "/etc/hadoop/conf/hadoop-env.sh" + echo "export HADOOP_USER_CLASSPATH_FIRST=true" >>"/etc/hadoop/conf/hadoop-env.sh" - cat << EOF >> "${INSTALL_BIN}/gobblin-env.sh" + cat <>"${INSTALL_BIN}/gobblin-env.sh" export JAVA_HOME=${JAVA_HOME} export HADOOP_BIN_DIR=/usr/lib/hadoop/bin EOF @@ -82,7 +82,7 @@ EOF rm -f "${HADOOP_LIB}/guava"* for prefix in "${lib_prefixes[@]}"; do - for jar in `ls ${INSTALL_LIB}/${prefix}* | sed 's#.*/##'`; do + for jar in $(ls ${INSTALL_LIB}/${prefix}* | sed 's#.*/##'); do maybe_symlink "${jar}" done done @@ -108,4 +108,3 @@ function main() { } main - diff --git a/hive-hcatalog/hive-hcatalog.sh b/hive-hcatalog/hive-hcatalog.sh index c0999c863..99333d6cb 100755 --- a/hive-hcatalog/hive-hcatalog.sh +++ b/hive-hcatalog/hive-hcatalog.sh @@ -39,7 +39,6 @@ function update_apt_get() { return 1 } - update_apt_get # Install the hive-hcatalog package diff --git a/jupyter/internal/bootstrap-jupyter-ext.sh b/jupyter/internal/bootstrap-jupyter-ext.sh index 3f16a9932..0560ab068 100755 --- a/jupyter/internal/bootstrap-jupyter-ext.sh +++ b/jupyter/internal/bootstrap-jupyter-ext.sh @@ -55,4 +55,3 @@ if [[ ! -d "${RISE_PATH}" ]]; then else echo "Existing directory at path: ${RISE_PATH}, skipping install!" fi - diff --git a/jupyter/internal/launch-jupyter-kernel.sh b/jupyter/internal/launch-jupyter-kernel.sh index 7df0fa17d..63f32dada 100755 --- a/jupyter/internal/launch-jupyter-kernel.sh +++ b/jupyter/internal/launch-jupyter-kernel.sh @@ -11,7 +11,7 @@ echo "Installing Jupyter service..." readonly JUPYTER_LAUNCHER='/usr/local/bin/launch_jupyter.sh' readonly INIT_SCRIPT='/usr/lib/systemd/system/jupyter-notebook.service' -cat << EOF > "${JUPYTER_LAUNCHER}" +cat <"${JUPYTER_LAUNCHER}" #!/bin/bash source /etc/profile.d/conda.sh @@ -19,7 +19,7 @@ source /etc/profile.d/conda.sh EOF chmod 750 "${JUPYTER_LAUNCHER}" -cat << EOF > "${INIT_SCRIPT}" +cat <"${INIT_SCRIPT}" [Unit] Description=Jupyter Notebook Server diff --git a/jupyter/kernels/generate-pyspark.sh b/jupyter/kernels/generate-pyspark.sh index 68fcff185..c53bc60b1 100755 --- a/jupyter/kernels/generate-pyspark.sh +++ b/jupyter/kernels/generate-pyspark.sh @@ -6,7 +6,7 @@ set -e -SPARK_MAJOR_VERSION=$(spark-submit --version |& \ +SPARK_MAJOR_VERSION=$(spark-submit --version |& grep 'version' | head -n 1 | sed 's/.*version //' | cut -d '.' -f 1) echo "Determined SPARK_MAJOR_VERSION to be '${SPARK_MAJOR_VERSION}'" >&2 @@ -18,13 +18,13 @@ PY4J_ZIP=$(ls /usr/lib/spark/python/lib/py4j-*.zip) PY4J_ZIP=$(echo ${PY4J_ZIP} | cut -d ' ' -f 1) echo "Found PY4J_ZIP: '${PY4J_ZIP}'" >&2 -if (( "${SPARK_MAJOR_VERSION}" >= 2 )); then +if (("${SPARK_MAJOR_VERSION}" >= 2)); then PACKAGES_ARG='' else PACKAGES_ARG='--packages com.databricks:spark-csv_2.10:1.3.0' fi -cat << EOF +cat <&2 @@ -24,7 +24,7 @@ PY4J_ZIP=$(echo ${PY4J_ZIP} | cut -d ' ' -f 1) echo "Found PY4J_ZIP: '${PY4J_ZIP}'" >&2 COMMON_PACKAGES='org.vegas-viz:vegas_2.11:0.3.11,org.vegas-viz:vegas-spark_2.11:0.3.11' -if (( "${SPARK_MAJOR_VERSION}" >= 2 )); then +if (("${SPARK_MAJOR_VERSION}" >= 2)); then PACKAGES_ARG="--packages ${COMMON_PACKAGES}" else PACKAGES_ARG="--packages com.databricks:spark-csv_2.10:1.3.0,${COMMON_PACKAGES}" @@ -32,6 +32,6 @@ fi SPARK_OPTS="--master yarn --deploy-mode client ${PACKAGES_ARG}" /opt/conda/bin/jupyter toree install \ - --spark_opts="${SPARK_OPTS}" \ - --spark_home="/usr/lib/spark" \ - --kernel_name="Toree" + --spark_opts="${SPARK_OPTS}" \ + --spark_home="/usr/lib/spark" \ + --kernel_name="Toree" diff --git a/jupyter/launch-jupyter-interface.sh b/jupyter/launch-jupyter-interface.sh index 89b57fe5b..cfe3457f5 100755 --- a/jupyter/launch-jupyter-interface.sh +++ b/jupyter/launch-jupyter-interface.sh @@ -11,8 +11,8 @@ readonly DIR source "${DIR}/../util/utils.sh" -function usage { - cat << EOF +function usage() { + cat <> ~/.jupyter/jupyter_notebook_config.py + cat <>~/.jupyter/jupyter_notebook_config.py ## Configs generated in Dataproc init action c.Application.log_level = 'DEBUG' @@ -65,7 +65,7 @@ EOF # {connection_file} is a magic variable that Jupyter fills in for us # Note: we can only use it in argv, so cannot use env to set those # environment variables. - cat << EOF > "${KERNELSPEC_FILE}" + cat <"${KERNELSPEC_FILE}" { "argv": [ "bash", @@ -78,7 +78,7 @@ EOF # Ensure Jupyter has picked up the new kernel jupyter kernelspec list | grep pyspark || err "Failed to create kernelspec" - cat << EOF > "${INIT_SCRIPT}" + cat <"${INIT_SCRIPT}" [Unit] Description=Jupyter Notebook Server After=hadoop-yarn-resourcemanager.service @@ -116,4 +116,3 @@ function main() { } main - diff --git a/kafka/cruise-control.sh b/kafka/cruise-control.sh index 60ef0b586..e94e00306 100755 --- a/kafka/cruise-control.sh +++ b/kafka/cruise-control.sh @@ -57,7 +57,7 @@ function update_kafka_metrics_reporter() { fi cp ${CRUISE_CONTROL_HOME}/cruise-control-metrics-reporter/build/libs/cruise-control-metrics-reporter-${CRUISE_CONTROL_VERSION}.jar \ - ${KAFKA_HOME}/libs + ${KAFKA_HOME}/libs cat >>${KAFKA_CONFIG_FILE} <> "${KAFKA_PROP_FILE}" - echo -e '\ndelete.topic.enable=true' >> "${KAFKA_PROP_FILE}" + echo -e '\nreserved.broker.max.id=100000' >>"${KAFKA_PROP_FILE}" + echo -e '\ndelete.topic.enable=true' >>"${KAFKA_PROP_FILE}" if [[ "${KAFKA_ENABLE_JMX}" == "true" ]]; then sed -i '/kafka-run-class.sh/i export KAFKA_JMX_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Djava.rmi.server.hostname=localhost -Djava.net.preferIPv4Stack=true"' /usr/lib/kafka/bin/kafka-server-start.sh @@ -173,16 +173,16 @@ function main() { # Only run the installation on workers; verify zookeeper on master(s). if [[ "${ROLE}" == 'Master' ]]; then - service zookeeper-server status \ - || err 'Required zookeeper-server not running on master!' + service zookeeper-server status || + err 'Required zookeeper-server not running on master!' if [[ "${RUN_ON_MASTER}" == "true" ]]; then # Run installation on masters. install_and_configure_kafka_server else # On master nodes, just install kafka command-line tools and libs but not # kafka-server. - install_apt_get kafka \ - || err 'Unable to install kafka libraries on master!' + install_apt_get kafka || + err 'Unable to install kafka libraries on master!' fi else # Run installation on workers. diff --git a/kafka/validate.sh b/kafka/validate.sh index 196e7b109..062ee65d5 100755 --- a/kafka/validate.sh +++ b/kafka/validate.sh @@ -7,25 +7,27 @@ hostname="$(hostname)" echo "---------------------------------" echo "Starting validation on ${hostname}" - # Create a test topic, just talking to the local master's zookeeper server. /usr/lib/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create \ - --replication-factor 1 --partitions 1 --topic ${hostname} + --replication-factor 1 --partitions 1 --topic ${hostname} /usr/lib/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --list # Use worker 0 as broker to publish 100 messages over 100 seconds # asynchronously. echo "Testing worker-0 as producer and worker-1 as consumer" CLUSTER_NAME=$(/usr/share/google/get_metadata_value attributes/dataproc-cluster-name) -for i in {0..50}; do echo "message${i}"; sleep 1; done | \ - /usr/lib/kafka/bin/kafka-console-producer.sh \ +for i in {0..50}; do + echo "message${i}" + sleep 1 +done | + /usr/lib/kafka/bin/kafka-console-producer.sh \ --broker-list ${CLUSTER_NAME}-w-0:9092 --topic ${hostname} & # User worker 1 as broker to consume those 100 messages as they come. # This can also be run in any other master or worker node of the cluster. /usr/lib/kafka/bin/kafka-console-consumer.sh \ - --bootstrap-server ${CLUSTER_NAME}-w-1:9092 \ - --topic ${hostname} --from-beginning > /tmp/messages & + --bootstrap-server ${CLUSTER_NAME}-w-1:9092 \ + --topic ${hostname} --from-beginning >/tmp/messages & received_messages=$(wc -l /tmp/messages | awk '{print $1}') while [[ ${received_messages} -ne 51 ]]; do @@ -35,13 +37,16 @@ while [[ ${received_messages} -ne 51 ]]; do done echo "Testing worker-0 as consumer and worker-1 as producer" -for i in {0..50}; do echo "message${i}"; sleep 1; done | \ - /usr/lib/kafka/bin/kafka-console-producer.sh \ +for i in {0..50}; do + echo "message${i}" + sleep 1 +done | + /usr/lib/kafka/bin/kafka-console-producer.sh \ --broker-list ${CLUSTER_NAME}-w-1:9092 --topic ${hostname} & /usr/lib/kafka/bin/kafka-console-consumer.sh \ - --bootstrap-server ${CLUSTER_NAME}-w-0:9092 \ - --topic ${hostname} --from-beginning > /tmp/messages1 & + --bootstrap-server ${CLUSTER_NAME}-w-0:9092 \ + --topic ${hostname} --from-beginning >/tmp/messages1 & received_messages=$(wc -l /tmp/messages1 | awk '{print $1}') while [[ ${received_messages} -ne 102 ]]; do diff --git a/oozie/oozie.sh b/oozie/oozie.sh index cb5f6fcf0..c5dd1b753 100755 --- a/oozie/oozie.sh +++ b/oozie/oozie.sh @@ -62,7 +62,7 @@ function main() { fi } -function install_oozie(){ +function install_oozie() { local master_node=$(/usr/share/google/get_metadata_value attributes/dataproc-master) local node_name=${HOSTNAME} @@ -71,7 +71,7 @@ function install_oozie(){ install_apt_get oozie oozie-client || err 'Unable to install oozie-client' # Remove Log4j 2 jar not compatible with Log4j 1 that was brought by Hive 2 # TODO: remove after upgrade to Oozie 5.1 - if compgen -G "/usr/lib/oozie/lib/log4j-1.2.*.jar" > /dev/null; then + if compgen -G "/usr/lib/oozie/lib/log4j-1.2.*.jar" >/dev/null; then rm -f /usr/lib/oozie/lib/log4j-1.2-api*.jar fi @@ -100,36 +100,35 @@ function install_oozie(){ # Hadoop must allow impersonation for Oozie to work properly bdconfig set_property \ - --configuration_file "/etc/hadoop/conf/core-site.xml" \ - --name 'hadoop.proxyuser.oozie.hosts' --value '*' \ - --clobber + --configuration_file "/etc/hadoop/conf/core-site.xml" \ + --name 'hadoop.proxyuser.oozie.hosts' --value '*' \ + --clobber bdconfig set_property \ --configuration_file "/etc/hadoop/conf/core-site.xml" \ --name 'hadoop.proxyuser.oozie.groups' --value '*' \ --clobber - # Detect if current node configuration is HA and then set oozie servers local additional_nodes=$(/usr/share/google/get_metadata_value attributes/dataproc-master-additional | sed 's/,/\n/g' | wc -l) if [[ ${additional_nodes} -ge 2 ]]; then echo 'Starting configuration for HA' # List of servers is used for proper zookeeper configuration. It is needed to replace original ports range with specific one - local servers=$(cat /usr/lib/zookeeper/conf/zoo.cfg \ - | grep 'server.' \ - | sed 's/server.//g' \ - | sed 's/:2888:3888//g' \ - | cut -d'=' -f2- \ - | sed 's/\n/,/g' \ - | head -n 3 \ - | sed 's/$/:2181,/g' \ - | xargs -L3 \ - | sed 's/.$//g') + local servers=$(cat /usr/lib/zookeeper/conf/zoo.cfg | + grep 'server.' | + sed 's/server.//g' | + sed 's/:2888:3888//g' | + cut -d'=' -f2- | + sed 's/\n/,/g' | + head -n 3 | + sed 's/$/:2181,/g' | + xargs -L3 | + sed 's/.$//g') bdconfig set_property \ --configuration_file "/etc/oozie/conf/oozie-site.xml" \ --name 'oozie.services.ext' --value \ - 'org.apache.oozie.service.ZKLocksService, + 'org.apache.oozie.service.ZKLocksService, org.apache.oozie.service.ZKXLogStreamingService, org.apache.oozie.service.ZKJobsConcurrencyService, org.apache.oozie.service.ZKUUIDService' \ @@ -145,7 +144,7 @@ function install_oozie(){ /usr/lib/zookeeper/bin/zkServer.sh restart # HDFS and YARN must be cycled; restart to clean things up for service in hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode hadoop-yarn-resourcemanager oozie; do - if [[ $(systemctl list-unit-files | grep ${service}) != '' ]] && \ + if [[ $(systemctl list-unit-files | grep ${service}) != '' ]] && [[ $(systemctl is-enabled ${service}) == 'enabled' ]]; then systemctl restart ${service} fi diff --git a/oozie/validate.sh b/oozie/validate.sh index fe5309082..01af40dc5 100755 --- a/oozie/validate.sh +++ b/oozie/validate.sh @@ -17,7 +17,6 @@ fi rm -f job.properties hdfs dfs -get "oozie-examples/apps/map-reduce/job.properties" job.properties - echo "---------------------------------" echo "Starting validation on ${HOSTNAME}" sudo -u hdfs hadoop dfsadmin -safemode leave &>/dev/null diff --git a/post-init/master-post-init.sh b/post-init/master-post-init.sh index 61428caf2..1ef2f7282 100755 --- a/post-init/master-post-init.sh +++ b/post-init/master-post-init.sh @@ -25,12 +25,12 @@ if [[ "${ROLE}" != 'Master' ]]; then fi CLUSTER_NAME=$(curl -f -s -H Metadata-Flavor:Google \ - ${METADATA_ROOT}/dataproc-cluster-name) + ${METADATA_ROOT}/dataproc-cluster-name) # Fetch the actual command we want to run once the cluster is healthy. # The command is specified with the 'post-init-command' key. POST_INIT_COMMAND=$(curl -f -s -H Metadata-Flavor:Google \ - ${METADATA_ROOT}/post-init-command) + ${METADATA_ROOT}/post-init-command) if [ -z ${POST_INIT_COMMAND} ]; then echo "Failed to find metadata key 'post-init-command'" @@ -39,7 +39,7 @@ fi # We must put the bulk of the login in a separate helper script so that we can # 'nohup' it. -cat << EOF > /usr/local/bin/await_cluster_and_run_command.sh +cat </usr/local/bin/await_cluster_and_run_command.sh #!/bin/bash # Helper to get current cluster state. @@ -73,5 +73,5 @@ chmod 750 /usr/local/bin/await_cluster_and_run_command.sh # Uncomment this following line and comment out the line after it to throw away # the stdout/stderr of the command instead of logging it. #nohup /usr/local/bin/await_cluster_and_run_command.sh &>> /dev/null & -nohup /usr/local/bin/await_cluster_and_run_command.sh &>> \ - /var/log/master-post-init.log & +nohup /usr/local/bin/await_cluster_and_run_command.sh &>>/var/log/master-post-init.log \ + ; diff --git a/prometheus/prometheus.sh b/prometheus/prometheus.sh index 04e3e9361..1938afc2a 100755 --- a/prometheus/prometheus.sh +++ b/prometheus/prometheus.sh @@ -32,7 +32,7 @@ readonly KAFKA_JMX_JAVAAGENT_URI="http://central.maven.org/maven2/io/prometheus/ readonly KAFKA_JMX_EXPORTER_CONFIG_NAME="kafka-0-8-2.yml" readonly KAFKA_JMX_EXPORTER_CONFIG_URI="https://raw.githubusercontent.com/prometheus/jmx_exporter/master/example_configs/${KAFKA_JMX_EXPORTER_CONFIG_NAME}" -function is_kafka_installed { +function is_kafka_installed() { local result="$(cat ${KAFKA_CONFIG_FILE} | grep broker.id | tail -1)" if [[ "${result}" == "broker.id=0" ]]; then return 1 @@ -41,7 +41,7 @@ function is_kafka_installed { fi } -function install_prometheus { +function install_prometheus() { mkdir -p /etc/prometheus /var/lib/prometheus wget -nv --timeout=30 --tries=5 --retry-connrefused \ "https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VER}/prometheus-${PROMETHEUS_VER}.linux-amd64.tar.gz" @@ -53,7 +53,7 @@ function install_prometheus { rm -rf "prometheus-${PROMETHEUS_VER}.linux-amd64.tar.gz" "prometheus-${PROMETHEUS_VER}.linux-amd64" -cat << EOF > /etc/systemd/system/prometheus.service + cat </etc/systemd/system/prometheus.service [Unit] Description=Prometheus Wants=network-online.target @@ -75,9 +75,9 @@ WantedBy=multi-user.target EOF } -function configure_prometheus { +function configure_prometheus() { # Statsd for Hadoop and Spark. - cat << EOF > /etc/prometheus/prometheus.yml + cat </etc/prometheus/prometheus.yml global: scrape_interval: 10s evaluation_interval: 10s @@ -91,7 +91,7 @@ EOF # Kafka JMX exporter. if [[ "${MONITOR_KAFKA}" == "true" ]] && is_kafka_installed; then - cat << EOF >> /etc/prometheus/prometheus.yml + cat <>/etc/prometheus/prometheus.yml - job_name: 'kafka' static_configs: - targets: ['localhost:${KAFKA_JMX_EXPORTER_PORT}'] @@ -99,7 +99,7 @@ EOF fi } -function install_statsd_exporter { +function install_statsd_exporter() { mkdir -p /var/lib/statsd wget -nv --timeout=30 --tries=5 --retry-connrefused \ "https://github.com/prometheus/statsd_exporter/releases/download/v${STATSD_EXPORTER_VER}/statsd_exporter-${STATSD_EXPORTER_VER}.linux-amd64.tar.gz" @@ -108,7 +108,7 @@ function install_statsd_exporter { rm -rf "statsd_exporter-${STATSD_EXPORTER_VER}.linux-amd64.tar.gz" "statsd_exporter-${STATSD_EXPORTER_VER}.linux-amd64" -cat << EOF > /etc/systemd/system/statsd-exporter.service + cat </etc/systemd/system/statsd-exporter.service [Unit] Description=Statsd Wants=network-online.target @@ -126,23 +126,23 @@ WantedBy=multi-user.target EOF } -function install_jmx_exporter { +function install_jmx_exporter() { wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${KAFKA_JMX_JAVAAGENT_URI}" -P "${KAFKA_LIBS_DIR}" + "${KAFKA_JMX_JAVAAGENT_URI}" -P "${KAFKA_LIBS_DIR}" wget -nv --timeout=30 --tries=5 --retry-connrefused \ - "${KAFKA_JMX_EXPORTER_CONFIG_URI}" -P "${KAFKA_CONFIG_DIR}" + "${KAFKA_JMX_EXPORTER_CONFIG_URI}" -P "${KAFKA_CONFIG_DIR}" sed -i "/kafka-run-class.sh/i export KAFKA_OPTS=\"\${KAFKA_OPTS} -javaagent:${KAFKA_LIBS_DIR}/${KAFKA_JMX_JAVAAGENT_NAME}=${KAFKA_JMX_EXPORTER_PORT}:${KAFKA_CONFIG_DIR}/${KAFKA_JMX_EXPORTER_CONFIG_NAME}\"" \ - /usr/lib/kafka/bin/kafka-server-start.sh + /usr/lib/kafka/bin/kafka-server-start.sh } -function start_services { +function start_services() { systemctl daemon-reload systemctl start statsd-exporter systemctl start prometheus } -function configure_hadoop { - cat << EOF > /etc/hadoop/conf/hadoop-metrics2.properties +function configure_hadoop() { + cat </etc/hadoop/conf/hadoop-metrics2.properties resourcemanager.sink.statsd.class=org.apache.hadoop.metrics2.sink.StatsDSink resourcemanager.sink.statsd.server.host=${HOSTNAME} resourcemanager.sink.statsd.server.port=9125 @@ -163,20 +163,20 @@ EOF fi } -function configure_spark { - cat << EOF > /etc/spark/conf/metrics.properties +function configure_spark() { + cat </etc/spark/conf/metrics.properties *.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink *.sink.statsd.prefix=spark *.sink.statsd.port=9125 EOF } -function configure_kafka { +function configure_kafka() { install_jmx_exporter systemctl restart kafka-server.service } -function configure_components { +function configure_components() { if [[ "${MONITOR_HADOOP}" == "true" || "${MONITOR_SPARK}" == "true" ]]; then install_statsd_exporter @@ -194,13 +194,13 @@ function configure_components { fi } -function restart_service_gracefully { +function restart_service_gracefully() { while true; do - if systemctl status "$1"| grep -q 'Active: active (running)'; then + if systemctl status "$1" | grep -q 'Active: active (running)'; then systemctl restart "$1" - break; + break fi - sleep 5; + sleep 5 done } diff --git a/python/conda-install.sh b/python/conda-install.sh index 007a35d60..1053b1a40 100755 --- a/python/conda-install.sh +++ b/python/conda-install.sh @@ -6,11 +6,10 @@ readonly PACKAGES=$(/usr/share/google/get_metadata_value attributes/CONDA_PACKAG function main() { if [[ -z "${PACKAGES}" ]]; then - echo "ERROR: Must specify CONDA_PACKAGES metadata key" + echo "ERROR: Must specify CONDA_PACKAGES metadata key" exit 1 fi conda install ${PACKAGES} } main - diff --git a/ranger/ranger.sh b/ranger/ranger.sh index f4dab7c6b..fb0ac76f2 100755 --- a/ranger/ranger.sh +++ b/ranger/ranger.sh @@ -60,14 +60,14 @@ function configure_admin() { sed -i 's/^audit_solr_user=/audit_solr_user=solr/' \ "${RANGER_INSTALL_DIR}/ranger-admin/install.properties" bdconfig set_property \ - --configuration_file "${RANGER_INSTALL_DIR}/ranger-admin/ews/webapp/WEB-INF/classes/conf.dist/ranger-admin-site.xml" \ - --name 'ranger.service.http.port' --value "${RANGER_ADMIN_PORT}" \ - --clobber + --configuration_file "${RANGER_INSTALL_DIR}/ranger-admin/ews/webapp/WEB-INF/classes/conf.dist/ranger-admin-site.xml" \ + --name 'ranger.service.http.port' --value "${RANGER_ADMIN_PORT}" \ + --clobber mysql -u root -proot-password -e "CREATE USER 'rangeradmin'@'localhost' IDENTIFIED BY 'rangerpass';" mysql -u root -proot-password -e "CREATE DATABASE ranger;" mysql -u root -proot-password -e "GRANT ALL PRIVILEGES ON ranger.* TO 'rangeradmin'@'localhost';" - if [[ "${MASTER_ADDITIONAL}" != "" ]] ; then + if [[ "${MASTER_ADDITIONAL}" != "" ]]; then sed -i "s/^audit_solr_zookeepers=/audit_solr_zookeepers=${CLUSTER_NAME}-m-0:2181,${CLUSTER_NAME}-m-1:2181,${CLUSTER_NAME}-m-2:2181\/solr/" \ "${RANGER_INSTALL_DIR}/ranger-admin/install.properties" sed -i 's/^audit_solr_urls=/audit_solr_urls=none/' \ @@ -88,8 +88,8 @@ function run_ranger_admin() { } function add_usersync_plugin() { - mkdir -p /var/log/ranger-usersync && chown ranger /var/log/ranger-usersync \ - && chgrp ranger /var/log/ranger-usersync + mkdir -p /var/log/ranger-usersync && chown ranger /var/log/ranger-usersync && + chgrp ranger /var/log/ranger-usersync sed -i 's/^logdir=logs/logdir=\/var\/log\/ranger-usersync/' \ "${RANGER_INSTALL_DIR}/ranger-usersync/install.properties" @@ -144,12 +144,11 @@ function add_hdfs_plugin() { systemctl start hadoop-hdfs-namenode.service # Notify cluster that plugin is installed on master. - until hadoop fs -touchz /tmp/ranger-hdfs-plugin-ready &> /dev/null - do + until hadoop fs -touchz /tmp/ranger-hdfs-plugin-ready &>/dev/null; do sleep 10 done - cat << EOF > service-hdfs.json + cat <service-hdfs.json { "configs": { "username": "admin", @@ -165,12 +164,11 @@ function add_hdfs_plugin() { "version": 1 } EOF - curl --user "admin:${RANGER_ADMIN_PASS}" -H "Content-Type: application/json" \ + curl --user "admin:${RANGER_ADMIN_PASS}" -H "Content-Type: application/json" \ -X POST -d @service-hdfs.json "http://localhost:${RANGER_ADMIN_PORT}/service/public/v2/api/service" elif [[ "${NODE_NAME}" =~ ^.*(-m-1)$ ]]; then # Waiting until hdfs plugin will be configured on m-0 - until hadoop fs -ls /tmp/ranger-hdfs-plugin-ready &> /dev/null - do + until hadoop fs -ls /tmp/ranger-hdfs-plugin-ready &>/dev/null; do sleep 10 done systemctl stop hadoop-hdfs-namenode.service @@ -180,9 +178,9 @@ EOF function add_hive_plugin() { apply_common_plugin_configuration "ranger-hive-plugin" "hive-dataproc" - mkdir -p hive \ - && ln -s /etc/hive/conf hive \ - && ln -s /usr/lib/hive/lib hive + mkdir -p hive && + ln -s /etc/hive/conf hive && + ln -s /usr/lib/hive/lib hive pushd ranger-hive-plugin && ./enable-hive-plugin.sh && popd if [[ "${NODE_NAME}" =~ ^.*(-m|-m-0)$ ]]; then @@ -190,12 +188,11 @@ function add_hive_plugin() { systemctl start hive-server2.service # Notify cluster that hive plugin is installed on master. - until hadoop fs -touchz /tmp/ranger-hive-plugin-ready &> /dev/null - do + until hadoop fs -touchz /tmp/ranger-hive-plugin-ready &>/dev/null; do sleep 10 done - cat << EOF > service-hive.json + cat <service-hive.json { "configs": { "username": "admin", @@ -210,12 +207,11 @@ function add_hive_plugin() { "version": 1 } EOF - curl --user "admin:${RANGER_ADMIN_PASS}" -H "Content-Type: application/json" \ + curl --user "admin:${RANGER_ADMIN_PASS}" -H "Content-Type: application/json" \ -X POST -d @service-hive.json "http://localhost:${RANGER_ADMIN_PORT}/service/public/v2/api/service" elif [[ "${NODE_NAME}" =~ ^.*(-m-1|-m-2)$ ]]; then # Waiting until hive plugin will be configured on m-0 - until hadoop fs -ls /tmp/ranger-hive-plugin-ready &> /dev/null - do + until hadoop fs -ls /tmp/ranger-hive-plugin-ready &>/dev/null; do sleep 10 done systemctl stop hive-server2.service @@ -232,12 +228,11 @@ function add_yarn_plugin() { systemctl start hadoop-yarn-resourcemanager.service # Notify cluster that yarn plugin is installed on master. - until hadoop fs -touchz /tmp/ranger-yarn-plugin-ready &> /dev/null - do + until hadoop fs -touchz /tmp/ranger-yarn-plugin-ready &>/dev/null; do sleep 10 done - cat << EOF > service-yarn.json + cat <service-yarn.json { "configs": { "username": "admin", @@ -251,12 +246,11 @@ function add_yarn_plugin() { "version": 1 } EOF - curl --user "admin:${RANGER_ADMIN_PASS}" -H "Content-Type: application/json" \ + curl --user "admin:${RANGER_ADMIN_PASS}" -H "Content-Type: application/json" \ -X POST -d @service-yarn.json "http://localhost:${RANGER_ADMIN_PORT}/service/public/v2/api/service" elif [[ "${NODE_NAME}" =~ ^.*(-m-1|-m-2)$ ]]; then # Waiting until yarn plugin will be configured on m-0 - until hadoop fs -ls /tmp/ranger-yarn-plugin-ready &> /dev/null - do + until hadoop fs -ls /tmp/ranger-yarn-plugin-ready &>/dev/null; do sleep 10 done systemctl stop hadoop-yarn-resourcemanager.service diff --git a/rapids/internal/launch-dask.sh b/rapids/internal/launch-dask.sh index b262f547d..2f985446f 100755 --- a/rapids/internal/launch-dask.sh +++ b/rapids/internal/launch-dask.sh @@ -14,7 +14,7 @@ readonly INIT_SCRIPT='/usr/lib/systemd/system/dask-cluster.service' readonly PREFIX='/opt/conda/anaconda/envs/RAPIDS/bin' if [[ "${ROLE}" == 'Master' ]]; then - cat << EOF > "${DASK_LAUNCHER}" + cat <"${DASK_LAUNCHER}" #!/bin/bash if [[ "${RUN_CUDA_WORKER_ON_MASTER}" == true ]]; then echo "dask-scheduler starting, logging to /var/log/dask-scheduler.log.." @@ -28,14 +28,14 @@ else fi EOF else - cat << EOF > "${DASK_LAUNCHER}" + cat <"${DASK_LAUNCHER}" #!/bin/bash $PREFIX/dask-cuda-worker --memory-limit 0 ${MASTER}:8786 > /var/log/dask-cuda-workers.log 2>&1 EOF fi chmod 750 "${DASK_LAUNCHER}" -cat << EOF > "${INIT_SCRIPT}" +cat <"${INIT_SCRIPT}" [Unit] Description=Dask Cluster Service [Service] diff --git a/rstudio/rstudio.sh b/rstudio/rstudio.sh index 25b06542e..191b2411f 100755 --- a/rstudio/rstudio.sh +++ b/rstudio/rstudio.sh @@ -42,12 +42,12 @@ function run_with_retries() { break else local sleep_time=${retry_backoff[$i]} - echo "'${cmd[*]}' attempt $(( $i + 1 )) failed! Sleeping ${sleep_time}." >&2 + echo "'${cmd[*]}' attempt $(($i + 1)) failed! Sleeping ${sleep_time}." >&2 sleep ${sleep_time} fi done - if ! (( ${succeeded} )); then + if ! ((${succeeded})); then echo "Final attempt of '${cmd[*]}'..." # Let any final error propagate all the way out to any error traps. "${cmd[@]}" @@ -63,20 +63,20 @@ OS_ID=$(lsb_release -is | tr '[:upper:]' '[:lower:]') OS_CODE=$(lsb_release -cs) function get_apt_key_for_debian() { - apt-key adv --no-tty --keyserver keys.gnupg.net --recv-key E19F5F87128899B192B1A2C2AD5F960A256A04AF || \ - apt-key adv --no-tty --keyserver pgp.mit.edu --recv-key E19F5F87128899B192B1A2C2AD5F960A256A04AF + apt-key adv --no-tty --keyserver keys.gnupg.net --recv-key E19F5F87128899B192B1A2C2AD5F960A256A04AF || + apt-key adv --no-tty --keyserver pgp.mit.edu --recv-key E19F5F87128899B192B1A2C2AD5F960A256A04AF } if [[ "${ROLE}" == 'Master' ]]; then - if [[ -n ${USER_PASSWORD} ]] && (( ${#USER_PASSWORD} < 7 )) ; then + if [[ -n ${USER_PASSWORD} ]] && ((${#USER_PASSWORD} < 7)); then echo "You must specify a password of at least 7 characters for user '$USER_NAME' through metadata 'rstudio-password'." exit 1 fi - if [[ -z "${USER_NAME}" ]] ; then + if [[ -z "${USER_NAME}" ]]; then echo "RStudio user name must not be empty." exit 2 fi - if [[ "${USER_NAME}" == "${USER_PASSWORD}" ]] ; then + if [[ "${USER_NAME}" == "${USER_PASSWORD}" ]]; then echo "RStudio user name and password must not be the same." exit 3 fi @@ -104,11 +104,11 @@ if [[ "${ROLE}" == 'Master' ]]; then fi if ! [ $(id -u "${USER_NAME}") ]; then useradd --create-home --gid "${USER_NAME}" "${USER_NAME}" - if [[ -n "${USER_PASSWORD}" ]] ; then + if [[ -n "${USER_PASSWORD}" ]]; then echo "${USER_NAME}:${USER_PASSWORD}" | chpasswd fi fi - if [[ -z "${USER_PASSWORD}" ]] ; then + if [[ -z "${USER_PASSWORD}" ]]; then sed -i 's:ExecStart=\(.*\):Environment=USER=rstudio\nExecStart=\1 --auth-none 1:1' /etc/systemd/system/rstudio-server.service systemctl daemon-reload systemctl restart rstudio-server diff --git a/solr/solr.sh b/solr/solr.sh index 6db95be81..7585799bf 100755 --- a/solr/solr.sh +++ b/solr/solr.sh @@ -51,10 +51,10 @@ function err() { function install_and_configure_solr() { local solr_home_dir local zookeeper_nodes - zookeeper_nodes="$(grep '^server\.' /etc/zookeeper/conf/zoo.cfg \ - | uniq | cut -d '=' -f 2 | cut -d ':' -f 1 | xargs echo | sed "s/ /,/g")" + zookeeper_nodes="$(grep '^server\.' /etc/zookeeper/conf/zoo.cfg | + uniq | cut -d '=' -f 2 | cut -d ':' -f 1 | xargs echo | sed "s/ /,/g")" -# Install deb packages from GS + # Install deb packages from GS update_apt_get install_apt_get solr @@ -75,12 +75,12 @@ function install_and_configure_solr() { else solr_home_dir="hdfs://${CLUSTER_NAME}-m:8020/solr" fi - cat << EOF >> "${SOLR_CONF_FILE}" + cat <>"${SOLR_CONF_FILE}" SOLR_OPTS="\${SOLR_OPTS} -Dsolr.directoryFactory=HdfsDirectoryFactory -Dsolr.lock.type=hdfs \ -Dsolr.hdfs.home=${solr_home_dir}" EOF - cat << EOF > /etc/systemd/system/solr.service + cat </etc/systemd/system/solr.service [Unit] Description=Apache SOLR ConditionPathExists=/usr/lib/solr/bin diff --git a/tez/tez.sh b/tez/tez.sh index af2032e02..8929d05c4 100755 --- a/tez/tez.sh +++ b/tez/tez.sh @@ -57,15 +57,15 @@ function err() { function configure_master_node() { update_apt_get || err 'Unable to update packages lists.' - install_apt_get tez hadoop-yarn-timelineserver \ - || err 'Failed to install required packages.' + install_apt_get tez hadoop-yarn-timelineserver || + err 'Failed to install required packages.' # Copy to hdfs from one master only to avoid race if [[ "${HOSTNAME}" == "${master_hostname}" ]]; then # Stage Tez hadoop fs -mkdir -p ${TEZ_HDFS_PATH} - hadoop fs -copyFromLocal ${TEZ_JARS}/* ${TEZ_HDFS_PATH}/ \ - || err 'Unable to copy tez jars to hdfs destination.' + hadoop fs -copyFromLocal ${TEZ_JARS}/* ${TEZ_HDFS_PATH}/ || + err 'Unable to copy tez jars to hdfs destination.' fi # Update the hadoop-env.sh @@ -73,7 +73,7 @@ function configure_master_node() { echo 'export TEZ_CONF_DIR=/etc/tez/' echo "export TEZ_JARS=${TEZ_JARS}" echo "HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:${TEZ_CONF_DIR}:${TEZ_JARS}/*:${TEZ_JARS}/lib/*" - } >> /etc/hadoop/conf/hadoop-env.sh + } >>/etc/hadoop/conf/hadoop-env.sh # Configure YARN to enable the Application Timeline Server. bdconfig set_property \ @@ -140,18 +140,18 @@ function configure_master_node() { # Restart resource manager systemctl restart hadoop-yarn-resourcemanager - systemctl status hadoop-yarn-resourcemanager # Ensure it started successfully + systemctl status hadoop-yarn-resourcemanager # Ensure it started successfully # Enable timeline server systemctl enable hadoop-yarn-timelineserver systemctl restart hadoop-yarn-timelineserver - systemctl status hadoop-yarn-timelineserver # Ensure it started successfully + systemctl status hadoop-yarn-timelineserver # Ensure it started successfully # Check hive-server2 status - if ( systemctl is-enabled --quiet hive-server2 ); then + if (systemctl is-enabled --quiet hive-server2); then # Restart hive server2 if it is enabled systemctl restart hive-server2 - systemctl status hive-server2 # Ensure it started successfully + systemctl status hive-server2 # Ensure it started successfully else echo "Service hive-server2 is not enabled" fi diff --git a/user-environment/user-environment.sh b/user-environment/user-environment.sh index 33b3a6c6f..16c583970 100755 --- a/user-environment/user-environment.sh +++ b/user-environment/user-environment.sh @@ -38,7 +38,7 @@ update-alternatives --set editor /usr/bin/vim.basic #apt-get install -y tmux sl ## The following script will get run as each user in their home directory. -cat << 'EOF' > /tmp/customize_home_dir.sh +cat <<'EOF' >/tmp/customize_home_dir.sh set -o errexit set -o nounset set -o xtrace diff --git a/util/utils.sh b/util/utils.sh index e27d80d40..77ba064d0 100755 --- a/util/utils.sh +++ b/util/utils.sh @@ -1,21 +1,20 @@ #!/usr/bin/env bash -function_exists () { - declare -f -F $1 > /dev/null - return $? +function_exists() { + declare -f -F $1 >/dev/null + return $? } -throw () { - echo "$*" >&2 - echo - function_exists usage && usage - exit 1 +throw() { + echo "$*" >&2 + echo + function_exists usage && usage + exit 1 } -get_metadata_property () { - [[ -z $1 ]] && throw "missing function param for DATAPROC_CLUSTER_NAME" || DATAPROC_CLUSTER_NAME=$1 - [[ -z $2 ]] && throw "missing function param for METADATA_KEY" || METADATA_KEY=$2 - # Get $DATAPROC_CLUSTER_NAME metadata value for key $METADATA_KEY... - gcloud dataproc clusters describe $DATAPROC_CLUSTER_NAME | python -c "import sys,yaml; cluster = yaml.load(sys.stdin); print(cluster['config']['gceClusterConfig']['metadata']['$METADATA_KEY'])" +get_metadata_property() { + [[ -z $1 ]] && throw "missing function param for DATAPROC_CLUSTER_NAME" || DATAPROC_CLUSTER_NAME=$1 + [[ -z $2 ]] && throw "missing function param for METADATA_KEY" || METADATA_KEY=$2 + # Get $DATAPROC_CLUSTER_NAME metadata value for key $METADATA_KEY... + gcloud dataproc clusters describe $DATAPROC_CLUSTER_NAME | python -c "import sys,yaml; cluster = yaml.load(sys.stdin); print(cluster['config']['gceClusterConfig']['metadata']['$METADATA_KEY'])" } - diff --git a/zeppelin/zeppelin.sh b/zeppelin/zeppelin.sh index 2b14c6c10..8ae1d1a55 100755 --- a/zeppelin/zeppelin.sh +++ b/zeppelin/zeppelin.sh @@ -19,7 +19,6 @@ set -euxo pipefail - readonly ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" readonly INTERPRETER_FILE='/etc/zeppelin/conf/interpreter.json' readonly INIT_SCRIPT='/usr/lib/systemd/system/zeppelin-notebook.service' @@ -49,7 +48,7 @@ function err() { return 1 } -function install_zeppelin(){ +function install_zeppelin() { # Install zeppelin. Don't mind if it fails to start the first time. retry_apt_command "apt-get install -t $(lsb_release -sc)-backports -y zeppelin" || dpkg -l zeppelin if [ $? != 0 ]; then @@ -68,8 +67,8 @@ function install_zeppelin(){ fi } -function configure_zeppelin(){ - local zeppelin_version; +function configure_zeppelin() { + local zeppelin_version zeppelin_version="$(dpkg-query --showformat='${Version}' --show zeppelin)" # Only use sed to modify interpreter.json prior to Zeppelin 0.8.0. @@ -101,7 +100,7 @@ function configure_zeppelin(){ sed -i '/spark\.executor\.memory/d' "${INTERPRETER_FILE}" # Set BigQuery project ID if present. - local project_id; + local project_id project_id="$(/usr/share/google/get_metadata_value ../project/project-id)" sed -i "s/\(\"zeppelin.bigquery.project_id\"\)[^,}]*/\1: \"${project_id}\"/" \ "${INTERPRETER_FILE}" @@ -110,11 +109,11 @@ function configure_zeppelin(){ # Link in hive configuration. ln -s /etc/hive/conf/hive-site.xml /etc/zeppelin/conf - local zeppelin_port; + local zeppelin_port zeppelin_port="$(/usr/share/google/get_metadata_value attributes/zeppelin-port || true)" if [[ -n "${zeppelin_port}" ]]; then echo "export ZEPPELIN_PORT=${zeppelin_port}" \ - >> /etc/zeppelin/conf/zeppelin-env.sh + >>/etc/zeppelin/conf/zeppelin-env.sh fi # Install matplotlib. Note that this will work in Zeppelin, but not diff --git a/zookeeper/zookeeper.sh b/zookeeper/zookeeper.sh index c2b2092d5..503a7c1eb 100755 --- a/zookeeper/zookeeper.sh +++ b/zookeeper/zookeeper.sh @@ -36,7 +36,7 @@ function install_apt_get() { } function write_config() { - cat >> /etc/zookeeper/conf/zoo.cfg <>/etc/zookeeper/conf/zoo.cfg <> /etc/zookeeper/conf/zoo.cfg <>/etc/zookeeper/conf/zoo.cfg < 2 )); then +if (($NODE_NUMBER > 2)); then write_config echo "Skip running ZooKeeper on this node." exit 0 @@ -86,7 +86,7 @@ install_apt_get zookeeper-server # Write ZooKeeper node ID. mkdir -p /var/lib/zookeeper -echo ${NODE_NUMBER} >| /var/lib/zookeeper/myid +echo ${NODE_NUMBER} >|/var/lib/zookeeper/myid # Write ZooKeeper configuration file write_config