diff --git a/BUILD b/BUILD index dfb599b4d..3dd5093ab 100644 --- a/BUILD +++ b/BUILD @@ -6,8 +6,8 @@ test_suite( ":test_cloud_sql_proxy", ":test_dr_elephant", ":test_hive_hcatalog", - ":test_starburst_presto", ":test_spark_rapids", + ":test_starburst_presto", "//alluxio:test_alluxio", "//atlas:test_atlas", "//bigtable:test_bigtable", @@ -60,7 +60,10 @@ py_test( name = "test_cloud_sql_proxy", size = "enormous", srcs = ["cloud-sql-proxy/test_cloud_sql_proxy.py"], - data = ["cloud-sql-proxy/cloud-sql-proxy.sh", "cloud-sql-proxy/hivetest.hive"], + data = [ + "cloud-sql-proxy/cloud-sql-proxy.sh", + "cloud-sql-proxy/hivetest.hive", + ], local = True, shard_count = 4, deps = [ @@ -114,10 +117,10 @@ py_test( size = "enormous", srcs = ["spark-rapids/test_spark_rapids.py"], data = [ + "spark-rapids/mig.sh", "spark-rapids/spark-rapids.sh", "spark-rapids/verify_xgboost_spark_rapids.scala", "spark-rapids/verify_xgboost_spark_rapids_sql.scala", - "spark-rapids/mig.sh", ], local = True, shard_count = 3, @@ -132,3 +135,19 @@ py_library( testonly = True, srcs = ["cloud-sql-proxy/pyspark_metastore_test.py"], ) + +py_test( + name = "test_hive_lineage", + size = "enormous", + srcs = ["hive-lineage/test_hive_lineage.py"], + data = [ + "hive-lineage/hive-lineage.sh", + "hive-lineage/hivetest.hive", + ], + local = True, + shard_count = 3, + deps = [ + "//integration_tests:dataproc_test_case", + "@io_abseil_py//absl/testing:parameterized", + ], +) diff --git a/hive-lineage/README.md b/hive-lineage/README.md new file mode 100644 index 000000000..14f94c590 --- /dev/null +++ b/hive-lineage/README.md @@ -0,0 +1,32 @@ +# Hive Lineage Initialization Action + +## Using the initialization action + +**:warning: NOTICE:** See +[best practices](/README.md#how-initialization-actions-are-used) of using +initialization actions in production. + +You can use this initialization action to create a new Dataproc cluster with Lineage enabled for Hive jobs. +Note that this feature is in preview for now. + +```shell +REGION= +CLUSTER_NAME= +gcloud dataproc clusters create ${CLUSTER_NAME} \ + --region ${REGION} \ + --scopes cloud-platform \ + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/hive-lineage/hive-lineage.sh +``` + +If you want to run Hive jobs involving bigquery tables, hive-bigquery connector needs to be installed as well. +See [connectors](../connectors/README.md) init action. + +```shell +REGION= +CLUSTER_NAME= +gcloud dataproc clusters create ${CLUSTER_NAME} \ + --region ${REGION} \ + --scopes cloud-platform \ + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/connectors/connectors.sh,gs://goog-dataproc-initialization-actions-${REGION}/hive-lineage/hive-lineage.sh \ + --metadata hive-bigquery-connector-version=2.0.3 +``` \ No newline at end of file diff --git a/hive-lineage/__init__.py b/hive-lineage/__init__.py new file mode 100644 index 000000000..709aae858 --- /dev/null +++ b/hive-lineage/__init__.py @@ -0,0 +1 @@ +# required for integration tests \ No newline at end of file diff --git a/hive-lineage/hive-lineage.sh b/hive-lineage/hive-lineage.sh new file mode 100644 index 000000000..586d60a3d --- /dev/null +++ b/hive-lineage/hive-lineage.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# +# Copyright 2025 Google LLC and contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This initialization script installs the required +# jars and sets the hive conf to enable lineage. + +set -euxo pipefail + +function prepare_env() { + export HIVE_HOME="/usr/lib/hive" + export HIVE_CONF_DIR="/etc/hive/conf" + export HIVE_CONF_FILE="$HIVE_CONF_DIR/hive-site.xml" + export HIVE_LIB_DIR="$HIVE_HOME/lib" + export INSTALLATION_SOURCE="gs://hadoop-lib/hive-lineage" + export HIVE_OL_HOOK_VERSION="1.0.0-preview" + export HIVE_OL_HOOK="io.openlineage.hive.hooks.HiveOpenLineageHook" +} + +function set_hive_lineage_conf() { + declare -A properties=( + ["hive.exec.post.hooks"]="$HIVE_OL_HOOK" + ["hive.exec.failure.hooks"]="$HIVE_OL_HOOK" + ["hive.openlineage.transport.type"]="gcplineage" + ["hive.conf.validation"]="false" # to allow custom properties, like hive.openlineage.namespace + ) + echo "Setting hive conf to enable lineage" + for key in "${!properties[@]}"; do + bdconfig set_property \ + --configuration_file="$HIVE_CONF_FILE" \ + --name "$key" \ + --value "${properties[$key]}" + done +} + +function install_jars() { + echo "Installing openlineage-hive hook" + gsutil cp -P "$INSTALLATION_SOURCE/hive-openlineage-hook-$HIVE_OL_HOOK_VERSION.jar" "$HIVE_LIB_DIR/hive-openlineage-hook.jar" +} + +function restart_hive_server2_master() { + ROLE=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/dataproc-role) + if [[ "${ROLE}" == 'Master' ]]; then + echo "Restarting hive-server2" + sudo systemctl restart hive-server2.service + fi +} + +prepare_env +install_jars +set_hive_lineage_conf +restart_hive_server2_master diff --git a/hive-lineage/hivetest.hive b/hive-lineage/hivetest.hive new file mode 100644 index 000000000..e9981994d --- /dev/null +++ b/hive-lineage/hivetest.hive @@ -0,0 +1,23 @@ +DROP TABLE IF EXISTS validate_hive_tbl; +DROP TABLE IF EXISTS grouped_tbl; + +CREATE EXTERNAL TABLE validate_hive_tbl ( + shell_user STRING, + dummy STRING, + uid INT, + gid INT, + name STRING, + home STRING, + shell STRING +) +ROW FORMAT DELIMITED + FIELDS TERMINATED BY ':'; + +CREATE TABLE grouped_tbl + ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' +AS SELECT shell, COUNT(*) shell_count + FROM validate_hive_tbl + GROUP BY shell + ORDER BY shell_count DESC, shell DESC; + +SELECT * from grouped_tbl; \ No newline at end of file diff --git a/hive-lineage/test_hive_lineage.py b/hive-lineage/test_hive_lineage.py new file mode 100644 index 000000000..166c700c0 --- /dev/null +++ b/hive-lineage/test_hive_lineage.py @@ -0,0 +1,30 @@ +from absl.testing import absltest +from absl.testing import parameterized + +from integration_tests.dataproc_test_case import DataprocTestCase + +class HiveLineageTestCase(DataprocTestCase): + COMPONENT = "hive-lineage" + INIT_ACTIONS = ["hive-lineage/hive-lineage.sh"] + TEST_SCRIPT_FILE = "hive-lineage/hivetest.hive" + + def __submit_hive_job(self, cluster_name): + self.assert_dataproc_job( + cluster_name, 'hive', '--file={}/{}'.format(self.INIT_ACTIONS_REPO, + self.TEST_SCRIPT_FILE)) + def verify_cluster(self, name): + self.__submit_hive_job(name) + + @parameterized.parameters( + 'STANDARD', + 'HA', + ) + def test_hive_job_success(self, configuration): + self.createCluster(configuration, + self.INIT_ACTIONS, + scopes='cloud-platform') + self.verify_cluster(self.getClusterName()) + + +if __name__ == "__main__": + absltest.main() \ No newline at end of file