Skip to content

Commit

Permalink
Add script and readme for hive lineage (#1293)
Browse files Browse the repository at this point in the history
  • Loading branch information
codelixir authored Feb 3, 2025
1 parent 75594a7 commit 0b3165c
Show file tree
Hide file tree
Showing 6 changed files with 172 additions and 3 deletions.
25 changes: 22 additions & 3 deletions BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ test_suite(
":test_cloud_sql_proxy",
":test_dr_elephant",
":test_hive_hcatalog",
":test_starburst_presto",
":test_spark_rapids",
":test_starburst_presto",
"//alluxio:test_alluxio",
"//atlas:test_atlas",
"//bigtable:test_bigtable",
Expand Down Expand Up @@ -60,7 +60,10 @@ py_test(
name = "test_cloud_sql_proxy",
size = "enormous",
srcs = ["cloud-sql-proxy/test_cloud_sql_proxy.py"],
data = ["cloud-sql-proxy/cloud-sql-proxy.sh", "cloud-sql-proxy/hivetest.hive"],
data = [
"cloud-sql-proxy/cloud-sql-proxy.sh",
"cloud-sql-proxy/hivetest.hive",
],
local = True,
shard_count = 4,
deps = [
Expand Down Expand Up @@ -114,10 +117,10 @@ py_test(
size = "enormous",
srcs = ["spark-rapids/test_spark_rapids.py"],
data = [
"spark-rapids/mig.sh",
"spark-rapids/spark-rapids.sh",
"spark-rapids/verify_xgboost_spark_rapids.scala",
"spark-rapids/verify_xgboost_spark_rapids_sql.scala",
"spark-rapids/mig.sh",
],
local = True,
shard_count = 3,
Expand All @@ -132,3 +135,19 @@ py_library(
testonly = True,
srcs = ["cloud-sql-proxy/pyspark_metastore_test.py"],
)

py_test(
name = "test_hive_lineage",
size = "enormous",
srcs = ["hive-lineage/test_hive_lineage.py"],
data = [
"hive-lineage/hive-lineage.sh",
"hive-lineage/hivetest.hive",
],
local = True,
shard_count = 3,
deps = [
"//integration_tests:dataproc_test_case",
"@io_abseil_py//absl/testing:parameterized",
],
)
32 changes: 32 additions & 0 deletions hive-lineage/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Hive Lineage Initialization Action

## Using the initialization action

**:warning: NOTICE:** See
[best practices](/README.md#how-initialization-actions-are-used) of using
initialization actions in production.

You can use this initialization action to create a new Dataproc cluster with Lineage enabled for Hive jobs.
Note that this feature is in preview for now.

```shell
REGION=<region>
CLUSTER_NAME=<cluster_name>
gcloud dataproc clusters create ${CLUSTER_NAME} \
--region ${REGION} \
--scopes cloud-platform \
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/hive-lineage/hive-lineage.sh
```

If you want to run Hive jobs involving bigquery tables, hive-bigquery connector needs to be installed as well.
See [connectors](../connectors/README.md) init action.

```shell
REGION=<region>
CLUSTER_NAME=<cluster_name>
gcloud dataproc clusters create ${CLUSTER_NAME} \
--region ${REGION} \
--scopes cloud-platform \
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/connectors/connectors.sh,gs://goog-dataproc-initialization-actions-${REGION}/hive-lineage/hive-lineage.sh \
--metadata hive-bigquery-connector-version=2.0.3
```
1 change: 1 addition & 0 deletions hive-lineage/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# required for integration tests
64 changes: 64 additions & 0 deletions hive-lineage/hive-lineage.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash
#
# Copyright 2025 Google LLC and contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This initialization script installs the required
# jars and sets the hive conf to enable lineage.

set -euxo pipefail

function prepare_env() {
export HIVE_HOME="/usr/lib/hive"
export HIVE_CONF_DIR="/etc/hive/conf"
export HIVE_CONF_FILE="$HIVE_CONF_DIR/hive-site.xml"
export HIVE_LIB_DIR="$HIVE_HOME/lib"
export INSTALLATION_SOURCE="gs://hadoop-lib/hive-lineage"
export HIVE_OL_HOOK_VERSION="1.0.0-preview"
export HIVE_OL_HOOK="io.openlineage.hive.hooks.HiveOpenLineageHook"
}

function set_hive_lineage_conf() {
declare -A properties=(
["hive.exec.post.hooks"]="$HIVE_OL_HOOK"
["hive.exec.failure.hooks"]="$HIVE_OL_HOOK"
["hive.openlineage.transport.type"]="gcplineage"
["hive.conf.validation"]="false" # to allow custom properties, like hive.openlineage.namespace
)
echo "Setting hive conf to enable lineage"
for key in "${!properties[@]}"; do
bdconfig set_property \
--configuration_file="$HIVE_CONF_FILE" \
--name "$key" \
--value "${properties[$key]}"
done
}

function install_jars() {
echo "Installing openlineage-hive hook"
gsutil cp -P "$INSTALLATION_SOURCE/hive-openlineage-hook-$HIVE_OL_HOOK_VERSION.jar" "$HIVE_LIB_DIR/hive-openlineage-hook.jar"
}

function restart_hive_server2_master() {
ROLE=$(curl -f -s -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/dataproc-role)
if [[ "${ROLE}" == 'Master' ]]; then
echo "Restarting hive-server2"
sudo systemctl restart hive-server2.service
fi
}

prepare_env
install_jars
set_hive_lineage_conf
restart_hive_server2_master
23 changes: 23 additions & 0 deletions hive-lineage/hivetest.hive
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
DROP TABLE IF EXISTS validate_hive_tbl;
DROP TABLE IF EXISTS grouped_tbl;

CREATE EXTERNAL TABLE validate_hive_tbl (
shell_user STRING,
dummy STRING,
uid INT,
gid INT,
name STRING,
home STRING,
shell STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ':';

CREATE TABLE grouped_tbl
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
AS SELECT shell, COUNT(*) shell_count
FROM validate_hive_tbl
GROUP BY shell
ORDER BY shell_count DESC, shell DESC;

SELECT * from grouped_tbl;
30 changes: 30 additions & 0 deletions hive-lineage/test_hive_lineage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from absl.testing import absltest
from absl.testing import parameterized

from integration_tests.dataproc_test_case import DataprocTestCase

class HiveLineageTestCase(DataprocTestCase):
COMPONENT = "hive-lineage"
INIT_ACTIONS = ["hive-lineage/hive-lineage.sh"]
TEST_SCRIPT_FILE = "hive-lineage/hivetest.hive"

def __submit_hive_job(self, cluster_name):
self.assert_dataproc_job(
cluster_name, 'hive', '--file={}/{}'.format(self.INIT_ACTIONS_REPO,
self.TEST_SCRIPT_FILE))
def verify_cluster(self, name):
self.__submit_hive_job(name)

@parameterized.parameters(
'STANDARD',
'HA',
)
def test_hive_job_success(self, configuration):
self.createCluster(configuration,
self.INIT_ACTIONS,
scopes='cloud-platform')
self.verify_cluster(self.getClusterName())


if __name__ == "__main__":
absltest.main()

0 comments on commit 0b3165c

Please sign in to comment.