From 7da5214289f0fa0f45f3fb5990c2622127b2a3ca Mon Sep 17 00:00:00 2001 From: kuldeepkk-dev <111517881+kuldeepkk-dev@users.noreply.github.com> Date: Mon, 15 Apr 2024 09:49:32 -0400 Subject: [PATCH] Added new init action for Datasketches (#1153) * Added new init action for Datasketches * Removed my username from the example output * Applied code review suggestions * Applied suggested code review changes --- datasketches/README.md | 109 +++++++++++++++++++++++++++++++++++ datasketches/datasketches.sh | 78 +++++++++++++++++++++++++ datasketches/pom.xml | 46 +++++++++++++++ 3 files changed, 233 insertions(+) create mode 100644 datasketches/README.md create mode 100755 datasketches/datasketches.sh create mode 100644 datasketches/pom.xml diff --git a/datasketches/README.md b/datasketches/README.md new file mode 100644 index 000000000..908751e03 --- /dev/null +++ b/datasketches/README.md @@ -0,0 +1,109 @@ +# Apache Datasketches + +**:warning: NOTICE:** This init action is supported only on Dataproc clusters 2.1 and above. + +This initialization action installs libraries required to run [Apache Datasketches](https://datasketches.apache.org/) on a +[Google Cloud Dataproc](https://cloud.google.com/dataproc) cluster. + +## Using this initialization action + +**:warning: NOTICE:** See +[best practices](/README.md#how-initialization-actions-are-used) of using +initialization actions in production. + +This initialization action installs dataksketches libraries on Dataproc cluster at `/usr/lib/datasketches` location, below jars will be deployed: + +``` +datasketches-memory-2.0.0.jar +datasketches-java-3.1.0.jar +datasketches-pig-1.1.0.jar +datasketches-hive-1.2.0.jar +spark-java-thetasketches-1.0-SNAPSHOT.jar [ Only if Spark version < 3.5.0 ] +``` + +1. Using the `gcloud` command to create a new cluster with this initialization + action. The following command will create a new standard cluster named + `${CLUSTER_NAME}`. + + ```bash + REGION= + CLUSTER_NAME= + gcloud dataproc clusters create ${CLUSTER_NAME} \ + --region ${REGION} \ + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/datasketches/dataksketches.sh + ``` + +## Apache Datasketches Examples: + +### Spark: + +Note: Starting Apache Spark version 3.5.0, Datasketches libraries are already integrated, follow this [example](https://www.databricks.com/blog/apache-spark-3-apache-datasketches-new-sketch-based-approximate-distinct-counting) + +1. For Older 3.X Spark versions, follow [Thetasketches example](https://datasketches.apache.org/docs/Theta/ThetaSparkExample.html) from Datasketches documentation. + + Note: `spark-java-thetasketches` example jar will be available under `/usr/lib/datasketches` as a part of this init action, run `spark-submit` with `spark-java-thetasketches-1.0-SNAPSHOT.jar` to try Thetasketches example. + + ``` + spark-submit --jars /usr/lib/datasketches/datasketches-java-3.1.0.jar,/usr/lib/datasketches/datasketches-memory-2.0.0.jar --class Aggregate target/spark-java-thetasketches-1.0-SNAPSHOT.jar + ``` + + If you modify the [java code](https://datasketches.apache.org/docs/Theta/ThetaSparkExample.html), use below instructions to build the jar. + + 1. Generate artifacts with Maven: + + ``` + mvn archetype:generate -DgroupId=org.apache.datasketches -DartifactId=spark-java-thetasketches -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false + ``` + + 1. Replace pom.xml with https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/datasketches/pom.xml + + + 1. Add modified code from https://datasketches.apache.org/docs/Theta/ThetaSparkExample.html under $local_path/src/main/java/org/apache/datasketches directory, remove the sample App.java file + + Example: + + ``` + root@cluster-$hostname-m:$local_path/spark-java-thetasketches/src/main/java/org/apache/datasketches# ls -lrt + total 20 + -rw-r--r-- 1 root root 1920 Feb 21 17:03 ThetaSketchJavaSerializable.java + -rw-r--r-- 1 root root 2459 Feb 21 17:03 Spark2DatasetMapPartitionsReduceJavaSerialization.java + -rw-r--r-- 1 root root 3654 Feb 21 17:03 MapPartitionsToPairReduceByKey.java + -rw-r--r-- 1 root root 3142 Feb 21 17:03 AggregateByKey2.java + -rw-r--r-- 1 root root 2123 Feb 21 17:03 Aggregate.java + ``` + + 1. Compile the code and package a jar: + + ``` + mvn package + ``` + + 1. Verify if jar is created under `target/` + + ``` + root@cluster-$hostname-m:$local_path/spark-java-thetasketches# ls -lrt target/ + total 48 + drwxr-xr-x 3 root root 4096 Feb 29 18:36 maven-status + drwxr-xr-x 3 root root 4096 Feb 29 18:36 generated-sources + drwxr-xr-x 2 root root 4096 Feb 29 18:36 classes + drwxr-xr-x 3 root root 4096 Feb 29 18:36 generated-test-sources + drwxr-xr-x 3 root root 4096 Feb 29 18:36 test-classes + drwxr-xr-x 2 root root 4096 Feb 29 18:36 surefire-reports + drwxr-xr-x 2 root root 4096 Feb 29 18:36 maven-archiver + -rw-r--r-- 1 root root 17542 Feb 29 18:36 spark-java-thetasketches-1.0-SNAPSHOT.jar + ``` + + 1. Run `spark-submit` with newly generated jar from above step. + + ``` + root@cluster-$hostname-m:$local_path/spark-java-thetasketches# spark-submit --jars /usr/lib/datasketches/datasketches-java-3.1.0.jar,/usr/lib/datasketches/datasketches-memory-2.0.0.jar --class Aggregate target/spark-java-thetasketches-1.0-SNAPSHOT.jar + ``` + +### Hive: + +1. cd to `/usr/lib/datasketches` and follow [Datasketches Hive examples](https://datasketches.apache.org/docs/SystemIntegrations/ApacheHiveIntegration.html) + +#### Pig: + +1. cd to `/usr/lib/datasketches` and follow [Datasketches Pig examples](https://datasketches.apache.org/docs/SystemIntegrations/ApachePigIntegration.html) + diff --git a/datasketches/datasketches.sh b/datasketches/datasketches.sh new file mode 100755 index 000000000..b0aee99c3 --- /dev/null +++ b/datasketches/datasketches.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script installs Below Datasketches libraries on Dataproc cluster 2.1 and above +# datasketches-java - https://github.com/apache/datasketches-java +# datasketches-memory - https://github.com/apache/datasketches-memory +# datasketches-hive - https://github.com/apache/datasketches-hive +# datasketches-pig - https://github.com/apache/datasketches-pig +# Official documentation link - https://datasketches.apache.org/ +set -euxo pipefail + +# Detect dataproc image version +if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then + DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" +fi + +if [[ $(echo "${DATAPROC_IMAGE_VERSION} < 2.1" | bc -l) == 1 ]]; then + echo "Datasketches integration is not supported on Dataproc image versions < 2.1" + exit 0 +fi + +readonly MAVEN_CENTRAL_URI=https://maven-central.storage-download.googleapis.com/maven2 +readonly DS_LIBPATH="/usr/lib/datasketches" +readonly SPARK_VERSION=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) +readonly SPARK_JAVA_EXAMPLE_JAR="gs://spark-lib/datasketches/spark-java-thetasketches-1.0-SNAPSHOT.jar" + +function download_libraries() +{ + mkdir -p ${DS_LIBPATH} + declare -A all_components=( [java]="3.1.0" [hive]="1.2.0" [memory]="2.0.0" [pig]="1.1.0" ) + + for lib in "${!all_components[@]}" + do + local component=${lib} + local version=${all_components[$lib]} + wget -P "${DS_LIBPATH}" "${MAVEN_CENTRAL_URI}"/org/apache/datasketches/datasketches-"${component}"/"${version}"/datasketches-"${component}"-"${version}".jar + if [ $? -eq 0 ]; then + echo "Downloaded datasketches-"${component}"-"${version}".jar successfully" + else + echo "Problem downloading datasketches-"${component}"-"${version}".jar from ${MAVEN_CENTRAL_URI}, exiting!" + exit 1 + fi + done +} + +function download_example_jar() +{ + if [[ "${SPARK_VERSION}" < "3.5" ]]; then + gsutil cp "${SPARK_JAVA_EXAMPLE_JAR}" "${DS_LIBPATH}" + if [ $? -eq 0 ]; then + echo "Downloaded "${SPARK_JAVA_EXAMPLE_JAR}" successfully" + else + echo "Problem downloading "${SPARK_JAVA_EXAMPLE_JAR}" from GCS, exiting!" + fi + +else + echo "Datasketches libraries are already included in Spark version 3.5.0 and onwards! Follow README for examples" +fi +} + +function main() +{ + download_libraries + download_example_jar +} + +main diff --git a/datasketches/pom.xml b/datasketches/pom.xml new file mode 100644 index 000000000..727fa912f --- /dev/null +++ b/datasketches/pom.xml @@ -0,0 +1,46 @@ + + 4.0.0 + org.apache.datasketches + spark-java-thetasketches + jar + 1.0-SNAPSHOT + spark-java-thetasketches + http://maven.apache.org + + 11 + 11 + + + + junit + junit + 3.8.1 + test + + + org.apache.datasketches + datasketches-memory + 2.0.0 + + + + org.apache.spark + spark-core_2.13 + 3.3.0 + + + + org.apache.datasketches + datasketches-java + 3.1.0 + + + + org.apache.spark + spark-sql_2.13 + 3.3.0 + provided + + +