From afb648ad846b039b9c58863522c127cf1195efc1 Mon Sep 17 00:00:00 2001 From: Yan Feng Date: Thu, 6 Feb 2025 13:20:07 +0800 Subject: [PATCH 1/5] Expose stripe_size_rows setting Signed-off-by: Yan Feng --- .../cudf/CompressionMetadataWriterOptions.java | 12 ++++++++++++ java/src/main/java/ai/rapids/cudf/Table.java | 4 ++++ java/src/main/native/src/TableJni.cpp | 5 +++++ 3 files changed, 21 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java index 27eb1be565a..c9b607fa677 100644 --- a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java @@ -24,11 +24,13 @@ public class CompressionMetadataWriterOptions extends ColumnWriterOptions.StructColumnWriterOptions { private final CompressionType compressionType; private final Map metadata; + private int stripeSizeRows; protected CompressionMetadataWriterOptions(Builder builder) { super(builder); this.compressionType = builder.compressionType; this.metadata = builder.metadata; + this.stripeSizeRows = builder.stripeSizeRows; } @Override @@ -96,10 +98,15 @@ public int getTopLevelChildren() { return childColumnOptions.length; } + public int getStripeSizeRows() { + return stripeSizeRows; + } + public abstract static class Builder extends AbstractStructBuilder { final Map metadata = new LinkedHashMap<>(); CompressionType compressionType = CompressionType.AUTO; + int stripeSizeRows = 1000000; /** * Add a metadata key and a value @@ -124,5 +131,10 @@ public T withCompressionType(CompressionType compression) { this.compressionType = compression; return (T) this; } + + public T withStripeSizeRows(int stripeSizeRows) { + this.stripeSizeRows = stripeSizeRows; + return (T) this; + } } } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 298f2cff6f3..8aed23a27b8 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -475,6 +475,7 @@ private static native long writeORCFileBegin(String[] columnNames, int compression, int[] precisions, boolean[] isMapValues, + int stripe_size_rows, String filename) throws CudfException; /** @@ -501,6 +502,7 @@ private static native long writeORCBufferBegin(String[] columnNames, int compression, int[] precisions, boolean[] isMapValues, + int stripe_size_rows, HostBufferConsumer consumer, HostMemoryAllocator hostMemoryAllocator ) throws CudfException; @@ -1823,6 +1825,7 @@ private ORCTableWriter(ORCWriterOptions options, File outputFile) { options.getCompressionType().nativeId, options.getFlatPrecision(), options.getFlatIsMap(), + options.getStripeSizeRows(), outputFile.getAbsolutePath())); this.consumer = null; } @@ -1838,6 +1841,7 @@ private ORCTableWriter(ORCWriterOptions options, HostBufferConsumer consumer, options.getCompressionType().nativeId, options.getFlatPrecision(), options.getFlatIsMap(), + options.getStripeSizeRows(), consumer, hostMemoryAllocator)); this.consumer = consumer; } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 50c6ae842f4..b46d29d8e2c 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1487,6 +1487,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(JNIEnv* env, jstring j_true_value, jstring j_false_value, jint j_quote_style, + jint j_stripe_size_rows, jstring j_output_path) { JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", ); @@ -2480,6 +2481,7 @@ Java_ai_rapids_cudf_Table_writeORCBufferBegin(JNIEnv* env, jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, + jint j_stripe_size_rows, jobject consumer, jobject host_memory_allocator) { @@ -2535,6 +2537,7 @@ Java_ai_rapids_cudf_Table_writeORCBufferBegin(JNIEnv* env, .enable_statistics(ORC_STATISTICS_ROW_GROUP) .key_value_metadata(kv_metadata) .compression_statistics(stats) + .stripe_size_rows(j_stripe_size_rows) .build(); auto writer_ptr = std::make_unique(opts); cudf::jni::native_orc_writer_handle* ret = new cudf::jni::native_orc_writer_handle( @@ -2555,6 +2558,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(JNIEnv* env, jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, + jint j_stripe_size_rows, jstring j_output_path) { JNI_NULL_CHECK(env, j_col_names, "null columns", 0); @@ -2606,6 +2610,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(JNIEnv* env, .enable_statistics(ORC_STATISTICS_ROW_GROUP) .key_value_metadata(kv_metadata) .compression_statistics(stats) + .stripe_size_rows(j_stripe_size_rows) .build(); auto writer_ptr = std::make_unique(opts); cudf::jni::native_orc_writer_handle* ret = From 1a4b078abccd88d235b1d599b88b653f48223f8e Mon Sep 17 00:00:00 2001 From: Yan Feng Date: Thu, 6 Feb 2025 13:49:14 +0800 Subject: [PATCH 2/5] Remove incorrect stripe_size_rows Signed-off-by: Yan Feng --- java/src/main/native/src/TableJni.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index b46d29d8e2c..e1b487b1f7c 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1487,7 +1487,6 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(JNIEnv* env, jstring j_true_value, jstring j_false_value, jint j_quote_style, - jint j_stripe_size_rows, jstring j_output_path) { JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", ); From 8010d2ea2b86ab112c235e6b0ccedef44502f94b Mon Sep 17 00:00:00 2001 From: ustcfy <96854327+ustcfy@users.noreply.github.com> Date: Thu, 6 Feb 2025 16:39:15 +0800 Subject: [PATCH 3/5] Update java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java Co-authored-by: Liangcai Li --- .../java/ai/rapids/cudf/CompressionMetadataWriterOptions.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java index c9b607fa677..cade4202a4d 100644 --- a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java @@ -24,7 +24,7 @@ public class CompressionMetadataWriterOptions extends ColumnWriterOptions.StructColumnWriterOptions { private final CompressionType compressionType; private final Map metadata; - private int stripeSizeRows; + private final int stripeSizeRows; protected CompressionMetadataWriterOptions(Builder builder) { super(builder); From 4f7a36b1bab7bed17701f955950f687d506a3b4d Mon Sep 17 00:00:00 2001 From: ustcfy <96854327+ustcfy@users.noreply.github.com> Date: Thu, 6 Feb 2025 16:45:21 +0800 Subject: [PATCH 4/5] Update java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java --- .../java/ai/rapids/cudf/CompressionMetadataWriterOptions.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java index cade4202a4d..eb9f06e3d42 100644 --- a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java @@ -24,7 +24,7 @@ public class CompressionMetadataWriterOptions extends ColumnWriterOptions.StructColumnWriterOptions { private final CompressionType compressionType; private final Map metadata; - private final int stripeSizeRows; + private final int stripeSizeRows; protected CompressionMetadataWriterOptions(Builder builder) { super(builder); From f8fced5d41fa3e20b516f8db5b795c0e23cc7ffe Mon Sep 17 00:00:00 2001 From: Yan Feng Date: Fri, 7 Feb 2025 14:19:54 +0800 Subject: [PATCH 5/5] Move stripeSizeRows to class ORCWriteOption Signed-off-by: Yan Feng --- .../cudf/CompressionMetadataWriterOptions.java | 12 ------------ .../java/ai/rapids/cudf/ORCWriterOptions.java | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java index eb9f06e3d42..27eb1be565a 100644 --- a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java @@ -24,13 +24,11 @@ public class CompressionMetadataWriterOptions extends ColumnWriterOptions.StructColumnWriterOptions { private final CompressionType compressionType; private final Map metadata; - private final int stripeSizeRows; protected CompressionMetadataWriterOptions(Builder builder) { super(builder); this.compressionType = builder.compressionType; this.metadata = builder.metadata; - this.stripeSizeRows = builder.stripeSizeRows; } @Override @@ -98,15 +96,10 @@ public int getTopLevelChildren() { return childColumnOptions.length; } - public int getStripeSizeRows() { - return stripeSizeRows; - } - public abstract static class Builder extends AbstractStructBuilder { final Map metadata = new LinkedHashMap<>(); CompressionType compressionType = CompressionType.AUTO; - int stripeSizeRows = 1000000; /** * Add a metadata key and a value @@ -131,10 +124,5 @@ public T withCompressionType(CompressionType compression) { this.compressionType = compression; return (T) this; } - - public T withStripeSizeRows(int stripeSizeRows) { - this.stripeSizeRows = stripeSizeRows; - return (T) this; - } } } diff --git a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java index 372f919532e..793344e8dec 100644 --- a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java @@ -23,17 +23,34 @@ * that will be used by the ORC writer to write the file. */ public class ORCWriterOptions extends CompressionMetadataWriterOptions { + private int stripeSizeRows; private ORCWriterOptions(Builder builder) { super(builder); + this.stripeSizeRows = builder.stripeSizeRows; } public static Builder builder() { return new Builder(); } + public int getStripeSizeRows() { + return stripeSizeRows; + } + public static class Builder extends CompressionMetadataWriterOptions.Builder { + // < 1M rows default orc stripe rows, defined in cudf/cpp/include/cudf/io/orc.hpp + private int stripeSizeRows = 1000000; + + public Builder withStripeSizeRows(int stripeSizeRows) { + // maximum stripe size cannot be smaller than 512 + if (stripeSizeRows < 512 || stripeSizeRows > 1000000) { + throw new IllegalArgumentException("Stripe size rows must be between 512 and 1M"); + } + this.stripeSizeRows = stripeSizeRows; + return this; + } public ORCWriterOptions build() { return new ORCWriterOptions(this);