gpt examples

UBCAgroBot · Oct 27, 2024 · 1ba87b5 · 1ba87b5
1 parent 62bb3be
commit 1ba87b5
Show file tree

Hide file tree

Showing 17 changed files with 713 additions and 4 deletions.
diff --git a/cpp_wip/cuda_stream_manager.cpp b/cpp_wip/cuda_stream_manager.cpp
@@ -0,0 +1,41 @@
+// cuda_stream_manager.hpp
+#pragma once
+#include <cuda_runtime.h>
+#include <memory>
+
+class CudaStreamManager {
+public:
+    CudaStreamManager() {
+        // Create a single CUDA stream
+        cudaStreamCreate(&stream_);
+
+        // Create CUDA events
+        cudaEventCreate(&preprocess_done_);
+        cudaEventCreate(&inference_done_);
+    }
+
+    ~CudaStreamManager() {
+        // Destroy CUDA stream and events
+        cudaStreamDestroy(stream_);
+        cudaEventDestroy(preprocess_done_);
+        cudaEventDestroy(inference_done_);
+    }
+
+    cudaStream_t getStream() const {
+        return stream_;
+    }
+
+    cudaEvent_t& getPreprocessEvent() {
+        return preprocess_done_;
+    }
+
+    cudaEvent_t& getInferenceEvent() {
+        return inference_done_;
+    }
+
+private:
+    cudaStream_t stream_;
+    cudaEvent_t preprocess_done_, inference_done_;
+};
+
+using CudaStreamManagerPtr = std::shared_ptr<CudaStreamManager>;
diff --git a/cpp_wip/preprocessing_node.cpp b/cpp_wip/preprocessing_node.cpp
@@ -0,0 +1,23 @@
+// preprocessing_node.cpp
+#include <rclcpp/rclcpp.hpp>
+#include "cuda_stream_manager.hpp"
+
+class PreprocessingNode : public rclcpp::Node {
+public:
+    PreprocessingNode(const CudaStreamManagerPtr& cuda_manager)
+        : Node("preprocessing_node"), cuda_manager_(cuda_manager) {}
+
+    void preprocess() {
+        // Perform GPU preprocessing here using cuda_manager_->getStream()
+
+        // Signal that preprocessing is done
+        cudaEventRecord(cuda_manager_->getPreprocessEvent(), cuda_manager_->getStream());
+    }
+
+private:
+    CudaStreamManagerPtr cuda_manager_;
+};
+
+// Register as a composable node
+#include "rclcpp_components/register_node_macro.hpp"
+RCLCPP_COMPONENTS_REGISTER_NODE(PreprocessingNode)
diff --git a/python_wip/Stream_E2E.py b/python_wip/Stream_E2E.py
@@ -158,4 +158,66 @@ def postprocess_callback(self, msg):
         # Clean up the IPC memory
         cuda.ipc_close_mem_handle(d_output)
 
-# this uses unified memory...
+# this uses unified memory...
+
+stream = cv2.cuda_Stream()
+gpu_image = cv2.cuda_GpuMat(image.shape)
+gpu_image.upload(image, stream=stream)
+
+# Recolor the image on the GPU
+gpu_image = cv2.cuda.cvtColor(gpu_image, cv2.COLOR_BGR2GRAY, stream=stream)
+
+# Perform additional operations like resize
+resized_image = cv2.cuda.resize(gpu_image, (640, 480), stream=stream)
+
+# All operations happen within the GPU stream
+
+
+import cupy as cp
+stream = cp.cuda.Stream()
+
+with stream:
+    # Create a CuPy array (on GPU)
+    gpu_image = cp.array(np_image)  # np_image is the original CPU image
+
+    # Normalize the image in-place
+    cp.subtract(gpu_image, 128, out=gpu_image)  # In-place operation
+
+    # Transpose the image in-place
+    gpu_image = gpu_image.transpose((1, 0, 2))  # Change image axes
+
+
+# interoperability:
+import cv2
+import cupy as cp
+
+# Create a CUDA stream
+stream = cv2.cuda_Stream()
+
+# Allocate a GPU Mat in OpenCV
+gpu_image = cv2.cuda_GpuMat(image.shape)
+
+# Upload image to GPU in the stream
+gpu_image.upload(image, stream=stream)
+
+# Get GPU pointer and wrap it as a CuPy array (no CPU-GPU copy)
+ptr = gpu_image.cudaPtr()
+gpu_image_cupy = cp.ndarray(image.shape, cp.uint8, cp.cuda.MemoryPointer(cp.cuda.Memory(ptr), 0))
+
+# Perform CuPy operations (like normalization) in-place
+gpu_image_cupy = gpu_image_cupy / 255.0
+
+# retrieving CUDA stream handle from OpenCV CUDA stream object:
+stream = cv2.cuda.Stream()  # OpenCV CUDA stream
+cuda_stream = stream.cudaPtr()  # Extract the CUDA stream handle
+
+# pass it to TensorRT just like you would with a normal CUDA stream.
+import pycuda.driver as cuda
+import cupy as cp
+
+# Assuming d_input_ptr, d_output, self.d_input, self.d_output, and self.exec_context are already defined.
+cuda.memcpy_dtod_async(self.d_input, d_input_ptr, cp.prod(self.input_shape) * cp.dtype(cp.float32).itemsize, stream=cuda_stream)  # Copy input data to the allocated memory in TensorRT
+self.exec_context.execute_async_v2(bindings=[int(self.d_input), int(self.d_output)], stream_handle=cuda_stream)  # Execute inference asynchronously using the OpenCV CUDA stream handle
+output = cp.empty(self.output_shape, dtype=cp.float32)
+cuda.memcpy_dtod_async(output.data, self.d_output, stream=cuda_stream)  # Copy output to variable
+cuda.Stream.synchronize(cuda_stream)  # Synchronize the stream
diff --git a/python_wip/cuda_download_1.py b/python_wip/cuda_download_1.py
@@ -0,0 +1,42 @@
+import pyzed.sl as sl
+
+# Create a ZED Camera object
+zed = sl.Camera()
+
+# Create InitParameters object and set configuration parameters
+init_params = sl.InitParameters()
+init_params.camera_resolution = sl.RESOLUTION.HD720  # Set resolution
+init_params.depth_mode = sl.DEPTH_MODE.ULTRA  # Set depth mode
+
+# Open the camera
+status = zed.open(init_params)
+if status != sl.ERROR_CODE.SUCCESS:
+    print(f"Camera failed to open: {status}")
+    exit(1)
+
+# Create a Mat object for the image (GPU memory type)
+image_gpu = sl.Mat(zed.get_camera_information().camera_resolution.width,
+                   zed.get_camera_information().camera_resolution.height,
+                   sl.MAT_TYPE.U8_C4, sl.MEM.GPU)
+
+# Capture an image frame
+runtime_params = sl.RuntimeParameters()
+
+if zed.grab(runtime_params) == sl.ERROR_CODE.SUCCESS:
+    # Retrieve image directly into GPU memory
+    zed.retrieve_image(image_gpu, sl.VIEW.LEFT, sl.MEM.GPU)
+
+    # Now `image_gpu` holds the image in GPU memory
+    print("Image captured and stored in CUDA memory")
+
+# Close the camera
+zed.close()
+
+# Create a CPU Mat to store the image
+image_cpu = sl.Mat()
+
+# Copy image from GPU to CPU
+image_gpu.copy_to(image_cpu)
+
+# Save the image (this is in CPU memory now)
+image_cpu.write("image_from_cuda.png")
diff --git a/python_wip/cuda_stream_inference.py b/python_wip/cuda_stream_inference.py
@@ -0,0 +1,51 @@
+# inference_node.py
+import rclpy
+from rclpy.node import Node
+from cuda_manager import CudaStreamManager
+import pycuda.driver as cuda
+import numpy as np
+
+class InferenceNode(Node):
+    def __init__(self, cuda_manager):
+        super().__init__('inference_node')
+        self.cuda_manager = cuda_manager
+
+    def infer(self):
+        self.get_logger().info("Waiting for preprocessing to complete...")
+        self.cuda_manager.get_preprocess_event().synchronize()
+        self.get_logger().info("Starting inference on GPU...")
+
+        # Simulate inference on GPU
+        data = np.random.randn(1024, 1024).astype(np.float32)
+        gpu_data = cuda.mem_alloc(data.nbytes)
+        cuda.memcpy_htod_async(gpu_data, data, self.cuda_manager.get_stream())
+
+        # Signal inference completion
+        self.cuda_manager.get_inference_event().record(self.cuda_manager.get_stream())
+        self.get_logger().info("Inference complete.")
+
+# post processing:
+# postprocessing_node.py
+import rclpy
+from rclpy.node import Node
+from cuda_manager import CudaStreamManager
+import pycuda.driver as cuda
+import numpy as np
+
+class PostprocessingNode(Node):
+    def __init__(self, cuda_manager):
+        super().__init__('postprocessing_node')
+        self.cuda_manager = cuda_manager
+
+    def postprocess(self):
+        self.get_logger().info("Waiting for inference to complete...")
+        self.cuda_manager.get_inference_event().synchronize()
+        self.get_logger().info("Starting postprocessing on GPU...")
+
+        # Simulate postprocessing on GPU
+        data = np.random.randn(1024, 1024).astype(np.float32)
+        gpu_data = cuda.mem_alloc(data.nbytes)
+        cuda.memcpy_htod_async(gpu_data, data, self.cuda_manager.get_stream())
+
+        # Assume postprocessing is complete
+        self.get_logger().info("Postprocessing complete.")
diff --git a/python_wip/function_tracing.py b/python_wip/function_tracing.py
@@ -0,0 +1,38 @@
+import time
+from functools import wraps
+import rclpy
+from rclpy.node import Node
+from std_msgs.msg import String
+
+def trace(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        print(f'Function {func.__name__} took {end_time - start_time} seconds')
+        return result
+    return wrapper
+
+class MinimalPublisher(Node):
+    def __init__(self):
+        super().__init__('minimal_publisher')
+        self.publisher_ = self.create_publisher(String, 'topic', 10)
+        self.timer = self.create_timer(0.5, self.timer_callback)
+
+    @trace
+    def timer_callback(self):
+        msg = String()
+        msg.data = 'Hello World: %s' % time.time()
+        self.publisher_.publish(msg)
+        self.get_logger().info('Publishing: "%s"' % msg.data)
+
+def main(args=None):
+    rclpy.init(args=args)
+    minimal_publisher = MinimalPublisher()
+    rclpy.spin(minimal_publisher)
+    minimal_publisher.destroy_node()
+    rclpy.shutdown()
+
+if __name__ == '__main__':
+    main()
diff --git a/python_wip/inference_comparison.py b/python_wip/inference_comparison.py
@@ -252,4 +252,37 @@ def infer_with_torch_trt(self):
 output_torch_trt, torch_trt_time = comparison.infer_with_torch_trt()
 print(f"Torch2TRT Inference Time: {torch_trt_time:.6f} seconds")
 
-## compare bounding box output to the expected from the file...
+## compare bounding box output to the expected from the file...
+
+# Warmup Phase:
+#     Each engine runs a warmup phase with 10 inference passes using random input. This ensures the timing reflects actual performance after the engine is "warmed up."
+# Buffer Allocation:
+#     For TensorRT-based models (trt_normal and trt_stripped), buffers for input and output tensors are allocated using CUDA. Host-pinned memory is allocated to enable efficient asynchronous execution.
+# Inference Timing:
+#     For each inference method (normal TensorRT, stripped TensorRT, and torch2trt), the inference time is measured after warmup. The random input is transferred to the device, and then the output is copied back to the host after inference.
+# Execution Context:
+#     Each TensorRT-based model (trt_normal, trt_stripped) uses an execution context created from the engine.
+# torch2trt:
+#     torch2trt models are loaded using TRTModule, and inference is performed using PyTorch tensors.
+
+# torch trt?
+import torch
+import torch_tensorrt as torch_trt
+
+# Sample PyTorch model (ResNet18 in this case)
+model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
+model.eval().cuda()
+
+# Example input tensor
+input_data = torch.randn((1, 3, 224, 224)).cuda()
+
+# Convert the PyTorch model to a Torch-TensorRT optimized model
+trt_model = torch_trt.compile(model, 
+                              inputs=[torch_trt.Input(input_data.shape)], 
+                              enabled_precisions={torch.float, torch.half})  # Use FP32 and FP16
+
+# Run inference
+with torch.no_grad():
+    output = trt_model(input_data)
+
+print(output)
diff --git a/python_wip/integrated_serial.py b/python_wip/integrated_serial.py
@@ -0,0 +1,38 @@
+import rclpy
+from rclpy.node import Node
+import serial
+import time
+from std_msgs.msg import String  # Example message type
+
+class ArduinoSerialNode(Node):
+    def __init__(self):
+        super().__init__('arduino_serial_node')
+        self.subscription = self.create_subscription(
+            String,
+            'your_topic_name',
+            self.listener_callback,
+            10
+        )
+        self.subscription  # prevent unused variable warning
+
+        # Open serial port to Arduino
+        self.ser = serial.Serial('/dev/ttyUSB0', 115200, timeout=1)  # Adjust USB port as needed
+        time.sleep(2)  # Wait for Arduino to reset
+
+    def listener_callback(self, msg):
+        # Serialize and send the message to Arduino
+        serialized_msg = msg.data + '\n'  # Add a newline as a delimiter
+        self.ser.write(serialized_msg.encode())
+        self.get_logger().info('Sent to Arduino: "%s"' % msg.data)
+
+def main(args=None):
+    rclpy.init(args=args)
+    arduino_serial_node = ArduinoSerialNode()
+    rclpy.spin(arduino_serial_node)
+    arduino_serial_node.destroy_node()
+    rclpy.shutdown()
+
+if __name__ == '__main__':
+    main()
+
+# pip3 install pyserial
diff --git a/python_wip/interop-cupy-cv.py b/python_wip/interop-cupy-cv.py
@@ -0,0 +1,43 @@
+import cupy as cp
+import cv2
+import numpy as np
+
+# Create a custom CUDA stream using CuPy
+cuda_stream = cp.cuda.Stream()
+
+# Allocate a GPU array using CuPy
+cupy_array = cp.random.random((224, 224, 3), dtype=cp.float32)
+
+# Perform some CuPy operations on the custom stream
+with cuda_stream:
+    cupy_array = cp.sqrt(cupy_array)  # Example CuPy operation
+
+# Sync the stream to ensure CuPy operations are done before OpenCV operation
+cuda_stream.synchronize()
+
+# Convert CuPy array to a NumPy array (on the CPU)
+# OpenCV doesn't natively support CuPy arrays, so transfer data back to host
+numpy_array = cp.asnumpy(cupy_array)
+
+# Convert NumPy array to OpenCV's GPU Mat
+gpu_mat = cv2.cuda_GpuMat()
+gpu_mat.upload(numpy_array)
+
+# Perform an OpenCV CUDA operation
+# OpenCV CUDA functions generally don't support custom streams directly
+gpu_mat = cv2.cuda.resize(gpu_mat, (128, 128))
+
+# Optionally, download the result back to the CPU
+result = gpu_mat.download()
+
+# cropping in cupy:
+import cupy as cp
+
+# Assume you have a CuPy array (image) of shape (height, width, channels)
+image = cp.random.rand(224, 224, 3).astype(cp.float32)  # Example image
+
+# Define the crop region (x, y, width, height)
+x, y, w, h = 50, 50, 100, 100
+
+# Crop the image (this works similarly to NumPy slicing)
+cropped_image = image[y:y+h, x:x+w, :]