Skip to content

Commit

Permalink
gpt examples
Browse files Browse the repository at this point in the history
  • Loading branch information
Ishaan-Datta committed Oct 27, 2024
1 parent 62bb3be commit 1ba87b5
Show file tree
Hide file tree
Showing 17 changed files with 713 additions and 4 deletions.
41 changes: 41 additions & 0 deletions cpp_wip/cuda_stream_manager.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// cuda_stream_manager.hpp
#pragma once
#include <cuda_runtime.h>
#include <memory>

class CudaStreamManager {
public:
CudaStreamManager() {
// Create a single CUDA stream
cudaStreamCreate(&stream_);

// Create CUDA events
cudaEventCreate(&preprocess_done_);
cudaEventCreate(&inference_done_);
}

~CudaStreamManager() {
// Destroy CUDA stream and events
cudaStreamDestroy(stream_);
cudaEventDestroy(preprocess_done_);
cudaEventDestroy(inference_done_);
}

cudaStream_t getStream() const {
return stream_;
}

cudaEvent_t& getPreprocessEvent() {
return preprocess_done_;
}

cudaEvent_t& getInferenceEvent() {
return inference_done_;
}

private:
cudaStream_t stream_;
cudaEvent_t preprocess_done_, inference_done_;
};

using CudaStreamManagerPtr = std::shared_ptr<CudaStreamManager>;
23 changes: 23 additions & 0 deletions cpp_wip/preprocessing_node.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// preprocessing_node.cpp
#include <rclcpp/rclcpp.hpp>
#include "cuda_stream_manager.hpp"

class PreprocessingNode : public rclcpp::Node {
public:
PreprocessingNode(const CudaStreamManagerPtr& cuda_manager)
: Node("preprocessing_node"), cuda_manager_(cuda_manager) {}

void preprocess() {
// Perform GPU preprocessing here using cuda_manager_->getStream()

// Signal that preprocessing is done
cudaEventRecord(cuda_manager_->getPreprocessEvent(), cuda_manager_->getStream());
}

private:
CudaStreamManagerPtr cuda_manager_;
};

// Register as a composable node
#include "rclcpp_components/register_node_macro.hpp"
RCLCPP_COMPONENTS_REGISTER_NODE(PreprocessingNode)
64 changes: 63 additions & 1 deletion python_wip/Stream_E2E.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,66 @@ def postprocess_callback(self, msg):
# Clean up the IPC memory
cuda.ipc_close_mem_handle(d_output)

# this uses unified memory...
# this uses unified memory...

stream = cv2.cuda_Stream()
gpu_image = cv2.cuda_GpuMat(image.shape)
gpu_image.upload(image, stream=stream)

# Recolor the image on the GPU
gpu_image = cv2.cuda.cvtColor(gpu_image, cv2.COLOR_BGR2GRAY, stream=stream)

# Perform additional operations like resize
resized_image = cv2.cuda.resize(gpu_image, (640, 480), stream=stream)

# All operations happen within the GPU stream


import cupy as cp
stream = cp.cuda.Stream()

with stream:
# Create a CuPy array (on GPU)
gpu_image = cp.array(np_image) # np_image is the original CPU image

# Normalize the image in-place
cp.subtract(gpu_image, 128, out=gpu_image) # In-place operation

# Transpose the image in-place
gpu_image = gpu_image.transpose((1, 0, 2)) # Change image axes


# interoperability:
import cv2
import cupy as cp

# Create a CUDA stream
stream = cv2.cuda_Stream()

# Allocate a GPU Mat in OpenCV
gpu_image = cv2.cuda_GpuMat(image.shape)

# Upload image to GPU in the stream
gpu_image.upload(image, stream=stream)

# Get GPU pointer and wrap it as a CuPy array (no CPU-GPU copy)
ptr = gpu_image.cudaPtr()
gpu_image_cupy = cp.ndarray(image.shape, cp.uint8, cp.cuda.MemoryPointer(cp.cuda.Memory(ptr), 0))

# Perform CuPy operations (like normalization) in-place
gpu_image_cupy = gpu_image_cupy / 255.0

# retrieving CUDA stream handle from OpenCV CUDA stream object:
stream = cv2.cuda.Stream() # OpenCV CUDA stream
cuda_stream = stream.cudaPtr() # Extract the CUDA stream handle

# pass it to TensorRT just like you would with a normal CUDA stream.
import pycuda.driver as cuda
import cupy as cp

# Assuming d_input_ptr, d_output, self.d_input, self.d_output, and self.exec_context are already defined.
cuda.memcpy_dtod_async(self.d_input, d_input_ptr, cp.prod(self.input_shape) * cp.dtype(cp.float32).itemsize, stream=cuda_stream) # Copy input data to the allocated memory in TensorRT
self.exec_context.execute_async_v2(bindings=[int(self.d_input), int(self.d_output)], stream_handle=cuda_stream) # Execute inference asynchronously using the OpenCV CUDA stream handle
output = cp.empty(self.output_shape, dtype=cp.float32)
cuda.memcpy_dtod_async(output.data, self.d_output, stream=cuda_stream) # Copy output to variable
cuda.Stream.synchronize(cuda_stream) # Synchronize the stream
42 changes: 42 additions & 0 deletions python_wip/cuda_download_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pyzed.sl as sl

# Create a ZED Camera object
zed = sl.Camera()

# Create InitParameters object and set configuration parameters
init_params = sl.InitParameters()
init_params.camera_resolution = sl.RESOLUTION.HD720 # Set resolution
init_params.depth_mode = sl.DEPTH_MODE.ULTRA # Set depth mode

# Open the camera
status = zed.open(init_params)
if status != sl.ERROR_CODE.SUCCESS:
print(f"Camera failed to open: {status}")
exit(1)

# Create a Mat object for the image (GPU memory type)
image_gpu = sl.Mat(zed.get_camera_information().camera_resolution.width,
zed.get_camera_information().camera_resolution.height,
sl.MAT_TYPE.U8_C4, sl.MEM.GPU)

# Capture an image frame
runtime_params = sl.RuntimeParameters()

if zed.grab(runtime_params) == sl.ERROR_CODE.SUCCESS:
# Retrieve image directly into GPU memory
zed.retrieve_image(image_gpu, sl.VIEW.LEFT, sl.MEM.GPU)

# Now `image_gpu` holds the image in GPU memory
print("Image captured and stored in CUDA memory")

# Close the camera
zed.close()

# Create a CPU Mat to store the image
image_cpu = sl.Mat()

# Copy image from GPU to CPU
image_gpu.copy_to(image_cpu)

# Save the image (this is in CPU memory now)
image_cpu.write("image_from_cuda.png")
51 changes: 51 additions & 0 deletions python_wip/cuda_stream_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# inference_node.py
import rclpy
from rclpy.node import Node
from cuda_manager import CudaStreamManager
import pycuda.driver as cuda
import numpy as np

class InferenceNode(Node):
def __init__(self, cuda_manager):
super().__init__('inference_node')
self.cuda_manager = cuda_manager

def infer(self):
self.get_logger().info("Waiting for preprocessing to complete...")
self.cuda_manager.get_preprocess_event().synchronize()
self.get_logger().info("Starting inference on GPU...")

# Simulate inference on GPU
data = np.random.randn(1024, 1024).astype(np.float32)
gpu_data = cuda.mem_alloc(data.nbytes)
cuda.memcpy_htod_async(gpu_data, data, self.cuda_manager.get_stream())

# Signal inference completion
self.cuda_manager.get_inference_event().record(self.cuda_manager.get_stream())
self.get_logger().info("Inference complete.")

# post processing:
# postprocessing_node.py
import rclpy
from rclpy.node import Node
from cuda_manager import CudaStreamManager
import pycuda.driver as cuda
import numpy as np

class PostprocessingNode(Node):
def __init__(self, cuda_manager):
super().__init__('postprocessing_node')
self.cuda_manager = cuda_manager

def postprocess(self):
self.get_logger().info("Waiting for inference to complete...")
self.cuda_manager.get_inference_event().synchronize()
self.get_logger().info("Starting postprocessing on GPU...")

# Simulate postprocessing on GPU
data = np.random.randn(1024, 1024).astype(np.float32)
gpu_data = cuda.mem_alloc(data.nbytes)
cuda.memcpy_htod_async(gpu_data, data, self.cuda_manager.get_stream())

# Assume postprocessing is complete
self.get_logger().info("Postprocessing complete.")
38 changes: 38 additions & 0 deletions python_wip/function_tracing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import time
from functools import wraps
import rclpy
from rclpy.node import Node
from std_msgs.msg import String

def trace(func):
@wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
print(f'Function {func.__name__} took {end_time - start_time} seconds')
return result
return wrapper

class MinimalPublisher(Node):
def __init__(self):
super().__init__('minimal_publisher')
self.publisher_ = self.create_publisher(String, 'topic', 10)
self.timer = self.create_timer(0.5, self.timer_callback)

@trace
def timer_callback(self):
msg = String()
msg.data = 'Hello World: %s' % time.time()
self.publisher_.publish(msg)
self.get_logger().info('Publishing: "%s"' % msg.data)

def main(args=None):
rclpy.init(args=args)
minimal_publisher = MinimalPublisher()
rclpy.spin(minimal_publisher)
minimal_publisher.destroy_node()
rclpy.shutdown()

if __name__ == '__main__':
main()
35 changes: 34 additions & 1 deletion python_wip/inference_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,4 +252,37 @@ def infer_with_torch_trt(self):
output_torch_trt, torch_trt_time = comparison.infer_with_torch_trt()
print(f"Torch2TRT Inference Time: {torch_trt_time:.6f} seconds")

## compare bounding box output to the expected from the file...
## compare bounding box output to the expected from the file...

# Warmup Phase:
# Each engine runs a warmup phase with 10 inference passes using random input. This ensures the timing reflects actual performance after the engine is "warmed up."
# Buffer Allocation:
# For TensorRT-based models (trt_normal and trt_stripped), buffers for input and output tensors are allocated using CUDA. Host-pinned memory is allocated to enable efficient asynchronous execution.
# Inference Timing:
# For each inference method (normal TensorRT, stripped TensorRT, and torch2trt), the inference time is measured after warmup. The random input is transferred to the device, and then the output is copied back to the host after inference.
# Execution Context:
# Each TensorRT-based model (trt_normal, trt_stripped) uses an execution context created from the engine.
# torch2trt:
# torch2trt models are loaded using TRTModule, and inference is performed using PyTorch tensors.

# torch trt?
import torch
import torch_tensorrt as torch_trt

# Sample PyTorch model (ResNet18 in this case)
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
model.eval().cuda()

# Example input tensor
input_data = torch.randn((1, 3, 224, 224)).cuda()

# Convert the PyTorch model to a Torch-TensorRT optimized model
trt_model = torch_trt.compile(model,
inputs=[torch_trt.Input(input_data.shape)],
enabled_precisions={torch.float, torch.half}) # Use FP32 and FP16

# Run inference
with torch.no_grad():
output = trt_model(input_data)

print(output)
38 changes: 38 additions & 0 deletions python_wip/integrated_serial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import rclpy
from rclpy.node import Node
import serial
import time
from std_msgs.msg import String # Example message type

class ArduinoSerialNode(Node):
def __init__(self):
super().__init__('arduino_serial_node')
self.subscription = self.create_subscription(
String,
'your_topic_name',
self.listener_callback,
10
)
self.subscription # prevent unused variable warning

# Open serial port to Arduino
self.ser = serial.Serial('/dev/ttyUSB0', 115200, timeout=1) # Adjust USB port as needed
time.sleep(2) # Wait for Arduino to reset

def listener_callback(self, msg):
# Serialize and send the message to Arduino
serialized_msg = msg.data + '\n' # Add a newline as a delimiter
self.ser.write(serialized_msg.encode())
self.get_logger().info('Sent to Arduino: "%s"' % msg.data)

def main(args=None):
rclpy.init(args=args)
arduino_serial_node = ArduinoSerialNode()
rclpy.spin(arduino_serial_node)
arduino_serial_node.destroy_node()
rclpy.shutdown()

if __name__ == '__main__':
main()

# pip3 install pyserial
43 changes: 43 additions & 0 deletions python_wip/interop-cupy-cv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import cupy as cp
import cv2
import numpy as np

# Create a custom CUDA stream using CuPy
cuda_stream = cp.cuda.Stream()

# Allocate a GPU array using CuPy
cupy_array = cp.random.random((224, 224, 3), dtype=cp.float32)

# Perform some CuPy operations on the custom stream
with cuda_stream:
cupy_array = cp.sqrt(cupy_array) # Example CuPy operation

# Sync the stream to ensure CuPy operations are done before OpenCV operation
cuda_stream.synchronize()

# Convert CuPy array to a NumPy array (on the CPU)
# OpenCV doesn't natively support CuPy arrays, so transfer data back to host
numpy_array = cp.asnumpy(cupy_array)

# Convert NumPy array to OpenCV's GPU Mat
gpu_mat = cv2.cuda_GpuMat()
gpu_mat.upload(numpy_array)

# Perform an OpenCV CUDA operation
# OpenCV CUDA functions generally don't support custom streams directly
gpu_mat = cv2.cuda.resize(gpu_mat, (128, 128))

# Optionally, download the result back to the CPU
result = gpu_mat.download()

# cropping in cupy:
import cupy as cp

# Assume you have a CuPy array (image) of shape (height, width, channels)
image = cp.random.rand(224, 224, 3).astype(cp.float32) # Example image

# Define the crop region (x, y, width, height)
x, y, w, h = 50, 50, 100, 100

# Crop the image (this works similarly to NumPy slicing)
cropped_image = image[y:y+h, x:x+w, :]
Loading

0 comments on commit 1ba87b5

Please sign in to comment.