Skip to content

Commit

Permalink
calibration tools repaired, future POC examples added, pointer testing
Browse files Browse the repository at this point in the history
  • Loading branch information
Ishaan-Datta committed Sep 24, 2024
1 parent aabd398 commit 134df86
Show file tree
Hide file tree
Showing 20 changed files with 964 additions and 532 deletions.
3 changes: 3 additions & 0 deletions calibration_tools/roi_calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
velocity = 1
shift_constant = 1

def print_camera_information(cam):
print("Serial number: {0}.\n".format( cam.get_camera_information().serial_number))

def initialize_image_source(source_type="static_image", image_path='C:/Users/ishaa/Coding Projects/Applied-AI/ROS/assets/maize'):
if source_type == "static_image":
if not os.path.exists(image_path):
Expand Down
46 changes: 45 additions & 1 deletion conversion_tools/ONNX_GS.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,48 @@ def optimize_onnx(model_path="/home/user/Downloads/model.onnx"):
parser.add_argument('--model_path', type=str, default="/home/user/Downloads/model.onnx", required=False, help='Path to the ONNX model file (.onnx)')
args = parser.parse_args()

optimize_onnx(args.model_path)
optimize_onnx(args.model_path)

# random chatgpt:
import onnx
import onnx_graphsurgeon as gs
import numpy as np

# Load the ONNX model
onnx_model_path = "yolo_backbone.onnx"
model = onnx.load(onnx_model_path)

# Parse the model graph into GraphSurgeon graph
graph = gs.import_onnx(model)

# Display the graph nodes (optional, useful for inspection)
print("Graph nodes before optimization:")
for node in graph.nodes:
print(node)

# Example: Remove Identity nodes (they are not needed for inference)
graph.cleanup()

# Example: Fold constant nodes
# Constant folding can be used to simplify the graph by evaluating constant expressions at graph build time.
for node in graph.nodes:
if node.op == "Add":
inputs_are_constants = all(isinstance(inp, gs.Constant) for inp in node.inputs)
if inputs_are_constants:
value = node.inputs[0].values + node.inputs[1].values
constant_node = gs.Constant(name=node.name, values=value)
graph.outputs = [constant_node]

# Example: Fuse certain nodes, if applicable
# In this case, you can fuse common patterns (like batch normalization, activation layers) if it's supported.
# This is model-dependent, so it's an optional step. For simplicity, we omit specific fusion here.

# Cleanup the graph to remove any orphaned nodes after the transformations
graph.cleanup()
graph.toposort()

# Export the optimized ONNX model
optimized_onnx_path = "yolo_backbone_optimized.onnx"
onnx.save(gs.export_onnx(graph), optimized_onnx_path)

print(f"Optimized model saved at {optimized_onnx_path}")
3 changes: 3 additions & 0 deletions conversion_tools/ONNX_TRT.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ def convert_onnx_to_trt(model_path="/home/user/Downloads/model.onnx", output_pat
config.set_flag(trt.BuilderFlag.FP16)
# elif INT8:
# config.set_flag(trt.BuilderFlag.INT8)
# Enable FP16 optimization if the device supports it
# if builder.platform_has_fast_fp16:
# builder.fp16_mode = True

if strip_weights:
config.set_flag(trt.BuilderFlag.STRIP_PLAN)
Expand Down
58 changes: 58 additions & 0 deletions python_wip/bbox_display.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import cv2
import os

def draw_bounding_boxes(image_path, bboxes):
# Read the image using OpenCV
img = cv2.imread(image_path)

if img is None:
print(f"Error loading image: {image_path}")
return

# Get the dimensions of the image
height, width, _ = img.shape
print(height)
print(width)

# Draw each bounding box on the image
for bbox in bboxes:
class_id, x_center, y_center, bbox_width, bbox_height = bbox

# Convert normalized values to absolute pixel values
x_center_pixel = int(x_center * width)
y_center_pixel = int(y_center * height)
bbox_width_pixel = int(bbox_width * width)
bbox_height_pixel = int(bbox_height * height)

# Calculate the top-left and bottom-right corners of the bounding box
top_left_x = int(x_center_pixel - bbox_width_pixel / 2)
top_left_y = int(y_center_pixel - bbox_height_pixel / 2)
bottom_right_x = int(x_center_pixel + bbox_width_pixel / 2)
bottom_right_y = int(y_center_pixel + bbox_height_pixel / 2)

# Draw the bounding box (using green color and thickness of 2)
cv2.rectangle(img, (top_left_x, top_left_y), (bottom_right_x, bottom_right_y), (0, 255, 0), 2)

# Show the image with bounding boxes (press any key to close)
cv2.imshow('Bounding Boxes', img)
cv2.waitKey(10000)
cv2.destroyAllWindows()

def read_bounding_boxes(txt_file):
bboxes = []
with open(txt_file, 'r') as file:
for line in file.readlines():
values = line.strip().split()
class_id = int(values[0])
x_center = float(values[1])
y_center = float(values[2])
bbox_width = float(values[3])
bbox_height = float(values[4])
bboxes.append((class_id, x_center, y_center, bbox_width, bbox_height))
return bboxes

os.chdir("C:/Users/ishaa/Coding Projects/Applied-AI/ROS/assets/maize")
print(os.getcwd())
boxes = read_bounding_boxes("IMG_2884_18.txt")
print(boxes)
draw_bounding_boxes("IMG_2884_18.JPG", boxes)
58 changes: 58 additions & 0 deletions python_wip/inference_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import rclpy
from rclpy.node import Node
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
import cv2.cuda as cv2_cuda

class InferenceNode(Node):
def __init__(self):
super().__init__('inference_node')

# Initialize CUDA context
self.cuda_driver_context = cuda.Device(0).make_context()
self.stream = cuda.Stream()

# Allocate GPU memory for input and output tensors using cudaMalloc
self.h_input = np.random.randn(1, 3, 224, 224).astype(np.float32)
self.h_output = np.empty((1, 1000), dtype=np.float32)

self.d_input = cuda.mem_alloc(self.h_input.nbytes)
self.d_output = cuda.mem_alloc(self.h_output.nbytes)

# Example image (allocate on GPU)
self.cv_image = np.random.rand(480, 640, 3).astype(np.uint8)
self.cv_cuda_image = cv2_cuda_GpuMat(self.cv_image.shape[0], self.cv_image.shape[1], cv2.CV_8UC3)

# Upload image to GPU (device memory)
self.cv_cuda_image.upload(self.cv_image)

# Create CUDA IPC handle for output tensor and image
self.output_ipc_handle = cuda.mem_get_ipc_handle(self.d_output)
self.image_ipc_handle = cuda.mem_get_ipc_handle(self.cv_cuda_image.cudaPtr())

# Publish the IPC handle to postprocessing node
self.publisher_ = self.create_publisher(MemoryHandle, 'inference_done', 10)

def run_inference(self):
tic = time.perf_counter_ns()
self.cuda_driver_context.push()

# Transfer data to device asynchronously
cuda.memcpy_htod_async(self.d_input, self.h_input, self.stream)

# Execute inference asynchronously
self.exec_context.execute_async_v2(bindings=[int(self.d_input), int(self.d_output)], stream_handle=self.stream.handle)
self.stream.synchronize()

self.cuda_driver_context.pop()
toc = time.perf_counter_ns()

self.get_logger().info(f"Inference done in: {(toc-tic)/1e6} ms")

# Publish the IPC handles to postprocessing node
msg = MemoryHandle()
msg.tensor_ipc_handle = str(self.output_ipc_handle)
msg.image_ipc_handle = str(self.image_ipc_handle)
self.publisher_.publish(msg)
9 changes: 9 additions & 0 deletions python_wip/jax_ex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import jax
import jax.numpy as jnp

a = jnp.array([1.0, 2.0, 3.0])
b = jnp.array([4.0, 5.0, 6.0])

c = a + b

print(c)
29 changes: 29 additions & 0 deletions python_wip/numba_ex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from numba import cuda
import numpy as np

@cuda.jit
def add_kernel(a, b, c):
idx = cuda.grid(1)
if idx < a.size:
c[idx] = a[idx] + b[idx]

# Allocate data on the host
N = 1000
a = np.arange(N, dtype=np.float32)
b = np.arange(N, dtype=np.float32)
c = np.zeros_like(a)

# Allocate data on the device
a_gpu = cuda.to_device(a)
b_gpu = cuda.to_device(b)
c_gpu = cuda.device_array_like(a)

# Launch kernel
threads_per_block = 128
blocks_per_grid = (a.size + (threads_per_block - 1)) // threads_per_block
add_kernel[blocks_per_grid, threads_per_block](a_gpu, b_gpu, c_gpu)

# Copy result back to host
c_gpu.copy_to_host(c)

print(c)
31 changes: 31 additions & 0 deletions python_wip/numba_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from numba import cuda
import numpy as np

# Define a CUDA kernel function
@cuda.jit
def matrix_addition_kernel(a, b, result):
i, j = cuda.grid(2)
if i < result.shape[0] and j < result.shape[1]:
result[i, j] = a[i, j] + b[i, j]

# Initialize NumPy arrays
a = np.random.rand(32, 32).astype(np.float32)
b = np.random.rand(32, 32).astype(np.float32)
result = np.zeros_like(a)

# Allocate arrays on the GPU
a_device = cuda.to_device(a)
b_device = cuda.to_device(b)
result_device = cuda.to_device(result)

# Define the grid size for the kernel execution
threads_per_block = (16, 16)
blocks_per_grid = (a.shape[0] // threads_per_block[0] + 1, a.shape[1] // threads_per_block[1] + 1)

# Launch the CUDA kernel
matrix_addition_kernel[blocks_per_grid, threads_per_block](a_device, b_device, result_device)

# Copy the result back to the host (CPU)
result = result_device.copy_to_host()

print(result)
18 changes: 18 additions & 0 deletions python_wip/pointer_to_pt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import torch
import pycuda.driver as cuda

# Example CUDA buffer size (you should match this with your actual output size)
output_size = (1, 1000) # Example: TensorRT output of shape [1, 1000]

# Convert CUDA memory pointer (from IPC) to a PyTorch tensor
def cuda_pointer_to_torch_tensor(cuda_ptr, shape, dtype=torch.float32):
# Convert the raw pointer to PyTorch tensor (in GPU memory)
tensor = torch.from_blob(cuda_ptr, shape, dtype=dtype, device='cuda')
return tensor

# In your post-processing node, after receiving the CUDA IPC handle
ipc_handle = cuda.IPCHandle(ipc_handle_bytes)
d_output = ipc_handle.open(cuda.Context.get_current())

# Convert the CUDA device pointer to a PyTorch tensor
output_tensor = cuda_pointer_to_torch_tensor(d_output, output_size)
51 changes: 51 additions & 0 deletions python_wip/post_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import rclpy
from rclpy.node import Node
import pycuda.driver as cuda
import pycuda.autoinit
import cv2
import cv2.cuda as cv2_cuda

class PostprocessingNode(Node):
def __init__(self):
super().__init__('postprocessing_node')

# Create CUDA context
self.cuda_driver_context = cuda.Device(0).make_context()

# Subscribe to inference_done topic to get IPC handles
self.subscription = self.create_subscription(
MemoryHandle,
'inference_done',
self.postprocess_callback,
10
)

def postprocess_callback(self, msg):
# Get the IPC handles for tensor and image
tensor_ipc_handle_str = msg.tensor_ipc_handle
image_ipc_handle_str = msg.image_ipc_handle

# Open IPC memory handles for tensor and image
tensor_ipc_handle = cuda.IPCMemoryHandle(tensor_ipc_handle_str)
image_ipc_handle = cuda.IPCMemoryHandle(image_ipc_handle_str)

d_output = cuda.ipc_open_mem_handle(tensor_ipc_handle, self.h_output.nbytes)
d_image = cuda.ipc_open_mem_handle(image_ipc_handle, self.cv_image.nbytes)

# Wrap the image GPU pointer into a GpuMat object for OpenCV CUDA operations
cv_cuda_image = cv2_cuda_GpuMat(self.cv_image.shape[0], self.cv_image.shape[1], cv2.CV_8UC3)
cv_cuda_image.upload(d_image)

# Perform OpenCV CUDA operations on the image (e.g., GaussianBlur)
blurred_image = cv2_cuda_image.gaussianBlur((5, 5), 0)

# Retrieve inference result and postprocess
cuda.memcpy_dtoh(self.h_output, d_output)
self.stream.synchronize()

output = np.copy(self.h_output)
self.get_logger().info(f"Postprocessed tensor: {output}")

# Clean up IPC memory handles
cuda.ipc_close_mem_handle(d_output)
cuda.ipc_close_mem_handle(d_image)
35 changes: 35 additions & 0 deletions python_wip/pytorch_ex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import torch
import torchvision.transforms as T

def preprocess_image_pytorch(self, image):
tic = time.perf_counter_ns()

roi_x, roi_y, roi_w, roi_h = self.roi_dimensions
shifted_x = roi_x + abs(self.velocity[0]) * self.shift_constant

# Convert image to PyTorch tensor and move to GPU
image_tensor = torch.from_numpy(image).cuda()

# Define preprocessing transformations
transform = T.Compose([
T.Lambda(lambda img: img[roi_y:(roi_y+roi_h), shifted_x:(shifted_x+roi_w), :3]), # Crop and remove alpha
T.Resize(self.dimensions), # Resize to model input size
T.ToTensor(), # Convert to Tensor
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # Normalize
])

# Apply transformations (automatically handles CHW format for TensorRT)
input_data = transform(image_tensor).unsqueeze(0).float().cuda()

d_input_ptr = input_data.data_ptr() # Get device pointer of the tensor

# Publish the IPC handle or pointer
ipc_handle = cuda.mem_get_ipc_handle(d_input_ptr)

toc = time.perf_counter_ns()
self.get_logger().info(f"Preprocessing: {(toc-tic)/1e6} ms")

# Publish the IPC handle
ipc_handle_msg = String()
ipc_handle_msg.data = str(ipc_handle.handle)
self.pointer_publisher.publish(ipc_handle_msg)
Loading

0 comments on commit 134df86

Please sign in to comment.