calibration tools repaired, future POC examples added, pointer testing

UBCAgroBot · Sep 24, 2024 · 134df86 · 134df86
1 parent aabd398
commit 134df86
Show file tree

Hide file tree

Showing 20 changed files with 964 additions and 532 deletions.
diff --git a/calibration_tools/roi_calibration.py b/calibration_tools/roi_calibration.py
@@ -17,6 +17,9 @@
 velocity = 1
 shift_constant = 1
 
+def print_camera_information(cam): 
+	print("Serial number: {0}.\n".format( cam.get_camera_information().serial_number))
+
 def initialize_image_source(source_type="static_image", image_path='C:/Users/ishaa/Coding Projects/Applied-AI/ROS/assets/maize'):
     if source_type == "static_image":
         if not os.path.exists(image_path):

diff --git a/conversion_tools/ONNX_GS.py b/conversion_tools/ONNX_GS.py
@@ -25,4 +25,48 @@ def optimize_onnx(model_path="/home/user/Downloads/model.onnx"):
     parser.add_argument('--model_path', type=str, default="/home/user/Downloads/model.onnx", required=False, help='Path to the ONNX model file (.onnx)')
     args = parser.parse_args()
 
-    optimize_onnx(args.model_path)
+    optimize_onnx(args.model_path)
+
+# random chatgpt:
+import onnx
+import onnx_graphsurgeon as gs
+import numpy as np
+
+# Load the ONNX model
+onnx_model_path = "yolo_backbone.onnx"
+model = onnx.load(onnx_model_path)
+
+# Parse the model graph into GraphSurgeon graph
+graph = gs.import_onnx(model)
+
+# Display the graph nodes (optional, useful for inspection)
+print("Graph nodes before optimization:")
+for node in graph.nodes:
+    print(node)
+
+# Example: Remove Identity nodes (they are not needed for inference)
+graph.cleanup()
+
+# Example: Fold constant nodes
+# Constant folding can be used to simplify the graph by evaluating constant expressions at graph build time.
+for node in graph.nodes:
+    if node.op == "Add":
+        inputs_are_constants = all(isinstance(inp, gs.Constant) for inp in node.inputs)
+        if inputs_are_constants:
+            value = node.inputs[0].values + node.inputs[1].values
+            constant_node = gs.Constant(name=node.name, values=value)
+            graph.outputs = [constant_node]
+
+# Example: Fuse certain nodes, if applicable
+# In this case, you can fuse common patterns (like batch normalization, activation layers) if it's supported.
+# This is model-dependent, so it's an optional step. For simplicity, we omit specific fusion here.
+
+# Cleanup the graph to remove any orphaned nodes after the transformations
+graph.cleanup()
+graph.toposort()
+
+# Export the optimized ONNX model
+optimized_onnx_path = "yolo_backbone_optimized.onnx"
+onnx.save(gs.export_onnx(graph), optimized_onnx_path)
+
+print(f"Optimized model saved at {optimized_onnx_path}")
diff --git a/conversion_tools/ONNX_TRT.py b/conversion_tools/ONNX_TRT.py
@@ -49,6 +49,9 @@ def convert_onnx_to_trt(model_path="/home/user/Downloads/model.onnx", output_pat
         config.set_flag(trt.BuilderFlag.FP16)
     # elif INT8:
     #     config.set_flag(trt.BuilderFlag.INT8)
+    # Enable FP16 optimization if the device supports it
+    # if builder.platform_has_fast_fp16:
+    #     builder.fp16_mode = True
 
     if strip_weights:
         config.set_flag(trt.BuilderFlag.STRIP_PLAN)

diff --git a/python_wip/bbox_display.py b/python_wip/bbox_display.py
@@ -0,0 +1,58 @@
+import cv2
+import os
+
+def draw_bounding_boxes(image_path, bboxes):
+    # Read the image using OpenCV
+    img = cv2.imread(image_path)
+
+    if img is None:
+        print(f"Error loading image: {image_path}")
+        return
+
+    # Get the dimensions of the image
+    height, width, _ = img.shape
+    print(height)
+    print(width)
+
+    # Draw each bounding box on the image
+    for bbox in bboxes:
+        class_id, x_center, y_center, bbox_width, bbox_height = bbox
+
+        # Convert normalized values to absolute pixel values
+        x_center_pixel = int(x_center * width)
+        y_center_pixel = int(y_center * height)
+        bbox_width_pixel = int(bbox_width * width)
+        bbox_height_pixel = int(bbox_height * height)
+
+        # Calculate the top-left and bottom-right corners of the bounding box
+        top_left_x = int(x_center_pixel - bbox_width_pixel / 2)
+        top_left_y = int(y_center_pixel - bbox_height_pixel / 2)
+        bottom_right_x = int(x_center_pixel + bbox_width_pixel / 2)
+        bottom_right_y = int(y_center_pixel + bbox_height_pixel / 2)
+
+        # Draw the bounding box (using green color and thickness of 2)
+        cv2.rectangle(img, (top_left_x, top_left_y), (bottom_right_x, bottom_right_y), (0, 255, 0), 2)
+
+        # Show the image with bounding boxes (press any key to close)
+        cv2.imshow('Bounding Boxes', img)
+        cv2.waitKey(10000)
+        cv2.destroyAllWindows()
+
+def read_bounding_boxes(txt_file):
+    bboxes = []
+    with open(txt_file, 'r') as file:
+        for line in file.readlines():
+            values = line.strip().split()
+            class_id = int(values[0])
+            x_center = float(values[1])
+            y_center = float(values[2])
+            bbox_width = float(values[3])
+            bbox_height = float(values[4])
+            bboxes.append((class_id, x_center, y_center, bbox_width, bbox_height))
+    return bboxes
+
+os.chdir("C:/Users/ishaa/Coding Projects/Applied-AI/ROS/assets/maize")
+print(os.getcwd())
+boxes = read_bounding_boxes("IMG_2884_18.txt")
+print(boxes)
+draw_bounding_boxes("IMG_2884_18.JPG", boxes)
diff --git a/python_wip/inference_stream.py b/python_wip/inference_stream.py
@@ -0,0 +1,58 @@
+import rclpy
+from rclpy.node import Node
+import pycuda.driver as cuda
+import pycuda.autoinit
+import numpy as np
+import cv2
+import cv2.cuda as cv2_cuda
+
+class InferenceNode(Node):
+    def __init__(self):
+        super().__init__('inference_node')
+
+        # Initialize CUDA context
+        self.cuda_driver_context = cuda.Device(0).make_context()
+        self.stream = cuda.Stream()
+
+        # Allocate GPU memory for input and output tensors using cudaMalloc
+        self.h_input = np.random.randn(1, 3, 224, 224).astype(np.float32)
+        self.h_output = np.empty((1, 1000), dtype=np.float32)
+
+        self.d_input = cuda.mem_alloc(self.h_input.nbytes)
+        self.d_output = cuda.mem_alloc(self.h_output.nbytes)
+
+        # Example image (allocate on GPU)
+        self.cv_image = np.random.rand(480, 640, 3).astype(np.uint8)
+        self.cv_cuda_image = cv2_cuda_GpuMat(self.cv_image.shape[0], self.cv_image.shape[1], cv2.CV_8UC3)
+
+        # Upload image to GPU (device memory)
+        self.cv_cuda_image.upload(self.cv_image)
+
+        # Create CUDA IPC handle for output tensor and image
+        self.output_ipc_handle = cuda.mem_get_ipc_handle(self.d_output)
+        self.image_ipc_handle = cuda.mem_get_ipc_handle(self.cv_cuda_image.cudaPtr())
+
+        # Publish the IPC handle to postprocessing node
+        self.publisher_ = self.create_publisher(MemoryHandle, 'inference_done', 10)
+
+    def run_inference(self):
+        tic = time.perf_counter_ns()
+        self.cuda_driver_context.push()
+
+        # Transfer data to device asynchronously
+        cuda.memcpy_htod_async(self.d_input, self.h_input, self.stream)
+
+        # Execute inference asynchronously
+        self.exec_context.execute_async_v2(bindings=[int(self.d_input), int(self.d_output)], stream_handle=self.stream.handle)
+        self.stream.synchronize()
+
+        self.cuda_driver_context.pop()
+        toc = time.perf_counter_ns()
+
+        self.get_logger().info(f"Inference done in: {(toc-tic)/1e6} ms")
+
+        # Publish the IPC handles to postprocessing node
+        msg = MemoryHandle()
+        msg.tensor_ipc_handle = str(self.output_ipc_handle)
+        msg.image_ipc_handle = str(self.image_ipc_handle)
+        self.publisher_.publish(msg)
diff --git a/python_wip/jax_ex.py b/python_wip/jax_ex.py
@@ -0,0 +1,9 @@
+import jax
+import jax.numpy as jnp
+
+a = jnp.array([1.0, 2.0, 3.0])
+b = jnp.array([4.0, 5.0, 6.0])
+
+c = a + b
+
+print(c)
diff --git a/python_wip/numba_ex.py b/python_wip/numba_ex.py
@@ -0,0 +1,29 @@
+from numba import cuda
+import numpy as np
+
+@cuda.jit
+def add_kernel(a, b, c):
+    idx = cuda.grid(1)
+    if idx < a.size:
+        c[idx] = a[idx] + b[idx]
+
+# Allocate data on the host
+N = 1000
+a = np.arange(N, dtype=np.float32)
+b = np.arange(N, dtype=np.float32)
+c = np.zeros_like(a)
+
+# Allocate data on the device
+a_gpu = cuda.to_device(a)
+b_gpu = cuda.to_device(b)
+c_gpu = cuda.device_array_like(a)
+
+# Launch kernel
+threads_per_block = 128
+blocks_per_grid = (a.size + (threads_per_block - 1)) // threads_per_block
+add_kernel[blocks_per_grid, threads_per_block](a_gpu, b_gpu, c_gpu)
+
+# Copy result back to host
+c_gpu.copy_to_host(c)
+
+print(c)
diff --git a/python_wip/numba_example.py b/python_wip/numba_example.py
@@ -0,0 +1,31 @@
+from numba import cuda
+import numpy as np
+
+# Define a CUDA kernel function
+@cuda.jit
+def matrix_addition_kernel(a, b, result):
+    i, j = cuda.grid(2)
+    if i < result.shape[0] and j < result.shape[1]:
+        result[i, j] = a[i, j] + b[i, j]
+
+# Initialize NumPy arrays
+a = np.random.rand(32, 32).astype(np.float32)
+b = np.random.rand(32, 32).astype(np.float32)
+result = np.zeros_like(a)
+
+# Allocate arrays on the GPU
+a_device = cuda.to_device(a)
+b_device = cuda.to_device(b)
+result_device = cuda.to_device(result)
+
+# Define the grid size for the kernel execution
+threads_per_block = (16, 16)
+blocks_per_grid = (a.shape[0] // threads_per_block[0] + 1, a.shape[1] // threads_per_block[1] + 1)
+
+# Launch the CUDA kernel
+matrix_addition_kernel[blocks_per_grid, threads_per_block](a_device, b_device, result_device)
+
+# Copy the result back to the host (CPU)
+result = result_device.copy_to_host()
+
+print(result)
diff --git a/python_wip/pointer_to_pt.py b/python_wip/pointer_to_pt.py
@@ -0,0 +1,18 @@
+import torch
+import pycuda.driver as cuda
+
+# Example CUDA buffer size (you should match this with your actual output size)
+output_size = (1, 1000)  # Example: TensorRT output of shape [1, 1000]
+
+# Convert CUDA memory pointer (from IPC) to a PyTorch tensor
+def cuda_pointer_to_torch_tensor(cuda_ptr, shape, dtype=torch.float32):
+    # Convert the raw pointer to PyTorch tensor (in GPU memory)
+    tensor = torch.from_blob(cuda_ptr, shape, dtype=dtype, device='cuda')
+    return tensor
+
+# In your post-processing node, after receiving the CUDA IPC handle
+ipc_handle = cuda.IPCHandle(ipc_handle_bytes)
+d_output = ipc_handle.open(cuda.Context.get_current())
+
+# Convert the CUDA device pointer to a PyTorch tensor
+output_tensor = cuda_pointer_to_torch_tensor(d_output, output_size)
diff --git a/python_wip/post_stream.py b/python_wip/post_stream.py
@@ -0,0 +1,51 @@
+import rclpy
+from rclpy.node import Node
+import pycuda.driver as cuda
+import pycuda.autoinit
+import cv2
+import cv2.cuda as cv2_cuda
+
+class PostprocessingNode(Node):
+    def __init__(self):
+        super().__init__('postprocessing_node')
+
+        # Create CUDA context
+        self.cuda_driver_context = cuda.Device(0).make_context()
+
+        # Subscribe to inference_done topic to get IPC handles
+        self.subscription = self.create_subscription(
+            MemoryHandle,
+            'inference_done',
+            self.postprocess_callback,
+            10
+        )
+
+    def postprocess_callback(self, msg):
+        # Get the IPC handles for tensor and image
+        tensor_ipc_handle_str = msg.tensor_ipc_handle
+        image_ipc_handle_str = msg.image_ipc_handle
+
+        # Open IPC memory handles for tensor and image
+        tensor_ipc_handle = cuda.IPCMemoryHandle(tensor_ipc_handle_str)
+        image_ipc_handle = cuda.IPCMemoryHandle(image_ipc_handle_str)
+
+        d_output = cuda.ipc_open_mem_handle(tensor_ipc_handle, self.h_output.nbytes)
+        d_image = cuda.ipc_open_mem_handle(image_ipc_handle, self.cv_image.nbytes)
+
+        # Wrap the image GPU pointer into a GpuMat object for OpenCV CUDA operations
+        cv_cuda_image = cv2_cuda_GpuMat(self.cv_image.shape[0], self.cv_image.shape[1], cv2.CV_8UC3)
+        cv_cuda_image.upload(d_image)
+
+        # Perform OpenCV CUDA operations on the image (e.g., GaussianBlur)
+        blurred_image = cv2_cuda_image.gaussianBlur((5, 5), 0)
+
+        # Retrieve inference result and postprocess
+        cuda.memcpy_dtoh(self.h_output, d_output)
+        self.stream.synchronize()
+
+        output = np.copy(self.h_output)
+        self.get_logger().info(f"Postprocessed tensor: {output}")
+
+        # Clean up IPC memory handles
+        cuda.ipc_close_mem_handle(d_output)
+        cuda.ipc_close_mem_handle(d_image)
diff --git a/python_wip/pytorch_ex.py b/python_wip/pytorch_ex.py
@@ -0,0 +1,35 @@
+import torch
+import torchvision.transforms as T
+
+def preprocess_image_pytorch(self, image):
+    tic = time.perf_counter_ns()
+
+    roi_x, roi_y, roi_w, roi_h = self.roi_dimensions
+    shifted_x = roi_x + abs(self.velocity[0]) * self.shift_constant
+
+    # Convert image to PyTorch tensor and move to GPU
+    image_tensor = torch.from_numpy(image).cuda()
+
+    # Define preprocessing transformations
+    transform = T.Compose([
+        T.Lambda(lambda img: img[roi_y:(roi_y+roi_h), shifted_x:(shifted_x+roi_w), :3]),  # Crop and remove alpha
+        T.Resize(self.dimensions),  # Resize to model input size
+        T.ToTensor(),  # Convert to Tensor
+        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize
+    ])
+
+    # Apply transformations (automatically handles CHW format for TensorRT)
+    input_data = transform(image_tensor).unsqueeze(0).float().cuda()
+
+    d_input_ptr = input_data.data_ptr()  # Get device pointer of the tensor
+
+    # Publish the IPC handle or pointer
+    ipc_handle = cuda.mem_get_ipc_handle(d_input_ptr)
+
+    toc = time.perf_counter_ns()
+    self.get_logger().info(f"Preprocessing: {(toc-tic)/1e6} ms")
+
+    # Publish the IPC handle
+    ipc_handle_msg = String()
+    ipc_handle_msg.data = str(ipc_handle.handle)
+    self.pointer_publisher.publish(ipc_handle_msg)