diff --git a/python_wip/Stream_E2E.py b/python_wip/Stream_E2E.py new file mode 100644 index 0000000..bfb6dec --- /dev/null +++ b/python_wip/Stream_E2E.py @@ -0,0 +1,161 @@ +# passing streams/sync events across w/ stream manager class? modifying in place.. + +import rclpy +from rclpy.node import Node +import pycuda.driver as cuda +import pycuda.autoinit +import numpy as np +import cv2 +import cv2.cuda as cv2_cuda + +class InferenceNode(Node): + def __init__(self): + super().__init__('inference_node') + + # Initialize CUDA context + self.cuda_driver_context = cuda.Device(0).make_context() + self.stream = cuda.Stream() + + # Allocate GPU memory for input and output tensors using cudaMalloc + self.h_input = np.random.randn(1, 3, 224, 224).astype(np.float32) + self.h_output = np.empty((1, 1000), dtype=np.float32) + + self.d_input = cuda.mem_alloc(self.h_input.nbytes) + self.d_output = cuda.mem_alloc(self.h_output.nbytes) + + # Example image (allocate on GPU) + self.cv_image = np.random.rand(480, 640, 3).astype(np.uint8) + self.cv_cuda_image = cv2_cuda_GpuMat(self.cv_image.shape[0], self.cv_image.shape[1], cv2.CV_8UC3) + + # Upload image to GPU (device memory) + self.cv_cuda_image.upload(self.cv_image) + + # Create CUDA IPC handle for output tensor and image + self.output_ipc_handle = cuda.mem_get_ipc_handle(self.d_output) + self.image_ipc_handle = cuda.mem_get_ipc_handle(self.cv_cuda_image.cudaPtr()) + + # Publish the IPC handle to postprocessing node + self.publisher_ = self.create_publisher(MemoryHandle, 'inference_done', 10) + + def run_inference(self): + tic = time.perf_counter_ns() + self.cuda_driver_context.push() + + # Transfer data to device asynchronously + cuda.memcpy_htod_async(self.d_input, self.h_input, self.stream) + + # Execute inference asynchronously + self.exec_context.execute_async_v2(bindings=[int(self.d_input), int(self.d_output)], stream_handle=self.stream.handle) + self.stream.synchronize() + + self.cuda_driver_context.pop() + toc = time.perf_counter_ns() + + self.get_logger().info(f"Inference done in: {(toc-tic)/1e6} ms") + + # Publish the IPC handles to postprocessing node + msg = MemoryHandle() + msg.tensor_ipc_handle = str(self.output_ipc_handle) + msg.image_ipc_handle = str(self.image_ipc_handle) + self.publisher_.publish(msg) + + +import rclpy +from rclpy.node import Node +import pycuda.driver as cuda +import pycuda.autoinit +import cv2 +import cv2.cuda as cv2_cuda + +class PostprocessingNode(Node): + def __init__(self): + super().__init__('postprocessing_node') + + # Create CUDA context + self.cuda_driver_context = cuda.Device(0).make_context() + + # Subscribe to inference_done topic to get IPC handles + self.subscription = self.create_subscription( + MemoryHandle, + 'inference_done', + self.postprocess_callback, + 10 + ) + + def postprocess_callback(self, msg): + # Get the IPC handles for tensor and image + tensor_ipc_handle_str = msg.tensor_ipc_handle + image_ipc_handle_str = msg.image_ipc_handle + + # Open IPC memory handles for tensor and image + tensor_ipc_handle = cuda.IPCMemoryHandle(tensor_ipc_handle_str) + image_ipc_handle = cuda.IPCMemoryHandle(image_ipc_handle_str) + + d_output = cuda.ipc_open_mem_handle(tensor_ipc_handle, self.h_output.nbytes) + d_image = cuda.ipc_open_mem_handle(image_ipc_handle, self.cv_image.nbytes) + + # Wrap the image GPU pointer into a GpuMat object for OpenCV CUDA operations + cv_cuda_image = cv2_cuda_GpuMat(self.cv_image.shape[0], self.cv_image.shape[1], cv2.CV_8UC3) + cv_cuda_image.upload(d_image) + + # Perform OpenCV CUDA operations on the image (e.g., GaussianBlur) + blurred_image = cv2_cuda_image.gaussianBlur((5, 5), 0) + + # Retrieve inference result and postprocess + cuda.memcpy_dtoh(self.h_output, d_output) + self.stream.synchronize() + + output = np.copy(self.h_output) + self.get_logger().info(f"Postprocessed tensor: {output}") + + # Clean up IPC memory handles + cuda.ipc_close_mem_handle(d_output) + cuda.ipc_close_mem_handle(d_image) + +import rclpy +from rclpy.node import Node +import pycuda.driver as cuda +import pycuda.autoinit +import cv2 +import cv2.cuda as cv2_cuda + +class PostprocessingNode(Node): + def __init__(self): + super().__init__('postprocessing_node') + self.subscription = self.create_subscription( + MemoryHandle, + 'inference_done', + self.postprocess_callback, + 10 + ) + + # Ensure same CUDA context or initialize a new one if needed + self.cuda_driver_context = cuda.Device(0).make_context() + + def postprocess_callback(self, msg): + # Get the IPC handle and shared image address + ipc_handle_str = msg.ipc_handle + ipc_handle = cuda.IPCMemoryHandle(ipc_handle_str) + shared_image_address = msg.shared_image_address + + # Map the shared output tensor via CUDA IPC + d_output = cuda.ipc_open_mem_handle(ipc_handle, pycuda.driver.mem_alloc(self.h_output.nbytes)) + + # Access shared image directly from unified memory (no need to download) + cv_cuda_image = cv2_cuda_GpuMat(480, 640, cv2.CV_8UC3) + cv_cuda_image.upload(shared_image_address) + + # Example OpenCV CUDA operation: GaussianBlur + blurred_image = cv2_cuda_image.gaussianBlur((5, 5), 0) + + # Postprocess the inference output and the blurred image + cuda.memcpy_dtoh(self.h_output, d_output) + self.stream.synchronize() + + output = np.copy(self.h_output) + self.get_logger().info(f"Postprocessed tensor: {output}") + + # Clean up the IPC memory + cuda.ipc_close_mem_handle(d_output) + +# this uses unified memory... \ No newline at end of file diff --git a/python_wip/async_cuda.py b/python_wip/async_cuda.py deleted file mode 100644 index a91f366..0000000 --- a/python_wip/async_cuda.py +++ /dev/null @@ -1,100 +0,0 @@ -import rclpy -from rclpy.callback_groups import ReentrantCallbackGroup -from rclpy.executors import MultiThreadedExecutor -from rclpy.node import Node -from sensor_msgs.msg import Image -import cv2 -import numpy as np -import asyncio # Import asyncio for async handling -from trt_inference import TRTInference # TensorRT inference class -import threading # For concurrency control -import pycuda.driver as cuda # Assuming CUDA is required for inference - -class BoundingBoxNode(Node): - def __init__(self): - super().__init__('bounding_box_node') - - # Create reentrant callback groups for left and right image callbacks - self.callback_group_left = ReentrantCallbackGroup() - self.callback_group_right = ReentrantCallbackGroup() - - # Subscribers for left and right camera image topics, each with a different callback group - self.left_image_subscriber = self.create_subscription( - Image, '/camera/left/image_raw', self.left_image_callback, 10, callback_group=self.callback_group_left) - - self.right_image_subscriber = self.create_subscription( - Image, '/camera/right/image_raw', self.right_image_callback, 10, callback_group=self.callback_group_right) - - # Initialize TensorRT inference engine - self.trt_inference = TRTInference('model.engine') - self.cuda_stream = cuda.Stream() - - # Buffer for left and right images (shared resource) - self.left_image = None - self.right_image = None - - # Add a lock to avoid race conditions on image access - self.lock = threading.Lock() - - def left_image_callback(self, msg): - frame = self.ros2_image_to_numpy(msg) - with self.lock: - self.left_image = frame - - # If both left and right images are available, start async inference - if self.right_image is not None: - asyncio.create_task(self.run_inference()) - - def right_image_callback(self, msg): - frame = self.ros2_image_to_numpy(msg) - with self.lock: - self.right_image = frame - - # If both left and right images are available, start async inference - if self.left_image is not None: - asyncio.create_task(self.run_inference()) - - async def run_inference(self): - # Lock the shared resource (images) - with self.lock: - # Batch the images for inference - left_img = self.left_image - right_img = self.right_image - - # Reset images to avoid reprocessing the same images - self.left_image = None - self.right_image = None - - # Run inference asynchronously - if left_img is not None and right_img is not None: - await self.async_inference(left_img, right_img) - - async def async_inference(self, left_img, right_img): - # This method would use the CUDA stream to run TensorRT inference asynchronously. - # Actual TensorRT inference code goes here... - # Ensure that TensorRT uses the self.cuda_stream for asynchronous execution. - ... - - def ros2_image_to_numpy(self, msg): - img = np.frombuffer(msg.data, dtype=np.uint8).reshape(msg.height, msg.width, 3) - return img - -def main(args=None): - rclpy.init(args=args) - - node = BoundingBoxNode() - - # Use a MultiThreadedExecutor with 2 threads - executor = MultiThreadedExecutor(num_threads=2) - executor.add_node(node) - - try: - executor.spin() - except KeyboardInterrupt: - pass - finally: - node.destroy_node() - rclpy.shutdown() - -if __name__ == '__main__': - main() diff --git a/python_wip/cuda_nonblocking.py b/python_wip/cuda_nonblocking.py deleted file mode 100644 index f3264ed..0000000 --- a/python_wip/cuda_nonblocking.py +++ /dev/null @@ -1,48 +0,0 @@ -import rclpy -from rclpy.node import Node -from sensor_msgs.msg import Image -import pycuda.driver as cuda -import numpy as np -import cv2 -import threading - -class ImagePreprocessingNode(Node): - def __init__(self, sync_event): - super().__init__('image_preprocessing_node') - self.sync_event = sync_event # Synchronization event to signal inference node - self.width = 224 - self.height = 224 - self.channels = 3 - - # Allocate CUDA memory for the image and create CUDA stream for non-blocking processing - self.cuda_mem = cuda.mem_alloc(self.width * self.height * self.channels * np.dtype(np.float32).itemsize) - self.stream = cuda.Stream() # Create CUDA stream for asynchronous execution - - # Create a subscription to image topic (e.g., from a camera) - self.subscription = self.create_subscription( - Image, - '/camera/image', - self.image_callback, - 10 - ) - - def image_callback(self, msg): - # Convert ROS Image message to a numpy array (assuming the image is encoded in RGB8) - image = np.frombuffer(msg.data, dtype=np.uint8).reshape(msg.height, msg.width, self.channels) - image = cv2.resize(image, (self.width, self.height)).astype(np.float32) / 255.0 - - # Copy image from host (CPU) to device (GPU) asynchronously using CUDA stream - cuda.memcpy_htod_async(self.cuda_mem, image, self.stream) - - # After copying, the image is preprocessed asynchronously, use CUDA stream to synchronize - self.stream.synchronize() # Wait for the preprocessing task to complete - - # Signal the inference node that preprocessing is done - self.sync_event.set() - self.get_logger().info("Image preprocessed and copied to CUDA memory") - - def get_cuda_memory(self): - return self.cuda_mem - - def get_cuda_stream(self): - return self.stream # Expose the CUDA stream for the inference node to use diff --git a/python_wip/cuda_nonblocking_infer.py b/python_wip/cuda_nonblocking_infer.py deleted file mode 100644 index 274368b..0000000 --- a/python_wip/cuda_nonblocking_infer.py +++ /dev/null @@ -1,55 +0,0 @@ -import rclpy -from rclpy.node import Node -import tensorrt as trt -import pycuda.driver as cuda -import numpy as np -import threading - -class InferenceNode(Node): - def __init__(self, preprocess_node, sync_event): - super().__init__('inference_node') - self.preprocess_node = preprocess_node - self.sync_event = sync_event # Synchronization event - self.create_timer(0.1, self.run_inference) # Periodically check for new images - - # Load the TensorRT engine - self.engine = self.load_engine('model.trt') - self.context = self.engine.create_execution_context() - - # Allocate device memory for input/output bindings - self.input_shape = (1, 3, 224, 224) - self.input_size = np.prod(self.input_shape) * np.dtype(np.float32).itemsize - self.d_output = cuda.mem_alloc(self.input_size) # assuming output size is the same as input - - # Create a CUDA stream for asynchronous inference execution - self.inference_stream = cuda.Stream() - - def load_engine(self, engine_path): - with open(engine_path, "rb") as f: - runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) - return runtime.deserialize_cuda_engine(f.read()) - - def run_inference(self): - # Wait until the preprocessing node signals that a new image is ready - if not self.sync_event.is_set(): - return # No image ready for inference yet - - # Reset the event so that the preprocessing node can signal for the next image - self.sync_event.clear() - - # Get the CUDA memory pointer and stream from the preprocessing node - d_preprocessed_image = self.preprocess_node.get_cuda_memory() - preprocess_stream = self.preprocess_node.get_cuda_stream() - - # Run inference using TensorRT asynchronously in its own stream - bindings = [int(d_preprocessed_image), int(self.d_output)] - self.context.execute_async_v2(bindings, self.inference_stream.handle) - - # Synchronize the inference stream to ensure inference is complete before proceeding - self.inference_stream.synchronize() - - # Assuming the result is in d_output, we could copy it back to host if needed - result = np.empty(self.input_shape, dtype=np.float32) - cuda.memcpy_dtoh_async(result, self.d_output, self.inference_stream) - self.inference_stream.synchronize() # Ensure the copy operation is done - self.get_logger().info(f"Inference complete. Output shape: {result.shape}") diff --git a/python_wip/cuda_sync.py b/python_wip/cuda_sync.py deleted file mode 100644 index c093c3e..0000000 --- a/python_wip/cuda_sync.py +++ /dev/null @@ -1,41 +0,0 @@ -import rclpy -from rclpy.node import Node -from sensor_msgs.msg import Image -import pycuda.driver as cuda -import numpy as np -import cv2 -import threading - -class ImagePreprocessingNode(Node): - def __init__(self, sync_event): - super().__init__('image_preprocessing_node') - self.sync_event = sync_event # Synchronization event to signal inference node - self.width = 224 - self.height = 224 - self.channels = 3 - - # Allocate CUDA memory for the image - self.cuda_mem = cuda.mem_alloc(self.width * self.height * self.channels * np.dtype(np.float32).itemsize) - - # Create a subscription to image topic (e.g., from a camera) - self.subscription = self.create_subscription( - Image, - '/camera/image', - self.image_callback, - 10 - ) - - def image_callback(self, msg): - # Convert ROS Image message to a numpy array (assuming the image is encoded in RGB8) - image = np.frombuffer(msg.data, dtype=np.uint8).reshape(msg.height, msg.width, self.channels) - image = cv2.resize(image, (self.width, self.height)).astype(np.float32) / 255.0 - - # Copy image from host (CPU) to device (GPU) - cuda.memcpy_htod(self.cuda_mem, image) - - # Signal the inference node that preprocessing is done - self.sync_event.set() - self.get_logger().info("Image preprocessed and copied to CUDA memory") - - def get_cuda_memory(self): - return self.cuda_mem diff --git a/python_wip/cuda_sync_infer.py b/python_wip/cuda_sync_infer.py deleted file mode 100644 index 6f0fa70..0000000 --- a/python_wip/cuda_sync_infer.py +++ /dev/null @@ -1,47 +0,0 @@ -import rclpy -from rclpy.node import Node -import tensorrt as trt -import pycuda.driver as cuda -import numpy as np -import threading - -class InferenceNode(Node): - def __init__(self, preprocess_node, sync_event): - super().__init__('inference_node') - self.preprocess_node = preprocess_node - self.sync_event = sync_event # Synchronization event - self.create_timer(0.1, self.run_inference) # Periodically check for new images - - # Load the TensorRT engine - self.engine = self.load_engine('model.trt') - self.context = self.engine.create_execution_context() - - # Allocate device memory for input/output bindings - self.input_shape = (1, 3, 224, 224) - self.input_size = np.prod(self.input_shape) * np.dtype(np.float32).itemsize - self.d_output = cuda.mem_alloc(self.input_size) # assuming output size is the same as input - - def load_engine(self, engine_path): - with open(engine_path, "rb") as f: - runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) - return runtime.deserialize_cuda_engine(f.read()) - - def run_inference(self): - # Wait until the preprocessing node signals that a new image is ready - if not self.sync_event.is_set(): - return # No image ready for inference yet - - # Reset the event so that the preprocessing node can signal for the next image - self.sync_event.clear() - - # Get the CUDA memory pointer from the preprocessing node - d_preprocessed_image = self.preprocess_node.get_cuda_memory() - - # Run inference using TensorRT on the preprocessed image - bindings = [int(d_preprocessed_image), int(self.d_output)] - self.context.execute_v2(bindings) - - # Assuming the result is in d_output, we could copy it back to host if needed - result = np.empty(self.input_shape, dtype=np.float32) - cuda.memcpy_dtoh(result, self.d_output) - self.get_logger().info(f"Inference complete. Output shape: {result.shape}") diff --git a/python_wip/cuda_sync_launch.py b/python_wip/cuda_sync_launch.py deleted file mode 100644 index 89cd29f..0000000 --- a/python_wip/cuda_sync_launch.py +++ /dev/null @@ -1,41 +0,0 @@ -from launch_ros.actions import ComposableNodeContainer -from launch_ros.descriptions import ComposableNode -from launch import LaunchDescription -import threading - -sync_event = threading.Event() - -def generate_launch_description(): - return LaunchDescription([ - ComposableNodeContainer( - name='gpu_composable_container', - namespace='', - package='rclpy_components', - executable='component_container_mt', # Multi-threaded container for composable nodes - composable_node_descriptions=[ - ComposableNode( - package='my_package', - plugin='my_package.ImagePreprocessingNode', - name='image_preprocessing_node', - parameters=[ - {"sync_event": sync_event} - ] - ), - ComposableNode( - package='my_package', - plugin='my_package.InferenceNode', - name='inference_node', - parameters=[ - {"preprocess_node": "image_preprocessing_node", "sync_event": sync_event} - ] - ), - ], - output='screen', - ), - ]) - -# Threading Event: The synchronization between the preprocessing and inference nodes is handled by a threading event (sync_event). The event is set by the image preprocessing node once the image is copied to CUDA memory and is ready for inference. The inference node waits for this event to be set, indicating that the preprocessing is done, and clears the event after starting inference. This ensures that each image is processed in order and no data is overwritten or lost. - -# Callback-Based Image Processing: The image preprocessing node processes each incoming image in the callback from the ROS2 image subscription. After processing the image, it signals the inference node to begin the inference process. - -# Periodic Inference Check: The inference node runs on a timer that periodically checks whether a new image is ready by checking the threading event. Once the event is set (indicating that an image is ready), it retrieves the CUDA memory pointer and performs inference. \ No newline at end of file diff --git a/workspace_python/ros2_ws/src/python_workspace/launch/benchmarking.py b/python_wip/depth_sensing.py similarity index 100% rename from workspace_python/ros2_ws/src/python_workspace/launch/benchmarking.py rename to python_wip/depth_sensing.py diff --git a/python_wip/inference_comparison.py b/python_wip/inference_comparison.py new file mode 100644 index 0000000..17ba994 --- /dev/null +++ b/python_wip/inference_comparison.py @@ -0,0 +1,253 @@ +import time +import tensorrt as trt +import pycuda.driver as cuda +import cupy as cp + +# warmup +self.engine_path = self.get_parameter('engine_path').get_parameter_value().string_value +self.strip_weights = self.get_parameter('strip_weights').get_parameter_value().bool_value +self.precision = self.get_parameter('precision').get_parameter_value().string_value + +self.engine, self.context = self.load_normal_engine() + +# could replace trt.volume with cp.prod # d_input = cuda.mem_alloc(input_size * np.float32().nbytes) +self.stream = cuda.Stream() +self.input_shape = (self.engine).get_binding_shape(0) +self.output_shape = (self.engine).get_binding_shape(1) +self.d_input = cuda.mem_alloc(trt.volume(self.input_shape) * cp.dtype(cp.float32).itemsize) # change to fp16, etc. +self.d_output = cuda.mem_alloc(trt.volume(self.output_shape) * cp.dtype(cp.float32).itemsize) + +self.pointer_subscriber = self.create_publisher(String, 'preprocessing_done', self.pointer_callback, 10) +self.pointer_publisher = self.create_publisher(String, 'inference_done', 10) +self.arrival_time, self.type = 0, None, None +self.warmup() + +def load_normal_engine(self): + if not os.path.exists(self.engine_path): + self.get_logger().error(f"Engine file not found at {self.engine_path}") + return None + + TRT_LOGGER = trt.Logger(trt.Logger.WARNING) + with open(self.engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: + engine = runtime.deserialize_cuda_engine(f.read()) + context = engine.create_execution_context() + self.get_logger().info(f"Successfully loaded engine from {self.engine_path}") + return engine, context + +def warmup(self): + input_shape = self.input_shape + + for _ in range(20): + random_input = np.random.randn(*input_shape).astype(np.float32) + cuda.memcpy_htod(self.d_input, random_input) + self.context.execute(bindings=[int(self.d_input), int(self.d_output)]) + + self.get_logger().info(f"Engine warmed up with 20 inference passes.") + + # Load TensorRT engine and create execution context (example) + TRT_LOGGER = trt.Logger(trt.Logger.WARNING) + +def load_engine(engine_file_path): + with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + +engine = load_engine("model.trt") +context = engine.create_execution_context() + +if self.engine_path.endswith('.trt') or self.engine_path.endswith('.engine'): + if self.strip_weights: + self.engine = self.load_stripped_engine_and_refit() + self.type = "trt_stripped" + else: + self.engine = self.load_normal_engine() + self.type = "trt_normal" +elif self.engine_path.endswith('.pth'): + from torch2trt import TRTModule + import torch + self.engine = TRTModule() + (self.engine).load_state_dict(torch.load(self.engine_path)) + self.type = "torch_trt" +else: + self.get_logger().error("Invalid engine file format. Please provide a .trt, .engine, or .pth file") + return None + +if self.type == "trt_stripped" or self.type == "trt_normal": + self.allocate_buffers() + self.exec_context = (self.engine).create_execution_context() +else: + self.inference_type = "torch_trt" + +def load_stripped_engine_and_refit(self): + if not os.path.exists(self.engine_path): + self.get_logger().error(f"Engine file not found at {self.engine_path}") + return None + + if not os.path.exists(self.model_path): + self.get_logger().error(f"Model file not found at {self.model_path}") + return None + + TRT_LOGGER = trt.Logger(trt.Logger.WARNING) + with open(self.engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: + engine = runtime.deserialize_cuda_engine(f.read()) + refitter = trt.Refitter(engine, TRT_LOGGER) + parser_refitter = trt.OnnxParserRefitter(refitter, TRT_LOGGER) + assert parser_refitter.refit_from_file(self.model_path) + assert refitter.refit_cuda_engine() + return engine + +# fixed allocation: does not account for multiple bindings/batch sizes (single input -> output tensor) +def allocate_buffers(self): + engine = self.engine + # Create a CUDA stream for async execution + self.stream = cuda.Stream() + + self.input_shape = engine.get_binding_shape(0) + self.output_shape = engine.get_binding_shape(1) + + # Allocate device memory for input/output + self.d_input = cuda.mem_alloc(trt.volume(self.input_shape) * np.dtype(np.float32).itemsize) + self.d_output = cuda.mem_alloc(trt.volume(self.output_shape) * np.dtype(np.float32).itemsize) + + # Allocate host pinned memory for input/output (pinned memory for input/output buffers) + self.h_input = cuda.pagelocked_empty(trt.volume(self.input_shape), dtype=np.float32) + self.h_output = cuda.pagelocked_empty(trt.volume(self.output_shape), dtype=np.float32) + + # Example image (allocate on GPU) + self.cv_image = np.random.rand(480, 640, 3).astype(np.uint8) + self.cv_cuda_image = cv2_cuda_GpuMat(self.cv_image.shape[0], self.cv_image.shape[1], cv2.CV_8UC3) + + # Upload image to GPU (device memory) + self.cv_cuda_image.upload(self.cv_image) + +import time +import tensorrt as trt +import pycuda.driver as cuda +import pycuda.autoinit +import torch +import numpy as np +from torch2trt import TRTModule + +# Define paths and settings +engine_path = "model.trt" # Replace with actual paths for your engines +strip_weights = False # Set True if using stripped weights for TensorRT +precision = "fp32" # Set precision (fp32, fp16, int8) + +class InferenceComparison: + + def __init__(self, engine_path, strip_weights, precision): + self.engine_path = engine_path + self.strip_weights = strip_weights + self.precision = precision + self.stream = cuda.Stream() + self.engine = None + self.context = None + self.h_input = None + self.h_output = None + self.d_input = None + self.d_output = None + + def allocate_buffers(self, engine): + self.input_shape = engine.get_binding_shape(0) + self.output_shape = engine.get_binding_shape(1) + + # Allocate device memory for input/output + self.d_input = cuda.mem_alloc(trt.volume(self.input_shape) * np.dtype(np.float32).itemsize) + self.d_output = cuda.mem_alloc(trt.volume(self.output_shape) * np.dtype(np.float32).itemsize) + + # Allocate host pinned memory for input/output (for asynchronous execution) + self.h_input = cuda.pagelocked_empty(trt.volume(self.input_shape), dtype=np.float32) + self.h_output = cuda.pagelocked_empty(trt.volume(self.output_shape), dtype=np.float32) + + def load_normal_engine(self): + TRT_LOGGER = trt.Logger(trt.Logger.WARNING) + with open(self.engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: + engine = runtime.deserialize_cuda_engine(f.read()) + context = engine.create_execution_context() + return engine, context + + def load_stripped_engine_and_refit(self): + TRT_LOGGER = trt.Logger(trt.Logger.WARNING) + with open(self.engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: + engine = runtime.deserialize_cuda_engine(f.read()) + refitter = trt.Refitter(engine, TRT_LOGGER) + # Refit weights here if necessary + assert refitter.refit_cuda_engine() + return engine + + def load_torch_trt(self): + model = TRTModule() + model.load_state_dict(torch.load(self.engine_path)) + return model + + def warmup(self, engine_type): + for _ in range(10): # Warmup with 10 inference passes + random_input = np.random.randn(*self.input_shape).astype(np.float32) + cuda.memcpy_htod(self.d_input, random_input) + if engine_type == "torch_trt": + self.engine(random_input) + else: + self.context.execute(bindings=[int(self.d_input), int(self.d_output)]) + + def infer_with_trt_normal(self): + self.engine, self.context = self.load_normal_engine() + self.allocate_buffers(self.engine) + + # Warmup phase + self.warmup("trt_normal") + + # Run actual inference and measure time + start_time = time.time() + cuda.memcpy_htod(self.d_input, np.random.randn(*self.input_shape).astype(np.float32)) + self.context.execute(bindings=[int(self.d_input), int(self.d_output)]) + cuda.memcpy_dtoh(self.h_output, self.d_output) + trt_normal_time = time.time() - start_time + + return self.h_output, trt_normal_time + + def infer_with_trt_stripped(self): + self.engine = self.load_stripped_engine_and_refit() + self.context = self.engine.create_execution_context() + self.allocate_buffers(self.engine) + + # Warmup phase + self.warmup("trt_stripped") + + # Run actual inference and measure time + start_time = time.time() + cuda.memcpy_htod(self.d_input, np.random.randn(*self.input_shape).astype(np.float32)) + self.context.execute(bindings=[int(self.d_input), int(self.d_output)]) + cuda.memcpy_dtoh(self.h_output, self.d_output) + trt_stripped_time = time.time() - start_time + + return self.h_output, trt_stripped_time + + def infer_with_torch_trt(self): + self.engine = self.load_torch_trt() + + # Create random input tensor + input_tensor = torch.randn(1, 3, 224, 224).cuda() + + # Warmup phase + for _ in range(10): + _ = self.engine(input_tensor) + + # Run actual inference and measure time + start_time = time.time() + output = self.engine(input_tensor) + torch_trt_time = time.time() - start_time + + return output, torch_trt_time + + +# Instantiate the comparison class +comparison = InferenceComparison(engine_path, strip_weights, precision) + +# Run inference for each engine and compare times +output_normal, normal_time = comparison.infer_with_trt_normal() +print(f"Normal TensorRT Inference Time: {normal_time:.6f} seconds") + +output_stripped, stripped_time = comparison.infer_with_trt_stripped() +print(f"Stripped TensorRT Inference Time: {stripped_time:.6f} seconds") + +output_torch_trt, torch_trt_time = comparison.infer_with_torch_trt() +print(f"Torch2TRT Inference Time: {torch_trt_time:.6f} seconds") \ No newline at end of file diff --git a/python_wip/inference_stream.py b/python_wip/inference_stream.py deleted file mode 100644 index b1978d6..0000000 --- a/python_wip/inference_stream.py +++ /dev/null @@ -1,58 +0,0 @@ -import rclpy -from rclpy.node import Node -import pycuda.driver as cuda -import pycuda.autoinit -import numpy as np -import cv2 -import cv2.cuda as cv2_cuda - -class InferenceNode(Node): - def __init__(self): - super().__init__('inference_node') - - # Initialize CUDA context - self.cuda_driver_context = cuda.Device(0).make_context() - self.stream = cuda.Stream() - - # Allocate GPU memory for input and output tensors using cudaMalloc - self.h_input = np.random.randn(1, 3, 224, 224).astype(np.float32) - self.h_output = np.empty((1, 1000), dtype=np.float32) - - self.d_input = cuda.mem_alloc(self.h_input.nbytes) - self.d_output = cuda.mem_alloc(self.h_output.nbytes) - - # Example image (allocate on GPU) - self.cv_image = np.random.rand(480, 640, 3).astype(np.uint8) - self.cv_cuda_image = cv2_cuda_GpuMat(self.cv_image.shape[0], self.cv_image.shape[1], cv2.CV_8UC3) - - # Upload image to GPU (device memory) - self.cv_cuda_image.upload(self.cv_image) - - # Create CUDA IPC handle for output tensor and image - self.output_ipc_handle = cuda.mem_get_ipc_handle(self.d_output) - self.image_ipc_handle = cuda.mem_get_ipc_handle(self.cv_cuda_image.cudaPtr()) - - # Publish the IPC handle to postprocessing node - self.publisher_ = self.create_publisher(MemoryHandle, 'inference_done', 10) - - def run_inference(self): - tic = time.perf_counter_ns() - self.cuda_driver_context.push() - - # Transfer data to device asynchronously - cuda.memcpy_htod_async(self.d_input, self.h_input, self.stream) - - # Execute inference asynchronously - self.exec_context.execute_async_v2(bindings=[int(self.d_input), int(self.d_output)], stream_handle=self.stream.handle) - self.stream.synchronize() - - self.cuda_driver_context.pop() - toc = time.perf_counter_ns() - - self.get_logger().info(f"Inference done in: {(toc-tic)/1e6} ms") - - # Publish the IPC handles to postprocessing node - msg = MemoryHandle() - msg.tensor_ipc_handle = str(self.output_ipc_handle) - msg.image_ipc_handle = str(self.image_ipc_handle) - self.publisher_.publish(msg) diff --git a/python_wip/jax_ex.py b/python_wip/jax_ex.py deleted file mode 100644 index f8069ed..0000000 --- a/python_wip/jax_ex.py +++ /dev/null @@ -1,9 +0,0 @@ -import jax -import jax.numpy as jnp - -a = jnp.array([1.0, 2.0, 3.0]) -b = jnp.array([4.0, 5.0, 6.0]) - -c = a + b - -print(c) \ No newline at end of file diff --git a/python_wip/lifecycle_architecture.py b/python_wip/lifecycle_architecture.py deleted file mode 100644 index d023499..0000000 --- a/python_wip/lifecycle_architecture.py +++ /dev/null @@ -1,183 +0,0 @@ -# lifecycle_node.py -import rclpy -from rclpy.lifecycle import LifecycleNode -from rclpy.lifecycle import State -from rclpy.lifecycle import TransitionCallbackReturn -from rclpy.executors import MultiThreadedExecutor - -class LifecycleManagerNode(LifecycleNode): - def __init__(self): - super().__init__('lifecycle_manager_node') - - # Declare parameters for the lifecycle node - self.declare_parameter('lifecycle_action', 'activate') - self.get_logger().info('LifecycleManagerNode has been created.') - - def on_configure(self, state: State) -> TransitionCallbackReturn: - self.get_logger().info('on_configure() called') - return TransitionCallbackReturn.SUCCESS - - def on_activate(self, state: State) -> TransitionCallbackReturn: - self.get_logger().info('on_activate() called') - return TransitionCallbackReturn.SUCCESS - - def on_deactivate(self, state: State) -> TransitionCallbackReturn: - self.get_logger().info('on_deactivate() called') - return TransitionCallbackReturn.SUCCESS - - def on_cleanup(self, state: State) -> TransitionCallbackReturn: - self.get_logger().info('on_cleanup() called') - return TransitionCallbackReturn.SUCCESS - - def on_shutdown(self, state: State) -> TransitionCallbackReturn: - self.get_logger().info('on_shutdown() called') - return TransitionCallbackReturn.SUCCESS - -def main(args=None): - rclpy.init(args=args) - - lifecycle_node = LifecycleManagerNode() - - executor = MultiThreadedExecutor() - executor.add_node(lifecycle_node) - - try: - rclpy.spin(lifecycle_node, executor=executor) - except KeyboardInterrupt: - pass - - lifecycle_node.destroy_node() - rclpy.shutdown() - -if __name__ == '__main__': - main() - -import rclpy -from rclpy.lifecycle import Node as LifecycleNode -from rclpy.lifecycle import State -from rclpy.lifecycle import TransitionCallbackReturn -import pycuda.driver as cuda -import pycuda.autoinit -import tensorrt as trt -import numpy as np -import time - -# TensorRT logger -TRT_LOGGER = trt.Logger(trt.Logger.WARNING) - -class TRTLifecycleNode(LifecycleNode): - def __init__(self): - super().__init__('trt_lifecycle_node') - self.engine = None - self.context = None - self.inputs = None - self.outputs = None - self.bindings = None - self.stream = None - - def on_configure(self, state: State) -> TransitionCallbackReturn: - """Load and deserialize TensorRT engine and allocate buffers.""" - self.get_logger().info("Configuring... Loading TensorRT engine.") - - try: - # Load TensorRT engine - engine_path = '/path/to/your/model.trt' - self.engine = self.load_trt_engine(engine_path) - self.context = self.engine.create_execution_context() - - # Allocate buffers - self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers() - - self.get_logger().info("TensorRT engine loaded and buffers allocated.") - return TransitionCallbackReturn.SUCCESS - - except Exception as e: - self.get_logger().error(f"Failed to load TensorRT engine: {e}") - return TransitionCallbackReturn.FAILURE - - def on_activate(self, state: State) -> TransitionCallbackReturn: - """Start inference process or any task in this state.""" - self.get_logger().info("Activating... ready for inference.") - return TransitionCallbackReturn.SUCCESS - - def on_deactivate(self, state: State) -> TransitionCallbackReturn: - """Handle node deactivation (optional).""" - self.get_logger().info("Deactivating...") - return TransitionCallbackReturn.SUCCESS - - def on_cleanup(self, state: State) -> TransitionCallbackReturn: - """Clean up and release resources.""" - self.get_logger().info("Cleaning up... releasing resources.") - self.context = None - self.engine = None - self.inputs = None - self.outputs = None - self.bindings = None - self.stream = None - return TransitionCallbackReturn.SUCCESS - - def on_shutdown(self, state: State) -> TransitionCallbackReturn: - """Shutdown node gracefully.""" - self.get_logger().info("Shutting down...") - return TransitionCallbackReturn.SUCCESS - - def load_trt_engine(self, engine_path): - """Load and deserialize the TensorRT engine.""" - with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: - engine_data = f.read() - return runtime.deserialize_cuda_engine(engine_data) - - def allocate_buffers(self): - """Allocate input/output buffers for TensorRT engine.""" - inputs = [] - outputs = [] - bindings = [] - stream = cuda.Stream() - - for binding in self.engine: - binding_shape = self.engine.get_binding_shape(binding) - size = trt.volume(binding_shape) * self.engine.max_batch_size - dtype = trt.nptype(self.engine.get_binding_dtype(binding)) - - # Allocate host and device memory for inputs/outputs - host_mem = cuda.pagelocked_empty(size, dtype) - device_mem = cuda.mem_alloc(host_mem.nbytes) - - bindings.append(int(device_mem)) - if self.engine.binding_is_input(binding): - inputs.append({'host': host_mem, 'device': device_mem}) - else: - outputs.append({'host': host_mem, 'device': device_mem}) - - return inputs, outputs, bindings, stream - - def run_inference(self, input_data): - """Run inference using the loaded TensorRT engine.""" - np.copyto(self.inputs[0]['host'], input_data.ravel()) - - # Transfer input to device - cuda.memcpy_htod_async(self.inputs[0]['device'], self.inputs[0]['host'], self.stream) - - # Run inference - self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle) - - # Transfer predictions back from device - cuda.memcpy_dtoh_async(self.outputs[0]['host'], self.outputs[0]['device'], self.stream) - - # Synchronize stream - self.stream.synchronize() - - return self.outputs[0]['host'] - - -def main(args=None): - rclpy.init(args=args) - node = TRTLifecycleNode() - - # Spin the node until shutdown - rclpy.spin(node) - node.destroy_node() - rclpy.shutdown() - -if __name__ == '__main__': - main() diff --git a/python_wip/numba_ex.py b/python_wip/numba_ex.py deleted file mode 100644 index a539aff..0000000 --- a/python_wip/numba_ex.py +++ /dev/null @@ -1,29 +0,0 @@ -from numba import cuda -import numpy as np - -@cuda.jit -def add_kernel(a, b, c): - idx = cuda.grid(1) - if idx < a.size: - c[idx] = a[idx] + b[idx] - -# Allocate data on the host -N = 1000 -a = np.arange(N, dtype=np.float32) -b = np.arange(N, dtype=np.float32) -c = np.zeros_like(a) - -# Allocate data on the device -a_gpu = cuda.to_device(a) -b_gpu = cuda.to_device(b) -c_gpu = cuda.device_array_like(a) - -# Launch kernel -threads_per_block = 128 -blocks_per_grid = (a.size + (threads_per_block - 1)) // threads_per_block -add_kernel[blocks_per_grid, threads_per_block](a_gpu, b_gpu, c_gpu) - -# Copy result back to host -c_gpu.copy_to_host(c) - -print(c) \ No newline at end of file diff --git a/python_wip/numba_example.py b/python_wip/numba_example.py deleted file mode 100644 index ff8ad74..0000000 --- a/python_wip/numba_example.py +++ /dev/null @@ -1,31 +0,0 @@ -from numba import cuda -import numpy as np - -# Define a CUDA kernel function -@cuda.jit -def matrix_addition_kernel(a, b, result): - i, j = cuda.grid(2) - if i < result.shape[0] and j < result.shape[1]: - result[i, j] = a[i, j] + b[i, j] - -# Initialize NumPy arrays -a = np.random.rand(32, 32).astype(np.float32) -b = np.random.rand(32, 32).astype(np.float32) -result = np.zeros_like(a) - -# Allocate arrays on the GPU -a_device = cuda.to_device(a) -b_device = cuda.to_device(b) -result_device = cuda.to_device(result) - -# Define the grid size for the kernel execution -threads_per_block = (16, 16) -blocks_per_grid = (a.shape[0] // threads_per_block[0] + 1, a.shape[1] // threads_per_block[1] + 1) - -# Launch the CUDA kernel -matrix_addition_kernel[blocks_per_grid, threads_per_block](a_device, b_device, result_device) - -# Copy the result back to the host (CPU) -result = result_device.copy_to_host() - -print(result) diff --git a/python_wip/pinned_multithread.py b/python_wip/pinned_multithread.py deleted file mode 100644 index 962ea53..0000000 --- a/python_wip/pinned_multithread.py +++ /dev/null @@ -1,129 +0,0 @@ -import rclpy -from rclpy.node import Node -from sensor_msgs.msg import Image -import cv2 -import numpy as np -import pycuda.driver as cuda -import pycuda.autoinit -from threading import Thread -from trt_inference import TRTInference # Custom TensorRT wrapper - -class BoundingBoxNode(Node): - def __init__(self): - super().__init__('bounding_box_node') - - # Subscribers for left and right camera image topics - self.left_image_subscriber = self.create_subscription(Image, '/camera/left/image_raw', self.left_image_callback, 10) - self.right_image_subscriber = self.create_subscription(Image, '/camera/right/image_raw', self.right_image_callback, 10) - - # Initialize TensorRT inference engine - self.trt_inference = TRTInference('model.engine') # Custom class for TensorRT inference - self.cuda_stream = cuda.Stream() # Use an asynchronous CUDA stream - - # Buffer for left and right images - self.left_image = None - self.right_image = None - - # Allocate pinned memory (page-locked) for faster memory transfers - self.host_input = cuda.pagelocked_empty(shape=(2, self.trt_inference.input_height, self.trt_inference.input_width, 3), dtype=np.float32) # Batch of 2 images - - # Allocate memory for output - self.host_output = cuda.pagelocked_empty(shape=(self.trt_inference.output_size,), dtype=np.float32) - - # Initialize threading - self.inference_thread = None - - def left_image_callback(self, msg): - frame = self.ros2_image_to_numpy(msg) - self.left_image = frame - - # If both left and right images are available, start inference - if self.right_image is not None: - if self.inference_thread is None or not self.inference_thread.is_alive(): - self.inference_thread = Thread(target=self.run_inference) - self.inference_thread.start() - - def right_image_callback(self, msg): - frame = self.ros2_image_to_numpy(msg) - self.right_image = frame - - # If both left and right images are available, start inference - if self.left_image is not None: - if self.inference_thread is None or not self.inference_thread.is_alive(): - self.inference_thread = Thread(target=self.run_inference) - self.inference_thread.start() - - def run_inference(self): - # Batch left and right images - self.preprocess_image(self.left_image, 0) # Preprocess left image into batch slot 0 - self.preprocess_image(self.right_image, 1) # Preprocess right image into batch slot 1 - - # Asynchronously copy input batch to GPU - self.trt_inference.set_input_async(self.host_input, stream=self.cuda_stream) - - # Run asynchronous inference - self.trt_inference.infer_async(stream=self.cuda_stream) - - # Asynchronously copy output from GPU to host - output = self.trt_inference.get_output_async(self.host_output, stream=self.cuda_stream) - - # Post-process bounding boxes and other outputs (e.g., NMS) - bboxes = self.post_process(output) - - # Optionally publish bounding boxes or visualize results - self.publish_bounding_boxes(bboxes) - - # Reset left and right images for the next batch - self.left_image = None - self.right_image = None - - def preprocess_image(self, frame, index): - """Perform preprocessing steps (resize, normalize, etc.) and copy to the pinned memory.""" - resized_frame = cv2.resize(frame, (self.trt_inference.input_width, self.trt_inference.input_height)) - normalized_frame = resized_frame.astype(np.float32) / 255.0 # Simple normalization - - # Copy preprocessed image to the pinned memory buffer at the correct batch index - self.host_input[index] = normalized_frame - - def post_process(self, output): - """Post-process the TensorRT output to extract bounding boxes.""" - # Implement bounding box extraction, confidence thresholding, and optional NMS - bboxes = self.extract_bounding_boxes(output) - return bboxes - - def publish_bounding_boxes(self, bboxes): - """Convert bounding boxes to a suitable ROS2 message and publish (optional).""" - # You can add the code to publish the bounding boxes to a ROS2 topic here - pass - - def ros2_image_to_numpy(self, msg): - """Convert ROS2 Image message to a NumPy array (assumed to be in BGR format).""" - img = np.frombuffer(msg.data, dtype=np.uint8).reshape(msg.height, msg.width, 3) - return img - -def main(args=None): - rclpy.init(args=args) - node = BoundingBoxNode() - try: - rclpy.spin(node) - except KeyboardInterrupt: - pass - finally: - node.destroy_node() - rclpy.shutdown() - -if __name__ == '__main__': - main() - -# stream synchronization: -# Run asynchronous inference -self.trt_inference.infer_async(stream=self.cuda_stream) - -# Asynchronously copy output from GPU to host -output = self.trt_inference.get_output_async(self.host_output, stream=self.cuda_stream) - -# Synchronize stream to ensure all GPU operations are complete before proceeding -self.cuda_stream.synchronize() - -# Now it's safe to process the results -bboxes = self.post_process(output) diff --git a/python_wip/pinned_multithread_executor.py b/python_wip/pinned_multithread_executor.py deleted file mode 100644 index 1e15b02..0000000 --- a/python_wip/pinned_multithread_executor.py +++ /dev/null @@ -1,214 +0,0 @@ -import rclpy -from rclpy.callback_groups import ReentrantCallbackGroup -from rclpy.executors import MultiThreadedExecutor -from rclpy.node import Node -from sensor_msgs.msg import Image -import cv2 -import numpy as np -from trt_inference import TRTInference # TensorRT inference class - -class BoundingBoxNode(Node): - def __init__(self): - super().__init__('bounding_box_node') - - # Create reentrant callback groups for left and right image callbacks - self.callback_group_left = ReentrantCallbackGroup() - self.callback_group_right = ReentrantCallbackGroup() - - # Subscribers for left and right camera image topics, each with a different callback group - self.left_image_subscriber = self.create_subscription( - Image, '/camera/left/image_raw', self.left_image_callback, 10, callback_group=self.callback_group_left) - - self.right_image_subscriber = self.create_subscription( - Image, '/camera/right/image_raw', self.right_image_callback, 10, callback_group=self.callback_group_right) - - # Initialize TensorRT inference engine - self.trt_inference = TRTInference('model.engine') - self.cuda_stream = cuda.Stream() - - # Buffer for left and right images - self.left_image = None - self.right_image = None - - def left_image_callback(self, msg): - frame = self.ros2_image_to_numpy(msg) - self.left_image = frame - - # If both left and right images are available, start inference - if self.right_image is not None: - self.run_inference() - - def right_image_callback(self, msg): - frame = self.ros2_image_to_numpy(msg) - self.right_image = frame - - # If both left and right images are available, start inference - if self.left_image is not None: - self.run_inference() - - def run_inference(self): - # Batch left and right images and run inference here - ... - - def ros2_image_to_numpy(self, msg): - img = np.frombuffer(msg.data, dtype=np.uint8).reshape(msg.height, msg.width, 3) - return img - -def main(args=None): - rclpy.init(args=args) - - node = BoundingBoxNode() - - # Use a MultiThreadedExecutor with 2 threads - executor = MultiThreadedExecutor(num_threads=2) - executor.add_node(node) - - try: - executor.spin() - except KeyboardInterrupt: - pass - finally: - node.destroy_node() - rclpy.shutdown() - -if __name__ == '__main__': - main() - -# ReentrantCallbackGroup: This callback group allows the same callback to be executed in parallel across multiple threads. Each subscription (left and right image) is assigned a different ReentrantCallbackGroup to ensure they can be processed concurrently. - -# MultiThreadedExecutor: The MultiThreadedExecutor is created with a thread pool size of 2, allowing two callbacks (left and right image) to be processed in parallel. - -# parallelize preprocessing, inference, postprocessing, and publishing operations while ensuring proper GPU context and memory management, we can leverage ROS2's multithreaded executor with callback groups for different stages of the pipeline. - -import rclpy -from rclpy.callback_groups import ReentrantCallbackGroup -from rclpy.executors import MultiThreadedExecutor -from rclpy.node import Node -from sensor_msgs.msg import Image -import cv2 -import numpy as np -import pycuda.driver as cuda - -from trt_inference import TRTInference # TensorRT inference class - -class BoundingBoxNode(Node): - def __init__(self): - super().__init__('bounding_box_node') - - # Initialize callback groups for parallel processing - self.preprocess_group = ReentrantCallbackGroup() - self.inference_group = ReentrantCallbackGroup() - self.postprocess_group = ReentrantCallbackGroup() - self.publish_group = ReentrantCallbackGroup() - - # Subscribers for left and right camera image topics, each in the preprocess group - self.left_image_subscriber = self.create_subscription( - Image, '/camera/left/image_raw', self.left_image_callback, 10, callback_group=self.preprocess_group) - - self.right_image_subscriber = self.create_subscription( - Image, '/camera/right/image_raw', self.right_image_callback, 10, callback_group=self.preprocess_group) - - # Initialize TensorRT inference engine - self.trt_inference = TRTInference('model.engine') - - # CUDA stream for asynchronous operations - self.cuda_stream = cuda.Stream() - - # Buffers for left and right images - self.left_image = None - self.right_image = None - - # Create publishers to publish bounding box results - self.bounding_boxes_publisher = self.create_publisher( - Image, '/bounding_boxes', 10, callback_group=self.publish_group) - - def left_image_callback(self, msg): - # Preprocess left image - self.get_logger().info('Preprocessing left image...') - frame = self.ros2_image_to_numpy(msg) - self.left_image = self.preprocess_image(frame) - - # If both left and right images are ready, run inference - if self.right_image is not None: - self.create_task(self.run_inference, self.inference_group) - - def right_image_callback(self, msg): - # Preprocess right image - self.get_logger().info('Preprocessing right image...') - frame = self.ros2_image_to_numpy(msg) - self.right_image = self.preprocess_image(frame) - - # If both left and right images are ready, run inference - if self.left_image is not None: - self.create_task(self.run_inference, self.inference_group) - - def preprocess_image(self, image): - # Example preprocessing: resize, normalize, etc. - # Perform any necessary transformations for the model - return cv2.resize(image, (224, 224)) - - def run_inference(self): - # Ensure both images are preprocessed before inference - if self.left_image is not None and self.right_image is not None: - self.get_logger().info('Running inference...') - - # Batch left and right images together - batched_images = np.stack([self.left_image, self.right_image]) - - # Run asynchronous inference on GPU using the CUDA stream - self.trt_inference.infer_async(batched_images, stream=self.cuda_stream) - - # Asynchronously copy the inference results from GPU to host - self.create_task(self.postprocess_results, self.postprocess_group) - - def postprocess_results(self): - self.get_logger().info('Postprocessing results...') - # Wait for inference and data transfers to complete - self.cuda_stream.synchronize() - - # Retrieve and postprocess the results - output = self.trt_inference.get_output() - bboxes = self.post_process(output) - - # Publish the results - self.create_task(lambda: self.publish_results(bboxes), self.publish_group) - - def post_process(self, output): - # Convert raw output into bounding boxes or detection results - return output # Example post-processing step - - def publish_results(self, bboxes): - self.get_logger().info('Publishing results...') - # Publish bounding boxes or any other output format - # Placeholder for publishing logic; replace with actual ROS message type - msg = Image() # Use the appropriate message type for bounding boxes - self.bounding_boxes_publisher.publish(msg) - - def ros2_image_to_numpy(self, msg): - # Convert ROS2 Image message to a NumPy array - return np.frombuffer(msg.data, dtype=np.uint8).reshape(msg.height, msg.width, 3) - - def create_task(self, target_function, callback_group): - # Helper function to execute tasks within the appropriate callback group - self.get_logger().info('Creating async task...') - self.executor.submit(target_function) - -def main(args=None): - rclpy.init(args=args) - - node = BoundingBoxNode() - - # Use a MultiThreadedExecutor with multiple threads for parallel callbacks - executor = MultiThreadedExecutor(num_threads=4) - executor.add_node(node) - - try: - executor.spin() - except KeyboardInterrupt: - pass - finally: - node.destroy_node() - rclpy.shutdown() - -if __name__ == '__main__': - main() diff --git a/python_wip/pointer_to_pt.py b/python_wip/pointer_to_pt.py deleted file mode 100644 index 1217208..0000000 --- a/python_wip/pointer_to_pt.py +++ /dev/null @@ -1,18 +0,0 @@ -import torch -import pycuda.driver as cuda - -# Example CUDA buffer size (you should match this with your actual output size) -output_size = (1, 1000) # Example: TensorRT output of shape [1, 1000] - -# Convert CUDA memory pointer (from IPC) to a PyTorch tensor -def cuda_pointer_to_torch_tensor(cuda_ptr, shape, dtype=torch.float32): - # Convert the raw pointer to PyTorch tensor (in GPU memory) - tensor = torch.from_blob(cuda_ptr, shape, dtype=dtype, device='cuda') - return tensor - -# In your post-processing node, after receiving the CUDA IPC handle -ipc_handle = cuda.IPCHandle(ipc_handle_bytes) -d_output = ipc_handle.open(cuda.Context.get_current()) - -# Convert the CUDA device pointer to a PyTorch tensor -output_tensor = cuda_pointer_to_torch_tensor(d_output, output_size) diff --git a/python_wip/post_stream.py b/python_wip/post_stream.py deleted file mode 100644 index 60b7500..0000000 --- a/python_wip/post_stream.py +++ /dev/null @@ -1,51 +0,0 @@ -import rclpy -from rclpy.node import Node -import pycuda.driver as cuda -import pycuda.autoinit -import cv2 -import cv2.cuda as cv2_cuda - -class PostprocessingNode(Node): - def __init__(self): - super().__init__('postprocessing_node') - - # Create CUDA context - self.cuda_driver_context = cuda.Device(0).make_context() - - # Subscribe to inference_done topic to get IPC handles - self.subscription = self.create_subscription( - MemoryHandle, - 'inference_done', - self.postprocess_callback, - 10 - ) - - def postprocess_callback(self, msg): - # Get the IPC handles for tensor and image - tensor_ipc_handle_str = msg.tensor_ipc_handle - image_ipc_handle_str = msg.image_ipc_handle - - # Open IPC memory handles for tensor and image - tensor_ipc_handle = cuda.IPCMemoryHandle(tensor_ipc_handle_str) - image_ipc_handle = cuda.IPCMemoryHandle(image_ipc_handle_str) - - d_output = cuda.ipc_open_mem_handle(tensor_ipc_handle, self.h_output.nbytes) - d_image = cuda.ipc_open_mem_handle(image_ipc_handle, self.cv_image.nbytes) - - # Wrap the image GPU pointer into a GpuMat object for OpenCV CUDA operations - cv_cuda_image = cv2_cuda_GpuMat(self.cv_image.shape[0], self.cv_image.shape[1], cv2.CV_8UC3) - cv_cuda_image.upload(d_image) - - # Perform OpenCV CUDA operations on the image (e.g., GaussianBlur) - blurred_image = cv2_cuda_image.gaussianBlur((5, 5), 0) - - # Retrieve inference result and postprocess - cuda.memcpy_dtoh(self.h_output, d_output) - self.stream.synchronize() - - output = np.copy(self.h_output) - self.get_logger().info(f"Postprocessed tensor: {output}") - - # Clean up IPC memory handles - cuda.ipc_close_mem_handle(d_output) - cuda.ipc_close_mem_handle(d_image) diff --git a/python_wip/rand_post.py b/python_wip/postprocess_comparison.py similarity index 67% rename from python_wip/rand_post.py rename to python_wip/postprocess_comparison.py index 1dfe240..76c9b17 100644 --- a/python_wip/rand_post.py +++ b/python_wip/postprocess_comparison.py @@ -1,3 +1,62 @@ +import cv2 +import os + +def draw_bounding_boxes(image_path, bboxes): + # Read the image using OpenCV + img = cv2.imread(image_path) + + if img is None: + print(f"Error loading image: {image_path}") + return + + # Get the dimensions of the image + height, width, _ = img.shape + print(height) + print(width) + + # Draw each bounding box on the image + for bbox in bboxes: + class_id, x_center, y_center, bbox_width, bbox_height = bbox + + # Convert normalized values to absolute pixel values + x_center_pixel = int(x_center * width) + y_center_pixel = int(y_center * height) + bbox_width_pixel = int(bbox_width * width) + bbox_height_pixel = int(bbox_height * height) + + # Calculate the top-left and bottom-right corners of the bounding box + top_left_x = int(x_center_pixel - bbox_width_pixel / 2) + top_left_y = int(y_center_pixel - bbox_height_pixel / 2) + bottom_right_x = int(x_center_pixel + bbox_width_pixel / 2) + bottom_right_y = int(y_center_pixel + bbox_height_pixel / 2) + + # Draw the bounding box (using green color and thickness of 2) + cv2.rectangle(img, (top_left_x, top_left_y), (bottom_right_x, bottom_right_y), (0, 255, 0), 2) + + # Show the image with bounding boxes (press any key to close) + cv2.imshow('Bounding Boxes', img) + cv2.waitKey(10000) + cv2.destroyAllWindows() + +def read_bounding_boxes(txt_file): + bboxes = [] + with open(txt_file, 'r') as file: + for line in file.readlines(): + values = line.strip().split() + class_id = int(values[0]) + x_center = float(values[1]) + y_center = float(values[2]) + bbox_width = float(values[3]) + bbox_height = float(values[4]) + bboxes.append((class_id, x_center, y_center, bbox_width, bbox_height)) + return bboxes + +os.chdir("C:/Users/ishaa/Coding Projects/Applied-AI/ROS/assets/maize") +print(os.getcwd()) +boxes = read_bounding_boxes("IMG_2884_18.txt") +print(boxes) +draw_bounding_boxes("IMG_2884_18.JPG", boxes) + def box_iou_batch(self, boxes_a, boxes_b): # Helper function to calculate the area of the boxes def box_area(box): diff --git a/python_wip/preprocess_comparison.py b/python_wip/preprocess_comparison.py new file mode 100644 index 0000000..30a0fc5 --- /dev/null +++ b/python_wip/preprocess_comparison.py @@ -0,0 +1,97 @@ +import time +import torch +import torchvision.transforms as T +import cv2 +import numpy as np +import cupy as cp + +# Load image (this will be used for all methods) +image_path = 'image.png' +image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED) + +# ROI coordinates for cropping (just placeholders, you need to adjust) +roi_x, roi_y, roi_w, roi_h = 50, 50, 200, 200 + +### CuPy method +def preprocess_with_cupy(): + # Upload image to GPU with CuPy + gpu_image = cp.asarray(image) + + # Remove alpha channel (RGBA -> RGB) + gpu_image = gpu_image[:, :, :3] + + # Crop the image based on ROI + gpu_image = gpu_image[roi_y:roi_y + roi_h, roi_x:roi_x + roi_w, :] + + # Resize the image to (224, 224) + gpu_image = cp.array(cv2.resize(cp.asnumpy(gpu_image), (224, 224))) + + # Normalize pixel values to [0, 1] and transpose + gpu_image = gpu_image.astype(cp.float32) / 255.0 + gpu_image = cp.transpose(gpu_image, (2, 0, 1)) # Transpose to (C, H, W) + + return gpu_image + +### PyTorch method +def preprocess_with_pytorch(): + # Convert image to PyTorch tensor + image_tensor = torch.from_numpy(image).cuda() + + # PyTorch transformation pipeline + transform = T.Compose([ + T.Lambda(lambda x: x[:, :, :3]), # Remove alpha channel + T.Lambda(lambda x: x[roi_y:roi_y + roi_h, roi_x:roi_x + roi_w, :]), # Crop + T.Resize((224, 224)), + T.ToTensor(), # Convert to tensor and normalize + ]) + + # Apply transformations + image_tensor = transform(image_tensor) + + return image_tensor + +### OpenCV with CUDA method +def preprocess_with_opencv_cuda(): + # Upload image to GPU using cv2.cuda_GpuMat + gpu_image = cv2.cuda_GpuMat() + gpu_image.upload(image) + + # Remove alpha channel (RGBA -> RGB) + gpu_image = gpu_image.get()[:, :, :3] + gpu_image = cv2.cuda_GpuMat() + gpu_image.upload(gpu_image) + + # Crop the image based on ROI + gpu_image = gpu_image[roi_y:roi_y + roi_h, roi_x:roi_x + roi_w] + + # Resize the image to (224, 224) + resized = cv2.cuda.resize(gpu_image, (224, 224)) + + # Download back to CPU for normalization + cpu_image = resized.download() + + # Normalize and transpose + cpu_image = cpu_image.astype(np.float32) / 255.0 + cpu_image = np.transpose(cpu_image, (2, 0, 1)) # Transpose to (C, H, W) + + return cpu_image + +# Measure execution time for CuPy +start_time = time.time() +gpu_image_cupy = preprocess_with_cupy() +cupy_time = time.time() - start_time + +# Measure execution time for PyTorch +start_time = time.time() +gpu_image_pytorch = preprocess_with_pytorch() +pytorch_time = time.time() - start_time + +# Measure execution time for OpenCV with CUDA +start_time = time.time() +gpu_image_opencv_cuda = preprocess_with_opencv_cuda() +opencv_cuda_time = time.time() - start_time + +# Output results +print(f"CuPy Preprocessing Time: {cupy_time:.4f} seconds") +print(f"PyTorch Preprocessing Time: {pytorch_time:.4f} seconds") +print(f"OpenCV with CUDA Preprocessing Time: {opencv_cuda_time:.4f} seconds") diff --git a/python_wip/pytorch_ex.py b/python_wip/pytorch_ex.py deleted file mode 100644 index 4601ec7..0000000 --- a/python_wip/pytorch_ex.py +++ /dev/null @@ -1,35 +0,0 @@ -import torch -import torchvision.transforms as T - -def preprocess_image_pytorch(self, image): - tic = time.perf_counter_ns() - - roi_x, roi_y, roi_w, roi_h = self.roi_dimensions - shifted_x = roi_x + abs(self.velocity[0]) * self.shift_constant - - # Convert image to PyTorch tensor and move to GPU - image_tensor = torch.from_numpy(image).cuda() - - # Define preprocessing transformations - transform = T.Compose([ - T.Lambda(lambda img: img[roi_y:(roi_y+roi_h), shifted_x:(shifted_x+roi_w), :3]), # Crop and remove alpha - T.Resize(self.dimensions), # Resize to model input size - T.ToTensor(), # Convert to Tensor - T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # Normalize - ]) - - # Apply transformations (automatically handles CHW format for TensorRT) - input_data = transform(image_tensor).unsqueeze(0).float().cuda() - - d_input_ptr = input_data.data_ptr() # Get device pointer of the tensor - - # Publish the IPC handle or pointer - ipc_handle = cuda.mem_get_ipc_handle(d_input_ptr) - - toc = time.perf_counter_ns() - self.get_logger().info(f"Preprocessing: {(toc-tic)/1e6} ms") - - # Publish the IPC handle - ipc_handle_msg = String() - ipc_handle_msg.data = str(ipc_handle.handle) - self.pointer_publisher.publish(ipc_handle_msg) diff --git a/python_wip/rand_infer.py b/python_wip/rand_infer.py deleted file mode 100644 index e696705..0000000 --- a/python_wip/rand_infer.py +++ /dev/null @@ -1,78 +0,0 @@ -# Load TensorRT engine and create execution context (example) -TRT_LOGGER = trt.Logger(trt.Logger.WARNING) - -def load_engine(engine_file_path): - with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: - return runtime.deserialize_cuda_engine(f.read()) - -engine = load_engine("model.trt") -context = engine.create_execution_context() - -if self.engine_path.endswith('.trt') or self.engine_path.endswith('.engine'): - if self.strip_weights: - self.engine = self.load_stripped_engine_and_refit() - self.type = "trt_stripped" - else: - self.engine = self.load_normal_engine() - self.type = "trt_normal" -elif self.engine_path.endswith('.pth'): - from torch2trt import TRTModule - import torch - self.engine = TRTModule() - (self.engine).load_state_dict(torch.load(self.engine_path)) - self.type = "torch_trt" -else: - self.get_logger().error("Invalid engine file format. Please provide a .trt, .engine, or .pth file") - return None - -if self.type == "trt_stripped" or self.type == "trt_normal": - self.allocate_buffers() - self.exec_context = (self.engine).create_execution_context() -else: - self.inference_type = "torch_trt" - -def load_stripped_engine_and_refit(self): - if not os.path.exists(self.engine_path): - self.get_logger().error(f"Engine file not found at {self.engine_path}") - return None - - if not os.path.exists(self.model_path): - self.get_logger().error(f"Model file not found at {self.model_path}") - return None - - TRT_LOGGER = trt.Logger(trt.Logger.WARNING) - with open(self.engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: - engine = runtime.deserialize_cuda_engine(f.read()) - refitter = trt.Refitter(engine, TRT_LOGGER) - parser_refitter = trt.OnnxParserRefitter(refitter, TRT_LOGGER) - assert parser_refitter.refit_from_file(self.model_path) - assert refitter.refit_cuda_engine() - return engine - -# fixed allocation: does not account for multiple bindings/batch sizes (single input -> output tensor) -def allocate_buffers(self): - engine = self.engine - # Create a CUDA stream for async execution - self.stream = cuda.Stream() - - self.input_shape = engine.get_binding_shape(0) - self.output_shape = engine.get_binding_shape(1) - - # Allocate device memory for input/output - self.d_input = cuda.mem_alloc(trt.volume(self.input_shape) * np.dtype(np.float32).itemsize) - self.d_output = cuda.mem_alloc(trt.volume(self.output_shape) * np.dtype(np.float32).itemsize) - - # Allocate host pinned memory for input/output (pinned memory for input/output buffers) - self.h_input = cuda.pagelocked_empty(trt.volume(self.input_shape), dtype=np.float32) - self.h_output = cuda.pagelocked_empty(trt.volume(self.output_shape), dtype=np.float32) - - # Example image (allocate on GPU) - self.cv_image = np.random.rand(480, 640, 3).astype(np.uint8) - self.cv_cuda_image = cv2_cuda_GpuMat(self.cv_image.shape[0], self.cv_image.shape[1], cv2.CV_8UC3) - - # Upload image to GPU (device memory) - self.cv_cuda_image.upload(self.cv_image) - - # # Create CUDA IPC handle for output tensor and image - # self.output_ipc_handle = cuda.mem_get_ipc_handle(self.d_output) - # self.image_ipc_handle = cuda.mem_get_ipc_handle(self.cv_cuda_image.cudaPtr()) \ No newline at end of file diff --git a/python_wip/stream_cv2.py b/python_wip/stream_cv2.py deleted file mode 100644 index c6ce0e6..0000000 --- a/python_wip/stream_cv2.py +++ /dev/null @@ -1,47 +0,0 @@ -import rclpy -from rclpy.node import Node -import pycuda.driver as cuda -import pycuda.autoinit -import cv2 -import cv2.cuda as cv2_cuda - -class PostprocessingNode(Node): - def __init__(self): - super().__init__('postprocessing_node') - self.subscription = self.create_subscription( - MemoryHandle, - 'inference_done', - self.postprocess_callback, - 10 - ) - - # Ensure same CUDA context or initialize a new one if needed - self.cuda_driver_context = cuda.Device(0).make_context() - - def postprocess_callback(self, msg): - # Get the IPC handle and shared image address - ipc_handle_str = msg.ipc_handle - ipc_handle = cuda.IPCMemoryHandle(ipc_handle_str) - shared_image_address = msg.shared_image_address - - # Map the shared output tensor via CUDA IPC - d_output = cuda.ipc_open_mem_handle(ipc_handle, pycuda.driver.mem_alloc(self.h_output.nbytes)) - - # Access shared image directly from unified memory (no need to download) - cv_cuda_image = cv2_cuda_GpuMat(480, 640, cv2.CV_8UC3) - cv_cuda_image.upload(shared_image_address) - - # Example OpenCV CUDA operation: GaussianBlur - blurred_image = cv2_cuda_image.gaussianBlur((5, 5), 0) - - # Postprocess the inference output and the blurred image - cuda.memcpy_dtoh(self.h_output, d_output) - self.stream.synchronize() - - output = np.copy(self.h_output) - self.get_logger().info(f"Postprocessed tensor: {output}") - - # Clean up the IPC memory - cuda.ipc_close_mem_handle(d_output) - -# this uses unified memory... \ No newline at end of file diff --git a/python_wip/stream_manager.py b/python_wip/stream_manager.py deleted file mode 100644 index b3ad76e..0000000 --- a/python_wip/stream_manager.py +++ /dev/null @@ -1,27 +0,0 @@ -import pycuda.driver as cuda - -class CudaStreamManager: - def __init__(self): - # Create separate streams for different stages - self.preprocess_stream = cuda.Stream() - self.inference_stream = cuda.Stream() - self.postprocess_stream = cuda.Stream() - - # Create CUDA events for synchronization - self.preprocess_done = cuda.Event() - self.inference_done = cuda.Event() - - def get_preprocess_stream(self): - return self.preprocess_stream - - def get_inference_stream(self): - return self.inference_stream - - def get_postprocess_stream(self): - return self.postprocess_stream - - def get_preprocess_event(self): - return self.preprocess_done - - def get_inference_event(self): - return self.inference_done \ No newline at end of file diff --git a/python_wip/trt_modify.py b/python_wip/trt_modify.py deleted file mode 100644 index 8b88f6e..0000000 --- a/python_wip/trt_modify.py +++ /dev/null @@ -1,26 +0,0 @@ -import tensorrt as trt - -# Create a builder and network definition for TensorRT -TRT_LOGGER = trt.Logger(trt.Logger.WARNING) -builder = trt.Builder(TRT_LOGGER) -network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) - -# Add an input layer with data type trt.float32 -input_tensor = network.add_input(name="input", dtype=trt.float32, shape=(1, 3, 224, 224)) - -# Define a layer (for example, a convolution layer) with trt.float32 precision -conv_layer = network.add_convolution(input=input_tensor, num_output_maps=32, kernel_shape=(3, 3), kernel=np.random.rand(32, 3, 3, 3).astype(np.float32)) -conv_layer.precision = trt.float32 - -# Mark output and build engine -network.mark_output(conv_layer.get_output(0)) - -# Set builder configurations, including the data precision -config = builder.create_builder_config() -config.set_flag(trt.BuilderFlag.FP16) # Enable FP16 if you want to support mixed precision -config.max_workspace_size = 1 << 30 # Set max workspace size - -# Build engine with specified precisions (trt.float32 by default if FP16 is not enabled) -engine = builder.build_engine(network, config) - -# Now the engine will use trt.float32 during inference diff --git a/workspace_python/ros2_ws/src/python_workspace/README.md b/workspace_python/ros2_ws/src/python_workspace/README.md index 9527dcf..47174b5 100644 --- a/workspace_python/ros2_ws/src/python_workspace/README.md +++ b/workspace_python/ros2_ws/src/python_workspace/README.md @@ -1,2 +1,2 @@ Running the camera node: -ros2 run python_workspace camera_node --ros-args -p source_type:=static_image -p static_image_path:='/home/user/Desktop/ROS/Models/Maize Model/sample_maize_images' -p loop:=-1 -p frame_rate:=10 -p model_dimensions:=[640,640] \ No newline at end of file +ros2 run python_workspace camera_node --ros-args -p static_image_path:='/home/user/Desktop/ROS/Models/Maize Model/sample_maize_images' -p loop:=-1 -p frame_rate:=10 -p model_dimensions:=[640,640] \ No newline at end of file diff --git a/workspace_python/ros2_ws/src/python_workspace/launch/composable.py b/workspace_python/ros2_ws/src/python_workspace/launch/composable.py deleted file mode 100644 index ac54874..0000000 --- a/workspace_python/ros2_ws/src/python_workspace/launch/composable.py +++ /dev/null @@ -1,218 +0,0 @@ -## for composable: -from launch import LaunchDescription -from launch_ros.actions import LifecycleNode -from launch_ros.actions import ComposableNodeContainer -from launch_ros.descriptions import ComposableNode - -def generate_launch_description(): - container = ComposableNodeContainer( - name='my_container', - namespace='', - package='rclcpp_components', - executable='component_container_mt', - composable_node_descriptions=[ - ComposableNode( - package='my_package', - plugin='mypackage.ComposablePublisherNode', - name='composable_publisher_node' - ), - ], - output='screen', - ) - - lifecycle_manager = LifecycleNode( - package='my_package', - executable='lifecycle_node', - name='lifecycle_manager_node', - namespace='', - output='screen', - ) - - return LaunchDescription([ - container, - lifecycle_manager - ]) - -# new basic: -from launch import LaunchDescription -from launch_ros.actions import ComposableNodeContainer -from launch_ros.descriptions import ComposableNode - -def generate_launch_description(): - return LaunchDescription([ - ComposableNodeContainer( - name='my_container', - namespace='', - package='rclcpp_components', - executable='component_container', - composable_node_descriptions=[ - ComposableNode( - package='my_package', - plugin='MyNode', - name='my_node' - ) - ], - output='screen', - ), - ]) - -# This launch file will load MyNode into the container at runtime. - -## basic pt 2 -from launch_ros.actions import ComposableNodeContainer -from launch_ros.descriptions import ComposableNode - -def generate_launch_description(): - return LaunchDescription([ - ComposableNodeContainer( - name='my_composable_container', - namespace='', - package='rclcpp_components', - executable='component_container_mt', # Multi-threaded container - composable_node_descriptions=[ - ComposableNode( - package='my_package', - plugin='my_package.IndoorNavNode', - name='indoor_nav' - ), - ComposableNode( - package='my_package', - plugin='my_package.LiDARNode', - name='lidar_node' - ), - ], - output='screen', - ), - ]) - -## cuda composable: -from launch_ros.actions import ComposableNodeContainer -from launch_ros.descriptions import ComposableNode -from launch import LaunchDescription - -def generate_launch_description(): - return LaunchDescription([ - ComposableNodeContainer( - name='gpu_composable_container', - namespace='', - package='rclpy_components', - executable='component_container_mt', # Multi-threaded container for composable nodes - composable_node_descriptions=[ - ComposableNode( - package='my_package', - plugin='my_package.image_preprocessing_node', - name='image_preprocessing_node' - ), - ComposableNode( - package='my_package', - plugin='my_package.inference_node', - name='inference_node', - parameters=[ - {"preprocess_node": "image_preprocessing_node"} - ] - ), - ], - output='screen', - ), - ]) - - - -from launch import LaunchDescription -from launch_ros.actions import ComposableNodeContainer -from launch_ros.descriptions import ComposableNode - -def generate_launch_description(): - return LaunchDescription([ - ComposableNodeContainer( - name='my_python_container', - namespace='', - package='rclpy_components', - executable='component_container_mt', # Multi-threaded container - composable_node_descriptions=[ - ComposableNode( - package='my_package', - plugin='my_python_node.MyPythonNode', - name='my_python_node' - ) - ], - output='screen', - ), - ]) - -### ll - - -from launch import LaunchDescription -from launch_ros.actions import ComposableNodeContainer -from launch_ros.descriptions import ComposableNode - -def generate_launch_description(): - return LaunchDescription([ - ComposableNodeContainer( - name='camera_container', - namespace='', - package='rclcpp_components', - executable='component_container', - composable_node_descriptions=[ - ComposableNode( - package='your_package_name', - plugin='your_package_name.CameraNode', - name='camera_node', - parameters=[{ - 'source_type': 'zed', - 'static_image_path': '.../assets/', - 'video_path': '.../assets/video.mp4', - 'loop': 0, - 'frame_rate': 30, - 'model_type': 'maize', - }], - ), - ], - output='screen', - ), - ]) - -# cuda concurrent -from launch_ros.actions import ComposableNodeContainer -from launch_ros.descriptions import ComposableNode -from launch import LaunchDescription -import threading - -sync_event = threading.Event() - -def generate_launch_description(): - return LaunchDescription([ - ComposableNodeContainer( - name='gpu_composable_container', - namespace='', - package='rclpy_components', - executable='component_container_mt', # Multi-threaded container for composable nodes - composable_node_descriptions=[ - ComposableNode( - package='my_package', - plugin='my_package.ImagePreprocessingNode', - name='image_preprocessing_node', - parameters=[ - {"sync_event": sync_event} - ] - ), - ComposableNode( - package='my_package', - plugin='my_package.InferenceNode', - name='inference_node', - parameters=[ - {"preprocess_node": "image_preprocessing_node", "sync_event": sync_event} - ] - ), - ], - output='screen', - ), - ]) - -# CUDA Streams for Asynchronous Execution: Both the preprocessing and inference nodes now use CUDA streams to allow for non-blocking GPU operations. - -# Image Preprocessing Node: Executes memcpy_htod_async to copy the image to GPU memory without blocking the CPU. The preprocessing stream is synchronized with stream.synchronize() to ensure the operation completes. -# Inference Node: Executes inference in its own stream using execute_async_v2. The stream is synchronized with stream.synchronize() to ensure that inference completes before retrieving the results. - -# Concurrency: By using a multi-threaded composable node container (component_container_mt), both nodes can process different images in parallel. This means the preprocessing node can start working on the next image while the inference node is still running on the previous one. \ No newline at end of file diff --git a/workspace_python/ros2_ws/src/python_workspace/launch/debugging.py b/workspace_python/ros2_ws/src/python_workspace/launch/debugging.py index e69de29..ca78390 100644 --- a/workspace_python/ros2_ws/src/python_workspace/launch/debugging.py +++ b/workspace_python/ros2_ws/src/python_workspace/launch/debugging.py @@ -0,0 +1,127 @@ +from launch_ros.actions import PushRosNamespace +from launch import LaunchDescription +from launch_ros.actions import Node +from launch.actions import DeclareLaunchArgument +from launch.substitutions import LaunchConfiguration + +def generate_launch_description(): + return LaunchDescription([ + # Declare launch arguments + DeclareLaunchArgument( + 'engine_path', default_value='/home/user/Downloads/model.engine', + description='Path to the TensorRT engine file'), + + DeclareLaunchArgument( + 'strip_weights', default_value='False', + description='Whether to strip weights from the model'), + + DeclareLaunchArgument( + 'precision', default_value='fp32', + description='Inference precision (fp32, fp16)'), + + DeclareLaunchArgument( + 'use_display_node', default_value='True', + description='Whether to use the display node in ExterminationNode'), + + DeclareLaunchArgument( + 'lower_range', default_value='[78, 158, 124]', + description='Lower HSV range for color segmentation'), + + DeclareLaunchArgument( + 'upper_range', default_value='[60, 255, 255]', + description='Upper HSV range for color segmentation'), + + DeclareLaunchArgument( + 'min_area', default_value='100', + description='Minimum area for object detection'), + + DeclareLaunchArgument( + 'min_confidence', default_value='0.5', + description='Minimum confidence for object detection'), + + DeclareLaunchArgument( + 'roi_list', default_value='[0, 0, 100, 100]', + description='Region of interest for detection'), + + DeclareLaunchArgument( + 'publish_rate', default_value='10', + description='Publishing rate for the extermination node'), + + DeclareLaunchArgument( + 'static_image_path', default_value='/home/usr/Desktop/ROS/assets/IMG_1822_14.JPG', + description='The path to the static image for CameraNode'), + + DeclareLaunchArgument( + 'loop', default_value='-1', + description='0 = do not loop, >0 = # of loops, -1 = loop forever'), + + DeclareLaunchArgument( + 'frame_rate', default_value='30', + description='Frame rate for CameraNode publishing'), + + DeclareLaunchArgument( + 'model_dimensions', default_value='[640, 640]', + description='Model dimensions for CameraNode preprocessing'), + + DeclareLaunchArgument( + 'shift_constant', default_value='1', + description='Shift constant for camera adjustments'), + + DeclareLaunchArgument( + 'roi_dimensions', default_value='[0, 0, 100, 100]', + description='Region of interest dimensions for CameraNode'), + + DeclareLaunchArgument( + 'precision', default_value='fp32', + description='Precision for image processing (fp32 or fp16)'), + + PushRosNamespace('camera_jetson_namespace'), # Optional: Organize nodes under a namespace + + # Launch JetsonNode + Node( + package='python_workspace', + executable='jetson_node', + name='jetson_node', + output='screen', + parameters=[ + {'engine_path': LaunchConfiguration('engine_path')}, + {'strip_weights': LaunchConfiguration('strip_weights')}, + {'precision': LaunchConfiguration('precision')} + ] + ), + + # Launch ExterminationNode + Node( + package='python_workspace', + executable='extermination_node', + name='extermination_node', + output='screen', + parameters=[ + {'use_display_node': LaunchConfiguration('use_display_node')}, + {'lower_range': LaunchConfiguration('lower_range')}, + {'upper_range': LaunchConfiguration('upper_range')}, + {'min_area': LaunchConfiguration('min_area')}, + {'min_confidence': LaunchConfiguration('min_confidence')}, + {'roi_list': LaunchConfiguration('roi_list')}, + {'publish_rate': LaunchConfiguration('publish_rate')}, + {'side': LaunchConfiguration('side')} + ] + ), + + # Launch CameraNode + Node( + package='python_workspace', + executable='picture_node', + name='picture_node', + output='screen', + parameters=[ + {'static_image_path': LaunchConfiguration('static_image_path')}, + {'loop': LaunchConfiguration('loop')}, + {'frame_rate': LaunchConfiguration('frame_rate')}, + {'model_dimensions': LaunchConfiguration('model_dimensions')}, + {'shift_constant': LaunchConfiguration('shift_constant')}, + {'roi_dimensions': LaunchConfiguration('roi_dimensions')}, + {'precision': LaunchConfiguration('precision')} + ] + ), + ]) diff --git a/workspace_python/ros2_ws/src/python_workspace/launch/launch.py b/workspace_python/ros2_ws/src/python_workspace/launch/launch.py index a9b9b13..c56cf60 100644 --- a/workspace_python/ros2_ws/src/python_workspace/launch/launch.py +++ b/workspace_python/ros2_ws/src/python_workspace/launch/launch.py @@ -1,118 +1,101 @@ -# document launch parameters - -import os +from launch_ros.actions import PushRosNamespace from launch import LaunchDescription from launch_ros.actions import Node -from launch.substitutions import LaunchConfiguration from launch.actions import DeclareLaunchArgument -from launch.conditions import IfCondition +from launch.substitutions import LaunchConfiguration -# should be two composition containers for left/right def generate_launch_description(): return LaunchDescription([ + # Declare launch arguments DeclareLaunchArgument( - 'model_path', - default_value='/default/path/to/model.trt', - description='Path to the TensorRT engine file' - ), + 'engine_path', default_value='/home/user/Downloads/model.engine', + description='Path to the TensorRT engine file'), + DeclareLaunchArgument( - 'source_type', - default_value='static_image', - description='Type of source: static_image, video, zed' - ), + 'strip_weights', default_value='False', + description='Whether to strip weights from the model'), + DeclareLaunchArgument( - 'static_image_path', - default_value='/path/to/static/image.jpg', - description='Path to the static image file' - ), + 'precision', default_value='fp32', + description='Inference precision (fp32, fp16)'), + DeclareLaunchArgument( - 'video_path', - default_value='/path/to/video.mp4', - description='Path to the video file' - ), + 'use_display_node', default_value='True', + description='Whether to use the display node in ExterminationNode'), + DeclareLaunchArgument( - 'loop', - default_value='-1', - description='Number of times to loop the video (-1 for infinite, 0 for 1 loop, >0 for # of loops )' - ), + 'lower_range', default_value='[78, 158, 124]', + description='Lower HSV range for color segmentation'), + DeclareLaunchArgument( - 'frame_rate', - default_value='30', - description='Desired frame rate for publishing' - ), + 'upper_range', default_value='[60, 255, 255]', + description='Upper HSV range for color segmentation'), + DeclareLaunchArgument( - 'model_type', - default_value='maize', - description='The model architecture being used' - ), + 'min_area', default_value='100', + description='Minimum area for object detection'), + DeclareLaunchArgument( - 'use_display_node', - default_value='True', - description='Whether to launch cv2 display screens' - ), + 'min_confidence', default_value='0.5', + description='Minimum confidence for object detection'), + DeclareLaunchArgument( - 'lower_range', - default_value='[78, 158, 124]', - description='Lower HSV range for color filtering' - ), + 'roi_list', default_value='[0, 0, 100, 100]', + description='Region of interest for detection'), + DeclareLaunchArgument( - 'upper_range', - default_value='[60, 255, 255]', - description='The model architecture being used' - ), + 'publish_rate', default_value='10', + description='Publishing rate for the extermination node'), + DeclareLaunchArgument( - 'min_area', - default_value='100', - description='The minimum area for a contour to be considered' - ), + 'side', default_value='left', + description='Side of the camera (left or right)'), + DeclareLaunchArgument( - 'min_confidence', - default_value='0.5', - description='The minimum confidence for a detection to be considered' - ), + 'frame_rate', default_value='30', + description='Frame rate for CameraNode publishing'), + DeclareLaunchArgument( - 'roi_list', - default_value='[0,0,100,100]', - description='The region of interest for the camera' - ), + 'model_dimensions', default_value='[640, 640]', + description='Model dimensions for CameraNode preprocessing'), + DeclareLaunchArgument( - 'publish_rate', - default_value='10', - description='The rate at which to publish the bounding boxes' - ), - # example to toggle nodes on/off - # DeclareLaunchArgument( - # 'use_display_node', - # default_value='true', - # description='Whether to launch the display node' - # ), - Node( - package='python_workspace', - executable='camera_node', # not sure if should match setup.py - name='camera_node', - output='screen', # idk what this does - parameters=[ - {'source_type': LaunchConfiguration('source_type')}, - {'static_image_path': LaunchConfiguration('static_image_path')}, - {'video_path': LaunchConfiguration('video_path')}, - {'loop': LaunchConfiguration('loop')}, - {'frame_rate': LaunchConfiguration('frame_rate')}, - {'model_type': LaunchConfiguration('model_type')}, - ] - ), + 'camera_side', default_value='left', + description='Camera side (left or right)'), + + DeclareLaunchArgument( + 'shift_constant', default_value='1', + description='Shift constant for camera adjustments'), + + DeclareLaunchArgument( + 'roi_dimensions', default_value='[0, 0, 100, 100]', + description='Region of interest dimensions for CameraNode'), + + DeclareLaunchArgument( + 'precision', default_value='fp32', + description='Precision for image processing (fp32 or fp16)'), + + PushRosNamespace('camera_jetson_namespace'), # Optional: Organize nodes under a namespace + + # Launch JetsonNode Node( package='python_workspace', executable='jetson_node', name='jetson_node', output='screen', - parameters=[{'model_path': LaunchConfiguration('model_path')}], + parameters=[ + {'engine_path': LaunchConfiguration('engine_path')}, + {'strip_weights': LaunchConfiguration('strip_weights')}, + {'precision': LaunchConfiguration('precision')} + ] ), + + # Launch ExterminationNode Node( package='python_workspace', - executable='extermination', + executable='extermination_node', name='extermination_node', output='screen', - # condition=IfCondition(LaunchConfiguration('use_display_node')) parameters=[ {'use_display_node': LaunchConfiguration('use_display_node')}, {'lower_range': LaunchConfiguration('lower_range')}, @@ -121,81 +104,23 @@ def generate_launch_description(): {'min_confidence': LaunchConfiguration('min_confidence')}, {'roi_list': LaunchConfiguration('roi_list')}, {'publish_rate': LaunchConfiguration('publish_rate')}, + {'side': LaunchConfiguration('side')} ] ), - ]) - -import launch -from launch_ros.actions import ComposableNodeContainer -from launch_ros.descriptions import ComposableNode - -# from launch import LaunchDescription -# from launch_ros.descriptions import ComposableNodeContainer, ComposableNode -def generate_launch_description(): - container = ComposableNodeContainer( - name='my_container', - namespace='', - package='rclcpp_components', - executable='component_container_mt', # Multi-threaded container - composable_node_descriptions=[ - ComposableNode( - package='my_package', - plugin='my_composable_node.MyComposableNode', - name='my_composable_node', - parameters=[{'use_display_node': True},] - ), - ], - output='screen', - argumens=['--ros-args', '--log-level', 'info'] - ) - - return launch.LaunchDescription([container]) - -import os -from launch import LaunchDescription -from launch_ros.actions import PushRosNamespace -from launch_ros.actions import Node -from launch.substitutions import LaunchConfiguration - -def generate_launch_description(): - return LaunchDescription([ - PushRosNamespace('camera_jetson_namespace'), # Optional: Organize nodes under a namespace - + # Launch CameraNode Node( - package='your_camera_package', # Replace with the actual package name for CameraNode - namespace='camera', + package='python_workspace', executable='camera_node', name='camera_node', output='screen', parameters=[ - { - 'source_type': 'zed', # Example parameter, change as necessary - 'static_image_path': '/home/usr/Desktop/ROS/assets/IMG_1822_14.JPG', - 'video_path': '/home/usr/Desktop/ROS/assets/video.mp4', - 'loop': 0, # Example value, adjust accordingly - 'frame_rate': 30, - 'model_dimensions': (448, 1024), - 'camera_side': 'left', - 'shift_constant': 1, - 'roi_dimensions': [0, 0, 100, 100], - 'precision': 'fp32', # Example precision - } - ], - ), - - Node( - package='your_jetson_package', # Replace with the actual package name for JetsonNode - namespace='jetson', - executable='jetson_node', - name='jetson_node', - output='screen', - parameters=[ - { - 'engine_path': '/home/user/Downloads/model.engine', # Replace with the actual engine path - 'strip_weights': False, - 'precision': 'fp32', # Example precision - } - ], + {'frame_rate': LaunchConfiguration('frame_rate')}, + {'model_dimensions': LaunchConfiguration('model_dimensions')}, + {'camera_side': LaunchConfiguration('camera_side')}, + {'shift_constant': LaunchConfiguration('shift_constant')}, + {'roi_dimensions': LaunchConfiguration('roi_dimensions')}, + {'precision': LaunchConfiguration('precision')} + ] ), ]) diff --git a/workspace_python/ros2_ws/src/python_workspace/launch/lifecycle.py b/workspace_python/ros2_ws/src/python_workspace/launch/lifecycle.py deleted file mode 100644 index 1b928c9..0000000 --- a/workspace_python/ros2_ws/src/python_workspace/launch/lifecycle.py +++ /dev/null @@ -1,100 +0,0 @@ -import rclpy -from rclpy.node import Node -from rclpy.lifecycle import LifecycleNode -from rclpy.lifecycle import State - -class CameraNode(LifecycleNode): - def __init__(self): - super().__init__('camera_node') - - def on_configure(self, state: State): - self.get_logger().info('Configuring the camera...') - # Initialize camera - return self.configure_success() - - def on_activate(self, state: State): - self.get_logger().info('Activating the camera...') - # Start the camera stream - return self.activate_success() - - def on_deactivate(self, state: State): - self.get_logger().info('Deactivating the camera...') - # Stop the camera stream - return self.deactivate_success() - - def on_cleanup(self, state: State): - self.get_logger().info('Cleaning up resources...') - # Clean up resources - return self.cleanup_success() - - def on_shutdown(self, state: State): - self.get_logger().info('Shutting down the camera...') - # Shut down resources - return self.shutdown_success() - -from launch import LaunchDescription -from launch_ros.actions import LifecycleNode -from launch.actions import EmitEvent -from launch.events import matches_action -from launch_ros.events.lifecycle import ChangeState - -from lifecycle_msgs.msg import Transition - -def generate_launch_description(): - return LaunchDescription([ - LifecycleNode( - package='your_package', - executable='your_lifecycle_node', - name='lifecycle_node_1', - namespace='', - output='screen' - ), - LifecycleNode( - package='another_package', - executable='another_lifecycle_node', - name='lifecycle_node_2', - namespace='', - output='screen' - ), - ]) - -## automatic lifecycle transition launch: -from launch import LaunchDescription -from launch_ros.actions import LifecycleNode -from launch.actions import EmitEvent -from launch.events import matches_action -from launch_ros.events.lifecycle import ChangeState -from lifecycle_msgs.msg import Transition - -def generate_launch_description(): - lifecycle_node_1 = LifecycleNode( - package='your_package', - executable='your_lifecycle_node', - name='lifecycle_node_1', - namespace='', - output='screen' - ) - - return LaunchDescription([ - # Launch the lifecycle node - lifecycle_node_1, - - # Emit an event to automatically configure the node - EmitEvent( - event=ChangeState( - lifecycle_node_matcher=matches_action(lifecycle_node_1), - transition_id=Transition.TRANSITION_CONFIGURE - ) - ), - - # Emit an event to automatically activate the node after configuring - EmitEvent( - event=ChangeState( - lifecycle_node_matcher=matches_action(lifecycle_node_1), - transition_id=Transition.TRANSITION_ACTIVATE - ) - ), - ]) - -# ComposableNodeContainer: This is the container where composable nodes are dynamically loaded. It's similar to a "node manager" that manages composable nodes. -# ComposableNode: This describes your node (in this case, the CameraNode) which will be loaded into the container. \ No newline at end of file diff --git a/workspace_python/ros2_ws/src/python_workspace/launch/parameters.py b/workspace_python/ros2_ws/src/python_workspace/launch/parameters.py deleted file mode 100644 index 5dbf21a..0000000 --- a/workspace_python/ros2_ws/src/python_workspace/launch/parameters.py +++ /dev/null @@ -1,89 +0,0 @@ -## parameter handling: -from launch import LaunchDescription -from launch_ros.actions import LifecycleNode -from launch_ros.actions import ComposableNodeContainer -from launch_ros.descriptions import ComposableNode -from launch.actions import LogInfo, EmitEvent -from launch.events.lifecycle import ChangeState -from lifecycle_msgs.msg import Transition - -def generate_launch_description(): - # Define parameters to pass to the composable node - composable_params = { - 'message': 'This is a custom message from launch!', - 'publish_frequency': 1.0 - } - - container = ComposableNodeContainer( - name='my_container', - namespace='', - package='rclcpp_components', - executable='component_container_mt', - composable_node_descriptions=[ - ComposableNode( - package='my_package', - plugin='mypackage.ComposablePublisherNode', - name='composable_publisher_node', - parameters=[composable_params] - ), - ComposableNode( - package='my_package', - plugin='mypackage.ComposablePublisherNode', - name='additional_publisher_node', - parameters=[{'message': 'This is the second node!', 'publish_frequency': 2.5}] - ) - ], - output='screen', - ) - - lifecycle_manager = LifecycleNode( - package='my_package', - executable='lifecycle_node', - name='lifecycle_manager_node', - namespace='', - output='screen', - parameters=[{'lifecycle_action': 'configure'}] - ) - - # Manually trigger lifecycle transitions after nodes are up - configure_event = EmitEvent( - event=ChangeState( - lifecycle_node_matcher=launch.events.matches_action(lifecycle_manager), - transition_id=Transition.TRANSITION_CONFIGURE - ) - ) - - activate_event = EmitEvent( - event=ChangeState( - lifecycle_node_matcher=launch.events.matches_action(lifecycle_manager), - transition_id=Transition.TRANSITION_ACTIVATE - ) - ) - - deactivate_event = EmitEvent( - event=ChangeState( - lifecycle_node_matcher=launch.events.matches_action(lifecycle_manager), - transition_id=Transition.TRANSITION_DEACTIVATE - ) - ) - - shutdown_event = EmitEvent( - event=ChangeState( - lifecycle_node_matcher=launch.events.matches_action(lifecycle_manager), - transition_id=Transition.TRANSITION_SHUTDOWN - ) - ) - - return LaunchDescription([ - LogInfo(msg="Starting the container and lifecycle manager..."), - container, - lifecycle_manager, - LogInfo(msg="Configuring the lifecycle node..."), - configure_event, - LogInfo(msg="Activating the lifecycle node..."), - activate_event, - LogInfo(msg="Deactivating the lifecycle node..."), - deactivate_event, - LogInfo(msg="Shutting down the lifecycle node..."), - shutdown_event, - ]) \ No newline at end of file diff --git a/workspace_python/ros2_ws/src/python_workspace/python_workspace/extermination_node.py b/workspace_python/ros2_ws/src/python_workspace/python_workspace/extermination_node.py index 723473f..d1ffd54 100644 --- a/workspace_python/ros2_ws/src/python_workspace/python_workspace/extermination_node.py +++ b/workspace_python/ros2_ws/src/python_workspace/python_workspace/extermination_node.py @@ -8,7 +8,6 @@ # from tracker import * # depth point cloud here... # add object counter -# log end to end latency by checking time.time_now() and unpacking time from header of initial image after the extermination output is decided import rclpy from rclpy.time import Time @@ -148,7 +147,7 @@ def verify_object(self, disp_image, bboxes): if x1 >= roi_x1 and x2 <= roi_x2 and y1 >= roi_y1 and y2 <= roi_y2: self.on = 1 if self.display: - + pass if self.display: self.display(disp_image, adjusted_bboxes) diff --git a/workspace_python/ros2_ws/src/python_workspace/python_workspace/jetson_node.py b/workspace_python/ros2_ws/src/python_workspace/python_workspace/jetson_node.py index 27e3612..e5ecd27 100644 --- a/workspace_python/ros2_ws/src/python_workspace/python_workspace/jetson_node.py +++ b/workspace_python/ros2_ws/src/python_workspace/python_workspace/jetson_node.py @@ -76,18 +76,6 @@ def pointer_callback(self, ipc_handle_msg): self.cuda_driver_context.push() self.get_logger().info(f"Received IPC handle: {ipc_handle_str}") - # # Convert the string back to bytes for the IPC handle - # ipc_handle_str = msg.data - # ipc_handle_bytes = bytes(int(ipc_handle_str[i:i+2], 16) for i in range(0, len(ipc_handle_str), 2)) - - # # Recreate the IPC handle using PyCUDA - # ipc_handle = cuda.IPCHandle(ipc_handle_bytes) - - # # Map the IPC memory into the current process - # d_input = cuda.IPCMemoryHandle(ipc_handle) - - # # Map the memory to the current context - # self.d_input = d_input.open(cuda.Context.get_current()) # Re-create the CuPy array from IPC handle ipc_handle_str = ipc_handle_msg.data diff --git a/workspace_python/ros2_ws/src/python_workspace/python_workspace/main.py b/workspace_python/ros2_ws/src/python_workspace/python_workspace/main.py deleted file mode 100644 index bf5bf61..0000000 --- a/workspace_python/ros2_ws/src/python_workspace/python_workspace/main.py +++ /dev/null @@ -1,28 +0,0 @@ -import rclpy -from rclpy.executors import MultiThreadedExecutor -from rclpy.components import ComponentManager -from camera_node import generate_node as generate_camera_node -from jetson_node import generate_node as generate_jetson_node - -def main(args=None): - rclpy.init(args=args) - - # Create a component manager (container) - component_manager = ComponentManager() - - # Load the composable nodes into the component manager - component_manager.add_node(generate_camera_node()) - component_manager.add_node(generate_jetson_node()) - - executor = MultiThreadedExecutor() - executor.add_node(component_manager) - - try: - executor.spin() - finally: - executor.shutdown() - component_manager.destroy_node() - rclpy.shutdown() - -if __name__ == '__main__': - main() diff --git a/workspace_python/ros2_ws/src/python_workspace/python_workspace/picture_node.py b/workspace_python/ros2_ws/src/python_workspace/python_workspace/picture_node.py new file mode 100644 index 0000000..f5c8fa3 --- /dev/null +++ b/workspace_python/ros2_ws/src/python_workspace/python_workspace/picture_node.py @@ -0,0 +1,154 @@ +import time, os +import cv2 +import pycuda.driver as cuda +import cupy as cp +import queue + +import rclpy +from rclpy.time import Time +from rclpy.node import Node +from rclpy.executors import MultiThreadedExecutor +from sensor_msgs.msg import Image +from std_msgs.msg import Header, String +from cv_bridge import CvBridge + +class PictureNode(Node): + def __init__(self): + super().__init__('picture_node') + + cuda.init() + device = cuda.Device(0) + self.cuda_driver_context = device.make_context() + self.stream = cuda.Stream() + + self.declare_parameter('static_image_path', '/home/usr/Desktop/ROS/assets/IMG_1822_14.JPG') + self.declare_parameter('loop', 0) # 0 = don't loop, >0 = # of loops, -1 = loop forever + self.declare_parameter('frame_rate', 30) # Desired frame rate for publishing + self.declare_parameter('model_dimensions', (448, 1024)) + self.declare_parameter('shift_constant', 1) + self.declare_parameter('roi_dimensions', [0, 0, 100, 100]) + self.declare_paramter('precision', 'fp32') + + self.static_image_path = self.get_parameter('static_image_path').get_parameter_value().string_value + self.loop = self.get_parameter('loop').get_parameter_value().integer_value + self.frame_rate = self.get_parameter('frame_rate').get_parameter_value().integer_value + self.dimensions = tuple(self.get_parameter('model_dimensions').get_parameter_value().integer_array_value) + self.camera_side = self.get_parameter('camera_side').get_parameter_value().string_value + self.shift_constant = self.get_parameter('shift_constant').get_parameter_value().integer_value + self.roi_dimensions = self.get_parameter('roi_dimensions').get_parameter_value().integer_array_value + self.precision = self.get_parameter('precision').get_parameter_value().string_value + + self.pointer_publisher = self.create_publisher(String, 'preprocessing_done', 10) + self.image_service = self.create_service(Image, 'image_data', self.image_callback) + + self.velocity = [0, 0, 0] + self.image_queue = queue.Queue() + self.bridge = CvBridge() + self.publish_static_image() + # propagate fp16 option (.fp16 datatype for cupy) + if self.precision == 'fp32': + pass + elif self.precision == 'fp16': + pass + else: + self.get_logger().error(f"Invalid precision: {self.precision}") + + def image_callback(self, response): + self.get_logger().info("Received image request") + if not self.image_queue.empty(): + image_data = self.image_queue.get() # Get the image from the queue + cv_image, velocity = image_data # unpack tuple (image, velocity) + + # Convert OpenCV image to ROS2 Image message using cv_bridge + ros_image = self.bridge.cv2_to_imgmsg(cv_image, encoding='rgb8') + # Create a new header and include velocity in the stamp fields + header = Header() + current_time = self.get_clock().now().to_msg() + header.stamp = current_time # Set timestamp to current time + header.frame_id = str(velocity) # Set frame ID to velocity + + ros_image.header = header # Attach the header to the ROS image message + response.image = ros_image # Set the response's image + response.image = ros_image # Set the response's image + return response + + else: + self.get_logger().error("Image queue is empty") + return response + + def publish_static_image(self): + if not os.path.exists(self.static_image_path): + self.get_logger().error(f"Static image not found at {self.static_image_path}") + # raise FileNotFoundError(f"Static image not found at {self.static_image_path}") + return + + images = [] + if os.path.isdir(self.static_image_path): + for filename in os.listdir(self.static_image_path): + if filename.endswith('.JPG') or filename.endswith('.png'): + images.append(os.path.join(self.static_image_path, filename)) + elif os.path.isfile(self.static_image_path): + images.append(self.static_image_path) + + if len(images) == 0: + self.get_logger().error(f"No images found at {self.static_image_path}") + return + + loops = 0 + while rclpy.ok() and (self.loop == -1 or loops < self.loop): + for filename in images: + image = cv2.imread(filename, cv2.IMREAD_COLOR) + if image is not None: + self.image_queue.put((image, self.velocity[0])) + self.preprocess_image(image) + time.sleep(1.0 / self.frame_rate) + else: + self.get_logger().error(f"Failed to read image: {filename}") + raise FileNotFoundError(f"Failed to read image: {filename}") + if self.loop > 0: + loops += 1 + + def preprocess_image(self, image): + tic = time.perf_counter_ns() + + roi_x, roi_y, roi_w, roi_h = self.roi_dimensions + shifted_x = roi_x + abs(self.velocity[0]) * self.shift_constant + + gpu_image = cv2.cuda_GpuMat() + gpu_image.upload(image) + + gpu_image = cv2.cuda.cvtColor(gpu_image, cv2.COLOR_RGBA2RGB) # remove alpha channel + gpu_image = gpu_image[roi_y:(roi_y+roi_h), shifted_x:(shifted_x+roi_w)] # crop the image to ROI + gpu_image = cv2.cuda.resize(gpu_image, self.dimensions) # resize to model dimensions + + input_data = cp.asarray(gpu_image) # Now the image is on GPU memory as CuPy array + input_data = input_data.astype(cp.float32) / 255.0 # normalize to [0, 1] + input_data = cp.transpose(input_data, (2, 0, 1)) # Transpose from HWC (height, width, channels) to CHW (channels, height, width) + input_data = cp.ravel(input_data) # flatten the array + + d_input_ptr = input_data.data.ptr # Get device pointer of the CuPy array + ipc_handle = cuda.mem_get_ipc_handle(d_input_ptr) # Create the IPC handle + + toc = time.perf_counter_ns() + self.get_logger().info(f"Preprocessing: {(toc-tic)/1e6} ms") + + # Publish the IPC handle as a string (sending the handle reference as a string) + ipc_handle_msg = String() + ipc_handle_msg.data = str(ipc_handle.handle) + self.pointer_publisher.publish(ipc_handle_msg) + +def main(args=None): + rclpy.init(args=args) + picture_node = PictureNode() + executor = MultiThreadedExecutor(num_threads=1) + executor.add_node(picture_node) + + try: + executor.spin() + finally: + executor.shutdown() + picture_node.destroy_node() + rclpy.shutdown() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/workspace_python/ros2_ws/src/python_workspace/python_workspace/video_node.py b/workspace_python/ros2_ws/src/python_workspace/python_workspace/video_node.py new file mode 100644 index 0000000..b34ac8f --- /dev/null +++ b/workspace_python/ros2_ws/src/python_workspace/python_workspace/video_node.py @@ -0,0 +1,148 @@ +import time, os +import cv2 +import pycuda.driver as cuda +import cupy as cp +import queue + +import rclpy +from rclpy.time import Time +from rclpy.node import Node +from rclpy.executors import MultiThreadedExecutor +from sensor_msgs.msg import Image +from std_msgs.msg import Header, String +from cv_bridge import CvBridge + +class VideoNode(Node): + def __init__(self): + super().__init__('video_node') + + cuda.init() + device = cuda.Device(0) + self.cuda_driver_context = device.make_context() + self.stream = cuda.Stream() + + self.declare_parameter('video_path', '/home/usr/Desktop/ROS/assets/video.mp4') + self.declare_parameter('loop', 0) # 0 = don't loop, >0 = # of loops, -1 = loop forever + self.declare_parameter('frame_rate', 30) # Desired frame rate for publishing + self.declare_parameter('model_dimensions', (448, 1024)) + self.declare_parameter('shift_constant', 1) + self.declare_parameter('roi_dimensions', [0, 0, 100, 100]) + self.declare_paramter('precision', 'fp32') + + self.video_path = self.get_parameter('video_path').get_parameter_value().string_value + self.loop = self.get_parameter('loop').get_parameter_value().integer_value + self.frame_rate = self.get_parameter('frame_rate').get_parameter_value().integer_value + self.dimensions = tuple(self.get_parameter('model_dimensions').get_parameter_value().integer_array_value) + self.shift_constant = self.get_parameter('shift_constant').get_parameter_value().integer_value + self.roi_dimensions = self.get_parameter('roi_dimensions').get_parameter_value().integer_array_value + self.precision = self.get_parameter('precision').get_parameter_value().string_value + + self.pointer_publisher = self.create_publisher(String, 'preprocessing_done', 10) + self.image_service = self.create_service(Image, 'image_data', self.image_callback) + + self.velocity = [0, 0, 0] + self.image_queue = queue.Queue() + self.bridge = CvBridge() + self.publish_video_frames() + # propagate fp16 option (.fp16 datatype for cupy) + if self.precision == 'fp32': + pass + elif self.precision == 'fp16': + pass + else: + self.get_logger().error(f"Invalid precision: {self.precision}") + + def image_callback(self, response): + self.get_logger().info("Received image request") + if not self.image_queue.empty(): + image_data = self.image_queue.get() # Get the image from the queue + cv_image, velocity = image_data # unpack tuple (image, velocity) + + # Convert OpenCV image to ROS2 Image message using cv_bridge + ros_image = self.bridge.cv2_to_imgmsg(cv_image, encoding='rgb8') + # Create a new header and include velocity in the stamp fields + header = Header() + current_time = self.get_clock().now().to_msg() + header.stamp = current_time # Set timestamp to current time + header.frame_id = str(velocity) # Set frame ID to velocity + + ros_image.header = header # Attach the header to the ROS image message + response.image = ros_image # Set the response's image + response.image = ros_image # Set the response's image + return response + + else: + self.get_logger().error("Image queue is empty") + return response + + def publish_video_frames(self): + if not os.path.exists(self.video_path): + self.get_logger().error(f"Video file not found at {self.video_path}") + return + + cap = cv2.VideoCapture(self.video_path) + if not cap.isOpened(): + self.get_logger().error(f"Failed to open video: {self.video_path}") + return + + loops = 0 + while rclpy.ok() and (self.loop == -1 or loops < self.loop): + while cap.isOpened() and rclpy.ok(): + ret, frame = cap.read() + if not ret: + break + self.image_queue.put((frame, self.velocity[0])) + self.preprocess_image(frame) + time.sleep(1.0 / self.frame_rate) + + if self.loop > 0: + loops += 1 + + if self.loop != -1: + cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # restart video + cap.release() + + def preprocess_image(self, image): + tic = time.perf_counter_ns() + + roi_x, roi_y, roi_w, roi_h = self.roi_dimensions + shifted_x = roi_x + abs(self.velocity[0]) * self.shift_constant + + gpu_image = cv2.cuda_GpuMat() + gpu_image.upload(image) + + gpu_image = cv2.cuda.cvtColor(gpu_image, cv2.COLOR_RGBA2RGB) # remove alpha channel + gpu_image = gpu_image[roi_y:(roi_y+roi_h), shifted_x:(shifted_x+roi_w)] # crop the image to ROI + gpu_image = cv2.cuda.resize(gpu_image, self.dimensions) # resize to model dimensions + + input_data = cp.asarray(gpu_image) # Now the image is on GPU memory as CuPy array + input_data = input_data.astype(cp.float32) / 255.0 # normalize to [0, 1] + input_data = cp.transpose(input_data, (2, 0, 1)) # Transpose from HWC (height, width, channels) to CHW (channels, height, width) + input_data = cp.ravel(input_data) # flatten the array + + d_input_ptr = input_data.data.ptr # Get device pointer of the CuPy array + ipc_handle = cuda.mem_get_ipc_handle(d_input_ptr) # Create the IPC handle + + toc = time.perf_counter_ns() + self.get_logger().info(f"Preprocessing: {(toc-tic)/1e6} ms") + + # Publish the IPC handle as a string (sending the handle reference as a string) + ipc_handle_msg = String() + ipc_handle_msg.data = str(ipc_handle.handle) + self.pointer_publisher.publish(ipc_handle_msg) + +def main(args=None): + rclpy.init(args=args) + video_node = VideoNode() + executor = MultiThreadedExecutor(num_threads=1) + executor.add_node(video_node) + + try: + executor.spin() + finally: + executor.shutdown() + video_node.destroy_node() + rclpy.shutdown() + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/workspace_python/ros2_ws/src/python_workspace/python_workspace/camera_node.py b/workspace_python/ros2_ws/src/python_workspace/python_workspace/zed_camera_node.py similarity index 51% rename from workspace_python/ros2_ws/src/python_workspace/python_workspace/camera_node.py rename to workspace_python/ros2_ws/src/python_workspace/python_workspace/zed_camera_node.py index 91ffe73..26ad78a 100644 --- a/workspace_python/ros2_ws/src/python_workspace/python_workspace/camera_node.py +++ b/workspace_python/ros2_ws/src/python_workspace/python_workspace/zed_camera_node.py @@ -3,17 +3,15 @@ import pyzed.sl as sl import pycuda.driver as cuda import cupy as cp -import numpy as np import queue import rclpy from rclpy.time import Time from rclpy.node import Node -from rclpy.components import NodeComponent from rclpy.executors import MultiThreadedExecutor from sensor_msgs.msg import Image from std_msgs.msg import Header, String -from cv2 import cvbridge # check if this is necessary +from cv_bridge import CvBridge class CameraNode(Node): def __init__(self): @@ -24,10 +22,6 @@ def __init__(self): self.cuda_driver_context = device.make_context() self.stream = cuda.Stream() - self.declare_parameter('source_type', 'zed') # static_image, video, zed - self.declare_parameter('static_image_path', '/home/usr/Desktop/ROS/assets/IMG_1822_14.JPG') - self.declare_parameter('video_path', '/home/usr/Desktop/ROS/assets/video.mp4') - self.declare_parameter('loop', 0) # 0 = don't loop, >0 = # of loops, -1 = loop forever self.declare_parameter('frame_rate', 30) # Desired frame rate for publishing self.declare_parameter('model_dimensions', (448, 1024)) self.declare_parameter('camera_side', 'left') # left, right @@ -35,10 +29,6 @@ def __init__(self): self.declare_parameter('roi_dimensions', [0, 0, 100, 100]) self.declare_paramter('precision', 'fp32') - self.source_type = self.get_parameter('source_type').get_parameter_value().string_value - self.static_image_path = self.get_parameter('static_image_path').get_parameter_value().string_value - self.video_path = self.get_parameter('video_path').get_parameter_value().string_value - self.loop = self.get_parameter('loop').get_parameter_value().integer_value self.frame_rate = self.get_parameter('frame_rate').get_parameter_value().integer_value self.dimensions = tuple(self.get_parameter('model_dimensions').get_parameter_value().integer_array_value) self.camera_side = self.get_parameter('camera_side').get_parameter_value().string_value @@ -47,20 +37,12 @@ def __init__(self): self.precision = self.get_parameter('precision').get_parameter_value().string_value self.pointer_publisher = self.create_publisher(String, 'preprocessing_done', 10) - # self.image_service = self.create_service(Image, 'image_data', self.image_callback) - self.velocity = 0 + self.image_service = self.create_service(Image, 'image_data', self.image_callback) + + self.velocity = [0, 0, 0] self.image_queue = queue.Queue() - - if self.source_type == 'static_image': - self.publish_static_image() - elif self.source_type == 'video': - self.publish_video_frames() - elif self.source_type == 'zed': - self.publish_zed_frames() - else: - self.get_logger().error(f"Invalid source_type: {self.source_type}") - return - + self.bridge = CvBridge() + self.publish_zed_frames() # propagate fp16 option (.fp16 datatype for cupy) if self.precision == 'fp32': pass @@ -69,74 +51,29 @@ def __init__(self): else: self.get_logger().error(f"Invalid precision: {self.precision}") - # save copy of image to queue - def image_callback(self, request, response): + def image_callback(self, response): self.get_logger().info("Received image request") if not self.image_queue.empty(): - image = self.image_queue.get() - response.image = image + image_data = self.image_queue.get() # Get the image from the queue + cv_image, velocity = image_data # unpack tuple (image, velocity) + + # Convert OpenCV image to ROS2 Image message using cv_bridge + ros_image = self.bridge.cv2_to_imgmsg(cv_image, encoding='rgb8') + # Create a new header and include velocity in the stamp fields + header = Header() + current_time = self.get_clock().now().to_msg() + header.stamp = current_time # Set timestamp to current time + header.frame_id = str(velocity) # Set frame ID to velocity + + ros_image.header = header # Attach the header to the ROS image message + response.image = ros_image # Set the response's image + response.image = ros_image # Set the response's image return response + else: self.get_logger().error("Image queue is empty") return response - def publish_static_image(self): - if not os.path.exists(self.static_image_path): - self.get_logger().error(f"Static image not found at {self.static_image_path}") - # raise FileNotFoundError(f"Static image not found at {self.static_image_path}") - return - - images = [] - if os.path.isdir(self.static_image_path): - for filename in os.listdir(self.static_image_path): - if filename.endswith('.JPG') or filename.endswith('.png'): - images.append(os.path.join(self.static_image_path, filename)) - elif os.path.isfile(self.static_image_path): - images.append(self.static_image_path) - - if len(images) == 0: - self.get_logger().error(f"No images found at {self.static_image_path}") - return - - loops = 0 - while rclpy.ok() and (self.loop == -1 or loops < self.loop): - for filename in images: - image = cv2.imread(filename, cv2.IMREAD_COLOR) - if image is not None: - self.preprocess_image(image) - time.sleep(1.0 / self.frame_rate) - else: - self.get_logger().error(f"Failed to read image: {filename}") - raise FileNotFoundError(f"Failed to read image: {filename}") - if self.loop > 0: - loops += 1 - - def publish_video_frames(self): - if not os.path.exists(self.video_path): - self.get_logger().error(f"Video file not found at {self.video_path}") - return - - cap = cv2.VideoCapture(self.video_path) - if not cap.isOpened(): - self.get_logger().error(f"Failed to open video: {self.video_path}") - return - - loops = 0 - while rclpy.ok() and (self.loop == -1 or loops < self.loop): - while cap.isOpened() and rclpy.ok(): - ret, frame = cap.read() - if not ret: - break - self.preprocess_image(frame) - time.sleep(1.0 / self.frame_rate) - - if self.loop > 0: - loops += 1 - - if self.loop != -1: - cap.set(cv2.CAP_PROP_POS_FRAMES, 0) # restart video - cap.release() - def publish_zed_frames(self): zed = sl.Camera() init = sl.InitParameters() @@ -159,29 +96,29 @@ def publish_zed_frames(self): runtime = sl.RuntimeParameters() image_zed = sl.Mat() sensors_data = sl.SensorsData() - previous_velocity = np.array([0.0, 0.0, 0.0]) + previous_velocity = cp.array([0.0, 0.0, 0.0]) previous_time = time.time() while rclpy.ok(): err = zed.grab(runtime) if err == sl.ERROR_CODE.SUCCESS: if self.side == 'left': - zed.retrieve_image(image_zed, sl.VIEW.LEFT_UNRECTIFIED) + zed.retrieve_image(image_zed, sl.VIEW.LEFT_RECTIFIED) else: - zed.retrieve_image(image_zed, sl.VIEW.RIGHT_UNRECTIFIED) + zed.retrieve_image(image_zed, sl.VIEW.RIGHT_RECTIFIED) zed.get_sensors_data(sensors_data, sl.TIME_REFERENCE.IMAGE) accel_data = sensors_data.get_imu_data().get_linear_acceleration() # Calculate velocity (v = u + at) current_time = time.time() delta_time = current_time - previous_time - acceleration = np.array([accel_data[0], accel_data[1], accel_data[2]]) + acceleration = cp.array([accel_data[0], accel_data[1], accel_data[2]]) velocity = previous_velocity + acceleration * delta_time self.velocity = velocity previous_velocity = velocity previous_time = current_time - - cv_image = image_zed.get_data() + cv_image = image_zed.get_data() + self.image_queue.put((cv_image, velocity[0])) self.preprocess_image(cv_image) time.sleep(1.0 / self.frame_rate) @@ -220,12 +157,6 @@ def preprocess_image(self, image): ipc_handle_msg.data = str(ipc_handle.handle) self.pointer_publisher.publish(ipc_handle_msg) - def enqueue_image(self, image): - self.image_queue.put(image) # append as tuple (image, velocity) - - def dequeue_image(self): - self.image_queue.get() - def main(args=None): rclpy.init(args=args) camera_node = CameraNode() @@ -239,53 +170,5 @@ def main(args=None): camera_node.destroy_node() rclpy.shutdown() -def generate_node(): - return NodeComponent(CameraNode, "camera_node") - if __name__ == '__main__': - main() - -# cupy alternative: -# import cupy as cp -# import cv2 - -# # Load image to CPU -# image = cv2.imread('image.png', cv2.IMREAD_UNCHANGED) - -# # Upload image to GPU with CuPy -# gpu_image = cp.asarray(image) - -# # Convert RGBA to RGB -# gpu_image = gpu_image[:, :, :3] # Assuming last channel is alpha - -# # Crop the image -# gpu_image = gpu_image[roi_y:roi_y+roi_h, roi_x:roi_x+roi_w, :] - -# # Resize the image (using CuPy's array resize, or via integration with other libraries) -# gpu_image = cp.array(cv2.resize(cp.asnumpy(gpu_image), (224, 224))) - -# # Normalize and transpose the image -# gpu_image = gpu_image.astype(cp.float32) / 255.0 -# gpu_image = cp.transpose(gpu_image, (2, 0, 1)) - -# pytorch alternative: -# import torch -# import torchvision.transforms as T -# import cv2 - -# # Load image to CPU -# image = cv2.imread('image.png', cv2.IMREAD_UNCHANGED) - -# # Transfer to PyTorch tensor -# image_tensor = torch.from_numpy(image).cuda() - -# # Remove alpha and preprocess -# transform = T.Compose([ -# T.Lambda(lambda x: x[:, :, :3]), # Remove alpha -# T.Lambda(lambda x: x[roi_y:roi_y+roi_h, roi_x:roi_x+roi_w, :]), # Crop -# T.Resize((224, 224)), -# T.ToTensor(), # Convert to tensor and normalize -# ]) - -# # Apply transformation -# image_tensor = transform(image_tensor) \ No newline at end of file + main() \ No newline at end of file diff --git a/workspace_python/ros2_ws/src/python_workspace/setup.py b/workspace_python/ros2_ws/src/python_workspace/setup.py index 18c0927..8ba6f10 100644 --- a/workspace_python/ros2_ws/src/python_workspace/setup.py +++ b/workspace_python/ros2_ws/src/python_workspace/setup.py @@ -11,9 +11,7 @@ data_files=[ ('share/ament_index/resource_index/packages', ['resource/' + package_name]), - # Include our package.xml file (os.path.join('share', package_name), ['package.xml']), - # Include all launch files. (os.path.join('share', package_name, 'launch'), glob(os.path.join('launch', '*.launch.py'))), ], zip_safe=True, @@ -23,7 +21,9 @@ license='Apache License 2.0', entry_points={ 'console_scripts': [ - 'camera_node = python_workspace.camera_node:main', + 'video_node = python_workspace.video_node:main', + 'camera_node = python_workspace.zed_camera_node:main', + 'picture_node = python_workspace.picture_node:main', 'jetson_node = python_workspace.jetson_node:main', 'extermination_node = python_workspace.extermination_node:main' ],