diff --git a/GEMstack/onboard/perception/IdTracker.py b/GEMstack/onboard/perception/IdTracker.py
new file mode 100644
index 000000000..15641fa00
--- /dev/null
+++ b/GEMstack/onboard/perception/IdTracker.py
@@ -0,0 +1,12 @@
+class IdTracker():
+    """Abstracts out id tracking
+    """
+    def __init__(self):
+        self.__id = 0
+
+    def get_new_id(self) -> int:
+        """Returns a unique agent id
+        """
+        assigned_id = self.__id
+        self.__id += 1 # id will intentionally overflow to get back to 0
+        return assigned_id
diff --git a/GEMstack/onboard/perception/calibration/camera_intrinsics.json b/GEMstack/onboard/perception/calibration/camera_intrinsics.json
new file mode 100644
index 000000000..74318680c
--- /dev/null
+++ b/GEMstack/onboard/perception/calibration/camera_intrinsics.json
@@ -0,0 +1,7 @@
+{
+    "fx": 684.8333129882812,
+    "fy": 684.6096801757812,
+    "cx": 573.37109375,
+    "cy": 363.700927734375
+  }
+  
\ No newline at end of file
diff --git a/GEMstack/onboard/perception/calibration/extrinsics/R.npy b/GEMstack/onboard/perception/calibration/extrinsics/R.npy
new file mode 100644
index 000000000..7d44aa76d
Binary files /dev/null and b/GEMstack/onboard/perception/calibration/extrinsics/R.npy differ
diff --git a/GEMstack/onboard/perception/calibration/extrinsics/rvec.npy b/GEMstack/onboard/perception/calibration/extrinsics/rvec.npy
new file mode 100644
index 000000000..62d1809f1
Binary files /dev/null and b/GEMstack/onboard/perception/calibration/extrinsics/rvec.npy differ
diff --git a/GEMstack/onboard/perception/calibration/extrinsics/t.npy b/GEMstack/onboard/perception/calibration/extrinsics/t.npy
new file mode 100644
index 000000000..b133fa55f
Binary files /dev/null and b/GEMstack/onboard/perception/calibration/extrinsics/t.npy differ
diff --git a/GEMstack/onboard/perception/pedestrian_detection.py b/GEMstack/onboard/perception/pedestrian_detection.py
index f91145274..2c24f3bef 100644
--- a/GEMstack/onboard/perception/pedestrian_detection.py
+++ b/GEMstack/onboard/perception/pedestrian_detection.py
@@ -1,13 +1,56 @@
-from ...state import AllState,VehicleState,ObjectPose,ObjectFrameEnum,AgentState,AgentEnum,AgentActivityEnum
+"""
+Top ouster lidar + Oak front camera fusion, object detection
+"""
+"""
+Terminal 1:
+---------------
+source /opt/ros/noetic/setup.bash && source /catkin_ws/devel/setup.bash
+roscore
+
+Terminal 2:
+---------------
+source /opt/ros/noetic/setup.bash && source /catkin_ws/devel/setup.bash
+python3 GEMstack/onboard/perception/transform.py
+
+Terminal 3:
+---------------
+source /opt/ros/noetic/setup.bash && source /catkin_ws/devel/setup.bash
+rosbag play -l ~/rosbags/vehicle.bag
+
+Terminal 4:
+---------------
+source /opt/ros/noetic/setup.bash && source /catkin_ws/devel/setup.bash
+cd GEMStack
+python3 main.py --variant=detector_only launch/pedestrian_detection.yaml
+
+Terminal 5:
+---------------
+source /opt/ros/noetic/setup.bash && source /catkin_ws/devel/setup.bash
+rviz
+"""
+
+# Python 
+import os
+from typing import List, Dict
+from collections import defaultdict
+from datetime import datetime
+# ROS, CV
+import rospy
+import message_filters
+import tf
+from cv_bridge import CvBridge
+from sensor_msgs.msg import Image, PointCloud2
+from visualization_msgs.msg import MarkerArray
+# YOLO
+from ultralytics import YOLO
+from ultralytics.engine.results import Results, Boxes
+# GEMStack
+from ...state import AllState,VehicleState,ObjectPose,ObjectFrameEnum,AgentState,AgentEnum,AgentActivityEnum,ObjectFrameEnum
+from .pedestrian_detection_utils import *
 from ..interface.gem import GEMInterface
 from ..component import Component
-from ultralytics import YOLO
-import cv2
-from typing import Dict
-from cv_bridge import CvBridge
-from sensor_msgs.msg import Image
-import rospy
-import os
+from .IdTracker import IdTracker
+
 
 
 def box_to_fake_agent(box):
@@ -22,158 +65,408 @@ def box_to_fake_agent(box):
 
 
 class PedestrianDetector2D(Component):
-    """Detects pedestrians.
-    
-    self.vehicle -> GEM car info
-    self.detector -> pedestrian detection model
-    self.last_person_boxes -> used to store boxes detected in every frame of the video
-    self.pedestrians -> stores the agentstate (pedestrian) found during the video in a dict, not sure if required can be removed
-    self.visualization -> enable this to view pedestrian detected
-    self.confidence -> min model confidence level
-    self.classes_to_detect -> Classes for the model to detect
-    
-    """
-    def __init__(self,vehicle_interface : GEMInterface):
+    # TODO: Pull params into a JSON/yaml
+    # TODO: Convert some lists into np.arrays, vectorize calculations
+    # TODO: Implement logging instead of print, cleanup comments
+    # TODO: Cleanup funcs + split into separate classes
+    # TODO: Decide if we want to name dets "peds" or "objs"/"agents"
+    #       Maybe peds for now and Agents in agent_detection.py?
+    def __init__(self, vehicle_interface : GEMInterface) -> None:
         self.vehicle_interface = vehicle_interface
-        self.detector = YOLO(os.getcwd()+'/GEMstack/knowledge/detection/yolov8n.pt') # change to get model value from sys arg
-        self.last_person_boxes = [] 
-        self.pedestrians = {}
-        self.visualization = True # Set this to true for visualization, later change to get value from sys arg
+        # Publish debug/viz rostopics if true
+        self.debug = True
+        # Setup variables
+        self.bridge = CvBridge()
+        # TODO: Wrap detector into GEMDetector?
+        self.detector = YOLO(os.getcwd()+'/GEMstack/knowledge/detection/yolov8n.pt')
+        # track_id: AgentState
+        self.prev_agents = dict()         
+        self.current_agents = dict()
+        # TODO Implement time
         self.confidence = 0.7
         self.classes_to_detect = 0
+        self.ground_threshold = -2.0
+        self.max_human_depth = 0.9
+        self.vehicle_frame = True # Indicate whether pedestrians centroids and point clouds are in the vehicle frame
 
-    def rate(self):
-        return 4.0
+        # Load calibration data
+        # TODO: Maybe lets add one word or link what R t K are?
+        self.R_lidar_to_oak = load_extrinsics(os.getcwd() + '/GEMstack/onboard/perception/calibration/extrinsics/R.npy')
+        self.t_lidar_to_oak = load_extrinsics(os.getcwd() + '/GEMstack/onboard/perception/calibration/extrinsics/t.npy')
+        self.K = load_intrinsics(os.getcwd() + '/GEMstack/onboard/perception/calibration/camera_intrinsics.json')
+
+        self.R_lidar_to_vehicle = np.array([[0.9995121, 0.02132941, 0.02281911], 
+            [-0.02124771, 0.99976695, -0.00381707], 
+            [-0.02289521, 0.00333035, 0.9997323]])
+        self.t_lidar_to_vehicle = np.array([[0.0], [0.0], [0.35]])
+
+        # Subscribers and sychronizers
+        self.rgb_rosbag = message_filters.Subscriber('/oak/rgb/image_raw', Image)
+        self.top_lidar_rosbag = message_filters.Subscriber('/ouster/points', PointCloud2)
+        self.sync = message_filters.ApproximateTimeSynchronizer([self.rgb_rosbag, self.top_lidar_rosbag], queue_size=10, slop=0.1)
+        self.sync.registerCallback(self.ouster_oak_callback)
+
+        # TF listener to get transformation from LiDAR to Camera
+        self.tf_listener = tf.TransformListener()
+
+        if self.debug: self.init_debug()
+
+        self.prev_time = None # Time in seconds since last scan for basic velocity calculation
+        self.curr_time = None # Time in seconds of current scan for basic velocity calculation
+        self.id_tracker = IdTracker()
     
-    def state_inputs(self):
-        return ['vehicle']
+    def init_debug(self,) -> None:
+         # Debug Publishers
+        self.pub_pedestrians_pc2 = rospy.Publisher("/point_cloud/pedestrians", PointCloud2, queue_size=10)
+        self.pub_obj_centers_pc2 = rospy.Publisher("/point_cloud/obj_centers", PointCloud2, queue_size=10)
+        self.pub_bboxes_markers = rospy.Publisher("/markers/bboxes", MarkerArray, queue_size=10)
+        self.pub_image = rospy.Publisher("/camera/image_detection", Image, queue_size=1)
+
+    def update(self, vehicle : VehicleState) -> Dict[str,AgentState]:
+        return self.current_agents
+
+    # TODO: Improve Algo Knn, ransac, etc.
+    def find_centers(self, clusters: List[List[np.ndarray]]) -> List[np.ndarray]:
+        clusters = [np.array(clust) for clust in clusters]
+        centers = [np.array(()) if clust.size == 0 else np.mean(clust, axis=0) for clust in clusters]
+        return centers
     
-    def state_outputs(self):
-        return ['agents']
+    # Beware: AgentState(PhysicalObject) builds bbox from 
+    # dims [-l/2,l/2] x [-w/2,w/2] x [0,h], not
+    # [-l/2,l/2] x [-w/2,w/2] x [-h/2,h/2]
+    def find_dims(self, clusters: List[List[np.ndarray]]) -> List[np.ndarray]:
+        # Add some small constant to height to compensate for
+        # objects distance to ground we filtered from lidar, 
+        # other heuristics to imrpove stability for find_ funcs ?
+        clusters = [np.array(clust) for clust in clusters]
+        dims = [np.array(()) if clust.size == 0 else np.max(clust, axis= 0) - np.min(clust, axis= 0) for clust in clusters]
+        return dims
+
+    # TODO: Slower but cleaner to input self.current_agents dict
+    # TODO: Moving Average across last N iterations pos/vel? Less spurious vals
+    # TODO Akul: Fix velocity calculation to calculate in ObjectFrameEnum.START
+    #            Work towards own tracking class instead of simple YOLO track?
+    #            Fix division by time
+    # ret: Dict[track_id: vel[x, y, z]]
+    def find_vels(self, track_ids: List[int], obj_centers: List[np.ndarray], obj_dims: List[np.ndarray]) -> Dict[int, np.ndarray]:
+        # Object not seen -> velocity = None
+        track_id_center_map = dict(zip(track_ids, obj_centers))
+        vels = defaultdict(lambda: np.array(())) # None is faster, np.array matches other find_ methods.
+
+        for prev_track_id, prev_agent in self.prev_agents.items():
+            if prev_track_id in track_ids:
+                # TODO: Add prev_agents to memory to avoid None velocity
+                # We should only be missing prev pose on first sight of track_id Agent.
+                # print("shape 1: ", track_id_center_map[prev_agent.track_id])
+                # print("shape 2: ", np.array([prev_agent.pose.x, prev_agent.pose.y, prev_agent.pose.z]))
+                # prev can be 3 separate Nones, current is just empty array... make this symmetrical
+                if prev_agent.pose.x and prev_agent.pose.y and prev_agent.pose.z and track_id_center_map[prev_agent.track_id].shape == 3:
+                    vels[prev_track_id] = (track_id_center_map[prev_track_id] - np.array([prev_agent.pose.x, prev_agent.pose.y, prev_agent.pose.z])) / (self.curr_time - self.prev_time)
+        return vels
     
-    def initialize(self):
+    # TODO: Separate debug/viz class, bbox and 2d 3d points funcs 
+    def viz_object_states(self, cv_image, boxes, extracted_pts_all):
+        # Extract 3D pedestrians points in lidar frame
+        # ** These are camera frame after transform_lidar_points, right?
+        # ** It was in camera frame before. I fixed it. Now they are in lidar frame!
+        pedestrians_3d_pts = [[] if len(extracted_pts) == 0 else list(extracted_pts[:, -3:]) for extracted_pts in extracted_pts_all] 
 
-        """Initializes subscribers and publishers
+        # Object center viz
+        obj_3d_obj_centers = list()
+        obj_3d_obj_dims = list()
+        for track_id, agent in self.current_agents.items():
+            if agent.pose.x != None and agent.pose.y != None and agent.pose.z != None:
+                obj_3d_obj_centers.append((agent.pose.x, agent.pose.y, agent.pose.z)) # **
+            if agent.dimensions != None and agent.dimensions[0] != None and agent.dimensions[1] != None and agent.dimensions[2] != None:
+                obj_3d_obj_dims.append(agent.dimensions)
         
-        self.vehicle_interface.subscribe_sensor -> GEM Car camera subscription
-        self.rosbag_test -> ROS Bag topic subscription
-        self.pub_image -> Publishes Image with detection for visualization
+        # Extract 2D pedestrians points and bbox in camera frame
+        extracted_2d_pts = [[] if len(extracted_pts) == 0 else list(extracted_pts[:, :2].astype(int)) for extracted_pts in extracted_pts_all]
+        flattened_pedestrians_2d_pts = list()
+        for pts in extracted_2d_pts:    flattened_pedestrians_2d_pts.extend(pts)
 
-        """
-        #tell the vehicle to use image_callback whenever 'front_camera' gets a reading, and it expects images of type cv2.Mat
+        for ind, bbox in enumerate(boxes):
+            xywh = bbox.xywh[0].tolist()
+            cv_image = vis_2d_bbox(cv_image, xywh, bbox)
+        
+        flattened_pedestrians_3d_pts = list() 
+        for pts in pedestrians_3d_pts: flattened_pedestrians_3d_pts.extend(pts)
+	
+        if len(flattened_pedestrians_3d_pts) > 0:
+            # Draw projected 2D LiDAR points on the image.
+            for pt in flattened_pedestrians_2d_pts:
+                cv2.circle(cv_image, pt, 2, (0, 0, 255), -1)
+            ros_img = self.bridge.cv2_to_imgmsg(cv_image, 'bgr8')
+            self.pub_image.publish(ros_img)  
 
-        # GEM Car subacriber
-        # self.vehicle_interface.subscribe_sensor('front_camera',self.image_callback,cv2.Mat)
+            # Draw 3D pedestrian pointclouds
+            if self.vehicle_frame:
+                # If in vehicle frame, tranform 3D pedestrians points to vehicle frame for better visualization.
+                flattened_pedestrians_3d_pts_vehicle = transform_lidar_points(np.array(flattened_pedestrians_3d_pts), self.R_lidar_to_vehicle, self.t_lidar_to_vehicle)
+                flattened_pedestrians_3d_pts = flattened_pedestrians_3d_pts_vehicle
 
-        # Webcam
-        # rospy.Subscriber('/webcam', Image, self.image_callback)
+            # Create point cloud from extracted 3D points
+            ros_extracted_pedestrian_pc2 = create_point_cloud(flattened_pedestrians_3d_pts)
+            self.pub_pedestrians_pc2.publish(ros_extracted_pedestrian_pc2)
 
-        # Testing with rosbag
-        self.rosbag_test = rospy.Subscriber('/oak/rgb/image_raw',Image, self.image_callback,queue_size=1)
-        if(self.visualization):
-            self.pub_image = rospy.Publisher("/camera/image_detection", Image, queue_size=1)
-        pass
-    
-    # Use cv2.Mat for GEM Car, Image for RosBag
-    def image_callback(self, image : Image): #image : cv2.Mat):
+        # Draw 3D pedestrian centers and dimensions
+        if len(obj_3d_obj_centers) > 0 and len(obj_3d_obj_dims) > 0:
+            # Draw 3D pedestrian center pointclouds
+            ros_obj_3d_obj_centers_pc2 = create_point_cloud(obj_3d_obj_centers, color=(0, 128, 0))
+            self.pub_obj_centers_pc2.publish(ros_obj_3d_obj_centers_pc2)
+
+            # Draw 3D pedestrian bboxes markers
+            # Create bbox marker from pedestrain dimensions
+            ros_delete_bboxes_markers = delete_bbox_marker()
+            self.pub_bboxes_markers.publish(ros_delete_bboxes_markers)
+            ros_pedestrians_bboxes_markers = create_bbox_marker(obj_3d_obj_centers, obj_3d_obj_dims)
+            self.pub_bboxes_markers.publish(ros_pedestrians_bboxes_markers)
+        
+
+    def update_object_states(self, track_result: List[Results], extracted_pts_all: List[np.ndarray]) -> None:
+        # self.prev_agents = self.current_agents.copy()
+        self.current_agents.clear()
+
+        # Return if no track results available
+        if track_result[0].boxes.id == None:
+            return
+
+        # Change pedestrians_3d_pts to dicts matching track_ids
+        track_ids = track_result[0].boxes.id.int().cpu().tolist()
+        num_objs = len(track_ids)
+        boxes = track_result[0].boxes
 
-        """Detects pedestrians using the model provided when new image is passed.
+        # Extract 3D pedestrians points in lidar frame
+        # ** These are camera frame after transform_lidar_points, right?
+        # ** It was in camera frame before. I fixed it. Now they are in lidar frame!
+        pedestrians_3d_pts = [[] if len(extracted_pts) == 0 else list(extracted_pts[:, -3:]) for extracted_pts in extracted_pts_all] 
+        if len(pedestrians_3d_pts) != num_objs:
+            raise Exception('Perception - Camera detections, points clusters num. mismatch')
+        
+        # TODO: Slower but cleaner to pass dicts of AgentState
+        #       or at least {track_ids: centers/pts/etc}
+        # TODO: Combine funcs for efficiency in C.
+        #       Separate numpy prob still faster for now
+        obj_centers = self.find_centers(pedestrians_3d_pts) # Centers are calculated in lidar frame here
+        obj_dims = self.find_dims(pedestrians_3d_pts)
 
-        Converts Image.msg to cv2 format (Might need to change this to use cv2.Mat) and uses the model to detect pedestrian
-        IF visualization is true, will publish an image with pedestrians detected.
+        # If in vehicle frame, transform centers from top_lidar frame to vehicle frame
+        # Need to transform the center point one by one since matrix op can't deal with empty points
+        if self.vehicle_frame:
+            obj_centers_vehicle = []
+            for obj_center in obj_centers:
+                if len(obj_center) > 0:
+                    obj_center = np.array([obj_center])
+                    obj_center_vehicle = transform_lidar_points(obj_center, self.R_lidar_to_vehicle, self.t_lidar_to_vehicle)[0]
+                    obj_centers_vehicle.append(obj_center_vehicle)
+                else:
+                    obj_centers_vehicle.append(np.array(()))
+            obj_centers = obj_centers_vehicle
 
-        Hardcoded values for now:
-            Detected only pedestrians -> Class = 0
-            Confidence level -> 0.7
+        self.find_vels_and_ids(obj_centers, obj_dims)
+            
+    # TODO: Refactor to make more efficient
+    # TODO: Moving Average across last N iterations pos/vel? Less spurious vals
+    # TODO Fix velocity calculation to calculate in ObjectFrameEnum.START
+    def find_vels_and_ids(self, obj_centers: List[np.ndarray], obj_dims: List[np.ndarray]):
+        # Gate to check whether dims and centers are empty (will happen if no pedestrians are scanned):
+        if ((len(obj_dims) == 1 or len(obj_centers) == 1) and (obj_centers[0].size == 0 or obj_dims[0].size == 0)):
+            self.prev_agents = self.current_agents.copy()
+            self.current_agents = {} # There weren't any pedestrians detected
+            return
+        elif (len(obj_centers) != len(obj_dims)):
+            for i in range(25):
+                print("ERROR NUM PARINGS AND OBJ DIMS ARENT THE SAME")
+            self.prev_agents = self.current_agents.copy()
+            self.current_agents = {} # There weren't any pedestrians detected
+            return
         
-        """
+        new_prev_agents = {} # Stores current agents in START frame for next time through (since 
+                             # planning wants us to send them agents in CURRENT frame)
+
+        assigned = False
+        num_pairings = len(obj_centers)
+
+        # TODO: NEED TO STORE AND INCORPORATE TRANSFORMS SOMEHOW TO DEAL WITH MOVING CAR CASE
+        converted_centers = obj_centers # TODO: Add this in when finished self.convert_vehicle_frame_to_start_frame(obj_centers)
+
+        # Loop through the indexes of the obj_center and obj_dim pairings
+        for idx in range(num_pairings):
+            assigned = False
+
+            # Loop through previous agents backwards
+            for prev_id, prev_state in reversed(self.prev_agents.items()):
+                # If an obj_center and obj_dim pair overlaps with a previous agent, assume that they're the same agent
+                if self.agents_overlap(converted_centers[idx], obj_dims[idx], prev_state):
+                    assigned = True
+
+                    if self.prev_time == None:
+                        # This will be triggered if the very first message has pedestrians in it
+                        vel = np.array([0, 0, 0])
+                    else:
+                        delta_t = self.curr_time - self.prev_time
+                        vel = (converted_centers[idx] - np.array([prev_state.pose.x, prev_state.pose.y, prev_state.pose.z])) / delta_t.total_seconds()
+                    print("VELOCITY:")
+                    print(vel)
+
+                    self.current_agents[prev_id] = (
+                        AgentState(
+                            track_id = prev_id,
+                            pose=ObjectPose(t=0, x=obj_centers[idx][0], y=obj_centers[idx][1], z=obj_centers[idx][2], yaw=0,pitch=0,roll=0,frame=ObjectFrameEnum.CURRENT),
+                            # (l, w, h)
+                            # TODO: confirm (z -> l, x -> w, y -> h)
+                            dimensions=(obj_dims[idx][0], obj_dims[idx][1], obj_dims[idx][2]),  
+                            outline=None,
+                            type=AgentEnum.PEDESTRIAN,
+                            activity=AgentActivityEnum.STOPPED if np.all(vel == 0) else AgentActivityEnum.MOVING,
+                            velocity=None if vel.size == 0 else tuple(vel),
+                            yaw_rate=0
+                        ))
+                    new_prev_agents[prev_id] = (
+                        AgentState(
+                            track_id = prev_id,
+                            pose=ObjectPose(t=0, x=converted_centers[idx][0], y=converted_centers[idx][1], z=converted_centers[idx][2], yaw=0,pitch=0,roll=0,frame=ObjectFrameEnum.CURRENT),
+                            # (l, w, h)
+                            # TODO: confirm (z -> l, x -> w, y -> h)
+                            dimensions=(obj_dims[idx][0], obj_dims[idx][1], obj_dims[idx][2]),  
+                            outline=None,
+                            type=AgentEnum.PEDESTRIAN,
+                            activity=AgentActivityEnum.STOPPED if np.all(vel == 0) else AgentActivityEnum.MOVING,
+                            velocity=None if vel.size == 0 else tuple(vel),
+                            yaw_rate=0
+                        ))
+                    del self.prev_agents[prev_id] # Remove previous agent from previous agents
+                    break
+
+            # If not assigned:
+            if not assigned:
+                # Set velocity to 0 and assign the new agent a new id with IdTracker
+                id = self.id_tracker.get_new_id()
+                self.current_agents[id] = (
+                    AgentState(
+                        track_id = id,
+                        pose=ObjectPose(t=0, x=obj_centers[idx][0], y=obj_centers[idx][1], z=obj_centers[idx][2] ,yaw=0,pitch=0,roll=0,frame=ObjectFrameEnum.CURRENT),
+                        # (l, w, h)
+                        # TODO: confirm (z -> l, x -> w, y -> h)
+                        dimensions=(obj_dims[idx][0], obj_dims[idx][1], obj_dims[idx][2]),  
+                        outline=None,
+                        type=AgentEnum.PEDESTRIAN,
+                        activity=AgentActivityEnum.UNDETERMINED,
+                        velocity=None,
+                        yaw_rate=0
+                    ))
+                new_prev_agents[id] = (
+                    AgentState(
+                        track_id = id,
+                        pose=ObjectPose(t=0, x=converted_centers[idx][0], y=converted_centers[idx][1], z=converted_centers[idx][2] ,yaw=0,pitch=0,roll=0,frame=ObjectFrameEnum.CURRENT),
+                        # (l, w, h)
+                        # TODO: confirm (z -> l, x -> w, y -> h)
+                        dimensions=(obj_dims[idx][0], obj_dims[idx][1], obj_dims[idx][2]),  
+                        outline=None,
+                        type=AgentEnum.PEDESTRIAN,
+                        activity=AgentActivityEnum.UNDETERMINED,
+                        velocity=None,
+                        yaw_rate=0
+                    ))
+        self.prev_agents = new_prev_agents
+
+    # Calculates whether 2 agents overlap in START frame. True if they do, false if not
+    def agents_overlap(self, obj_center: np.ndarray, obj_dim: np.ndarray, prev_agent: AgentState) -> bool:
+        # Calculate corners of obj_center and obj_dim pairing
+        x1_min, x1_max = obj_center[0] - obj_dim[0] / 2.0, obj_center[0] + obj_dim[0] / 2.0
+        y1_min, y1_max = obj_center[1] - obj_dim[1] / 2.0, obj_center[1] + obj_dim[1] / 2.0 # CENTER CALCULATION
+        z1_min, z1_max = obj_center[2] - obj_dim[2] / 2.0, obj_center[2] + obj_dim[2] / 2.0
+
+        # Calculate corners of AgentState
+        # Beware: AgentState(PhysicalObject) builds bbox from 
+        # dims [-l/2,l/2] x [-w/2,w/2] x [0,h], not
+        # [-l/2,l/2] x [-w/2,w/2] x [-h/2,h/2]
+        # TODO: confirm (z -> l, x -> w, y -> h)
+        x2_min, x2_max = prev_agent.pose.x - prev_agent.dimensions[0] / 2.0, prev_agent.pose.x + prev_agent.dimensions[0] / 2.0
+        y2_min, y2_max = prev_agent.pose.y, prev_agent.pose.y + prev_agent.dimensions[1] # AGENT STATE CALCULATION
+        z2_min, z2_max = prev_agent.pose.z - prev_agent.dimensions[2] / 2.0, prev_agent.pose.z + prev_agent.dimensions[2] / 2.0
+        
+        # True if they overlap, false if not
+        return (
+            ( (x1_min <= x2_min and x2_min <= x1_max) or (x2_min <= x1_min and x1_min <= x2_max) ) and
+            ( (y1_min <= y2_min and y2_min <= y1_max) or (y2_min <= y1_min and y1_min <= y2_max) ) and
+            ( (z1_min <= z2_min and z2_min <= z1_max) or (z2_min <= z1_min and z1_min <= z2_max) )
+        )
+
+    def convert_vehicle_frame_to_start_frame(self, vehicle_states: List[AgentState]) -> List[AgentState]:
+        # num_states = len(vehicle_states)
+        # for idx in num_states:
+        #     num_states[idx].to_frame()
+        pass
+
 
-        # Use Image directly for GEM Car
-        # track_result = self.detector.track(source=image, classes=self.classes_to_detect, persist=True, conf=self.confidence)
+    def ouster_oak_callback(self, rgb_image_msg: Image, lidar_pc2_msg: PointCloud2):
+        # Update times for basic velocity calculation
+        self.prev_time = self.curr_time
+        self.curr_time = datetime.now()
 
-        # Convert to CV2 format for RosBag
-        bridge = CvBridge() 
-        image = bridge.imgmsg_to_cv2(image, "bgr8") 
-        track_result = self.detector.track(source=image, classes=self.classes_to_detect, persist=True, conf=self.confidence)
+        # Convert to cv2 image and run detector
+        cv_image = self.bridge.imgmsg_to_cv2(rgb_image_msg, "bgr8") 
+        track_result = self.detector.track(source=cv_image, classes=self.classes_to_detect, persist=True, conf=self.confidence)
 
-        self.last_person_boxes = []
+        # Convert 1D PointCloud2 data to x, y, z coords
+        lidar_points = convert_pointcloud2_to_xyz(lidar_pc2_msg)
+        # print("len lidar_points", len(lidar_points))
+
+        # Downsample xyz point clouds
+        downsampled_points = downsample_points(lidar_points)
+        # print("len downsampled_points", len(downsampled_points))
+        
+        # Transform LiDAR points into the camera coordinate frame.
+        lidar_in_camera = transform_lidar_points(downsampled_points, self.R_lidar_to_oak, self.t_lidar_to_oak)
+        # print("len lidar_in_camera", len(lidar_in_camera))
+
+        # Project the transformed points into the image plane.
+        projected_pts = project_points(lidar_in_camera, self.K, downsampled_points)
+        # print("projected_pts", len(projected_pts))
+
+        # Process bboxes
         boxes = track_result[0].boxes
 
-        # Used for visualization
-        if(self.visualization):
-            label_text = "Pedestrian "
-            font = cv2.FONT_HERSHEY_SIMPLEX
-            font_scale = 0.5
-            font_color = (255, 255, 255)  # White text
-            outline_color = (0, 0, 0)  # Black outline
-            line_type = 1
-            text_thickness = 2 # Text thickness
-            outline_thickness = 1  # Thickness of the text outline
-
-        # Unpacking box dimentions detected into x,y,w,h
-        for box in boxes:
-
-            xywh = box.xywh[0].tolist()
-            self.last_person_boxes.append(xywh)
+        extracted_pts_all = list()
+
+        for ind, bbox in enumerate(boxes):
+            xywh = bbox.xywh[0].tolist()
+
+            # Extracting projected pts
             x, y, w, h = xywh
-            id = box.id.item()
-
-            # Stores AgentState in a dict, can be removed if not required
-            pose = ObjectPose(t=0,x=x,y=y,z=0,yaw=0,pitch=0,roll=0,frame=ObjectFrameEnum.CURRENT)
-            dims = (w,h,0)
-            if(id not in self.pedestrians.keys()):
-                self.pedestrians[id] = AgentState(pose=pose,dimensions=dims,outline=None,type=AgentEnum.PEDESTRIAN,activity=AgentActivityEnum.MOVING,velocity=(0,0,0),yaw_rate=0)
-            else:
-                self.pedestrians[id].pose = pose
-                self.pedestrians[id].dims = dims
-
-            # Used for visualization
-            if(self.visualization):
-                # Draw bounding box
-                cv2.rectangle(image, (int(x - w / 2), int(y - h / 2)), (int(x + w / 2), int(y + h / 2)), (255, 0, 255), 3)
-
-                # Define text label
-                x = int(x - w / 2)
-                y = int(y - h / 2)
-                label = label_text + str(id) + " : " + str(round(box.conf.item(), 2))
-
-                # Get text size
-                text_size, baseline = cv2.getTextSize(label, font, font_scale, line_type)
-                text_w, text_h = text_size
-
-                # Position text above the bounding box
-                text_x = x
-                text_y = y - 10 if y - 10 > 10 else y + h + text_h
-
-                # Draw text outline for better visibility, uncomment for outline
-                # for dx, dy in [(-1, -1), (-1, 1), (1, -1), (1, 1)]:  
-                #     cv2.putText(image, label, (text_x + dx, text_y - baseline + dy), font, font_scale, outline_color, outline_thickness)
-
-                # Draw main text on top of the outline
-                cv2.putText(image, label, (text_x, text_y - baseline), font, font_scale, font_color, text_thickness)
+            left_bound = int(x - w / 2)
+            right_bound = int(x + w / 2)
+            top_bound = int(y - h / 2)
+            bottom_bound = int(y + h / 2)
+
+            pts = np.array(projected_pts)
+            # Checking projected_pts length is very important
+            if len(projected_pts) > 0:
+                extracted_pts = pts[(pts[:, 0] > left_bound) &
+                                    (pts[:, 0] < right_bound) &
+                                    (pts[:, 1] > top_bound) &
+                                    (pts[:, 1] < bottom_bound)
+                                    ]
 
+                # Apply ground and max distance filter to the extracted 5D points
+                extracted_pts = filter_ground_points(extracted_pts, self.ground_threshold)
+                extracted_pts = filter_depth_points(extracted_pts, self.max_human_depth)
+                extracted_pts_all.append(extracted_pts)
+            else: extracted_pts_all.append(np.array(()))
         
-        # Used for visualization
-        if(self.visualization):
-            ros_img = bridge.cv2_to_imgmsg(image, 'bgr8')
-            self.pub_image.publish(ros_img)
-
-        #uncomment if you want to debug the detector...
-        # print(self.last_person_boxes)
-        # print(self.pedestrians.keys())
-        #for bb in self.last_person_boxes:
-        #    x,y,w,h = bb
-        #    cv2.rectangle(image, (int(x-w/2), int(y-h/2)), (int(x+w/2), int(y+h/2)), (255, 0, 255), 3)
-        #cv2.imwrite("pedestrian_detections.png",image)
+        self.update_object_states(track_result, extracted_pts_all)
+        if self.debug: self.viz_object_states(cv_image, boxes, extracted_pts_all)
+
+
+    def rate(self):
+        return 4.0
     
-    def update(self, vehicle : VehicleState) -> Dict[str,AgentState]:
-        res = {}
-        for i,b in enumerate(self.last_person_boxes):
-            x,y,w,h = b
-            res['pedestrian'+str(i)] = box_to_fake_agent(b)
-        if len(res) > 0:
-            print("Detected",len(res),"pedestrians")
-        return res
+    def state_inputs(self):
+        return ['vehicle']
+    
+    def state_outputs(self):
+        return ['agents']
 
 
 class FakePedestrianDetector2D(Component):
diff --git a/GEMstack/onboard/perception/pedestrian_detection_utils.py b/GEMstack/onboard/perception/pedestrian_detection_utils.py
new file mode 100644
index 000000000..788ee2e54
--- /dev/null
+++ b/GEMstack/onboard/perception/pedestrian_detection_utils.py
@@ -0,0 +1,244 @@
+from sensor_msgs.msg import PointCloud2, PointField
+from visualization_msgs.msg import Marker, MarkerArray
+import numpy as np
+import sensor_msgs.point_cloud2 as pc2
+import open3d as o3d
+import cv2
+import json
+import rospy
+import numpy as np
+import struct
+
+
+def convert_pointcloud2_to_xyz(lidar_pc2_msg: PointCloud2):
+    """ Convert 1D PointCloud2 data to x, y, z coords """
+    return np.array(list(pc2.read_points(lidar_pc2_msg, skip_nans=True)), dtype=np.float32)[:, :3]
+
+
+def downsample_points(lidar_points):
+    """ Downsample point clouds """
+    # Convert numpy array to Open3D point cloud
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(lidar_points)
+
+    # Apply voxel grid downsampling
+    voxel_size = 0.1  # Adjust for desired resolution
+    downsampled_pcd = pcd.voxel_down_sample(voxel_size=voxel_size)
+    
+    # Convert back to numpy array
+    transformed_points = np.asarray(downsampled_pcd.points)
+    return transformed_points
+
+
+def filter_ground_points(lidar_points, ground_threshold = 0):
+    """ Filter points given an elevation of ground threshold """
+    filtered_array = lidar_points[lidar_points[:, 4] > ground_threshold]
+    return filtered_array
+
+
+def filter_depth_points(lidar_points, max_human_depth=0.9):
+    """ Filter points beyond a max possible human depth in a point cluster """
+    if lidar_points.shape[0] == 0: return lidar_points
+    lidar_points_dist = lidar_points[:, 2]
+    min_dist = np.min(lidar_points_dist)
+    max_dist = np.max(lidar_points_dist)
+    max_possible_dist = min_dist + max_human_depth
+    actual_dist = min(max_dist, max_possible_dist)
+    filtered_array = lidar_points[lidar_points_dist < actual_dist]
+    return filtered_array
+
+
+# Credits: The following lines of codes (from 33 to 92) are adapted from the Calibration Team B
+def load_extrinsics(extrinsics_file):
+    """
+    Load calibrated extrinsics from a .npz file.
+    Assumes the file contains keys 'R' and 't'.
+    """
+    data = np.load(extrinsics_file)
+    return data
+
+
+def load_intrinsics(intrinsics_file):
+    """
+    Load camera intrinsics from a JSON file.
+    Expects keys: 'fx', 'fy', 'cx', and 'cy'.
+    """
+    with open(intrinsics_file, 'r') as f:
+        intrinsics = json.load(f)
+    fx = intrinsics["fx"]
+    fy = intrinsics["fy"]
+    cx = intrinsics["cx"]
+    cy = intrinsics["cy"]
+    K = np.array([[fx, 0, cx],
+                  [0, fy, cy],
+                  [0,  0,  1]], dtype=np.float32)
+    return K
+
+
+def load_lidar_scan(lidar_file):
+    """
+    Load a LiDAR scan from a file.
+    This function handles both npy (returns a NumPy array) and npz files.
+    """
+    data = np.load(lidar_file, allow_pickle=True)
+    if isinstance(data, np.ndarray):
+        return data.astype(np.float32)
+    else:
+        key = list(data.keys())[0]
+        return data[key].astype(np.float32)
+
+
+def transform_lidar_points(lidar_points, R, t):
+    """
+    Transform LiDAR points from the LiDAR frame into the camera frame.
+    p_cam = R * p_lidar + t.
+    """
+    P_cam = (R @ lidar_points.T + t.reshape(3,1)).T
+    return P_cam
+
+
+def project_points(points_3d, K, lidar_points):
+    """
+    Project 3D points (in the camera frame) into 2D image coordinates using the camera matrix K.
+    Only projects points with z > 0.
+    """
+    proj_points = []
+    for pt, l_pt in zip(points_3d, lidar_points):
+        if pt[2] > 0:  # only project points in front of the camera
+            u = K[0, 0] * (pt[0] / pt[2]) + K[0, 2]
+            v = K[1, 1] * (pt[1] / pt[2]) + K[1, 2]
+            # 5D data point
+            proj_points.append((int(u), int(v), l_pt[0], l_pt[1], l_pt[2]))
+    return np.array(proj_points)
+
+
+def vis_2d_bbox(image, xywh, box):
+    # Setup
+    label_text = "Pedestrian "
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.5
+    font_color = (255, 255, 255)
+    line_type = 1
+    text_thickness = 2
+
+    x, y, w, h = xywh
+
+    if box.id is not None:
+        id = box.id.item()
+    else:
+        id = 0
+
+    # Draw bounding box
+    cv2.rectangle(image, (int(x - w / 2), int(y - h / 2)), (int(x + w / 2), int(y + h / 2)), (255, 0, 255), 3)
+
+    # Define text label
+    x = int(x - w / 2)
+    y = int(y - h / 2)
+    label = label_text + str(id) + " : " + str(round(box.conf.item(), 2))
+
+    # Get text size
+    text_size, baseline = cv2.getTextSize(label, font, font_scale, line_type)
+    text_w, text_h = text_size
+
+    # Position text above the bounding box
+    text_x = x
+    text_y = y - 10 if y - 10 > 10 else y + h + text_h
+
+    # Draw main text on top of the outline
+    cv2.putText(image, label, (text_x, text_y - baseline), font, font_scale, font_color, text_thickness)
+    return image
+
+
+def create_point_cloud(points, color=(255, 0, 0)):
+    """
+    Converts a list of (x, y, z) points into a PointCloud2 message.
+    """
+    header = rospy.Header()
+    header.stamp = rospy.Time.now()
+    header.frame_id = "os_sensor"  # Change to your TF frame
+
+    fields = [
+        PointField(name="x", offset=0, datatype=PointField.FLOAT32, count=1),
+        PointField(name="y", offset=4, datatype=PointField.FLOAT32, count=1),
+        PointField(name="z", offset=8, datatype=PointField.FLOAT32, count=1),
+        PointField(name="rgb", offset=12, datatype=PointField.FLOAT32, count=1),
+    ]
+
+    # Convert RGB color to packed float32
+    r, g, b = color
+    packed_color = struct.unpack('f', struct.pack('I', (r << 16) | (g << 8) | b))[0]
+
+    point_cloud_data = [(x, y, z, packed_color) for x, y, z in points]
+
+    return pc2.create_cloud(header, fields, point_cloud_data)
+
+
+def create_bbox_marker(centroids, dimensions):
+    """
+    Create 3D bbox markers from centroids and dimensions
+    """
+    marker_array = MarkerArray()
+
+    for i, (centroid, dimension) in enumerate(zip(centroids, dimensions)):
+        # Skip if no centroid or dimension
+        if (centroid == None) or (dimension == None):
+            continue
+
+        marker = Marker()
+        marker.header.frame_id = "os_sensor"  # Reference frame
+        marker.header.stamp = rospy.Time.now()
+        marker.ns = "bounding_boxes"
+        marker.id = i  # Unique ID for each marker
+        marker.type = Marker.CUBE  # Cube for bounding box
+        marker.action = Marker.ADD
+
+        # Position (center of the bounding box)
+        c_x, c_y, c_z = centroid
+        if (not isinstance(c_x, float)) or (not isinstance(c_y, float)) or (not isinstance(c_z, float)):
+            continue
+
+        marker.pose.position.x = c_x
+        marker.pose.position.y = c_y
+        marker.pose.position.z = c_z
+
+        # Orientation (default, no rotation)
+        marker.pose.orientation.x = 0.0
+        marker.pose.orientation.y = 0.0
+        marker.pose.orientation.z = 0.0
+        marker.pose.orientation.w = 1.0
+
+        # Bounding box dimensions
+        d_x, d_y, d_z = dimension
+        if (not isinstance(d_x, float)) or (not isinstance(d_y, float)) or (not isinstance(d_z, float)):
+            continue
+
+        marker.scale.x = d_x
+        marker.scale.y = d_y
+        marker.scale.z = d_z
+
+        # Random colors for each bounding box
+        marker.color.r = 0.0  # Varying colors
+        marker.color.g = 1.0
+        marker.color.b = 1.5
+        marker.color.a = 0.2  # Transparency
+
+        marker.lifetime = rospy.Duration()  # Persistent
+        marker_array.markers.append(marker)
+    return marker_array
+
+
+def delete_bbox_marker():
+    """
+    Delete 3D bbox markers given ID ranges
+    """
+    marker_array = MarkerArray()
+    for i in range(6):
+        marker = Marker()
+        marker.ns = "bounding_boxes"
+        marker.id = i
+        marker.action = Marker.DELETE
+        marker_array.markers.append(marker)
+    return marker_array
+
+
+    
diff --git a/GEMstack/onboard/perception/transform.py b/GEMstack/onboard/perception/transform.py
new file mode 100644
index 000000000..8032752b1
--- /dev/null
+++ b/GEMstack/onboard/perception/transform.py
@@ -0,0 +1,21 @@
+import rospy
+import tf
+import numpy as np
+
+def publish_tf():
+    rospy.init_node('pointcloud_tf_broadcaster')
+    br = tf.TransformBroadcaster()
+    rate = rospy.Rate(10)  # 10 Hz
+
+    while not rospy.is_shutdown():
+        br.sendTransform(
+            (0, 0, 0),  # (x, y, z) translation
+            tf.transformations.quaternion_from_euler(0, 0, 0),  # (roll, pitch, yaw)
+            rospy.Time.now(),
+            "os_sensor",  # Child frame (sensor)
+            "map"  # Parent frame (world)
+        )
+        rate.sleep()
+
+if __name__ == "__main__":
+    publish_tf()
\ No newline at end of file
diff --git a/GEMstack/onboard/planning/longitudinal_planning.py b/GEMstack/onboard/planning/longitudinal_planning.py
index c3f3bf737..e862239fe 100644
--- a/GEMstack/onboard/planning/longitudinal_planning.py
+++ b/GEMstack/onboard/planning/longitudinal_planning.py
@@ -77,7 +77,7 @@ def update(self, state : AllState):
         #parse the relations indicated
         should_brake = False
         for r in state.relations:
-            if r.type == EntityRelationEnum.YIELD and r.obj1 == '':
+            if r.type == EntityRelationEnum.YIELDING and r.obj1 == '':
                 #yielding to something, brake
                 should_brake = True
         should_accelerate = (not should_brake and curr_v < self.desired_speed)
diff --git a/GEMstack/state/agent.py b/GEMstack/state/agent.py
index 757805837..b0ad7aaad 100644
--- a/GEMstack/state/agent.py
+++ b/GEMstack/state/agent.py
@@ -27,6 +27,10 @@ class AgentState(PhysicalObject):
     activity : AgentActivityEnum
     velocity : Tuple[float,float,float]     #estimated velocity in x,y,z, m/s and in agent's local frame
     yaw_rate : float                        #estimated yaw rate, in radians/s
+    # Added by Perception for persistent objs across frames
+    # Check track_id to verify this is the same Agent you've
+    # seen before
+    track_id : int      
 
     def velocity_local(self) -> Tuple[float,float,float]:
         """Returns velocity in m/s in the agent's local frame."""
diff --git a/homework/person_detector.py b/homework/person_detector.py
deleted file mode 100644
index 4003d0823..000000000
--- a/homework/person_detector.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import cv2
-import sys
-from ultralytics import YOLO
-
-def person_detector(img : cv2.Mat):
-    #TODO: implement me to produce a list of (x,y,w,h) bounding boxes of people in the image
-    model = YOLO("yolo11n.pt")
-    results = model(img)
-    cls = results[0].boxes.cls.cpu().numpy()
-    bboxes = results[0].boxes.xywh.cpu().numpy()
-    detection = []
-    for k in range(len(cls)):
-        category = cls[k]
-        b = bboxes[k]
-        bb = (b[0], b[1], b[2], b[3])
-        if category == 0:
-            detection.append(bb)
-    return detection
-
-def main(fn):
-    image = cv2.imread(fn)
-    bboxes = person_detector(image)
-    print("Detected",len(bboxes),"people")
-    for bb in bboxes:
-        x,y,w,h = bb
-        if not isinstance(x,(int,float)) or not isinstance(y,(int,float)) or not isinstance(w,(int,float)) or not isinstance(h,(int,float)):
-            print("WARNING: make sure to return Python numbers rather than PyTorch Tensors")
-        print("Corner",(x,y),"size",(w,h))
-        cv2.rectangle(image, (int(x-w/2), int(y-h/2)), (int(x+w/2), int(y+h/2)), (255, 0, 255), 3)
-    cv2.imshow('Results', image)
-    cv2.waitKey(0)
-
-def main_webcam(): 
-    cap = cv2.VideoCapture(0)
-    cap.set(3, 640)
-    cap.set(4, 480)
-
-    print("Press space to exit")
-    while True:
-        _, image = cap.read()
-        
-        bboxes = person_detector(image)
-        for bb in bboxes:
-            x,y,w,h = bb
-            cv2.rectangle(image, (int(x-w/2), int(y-h/2)), (int(x+w/2), int(y+h/2)), (255, 0, 255), 3)
-              
-        cv2.imshow('Person detection', image)     
-        if cv2.waitKey(1) & 0xFF == ord(' '):
-            break
-
-    cap.release()
-
-
-if __name__ == '__main__':
-    fn = sys.argv[1]
-    if fn != 'webcam':
-        main(fn)
-    else:
-        main_webcam()
\ No newline at end of file
diff --git a/launch/pedestrian_detection.yaml b/launch/pedestrian_detection.yaml
index 0fffa89fc..1a00fa797 100644
--- a/launch/pedestrian_detection.yaml
+++ b/launch/pedestrian_detection.yaml
@@ -94,4 +94,6 @@ variants:
             visualization: !include "klampt_visualization.yaml"
             drive: 
                 perception:
+                    agent_detection : pedestrian_detection.FakePedestrianDetector2D
+                    #agent_detection : OmniscientAgentDetector  #this option reads agents from the simulator
                     state_estimation : OmniscientStateEstimator
diff --git a/run_docker_container.sh b/run_docker_container.sh
index c987a9016..ac0f998b3 100644
--- a/run_docker_container.sh
+++ b/run_docker_container.sh
@@ -3,6 +3,6 @@
 if [ "$(docker ps -q -f name=gem_stack-container)" ]; then
     docker exec -it gem_stack-container bash
 else
-    docker compose -f setup/docker-compose.yaml up -d
+    UID=$(id -u) GID=$(id -g) docker compose -f setup/docker-compose.yaml up -d
     docker exec -it gem_stack-container bash
 fi
\ No newline at end of file
diff --git a/setup/Dockerfile.cuda12 b/setup/Dockerfile.cuda12
index ffafaaa3f..32005202f 100644
--- a/setup/Dockerfile.cuda12
+++ b/setup/Dockerfile.cuda12
@@ -18,7 +18,7 @@ RUN ln -fs /usr/share/zoneinfo/$TZ /etc/localtime \
     && apt-get update && apt-get install -y tzdata \
     && rm -rf /var/lib/apt/lists/*
 
-# install Zed SDK
+# install Zed SDK. If you want to install a different version, change to this https://download.stereolabs.com/zedsdk/4.2/cu12/ubuntu20
 RUN wget https://stereolabs.sfo2.cdn.digitaloceanspaces.com/zedsdk/4.2/ZED_SDK_Ubuntu20_cuda12.1_v4.2.4.zstd.run -O zed_sdk.run
 RUN chmod +x zed_sdk.run
 RUN ./zed_sdk.run -- silent
diff --git a/setup/docker-compose.yaml b/setup/docker-compose.yaml
index 8c43cfcaf..f0c7703b8 100644
--- a/setup/docker-compose.yaml
+++ b/setup/docker-compose.yaml
@@ -22,6 +22,11 @@ services:
       - "/tmp/.X11-unix:/tmp/.X11-unix:rw"
     environment:
       - DISPLAY=${DISPLAY}
+      - XDG_RUNTIME_DIR=/tmp/runtime-${USER}
+      - DBUS_SYSTEM_BUS_ADDRESS=unix:path=/var/run/dbus/system_bus_socket
+      - DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/${UID}/bus
+      - NVIDIA_DRIVER_CAPABILITIES=all
+      - NVIDIA_VISIBLE_DEVICES=all
       # - LIBGL_ALWAYS_SOFTWARE=1 # Uncomment if you want to use software rendering (No GPU)
     network_mode: host
     ipc: host
diff --git a/setup/setup_this_machine.sh b/setup/setup_this_machine.sh
index 5c9918866..36690f1d4 100755
--- a/setup/setup_this_machine.sh
+++ b/setup/setup_this_machine.sh
@@ -16,17 +16,25 @@ fi
 source /opt/ros/noetic/setup.bash
 
 #install Zed SDK
-echo "Select CUDA version:"
+echo "To install the ZED SDK, select the CUDA version:"
 echo "1) CUDA 11.8"
 echo "2) CUDA 12+"
-read -p "Enter choice [1-2]: " choice
+echo "3) No GPU (Skip ZED SDK installation)"
+read -p "Enter choice [1-3]: " choice
 
 case $choice in
     1)
         wget https://download.stereolabs.com/zedsdk/4.0/cu118/ubuntu20 -O zed_sdk.run
+        chmod +x zed_sdk.run
+        ./zed_sdk.run -- silent
         ;;
     2)
         wget https://stereolabs.sfo2.cdn.digitaloceanspaces.com/zedsdk/4.2/ZED_SDK_Ubuntu20_cuda12.1_v4.2.4.zstd.run -O zed_sdk.run
+        chmod +x zed_sdk.run
+        ./zed_sdk.run -- silent
+        ;;
+    3)
+        echo "Skipping ZED SDK installation..."
         ;;
     *)
         echo "Invalid choice"
@@ -34,14 +42,12 @@ case $choice in
         ;;
 esac
 
-chmod +x zed_sdk.run
-./zed_sdk.run -- silent
-
 #create ROS Catkin workspace
 mkdir -p ~/catkin_ws/src
 
 # Store current working directory
 CURRENT_DIR=$(pwd)
+echo "CURRENT_DIR: $CURRENT_DIR"
 
 #install ROS dependencies and packages
 cd ~/catkin_ws/src
@@ -57,9 +63,26 @@ catkin_make -DCMAKE_BUILD_TYPE=Release
 source devel/setup.bash
 
 cd $CURRENT_DIR
-cd ..
 #install GEMstack Python dependencies
-python3 -m pip install -r requirements.txt
+
+# Ask the user if they want to install ultralytics
+read -p "Do you want to install ultralytics? (y/n): " install_ultralytics
+
+# Create a temporary requirements file
+temp_requirements="temp_requirements.txt"
+
+# Copy all lines except ultralytics if the user chooses to skip it
+if [ "$install_ultralytics" == "y" ]; then
+    cp requirements.txt $temp_requirements
+else
+    grep -v "ultralytics" requirements.txt > $temp_requirements
+fi
+
+# Install the packages from the temporary requirements file
+pip3 install -r $temp_requirements
+
+# Clean up the temporary file
+rm $temp_requirements
 
 #install other dependencies
 sudo apt-get install -y ros-noetic-septentrio-gnss-driver
\ No newline at end of file
diff --git a/webcam.py b/webcam.py
index 10c3452d4..52f1b36a8 100644
--- a/webcam.py
+++ b/webcam.py
@@ -9,7 +9,7 @@ def webcam_publisher():
     
     image_pub = rospy.Publisher('/webcam', Image, queue_size=10)
     
-    cap = cv2.VideoCapture(0)  # 0 表示默认摄像头
+    cap = cv2.VideoCapture(0)
     if not cap.isOpened():
         rospy.logerr("cannot open camera")
         return