diff --git a/GEMstack/onboard/perception/IdTracker.py b/GEMstack/onboard/perception/IdTracker.py new file mode 100644 index 000000000..15641fa00 --- /dev/null +++ b/GEMstack/onboard/perception/IdTracker.py @@ -0,0 +1,12 @@ +class IdTracker(): + """Abstracts out id tracking + """ + def __init__(self): + self.__id = 0 + + def get_new_id(self) -> int: + """Returns a unique agent id + """ + assigned_id = self.__id + self.__id += 1 # id will intentionally overflow to get back to 0 + return assigned_id diff --git a/GEMstack/onboard/perception/calibration/camera_intrinsics.json b/GEMstack/onboard/perception/calibration/camera_intrinsics.json new file mode 100644 index 000000000..74318680c --- /dev/null +++ b/GEMstack/onboard/perception/calibration/camera_intrinsics.json @@ -0,0 +1,7 @@ +{ + "fx": 684.8333129882812, + "fy": 684.6096801757812, + "cx": 573.37109375, + "cy": 363.700927734375 + } + \ No newline at end of file diff --git a/GEMstack/onboard/perception/calibration/extrinsics/R.npy b/GEMstack/onboard/perception/calibration/extrinsics/R.npy new file mode 100644 index 000000000..7d44aa76d Binary files /dev/null and b/GEMstack/onboard/perception/calibration/extrinsics/R.npy differ diff --git a/GEMstack/onboard/perception/calibration/extrinsics/rvec.npy b/GEMstack/onboard/perception/calibration/extrinsics/rvec.npy new file mode 100644 index 000000000..62d1809f1 Binary files /dev/null and b/GEMstack/onboard/perception/calibration/extrinsics/rvec.npy differ diff --git a/GEMstack/onboard/perception/calibration/extrinsics/t.npy b/GEMstack/onboard/perception/calibration/extrinsics/t.npy new file mode 100644 index 000000000..b133fa55f Binary files /dev/null and b/GEMstack/onboard/perception/calibration/extrinsics/t.npy differ diff --git a/GEMstack/onboard/perception/pedestrian_detection.py b/GEMstack/onboard/perception/pedestrian_detection.py index f91145274..2c24f3bef 100644 --- a/GEMstack/onboard/perception/pedestrian_detection.py +++ b/GEMstack/onboard/perception/pedestrian_detection.py @@ -1,13 +1,56 @@ -from ...state import AllState,VehicleState,ObjectPose,ObjectFrameEnum,AgentState,AgentEnum,AgentActivityEnum +""" +Top ouster lidar + Oak front camera fusion, object detection +""" +""" +Terminal 1: +--------------- +source /opt/ros/noetic/setup.bash && source /catkin_ws/devel/setup.bash +roscore + +Terminal 2: +--------------- +source /opt/ros/noetic/setup.bash && source /catkin_ws/devel/setup.bash +python3 GEMstack/onboard/perception/transform.py + +Terminal 3: +--------------- +source /opt/ros/noetic/setup.bash && source /catkin_ws/devel/setup.bash +rosbag play -l ~/rosbags/vehicle.bag + +Terminal 4: +--------------- +source /opt/ros/noetic/setup.bash && source /catkin_ws/devel/setup.bash +cd GEMStack +python3 main.py --variant=detector_only launch/pedestrian_detection.yaml + +Terminal 5: +--------------- +source /opt/ros/noetic/setup.bash && source /catkin_ws/devel/setup.bash +rviz +""" + +# Python +import os +from typing import List, Dict +from collections import defaultdict +from datetime import datetime +# ROS, CV +import rospy +import message_filters +import tf +from cv_bridge import CvBridge +from sensor_msgs.msg import Image, PointCloud2 +from visualization_msgs.msg import MarkerArray +# YOLO +from ultralytics import YOLO +from ultralytics.engine.results import Results, Boxes +# GEMStack +from ...state import AllState,VehicleState,ObjectPose,ObjectFrameEnum,AgentState,AgentEnum,AgentActivityEnum,ObjectFrameEnum +from .pedestrian_detection_utils import * from ..interface.gem import GEMInterface from ..component import Component -from ultralytics import YOLO -import cv2 -from typing import Dict -from cv_bridge import CvBridge -from sensor_msgs.msg import Image -import rospy -import os +from .IdTracker import IdTracker + def box_to_fake_agent(box): @@ -22,158 +65,408 @@ def box_to_fake_agent(box): class PedestrianDetector2D(Component): - """Detects pedestrians. - - self.vehicle -> GEM car info - self.detector -> pedestrian detection model - self.last_person_boxes -> used to store boxes detected in every frame of the video - self.pedestrians -> stores the agentstate (pedestrian) found during the video in a dict, not sure if required can be removed - self.visualization -> enable this to view pedestrian detected - self.confidence -> min model confidence level - self.classes_to_detect -> Classes for the model to detect - - """ - def __init__(self,vehicle_interface : GEMInterface): + # TODO: Pull params into a JSON/yaml + # TODO: Convert some lists into np.arrays, vectorize calculations + # TODO: Implement logging instead of print, cleanup comments + # TODO: Cleanup funcs + split into separate classes + # TODO: Decide if we want to name dets "peds" or "objs"/"agents" + # Maybe peds for now and Agents in agent_detection.py? + def __init__(self, vehicle_interface : GEMInterface) -> None: self.vehicle_interface = vehicle_interface - self.detector = YOLO(os.getcwd()+'/GEMstack/knowledge/detection/yolov8n.pt') # change to get model value from sys arg - self.last_person_boxes = [] - self.pedestrians = {} - self.visualization = True # Set this to true for visualization, later change to get value from sys arg + # Publish debug/viz rostopics if true + self.debug = True + # Setup variables + self.bridge = CvBridge() + # TODO: Wrap detector into GEMDetector? + self.detector = YOLO(os.getcwd()+'/GEMstack/knowledge/detection/yolov8n.pt') + # track_id: AgentState + self.prev_agents = dict() + self.current_agents = dict() + # TODO Implement time self.confidence = 0.7 self.classes_to_detect = 0 + self.ground_threshold = -2.0 + self.max_human_depth = 0.9 + self.vehicle_frame = True # Indicate whether pedestrians centroids and point clouds are in the vehicle frame - def rate(self): - return 4.0 + # Load calibration data + # TODO: Maybe lets add one word or link what R t K are? + self.R_lidar_to_oak = load_extrinsics(os.getcwd() + '/GEMstack/onboard/perception/calibration/extrinsics/R.npy') + self.t_lidar_to_oak = load_extrinsics(os.getcwd() + '/GEMstack/onboard/perception/calibration/extrinsics/t.npy') + self.K = load_intrinsics(os.getcwd() + '/GEMstack/onboard/perception/calibration/camera_intrinsics.json') + + self.R_lidar_to_vehicle = np.array([[0.9995121, 0.02132941, 0.02281911], + [-0.02124771, 0.99976695, -0.00381707], + [-0.02289521, 0.00333035, 0.9997323]]) + self.t_lidar_to_vehicle = np.array([[0.0], [0.0], [0.35]]) + + # Subscribers and sychronizers + self.rgb_rosbag = message_filters.Subscriber('/oak/rgb/image_raw', Image) + self.top_lidar_rosbag = message_filters.Subscriber('/ouster/points', PointCloud2) + self.sync = message_filters.ApproximateTimeSynchronizer([self.rgb_rosbag, self.top_lidar_rosbag], queue_size=10, slop=0.1) + self.sync.registerCallback(self.ouster_oak_callback) + + # TF listener to get transformation from LiDAR to Camera + self.tf_listener = tf.TransformListener() + + if self.debug: self.init_debug() + + self.prev_time = None # Time in seconds since last scan for basic velocity calculation + self.curr_time = None # Time in seconds of current scan for basic velocity calculation + self.id_tracker = IdTracker() - def state_inputs(self): - return ['vehicle'] + def init_debug(self,) -> None: + # Debug Publishers + self.pub_pedestrians_pc2 = rospy.Publisher("/point_cloud/pedestrians", PointCloud2, queue_size=10) + self.pub_obj_centers_pc2 = rospy.Publisher("/point_cloud/obj_centers", PointCloud2, queue_size=10) + self.pub_bboxes_markers = rospy.Publisher("/markers/bboxes", MarkerArray, queue_size=10) + self.pub_image = rospy.Publisher("/camera/image_detection", Image, queue_size=1) + + def update(self, vehicle : VehicleState) -> Dict[str,AgentState]: + return self.current_agents + + # TODO: Improve Algo Knn, ransac, etc. + def find_centers(self, clusters: List[List[np.ndarray]]) -> List[np.ndarray]: + clusters = [np.array(clust) for clust in clusters] + centers = [np.array(()) if clust.size == 0 else np.mean(clust, axis=0) for clust in clusters] + return centers - def state_outputs(self): - return ['agents'] + # Beware: AgentState(PhysicalObject) builds bbox from + # dims [-l/2,l/2] x [-w/2,w/2] x [0,h], not + # [-l/2,l/2] x [-w/2,w/2] x [-h/2,h/2] + def find_dims(self, clusters: List[List[np.ndarray]]) -> List[np.ndarray]: + # Add some small constant to height to compensate for + # objects distance to ground we filtered from lidar, + # other heuristics to imrpove stability for find_ funcs ? + clusters = [np.array(clust) for clust in clusters] + dims = [np.array(()) if clust.size == 0 else np.max(clust, axis= 0) - np.min(clust, axis= 0) for clust in clusters] + return dims + + # TODO: Slower but cleaner to input self.current_agents dict + # TODO: Moving Average across last N iterations pos/vel? Less spurious vals + # TODO Akul: Fix velocity calculation to calculate in ObjectFrameEnum.START + # Work towards own tracking class instead of simple YOLO track? + # Fix division by time + # ret: Dict[track_id: vel[x, y, z]] + def find_vels(self, track_ids: List[int], obj_centers: List[np.ndarray], obj_dims: List[np.ndarray]) -> Dict[int, np.ndarray]: + # Object not seen -> velocity = None + track_id_center_map = dict(zip(track_ids, obj_centers)) + vels = defaultdict(lambda: np.array(())) # None is faster, np.array matches other find_ methods. + + for prev_track_id, prev_agent in self.prev_agents.items(): + if prev_track_id in track_ids: + # TODO: Add prev_agents to memory to avoid None velocity + # We should only be missing prev pose on first sight of track_id Agent. + # print("shape 1: ", track_id_center_map[prev_agent.track_id]) + # print("shape 2: ", np.array([prev_agent.pose.x, prev_agent.pose.y, prev_agent.pose.z])) + # prev can be 3 separate Nones, current is just empty array... make this symmetrical + if prev_agent.pose.x and prev_agent.pose.y and prev_agent.pose.z and track_id_center_map[prev_agent.track_id].shape == 3: + vels[prev_track_id] = (track_id_center_map[prev_track_id] - np.array([prev_agent.pose.x, prev_agent.pose.y, prev_agent.pose.z])) / (self.curr_time - self.prev_time) + return vels - def initialize(self): + # TODO: Separate debug/viz class, bbox and 2d 3d points funcs + def viz_object_states(self, cv_image, boxes, extracted_pts_all): + # Extract 3D pedestrians points in lidar frame + # ** These are camera frame after transform_lidar_points, right? + # ** It was in camera frame before. I fixed it. Now they are in lidar frame! + pedestrians_3d_pts = [[] if len(extracted_pts) == 0 else list(extracted_pts[:, -3:]) for extracted_pts in extracted_pts_all] - """Initializes subscribers and publishers + # Object center viz + obj_3d_obj_centers = list() + obj_3d_obj_dims = list() + for track_id, agent in self.current_agents.items(): + if agent.pose.x != None and agent.pose.y != None and agent.pose.z != None: + obj_3d_obj_centers.append((agent.pose.x, agent.pose.y, agent.pose.z)) # ** + if agent.dimensions != None and agent.dimensions[0] != None and agent.dimensions[1] != None and agent.dimensions[2] != None: + obj_3d_obj_dims.append(agent.dimensions) - self.vehicle_interface.subscribe_sensor -> GEM Car camera subscription - self.rosbag_test -> ROS Bag topic subscription - self.pub_image -> Publishes Image with detection for visualization + # Extract 2D pedestrians points and bbox in camera frame + extracted_2d_pts = [[] if len(extracted_pts) == 0 else list(extracted_pts[:, :2].astype(int)) for extracted_pts in extracted_pts_all] + flattened_pedestrians_2d_pts = list() + for pts in extracted_2d_pts: flattened_pedestrians_2d_pts.extend(pts) - """ - #tell the vehicle to use image_callback whenever 'front_camera' gets a reading, and it expects images of type cv2.Mat + for ind, bbox in enumerate(boxes): + xywh = bbox.xywh[0].tolist() + cv_image = vis_2d_bbox(cv_image, xywh, bbox) + + flattened_pedestrians_3d_pts = list() + for pts in pedestrians_3d_pts: flattened_pedestrians_3d_pts.extend(pts) + + if len(flattened_pedestrians_3d_pts) > 0: + # Draw projected 2D LiDAR points on the image. + for pt in flattened_pedestrians_2d_pts: + cv2.circle(cv_image, pt, 2, (0, 0, 255), -1) + ros_img = self.bridge.cv2_to_imgmsg(cv_image, 'bgr8') + self.pub_image.publish(ros_img) - # GEM Car subacriber - # self.vehicle_interface.subscribe_sensor('front_camera',self.image_callback,cv2.Mat) + # Draw 3D pedestrian pointclouds + if self.vehicle_frame: + # If in vehicle frame, tranform 3D pedestrians points to vehicle frame for better visualization. + flattened_pedestrians_3d_pts_vehicle = transform_lidar_points(np.array(flattened_pedestrians_3d_pts), self.R_lidar_to_vehicle, self.t_lidar_to_vehicle) + flattened_pedestrians_3d_pts = flattened_pedestrians_3d_pts_vehicle - # Webcam - # rospy.Subscriber('/webcam', Image, self.image_callback) + # Create point cloud from extracted 3D points + ros_extracted_pedestrian_pc2 = create_point_cloud(flattened_pedestrians_3d_pts) + self.pub_pedestrians_pc2.publish(ros_extracted_pedestrian_pc2) - # Testing with rosbag - self.rosbag_test = rospy.Subscriber('/oak/rgb/image_raw',Image, self.image_callback,queue_size=1) - if(self.visualization): - self.pub_image = rospy.Publisher("/camera/image_detection", Image, queue_size=1) - pass - - # Use cv2.Mat for GEM Car, Image for RosBag - def image_callback(self, image : Image): #image : cv2.Mat): + # Draw 3D pedestrian centers and dimensions + if len(obj_3d_obj_centers) > 0 and len(obj_3d_obj_dims) > 0: + # Draw 3D pedestrian center pointclouds + ros_obj_3d_obj_centers_pc2 = create_point_cloud(obj_3d_obj_centers, color=(0, 128, 0)) + self.pub_obj_centers_pc2.publish(ros_obj_3d_obj_centers_pc2) + + # Draw 3D pedestrian bboxes markers + # Create bbox marker from pedestrain dimensions + ros_delete_bboxes_markers = delete_bbox_marker() + self.pub_bboxes_markers.publish(ros_delete_bboxes_markers) + ros_pedestrians_bboxes_markers = create_bbox_marker(obj_3d_obj_centers, obj_3d_obj_dims) + self.pub_bboxes_markers.publish(ros_pedestrians_bboxes_markers) + + + def update_object_states(self, track_result: List[Results], extracted_pts_all: List[np.ndarray]) -> None: + # self.prev_agents = self.current_agents.copy() + self.current_agents.clear() + + # Return if no track results available + if track_result[0].boxes.id == None: + return + + # Change pedestrians_3d_pts to dicts matching track_ids + track_ids = track_result[0].boxes.id.int().cpu().tolist() + num_objs = len(track_ids) + boxes = track_result[0].boxes - """Detects pedestrians using the model provided when new image is passed. + # Extract 3D pedestrians points in lidar frame + # ** These are camera frame after transform_lidar_points, right? + # ** It was in camera frame before. I fixed it. Now they are in lidar frame! + pedestrians_3d_pts = [[] if len(extracted_pts) == 0 else list(extracted_pts[:, -3:]) for extracted_pts in extracted_pts_all] + if len(pedestrians_3d_pts) != num_objs: + raise Exception('Perception - Camera detections, points clusters num. mismatch') + + # TODO: Slower but cleaner to pass dicts of AgentState + # or at least {track_ids: centers/pts/etc} + # TODO: Combine funcs for efficiency in C. + # Separate numpy prob still faster for now + obj_centers = self.find_centers(pedestrians_3d_pts) # Centers are calculated in lidar frame here + obj_dims = self.find_dims(pedestrians_3d_pts) - Converts Image.msg to cv2 format (Might need to change this to use cv2.Mat) and uses the model to detect pedestrian - IF visualization is true, will publish an image with pedestrians detected. + # If in vehicle frame, transform centers from top_lidar frame to vehicle frame + # Need to transform the center point one by one since matrix op can't deal with empty points + if self.vehicle_frame: + obj_centers_vehicle = [] + for obj_center in obj_centers: + if len(obj_center) > 0: + obj_center = np.array([obj_center]) + obj_center_vehicle = transform_lidar_points(obj_center, self.R_lidar_to_vehicle, self.t_lidar_to_vehicle)[0] + obj_centers_vehicle.append(obj_center_vehicle) + else: + obj_centers_vehicle.append(np.array(())) + obj_centers = obj_centers_vehicle - Hardcoded values for now: - Detected only pedestrians -> Class = 0 - Confidence level -> 0.7 + self.find_vels_and_ids(obj_centers, obj_dims) + + # TODO: Refactor to make more efficient + # TODO: Moving Average across last N iterations pos/vel? Less spurious vals + # TODO Fix velocity calculation to calculate in ObjectFrameEnum.START + def find_vels_and_ids(self, obj_centers: List[np.ndarray], obj_dims: List[np.ndarray]): + # Gate to check whether dims and centers are empty (will happen if no pedestrians are scanned): + if ((len(obj_dims) == 1 or len(obj_centers) == 1) and (obj_centers[0].size == 0 or obj_dims[0].size == 0)): + self.prev_agents = self.current_agents.copy() + self.current_agents = {} # There weren't any pedestrians detected + return + elif (len(obj_centers) != len(obj_dims)): + for i in range(25): + print("ERROR NUM PARINGS AND OBJ DIMS ARENT THE SAME") + self.prev_agents = self.current_agents.copy() + self.current_agents = {} # There weren't any pedestrians detected + return - """ + new_prev_agents = {} # Stores current agents in START frame for next time through (since + # planning wants us to send them agents in CURRENT frame) + + assigned = False + num_pairings = len(obj_centers) + + # TODO: NEED TO STORE AND INCORPORATE TRANSFORMS SOMEHOW TO DEAL WITH MOVING CAR CASE + converted_centers = obj_centers # TODO: Add this in when finished self.convert_vehicle_frame_to_start_frame(obj_centers) + + # Loop through the indexes of the obj_center and obj_dim pairings + for idx in range(num_pairings): + assigned = False + + # Loop through previous agents backwards + for prev_id, prev_state in reversed(self.prev_agents.items()): + # If an obj_center and obj_dim pair overlaps with a previous agent, assume that they're the same agent + if self.agents_overlap(converted_centers[idx], obj_dims[idx], prev_state): + assigned = True + + if self.prev_time == None: + # This will be triggered if the very first message has pedestrians in it + vel = np.array([0, 0, 0]) + else: + delta_t = self.curr_time - self.prev_time + vel = (converted_centers[idx] - np.array([prev_state.pose.x, prev_state.pose.y, prev_state.pose.z])) / delta_t.total_seconds() + print("VELOCITY:") + print(vel) + + self.current_agents[prev_id] = ( + AgentState( + track_id = prev_id, + pose=ObjectPose(t=0, x=obj_centers[idx][0], y=obj_centers[idx][1], z=obj_centers[idx][2], yaw=0,pitch=0,roll=0,frame=ObjectFrameEnum.CURRENT), + # (l, w, h) + # TODO: confirm (z -> l, x -> w, y -> h) + dimensions=(obj_dims[idx][0], obj_dims[idx][1], obj_dims[idx][2]), + outline=None, + type=AgentEnum.PEDESTRIAN, + activity=AgentActivityEnum.STOPPED if np.all(vel == 0) else AgentActivityEnum.MOVING, + velocity=None if vel.size == 0 else tuple(vel), + yaw_rate=0 + )) + new_prev_agents[prev_id] = ( + AgentState( + track_id = prev_id, + pose=ObjectPose(t=0, x=converted_centers[idx][0], y=converted_centers[idx][1], z=converted_centers[idx][2], yaw=0,pitch=0,roll=0,frame=ObjectFrameEnum.CURRENT), + # (l, w, h) + # TODO: confirm (z -> l, x -> w, y -> h) + dimensions=(obj_dims[idx][0], obj_dims[idx][1], obj_dims[idx][2]), + outline=None, + type=AgentEnum.PEDESTRIAN, + activity=AgentActivityEnum.STOPPED if np.all(vel == 0) else AgentActivityEnum.MOVING, + velocity=None if vel.size == 0 else tuple(vel), + yaw_rate=0 + )) + del self.prev_agents[prev_id] # Remove previous agent from previous agents + break + + # If not assigned: + if not assigned: + # Set velocity to 0 and assign the new agent a new id with IdTracker + id = self.id_tracker.get_new_id() + self.current_agents[id] = ( + AgentState( + track_id = id, + pose=ObjectPose(t=0, x=obj_centers[idx][0], y=obj_centers[idx][1], z=obj_centers[idx][2] ,yaw=0,pitch=0,roll=0,frame=ObjectFrameEnum.CURRENT), + # (l, w, h) + # TODO: confirm (z -> l, x -> w, y -> h) + dimensions=(obj_dims[idx][0], obj_dims[idx][1], obj_dims[idx][2]), + outline=None, + type=AgentEnum.PEDESTRIAN, + activity=AgentActivityEnum.UNDETERMINED, + velocity=None, + yaw_rate=0 + )) + new_prev_agents[id] = ( + AgentState( + track_id = id, + pose=ObjectPose(t=0, x=converted_centers[idx][0], y=converted_centers[idx][1], z=converted_centers[idx][2] ,yaw=0,pitch=0,roll=0,frame=ObjectFrameEnum.CURRENT), + # (l, w, h) + # TODO: confirm (z -> l, x -> w, y -> h) + dimensions=(obj_dims[idx][0], obj_dims[idx][1], obj_dims[idx][2]), + outline=None, + type=AgentEnum.PEDESTRIAN, + activity=AgentActivityEnum.UNDETERMINED, + velocity=None, + yaw_rate=0 + )) + self.prev_agents = new_prev_agents + + # Calculates whether 2 agents overlap in START frame. True if they do, false if not + def agents_overlap(self, obj_center: np.ndarray, obj_dim: np.ndarray, prev_agent: AgentState) -> bool: + # Calculate corners of obj_center and obj_dim pairing + x1_min, x1_max = obj_center[0] - obj_dim[0] / 2.0, obj_center[0] + obj_dim[0] / 2.0 + y1_min, y1_max = obj_center[1] - obj_dim[1] / 2.0, obj_center[1] + obj_dim[1] / 2.0 # CENTER CALCULATION + z1_min, z1_max = obj_center[2] - obj_dim[2] / 2.0, obj_center[2] + obj_dim[2] / 2.0 + + # Calculate corners of AgentState + # Beware: AgentState(PhysicalObject) builds bbox from + # dims [-l/2,l/2] x [-w/2,w/2] x [0,h], not + # [-l/2,l/2] x [-w/2,w/2] x [-h/2,h/2] + # TODO: confirm (z -> l, x -> w, y -> h) + x2_min, x2_max = prev_agent.pose.x - prev_agent.dimensions[0] / 2.0, prev_agent.pose.x + prev_agent.dimensions[0] / 2.0 + y2_min, y2_max = prev_agent.pose.y, prev_agent.pose.y + prev_agent.dimensions[1] # AGENT STATE CALCULATION + z2_min, z2_max = prev_agent.pose.z - prev_agent.dimensions[2] / 2.0, prev_agent.pose.z + prev_agent.dimensions[2] / 2.0 + + # True if they overlap, false if not + return ( + ( (x1_min <= x2_min and x2_min <= x1_max) or (x2_min <= x1_min and x1_min <= x2_max) ) and + ( (y1_min <= y2_min and y2_min <= y1_max) or (y2_min <= y1_min and y1_min <= y2_max) ) and + ( (z1_min <= z2_min and z2_min <= z1_max) or (z2_min <= z1_min and z1_min <= z2_max) ) + ) + + def convert_vehicle_frame_to_start_frame(self, vehicle_states: List[AgentState]) -> List[AgentState]: + # num_states = len(vehicle_states) + # for idx in num_states: + # num_states[idx].to_frame() + pass + - # Use Image directly for GEM Car - # track_result = self.detector.track(source=image, classes=self.classes_to_detect, persist=True, conf=self.confidence) + def ouster_oak_callback(self, rgb_image_msg: Image, lidar_pc2_msg: PointCloud2): + # Update times for basic velocity calculation + self.prev_time = self.curr_time + self.curr_time = datetime.now() - # Convert to CV2 format for RosBag - bridge = CvBridge() - image = bridge.imgmsg_to_cv2(image, "bgr8") - track_result = self.detector.track(source=image, classes=self.classes_to_detect, persist=True, conf=self.confidence) + # Convert to cv2 image and run detector + cv_image = self.bridge.imgmsg_to_cv2(rgb_image_msg, "bgr8") + track_result = self.detector.track(source=cv_image, classes=self.classes_to_detect, persist=True, conf=self.confidence) - self.last_person_boxes = [] + # Convert 1D PointCloud2 data to x, y, z coords + lidar_points = convert_pointcloud2_to_xyz(lidar_pc2_msg) + # print("len lidar_points", len(lidar_points)) + + # Downsample xyz point clouds + downsampled_points = downsample_points(lidar_points) + # print("len downsampled_points", len(downsampled_points)) + + # Transform LiDAR points into the camera coordinate frame. + lidar_in_camera = transform_lidar_points(downsampled_points, self.R_lidar_to_oak, self.t_lidar_to_oak) + # print("len lidar_in_camera", len(lidar_in_camera)) + + # Project the transformed points into the image plane. + projected_pts = project_points(lidar_in_camera, self.K, downsampled_points) + # print("projected_pts", len(projected_pts)) + + # Process bboxes boxes = track_result[0].boxes - # Used for visualization - if(self.visualization): - label_text = "Pedestrian " - font = cv2.FONT_HERSHEY_SIMPLEX - font_scale = 0.5 - font_color = (255, 255, 255) # White text - outline_color = (0, 0, 0) # Black outline - line_type = 1 - text_thickness = 2 # Text thickness - outline_thickness = 1 # Thickness of the text outline - - # Unpacking box dimentions detected into x,y,w,h - for box in boxes: - - xywh = box.xywh[0].tolist() - self.last_person_boxes.append(xywh) + extracted_pts_all = list() + + for ind, bbox in enumerate(boxes): + xywh = bbox.xywh[0].tolist() + + # Extracting projected pts x, y, w, h = xywh - id = box.id.item() - - # Stores AgentState in a dict, can be removed if not required - pose = ObjectPose(t=0,x=x,y=y,z=0,yaw=0,pitch=0,roll=0,frame=ObjectFrameEnum.CURRENT) - dims = (w,h,0) - if(id not in self.pedestrians.keys()): - self.pedestrians[id] = AgentState(pose=pose,dimensions=dims,outline=None,type=AgentEnum.PEDESTRIAN,activity=AgentActivityEnum.MOVING,velocity=(0,0,0),yaw_rate=0) - else: - self.pedestrians[id].pose = pose - self.pedestrians[id].dims = dims - - # Used for visualization - if(self.visualization): - # Draw bounding box - cv2.rectangle(image, (int(x - w / 2), int(y - h / 2)), (int(x + w / 2), int(y + h / 2)), (255, 0, 255), 3) - - # Define text label - x = int(x - w / 2) - y = int(y - h / 2) - label = label_text + str(id) + " : " + str(round(box.conf.item(), 2)) - - # Get text size - text_size, baseline = cv2.getTextSize(label, font, font_scale, line_type) - text_w, text_h = text_size - - # Position text above the bounding box - text_x = x - text_y = y - 10 if y - 10 > 10 else y + h + text_h - - # Draw text outline for better visibility, uncomment for outline - # for dx, dy in [(-1, -1), (-1, 1), (1, -1), (1, 1)]: - # cv2.putText(image, label, (text_x + dx, text_y - baseline + dy), font, font_scale, outline_color, outline_thickness) - - # Draw main text on top of the outline - cv2.putText(image, label, (text_x, text_y - baseline), font, font_scale, font_color, text_thickness) + left_bound = int(x - w / 2) + right_bound = int(x + w / 2) + top_bound = int(y - h / 2) + bottom_bound = int(y + h / 2) + + pts = np.array(projected_pts) + # Checking projected_pts length is very important + if len(projected_pts) > 0: + extracted_pts = pts[(pts[:, 0] > left_bound) & + (pts[:, 0] < right_bound) & + (pts[:, 1] > top_bound) & + (pts[:, 1] < bottom_bound) + ] + # Apply ground and max distance filter to the extracted 5D points + extracted_pts = filter_ground_points(extracted_pts, self.ground_threshold) + extracted_pts = filter_depth_points(extracted_pts, self.max_human_depth) + extracted_pts_all.append(extracted_pts) + else: extracted_pts_all.append(np.array(())) - # Used for visualization - if(self.visualization): - ros_img = bridge.cv2_to_imgmsg(image, 'bgr8') - self.pub_image.publish(ros_img) - - #uncomment if you want to debug the detector... - # print(self.last_person_boxes) - # print(self.pedestrians.keys()) - #for bb in self.last_person_boxes: - # x,y,w,h = bb - # cv2.rectangle(image, (int(x-w/2), int(y-h/2)), (int(x+w/2), int(y+h/2)), (255, 0, 255), 3) - #cv2.imwrite("pedestrian_detections.png",image) + self.update_object_states(track_result, extracted_pts_all) + if self.debug: self.viz_object_states(cv_image, boxes, extracted_pts_all) + + + def rate(self): + return 4.0 - def update(self, vehicle : VehicleState) -> Dict[str,AgentState]: - res = {} - for i,b in enumerate(self.last_person_boxes): - x,y,w,h = b - res['pedestrian'+str(i)] = box_to_fake_agent(b) - if len(res) > 0: - print("Detected",len(res),"pedestrians") - return res + def state_inputs(self): + return ['vehicle'] + + def state_outputs(self): + return ['agents'] class FakePedestrianDetector2D(Component): diff --git a/GEMstack/onboard/perception/pedestrian_detection_utils.py b/GEMstack/onboard/perception/pedestrian_detection_utils.py new file mode 100644 index 000000000..788ee2e54 --- /dev/null +++ b/GEMstack/onboard/perception/pedestrian_detection_utils.py @@ -0,0 +1,244 @@ +from sensor_msgs.msg import PointCloud2, PointField +from visualization_msgs.msg import Marker, MarkerArray +import numpy as np +import sensor_msgs.point_cloud2 as pc2 +import open3d as o3d +import cv2 +import json +import rospy +import numpy as np +import struct + + +def convert_pointcloud2_to_xyz(lidar_pc2_msg: PointCloud2): + """ Convert 1D PointCloud2 data to x, y, z coords """ + return np.array(list(pc2.read_points(lidar_pc2_msg, skip_nans=True)), dtype=np.float32)[:, :3] + + +def downsample_points(lidar_points): + """ Downsample point clouds """ + # Convert numpy array to Open3D point cloud + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(lidar_points) + + # Apply voxel grid downsampling + voxel_size = 0.1 # Adjust for desired resolution + downsampled_pcd = pcd.voxel_down_sample(voxel_size=voxel_size) + + # Convert back to numpy array + transformed_points = np.asarray(downsampled_pcd.points) + return transformed_points + + +def filter_ground_points(lidar_points, ground_threshold = 0): + """ Filter points given an elevation of ground threshold """ + filtered_array = lidar_points[lidar_points[:, 4] > ground_threshold] + return filtered_array + + +def filter_depth_points(lidar_points, max_human_depth=0.9): + """ Filter points beyond a max possible human depth in a point cluster """ + if lidar_points.shape[0] == 0: return lidar_points + lidar_points_dist = lidar_points[:, 2] + min_dist = np.min(lidar_points_dist) + max_dist = np.max(lidar_points_dist) + max_possible_dist = min_dist + max_human_depth + actual_dist = min(max_dist, max_possible_dist) + filtered_array = lidar_points[lidar_points_dist < actual_dist] + return filtered_array + + +# Credits: The following lines of codes (from 33 to 92) are adapted from the Calibration Team B +def load_extrinsics(extrinsics_file): + """ + Load calibrated extrinsics from a .npz file. + Assumes the file contains keys 'R' and 't'. + """ + data = np.load(extrinsics_file) + return data + + +def load_intrinsics(intrinsics_file): + """ + Load camera intrinsics from a JSON file. + Expects keys: 'fx', 'fy', 'cx', and 'cy'. + """ + with open(intrinsics_file, 'r') as f: + intrinsics = json.load(f) + fx = intrinsics["fx"] + fy = intrinsics["fy"] + cx = intrinsics["cx"] + cy = intrinsics["cy"] + K = np.array([[fx, 0, cx], + [0, fy, cy], + [0, 0, 1]], dtype=np.float32) + return K + + +def load_lidar_scan(lidar_file): + """ + Load a LiDAR scan from a file. + This function handles both npy (returns a NumPy array) and npz files. + """ + data = np.load(lidar_file, allow_pickle=True) + if isinstance(data, np.ndarray): + return data.astype(np.float32) + else: + key = list(data.keys())[0] + return data[key].astype(np.float32) + + +def transform_lidar_points(lidar_points, R, t): + """ + Transform LiDAR points from the LiDAR frame into the camera frame. + p_cam = R * p_lidar + t. + """ + P_cam = (R @ lidar_points.T + t.reshape(3,1)).T + return P_cam + + +def project_points(points_3d, K, lidar_points): + """ + Project 3D points (in the camera frame) into 2D image coordinates using the camera matrix K. + Only projects points with z > 0. + """ + proj_points = [] + for pt, l_pt in zip(points_3d, lidar_points): + if pt[2] > 0: # only project points in front of the camera + u = K[0, 0] * (pt[0] / pt[2]) + K[0, 2] + v = K[1, 1] * (pt[1] / pt[2]) + K[1, 2] + # 5D data point + proj_points.append((int(u), int(v), l_pt[0], l_pt[1], l_pt[2])) + return np.array(proj_points) + + +def vis_2d_bbox(image, xywh, box): + # Setup + label_text = "Pedestrian " + font = cv2.FONT_HERSHEY_SIMPLEX + font_scale = 0.5 + font_color = (255, 255, 255) + line_type = 1 + text_thickness = 2 + + x, y, w, h = xywh + + if box.id is not None: + id = box.id.item() + else: + id = 0 + + # Draw bounding box + cv2.rectangle(image, (int(x - w / 2), int(y - h / 2)), (int(x + w / 2), int(y + h / 2)), (255, 0, 255), 3) + + # Define text label + x = int(x - w / 2) + y = int(y - h / 2) + label = label_text + str(id) + " : " + str(round(box.conf.item(), 2)) + + # Get text size + text_size, baseline = cv2.getTextSize(label, font, font_scale, line_type) + text_w, text_h = text_size + + # Position text above the bounding box + text_x = x + text_y = y - 10 if y - 10 > 10 else y + h + text_h + + # Draw main text on top of the outline + cv2.putText(image, label, (text_x, text_y - baseline), font, font_scale, font_color, text_thickness) + return image + + +def create_point_cloud(points, color=(255, 0, 0)): + """ + Converts a list of (x, y, z) points into a PointCloud2 message. + """ + header = rospy.Header() + header.stamp = rospy.Time.now() + header.frame_id = "os_sensor" # Change to your TF frame + + fields = [ + PointField(name="x", offset=0, datatype=PointField.FLOAT32, count=1), + PointField(name="y", offset=4, datatype=PointField.FLOAT32, count=1), + PointField(name="z", offset=8, datatype=PointField.FLOAT32, count=1), + PointField(name="rgb", offset=12, datatype=PointField.FLOAT32, count=1), + ] + + # Convert RGB color to packed float32 + r, g, b = color + packed_color = struct.unpack('f', struct.pack('I', (r << 16) | (g << 8) | b))[0] + + point_cloud_data = [(x, y, z, packed_color) for x, y, z in points] + + return pc2.create_cloud(header, fields, point_cloud_data) + + +def create_bbox_marker(centroids, dimensions): + """ + Create 3D bbox markers from centroids and dimensions + """ + marker_array = MarkerArray() + + for i, (centroid, dimension) in enumerate(zip(centroids, dimensions)): + # Skip if no centroid or dimension + if (centroid == None) or (dimension == None): + continue + + marker = Marker() + marker.header.frame_id = "os_sensor" # Reference frame + marker.header.stamp = rospy.Time.now() + marker.ns = "bounding_boxes" + marker.id = i # Unique ID for each marker + marker.type = Marker.CUBE # Cube for bounding box + marker.action = Marker.ADD + + # Position (center of the bounding box) + c_x, c_y, c_z = centroid + if (not isinstance(c_x, float)) or (not isinstance(c_y, float)) or (not isinstance(c_z, float)): + continue + + marker.pose.position.x = c_x + marker.pose.position.y = c_y + marker.pose.position.z = c_z + + # Orientation (default, no rotation) + marker.pose.orientation.x = 0.0 + marker.pose.orientation.y = 0.0 + marker.pose.orientation.z = 0.0 + marker.pose.orientation.w = 1.0 + + # Bounding box dimensions + d_x, d_y, d_z = dimension + if (not isinstance(d_x, float)) or (not isinstance(d_y, float)) or (not isinstance(d_z, float)): + continue + + marker.scale.x = d_x + marker.scale.y = d_y + marker.scale.z = d_z + + # Random colors for each bounding box + marker.color.r = 0.0 # Varying colors + marker.color.g = 1.0 + marker.color.b = 1.5 + marker.color.a = 0.2 # Transparency + + marker.lifetime = rospy.Duration() # Persistent + marker_array.markers.append(marker) + return marker_array + + +def delete_bbox_marker(): + """ + Delete 3D bbox markers given ID ranges + """ + marker_array = MarkerArray() + for i in range(6): + marker = Marker() + marker.ns = "bounding_boxes" + marker.id = i + marker.action = Marker.DELETE + marker_array.markers.append(marker) + return marker_array + + + diff --git a/GEMstack/onboard/perception/transform.py b/GEMstack/onboard/perception/transform.py new file mode 100644 index 000000000..8032752b1 --- /dev/null +++ b/GEMstack/onboard/perception/transform.py @@ -0,0 +1,21 @@ +import rospy +import tf +import numpy as np + +def publish_tf(): + rospy.init_node('pointcloud_tf_broadcaster') + br = tf.TransformBroadcaster() + rate = rospy.Rate(10) # 10 Hz + + while not rospy.is_shutdown(): + br.sendTransform( + (0, 0, 0), # (x, y, z) translation + tf.transformations.quaternion_from_euler(0, 0, 0), # (roll, pitch, yaw) + rospy.Time.now(), + "os_sensor", # Child frame (sensor) + "map" # Parent frame (world) + ) + rate.sleep() + +if __name__ == "__main__": + publish_tf() \ No newline at end of file diff --git a/GEMstack/onboard/planning/longitudinal_planning.py b/GEMstack/onboard/planning/longitudinal_planning.py index c3f3bf737..e862239fe 100644 --- a/GEMstack/onboard/planning/longitudinal_planning.py +++ b/GEMstack/onboard/planning/longitudinal_planning.py @@ -77,7 +77,7 @@ def update(self, state : AllState): #parse the relations indicated should_brake = False for r in state.relations: - if r.type == EntityRelationEnum.YIELD and r.obj1 == '': + if r.type == EntityRelationEnum.YIELDING and r.obj1 == '': #yielding to something, brake should_brake = True should_accelerate = (not should_brake and curr_v < self.desired_speed) diff --git a/GEMstack/state/agent.py b/GEMstack/state/agent.py index 757805837..b0ad7aaad 100644 --- a/GEMstack/state/agent.py +++ b/GEMstack/state/agent.py @@ -27,6 +27,10 @@ class AgentState(PhysicalObject): activity : AgentActivityEnum velocity : Tuple[float,float,float] #estimated velocity in x,y,z, m/s and in agent's local frame yaw_rate : float #estimated yaw rate, in radians/s + # Added by Perception for persistent objs across frames + # Check track_id to verify this is the same Agent you've + # seen before + track_id : int def velocity_local(self) -> Tuple[float,float,float]: """Returns velocity in m/s in the agent's local frame.""" diff --git a/homework/person_detector.py b/homework/person_detector.py deleted file mode 100644 index 4003d0823..000000000 --- a/homework/person_detector.py +++ /dev/null @@ -1,59 +0,0 @@ -import cv2 -import sys -from ultralytics import YOLO - -def person_detector(img : cv2.Mat): - #TODO: implement me to produce a list of (x,y,w,h) bounding boxes of people in the image - model = YOLO("yolo11n.pt") - results = model(img) - cls = results[0].boxes.cls.cpu().numpy() - bboxes = results[0].boxes.xywh.cpu().numpy() - detection = [] - for k in range(len(cls)): - category = cls[k] - b = bboxes[k] - bb = (b[0], b[1], b[2], b[3]) - if category == 0: - detection.append(bb) - return detection - -def main(fn): - image = cv2.imread(fn) - bboxes = person_detector(image) - print("Detected",len(bboxes),"people") - for bb in bboxes: - x,y,w,h = bb - if not isinstance(x,(int,float)) or not isinstance(y,(int,float)) or not isinstance(w,(int,float)) or not isinstance(h,(int,float)): - print("WARNING: make sure to return Python numbers rather than PyTorch Tensors") - print("Corner",(x,y),"size",(w,h)) - cv2.rectangle(image, (int(x-w/2), int(y-h/2)), (int(x+w/2), int(y+h/2)), (255, 0, 255), 3) - cv2.imshow('Results', image) - cv2.waitKey(0) - -def main_webcam(): - cap = cv2.VideoCapture(0) - cap.set(3, 640) - cap.set(4, 480) - - print("Press space to exit") - while True: - _, image = cap.read() - - bboxes = person_detector(image) - for bb in bboxes: - x,y,w,h = bb - cv2.rectangle(image, (int(x-w/2), int(y-h/2)), (int(x+w/2), int(y+h/2)), (255, 0, 255), 3) - - cv2.imshow('Person detection', image) - if cv2.waitKey(1) & 0xFF == ord(' '): - break - - cap.release() - - -if __name__ == '__main__': - fn = sys.argv[1] - if fn != 'webcam': - main(fn) - else: - main_webcam() \ No newline at end of file diff --git a/launch/pedestrian_detection.yaml b/launch/pedestrian_detection.yaml index 0fffa89fc..1a00fa797 100644 --- a/launch/pedestrian_detection.yaml +++ b/launch/pedestrian_detection.yaml @@ -94,4 +94,6 @@ variants: visualization: !include "klampt_visualization.yaml" drive: perception: + agent_detection : pedestrian_detection.FakePedestrianDetector2D + #agent_detection : OmniscientAgentDetector #this option reads agents from the simulator state_estimation : OmniscientStateEstimator diff --git a/run_docker_container.sh b/run_docker_container.sh index c987a9016..ac0f998b3 100644 --- a/run_docker_container.sh +++ b/run_docker_container.sh @@ -3,6 +3,6 @@ if [ "$(docker ps -q -f name=gem_stack-container)" ]; then docker exec -it gem_stack-container bash else - docker compose -f setup/docker-compose.yaml up -d + UID=$(id -u) GID=$(id -g) docker compose -f setup/docker-compose.yaml up -d docker exec -it gem_stack-container bash fi \ No newline at end of file diff --git a/setup/Dockerfile.cuda12 b/setup/Dockerfile.cuda12 index ffafaaa3f..32005202f 100644 --- a/setup/Dockerfile.cuda12 +++ b/setup/Dockerfile.cuda12 @@ -18,7 +18,7 @@ RUN ln -fs /usr/share/zoneinfo/$TZ /etc/localtime \ && apt-get update && apt-get install -y tzdata \ && rm -rf /var/lib/apt/lists/* -# install Zed SDK +# install Zed SDK. If you want to install a different version, change to this https://download.stereolabs.com/zedsdk/4.2/cu12/ubuntu20 RUN wget https://stereolabs.sfo2.cdn.digitaloceanspaces.com/zedsdk/4.2/ZED_SDK_Ubuntu20_cuda12.1_v4.2.4.zstd.run -O zed_sdk.run RUN chmod +x zed_sdk.run RUN ./zed_sdk.run -- silent diff --git a/setup/docker-compose.yaml b/setup/docker-compose.yaml index 8c43cfcaf..f0c7703b8 100644 --- a/setup/docker-compose.yaml +++ b/setup/docker-compose.yaml @@ -22,6 +22,11 @@ services: - "/tmp/.X11-unix:/tmp/.X11-unix:rw" environment: - DISPLAY=${DISPLAY} + - XDG_RUNTIME_DIR=/tmp/runtime-${USER} + - DBUS_SYSTEM_BUS_ADDRESS=unix:path=/var/run/dbus/system_bus_socket + - DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/${UID}/bus + - NVIDIA_DRIVER_CAPABILITIES=all + - NVIDIA_VISIBLE_DEVICES=all # - LIBGL_ALWAYS_SOFTWARE=1 # Uncomment if you want to use software rendering (No GPU) network_mode: host ipc: host diff --git a/setup/setup_this_machine.sh b/setup/setup_this_machine.sh index 5c9918866..36690f1d4 100755 --- a/setup/setup_this_machine.sh +++ b/setup/setup_this_machine.sh @@ -16,17 +16,25 @@ fi source /opt/ros/noetic/setup.bash #install Zed SDK -echo "Select CUDA version:" +echo "To install the ZED SDK, select the CUDA version:" echo "1) CUDA 11.8" echo "2) CUDA 12+" -read -p "Enter choice [1-2]: " choice +echo "3) No GPU (Skip ZED SDK installation)" +read -p "Enter choice [1-3]: " choice case $choice in 1) wget https://download.stereolabs.com/zedsdk/4.0/cu118/ubuntu20 -O zed_sdk.run + chmod +x zed_sdk.run + ./zed_sdk.run -- silent ;; 2) wget https://stereolabs.sfo2.cdn.digitaloceanspaces.com/zedsdk/4.2/ZED_SDK_Ubuntu20_cuda12.1_v4.2.4.zstd.run -O zed_sdk.run + chmod +x zed_sdk.run + ./zed_sdk.run -- silent + ;; + 3) + echo "Skipping ZED SDK installation..." ;; *) echo "Invalid choice" @@ -34,14 +42,12 @@ case $choice in ;; esac -chmod +x zed_sdk.run -./zed_sdk.run -- silent - #create ROS Catkin workspace mkdir -p ~/catkin_ws/src # Store current working directory CURRENT_DIR=$(pwd) +echo "CURRENT_DIR: $CURRENT_DIR" #install ROS dependencies and packages cd ~/catkin_ws/src @@ -57,9 +63,26 @@ catkin_make -DCMAKE_BUILD_TYPE=Release source devel/setup.bash cd $CURRENT_DIR -cd .. #install GEMstack Python dependencies -python3 -m pip install -r requirements.txt + +# Ask the user if they want to install ultralytics +read -p "Do you want to install ultralytics? (y/n): " install_ultralytics + +# Create a temporary requirements file +temp_requirements="temp_requirements.txt" + +# Copy all lines except ultralytics if the user chooses to skip it +if [ "$install_ultralytics" == "y" ]; then + cp requirements.txt $temp_requirements +else + grep -v "ultralytics" requirements.txt > $temp_requirements +fi + +# Install the packages from the temporary requirements file +pip3 install -r $temp_requirements + +# Clean up the temporary file +rm $temp_requirements #install other dependencies sudo apt-get install -y ros-noetic-septentrio-gnss-driver \ No newline at end of file diff --git a/webcam.py b/webcam.py index 10c3452d4..52f1b36a8 100644 --- a/webcam.py +++ b/webcam.py @@ -9,7 +9,7 @@ def webcam_publisher(): image_pub = rospy.Publisher('/webcam', Image, queue_size=10) - cap = cv2.VideoCapture(0) # 0 表示默认摄像头 + cap = cv2.VideoCapture(0) if not cap.isOpened(): rospy.logerr("cannot open camera") return