visual-object-tracker/object_detection_label.py at main · aarshon/visual-object-tracker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# main_object_detection.py
import cv2
import numpy as np

# --- Configuration ---
# IMPORTANT: You need to download a pre-trained model and its configuration file.
# Example using MobileNet-SSD (often trained on COCO dataset)
# You would download:
# 1. Model weights: e.g., 'MobileNetSSD_deploy.caffemodel'
# 2. Model configuration: e.g., 'MobileNetSSD_deploy.prototxt.txt'
# 3. Class labels file: e.g., 'coco_labels.txt'
#
# This configuration assumes the model files (prototxt, caffemodel, and labels txt)
# are in the SAME DIRECTORY as this Python script.

MODEL_CONFIG = "MobileNetSSD_deploy.prototxt.txt"
MODEL_WEIGHTS = "MobileNetSSD_deploy.caffemodel"
CLASS_LABELS_FILE = "coco_labels.txt" # Ensure this file exists in the same directory
CONFIDENCE_THRESHOLD = 0.5  # Minimum probability to filter weak detections
INPUT_WIDTH = 300 # Width the model expects (MobileNet-SSD typically uses 300x300)
INPUT_HEIGHT = 300 # Height the model expects

# --- Load Class Labels ---
# These are example COCO class labels. Your model might have different classes.
# If you have a labels file, load it. Otherwise, you can define a list like this.
try:
    with open(CLASS_LABELS_FILE, 'r') as f:
        class_labels = [line.strip() for line in f.readlines()]
    print(f"[INFO] Successfully loaded class labels from '{CLASS_LABELS_FILE}'.")
except FileNotFoundError:
    print(f"[WARNING] Class labels file '{CLASS_LABELS_FILE}' not found. Using generic fallback labels.")
    # Fallback COCO common labels (ensure this matches your model's output classes)
    class_labels = [
        "background", "person", "bicycle", "car", "motorcycle", "airplane", "bus",
        "train", "truck", "boat", "traffic light", "fire hydrant", "street sign",
        "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse",
        "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "hat", "backpack",
        "umbrella", "shoe", "eye glasses", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
        "skateboard", "surfboard", "tennis racket", "bottle", "plate", "wine glass",
        "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
        "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
        "chair", "couch", "potted plant", "bed", "mirror", "dining table", "window",
        "desk", "toilet", "door", "tv", "laptop", "mouse", "remote", "keyboard",
        "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "blender",
        "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
        "hair brush"
    ]
    # No need for the MODEL_CONFIG == "path/to/your/..." check here as we've defined direct paths.


# --- Load the Deep Learning Model ---
# OpenCV's DNN module can load models from TensorFlow, Caffe, Darknet (YOLO), etc.
print("[INFO] Loading model...")
try:
    # Example for TensorFlow models:
    # net = cv2.dnn.readNetFromTensorflow(MODEL_WEIGHTS, MODEL_CONFIG)
    # Example for Caffe models:
    net = cv2.dnn.readNetFromCaffe(MODEL_CONFIG, MODEL_WEIGHTS)
    # For YOLO, you might use:
    # net = cv2.dnn.readNetFromDarknet(MODEL_CONFIG, MODEL_WEIGHTS)
    print("[INFO] Model loaded successfully.")
except cv2.error as e:
    print(f"[ERROR] Could not load model. Check paths and model files: {e}")
    print(f"Attempted to load: Config='{MODEL_CONFIG}', Weights='{MODEL_WEIGHTS}'")
    print("[INFO] Ensure that 'MobileNetSSD_deploy.prototxt.txt' and 'MobileNetSSD_deploy.caffemodel'")
    print("[INFO] are in the same directory as this script.")
    net = None # Ensure net is None if loading failed

# --- Initialize Webcam ---
print("[INFO] Starting video stream...")
cap = cv2.VideoCapture(0) # 0 is usually the default webcam

if not cap.isOpened():
    print("[ERROR] Cannot open webcam. Ensure it is connected and not in use by another application.")
    exit()

def detect_and_display(frame):
    """
    Detects objects in the frame and draws bounding boxes and labels.
    """
    if net is None:
        cv2.putText(frame, "Model not loaded. Check console for errors.", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
        return frame

    (h, w) = frame.shape[:2] # Original frame dimensions

    # Preprocess the frame: create a blob
    # Note: Mean subtraction values (127.5, 127.5, 127.5) and scale factor (1.0/127.5)
    # are common for MobileNet-SSD. These might need adjustment for other models.
    # SwapRB=True is often needed as OpenCV reads images in BGR format.
    blob = cv2.dnn.blobFromImage(cv2.resize(frame, (INPUT_WIDTH, INPUT_HEIGHT)),
                                 scalefactor=1.0/127.5, # or 0.007843
                                 size=(INPUT_WIDTH, INPUT_HEIGHT),
                                 mean=(127.5, 127.5, 127.5), # (mean_R, mean_G, mean_B)
                                 swapRB=True, crop=False)

    # Pass the blob through the network
    net.setInput(blob)
    detections = net.forward() # This is where the detection happens

    # Loop over the detections
    for i in range(detections.shape[2]):
        confidence = detections[0, 0, i, 2]

        if confidence > CONFIDENCE_THRESHOLD:
            # Extract the index of the class label from the detections
            class_id = int(detections[0, 0, i, 1])

            # Ensure class_id is within the bounds of our class_labels list
            if class_id < len(class_labels):
                # Compute the (x, y)-coordinates of the bounding box for the object
                box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
                (startX, startY, endX, endY) = box.astype("int")

                # Draw the bounding box and label on the frame
                label_text = f"{class_labels[class_id]}: {confidence:.2f}"
                cv2.rectangle(frame, (startX, startY), (endX, endY), (0, 255, 0), 2)
                y = startY - 15 if startY - 15 > 15 else startY + 15
                cv2.putText(frame, label_text, (startX, y),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
            else:
                print(f"[WARNING] Detected class_id {class_id} is out of bounds for loaded class_labels (max index {len(class_labels)-1}).")
    return frame

# --- Main Loop ---
while True:
    # Capture frame-by-frame
    ret, frame = cap.read()
    if not ret:
        print("[ERROR] Can't receive frame (stream end?). Exiting ...")
        break

    # Perform detection and display
    processed_frame = detect_and_display(frame)

    # Display the resulting frame
    cv2.imshow('Object Detection - Press Q to Quit', processed_frame)

    # Break the loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# --- Cleanup ---
print("[INFO] Cleaning up...")
cap.release()
cv2.destroyAllWindows()