Ahmednull · ma4096 · Aug 7, 2025 · Aug 7, 2025 · Aug 11, 2025 · Aug 14, 2025
diff --git a/README.md b/README.md
@@ -41,19 +41,27 @@ Detect face and predict gaze from webcam
 ```python
 from l2cs import Pipeline, render
 import cv2
+import pathlib
+import torch
+
+CWD = pathlib.Path.cwd()
 
 gaze_pipeline = Pipeline(
     weights=CWD / 'models' / 'L2CSNet_gaze360.pkl',
     arch='ResNet50',
-    device=torch.device('cpu') # or 'gpu'
+    device=torch.device('cpu') # or 'cuda', 'opengl', ...
 )
 
-cap = cv2.VideoCapture(cam)
+cap = cv2.VideoCapture(0)
 _, frame = cap.read()    
 
 # Process frame and visualize
 results = gaze_pipeline.step(frame)
 frame = render(frame, results)
+
+cv2.imshow("Detected face", frame)
+cv2.waitKey(0)
+cv2.destroyAllWindows()
 ```
 
 ## Demo

diff --git a/l2cs/pipeline.py b/l2cs/pipeline.py
@@ -8,7 +8,7 @@
 from dataclasses import dataclass
 from face_detection import RetinaFace
 
-from .utils import prep_input_numpy, getArch
+from .utils import prep_input_numpy, getArch, stackSave
 from .results import GazeResultContainer
 
 
@@ -47,7 +47,7 @@ def __init__(
             self.idx_tensor = [idx for idx in range(90)]
             self.idx_tensor = torch.FloatTensor(self.idx_tensor).to(self.device)
 
-    def step(self, frame: np.ndarray) -> GazeResultContainer:
+    def step(self, frame: np.ndarray, single_face: bool = False) -> GazeResultContainer:
 
         # Creating containers
         face_imgs = []
@@ -58,13 +58,15 @@ def step(self, frame: np.ndarray) -> GazeResultContainer:
         if self.include_detector:
             faces = self.detector(frame)
 
-            if faces is not None: 
+            if faces is not None:
                 for box, landmark, score in faces:
 
                     # Apply threshold
                     if score < self.confidence_threshold:
                         continue
 
+                    accepted_scores.append(score)
+
                     # Extract safe min and max of x,y
                     x_min=int(box[0])
                     if x_min < 0:
@@ -86,8 +88,17 @@ def step(self, frame: np.ndarray) -> GazeResultContainer:
                     landmarks.append(landmark)
                     scores.append(score)
 
+                # if single_face, only take the face with the highest score
+                if single_face and len(face_imgs) > 1:
+                    max_score_index = accepted_scores.index(max(accepted_scores))
+                    face_imgs = [face_imgs[max_score_index]]
+
                 # Predict gaze
-                pitch, yaw = self.predict_gaze(np.stack(face_imgs))
+                if len(face_imgs) != 0:
+                    pitch, yaw = self.predict_gaze(np.stack(face_imgs))
+                else:
+                    pitch = np.empty((0,1)) # kind of random numbers, always assert successfull detection of face/gaze with GazeResultContainer.detection
+                    yaw = np.empty((0,1))
 
             else:
 
@@ -97,13 +108,15 @@ def step(self, frame: np.ndarray) -> GazeResultContainer:
         else:
             pitch, yaw = self.predict_gaze(frame)
 
+        detection = len(face_imgs) > 0 # test, if there are any detected faces/eyes
         # Save data
         results = GazeResultContainer(
             pitch=pitch,
             yaw=yaw,
-            bboxes=np.stack(bboxes),
-            landmarks=np.stack(landmarks),
-            scores=np.stack(scores)
+            bboxes=stackSave(bboxes),
+            landmarks=stackSave(landmarks),
+            scores=stackSave(scores),
+            detection=detection
         )
 
         return results

diff --git a/l2cs/results.py b/l2cs/results.py
@@ -6,6 +6,7 @@ class GazeResultContainer:
 
     pitch: np.ndarray
     yaw: np.ndarray
-    bboxes: np.ndarray
-    landmarks: np.ndarray
-    scores: np.ndarray
+    bboxes: np.ndarray|None
+    landmarks: np.ndarray|None
+    scores: np.ndarray|None
+    detection: bool
diff --git a/l2cs/utils.py b/l2cs/utils.py
@@ -68,7 +68,6 @@ def angular(gaze, label):
 
 def select_device(device='', batch_size=None):
     # device = 'cpu' or '0' or '0,1,2,3'
-    s = f'YOLOv3 🚀 {git_describe() or date_modified()} torch {torch.__version__} '  # string
     cpu = device.lower() == 'cpu'
     if cpu:
         os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # force torch.cuda.is_available() = False
@@ -85,9 +84,8 @@ def select_device(device='', batch_size=None):
         space = ' ' * len(s)
         for i, d in enumerate(devices):
             p = torch.cuda.get_device_properties(i)
-            s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / 1024 ** 2}MB)\n"  # bytes to MB
     else:
-        s += 'CPU\n'
+        pass
 
     return torch.device('cuda:0' if cuda else 'cpu')
 
@@ -143,3 +141,8 @@ def getArch(arch,bins):
                 'The default value of ResNet50 will be used instead!')
         model = L2CS( torchvision.models.resnet.Bottleneck, [3, 4, 6,  3], bins)
     return model
+
+def stackSave(ar:list):
+    """ Wrapper for np.stack to with error handling when trying to stack empty lists. If the length of the passed list == 0, returns None else returns np.stack(ar) as expected
+    """
+    return np.stack(ar) if len(ar) > 0 else None
diff --git a/l2cs/vis.py b/l2cs/vis.py
@@ -33,7 +33,12 @@ def draw_bbox(frame: np.ndarray, bbox: np.ndarray):
     return frame
 
 def render(frame: np.ndarray, results: GazeResultContainer):
-
+
+    # Check if there is a detection in the frame/results object. If not, return an image with annotation "No detection".
+    if not results.detection:
+        frame = cv2.putText(frame, "No detection", (10,40), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0,255,0), 2, cv2.LINE_AA)
+        return frame
+
     # Draw bounding boxes
     for bbox in results.bboxes:
         frame = draw_bbox(frame, bbox)