From 45e0dd8603696ff52d5be7d532195dcdf049f69f Mon Sep 17 00:00:00 2001
From: ashleyl7 <ashleyl7@cs-angrave-01.cs.illinois.edu>
Date: Thu, 6 Nov 2025 11:26:01 -0600
Subject: [PATCH] Move to new file and update good matches condition

---
 .../tasks/lib/scenedetection/svm_poly3.py     | 883 ++++++++++++++++++
 1 file changed, 883 insertions(+)
 create mode 100644 pkg/agent/tasks/lib/scenedetection/svm_poly3.py

diff --git a/pkg/agent/tasks/lib/scenedetection/svm_poly3.py b/pkg/agent/tasks/lib/scenedetection/svm_poly3.py
new file mode 100644
index 0000000..d5d6883
--- /dev/null
+++ b/pkg/agent/tasks/lib/scenedetection/svm_poly3.py
@@ -0,0 +1,883 @@
+from pkg.agent.tasks.lib.scenedetection.base import SceneDetectionAlgorithm
+
+import math
+import json
+import os
+
+import numpy as np
+import pytesseract
+import cv2
+from time import perf_counter
+from skimage.metrics import structural_similarity as ssim
+from sklearn import svm
+from collections import Counter
+
+from mtcnn_cv2 import MTCNN
+from multiprocessing import Queue, Process, Semaphore
+
+TARGET_FPS = float(os.getenv('SCENE_DETECT_FPS', 0.5))
+SCENE_DETECT_USE_FACE = os.getenv('SCENE_DETECT_USE_FACE', 'true') == 'true'
+SCENE_DETECT_USE_OCR = os.getenv('SCENE_DETECT_USE_OCR', 'true') == 'true'
+SCENE_DETECT_USE_EARLY_DROP = os.getenv('SCENE_DETECT_USE_EARLY_DROP', 'true') == 'true'
+
+# Threshold for max number of samples for scene candidate selection
+# sample_rate is determined by FPS, samples = frames / sample_rate
+# if samples exceeds our threshold, we artificially lower the sampling rate
+MAX_SAMPLES = os.getenv('SCENE_DETECT_MAX_SAMPLES', 3000)  # default ~ 100 minutes at 0.5 fps
+MIN_SCENE_LENGTH = 1  # Minimum scene length in seconds
+MODEL_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'models', 'poly2.json') # File path of the SVM model to be used
+
+detector = MTCNN()
+
+
+def require_ssim_with_face_detection(curr_frame, curr_result, last_frame, last_result):
+    """
+    Given two frames with their face & upper body bounding boxes,
+        find SSIM between them after removing face & upper body
+
+    Parameters:
+    curr_frame (image): Image of the first frame
+    curr_result (tuple): Face & upper body detection result of the first frame
+    last_frame (image): Image of the second frame
+    last_result (tuple): Face & upper body detection result of the second frame
+
+    Returns:
+    float: SSIM after removing face & upper body
+    """
+
+    curr_frame_with_face_removed = curr_frame.copy()
+    last_frame_with_face_removed = last_frame.copy()
+
+    if curr_result[0]:
+        curr_boxes = curr_result[1]
+        for j in range(len(curr_boxes)):
+            x1, x2, y1, y2 = curr_boxes[j]
+            curr_frame_with_face_removed[x1:x2, y1:y2] = 0
+            last_frame_with_face_removed[x1:x2, y1:y2] = 0
+
+    if last_result[0]:
+        last_boxes = last_result[1]
+        for j in range(len(last_boxes)):
+            x1, x2, y1, y2 = last_boxes[j]
+            curr_frame_with_face_removed[x1:x2, y1:y2] = 0
+            last_frame_with_face_removed[x1:x2, y1:y2] = 0
+
+    return ssim(last_frame_with_face_removed, curr_frame_with_face_removed)
+
+
+def require_face_result(curr_frame):
+    """
+    Find all the bounding boxes of face & upper body appeared in a given frame.
+
+    Parameters:
+    curr_frame (image): Frame image
+
+    Returns:
+    tuple:
+        First element: a boolean indicating if there is any face & upper body found inside the frame
+        Second element: a list of bounding boxes of face & upper body
+    """
+
+    # Convert the input image to gray scale
+    gray_frame = cv2.cvtColor(cv2.resize(
+        curr_frame, (320, 240)), cv2.COLOR_BGR2RGB)
+
+    # Run the face detection
+    faces = detector.detect_faces(gray_frame)
+
+    curr_frame_boxes = []  # [x1, x2, y1, y2]
+    has_body = False
+
+    # Iterate through all the bounding boxes for one frame
+    for face in faces:
+        x, y, width, height = face['box']
+        curr_frame_boxes.append([x, x + width, y, y + height])
+
+        # Move x to the center of the face bounding box
+        x = x + width / 2
+
+        # Check if the face is at the center
+        if x > 0.2 * gray_frame.shape[1] and x < 0.8 * gray_frame.shape[1]:
+
+            # Check if the face is large enough
+            if width / gray_frame.shape[1] > 0.1 or height / gray_frame.shape[0] > 0.1:
+                has_body = True
+                body_x = int(x - 2 * width)
+                if body_x < 0:
+                    body_x = 0
+
+                body_y = y + height
+                body_width = width * 4
+                body_height = height * 3
+
+                curr_frame_boxes.append(
+                    [body_x, body_x + body_width, body_y, body_y + body_height])
+
+    return (has_body, curr_frame_boxes)
+
+
+def compare_ocr_difference(word_dict_a, word_dict_b):
+    """
+    Calculate the sim_OCR between two frames.
+
+    Parameters:
+    word_dict_a (dict): Key is the words that appeared in the OCR output for frame A
+                        Value is the sum of confidence of each word
+    word_dict_b (dict): Key is the words that appeared in the OCR output for frame B
+                        Value is the sum of confidence of each word
+
+    Returns:
+    float: Relative OCR similarty between the two frames
+    """
+
+    total_amount = 0
+    for k in word_dict_a.keys():
+        total_amount += word_dict_a[k]
+    for k in word_dict_b.keys():
+        total_amount += word_dict_b[k]
+
+    if total_amount == 0:
+        return 1.0
+
+    score = 0
+    for key_a in word_dict_a.keys():
+        if key_a in word_dict_b.keys():
+            score += (word_dict_a[key_a] + word_dict_b[key_a])
+
+    for key_b in list(set(word_dict_b.keys()) - set(word_dict_a.keys())):
+        if key_b in word_dict_a.keys():
+            score += (word_dict_a[key_b] + word_dict_b[key_b])
+
+    return score / total_amount
+
+def map_to_poly_kernel(features):
+    '''
+    Map the data from feature space to kernel space. A polynomial kernel of degree 2 is selected as the kernel.
+
+    Parameters:
+    X_train (numpy.ndarray): Data inside feature space
+
+    Returns:
+    numpy.ndarray: Data inside kernel space
+    '''
+    
+    kernel_space = np.ndarray((len(features), 10))
+    
+    kernel_space[:, 0] = features[:, 0] * features[:, 0] # a * a
+    kernel_space[:, 1] = features[:, 1] * features[:, 1] # b * b
+    kernel_space[:, 2] = features[:, 2] * features[:, 2] # c * c
+    
+    kernel_space[:, 3] = features[:, 0] * features[:, 1] * np.sqrt(2) # a * b
+    kernel_space[:, 4] = features[:, 0] * features[:, 2] * np.sqrt(2) # a * c
+    kernel_space[:, 5] = features[:, 1] * features[:, 2] * np.sqrt(2) # b * c
+    
+    kernel_space[:, 6] = features[:, 0] * np.sqrt(2) # a
+    kernel_space[:, 7] = features[:, 1] * np.sqrt(2) # b
+    kernel_space[:, 8] = features[:, 2] * np.sqrt(2) # c
+    
+    kernel_space[:, 9] = np.ones(len(features)) # Constant    
+    
+    return kernel_space
+
+def calculate_score(sim_structural, sim_ocr, sim_structural_no_face):
+    """
+    Calculate the final similarties score between two frames. 1 refers to a scene change and 0 refers to not a scene change.
+
+    Parameters:
+    sim_structural (list of float): List of similarities (SSIMs) between frames
+    sim_ocr (list of float): List of OCR similarities
+    sim_structural_no_face (list of float): List of similarities (SSIMs) between frames when face is removed
+
+    Returns:
+    list of float: List of combined_similarities between frames
+    """
+
+    sim_combined = []
+    for i in range(len(sim_structural)):
+        sim_combined.append([sim_structural[i], sim_ocr[i], sim_structural_no_face[i]])
+    sim_combined = np.array(sim_combined)
+
+    # Map the feature into kernel space
+    sim_combined = map_to_poly_kernel(sim_combined)
+
+    # Load the SVM Model from JSON file
+    with open(MODEL_PATH, 'r') as f:
+        loaded_model_params = json.load(f)
+    print(f"{MODEL_PATH}: SVM Loaded!")
+
+    # Initialize the SVM Model and pass in the parameters
+    loaded_clf = svm.SVC(kernel='linear')
+
+    loaded_clf._dual_coef_ = np.array(loaded_model_params['_dual_coef_'])
+    loaded_clf.support_vectors_ = np.array(loaded_model_params['support_vectors_'])
+    loaded_clf._sparse = loaded_model_params['_sparse']
+    loaded_clf._n_support = np.array(loaded_model_params['_n_support'], dtype = np.int32)
+    loaded_clf.support_ = np.array(loaded_model_params['support_'], dtype = np.int32)
+    loaded_clf._intercept_ = np.array(loaded_model_params['_intercept_'])
+    loaded_clf._probA = np.array(loaded_model_params['_probA'])
+    loaded_clf._probB = np.array(loaded_model_params['_probB'])
+    loaded_clf._gamma = loaded_model_params['_gamma']
+    loaded_clf.classes_ = np.array(loaded_model_params['classes_'], dtype = np.int32)
+    loaded_clf.gamma = loaded_model_params['gamma']
+
+    # Predict
+    predicted_labels = loaded_clf.predict(sim_combined)
+
+    return predicted_labels
+
+def generate_frame_similarity(video_path, num_samples, everyN, start_time):
+    """
+    Generate simlarity values for each sample frames.
+
+    Parameters:
+    video_path (string): Video path
+    num_samples (list of float): Amount of samples
+    everyN (list of float): Number of frames that is ignored each iteration
+    start_time (list of float): Start time of the whole process
+
+    Returns:
+    List of string: Timestamps array of each sample frame
+    List of float: sim_structural array of each sample frame
+    List of float: sim_structural_no_face array of each sample frame
+    List of float: sim_ocr array of each sample frame
+    """
+
+    SIM_OCR_CONFIDENCE = 55  # OCR confidnece used to generate sim_ocr
+    DROP_THRESHOLD = 0.95  # Minimum sim_structural confidnece to conclude no scene changes
+
+    # Stores the last frame read
+    last_frame = 0
+
+    # Stores the last face detetion result
+    last_face_detection_result = 0
+
+    # Stores the OCR output of last frame read
+    last_ocr = dict()
+
+    # List of similarities (SSIMs) between frames
+    sim_structural = np.zeros(num_samples)
+
+    # List of OCR outputs and OCR similarities
+    ocr_output = []
+    sim_ocr = np.zeros(num_samples)
+
+    # List of similarities (SSIMs) between frames when face is removed
+    sim_structural_no_face = np.zeros(num_samples)
+
+    timestamps = np.zeros(num_samples)
+
+    # Opencv Reader
+    cap = cv2.VideoCapture(video_path)
+
+    last_log_time = 0
+    # For this loop only we are not using real frame numbers; we are skipping frames to improve processing speed
+
+    # Avoid memory leak by using del
+    curr_face_detection_result = None
+    last_face_detection_result = None
+    frame = None
+    last_frame = None
+    ocr_frame = None
+    str_text = None
+
+    for i in range(0, num_samples):
+
+        t = perf_counter()
+        if t >= last_log_time + 30:
+            print(
+                f"find_scenes({video_path}): {i}/{num_samples}. Elapsed {int(t - start_time)} s")
+            last_log_time = t
+
+        # Read a frame through opencv
+        cap.set(cv2.CAP_PROP_POS_FRAMES, i * everyN)
+        ret, frame = cap.read()
+
+        timestamps[i] = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000
+
+        curr_frame = cv2.cvtColor(cv2.resize(
+            frame, (320, 240)), cv2.COLOR_BGR2GRAY)
+
+        # Calculate the SSIM between the current frame and last frame
+        if i >= 1:
+            sim_structural[i] = ssim(last_frame, curr_frame)
+
+        # Check the sim_structural score to ignore Face Detection and OCR
+        is_early_drop = (i >= 1 and sim_structural[i] >= DROP_THRESHOLD and SCENE_DETECT_USE_EARLY_DROP)
+
+        # Drop Face Detection and OCR
+        if is_early_drop:
+            sim_structural[i] = 1  # By setting all of these to 1 we declare that there is no change in frame here.
+            sim_structural_no_face[i] = 1
+            sim_ocr[i] = 1
+
+        # Continue Face Detection and OCR
+        else:
+            if SCENE_DETECT_USE_FACE:
+                # Run Face Detection upon the current frame
+                curr_face_detection_result = require_face_result(curr_frame)
+
+                # Calculate the SSIM between the current frame and last frame when face & upper body are removed
+                if i >= 1:
+                    sim_structural_no_face[i] = require_ssim_with_face_detection(
+                        curr_frame, curr_face_detection_result, last_frame, last_face_detection_result)
+
+                # Save the current face detection result for the next iteration
+                del last_face_detection_result
+                last_face_detection_result = curr_face_detection_result
+            else:
+                sim_structural_no_face[i] = sim_structural[i]
+
+            if SCENE_DETECT_USE_OCR:
+                # Calculate the OCR difference between the current frame and last frame
+                ocr_frame = cv2.cvtColor(cv2.resize(
+                    frame, (480, 360)), cv2.COLOR_BGR2GRAY)
+                str_text = pytesseract.image_to_data(
+                    ocr_frame, output_type='dict')
+
+                phrases = Counter()
+                for j in range(len(str_text['conf'])):
+                    if int(float(str_text['conf'][j])) >= SIM_OCR_CONFIDENCE and len(str_text['text'][j].strip()) > 0:
+                        phrases[str_text['text'][j]
+                        ] += (float(str_text['conf'][j]) / 100)
+
+                del str_text
+                curr_ocr = dict(phrases)
+
+                if i >= 1:
+                    sim_ocr[i] = compare_ocr_difference(last_ocr, curr_ocr)
+
+                ocr_output.append(phrases)
+
+                # Save the current OCR output for the next iteration
+                if last_ocr:
+                    del last_ocr
+                last_ocr = curr_ocr
+            else:
+                sim_ocr[i] = 1 if i >= 1 else 0
+
+        # Save the current frame for the next iteration
+        if last_frame is not None:
+            del last_frame
+        last_frame = curr_frame
+
+        # One or more these prevents a memory leak. (16GB over 10,000 samples)
+    if SCENE_DETECT_USE_OCR:
+        del curr_face_detection_result
+        del last_ocr
+
+    del last_frame  # May prevent a memory leak
+    del frame
+    del curr_frame
+
+    return timestamps, sim_structural, sim_structural_no_face, sim_ocr
+
+
+def _enumerate_scene_candidates(result_queue, args):
+    """
+    Given a video path, parse the video file and look for possible location where scenes could be cut.
+
+    Parameters:
+    video_path (string): Video path
+    start_time (datetime): the time at which the task started (for reporting incremental performance or progress)
+
+    Returns:
+    string: Features of detected scenes
+    """
+    (video_path, start_time) = args
+
+    # Extract frames s1,e1,s2,e2,....
+    # e1 != s2 but s1 is roughly equal to m1
+    # s1 (m1) e1 s2 (m2) e2
+
+    print(f"find_scenes({video_path}) starting...")
+    print(
+        f"SCENE_DETECT_USE_FACE={SCENE_DETECT_USE_FACE}, SCENE_DETECT_USE_OCR={SCENE_DETECT_USE_OCR}, TARGET_FPS={TARGET_FPS}")
+
+    # Check if the video file exists
+    if os.path.exists(video_path):
+        print(f"{video_path}: Found file!")
+    else:
+        print(f"{video_path}: File not found -returning empty scene cuts ")
+        return json.dumps([])
+
+    # Get the video capture and number of frames and fps
+    cap = cv2.VideoCapture(video_path)
+    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    fps = float(cap.get(cv2.CAP_PROP_FPS))
+
+    # Input FPS could be < targetFPS
+    everyN = max(1, int(fps / TARGET_FPS))
+
+    num_samples = num_frames // everyN
+
+    print(
+        f"find_scenes({video_path}): frames={num_frames}. fps={fps}. everyN={everyN}. samples={num_samples}.")
+
+    # examine num_samples < 3000 (tbd)? if so, lower sampling rate (TARGET_FPS?)
+    # probably ~3000 will be maximum in practice
+    if num_samples > MAX_SAMPLES:
+        print(
+            f" >>> WARNING: Sampling every {everyN} frame with {num_frames} frames would "
+            f"exceed maximum number of samples {MAX_SAMPLES}.")
+        everyN = int(math.ceil(num_frames / MAX_SAMPLES))
+        num_samples = num_frames // everyN
+        print(f" >>> WARNING: Using alternative sampling rate. everyN={everyN}. samples={num_samples}.")
+
+    # Mininum number of frames per scene
+    min_samples_between_cut = max(0, int(MIN_SCENE_LENGTH * TARGET_FPS))
+
+    # Scene Analysis
+    timestamps, sim_structural, sim_structural_no_face, sim_ocr = generate_frame_similarity(video_path, num_samples,
+                                                                                            everyN, start_time)
+
+    t = perf_counter()
+    print(
+        f"find_scenes('{video_path}',...) Scene Analysis Complete.  Time so far {int(t - start_time)} seconds. Defining Scene Cut points next")
+
+    result = (min_samples_between_cut, num_samples, num_frames, everyN, timestamps, sim_structural, sim_structural_no_face, sim_ocr)
+    result_queue.put(result)
+
+    return result
+
+def find_match(curr, ref, width, height, index):
+        # Search for the middle of reference frame in the current frame
+        # Ignore edges because they can cloud motion estimation
+        img_x = int(width/6)
+        img_y = int(height/6)
+        img_w = int(2 * width/3)
+        img_h = int(2 * height/3)
+
+        # Create target
+        crop_image = ref[img_y:img_y+img_h, img_x:img_x+img_w]
+
+        # Initiate scale-invariant feature detector (Scale Invariant Feature 
+        # Transform) from Lowe's paper
+        # https://doi.org/10.1023/B:VISI.0000029664.99615.94
+        sift = cv2.SIFT_create()
+
+        # Find the keypoints and descriptors with SIFT
+        kp1, des1 = sift.detectAndCompute(crop_image, None)
+        kp2, des2 = sift.detectAndCompute(curr, None)
+
+        # If no features were found in either image, skip
+        if des1 is None or des2 is None or len(kp1) < 2 or len(kp2) < 2:
+            print(f"No descriptors found (des1={des1 is None}, des2={des2 is None}) at index {index}")
+            return False
+
+        # FLANN parameters
+        FLANN_INDEX_KDTREE = 1  # Use k-d trees for nearest neighbor search
+        NUM_TREES = 5  # Default, more trees is faster but uses more memory and is slower
+        NUM_CHECKS = 50  # Number of neighbors to check
+        index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = NUM_TREES) 
+        search_params = dict(checks = NUM_CHECKS)  
+
+        flann = cv2.FlannBasedMatcher(index_params, search_params)
+
+        matches = flann.knnMatch(des1, des2, k=2)
+
+        # Ratio test as per Lowe's paper (7.1 Keypoint matching)
+        DISTANCE_RATIO = 0.7  
+        good = []
+        for i,(m,n) in enumerate(matches):
+            if m.distance < DISTANCE_RATIO * n.distance:
+                good.append(m)
+
+        print("Matches: ", len(matches))
+        print("Good: ", len(good))
+        print("Good ratio: ", len(good)/len(matches))
+
+        # If fewer than MIN_GOOD found between frames, then it's unlikely
+        # they are scrolled versions of each other
+        MIN_GOOD_RATIO = 0.2  # Increase to require more matches for two frames to be considered related
+        if len(good)/len(matches) < MIN_GOOD_RATIO:
+            print("Not enough good matches to estimate motion")
+            return False
+
+        # Calculating distances for matched pairs
+        src_pts = np.float32([(kp1[m.queryIdx].pt[0] + img_x, kp1[m.queryIdx].pt[1] + img_y) for m in good])
+        dst_pts = np.float32([kp2[m.trainIdx].pt for m in good])
+
+        M, inliers = cv2.estimateAffinePartial2D(src_pts, dst_pts, method=cv2.RANSAC)
+
+        # Fraction of matched features that agree with the estimated motion
+        inlier_ratio = np.sum(inliers) / len(inliers)
+
+        # If not enough agree, it's unlikely the frames are scrolled versions of each other
+        MIN_INLIER_RATIO = 0.3 # Increase to require more consistent motion between frames 
+        if inlier_ratio < MIN_INLIER_RATIO:
+            print("Reject: not enough inliers")
+            return False
+
+        tx = M[0,2]  # translation in x
+        ty = M[1,2]  # translation in Y
+        # sx = np.sqrt(M[0,0]**2 + M[1,0]**2)  # scale in x
+        sy = np.sqrt(M[0,1]**2 + M[1,1]**2)  # scale in y
+        
+        print("x-shift: ", tx)
+        print("y-shift: ", ty)
+        print("y-scale: ", sy)
+
+        if abs(sy - 1) > 0.01 or abs(ty) > 1 or abs(tx) > 1:
+            return True
+
+        return False
+
+
+def filter_scrolling(video_path, frame_cuts):
+    """
+    Filters out scrolling frames from frame cuts
+
+    Args:
+        video_path (str): Path to the video file.
+        frame_cuts (list): List of frame numbers where cuts are predicted.
+
+    Returns:
+        list: Frame cuts without frames detected as differing only by scrolling.
+    """
+    # Reverse frame_cuts to enumerate backwards
+    frame_cuts = list(reversed(frame_cuts))
+
+    filtered_frame_cuts = [frame_cuts[0]]
+
+    # Load video reader
+    cap = cv2.VideoCapture(video_path)
+
+    # Enumerate backwards through frame cuts, comparing against most recently added filtered_frame_cuts element, and check for scrolling
+    # If true (difference below threshold),  do not add to filtered_frame_cuts; else add
+    cap.set(cv2.CAP_PROP_POS_FRAMES, filtered_frame_cuts[0])
+    ret, frame = cap.read()
+    height, width, channels = frame.shape
+    reference_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+
+    for i in range(1, len(frame_cuts)):
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_cuts[i])
+        ret, frame = cap.read()
+        curr_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+
+        # If reference frame is reasonably found within current frame, skip
+        if find_match(curr_frame, reference_frame, width, height, i):
+            continue
+
+        # Else, add current frame and update reference frame
+        filtered_frame_cuts.append(frame_cuts[i])
+        reference_frame = curr_frame
+
+    return list(reversed(filtered_frame_cuts))
+
+class SvmPoly3(SceneDetectionAlgorithm):
+    def _generate_frame_similarity_batch(self, result_queue, args):
+        """
+        Generate simlarity values for a batch of sample frames.
+        This function should be run in a subprocess
+
+        Parameters:
+        video_path (string): Video path
+        num_samples (list of float): Amount of samples
+        everyN (list of float): Number of frames that is ignored each iteration
+        start_time (list of float): Start time of the whole process
+
+        Returns:
+        List of string: Timestamps array of each sample frame
+        List of float: sim_structural array of each sample frame
+        List of float: sim_structural_no_face array of each sample frame
+        List of float: sim_ocr array of each sample frame
+        """
+        (video_path, start_idx, end_idx, everyN, start_time) = args
+        num_samples = end_idx - start_idx
+
+        SIM_OCR_CONFIDENCE = 55  # OCR confidnece used to generate sim_ocr
+        DROP_THRESHOLD = 0.95  # Minimum sim_structural confidnece to conclude no scene changes
+
+        # Stores the last frame read
+        last_frame = 0
+
+        # Stores the last face detetion result
+        last_face_detection_result = 0
+
+        # Stores the OCR output of last frame read
+        last_ocr = dict()
+
+        # List of similarities (SSIMs) between frames
+        sim_structural = np.zeros(num_samples)
+
+        # List of OCR outputs and OCR similarities
+        ocr_output = []
+        sim_ocr = np.zeros(num_samples)
+
+        # List of similarities (SSIMs) between frames when face is removed
+        sim_structural_no_face = np.zeros(num_samples)
+
+        timestamps = np.zeros(num_samples)
+
+        # Opencv Reader
+        cap = cv2.VideoCapture(video_path)
+
+        last_log_time = 0
+        # For this loop only we are not using real frame numbers; we are skipping frames to improve processing speed
+
+        # Avoid memory leak by using del
+        curr_face_detection_result = None
+        last_face_detection_result = None
+        frame = None
+        last_frame = None
+        curr_frame = None
+        ocr_frame = None
+        str_text = None
+
+        for i in range(start_idx, end_idx):
+
+            t = perf_counter()
+            if t >= last_log_time + 30:
+                print(
+                    f"find_scenes({video_path}): {i - start_idx}/{num_samples}. Elapsed {int(t - start_time)} s")
+                last_log_time = t
+
+            # Read a frame through opencv
+            requested_frame_number = i * everyN
+            cap.set(cv2.CAP_PROP_POS_FRAMES, requested_frame_number)
+            ret, frame = cap.read()
+
+            # Read a null frame, truncate the result arrays
+            if type(frame) != np.ndarray:
+                timestamps = timestamps[0:i - start_idx]
+                sim_structural = sim_structural[0:i - start_idx]
+                sim_structural_no_face = sim_structural_no_face[0:i - start_idx]
+                sim_ocr = sim_ocr[0:i - start_idx]
+                break
+
+            timestamps[i - start_idx] = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000
+
+            curr_frame = cv2.cvtColor(cv2.resize(
+                frame, (320, 240)), cv2.COLOR_BGR2GRAY)
+
+            # Calculate the SSIM between the current frame and last frame
+            if i - start_idx >= 1:
+                sim_structural[i - start_idx] = ssim(last_frame, curr_frame)
+
+            # Check the sim_structural score to ignore Face Detection and OCR
+            is_early_drop = (i - start_idx >= 1 and sim_structural[i - start_idx] >= DROP_THRESHOLD and SCENE_DETECT_USE_EARLY_DROP)
+
+            # Drop Face Detection and OCR
+            if is_early_drop:
+                sim_structural[i - start_idx] = 1  # By setting all of these to 1 we declare that there is no change in frame here.
+                sim_structural_no_face[i - start_idx] = 1
+                sim_ocr[i - start_idx] = 1
+
+            # Continue Face Detection and OCR
+            else:
+                if SCENE_DETECT_USE_FACE:
+                    # Run Face Detection upon the current frame
+                    curr_face_detection_result = require_face_result(curr_frame)
+
+                    # Calculate the SSIM between the current frame and last frame when face & upper body are removed
+                    if i - start_idx >= 1:
+                        sim_structural_no_face[i - start_idx] = require_ssim_with_face_detection(
+                            curr_frame, curr_face_detection_result, last_frame, last_face_detection_result)
+
+                    # Save the current face detection result for the next iteration
+                    del last_face_detection_result
+                    last_face_detection_result = curr_face_detection_result
+                else:
+                    sim_structural_no_face[i - start_idx] = sim_structural[i - start_idx]
+
+                if SCENE_DETECT_USE_OCR:
+                    # Calculate the OCR difference between the current frame and last frame
+                    ocr_frame = cv2.cvtColor(cv2.resize(
+                        frame, (480, 360)), cv2.COLOR_BGR2GRAY)
+                    str_text = pytesseract.image_to_data(
+                        ocr_frame, output_type='dict')
+
+                    phrases = Counter()
+                    for j in range(len(str_text['conf'])):
+                        if int(float(str_text['conf'][j])) >= SIM_OCR_CONFIDENCE and len(str_text['text'][j].strip()) > 0:
+                            phrases[str_text['text'][j]
+                            ] += (float(str_text['conf'][j]) / 100)
+
+                    del str_text
+                    curr_ocr = dict(phrases)
+
+                    if i >= 1:
+                        sim_ocr[i - start_idx] = compare_ocr_difference(last_ocr, curr_ocr)
+
+                    ocr_output.append(phrases)
+
+                    # Save the current OCR output for the next iteration
+                    if last_ocr:
+                        del last_ocr
+                    last_ocr = curr_ocr
+                else:
+                    sim_ocr[i - start_idx] = 1 if i >= 1 else 0
+
+            # Save the current frame for the next iteration
+            if last_frame is not None:
+                del last_frame
+            last_frame = curr_frame
+
+            # One or more these prevents a memory leak. (16GB over 10,000 samples)
+        if SCENE_DETECT_USE_OCR:
+            del curr_face_detection_result
+            del last_ocr
+
+        del last_frame  # May prevent a memory leak
+        del frame
+        del curr_frame
+
+        results = (timestamps, sim_structural, sim_structural_no_face, sim_ocr)
+        result_queue.put(results)
+
+        return results
+
+    def enumerate_scene_candidates_batch(self, video_path, start_time):
+        """
+        Given a video path, parse the video file and look for possible location where scenes could be cut as a sequence of subprocess. 
+        Each subprocess will process a batch of frames.
+
+        Parameters:
+        video_path (string): Video path
+        start_time (datetime): the time at which the task started (for reporting incremental performance or progress)
+
+        Returns:
+        string: Features of detected scenes
+        """
+        CONCURRENCY = 1
+        FRAME_PER_PROCESS = 300 # Maximum concurrent processes allowed
+
+        print(f"find_scenes({video_path}) starting...")
+        print(
+            f"SCENE_DETECT_USE_FACE={SCENE_DETECT_USE_FACE}, SCENE_DETECT_USE_OCR={SCENE_DETECT_USE_OCR}, TARGET_FPS={TARGET_FPS}")
+
+        # Check if the video file exists
+        if os.path.exists(video_path):
+            print(f"{video_path}: Found file!")
+        else:
+            print(f"{video_path}: File not found -returning empty scene cuts ")
+            return json.dumps([])
+
+        # Get the video capture and number of frames and fps
+        cap = cv2.VideoCapture(video_path)
+        num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+        fps = float(cap.get(cv2.CAP_PROP_FPS))
+
+        # Input FPS could be < targetFPS
+        everyN = max(1, int(fps / TARGET_FPS))
+
+        num_samples = num_frames // everyN
+
+        print(
+            f"find_scenes({video_path}): frames={num_frames}. fps={fps}. everyN={everyN}. samples={num_samples}.")
+
+        # examine num_samples < 3000 (tbd)? if so, lower sampling rate (TARGET_FPS?)
+        # probably ~3000 will be maximum in practice
+        if num_samples > MAX_SAMPLES:
+            print(
+                f" >>> WARNING: Sampling every {everyN} frame with {num_frames} frames would "
+                f"exceed maximum number of samples {MAX_SAMPLES}.")
+            everyN = int(math.ceil(num_frames / MAX_SAMPLES))
+            num_samples = num_frames // everyN
+            print(f" >>> WARNING: Using alternative sampling rate. everyN={everyN}. samples={num_samples}.")
+
+        # Mininum number of frames per scene
+        min_samples_between_cut = max(0, int(MIN_SCENE_LENGTH * TARGET_FPS))
+
+        # Scene Analysis
+        sema = Semaphore(CONCURRENCY)
+        num_batches = math.floor(num_samples / FRAME_PER_PROCESS) + 1
+
+        timestamps = []
+        sim_structural = []
+        sim_structural_no_face = []
+        sim_ocr = []
+        for i in range(num_batches):
+            start_idx = i * FRAME_PER_PROCESS
+            end_idx = min(start_idx + FRAME_PER_PROCESS, num_samples)
+            print("Scene Analysis - Processing from " + str(start_idx) + " to " + str(end_idx))
+
+            sema.acquire()
+            try:
+                args = (video_path, start_idx, end_idx, everyN, start_time)
+                local_result = self.run_as_subprocess(target=self._generate_frame_similarity_batch, args=args)
+                
+                timestamps.extend(local_result[0])
+                sim_structural.extend(local_result[1])
+                sim_structural_no_face.extend(local_result[2])
+                sim_ocr.extend(local_result[3])
+            
+            except Exception as e:
+                print(f"_generate_frame_similarity_batch throwing Exception:" + str(e))
+
+            sema.release()
+        
+        # Correct num_samples according to the real number of frame read
+        num_samples = len(timestamps)
+        print("Scene Analysis - " + str(num_samples) + " extracted!")
+
+        t = perf_counter()
+        print(
+            f"find_scenes('{video_path}',...) Scene Analysis Complete.  Time so far {int(t - start_time)} seconds. Defining Scene Cut points next")
+
+        return (min_samples_between_cut, num_samples, num_frames, everyN, timestamps, sim_structural, sim_structural_no_face, sim_ocr)
+    
+    def enumerate_scene_candidates(self, video_path, start_time):
+        return self.run_as_subprocess(target=_enumerate_scene_candidates, args=(video_path, start_time))
+
+    def find_scenes(self, video_path):
+        """
+        The main method of the SceneDetectionAlgorithm. Override this in your subclass.
+
+        Parameters:
+        video_path (string): Video path
+
+        Returns:
+        string: Features of detected scenes
+        """
+        print("video_path",video_path)
+
+        start_time = perf_counter()
+
+        # 1. Enumerate candidates as subprocess and block until it completes
+        print(' >>>>> USING NEW SceneDetection Running Step 1/3 (subprocess): ' + video_path)
+        (min_samples_between_cut, num_samples, num_frames, everyN, timestamps,
+            sim_structural, sim_structural_no_face, sim_ocr) = self.enumerate_scene_candidates_batch(video_path, start_time)
+
+        # 2. Calculate the combined similarities score
+        print(' >>>>> SceneDetection Running Step 2/3 (main process): ' + video_path)
+        combined_similarities = calculate_score(
+            sim_structural, sim_ocr, sim_structural_no_face)
+
+        # actual pixels/color differences, text/object differences, face/mouth differences
+
+        # Calculate the combined similarities score
+        predicted_labels = calculate_score(
+            sim_structural, sim_ocr, sim_structural_no_face)
+
+        # Find cuts by finding where combined predicted_labels == 1
+        samples_cut_candidates = np.argwhere(predicted_labels == 1).flatten()
+
+        print(f"{video_path}: {len(samples_cut_candidates)} candidates identified")
+        if len(samples_cut_candidates) == 0:
+            print(f"{video_path}:Returning early - no scene cuts found")
+            return json.dumps([])
+
+        # Get real scene cuts by filtering out those that happen within min_frames of the last cut
+        sample_cuts = [samples_cut_candidates[0]]
+        for i in range(1, len(samples_cut_candidates)):
+            if samples_cut_candidates[i] >= samples_cut_candidates[i - 1] + min_samples_between_cut:
+                sample_cuts += [samples_cut_candidates[i]]
+
+        if num_samples > 1:
+            sample_cuts += [num_samples - 1]
+
+        # Now work in frames again. Make sure we are using regular ints (not numpy ints) other json serialization will fail
+        frame_cuts = [int(s * everyN) for s in sample_cuts]
+
+        # Filter out frames differing only by scrolling
+        filtered_frame_cuts = filter_scrolling(video_path, frame_cuts)
+
+        filtered_frame_cuts = [int(x) for x in filtered_frame_cuts]
+
+        # Finish up by calling helper method to cut scenes and run OCR
+        print(' >>>>> SceneDetection Running Step 3/3 (mutiple subprocess): ' + video_path)
+        return self.extract_scene_information_batch(video_path, timestamps, filtered_frame_cuts, everyN, start_time)
+