CodeML_Hackathon_2025/app.py at main · peterhxk/CodeML_Hackathon_2025 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import cv2
import numpy as np
import time
import random
import pygame
import os
from PIL import Image, ImageSequence

# ==========================
# 🧠 Model Loading & Inference (Enhanced ResNet152)
# ==========================

# Configuration for enhanced model
IMG_SIZE = 224  # Enhanced image size for better feature extraction
MODEL_ARCH = "ResNet152"  # Using ResNet152 instead of ResNet18

# Enhanced model setup with better optimization (matching training)
def create_model(num_classes, use_imagenet_weights=True, dropout_rate=0.5):
    weights = models.ResNet152_Weights.DEFAULT if use_imagenet_weights else None
    model = models.resnet152(weights=weights)

    # Add dropout before final layer for regularization
    in_feats = model.fc.in_features
    model.fc = nn.Sequential(
        nn.Dropout(dropout_rate),
        nn.Linear(in_feats, num_classes)
    )
    return model

def load_model(model_path, num_classes=7):
    """Load the trained emotion model"""
    print(f"📁 Loading model from: {model_path}")

    # Create enhanced model architecture (same as ResNet152 training)
    model = create_model(num_classes=num_classes, use_imagenet_weights=False, dropout_rate=0.3)

    # Load trained weights
    checkpoint = torch.load(model_path, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])

    # Set to evaluation mode
    model.eval()

    print("✅ Model loaded successfully")
    return model

def detect_and_crop_face(frame, face_cascade):
    """Detect face in frame and return cropped face region"""
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    # Detect faces
    faces = face_cascade.detectMultiScale(
        gray,
        scaleFactor=1.1,
        minNeighbors=5,
        minSize=(30, 30)
    )

    if len(faces) > 0:
        # Get the largest face
        face = max(faces, key=lambda x: x[2] * x[3])
        x, y, w, h = face

        # Crop face region
        face_crop = frame[y:y+h, x:x+w]

        return face_crop, (x, y, w, h)

    return None, None

def preprocess_face(face_crop, img_size=224):
    """Preprocess face crop for emotion prediction (enhanced ResNet152 model)"""
    # Convert to grayscale first (to match training data processing)
    gray_face = cv2.cvtColor(face_crop, cv2.COLOR_BGR2GRAY)

    # Step 1: Resize to 48x48 first (matching original training data size)
    face_48x48 = cv2.resize(gray_face, (48, 48))

    # Step 2: Resize from 48x48 to 224x224 for the enhanced model input
    resized_gray = cv2.resize(face_48x48, (img_size, img_size))

    # Convert grayscale to RGB format (3 channels) for the model
    # This creates RGB channels with identical grayscale values
    resized_face = cv2.cvtColor(resized_gray, cv2.COLOR_GRAY2RGB)

    # Same transforms as enhanced training data
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225])
    ])

    # Convert to PIL Image
    image = Image.fromarray(resized_face)

    # Apply transforms
    image_tensor = transform(image).unsqueeze(0)  # Add batch dimension

    return image_tensor

def predict_emotion(model, image_tensor, class_names):
    """Predict emotion from image tensor"""
    with torch.no_grad():
        outputs = model(image_tensor)
        probabilities = torch.softmax(outputs, dim=1)
        predicted_class = torch.argmax(outputs, dim=1).item()
        confidence = probabilities[0][predicted_class].item()

    return predicted_class, confidence, probabilities[0]

def load_emote_assets():
    """Load emote assets and initialize pygame for sound"""
    assets_dir = "assets"
    emotes_dir = os.path.join(assets_dir, "emotes")
    sounds_dir = os.path.join(assets_dir, "sounds")

    # Initialize pygame mixer for sound
    pygame.mixer.init()

    # Load emote assets
    emote_assets = {
        'angry': {
            'image': os.path.join(emotes_dir, "angry.png"),
            'sound': os.path.join(sounds_dir, "angry.mp3")
        },
        'happy': {
            'image': os.path.join(emotes_dir, "happy.gif"),
            'sound': os.path.join(sounds_dir, "happy.mp3")
        },
        'sad': {
            'images': [
                os.path.join(emotes_dir, "sad1.png"),
                os.path.join(emotes_dir, "sad2.png")
            ],
            'sounds': [
                os.path.join(sounds_dir, "sad1.mp3"),
                os.path.join(sounds_dir, "sad2.mp3")
            ]
        },
        'fear': {
            'image': os.path.join(emotes_dir, "fear.gif"),
            'sound': os.path.join(sounds_dir, "fear.mp3")
        },
        'surprise': {
            'image': os.path.join(emotes_dir, "surprise.gif"),
            'sound': os.path.join(sounds_dir, "surprise.mp3")
        },
        'pig_twerking': {
            'image': os.path.join(emotes_dir, "pig_twerk.gif"),
            'sound': os.path.join(sounds_dir, "pig_twerk.mp3")
        }
    }

    return emote_assets

def get_sound_duration(sound_path):
    """Get sound duration in seconds"""
    try:
        sound = pygame.mixer.Sound(sound_path)
        duration = sound.get_length()  # Duration in seconds
        return duration
    except Exception as e:
        print(f"Error getting sound duration for {sound_path}: {e}")
        return 3.0  # Default 3 seconds if error

def play_sound(sound_path):
    """Play sound effect and return its duration"""
    if sound_path is None:
        print("No sound file for this emotion")
        return 2.0  # Default 2 seconds for silent emotions (like surprise)

    try:
        pygame.mixer.music.load(sound_path)
        pygame.mixer.music.play()
        return get_sound_duration(sound_path)
    except Exception as e:
        print(f"Sound error: {e}")
        return 3.0  # Default duration

def load_emote(emote_path, face_size=None):
    """Load emote and resize to 1.5x face size while maintaining aspect ratio"""
    try:
        if emote_path.endswith('.gif'):
            # Handle GIF files
            gif = Image.open(emote_path)
            frames = []
            for frame in ImageSequence.Iterator(gif):
                frame = frame.convert('RGBA')

                # Resize if face_size is provided
                if face_size:
                    frame = resize_emote_to_face(frame, face_size)

                frames.append(frame)
            return frames
        else:
            # Handle static images
            img = Image.open(emote_path).convert('RGBA')

            # Resize if face_size is provided
            if face_size:
                img = resize_emote_to_face(img, face_size)

            return [img]
    except Exception as e:
        print(f"Error loading emote {emote_path}: {e}")
        return None

def resize_emote_to_face(emote_img, face_size):
    """Resize emote to 1.5x face size while maintaining emote's aspect ratio"""
    face_w, face_h = face_size
    emote_w, emote_h = emote_img.size

    # Calculate target size (1.5x face size)
    target_w = int(face_w * 1.5)
    target_h = int(face_h * 1.5)

    # Calculate scale to fit target size while maintaining aspect ratio
    scale_w = target_w / emote_w
    scale_h = target_h / emote_h

    # Use the smaller scale to ensure emote fits within 1.5x face bounds
    scale = min(scale_w, scale_h)

    # Calculate final dimensions
    final_w = int(emote_w * scale)
    final_h = int(emote_h * scale)

    # Resize the emote
    resized_emote = emote_img.resize((final_w, final_h), Image.Resampling.LANCZOS)

    return resized_emote

def display_emote_on_frame(frame, emote_frames, position, frame_idx=0):
    """Display emote on frame at specified position"""
    if emote_frames is None or len(emote_frames) == 0:
        return frame

    # Get current frame (for GIFs) or first frame (for static images)
    current_emote = emote_frames[frame_idx % len(emote_frames)]

    # Convert PIL to OpenCV format
    emote_cv = cv2.cvtColor(np.array(current_emote), cv2.COLOR_RGBA2BGRA)

    # Get dimensions
    h, w = frame.shape[:2]
    emote_h, emote_w = emote_cv.shape[:2]

    # Calculate position
    x, y = position

    # Ensure emote fits within frame
    if x + emote_w > w:
        x = w - emote_w
    if y + emote_h > h:
        y = h - emote_h
    if x < 0:
        x = 0
    if y < 0:
        y = 0

    # Create overlay
    overlay = frame.copy()

    # Extract alpha channel
    alpha = emote_cv[:, :, 3] / 255.0
    alpha = np.stack([alpha] * 3, axis=2)

    # Blend emote onto frame
    for c in range(3):
        overlay[y:y+emote_h, x:x+emote_w, c] = (
            alpha[:, :, c] * emote_cv[:, :, c] +
            (1 - alpha[:, :, c]) * frame[y:y+emote_h, x:x+emote_w, c]
        )

    return overlay

def get_random_border_position(frame_shape, emote_size, exclude_top_left=True):
    """Get a random position closer to the border of the frame, avoiding top-left area"""
    frame_h, frame_w = frame_shape[:2]
    emote_w, emote_h = emote_size

    # Define border regions (closer to edges with smaller spacing)
    positions = []
    border_offset = 5  # Closer to border (was implicit 0)

    if not exclude_top_left:
        # Top edge - very close to top
        for x in range(border_offset, frame_w - emote_w - border_offset, 15):
            positions.append((x, border_offset))
    else:
        # Top edge (excluding left 200px for text) - very close to top
        for x in range(200, frame_w - emote_w - border_offset, 15):
            positions.append((x, border_offset))

    # Right edge - very close to right side
    for y in range(border_offset, frame_h - emote_h - border_offset, 15):
        positions.append((frame_w - emote_w - border_offset, y))

    # Bottom edge - very close to bottom
    for x in range(border_offset, frame_w - emote_w - border_offset, 15):
        positions.append((x, frame_h - emote_h - border_offset))

    # Left edge (excluding top 150px for text) - very close to left side
    for y in range(150, frame_h - emote_h - border_offset, 15):
        positions.append((border_offset, y))

    return random.choice(positions) if positions else (frame_w - emote_w, frame_h - emote_h)

def play_pig_sound(sound_path, volume=0.05):
    """Play pig twerk sound using a different pygame channel at reduced volume"""
    try:
        # Use a separate sound channel for pig sounds to avoid interfering with emotion sounds
        pig_sound = pygame.mixer.Sound(sound_path)
        pig_sound.set_volume(volume)  # Set pig sound to 5% volume (quieter than emotion sounds)
        pig_channel = pygame.mixer.Channel(1)  # Use channel 1 for pig sounds
        pig_channel.play(pig_sound)
        return pig_sound.get_length()
    except Exception as e:
        print(f"Pig sound error: {e}")
        return 2.0  # Default duration

def live_emotion_detection():
    """Live camera feed with emotion detection"""

    # Configuration
    model_path = "best_model.pth"  # Updated to use ResNet152 model
    class_names = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

    print("🎯 Live Emotion Detection with Emotes & Sounds")
    print("=" * 50)

    # Load model
    try:
        model = load_model(model_path)
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        return

    # Load emote assets
    try:
        emote_assets = load_emote_assets()
        print("✅ Emote assets loaded successfully")
    except Exception as e:
        print(f"❌ Error loading emote assets: {e}")
        return

    # Load face cascade
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    # Initialize camera
    cap = cv2.VideoCapture(0)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
    cap.set(cv2.CAP_PROP_FPS, 30)  # Set to 30 FPS
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)  # Reduce buffer size for lower latency

    if not cap.isOpened():
        print("❌ Could not open camera")
        return

    print("🎥 Starting live camera feed with continuous emotion detection...")
    print("Press 'q' to quit, 's' to save current frame")
    print("=" * 50)

    frame_count = 0
    last_prediction_time = 0
    prediction_interval = 0.5  # Predict every 0.5 seconds
    current_emotion = None
    current_confidence = 0.0

    # Emote and sound tracking
    emotion_start_time = None
    current_emote_frames = None
    emote_frame_idx = 0
    last_sound_time = 0
    sound_interval = 2.0  # Play sound every 2 seconds

    # Sound-based emote duration tracking
    emote_start_time = None
    emote_duration = 0  # Duration based on sound length
    current_sound_duration = 0
    current_emote_emotion = None  # Store which emotion is currently being displayed

    # Pig twerking animation tracking
    pig_frames = None
    pig_frame_idx = 0
    pig_start_time = None
    pig_duration = 0
    pig_position = None
    last_pig_time = 0
    pig_interval = 10.0  # Show pig every 10 seconds

    while True:
        ret, frame = cap.read()
        if not ret:
            print("❌ Failed to capture frame")
            break

        # Flip frame horizontally for mirror effect
        frame = cv2.flip(frame, 1)

        # Detect faces in current frame
        face_crop, face_coords = detect_and_crop_face(frame, face_cascade)

        # Draw face rectangle if detected
        if face_coords is not None:
            x, y, w, h = face_coords
            cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
            cv2.putText(frame, "Face Detected", (x, y-10),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

        # Continuous emotion detection
        current_time = time.time()
        if (face_crop is not None and
            current_time - last_prediction_time > prediction_interval):

            try:
                # Brighten face for better analysis
                brightened_face = cv2.convertScaleAbs(face_crop, alpha=1.5, beta=30)

                # Preprocess face (same as get_pics.py)
                image_tensor = preprocess_face(brightened_face)

                # Make prediction
                predicted_class, confidence, all_probabilities = predict_emotion(model, image_tensor, class_names)

                # Update current emotion
                current_emotion = predicted_class
                current_confidence = confidence
                last_prediction_time = current_time

                # Track emotion duration
                if emotion_start_time is None:
                    emotion_start_time = current_time

                # Print result to console (every 10 frames to avoid spam)
                if frame_count % 10 == 0:
                    print(f"Frame {frame_count}: {class_names[predicted_class].upper()} (Confidence: {confidence:.1%})")

            except Exception as e:
                print(f"Prediction error: {e}")

        # Handle emote display and sound effects (for angry, happy, sad, fear, surprise with 50%+ confidence)
        if (current_emotion is not None and
            current_confidence >= 0.5 and
            class_names[current_emotion] in ['angry', 'happy', 'sad', 'fear', 'surprise']):

            emotion_name = class_names[current_emotion]
            emotion_duration = current_time - emotion_start_time if emotion_start_time else 0

            # Check if emotion has lasted for 1 second or more
            if emotion_duration >= 1.0:
                # Load and display emote
                if emotion_name in emote_assets:
                    if emotion_name == 'sad':
                        # Randomly choose between sad1 and sad2
                        emote_idx = random.randint(0, 1)
                        emote_path = emote_assets[emotion_name]['images'][emote_idx]
                        sound_path = emote_assets[emotion_name]['sounds'][emote_idx]
                    else:
                        emote_path = emote_assets[emotion_name]['image']
                        sound_path = emote_assets[emotion_name]['sound']

                    # Load emote if not already loaded (resize to 1.5x face size)
                    if current_emote_frames is None and face_coords:
                        x, y, w, h = face_coords
                        face_size = (w, h)
                        current_emote_frames = load_emote(emote_path, face_size)

                    # Play sound and start emote display when emotion first detected
                    if emote_start_time is None:
                        current_sound_duration = play_sound(sound_path)
                        emote_start_time = current_time
                        emote_duration = current_sound_duration
                        # Store the current emotion type for later display
                        current_emote_emotion = emotion_name
                        print(f"🎵 Playing {emotion_name} sound for {emote_duration:.1f}s")
            else:
                # Don't reset emote here - let it play for full duration
                pass

        # Display emote if it's currently playing (regardless of current emotion detection)
        if current_emote_frames is not None and face_coords:
            x, y, w, h = face_coords

            # Position emotes to the side of face based on the original emotion type
            emote_w, emote_h = current_emote_frames[0].size
            if current_emote_emotion is not None:
                if current_emote_emotion == 'angry':
                    # Angry to the left of face
                    emote_x = max(0, x - emote_w - 10)  # 10px gap from face
                    emote_y = y + (h - emote_h) // 2  # Center vertically with face
                elif current_emote_emotion == 'sad':
                    # Sad to the right of face
                    emote_x = min(frame.shape[1] - emote_w, x + w + 10)  # 10px gap from face
                    emote_y = y + (h - emote_h) // 2  # Center vertically with face
                elif current_emote_emotion == 'fear':
                    # Fear above the face
                    emote_x = x + (w - emote_w) // 2  # Center horizontally with face
                    emote_y = max(0, y - emote_h - 15)  # 15px gap above face
                elif current_emote_emotion == 'surprise':
                    # surprise to the right of  face
                    emote_x = min(frame.shape[1] - emote_w, x + w + 10)  # 10px gap from face
                    emote_y = y + (h - emote_h) // 2  # Center vertically with face
                else:  # happy (default)
                    # Happy to the right of face
                    emote_x = min(frame.shape[1] - emote_w, x + w + 10)  # 10px gap from face
                    emote_y = y + (h - emote_h) // 2  # Center vertically with face
            else:
                # Default position if emotion type not stored
                emote_x = min(frame.shape[1] - emote_w, x + w + 10)
                emote_y = y + (h - emote_h) // 2

            frame = display_emote_on_frame(frame, current_emote_frames, (emote_x, emote_y), emote_frame_idx)
            emote_frame_idx += 1

        # Check if emote display duration (based on sound) has ended - INDEPENDENT of emotion detection
        if emote_start_time is not None and current_time - emote_start_time >= emote_duration:
            # Reset emote after sound duration has passed
            current_emote_frames = None
            emote_frame_idx = 0
            emote_start_time = None
            current_emote_emotion = None  # Reset stored emotion type
            print(f"🔇 Emote display ended after {emote_duration:.1f}s")

        # Reset emotion tracking if no supported emotion detected or confidence too low
        # BUT keep emote playing if it's already started
        if (current_emotion is None or
            current_confidence < 0.5 or
            class_names[current_emotion] not in ['angry', 'happy', 'sad', 'fear', 'surprise']):
            emotion_start_time = None
            # Don't reset emote here - let it finish playing for full duration

        # Handle pig twerking animation (every 2 seconds)
        if current_time - last_pig_time >= pig_interval:
            # Load pig animation if not already loaded (resize to 50% of original size)
            if pig_frames is None:
                pig_frames = load_emote(emote_assets['pig_twerking']['image'], face_size=(50, 50))

            if pig_frames:
                # Play pig sound and start animation
                pig_sound_path = emote_assets['pig_twerking']['sound']
                pig_duration = play_pig_sound(pig_sound_path)
                pig_start_time = current_time
                pig_frame_idx = 0

                # Get random border position (excluding top-left for text)
                pig_size = pig_frames[0].size
                pig_position = get_random_border_position(frame.shape, pig_size, exclude_top_left=True)

                last_pig_time = current_time
                print(f"🐷 Pig twerking at position {pig_position} for {pig_duration:.1f}s!")

        # Display pig animation if it's currently playing
        if (pig_frames is not None and pig_start_time is not None and
            current_time - pig_start_time < pig_duration and pig_position is not None):
            frame = display_emote_on_frame(frame, pig_frames, pig_position, pig_frame_idx)
            pig_frame_idx += 1
        elif pig_start_time is not None and current_time - pig_start_time >= pig_duration:
            # Reset pig animation after duration ends
            pig_start_time = None
            pig_position = None
            pig_frame_idx = 0

        # Display current emotion on frame
        if current_emotion is not None and current_confidence > 0.5:
            emotion_text = f"{class_names[current_emotion].upper()}"
            confidence_text = f"Confidence: {current_confidence:.1%}"

            # Position text
            cv2.putText(frame, emotion_text, (50, 50),
                       cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), 3)
            cv2.putText(frame, confidence_text, (50, 100),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        # Show live camera feed
        cv2.imshow('Live Emotion Detection', frame)

        # Handle key presses
        key = cv2.waitKey(1) & 0xFF

        if key == ord('q'):
            print("👋 Quitting...")
            break
        elif key == ord('s'):
            # Save current frame
            filename = f"captured_frame_{frame_count}.jpg"
            cv2.imwrite(filename, frame)
            print(f"💾 Frame saved as {filename}")

        frame_count += 1

    # Cleanup
    cap.release()
    cv2.destroyAllWindows()
    print("✅ Camera inference stopped")

if __name__ == "__main__":
    live_emotion_detection()