-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathapp.py
More file actions
604 lines (494 loc) Ā· 22.6 KB
/
app.py
File metadata and controls
604 lines (494 loc) Ā· 22.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
import cv2
import numpy as np
import time
import random
import pygame
import os
from PIL import Image, ImageSequence
# ==========================
# š§ Model Loading & Inference (Enhanced ResNet152)
# ==========================
# Configuration for enhanced model
IMG_SIZE = 224 # Enhanced image size for better feature extraction
MODEL_ARCH = "ResNet152" # Using ResNet152 instead of ResNet18
# Enhanced model setup with better optimization (matching training)
def create_model(num_classes, use_imagenet_weights=True, dropout_rate=0.5):
weights = models.ResNet152_Weights.DEFAULT if use_imagenet_weights else None
model = models.resnet152(weights=weights)
# Add dropout before final layer for regularization
in_feats = model.fc.in_features
model.fc = nn.Sequential(
nn.Dropout(dropout_rate),
nn.Linear(in_feats, num_classes)
)
return model
def load_model(model_path, num_classes=7):
"""Load the trained emotion model"""
print(f"š Loading model from: {model_path}")
# Create enhanced model architecture (same as ResNet152 training)
model = create_model(num_classes=num_classes, use_imagenet_weights=False, dropout_rate=0.3)
# Load trained weights
checkpoint = torch.load(model_path, map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
# Set to evaluation mode
model.eval()
print("ā
Model loaded successfully")
return model
def detect_and_crop_face(frame, face_cascade):
"""Detect face in frame and return cropped face region"""
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
# Detect faces
faces = face_cascade.detectMultiScale(
gray,
scaleFactor=1.1,
minNeighbors=5,
minSize=(30, 30)
)
if len(faces) > 0:
# Get the largest face
face = max(faces, key=lambda x: x[2] * x[3])
x, y, w, h = face
# Crop face region
face_crop = frame[y:y+h, x:x+w]
return face_crop, (x, y, w, h)
return None, None
def preprocess_face(face_crop, img_size=224):
"""Preprocess face crop for emotion prediction (enhanced ResNet152 model)"""
# Convert to grayscale first (to match training data processing)
gray_face = cv2.cvtColor(face_crop, cv2.COLOR_BGR2GRAY)
# Step 1: Resize to 48x48 first (matching original training data size)
face_48x48 = cv2.resize(gray_face, (48, 48))
# Step 2: Resize from 48x48 to 224x224 for the enhanced model input
resized_gray = cv2.resize(face_48x48, (img_size, img_size))
# Convert grayscale to RGB format (3 channels) for the model
# This creates RGB channels with identical grayscale values
resized_face = cv2.cvtColor(resized_gray, cv2.COLOR_GRAY2RGB)
# Same transforms as enhanced training data
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# Convert to PIL Image
image = Image.fromarray(resized_face)
# Apply transforms
image_tensor = transform(image).unsqueeze(0) # Add batch dimension
return image_tensor
def predict_emotion(model, image_tensor, class_names):
"""Predict emotion from image tensor"""
with torch.no_grad():
outputs = model(image_tensor)
probabilities = torch.softmax(outputs, dim=1)
predicted_class = torch.argmax(outputs, dim=1).item()
confidence = probabilities[0][predicted_class].item()
return predicted_class, confidence, probabilities[0]
def load_emote_assets():
"""Load emote assets and initialize pygame for sound"""
assets_dir = "assets"
emotes_dir = os.path.join(assets_dir, "emotes")
sounds_dir = os.path.join(assets_dir, "sounds")
# Initialize pygame mixer for sound
pygame.mixer.init()
# Load emote assets
emote_assets = {
'angry': {
'image': os.path.join(emotes_dir, "angry.png"),
'sound': os.path.join(sounds_dir, "angry.mp3")
},
'happy': {
'image': os.path.join(emotes_dir, "happy.gif"),
'sound': os.path.join(sounds_dir, "happy.mp3")
},
'sad': {
'images': [
os.path.join(emotes_dir, "sad1.png"),
os.path.join(emotes_dir, "sad2.png")
],
'sounds': [
os.path.join(sounds_dir, "sad1.mp3"),
os.path.join(sounds_dir, "sad2.mp3")
]
},
'fear': {
'image': os.path.join(emotes_dir, "fear.gif"),
'sound': os.path.join(sounds_dir, "fear.mp3")
},
'surprise': {
'image': os.path.join(emotes_dir, "surprise.gif"),
'sound': os.path.join(sounds_dir, "surprise.mp3")
},
'pig_twerking': {
'image': os.path.join(emotes_dir, "pig_twerk.gif"),
'sound': os.path.join(sounds_dir, "pig_twerk.mp3")
}
}
return emote_assets
def get_sound_duration(sound_path):
"""Get sound duration in seconds"""
try:
sound = pygame.mixer.Sound(sound_path)
duration = sound.get_length() # Duration in seconds
return duration
except Exception as e:
print(f"Error getting sound duration for {sound_path}: {e}")
return 3.0 # Default 3 seconds if error
def play_sound(sound_path):
"""Play sound effect and return its duration"""
if sound_path is None:
print("No sound file for this emotion")
return 2.0 # Default 2 seconds for silent emotions (like surprise)
try:
pygame.mixer.music.load(sound_path)
pygame.mixer.music.play()
return get_sound_duration(sound_path)
except Exception as e:
print(f"Sound error: {e}")
return 3.0 # Default duration
def load_emote(emote_path, face_size=None):
"""Load emote and resize to 1.5x face size while maintaining aspect ratio"""
try:
if emote_path.endswith('.gif'):
# Handle GIF files
gif = Image.open(emote_path)
frames = []
for frame in ImageSequence.Iterator(gif):
frame = frame.convert('RGBA')
# Resize if face_size is provided
if face_size:
frame = resize_emote_to_face(frame, face_size)
frames.append(frame)
return frames
else:
# Handle static images
img = Image.open(emote_path).convert('RGBA')
# Resize if face_size is provided
if face_size:
img = resize_emote_to_face(img, face_size)
return [img]
except Exception as e:
print(f"Error loading emote {emote_path}: {e}")
return None
def resize_emote_to_face(emote_img, face_size):
"""Resize emote to 1.5x face size while maintaining emote's aspect ratio"""
face_w, face_h = face_size
emote_w, emote_h = emote_img.size
# Calculate target size (1.5x face size)
target_w = int(face_w * 1.5)
target_h = int(face_h * 1.5)
# Calculate scale to fit target size while maintaining aspect ratio
scale_w = target_w / emote_w
scale_h = target_h / emote_h
# Use the smaller scale to ensure emote fits within 1.5x face bounds
scale = min(scale_w, scale_h)
# Calculate final dimensions
final_w = int(emote_w * scale)
final_h = int(emote_h * scale)
# Resize the emote
resized_emote = emote_img.resize((final_w, final_h), Image.Resampling.LANCZOS)
return resized_emote
def display_emote_on_frame(frame, emote_frames, position, frame_idx=0):
"""Display emote on frame at specified position"""
if emote_frames is None or len(emote_frames) == 0:
return frame
# Get current frame (for GIFs) or first frame (for static images)
current_emote = emote_frames[frame_idx % len(emote_frames)]
# Convert PIL to OpenCV format
emote_cv = cv2.cvtColor(np.array(current_emote), cv2.COLOR_RGBA2BGRA)
# Get dimensions
h, w = frame.shape[:2]
emote_h, emote_w = emote_cv.shape[:2]
# Calculate position
x, y = position
# Ensure emote fits within frame
if x + emote_w > w:
x = w - emote_w
if y + emote_h > h:
y = h - emote_h
if x < 0:
x = 0
if y < 0:
y = 0
# Create overlay
overlay = frame.copy()
# Extract alpha channel
alpha = emote_cv[:, :, 3] / 255.0
alpha = np.stack([alpha] * 3, axis=2)
# Blend emote onto frame
for c in range(3):
overlay[y:y+emote_h, x:x+emote_w, c] = (
alpha[:, :, c] * emote_cv[:, :, c] +
(1 - alpha[:, :, c]) * frame[y:y+emote_h, x:x+emote_w, c]
)
return overlay
def get_random_border_position(frame_shape, emote_size, exclude_top_left=True):
"""Get a random position closer to the border of the frame, avoiding top-left area"""
frame_h, frame_w = frame_shape[:2]
emote_w, emote_h = emote_size
# Define border regions (closer to edges with smaller spacing)
positions = []
border_offset = 5 # Closer to border (was implicit 0)
if not exclude_top_left:
# Top edge - very close to top
for x in range(border_offset, frame_w - emote_w - border_offset, 15):
positions.append((x, border_offset))
else:
# Top edge (excluding left 200px for text) - very close to top
for x in range(200, frame_w - emote_w - border_offset, 15):
positions.append((x, border_offset))
# Right edge - very close to right side
for y in range(border_offset, frame_h - emote_h - border_offset, 15):
positions.append((frame_w - emote_w - border_offset, y))
# Bottom edge - very close to bottom
for x in range(border_offset, frame_w - emote_w - border_offset, 15):
positions.append((x, frame_h - emote_h - border_offset))
# Left edge (excluding top 150px for text) - very close to left side
for y in range(150, frame_h - emote_h - border_offset, 15):
positions.append((border_offset, y))
return random.choice(positions) if positions else (frame_w - emote_w, frame_h - emote_h)
def play_pig_sound(sound_path, volume=0.05):
"""Play pig twerk sound using a different pygame channel at reduced volume"""
try:
# Use a separate sound channel for pig sounds to avoid interfering with emotion sounds
pig_sound = pygame.mixer.Sound(sound_path)
pig_sound.set_volume(volume) # Set pig sound to 5% volume (quieter than emotion sounds)
pig_channel = pygame.mixer.Channel(1) # Use channel 1 for pig sounds
pig_channel.play(pig_sound)
return pig_sound.get_length()
except Exception as e:
print(f"Pig sound error: {e}")
return 2.0 # Default duration
def live_emotion_detection():
"""Live camera feed with emotion detection"""
# Configuration
model_path = "best_model.pth" # Updated to use ResNet152 model
class_names = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
print("šÆ Live Emotion Detection with Emotes & Sounds")
print("=" * 50)
# Load model
try:
model = load_model(model_path)
except Exception as e:
print(f"ā Error loading model: {e}")
return
# Load emote assets
try:
emote_assets = load_emote_assets()
print("ā
Emote assets loaded successfully")
except Exception as e:
print(f"ā Error loading emote assets: {e}")
return
# Load face cascade
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
# Initialize camera
cap = cv2.VideoCapture(0)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
cap.set(cv2.CAP_PROP_FPS, 30) # Set to 30 FPS
cap.set(cv2.CAP_PROP_BUFFERSIZE, 1) # Reduce buffer size for lower latency
if not cap.isOpened():
print("ā Could not open camera")
return
print("š„ Starting live camera feed with continuous emotion detection...")
print("Press 'q' to quit, 's' to save current frame")
print("=" * 50)
frame_count = 0
last_prediction_time = 0
prediction_interval = 0.5 # Predict every 0.5 seconds
current_emotion = None
current_confidence = 0.0
# Emote and sound tracking
emotion_start_time = None
current_emote_frames = None
emote_frame_idx = 0
last_sound_time = 0
sound_interval = 2.0 # Play sound every 2 seconds
# Sound-based emote duration tracking
emote_start_time = None
emote_duration = 0 # Duration based on sound length
current_sound_duration = 0
current_emote_emotion = None # Store which emotion is currently being displayed
# Pig twerking animation tracking
pig_frames = None
pig_frame_idx = 0
pig_start_time = None
pig_duration = 0
pig_position = None
last_pig_time = 0
pig_interval = 10.0 # Show pig every 10 seconds
while True:
ret, frame = cap.read()
if not ret:
print("ā Failed to capture frame")
break
# Flip frame horizontally for mirror effect
frame = cv2.flip(frame, 1)
# Detect faces in current frame
face_crop, face_coords = detect_and_crop_face(frame, face_cascade)
# Draw face rectangle if detected
if face_coords is not None:
x, y, w, h = face_coords
cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
cv2.putText(frame, "Face Detected", (x, y-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
# Continuous emotion detection
current_time = time.time()
if (face_crop is not None and
current_time - last_prediction_time > prediction_interval):
try:
# Brighten face for better analysis
brightened_face = cv2.convertScaleAbs(face_crop, alpha=1.5, beta=30)
# Preprocess face (same as get_pics.py)
image_tensor = preprocess_face(brightened_face)
# Make prediction
predicted_class, confidence, all_probabilities = predict_emotion(model, image_tensor, class_names)
# Update current emotion
current_emotion = predicted_class
current_confidence = confidence
last_prediction_time = current_time
# Track emotion duration
if emotion_start_time is None:
emotion_start_time = current_time
# Print result to console (every 10 frames to avoid spam)
if frame_count % 10 == 0:
print(f"Frame {frame_count}: {class_names[predicted_class].upper()} (Confidence: {confidence:.1%})")
except Exception as e:
print(f"Prediction error: {e}")
# Handle emote display and sound effects (for angry, happy, sad, fear, surprise with 50%+ confidence)
if (current_emotion is not None and
current_confidence >= 0.5 and
class_names[current_emotion] in ['angry', 'happy', 'sad', 'fear', 'surprise']):
emotion_name = class_names[current_emotion]
emotion_duration = current_time - emotion_start_time if emotion_start_time else 0
# Check if emotion has lasted for 1 second or more
if emotion_duration >= 1.0:
# Load and display emote
if emotion_name in emote_assets:
if emotion_name == 'sad':
# Randomly choose between sad1 and sad2
emote_idx = random.randint(0, 1)
emote_path = emote_assets[emotion_name]['images'][emote_idx]
sound_path = emote_assets[emotion_name]['sounds'][emote_idx]
else:
emote_path = emote_assets[emotion_name]['image']
sound_path = emote_assets[emotion_name]['sound']
# Load emote if not already loaded (resize to 1.5x face size)
if current_emote_frames is None and face_coords:
x, y, w, h = face_coords
face_size = (w, h)
current_emote_frames = load_emote(emote_path, face_size)
# Play sound and start emote display when emotion first detected
if emote_start_time is None:
current_sound_duration = play_sound(sound_path)
emote_start_time = current_time
emote_duration = current_sound_duration
# Store the current emotion type for later display
current_emote_emotion = emotion_name
print(f"šµ Playing {emotion_name} sound for {emote_duration:.1f}s")
else:
# Don't reset emote here - let it play for full duration
pass
# Display emote if it's currently playing (regardless of current emotion detection)
if current_emote_frames is not None and face_coords:
x, y, w, h = face_coords
# Position emotes to the side of face based on the original emotion type
emote_w, emote_h = current_emote_frames[0].size
if current_emote_emotion is not None:
if current_emote_emotion == 'angry':
# Angry to the left of face
emote_x = max(0, x - emote_w - 10) # 10px gap from face
emote_y = y + (h - emote_h) // 2 # Center vertically with face
elif current_emote_emotion == 'sad':
# Sad to the right of face
emote_x = min(frame.shape[1] - emote_w, x + w + 10) # 10px gap from face
emote_y = y + (h - emote_h) // 2 # Center vertically with face
elif current_emote_emotion == 'fear':
# Fear above the face
emote_x = x + (w - emote_w) // 2 # Center horizontally with face
emote_y = max(0, y - emote_h - 15) # 15px gap above face
elif current_emote_emotion == 'surprise':
# surprise to the right of face
emote_x = min(frame.shape[1] - emote_w, x + w + 10) # 10px gap from face
emote_y = y + (h - emote_h) // 2 # Center vertically with face
else: # happy (default)
# Happy to the right of face
emote_x = min(frame.shape[1] - emote_w, x + w + 10) # 10px gap from face
emote_y = y + (h - emote_h) // 2 # Center vertically with face
else:
# Default position if emotion type not stored
emote_x = min(frame.shape[1] - emote_w, x + w + 10)
emote_y = y + (h - emote_h) // 2
frame = display_emote_on_frame(frame, current_emote_frames, (emote_x, emote_y), emote_frame_idx)
emote_frame_idx += 1
# Check if emote display duration (based on sound) has ended - INDEPENDENT of emotion detection
if emote_start_time is not None and current_time - emote_start_time >= emote_duration:
# Reset emote after sound duration has passed
current_emote_frames = None
emote_frame_idx = 0
emote_start_time = None
current_emote_emotion = None # Reset stored emotion type
print(f"š Emote display ended after {emote_duration:.1f}s")
# Reset emotion tracking if no supported emotion detected or confidence too low
# BUT keep emote playing if it's already started
if (current_emotion is None or
current_confidence < 0.5 or
class_names[current_emotion] not in ['angry', 'happy', 'sad', 'fear', 'surprise']):
emotion_start_time = None
# Don't reset emote here - let it finish playing for full duration
# Handle pig twerking animation (every 2 seconds)
if current_time - last_pig_time >= pig_interval:
# Load pig animation if not already loaded (resize to 50% of original size)
if pig_frames is None:
pig_frames = load_emote(emote_assets['pig_twerking']['image'], face_size=(50, 50))
if pig_frames:
# Play pig sound and start animation
pig_sound_path = emote_assets['pig_twerking']['sound']
pig_duration = play_pig_sound(pig_sound_path)
pig_start_time = current_time
pig_frame_idx = 0
# Get random border position (excluding top-left for text)
pig_size = pig_frames[0].size
pig_position = get_random_border_position(frame.shape, pig_size, exclude_top_left=True)
last_pig_time = current_time
print(f"š· Pig twerking at position {pig_position} for {pig_duration:.1f}s!")
# Display pig animation if it's currently playing
if (pig_frames is not None and pig_start_time is not None and
current_time - pig_start_time < pig_duration and pig_position is not None):
frame = display_emote_on_frame(frame, pig_frames, pig_position, pig_frame_idx)
pig_frame_idx += 1
elif pig_start_time is not None and current_time - pig_start_time >= pig_duration:
# Reset pig animation after duration ends
pig_start_time = None
pig_position = None
pig_frame_idx = 0
# Display current emotion on frame
if current_emotion is not None and current_confidence > 0.5:
emotion_text = f"{class_names[current_emotion].upper()}"
confidence_text = f"Confidence: {current_confidence:.1%}"
# Position text
cv2.putText(frame, emotion_text, (50, 50),
cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 0), 3)
cv2.putText(frame, confidence_text, (50, 100),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
# Show live camera feed
cv2.imshow('Live Emotion Detection', frame)
# Handle key presses
key = cv2.waitKey(1) & 0xFF
if key == ord('q'):
print("š Quitting...")
break
elif key == ord('s'):
# Save current frame
filename = f"captured_frame_{frame_count}.jpg"
cv2.imwrite(filename, frame)
print(f"š¾ Frame saved as {filename}")
frame_count += 1
# Cleanup
cap.release()
cv2.destroyAllWindows()
print("ā
Camera inference stopped")
if __name__ == "__main__":
live_emotion_detection()