Skip to content

Commit e76ec8a

Browse files
Add audio matching and sync scripts for Second Room videos
- match-audio.py: Match a single video with audio files using cross-correlation - sync-second-room-audio.py: Batch process all Second Room videos from tags file Uses FFT-based cross-correlation to find the best audio match and time offset, then replaces the low-quality camera audio with the high-quality external recording. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 9048065 commit e76ec8a

2 files changed

Lines changed: 489 additions & 0 deletions

File tree

scripts/match-audio.py

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Match and sync external audio files to videos.
4+
5+
This script:
6+
1. Extracts audio from video files
7+
2. Compares with external audio files using cross-correlation
8+
3. Finds the best match and time offset
9+
4. Replaces the video audio with the synced high-quality audio
10+
11+
Usage: ./match-audio.py <video_file> <audio_dir> [--output <output_file>]
12+
13+
Example:
14+
./match-audio.py "video.mp4" "Second Room Recordings/Audio/" --output "video_synced.mp4"
15+
"""
16+
17+
import argparse
18+
import subprocess
19+
import sys
20+
import tempfile
21+
from pathlib import Path
22+
23+
import numpy as np
24+
25+
# Duration of audio to analyze for matching (seconds)
26+
ANALYSIS_DURATION = 60
27+
# Sample rate for analysis (lower = faster but less accurate)
28+
ANALYSIS_SAMPLE_RATE = 8000
29+
30+
31+
def extract_audio(input_file: str, output_file: str, duration: int = None, sample_rate: int = 44100) -> bool:
32+
"""Extract audio from video/audio file to WAV."""
33+
cmd = [
34+
"ffmpeg", "-y",
35+
"-i", input_file,
36+
]
37+
if duration:
38+
cmd.extend(["-t", str(duration)])
39+
cmd.extend([
40+
"-ac", "1", # mono
41+
"-ar", str(sample_rate),
42+
"-f", "wav",
43+
output_file
44+
])
45+
46+
result = subprocess.run(cmd, capture_output=True)
47+
return result.returncode == 0
48+
49+
50+
def load_audio_samples(wav_file: str) -> np.ndarray:
51+
"""Load WAV file as numpy array."""
52+
import wave
53+
with wave.open(wav_file, 'rb') as wf:
54+
n_frames = wf.getnframes()
55+
audio_data = wf.readframes(n_frames)
56+
samples = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32)
57+
# Normalize
58+
samples = samples / 32768.0
59+
return samples
60+
61+
62+
def cross_correlate(signal1: np.ndarray, signal2: np.ndarray) -> tuple[float, float]:
63+
"""
64+
Find the best alignment between two signals using cross-correlation.
65+
Returns (offset_samples, correlation_score).
66+
67+
Positive offset means signal2 starts after signal1.
68+
"""
69+
# Use FFT-based correlation for speed
70+
from scipy import signal as sig
71+
72+
# Normalize signals
73+
s1 = (signal1 - np.mean(signal1)) / (np.std(signal1) + 1e-10)
74+
s2 = (signal2 - np.mean(signal2)) / (np.std(signal2) + 1e-10)
75+
76+
# Cross-correlate
77+
correlation = sig.correlate(s1, s2, mode='full')
78+
79+
# Find peak
80+
peak_idx = np.argmax(np.abs(correlation))
81+
peak_value = np.abs(correlation[peak_idx])
82+
83+
# Calculate offset (positive = s2 starts later)
84+
offset = peak_idx - len(s2) + 1
85+
86+
# Normalize correlation score
87+
score = peak_value / len(s1)
88+
89+
return offset, score
90+
91+
92+
def find_best_match(video_file: str, audio_dir: str) -> tuple[str, float, float]:
93+
"""
94+
Find the best matching audio file and time offset.
95+
Returns (audio_file, offset_seconds, score).
96+
"""
97+
audio_path = Path(audio_dir)
98+
99+
# Find all audio files
100+
audio_files = []
101+
for ext in ['*.wav', '*.mp3', '*.m4a', '*.aac', '*.flac', '*.ogg']:
102+
audio_files.extend(audio_path.glob(f"**/{ext}"))
103+
104+
if not audio_files:
105+
print(f"No audio files found in {audio_dir}")
106+
return None, 0, 0
107+
108+
print(f"Found {len(audio_files)} audio file(s) to compare")
109+
print()
110+
111+
with tempfile.TemporaryDirectory() as tmpdir:
112+
# Extract audio from video
113+
video_audio = f"{tmpdir}/video_audio.wav"
114+
print("Extracting audio from video...")
115+
if not extract_audio(video_file, video_audio, ANALYSIS_DURATION, ANALYSIS_SAMPLE_RATE):
116+
print("Failed to extract audio from video")
117+
return None, 0, 0
118+
119+
video_samples = load_audio_samples(video_audio)
120+
print(f"Video audio: {len(video_samples)} samples ({len(video_samples)/ANALYSIS_SAMPLE_RATE:.1f}s)")
121+
print()
122+
123+
best_match = None
124+
best_score = 0
125+
best_offset = 0
126+
127+
for i, audio_file in enumerate(audio_files, 1):
128+
print(f"[{i}/{len(audio_files)}] Comparing: {audio_file.name}...", end=" ", flush=True)
129+
130+
# Extract/convert audio file
131+
ext_audio = f"{tmpdir}/ext_audio_{i}.wav"
132+
if not extract_audio(str(audio_file), ext_audio, ANALYSIS_DURATION * 2, ANALYSIS_SAMPLE_RATE):
133+
print("SKIP (extraction failed)")
134+
continue
135+
136+
ext_samples = load_audio_samples(ext_audio)
137+
138+
# Cross-correlate
139+
offset_samples, score = cross_correlate(video_samples, ext_samples)
140+
offset_seconds = offset_samples / ANALYSIS_SAMPLE_RATE
141+
142+
print(f"score={score:.4f}, offset={offset_seconds:+.2f}s")
143+
144+
if score > best_score:
145+
best_score = score
146+
best_match = str(audio_file)
147+
best_offset = offset_seconds
148+
149+
return best_match, best_offset, best_score
150+
151+
152+
def replace_audio(video_file: str, audio_file: str, offset_seconds: float, output_file: str) -> bool:
153+
"""Replace video audio with synced external audio."""
154+
155+
# If offset is positive, external audio starts later, so we delay it
156+
# If offset is negative, external audio starts earlier, so we trim it
157+
158+
if offset_seconds >= 0:
159+
# Delay the external audio
160+
audio_filter = f"adelay={int(offset_seconds * 1000)}|{int(offset_seconds * 1000)}"
161+
else:
162+
# Trim the start of external audio
163+
audio_filter = f"atrim=start={-offset_seconds}"
164+
165+
cmd = [
166+
"ffmpeg", "-y",
167+
"-i", video_file,
168+
"-i", audio_file,
169+
"-c:v", "copy",
170+
"-af", audio_filter,
171+
"-map", "0:v:0",
172+
"-map", "1:a:0",
173+
"-shortest",
174+
output_file
175+
]
176+
177+
print(f"Running: {' '.join(cmd)}")
178+
result = subprocess.run(cmd, capture_output=True, text=True)
179+
180+
if result.returncode != 0:
181+
print(f"FFmpeg error: {result.stderr[-500:]}")
182+
return False
183+
184+
return True
185+
186+
187+
def main():
188+
parser = argparse.ArgumentParser(description="Match and sync external audio to video")
189+
parser.add_argument("video_file", help="Video file to process")
190+
parser.add_argument("audio_dir", help="Directory containing audio files to match")
191+
parser.add_argument("--output", "-o", help="Output video file (default: video_synced.mp4)")
192+
parser.add_argument("--threshold", "-t", type=float, default=0.1,
193+
help="Minimum correlation score to accept match (default: 0.1)")
194+
195+
args = parser.parse_args()
196+
197+
if not Path(args.video_file).exists():
198+
print(f"Video file not found: {args.video_file}")
199+
sys.exit(1)
200+
201+
if not Path(args.audio_dir).exists():
202+
print(f"Audio directory not found: {args.audio_dir}")
203+
sys.exit(1)
204+
205+
output_file = args.output or Path(args.video_file).stem + "_synced.mp4"
206+
207+
print("=" * 50)
208+
print(f"Video: {args.video_file}")
209+
print(f"Audio dir: {args.audio_dir}")
210+
print(f"Output: {output_file}")
211+
print("=" * 50)
212+
print()
213+
214+
# Find best match
215+
best_match, offset, score = find_best_match(args.video_file, args.audio_dir)
216+
217+
if not best_match:
218+
print("No matching audio found!")
219+
sys.exit(1)
220+
221+
print()
222+
print("=" * 50)
223+
print(f"Best match: {best_match}")
224+
print(f"Score: {score:.4f}")
225+
print(f"Offset: {offset:+.3f}s")
226+
print("=" * 50)
227+
228+
if score < args.threshold:
229+
print(f"\nWarning: Score {score:.4f} is below threshold {args.threshold}")
230+
print("Match may not be reliable!")
231+
response = input("Continue anyway? [y/N] ")
232+
if response.lower() != 'y':
233+
sys.exit(1)
234+
235+
print()
236+
print("Replacing audio...")
237+
238+
if replace_audio(args.video_file, best_match, offset, output_file):
239+
print(f"\nSuccess! Output: {output_file}")
240+
else:
241+
print("\nFailed to replace audio")
242+
sys.exit(1)
243+
244+
245+
if __name__ == "__main__":
246+
main()

0 commit comments

Comments
 (0)