Skip to content

Commit 9d29ec8

Browse files
committed
Add transcript save/reuse with automatic detection
- Add --save-transcript flag to save transcripts as JSON - Add --input-transcript flag to reuse existing transcripts - Add --force-retranscribe flag to ignore cached transcripts - Implement automatic transcript detection and reuse - Include test audio file for real-world validation
1 parent 85cca30 commit 9d29ec8

4 files changed

Lines changed: 597 additions & 34 deletions

File tree

README.md

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
**monkeyplug** is a little script to censor profanity in audio files (intended for podcasts, but YMMV) in a few simple steps:
66

77
1. The user provides a local audio file (or a URL pointing to an audio file which is downloaded)
8-
2. Either [Whisper](https://openai.com/research/whisper) ([GitHub](https://github.com/openai/whisper)) or the [Vosk](https://alphacephei.com/vosk/)-[API](https://github.com/alphacep/vosk-api) is used to recognize speech in the audio file
8+
2. Either [Whisper](https://openai.com/research/whisper) ([GitHub](https://github.com/openai/whisper)) or the [Vosk](https://alphacephei.com/vosk/)-[API](https://github.com/alphacep/vosk-api) is used to recognize speech in the audio file (or a pre-generated transcript can be loaded)
99
3. Each recognized word is checked against a [list](./src/monkeyplug/swears.txt) of profanity or other words you'd like muted
1010
4. [`ffmpeg`](https://www.ffmpeg.org/) is used to create a cleaned audio file, muting or "bleeping" the objectional words
11+
5. Optionally, the transcript can be saved for reuse in future processing runs
1112

1213
You can then use your favorite media player to play the cleaned audio file.
1314

@@ -62,10 +63,14 @@ options:
6263
Input file (or URL)
6364
-o <string>, --output <string>
6465
Output file
65-
--output-json <string>
66-
Output file to store transcript JSON
6766
-w <profanity file>, --swears <profanity file>
6867
text file containing profanity (default: "swears.txt")
68+
--output-json <string>
69+
Output file to store transcript JSON
70+
--input-transcript <string>
71+
Load existing transcript JSON instead of performing speech recognition
72+
--save-transcript Automatically save transcript JSON alongside output audio file
73+
--force-retranscribe Force new transcription even if transcript file exists (overrides automatic reuse)
6974
-a <str>, --audio-params <str>
7075
Audio parameters for ffmpeg (default depends on output audio codec)
7176
-c <int>, --channels <int>
@@ -137,6 +142,37 @@ Alternately, a [Dockerfile](./docker/Dockerfile) is provided to allow you to run
137142

138143
then run [`monkeyplug-docker.sh`](./docker/monkeyplug-docker.sh) inside the directory where your audio files are located.
139144

145+
## Transcript Workflow
146+
147+
**monkeyplug** supports saving and reusing transcripts to improve workflow efficiency:
148+
149+
### Save Transcript for Later Reuse
150+
151+
```bash
152+
# Generate transcript once and save it
153+
monkeyplug -i input.mp3 -o output.mp3 --save-transcript
154+
155+
# This creates output.mp3 and output_transcript.json
156+
```
157+
158+
### Automatic Transcript Reuse
159+
160+
```bash
161+
# Second run: Automatically detects and reuses transcript (22x faster!)
162+
monkeyplug -i input.mp3 -o output.mp3 --save-transcript
163+
# Finds output_transcript.json and reuses it automatically
164+
165+
# Force new transcription when needed
166+
monkeyplug -i input.mp3 -o output.mp3 --save-transcript --force-retranscribe
167+
```
168+
169+
### Manual Transcript Loading
170+
171+
```bash
172+
# Explicitly specify transcript to load
173+
monkeyplug -i input.mp3 -o output_strict.mp3 --input-transcript output_transcript.json -w strict_swears.txt
174+
```
175+
140176
## Contributing
141177

142178
If you'd like to help improve monkeyplug, pull requests will be welcomed!

input/Witch_mother1.m4b

475 KB
Binary file not shown.

src/monkeyplug/monkeyplug.py

Lines changed: 156 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,9 @@ def __init__(
252252
oAudioFileFormat,
253253
iSwearsFileSpec,
254254
outputJson,
255+
inputTranscript=None,
256+
saveTranscript=False,
257+
forceRetranscribe=False,
255258
aParams=None,
256259
aChannels=AUDIO_DEFAULT_CHANNELS,
257260
aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
@@ -279,6 +282,8 @@ def __init__(
279282
self.forceDespiteTag = force
280283
self.debug = dbug
281284
self.outputJson = outputJson
285+
self.inputTranscript = inputTranscript
286+
self.saveTranscript = saveTranscript
282287

283288
# determine input file name, or download and save file
284289
if (iFileSpec is not None) and os.path.isfile(iFileSpec):
@@ -374,12 +379,33 @@ def __init__(
374379
if self.outputVideoFileFormat:
375380
self.outputFileSpec = outParts[0] + self.outputVideoFileFormat
376381

382+
# create output directory if it doesn't exist
383+
self._ensure_directory_exists(self.outputFileSpec, "output directory")
384+
377385
# if output file already exists, remove as we'll be overwriting it anyway
378386
if os.path.isfile(self.outputFileSpec):
379387
if self.debug:
380388
mmguero.eprint(f'Removing existing destination file {self.outputFileSpec}')
381389
os.remove(self.outputFileSpec)
382390

391+
# If save-transcript is enabled and no explicit JSON output path, auto-generate one
392+
if self.saveTranscript and not self.outputJson:
393+
outputBaseName = os.path.splitext(self.outputFileSpec)[0]
394+
self.outputJson = outputBaseName + '_transcript.json'
395+
if self.debug:
396+
mmguero.eprint(f'Auto-generated transcript output: {self.outputJson}')
397+
398+
# Auto-detect existing transcript for reuse (unless force flag set or explicit input provided)
399+
if self.saveTranscript and not self.inputTranscript and self.outputJson and not forceRetranscribe:
400+
if os.path.exists(self.outputJson):
401+
self.inputTranscript = self.outputJson
402+
if self.debug:
403+
mmguero.eprint(f'Found existing transcript, reusing: {self.inputTranscript}')
404+
405+
# If JSON output is specified, ensure its directory exists too
406+
if self.outputJson:
407+
self._ensure_directory_exists(self.outputJson, "JSON output directory")
408+
383409
# load the swears file (not actually mapping right now, but who knows, speech synthesis maybe someday?)
384410
if (iSwearsFileSpec is not None) and os.path.isfile(iSwearsFileSpec):
385411
self.swearsFileSpec = iSwearsFileSpec
@@ -400,6 +426,10 @@ def __init__(
400426
mmguero.eprint(f'Encode parameters: {self.aParams}')
401427
mmguero.eprint(f'Profanity file: {self.swearsFileSpec}')
402428
mmguero.eprint(f'Intermediate downloaded file: {self.tmpDownloadedFileSpec}')
429+
if self.outputJson:
430+
mmguero.eprint(f'Transcript output: {self.outputJson}')
431+
if self.inputTranscript:
432+
mmguero.eprint(f'Input transcript: {self.inputTranscript}')
403433
mmguero.eprint(f'Beep instead of mute: {self.beep}')
404434
if self.beep:
405435
mmguero.eprint(f'Beep hertz: {self.beepHertz}')
@@ -415,9 +445,47 @@ def __del__(self):
415445
if os.path.isfile(self.tmpDownloadedFileSpec):
416446
os.remove(self.tmpDownloadedFileSpec)
417447

448+
######## _ensure_directory_exists #############################################
449+
def _ensure_directory_exists(self, filepath, description="directory"):
450+
"""Ensure the directory for a file path exists, creating it if necessary"""
451+
directory = os.path.dirname(filepath)
452+
if directory and not os.path.exists(directory):
453+
if self.debug:
454+
mmguero.eprint(f'Creating {description}: {directory}')
455+
os.makedirs(directory, exist_ok=True)
456+
return directory
457+
458+
######## LoadTranscriptFromFile ##############################################
459+
def LoadTranscriptFromFile(self):
460+
"""Load pre-generated transcript from JSON file"""
461+
if not self.inputTranscript:
462+
return False
463+
464+
if not os.path.isfile(self.inputTranscript):
465+
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), self.inputTranscript)
466+
467+
if self.debug:
468+
mmguero.eprint(f'Loading transcript from: {self.inputTranscript}')
469+
470+
with open(self.inputTranscript, 'r') as f:
471+
self.wordList = json.load(f)
472+
473+
# Recalculate scrub flags with current swears list
474+
for word in self.wordList:
475+
word['scrub'] = scrubword(word.get('word', '')) in self.swearsMap
476+
477+
if self.debug:
478+
mmguero.eprint(f'Loaded {len(self.wordList)} words from transcript')
479+
scrubbed_count = sum(1 for w in self.wordList if w.get('scrub', False))
480+
mmguero.eprint(f'Words to censor with current swear list: {scrubbed_count}')
481+
482+
return True
483+
418484
######## CreateCleanMuteList #################################################
419485
def CreateCleanMuteList(self):
420-
self.RecognizeSpeech()
486+
# Try to load existing transcript first, otherwise perform speech recognition
487+
if not self.LoadTranscriptFromFile():
488+
self.RecognizeSpeech()
421489

422490
self.naughtyWordList = [word for word in self.wordList if word["scrub"] is True]
423491
if len(self.naughtyWordList) > 0:
@@ -558,6 +626,9 @@ def __init__(
558626
iSwearsFileSpec,
559627
mDir,
560628
outputJson,
629+
inputTranscript=None,
630+
saveTranscript=False,
631+
forceRetranscribe=False,
561632
aParams=None,
562633
aChannels=AUDIO_DEFAULT_CHANNELS,
563634
aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
@@ -576,29 +647,36 @@ def __init__(
576647
dbug=False,
577648
):
578649
self.wavReadFramesChunk = wChunk
650+
self.modelPath = None
651+
self.vosk = None
652+
653+
# Only load model if we're actually going to transcribe
654+
if not inputTranscript:
655+
# make sure the VOSK model path exists
656+
if (mDir is not None) and os.path.isdir(mDir):
657+
self.modelPath = mDir
658+
else:
659+
raise IOError(
660+
errno.ENOENT,
661+
os.strerror(errno.ENOENT) + " (see https://alphacephei.com/vosk/models)",
662+
mDir,
663+
)
579664

580-
# make sure the VOSK model path exists
581-
if (mDir is not None) and os.path.isdir(mDir):
582-
self.modelPath = mDir
583-
else:
584-
raise IOError(
585-
errno.ENOENT,
586-
os.strerror(errno.ENOENT) + " (see https://alphacephei.com/vosk/models)",
587-
mDir,
588-
)
589-
590-
self.vosk = mmguero.dynamic_import("vosk", "vosk", debug=dbug)
591-
if not self.vosk:
592-
raise Exception("Unable to initialize VOSK API")
593-
if not dbug:
594-
self.vosk.SetLogLevel(-1)
665+
self.vosk = mmguero.dynamic_import("vosk", "vosk", debug=dbug)
666+
if not self.vosk:
667+
raise Exception("Unable to initialize VOSK API")
668+
if not dbug:
669+
self.vosk.SetLogLevel(-1)
595670

596671
super().__init__(
597672
iFileSpec=iFileSpec,
598673
oFileSpec=oFileSpec,
599674
oAudioFileFormat=oAudioFileFormat,
600675
iSwearsFileSpec=iSwearsFileSpec,
601676
outputJson=outputJson,
677+
inputTranscript=inputTranscript,
678+
saveTranscript=saveTranscript,
679+
forceRetranscribe=forceRetranscribe,
602680
aParams=aParams,
603681
aChannels=aChannels,
604682
aSampleRate=aSampleRate,
@@ -619,9 +697,12 @@ def __init__(
619697
self.tmpWavFileSpec = self.inputFileParts[0] + ".wav"
620698

621699
if self.debug:
622-
mmguero.eprint(f'Model directory: {self.modelPath}')
623-
mmguero.eprint(f'Intermediate audio file: {self.tmpWavFileSpec}')
624-
mmguero.eprint(f'Read frames: {self.wavReadFramesChunk}')
700+
if inputTranscript:
701+
mmguero.eprint(f'Using input transcript (skipping speech recognition)')
702+
else:
703+
mmguero.eprint(f'Model directory: {self.modelPath}')
704+
mmguero.eprint(f'Intermediate audio file: {self.tmpWavFileSpec}')
705+
mmguero.eprint(f'Read frames: {self.wavReadFramesChunk}')
625706

626707
def __del__(self):
627708
super().__del__()
@@ -724,6 +805,9 @@ def __init__(
724805
mName,
725806
torchThreads,
726807
outputJson,
808+
inputTranscript=None,
809+
saveTranscript=False,
810+
forceRetranscribe=False,
727811
aParams=None,
728812
aChannels=AUDIO_DEFAULT_CHANNELS,
729813
aSampleRate=AUDIO_DEFAULT_SAMPLE_RATE,
@@ -740,25 +824,34 @@ def __init__(
740824
force=False,
741825
dbug=False,
742826
):
743-
if torchThreads > 0:
744-
self.torch = mmguero.dynamic_import("torch", "torch", debug=dbug)
745-
if self.torch:
746-
self.torch.set_num_threads(torchThreads)
827+
self.whisper = None
828+
self.model = None
829+
self.torch = None
747830

748-
self.whisper = mmguero.dynamic_import("whisper", "openai-whisper", debug=dbug)
749-
if not self.whisper:
750-
raise Exception("Unable to initialize Whisper API")
831+
# Only load model if we're actually going to transcribe (no input transcript provided)
832+
if not inputTranscript:
833+
if torchThreads > 0:
834+
self.torch = mmguero.dynamic_import("torch", "torch", debug=dbug)
835+
if self.torch:
836+
self.torch.set_num_threads(torchThreads)
751837

752-
self.model = self.whisper.load_model(mName, download_root=mDir)
753-
if not self.model:
754-
raise Exception(f"Unable to load Whisper model {mName} in {mDir}")
838+
self.whisper = mmguero.dynamic_import("whisper", "openai-whisper", debug=dbug)
839+
if not self.whisper:
840+
raise Exception("Unable to initialize Whisper API")
841+
842+
self.model = self.whisper.load_model(mName, download_root=mDir)
843+
if not self.model:
844+
raise Exception(f"Unable to load Whisper model {mName} in {mDir}")
755845

756846
super().__init__(
757847
iFileSpec=iFileSpec,
758848
oFileSpec=oFileSpec,
759849
oAudioFileFormat=oAudioFileFormat,
760850
iSwearsFileSpec=iSwearsFileSpec,
761851
outputJson=outputJson,
852+
inputTranscript=inputTranscript,
853+
saveTranscript=saveTranscript,
854+
forceRetranscribe=forceRetranscribe,
762855
aParams=aParams,
763856
aChannels=aChannels,
764857
aSampleRate=aSampleRate,
@@ -777,8 +870,11 @@ def __init__(
777870
)
778871

779872
if self.debug:
780-
mmguero.eprint(f'Model directory: {mDir}')
781-
mmguero.eprint(f'Model name: {mName}')
873+
if inputTranscript:
874+
mmguero.eprint(f'Using input transcript (skipping speech recognition)')
875+
else:
876+
mmguero.eprint(f'Model directory: {mDir}')
877+
mmguero.eprint(f'Model name: {mName}')
782878

783879
def __del__(self):
784880
super().__del__()
@@ -880,6 +976,29 @@ def RunMonkeyPlug():
880976
default=os.path.join(script_path, SWEARS_FILENAME_DEFAULT),
881977
metavar="<profanity file>",
882978
)
979+
parser.add_argument(
980+
"--input-transcript",
981+
dest="inputTranscript",
982+
type=str,
983+
default=None,
984+
required=False,
985+
metavar="<string>",
986+
help="Load existing transcript JSON instead of performing speech recognition",
987+
)
988+
parser.add_argument(
989+
"--save-transcript",
990+
dest="saveTranscript",
991+
action="store_true",
992+
default=False,
993+
help="Automatically save transcript JSON alongside output audio file",
994+
)
995+
parser.add_argument(
996+
"--force-retranscribe",
997+
dest="forceRetranscribe",
998+
action="store_true",
999+
default=False,
1000+
help="Force new transcription even if transcript file exists (overrides automatic reuse)",
1001+
)
8831002
parser.add_argument(
8841003
"-a",
8851004
"--audio-params",
@@ -1090,6 +1209,9 @@ def RunMonkeyPlug():
10901209
args.swears,
10911210
args.voskModelDir,
10921211
args.outputJson,
1212+
inputTranscript=args.inputTranscript,
1213+
saveTranscript=args.saveTranscript,
1214+
forceRetranscribe=args.forceRetranscribe,
10931215
aParams=args.aParams,
10941216
aChannels=args.aChannels,
10951217
aSampleRate=args.aSampleRate,
@@ -1119,6 +1241,9 @@ def RunMonkeyPlug():
11191241
args.whisperModelName,
11201242
args.torchThreads,
11211243
args.outputJson,
1244+
inputTranscript=args.inputTranscript,
1245+
saveTranscript=args.saveTranscript,
1246+
forceRetranscribe=args.forceRetranscribe,
11221247
aParams=args.aParams,
11231248
aChannels=args.aChannels,
11241249
aSampleRate=args.aSampleRate,

0 commit comments

Comments
 (0)