forked from OpenSHAPA/OpenSHAPA-Example-Scripts
-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathinsert_whisper_transcript.rb
More file actions
304 lines (255 loc) · 9.31 KB
/
insert_whisper_transcript.rb
File metadata and controls
304 lines (255 loc) · 9.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# Purpose: This script processes and imports transcript files into Datavyu.
#
# Supported Formats:
# - WebVTT (.vtt)
# - SubRip (.srt) [planned]
# - Plain Text (.txt) [planned]
#
# Features:
# - File format validation
# - Timestamp conversion
# - Quality assurance workflow
# - Support for multiple subtitle formats [planned]
# - Optimized performance for large files
# - Dynamic chunk sizing based on file size
#
# Usage:
# 1. Run script
# 2. Select subtitle/transcript file when prompted
# 3. Script will create necessary columns in Datavyu:
# - transcript_original: Contains original transcription content
# - transcript_QA: For marking various types of errors
# - transcript_clean: For adding speaker labels to transcription
# - transcript_initials: For coder identification
# - transcript_notes: For additional observations
# Authors: Aaron G. Beckner & Trinity Wang
# 02-27-2025: Added flexible timestamp detection
# 01-14-2025: Added comments and made more generic
# Revised by Van T. Pham for SPACE 2024 play coding
# Last edited: 01-21-2025
# Optimized version: 02-27-2025
require 'Datavyu_API.rb'
# Configuration constants
SUPPORTED_FORMATS = {
'vtt' => 'WebVTT Subtitles'
# Add additional formats here as needed:
# 'srt' => 'SubRip Subtitles',
# 'txt' => 'Plain Text Transcripts'
}
# Column configurations
COLUMN_CONFIGS = {
transcript_original: {
name: 'transcript_original',
codes: ['content'],
required: true
},
qa: {
name: 'transcript_QA',
codes: ['OnsetError', 'ContentError', 'OmittedUtterance', 'HallucinatedUtterance', 'SpeakerChange'], # quality assurance error codes
required: true
},
transcript_clean: {
name: 'transcript_clean',
codes: ['speaker', 'content'], # Codes for speaker labeling and transcription content
required: true
},
initials: {
name: 'transcript_initials',
codes: ['coder_initials'], # Optional Coder initials column
required: false
},
notes: {
name: 'transcript_notes',
codes: ['notes'],
required: false
}
}
# Base ratio for determining chunk size (adjust as needed)
# For every 10 cells, use chunk size of 1
BASE_RATIO = 10.0
# Minimum and maximum chunk sizes to ensure reasonable processing
MIN_CHUNK_SIZE = 10
MAX_CHUNK_SIZE = 100
begin
# Import Java classes for GUI file selection
java_import javax.swing.JFileChooser
java_import javax.swing.filechooser.FileNameExtensionFilter
java_import javax.swing.JFrame
java_import javax.swing.JOptionPane
# Sets up the file chooser dialog
def setup_file_chooser
frame = JFrame.new("Import Transcript")
frame.setDefaultCloseOperation(JFrame::DISPOSE_ON_CLOSE)
frame.setSize(200, 200)
frame.setLocationRelativeTo(nil)
jfc = JFileChooser.new
jfc.setAcceptAllFileFilterUsed(false)
jfc.setMultiSelectionEnabled(false)
jfc.setDialogTitle('Select transcript file to import')
SUPPORTED_FORMATS.each do |format, description|
extensions = [format].to_java(:String)
filter = FileNameExtensionFilter.new(description, extensions)
jfc.addChoosableFileFilter(filter)
end
[frame, jfc]
end
# Validates that the selected file has a supported format
def validate_file_format(file_path)
extension = File.extname(file_path)[1..-1]
raise "Unsupported file format: .#{extension}" unless SUPPORTED_FORMATS.key?(extension)
true
end
# Optimize timestamp parsing for better performance
def parse_timestamp(time_str)
# Handle HH:MM:SS.mmm format
if time_str.match(/^(\d{2}):(\d{2}):(\d{2})\.(\d{3})$/)
hours = $1.to_i
minutes = $2.to_i
seconds = $3.to_i
milliseconds = $4.to_i
return (hours * 3600000) + (minutes * 60000) + (seconds * 1000) + milliseconds
# Handle MM:SS.mmm format (no hours)
elsif time_str.match(/^(\d{2}):(\d{2})\.(\d{3})$/)
minutes = $1.to_i
seconds = $2.to_i
milliseconds = $3.to_i
return (minutes * 60000) + (seconds * 1000) + milliseconds
else
raise "Invalid timestamp format: #{time_str}. Expected format: HH:MM:SS.mmm or MM:SS.mmm"
end
end
# Pre-process the content to extract words and timestamps more efficiently
def process_content(content)
# Remove WEBVTT header if present
content.shift if content.first && content.first.strip == 'WEBVTT'
timestamps = []
words = []
current_timestamp = nil
content.each do |line|
line = line.strip
next if line.empty?
if line.include?("-->")
onset_str, offset_str = line.split('-->').map(&:strip)
current_timestamp = {
onset: parse_timestamp(onset_str),
offset: parse_timestamp(offset_str)
}
elsif line.match(/^\d+$/)
# Skip cue numbers (commonly found in VTT files)
next
elsif current_timestamp && line.match(/^[a-zA-Z]/)
# Only add lines that start with letters and have a valid timestamp
timestamps << current_timestamp
words << line
end
end
[words, timestamps]
end
# Calculate optimal chunk size based on number of cells
def calculate_chunk_size(total_cells)
# Use the ratio of 1 chunk per 10 cells as basis
chunk_size = (total_cells / BASE_RATIO).ceil
# Enforce minimum and maximum chunk sizes
chunk_size = [chunk_size, MIN_CHUNK_SIZE].max
chunk_size = [chunk_size, MAX_CHUNK_SIZE].min
puts "Calculated optimal chunk size: #{chunk_size} for #{total_cells} cells"
chunk_size
end
# Create columns in batches for better performance
def create_columns(words, timestamps)
columns = {}
# First create all columns
COLUMN_CONFIGS.each do |type, config|
if config[:required]
columns[type] = new_column(config[:name], *config[:codes])
end
end
# Calculate total number of entries and dynamic chunk size
total_entries = words.size
chunk_size = calculate_chunk_size(total_entries)
total_chunks = (total_entries.to_f / chunk_size).ceil
puts "Processing #{total_entries} entries in #{total_chunks} chunks with chunk size of #{chunk_size}..."
# Process in chunks
(0...total_chunks).each do |chunk_idx|
start_idx = chunk_idx * chunk_size
end_idx = [start_idx + chunk_size, total_entries].min
chunk_range = (start_idx...end_idx)
chunk_words = words[chunk_range]
chunk_timestamps = timestamps[chunk_range]
puts "Processing chunk #{chunk_idx + 1}/#{total_chunks} (entries #{start_idx + 1}-#{end_idx})..."
# Process each column type for this chunk
COLUMN_CONFIGS.each do |type, config|
next unless config[:required]
column = columns[type]
chunk_words.each_with_index do |word, i|
cell = column.make_new_cell
timestamp = chunk_timestamps[i]
# Set onset and offset
cell.change_code('onset', timestamp[:onset])
cell.change_code('offset', timestamp[:offset])
# Set content based on column type
case type
when :transcript_original
cell.change_code('content', word)
when :transcript_clean
cell.change_code('content', word)
cell.change_code('speaker', '') # Initialize speaker field empty
when :qa
# Initialize QA codes as empty
config[:codes].each do |code|
cell.change_code(code, '')
end
when :initials
cell.change_code('coder_initials', '')
when :notes
cell.change_code('notes', '')
end
end
end
end
puts "Setting columns in Datavyu..."
columns.each do |_, column|
set_column(column)
end
end
# Show progress dialog
def show_progress_dialog(message)
JOptionPane.showMessageDialog(nil, message, "Progress", JOptionPane::INFORMATION_MESSAGE)
end
# Main execution flow
puts "Starting transcript import..."
# Setup and show file chooser
frame, jfc = setup_file_chooser
frame.setVisible(true)
result = jfc.showOpenDialog(frame)
frame.dispose
if result != JFileChooser::APPROVE_OPTION
puts "No file selected. Aborting."
return
end
file_path = jfc.getSelectedFile.getPath
validate_file_format(file_path)
puts "Reading file: #{file_path}"
show_progress_dialog("Reading file. Please wait...")
# Read file content
content = File.readlines(file_path)
puts "Processing content..."
show_progress_dialog("Processing transcript. This may take a moment for large files...")
# Process file content
words, timestamps = process_content(content)
if words.empty? || timestamps.empty?
puts "No valid transcript entries found. Check file format."
show_progress_dialog("No valid transcript entries found. Check file format.")
return
end
puts "Creating Datavyu columns with #{words.size} entries..."
show_progress_dialog("Creating Datavyu columns with #{words.size} entries. Please wait...")
# Create columns with dynamically sized batched processing
create_columns(words, timestamps)
puts "Import completed successfully!"
show_progress_dialog("Import completed successfully!")
rescue => e
puts "Error: #{e.message}"
puts e.backtrace if ENV['DEBUG']
JOptionPane.showMessageDialog(nil, "Error: #{e.message}", "Import Error", JOptionPane::ERROR_MESSAGE)
end