Datavyu-Example-Scripts/Insert/insert_whisper_transcript.rb at 54ad9a65ef9034ff4052c15d9b40595a3192596f · databrary/Datavyu-Example-Scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# Purpose: This script processes and imports transcript files into Datavyu.
#
# Supported Formats:
#   - WebVTT (.vtt)
#   - SubRip (.srt) [planned]
#   - Plain Text (.txt) [planned]
#
# Features:
#   - File format validation
#   - Timestamp conversion
#   - Quality assurance workflow
#   - Support for multiple subtitle formats [planned]
#   - Optimized performance for large files
#   - Dynamic chunk sizing based on file size
#
# Usage:
#   1. Run script
#   2. Select subtitle/transcript file when prompted
#   3. Script will create necessary columns in Datavyu:
#      - transcript_original: Contains original transcription content
#      - transcript_QA: For marking various types of errors
#      - transcript_clean: For adding speaker labels to transcription
#      - transcript_initials: For coder identification
#      - transcript_notes: For additional observations
# Authors: Aaron G. Beckner & Trinity Wang
# 02-27-2025: Added flexible timestamp detection
# 01-14-2025: Added comments and made more generic

# Revised by Van T. Pham for SPACE 2024 play coding
# Last edited: 01-21-2025
# Optimized version: 02-27-2025

require 'Datavyu_API.rb'

# Configuration constants
SUPPORTED_FORMATS = {
  'vtt' => 'WebVTT Subtitles'
  # Add additional formats here as needed:
  # 'srt' => 'SubRip Subtitles',
  # 'txt' => 'Plain Text Transcripts'
}

# Column configurations
COLUMN_CONFIGS = {
  transcript_original: {
    name: 'transcript_original',
    codes: ['content'],
    required: true
  },
  qa: {
    name: 'transcript_QA',
    codes: ['OnsetError', 'ContentError', 'OmittedUtterance', 'HallucinatedUtterance', 'SpeakerChange'], # quality assurance error codes
    required: true
  },
  transcript_clean: {
    name: 'transcript_clean',
    codes: ['speaker', 'content'],  # Codes for speaker labeling and transcription content
    required: true
  },
  initials: {
    name: 'transcript_initials',
    codes: ['coder_initials'], # Optional Coder initials column
    required: false
  },
  notes: {
    name: 'transcript_notes',
    codes: ['notes'],
    required: false
  }
}

# Base ratio for determining chunk size (adjust as needed)
# For every 10 cells, use chunk size of 1
BASE_RATIO = 10.0
# Minimum and maximum chunk sizes to ensure reasonable processing
MIN_CHUNK_SIZE = 10
MAX_CHUNK_SIZE = 100

begin
  # Import Java classes for GUI file selection
  java_import javax.swing.JFileChooser
  java_import javax.swing.filechooser.FileNameExtensionFilter
  java_import javax.swing.JFrame
  java_import javax.swing.JOptionPane

  # Sets up the file chooser dialog
  def setup_file_chooser
    frame = JFrame.new("Import Transcript")
    frame.setDefaultCloseOperation(JFrame::DISPOSE_ON_CLOSE)
    frame.setSize(200, 200)
    frame.setLocationRelativeTo(nil)

    jfc = JFileChooser.new
    jfc.setAcceptAllFileFilterUsed(false)
    jfc.setMultiSelectionEnabled(false)
    jfc.setDialogTitle('Select transcript file to import')

    SUPPORTED_FORMATS.each do |format, description|
      extensions = [format].to_java(:String)
      filter = FileNameExtensionFilter.new(description, extensions)
      jfc.addChoosableFileFilter(filter)
    end

    [frame, jfc]
  end

  # Validates that the selected file has a supported format
  def validate_file_format(file_path)
    extension = File.extname(file_path)[1..-1]
    raise "Unsupported file format: .#{extension}" unless SUPPORTED_FORMATS.key?(extension)
    true
  end

  # Optimize timestamp parsing for better performance
  def parse_timestamp(time_str)
    # Handle HH:MM:SS.mmm format
    if time_str.match(/^(\d{2}):(\d{2}):(\d{2})\.(\d{3})$/)
      hours = $1.to_i
      minutes = $2.to_i
      seconds = $3.to_i
      milliseconds = $4.to_i

      return (hours * 3600000) + (minutes * 60000) + (seconds * 1000) + milliseconds
    # Handle MM:SS.mmm format (no hours)
    elsif time_str.match(/^(\d{2}):(\d{2})\.(\d{3})$/)
      minutes = $1.to_i
      seconds = $2.to_i
      milliseconds = $3.to_i

      return (minutes * 60000) + (seconds * 1000) + milliseconds
    else
      raise "Invalid timestamp format: #{time_str}. Expected format: HH:MM:SS.mmm or MM:SS.mmm"
    end
  end

  # Pre-process the content to extract words and timestamps more efficiently
  def process_content(content)
    # Remove WEBVTT header if present
    content.shift if content.first && content.first.strip == 'WEBVTT'

    timestamps = []
    words = []
    current_timestamp = nil

    content.each do |line|
      line = line.strip
      next if line.empty?

      if line.include?("-->")
        onset_str, offset_str = line.split('-->').map(&:strip)
        current_timestamp = {
          onset: parse_timestamp(onset_str),
          offset: parse_timestamp(offset_str)
        }
      elsif line.match(/^\d+$/)
        # Skip cue numbers (commonly found in VTT files)
        next
      elsif current_timestamp && line.match(/^[a-zA-Z]/)
        # Only add lines that start with letters and have a valid timestamp
        timestamps << current_timestamp
        words << line
      end
    end

    [words, timestamps]
  end

  # Calculate optimal chunk size based on number of cells
  def calculate_chunk_size(total_cells)
    # Use the ratio of 1 chunk per 10 cells as basis
    chunk_size = (total_cells / BASE_RATIO).ceil

    # Enforce minimum and maximum chunk sizes
    chunk_size = [chunk_size, MIN_CHUNK_SIZE].max
    chunk_size = [chunk_size, MAX_CHUNK_SIZE].min

    puts "Calculated optimal chunk size: #{chunk_size} for #{total_cells} cells"
    chunk_size
  end

  # Create columns in batches for better performance
  def create_columns(words, timestamps)
    columns = {}

    # First create all columns
    COLUMN_CONFIGS.each do |type, config|
      if config[:required]
        columns[type] = new_column(config[:name], *config[:codes])
      end
    end

    # Calculate total number of entries and dynamic chunk size
    total_entries = words.size
    chunk_size = calculate_chunk_size(total_entries)
    total_chunks = (total_entries.to_f / chunk_size).ceil

    puts "Processing #{total_entries} entries in #{total_chunks} chunks with chunk size of #{chunk_size}..."

    # Process in chunks
    (0...total_chunks).each do |chunk_idx|
      start_idx = chunk_idx * chunk_size
      end_idx = [start_idx + chunk_size, total_entries].min

      chunk_range = (start_idx...end_idx)
      chunk_words = words[chunk_range]
      chunk_timestamps = timestamps[chunk_range]

      puts "Processing chunk #{chunk_idx + 1}/#{total_chunks} (entries #{start_idx + 1}-#{end_idx})..."

      # Process each column type for this chunk
      COLUMN_CONFIGS.each do |type, config|
        next unless config[:required]
        column = columns[type]

        chunk_words.each_with_index do |word, i|
          cell = column.make_new_cell
          timestamp = chunk_timestamps[i]

          # Set onset and offset
          cell.change_code('onset', timestamp[:onset])
          cell.change_code('offset', timestamp[:offset])

          # Set content based on column type
          case type
          when :transcript_original
            cell.change_code('content', word)
          when :transcript_clean
            cell.change_code('content', word)
            cell.change_code('speaker', '')  # Initialize speaker field empty
          when :qa
            # Initialize QA codes as empty
            config[:codes].each do |code|
              cell.change_code(code, '')
            end
          when :initials
            cell.change_code('coder_initials', '')
          when :notes
            cell.change_code('notes', '')
          end
        end
      end
    end

    puts "Setting columns in Datavyu..."
    columns.each do |_, column|
      set_column(column)
    end
  end

  # Show progress dialog
  def show_progress_dialog(message)
    JOptionPane.showMessageDialog(nil, message, "Progress", JOptionPane::INFORMATION_MESSAGE)
  end

  # Main execution flow
  puts "Starting transcript import..."

  # Setup and show file chooser
  frame, jfc = setup_file_chooser
  frame.setVisible(true)

  result = jfc.showOpenDialog(frame)
  frame.dispose

  if result != JFileChooser::APPROVE_OPTION
    puts "No file selected. Aborting."
    return
  end

  file_path = jfc.getSelectedFile.getPath
  validate_file_format(file_path)

  puts "Reading file: #{file_path}"
  show_progress_dialog("Reading file. Please wait...")

  # Read file content
  content = File.readlines(file_path)

  puts "Processing content..."
  show_progress_dialog("Processing transcript. This may take a moment for large files...")

  # Process file content
  words, timestamps = process_content(content)

  if words.empty? || timestamps.empty?
    puts "No valid transcript entries found. Check file format."
    show_progress_dialog("No valid transcript entries found. Check file format.")
    return
  end

  puts "Creating Datavyu columns with #{words.size} entries..."
  show_progress_dialog("Creating Datavyu columns with #{words.size} entries. Please wait...")

  # Create columns with dynamically sized batched processing
  create_columns(words, timestamps)

  puts "Import completed successfully!"
  show_progress_dialog("Import completed successfully!")

rescue => e
  puts "Error: #{e.message}"
  puts e.backtrace if ENV['DEBUG']
  JOptionPane.showMessageDialog(nil, "Error: #{e.message}", "Import Error", JOptionPane::ERROR_MESSAGE)
end