faster-whisper-api/server.js at main · moothz/faster-whisper-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
// Importação dos módulos necessários
require('dotenv').config();
const express = require('express');
const logger = require('./logger');
const { exec } = require('child_process');
const { v4: uuidv4 } = require('uuid');
const fs = require('fs');
const path = require('path');
const fluent_ffmpeg = require('fluent-ffmpeg');
const axios = require('axios');
const swaggerUi = require('swagger-ui-express');
const swaggerJsdoc = require('swagger-jsdoc');

// Configuração do Swagger
const swaggerOptions = {
    definition: {
        openapi: '3.0.0',
        info: {
            title: 'Faster Whisper API',
            version: '1.0.0',
            description: 'API de transcrição de áudio assíncrona utilizando Faster Whisper XXL',
        },
        servers: [
            {
                url: `http://localhost:${process.env.PORT || 3378}`,
                description: 'Servidor Local',
            },
        ],
    },
    apis: ['./server.js'], // Caminho para os arquivos com anotações
};

const swaggerDocs = swaggerJsdoc(swaggerOptions);

// Configuração do aplicativo Express
const app = express();
app.use(express.json({ limit: '50mb' })); // Aumenta o limite de tamanho do corpo da requisição para aceitar áudios em base64

// Rota para documentação Swagger
app.use('/api-docs', swaggerUi.serve, swaggerUi.setup(swaggerDocs));

// Middleware para logar todas as requisições
app.use((req, res, next) => {
    logger.info(`${req.method} ${req.originalUrl}`);
    next();
});

// --- CONFIGURAÇÕES ---
const PORT = process.env.PORT || 3378;
const CONVERSION_TIME_RATE = Number(process.env.CONVERSION_TIME_RATE) || 5;

// Configurações do Faster Whisper
const WHISPER_EXECUTABLE = process.env.WHISPER_EXECUTABLE;
const WHISPER_MODEL = process.env.WHISPER_MODEL || "large-v3-turbo";
const WHISPER_COMPUTE_TYPE = process.env.WHISPER_COMPUTE_TYPE || "float16";
const WHISPER_RUN_ON = process.env.WHISPER_RUN_ON || "";
const WHISPER_LANGUAGE = process.env.WHISPER_LANGUAGE || "pt";

// Armazenamento em memória para os status das transcrições
const tasks = {};

// Diretório para armazenar arquivos temporários
const TEMP_DIR = path.join(__dirname, 'temp');
if (!fs.existsSync(TEMP_DIR)) {
    fs.mkdirSync(TEMP_DIR);
}

/**
 * Função para processar o áudio (converter para WAV)
 * @param {string} inputPath - Caminho do arquivo de áudio de entrada (ou URL)
 * @param {string} outputPath - Caminho para salvar o arquivo WAV convertido
 * @returns {Promise<void>}
 */
function processAudio(inputPath, outputPath) {
    return new Promise((resolve, reject) => {
        fluent_ffmpeg(inputPath)
            .toFormat('wav')
            .audioFrequency(16000)
            .audioChannels(1)
            .audioCodec('pcm_s16le')
            .on('error', (err) => reject(new Error(`Erro no FFmpeg: ${err.message}`)))
            .on('end', () => resolve())
            .save(outputPath);
    });
}

/**
 * Função para executar a transcrição com o Faster Whisper
 * @param {string} audioPath - Caminho do arquivo de áudio WAV
 * @returns {Promise<string>} - O texto transcrito
 */
function runWhisper(audioPath) {
    return new Promise((resolve, reject) => {
        // Monta o comando de execução de forma limpa
        const modelArg = `-m ${WHISPER_MODEL}`;
        const computeArg = `--compute_type ${WHISPER_COMPUTE_TYPE}`;
        const languageArg = `--language ${WHISPER_LANGUAGE}`;
        const outputArg = `--output_dir "${TEMP_DIR}" --output_format txt`;

        const command = `"${WHISPER_EXECUTABLE}" "${audioPath}" ${WHISPER_RUN_ON} ${modelArg} ${languageArg} ${computeArg} ${outputArg}`;
        const whisperOutputPath = audioPath.replace(/\.[^/.]+$/, '') + '.txt';

        logger.info(`Executando comando: ${command}`);

        exec(command, (error, stdout, stderr) => {

            if (error) {
                logger.error(`Erro ao executar o Faster Whisper: ${stderr || error.message}`);
                return reject(new Error(`Erro ao executar o Faster Whisper: ${stderr || error.message}`));
            }

            let transcribedText = '';
            try {
                transcribedText = fs.readFileSync(whisperOutputPath, 'utf8');
                transcribedText = transcribedText.trim();

                resolve(cleanupString(transcribedText));
            } catch (e) {
                reject(new Error(`Erro ao processar a saída do Whisper: ${e.message}`));
            }
        });
    });
}

/**
 * Função para remover os timestamps do whisper
 * @param {string} text - '[00:00.000 --> 00:01.460]  blah blah blah'
 * @returns {string} - O texto limpo
 */
function cleanupString(text) {
  const lines = text.split('\n');

  const cleanedLines = lines.map(line => {
    const cleanedLine = line.replace(/^\s*\[.*?\]\s*/, '');
    return cleanedLine.trim();
  });

  return cleanedLines.filter(line => line.length > 2).join('\n');
}

// --- ENDPOINTS DA API ---

/**
 * @swagger
 * /transcribe:
 *   post:
 *     summary: Inicia o processo de transcrição de um arquivo de áudio.
 *     description: Recebe um áudio via base64 ou URL e retorna um ID de execução para consulta posterior.
 *     requestBody:
 *       required: true
 *       content:
 *         application/json:
 *           schema:
 *             type: object
 *             properties:
 *               audioData:
 *                 type: string
 *                 description: Áudio codificado em base64.
 *               audioUrl:
 *                 type: string
 *                 description: URL para um arquivo de áudio.
 *     responses:
 *       202:
 *         description: Transcrição iniciada com sucesso.
 *         content:
 *           application/json:
 *             schema:
 *               type: object
 *               properties:
 *                 executionId:
 *                   type: string
 *                 audioDuration:
 *                   type: number
 *                 estimatedTranscriptionTime:
 *                   type: number
 *       400:
 *         description: Requisição inválida (falta audioData ou audioUrl).
 */
app.post('/transcribe', async (req, res) => {
    const executionId = uuidv4();
    const { audioData, audioUrl } = req.body;

    if (!audioData && !audioUrl) {
        return res.status(400).json({ error: 'É necessário fornecer "audioData" (base64) ou "audioUrl".' });
    }

    // Cria um status inicial para a tarefa
    tasks[executionId] = { status: 'running', startTime: Date.now() };

    // --- Inicia o processamento em segundo plano ---
    (async () => {
        const tempInputPath = path.join(TEMP_DIR, `${executionId}.tmp`);
        const tempWavPath = path.join(TEMP_DIR, `${executionId}.wav`);
        let originalFileSize = 0;
        let audioDuration = 0;

        try {
            // Passo 1: Obter o arquivo de áudio (de base64 ou URL)
            if (audioData) {
                const buffer = Buffer.from(audioData, 'base64');
                originalFileSize = buffer.length;
                fs.writeFileSync(tempInputPath, buffer);
            } else {
                const response = await axios.get(audioUrl, { responseType: 'arraybuffer' });
                originalFileSize = response.data.length;
                fs.writeFileSync(tempInputPath, response.data);
            }

            // Passo 2: Converter o áudio para o formato WAV correto
            await processAudio(tempInputPath, tempWavPath);

            // Passo 3: Obter a duração do áudio
            const metadata = await new Promise((resolve, reject) => {
                fluent_ffmpeg.ffprobe(tempWavPath, (err, data) => {
                    if (err) return reject(err);
                    resolve(data);
                });
            });
            audioDuration = Math.ceil(metadata.format.duration);

            // Calcula o tempo estimado de transcrição
            const estimatedTranscriptionTime = Math.ceil((audioDuration / CONVERSION_TIME_RATE) + 1); // Adiciona 1 segundo de buffer

            // Envia a resposta inicial com a duração e o tempo estimado
            const dados = { executionId, audioDuration, estimatedTranscriptionTime };
            logger.info(`[${executionId}] Transcrição inciada: ${JSON.stringify(dados)}`);

            res.status(202).json({ executionId, audioDuration, estimatedTranscriptionTime });

            // Passo 4: Executar a transcrição
            const transcribedText = await runWhisper(tempWavPath);

            // Passo 5: Armazenar o resultado final
            tasks[executionId] = {
                status: 'complete',
                fileSize: originalFileSize,
                duration: audioDuration,
                text: transcribedText,
            };

            logger.info(`[${executionId}] Transcrição concluída: ${JSON.stringify(tasks[executionId], null, '\t')}`);

        } catch (error) {
            tasks[executionId] = {
                status: 'error',
                message: error.message || 'Ocorreu um erro desconhecido.',
            };
            logger.error(`[${executionId}] Erro no processamento:`, error);
        } finally {
            // Passo 6: Limpar arquivos temporários
            if (fs.existsSync(tempInputPath)) fs.unlinkSync(tempInputPath);
            if (fs.existsSync(tempWavPath)) fs.unlinkSync(tempWavPath);
        }
    })();
});

/**
 * @swagger
 * /status/{executionId}:
 *   get:
 *     summary: Consulta o status de um processo de transcrição.
 *     description: Retorna o status atual (running, complete, error) e o resultado final caso concluído.
 *     parameters:
 *       - in: path
 *         name: executionId
 *         required: true
 *         schema:
 *           type: string
 *         description: ID da execução retornado pelo endpoint /transcribe.
 *     responses:
 *       200:
 *         description: Status retornado com sucesso.
 *       404:
 *         description: ID de execução não encontrado.
 */
app.get('/status/:executionId', (req, res) => {
    const { executionId } = req.params;
    const task = tasks[executionId];

    if (!task) {
        return res.status(404).json({ error: 'ID de execução não encontrado.' });
    }

    res.json(task);
});

// --- INICIALIZAÇÃO DO SERVIDOR ---
app.listen(PORT, () => {
    logger.info(`🎙️  Servidor da API de Transcrição rodando na porta ${PORT}`);
    logger.info(`📝 Documentação Swagger disponível em http://localhost:${PORT}/api-docs`);
    logger.info('Aguardando requisições...');
});