Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
383 changes: 281 additions & 102 deletions examples/zcr_main/main.cc

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion sense-voice/csrc/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ struct sense_voice_segment {
size_t t1; // 时间区间右端点
// std::string text; // tokens对应的文本
std::vector<int> tokens; // 识别后的tokens
std::vector<double> samples; // 具体音频
std::vector<float> samples; // 具体音频
// std::vector<float>
// bool speaker_turn_next;
};
Expand Down
19 changes: 2 additions & 17 deletions sense-voice/csrc/sense-voice-encoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -192,22 +192,12 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
a = ggml_repeat(ctx0, ggml_cast(ctx0, a, GGML_TYPE_F32), ggml_new_tensor_4d(ctx0, GGML_TYPE_F16, a->ne[0], a->ne[1], a->ne[2], n_batch));
struct ggml_tensor * result = ggml_mul_mat(ctx0, a, im2col);
fsmn_memory = ggml_reshape_3d(ctx0, result, im2col->ne[1], im2col->ne[2], im2col->ne[3]);
// if(n_batch > 1){
// printf("n_batch: %d\n", n_batch);
// printf("a: %ld %ld %ld %ld\n", a->ne[0], a->ne[1], a->ne[2], a->ne[3]);
// printf("b: %ld %ld %ld %ld\n", b->ne[0], b->ne[1], b->ne[2], b->ne[3]);
// printf("im2col: %ld %ld %ld %ld\n", im2col->ne[0], im2col->ne[1], im2col->ne[2], im2col->ne[3]);
// printf("result: %ld %ld %ld %ld\n", result->ne[0], result->ne[1], result->ne[2], result->ne[3]);
// printf("fsmn_memory: %ld %ld %ld %ld\n", fsmn_memory->ne[0], fsmn_memory->ne[1], fsmn_memory->ne[2], fsmn_memory->ne[3]);
// printf("V: %ld %ld %ld %ld\n", V->ne[0], V->ne[1], V->ne[2], V->ne[3]);
// }
}
fsmn_memory = ggml_cont(ctx0, ggml_transpose(ctx0, fsmn_memory));
fsmn_memory = ggml_add(ctx0, fsmn_memory, V);
ggml_set_name(fsmn_memory, "fsmn_memory");
}

struct ggml_tensor *KQV;
float KQscale = 1.0f / sqrtf(float(n_state) / n_head);

if(user_flash_attn){
Expand All @@ -232,7 +222,7 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
ggml_element_size(state->kv_pad.v)*n_state_head,
ggml_element_size(state->kv_pad.v)*n_state*n_ctx_pad,
0);
KQV = ggml_flash_attn_ext(ctx0, Q_h, K, V, nullptr, KQscale, 0.0f, 0.0f);
ggml_tensor *KQV = ggml_flash_attn_ext(ctx0, Q_h, K, V, nullptr, KQscale, 0.0f, 0.0f);
cur = ggml_reshape_3d(ctx0, KQV, n_state, n_ctx, n_batch);
} else{
// K * Q
Expand All @@ -241,19 +231,14 @@ static struct ggml_tensor *encoder_layer_sanm_forward(const sense_voice_hparams
struct ggml_tensor *KQ_soft_max = ggml_soft_max_ext(ctx0, KQ, nullptr, KQscale, 0.0f);


KQV = ggml_mul_mat(
ggml_tensor *KQV = ggml_mul_mat(
ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, V_h)), KQ_soft_max);
struct ggml_tensor *KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state, n_ctx, n_batch));
}



cur = ggml_cpy(ctx0, cur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state, n_ctx, n_batch));

cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.e_attn_ln_out_w, cur),
layer.e_attn_ln_out_b);
ggml_set_name(cur, "attention_out");
Expand Down
187 changes: 187 additions & 0 deletions sense-voice/csrc/sense-voice-frontend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -287,3 +287,190 @@ bool load_wav_file(const char *filename, int32_t *sampling_rate,

}


// Float version of fbank_feature_worker_thread
static void fbank_feature_worker_thread_float(int ith,
const std::vector<double> &hamming,
const std::vector<float> &samples,
int n_samples, int frame_size,
int frame_step, int n_threads,
sense_voice_feature &mel) {
// make sure n_fft == 1 + (sense_voice_N_FFT / 2), bin_0 to bin_nyquist
int i = ith;

std::vector<double> window;
const int padded_window_size = round_to_nearest_power_two(frame_size);
window.resize(padded_window_size);

// calculate FFT only when fft_in are not all zero
int n_fft = std::min(n_samples / frame_step + 1, mel.n_len);
for (; i < n_fft; i += n_threads) {
const int offset = i * frame_step;

// Convert float to double for processing
for (int j = 0; j < frame_size; j++) {
window[j] = static_cast<double>(samples[offset + j]);
}

{
// init window default 0, initialization values may result in NaN on arm cpu.
for (int k = frame_size; k < window.size(); k++) {
window[k] = 0;
}
}
// remove dc offset
{
double sum = 0;
for (int32_t k = 0; k < frame_size; ++k) {
sum += window[k];
}
double mean = sum / frame_size;
for (int32_t k = 0; k < frame_size; ++k) {
window[k] -= mean;
}
}
// pre-emphasis
{
for (int32_t k = frame_size - 1; k > 0; --k) {
window[k] -= PREEMPH_COEFF * window[k - 1];
}
window[0] -= PREEMPH_COEFF * window[0];
}

// apply Hamming window
{
for (int j = 0; j < frame_size; j++) {
window[j] *= hamming[j];
}
}

// FFT
// window is input and output
rfft(window);

// Calculate modulus^2 of complex numbers,Power Spectrum
// Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes
// inference quality problem? Interesting.
for (int j = 0; j < padded_window_size / 2; j++) {
window[j] = (window[2 * j + 0] * window[2 * j + 0] +
window[2 * j + 1] * window[2 * j + 1]);
}

// log-Mel filter bank energies aka: "fbank"
{
auto num_fft_bins = padded_window_size / 2;
int n_mel = mel.n_mel;
for (int j = 0; j < n_mel; j++) {
double sum = 0.0;
for (int k = 0; k < num_fft_bins; k++) {
sum += window[k] * LogMelFilterMelArray[j * num_fft_bins + k];
}

sum = log(sum > 1.19e-7 ? sum : 1.19e-7);

mel.data[i * n_mel + j] = static_cast<float>(sum);
}
}
}
}

// Float version of fbank_lfr_cmvn_feature
bool fbank_lfr_cmvn_feature(const std::vector<float> &samples,
const int n_samples, const int frame_size,
const int frame_step, const int n_feats,
const int n_threads, const bool debug,
sense_voice_cmvn &cmvn, sense_voice_feature &feats) {
// const int64_t t_start_us = ggml_time_us();

const int32_t n_frames_per_ms = SENSE_VOICE_SAMPLE_RATE * 0.001f;
feats.n_mel = n_feats;
feats.n_len = 1 + ((n_samples - frame_size * n_frames_per_ms) /
(frame_step * n_frames_per_ms));
feats.data.resize(feats.n_mel * feats.n_len);

std::vector<double> hamming;
hamming_window(frame_size * n_frames_per_ms, true, hamming);

{
if (n_threads > 1) {
ThreadPool pool(n_threads);
for (int iw = 0; iw < n_threads - 1; ++iw) {
pool.enqueue(fbank_feature_worker_thread_float, iw + 1, std::cref(hamming),
samples, n_samples, frame_size * n_frames_per_ms,
frame_step * n_frames_per_ms, n_threads, std::ref(feats));
}
}

// main thread
fbank_feature_worker_thread_float(0, hamming, samples, n_samples,
frame_size * n_frames_per_ms,
frame_step * n_frames_per_ms, n_threads, feats);
}

if (debug) {
auto &mel = feats.data;
std::ofstream outFile("fbank_lfr_cmvn_feature_float.json");
outFile << "[";
for (uint64_t i = 0; i < mel.size() - 1; i++) {
outFile << mel[i] << ", ";
}
outFile << mel[mel.size() - 1] << "]";
outFile.close();
}

std::vector<std::vector<float>> out_feats;

// tapply lrf, merge lfr_m frames as one,lfr_n frames per window
// ref:
// https://github.com/alibaba-damo-academy/FunASR/blob/main/runtime/onnxruntime/src/paraformer.cpp#L409-L440
int T = feats.n_len;
int lfr_m = feats.lfr_m; // 7
int lfr_n = feats.lfr_n; // 6
int T_lrf = ceil(1.0 * T / feats.lfr_n);
int left_pad = (feats.lfr_m - 1) / 2;
int left_pad_offset = (lfr_m - left_pad) * feats.n_mel;
// Merge lfr_m frames as one,lfr_n frames per window
T = T + (lfr_m - 1) / 2;
std::vector<float> p;
for (int i = 0; i < T_lrf; i++) {
// the first frames need left padding
if (i == 0) {
// left padding
for (int j = 0; j < left_pad; j++) {
p.insert(p.end(), feats.data.begin(), feats.data.begin() + feats.n_mel);
}
p.insert(p.end(), feats.data.begin(), feats.data.begin() + left_pad_offset);
out_feats.push_back(p);
p.clear();
} else {
if (lfr_m <= T - i * lfr_n) {
p.insert(p.end(), feats.data.begin() + (i * lfr_n - left_pad) * feats.n_mel,
feats.data.begin() + (i * lfr_n - left_pad + lfr_m) * feats.n_mel);
out_feats.push_back(p);
p.clear();
} else {
// Fill to lfr_m frames at last window if less than lfr_m frames (copy
// last frame)
int num_padding = lfr_m - (T - i * lfr_n);
for (int j = 0; j < (feats.n_len - i * lfr_n); j++) {
p.insert(p.end(),
feats.data.begin() + (i * lfr_n - left_pad) * feats.n_mel,
feats.data.end());
}
for (int j = 0; j < num_padding; j++) {
p.insert(p.end(), feats.data.end() - feats.n_mel, feats.data.end());
}
out_feats.push_back(p);
p.clear();
}
}
}
feats.data.resize(T_lrf * feats.lfr_m * feats.n_mel);
// apply cvmn
for (int i = 0; i < T_lrf; i++) {
for (int j = 0; j < feats.lfr_m * feats.n_mel; j++) {
feats.data[i * feats.lfr_m * feats.n_mel + j] = (out_feats[i][j] + cmvn.cmvn_means[j]) * cmvn.cmvn_vars[j];
}
}
return true;
}
5 changes: 5 additions & 0 deletions sense-voice/csrc/sense-voice-frontend.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ bool fbank_lfr_cmvn_feature(const std::vector<double> &samples,
const int frame_step, const int n_feats,
const int n_threads, const bool debug,
sense_voice_cmvn &cmvn, sense_voice_feature &feats);
bool fbank_lfr_cmvn_feature(const std::vector<float> &samples,
const int n_samples, const int frame_size,
const int frame_step, const int n_feats,
const int n_threads, const bool debug,
sense_voice_cmvn &cmvn, sense_voice_feature &feats);
bool load_wav_file(const char *filename, int32_t *sampling_rate,
std::vector<double> &data);

Expand Down
4 changes: 2 additions & 2 deletions sense-voice/csrc/sense-voice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -799,7 +799,7 @@ int sense_voice_batch_pcm_to_feature_with_state(struct sense_voice_context *ctx,
max_len = std::max(max_len, state->result_all[segmentID].samples.size());
for (size_t segmentID: state->segmentIDs)
{
std::vector<double>& pcmf32 = state->result_all[segmentID].samples;
std::vector<float>& pcmf32 = state->result_all[segmentID].samples;
if(pcmf32.size() < max_len) {
pcmf32.insert(pcmf32.end(), max_len - pcmf32.size(), 0);
}
Expand Down Expand Up @@ -892,7 +892,7 @@ int sense_voice_batch_full(struct sense_voice_context *ctx, const sense_voice_fu
return 0;
}

int sense_voice_batch_pcmf(struct sense_voice_context *ctx, const sense_voice_full_params &params, std::vector<std::vector<double>> &pcmf32,
int sense_voice_batch_pcmf(struct sense_voice_context *ctx, const sense_voice_full_params &params, std::vector<std::vector<float>> &pcmf32,
size_t max_batch_len, size_t max_batch_cnt,
bool use_prefix, bool use_itn)
{
Expand Down
2 changes: 1 addition & 1 deletion sense-voice/csrc/sense-voice.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ int sense_voice_full_parallel(struct sense_voice_context * ctx,
void sense_voice_print_output(struct sense_voice_context * ctx, bool need_prefix, bool use_itn, bool refresh_self=false);
void sense_voice_free_state(struct sense_voice_state * state);
int sense_voice_batch_full(struct sense_voice_context * ctx, const sense_voice_full_params &params);
int sense_voice_batch_pcmf(struct sense_voice_context *ctx, const sense_voice_full_params &params, std::vector<std::vector<double>> &pcmf32,
int sense_voice_batch_pcmf(struct sense_voice_context *ctx, const sense_voice_full_params &params, std::vector<std::vector<float>> &pcmf32,
size_t max_batch_len=90000, size_t max_batch_cnt=1,
bool use_prefix=true, bool use_itn=true);
void sense_voice_batch_print_output(struct sense_voice_context * ctx, bool need_prefix, bool use_itn, bool refresh_self=false);
Expand Down
Loading