Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/supported_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ We include x mark if the metric is auto-installed in versa.
| 43 | | Qwen2 Recording Environment - Background | qwen2_speech_background_environment_metric | qwen2_speech_background_environment_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
| 44 | | Qwen2 Recording Environment - Quality | qwen2_recording_quality_metric | qwen2_recording_quality_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
| 45 | | Qwen2 Recording Environment - Channel Type | qwen2_channel_type_metric | qwen2_channel_type_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |

| 46 | | OpenBEATs - Embedding extraction | openbeats_embedding_extraction | openbeats_embedding_extraction | Released via VERSA | [Challenge report/OpenBEATs arxiv](todo) |
| 48 | | OpenBEATs - Similarity | openbeats_embedding_similarity | openbeats_embedding_similarity | Released via VERSA | [Challenge report/OpenBEATs arxiv](todo) |
| 49 | | OpenBEATs - Class prediction | openbeats_class_prediction | openbeats_class_prediction | Released via VERSA | [Challenge report/OpenBEATs arxiv](todo) |

### Dependent Metrics
|Number| Auto-Install | Metric Name (Auto-Install) | Key in config | Key in report | Code Source | References |
Expand Down
16 changes: 16 additions & 0 deletions egs/separate_metrics/openbeats.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Metrics with OpenBEATs
# Inference pipeline is released via VERSA!

# 1. Class prediction
# TODO(shikhar): Add other checkpoints for fine-tuned models.
- name: openbeats_class_prediction
model_path: /work/nvme/bbjs/sbharadwaj/OpenBEATs/audioset20k/cls_earlarge3/ckpt_w_cfg.ckpt

# 2. Embedding extraction
- name: openbeats_embedding_extraction
model_path: /work/nvme/bbjs/sbharadwaj/7Msounds/exp/beats_iter1_large1.tune_lr1.0e-4_warmup40000_bins1600000_totalsteps400000/epoch_latest.pt
embedding_output_file: test/test_samples/test2/embeddings/test_embeddings.npy

# 3. Embedding similarity
- name: openbeats_embedding_similarity
model_path: /work/nvme/bbjs/sbharadwaj/7Msounds/exp/beats_iter1_large1.tune_lr1.0e-4_warmup40000_bins1600000_totalsteps400000/epoch_latest.pt
94 changes: 94 additions & 0 deletions test/test_pipeline/test_openbeats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import logging
import os

import yaml
import numpy as np

from versa.scorer_shared import (
find_files,
list_scoring,
load_score_modules,
)

TEST_INFO = {
"openbeats_embedding_extraction": np.array([-0.42187455, -0.6287595, 0.1792216]),
"openbeats_embedding_similarity": 1.0,
}


def test_openbeats_embedding_extraction(embedding_result):
"""Test OpenBEATs embedding extraction."""
# Read embedding
assert (
"embedding_file" in embedding_result
), "Embedding result does not contain 'embedding_file'"
with open(embedding_result["embedding_file"], "rb") as f:
embedding_result["embedding"] = np.load(f)

assert embedding_result["embedding"].shape[:-1] == (
1,
48,
), f'The frame size is off. Expected (1,48) but got {embedding_result["embedding"].shape[:-1]}'
summary_value = embedding_result["embedding"][0, :3, 0]
if np.any(
np.abs(TEST_INFO["openbeats_embedding_extraction"] - summary_value) > 1e-3
):
raise ValueError(
"Value issue in the test case, might be some issue in scorer {}".format(
"openbeats_embedding_extraction"
)
)


def test_openbeats_embedding_similarity(embedding_result):
"""Test OpenBEATs embedding similarity."""
assert (
"similarity_score" in embedding_result
), "Embedding result does not contain 'similarity_score'"
similarity_score = embedding_result["similarity_score"]
assert (
np.abs(TEST_INFO["openbeats_embedding_similarity"] - similarity_score) < 1e-3
), "Similarity score should be 1.0, got {}".format(similarity_score)


def test_openbeats_class_prediction(class_prediction_result):
"""Test OpenBEATs class prediction."""
assert (
"class_probabilities" in class_prediction_result
), "Class prediction result does not contain 'class_probabilities'"
class_probabilities = class_prediction_result["class_probabilities"]
print("Multi-class log probabilities: {}".format(class_probabilities), flush=True)


def info_update():

# find files
if os.path.isdir("test/test_samples/test2"):
gen_files = find_files("test/test_samples/test2")

logging.info("The number of utterances = %d" % len(gen_files))

with open("egs/separate_metrics/openbeats.yaml", "r", encoding="utf-8") as f:
score_config = yaml.full_load(f)

score_modules = load_score_modules(
score_config,
use_gt=True,
use_gpu=False,
)

assert len(score_config) > 0, "no scoring function is provided"

score_info = list_scoring(
gen_files, score_modules, gt_files=gen_files, output_file=None, io="soundfile"
)

test_openbeats_embedding_extraction(score_info[0])
test_openbeats_embedding_similarity(score_info[0])
test_openbeats_class_prediction(score_info[0])

print("check successful", flush=True)


if __name__ == "__main__":
info_update()
7 changes: 7 additions & 0 deletions versa/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,10 @@
)
from versa.utterance_metrics.squim import squim_metric, squim_metric_no_ref
from versa.utterance_metrics.srmr import srmr_metric
from versa.utterance_metrics.openbeats import (
openbeats_setup,
openbeats_class_prediction,
openbeats_embedding_extraction,
openbeats_embedding_similarity,
)
from versa import models
2 changes: 2 additions & 0 deletions versa/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
"espnet_hyp_text",
"owsm_hyp_text",
"whisper_hyp_text",
"openbeats_class_prediction",
"openbeats_embedding_extraction", # HACK: using STR_METRIC to bypass summarization
]

NUM_METRIC = [
Expand Down
Empty file added versa/models/__init__.py
Empty file.
Empty file.
93 changes: 93 additions & 0 deletions versa/models/openbeats/decoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""A simple linear layer decoder.

This can be used for classification tasks from sequence input.
"""

from typing import Tuple
import torch
from typeguard import typechecked
from versa.models.openbeats.utils import make_pad_mask


class LinearDecoder(torch.nn.Module):

@typechecked
def __init__(
self,
vocab_size: int,
encoder_output_size: int,
pooling: str = "mean",
dropout: float = 0.0,
pre_layer_norm: bool = False,
):
"""Initialize the module."""
super().__init__()

self.input_dim = encoder_output_size
self.output_dim = vocab_size # No special symbols
self.dropout = None
if dropout != 0.0:
self.dropout = torch.nn.Dropout(p=dropout)
self.linear_out = torch.nn.Linear(self.input_dim, self.output_dim)
assert pooling in [
"mean",
"max",
"CLS",
], f"Invalid pooling: {pooling}. Should be 'mean', 'max' or 'CLS'."
self.pooling = pooling
self.layer_norm = torch.nn.LayerNorm(self.input_dim) if pre_layer_norm else None

def forward(
self,
hs_pad: torch.Tensor,
hlens: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Args:
hs_pad: (B, Tmax, D)
hlens: (B,)
Returns:
output: (B, n_classes)
"""

mask = make_pad_mask(lengths=hlens, xs=hs_pad, length_dim=1).to(hs_pad.device)
if self.layer_norm is not None:
hs_pad = self.layer_norm(hs_pad)
if self.dropout is not None:
hs_pad = self.dropout(hs_pad)
if self.pooling == "mean":
unmasked_entries = (~mask).to(dtype=hs_pad.dtype)
input_feature = (hs_pad * unmasked_entries).sum(dim=1)
input_feature = input_feature / unmasked_entries.sum(dim=1)
elif self.pooling == "max":
input_feature = hs_pad.masked_fill(mask, float("-inf"))
input_feature, _ = torch.max(input_feature, dim=1)
elif self.pooling == "CLS":
input_feature = hs_pad[:, 0, :]

output = self.linear_out(input_feature)
return output

def score(self, ys, state, x):
"""Classify x.
Args:
ys: Not used
state: Not used
x: (T, D). this should be a single sample without
any padding ie batch size=1.
Returns:
ret1: logits over (n_classes,)
state: None
Assumes that x is a single unpadded sequence.
"""
assert len(x.shape) == 2, x.shape
hs_len = torch.tensor([x.shape[0]], dtype=torch.long).to(x.device)
logits = self.forward(
x.unsqueeze(0),
hs_len,
)
return logits.squeeze(0), None

def output_size(self) -> int:
"""Get the output size."""
return self.output_dim
Loading