wavlab-speech · Shikhar-S · May 24, 2025 · May 24, 2025 · May 24, 2025 · Jun 15, 2025
diff --git a/docs/supported_metrics.md b/docs/supported_metrics.md
@@ -50,7 +50,9 @@ We include x mark if the metric is auto-installed in versa.
 | 43 |   | Qwen2 Recording Environment - Background | qwen2_speech_background_environment_metric | qwen2_speech_background_environment_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
 | 44 |   | Qwen2 Recording Environment - Quality | qwen2_recording_quality_metric | qwen2_recording_quality_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
 | 45 |   | Qwen2 Recording Environment - Channel Type | qwen2_channel_type_metric | qwen2_channel_type_metric | [Qwen2 Audio](https://github.com/QwenLM/Qwen2-Audio) | [paper](https://arxiv.org/abs/2407.10759) |
-
+| 46 |   | OpenBEATs - Embedding extraction | openbeats_embedding_extraction | openbeats_embedding_extraction | Released via VERSA | [Challenge report/OpenBEATs arxiv](todo) |
+| 48 |   | OpenBEATs - Similarity | openbeats_embedding_similarity | openbeats_embedding_similarity | Released via VERSA | [Challenge report/OpenBEATs arxiv](todo) |
+| 49 |   | OpenBEATs - Class prediction | openbeats_class_prediction | openbeats_class_prediction | Released via VERSA | [Challenge report/OpenBEATs arxiv](todo) |
 
 ### Dependent Metrics
 |Number| Auto-Install | Metric Name  (Auto-Install)  | Key in config | Key in report |  Code Source                                                                                                     | References                                                                                       |

diff --git a/egs/separate_metrics/openbeats.yaml b/egs/separate_metrics/openbeats.yaml
@@ -0,0 +1,16 @@
+# Metrics with OpenBEATs
+# Inference pipeline is released via VERSA!
+
+# 1. Class prediction
+# TODO(shikhar): Add other checkpoints for fine-tuned models.
+- name: openbeats_class_prediction
+  model_path: /work/nvme/bbjs/sbharadwaj/OpenBEATs/audioset20k/cls_earlarge3/ckpt_w_cfg.ckpt
+
+# 2. Embedding extraction
+- name: openbeats_embedding_extraction
+  model_path: /work/nvme/bbjs/sbharadwaj/7Msounds/exp/beats_iter1_large1.tune_lr1.0e-4_warmup40000_bins1600000_totalsteps400000/epoch_latest.pt
+  embedding_output_file: test/test_samples/test2/embeddings/test_embeddings.npy
+
+# 3. Embedding similarity
+- name: openbeats_embedding_similarity
+  model_path: /work/nvme/bbjs/sbharadwaj/7Msounds/exp/beats_iter1_large1.tune_lr1.0e-4_warmup40000_bins1600000_totalsteps400000/epoch_latest.pt
diff --git a/test/test_pipeline/test_openbeats.py b/test/test_pipeline/test_openbeats.py
@@ -0,0 +1,94 @@
+import logging
+import os
+
+import yaml
+import numpy as np
+
+from versa.scorer_shared import (
+    find_files,
+    list_scoring,
+    load_score_modules,
+)
+
+TEST_INFO = {
+    "openbeats_embedding_extraction": np.array([-0.42187455, -0.6287595, 0.1792216]),
+    "openbeats_embedding_similarity": 1.0,
+}
+
+
+def test_openbeats_embedding_extraction(embedding_result):
+    """Test OpenBEATs embedding extraction."""
+    # Read embedding
+    assert (
+        "embedding_file" in embedding_result
+    ), "Embedding result does not contain 'embedding_file'"
+    with open(embedding_result["embedding_file"], "rb") as f:
+        embedding_result["embedding"] = np.load(f)
+
+    assert embedding_result["embedding"].shape[:-1] == (
+        1,
+        48,
+    ), f'The frame size is off. Expected (1,48) but got {embedding_result["embedding"].shape[:-1]}'
+    summary_value = embedding_result["embedding"][0, :3, 0]
+    if np.any(
+        np.abs(TEST_INFO["openbeats_embedding_extraction"] - summary_value) > 1e-3
+    ):
+        raise ValueError(
+            "Value issue in the test case, might be some issue in scorer {}".format(
+                "openbeats_embedding_extraction"
+            )
+        )
+
+
+def test_openbeats_embedding_similarity(embedding_result):
+    """Test OpenBEATs embedding similarity."""
+    assert (
+        "similarity_score" in embedding_result
+    ), "Embedding result does not contain 'similarity_score'"
+    similarity_score = embedding_result["similarity_score"]
+    assert (
+        np.abs(TEST_INFO["openbeats_embedding_similarity"] - similarity_score) < 1e-3
+    ), "Similarity score should be 1.0, got {}".format(similarity_score)
+
+
+def test_openbeats_class_prediction(class_prediction_result):
+    """Test OpenBEATs class prediction."""
+    assert (
+        "class_probabilities" in class_prediction_result
+    ), "Class prediction result does not contain 'class_probabilities'"
+    class_probabilities = class_prediction_result["class_probabilities"]
+    print("Multi-class log probabilities: {}".format(class_probabilities), flush=True)
+
+
+def info_update():
+
+    # find files
+    if os.path.isdir("test/test_samples/test2"):
+        gen_files = find_files("test/test_samples/test2")
+
+    logging.info("The number of utterances = %d" % len(gen_files))
+
+    with open("egs/separate_metrics/openbeats.yaml", "r", encoding="utf-8") as f:
+        score_config = yaml.full_load(f)
+
+    score_modules = load_score_modules(
+        score_config,
+        use_gt=True,
+        use_gpu=False,
+    )
+
+    assert len(score_config) > 0, "no scoring function is provided"
+
+    score_info = list_scoring(
+        gen_files, score_modules, gt_files=gen_files, output_file=None, io="soundfile"
+    )
+
+    test_openbeats_embedding_extraction(score_info[0])
+    test_openbeats_embedding_similarity(score_info[0])
+    test_openbeats_class_prediction(score_info[0])
+
+    print("check successful", flush=True)
+
+
+if __name__ == "__main__":
+    info_update()
diff --git a/versa/__init__.py b/versa/__init__.py
@@ -103,3 +103,10 @@
 )
 from versa.utterance_metrics.squim import squim_metric, squim_metric_no_ref
 from versa.utterance_metrics.srmr import srmr_metric
+from versa.utterance_metrics.openbeats import (
+    openbeats_setup,
+    openbeats_class_prediction,
+    openbeats_embedding_extraction,
+    openbeats_embedding_similarity,
+)
+from versa import models
diff --git a/versa/metrics.py b/versa/metrics.py
@@ -31,6 +31,8 @@
     "espnet_hyp_text",
     "owsm_hyp_text",
     "whisper_hyp_text",
+    "openbeats_class_prediction",
+    "openbeats_embedding_extraction",  # HACK: using STR_METRIC to bypass summarization
 ]
 
 NUM_METRIC = [

diff --git a/versa/models/__init__.py b/versa/models/__init__.py
diff --git a/versa/models/openbeats/__init__.py b/versa/models/openbeats/__init__.py
diff --git a/versa/models/openbeats/decoder.py b/versa/models/openbeats/decoder.py
@@ -0,0 +1,93 @@
+"""A simple linear layer decoder.
+
+This can be used for classification tasks from sequence input.
+"""
+
+from typing import Tuple
+import torch
+from typeguard import typechecked
+from versa.models.openbeats.utils import make_pad_mask
+
+
+class LinearDecoder(torch.nn.Module):
+
+    @typechecked
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        pooling: str = "mean",
+        dropout: float = 0.0,
+        pre_layer_norm: bool = False,
+    ):
+        """Initialize the module."""
+        super().__init__()
+
+        self.input_dim = encoder_output_size
+        self.output_dim = vocab_size  # No special symbols
+        self.dropout = None
+        if dropout != 0.0:
+            self.dropout = torch.nn.Dropout(p=dropout)
+        self.linear_out = torch.nn.Linear(self.input_dim, self.output_dim)
+        assert pooling in [
+            "mean",
+            "max",
+            "CLS",
+        ], f"Invalid pooling: {pooling}. Should be 'mean', 'max' or 'CLS'."
+        self.pooling = pooling
+        self.layer_norm = torch.nn.LayerNorm(self.input_dim) if pre_layer_norm else None
+
+    def forward(
+        self,
+        hs_pad: torch.Tensor,
+        hlens: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            hs_pad: (B, Tmax, D)
+            hlens: (B,)
+        Returns:
+            output: (B, n_classes)
+        """
+
+        mask = make_pad_mask(lengths=hlens, xs=hs_pad, length_dim=1).to(hs_pad.device)
+        if self.layer_norm is not None:
+            hs_pad = self.layer_norm(hs_pad)
+        if self.dropout is not None:
+            hs_pad = self.dropout(hs_pad)
+        if self.pooling == "mean":
+            unmasked_entries = (~mask).to(dtype=hs_pad.dtype)
+            input_feature = (hs_pad * unmasked_entries).sum(dim=1)
+            input_feature = input_feature / unmasked_entries.sum(dim=1)
+        elif self.pooling == "max":
+            input_feature = hs_pad.masked_fill(mask, float("-inf"))
+            input_feature, _ = torch.max(input_feature, dim=1)
+        elif self.pooling == "CLS":
+            input_feature = hs_pad[:, 0, :]
+
+        output = self.linear_out(input_feature)
+        return output
+
+    def score(self, ys, state, x):
+        """Classify x.
+        Args:
+            ys: Not used
+            state: Not used
+            x: (T, D). this should be a single sample without
+                any padding ie batch size=1.
+        Returns:
+            ret1: logits over (n_classes,)
+            state: None
+        Assumes that x is a single unpadded sequence.
+        """
+        assert len(x.shape) == 2, x.shape
+        hs_len = torch.tensor([x.shape[0]], dtype=torch.long).to(x.device)
+        logits = self.forward(
+            x.unsqueeze(0),
+            hs_len,
+        )
+        return logits.squeeze(0), None
+
+    def output_size(self) -> int:
+        """Get the output size."""
+        return self.output_dim