Bitbol-Lab
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎diffpass/_modidx.py‎
Lines changed: 4 additions & 4 deletions b/‎diffpass/_modidx.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎diffpass/data_utils.py‎
Lines changed: 65 additions & 20 deletions b/‎diffpass/data_utils.py‎
Lines changed: 65 additions & 20 deletions
diff --git a/‎diffpass/msa_parsing.py‎
Lines changed: 6 additions & 5 deletions b/‎diffpass/msa_parsing.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎mutual_information_msa_pairing.ipynb‎
Lines changed: 24 additions & 17 deletions b/‎mutual_information_msa_pairing.ipynb‎
Lines changed: 24 additions & 17 deletions
@@ -1,4 +1,4 @@
-# DiffPaSS
+# DiffPaSS – Differentiable Pairing using Soft Scores
 
 <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
 
 
@@ -54,12 +54,12 @@
                                                                                            'diffpass/data_utils.py'),
                                      'diffpass.data_utils.create_groupwise_seq_records': ( 'data_utils.html#create_groupwise_seq_records',
                                                                                            'diffpass/data_utils.py'),
-                                     'diffpass.data_utils.fetch_seq_records_from_group_names': ( 'data_utils.html#fetch_seq_records_from_group_names',
-                                                                                                 'diffpass/data_utils.py'),
                                      'diffpass.data_utils.get_single_and_paired_seqs': ( 'data_utils.html#get_single_and_paired_seqs',
                                                                                          'diffpass/data_utils.py'),
-                                     'diffpass.data_utils.seq_records_tokenizer': ( 'data_utils.html#seq_records_tokenizer',
-                                                                                    'diffpass/data_utils.py')},
+                                     'diffpass.data_utils.one_hot_encode_msa': ( 'data_utils.html#one_hot_encode_msa',
+                                                                                 'diffpass/data_utils.py'),
+                                     'diffpass.data_utils.pad_msas_with_dummy_sequences': ( 'data_utils.html#pad_msas_with_dummy_sequences',
+                                                                                            'diffpass/data_utils.py')},
             'diffpass.entropy_ops': { 'diffpass.entropy_ops.pointwise_shannon': ( 'entropy_ops.html#pointwise_shannon',
                                                                                   'diffpass/entropy_ops.py'),
                                       'diffpass.entropy_ops.smooth_mean_one_body_entropy': ( 'entropy_ops.html#smooth_mean_one_body_entropy',
 
@@ -1,14 +1,15 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/data_utils.ipynb.
 
 # %% auto 0
-__all__ = ['SeqRecord', 'SeqRecords', 'GroupwiseSeqRecords', 'create_groupwise_seq_records', 'fetch_seq_records_from_group_names',
-           'get_single_and_paired_seqs', 'seq_records_tokenizer', 'compute_num_correct_pairings',
+__all__ = ['SeqRecord', 'SeqRecords', 'GroupwiseSeqRecords', 'create_groupwise_seq_records', 'pad_msas_with_dummy_sequences',
+           'get_single_and_paired_seqs', 'one_hot_encode_msa', 'compute_num_correct_pairings',
            'compute_comparable_group_idxs']
 
-# %% ../nbs/data_utils.ipynb 3
+# %% ../nbs/data_utils.ipynb 4
 from collections import defaultdict
 from collections.abc import Sequence
 from typing import Optional, Union
+from copy import deepcopy
 
 import numpy as np
 
@@ -21,7 +22,7 @@
 SeqRecords = list[SeqRecord]
 GroupwiseSeqRecords = dict[str, SeqRecords]
 
-# %% ../nbs/data_utils.ipynb 4
+# %% ../nbs/data_utils.ipynb 5
 def create_groupwise_seq_records(
     seq_records: dict[str, SeqRecords],
     group_name_func: callable,
@@ -41,18 +42,61 @@ def create_groupwise_seq_records(
     return data_group_by_group
 
 
-def fetch_seq_records_from_group_names(
-    data_group_by_group: GroupwiseSeqRecords,
-    group_names: Sequence[str],
-) -> dict:
-    seq_records = []
-    group_sizes = []
+def pad_msas_with_dummy_sequences(
+    data_group_by_group_x: GroupwiseSeqRecords,
+    data_group_by_group_y: GroupwiseSeqRecords,
+    *,
+    dummy_symbol: str = "-",
+) -> tuple[GroupwiseSeqRecords, GroupwiseSeqRecords]:
+    """Pad MSAs with dummy sequences so that all groups/species contain the same
+    number of sequences."""
+    # Check that all sequences in the x and y MSAs have the same length
+    lengths_x = set(
+        [
+            len(seq)
+            for data_x_this_group in data_group_by_group_x.values()
+            for _, seq in data_x_this_group
+        ]
+    )
+    lengths_y = set(
+        [
+            len(seq)
+            for data_y_this_group in data_group_by_group_y.values()
+            for _, seq in data_y_this_group
+        ]
+    )
+    if len(lengths_x) != 1:
+        raise ValueError(
+            "Sequences in the first input collection must have the same lengths for padding with dummy gap sequences."
+        )
+    if len(lengths_y) != 1:
+        raise ValueError(
+            "Sequences in the second input collection must have the same lengths for padding with dummy gap sequences."
+        )
+    len_x = next(iter(lengths_x))
+    len_y = next(iter(lengths_y))
+
+    group_names = set(data_group_by_group_x.keys()) | set(data_group_by_group_y.keys())
+
+    data_group_by_group_x_padded = defaultdict(SeqRecords)
+    data_group_by_group_y_padded = defaultdict(SeqRecords)
+    data_group_by_group_x_padded.update(deepcopy(data_group_by_group_x))
+    data_group_by_group_y_padded.update(deepcopy(data_group_by_group_y))
     for group_name in group_names:
-        recs_this_group_name = data_group_by_group[group_name]
-        seq_records.extend(recs_this_group_name)
-        group_sizes.append(len(recs_this_group_name))
+        max_depth = max(
+            len(data_group_by_group_x[group_name]),
+            len(data_group_by_group_y[group_name]),
+        )
+        data_group_by_group_x_padded[group_name] += [
+            (f"dummy_{i}", dummy_symbol * len_x)
+            for i in range(max_depth - len(data_group_by_group_x[group_name]))
+        ]
+        data_group_by_group_y_padded[group_name] += [
+            (f"dummy_{i}", dummy_symbol * len_y)
+            for i in range(max_depth - len(data_group_by_group_y[group_name]))
+        ]
 
-    return {"seq_records": seq_records, "group_sizes": group_sizes}
+    return data_group_by_group_x_padded, data_group_by_group_y_padded
 
 
 def get_single_and_paired_seqs(
@@ -96,15 +140,16 @@ def get_single_and_paired_seqs(
         "xy_seqs_to_counts_by_group": xy_seqs_to_counts_by_group,
     }
 
-# %% ../nbs/data_utils.ipynb 5
-def seq_records_tokenizer(
+# %% ../nbs/data_utils.ipynb 6
+def one_hot_encode_msa(
     seq_records: SeqRecords,
     aa_to_int: Optional[dict[str, int]] = None,
     device: Optional[torch.device] = None,
 ) -> torch.Tensor:
     """
-    Given a list of records of the form (header, sequence), tokenize each
-    sequence and one-hot encode each token.
+    Given a list of records of the form (header, sequence), assumed to be a parsed MSA,
+    tokenize each sequence and one-hot encode each token. Return a 3D tensor representing the
+    one-hot encoded MSA.
     """
     if aa_to_int is None:
         aa_to_int = DEFAULT_AA_TO_INT
@@ -120,7 +165,7 @@ def seq_records_tokenizer(
 
     return tokenized_records_oh
 
-# %% ../nbs/data_utils.ipynb 6
+# %% ../nbs/data_utils.ipynb 8
 def compute_num_correct_pairings(
     hard_perms_by_group: list[np.ndarray],
     *,
@@ -168,7 +213,7 @@ def compute_num_correct_pairings(
 
     return correct
 
-# %% ../nbs/data_utils.ipynb 7
+# %% ../nbs/data_utils.ipynb 10
 def compute_comparable_group_idxs(
     group_sizes_arr: np.ndarray, *, max_size_ratio: int, max_group_size: int
 ) -> np.ndarray:
 
@@ -1,23 +1,24 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/msa_parsing.ipynb.
 
 # %% auto 0
-__all__ = ['deletekeys', 'translation', 'read_sequence', 'remove_insertions', 'read_msa']
+__all__ = ['SeqRecord', 'SeqRecords', 'deletekeys', 'translation', 'read_sequence', 'remove_insertions', 'read_msa']
 
-# %% ../nbs/msa_parsing.ipynb 2
-from typing import List, Tuple
+# %% ../nbs/msa_parsing.ipynb 3
 import string
 import itertools
 
 from Bio import SeqIO
 
+SeqRecord = tuple[str, str]
+SeqRecords = list[SeqRecord]
 
 deletekeys = dict.fromkeys(string.ascii_lowercase)
 deletekeys["."] = None
 deletekeys["*"] = None
 translation = str.maketrans(deletekeys)
 
 
-def read_sequence(filename: str) -> Tuple[str, str]:
+def read_sequence(filename: str) -> SeqRecord:
     """Reads the first (reference) sequences from a fasta or MSA file."""
     record = next(SeqIO.parse(filename, "fasta"))
     return record.description, str(record.seq)
@@ -28,7 +29,7 @@ def remove_insertions(sequence: str) -> str:
     return sequence.translate(translation)
 
 
-def read_msa(filename: str, nseq: int) -> List[Tuple[str, str]]:
+def read_msa(filename: str, nseq: int) -> SeqRecords:
     """Reads the first nseq sequences from an MSA file, automatically removes insertions."""
     if nseq == -1:
         nseq = len([elem.id for elem in SeqIO.parse(filename, "fasta")])
 
@@ -82,31 +82,31 @@
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
+   "outputs": [],
    "source": [
     "# DiffPaSS parsing and preprocessing utilities\n",
     "from diffpass.msa_parsing import read_msa\n",
-    "from diffpass.data_utils import create_groupwise_seq_records, fetch_seq_records_from_group_names, seq_records_tokenizer, compute_num_correct_pairings\n",
+    "from diffpass.data_utils import create_groupwise_seq_records, one_hot_encode_msa, compute_num_correct_pairings\n",
     "\n",
     "\n",
     "# Load prokaryotic datasets\n",
     "\n",
     "# HK-RR datasets\n",
     "msa_data = [\n",
-    "    read_msa(\"../data/HK-RR/HK_in_Concat_nnn.fasta\", -1),\n",
-    "    read_msa(\"../data/HK-RR/RR_in_Concat_nnn.fasta\", -1)\n",
+    "    read_msa(\"data/HK-RR/HK_in_Concat_nnn.fasta\", -1),\n",
+    "    read_msa(\"data/HK-RR/RR_in_Concat_nnn.fasta\", -1)\n",
     "]\n",
     "species_name_func = lambda header: header.split(\"|\")[1]\n",
     "\n",
     "## MALG-MALK datasets\n",
     "# msa_data = [\n",
-    "#     read_msa(\"../data/MALG-MALK/MALG_cov75_hmmsearch_extr5000_withLast_b.fasta\", -1),\n",
-    "#     read_msa(\"../data/MALG-MALK/MALK_cov75_hmmsearch_extr5000_withLast_b.fasta\", -1)\n",
+    "#     read_msa(\"data/MALG-MALK/MALG_cov75_hmmsearch_extr5000_withLast_b.fasta\", -1),\n",
+    "#     read_msa(\"data/MALG-MALK/MALK_cov75_hmmsearch_extr5000_withLast_b.fasta\", -1)\n",
     "# ]\n",
     "# species_name_func = lambda header: header.split(\"_\")[-1]"
-   ],
-   "outputs": [],
-   "execution_count": 3
+   ]
   },
   {
    "cell_type": "markdown",
@@ -123,15 +123,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Organize the MSAs by species (\"groupwise\")\n",
     "msa_data_species_by_species = [\n",
     "    create_groupwise_seq_records(msa, species_name_func, remove_groups_with_one_seq=True) \n",
     "    for msa in msa_data\n",
     "]\n",
     "all_species = list(msa_data_species_by_species[0])\n",
-    "assert all_species == list(msa_data_species_by_species[1])\n",
-    "\n",
-    "n_species_to_sample = 50\n",
-    "species = np.random.choice(all_species, n_species_to_sample, replace=False)"
+    "assert all_species == list(msa_data_species_by_species[1])"
    ]
   },
   {
@@ -149,12 +147,15 @@
     }
    ],
    "source": [
-    "msa_data_and_species_sizes = [\n",
-    "    fetch_seq_records_from_group_names(msa_species_by_species, species)\n",
+    "# Sample a few species to work with, and filter the MSAs to only include these species\n",
+    "n_species_to_sample = 50\n",
+    "species = np.random.choice(all_species, n_species_to_sample, replace=False)\n",
+    "msa_data_species_by_species = [\n",
+    "    {sp: msa_species_by_species[sp] for sp in species}\n",
     "    for msa_species_by_species in msa_data_species_by_species\n",
     "]\n",
     "\n",
-    "species_sizes = msa_data_and_species_sizes[0][\"group_sizes\"]\n",
+    "species_sizes = [len(records) for records in msa_data_species_by_species[0].values()]\n",
     "print(f\"Species sizes: {species_sizes}\")\n",
     "\n",
     "n_seqs = sum(species_sizes)\n",
@@ -167,8 +168,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "x = seq_records_tokenizer(msa_data_and_species_sizes[0][\"seq_records\"], device=DEVICE)\n",
-    "y = seq_records_tokenizer(msa_data_and_species_sizes[1][\"seq_records\"], device=DEVICE)"
+    "# Bring data back into the original form (list of records)\n",
+    "msa_data = [\n",
+    "    [record for records_this_species in msa_species_by_species.values() for record in records_this_species]\n",
+    "    for msa_species_by_species in msa_data_species_by_species\n",
+    "]\n",
+    "\n",
+    "x = one_hot_encode_msa(msa_data[0], device=DEVICE)\n",
+    "y = one_hot_encode_msa(msa_data[1], device=DEVICE)"
    ]
   },
   {
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# DiffPaSS`
	`1`	`+# DiffPaSS – Differentiable Pairing using Soft Scores`
`2`	`2`
`3`	`3`	`<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->`
`4`	`4`