microsoft
diff --git a/‎CODE_OF_CONDUCT.md‎
Lines changed: 9 additions & 0 deletions b/‎CODE_OF_CONDUCT.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎Dockerfile‎
Lines changed: 40 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎LICENSE‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 275 additions & 0 deletions b/‎README.md‎
Lines changed: 275 additions & 0 deletions
diff --git a/‎SECURITY.md‎
Lines changed: 41 additions & 0 deletions b/‎SECURITY.md‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎SUPPORT.md‎
Lines changed: 13 additions & 0 deletions b/‎SUPPORT.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎analysis/ccmgen/Snakefile‎
Lines changed: 62 additions & 0 deletions b/‎analysis/ccmgen/Snakefile‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎analysis/clusters.py‎
Lines changed: 94 additions & 0 deletions b/‎analysis/clusters.py‎
Lines changed: 94 additions & 0 deletions
diff --git a/‎analysis/compile_cas9_fidelity.py‎
Lines changed: 70 additions & 0 deletions b/‎analysis/compile_cas9_fidelity.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎analysis/compile_dayhoffref.py‎
Lines changed: 18 additions & 0 deletions b/‎analysis/compile_dayhoffref.py‎
Lines changed: 18 additions & 0 deletions
@@ -0,0 +1,9 @@
+# Microsoft Open Source Code of Conduct
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+
+Resources:
+
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
@@ -0,0 +1,40 @@
+FROM pytorch/pytorch:2.7.0-cuda12.8-cudnn9-devel
+
+# Set environment to non-interactive for clean installs
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install git and other system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone and install causal-conv1d
+RUN git clone https://github.com/Dao-AILab/causal-conv1d.git && \
+    cd causal-conv1d && \
+    git checkout v1.4.0 && \
+    CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \
+    cd ..
+
+# Clone and install mamba
+RUN git clone https://github.com/state-spaces/mamba.git && \
+    cd mamba && \
+    git checkout v2.2.4 && \
+    CAUSAL_CONV1D_FORCE_BUILD=TRUE \
+    CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \
+    CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE \
+    pip install --no-build-isolation . && \
+    cd ..
+
+# Install dayhoff package from PyPI
+RUN pip install dayhoff
+
+# Add GitHub to known_hosts to avoid host verification error
+RUN mkdir -p /root/.ssh && \
+    ssh-keyscan github.com >> /root/.ssh/known_hosts
+
+# Clone the private or public Dayhoff repo 
+RUN --mount=type=ssh git clone git@github.com:microsoft/dayhoff.git /dayhoff
+
+# Set working directory to inside the cloned repo
+WORKDIR /dayhoff
@@ -0,0 +1,21 @@
+    MIT License
+
+    Copyright (c) Microsoft Corporation.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
@@ -0,0 +1,41 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+
+## Reporting Security Issues
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
+
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
+
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+
+## Preferred Languages
+
+We prefer all communications to be in English.
+
+## Policy
+
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
@@ -0,0 +1,13 @@
+# Support
+
+## How to file issues and get help  
+
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
+feature request as a new Issue.
+
+For help and questions about using this project, please contact the authors.
+
+## Microsoft Support Policy  
+
+Support for this project is limited to the resources listed above.
@@ -0,0 +1,62 @@
+import os
+
+# alignments = os.listdir('selected_alignments')
+# names = []
+# for alignment in alignments:
+#     with open(os.path.join("selected_alignments", alignment)) as f:
+#         lines = f.readlines()
+#         if len(lines) > 2:
+#             names.append(alignment[:-6])
+
+alignments = os.listdir('ccmgen_models')
+names = [a[:-8] for a in alignments]
+
+
+
+rule cat:
+    input: ["ccmgen_outputs_short/" + name + ".fasta" for name in names]
+    output: "ccmgen_short.fasta"
+    run:
+        with open(output[0], "w") as out_file:
+            for in_file in input:
+                with open(in_file) as f:
+                    _ = f.readline()
+                    seq = f.readlines()
+                    seq = "".join([s[:-1] for s in seq])
+                name = in_file.split("/")[1][:-6]
+                out_file.write(">" + name + "\n" + seq + "\n")
+
+
+rule ccmgen:
+    input: "single_sequences/{name}.fasta", "ccmgen_models_short/{name}.braw.gz"
+    output: "ccmgen_outputs_short/{name}.fasta"
+    conda: "ccmgen"
+    shell:
+        "ccmgen ccmgen_models_short/{wildcards.name}.braw.gz {output} --mcmc-sampling --alnfile single_sequences/{wildcards.name}.fasta --mcmc-sample-random-gapped --mcmc-burn-in 500 --num-sequences 1"
+
+
+rule get_56:
+    input: "selected_alignments/{name}.fasta"
+    output: "conditioning_sequences/{name}.fasta"
+    run:
+        with open(input[0]) as f_in, open(output[0], 'w') as f_out:
+            for i, line in enumerate(f_in):
+                if i == 57 * 2:
+                    break
+                f_out.write(line)
+
+
+rule get_first:
+    input: "selected_alignments/{name}.fasta"
+    output: "single_sequences/{name}.fasta"
+    shell:
+        "head -n 2 {input} > {output}"
+
+
+rule ccmpred:
+    input: "conditioning_sequences/{name}.fasta"
+    output: "ccmgen_models_short/{name}.braw.gz"
+    threads: 2
+    conda: "ccmgen"
+    shell:
+        "ccmpred {input} --no-logo --num-threads {threads} -b {output}"
@@ -0,0 +1,94 @@
+import os
+import jsonlines
+from tqdm import tqdm
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+_ = sns.set_style("white")
+input_dir = "/home/kevyan/generations/dayhoffref/"
+
+df_50 = pd.read_csv(os.path.join(input_dir, 'c50.tsv'), sep='\t', header=None)
+c50 = [None] * len(df_50)
+current_pos = 0
+current_cluster_rep = df_50.loc[0, 0]
+current_cluster = []
+for row in tqdm(df_50.itertuples()):
+    if row._1 != current_cluster_rep:
+        c50[current_pos] = {"rep_id": current_cluster_rep, "ids": current_cluster, "n": len(current_cluster)}
+        current_pos += 1
+        current_cluster = []
+        current_cluster_rep = row._1
+    current_cluster.append(row._2)
+c50[current_pos] = {"rep_id": current_cluster_rep, "ids": current_cluster, "n": len(current_cluster)}
+c50 = c50[:current_pos + 1]
+with jsonlines.open(os.path.join(input_dir, 'c50.jsonl'), 'w') as f:
+    for key in c50:
+        f.write(key)
+
+counts = {}
+total = 0
+for key in c50:
+    c = key["n"]
+    if c not in counts:
+        counts[c] = 1
+    else:
+        counts[c] += 1
+    total += c
+
+x = np.array(list(counts.keys()))
+x = np.sort(x)
+y = np.array([counts[xx] * xx for xx in x])
+y = np.cumsum(y)
+fig, ax = plt.subplots(1, 1)
+_ = ax.plot(x, y, '.-')
+_ = fig.savefig(os.path.join(input_dir, 'c50_cumsum.pdf'), dpi=300, bbox_inches='tight')
+
+df = pd.DataFrame(columns=["model", "temp", "direction", "cluster_size"])
+model = [None] * total
+temp = [None] * total
+direction = [None] * total
+cluster_size = [None] * total
+current_row = 0
+for c in tqdm(c50):
+    for name in c["ids"]:
+        broken = name.split('_')
+        model[current_row] = broken[0]
+        temp[current_row] = float(broken[2][1:])
+        direction[current_row] = broken[1]
+        cluster_size[current_row] = c["n"]
+        current_row += 1
+df["model"] = model
+df["temp"] = temp
+df["direction"] = direction
+df["cluster_size"] = cluster_size
+df.to_csv(os.path.join(input_dir, 'c50_sizes.csv'), index=False)
+
+
+df = pd.read_csv(os.path.join(input_dir, 'c50_sizes.csv'))
+model_names = list(set(model))
+grouped = df.groupby(["model", "temp", "direction"])
+grouped["cluster_size"].mean()
+
+def f(sizes):
+    return (sizes == 1).mean()
+
+agged = grouped.agg(
+    cluster_size_mean=('cluster_size', np.mean),
+    cluster_size_std=('cluster_size', np.std),
+    frac_singleton=("cluster_size", f),
+    n=("cluster_size", "count"),
+)
+pd.set_option('display.max_columns', None)
+pd.set_option('display.max_rows', None)
+pd.set_option('display.expand_frame_repr', False)
+print(agged.reset_index())
+
+df.groupby("model").agg(
+cluster_size_mean=('cluster_size', np.mean),
+    cluster_size_std=('cluster_size', np.std),
+    frac_singleton=("cluster_size", f),
+    n=("cluster_size", "count"),
+).reset_index()
@@ -0,0 +1,70 @@
+from tqdm import tqdm
+import os
+
+from Bio.Align import PairwiseAligner, substitution_matrices
+from sequence_models.utils import parse_fasta
+
+
+base_path = "/home/kevyan/generations/cas9-no-order/"
+
+model = "short_cas9s_1.0_minp0.00_new"
+folding_df = pd.read_csv(os.path.join(base_path, 'esmfold_proteinmpnn_merge_data.csv'))
+seqs, names = parse_fasta(os.path.join(base_path, "%s.fasta" % model), return_names=True)
+df = folding_df[folding_df['if_temp'] == 1.0]
+name_df = pd.DataFrame()
+name_df['sequence'] = seqs
+name_df['file'] = names
+df = pd.merge(name_df, df, how='left', on='file')
+# for m in models:
+#     pdb_paths, mpnn_paths = get_all_paths(os.path.join(base_path, "%s_structures/pdb/esmfold/" %m), os.path.join(base_path, "%s_structures/esmfoldmpnn_iftemp_1" %m))
+#     fold_df, mpnn_df, df = results_to_pandas(pdb_paths, mpnn_paths, name="")
+#     df['model'] = m
+
+aligner = PairwiseAligner()
+aligner.substitution_matrix = substitution_matrices.load("BLOSUM62")
+aligner.open_gap_score = -10
+aligner.extend_gap_score = -0.5
+aligner.target_end_gap_score = 0.0
+aligner.query_end_gap_score = 0.0
+with tqdm(total=len(df)) as pbar:
+    homologs, homolog_names = parse_fasta(os.path.join('/home/kevyan/data/characterized_cas9s', "naturals.fasta"), return_names=True)
+    for idx, row in df.iterrows():
+        s = row['sequence']
+        s = s.replace("-", "")
+        s = s.replace("<mask2>", "")
+        s = s.replace("<mask1>", "")
+        s = s.replace("<mask3>", "")
+        s = s.replace("<eos>", "")
+        best_matches = -1
+        best_homolog_sequence = None
+        best_homolog_name = None
+        best_cterm_gaps = None
+        for hs, hn in zip(homologs, homolog_names):
+            alignment = aligner.align(s, hs)
+            if alignment.score > best_matches:
+                best_matches = alignment.score
+                best_homolog_sequence = hs
+                best_homolog_name = hn
+                best_cterm_gaps = len(hs) - alignment[0].aligned[1, -1, 1]
+        df.loc[idx, 'gen_length'] = len(s)
+        df.loc[idx, 'best_matches'] = best_matches
+        df.loc[idx, 'match_length'] = len(best_homolog_sequence)
+        df.loc[idx, 'homolog_name'] = best_homolog_name
+        df.loc[idx, 'homolog_sequence'] = best_homolog_sequence
+        df.loc[idx, 'cterm_gaps'] = best_cterm_gaps
+        pbar.update(1)
+
+df['plddt'] = df['esmfoldplddt']
+df['scperplexity'] = df['proteinmpnnperplexity']
+df['seq_id'] = df['best_matches'] / df['gen_length']
+df = df.sort_values(['cterm_gaps', 'plddt'], ascending=[True, False])
+df['name'] = [f.split('_')[-1] for f in df['file']]
+df.to_csv(os.path.join(base_path, "%s_fidelity.csv" %model), index=False)
+
+df = pd.read_csv(os.path.join(base_path, "%s_fidelity.csv" %model))
+
+df[df['plddt'] > .70].head(10)[['name', 'match_length', 'gen_length', 'plddt', 'cterm_gaps', 'best_matches']]
+# 52, 8, and 50 have the most domain hits
+df[df['plddt'] > 0.7].shape
+df.loc[[0, 1, 2, 18, 19, 21], ['name', 'sequence']].values
+df.loc[[0, 1, 2, 18, 19, 21], ['name', 'homolog_name', 'homolog_sequence']].values
@@ -0,0 +1,18 @@
+import os
+
+individual_dir = "/home/kevyan/generations/dayhoffref/dayhoff_generations/"
+individual_files = os.listdir(individual_dir)
+
+out_file = "/home/kevyan/generations/dayhoffref/dayhoffref.fasta"
+with open(out_file, 'w') as out:
+    for individual_file in individual_files:
+        name = individual_file.replace(".fasta", "")
+        name = name.replace("jamba-", "")
+        name = name.replace("10mbothfilter", "bbr-novel-sc")
+        print(name)
+        with open(os.path.join(individual_dir, individual_file), 'r') as infile:
+            for line in infile:
+                if line.startswith(">"):
+                    out.write(">" + name + "_" + line[1:])
+                else:
+                    out.write(line)