clean

sarahalamdari · sarahalamdari · commit 3409d91c8d1c · 2025-07-18T11:59:03.000-07:00
diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ docker run -it samirchar/dayhoff:latest
 
 ## Data and model availability
 
-All Dayhoff models are available on [AzureAIFoundry](https://ai.azure.com/labs)
+All Dayhoff models are available on [Azure AI Foundry](https://aka.ms/dayhoff/foundry)
 
 Additionally, all Dayhoff models are also hosted on [Hugging Face](https://huggingface.co/collections/microsoft/dayhoff-atlas-6866d679465a2685b06ee969) 🤗. All datasets used in the paper, with the exception of OpenProteinSet are available on Hugging Face in three formats: FASTA, Arrow, and JSONL. 
 
@@ -92,34 +92,34 @@ GigaRef, BackboneRef, and DayhoffRef are available under [CC BY License](https:/
 ### Training datasets
 The Dayhoff models were trained on the Dayhoff Atlas, with varying data mixes which include:  
 
-**[UniRef50](https://www.uniprot.org/)** (**UR50**) - dataset from UniProt, clustered at 50% sequence identity, contains only cluster representatives.
-* _Splits: train (25 GB), test (26 MB), valid (26 MB)_
+* **[UniRef50](https://www.uniprot.org/)** (**UR50**) - dataset from UniProt, clustered at 50% sequence identity, contains only cluster representatives.
+    * _Splits: train (25 GB), test (26 MB), valid (26 MB)_
 
-**[UniRef90](https://www.uniprot.org/)** (**UR90**) - dataset from UniProt, clustered at 90% sequence identity, contains cluster representatives and members.
-* _Splits: train (83 GB), test (90 MB), valid (87 MB)_
+* **[UniRef90](https://www.uniprot.org/)** (**UR90**) - dataset from UniProt, clustered at 90% sequence identity, contains cluster representatives and members.
+    * _Splits: train (83 GB), test (90 MB), valid (87 MB)_
 
 
-**GigaRef** (**GR**)– 3.43B protein sequences across 1.7B clusters of metagenomic and natural protein sequences. There are two subsets of gigaref:
-* **GigaRef-clusters** (**GR**) - Only includes cluster representatives and members, no singletons
-    * _Splits: train (433 GB), test (22 MB)_
-* **GigaRef-singletons** (**GR-s**) - Only includes singletons
-    * _Splits: train (282 GB)_
+* **GigaRef** (**GR**)– 3.43B protein sequences across 1.7B clusters of metagenomic and natural protein sequences. There are two subsets of gigaref:
+    * **GigaRef-clusters** (**GR**) - Only includes cluster representatives and members, no singletons
+        * _Splits: train (433 GB), test (22 MB)_
+    * **GigaRef-singletons** (**GR-s**) - Only includes singletons
+        * _Splits: train (282 GB)_
 
-**BackboneRef** (**BR**) – 46M structure-derived synthetic sequences from c.a. 240,000 de novo backbones, with three subsets containing 10M sequences each:  
-* **BackboneRef unfiltered** (**BRu**) – 10M sequences randomly sampled from all 46M designs.  
-    * _Splits: train (3 GB)_
-* **BackboneRef quality** (**BRq**) – 10M sequences sampled from 127,633 backbones whose average self-consistency RMSD ≤ 2 Å.  
-    * _Splits: train(3 GB)_
-* **BackboneRef novelty** (**BRn**) – 10M sequences from 138,044 backbones with a max TM-score < 0.5 to any natural structure.  
-    * _Splits: train (3GB)_
+* **BackboneRef** (**BR**) – 46M structure-derived synthetic sequences from c.a. 240,000 de novo backbones, with three subsets containing 10M sequences each:  
+    * **BackboneRef unfiltered** (**BRu**) – 10M sequences randomly sampled from all 46M designs.  
+        * _Splits: train (3 GB)_
+    * **BackboneRef quality** (**BRq**) – 10M sequences sampled from 127,633 backbones whose average self-consistency RMSD ≤ 2 Å.  
+        * _Splits: train(3 GB)_
+    * **BackboneRef novelty** (**BRn**) – 10M sequences from 138,044 backbones with a max TM-score < 0.5 to any natural structure.  
+        * _Splits: train (3GB)_
 
-**[OpenProteinSet](https://arxiv.org/abs/2308.05326)** (**HM**) – 16 million precomputed MSAs from 16M sequences in UniClust30 and 140,000 PDB chains. 
+* **[OpenProteinSet](https://arxiv.org/abs/2308.05326)** (**HM**) – 16 million precomputed MSAs from 16M sequences in UniClust30 and 140,000 PDB chains. 
 
 ### DayhoffRef
 Given the potential for generative models to expand the space of proteins and their functions, we used the Dayhoff models to generate DayhoffRef, a PLM-generated database of synthetic protein sequences
 
-**DayhoffRef**: dataset of 16 million synthetic protein sequences generated by the Dayhoff models: Dayhoff-3b-UR90, Dayhoff-3b-GR-HM, Dayhoff-3b-GR-HM-c, and Dayhoff-170m-UR50-BRn. 
-* _Splits: train (5 GB)_
+* **DayhoffRef**: dataset of 16 million synthetic protein sequences generated by the Dayhoff models: Dayhoff-3b-UR90, Dayhoff-3b-GR-HM, Dayhoff-3b-GR-HM-c, and Dayhoff-170m-UR50-BRn. 
+    * _Splits: train (5 GB)_
 
 ### Loading datasets in HuggingFace 
 
diff --git a/datasets/openproteinset/utils.py b/datasets/openproteinset/utils.py
@@ -1,12 +1,12 @@
 import glob
-from tqdm import tqdm
 import os
-import pandas as pd
-import numpy as np
-from sequence_models.utils import parse_fasta
 import subprocess
 from multiprocessing.pool import ThreadPool
 
+import pandas as pd
+from sequence_models.utils import parse_fasta
+from tqdm import tqdm
+
 
 def make_uniref_fasta_splits(alignments):
     for x in alignments:
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,64 +33,46 @@ classifiers = [
     "Operating System :: OS Independent"
 ]
 
-dependencies = [ #TODO: can add optional dependencies
-# Basics
-'pandas~=2.2.3',
-'numpy~=1.26.4',
-# 'lmdb~=1.4.1',
-'mdanalysis~=2.7.0',
-'python-dotenv~=1.0.1', # to load env variables
-'matplotlib~=3.10.1',
-'seaborn~=0.13.2',
-'h5py~=3.13.0',
-'scikit-learn~=1.5.0',
-'scipy~=1.15.2',
-
-#Pytorch
-'torch-geometric~=2.5.2',
-'torch-scatter~=2.1.2',
-'transformers~=4.42.4',
-'datasets~=3.2.0', #for HF datasets
-# 'causal-conv1d>=1.4.0', #For jamba/mamba
-# 'mamba-ssm~=2.2.4', #For jamba/mamba
-# 'flash-attn~=2.7.4.post1', #Flash Attention
-
-#Bio
-'biopython~=1.83',
-# 'biotite~=0.40.0',
-'blosum~=2.0.3',
-'fair-esm~=2.0.0',
-'evodiff~=1.1.0',
-'sequence-models~=1.8.0',
-
-
-# logging debugging
-# 'mlflow',
-'pdb-tools~=2.5.0',
-'wandb~=0.16.6',
-'tqdm~=4.67.1',
-
-
-#Huggingface Hub
-'ijson~=3.3.0',
-'pyfastx~=2.2.0',
-'huggingface_hub~=0.27.1',
-
-#Azure
-'azure-identity~=1.21.0'
+# only the packages imported by dayhoff/:
+dependencies = [
+  "numpy>=1.26",
+  "torch>=2.7",
+  "transformers>=4.49",
+  "pandas>=2.3",
+  "biopython>=1.85",
+  "sequence-models>=1.8",
+  "scipy>=1.13",
 ]
 
-# [project.optional-dependencies]
-
+[project.optional-dependencies]
+# everything else in your monorepo lives here
+full = [
+  "mdanalysis>=2.7",
+  "python-dotenv>=1.0",
+  "matplotlib>=3.10",
+  "seaborn>=0.13",
+  "h5py>=3.13",
+  "scikit-learn>=1.5",
+  "torch-geometric>=2.5",
+  "torch-scatter>=2.1",
+  "datasets>=3.2",
+  "blosum>=2.0",
+  "fair-esm>=2.0",
+  "evodiff>=1.1",
+  "pdb-tools>=2.5",
+  "wandb>=0.16",
+  "tqdm>=4.67",
+  "ijson>=3.3",
+  "pyfastx>=2.2",
+  "huggingface_hub>=0.27",
+  "azure-identity>=1.21",
+]
 
 [project.urls]
 Homepage = "https://github.com/microsoft/dayhoff"
 Repository = "https://github.com/microsoft/dayhoff"
 Issues = "https://github.com/microsoft/dayhoff/issues"
-
-# May need to remove this
-HuggingFaceDatasets = "https://huggingface.co/datasets/microsoft/DayhoffDataset"
-HuggingFaceModels = "https://huggingface.co/microsoft/Dayhoff"
+HuggingFace = "https://huggingface.co/collections/microsoft/dayhoff-atlas-6866d679465a2685b06ee969"
 
 
 [tool.setuptools.packages.find]