freddolino-lab · vivianra · Jan 30, 2026 · Mar 10, 2026 · Mar 16, 2026
diff --git a/README.md b/README.md
@@ -11,7 +11,9 @@ space, ShapeME instead identifies motifs in local DNA structure space using
 sequence-based predictions of DNA structure parameters including minor groove
 width, electrostatic potential, helical twist, propeller twist, and roll. DNA
 structural parameters are predicted from input sequences using `DNAshapeR`
-([Chiu et al. 2016](https://doi.org/10.1093/bioinformatics/btv735)).
+([Chiu et al. 2016](https://doi.org/10.1093/bioinformatics/btv735)) or `Deep DNAshape` 
+([Li et al. 2024](https://www-nature-com.proxy.lib.umich.edu/articles/s41467-024-45191-5)),
+which can be selected as the backend shape prediction.
 
 ShapeME excels at discovering DNA structural motifs that explain
 binary data such as bound and unbound regions on a chromosome for a given
@@ -476,6 +478,7 @@ description of each argument.
                             [--stepsize STEPSIZE] [--opt_niter OPT_NITER]
                             [--alpha ALPHA] [--batch_size BATCH_SIZE]
                             [--find_seq_motifs] [--no_shape_motifs]
+                            [--shape_tool {dnashaper,deepdnashape}]
                             [--seq_fasta SEQ_FASTA]
                             [--seq_motif_positive_cats SEQ_MOTIF_POSITIVE_CATS]
                             [--streme_thresh STREME_THRESH]
@@ -564,6 +567,9 @@ description of each argument.
       --no_shape_motifs     Add this flag to turn off shape motif inference. This
                             is useful if you basically want to use this script as
                             a wrapper for streme to just find sequence motifs.
+      --shape_tool {dnashaper,deepdnashape}
+                            Backend tool used to compute DNA shape tracks. 
+                            'dnashaper' uses DNAshapeR'deepdnashape' uses DeepDNAshape.
       --seq_fasta SEQ_FASTA
                             Name of fasta file (located within data_dir, do not
                             include the directory, just the file name) containing

diff --git a/singularity/deepdnashape_env.yaml b/singularity/deepdnashape_env.yaml
@@ -0,0 +1,13 @@
+name: deepdnashape
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.10
+  - pip
+  - numpy<1.24
+  - tensorflow=2.15.0
+  - h5py
+  - protobuf<5
+  - pip:
+      - wheel
diff --git a/singularity/env_spec.yaml b/singularity/env_spec.yaml
@@ -61,6 +61,7 @@ dependencies:
   - gxx_impl_linux-64=13.2.0=h338b0a0_2
   - harfbuzz=6.0.0=h8e241bc_0
   - icu=70.1=h27087fc_0
+  - ipdb
   - jack=1.9.22=h11f4161_0
   - jinja2=3.1.2=pyhd8ed1ab_1
   - joblib=1.3.2=pyhd8ed1ab_0

diff --git a/singularity/shapeme.def b/singularity/shapeme.def
@@ -19,6 +19,7 @@ Stage: build
 
 %files
     env_spec.yaml
+    deepdnashape_env.yaml
     shapeme.bashrc /shapeme.bashrc
 
 %runscript
@@ -29,10 +30,15 @@ Stage: build
     && apt update -y \
     && apt upgrade -y \
     && apt install -y libffi-dev \
+    && apt install -y libffi-dev git ca-certificates \
     && conda update -y -n base conda \
     && conda install -c conda-forge mamba \
     && mamba env create -f env_spec.yaml \
-    && cd 
+    && mamba env create -f deepdnashape_env.yaml \
+    && cd /src \
+    && git clone https://github.com/JinsenLi/deepDNAshape.git \
+    && /opt/conda/envs/deepdnashape/bin/python -m pip install --no-deps ./deepDNAshape \
+    && rm -rf /src/deepDNAshape
 
     CUSTOM_ENV=/.singularity.d/env/99-zz_custom_env.sh
     cat >$CUSTOM_ENV <<EOF
@@ -46,3 +52,16 @@ EOF
     source /opt/conda/etc/profile.d/conda.sh
     conda activate motifer
     python -c "import numpy; print(numpy.__file__)"
+
+    echo "motifer python:"
+    /opt/conda/envs/motifer/bin/python -c "import sys; print(sys.executable)"
+
+    echo "motifer numpy:"
+    /opt/conda/envs/motifer/bin/python -c "import numpy as np; print('motifer numpy', np.__version__); print(np.__file__)"
+
+    echo "deepdnashape python:"
+    /opt/conda/envs/deepdnashape/bin/python -c "import sys; print(sys.executable)"
+
+    echo "deepdnashape numpy + tensorflow:"
+    /opt/conda/envs/deepdnashape/bin/python -c "import numpy as np; import tensorflow as tf; print('deepdnashape numpy', np.__version__); print('tf', tf.__version__)"
+
diff --git a/src/python3/ShapeME.py b/src/python3/ShapeME.py
@@ -406,6 +406,9 @@ def parse_args():
         help=f"Add this flag to turn off shape motif inference. "\
             f"This is useful if you basically want to use this script "\
             f"as a wrapper for streme to just find sequence motifs.")
+    infer.add_argument("--shape_tool", choices=["dnashaper", "deepdnashape"], default="dnashaper",
+        help=f"Backend tool used to compute DNA shape tracks. 'dnashaper' uses DNAshapeR"
+            f"'deepdnashape' uses DeepDNAshape.")
     infer.add_argument("--seq_fasta", type=str, default=None,
         help=f"Name of fasta file (located within data_dir, do not include the "\
             f"directory, just the file name) containing sequences in which to "\
@@ -479,6 +482,42 @@ def set_outdir_pref(no_shape_motifs, find_seq_motifs):
             sys.exit(1)
     return outdir_pre
 
+# functions newly added to account for the shape tool to be specified
+def shape_file_list_str(seq_fasta: str, shape_names):
+    return "".join([f"{seq_fasta}.{s} " for s in shape_names])
+
+def run_shape_prediction(seq_fasta: str, shape_tool: str):
+    """
+    Generate per-shape files alongside seq_fasta.
+    """
+
+    if shape_tool.lower() == "dnashaper":
+        convert = f"Rscript {this_path}/utils/calc_shape.R {seq_fasta}"
+    elif shape_tool.lower() == "deepdnashape":
+        convert = f"/opt/conda/envs/deepdnashape/bin/python {this_path}/utils/calc_deepdnashape.py {seq_fasta}"
+    else:
+        logging.error(f"ERROR: Unrecognized --shape_tool: {shape_tool}")
+        sys.exit(1)
+
+    convert_result = subprocess.run(
+        convert,
+        shell=True,
+        capture_output=True,
+    )
+
+    if convert_result.returncode != 0:
+        msg = (
+            f"ERROR: running the following command:\n\n"
+            f"{convert}\n\n"
+            f"resulted in the following stderr:\n\n"
+            f"{convert_result.stderr.decode()}\n\n"
+            f"and the following stdout:\n\n"
+            f"{convert_result.stdout.decode()}")
+        logging.error(msg)
+        sys.exit(1)
+    else:
+        logging.info("Converting sequences to shapes ran without error")
+
 
 def infer(args):
     data_dir = args.data_dir
@@ -582,27 +621,30 @@ def infer(args):
     ## in future versions, use this conversion in fold splitting.
     #############################################################
     #############################################################
-    convert = f"Rscript {this_path}/utils/calc_shape.R {seq_fasta}"
-    convert_result = subprocess.run(
-        convert,
-        shell=True,
-        capture_output=True,
-        #check=True,
-    )
-    if convert_result.returncode != 0:
-        msg = f"ERROR: running the following command:\n\n"\
-            f"{convert}\n\n"\
-            f"resulted in the following stderr:\n\n"\
-            f"{convert_result.stderr.decode()}\n\n"\
-            f"and the following stdout:\n\n"\
-            f"{convert_result.stdout.decode()}"
-        logging.error(msg)
-        sys.exit(1)
-    else:
-        logging.info("Converting training sequences to shapes ran without error")
-        full_shape_fnames = ""
-        for shape_name in shape_names:
-            full_shape_fnames += f"{seq_fasta}.{shape_name} "
+    #convert = f"Rscript {this_path}/utils/calc_shape.R {seq_fasta}"
+    #convert_result = subprocess.run(
+    #    convert,
+    #    shell=True,
+    #    capture_output=True,
+    #    #check=True,
+    #)
+    #if convert_result.returncode != 0:
+    #    msg = f"ERROR: running the following command:\n\n"\
+    #        f"{convert}\n\n"\
+    #        f"resulted in the following stderr:\n\n"\
+    #        f"{convert_result.stderr.decode()}\n\n"\
+    #        f"and the following stdout:\n\n"\
+    #        f"{convert_result.stdout.decode()}"
+    #    logging.error(msg)
+    #    sys.exit(1)
+    #else:
+    #    logging.info("Converting training sequences to shapes ran without error")
+    #    full_shape_fnames = ""
+    #    for shape_name in shape_names:
+    #        full_shape_fnames += f"{seq_fasta}.{shape_name} "
+
+    run_shape_prediction(seq_fasta, args.shape_tool)
+    full_shape_fnames = shape_file_list_str(seq_fasta, shape_names)
 
     out_dir = os.path.join(data_dir, f"{outdir_pre}_main_output")
     tmpdir = os.path.join(out_dir, "tmp")
@@ -920,47 +962,49 @@ def infer(args):
                 test_score_file.write(f"\n{name}\t{yval}")
             test_score_file.close()
 
-            convert = f"Rscript {this_path}/utils/calc_shape.R {train_seq_fasta}"
-            #convert = shlex.quote(convert)
-            convert_result = subprocess.run(
-                convert,
-                shell=True,
-                capture_output=True,
-                #check=True,
-            )
-            if convert_result.returncode != 0:
-                logging.error(
-                    f"ERROR: running the following command:\n\n"\
-                    f"{convert}\n\n"\
-                    f"resulted in the following stderr:\n\n"\
-                    f"{convert_result.stderr.decode()}\n\n"
-                    f"and the following stdout:\n\n"\
-                    f"{convert_result.stdout.decode()}"
-                )
-                sys.exit(1)
-            else:
-                logging.info("Converting training sequences to shapes ran without error")
-
-            convert = f"Rscript {this_path}/utils/calc_shape.R {test_seq_fasta}"
-            #convert = shlex.quote(convert)
-            convert_result = subprocess.run(
-                convert,
-                shell=True,
-                capture_output=True,
-                #check=True,
-            )
-            if convert_result.returncode != 0:
-                logging.error(
-                    f"ERROR: running the following command:\n\n"\
-                    f"{convert}\n\n"\
-                    f"resulted in the following stderr:\n\n"\
-                    f"{convert_result.stderr.decode()}\n\n"
-                    f"and the following stdout:\n\n"\
-                    f"{convert_result.stdout.decode()}"
-                )
-                sys.exit(1)
-            else:
-                logging.info("Converting testing sequences to shapes ran without error")
+            run_shape_prediction(train_seq_fasta, args.shape_tool)
+            #convert = f"Rscript {this_path}/utils/calc_shape.R {train_seq_fasta}"
+            ##convert = shlex.quote(convert)
+            #convert_result = subprocess.run(
+            #    convert,
+            #    shell=True,
+            #    capture_output=True,
+            #    #check=True,
+            #)
+            #if convert_result.returncode != 0:
+            #    logging.error(
+            #        f"ERROR: running the following command:\n\n"\
+            #        f"{convert}\n\n"\
+            #        f"resulted in the following stderr:\n\n"\
+            #        f"{convert_result.stderr.decode()}\n\n"
+            #        f"and the following stdout:\n\n"\
+            #        f"{convert_result.stdout.decode()}"
+            #    )
+            #    sys.exit(1)
+            #else:
+            #    logging.info("Converting training sequences to shapes ran without error")
+
+            run_shape_prediction(test_seq_fasta, args.shape_tool)
+            #convert = f"Rscript {this_path}/utils/calc_shape.R {test_seq_fasta}"
+            ##convert = shlex.quote(convert)
+            #convert_result = subprocess.run(
+            #    convert,
+            #    shell=True,
+            #    capture_output=True,
+            #    #check=True,
+            #)
+            #if convert_result.returncode != 0:
+            #    logging.error(
+            #        f"ERROR: running the following command:\n\n"\
+            #        f"{convert}\n\n"\
+            #        f"resulted in the following stderr:\n\n"\
+            #        f"{convert_result.stderr.decode()}\n\n"
+            #        f"and the following stdout:\n\n"\
+            #        f"{convert_result.stdout.decode()}"
+            #    )
+            #    sys.exit(1)
+            #else:
+            #    logging.info("Converting testing sequences to shapes ran without error")
 
             INFER_EXE = f"python {this_path}/infer_motifs.py "\
                 f"--score_file {train_score_fname} "\

diff --git a/src/python3/create_synthetic_data.py b/src/python3/create_synthetic_data.py
@@ -199,14 +199,22 @@ def main():
                         help="The type of data to be output.")
     parser.add_argument('--ncats', action='store', type=int, default=2, help="The number of categories containing motifs (see --pivot-category if you want to add one extra category without any motif).")
     parser.add_argument('--pivot-category', action='store_true', default=False, help="If this flag is set, an extra category will be present in the synthetic data in which no motif is expected to be enriched. Ignored if ncats=1, since in that case, cat 1 will have the motif and cat 0 will not.")
+    parser.add_argument('--cat_weights', action='store', type=float, nargs='+', default=None, 
+                        help="Class label sampling weights (class prevalence). For ncats=2, defaults to 0.80 class 0 and 0.20 class 1. example: --cat-weights 0.80 0.20")
     parser.add_argument('--folds', action='store', type=int, default=5, help="The number of folds for k-fold CV.")
+    parser.add_argument('--seed', action='store', type=int, default=None,
+                    help="Random seed for reproducibility")
     parser.add_argument('--noshapes', action='store_true', help="include at command line if you only want sequences produced.")
 
     args = parser.parse_args()
+
+    if args.seed is not None:
+        np.random.seed(args.seed)
 
     folds = args.folds
     dtype = args.dtype
     ncats = args.ncats
+    cat_weights=args.cat_weights
     out_dir = args.outdir
     seq_len = args.seqlen
     rec_num = args.recnum
@@ -229,9 +237,23 @@ def main():
 
     if dtype == 'categorical':
 
+        # determine how many categories we actually generate (pivot adds one extra)
+        tmp_cats = ncats + 1 if args.pivot_category else ncats
+
+        if cat_weights is not None:
+            if len(cat_weights) != tmp_cats:
+                raise ValueError(f"--cat-weights must have length {tmp_cats} (got {len(cat_weights)})")
+            cat_weights = np.array(cat_weights, dtype=float)
+            if np.any(cat_weights < 0):
+                raise ValueError("--cat-weights must be non-negative")
+            s = cat_weights.sum()
+            if s <= 0:
+                raise ValueError("--cat-weights must sum to a positive value")
+            cat_weights = (cat_weights / s).tolist()
+
         if ncats == 2:
             # weights as [0.75, 0.25] will give about 3x as many 0's as 1's
-            y_vals = make_categorical_y_vals(rec_num, weights=[0.8, 0.2], n_cats=ncats)
+            y_vals = make_categorical_y_vals(rec_num, weights=cat_weights if cat_weights is not None else [0.8, 0.2], n_cats=tmp_cats)
             # fa_seqs modified in-place here to include the motif at a 
             #  randomly chosen site in each record where y_val is cat
             substitute_motif_into_records(
@@ -250,7 +272,7 @@ def main():
             if args.pivot_category:
                 ncats += 1
             # approximately even number of records per category
-            y_vals = make_categorical_y_vals(rec_num, n_cats=ncats)
+            y_vals = make_categorical_y_vals(rec_num, weights=cat_weights, n_cats=tmp_cats)
             distinct_cats = np.unique(y_vals)
             if args.pivot_category:
                 distinct_cats = distinct_cats[:-1]

diff --git a/src/python3/inout.py b/src/python3/inout.py
@@ -226,7 +226,7 @@ def parse_kmac_files(list_fname):
 
     return motif_list,fnames
 
-def parse_meme_file(fname, evalue_thresh=np.Inf):
+def parse_meme_file(fname, evalue_thresh=np.inf):
 
     motif_list = []
 
@@ -1296,7 +1296,7 @@ def get_enrichments(self, categories, cat_inds):
                 (enr["test_stats"][i,j], enr["pvals"][i,j]) = result.tuple
 
         enr["log2_ratio"] = np.log2(
-            np.clip(enr["ratio"], EPSILON, np.Inf)
+            np.clip(enr["ratio"], EPSILON, np.inf)
         )
 
     def create_data_header_line(self):
@@ -1445,7 +1445,7 @@ def parse_robustness_output(output):
     robustness = (passes, attempts)
     zscore = float(zscore_pat.search(output).group())
     if zscore == "inf":
-        zscore = np.Inf
+        zscore = np.inf
 
     return (mi, robustness, zscore)
 
@@ -1454,7 +1454,7 @@ class Motifs:
 
     def __init__(
             self, fname=None, motif_type=None, shape_lut=None,
-            max_count=None, alt_name_base=None, evalue_thresh=np.Inf,
+            max_count=None, alt_name_base=None, evalue_thresh=np.inf,
     ):
 
         self.X = None