Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ space, ShapeME instead identifies motifs in local DNA structure space using
sequence-based predictions of DNA structure parameters including minor groove
width, electrostatic potential, helical twist, propeller twist, and roll. DNA
structural parameters are predicted from input sequences using `DNAshapeR`
([Chiu et al. 2016](https://doi.org/10.1093/bioinformatics/btv735)).
([Chiu et al. 2016](https://doi.org/10.1093/bioinformatics/btv735)) or `Deep DNAshape`
([Li et al. 2024](https://www-nature-com.proxy.lib.umich.edu/articles/s41467-024-45191-5)),
which can be selected as the backend shape prediction.

ShapeME excels at discovering DNA structural motifs that explain
binary data such as bound and unbound regions on a chromosome for a given
Expand Down Expand Up @@ -476,6 +478,7 @@ description of each argument.
[--stepsize STEPSIZE] [--opt_niter OPT_NITER]
[--alpha ALPHA] [--batch_size BATCH_SIZE]
[--find_seq_motifs] [--no_shape_motifs]
[--shape_tool {dnashaper,deepdnashape}]
[--seq_fasta SEQ_FASTA]
[--seq_motif_positive_cats SEQ_MOTIF_POSITIVE_CATS]
[--streme_thresh STREME_THRESH]
Expand Down Expand Up @@ -564,6 +567,9 @@ description of each argument.
--no_shape_motifs Add this flag to turn off shape motif inference. This
is useful if you basically want to use this script as
a wrapper for streme to just find sequence motifs.
--shape_tool {dnashaper,deepdnashape}
Backend tool used to compute DNA shape tracks.
'dnashaper' uses DNAshapeR'deepdnashape' uses DeepDNAshape.
--seq_fasta SEQ_FASTA
Name of fasta file (located within data_dir, do not
include the directory, just the file name) containing
Expand Down
13 changes: 13 additions & 0 deletions singularity/deepdnashape_env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: deepdnashape
channels:
- conda-forge
- defaults
dependencies:
- python=3.10
- pip
- numpy<1.24
- tensorflow=2.15.0
- h5py
- protobuf<5
- pip:
- wheel
1 change: 1 addition & 0 deletions singularity/env_spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ dependencies:
- gxx_impl_linux-64=13.2.0=h338b0a0_2
- harfbuzz=6.0.0=h8e241bc_0
- icu=70.1=h27087fc_0
- ipdb
- jack=1.9.22=h11f4161_0
- jinja2=3.1.2=pyhd8ed1ab_1
- joblib=1.3.2=pyhd8ed1ab_0
Expand Down
21 changes: 20 additions & 1 deletion singularity/shapeme.def
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Stage: build

%files
env_spec.yaml
deepdnashape_env.yaml
shapeme.bashrc /shapeme.bashrc

%runscript
Expand All @@ -29,10 +30,15 @@ Stage: build
&& apt update -y \
&& apt upgrade -y \
&& apt install -y libffi-dev \
&& apt install -y libffi-dev git ca-certificates \
&& conda update -y -n base conda \
&& conda install -c conda-forge mamba \
&& mamba env create -f env_spec.yaml \
&& cd
&& mamba env create -f deepdnashape_env.yaml \
&& cd /src \
&& git clone https://github.com/JinsenLi/deepDNAshape.git \
&& /opt/conda/envs/deepdnashape/bin/python -m pip install --no-deps ./deepDNAshape \
&& rm -rf /src/deepDNAshape

CUSTOM_ENV=/.singularity.d/env/99-zz_custom_env.sh
cat >$CUSTOM_ENV <<EOF
Expand All @@ -46,3 +52,16 @@ EOF
source /opt/conda/etc/profile.d/conda.sh
conda activate motifer
python -c "import numpy; print(numpy.__file__)"

echo "motifer python:"
/opt/conda/envs/motifer/bin/python -c "import sys; print(sys.executable)"

echo "motifer numpy:"
/opt/conda/envs/motifer/bin/python -c "import numpy as np; print('motifer numpy', np.__version__); print(np.__file__)"

echo "deepdnashape python:"
/opt/conda/envs/deepdnashape/bin/python -c "import sys; print(sys.executable)"

echo "deepdnashape numpy + tensorflow:"
/opt/conda/envs/deepdnashape/bin/python -c "import numpy as np; import tensorflow as tf; print('deepdnashape numpy', np.__version__); print('tf', tf.__version__)"

168 changes: 106 additions & 62 deletions src/python3/ShapeME.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,9 @@ def parse_args():
help=f"Add this flag to turn off shape motif inference. "\
f"This is useful if you basically want to use this script "\
f"as a wrapper for streme to just find sequence motifs.")
infer.add_argument("--shape_tool", choices=["dnashaper", "deepdnashape"], default="dnashaper",
help=f"Backend tool used to compute DNA shape tracks. 'dnashaper' uses DNAshapeR"
f"'deepdnashape' uses DeepDNAshape.")
infer.add_argument("--seq_fasta", type=str, default=None,
help=f"Name of fasta file (located within data_dir, do not include the "\
f"directory, just the file name) containing sequences in which to "\
Expand Down Expand Up @@ -479,6 +482,42 @@ def set_outdir_pref(no_shape_motifs, find_seq_motifs):
sys.exit(1)
return outdir_pre

# functions newly added to account for the shape tool to be specified
def shape_file_list_str(seq_fasta: str, shape_names):
return "".join([f"{seq_fasta}.{s} " for s in shape_names])

def run_shape_prediction(seq_fasta: str, shape_tool: str):
"""
Generate per-shape files alongside seq_fasta.
"""

if shape_tool.lower() == "dnashaper":
convert = f"Rscript {this_path}/utils/calc_shape.R {seq_fasta}"
elif shape_tool.lower() == "deepdnashape":
convert = f"/opt/conda/envs/deepdnashape/bin/python {this_path}/utils/calc_deepdnashape.py {seq_fasta}"
else:
logging.error(f"ERROR: Unrecognized --shape_tool: {shape_tool}")
sys.exit(1)

convert_result = subprocess.run(
convert,
shell=True,
capture_output=True,
)

if convert_result.returncode != 0:
msg = (
f"ERROR: running the following command:\n\n"
f"{convert}\n\n"
f"resulted in the following stderr:\n\n"
f"{convert_result.stderr.decode()}\n\n"
f"and the following stdout:\n\n"
f"{convert_result.stdout.decode()}")
logging.error(msg)
sys.exit(1)
else:
logging.info("Converting sequences to shapes ran without error")


def infer(args):
data_dir = args.data_dir
Expand Down Expand Up @@ -582,27 +621,30 @@ def infer(args):
## in future versions, use this conversion in fold splitting.
#############################################################
#############################################################
convert = f"Rscript {this_path}/utils/calc_shape.R {seq_fasta}"
convert_result = subprocess.run(
convert,
shell=True,
capture_output=True,
#check=True,
)
if convert_result.returncode != 0:
msg = f"ERROR: running the following command:\n\n"\
f"{convert}\n\n"\
f"resulted in the following stderr:\n\n"\
f"{convert_result.stderr.decode()}\n\n"\
f"and the following stdout:\n\n"\
f"{convert_result.stdout.decode()}"
logging.error(msg)
sys.exit(1)
else:
logging.info("Converting training sequences to shapes ran without error")
full_shape_fnames = ""
for shape_name in shape_names:
full_shape_fnames += f"{seq_fasta}.{shape_name} "
#convert = f"Rscript {this_path}/utils/calc_shape.R {seq_fasta}"
#convert_result = subprocess.run(
# convert,
# shell=True,
# capture_output=True,
# #check=True,
#)
#if convert_result.returncode != 0:
# msg = f"ERROR: running the following command:\n\n"\
# f"{convert}\n\n"\
# f"resulted in the following stderr:\n\n"\
# f"{convert_result.stderr.decode()}\n\n"\
# f"and the following stdout:\n\n"\
# f"{convert_result.stdout.decode()}"
# logging.error(msg)
# sys.exit(1)
#else:
# logging.info("Converting training sequences to shapes ran without error")
# full_shape_fnames = ""
# for shape_name in shape_names:
# full_shape_fnames += f"{seq_fasta}.{shape_name} "

run_shape_prediction(seq_fasta, args.shape_tool)
full_shape_fnames = shape_file_list_str(seq_fasta, shape_names)

out_dir = os.path.join(data_dir, f"{outdir_pre}_main_output")
tmpdir = os.path.join(out_dir, "tmp")
Expand Down Expand Up @@ -920,47 +962,49 @@ def infer(args):
test_score_file.write(f"\n{name}\t{yval}")
test_score_file.close()

convert = f"Rscript {this_path}/utils/calc_shape.R {train_seq_fasta}"
#convert = shlex.quote(convert)
convert_result = subprocess.run(
convert,
shell=True,
capture_output=True,
#check=True,
)
if convert_result.returncode != 0:
logging.error(
f"ERROR: running the following command:\n\n"\
f"{convert}\n\n"\
f"resulted in the following stderr:\n\n"\
f"{convert_result.stderr.decode()}\n\n"
f"and the following stdout:\n\n"\
f"{convert_result.stdout.decode()}"
)
sys.exit(1)
else:
logging.info("Converting training sequences to shapes ran without error")

convert = f"Rscript {this_path}/utils/calc_shape.R {test_seq_fasta}"
#convert = shlex.quote(convert)
convert_result = subprocess.run(
convert,
shell=True,
capture_output=True,
#check=True,
)
if convert_result.returncode != 0:
logging.error(
f"ERROR: running the following command:\n\n"\
f"{convert}\n\n"\
f"resulted in the following stderr:\n\n"\
f"{convert_result.stderr.decode()}\n\n"
f"and the following stdout:\n\n"\
f"{convert_result.stdout.decode()}"
)
sys.exit(1)
else:
logging.info("Converting testing sequences to shapes ran without error")
run_shape_prediction(train_seq_fasta, args.shape_tool)
#convert = f"Rscript {this_path}/utils/calc_shape.R {train_seq_fasta}"
##convert = shlex.quote(convert)
#convert_result = subprocess.run(
# convert,
# shell=True,
# capture_output=True,
# #check=True,
#)
#if convert_result.returncode != 0:
# logging.error(
# f"ERROR: running the following command:\n\n"\
# f"{convert}\n\n"\
# f"resulted in the following stderr:\n\n"\
# f"{convert_result.stderr.decode()}\n\n"
# f"and the following stdout:\n\n"\
# f"{convert_result.stdout.decode()}"
# )
# sys.exit(1)
#else:
# logging.info("Converting training sequences to shapes ran without error")

run_shape_prediction(test_seq_fasta, args.shape_tool)
#convert = f"Rscript {this_path}/utils/calc_shape.R {test_seq_fasta}"
##convert = shlex.quote(convert)
#convert_result = subprocess.run(
# convert,
# shell=True,
# capture_output=True,
# #check=True,
#)
#if convert_result.returncode != 0:
# logging.error(
# f"ERROR: running the following command:\n\n"\
# f"{convert}\n\n"\
# f"resulted in the following stderr:\n\n"\
# f"{convert_result.stderr.decode()}\n\n"
# f"and the following stdout:\n\n"\
# f"{convert_result.stdout.decode()}"
# )
# sys.exit(1)
#else:
# logging.info("Converting testing sequences to shapes ran without error")

INFER_EXE = f"python {this_path}/infer_motifs.py "\
f"--score_file {train_score_fname} "\
Expand Down
26 changes: 24 additions & 2 deletions src/python3/create_synthetic_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,14 +199,22 @@ def main():
help="The type of data to be output.")
parser.add_argument('--ncats', action='store', type=int, default=2, help="The number of categories containing motifs (see --pivot-category if you want to add one extra category without any motif).")
parser.add_argument('--pivot-category', action='store_true', default=False, help="If this flag is set, an extra category will be present in the synthetic data in which no motif is expected to be enriched. Ignored if ncats=1, since in that case, cat 1 will have the motif and cat 0 will not.")
parser.add_argument('--cat_weights', action='store', type=float, nargs='+', default=None,
help="Class label sampling weights (class prevalence). For ncats=2, defaults to 0.80 class 0 and 0.20 class 1. example: --cat-weights 0.80 0.20")
parser.add_argument('--folds', action='store', type=int, default=5, help="The number of folds for k-fold CV.")
parser.add_argument('--seed', action='store', type=int, default=None,
help="Random seed for reproducibility")
parser.add_argument('--noshapes', action='store_true', help="include at command line if you only want sequences produced.")

args = parser.parse_args()

if args.seed is not None:
np.random.seed(args.seed)

folds = args.folds
dtype = args.dtype
ncats = args.ncats
cat_weights=args.cat_weights
out_dir = args.outdir
seq_len = args.seqlen
rec_num = args.recnum
Expand All @@ -229,9 +237,23 @@ def main():

if dtype == 'categorical':

# determine how many categories we actually generate (pivot adds one extra)
tmp_cats = ncats + 1 if args.pivot_category else ncats

if cat_weights is not None:
if len(cat_weights) != tmp_cats:
raise ValueError(f"--cat-weights must have length {tmp_cats} (got {len(cat_weights)})")
cat_weights = np.array(cat_weights, dtype=float)
if np.any(cat_weights < 0):
raise ValueError("--cat-weights must be non-negative")
s = cat_weights.sum()
if s <= 0:
raise ValueError("--cat-weights must sum to a positive value")
cat_weights = (cat_weights / s).tolist()

if ncats == 2:
# weights as [0.75, 0.25] will give about 3x as many 0's as 1's
y_vals = make_categorical_y_vals(rec_num, weights=[0.8, 0.2], n_cats=ncats)
y_vals = make_categorical_y_vals(rec_num, weights=cat_weights if cat_weights is not None else [0.8, 0.2], n_cats=tmp_cats)
# fa_seqs modified in-place here to include the motif at a
# randomly chosen site in each record where y_val is cat
substitute_motif_into_records(
Expand All @@ -250,7 +272,7 @@ def main():
if args.pivot_category:
ncats += 1
# approximately even number of records per category
y_vals = make_categorical_y_vals(rec_num, n_cats=ncats)
y_vals = make_categorical_y_vals(rec_num, weights=cat_weights, n_cats=tmp_cats)
distinct_cats = np.unique(y_vals)
if args.pivot_category:
distinct_cats = distinct_cats[:-1]
Expand Down
8 changes: 4 additions & 4 deletions src/python3/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def parse_kmac_files(list_fname):

return motif_list,fnames

def parse_meme_file(fname, evalue_thresh=np.Inf):
def parse_meme_file(fname, evalue_thresh=np.inf):

motif_list = []

Expand Down Expand Up @@ -1296,7 +1296,7 @@ def get_enrichments(self, categories, cat_inds):
(enr["test_stats"][i,j], enr["pvals"][i,j]) = result.tuple

enr["log2_ratio"] = np.log2(
np.clip(enr["ratio"], EPSILON, np.Inf)
np.clip(enr["ratio"], EPSILON, np.inf)
)

def create_data_header_line(self):
Expand Down Expand Up @@ -1445,7 +1445,7 @@ def parse_robustness_output(output):
robustness = (passes, attempts)
zscore = float(zscore_pat.search(output).group())
if zscore == "inf":
zscore = np.Inf
zscore = np.inf

return (mi, robustness, zscore)

Expand All @@ -1454,7 +1454,7 @@ class Motifs:

def __init__(
self, fname=None, motif_type=None, shape_lut=None,
max_count=None, alt_name_base=None, evalue_thresh=np.Inf,
max_count=None, alt_name_base=None, evalue_thresh=np.inf,
):

self.X = None
Expand Down
Loading