Skip to content

Commit 3ce089d

Browse files
authored
Merge pull request #19 from PennChopMicrobiomeProgram/codex/add-option-to-run-backup-despite-checks
Add --allow-check-failures CLI flag to continue backup on validation failures
2 parents f78a792 + 60d38db commit 3ce089d

2 files changed

Lines changed: 91 additions & 15 deletions

File tree

seqBackupLib/backup.py

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def backup_fastq(
4747
sample_sheet_fp: Path,
4848
has_index: bool,
4949
min_file_size: int,
50+
allow_check_failures: bool = False,
5051
):
5152

5253
R1 = IlluminaFastq(gzip.open(forward_reads, mode="rt"))
@@ -58,25 +59,47 @@ def backup_fastq(
5859
illumina_fastqs = [IlluminaFastq(gzip.open(fp, mode="rt")) for fp in RI_fps]
5960
r1 = illumina_fastqs[0]
6061

61-
if not all([ifq.check_fp_vs_content()[0] for ifq in illumina_fastqs]):
62+
fp_vs_content_results = [ifq.check_fp_vs_content()[0] for ifq in illumina_fastqs]
63+
if not all(fp_vs_content_results):
6264
[ifq.check_fp_vs_content(verbose=True) for ifq in illumina_fastqs]
63-
raise ValueError(
65+
message = (
6466
"The file path and header information don't match",
65-
[str(ifq) for ifq in illumina_fastqs if not ifq.check_fp_vs_content()[0]],
67+
[
68+
str(ifq)
69+
for ifq, ok in zip(illumina_fastqs, fp_vs_content_results)
70+
if not ok
71+
],
6672
)
67-
if not all([ifq.check_file_size(min_file_size) for ifq in illumina_fastqs]):
68-
raise ValueError(
69-
"File seems suspiciously small. Please check if you have the correct file or lower the minimum file size threshold",
70-
[ifq.check_file_size(min_file_size) for ifq in illumina_fastqs],
73+
if allow_check_failures:
74+
warnings.warn(f"{message[0]}: {message[1]}")
75+
else:
76+
raise ValueError(*message)
77+
file_size_results = [ifq.check_file_size(min_file_size) for ifq in illumina_fastqs]
78+
if not all(file_size_results):
79+
message = (
80+
"File seems suspiciously small. Please check if you have the correct file or"
81+
" lower the minimum file size threshold",
82+
file_size_results,
7183
)
84+
if allow_check_failures:
85+
warnings.warn(f"{message[0]}: {message[1]}")
86+
else:
87+
raise ValueError(*message)
7288
if not all([ifq.check_index_read_exists() for ifq in illumina_fastqs]):
7389
warnings.warn(
7490
"No barcodes in headers. Were the fastq files generated properly?"
7591
)
7692

7793
# parse the info from the headers in EACH file and check they are consistent within each other
78-
if not all([fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs]):
79-
raise ValueError("The files are not from the same run.")
94+
same_run_results = [
95+
fastq.is_same_run(illumina_fastqs[0]) for fastq in illumina_fastqs
96+
]
97+
if not all(same_run_results):
98+
message = "The files are not from the same run."
99+
if allow_check_failures:
100+
warnings.warn(message)
101+
else:
102+
raise ValueError(message)
80103

81104
## Archiving steps
82105

@@ -144,13 +167,19 @@ def main(argv=None):
144167
default=DEFAULT_MIN_FILE_SIZE,
145168
help="Minimum file size to register in bytes",
146169
)
170+
parser.add_argument(
171+
"--allow-check-failures",
172+
action="store_true",
173+
help="Continue archiving even if validation checks fail",
174+
)
147175
args = parser.parse_args(argv)
148176
return backup_fastq(
149177
args.forward_reads,
150178
args.destination_dir,
151179
args.sample_sheet,
152180
not args.no_index,
153181
args.min_file_size,
182+
args.allow_check_failures,
154183
)
155184

156185
# maybe also ask for single or double reads

test/test_backup.py

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
import pytest
22
from pathlib import Path
3-
from seqBackupLib.backup import (
4-
backup_fastq,
5-
build_fp_to_archive,
6-
return_md5,
7-
main,
8-
)
3+
import gzip
4+
from seqBackupLib.backup import backup_fastq, build_fp_to_archive, return_md5, main
5+
6+
7+
def _write_fastq(fp: Path, header: str) -> None:
8+
sequence = "N" * 10
9+
content = f"{header}\n{sequence}\n+\n{'#' * len(sequence)}\n"
10+
with gzip.open(fp, "wt") as handle:
11+
handle.write(content)
912

1013

1114
def test_build_fp_to_archive():
@@ -132,3 +135,47 @@ def test_main_returns_archive_path(tmp_path, full_miseq_dir):
132135
expected_dir = raw / "250407_M03543_0443_000000000-DTHBL_L001"
133136
assert out_dir == expected_dir
134137
assert expected_dir.is_dir()
138+
139+
140+
def test_allow_check_failures_continues_archive(tmp_path):
141+
run_dir = tmp_path / "240101_M01234_0001_ABCDEFGX"
142+
run_dir.mkdir(parents=True, exist_ok=True)
143+
sample_sheet_fp = run_dir / "sample_sheet.csv"
144+
sample_sheet_fp.write_text(
145+
"[Header]\nIEMFileVersion,4\n[Data]\nSample_ID,Sample_Name\nS1,S1\n"
146+
)
147+
148+
header = "@M01234:1:ZZZZZZ:1:1101:10000:10000 1:N:0:ATCACG"
149+
for name in [
150+
"Undetermined_S0_L001_R1_001.fastq.gz",
151+
"Undetermined_S0_L001_R2_001.fastq.gz",
152+
"Undetermined_S0_L001_I1_001.fastq.gz",
153+
"Undetermined_S0_L001_I2_001.fastq.gz",
154+
]:
155+
_write_fastq(run_dir / name, header)
156+
157+
raw = tmp_path / "raw_reads"
158+
raw.mkdir(parents=True, exist_ok=True)
159+
160+
with pytest.raises(ValueError, match="header information don't match"):
161+
backup_fastq(
162+
run_dir / "Undetermined_S0_L001_R1_001.fastq.gz",
163+
raw,
164+
sample_sheet_fp,
165+
True,
166+
1,
167+
)
168+
169+
with pytest.warns(UserWarning, match="header information don't match"):
170+
out_dir = backup_fastq(
171+
run_dir / "Undetermined_S0_L001_R1_001.fastq.gz",
172+
raw,
173+
sample_sheet_fp,
174+
True,
175+
1,
176+
allow_check_failures=True,
177+
)
178+
179+
assert out_dir.is_dir()
180+
md5_fp = out_dir / f"{out_dir.name}.md5"
181+
assert md5_fp.is_file()

0 commit comments

Comments
 (0)