From bf8f9f2c810e267c4f636db5650bfae6be278706 Mon Sep 17 00:00:00 2001 From: ishinder Date: Sun, 7 Jun 2026 08:05:16 -0400 Subject: [PATCH 1/2] Fix CLI/output/packaging issues Adds --version flag, makes input-type detection content-based instead of extension-based, and makes the junction/BAM output options accept a file path for single inputs. Bumps to 2.1.0. - Add --version (CLI11 set_version_flag) sourced from a configured version.hpp generated from the CMake project VERSION, so the git tag, CMake version, --version output and conda package can't drift. A CI job fails on any CMakeLists.txt vs conda/meta.yaml version mismatch. - Detect input type by content, not extension (Galaxy names every dataset *.dat): --bam by magic bytes (BGZF/BAM, CRAM, SAM), --bed by content (BED record vs list of existing paths). Add explicit --bam_list/--bed_list; auto-detected path lists still work but warn. (Issue #2) - Fix is_file_path npos bug so bare output filenames (out.bed, out.bam) are treated as files, not directories. Move it to a shared path_utils helper and use it in OutputWriter. (Issues #1, #3, #4) - --out_original_junctions now accepts a file path for a single input and is implemented for GTF/BED (previously directory-only for BAM and a silent no-op for GTF/BED). (Issues #1, #3) - --out_filtered_bam accepts a file path for a single BAM input; fix generate_bam_output_paths multi-input parent-dir fallback. (Issue #4) - Add GitHub Actions CI (x86_64, linux-aarch64, osx-arm64) + tests for path generation and content detection (incl. a *.dat input case). - Update README output-option docs (file vs directory, the --removed_alignments_bam suffix) and add CHANGELOG. Closes #1 Co-Authored-By: Claude Opus 4.8 --- .github/workflows/ci.yml | 56 +++++++++++++++ CHANGELOG.md | 49 +++++++++++++ CMakeLists.txt | 13 +++- README.md | 33 ++++++--- conda/meta.yaml | 2 +- include/eastr/input_detect.hpp | 21 ++++++ include/eastr/path_utils.hpp | 21 ++++++ include/eastr/types.hpp | 6 +- include/eastr/version.hpp.in | 8 +++ src/input_detect.cpp | 97 +++++++++++++++++++++++++ src/main.cpp | 128 +++++++++++++++++++++++---------- src/output_writer.cpp | 51 +++---------- src/path_utils.cpp | 57 +++++++++++++++ tests/CMakeLists.txt | 6 ++ tests/test_input_detect.cpp | 79 ++++++++++++++++++++ tests/test_path_utils.cpp | 94 ++++++++++++++++++++++++ 16 files changed, 630 insertions(+), 91 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 CHANGELOG.md create mode 100644 include/eastr/input_detect.hpp create mode 100644 include/eastr/path_utils.hpp create mode 100644 include/eastr/version.hpp.in create mode 100644 src/input_detect.cpp create mode 100644 src/path_utils.cpp create mode 100644 tests/test_input_detect.cpp create mode 100644 tests/test_path_utils.cpp diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..547d0ba --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,56 @@ +name: CI + +on: + push: + branches: [main, master] + pull_request: + +jobs: + build-and-test: + name: ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest # x86_64 + - ubuntu-24.04-arm # linux-aarch64 (AWS Graviton / Galaxy compute) + - macos-latest # osx-arm64 (Apple Silicon) + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive # minimap2 (+ its sse2neon dir) is a submodule + + - name: Install dependencies (Linux) + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y cmake g++ zlib1g-dev libhts-dev + + - name: Install dependencies (macOS) + if: runner.os == 'macOS' + run: brew install cmake htslib + + - name: Configure + run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DEASTR_BUILD_TESTS=ON + + - name: Build + run: cmake --build build -j + + - name: Test + run: ctest --test-dir build --output-on-failure + + version-consistency: + name: version consistency + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Check CMake and conda versions match + run: | + cmake_ver=$(sed -n 's/^project(eastr VERSION \([0-9.]*\).*/\1/p' CMakeLists.txt) + conda_ver=$(sed -n 's/.*set version = "\([0-9.]*\)".*/\1/p' conda/meta.yaml) + echo "CMake: $cmake_ver conda: $conda_ver" + if [ "$cmake_ver" != "$conda_ver" ]; then + echo "::error::Version drift: CMakeLists.txt=$cmake_ver conda/meta.yaml=$conda_ver" + exit 1 + fi diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..c0a4afa --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,49 @@ +# Changelog + +All notable changes to this project are documented here. This project adheres to +[Semantic Versioning](https://semver.org/). + +## [2.1.0] - unreleased + +### Added +- `--version` flag that prints the version and exits 0. The version is generated + from the CMake `project(... VERSION ...)` (a single source of truth) via a + configured `eastr/version.hpp`, so the git tag, CMake version, `--version` + output, and conda package can no longer drift. A CI check fails on any drift + between `CMakeLists.txt` and `conda/meta.yaml`. +- `--bed_list` / `--bam_list` flags to explicitly declare that the `--bed` / `--bam` + argument is a text file listing input paths (one per line). +- GitHub Actions CI building and testing on `ubuntu-latest` (x86_64), + `ubuntu-24.04-arm` (linux-aarch64), and `macos-latest` (osx-arm64). +- Unit tests for output path generation and content-based input detection, + including a Galaxy-style `*.dat`-named input case and a single-file + `--out_original_junctions` case. + +### Changed +- Input-type detection no longer depends on the file extension. `--bam` is detected + by magic bytes (BGZF/BAM, CRAM, SAM); `--bed` is detected by content (a BED record + vs. a list of existing paths). This fixes inputs named `*.dat` (e.g. every Galaxy + dataset). Auto-detecting a path list still works but now emits a deprecation + warning recommending `--bed_list` / `--bam_list`. `--gtf` already accepted any + filename. (Issue #2) +- `--out_original_junctions` now accepts a **file path** for a single input (BAM, + GTF, or BED) and writes one file, consistent with `--out_removed_junctions` / + `--out_kept_junctions`. It previously behaved as a directory for BAM input and was + a silent no-op for GTF/BED. Directory mode is still used for multi-input runs. + (Issues #1, #3) +- `--out_filtered_bam` now accepts a **file path** for a single BAM input (writes the + filtered BAM directly), keeping directory mode for BAM lists. (Issue #4) + +### Fixed +- Bare output filenames without a directory prefix (e.g. `out.bed`, `out.bam`) were + misinterpreted as directories, causing "Cannot open output file" errors. Both bare + filenames and `dir/out.bed` now work. (Issues #1, #3, #4) +- `--out_filtered_bam` with a file-looking path and multiple BAM inputs previously + produced paths like `out.bam/_EASTR_filtered.bam`; it now falls back to the + parent directory. + +### Notes +- `conda/meta.yaml` `sha256` for the source tarball must be updated when the `v2.1.0` + tag is cut. +- Adding `additional_platforms: [linux-aarch64]` to the Bioconda recipe is a separate + PR in `bioconda-recipes`. diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e4f1ec..e2b9405 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,11 +1,19 @@ cmake_minimum_required(VERSION 3.16) -project(eastr VERSION 2.0.0 LANGUAGES CXX C) +project(eastr VERSION 2.1.0 LANGUAGES CXX C) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +# Generate the version header from the single source of truth (project VERSION), +# so the git tag, CMake version, --version output and conda package can't drift. +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/include/eastr/version.hpp.in + ${CMAKE_CURRENT_BINARY_DIR}/generated/eastr/version.hpp + @ONLY +) + # Options option(EASTR_BUILD_TESTS "Build tests" OFF) option(EASTR_USE_OPENMP "Use OpenMP for parallelization" OFF) @@ -135,6 +143,8 @@ add_library(eastr_lib STATIC src/junction.cpp src/bed_parser.cpp src/gtf_parser.cpp + src/path_utils.cpp + src/input_detect.cpp src/fasta_index.cpp src/junction_extractor.cpp src/self_aligner.cpp @@ -151,6 +161,7 @@ add_library(eastr_lib STATIC target_include_directories(eastr_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_BINARY_DIR}/generated ${HTSLIB_INCLUDE_DIRS} ) diff --git a/README.md b/README.md index 3492a78..926e23a 100644 --- a/README.md +++ b/README.md @@ -131,17 +131,23 @@ eastr --bam input.bam \ |--------|-------------| | `--bam FILE` | BAM file, or text file listing BAM paths (one per line) | | `--gtf FILE` | GTF annotation file | -| `--bed FILE` | BED file with junction coordinates | +| `--bed FILE` | BED file with junction coordinates, or text file listing BED paths | | `-r, --reference FILE` | Reference genome FASTA (required for all input types) | +> Input type is detected by file **content**, not by extension, so any filename +> works (including Galaxy's `*.dat` datasets). A `--bam`/`--bed` argument that is a +> plain-text list of paths is still accepted, but the explicit `--bam_list` / +> `--bed_list` flags are preferred (see below). + ### Common options | Option | Default | Description | |--------|---------|-------------| | `-i, --bowtie2_index PATH` | auto-built | Bowtie2 index prefix (built automatically if not provided) | | `-p INT` | 1 | Number of threads | -| `--out_filtered_bam PATH` | — | Output filtered BAM file or directory | -| `--out_removed_junctions PATH` | stdout | Output spurious junctions (BED format) | +| `--out_filtered_bam PATH` | — | Output filtered BAM. **File** for a single BAM input; **directory** for a BAM list (writes `.bam` per input) | +| `--out_removed_junctions FILE` | stdout | Output spurious junctions (BED format) | +| `--version` | — | Print the version and exit | | `--verbose` | off | Show progress information |
@@ -168,14 +174,25 @@ eastr --bam input.bam \ | `-w INT` | 2 | Minimizer window size | | `-m INT` | 25 | Minimum chain score | -### Additional output options +### Input list options | Option | Description | |--------|-------------| -| `--out_original_junctions PATH` | Write all junctions before filtering | -| `--out_kept_junctions PATH` | Write non-spurious junctions | -| `--removed_alignments_bam` | Write removed alignments to separate BAM | -| `--filtered_bam_suffix STR` | Suffix for output BAMs (default: `_EASTR_filtered`) | +| `--bam_list` | Treat the `--bam` argument as a text file listing BAM paths (one per line) | +| `--bed_list` | Treat the `--bed` argument as a text file listing BED paths (one per line) | + +### Additional output options + +All `--out_*_junctions` options take a **file path** for a single input and write +one BED file. For multi-input runs (BAM list / multiple BED files) you may pass a +**directory**, and one file per input is written as `.bed`. + +| Option | File / Dir | Description | +|--------|------------|-------------| +| `--out_original_junctions PATH` | file (single) / dir (multi) | All junctions before filtering. Works for BAM, GTF, and BED input | +| `--out_kept_junctions FILE` | file | Non-spurious (kept) junctions | +| `--removed_alignments_bam` | flag | Also write the *removed* alignments alongside each filtered BAM. The output is named by replacing `.bam` with `_removed_alignments.bam` (e.g. `filtered.bam` → `filtered_removed_alignments.bam`) | +| `--filtered_bam_suffix STR` | — | Suffix for per-input filtered BAMs in directory mode (default: `_EASTR_filtered`) |
diff --git a/conda/meta.yaml b/conda/meta.yaml index ce23acc..06eecfe 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,5 +1,5 @@ {% set name = "eastr-cpp" %} -{% set version = "2.0.2" %} +{% set version = "2.1.0" %} package: name: {{ name|lower }} diff --git a/include/eastr/input_detect.hpp b/include/eastr/input_detect.hpp new file mode 100644 index 0000000..1320d9d --- /dev/null +++ b/include/eastr/input_detect.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace eastr { + +// Content-based input detection so input handling does not depend on a file's +// extension (Galaxy, for example, names every dataset "*.dat"). + +// True if `path` is a single alignment file (BAM/CRAM/SAM), detected by magic +// bytes: BGZF/gzip (1f 8b) -> BAM, "CRAM" -> CRAM, leading '@' -> SAM header. +// False means it should be treated as a text file listing alignment paths. +bool is_alignment_file(const std::string& path); + +// True if `path` is a single BED file rather than a text file listing BED paths. +// A bgzipped file (1f 8b), or a first data line that parses as a BED record +// (>=3 whitespace columns with integer start/end), is a single BED file. If the +// first non-comment line is instead an existing filesystem path, it is a list. +bool is_bed_file(const std::string& path); + +} // namespace eastr diff --git a/include/eastr/path_utils.hpp b/include/eastr/path_utils.hpp new file mode 100644 index 0000000..122c62b --- /dev/null +++ b/include/eastr/path_utils.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace eastr { + +// Decide whether `path` denotes a file (vs a directory) for output purposes. +// An existing directory, or a path with a trailing separator, is a directory. +// Otherwise it is treated as a file when it has an extension after the last +// path separator (e.g. "out.bed", "dir/out.bed"); a bare name without an +// extension (e.g. "results", "out_dir") is treated as a directory. +bool is_file_path(const std::string& path); + +// Filename component of `path` with its directory and extension removed +// (e.g. "/a/b/sample.bam" -> "sample"). +std::string path_basename(const std::string& path); + +// Directory component of `path`, or "." if `path` has no separator. +std::string path_parent_dir(const std::string& path); + +} // namespace eastr diff --git a/include/eastr/types.hpp b/include/eastr/types.hpp index 42746b3..5961552 100644 --- a/include/eastr/types.hpp +++ b/include/eastr/types.hpp @@ -54,8 +54,10 @@ struct AlgorithmParams { struct Config { // Input paths (mutually exclusive) std::string gtf_path; - std::string bed_path; // Single file or list file - std::string bam_path; // Single file or list file + std::string bed_path; // Single file or list file (detected by content) + std::string bam_path; // Single file or list file (detected by content) + bool bed_list = false; // Treat --bed argument as a list of BED paths + bool bam_list = false; // Treat --bam argument as a list of BAM paths // Required paths std::string reference_fasta; diff --git a/include/eastr/version.hpp.in b/include/eastr/version.hpp.in new file mode 100644 index 0000000..226bba3 --- /dev/null +++ b/include/eastr/version.hpp.in @@ -0,0 +1,8 @@ +#pragma once + +namespace eastr { + +// Single source of truth: configured from the CMake project() VERSION. +inline constexpr const char* kVersion = "@PROJECT_VERSION@"; + +} // namespace eastr diff --git a/src/input_detect.cpp b/src/input_detect.cpp new file mode 100644 index 0000000..f77570d --- /dev/null +++ b/src/input_detect.cpp @@ -0,0 +1,97 @@ +#include "eastr/input_detect.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +namespace eastr { + +namespace { + +// Read up to `n` leading bytes of a file (fewer if the file is shorter). +std::string read_magic(const std::string& path, size_t n) { + std::ifstream f(path, std::ios::binary); + std::string buf(n, '\0'); + f.read(&buf[0], static_cast(n)); + buf.resize(static_cast(f.gcount())); + return buf; +} + +bool has_gzip_magic(const std::string& magic) { + return magic.size() >= 2 && + static_cast(magic[0]) == 0x1f && + static_cast(magic[1]) == 0x8b; +} + +bool is_integer(const std::string& s) { + if (s.empty()) return false; + for (char c : s) { + if (!std::isdigit(static_cast(c))) return false; + } + return true; +} + +} // namespace + +bool is_alignment_file(const std::string& path) { + std::string magic = read_magic(path, 4); + + // BAM is BGZF-compressed, which begins with the gzip magic bytes. + if (has_gzip_magic(magic)) return true; + // CRAM files begin with the literal "CRAM". + if (magic.rfind("CRAM", 0) == 0) return true; + // Uncompressed SAM begins with an '@' header line. + if (!magic.empty() && magic[0] == '@') return true; + + // Otherwise assume a plain-text list of alignment paths. + return false; +} + +bool is_bed_file(const std::string& path) { + // A bgzipped BED file is a single data file, not a list. + if (has_gzip_magic(read_magic(path, 2))) return true; + + std::ifstream f(path); + if (!f.is_open()) { + // Let downstream parsing surface a clear error; treat as single file. + return true; + } + + std::string line; + while (std::getline(f, line)) { + // Trim surrounding whitespace. + line.erase(0, line.find_first_not_of(" \t\r\n")); + line.erase(line.find_last_not_of(" \t\r\n") + 1); + + if (line.empty()) continue; + if (line[0] == '#' || line.rfind("track", 0) == 0) continue; + + // First meaningful line found. A BED record has >=3 columns with + // integer start/end (cols 2 and 3). + std::istringstream iss(line); + std::string chrom, start, end; + if ((iss >> chrom >> start >> end) && is_integer(start) && is_integer(end)) { + return true; + } + + // Not a BED record: if it names an existing path, treat as a list file. + if (fs::exists(line)) { + return false; + } + + // Neither a BED record nor an existing path: treat as a (malformed) BED + // file so the BED parser reports a precise format error. + return true; + } + + // Empty / comment-only file: treat as an empty BED file. + return true; +} + +} // namespace eastr diff --git a/src/main.cpp b/src/main.cpp index 675ba73..650d9dc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,6 +11,9 @@ #include "eastr/junction_accumulator.hpp" #include "eastr/streaming_spurious_detector.hpp" #include "eastr/parallel_bam_extractor.hpp" +#include "eastr/path_utils.hpp" +#include "eastr/input_detect.hpp" +#include "eastr/version.hpp" #include @@ -179,63 +182,82 @@ std::tuple build_bowtie2_index( } } -// Check if path is a file (has extension) or directory -bool is_file_path(const std::string& path) { - size_t last_dot = path.find_last_of('.'); - size_t last_slash = path.find_last_of("/\\"); - return last_dot != std::string::npos && - (last_slash == std::string::npos || last_dot > last_slash); -} +enum class ListKind { Bam, Bed }; -// Expand file list (if .txt file, read paths from it) -std::vector expand_file_list(const std::string& path) { +// Read a text file listing one input path per line. +std::vector read_path_list(const std::string& path) { std::vector files; + std::ifstream list_file(path); + if (!list_file.is_open()) { + throw std::runtime_error("Cannot open file list: " + path); + } - // Check extension - size_t dot_pos = path.find_last_of('.'); - std::string ext = (dot_pos != std::string::npos) ? path.substr(dot_pos) : ""; - - if (ext == ".bam" || ext == ".cram" || ext == ".sam" || - ext == ".bed" || ext == ".gtf" || ext == ".gff") { - // Single file - files.push_back(path); - } else { - // Assume it's a list file - std::ifstream list_file(path); - if (!list_file.is_open()) { - throw std::runtime_error("Cannot open file list: " + path); - } - - std::string line; - while (std::getline(list_file, line)) { - // Trim whitespace - line.erase(0, line.find_first_not_of(" \t\r\n")); - line.erase(line.find_last_not_of(" \t\r\n") + 1); + std::string line; + while (std::getline(list_file, line)) { + // Trim whitespace + line.erase(0, line.find_first_not_of(" \t\r\n")); + line.erase(line.find_last_not_of(" \t\r\n") + 1); - if (!line.empty()) { - if (!fs::exists(line)) { - throw std::runtime_error("File not found: " + line); - } - files.push_back(line); + if (!line.empty()) { + if (!fs::exists(line)) { + throw std::runtime_error("File not found: " + line); } + files.push_back(line); } } return files; } +// Resolve a --bam/--bed argument into a list of input files. The "single file vs +// list of paths" distinction is made by file *content* (not extension), so inputs +// work regardless of name (e.g. Galaxy's "*.dat"). `force_list` (from +// --bam_list/--bed_list) treats the argument as a list unconditionally. +std::vector resolve_input_files(const std::string& path, ListKind kind, + bool force_list) { + if (!fs::exists(path)) { + throw std::runtime_error("Input file not found: " + path); + } + + if (force_list) { + return read_path_list(path); + } + + bool single = (kind == ListKind::Bam) ? eastr::is_alignment_file(path) + : eastr::is_bed_file(path); + if (single) { + return {path}; + } + + // Auto-detected a list of paths. Supported for backward compatibility, but + // the explicit flag is preferred. + std::cerr << "Warning: '" << path << "' was auto-detected as a list of file paths. " + << "This is deprecated; pass --" + << (kind == ListKind::Bam ? "bam_list" : "bed_list") + << " to declare a list file explicitly.\n"; + return read_path_list(path); +} + int main(int argc, char* argv[]) { CLI::App app{"eastr: Emending alignments of spuriously spliced transcript reads"}; + // --version prints the version and exits 0 (before --reference is validated). + app.set_version_flag("--version", std::string("eastr ") + eastr::kVersion); + eastr::Config config; - // Input options (mutually exclusive) + // Input options (mutually exclusive). Input type is detected by file content, + // not by extension, so any filename works (e.g. Galaxy's "*.dat"). auto* gtf_opt = app.add_option("--gtf", config.gtf_path, "Input GTF file containing transcript annotations"); auto* bed_opt = app.add_option("--bed", config.bed_path, - "Input BED file with intron coordinates"); + "Input BED file with intron coordinates (or a list of BED paths; see --bed_list)"); auto* bam_opt = app.add_option("--bam", config.bam_path, - "Input BAM file or TXT file containing list of BAM files"); + "Input BAM file (or a list of BAM paths; see --bam_list)"); + app.add_flag("--bed_list", config.bed_list, + "Treat the --bed argument as a text file listing BED paths (one per line)"); + app.add_flag("--bam_list", config.bam_list, + "Treat the --bam argument as a text file listing BAM paths (one per line)"); gtf_opt->excludes(bed_opt)->excludes(bam_opt); bed_opt->excludes(gtf_opt)->excludes(bam_opt); @@ -339,11 +361,12 @@ int main(int argc, char* argv[]) { eastr::JunctionMap spurious; // Declare early for streaming path bool is_bam_input = false; std::vector bam_files; + std::vector bed_files; // Populated for BED input (output reuse) std::vector sample_names; // For consistent output ordering if (!config.bam_path.empty()) { is_bam_input = true; - bam_files = expand_file_list(config.bam_path); + bam_files = resolve_input_files(config.bam_path, ListKind::Bam, config.bam_list); // Get sample names (for consistent output ordering) for (size_t i = 0; i < bam_files.size(); ++i) { @@ -367,6 +390,12 @@ int main(int argc, char* argv[]) { eastr::OutputWriter::generate_junction_output_paths( bam_files, config.out_original_junctions, "_original_junctions"); + // Create parent directory if needed + fs::path dir = fs::path(orig_junction_paths[0]).parent_path(); + if (!dir.empty()) { + fs::create_directories(dir); + } + eastr::OutputWriter::write_spurious_bed( junctions, config.scoring, orig_junction_paths[0], eastr::OutputWriter::InputType::BAM, sample_names); @@ -398,7 +427,7 @@ int main(int argc, char* argv[]) { } } else if (!config.bed_path.empty()) { - auto bed_files = expand_file_list(config.bed_path); + bed_files = resolve_input_files(config.bed_path, ListKind::Bed, config.bed_list); if (config.verbose) { std::cerr << "Parsing BED files...\n"; @@ -452,6 +481,27 @@ int main(int argc, char* argv[]) { output_type = eastr::OutputWriter::InputType::BAM; } + // Write original (pre-filtering) junctions for GTF/BED inputs if requested. + // (BAM inputs handle this above, during parallel extraction.) + if (!is_bam_input && !config.out_original_junctions.empty()) { + std::vector input_files = + !config.bed_path.empty() ? bed_files + : std::vector{config.gtf_path}; + std::vector orig_junction_paths = + eastr::OutputWriter::generate_junction_output_paths( + input_files, config.out_original_junctions, "_original_junctions"); + + // Create parent directory if needed + fs::path dir = fs::path(orig_junction_paths[0]).parent_path(); + if (!dir.empty()) { + fs::create_directories(dir); + } + + eastr::OutputWriter::write_spurious_bed( + junctions, config.scoring, orig_junction_paths[0], + output_type, sample_names); + } + // Output results if (is_bam_input && !config.out_filtered_bam.empty()) { // Write spurious junctions to a single BED file (used for all BAM filtering) diff --git a/src/output_writer.cpp b/src/output_writer.cpp index 7a185b1..fb37eb1 100644 --- a/src/output_writer.cpp +++ b/src/output_writer.cpp @@ -1,4 +1,5 @@ #include "eastr/output_writer.hpp" +#include "eastr/path_utils.hpp" #include #include @@ -201,39 +202,18 @@ std::vector OutputWriter::generate_junction_output_paths( std::vector output_paths; - auto get_basename = [](const std::string& path) { - size_t last_slash = path.find_last_of("/\\"); - size_t last_dot = path.find_last_of('.'); - if (last_slash == std::string::npos) last_slash = 0; - else last_slash++; - if (last_dot == std::string::npos || last_dot < last_slash) { - return path.substr(last_slash); - } - return path.substr(last_slash, last_dot - last_slash); - }; - - auto get_parent_dir = [](const std::string& path) { - size_t last_slash = path.find_last_of("/\\"); - if (last_slash == std::string::npos) { - return std::string("."); - } - return path.substr(0, last_slash); - }; - - // Check if output is a file or directory - bool is_file = output_dir_or_file.find('.') != std::string::npos && - output_dir_or_file.find_last_of('.') > output_dir_or_file.find_last_of("/\\"); + // Check if output is a file or directory (shared, npos-safe heuristic). + bool is_file = is_file_path(output_dir_or_file); if (input_files.size() == 1 && is_file) { output_paths.push_back(output_dir_or_file); } else { // Determine the output directory // If output looks like a file but we have multiple inputs, use parent directory - std::string out_dir = is_file ? get_parent_dir(output_dir_or_file) : output_dir_or_file; + std::string out_dir = is_file ? path_parent_dir(output_dir_or_file) : output_dir_or_file; for (const auto& input : input_files) { - std::string basename = get_basename(input); - output_paths.push_back(out_dir + "/" + basename + suffix + ".bed"); + output_paths.push_back(out_dir + "/" + path_basename(input) + suffix + ".bed"); } } @@ -247,26 +227,17 @@ std::vector OutputWriter::generate_bam_output_paths( std::vector output_paths; - auto get_basename = [](const std::string& path) { - size_t last_slash = path.find_last_of("/\\"); - size_t last_dot = path.find_last_of('.'); - if (last_slash == std::string::npos) last_slash = 0; - else last_slash++; - if (last_dot == std::string::npos || last_dot < last_slash) { - return path.substr(last_slash); - } - return path.substr(last_slash, last_dot - last_slash); - }; - - bool is_file = output_dir_or_file.find('.') != std::string::npos && - output_dir_or_file.find_last_of('.') > output_dir_or_file.find_last_of("/\\"); + // Check if output is a file or directory (shared, npos-safe heuristic). + bool is_file = is_file_path(output_dir_or_file); if (input_bams.size() == 1 && is_file) { output_paths.push_back(output_dir_or_file); } else { + // If output looks like a file but we have multiple inputs, use parent directory + std::string out_dir = is_file ? path_parent_dir(output_dir_or_file) : output_dir_or_file; + for (const auto& input : input_bams) { - std::string basename = get_basename(input); - output_paths.push_back(output_dir_or_file + "/" + basename + suffix + ".bam"); + output_paths.push_back(out_dir + "/" + path_basename(input) + suffix + ".bam"); } } diff --git a/src/path_utils.cpp b/src/path_utils.cpp new file mode 100644 index 0000000..5ba2ce7 --- /dev/null +++ b/src/path_utils.cpp @@ -0,0 +1,57 @@ +#include "eastr/path_utils.hpp" + +#include + +namespace fs = std::filesystem; + +namespace eastr { + +bool is_file_path(const std::string& path) { + if (path.empty()) { + return false; + } + + // An existing directory is unambiguously a directory. + std::error_code ec; + if (fs::is_directory(path, ec)) { + return false; + } + + // A trailing path separator means a directory was intended. + char last = path.back(); + if (last == '/' || last == '\\') { + return false; + } + + // Otherwise: treat as a file if there is an extension after the last + // separator. Guarding on last_slash avoids the npos pitfall where a bare + // "out.bed" (no separator) would otherwise be misread as a directory. + size_t last_dot = path.find_last_of('.'); + size_t last_slash = path.find_last_of("/\\"); + return last_dot != std::string::npos && + (last_slash == std::string::npos || last_dot > last_slash); +} + +std::string path_basename(const std::string& path) { + size_t last_slash = path.find_last_of("/\\"); + size_t last_dot = path.find_last_of('.'); + if (last_slash == std::string::npos) { + last_slash = 0; + } else { + last_slash++; + } + if (last_dot == std::string::npos || last_dot < last_slash) { + return path.substr(last_slash); + } + return path.substr(last_slash, last_dot - last_slash); +} + +std::string path_parent_dir(const std::string& path) { + size_t last_slash = path.find_last_of("/\\"); + if (last_slash == std::string::npos) { + return std::string("."); + } + return path.substr(0, last_slash); +} + +} // namespace eastr diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d863444..90d18c3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,6 +5,8 @@ add_executable(eastr_tests test_junction.cpp test_bed_parser.cpp test_strand_handling.cpp + test_path_utils.cpp + test_input_detect.cpp ) target_link_libraries(eastr_tests PRIVATE @@ -16,6 +18,10 @@ include(CTest) include(Catch) catch_discover_tests(eastr_tests) +# CLI smoke tests: the binary wiring (incl. --version exiting 0 without --reference). +add_test(NAME cli_version COMMAND eastr --version) +add_test(NAME cli_help COMMAND eastr --help) + # Standalone alignment score test (for comparison with Python) add_executable(test_alignment_score test_alignment_score.cpp) target_link_libraries(test_alignment_score PRIVATE eastr_lib) diff --git a/tests/test_input_detect.cpp b/tests/test_input_detect.cpp new file mode 100644 index 0000000..9cd5255 --- /dev/null +++ b/tests/test_input_detect.cpp @@ -0,0 +1,79 @@ +#include + +#include "eastr/input_detect.hpp" + +#include +#include +#include + +namespace fs = std::filesystem; +using namespace eastr; + +namespace { + +// Temp file that can hold arbitrary (binary) content, with any extension. +class TempFile { +public: + TempFile(const std::string& content, const std::string& ext = ".dat") { + path_ = fs::temp_directory_path() / + ("eastr_detect_" + std::to_string(counter_++) + ext); + std::ofstream f(path_, std::ios::binary); + f.write(content.data(), static_cast(content.size())); + } + ~TempFile() { fs::remove(path_); } + std::string str() const { return path_.string(); } + +private: + fs::path path_; + static int counter_; +}; +int TempFile::counter_ = 0; + +// Minimal BGZF/gzip header followed by the BAM magic. +const std::string kBamMagic = std::string("\x1f\x8b\x08\x04", 4) + "BAM\1"; + +} // namespace + +TEST_CASE("is_alignment_file detects BAM by magic bytes regardless of extension", + "[input_detect]") { + // Galaxy names every dataset "*.dat" (issue #2): detection must not use the name. + TempFile bam(kBamMagic, ".dat"); + REQUIRE(is_alignment_file(bam.str())); +} + +TEST_CASE("is_alignment_file detects CRAM and SAM", "[input_detect]") { + TempFile cram("CRAM\x03", ".dat"); + REQUIRE(is_alignment_file(cram.str())); + + TempFile sam("@HD\tVN:1.6\n@SQ\tSN:chr1\tLN:100\n", ".dat"); + REQUIRE(is_alignment_file(sam.str())); +} + +TEST_CASE("is_alignment_file treats a path list as not-a-single-file", "[input_detect]") { + TempFile bam(kBamMagic, ".bam"); + TempFile list(bam.str() + "\n", ".txt"); + REQUIRE_FALSE(is_alignment_file(list.str())); +} + +TEST_CASE("is_bed_file detects a BED record regardless of extension", "[input_detect]") { + TempFile bed("chr1\t100\t200\tJUNC1\t5\t+\n", ".dat"); + REQUIRE(is_bed_file(bed.str())); + // A real BED record is text, not an alignment. + REQUIRE_FALSE(is_alignment_file(bed.str())); +} + +TEST_CASE("is_bed_file skips comment and track lines", "[input_detect]") { + TempFile bed("# comment\ntrack name=foo\nchr1\t10\t20\tj\t1\t-\n", ".dat"); + REQUIRE(is_bed_file(bed.str())); +} + +TEST_CASE("is_bed_file detects a list of existing paths", "[input_detect]") { + TempFile bed("chr1\t100\t200\tj\t1\t+\n", ".bed"); + TempFile list(bed.str() + "\n", ".txt"); + REQUIRE_FALSE(is_bed_file(list.str())); +} + +TEST_CASE("is_bed_file treats a bgzipped file as a single BED", "[input_detect]") { + TempFile gz(kBamMagic, ".dat"); // gzip magic prefix + REQUIRE(is_bed_file(gz.str())); +} diff --git a/tests/test_path_utils.cpp b/tests/test_path_utils.cpp new file mode 100644 index 0000000..bda6bf1 --- /dev/null +++ b/tests/test_path_utils.cpp @@ -0,0 +1,94 @@ +#include + +#include "eastr/path_utils.hpp" +#include "eastr/output_writer.hpp" +#include "eastr/version.hpp" + +using namespace eastr; + +TEST_CASE("is_file_path distinguishes files from directories", "[path_utils]") { + // Bare filename with extension and no separator -> file (regression: this used + // to be misclassified as a directory because of the npos comparison bug). + REQUIRE(is_file_path("out.bed")); + REQUIRE(is_file_path("out.bam")); + + // File with a directory prefix -> file. + REQUIRE(is_file_path("dir/out.bed")); + REQUIRE(is_file_path("a/b/c/out.original.bed")); + + // No extension -> directory. + REQUIRE_FALSE(is_file_path("results")); + REQUIRE_FALSE(is_file_path("out/results")); + + // Trailing separator -> directory. + REQUIRE_FALSE(is_file_path("results/")); + + // Empty -> not a file. + REQUIRE_FALSE(is_file_path("")); +} + +TEST_CASE("path_basename strips directory and extension", "[path_utils]") { + REQUIRE(path_basename("/a/b/sample.bam") == "sample"); + REQUIRE(path_basename("sample.bam") == "sample"); + REQUIRE(path_basename("sample") == "sample"); + REQUIRE(path_basename("dir/sample") == "sample"); +} + +TEST_CASE("path_parent_dir returns the directory component", "[path_utils]") { + REQUIRE(path_parent_dir("/a/b/sample.bam") == "/a/b"); + REQUIRE(path_parent_dir("sample.bam") == "."); +} + +TEST_CASE("generate_junction_output_paths: single input + file path", "[output_writer]") { + // Single input, bare filename -> use the path as-is (issue #3 core). + auto paths = OutputWriter::generate_junction_output_paths( + {"sample.bam"}, "out.bed", "_original_junctions"); + REQUIRE(paths.size() == 1); + REQUIRE(paths[0] == "out.bed"); +} + +TEST_CASE("generate_junction_output_paths: single input + directory", "[output_writer]") { + auto paths = OutputWriter::generate_junction_output_paths( + {"/data/sample.bam"}, "outdir", "_original_junctions"); + REQUIRE(paths.size() == 1); + REQUIRE(paths[0] == "outdir/sample_original_junctions.bed"); +} + +TEST_CASE("generate_junction_output_paths: multi input falls back to parent dir", + "[output_writer]") { + auto paths = OutputWriter::generate_junction_output_paths( + {"a.bam", "b.bam"}, "results/out.bed", "_original_junctions"); + REQUIRE(paths.size() == 2); + REQUIRE(paths[0] == "results/a_original_junctions.bed"); + REQUIRE(paths[1] == "results/b_original_junctions.bed"); +} + +TEST_CASE("generate_bam_output_paths: single input + file path", "[output_writer]") { + // Single BAM, bare filename -> write that file directly (issue #4). + auto paths = OutputWriter::generate_bam_output_paths( + {"sample.bam"}, "out.bam", "_EASTR_filtered"); + REQUIRE(paths.size() == 1); + REQUIRE(paths[0] == "out.bam"); +} + +TEST_CASE("generate_bam_output_paths: multi input uses directory", "[output_writer]") { + auto paths = OutputWriter::generate_bam_output_paths( + {"a.bam", "b.bam"}, "outdir", "_EASTR_filtered"); + REQUIRE(paths.size() == 2); + REQUIRE(paths[0] == "outdir/a_EASTR_filtered.bam"); + REQUIRE(paths[1] == "outdir/b_EASTR_filtered.bam"); +} + +TEST_CASE("generate_bam_output_paths: multi input + file path falls back to parent dir", + "[output_writer]") { + // Regression: previously produced "out.bam/a_..." for multiple inputs. + auto paths = OutputWriter::generate_bam_output_paths( + {"a.bam", "b.bam"}, "results/out.bam", "_EASTR_filtered"); + REQUIRE(paths.size() == 2); + REQUIRE(paths[0] == "results/a_EASTR_filtered.bam"); + REQUIRE(paths[1] == "results/b_EASTR_filtered.bam"); +} + +TEST_CASE("version string is populated", "[version]") { + REQUIRE(std::string(kVersion).size() > 0); +} From 25c08218344687ca6a401fab772fecfedc18d79a Mon Sep 17 00:00:00 2001 From: ishinder Date: Sun, 7 Jun 2026 08:13:19 -0400 Subject: [PATCH 2/2] Fix htslib link path on macOS/Homebrew when found via pkg-config pkg-config sets HTSLIB_LIBRARIES to the bare "hts" and the directory in HTSLIB_LIBRARY_DIRS, which is not on the default linker search path on macOS (Homebrew's /opt/homebrew/lib), causing "ld: library 'hts' not found". Add the dir via target_link_directories. Surfaced by the new macos-latest CI job. Co-Authored-By: Claude Opus 4.8 --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e2b9405..19d2bca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -165,6 +165,14 @@ target_include_directories(eastr_lib PUBLIC ${HTSLIB_INCLUDE_DIRS} ) +# When htslib is found via pkg-config, HTSLIB_LIBRARIES holds the bare name +# ("hts") and the directory is in HTSLIB_LIBRARY_DIRS. That dir is on the default +# linker path on Linux but not for Homebrew on macOS (/opt/homebrew/lib), so add +# it explicitly to avoid "ld: library 'hts' not found". +if(HTSLIB_LIBRARY_DIRS) + target_link_directories(eastr_lib PUBLIC ${HTSLIB_LIBRARY_DIRS}) +endif() + target_link_libraries(eastr_lib PUBLIC ${HTSLIB_LIBRARIES} ZLIB::ZLIB