Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: CI

on:
push:
branches: [main, master]
pull_request:

jobs:
build-and-test:
name: ${{ matrix.os }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest # x86_64
- ubuntu-24.04-arm # linux-aarch64 (AWS Graviton / Galaxy compute)
- macos-latest # osx-arm64 (Apple Silicon)
steps:
- uses: actions/checkout@v4
with:
submodules: recursive # minimap2 (+ its sse2neon dir) is a submodule

- name: Install dependencies (Linux)
if: runner.os == 'Linux'
run: |
sudo apt-get update
sudo apt-get install -y cmake g++ zlib1g-dev libhts-dev

- name: Install dependencies (macOS)
if: runner.os == 'macOS'
run: brew install cmake htslib

- name: Configure
run: cmake -B build -DCMAKE_BUILD_TYPE=Release -DEASTR_BUILD_TESTS=ON

- name: Build
run: cmake --build build -j

- name: Test
run: ctest --test-dir build --output-on-failure

version-consistency:
name: version consistency
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Check CMake and conda versions match
run: |
cmake_ver=$(sed -n 's/^project(eastr VERSION \([0-9.]*\).*/\1/p' CMakeLists.txt)
conda_ver=$(sed -n 's/.*set version = "\([0-9.]*\)".*/\1/p' conda/meta.yaml)
echo "CMake: $cmake_ver conda: $conda_ver"
if [ "$cmake_ver" != "$conda_ver" ]; then
echo "::error::Version drift: CMakeLists.txt=$cmake_ver conda/meta.yaml=$conda_ver"
exit 1
fi
49 changes: 49 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Changelog

All notable changes to this project are documented here. This project adheres to
[Semantic Versioning](https://semver.org/).

## [2.1.0] - unreleased

### Added
- `--version` flag that prints the version and exits 0. The version is generated
from the CMake `project(... VERSION ...)` (a single source of truth) via a
configured `eastr/version.hpp`, so the git tag, CMake version, `--version`
output, and conda package can no longer drift. A CI check fails on any drift
between `CMakeLists.txt` and `conda/meta.yaml`.
- `--bed_list` / `--bam_list` flags to explicitly declare that the `--bed` / `--bam`
argument is a text file listing input paths (one per line).
- GitHub Actions CI building and testing on `ubuntu-latest` (x86_64),
`ubuntu-24.04-arm` (linux-aarch64), and `macos-latest` (osx-arm64).
- Unit tests for output path generation and content-based input detection,
including a Galaxy-style `*.dat`-named input case and a single-file
`--out_original_junctions` case.

### Changed
- Input-type detection no longer depends on the file extension. `--bam` is detected
by magic bytes (BGZF/BAM, CRAM, SAM); `--bed` is detected by content (a BED record
vs. a list of existing paths). This fixes inputs named `*.dat` (e.g. every Galaxy
dataset). Auto-detecting a path list still works but now emits a deprecation
warning recommending `--bed_list` / `--bam_list`. `--gtf` already accepted any
filename. (Issue #2)
- `--out_original_junctions` now accepts a **file path** for a single input (BAM,
GTF, or BED) and writes one file, consistent with `--out_removed_junctions` /
`--out_kept_junctions`. It previously behaved as a directory for BAM input and was
a silent no-op for GTF/BED. Directory mode is still used for multi-input runs.
(Issues #1, #3)
- `--out_filtered_bam` now accepts a **file path** for a single BAM input (writes the
filtered BAM directly), keeping directory mode for BAM lists. (Issue #4)

### Fixed
- Bare output filenames without a directory prefix (e.g. `out.bed`, `out.bam`) were
misinterpreted as directories, causing "Cannot open output file" errors. Both bare
filenames and `dir/out.bed` now work. (Issues #1, #3, #4)
- `--out_filtered_bam` with a file-looking path and multiple BAM inputs previously
produced paths like `out.bam/<sample>_EASTR_filtered.bam`; it now falls back to the
parent directory.

### Notes
- `conda/meta.yaml` `sha256` for the source tarball must be updated when the `v2.1.0`
tag is cut.
- Adding `additional_platforms: [linux-aarch64]` to the Bioconda recipe is a separate
PR in `bioconda-recipes`.
21 changes: 20 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
cmake_minimum_required(VERSION 3.16)
project(eastr VERSION 2.0.0 LANGUAGES CXX C)
project(eastr VERSION 2.1.0 LANGUAGES CXX C)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

# Generate the version header from the single source of truth (project VERSION),
# so the git tag, CMake version, --version output and conda package can't drift.
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/include/eastr/version.hpp.in
${CMAKE_CURRENT_BINARY_DIR}/generated/eastr/version.hpp
@ONLY
)

# Options
option(EASTR_BUILD_TESTS "Build tests" OFF)
option(EASTR_USE_OPENMP "Use OpenMP for parallelization" OFF)
Expand Down Expand Up @@ -135,6 +143,8 @@ add_library(eastr_lib STATIC
src/junction.cpp
src/bed_parser.cpp
src/gtf_parser.cpp
src/path_utils.cpp
src/input_detect.cpp
src/fasta_index.cpp
src/junction_extractor.cpp
src/self_aligner.cpp
Expand All @@ -151,9 +161,18 @@ add_library(eastr_lib STATIC

target_include_directories(eastr_lib PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_BINARY_DIR}/generated
${HTSLIB_INCLUDE_DIRS}
)

# When htslib is found via pkg-config, HTSLIB_LIBRARIES holds the bare name
# ("hts") and the directory is in HTSLIB_LIBRARY_DIRS. That dir is on the default
# linker path on Linux but not for Homebrew on macOS (/opt/homebrew/lib), so add
# it explicitly to avoid "ld: library 'hts' not found".
if(HTSLIB_LIBRARY_DIRS)
target_link_directories(eastr_lib PUBLIC ${HTSLIB_LIBRARY_DIRS})
endif()

target_link_libraries(eastr_lib PUBLIC
${HTSLIB_LIBRARIES}
ZLIB::ZLIB
Expand Down
33 changes: 25 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,17 +131,23 @@ eastr --bam input.bam \
|--------|-------------|
| `--bam FILE` | BAM file, or text file listing BAM paths (one per line) |
| `--gtf FILE` | GTF annotation file |
| `--bed FILE` | BED file with junction coordinates |
| `--bed FILE` | BED file with junction coordinates, or text file listing BED paths |
| `-r, --reference FILE` | Reference genome FASTA (required for all input types) |

> Input type is detected by file **content**, not by extension, so any filename
> works (including Galaxy's `*.dat` datasets). A `--bam`/`--bed` argument that is a
> plain-text list of paths is still accepted, but the explicit `--bam_list` /
> `--bed_list` flags are preferred (see below).

### Common options

| Option | Default | Description |
|--------|---------|-------------|
| `-i, --bowtie2_index PATH` | auto-built | Bowtie2 index prefix (built automatically if not provided) |
| `-p INT` | 1 | Number of threads |
| `--out_filtered_bam PATH` | — | Output filtered BAM file or directory |
| `--out_removed_junctions PATH` | stdout | Output spurious junctions (BED format) |
| `--out_filtered_bam PATH` | — | Output filtered BAM. **File** for a single BAM input; **directory** for a BAM list (writes `<sample><suffix>.bam` per input) |
| `--out_removed_junctions FILE` | stdout | Output spurious junctions (BED format) |
| `--version` | — | Print the version and exit |
| `--verbose` | off | Show progress information |

<details>
Expand All @@ -168,14 +174,25 @@ eastr --bam input.bam \
| `-w INT` | 2 | Minimizer window size |
| `-m INT` | 25 | Minimum chain score |

### Additional output options
### Input list options

| Option | Description |
|--------|-------------|
| `--out_original_junctions PATH` | Write all junctions before filtering |
| `--out_kept_junctions PATH` | Write non-spurious junctions |
| `--removed_alignments_bam` | Write removed alignments to separate BAM |
| `--filtered_bam_suffix STR` | Suffix for output BAMs (default: `_EASTR_filtered`) |
| `--bam_list` | Treat the `--bam` argument as a text file listing BAM paths (one per line) |
| `--bed_list` | Treat the `--bed` argument as a text file listing BED paths (one per line) |

### Additional output options

All `--out_*_junctions` options take a **file path** for a single input and write
one BED file. For multi-input runs (BAM list / multiple BED files) you may pass a
**directory**, and one file per input is written as `<basename><suffix>.bed`.

| Option | File / Dir | Description |
|--------|------------|-------------|
| `--out_original_junctions PATH` | file (single) / dir (multi) | All junctions before filtering. Works for BAM, GTF, and BED input |
| `--out_kept_junctions FILE` | file | Non-spurious (kept) junctions |
| `--removed_alignments_bam` | flag | Also write the *removed* alignments alongside each filtered BAM. The output is named by replacing `.bam` with `_removed_alignments.bam` (e.g. `filtered.bam` → `filtered_removed_alignments.bam`) |
| `--filtered_bam_suffix STR` | — | Suffix for per-input filtered BAMs in directory mode (default: `_EASTR_filtered`) |

</details>

Expand Down
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% set name = "eastr-cpp" %}
{% set version = "2.0.2" %}
{% set version = "2.1.0" %}

package:
name: {{ name|lower }}
Expand Down
21 changes: 21 additions & 0 deletions include/eastr/input_detect.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include <string>

namespace eastr {

// Content-based input detection so input handling does not depend on a file's
// extension (Galaxy, for example, names every dataset "*.dat").

// True if `path` is a single alignment file (BAM/CRAM/SAM), detected by magic
// bytes: BGZF/gzip (1f 8b) -> BAM, "CRAM" -> CRAM, leading '@' -> SAM header.
// False means it should be treated as a text file listing alignment paths.
bool is_alignment_file(const std::string& path);

// True if `path` is a single BED file rather than a text file listing BED paths.
// A bgzipped file (1f 8b), or a first data line that parses as a BED record
// (>=3 whitespace columns with integer start/end), is a single BED file. If the
// first non-comment line is instead an existing filesystem path, it is a list.
bool is_bed_file(const std::string& path);

} // namespace eastr
21 changes: 21 additions & 0 deletions include/eastr/path_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include <string>

namespace eastr {

// Decide whether `path` denotes a file (vs a directory) for output purposes.
// An existing directory, or a path with a trailing separator, is a directory.
// Otherwise it is treated as a file when it has an extension after the last
// path separator (e.g. "out.bed", "dir/out.bed"); a bare name without an
// extension (e.g. "results", "out_dir") is treated as a directory.
bool is_file_path(const std::string& path);

// Filename component of `path` with its directory and extension removed
// (e.g. "/a/b/sample.bam" -> "sample").
std::string path_basename(const std::string& path);

// Directory component of `path`, or "." if `path` has no separator.
std::string path_parent_dir(const std::string& path);

} // namespace eastr
6 changes: 4 additions & 2 deletions include/eastr/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@ struct AlgorithmParams {
struct Config {
// Input paths (mutually exclusive)
std::string gtf_path;
std::string bed_path; // Single file or list file
std::string bam_path; // Single file or list file
std::string bed_path; // Single file or list file (detected by content)
std::string bam_path; // Single file or list file (detected by content)
bool bed_list = false; // Treat --bed argument as a list of BED paths
bool bam_list = false; // Treat --bam argument as a list of BAM paths

// Required paths
std::string reference_fasta;
Expand Down
8 changes: 8 additions & 0 deletions include/eastr/version.hpp.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#pragma once

namespace eastr {

// Single source of truth: configured from the CMake project() VERSION.
inline constexpr const char* kVersion = "@PROJECT_VERSION@";

} // namespace eastr
97 changes: 97 additions & 0 deletions src/input_detect.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#include "eastr/input_detect.hpp"

#include <array>
#include <cctype>
#include <cstdint>
#include <filesystem>
#include <fstream>
#include <sstream>
#include <string>

namespace fs = std::filesystem;

namespace eastr {

namespace {

// Read up to `n` leading bytes of a file (fewer if the file is shorter).
std::string read_magic(const std::string& path, size_t n) {
std::ifstream f(path, std::ios::binary);
std::string buf(n, '\0');
f.read(&buf[0], static_cast<std::streamsize>(n));
buf.resize(static_cast<size_t>(f.gcount()));
return buf;
}

bool has_gzip_magic(const std::string& magic) {
return magic.size() >= 2 &&
static_cast<unsigned char>(magic[0]) == 0x1f &&
static_cast<unsigned char>(magic[1]) == 0x8b;
}

bool is_integer(const std::string& s) {
if (s.empty()) return false;
for (char c : s) {
if (!std::isdigit(static_cast<unsigned char>(c))) return false;
}
return true;
}

} // namespace

bool is_alignment_file(const std::string& path) {
std::string magic = read_magic(path, 4);

// BAM is BGZF-compressed, which begins with the gzip magic bytes.
if (has_gzip_magic(magic)) return true;
// CRAM files begin with the literal "CRAM".
if (magic.rfind("CRAM", 0) == 0) return true;
// Uncompressed SAM begins with an '@' header line.
if (!magic.empty() && magic[0] == '@') return true;

// Otherwise assume a plain-text list of alignment paths.
return false;
}

bool is_bed_file(const std::string& path) {
// A bgzipped BED file is a single data file, not a list.
if (has_gzip_magic(read_magic(path, 2))) return true;

std::ifstream f(path);
if (!f.is_open()) {
// Let downstream parsing surface a clear error; treat as single file.
return true;
}

std::string line;
while (std::getline(f, line)) {
// Trim surrounding whitespace.
line.erase(0, line.find_first_not_of(" \t\r\n"));
line.erase(line.find_last_not_of(" \t\r\n") + 1);

if (line.empty()) continue;
if (line[0] == '#' || line.rfind("track", 0) == 0) continue;

// First meaningful line found. A BED record has >=3 columns with
// integer start/end (cols 2 and 3).
std::istringstream iss(line);
std::string chrom, start, end;
if ((iss >> chrom >> start >> end) && is_integer(start) && is_integer(end)) {
return true;
}

// Not a BED record: if it names an existing path, treat as a list file.
if (fs::exists(line)) {
return false;
}

// Neither a BED record nor an existing path: treat as a (malformed) BED
// file so the BED parser reports a precise format error.
return true;
}

// Empty / comment-only file: treat as an empty BED file.
return true;
}

} // namespace eastr
Loading
Loading