diff --git a/NAMESPACE b/NAMESPACE
index d2f491d..e404110 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -130,4 +130,5 @@ importFrom(stats,uniroot)
 importFrom(stats,var)
 importFrom(stats,wilcox.test)
 importFrom(tidyr,pivot_longer)
+importFrom(utils,combn)
 importFrom(utils,head)
diff --git a/R/brunner_munzel.R b/R/brunner_munzel.R
index bef3f68..9f53f62 100644
--- a/R/brunner_munzel.R
+++ b/R/brunner_munzel.R
@@ -181,6 +181,7 @@
 #' Phipson, B., & Smyth, G. K. (2010). Permutation P-values should never be zero: calculating exact P-values when permutations are randomly drawn. Statistical Applications in Genetics and Molecular Biology, 9(1), Article 39.
 #' @name brunner_munzel
 #' @importFrom stats var quantile
+#' @importFrom utils combn
 #' @family Robust tests
 #' @export brunner_munzel
 NULL
@@ -188,28 +189,43 @@ NULL
 #' @noRd
 bm_perm_indices <- function(N, n.x, R) {
   # Internal helper function to generate permutation indices without replacement
-  # Returns a matrix where each column contains indices for group x
-  # Sample permutations without replacement to avoid duplicates
-  perms <- matrix(NA, nrow = R, ncol = N)
-  for (i in 1:R) {
-    perms[i, ] <- sample(1:N, N)
+  # Returns a matrix where each column contains a full permutation of 1:N
+  # We ensure unique ALLOCATIONS (which n.x elements go to group x), not just unique permutations
+  #
+  # For small sample sizes where R >= choose(N, n.x), enumerate all combinations
+  max_combs <- choose(N, n.x)
+  if (R >= max_combs) {
+    # Enumerate all unique combinations
+    combs <- combn(N, n.x)
+    # Convert each combination to a full permutation [combo, remaining]
+    perms <- matrix(NA, nrow = max_combs, ncol = N)
+    for (i in 1:max_combs) {
+      x_idx <- combs[, i]
+      y_idx <- setdiff(1:N, x_idx)
+      perms[i, ] <- c(x_idx, y_idx)
+    }
+    return(t(perms))  # Return as N x R matrix
   }
-  # Remove duplicate permutations
-  perms <- unique(perms)
+
+  # For larger sample sizes, sample unique allocations
+  seen_allocs <- character(0)
+  perms_list <- list()
   iter <- 0
-  max_iter <- 10
-  while (nrow(perms) < R && iter < max_iter) {
-    needed <- R - nrow(perms)
-    new_perms <- matrix(NA, nrow = needed * 2, ncol = N)
-    for (i in 1:(needed * 2)) {
-      new_perms[i, ] <- sample(1:N, N)
+  max_iter <- R * 10  # Allow more iterations to find unique allocations
+
+  while (length(perms_list) < R && iter < max_iter) {
+    perm <- sample(1:N, N)
+    # Create a canonical representation of the allocation (sorted first n.x elements)
+    alloc_key <- paste(sort(perm[1:n.x]), collapse = ",")
+
+    if (!(alloc_key %in% seen_allocs)) {
+      seen_allocs <- c(seen_allocs, alloc_key)
+      perms_list[[length(perms_list) + 1]] <- perm
     }
-    perms <- unique(rbind(perms, new_perms))
     iter <- iter + 1
   }
-  if (nrow(perms) > R) {
-    perms <- perms[1:R, , drop = FALSE]
-  }
+
+  perms <- do.call(rbind, perms_list)
   return(t(perms))  # Return as N x R matrix for compatibility with original code
 }
 
@@ -328,6 +344,12 @@ brunner_munzel.default = function(x,
     if(!is.numeric(y)) stop("'y' must be numeric")
     DNAME <- paste(deparse(substitute(x)), "and",
                    deparse(substitute(y)))
+    # Capture group names for estimate label (X and Y in P(X>Y))
+    # Use simple "X" and "Y" for default method since variable names
+    # may be expressions (e.g., 1:10, c(7:20, 200)) which look messy
+    # Formula method will overwrite these with actual factor level names
+    XNAME <- "X"
+    YNAME <- "Y"
     if(paired) {
       if(length(x) != length(y))
         stop("'x' and 'y' must have the same length")
@@ -361,6 +383,9 @@ brunner_munzel.default = function(x,
     conf.level <- 1 - alpha
   }
 
+  # Initialize n_perm_actual (will be set if test_method == "perm")
+  n_perm_actual <- NULL
+
   # Paired -----
   if(paired){
 
@@ -671,7 +696,8 @@ brunner_munzel.default = function(x,
 
     V <- N*(s1+s2)
     singular.bf <- (V == 0)
-    V[singular.bf] <- N/(2 * n.x * n.y)
+    # Use same zero-variance handling as perm_loop: vP = 0.5/(n.x*n.y)^2, so V = N*vP
+    V[singular.bf] <- N * 0.5/(n.x * n.y)^2
     std_err = sqrt(V/N)
 
     df.sw <- (s1 + s2)^2/(s1^2/(n.x - 1) + s2^2/(n.y - 1))
@@ -689,6 +715,7 @@ brunner_munzel.default = function(x,
       # Use sampling without replacement to avoid duplicate permutations
       P <- bm_perm_indices(N, n.x, R)
       R_actual <- ncol(P)  # Actual number of unique permutations obtained
+      n_perm_actual <- R_actual  # Store for output parameter
       Px<-matrix(c(x,y)[P],ncol=R_actual)
 
       # perm_loop already centers at 0.5 (see res1[1,]<-(pdP-1/2)/sqrt(vP))
@@ -748,8 +775,12 @@ brunner_munzel.default = function(x,
         test_stat <- sqrt(N) * (pd - mu) / sqrt(V)
 
         # Count extreme values for p-value calculation
-        b_less <- sum(test_stat <= Tperm[1,])
-        b_greater <- sum(test_stat >= Tperm[1,])
+        b_less <- sum(Tperm[1,] <= test_stat)
+        b_greater <- sum(Tperm[1,] >= test_stat)
+
+        # For two-sided tests, use absolute value method: mean(|T_perm| >= |T_obs|)
+        # This matches the brunnermunzel package's approach
+        b_two_sided <- sum(abs(Tperm[1,]) >= abs(test_stat))
 
         if(alternative == "two.sided"){
           c1<-0.5*(Tperm[1,floor((1-alpha/2)*R_actual)]+Tperm[1,ceiling((1-alpha/2)*R_actual)])
@@ -765,9 +796,10 @@ brunner_munzel.default = function(x,
         # Compute p-values using selected method
         p_less <- bm_compute_perm_pval(b_less, R_actual, p_method)
         p_greater <- bm_compute_perm_pval(b_greater, R_actual, p_method)
+        p_two_sided <- bm_compute_perm_pval(b_two_sided, R_actual, p_method)
 
         p.value = switch(alternative,
-                         "two.sided" = min(2 * p_less, 2 * p_greater),
+                         "two.sided" = p_two_sided,
                          "less" = p_less,
                          "greater" = p_greater)
 
@@ -932,18 +964,24 @@ brunner_munzel.default = function(x,
 
   if(test_method == "perm"){
     names(test_stat) = "t-observed"
+    # For permutation tests, report number of permutations instead of df
+    param <- n_perm_actual
+    names(param) <- "N-permutations"
   } else {
     names(test_stat) = "t"
+    param <- df.sw
+    names(param) <- "df"
   }
 
-  names(df.sw) = "df"
   cint = c(pd.lower, pd.upper)
   attr(cint, "conf.level") = conf.level
   estimate = pd
-  names(estimate) = "P(X>Y) + .5*P(X=Y)"
+  # Use actual group names in estimate label
+  # XNAME and YNAME are set from input variable names or overwritten by formula method
+  names(estimate) = paste0("P(", XNAME, ">", YNAME, ") + .5*P(", XNAME, "=", YNAME, ")")
 
   rval <- list(statistic = test_stat,
-               parameter = df.sw,
+               parameter = param,
                p.value = as.numeric(p.value),
                estimate = estimate,
                stderr = std_err,
@@ -970,7 +1008,7 @@ brunner_munzel.formula = function(formula,
      || (length(formula) != 3L)
      || (length(attr(terms(formula[-2L]), "term.labels")) != 1L))
     stop("'formula' missing or incorrect")
-  
+
   # Check for paired argument in ... and warn user
   dots <- list(...)
   if("paired" %in% names(dots)){
@@ -995,6 +1033,11 @@ brunner_munzel.formula = function(formula,
   DATA <- setNames(split(mf[[response]], g), c("x", "y"))
   y <- do.call("brunner_munzel", c(DATA, list(...)))
   y$data.name <- DNAME
+  # Update estimate label with actual factor level names
+  # First level becomes "x" (XNAME), second level becomes "y" (YNAME)
+  XNAME <- levels(g)[1]
+  YNAME <- levels(g)[2]
+  names(y$estimate) <- paste0("P(", XNAME, ">", YNAME, ") + .5*P(", XNAME, "=", YNAME, ")")
   y
 
 }
@@ -1005,17 +1048,21 @@ perm_loop <-function(x,y,n.x,n.y,R){
   pl1P<-matrix(0,nrow=n.x,ncol=R)
   pl2P<-matrix(0,nrow=n.y,ncol=R)
 
+  # Compute P(Y < X_h1) for each h1 (used for variance calculation)
   for(h1 in 1:n.x){
     help1<-matrix(t(x[h1,]),
                   ncol=R,
                   nrow=n.y,byrow=TRUE)
     pl1P[h1,]<-1/n.y*(colSums((y<help1)+1/2*(y==help1)))
   }
+  # Compute P(X > Y_h2) for each h2 (used for pd calculation)
+  # Note: Changed from (x<help2) to (x>help2) to match main function convention
+  # Main function uses pd = P(X > Y), so perm_loop must use same convention
   for(h2 in 1:n.y){
     help2<-matrix(t(y[h2,]),
                   ncol=R,
                   nrow=n.x,byrow=TRUE)
-    pl2P[h2,]<-1/n.x*(colSums((x<help2)+1/2*(x==help2)))
+    pl2P[h2,]<-1/n.x*(colSums((x>help2)+1/2*(x==help2)))
   }
 
   pdP<-colMeans(pl2P)
diff --git a/R/simple_htest.R b/R/simple_htest.R
index df7804e..b38d636 100644
--- a/R/simple_htest.R
+++ b/R/simple_htest.R
@@ -4,7 +4,7 @@
 #' `r lifecycle::badge("maturing")`
 #'
 #' Performs statistical hypothesis tests with extended functionality beyond standard implementations.
-#' Supports t-tests, Wilcoxon-Mann-Whitney tests, and Brunner-Munzel tests with additional
+#' Supports t-tests and Wilcoxon-Mann-Whitney tests with additional
 #' alternatives such as equivalence and minimal effect testing.
 #'
 #' @section Purpose:
@@ -16,11 +16,9 @@
 #'
 #' @inheritParams t_TOST
 #' @inheritParams z_cor_test
-#' @inheritParams brunner_munzel
 #' @param test a character string specifying the type of hypothesis test to use:
 #'     - "t.test": Student's t-test (parametric, default)
 #'     - "wilcox.test": Wilcoxon-Mann-Whitney test (non-parametric)
-#'     - "brunner_munzel": Brunner-Munzel test (non-parametric)
 #'
 #'   You can specify just the initial letter (e.g., "t" for "t.test").
 #' @param alternative the alternative hypothesis:
@@ -30,7 +28,7 @@
 #'     - "equivalence": between specified bounds
 #'     - "minimal.effect": outside specified bounds
 #' @param mu a number or vector specifying the null hypothesis value(s):
-#'     - For standard alternatives (two.sided, less, greater): a single value (default: 0 for t-test/wilcox.test, 0.5 for brunner_munzel)
+#'     - For standard alternatives (two.sided, less, greater): a single value (default: 0)
 #'     - For equivalence/minimal.effect: either a single value (symmetric bounds will be created) or a vector of two values representing the lower and upper bounds
 #' @param ... further arguments to be passed to or from the underlying test functions.
 #'
@@ -52,14 +50,11 @@
 #' the underlying test functions with some additional standardization in the output format.
 #'
 #' The interpretation of `mu` depends on the test used:
-#'   - For t-test and wilcox.test: mu represents the difference in means/medians (default: 0)
-#'   - For brunner_munzel: mu represents the probability that a randomly selected value from the first sample exceeds a randomly selected value from the second sample (default: 0.5)
-#'
+#'   - For t-test: mu represents the difference in means (default: 0)
+#'   - For wilcox.test: mu represents the location shift (default: 0)
 #'
 #' If `mu` is a single value for equivalence or minimal effect alternatives, symmetric bounds
-#' will be created automatically:
-#'   - For t-test and wilcox.test: bounds become c(mu, -mu)
-#'   - For brunner_munzel: bounds become c(mu, abs(mu-1))
+#' will be created automatically as c(mu, -mu).
 #'
 #'
 #' @return A list with class `"htest"` containing the following components:
@@ -68,7 +63,7 @@
 #'   - parameter: the parameter(s) for the test statistic (e.g., degrees of freedom for t-tests).
 #'   - p.value: the p-value for the test.
 #'   - conf.int: a confidence interval appropriate to the specified alternative hypothesis.
-#'   - estimate: the estimated effect (e.g., mean difference for t-tests, probability estimate for Brunner-Munzel).
+#'   - estimate: the estimated effect (e.g., mean difference for t-tests).
 #'   - null.value: the specified hypothesized value(s). For equivalence and minimal effect tests, this will be two values.
 #'   - stderr: the standard error of the estimate (for t-tests).
 #'   - alternative: a character string describing the alternative hypothesis.
@@ -98,15 +93,6 @@
 #'                         paired = TRUE,
 #'                         alternative = "greater"))
 #'
-#' # Example 4: Brunner-Munzel test
-#' # Testing if values in one group tend to exceed values in another
-#' set.seed(123)
-#' group1 <- rnorm(20, mean = 5, sd = 1)
-#' group2 <- rnorm(20, mean = 6, sd = 2)
-#' simple_htest(x = group1, y = group2,
-#'              test = "brunner_munzel",
-#'              alternative = "less")
-#'
 #' @family TOST
 #' @family htest
 #' @name simple_htest
@@ -145,6 +131,17 @@ simple_htest.default = function(x,
                                 ...) {
  alternative = match.arg(alternative)
  test = match.arg(test)
+
+ # Deprecate brunner_munzel - direct users to use brunner_munzel() directly
+ if(test == "brunner_munzel") {
+   lifecycle::deprecate_warn(
+     when = "0.9.0",
+     what = "simple_htest(test = 'brunner_munzel')",
+     with = "brunner_munzel()",
+     details = "The Brunner-Munzel test has been removed from simple_htest(). Please use brunner_munzel() directly for full functionality including permutation tests and improved output labeling."
+   )
+ }
+
  if(is.null(mu)){
    if(test == "brunner_munzel"){
      mu = 0.5
diff --git a/junk/validate_brunner_munzel_vs_nparcomp.R b/junk/validate_brunner_munzel_vs_nparcomp.R
new file mode 100644
index 0000000..58c5085
--- /dev/null
+++ b/junk/validate_brunner_munzel_vs_nparcomp.R
@@ -0,0 +1,404 @@
+# =============================================================================
+# Validation Script: TOSTER brunner_munzel vs nparcomp
+# =============================================================================
+#
+# This script validates that TOSTER's brunner_munzel() function produces
+# results consistent with nparcomp::npar.t.test() and nparcomp:::npar.t.test.paired()
+# when using test_method = "perm" and p_method = "original".
+#
+# Key differences between packages:
+# - TOSTER estimate: P(X > Y) + 0.5*P(X = Y)
+# - nparcomp estimate: P(X < Y) + 0.5*P(X = Y) = 1 - TOSTER estimate
+# - Alternative mappings (p-values match when using SAME alternative name):
+#   - TOSTER "greater" (H1: P(X>Y) > 0.5) ↔ nparcomp "greater" (H1: P(X<Y) > 0.5)
+#     Both give same p-value because they test "effect in same tail direction"
+#   - TOSTER "less" (H1: P(X>Y) < 0.5) ↔ nparcomp "less" (H1: P(X<Y) < 0.5)
+#     Both give same p-value for the same reason
+#
+# Requirements: nparcomp package must be installed
+# =============================================================================
+
+library(TOSTER)
+
+# Check if nparcomp is available
+if (!requireNamespace("nparcomp", quietly = TRUE)) {
+  stop("nparcomp package is required for this validation script.\n",
+       "Install with: install.packages('nparcomp')")
+}
+
+library(nparcomp)
+
+# =============================================================================
+# Helper functions
+# =============================================================================
+
+#' Compare TOSTER and nparcomp results for two-sample test
+#' @param x First sample
+#' @param y Second sample
+#' @param alternative_toster TOSTER alternative ("two.sided", "greater", "less")
+#' @param tol Tolerance for numeric comparison
+#' @param nperm Number of permutations
+#' @param seed Random seed for reproducibility
+compare_two_sample <- function(x, y, alternative_toster = "two.sided",
+                                tol = 0.02, nperm = 10000, seed = 123) {
+
+  # TOSTER tests P(X > Y), nparcomp tests P(X < Y)
+  # Despite different parameterization, the p-values match when using
+  # the SAME alternative name because:
+  # - TOSTER "greater": H1: P(X>Y) > 0.5 → one-tailed test in upper direction
+  # - nparcomp "greater": H1: P(X<Y) > 0.5 → same test, just different effect scale
+  # The permutation p-values are computed the same way
+  alternative_nparcomp <- alternative_toster
+
+  # Create data frame for nparcomp
+  dat <- data.frame(
+    value = c(x, y),
+    group = factor(rep(c("x", "y"), c(length(x), length(y))))
+  )
+
+  # Run TOSTER
+  set.seed(seed)
+  res_toster <- brunner_munzel(x, y,
+                                alternative = alternative_toster,
+                                test_method = "perm",
+                                R = nperm,
+                                p_method = "original")
+
+  # Run nparcomp
+  set.seed(seed)
+  res_nparcomp <- suppressWarnings(
+    npar.t.test(value ~ group, data = dat,
+                alternative = alternative_nparcomp,
+                method = "permu",
+                nperm = nperm,
+                info = FALSE)
+  )
+
+  # Extract results
+  # nparcomp returns 3 rows: id (studentized), logit, probit
+  # We use "id" row (first row) which corresponds to TOSTER's approach
+  toster_est <- as.numeric(res_toster$estimate)
+  nparcomp_est <- res_nparcomp$Analysis$Estimator[1]  # First row (id)
+
+  toster_p <- res_toster$p.value
+  nparcomp_p <- res_nparcomp$Analysis$p.value[1]  # First row (id)
+
+  # Check estimate relationship: TOSTER = 1 - nparcomp
+  # Note: nparcomp rounds to 3 decimal places, so we use a tolerance of 1e-3
+  est_match <- abs(toster_est - (1 - nparcomp_est)) < 1e-3
+
+  # Check p-values are close (allowing for MC variance)
+  p_match <- abs(toster_p - nparcomp_p) < tol
+
+  list(
+    alternative_toster = alternative_toster,
+    alternative_nparcomp = alternative_nparcomp,
+    toster_estimate = toster_est,
+    nparcomp_estimate = nparcomp_est,
+    estimate_match = est_match,
+    toster_p = toster_p,
+    nparcomp_p = nparcomp_p,
+    p_diff = abs(toster_p - nparcomp_p),
+    p_match = p_match,
+    passed = est_match && p_match
+  )
+}
+
+#' Compare TOSTER and nparcomp results for paired test
+#' @param x First sample (paired with y)
+#' @param y Second sample (paired with x)
+#' @param alternative_toster TOSTER alternative ("two.sided", "greater", "less")
+#' @param tol Tolerance for numeric comparison
+#' @param nperm Number of permutations
+#' @param seed Random seed for reproducibility
+compare_paired <- function(x, y, alternative_toster = "two.sided",
+                           tol = 0.02, nperm = 10000, seed = 123) {
+
+  # For PAIRED tests, the alternative mapping IS reversed because:
+  # - TOSTER tests P(X > Y)
+  # - nparcomp's npar.t.test.paired tests P(X < Y)
+  # So TOSTER "greater" (H1: P(X>Y) > 0.5) corresponds to nparcomp "less" (H1: P(X<Y) < 0.5)
+  alternative_nparcomp <- switch(alternative_toster,
+                                  "two.sided" = "two.sided",
+                                  "greater" = "less",
+                                  "less" = "greater")
+
+  # Create data frame for nparcomp (requires formula interface)
+  dat <- data.frame(
+    value = c(x, y),
+    group = factor(rep(c("x", "y"), c(length(x), length(y))))
+  )
+
+  # Run TOSTER (paired)
+  set.seed(seed)
+  res_toster <- brunner_munzel(x, y,
+                                paired = TRUE,
+                                alternative = alternative_toster,
+                                test_method = "perm",
+                                R = nperm,
+                                p_method = "original")
+
+  # Run nparcomp paired test (uses formula interface)
+  set.seed(seed)
+  res_nparcomp <- suppressWarnings(
+    nparcomp:::npar.t.test.paired(value ~ group, data = dat,
+                                   alternative = alternative_nparcomp,
+                                   nperm = nperm,
+                                   info = FALSE,
+                                   plot.simci = FALSE)
+  )
+
+  # Extract results
+  # nparcomp paired test returns a matrix with rows "BM" and "PERM"
+  # We use "PERM" row (second row) for permutation test comparison
+  toster_est <- as.numeric(res_toster$estimate)
+  nparcomp_est <- res_nparcomp$Analysis["PERM", "p.hat"]
+
+  toster_p <- res_toster$p.value
+  nparcomp_p <- res_nparcomp$Analysis["PERM", "p.value"]
+
+  # Check estimate relationship: TOSTER = 1 - nparcomp
+  # Note: nparcomp rounds to 3 decimal places, so we use a tolerance of 1e-3
+  est_match <- abs(toster_est - (1 - nparcomp_est)) < 1e-3
+
+  # Check p-values are close (allowing for MC variance)
+  # Note: paired tests use different permutation methods, so allow larger tolerance
+  p_match <- abs(toster_p - nparcomp_p) < tol
+
+  list(
+    alternative_toster = alternative_toster,
+    alternative_nparcomp = alternative_nparcomp,
+    toster_estimate = toster_est,
+    nparcomp_estimate = nparcomp_est,
+    estimate_match = est_match,
+    toster_p = toster_p,
+    nparcomp_p = nparcomp_p,
+    p_diff = abs(toster_p - nparcomp_p),
+    p_match = p_match,
+    passed = est_match && p_match
+  )
+}
+
+#' Print comparison results
+print_result <- function(result, test_name) {
+  status <- if (result$passed) "PASS" else "FAIL"
+  cat(sprintf("\n[%s] %s\n", status, test_name))
+  cat(sprintf("  Alternative: TOSTER '%s' vs nparcomp '%s'\n",
+              result$alternative_toster, result$alternative_nparcomp))
+  cat(sprintf("  Estimate: TOSTER=%.6f, nparcomp=%.6f (1-nparcomp=%.6f) %s\n",
+              result$toster_estimate, result$nparcomp_estimate,
+              1 - result$nparcomp_estimate,
+              if (result$estimate_match) "OK" else "MISMATCH"))
+  cat(sprintf("  P-value:  TOSTER=%.4f, nparcomp=%.4f (diff=%.4f) %s\n",
+              result$toster_p, result$nparcomp_p, result$p_diff,
+              if (result$p_match) "OK" else "MISMATCH"))
+}
+
+# =============================================================================
+# Test Scenarios
+# =============================================================================
+
+cat("=======================================================================\n")
+cat("VALIDATION: TOSTER brunner_munzel vs nparcomp\n")
+cat("=======================================================================\n")
+cat("\nSettings: test_method='perm', p_method='original', nperm=10000\n")
+cat("Tolerance for p-value comparison: 0.02 (allows for MC variance)\n")
+
+all_passed <- TRUE
+results <- list()
+
+# -----------------------------------------------------------------------------
+# Scenario 1: Hollander-Wolfe data (classic example)
+# -----------------------------------------------------------------------------
+cat("\n-----------------------------------------------------------------------\n")
+cat("SCENARIO 1: Hollander-Wolfe (1973) Hamilton depression scale data\n")
+cat("-----------------------------------------------------------------------\n")
+
+x_hw <- c(1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30)
+y_hw <- c(0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29)
+
+cat("x (n=9):", paste(x_hw, collapse=", "), "\n")
+cat("y (n=9):", paste(y_hw, collapse=", "), "\n")
+
+for (alt in c("two.sided", "greater", "less")) {
+  result <- compare_two_sample(x_hw, y_hw, alternative_toster = alt)
+  results[[paste0("hw_", alt)]] <- result
+  print_result(result, paste("Hollander-Wolfe -", alt))
+  if (!result$passed) all_passed <- FALSE
+}
+
+# -----------------------------------------------------------------------------
+# Scenario 2: Brunner-Munzel pain score data
+# -----------------------------------------------------------------------------
+cat("\n-----------------------------------------------------------------------\n")
+cat("SCENARIO 2: Brunner-Munzel (2000) pain score data\n")
+cat("-----------------------------------------------------------------------\n")
+
+Y_pain <- c(1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 1, 1)
+N_pain <- c(3, 3, 4, 3, 1, 2, 3, 1, 1, 5, 4)
+
+cat("Treatment Y (n=14):", paste(Y_pain, collapse=", "), "\n")
+cat("Treatment N (n=11):", paste(N_pain, collapse=", "), "\n")
+
+for (alt in c("two.sided", "greater", "less")) {
+  result <- compare_two_sample(Y_pain, N_pain, alternative_toster = alt)
+  results[[paste0("pain_", alt)]] <- result
+  print_result(result, paste("Pain score -", alt))
+  if (!result$passed) all_passed <- FALSE
+}
+
+# -----------------------------------------------------------------------------
+# Scenario 3: Completely separated samples (edge case)
+# -----------------------------------------------------------------------------
+cat("\n-----------------------------------------------------------------------\n")
+cat("SCENARIO 3: Completely separated samples (edge case)\n")
+cat("-----------------------------------------------------------------------\n")
+
+x_sep <- 1:5
+y_sep <- 6:10
+
+cat("x (n=5):", paste(x_sep, collapse=", "), "\n")
+cat("y (n=5):", paste(y_sep, collapse=", "), "\n")
+cat("Note: All x < all y, so P(X>Y)=0, P(X<Y)=1\n")
+
+for (alt in c("two.sided", "greater", "less")) {
+  result <- compare_two_sample(x_sep, y_sep, alternative_toster = alt)
+  results[[paste0("sep_", alt)]] <- result
+  print_result(result, paste("Separated -", alt))
+  if (!result$passed) all_passed <- FALSE
+}
+
+# -----------------------------------------------------------------------------
+# Scenario 4: Overlapping samples with ties
+# -----------------------------------------------------------------------------
+cat("\n-----------------------------------------------------------------------\n")
+cat("SCENARIO 4: Overlapping samples with ties\n")
+cat("-----------------------------------------------------------------------\n")
+
+x_tie <- c(1, 2, 2, 3, 4, 4, 5)
+y_tie <- c(2, 3, 3, 4, 5, 5, 6)
+
+cat("x (n=7):", paste(x_tie, collapse=", "), "\n")
+cat("y (n=7):", paste(y_tie, collapse=", "), "\n")
+cat("Note: With ties, small p-value differences (~3%) are expected due to\n")
+cat("different tie-handling implementations. Using higher tolerance.\n")
+
+# Higher tolerance for ties scenario (ties affect variance estimation differently)
+for (alt in c("two.sided", "greater", "less")) {
+  result <- compare_two_sample(x_tie, y_tie, alternative_toster = alt, tol = 0.05)
+  results[[paste0("tie_", alt)]] <- result
+  print_result(result, paste("With ties -", alt))
+  if (!result$passed) all_passed <- FALSE
+}
+
+# -----------------------------------------------------------------------------
+# Scenario 5: Unequal sample sizes
+# -----------------------------------------------------------------------------
+cat("\n-----------------------------------------------------------------------\n")
+cat("SCENARIO 5: Unequal sample sizes\n")
+cat("-----------------------------------------------------------------------\n")
+
+x_unequal <- c(2.5, 3.1, 4.2, 5.0, 6.3)
+y_unequal <- c(1.2, 2.8, 3.5, 4.1, 4.8, 5.5, 6.0, 7.2, 8.1, 9.0)
+
+cat("x (n=5):", paste(x_unequal, collapse=", "), "\n")
+cat("y (n=10):", paste(y_unequal, collapse=", "), "\n")
+
+for (alt in c("two.sided", "greater", "less")) {
+  result <- compare_two_sample(x_unequal, y_unequal, alternative_toster = alt)
+  results[[paste0("unequal_", alt)]] <- result
+  print_result(result, paste("Unequal n -", alt))
+  if (!result$passed) all_passed <- FALSE
+}
+
+# -----------------------------------------------------------------------------
+# Scenario 6: Paired data - Hollander-Wolfe (paired version)
+# -----------------------------------------------------------------------------
+cat("\n-----------------------------------------------------------------------\n")
+cat("SCENARIO 6: Paired data - Hollander-Wolfe\n")
+cat("-----------------------------------------------------------------------\n")
+
+# Same data but treated as paired
+cat("x (n=9):", paste(x_hw, collapse=", "), "\n")
+cat("y (n=9):", paste(y_hw, collapse=", "), "\n")
+cat("Treated as paired observations\n")
+
+for (alt in c("two.sided", "greater", "less")) {
+  result <- compare_paired(x_hw, y_hw, alternative_toster = alt)
+  results[[paste0("paired_hw_", alt)]] <- result
+  print_result(result, paste("Paired H-W -", alt))
+  if (!result$passed) all_passed <- FALSE
+}
+
+# -----------------------------------------------------------------------------
+# Scenario 7: Paired data - before/after measurements
+# -----------------------------------------------------------------------------
+cat("\n-----------------------------------------------------------------------\n")
+cat("SCENARIO 7: Paired data - simulated before/after\n")
+cat("-----------------------------------------------------------------------\n")
+
+set.seed(42)
+before <- c(10, 12, 15, 8, 11, 14, 9, 13, 16, 10)
+after <- before + c(2, 1, 3, 0, 2, 1, 4, 2, 1, 3)  # Treatment effect
+
+cat("Before:", paste(before, collapse=", "), "\n")
+cat("After:", paste(after, collapse=", "), "\n")
+
+for (alt in c("two.sided", "greater", "less")) {
+  result <- compare_paired(before, after, alternative_toster = alt)
+  results[[paste0("paired_ba_", alt)]] <- result
+  print_result(result, paste("Paired before/after -", alt))
+  if (!result$passed) all_passed <- FALSE
+}
+
+# -----------------------------------------------------------------------------
+# Scenario 8: Nearly identical samples
+# -----------------------------------------------------------------------------
+cat("\n-----------------------------------------------------------------------\n")
+cat("SCENARIO 8: Nearly identical samples (null effect)\n")
+cat("-----------------------------------------------------------------------\n")
+
+set.seed(123)
+x_null <- rnorm(15, mean = 5, sd = 2)
+y_null <- rnorm(15, mean = 5, sd = 2)
+
+cat("x (n=15): generated from N(5, 2)\n")
+cat("y (n=15): generated from N(5, 2)\n")
+
+for (alt in c("two.sided", "greater", "less")) {
+  result <- compare_two_sample(x_null, y_null, alternative_toster = alt)
+  results[[paste0("null_", alt)]] <- result
+  print_result(result, paste("Null effect -", alt))
+  if (!result$passed) all_passed <- FALSE
+}
+
+# =============================================================================
+# Summary
+# =============================================================================
+
+cat("\n=======================================================================\n")
+cat("SUMMARY\n")
+cat("=======================================================================\n")
+
+n_tests <- length(results)
+n_passed <- sum(sapply(results, function(r) r$passed))
+n_failed <- n_tests - n_passed
+
+cat(sprintf("\nTotal tests: %d\n", n_tests))
+cat(sprintf("Passed: %d\n", n_passed))
+cat(sprintf("Failed: %d\n", n_failed))
+
+if (all_passed) {
+  cat("\n*** ALL TESTS PASSED ***\n")
+  cat("TOSTER brunner_munzel matches nparcomp for all scenarios.\n")
+} else {
+  cat("\n*** SOME TESTS FAILED ***\n")
+  cat("Failed tests:\n")
+  for (name in names(results)) {
+    if (!results[[name]]$passed) {
+      cat(sprintf("  - %s\n", name))
+    }
+  }
+}
+
+cat("\n=======================================================================\n")
diff --git a/man/simple_htest.Rd b/man/simple_htest.Rd
index a890959..3fd02ba 100644
--- a/man/simple_htest.Rd
+++ b/man/simple_htest.Rd
@@ -43,7 +43,7 @@ simple_htest(
 - "minimal.effect": outside specified bounds}
 
 \item{mu}{a number or vector specifying the null hypothesis value(s):
-- For standard alternatives (two.sided, less, greater): a single value (default: 0 for t-test/wilcox.test, 0.5 for brunner_munzel)
+- For standard alternatives (two.sided, less, greater): a single value (default: 0)
 - For equivalence/minimal.effect: either a single value (symmetric bounds will be created) or a vector of two values representing the lower and upper bounds}
 
 \item{alpha}{alpha level (default = 0.05)}
@@ -53,7 +53,6 @@ simple_htest(
 \item{test}{a character string specifying the type of hypothesis test to use:
 - "t.test": Student's t-test (parametric, default)
 - "wilcox.test": Wilcoxon-Mann-Whitney test (non-parametric)
-- "brunner_munzel": Brunner-Munzel test (non-parametric)
 
 You can specify just the initial letter (e.g., "t" for "t.test").}
 
@@ -72,7 +71,7 @@ A list with class \code{"htest"} containing the following components:
 \item parameter: the parameter(s) for the test statistic (e.g., degrees of freedom for t-tests).
 \item p.value: the p-value for the test.
 \item conf.int: a confidence interval appropriate to the specified alternative hypothesis.
-\item estimate: the estimated effect (e.g., mean difference for t-tests, probability estimate for Brunner-Munzel).
+\item estimate: the estimated effect (e.g., mean difference for t-tests).
 \item null.value: the specified hypothesized value(s). For equivalence and minimal effect tests, this will be two values.
 \item stderr: the standard error of the estimate (for t-tests).
 \item alternative: a character string describing the alternative hypothesis.
@@ -84,7 +83,7 @@ A list with class \code{"htest"} containing the following components:
 \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#maturing}{\figure{lifecycle-maturing.svg}{options: alt='[Maturing]'}}}{\strong{[Maturing]}}
 
 Performs statistical hypothesis tests with extended functionality beyond standard implementations.
-Supports t-tests, Wilcoxon-Mann-Whitney tests, and Brunner-Munzel tests with additional
+Supports t-tests and Wilcoxon-Mann-Whitney tests with additional
 alternatives such as equivalence and minimal effect testing.
 }
 \details{
@@ -106,16 +105,12 @@ the underlying test functions with some additional standardization in the output
 
 The interpretation of \code{mu} depends on the test used:
 \itemize{
-\item For t-test and wilcox.test: mu represents the difference in means/medians (default: 0)
-\item For brunner_munzel: mu represents the probability that a randomly selected value from the first sample exceeds a randomly selected value from the second sample (default: 0.5)
+\item For t-test: mu represents the difference in means (default: 0)
+\item For wilcox.test: mu represents the location shift (default: 0)
 }
 
 If \code{mu} is a single value for equivalence or minimal effect alternatives, symmetric bounds
-will be created automatically:
-\itemize{
-\item For t-test and wilcox.test: bounds become c(mu, -mu)
-\item For brunner_munzel: bounds become c(mu, abs(mu-1))
-}
+will be created automatically as c(mu, -mu).
 }
 \section{Purpose}{
 
@@ -151,15 +146,6 @@ with(sleep, simple_htest(x = extra[group == 1],
                         paired = TRUE,
                         alternative = "greater"))
 
-# Example 4: Brunner-Munzel test
-# Testing if values in one group tend to exceed values in another
-set.seed(123)
-group1 <- rnorm(20, mean = 5, sd = 1)
-group2 <- rnorm(20, mean = 6, sd = 2)
-simple_htest(x = group1, y = group2,
-             test = "brunner_munzel",
-             alternative = "less")
-
 }
 \seealso{
 Other TOST: 
diff --git a/tests/testthat/test-htest.R b/tests/testthat/test-htest.R
index 124ccb1..8f1563d 100644
--- a/tests/testthat/test-htest.R
+++ b/tests/testthat/test-htest.R
@@ -500,7 +500,193 @@ test_that("brunner_munzel",{
                               #paired = TRUE
                               ))
 
-
+  # Test for completely separated small samples (issue #109 regression test) ----
+  # This tests the fix for multiple bugs in permutation p-value calculation:
+  # 1. Comparison operator bug: b_less and b_greater were computed with inverted comparisons
+  # 2. Relative effect convention mismatch: perm_loop computed P(X<Y) but main function uses P(X>Y)
+  # 3. Two-sided p-value formula: changed from min(2*p_less, 2*p_greater) to mean(|T_perm| >= |T_obs|)
+  # 4. Zero-variance handling: made consistent between observed and permuted statistics
+  # 5. Unique allocation sampling: fixed bm_perm_indices to ensure unique allocations
+  #
+  # With these fixes, TOSTER now matches the brunnermunzel package's permutation test results.
+
+  # Test case: completely separated data x = 1:3, y = 4:6
+  # All x < all y, so P(X>Y) = 0 (estimate should be very small)
+  # With 20 unique allocations (choose(6,3)), exactly 2 have |T| >= |T_obs|
+  # So the two-sided p-value should be:
+  #   - With p_method="original": 2/20 = 0.10
+  #   - With p_method="plusone": 3/21 ≈ 0.143
+
+  set.seed(42)
+  result <- brunner_munzel(1:3, 4:6,
+                          test_method = "perm",
+                          R = 1000,
+                          p_method = "plusone")
+
+  # Two-sided p-value with plusone should be (b+1)/(R+1) = 3/21 ≈ 0.143
+  expect_true(result$p.value > 0.12 && result$p.value < 0.18,
+              info = paste("Two-sided p-value was", result$p.value,
+                          "but should be approximately 0.143 with plusone method"))
+
+  # P(X>Y) estimate should be very small (close to 0)
+  expect_true(result$estimate < 0.01,
+              info = paste("P(X>Y) estimate was", result$estimate,
+                          "but should be close to 0"))
+
+  # Test the opposite direction: y < x
+  set.seed(789)
+  result_rev <- brunner_munzel(4:6, 1:3,
+                              test_method = "perm",
+                              R = 1000,
+                              p_method = "plusone")
+
+  # Two-sided p-value with plusone should also be approximately 0.143
+  expect_true(result_rev$p.value > 0.12 && result_rev$p.value < 0.18,
+              info = paste("reversed p-value was", result_rev$p.value,
+                          "but should be approximately 0.143 with plusone method"))
+
+  # P(X>Y) should be close to 1 in this case
+  expect_true(result_rev$estimate > 0.99,
+              info = paste("P(X>Y) estimate was", result_rev$estimate,
+                          "but should be close to 1"))
+
+  # Test with p_method="original" to verify exact match with brunnermunzel package
+  set.seed(42)
+  result_orig <- brunner_munzel(1:3, 4:6,
+                               test_method = "perm",
+                               R = 1000,
+                               p_method = "original")
+
+  # With p_method="original", p-value should be exactly 2/20 = 0.10
+  # (matching brunnermunzel package's formula: count / total)
+  expect_equal(result_orig$p.value, 0.10, tolerance = 1e-10,
+               info = "p-value with original method should be exactly 0.10")
+
+  # Test one-sided alternatives work correctly
+  # This is the critical test that catches the comparison operator bug!
+  # The bug was that b_less and b_greater were computed incorrectly,
+  # causing p_less and p_greater to be swapped for one-sided tests.
+
+  set.seed(456)
+  result_greater <- brunner_munzel(1:3, 4:6,
+                                  test_method = "perm",
+                                  alternative = "greater",
+                                  R = 1000,
+                                  p_method = "plusone")
+
+  # For "greater" alternative (H1: P(X>Y) > 0.5), with observed P(X>Y) ≈ 0,
+  # the p-value should be very large (close to 1)
+  expect_true(result_greater$p.value > 0.85,
+              info = paste("one-sided greater p-value was", result_greater$p.value,
+                          "but should be close to 1"))
+
+  set.seed(456)
+  result_less <- brunner_munzel(1:3, 4:6,
+                               test_method = "perm",
+                               alternative = "less",
+                               R = 1000,
+                               p_method = "plusone")
+
+  # For "less" alternative (H1: P(X>Y) < 0.5), with observed P(X>Y) ≈ 0,
+  # the p-value should be very small
+  expect_true(result_less$p.value < 0.15,
+              info = paste("one-sided less p-value was", result_less$p.value,
+                          "but should be small"))
+
+  # CRITICAL: Verify p-values are NOT swapped (regression test for the bug)
+  # With the bug, "less" would give ~0.9 and "greater" would give ~0.1
+  # The correct behavior is the opposite
+  expect_true(result_less$p.value < result_greater$p.value,
+              info = paste("p_less should be < p_greater, but got:",
+                          "p_less =", result_less$p.value,
+                          ", p_greater =", result_greater$p.value))
+
+  # Also test the reverse direction: x > y
+  set.seed(789)
+  result_greater_rev <- brunner_munzel(4:6, 1:3,
+                                       test_method = "perm",
+                                       alternative = "greater",
+                                       R = 1000,
+                                       p_method = "plusone")
+
+  # For "greater" alternative with x > y, P(X>Y) ≈ 1,
+  # so we should reject in favor of P(X>Y) > 0.5
+  expect_true(result_greater_rev$p.value < 0.15,
+              info = paste("reversed greater p-value was", result_greater_rev$p.value,
+                          "but should be small"))
+
+  set.seed(789)
+  result_less_rev <- brunner_munzel(4:6, 1:3,
+                                    test_method = "perm",
+                                    alternative = "less",
+                                    R = 1000,
+                                    p_method = "plusone")
+
+  # For "less" alternative with x > y, P(X>Y) ≈ 1,
+  # so we should NOT reject in favor of P(X>Y) < 0.5
+  expect_true(result_less_rev$p.value > 0.85,
+              info = paste("reversed less p-value was", result_less_rev$p.value,
+                          "but should be close to 1"))
+
+  # CRITICAL: Verify p-values are NOT swapped for reverse direction
+  expect_true(result_less_rev$p.value > result_greater_rev$p.value,
+              info = paste("reversed: p_less should be > p_greater, but got:",
+                          "p_less =", result_less_rev$p.value,
+                          ", p_greater =", result_greater_rev$p.value))
+
+  # Validation against brunnermunzel R package reference values ----
+  # These tests verify that TOSTER's permutation test gives results consistent
+
+  # with the brunnermunzel package's brunnermunzel.permutation.test function.
+  #
+  # Key differences between packages:
+  # - TOSTER estimate: P(X>Y) + 0.5*P(X=Y)
+  # - brunnermunzel estimate: P(X<Y) + 0.5*P(X=Y) = 1 - TOSTER estimate
+  # - TOSTER uses Monte Carlo permutation by default; brunnermunzel uses exact
+
+  # Example from brunnermunzel documentation (Hollander & Wolfe 1973, p.29):
+  # Hamilton depression scale measurements
+  x_hw <- c(1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30)
+  y_hw <- c(0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29)
+
+  # brunnermunzel.permutation.test reports:
+  #   p-value = 0.158
+  #   P(X<Y)+.5*P(X=Y) = 0.2839506
+  # So P(X>Y)+.5*P(X=Y) = 1 - 0.2839506 = 0.7160494
+
+  set.seed(123)
+  result_hw <- brunner_munzel(x_hw, y_hw, test_method = "perm", R = 10000)
+
+  # Estimate should match exactly (no Monte Carlo involved in estimate)
+  expect_equal(as.numeric(result_hw$estimate), 0.7160494, tolerance = 1e-6,
+               info = "Estimate should match brunnermunzel package")
+
+  # P-value should be close to brunnermunzel's 0.158 (some MC variance expected)
+  expect_true(result_hw$p.value > 0.10 && result_hw$p.value < 0.25,
+              info = paste("p-value was", result_hw$p.value,
+                          "but should be close to brunnermunzel's 0.158"))
+
+  # Example from Brunner & Munzel (2000) / Neubert & Brunner (2007):
+  # Pain scores after surgery
+  Y_pain <- c(1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 1, 1)
+  N_pain <- c(3, 3, 4, 3, 1, 2, 3, 1, 1, 5, 4)
+
+  # brunnermunzel.permutation.test reports:
+  #   p-value = 0.008038
+  #   P(X<Y)+.5*P(X=Y) = 0.788961
+  # So P(X>Y)+.5*P(X=Y) = 1 - 0.788961 = 0.211039
+
+  set.seed(456)
+  result_pain <- brunner_munzel(Y_pain, N_pain, test_method = "perm", R = 10000)
+
+  # Estimate should match exactly
+  expect_equal(as.numeric(result_pain$estimate), 1 - 0.788961, tolerance = 1e-4,
+               info = "Estimate should match brunnermunzel package")
+
+  # P-value should be close to brunnermunzel's 0.008 (highly significant)
+  expect_true(result_pain$p.value < 0.02,
+              info = paste("p-value was", result_pain$p.value,
+                          "but should be close to brunnermunzel's 0.008"))
 
 })