ce/testing.qmd at main · lmaowisc/ce · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
# Hypothesis Testing

## Slides {.unnumbered}

Chapter slides [here](chap2.html){target="_blank"}. (To convert html to pdf, press E $\to$ Print $\to$ Destination: Save to pdf)

## R code {.unnumbered}

```{r}
#| code-fold: true
#| code-summary: "Show the code"
#| eval: false
##################################################################
# This code generates the numerical results in chapter 2         #
##################################################################

# load the survival package
library(survival)

# install and load the WR package for
# 1. dataset hfaction_cpx9;
# 2. function WRrec() for win ratio test (of recurrent events and death)
# 3. functions base() and WRSS() for sample size calculation
# install.packages("WR")
library(WR)
library(tidyverse) # for data wrangling (dplyr, ggplot2, etc.)

##### Read in HF-ACTION DATA ########
# same as rmt::hfaction used in chap 1
#  (except for status coding)
data(hfaction_cpx9)
hfaction <- hfaction_cpx9
head(hfaction)
#> Shows the first few rows of hfaction_cpx9 dataset

# count unique patients in each arm
hfaction |>
  group_by(trt_ab) |>
  distinct(patid) |>
  count(trt_ab)
#> This gives the number of unique patients (patid) by treatment arm (trt_ab)

#### demo ############
# WRrec() fits the recurrent event plus death model (Win Ratio approach)
obj <- WRrec(
  ID = hfaction$patid,
  time = hfaction$time,
  status = hfaction$status,
  trt = hfaction$trt_ab,
  strata = hfaction$age60,
  naive = TRUE
)
# summary results
obj
#> Displays the main results, including win ratio estimates for each method.

# LWR
beta <- obj$log.WR  # log-win ratio for LWR
se <- obj$se        # standard error for log-win ratio (LWR)
# test
pval <- 2 * (1 - pnorm(abs(beta / se)))
pval
#> Two-sided p-value for LWR

# NWR
beta.naive <- obj$log.WR.naive  # log-win ratio for naive WR (NWR)
se.naive <- obj$se.naive        # its standard error
# test
pval.naive <- 2 * (1 - pnorm(abs(beta.naive / se.naive)))
pval.naive
#> Two-sided p-value for NWR

# FWR
beta.FI <- obj$log.WR.FI  # log-win ratio for Fisher’s information-based WR (FWR)
se.FI <- obj$se.FI        # standard error for log(FWR)
# test
pval.FI <- 2 * (1 - pnorm(abs(beta.FI / se.FI)))
pval.FI
#> Two-sided p-value for FWR

#################################
# Win ratio analyses: tabulate  #
#################################

data <- hfaction
### create a dataset with only the first hospitalization -> data.H1

# hospitalization data
tmpH <- data[data$status == 2, ]
# get the first record of each id
o <- order(tmpH$patid, tmpH$time)
tmpH <- tmpH[o, ]
tmpFH <- tmpH[!duplicated(tmpH$patid), ]

# combine it with mortality data
data.H1 <- rbind(tmpFH, data[data$status != 2, ])
o <- order(data.H1$patid, data.H1$time)
data.H1 <- data.H1[o, ]

# Function to create a summary table for
# PWR, NWR, FWR, and LWR with their 95% CI and p-values
# ind:  index (logical) for rows in the main 'data'
# ind1: index (logical) for rows in 'data.H1'
# r:    number of decimals for rounding in the output
gwr.fun = function(ind, ind1, r = 2) {

  # fit NWR, FWR, and LWR to original data (multiple events)
  obj <- WRrec(
    ID = data$patid[ind],
    time = data$time[ind],
    status = data$status[ind],
    trt = data$trt_ab[ind],
    strata = data$age60[ind],
    naive = TRUE
  )

  # fit sWR (PWR) to dataset with first hospitalization only
  # This typically addresses "semi-competing" event structure
  obj1 <- WRrec(
    ID = data.H1$patid[ind1],
    time = data.H1$time[ind1],
    status = data.H1$status[ind1],
    trt = data.H1$trt_ab[ind1],
    strata = data.H1$age60[ind1],
    naive = FALSE
  )

  # critical value for a 95% confidence interval
  za <- qnorm(0.975)

  ## LWR results
  beta <- obj$log.WR
  se <- obj$se
  theta <- obj$theta  # proportion of pairwise comparisons that are wins/losses

  # Format: percentage of wins, percentage of losses,
  #         win ratio & 95% CI, p-value
  r4 <- c(
    paste0(round(100 * theta[1], 1), "%"),  # Win
    paste0(round(100 * theta[2], 1), "%"),  # Loss
    paste0(
      round(exp(beta), r), " (",
      round(exp(beta - za * se), r), ", ",
      round(exp(beta + za * se), r), ")"
    ),
    round(1 - pchisq((beta / se)^2, 1), 3)
  )

  ## PWR results
  beta1 <- obj1$log.WR
  se1 <- obj1$se
  theta1 <- obj1$theta

  r1 <- c(
    paste0(round(100 * theta1[1], 1), "%"),
    paste0(round(100 * theta1[2], 1), "%"),
    paste0(
      round(exp(beta1), r), " (",
      round(exp(beta1 - za * se1), r), ", ",
      round(exp(beta1 + za * se1), r), ")"
    ),
    round(1 - pchisq((beta1 / se1)^2, 1), 3)
  )

  ## NWR results
  beta.naive <- obj$log.WR.naive
  se.naive <- obj$se.naive
  theta.naive <- obj$theta.naive

  r2 <- c(
    paste0(round(100 * theta.naive[1], 1), "%"),
    paste0(round(100 * theta.naive[2], 1), "%"),
    paste0(
      round(exp(beta.naive), r), " (",
      round(exp(beta.naive - za * se.naive), r), ", ",
      round(exp(beta.naive + za * se.naive), r), ")"
    ),
    round(1 - pchisq((beta.naive / se.naive)^2, 1), 3)
  )

  ## FWR results
  beta.FI <- obj$log.WR.FI
  se.FI <- obj$se.FI
  theta.FI <- obj$theta.FI

  r3 <- c(
    paste0(round(100 * theta.FI[1], 1), "%"),
    paste0(round(100 * theta.FI[2], 1), "%"),
    paste0(
      round(exp(beta.FI), r), " (",
      round(exp(beta.FI - za * se.FI), r), ", ",
      round(exp(beta.FI + za * se.FI), r), ")"
    ),
    round(1 - pchisq((beta.FI / se.FI)^2, 1), 3)
  )

  # Combine rows into a single table
  result <- rbind(r1, r2, r3, r4)
  rownames(result) <- c("PWR", "NWR", "FWR", "LWR")

  return(result)
}

# Create table
## Age <= 60 years
ind <- (data$age60 == 0)
ind1 <- (data.H1$age60 == 0)
result.lt60 <- gwr.fun(ind, ind1, r = 2)

## Age > 60 years
ind <- (data$age60 == 1)
ind1 <- (data.H1$age60 == 1)
result.ge60 <- gwr.fun(ind, ind1, r = 2)

## overall
ind <- rep(TRUE, nrow(data))
ind1 <- rep(TRUE, nrow(data.H1))
result.all <- gwr.fun(ind, ind1, r = 2)

# combine results
results <- rbind(result.lt60, result.ge60, result.all)
colnames(results) <- c("Win", "Loss", "Win ratio (95% CI)", "p-value")
noquote(results)
#> Final table of all 4 measures (PWR, NWR, FWR, LWR) across strata.

############################################################################
#               Sample size calculation                                    #
############################################################################

# get training arm data
pilot <- hfaction |>
  filter(trt_ab == 1)
# number of subjects
pilot |>
  distinct(patid) |>
  count()
#> This indicates how many unique subjects were in the training arm

############## estimate parameters ##############
# Get the variables from pilot dataset
# to estimate baseline parameters
# lambda_D, lambda_H, kappa

outcome_base <- gumbel.est(pilot$patid, pilot$time / 12, pilot$status)

lambda_D <- outcome_base$lambda_D
lambda_H <- outcome_base$lambda_H
kappa <- outcome_base$kappa

lambda_D
lambda_H
kappa
#> Baseline hazards for death/hospitalization and the gumbel 'kappa' parameter

## Kendall's rank correlation
1 - 1/kappa
#> [1] 0.360812
#> This measures correlation between timing of repeated events

### demo ###################
# set design parameters
tau_b <- 3   # time from baseline to start of follow-up for base() computation
tau <- 4     # total follow-up (in years)
lambda_L <- 0.01  # Additional parameter used in the base() function

# use base() function to compute zeta2 and delta
## may take up to 30s
bparam <- base(lambda_D, lambda_H, kappa, tau_b, tau, lambda_L)
#> bparam includes the baseline rates and distribution shape
#> used for sample size calculations

# compute sample size under HRs 0.8 and 0.9
# for death and nonfatal event, respectively
obj <- WRSS(
  xi = log(c(0.9, 0.8)),
  bparam = bparam,
  q = 0.5,
  alpha = 0.05,
  power = 0.8
)
obj$n
#> The required sample size for the given HRs at 80% power

## effect size specification
thetaD <- seq(0.6, 0.95, by = 0.05) # hazard ratio for death
thetaH <- seq(0.6, 0.95, by = 0.05) # hazard ratio for hospitalization

## create a matrix "SS08" for sample size powered at 80%
## under each combination of thetaD and thetaH
mD <- length(thetaD)
mH <- length(thetaH)
SS08 <- matrix(NA, mD, mH)
rownames(SS08) <- thetaD
colnames(SS08) <- thetaH

## fill in the computed sample size values
for (i in 1:mD) {
  for (j in 1:mH) {
    ## sample size under hazard ratios thetaD[i] for death
    ## and thetaH[j] for hospitalization
    SS08[i, j] <- WRSS(
      xi = log(c(thetaD[i], thetaH[j])),
      bparam = bparam,
      q = 0.5,
      alpha = 0.05,
      power = 0.8
    )$n
  }
}
## print the calculated sample sizes
print(SS08)
#> Shows how sample size changes under different hazard ratios for death/hosp

## repeating the same calculation for power = 90%
SS09 <- matrix(NA, mD, mH)
rownames(SS09) <- thetaD
colnames(SS09) < -thetaH  # As in original code; sets the colnames to the sequence of thetaH

## fill in the computed sample size values
for (i in 1:mD) {
  for (j in 1:mH) {
    ## sample size under hazard ratios thetaD[i] for death
    ## and thetaH[j] for hospitalization
    SS09[i, j] <- WRSS(
      xi = log(c(thetaD[i], thetaH[j])),
      bparam = bparam,
      q = 0.5,
      alpha = 0.05,
      power = 0.9
    )$n
  }
}
## print the calculated sample sizes
print(SS09)
#> Sample sizes under 90% power requirements

oldpar <- par(mfrow = par("mfrow"))
par(mfrow = c(1, 2))

persp(
  thetaD, thetaH, SS08 / 1000,
  theta = 50, phi = 15, expand = 0.8, col = "gray",
  ltheta = 180, lphi = 180, shade = 0.75,
  ticktype = "detailed",
  xlab = "\n HR on Death", ylab = "\n HR on Hospitalization",
  zlab = paste0("\n Sample Size (10e3)"),
  main = "Power = 80%",
  zlim = c(0, 26)
)
#> 3D perspective plot of sample size (in thousands) for power=80%
#> over various hazard ratios for death/hosp

persp(
  thetaD, thetaH, SS09 / 1000,
  theta = 50, phi = 15, expand = 0.8, col = "gray",
  ltheta = 180, lphi = 180, shade = 0.75,
  ticktype = "detailed",
  xlab = "\nHR on Death", ylab = "\nHR on Hospitalization",
  zlab = paste0("\n Sample Size (10e3)"),
  main = "Power = 90%",
  zlim = c(0, 26)
)
#> Similar 3D perspective for power=90%

```

$$\newcommand{\d}{{\rm d}}
\newcommand{\T}{{\rm T}}
\newcommand{\dd}{{\rm d}}
\newcommand{\cc}{{\rm c}}
\newcommand{\pr}{{\rm pr}}
\newcommand{\var}{{\rm var}}
\newcommand{\se}{{\rm se}}
\newcommand{\indep}{\perp \!\!\! \perp}
\newcommand{\Pn}{n^{-1}\sum_{i=1}^n}
\newcommand\mymathop[1]{\mathop{\operatorname{#1}}}
\newcommand{\Ut}{{n \choose 2}^{-1}\sum_{i<j}\sum}
\def\a{{(a)}}
\def\b{{(1-a)}}
\def\t{{(1)}}
\def\c{{(0)}}
\def\d{{\rm d}}
\def\T{{\rm T}}
\def\bs{\boldsymbol}
$$

## Win Ratio: Definitions and Properties

The win ratio (WR) is a pairwise, nonparametric method for comparing composite outcomes in a prioritized manner. Each subject in the treatment arm is compared to each subject in the control arm over a shared follow-up window. A win is declared if the treated subject experiences a better outcome—typically defined as longer survival or, if tied on survival, fewer or later nonfatal events.

Formally, let $D_i^{(a)}, T_i^{(a)}, C_i^{(a)}$ denote survival, hospitalization, censoring times on $i$th subject in group $a$ $(i=1,\ldots, N_a; a= 1, 0)$.
Then the win fraction for group $a$ versus $1-a$ is
$$
\hat w^{(a, 1-a)} = \frac{1}{N_1N_0} \sum_{i=1}^{N_a} \sum_{j=1}^{N_{1-a}} \hat w^{(a, 1-a)}_{ij},
$$
where
\begin{align}
            \hat w^{(a, 1-a)}_{ij}&= \underbrace{I(D_j^{(1-a)}< D_i^{(a)}\wedge C_i^{(a)}\wedge C_j^{(1-a)})}_{\mbox{win on survival}}\\
            & + \underbrace{I(\min(D_i^{(a)}, D_j^{(1-a)}) > C_i^{(a)}\wedge C_j^{(1-a)}, T_j^{(1-a)}< T_i^{(a)}\wedge C_i^{(1)}\wedge C_j^{(0)})}_{\mbox{tie on survival, win on hospitalization}}
    \end{align}


The win ratio statistic is defined as
$$
\text{WR} = \frac{\hat w^{(1, 0)}}{\hat w^{(0, 1)}}.
$$

Alternative summaries include the **net benefit** (proportion in favor) and the **win odds**, which account for ties and interpret the WR on a log-odds scale. In the special case of binary outcomes, the WR reduces to the odds ratio, and the net benefit corresponds to the risk difference.

## Hypothesis Testing and Interpretation

Statistical inference for the WR is based on the asymptotic normality of the log win ratio:
$$
S_n = \frac{\sqrt{n} \log(\hat w_{1,0} / \hat w_{0,1})}{\hat{\rm SE}} \sim N(0, 1),
$$
under the null hypothesis that treatment and control are equally effective. The standard error $\hat{\rm SE}$ is calculated via $U$-statistics, and the null corresponds to equal bivariate survival functions $H^{(1)}(s, t) = H^{(0)}(s, t)$ for all $t \le s$.

The alternative hypothesis assumes that the probability of a treatment win exceeds that of a control win at all times. A sufficient condition is that the joint distribution of death and nonfatal events under treatment is stochastically larger than under control.

## Extensions and Variants

Several extensions of the WR improve its flexibility and robustness. Weighted comparisons allow later follow-up times to contribute more to the test statistic, leading to log-rank-type weights. Stratified WR methods compare subjects within strata to adjust for confounding and increase efficiency.

## Generalization to Recurrent Events

When subjects experience multiple nonfatal events, a generalized WR can be defined using a win function $\mathcal{W}(\cdot, \cdot)(t)$ that compares event histories up to time $t$. The generalized win ratio statistic becomes
$$
\hat{\mathcal E}_n(\mathcal W) = \frac{\sum \mathcal W(\mathcal H^{*(1)}_i, \mathcal H^{*(0)}_j)}
     {\sum \mathcal W(\mathcal H^{*(0)}_j, \mathcal H^{*(1)}_i)},
$$
where comparisons are made up to the minimum of each pair’s observed follow-up time.

Three versions of recurrent-event win functions are commonly used:

- **Naive WR**: prioritize death, then total number of events
- **First-event WR**: further prioritize time to first event
- **Last-event WR**: further prioritize time to last event

All versions enforce a win/loss/tie rule based on observable history, and guarantee that win status is unchanged after death. Among them, the last-event WR (LWR) has shown superior power in simulation studies due to fewer ties and better separation between subjects.

## HF-ACTION Example

In the HF-ACTION trial, recurrent-event WR methods were applied to a high-risk subgroup of 426 patients. Compared to usual care, exercise training reduced both mortality and hospitalizations. The last-event WR was estimated as 1.32 (95% CI: 1.05–1.66, *p* = 0.019), with similar results from first-event and naive WRs.

![](images/test_hfaction_wr.png){fig-align="center" width="70%"}

These results illustrate that recurrent-event WRs detect benefits more effectively than traditional time-to-first-event WRs, especially when the number and timing of nonfatal events vary widely across subjects.

## Sample Size Calculation

Sample size calculation for WR is based on a joint model for death and nonfatal events. Under a Gumbel–Hougaard copula model, the joint survival function is
$$
        \pr(D^\a>s, T_1^\a>t) = \exp\left(-\left[\{\exp(a\xi_D)\lambda_Ds\}^\kappa + \{\exp(a\xi_H)\lambda_Ht\}^\kappa \right]^{1/\kappa}\right)
$$
where $\lambda_D^a$ and $\lambda_H^a$ are arm-specific baseline hazard rates and $\kappa$ controls the dependence between outcomes.

The required sample size is
$$
n = \frac{\zeta_0^2(z_{1-\alpha/2} + z_\gamma)^2}{q(1-q)\delta^\top \xi},
$$
where:

- $\zeta_0^2$ is the variance of the test statistic under the null,
- $\delta$ is the sensitivity of the log-WR to changes in log-HRs $\xi$,
- $q$ is the treatment allocation proportion.

Design parameters include the accrual duration $\tau_b$, total follow-up time $\tau$, and a random loss-to-follow-up rate $\lambda_L$.

## HF-ACTION: Planning a New Trial

Using HF-ACTION training arm data as historical reference, the estimated baseline rates were:

- $\lambda_D = 0.073$ deaths/year
- $\lambda_H = 0.56$ hospitalizations/year
- Kendall’s correlation = 36.1%

Assuming HRs of 0.9 (death) and 0.8 (hospitalization), and minimal loss to follow-up, the sample size required for 80% power is approximately $n = 1241$.

![](images/test_hfaction_ss.png){fig-align="center" width="90%"}

## Example R code

The following example uses a data frame `df` in long format with columns:

-   `id`: subject identifier\
-   `time`: event or censoring time\
-   `status`: event type (0 = censoring, 1 = death, 2 = nonfatal)\
-   `trt`: treatment group (0 = control, 1 = treatment)\
-   `strata`: optional stratification variable

```{r}
#| eval: false
########################################
# 1. WR test for recurrent events
########################################
library(WR)

obj <- WRrec(ID = df$id, time = df$time, status = df$status,
             trt = df$trt, strata = df$strata, naive = TRUE)

# Extract log win ratio and standard error
obj$log.WR
obj$se
print(obj)


########################################
# 2. Sample size calculation
########################################
# Step 1: Estimate baseline outcome parameters
base_est <- gumbel.est(id = df$id, time = df$time, status = df$status)

# Step 2: Design-specific noise and signal
bparam <- base(lambda_D = base_est$lambda_D,
               lambda_H = base_est$lambda_H,
               kappa = base_est$kappa,
               tau_c = 3, tau = 4, lambda_L = 0.001)

# Step 3: Required sample size for given effect sizes
WRSS(xi = log(c(0.9, 0.8)), bparam = bparam, q = 0.5, power = 0.8)$n

```

## Conclusion

The win ratio test provides a clinically meaningful and statistically valid method for analyzing prioritized composite outcomes. By extending to recurrent events and general win functions, the WR accommodates complex patient trajectories while preserving interpretability. Among several extensions, the last-event WR offers strong separation and improved power in trials with repeated nonfatal events.

Sample size formulas grounded in copula models enable principled trial design. The WR package supports both estimation and power analysis, facilitating the broader adoption of win-based strategies in modern clinical research.