CytofWorkflow/workflow.Rmd at master · BlishLab/CytofWorkflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# CyTOF data analysis

```{r setup, echo = FALSE}
library(knitr)
opts_chunk$set(dpi = 100, fig.width = 7, fig.height = 4, cache = TRUE,
               cache.path = "cytoffinal/")
```

## Dimensionality reduction

In this section, we will show an example of how to use R to create an
interpretable representation of the samples in a CyTOF dataset. The
idea is to "featurize" each sample in a useful way: in this analysis,
we will partition the receptor space into 200 elements and use as
features the fraction of cells in each sample which fall into each
element of the partition. This gives us a count matrix with the number
of cells in each sample falling into each element of the partition,
and we use correspondence analysis to get a low-dimensional
representation of this matrix.

To get some intuition into the procedure, note that if the elements of
the partition were all of the same size, the Euclidean distance
between the vectors containing the fraction of each sample falling in
each element of the partition would be a discrete approximation of the
L2 divergence between distributions. We will use k-means to create the
partition, which means that the elements of the partition will not be
of the same size, but we can think of it as a variation on the L2
divergence which upweights regions of receptor space which have higher
average density.


First we load the required packages and read in our data.
```{r libraries}
set.seed(0)
library(flowCore)
library(flexclust)
library(ade4)
library(stringr)
library(viridis)
library(ggrepel)
library(ggplot2)
fs = readRDS("data/flow_list_asinh.rds")
expdata = readRDS("data/expdata.rds")
```

We do not want to use the results from donors 1 and 4, so we exclude
them from the dataset.
```{r excludesamples}
# exclude experiments 1 and 4
exp14 = which(expdata$Exp %in% c("Exp1", "Exp4"))
fs =  fs[-exp14]
expdata = expdata[-exp14,]
```

Next we rename some of the variables (this is just for so the plots
have the right labels).
```{r rename}
expdata$Stim = as.character(expdata$Stim)
expdata$Stim[expdata$Stim == "uns"] = "unstimulated"
expdata$Stim[expdata$Stim == "magic"] = "IL12/IL15/IL18"
expdata$Donor = sapply(expdata$Exp, function(x) str_replace(x, "Exp", "Donor "))
```

Finally, we exclude CD4 and KIR2DL5 for technical reasons, and we
exclude the functional markers for the purpose of this analysis. The
same analysis can also be performed with the functional markers
included.
```{r excludemarkers}
excludedMarkers = c("CD4", "KIR2DL5")
functionalMarkers = c("INFg", "TRAIL", "MIP1b", "GranzymeB", "CD107a", "GM.CSF", "TNFa", "Perforin")
fs = lapply(fs, function(x) x[,!(colnames(x) %in% excludedMarkers)])
fs = lapply(fs, function(x) x[,!(colnames(x) %in% functionalMarkers)])
```


Next we down-sample the cells so that when we perform k-means we are
not over-weighting samples which have larger numbers of cells. We
chose to downsample to 500 purely to save time, but increasing this
number has almost no effect on the outcome.
```{r downsample}
sampledCells = lapply(fs, function(x) x[sample(nrow(x), 500), ])
sampledCells = Reduce(rbind, sampledCells)
```

Next we perform k-means on the downsampled cells.
```{r kmeans}
nclust = 200
out.km = kmeans(sampledCells, centers = nclust, iter.max = 50)
out.fc = as.kcca(out.km, sampledCells)
```

Finally, we assign each of the cells in the full data set to one of
the clusters and count how many cells correspond to each cluster. This
gives us a count matrix called cluster.sizes.
```{r fullclusters}
full.clusters = lapply(fs, function(x) predict(out.fc, newdata = x))
cluster.sizes = t(sapply(full.clusters, function(x) {
    sizes = table(x)
    sizesLong = sizes[as.character(1:nclust)]
    sizesLong[is.na(sizesLong)] = 0
    return(sizesLong)
}))
rownames(cluster.sizes)= rownames(expdata)
```

Finally, we can perform correspondence analysis and make some
plots. The first thing to look at is the scree plot which shows us the
fraction of the inertia explained by each axis.
```{r scree, fig.width = 8, fig.height = 5}
out.coa = dudi.coa(cluster.sizes, scannf = FALSE, nf = 4)
plot(out.coa$eig, type = 'h')
```

Next we can look at the projections of the sample points onto the
principal axes.
```{r coasamples}
axis1percent = round(out.coa$eig[1] / sum(out.coa$eig) * 100, digits = 1)
axis2percent = round(out.coa$eig[2] / sum(out.coa$eig) * 100, digits = 1)
axis3percent = round(out.coa$eig[3] / sum(out.coa$eig) * 100, digits = 1)
(p1 = ggplot(data.frame(out.coa$li, expdata)) +
    geom_point(aes(x = Axis1, y = Axis2, color = Stim)) +
        xlab(paste("Axis 1: ", axis1percent, "%", sep = "")) +
        ylab(paste("Axis 2: ", axis2percent, "%", sep = "")))
ggplot(data.frame(out.coa$li, expdata)) +
    geom_point(aes(x = Axis2, y = Axis3, color = Donor)) +
    xlab(paste("Axis 2: ", axis2percent, "%", sep = "")) +
    ylab(paste("Axis 3: ", axis3percent, "%", sep = ""))
```

To get an idea of whether the axes are easily interpretable in terms
of a single receptor, we can look at the correlation between the
cluster centers and the axes.
```{r coacors}
cors = cor(out.fc@centers, out.coa$co)
cors = data.frame(cors)
cors$name = rownames(cors)
(p2 = ggplot(cors, aes(x = Comp1, y = Comp2, label = name)) +
    geom_text_repel( size = 2) + geom_point(size = 1) +
        ylim(c(-.7, .5)) + xlim(c(-.95, .3)) +
            xlab("Correlation with axis 1") + ylab("Correlation with\naxis 2"))
```


We see from the plot above that CD16 and CD69 are fairly strongly
associated with the first and second axes, respectively. To validate
this, we plot the projections of the clusters onto the principal
plane, coloring the clusters by their average intensity of CD16 or
CD69.
```{r coavariables}
(p3 = ggplot(data.frame(out.coa$co, out.fc@centers)) +
    geom_point(aes(x = Comp1, y = Comp2, color = CD16)) + scale_color_viridis() +
    xlab(paste("Axis 1: ", axis1percent, "%", sep = "")) +
    ylab(paste("Axis 2: ", axis2percent, "%", sep = "")))
(p4 = ggplot(data.frame(out.coa$co, out.fc@centers)) +
    geom_point(aes(x = Comp1, y = Comp2, color = CD69)) + scale_color_viridis() +
    xlab(paste("Axis 1: ", axis1percent, "%", sep = "")) +
    ylab(paste("Axis 2: ", axis2percent, "%", sep = "")))
```


```{r print_figures, echo = FALSE, eval = FALSE}
no.axis.labels = theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(),
    axis.text.x = element_blank(), axis.ticks.x = element_blank(),
    strip.text.x = element_text(size = 6))
## add the eigenvalues to this plot
cairo_pdf("cluster_coa_nofunctional.pdf", family = "arial", width = 9, height = 5)
multiplot(p1, p3, p2, p4, cols = 2)
dev.off()
cairo_pdf("cluster_coa_nofunctional_eig.pdf", family = "arial", width = 4, height = 2.5)
qplot(x = 1:length(out.coa$eig), xend = 1:length(out.coa$eig),
      y = out.coa$eig / sum(out.coa$eig), yend = 0, geom = "segment") +
          ylab("Fraction of inertia") +  xlab("Component")
dev.off()
```


Next we can look at the results of the same analysis with the
functional markers included.

We re-read in the data:
```{r dataimportfunctional}
set.seed(0)
fs = readRDS("../cytof_stims/flow_list_asinh.rds")
expdata = readRDS("../cytof_stims/expdata.rds")
# exclude experiments 1 and 4
exp14 = which(expdata$Exp %in% c("Exp1", "Exp4"))
fs =  fs[-exp14]
expdata = expdata[-exp14,]
expdata$Stim = as.character(expdata$Stim)
expdata$Stim[expdata$Stim == "magic"] = "IL12/IL15/IL18"
expdata$Stim[expdata$Stim == "uns"] = "unstimulated"
expdata$Donor = sapply(expdata$Exp, function(x) str_replace(x, "Exp", "Donor "))

excludedMarkers = c("CD4", "KIR2DL5")
fs = lapply(fs, function(x) x[,!(colnames(x) %in% excludedMarkers)])
```

Then we downsample, perform k means, and make the cluster size
matrix.
```{r clusteringfunctional}
sampledCells = lapply(fs, function(x) x[sample(nrow(x), 500), ])
sampledCells = Reduce(rbind, sampledCells)
nclust = 200
out.km = kmeans(sampledCells, centers = nclust, iter.max = 50)
out.fc = as.kcca(out.km, sampledCells)
full.clusters = lapply(fs, function(x) predict(out.fc, newdata = x))
cluster.sizes = t(sapply(full.clusters, function(x) {
    sizes = table(x)
    sizesLong = sizes[as.character(1:nclust)]
    sizesLong[is.na(sizesLong)] = 0
    return(sizesLong)
}))
rownames(cluster.sizes)= rownames(expdata)
```


And finally, we again perform correspondence analysis and make some
plots.
```{r coafunctional}
out.coa = dudi.coa(cluster.sizes, scannf = FALSE, nf = 6)
cors = cor(out.fc@centers, out.coa$co)
cors = data.frame(cors)
cors$name = rownames(cors)
axis1percent = round(out.coa$eig[1] / sum(out.coa$eig) * 100, digits = 1)
axis2percent = round(out.coa$eig[2] / sum(out.coa$eig) * 100, digits = 1)
plot(out.coa$eig, type = 'h')

(p1 = ggplot(data.frame(out.coa$li, expdata)) +
    geom_point( aes(x = Axis1, y = Axis2, color = Stim)) +
        xlab(paste("Axis 1: ", axis1percent, "%", sep = "")) +
            ylab(paste("Axis 2: ", axis2percent, "%", sep = "")))
(p2 = ggplot(cors, aes(x = Comp1, y = Comp2, label = name)) +
    geom_text_repel(size = 2) + geom_point(size = 1) +
        xlim(c(-.9,1)) + ylim(c(-.55, .5)) +
            xlab("Correlation with axis 1") + ylab("Correlation with\naxis 2"))

(p3 = ggplot(data.frame(out.coa$co, out.fc@centers)) +
    geom_point(aes(x = Comp1, y = Comp2, color = INFg)) +
        scale_color_viridis(guide = guide_legend(title = "IFN-g"))  +
            xlab(paste("Axis 1: ", axis1percent, "%", sep = "")) +
                ylab(paste("Axis 2: ", axis2percent, "%", sep = "")))
(p4 = ggplot(data.frame(out.coa$co, out.fc@centers)) +
    geom_point(aes(x = Comp1, y = Comp2, color = MIP1b)) +
        scale_color_viridis(guide = guide_legend(title = "MIP-1b")) +
            xlab(paste("Axis 1: ", axis1percent, "%", sep = "")) +
                ylab(paste("Axis 2: ", axis2percent, "%", sep = "")))
```


## Testing

We can use the features from the previous section (the number or
fraction of cells in each sample falling in each section of the
partitioned receptor space) to do testing. Here we apply the
Friedman-Rafsky test, which is a two-sample multivariate test based on
distances. For each sample, we have a vector containing the proportion
of cells falling in each section of the space, and we use as a
distance on the samples the distance between those vectors. After
computing these distances, we compute a minimum spanning tree over the
samples using the distances between the proportion vectors. The
Friedman-Rafsky test statistic is the number of pure edges in this
tree (the number of edges connecting two samples in the same class),
and the null distribution is approximated by permutation.

First we get the cluster proportions from the cluster sizes and define
the function that will do our test for us.
```{r frsetup}
library(igraph)
cluster.proportions = t(apply(cluster.sizes, 1, function(x) x / sum(x)))


fr_test = function(types) {
    d = dist(cluster.proportions[expdata$Stim %in% types,])
    g = graph.adjacency(as.matrix(d), weighted = TRUE, mode = "undirected", diag = FALSE)
    mst = minimum.spanning.tree(g)
    el = get.edgelist(mst)

    typelist = el
    typelist[,1] = expdata[el[,1], "Stim"]
    typelist[,2] = expdata[el[,2], "Stim"]
    npure = sum(apply(typelist, 1, function(x) x[1] == x[2]))

    niter = 1999
    npureNull = numeric(niter)
    permexpdata = subset(expdata, expdata$Stim %in% types)
    for(i in 1:niter) {
        for(ex in unique(permexpdata$Exp)) {
            idx = which(permexpdata$Exp == ex)
            permexpdata[idx,"Stim"] = sample(permexpdata[idx, "Stim"], size = length(idx))
        }
        typelistPerm = el
        typelistPerm[,1] = permexpdata[el[,1], "Stim"]
        typelistPerm[,2] = permexpdata[el[,2], "Stim"]
        npurePerm = sum(apply(typelistPerm, 1, function(x) x[1] == x[2]))
        npureNull[i] = npurePerm
    }


    qplot(npureNull, geom = "histogram")
    mean(c(npureNull, npure) >= npure)
    return(list(pval = mean(c(npureNull, npure) >= npure),
                null = npureNull, npure = npure))
}
```


Then we can use this function to test every pair of conditions.
```{r frtest}
fr_test(c("unstimulated", "IL12/IL15/IL18"))$pval
fr_test(c("unstimulated", "IL15"))$pval
fr_test(c("unstimulated", "IL2"))$pval
fr_test(c("IL12/IL15/IL18", "IL15"))$pval
fr_test(c("IL12/IL15/IL18", "IL2"))$pval
fr_test(c("IL15", "IL2"))$pval
```