-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathnextflow.config
More file actions
executable file
·420 lines (330 loc) · 16.3 KB
/
nextflow.config
File metadata and controls
executable file
·420 lines (330 loc) · 16.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
// ---------------------------------------------------------------------------------- //
// Nextflow / executor settings
// ---------------------------------------------------------------------------------- //
conda.enabled = true
singularity.enabled = false
manifest.mainScript = "main.nf"
profiles {
local { includeConfig 'conf/local.config'}
lsf { includeConfig 'conf/lsf.config'}
lsf_ignore_errors { includeConfig 'conf/lsf_ignore_errors.config'}
}
report.overwrite=true
trace.overwrite=true
// ---------------------------------------------------------------------------------- //
// Process labels
// ---------------------------------------------------------------------------------- //
includeConfig 'conf/processes.config'
// ---------------------------------------------------------------------------------- //
// Plugins & Schema
// ---------------------------------------------------------------------------------- //
plugins {
id 'nf-schema@2.4.1'
}
validation {
help {
enabled = true
}
failUnrecognisedParams=true
}
// ---------------------------------------------------------------------------------- //
// Parameters
// ---------------------------------------------------------------------------------- //
params {
// -------------------------------------------------------------------------
// Global run options
// Path to the manifest
rn_manifest=null
// The publish folder for the results
rn_publish_dir=null
// Use node scratch storage instead of workdir (generally ok, set to false when debugging)
rn_scratch=false
// Run name, defaults to random hash
rn_runname=(UUID.randomUUID().toString().replaceAll('-', '')[0..4]).toLowerCase()
// Used to fetch ensembl ids to gene names and for generating magma geneloc file
rn_ensembl_version="115"
// Container and conda settings
// container is currently not in use
rn_container = ""
rn_conda = "/software/conda/users/ob7/sc-blipper"
// -------------------------------------------------------------------------
// Parameters for the conversion procceses
convert {
label="medium"
// Output namespace, one of 'gene_name' or 'ensembl_id'
output_namespace='gene_name'
// Ensembl file to use for gene name conversion and biotype filtering, if null this is automatically downloaded from ensembl biomart
// If this is set, id_linker must also be set
// Should be a .tsv with columns
// ensembl_gene_id hgnc_symbol gene_biotype chromosome_name start_position end_position strand external_gene_name source final_gene_name
ensembl_file=null
// TSV file for gene ID conversion, if null this is automatically downloaded from ensembl biomart
// Should be a 2 column tsv file with old new
// 'new' Should be ensembl ids if you are doing gene set enrichments downstream
// if null, need an internet connection
// For 10x could use features.tsv.gz, to convert zcat features.tsv.gz | awk '{print $2"\t"$1}' > id_linker.tsv
id_linker=null
// File with gene names, one per row, need to be gene names in the original id type
// I.e. If gene names are converted, will convert these as well, prior to subsetting
// This overrides rn_biotype_filter if set
subset_genes=null
// An egrep compatible pattern to select biotypes to include from ensembl
// e.g. "protein_coding|lncRNA"
// For cNMF should generally be "protein_coding" or left null to include all
// This is overridden by convert.subset_genes if that is set
// To list biotypes run this on the ensembl file in the results folder
// cat v114_ensembl.tsv | awk -F '\t' '{print $3}' | sort | uniq -c
biotype_filter=null
}
// --------------------------------------------------------------------------
// Parameters for batch correction
preprocess {
// Batch correction
// Options: null, 'harmony', 'scvi'
batch_correction=null
// Number of variable genes to run scVI / harmony / cNMF with .
n_variable=2000
harmony {
// Correct counts for these variables using the cnmf version of harmony.
// To use treat each input file as a batch to correct, use variable 'orig_h5ad'
// Leave at null to not apply the correction
// Comma seperated string e.g. "Donor,Batch,orig_h5ad"
harmony_vars=null
}
scvi {
label="gpu_medium"
container=""
conda="/software/conda/users/ob7/sc-blipper-scvi"
// Seed for reproducibility, set to null for random
seed=314
// Batch variable. Must be set if convert.batch_correction is 'scvi'
// If you want to use the file of origin h5ads as batches, set to 'orig_h5ad'
// If you have multiple variables determining a batch supply the column names
// as a string seperated by space: 'orig_h5ad donor'
// Corrected counts are normalized to the first batch observed.
batch_key=null
// Categorical and continuous covariates to include in the scVI model, comma seperated string of column names in adata.obs
// Can be left null to not include any. Note these are things removed in the latent space, but not in the corrected data.
cat_covariates=null
cont_covariates=null
// Number of latent dimensions for scVI
n_latent=10
// Max number of training epochs for scVI
epochs=400
// Denoise tp10k
// This will generate a denoised version of the data, which is then scaled to a fixed library size of 1e4, which is used for inferring the cNMF usages.
// cNMF by default infers usages on the raw tp10k data, and only the spectra are fit on the batch corrected data.
// This option saves the scVI denoised tp10k instead, but results will only be available for the HVGs fit in the scVI model, and not the full gene set.
denoise_tp10k=false
// Pre-trained scVI model path, if provided skips training. Must have been trained on the same h5ad
// Can be usefull to re-use models across multiple runs
model_path=null
// Skip saving the unscaled scVI h5ad file (this is the file you want to use for anything other then cnmf)
// Its not needed for cNMF, but if you want to save space, set to true.
skip_scvi_h5ad=false
// Skip early stopping during training
skip_early_stopping=false
// Skip generating UMAPs and saving plots
skip_plot=false
}
}
// -------------------------------------------------------------------------
// Parameters for the merge h5ad procceses
merge {
label="normal_plus"
// Should genes be overlapped prior to merging the h5ads
// Otherwise, outer join is used, with non-overlaps becomming NA's
overlap_genes=true
}
// -------------------------------------------------------------------------
// Options for cNMF
cnmf {
// Label for cnmf processes
label="normal_plus"
// Label for cnmf pre-process proccess (label for processes that require more memory)
label_high="normal_plus"
// Run preprocessing or not (makes a h5ad with variable genes, optionally harmony/scvi corrected)
preprocess=true
// Seed for reproducibility, set to null for random
seed=42
// If the input objects contain CITEseq, this should be set to the column in var that identifies them.
// Only works when harmony is used, scVI doesn't currenly consider CITEseq
feature_type_col=null
// Save the cnmf as an H5ad per k value
// X = usages
// var = spectra score
// obs = obs from merged h5ad
save_h5ad=true
// Number of iterations {100}
n_iter=100
// Number of workers to run in parallel for factorize process (a.k.a jobs)
// Increase this if your jobs are slow and running out of time
// Set either to:
// - Fraction of total jobs, e.g. 0.5 = half, 0.25 = quarter etc. >0 <=1
// - Absolute number of workers < total jobs, e.g. 10, 20 etc. > 2
// - If 0<=, 1 or null, will use total number of jobs minus 1
// In total cnmf will run k * n_iter jobs, so that forms the max
// Number of running paralllel jobs is limited by nextflow run configuration, this controls the total jobs only!
// By default nextflow will run only 40 concurrent jobs. Set executor.queuesize to change this
n_workers=0.5
// Number of K to try, CANNOT contain 1
// {comma seperated list of k values}
k="2,4,6,12,16,18,20,22,24,36,48,60"
// Loss function for the optimization
// {frobenius,kullback-leibler,itakura-saito}
beta_loss="frobenius"
// Initialize, How to intialize the initial state of the decomp,
// {random,nndsvd}
initialize="random"
// Local density for consensus process {0.01}
// This is used for cutting the heatmap into clusters
// Sometimes this might need to be increased from default {0.01} to avoid crashes
// It controls the distance to determine outliers to remove, basiclly how "clean" the cnmf will be
local_density=0.1
//----------------------------------------------------------
// cnmf posprocessing options
// Ignore these K values in post-proccessing of cNMF factors. This is usefull to reduce jobs you are not going to use anyway.
// {comma seperated list of k values}
k_ignore=null
// Should the enrichment process be run after cnmf
run_enrichment=true
// Specific flags for enrichment processed
// [Ignored in enrich workflow]
run_gsea=true
run_ora=true
run_decoupler=true
run_magma=true
// Should the k-selection tree be plotted
ktree_plot=true
// Thresholding for the k-selection tree edges
ktree_threshold="auto"
// Mode for calculating the ktree edge weights
// cor: Individual correlation of k-k+1 pairs
// lm: One multiple regression model per k-k+1
// nnls: NOT IMPLEMENTED (for use with spectra, not spectra scores)
ktree_mode="cor"
// Calculate the per-gep variance explained by leaving out one gep, and calculating the loss in reconstruction error.
// This is a sanity check to see if the geps are actually contributing to the reconstruction of the data,
// and can be used to flag geps that don't contribute anything.
qc_skip_calc_varexp=false
// Generate a summary table for values where enrichment is run
run_summary=true
summarize {
// TSV file with two columns: <gene name> <group>
// Group can be set to gene name, or to a group string over which to average genes
// This is used in generating the cNMF summary table for the usages
// By default assets/markers/CD4_markers are used, set to null to ignore
marker_file="DEFAULT"
// TSV file with one column with gene names
// This is used to highlight genes in the spectra scores output
// By default assets/markers/cytokines and assets/markers/lambert_2018_tfs are used, set to null to ignore
tf_file="DEFAULT"
cyto_file="DEFAULT"
// Pvalue threshold to consider reporting in the table. Would set to something relatively strict
// Set to 1 to report anything FDR<0.05
threshold=5e-5
// Which databases to include. Comma-separated list of databases (null = all)
databases=null
// Which tests to include. Comma-separated list of tests (null = all)
tests=null
// How many of the top results (enrichments, genes etc) to include in the summary
topn=10
// Should spectra scores be scaled
scale_spectra=true
}
// --------------------------------------------------------------------------
// ADVANCED PARAMETERS, ONLY USE IF YOU KNOW WHAT THIS MEANS
// --------------------------------------------------------------------------
// Parameters to manually re-use previous pre-process output.
// Must be specified together, if provider, preprocess and convert steps are skipped
// h5ad = *.Corrected.HVG.Varnorm.h5ad
// tpm = "*.TP10K.h5ad"
// hvg = "*.Corrected.HVGs.txt"
input {
h5ad=null
tpm=null
hvg=null
}
// --------------------------------------------------------------------------
// END OF ADVANCED PARAMETERS
// --------------------------------------------------------------------------
}
// --------------------------------------------------------------------------
// Settings for enrichment processes
enrich {
label="small"
// Input matrix to run enrichment on, genes x conditions (if conditions x genes set transpose=true)
input_matrix=null
// Define the input namespace for the gene ids in the file. If it doesn't match params.convert.output_namespace
// it will be converted
input_namespace="gene_name"
// Pathway information as gmt files, see assets folder
// ID type should match the id type of your (converted) h5ad files or input matrix
// Comma seperated list of input files Or 'DEFAULT' to use all gmt files from assets/gene_sets/{symbols|ensembl}
// Also used for magma --set-annot
// To use specific files from the assets folder set ${projectDir}/gene_sets/<symbols|ensembl>/<file>
// Set to null to skip enrichment
gmt_files='DEFAULT'
// Maximum size of the pathway to consider for ORA and GSEA
max_pathway_size = 2000
// Should the input matrix be transposed, columns should be conditions, rows should be genes
// tranpose: false [ genes x conditions ]
// tranpose: true [ conditions x genes ]
// [Ignored in cnmf workflow]
transpose=false
// A txt file with gene ids specifying the universe. If null automatically determined
// based on all the genes in the input matrix. Id's must match convert.output_namespace
universe=null
// A tsv file with a mandatory column called 'condition' that has the colnames (or rownames if transpose=true)
// and additional annotations to add to the output matrix. Each row must correspond to a column of the input (or row if transpose=true)
annotate=null
// [Ignored in cnmf workflow]
run_gsea = true
run_ora = true
run_decoupler = true
run_magma = true
//---------------------------
// Settings for ORA
// Threshold value top binarize matrix.
// If matrix is binary, use threshold=0, threshold_invert=false
// If matrix is pvalues/fdr, use threshold=0, threshold_invert=true
// [Ignored in cnmf workflow, is always 0]
threshold = 0
// Instead of using > threshold, invert the logic to < threshold
threshold_invert = false
// Top n genes to use for enrichment, can be a list
// Set to null to skip
use_top = [50,100,250,500]
// Use the absolute of the score prior to threshold or use_top
// [Ignored in cnmf workflow]
absolute = true
// This works different for the python version, so doesn't seem to be needed anymore
// Cache folder for decoupler databases, defaults to its default somewhere in the home folder
//omnipath_cache_dir=null
}
// --------------------------------------------------------------------------
// Params for MAGMA process
magma {
label="small"
// Do not include genes in the extendted HLA region (defined chr6:25726063-33400644)
// https://www.nature.com/articles/nrg1489
// HLA genes are removed from the initial magma gene.loc file, if you want more control
// use the universe option to remove the genes you want and set this to flase
remove_hla_genes=true
// A manifest with summary statistics
// <name> <N> <snp_id_col> <pval_col> <path>
manifest_sumstats=null
// Use previous magma results instead of computing them
// <trait name> </path/to/<trait>.genes.raw>
manifest_magma=null
// Plink bed/bim/fam prefix for LD reference panel to be used
// Reccomend 1000G panel of matching population
// Specify the prefix WITHOUT .bed/.bim/.fam
ld_reference=null
// Up and downstream window (kb) for variants to include, used in magma annotate
annotate_window="50,10"
// Number of batches to split magma jobs into, should not go over 25 (see magma docs)
// We also don't want too many, as this makes many < 5 minute jobs which will be slower in practice
n_batch = 5
}
}