sc-blipper/nextflow.config at main · TrynkaLab/sc-blipper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
// ---------------------------------------------------------------------------------- //
//                             Nextflow / executor settings
// ---------------------------------------------------------------------------------- //
conda.enabled = true
singularity.enabled = false

manifest.mainScript = "main.nf"

profiles {
    local { includeConfig 'conf/local.config'}
    lsf { includeConfig 'conf/lsf.config'}
    lsf_ignore_errors { includeConfig 'conf/lsf_ignore_errors.config'}
}

report.overwrite=true
trace.overwrite=true

// ---------------------------------------------------------------------------------- //
//                                    Process labels
// ---------------------------------------------------------------------------------- //
includeConfig 'conf/processes.config'

// ---------------------------------------------------------------------------------- //
//                                  Plugins & Schema
// ---------------------------------------------------------------------------------- //
plugins {
  id 'nf-schema@2.4.1'
}

validation {
  help {
    enabled = true
  }

  failUnrecognisedParams=true
}


// ---------------------------------------------------------------------------------- //
//                                      Parameters
// ---------------------------------------------------------------------------------- //
params {

  // -------------------------------------------------------------------------
  // Global run options

  // Path to the manifest
  rn_manifest=null

  // The publish folder for the results
  rn_publish_dir=null

  // Use node scratch storage instead of workdir (generally ok, set to false when debugging)
  rn_scratch=false

  // Run name, defaults to random hash
  rn_runname=(UUID.randomUUID().toString().replaceAll('-', '')[0..4]).toLowerCase()

  // Used to fetch ensembl ids to gene names and for generating magma geneloc file
  rn_ensembl_version="115"

  // Container and conda settings
  // container is currently not in use
  rn_container = ""
  rn_conda = "/software/conda/users/ob7/sc-blipper"

  // -------------------------------------------------------------------------
  // Parameters for the conversion procceses
  convert {
    label="medium"

    // Output namespace, one of 'gene_name' or 'ensembl_id'
    output_namespace='gene_name'

    // Ensembl file to use for gene name conversion and biotype filtering, if null this is automatically downloaded from ensembl biomart
    // If this is set, id_linker must also be set
    // Should be a .tsv with columns
    // ensembl_gene_id	hgnc_symbol	gene_biotype	chromosome_name	start_position	end_position	strand	external_gene_name	source	final_gene_name
    ensembl_file=null

    // TSV file for gene ID conversion, if null this is automatically downloaded from ensembl biomart
    // Should be a 2 column tsv file with old new
    // 'new' Should be ensembl ids if you are doing gene set enrichments downstream
    // if null, need an internet connection
    // For 10x could use features.tsv.gz, to convert zcat features.tsv.gz | awk '{print $2"\t"$1}' > id_linker.tsv
    id_linker=null

    // File with gene names, one per row, need to be gene names in the original id type
    // I.e. If gene names are converted, will convert these as well, prior to subsetting
    // This overrides rn_biotype_filter if set
    subset_genes=null

    // An egrep compatible pattern to select biotypes to include from ensembl
    // e.g. "protein_coding|lncRNA"
    // For cNMF should generally be "protein_coding" or left null to include all
    // This is overridden by convert.subset_genes if that is set
    // To list biotypes run this on the ensembl file in the results folder
    // cat v114_ensembl.tsv | awk -F '\t' '{print $3}' | sort | uniq -c
    biotype_filter=null
  }


  // --------------------------------------------------------------------------
  // Parameters for batch correction
  preprocess {
    // Batch correction
    // Options: null, 'harmony', 'scvi'
    batch_correction=null

    // Number of variable genes to run scVI / harmony / cNMF with .
    n_variable=2000

    harmony {
      // Correct counts for these variables using the cnmf version of harmony.
      // To use treat each input file as a batch to correct, use variable 'orig_h5ad'
      // Leave at null to not apply the correction
      // Comma seperated string e.g. "Donor,Batch,orig_h5ad"
      harmony_vars=null
    }

    scvi {
      label="gpu_medium"
      container=""
      conda="/software/conda/users/ob7/sc-blipper-scvi"

      // Seed for reproducibility, set to null for random
      seed=314

      // Batch variable. Must be set if convert.batch_correction is 'scvi'
      // If you want to use the file of origin h5ads as batches, set to 'orig_h5ad'
      // If you have multiple variables determining a batch supply the column names
      // as a string seperated by space: 'orig_h5ad donor'
      // Corrected counts are normalized to the first batch observed.
      batch_key=null

      // Categorical and continuous covariates to include in the scVI model, comma seperated string of column names in adata.obs
      // Can be left null to not include any. Note these are things removed in the latent space, but not in the corrected data.
      cat_covariates=null
      cont_covariates=null

      // Number of latent dimensions for scVI
      n_latent=10

      // Max number of training epochs for scVI
      epochs=400

      // Denoise tp10k
      // This will generate a denoised version of the data, which is then scaled to a fixed library size of 1e4, which is used for inferring the cNMF usages.
      // cNMF by default infers usages on the raw tp10k data, and only the spectra are fit on the batch corrected data.
      // This option saves the scVI denoised tp10k instead, but results will only be available for the HVGs fit in the scVI model, and not the full gene set.
      denoise_tp10k=false

      // Pre-trained scVI model path, if provided skips training. Must have been trained on the same h5ad
      // Can be usefull to re-use models across multiple runs
      model_path=null

      // Skip saving the unscaled scVI h5ad file (this is the file you want to use for anything other then cnmf)
      // Its not needed for cNMF, but if you want to save space, set to true.
      skip_scvi_h5ad=false

      // Skip early stopping during training
      skip_early_stopping=false

      // Skip generating UMAPs and saving plots
      skip_plot=false
    }
  }

  // -------------------------------------------------------------------------
  // Parameters for the merge h5ad procceses
  merge {
    label="normal_plus"

    // Should genes be overlapped prior to merging the h5ads
    // Otherwise, outer join is used, with non-overlaps becomming NA's
    overlap_genes=true
  }

  // -------------------------------------------------------------------------
  // Options for cNMF
  cnmf {

    // Label for cnmf processes
    label="normal_plus"

    // Label for cnmf pre-process proccess (label for processes that require more memory)
    label_high="normal_plus"

    // Run preprocessing or not (makes a h5ad with variable genes, optionally harmony/scvi corrected)
    preprocess=true

    // Seed for reproducibility, set to null for random
    seed=42

    // If the input objects contain CITEseq, this should be set to the column in var that identifies them.
    // Only works when harmony is used, scVI doesn't currenly consider CITEseq
    feature_type_col=null

    // Save the cnmf as an H5ad per k value
    // X = usages
    // var = spectra score
    // obs = obs from merged h5ad
    save_h5ad=true

    // Number of iterations {100}
    n_iter=100

    // Number of workers to run in parallel for factorize process (a.k.a jobs)
    // Increase this if your jobs are slow and running out of time
    // Set either to:
    // - Fraction of total jobs, e.g. 0.5 = half, 0.25 = quarter etc. >0 <=1
    // - Absolute number of workers < total jobs, e.g. 10, 20 etc. > 2
    // - If 0<=, 1 or null, will use total number of jobs minus 1
    // In total cnmf will run k * n_iter jobs, so that forms the max
    // Number of running paralllel jobs is limited by nextflow run configuration, this controls the total jobs only!
    // By default nextflow will run only 40 concurrent jobs. Set executor.queuesize to change this
    n_workers=0.5

    // Number of K to try, CANNOT contain 1
    // {comma seperated list of k values}
    k="2,4,6,12,16,18,20,22,24,36,48,60"

    // Loss function for the optimization
    // {frobenius,kullback-leibler,itakura-saito}
    beta_loss="frobenius"

    // Initialize, How to intialize the initial state of the decomp,
    // {random,nndsvd}
    initialize="random"

    // Local density for consensus process {0.01}
    // This is used for cutting the heatmap into clusters
    // Sometimes this might need to be increased from default {0.01} to avoid crashes
    // It controls the distance to determine outliers to remove, basiclly how "clean" the cnmf will be
    local_density=0.1

    //----------------------------------------------------------
    // cnmf posprocessing options
    // Ignore these K values in post-proccessing of cNMF factors. This is usefull to reduce jobs you are not going to use anyway.
    // {comma seperated list of k values}
    k_ignore=null

    // Should the enrichment process be run after cnmf
    run_enrichment=true

    // Specific flags for enrichment processed
    // [Ignored in enrich workflow]
    run_gsea=true
    run_ora=true
    run_decoupler=true
    run_magma=true

    // Should the k-selection tree be plotted
    ktree_plot=true

    // Thresholding for the k-selection tree edges
    ktree_threshold="auto"

    // Mode for calculating the ktree edge weights
    // cor: Individual correlation of k-k+1 pairs
    // lm: One multiple regression model per k-k+1
    // nnls: NOT IMPLEMENTED (for use with spectra, not spectra scores)
    ktree_mode="cor"

    // Calculate the per-gep variance explained by leaving out one gep, and calculating the loss in reconstruction error.
    // This is a sanity check to see if the geps are actually contributing to the reconstruction of the data,
    // and can be used to flag geps that don't contribute anything.
    qc_skip_calc_varexp=false

    // Generate a summary table for values where enrichment is run
    run_summary=true

    summarize {
      // TSV file with two columns: <gene name> <group>
      // Group can be set to gene name, or to a group string over which to average genes
      // This is used in generating the cNMF summary table for the usages
      // By default assets/markers/CD4_markers are used, set to null to ignore
      marker_file="DEFAULT"

      // TSV file with one column with gene names
      // This is used to highlight genes in the spectra scores output
      // By default assets/markers/cytokines and assets/markers/lambert_2018_tfs are used, set to null to ignore
      tf_file="DEFAULT"
      cyto_file="DEFAULT"

      // Pvalue threshold to consider reporting in the table. Would set to something relatively strict
      // Set to 1 to report anything FDR<0.05
      threshold=5e-5

      // Which databases to include. Comma-separated list of databases (null = all)
      databases=null

      // Which tests to include. Comma-separated list of tests (null = all)
      tests=null

      // How many of the top results (enrichments, genes etc) to include in the summary
      topn=10

      // Should spectra scores be scaled
      scale_spectra=true
    }

    // --------------------------------------------------------------------------
    // ADVANCED PARAMETERS, ONLY USE IF YOU KNOW WHAT THIS MEANS
    // --------------------------------------------------------------------------
    // Parameters to manually re-use previous pre-process output.
    // Must be specified together, if provider, preprocess and convert steps are skipped
    // h5ad = *.Corrected.HVG.Varnorm.h5ad
    // tpm = "*.TP10K.h5ad"
    // hvg = "*.Corrected.HVGs.txt"
    input {
      h5ad=null
      tpm=null
      hvg=null
    }

    // --------------------------------------------------------------------------
    // END OF ADVANCED PARAMETERS
    // --------------------------------------------------------------------------
  }

  // --------------------------------------------------------------------------
  // Settings for enrichment processes
  enrich {
    label="small"

    // Input matrix to run enrichment on, genes x conditions (if conditions x genes set transpose=true)
    input_matrix=null

    // Define the input namespace for the gene ids in the file. If it doesn't match params.convert.output_namespace
    // it will be converted
    input_namespace="gene_name"

    // Pathway information as gmt files, see assets folder
    // ID type should match the id type of your (converted) h5ad files or input matrix
    // Comma seperated list of input files Or 'DEFAULT' to use all gmt files from assets/gene_sets/{symbols|ensembl}
    // Also used for magma --set-annot
    // To use specific files from the assets folder set ${projectDir}/gene_sets/<symbols|ensembl>/<file>
    // Set to null to skip enrichment
    gmt_files='DEFAULT'

    // Maximum size of the pathway to consider for ORA and GSEA
    max_pathway_size = 2000

    // Should the input matrix be transposed, columns should be conditions, rows should be genes
    // tranpose: false [ genes x conditions ]
    // tranpose: true  [ conditions x genes ]
    // [Ignored in cnmf workflow]
    transpose=false

    // A txt file with gene ids specifying the universe. If null automatically determined
    // based on all the genes in the input matrix. Id's must match convert.output_namespace
    universe=null

    // A tsv file with a mandatory column called 'condition' that has the colnames (or rownames if transpose=true)
    // and additional annotations to add to the output matrix. Each row must correspond to a column of the input (or row if transpose=true)
    annotate=null

    // [Ignored in cnmf workflow]
    run_gsea = true
    run_ora = true
    run_decoupler = true
    run_magma = true

    //---------------------------
    // Settings for ORA
    // Threshold value top binarize matrix.
    // If matrix is binary, use threshold=0, threshold_invert=false
    // If matrix is pvalues/fdr, use threshold=0, threshold_invert=true
    // [Ignored in cnmf workflow, is always 0]
    threshold = 0

    // Instead of using > threshold, invert the logic to < threshold
    threshold_invert = false

    // Top n genes to use for enrichment, can be a list
    // Set to null to skip
    use_top = [50,100,250,500]

    // Use the absolute of the score prior to threshold or use_top
    // [Ignored in cnmf workflow]
    absolute = true

    // This works different for the python version, so doesn't seem to be needed anymore
    // Cache folder for decoupler databases, defaults to its default somewhere in the home folder
    //omnipath_cache_dir=null
  }

  // --------------------------------------------------------------------------
  // Params for MAGMA process
  magma {
    label="small"

    // Do not include genes in the extendted HLA region (defined chr6:25726063-33400644)
    // https://www.nature.com/articles/nrg1489
    // HLA genes are removed from the initial magma gene.loc file, if you want more control
    // use the universe option to remove the genes you want and set this to flase
    remove_hla_genes=true

    // A manifest with summary statistics
    // <name> <N> <snp_id_col> <pval_col> <path>
    manifest_sumstats=null

    // Use previous magma results instead of computing them
    // <trait name> </path/to/<trait>.genes.raw>
    manifest_magma=null

    // Plink bed/bim/fam prefix for LD reference panel to be used
    // Reccomend 1000G panel of matching population
    // Specify the prefix WITHOUT .bed/.bim/.fam
    ld_reference=null

    // Up and downstream window (kb) for variants to include, used in magma annotate
    annotate_window="50,10"

    // Number of batches to split magma jobs into, should not go over 25 (see magma docs)
    // We also don't want too many, as this makes many < 5 minute jobs which will be slower in practice
    n_batch = 5
  }
}