-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgem_workflow.wdl
More file actions
executable file
·228 lines (203 loc) · 11.7 KB
/
gem_workflow.wdl
File metadata and controls
executable file
·228 lines (203 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
version 1.0
workflow run_GEM {
input {
Array[File]? bgenfiles
Array[File]? samplefiles
Array[File]? pgenfiles
Array[File]? pvarfiles
Array[File]? psamfiles
Array[File]? bedfiles
Array[File]? bimfiles
Array[File]? famfiles
# Single-File inputs are for convenience, as often multiple chrs will share the same sample file.
File? samplefile
File? psamfile
File? famfile
String? unused # WDL 1.0 has no "None" value yet. Workaround: using a never-defined optional value.
}
Int n_files = if (defined(bgenfiles)) then length(select_first([bgenfiles])) else if (defined(pgenfiles)) then length(select_first([pgenfiles])) else length(select_first([bedfiles]))
scatter (i in range(n_files)) {
call run_tests {
input:
bgenfile = if defined(bgenfiles ) && i < length(select_first([bgenfiles ])) && defined(select_first([bgenfiles ])[i]) then select_first([bgenfiles ])[i] else unused,
samplefile = if defined(samplefiles) && i < length(select_first([samplefiles])) && defined(select_first([samplefiles])[i]) then select_first([samplefiles])[i] else samplefile,
pgenfile = if defined(pgenfiles ) && i < length(select_first([pgenfiles ])) && defined(select_first([pgenfiles ])[i]) then select_first([pgenfiles ])[i] else unused,
pvarfile = if defined(pvarfiles ) && i < length(select_first([pvarfiles ])) && defined(select_first([pvarfiles ])[i]) then select_first([pvarfiles ])[i] else unused,
psamfile = if defined(psamfiles ) && i < length(select_first([psamfiles ])) && defined(select_first([psamfiles ])[i]) then select_first([psamfiles ])[i] else psamfile,
bedfile = if defined(bedfiles ) && i < length(select_first([bedfiles ])) && defined(select_first([bedfiles ])[i]) then select_first([bedfiles ])[i] else unused,
bimfile = if defined(bimfiles ) && i < length(select_first([bimfiles ])) && defined(select_first([bimfiles ])[i]) then select_first([bimfiles ])[i] else unused,
famfile = if defined(famfiles ) && i < length(select_first([famfiles ])) && defined(select_first([famfiles ])[i]) then select_first([famfiles ])[i] else famfile,
}
}
Array[File] results_array = run_tests.out
Array[File]? sru = run_tests.system_resource_usage
Array[File]? pru = run_tests.process_resource_usage
call cat_results { input: results_array = results_array }
output {
File gem_results = cat_results.all_results
Array[File] gem_log = run_tests.log
Array[File]? system_resource_usage = sru
Array[File]? process_resource_usage = pru
}
parameter_meta {
bgenfiles: "Array of genotype filepaths in .bgen format. Optional, but either this or pgenfiles must be specified as an input."
samplefiles: "Optional .sample files accompanying the .bgen files. Required for proper function if .bgen does not store sample identifiers."
samplefile: "Convenient alternative to the samplefiles input, if all bgenfiles share the same sample information."
pgenfiles: "Array of genotype filepaths in .pgen (PLINK2) format. Optional, but at least one genotype dataset must be specified as an input."
pvarfiles: "Array of variant descriptor filepaths in .pvar (PLINK2) format. Optional, but must be included if using the pgenfiles input."
psamfiles: "Sample descriptor files in .psam (PLINK2) format. Optional, but must be included if using the pgenfiles input."
psamfile: "Convenient alternative to the psamfiles input, if all pgenfiles share the same sample information."
bedfiles: "Array of genotype filepaths in .bed (PLINK1) format. Optional, but at least one genotype dataset must be specified as an input."
bimfiles: "Array of variant descriptor filepaths in .bim (PLINK1) format. Optional, but must be included if using the bedfiles input."
famfiles: "Sample descriptor files in .fam (PLINK1) format. Optional, but must be included if using the bedfiles input."
famfile: "Convenient alternative to the famfiles input, if all bedfiles share the same sample information."
maf: "Minor allele frequency threshold for pre-filtering variants as a fraction (default is 0.005)."
miss_geno_cutoff: "Maximum threshold value [0, 1.0] to filter variants based on the missing genotype rate. Default is 0.05."
pheno_file: "Phenotype filepath."
sampleid_name: "Optional column header name of sample ID in phenotype file."
pheno_name: "Column header name of phenotype data in phenotype file."
exposure_names: "Column header name(s) of the exposures for genotype interaction testing (space-delimited)."
int_covar_names: "Column header name(s) of any covariates for which genotype interactions should be included for adjustment in regression (space-delimited). These terms will not be included in any multi-exposure interaction tests. This set should not overlap with exposures or covar_names."
covar_names: "Column header name(s) of any covariates for which only main effects should be included selected covariates in the pheno data file (space-delimited). This set should not overlap with exposures or int_covar_names."
categorical_names: "Names of the exposure or interaction covariate that should be treated as categorical."
cat_threshold: "A cut-off to determine which exposure or interaction covariate not specified using --categorical-names should be automatically treated as categorical based on the number of levels (unique observations)."
include_snp_file: "Optional path to file containing a subset of variants in the specified genotype file to be used for analysis. The first line in this file is the header that specifies which variant identifier in the genotype file is used for ID matching. This must be 'snpid' (PLINK or BGEN) or 'rsid' (BGEN only). There should be one variant identifier per line after the header."
center: "Should exposures and interaction covariates be centered prior to analyis? 0: no, 1: yes, 2: interaction covariates only. Default is 2."
delimiter: "Delimiter used in the phenotype file."
kin_delim: "Delimiter used in the kinship file."
kin_file: "Path to the kinship file."
kin_diag: "Diagonal value of kinship matrix not accounting for inbreeding."
missing: "Missing value key of phenotype file. Default is 'NA'."
robust: "Boolean: should robust (a.k.a. sandwich/Huber-White) standard errors be used?"
scale: "Boolean: should ALL exposures and covariates be scaled by the standard deviation?"
output_style: "Optional string specifying the output columns to include: minimum (marginal and GxE estimates), meta (minimum plus main G and GxCovariate terms), or full (meta plus additionals fields necessary for re-analysis based on summary statistics alone). Default is 'minimum'."
stream_snps: "SNP numbers for each GWAS analysis."
tol: "Convergence tolerance for logistic regression."
memory_gb: "Requested memory (in GB)."
n_cpu: "Minimum number of requested cores."
disk_gb: "Requested disk space (in GB)."
preemptible: "Optional number of attempts using a preemptible machine from Google Cloud prior to falling back to a standard machine (default = 0, i.e., don't use preemptible)."
threads: "Number of threads GEM should use for parallelization over variants."
monitoring_freq: "Delay between each output for process monitoring (in seconds). Default is 1 second."
random_slope: "Column name in the phenotype file that contains random slope."
}
meta {
author: "Kenny Westerman"
email: "kewesterman@mgh.harvard.edu"
description: "Run gene-environment interaction tests using GEM and return a file of summary statistics."
}
}
task run_tests {
input {
# Scattered inputs
File? bgenfile
File? pgenfile
File? pvarfile
File? bedfile
File? bimfile
File? psamfile
File? samplefile
File? famfile
# Additional required inputs
File pheno_file
String pheno_name
String sampleid_name
# Options
File? kin_file
File? include_snp_file
String? exposure_names
String? covar_names
String? int_covar_names
String? categorical_names
String? random_slope
String kin_delim = ","
String delimiter = ","
String missing = "NA"
String output_style = "minimum"
Boolean scale = false
Boolean robust = false
Float kin_diag = 1.0
Float miss_geno_cutoff = 0.05
Float maf = 0.001
Float tol = 0.0000001
Int center = 2
Int cat_threshold = 2
# Compute resources
Int threads = 3
Int n_cpu = 4
Int disk_gb = 50
Int memory_gb = 10
Int preemptible = 0
Int maxRetries = 0
Int stream_snps = 1
Int monitoring_freq = 1
}
String robust01 = if robust then "1" else "0"
String scale01 = if scale then "1" else "0"
command <<<
dstat -c -d -m --nocolor ~{monitoring_freq} > system_resource_usage.log &
atop -x -P PRM ~{monitoring_freq} | grep '(GEM)' > process_resource_usage.log &
/GEM \
~{"--bgen " + bgenfile} \
~{"--sample " + samplefile} \
~{"--pgen " + pgenfile} \
~{"--pvar " + pvarfile} \
~{"--psam " + psamfile} \
~{"--bed " + bedfile} \
~{"--bim " + bimfile} \
~{"--fam " + famfile} \
~{"--kin-file " + kin_file} \
~{"--include-snp-file " + include_snp_file} \
~{"--exposure-names " + exposure_names} \
~{"--covar-names " + covar_names} \
~{"--int-covar-names " + int_covar_names} \
~{"--categorical-names " + categorical_names} \
~{"--random-slope " + random_slope} \
--kin-delim ~{kin_delim} \
--kin-diag ~{kin_diag} \
--maf ~{maf} \
--miss-geno-cutoff ~{miss_geno_cutoff} \
--pheno-file ~{pheno_file} \
--sampleid-name ~{sampleid_name} \
--pheno-name ~{pheno_name} \
--cat-threshold ~{cat_threshold} \
--center ~{center} \
--scale ~{scale01} \
--delim ~{delimiter} \
--missing-value ~{missing} \
--robust ~{robust01} \
--output-style ~{output_style} \
--tol ~{tol} \
--threads ~{threads} \
--stream-snps ~{stream_snps} \
--out gem_res | tee gem_log.log
>>>
runtime {
docker: "quay.io/large-scale-gxe-methods/gem-workflow@sha256:ab7c82aaa2e77e265b597a4a204bc30c83a94a4cba3d4d10935dfa0d74ecfec3"
memory: "~{memory_gb} GB"
cpu: "~{n_cpu}"
disks: "local-disk ~{disk_gb} HDD"
preemptible: "~{preemptible}"
maxRetries: "~{maxRetries}"
gpu: false
dx_timeout: "7D0H00M"
}
output {
File out = "gem_res"
File log = "gem_log.log"
File system_resource_usage = "system_resource_usage.log"
File process_resource_usage = "process_resource_usage.log"
}
}
task cat_results {
input { Array[File] results_array }
command <<<
head -1 ~{results_array[0]} > all_results.txt && \
for res in ~{sep=" " results_array}; do tail -n +2 $res >> all_results.txt; done
>>>
runtime {
docker: "quay.io/large-scale-gxe-methods/ubuntu:focal-20210325"
disks: "local-disk 10 HDD"
}
output { File all_results = "all_results.txt" }
}