long-gwas-pipeline/conf/params.config at main · GP2code/long-gwas-pipeline · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
/*
 * Default parameter values
 * Can be overridden by command line (--param value) or params file (-params-file file.yml)
 */

params {
    //# Input/Output paths
    input                 = null  // Path to input VCF files (required)
    covarfile             = null  // Path to covariates file (required)
    phenofile             = null  // Path to phenotype file (required)

    //# Variable names in covariate/phenotype files
    pheno_name            = 'y'          // Phenotype column name(s), space-separated for multiple
    covar_numeric         = 'SEX age_at_baseline PC1 PC2'  // Numeric covariate names, space-separated. PCs calculated later will be added.
    covar_categorical     = ''           // Categorical covariate names, space-separated (mutually exclusive with covar_numeric)
    covar_interact        = ''           // Numeric covariate for SNP interaction testing (survival analysis only, must be in covar_numeric list)
    study_arm_col         = 'study_arm'  // Required column defining study groups analyzed separately: If only one arm (e.g. case-only study), set all samples to same value.
    time_col              = 'study_days' // Time column for longitudinal/survival analysis.
    // For survival analysis: If tstart and tend columns exist in phenotype file, they will be used directly.
    // Otherwise, tstart will be set to 0 and tend will be set to the value of time_col.

    //# Analysis model selection
    longitudinal_flag     = false  // Longitudinal analysis (GALLOP)
    survival_flag         = false  // Survival analysis (Cox PH)
    linear_flag           = true   // Cross-sectional analysis (GLM)

    //# Processing parameters
    chunk_flag            = true   // Enable chunking of variants
    chunk_size            = 30000  // Number of variants per chunk for GWAS analysis
                                   // Note: SPLIT_VCF process uses 3x this value (90000) for initial splitting
                                   // to optimize parallelization, then chunks are processed at this size for GWAS

    //# Genetic QC parameters
    r2thres               = -9         // R2 threshold for imputation quality (-9 = disabled)
    minor_allele_freq     = '0.05'     // Minor allele frequency threshold
    minor_allele_ct       = '20'       // Minor allele count threshold
    kinship               = '0.177'    // Kinship threshold for relatedness filtering
    ancestry              = 'EUR'      // Target ancestry (EUR, AFR, EAS, SAS, AMR)
    assembly              = 'hg19'     // Genome assembly (hg19 or hg38)

    //# Workflow mode
    skip_pop_split        = false  // Skip population splitting: use when data is already ancestry-specific
                                   // Enables efficient LD pruning per chromosome before merge

    //# Analysis name identifier
    analysis_name         = 'TEST'  // Identifier for this analysis run
    genetic_data_id       = null    // Optional: Identifier for genetic data cache (auto-computed if null)
                                     // Use same ID across analyses to reuse genetic QC outputs

    //# Output options
    mh_plot               = true  // Generate Manhattan plots

    //# Storage configuration
    // Storage root - can be local path or cloud bucket
    // Priority: 1) Environment variable, 2) Command line param, 3) Current directory
    STORE_ROOT            = null
    PROJECT_NAME          = null

    // Reference genomes directory
    reference_dir         = null

    // Derived paths (computed automatically)
    project_dir           = null
    genetic_cache_key     = null    // Computed: cache key for genetic data processing
}

// Compute derived paths
params.STORE_ROOT = params.STORE_ROOT ?: System.getenv('STORE_ROOT') ?: "${launchDir}"
params.PROJECT_NAME = params.PROJECT_NAME ?: System.getenv('PROJECT_NAME') ?: 'unnamed_project'
params.reference_dir = params.reference_dir ?: System.getenv('REFERENCE_DIR') ?: "${params.STORE_ROOT}/References"
params.project_dir = "${params.STORE_ROOT}/${params.PROJECT_NAME}"

// Compute genetic cache key: based on genetic data and QC params, not phenotype/analysis mode
// This allows reusing genetic QC outputs across different analyses
// Format prefix identifies input type: bed, pgen, or vcf
def skip_suffix = params.skip_pop_split ? "_skip" : ""
def format_prefix = params.input =~ /\.bed$/ ? "bed" : (params.input =~ /\.pgen$/ ? "pgen" : "vcf")
params.genetic_cache_key = params.genetic_data_id ?: "${format_prefix}_${params.ancestry}_${params.assembly}_maf${params.minor_allele_freq}_kin${params.kinship}${skip_suffix}"