Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions bash_script/0.munge_sumstats.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash

#SBATCH --ntasks=1
#SBATCH --cpus-per-task=30
#SBATCH --mem=300G
#SBATCH --time=24:00:00
#SBATCH --output=bash_script/slurm_munge_GWAS_%j.out
#SBATCH --error=bash_script/slurm_munge_GWAS_%j.err

source /opt/software/anaconda3/2023.03-py3.10/etc/profile.d/conda.sh
conda activate /projects/cbmr_shared/people/wkq953/non-GDPR/.conda/envs/ldsc

ALLELE_FILE="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/resources/ldsc/w_hm3.snplist"
OUTPUT_DIR="/projects/cbmr_shared/data/common_resources/SEGMENT/GWAS/processed/ldsc_munge"
INPUT_DIR="/projects/cbmr_shared/data/common_resources/SEGMENT/GWAS/processed/SSF_format"
DEFAULT_DIR="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/ldsc"

mkdir -p "$OUTPUT_DIR"
cd "$DEFAULT_DIR"


for file in "$INPUT_DIR"/*.tsv; do

# Skip finemapped files
if [[ "$file" == *finemapped* ]]; then
continue
fi

base=$(basename "$file" .tsv)
outname="$OUTPUT_DIR/${base}"
sumstats_clean="${OUTPUT_DIR}/${base}.clean.tsv"

# munge_sumstats will filter out all rows with NA or empty values
# Check if it's a binary trait by looking at column 17 (n_cases)
is_binary=$(awk -F'\t' 'NR>1 && $17 != "NA" && $17 != "" && $17+0 > 0 { print 1; exit } END { print 0 }' "$file"| head -n1)

echo "Processing: $file"
echo "Is binary trait: $is_binary"

if [ "$is_binary" -eq 1 ]; then
echo "binary trait"
cut -f3,4,5,7,8,11,14,17,18 "$file" > "$sumstats_clean"
else
cut -f3,4,5,7,8,11,14 "$file" > "$sumstats_clean"
fi

python munge_sumstats.py \
--sumstats "$sumstats_clean" \
--out "$outname" \
--merge-alleles "$ALLELE_FILE" \
--chunksize 500000

rm -f "$sumstats_clean"

done
55 changes: 55 additions & 0 deletions bash_script/1.annotation_peak.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/bin/bash

#SBATCH --ntasks=1
#SBATCH --cpus-per-task=30
#SBATCH --mem=300G
#SBATCH --time=24:00:00
#SBATCH --output=bash_script/slurm_annotation_%j.out
#SBATCH --error=bash_script/slurm_annotation_%j.err
#SBATCH --array=1-22

# === Load environment ===
source /opt/software/anaconda3/2023.03-py3.10/etc/profile.d/conda.sh
conda activate /projects/cbmr_shared/people/wkq953/non-GDPR/.conda/envs/ldsc

# === Set project specific paths ===
PEAK_DIR="/projects/qtl_families-AUDIT/data/multiome/processed/pipeline_output_freezes/muscle/2.11.0/objects/real/"
ANNOT_DIR="/projects/qtl_families-AUDIT/scratch/ldsc_annotation" # store annotation peak files
mkdir -p "$ANNOT_DIR"

# === Set general paths ===
CHROM=${SLURM_ARRAY_TASK_ID} # Auto-index from --array
PLINK_DIR="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/resources/ldsc/1000G_EUR_Phase3_plink"
#PLINK_PREFIX="$PLINK_DIR/1000G.EUR.QC.CHR_1_22"
HM3_SNPLIST="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/resources/ldsc/w_hm3.snplist"
LDSC_DIR="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/ldsc"

cd "$LDSC_DIR" || exit 1

# === Process each BED file ===

#for PEAK_FILE in "$PEAK_DIR"/*.BED; do
for PEAK_FILE in "$PEAK_DIR"/*.narrowPeak; do

BASE=$(basename "$PEAK_FILE" .narrowPeak)
ANNOT_FILE="$ANNOT_DIR/${BASE}.chr${CHROM}.annot.gz"
OUT_PREFIX="$ANNOT_DIR/${BASE}.chr${CHROM}"

echo "[chr$CHROM] Generating annotation for $BASE"
python make_annot.py \
--bed-file "$PEAK_FILE" \
--bimfile "$PLINK_DIR/1000G.EUR.QC.${CHROM}.bim" \
--annot-file "$ANNOT_FILE"

echo "[chr$CHROM] Computing LD score for $BASE"

python ldsc.py \
--l2 \
--ld-wind-cm 1 \
--bfile "$PLINK_DIR/1000G.EUR.QC.${CHROM}" \
--annot "$ANNOT_FILE" \
--thin-annot \
--out "$OUT_PREFIX" \
--print-snps "$HM3_SNPLIST"

done
84 changes: 84 additions & 0 deletions bash_script/2.ldsc_h2_preprocess.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash

#SBATCH --ntasks=1
#SBATCH --cpus-per-task=30
#SBATCH --mem=300G
#SBATCH --time=24:00:00
#SBATCH --output=bash_script/slurm_ldsc_h2_preprocess_input_%j.out
#SBATCH --error=bash_script/slurm_ldsc_h2_preprocess_input_%j.err
#SBATCH --array=1-22


# === Set project specific paths ===
PROJECT_DIR="/projects/qtl_families-AUDIT/scratch/ldsc_annotation"
OUTPUT_DIR="/projects/qtl_families-AUDIT/scratch/ldsc_annotation_combine_celltypes"

# === Set general paths ===
CHROM=${SLURM_ARRAY_TASK_ID} # Auto-index from --array
DEFAULT_DIR="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/ldsc"
BASELINE_DIR="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/resources/ldsc/1000G_Phase3_baselineLD_v2.2_ldscores"

mkdir -p "$OUTPUT_DIR"
cd "$DEFAULT_DIR"

echo "--- Starting LDSC Data Preparation for Chromosome ${CHROM} ---"
echo "Project Directory: ${PROJECT_DIR}"
echo "Output Directory: ${OUTPUT_DIR}"
echo "Baseline Directory: ${BASELINE_DIR}"


# Step 1 extract only Base from Baseline model

echo "Step 1: Extracting Baseline Model BASE Data for Chr ${CHROM}"

zcat "$BASELINE_DIR/baselineLD.${CHROM}.l2.ldscore.gz" | cut -f1-4 > "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.ldscore"
zcat "$BASELINE_DIR/baselineLD.${CHROM}.annot.gz" | cut -f1-5 > "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.annot"
cut -f1 "$BASELINE_DIR/baselineLD.${CHROM}.l2.M" > "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.M"
cut -f1 "$BASELINE_DIR/baselineLD.${CHROM}.l2.M_5_50" > "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.M_5_50"


# Step 2 combine celltype annotations for each chromosome with baseline model

echo "Step 2: Integrating Cell Type Specific Data for Chr ${CHROM}"

for annot_file in "$PROJECT_DIR"/*.chr${CHROM}.annot.gz; do # e.g. ATAC_peaks_per_cluster_narrowPeaks.real.muscle_9_peaks.chr9.annot.gz

cluster_full=$(basename "$annot_file" .chr${CHROM}.annot.gz)
cluster=${cluster_full##*.} # e.g. muscle_9_peaks

ldscore_file="$PROJECT_DIR/${cluster_full}.chr${CHROM}.l2.ldscore.gz"
M_file="$PROJECT_DIR/${cluster_full}.chr${CHROM}.l2.M"
M_5_50_file="$PROJECT_DIR/${cluster_full}.chr${CHROM}.l2.M_5_50"

tmp_col=$(mktemp)

# === LD score ===
{
echo "${cluster}L2"
zcat "$ldscore_file" | tail -n +2 | cut -f4
} > "$tmp_col"
paste "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.ldscore" "$tmp_col" > "${tmp_col}_new"
mv "${tmp_col}_new" "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.ldscore"

# === Annotation ===
{
echo "$cluster"
zcat "$annot_file" | tail -n +2 | cut -f5
} > "$tmp_col"
paste "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.annot" "$tmp_col" > "${tmp_col}_new"
mv "${tmp_col}_new" "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.annot"

# === M ===
paste "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.M" "$M_file" > "${tmp_col}_new"
mv "${tmp_col}_new" "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.M"

# === M_5_50 ===
paste "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.M_5_50" "$M_5_50_file" > "${tmp_col}_new"
mv "${tmp_col}_new" "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.M_5_50"

done

cat "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.ldscore" | gzip -c > "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.ldscore.gz"
cat "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.annot" | gzip -c > "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.annot.gz"
rm "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.l2.ldscore"
rm "$OUTPUT_DIR/baseline_w_celltype_LD.${CHROM}.annot"
48 changes: 48 additions & 0 deletions bash_script/3.ldsc_h2_base_w_celltypes.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash

#SBATCH --ntasks=1
#SBATCH --cpus-per-task=30
#SBATCH --mem=300G
#SBATCH --time=24:00:00
#SBATCH --output=slurm_sldsc_h2_celltype_%j.out
#SBATCH --error=slurm_sldsc_h2_celltype_%j.err

source /opt/software/anaconda3/2023.03-py3.10/etc/profile.d/conda.sh
conda activate /projects/cbmr_shared/people/wkq953/non-GDPR/.conda/envs/ldsc

# Only Base in Baseline model + all cell-type annotations, captures relative enrichment, adjusting for collinearity.

# === Set project specific paths ===
PROJECT_DIR="/projects/qtl_families-AUDIT/scratch"
CLUSTER=${SLURM_ARRAY_TASK_ID}
ANNOT_DIR="$PROJECT_DIR/ldsc_annotation_base_w_celltypes/baseline_w_celltype_LD." # ATAC_peaks_per_cluster_narrowPeaks.real.muscle_9_peaks.chr9.l2.M
OUT_DIR="$PROJECT_DIR/sldsc_result/baselineLD_v2.2_h2_base_w_celltypes"
mkdir -p "$OUT_DIR"


# === Set general paths ===
RESOURCE_DIR="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/resources/ldsc"
GWAS_DIR="/projects/cbmr_shared/data/common_resources/SEGMENT/GWAS/processed/ldsc_munge"
LDSC_DIR="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/ldsc"
BASELINE_DIR="$RESOURCE_DIR/1000G_Phase3_baselineLD_v2.2_ldscores/baselineLD."
weights_file="$RESOURCE_DIR/1000G_Phase3_weights_hm3_no_MHC/weights.hm3_noMHC."
frq_file="$RESOURCE_DIR/1000G_Phase3_frq/1000G.EUR.QC."

cd "$LDSC_DIR" || exit 1

# === Loop through all GWAS sumstats ===
for trait in "$GWAS_DIR"/*.sumstats.gz; do

trait_name=$(basename "$trait" .sumstats.gz)
echo "Processed $trait_name"

python ldsc.py \
--h2 "$trait" \
--ref-ld-chr "$ANNOT_DIR" \
--w-ld-chr "$weights_file" \
--frqfile-chr "$frq_file" \
--overlap-annot \
--out "$OUT_DIR/${trait_name}"
# --overlap-annot --print-cov --print-coefficients --print-delete-vals \

done
47 changes: 47 additions & 0 deletions bash_script/3.ldsc_h2_baseline_full_w_celltype.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash

#SBATCH --ntasks=1
#SBATCH --cpus-per-task=30
#SBATCH --mem=300G
#SBATCH --time=24:00:00
#SBATCH --output=slurm_sldsc_h2_%j.out
#SBATCH --error=slurm_sldsc_h2_%j.err
#SBATCH --array=1-15

source /opt/software/anaconda3/2023.03-py3.10/etc/profile.d/conda.sh
conda activate /projects/cbmr_shared/people/wkq953/non-GDPR/.conda/envs/ldsc

# Baseline + one cell-type annotation at a time, captures marginal enrichment, more sensitive but less specific.

# === Set project specific paths ===
PROJECT_DIR="/projects/qtl_families-AUDIT/scratch"
CLUSTER=${SLURM_ARRAY_TASK_ID}
ANNOT_DIR="$PROJECT_DIR/ldsc_annotation/ATAC_peaks_per_cluster_narrowPeaks.real.muscle_${CLUSTER}_peaks.chr" # ATAC_peaks_per_cluster_narrowPeaks.real.muscle_9_peaks.chr9.l2.M
OUT_DIR="$PROJECT_DIR/sldsc_result/baselineLD_v2.2_h2_baseline_full_w_celltypes"
mkdir -p "$OUT_DIR"

# === Set general paths ===
RESOURCE_DIR="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/resources/ldsc"
GWAS_DIR="/projects/cbmr_shared/data/common_resources/SEGMENT/GWAS/processed/ldsc_munge"
LDSC_DIR="/projects/cbmr_shared/people/wkq953/non-GDPR/segment/pipeline/ldsc"
BASELINE_DIR="$RESOURCE_DIR/1000G_Phase3_baselineLD_v2.2_ldscores/baselineLD."
weights_file="$RESOURCE_DIR/1000G_Phase3_weights_hm3_no_MHC/weights.hm3_noMHC."
frq_file="$RESOURCE_DIR/1000G_Phase3_frq/1000G.EUR.QC."

cd "$LDSC_DIR" || exit 1

# === Loop through all GWAS sumstats ===
for trait in "$GWAS_DIR"/*.sumstats.gz; do

trait_name=$(basename "$trait" .sumstats.gz)
echo "Processed $trait_name"

python ldsc.py \
--h2 "$trait" \
--ref-ld-chr "$ANNOT_DIR","$BASELINE_DIR" \
--w-ld-chr "$weights_file" \
--frqfile-chr "$frq_file" \
--print-cov --print-coefficients --print-delete-vals \
--out "$OUT_DIR/${trait_name}_cluster${CLUSTER}"

done
36 changes: 36 additions & 0 deletions bash_script/3.ldsc_h2_baseline_full_w_celltype_combine_result.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

#SBATCH --ntasks=1
#SBATCH --cpus-per-task=30
#SBATCH --mem=300G
#SBATCH --time=24:00:00
#SBATCH --output=slurm_sldsc_h2_full_w_celltype_combine%j.out
#SBATCH --error=slurm_sldsc_h2_full_w_celltype_combine%j.err


# Baseline + one cell-type annotation at a time, captures marginal enrichment, more sensitive but less specific.
# combine all cell-type annotations in one run, which is more efficient than running each cell-type separately.


# === Set project specific paths ===
PROJECT_DIR="/projects/qtl_families-AUDIT/scratch"
GWAS_DIR="/projects/cbmr_shared/data/common_resources/SEGMENT/GWAS/processed/ldsc_munge"
OUT_DIR="$PROJECT_DIR/sldsc_result/baselineLD_v2.2_h2_baseline_full_w_celltypes"


# === Loop through all GWAS sumstats ===
for sumstat_file in "$GWAS_DIR"/*.sumstats.gz; do

trait_name=$(basename "$sumstat_file" .sumstats.gz)
output_file="${OUT_DIR}/${trait_name}.results"
echo -e "Category\tProp._SNPs\tProp._h2\tProp._h2_std_error\tEnrichment\tEnrichment_SE\tEnrichment_p\tCoefficient\tCoefficient_std_error\tCoefficient_z-score" > "$output_file"

for file in ${OUT_DIR}/${trait_name}_cluster*.results; do

[[ -f "$file" ]] || continue
cluster=$(basename "$file" .results | sed -E 's/.*_cluster([0-9]+)/cluster\1/')
result_line=$(awk 'NR==2' "$file")
echo -e "${cluster}\t${result_line#*L2_0}" >> "$output_file"

done
done
Loading