Profound_Autism_Paper_Scripts/convertExpressionToBED_rand_diff.R at main · QuackenbushLab/Profound_Autism_Paper_Scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# ---- user inputs ----
expr_csv   <- NULL  # genes x samples
gtf_path   <- NULL
mergedFile <- NULL
map <- read.table(mergedFile, sep = "\t", header = TRUE)
suppressPackageStartupMessages({
  library(rtracklayer)  # to read GTF
})
expr <- read.csv(expr_csv, row.names = 1, check.names = FALSE)
# ensure numeric
expr[] <- lapply(expr, function(x) suppressWarnings(as.numeric(as.character(x))))
expr <- as.data.frame(expr)
print("Read")

iid <- unlist(lapply(colnames(expr), function(samp){
  return(paste0(samp, ".p1"))
}))
aid <- unlist(lapply(iid, function(id){
  retval <- NA
  which_iid <- which(map$individual_id == id)
  aid <- map[which_iid, "array_id"]
  if(length(which_iid) == 1){
    retval <-  aid
  }
  return(retval)
}))
str(expr)
expr <- expr[,which(!is.na(aid))]
str(expr)
aid <- aid[which(!is.na(aid))]
colnames(expr) <- aid
str(expr)

# drop genes with all-NA
expr <- expr[rowSums(!is.na(expr)) > 0, , drop = FALSE]
print("Dropped")

# ---- 2) gene coordinates from GTF ----
gr <- import(gtf_path)
gr_gene <- gr[gr$type == "gene"]
# gene_id and (optionally) gene_name
gid <- if (!is.null(mcols(gr_gene)$gene_id)) mcols(gr_gene)$gene_id else mcols(gr_gene)$ID
gname <- mcols(gr_gene)$gene_name
print("coords")

# Retain only the identifiers that have matching IDs.
expr <- expr[which(rownames(expr) %in% gid),]

# choose which gene identifier to match by:
# If your expr rownames are Ensembl IDs -> use gene_id
# If your expr rownames are gene symbols -> use gene_name
if (all(rownames(expr) %in% gid)) {
  key <- gid
} else if (!is.null(gname) && all(rownames(expr) %in% gname)) {
  key <- gname
} else {
  stop("Row names of expr do not match GTF gene_id or gene_name. ",
       "Use the matching identifier or provide your own gene coord table.")
}
print("Chose ID")


df_anno <- data.frame(
  gene_key = key,
  chr  = as.character(seqnames(gr_gene)),
  start = start(gr_gene),
  end   = end(gr_gene),
  strand= as.character(strand(gr_gene)),
  stringsAsFactors = FALSE
)
print("Data frame")

# keep only genes present in expression
df_anno <- df_anno[!duplicated(df_anno$gene_key), ]
common <- intersect(rownames(expr), df_anno$gene_key)
df_anno <- df_anno[match(common, df_anno$gene_key), ]
expr    <- expr[common, , drop = FALSE]
print("Kept")

# region to use
bed_start <- pmax(0L, df_anno$start - 1L)
bed_end   <- df_anno$end
print("Region")

# phenotype_id = the same IDs you matched by
phenotype_id <- df_anno$gene_key
geno_prefix_dir <- NULL
out_tsv_dir <- NULL
for(i in 1:10){
  for(j in 1:2){                          # e.g., GENCODE/GTF
    geno_prefix <- paste0(geno_prefix_dir, "_rand_", i, "_", j)
    out_tsv <- paste0(out_tsv_dir, "_", i, "_", j, "_diff.bed")

    # ---- 3) reorder expression columns to match genotype sample order ----
    psam_path <- paste0(geno_prefix, ".psam")
    fam_path  <- paste0(geno_prefix, ".fam")
    if (file.exists(psam_path)) {
      psam <- read.table(psam_path, sep = "\t", check.names = FALSE,
                         stringsAsFactors = FALSE, header = FALSE)
      str("Found PSAM")
      sample_ids <- psam[,2]
    } else if (file.exists(fam_path)) {
      fam <- read.table(fam_path, header = FALSE, sep = "", stringsAsFactors = FALSE)
      sample_ids <- fam[[2]]  # IID is column 2
    } else {
      stop("Could not find .psam or .fam for 'geno_prefix' = ", geno_prefix)
    }
    print("Read FAM")

    # ensure all samples exist
    shared <- intersect(sample_ids, colnames(expr))
    exprTmp <- expr[, shared, drop = FALSE]
    print("Exists")
    # ---- 4) assemble phenotype BED and sort ----
    bed <- data.frame(
      `#chr` = df_anno$chr,
      start = as.integer(bed_start),
      end   = as.integer(bed_end),
      phenotype_id = phenotype_id,
      check.names = FALSE
    )
    #str(exprTmp)
    bed <- cbind(bed, exprTmp)

    # sort by chrom and start (natural human order if possible)
    chr_order <- function(x) {
      # try to place 1..22, X, Y, MT at the end if present
      x2 <- toupper(x)
      map <- setNames(seq_along(x2), x2)
      # numeric part
      num <- suppressWarnings(as.integer(x2))
      is_num <- !is.na(num)
      ord <- order(
        ifelse(is_num, 0L, ifelse(x2 %in% c("X","Y","MT","M"), 1L, 2L)),
        ifelse(is_num, num,
               match(x2, c("X","Y","MT","M"), nomatch = 999L)),
        bed$start
      )
      ord
    }
    bed <- bed[chr_order(bed$`#chr`), ]
    print("sorted by chrom")
    str(bed)
    # ---- 5) write, bgzip, tabix ----
    write.table(bed, file = out_tsv, sep = "\t", quote = FALSE,
                row.names = FALSE, col.names = TRUE)
    print("wrote")
  }
}