LncRNA_CLIP_CLASH/BEDtoCSV.R at master · RenneLab/LncRNA_CLIP_CLASH · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#! usr/bin/env Rscript

rm(list=ls())

setwd("/ufrc/renne/sunantha.s/research/CLIP_lncRNAs/MetaAnalysis/Scripts")

library(data.table)
library(splitstackshape)

###############################################################################################################################
### Add annotations to the cluster BED files

INDIR <- "/ufrc/renne/sunantha.s/research/CLIP_lncRNAs/MetaAnalysis/PIPECLIP/"
OUTDIR <- "/ufrc/renne/sunantha.s/research/CLIP_lncRNAs/MetaAnalysis/PIPECLIP/Annotated/"

SeqFiles <- list.files(path = INDIR ,pattern = "\\_seqs.bed$")
AnnFiles <- list.files(path = INDIR ,pattern = "\\_ann.bed$")

sample_names <- sub("*_ann.bed", "", AnnFiles)

names <- c("ClusterID", "Sequence", "Chr", "Start", "Stop", "Read_Count", "Strand", "p_value", "FDR", "Lnc_Chr", "Lnc_Start",
           "Lnc_End", "Lnc_Name", "Lnc_Strand", "Source", "Type", "transcript_class", "Overlap", "gene_id","transcript_id", "gene_type",
           "gene_status", "gene_name", "transcript_type", "transcript_status", "transcript_name")

for (i in 1: length(SeqFiles)) {
  seq <- read.delim(paste0(INDIR,SeqFiles[i]), stringsAsFactors = F, header = FALSE)
  colnames(seq) <- c("ClusterID","Sequence")
  seq$Sequence <- toupper(seq$Sequence)
  ann <- read.delim(paste0(INDIR,AnnFiles[i]), stringsAsFactors = F, header = FALSE)
  names(ann)[names(ann) == "V4"] <- "ClusterID"
  ann_seq <- merge(seq,ann, by="ClusterID")
  ann_seq <- cSplit(ann_seq, 19, ";")
  ann_seq <- subset.data.frame(ann_seq, select = -c(14,18,29:ncol(ann_seq)), use.names =F, fill=F)
  ann_seq$V18_01 <- gsub("gene_id ", "", ann_seq$V18_01)
  ann_seq$V18_02 <- gsub("transcript_id ", "", ann_seq$V18_02)
  ann_seq$V18_03 <- gsub("gene_type ", "", ann_seq$V18_03)
  ann_seq$V18_04 <- gsub("gene_status ", "", ann_seq$V18_04)
  ann_seq$V18_05 <- gsub("gene_name ", "", ann_seq$V18_05)
  ann_seq$V18_06 <- gsub("transcript_type ", "", ann_seq$V18_06)
  ann_seq$V18_07 <- gsub("transcript_status ", "", ann_seq$V18_07)
  ann_seq$V18_08 <- gsub("transcript_name ", "", ann_seq$V18_08)
  colnames(ann_seq) <- names
  ann_seq$Read_Length <- nchar(ann_seq$Sequence)
  ann_seq <- ann_seq[ann_seq$Read_Count>4,]
  write.csv(ann_seq, paste0(OUTDIR,sample_names[i],".csv"), row.names = FALSE)
}

###############################################################################################################################
### Count the number of clusters in different RNA categories

files <- list.files(OUTDIR, pattern = "\\.csv$")

counting <- data.frame(matrix(NA, nrow=length(files), ncol= 5))

colnames(counting) <- c("SampleName", "Unannotated", "mRNA", "smallRNA", "lncRNA")

count =1
for(file in files) {
  temp <- read.csv(paste0(OUTDIR,file), stringsAsFactors = F)
  counting[count,1] <- file
  counting[count,2] <- sum(temp$transcript_class == ".")
  counting[count,3] <- sum(temp$transcript_class == "mRNA")
  counting[count,4] <- sum(temp$transcript_class == "smallRNA")
  counting[count,5] <- sum(temp$transcript_class == "lncRNA")
  count= count+1
}
write.csv(counting, paste0(OUTDIR,"Counts/","Classes.csv"), row.names=F)

###############################################################################################################################