-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathgenerate_DNAshapes_csv.R
More file actions
129 lines (114 loc) · 3.72 KB
/
generate_DNAshapes_csv.R
File metadata and controls
129 lines (114 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env Rscript
### generate_DNAhapes_csv.R
### Zian Liu
### 6/21/2021
## This script generates the DNA shapes table for k-mers given any odd number k.
# Library
library(DNAshapeR) # Note: install from the author's github
# Read inputs
args = commandArgs(trailingOnly=TRUE)
#defaults=c(5, "fasta_dict_5mer.fa", "ref_5mers_structure.csv"))
k = as.integer(args[1])
FaName = args[2]
OutputFile = args[3]
FlagCPG = as.logical(args[4])
print("Length of sequences k: ")
print(k)
print("Name of output fasta file: ")
print(FaName)
print("Name of final output csv file: ")
print(OutputFile)
print("Whether to consider CpG status (logical): ")
print(FlagCPG)
# Build the basic letters dictionary
dict_4 <- c('A','C','G','T')
# For each value in k, make the iteration
TotalIt = 4^(k-1)
Seqs = matrix(nrow=4^k, ncol=k)
for (i in 1:k) {
Div = 4^(i-1)
TempIt = TotalIt/Div
Seqs[, i] <- rep(
c(rep('A', TempIt), rep('C', TempIt), rep('G', TempIt), rep('T', TempIt)), Div)
}
# Collapse the matrix to form individual sequences
dict_fa_1 <- apply(format(Seqs), 1, paste, collapse="")
# Add titles
dict_fa_title <- paste(rep('>seq', 4^k), 1:(4^k), sep = '_')
# Interweave the previous two vectors into one vector & write it into text file
write.table(as.vector(rbind(dict_fa_title, dict_fa_1)),
file = FaName, row.names = FALSE, col.names = FALSE, quote = FALSE)
# Notification
print("Fasta sequences generated and saved.")
# If CpG is toggled, save a CpG-specific fasta
if (FlagCPG) {
# Create dict_fa_2 which replaces all occurrences of "CG" to "Mg"
dict_fa_2 <- gsub("CG", "Mg", dict_fa_1)
# Interweave the previous two vectors into one vector & write it into text file
write.table(as.vector(rbind(dict_fa_title, dict_fa_2)),
file = paste(FaName, "cpg", sep="."),
row.names = FALSE, col.names = FALSE, quote = FALSE)
}
# Define all shapes used
ShapeTypeList = c('HelT', 'Rise', 'Roll', 'Shift', 'Slide', 'Tilt', 'Buckle',
'Opening', 'ProT', 'Shear', 'Stagger', 'Stretch', 'MGW', 'EP')
# If CpG is toggled, define CpG specific shapes
if (FlagCPG) {
ShapeTypeListM = c("ProT", "HelT", "Roll", "MGW")
}
# Import the fastas and run shapes
ShapeTF = data.frame()
All_colnames = c()
for (ShType in ShapeTypeList){
# Get DNA shape
TempDF <- getShape(
FaName, shapeType = ShType
)[[ShType]]
# Remove all NA columns
TempDF <- TempDF[, colSums(is.na(TempDF)) < 1]
# Change column names
if (length(TempDF) / length(dict_fa_1) < 2) {
All_colnames <- c(All_colnames, paste(ShType, 1, sep="_"))
} else {
All_colnames <- c(All_colnames, paste(ShType, 1:ncol(TempDF), sep="_"))
}
# Add data
if (ncol(ShapeTF) == 0) {
ShapeTF <- TempDF
} else {
ShapeTF = cbind(ShapeTF, TempDF)
}
}
print("DNA shapes generated.")
# If CpG is toggled, run CpG-specific shapes:
if (FlagCPG) {
for (ShType in ShapeTypeListM){
ShTypeM = paste(ShType, "m", sep="_")
# Get DNA shape
TempDF <- getShape(
paste(FaName, "cpg", sep="."), shapeType = ShType, methylate=TRUE
)[[ShType]]
# Remove all NA columns
TempDF <- TempDF[, colSums(is.na(TempDF)) < 1]
# Change column names
if (length(TempDF) / length(dict_fa_1) < 2) {
All_colnames <- c(All_colnames, paste(ShType, "m", 1, sep="_"))
} else {
All_colnames <- c(All_colnames, paste(ShType, "m", 1:ncol(TempDF), sep="_"))
}
# Add data
if (ncol(ShapeTF) == 0) {
ShapeTF <- TempDF
} else {
ShapeTF = cbind(ShapeTF, TempDF)
}
}
print("CpG-specific DNA shapes generated.")
}
# Change names
rownames(ShapeTF) <- dict_fa_1
colnames(ShapeTF) <- All_colnames
# Make an output
write.csv(ShapeTF, file = OutputFile, row.names = TRUE)
# Notification
print("CSV file generated.")