Skip to content

Commit b6c7ac5

Browse files
authored
Merge pull request #1 from iibadshah/main
Adding ks datasets
2 parents 786feb9 + f100764 commit b6c7ac5

6 files changed

Lines changed: 23161 additions & 2 deletions

File tree

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: protools2
22
Type: Package
33
Title: A set of tools for proteomics and phosphoproteomics data analysis
4-
Version: 0.1.0
4+
Version: 0.2.1
55
Date: 2023-01-07
66
Author: Pedro Rodriguez Cutillas
77
Maintainer: Pedro Rodriguez Cutillas <p.cutillas@qmul.ac.uk>
@@ -22,4 +22,4 @@ Imports:
2222
limma,
2323
readxl,
2424
stringr
25-
RoxygenNote: 7.2.1
25+
RoxygenNote: 7.2.3

data-raw/Kinase_Substrate_Dataset_psite_2023_05.csv

Lines changed: 22792 additions & 0 deletions
Large diffs are not rendered by default.

data-raw/process files.R

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
# Load packages ----
2+
library(devtools)
3+
library(dplyr)
4+
5+
6+
# Create species-specific PhosphoSitePlus data ----
7+
8+
# Load PhoshoSitePlus data
9+
# Available from: https://www.phosphosite.org/staticDownloads
10+
psite_Kinase_Substrate_Dataset <- read.csv("Kinase_Substrate_Dataset_psite_2023_05.csv")
11+
12+
# Get unique species in dataset
13+
species_kin <- unique(psite_Kinase_Substrate_Dataset$KIN_ORGANISM)
14+
species_sub <- unique(psite_Kinase_Substrate_Dataset$SUB_ORGANISM)
15+
species_list <- dplyr::intersect(species_kin, species_sub)
16+
17+
# Create dataframe in `protools2` required format
18+
psite_ks_species <- vector("list", length = length(species_list))
19+
names(psite_ks_species) <- paste0("psite_", species_list)
20+
for (species in species_list) {
21+
psite_ks_species[[paste0("psite_", species)]] <- psite_Kinase_Substrate_Dataset %>%
22+
dplyr::filter(KIN_ORGANISM == species & SUB_ORGANISM == species) %>% # Filters matching species
23+
dplyr::group_by(GENE) %>%
24+
dplyr::summarise(
25+
m = dplyr::n(),
26+
subs = paste0(SUB_GENE, "(", SUB_MOD_RSD, ")", collapse = ";")
27+
) %>%
28+
dplyr::mutate(
29+
organism = species,
30+
gene.db = paste0(GENE, ".pSite")
31+
) %>%
32+
dplyr::rename(gene = GENE)
33+
}
34+
35+
# Save the processed data (saves to 'data/psite_ks_species.rda')
36+
usethis::use_data(psite_ks_species)
37+
38+
39+
#####
40+
41+
jj <- 1
42+
43+
if (jj == 1) {
44+
edges <- "https://www.dropbox.com/s/ttmzd40mnjgh1iu/edges.csv?dl=1"
45+
pdts <- "https://www.dropbox.com/s/86jfnayv0qa1n2q/pdts.csv?dl=1"
46+
psite <- "https://www.dropbox.com/s/eb1qoofz793f4tq/psite.csv?dl=1"
47+
signor <- "https://www.dropbox.com/s/alpbq880emz1z2t/signor.csv?dl=1"
48+
reactome <- "https://www.dropbox.com/s/jdcc1355cz73mmi/reactome.csv?dl=1"
49+
50+
process <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/process_ont.csv"
51+
myfunction <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/function_ont.csv"
52+
location <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/location_ont.csv"
53+
54+
process_mouse <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/process_ont_mouse.csv"
55+
myfunction_mouse <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/function_ont_mouse.csv"
56+
location_mouse <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/location_ont_mouse.csv"
57+
58+
process_rat <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/process_ont_rat.csv"
59+
myfunction_rat <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/function_ont_rat.csv"
60+
location_rat <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/location_ont_rat.csv"
61+
62+
process_pig <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/process_ont_pig.csv"
63+
myfunction_pig <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/function_ont_pig.csv"
64+
location_pig <- "C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/location_ont_pig.csv"
65+
66+
nci <- "https://www.dropbox.com/s/fe8t4nyhbljsn5y/nci.csv?dl=1"
67+
human.cell.markers.full <- "https://www.dropbox.com/s/a2tvvuh8kgi340v/human_cell_markers_full_lineage.csv?dl=1"
68+
human.cell.markers.short <- "https://www.dropbox.com/s/4njofwfs5uu3wya/human_cell_markers_short_lineage.csv?dl=1"
69+
mouse.cell.markers <- "https://www.dropbox.com/s/223bpbc5p24j919/mouse_cell_markers_full_lineage.csv?dl=1"
70+
blood.cell.markers.human1 <- "https://www.dropbox.com/s/kl2apjfnbqdw7gj/blood_cell_markers.csv?dl=1"
71+
blood.cell.markers.human2 <- "https://www.dropbox.com/s/o28o4fbmmuwtgs3/blood_cell_markers2.csv?dl=1"
72+
bone.marrow.cell.markers <- "https://www.dropbox.com/s/gl0vim65cz9804a/bone_marrow_cell_markers.csv?dl=1"
73+
tf.targets <- "https://www.dropbox.com/s/cuitt6gzipxaaec/TF%20target%20genes%20gtrd%20v71.csv?dl=1"
74+
75+
pdt.sig <- "https://www.dropbox.com/s/wr6j8i5gosm959x/pdts_signor.csv?dl=1"
76+
77+
tf.targets.omnipath <- "https://www.dropbox.com/s/hg3slk150l7zd0x/TF%20all%20targets%20omnipath.csv?dl=1"
78+
chromatin <- "https://www.dropbox.com/s/y2gkjn45oxy72nw/chromatin.csv?dl=1"
79+
selected <- "https://www.dropbox.com/s/nrraoj8hlvzap87/selected.csv?dl=1"
80+
ctams <- "https://www.dropbox.com/s/ev95hm4zz4c535c/ctams.csv?dl=1"
81+
kegg <- "https://www.dropbox.com/s/gm2821cmxarv7sx/kegg%20pathways.csv?dl=1"
82+
pp.markers.corr.with.cd <- "https://www.dropbox.com/s/aixtdiwhuym9vk6/PP%20Markers%20corr%20with%20CDs.csv?dl=1"
83+
84+
hallmark.genes <- "https://www.dropbox.com/s/wqvnoalg2v6ufm8/hallmark%20genes%20v71.csv?dl=1"
85+
86+
cd.phospho.markers <- "https://www.dropbox.com/s/aixtdiwhuym9vk6/PP%20Markers%20corr%20with%20CDs.csv?dl=1"
87+
pdts.reactome <- "https://www.dropbox.com/s/rvqjmzg9cq9yjpu/pdts_reactome.csv?dl=1"
88+
pdts.process <- "https://www.dropbox.com/s/1vqm9kky5ctzj2g/pdts_process.csv?dl=1"
89+
pdts.location <- "https://www.dropbox.com/s/jbroun1ojqu9rnc/pdts_location.csv?dl=1"
90+
pdts.nci <- "https://www.dropbox.com/s/qjvmxgp6fd5cv8x/pdts_nci.csv?dl=1"
91+
92+
ks.omnipath <- "https://www.dropbox.com/s/zdnuxe8anwboks1/Kinase%20substrates%20omnipath.csv?dl=1"
93+
94+
circuitries <- "https://www.dropbox.com/s/9vykdbcvy0jzh0f/Results%20circuitry%20anal.csv?dl=1"
95+
96+
ctams.hijazi <- "https://www.dropbox.com/s/zksj57u60q6sg7q/CTAMS_hijazi.csv?dl=1"
97+
98+
99+
dataset.names <- c(
100+
"edges", # 1
101+
"pdts",
102+
"psite",
103+
"signor",
104+
"ks.omnipath", # 5
105+
"reactome",
106+
"process",
107+
"function",
108+
"location",
109+
"process_mouse", # 10
110+
"function_mouse",
111+
"location_mouse",
112+
"process_rat",
113+
"function_rat",
114+
"location_rat", # 15
115+
"process_pig",
116+
"function_pig",
117+
"location_pig",
118+
"nci", # 19
119+
"human.cell.markers.full", # 20
120+
"human.cell.markers.short",
121+
"mouse.cell.markers",
122+
"blood.cell.markers.human1",
123+
"blood.cell.markers.human2",
124+
"bone.marrow.cell.markers", # 25
125+
"tf.targets",
126+
"tf.targets.omnipath",
127+
"chromatin",
128+
"selected",
129+
"ctams", # 30
130+
"kegg",
131+
"cd.phospho.markers",
132+
"pdts.reactome",
133+
"pdts.process",
134+
"pdts.location", # 35
135+
"pdts.nci",
136+
"markers.corr.with.cd",
137+
"hallmark.genes",
138+
"pdt.sig",
139+
"circuitries", # 40
140+
"ctams.hijazi",
141+
"psite_mouse", # Incorporated new species-specific PhosphoSitePlus data
142+
"psite_rat",
143+
"psite_human",
144+
"psite_rabbit", # 45
145+
"psite_chicken",
146+
"psite_cow",
147+
"psite_pig",
148+
"psite_frog",
149+
"psite_dog", # 50
150+
"psite_hamster"
151+
)
152+
153+
datasets <- list(
154+
edges, # 1
155+
pdts, # 2
156+
psite,
157+
signor,
158+
ks.omnipath, # 5
159+
reactome,
160+
process,
161+
myfunction,
162+
location,
163+
process_mouse, # 10
164+
myfunction_mouse,
165+
location_mouse,
166+
process_rat,
167+
myfunction_rat,
168+
location_rat, # 15
169+
process_pig,
170+
myfunction_pig,
171+
location_pig,
172+
nci, # 19
173+
human.cell.markers.full, # 20
174+
human.cell.markers.short,
175+
mouse.cell.markers,
176+
blood.cell.markers.human1, # 23
177+
blood.cell.markers.human2,
178+
bone.marrow.cell.markers, # 25
179+
tf.targets,
180+
tf.targets.omnipath,
181+
chromatin, # 28
182+
selected,
183+
ctams, # 30
184+
kegg,
185+
cd.phospho.markers,
186+
pdts.reactome,
187+
pdts.process,
188+
pdts.location, # 35
189+
pdts.nci,
190+
pp.markers.corr.with.cd,
191+
hallmark.genes,
192+
pdt.sig,
193+
circuitries, # 40
194+
ctams.hijazi,
195+
psite_ks_species$psite_mouse, # Incorporated new species-specific PhosphoSitePlus data
196+
psite_ks_species$psite_rat,
197+
psite_ks_species$psite_human,
198+
psite_ks_species$psite_rabbit, # 45
199+
psite_ks_species$psite_chicken,
200+
psite_ks_species$psite_cow,
201+
psite_ks_species$psite_pig,
202+
psite_ks_species$psite_frog,
203+
psite_ks_species$psite_dog, # 50
204+
psite_ks_species$psite_hamster
205+
)
206+
207+
208+
# df.datasets <- list(datasets)
209+
names(datasets) <- dataset.names
210+
dataset.list <- list()
211+
i <- 1
212+
for (d in datasets) {
213+
names(datasets[1])
214+
dataset.list[[i]] <- read.csv(datasets[[i]])
215+
names(dataset.list)[i] <- names(datasets[i])
216+
i <- i + 1
217+
}
218+
protein_and_ks_sets <- dataset.list
219+
k <- protein_and_ks_sets$process
220+
usethis::use_data_raw(name = "protein_and_ks_sets")
221+
usethis::use_data(protein_and_ks_sets, overwrite = TRUE)
222+
}
223+
224+
225+
226+
uniprot.names <- read.csv("C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/uniprot_reviewed_20230102.csv")
227+
228+
usethis::use_data_raw(name = "uniprot.names")
229+
usethis::use_data(uniprot.names, overwrite = TRUE)
230+
231+
uniprot.names.mouse <- read.csv("C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/uniprot/uniprot_mouse_reviewed_20230102.csv")
232+
233+
usethis::use_data_raw(name = "uniprot.names.mouse")
234+
usethis::use_data(uniprot.names.mouse, overwrite = TRUE)
235+
236+
####
237+
238+
239+
datasets <- protools::protein_and_ks_sets
240+
241+
d <- datasets$selected
242+
243+
244+
j <- 10
245+
for (d in datasets) {
246+
dd <- datasets[j]
247+
nn <- names(dd[[1]])
248+
ddd <- dd[[1]]
249+
names(dd[1])
250+
ddd <- subset(ddd, ddd[, 2] > 2)
251+
# if (!"genes" %in% nn){
252+
i <- 1
253+
for (i in 1:nrow(ddd)) {
254+
prots <- unique(unlist(strsplit(ddd[i, 3], ";")))
255+
genes <- protools::gene.names.from.accessions(prots)
256+
ddd$genes[i] <- paste0(genes, collapse = ";")
257+
}
258+
datasets[[j]] <- ddd
259+
# }
260+
j <- j + 1
261+
}
262+
protein_and_ks_sets <- datasets
263+
264+
usethis::use_data_raw(name = "protein_and_ks_sets")
265+
usethis::use_data(protein_and_ks_sets, overwrite = TRUE)
266+
267+
268+
df.kegg <- read.csv("C:/Users/cutill01/Dropbox/01pedro work/01_data/databases/gene sets/kegg pathways_2.csv")
269+
i <- 1
270+
accs <- accessions.from.gene.names(df.kegg$gene)
271+
df.kegg$proteins <- accs
272+
273+
pathways <- unique(df.kegg$pathway)
274+
nn <- length(pathways)
275+
276+
pathway <- character(nn)
277+
genes <- character(nn)
278+
proteins <- character(nn)
279+
m <- numeric(nn)
280+
i <- 1
281+
for (p in pathways) {
282+
xx <- df.kegg[df.kegg$pathway == p, ]
283+
genes[i] <- paste0(xx$gene, collapse = ";")
284+
proteins[i] <- paste0(xx$proteins, collapse = ";")
285+
m[i] <- nrow(xx)
286+
i <- i + 1
287+
}
288+
289+
df.kegg.b <- data.frame(pathways, m, proteins, genes)
290+
291+
df.kegg.b <- df.kegg.b[order(-df.kegg.b$m), ]
292+
293+
protein_and_ks_sets[["kegg"]] <- df.kegg.b
294+
295+
usethis::use_data_raw(name = "protein_and_ks_sets")
296+
usethis::use_data(protein_and_ks_sets, overwrite = TRUE)
297+
298+
k <- protein_and_ks_sets$kegg$genes[1:10]
299+
300+
301+
302+
use_package("foreach")
303+
use_package("doParallel")
304+
use_package("readxl")
305+
use_package("dplyr")
306+
use_package("ggrepel")
307+
use_package("stringr")
308+
use_package("ggplot2")
309+
use_package("ggpubr")
310+
use_package("ggrepel")
311+
use_package("igraph")
312+
use_package("limma")
313+
document()
314+
315+
# Reload the package: CTRL-L or
316+
load_all()

data-raw/process_files.R

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Process species-specific data files
2+
3+
4+
# Load packages ----
5+
library(devtools)
6+
library(dplyr)
7+
8+
9+
# Create species-specific PhosphoSitePlus data ----
10+
11+
# Load PhoshoSitePlus data
12+
# Available from: https://www.phosphosite.org/staticDownloads
13+
psite_Kinase_Substrate_Dataset <- read.csv("Kinase_Substrate_Dataset_psite_2023_05.csv")
14+
15+
# Get unique species in dataset
16+
species_kin <- unique(psite_Kinase_Substrate_Dataset$KIN_ORGANISM)
17+
species_sub <- unique(psite_Kinase_Substrate_Dataset$SUB_ORGANISM)
18+
species_list <- dplyr::intersect(species_kin, species_sub)
19+
20+
# Create dataframe in `protools2` required format
21+
psite_ks_species <- vector("list", length = length(species_list))
22+
names(psite_ks_species) <- paste0("psite_", species_list)
23+
for (species in species_list) {
24+
psite_ks_species[[paste0("psite_", species)]] <- psite_Kinase_Substrate_Dataset %>%
25+
dplyr::filter(KIN_ORGANISM == species & SUB_ORGANISM == species) %>% # Filters matching species
26+
dplyr::group_by(GENE) %>%
27+
dplyr::summarise(
28+
m = dplyr::n(),
29+
subs = paste0(SUB_GENE, "(", SUB_MOD_RSD, ")", collapse = ";")
30+
) %>%
31+
dplyr::mutate(
32+
organism = species,
33+
gene.db = paste0(GENE, ".pSite")
34+
) %>%
35+
dplyr::rename(gene = GENE)
36+
}
37+
38+
# Save the processed data in package (saves to '../data/psite_ks_species.rda')
39+
usethis::use_data(psite_ks_species)
40+
41+
42+
# Combine with previous protein and kinase-substrate sets ----
43+
44+
# Load list object
45+
load("../data/protein_and_ks_sets.rda")
46+
47+
# Combine lists
48+
protein_and_ks_sets <- c(protein_and_ks_sets, psite_ks_species)
49+
50+
# Save data in package (saves to '../data/protein_and_ks_sets.rda')
51+
usethis::use_data(protein_and_ks_sets, overwrite = TRUE)

data/protein_and_ks_sets.rda

65.7 KB
Binary file not shown.

data/psite_ks_species.rda

66.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)