Skip to content

Commit af4d6c0

Browse files
committed
add datasets
1 parent 008c0da commit af4d6c0

10 files changed

Lines changed: 826 additions & 0 deletions

File tree

datasets/gigaref/Snakefile

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
2+
rule parent:
3+
input:
4+
"/data/final/final_seqs.fasta"
5+
shell:
6+
"""
7+
echo "Parent rule"
8+
"""
9+
10+
rule mgnify_db:
11+
input:
12+
"/data/mgnify/mgnify.fasta"
13+
output:
14+
"/data/mgnify/db/mgnify_db"
15+
shell:
16+
"""
17+
mmseqs createdb {input} {output}
18+
"""
19+
20+
rule mgnify_clu:
21+
input:
22+
"/data/mgnify/db/mgnify_db"
23+
output:
24+
"/data/mgnify/clu/mgnify_clu"
25+
shell:
26+
"""
27+
mmseqs linclust {input} {output} /data/mgnify/tmp --cluster-mode 2 --cov-mode 2 -c 0.8 --min-seq-id 0.7
28+
"""
29+
30+
rule mgnify_rep_db
31+
input:
32+
"/data/mgnify/db/mgnify_db",
33+
"/data/mgnify/clu/mgnify_clu"
34+
output:
35+
"/data/mgnify/clu/mgnify_reps"
36+
shell:
37+
"""
38+
mmseqs createsubdb {input[1]} {input[0]} {output}
39+
"""
40+
41+
rule mgnify_reps:
42+
input:
43+
"/data/mgnify/clu/mgnify_reps"
44+
output:
45+
"/data/mgnify/clu/mgnify_reps.fasta"
46+
shell:
47+
"""
48+
mmseqs convert2fasta {input} {output}
49+
50+
"""
51+
52+
rule merc_src_db:
53+
input:
54+
"/data/merc_and_src/merc_src.fasta"
55+
output:
56+
"/data/merc_and_src/db/merc_src_db"
57+
shell:
58+
"""
59+
mmseqs createdb {input} {output}
60+
"""
61+
62+
rule merc_src_clu:
63+
input:
64+
"/data/merc_and_src/db/merc_src_db"
65+
output:
66+
"/data/merc_and_src/clu/merc_src_clu"
67+
shell:
68+
"""
69+
mmseqs linclust {input} {output} /data/merc_and_src/tmp --cluster-mode 2 --cov-mode 2 -c 0.8 --min-seq-id 0.7
70+
"""
71+
72+
rule merc_src_rep_db
73+
input:
74+
"/data/merc_and_src/db/merc_src_db",
75+
"/data/merc_and_src/clu/merc_src_clu"
76+
output:
77+
"/data/merc_and_src/clu/merc_src_reps"
78+
shell:
79+
"""
80+
mmseqs createsubdb {input[1]} {input[0]} {output}
81+
"""
82+
83+
rule merc_src_reps:
84+
input:
85+
"/data/merc_and_src/clu/merc_src_reps"
86+
output:
87+
"/data/merc_and_src/clu/merc_src_reps.fasta"
88+
shell:
89+
"""
90+
mmseqs convert2fasta {input} {output}
91+
"""
92+
93+
rule concat:
94+
input:
95+
"/data/merc_and_src/clu/merc_src_reps.fasta",
96+
"/data/mgnify/clu/mgnify_reps.fasta",
97+
"/data/remainder/remainder.fasta"
98+
output:
99+
"/data/all/all.fasta"
100+
shell:
101+
"""
102+
cat {input[0]} {input[1]} {input[2]} > {output}
103+
"""
104+
105+
rule all_db:
106+
input:
107+
"/data/all/all.fasta"
108+
output:
109+
"/data/all/db/all_db"
110+
shell:
111+
"""
112+
mmseqs createdb {input} {output}
113+
"""
114+
115+
rule all_clu:
116+
input:
117+
"/data/all/db/all_db"
118+
output:
119+
"/data/all/clu/all_clu"
120+
shell:
121+
"""
122+
mmseqs linclust {input} {output} /data/all/tmp --cluster-mode 2 --cov-mode 2 -c 0.8 --min-seq-id 0.9
123+
"""
124+
125+
rule all_rep_db:
126+
input:
127+
"/data/all/db/all_db",
128+
"/data/all/clu/all_clu"
129+
output:
130+
"/data/all/clu/all_reps"
131+
shell:
132+
"""
133+
mmseqs createsubdb {input[1]} {input[0]} {output}
134+
"""
135+
136+
rule final_clu:
137+
input:
138+
"/data/all/clu/all_reps"
139+
output:
140+
"/data/final/clu/final_clu"
141+
shell:
142+
"""
143+
mmseqs linclust {input} {output} /data/final/tmp --cluster-mode 2 --cov-mode 2 -c 0.8 --min-seq-id 0.5
144+
"""
145+
146+
rule final_seq_db:
147+
input:
148+
"/data/all/clu/all_reps",
149+
"/data/final/clu/final_clu"
150+
output:
151+
"/data/final/final_seqs"
152+
shell:
153+
"""
154+
mmseqs createseqfiledb {input[0]} {input[1]} {output}
155+
"""
156+
157+
rule final_seqs:
158+
input:
159+
"/data/all/clu/all_reps",
160+
"/data/final/final_seqs"
161+
output:
162+
"/data/final/final_seqs.fasta"
163+
shell:
164+
"""
165+
mmseqs result2flat {input[0]} {input[0]} {input[1]} {output}
166+
"""

datasets/gigaref/dedup.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import json
2+
import numpy as np
3+
from datasets import load_dataset, Dataset
4+
5+
DATA = ['rtest', 'valid']
6+
seqs = set()
7+
for data in DATA:
8+
with open('/data/intermediate/new_'+data+'.fasta', 'r') as f:
9+
for line in f:
10+
if not line.startswith('>'):
11+
seqs.add(line)
12+
13+
# def fasta_generator(filepath):
14+
# with open(filepath, 'r') as f:
15+
# seq_id = None
16+
# sequence = None
17+
# cluster = []
18+
# prev_line = None
19+
# for line in f:
20+
# if prev_line and prev_line == line:
21+
# if cluster:
22+
# yield {"representative": cluster[0], "members": cluster}
23+
# cluster = []
24+
# if line.startswith('>'):
25+
# if seq_id and sequence:
26+
# cluster.append({"id": seq_id, "sequence": sequence})
27+
# seq_id = line.strip()
28+
# sequence = None
29+
# else:
30+
# sequence = line.strip()
31+
# prev_line = line
32+
# if seq_id and sequence:
33+
# cluster.append({"id": seq_id, "sequence": sequence})
34+
# if cluster:
35+
# yield {"representative": cluster[0], "members": cluster}
36+
37+
# dataset = Dataset.from_generator(fasta_generator, num_proc=128, gen_kwargs={"filepath": "/data/all_new/db/inner_db/final_seqs.fasta"})
38+
39+
# Filter out clusters with any IDs in the ids list
40+
# def filter_clusters(cluster):
41+
# for member in cluster['members']:
42+
# if member['id'] in ids:
43+
# return False
44+
# return True
45+
46+
# filtered_dataset = dataset.filter(filter_clusters, num_proc=128)
47+
48+
with open("/data/pre_dedup/final_clusters.fasta", 'r') as f, open('/data/post_dedup/dedup_clusters.fasta', 'w') as outfile:
49+
cluster = []
50+
prev_line = None
51+
valid = True
52+
sequence = None
53+
seq_id = None
54+
for line in f:
55+
if prev_line and prev_line == line:
56+
if cluster and valid:
57+
outfile.write(cluster[0])
58+
outfile.writelines(cluster)
59+
cluster = []
60+
valid = True
61+
if line.startswith('>'):
62+
if seq_id and sequence:
63+
cluster.append(seq_id)
64+
cluster.append(sequence)
65+
seq_id = line
66+
sequence = None
67+
else:
68+
sequence = line
69+
if sequence in seqs:
70+
seqs.remove(sequence)
71+
print("match")
72+
valid = False
73+
prev_line = line
74+
75+
if seq_id and sequence:
76+
cluster.append(seq_id)
77+
cluster.append(sequence)
78+
if cluster and valid:
79+
outfile.write(cluster[0])
80+
outfile.writelines(cluster)
81+
82+
# Write the filtered sequences to the output file in cluster format
83+
# with open('/data/final/dedup.fasta', 'w') as outfile:
84+
# for cluster in filtered_dataset:
85+
# representative = cluster['representative']
86+
# outfile.write(f"{representative['id']}\n{representative['sequence']}\n")
87+
88+
89+
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import random
2+
import json
3+
4+
with open("/data/post_dedup/dedup_clusters.fasta", 'r') as f, open("/data/gigaref/private/consensus.fasta", 'w') as private, open("/data/gigaref/consensus.fasta", 'w') as consensus:
5+
current_cluster = []
6+
prev = None
7+
id = None
8+
seq = None
9+
size = 0
10+
indices = []
11+
index = 0
12+
private_index = 0
13+
public_index = 0
14+
private_clu_json = {'test': []}
15+
private_rep_json = {'test': []}
16+
clu_json = {'train': [], 'test': []}
17+
clu_no_singles_json = {'train': [], 'test': []}
18+
rep_json = {'train': [], 'test': []}
19+
rep_no_singles_json = {'train': [], 'test': []}
20+
21+
for line in f:
22+
if prev == line:
23+
if current_cluster:
24+
if size > 1:
25+
rand = random.random()
26+
if (rand < 4.1e-5):
27+
private.writelines(current_cluster)
28+
private_clu_json['test'].append([i + private_index for i in range(len(indices))])
29+
private_rep_json['test'].append(private_index)
30+
private_index += len(indices)
31+
elif(rand < 8.2e-5):
32+
consensus.writelines(current_cluster)
33+
clu_json['test'].append([i + public_index for i in range(len(indices))])
34+
clu_no_singles_json['test'].append([i + public_index for i in range(len(indices))])
35+
rep_json['test'].append(public_index)
36+
rep_no_singles_json['test'].append(public_index)
37+
public_index += len(indices)
38+
else:
39+
consensus.writelines(current_cluster)
40+
clu_json['train'].append([i + public_index for i in range(len(indices))])
41+
clu_no_singles_json['train'].append([i + public_index for i in range(len(indices))])
42+
rep_json['train'].append(public_index)
43+
rep_no_singles_json['train'].append(public_index)
44+
public_index += len(indices)
45+
else:
46+
consensus.writelines(current_cluster)
47+
clu_json['train'].append([i + public_index for i in range(len(indices))])
48+
rep_json['train'].append(public_index)
49+
public_index += len(indices)
50+
current_cluster = []
51+
size = 0
52+
indices = []
53+
if line.startswith('>'):
54+
if id and seq:
55+
current_cluster.append(id)
56+
current_cluster.append(seq)
57+
size += 1
58+
id = line
59+
seq = None
60+
else:
61+
indices.append(index)
62+
seq = line
63+
index+=1
64+
prev = line
65+
66+
print("Done reading file")
67+
68+
if id and seq:
69+
current_cluster.append(id)
70+
current_cluster.append(seq)
71+
size += 1
72+
if current_cluster:
73+
if size > 1:
74+
rand = random.random()
75+
if (rand < 4.1e-5):
76+
private.writelines(current_cluster)
77+
private_clu_json['test'].append([i + private_index for i in range(len(indices))])
78+
private_rep_json['test'].append(private_index)
79+
private_index += len(indices)
80+
elif(rand < 8.2e-5):
81+
consensus.writelines(current_cluster)
82+
clu_json['test'].append([i + public_index for i in range(len(indices))])
83+
clu_no_singles_json['test'].append([i + public_index for i in range(len(indices))])
84+
rep_json['test'].append(public_index)
85+
rep_no_singles_json['test'].append(public_index)
86+
public_index += len(indices)
87+
else:
88+
consensus.writelines(current_cluster)
89+
clu_json['train'].append([i + public_index for i in range(len(indices))])
90+
clu_no_singles_json['train'].append([i + public_index for i in range(len(indices))])
91+
rep_json['train'].append(public_index)
92+
rep_no_singles_json['train'].append(public_index)
93+
public_index += len(indices)
94+
else:
95+
consensus.writelines(current_cluster)
96+
clu_json['train'].append([i + public_index for i in range(len(indices))])
97+
rep_json['train'].append(public_index)
98+
public_index += len(indices)
99+
100+
print("Finished last cluster")
101+
102+
consensus.flush()
103+
consensus.close()
104+
private.flush()
105+
private.close()
106+
107+
print("Closed files")
108+
109+
with open("/data/gigaref/private/clustered_splits.json", 'w') as f:
110+
json.dump(private_clu_json, f)
111+
112+
with open("/data/gigaref/private/splits.json", 'w') as f:
113+
json.dump(private_rep_json, f)
114+
115+
with open("/data/gigaref/with_singletons/clustered_splits.json", 'w') as f:
116+
json.dump(clu_json, f)
117+
118+
with open("/data/gigaref/with_singletons/splits.json", 'w') as f:
119+
json.dump(rep_json, f)
120+
121+
with open("/data/gigaref/no_singletons/clustered_splits.json", 'w') as f:
122+
json.dump(clu_no_singles_json, f)
123+
124+
with open("/data/gigaref/no_singletons/splits.json", 'w') as f:
125+
json.dump(rep_no_singles_json, f)
126+
127+
print("Done writing json files")

0 commit comments

Comments
 (0)