-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcorePHASE.py
More file actions
1630 lines (1305 loc) · 74.6 KB
/
corePHASE.py
File metadata and controls
1630 lines (1305 loc) · 74.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import itertools
import numpy as np
import gzip, sys
import numba as nb
from numba import jit, njit, types
import random
from graph_tool.all import *
from line_profiler import LineProfiler, profile
import copy
import json
from pysam import VariantFile
import asyncio
import time
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
import argparse
from hap_utils import open_bcf_to_read
def worker(queue, result_queue, free_fixed_found, core_components, draw, cores_to_process, basis_genotype):
while queue:
core_to_process = queue.get()
if core_to_process is None: # Check for sentinel value
return result_queue
start_time = time.time()
new_cores = process_core(core_to_process, free_fixed_found, core_components, draw, cores_to_process, basis_genotype)
end_time = time.time()
for new_core in new_cores:
print("finished core with size ",len(new_core[0].get_vertices())," and ",len(genotypes)," genotypes"," in ",end_time-start_time," seconds", file=sys.stderr)
result_queue.put(new_core)
@njit(types.uint8(types.uint8,types.uint8))
def pconsistent_val(g1,g2):
if g1==2 and g2==2:
return 2
elif g1==2:
return g2
elif g2==2:
return g1
elif g1==g2:
return g1
else:
return 255
def complement_genotype(genotype1,genotype2):
complement = np.zeros((M),dtype=np.uint8)
for i in range(len(genotype1)):
comp = complement_allele(genotype1[i],genotype2[i])
if comp == -1:
return None
else:
complement[i]=comp
return complement
@njit(types.uint8(types.uint8,types.uint8))
def complement_allele(genotype_allele1,genotype_allele2):
if genotype_allele1==2 and genotype_allele2==2:
return 2
elif genotype_allele1==2:
return np.mod(genotype_allele2+1,2)
elif genotype_allele2==2:
return np.mod(genotype_allele1+1,2)
elif genotype_allele1==genotype_allele2:
return genotype_allele1
else:
return -1
@njit(types.uint8[:](types.Array(types.uint8, 1, 'C', readonly=False),types.Array(types.uint8, 1, 'C', readonly=True),types.int64))
def fast_complement_genotype(genotype1,genotype2,M):
complement = np.zeros((M),dtype=np.uint8)
for i in range(len(genotype1)):
complement[i] = complement_allele(genotype1[i],genotype2[i])
return complement
def check_intersection(template1,template2):
for i in range(len(template1)):
inter_val = pconsistent_val(template1[i],template2[i])
if inter_val == 255:
return False
return True
@njit(types.boolean(types.Array(types.uint8, 1, 'C', readonly=False),types.Array(types.uint8, 1, 'C', readonly=True)))
def fast_check_intersection(template1,template2):
for i in range(len(template1)):
inter_val = pconsistent_val(template1[i],template2[i])
if inter_val == 255:
return False
return True
@jit(types.uint8[:](types.Array(types.uint8, 1, 'C'),types.Array(types.uint8, 1, 'C', readonly=True)))
def fast_intersect(template1,template2):
# assumes that the two templates are compatible
intersection = np.zeros((len(template1)),dtype=np.uint8)
for i in range(len(template1)):
intersection[i]=pconsistent_val(template1[i],template2[i])
return intersection
def intersect(template1,template2):
intersection = np.zeros((len(template1)),dtype=np.uint8)
for i in range(len(template1)):
inter_val = pconsistent_val(template1[i],template2[i])
if inter_val == -1:
return None
else:
intersection[i]=inter_val
return intersection
def consistent(template1,template2):
for i in range(len(template1)):
if not pconsistent(template1[i],template2[i]):
return False
return True
def gconsistent(h,h2,g):
if g==0 and h==0 and h2==0:
return 0
elif g==1 and h==1 and h2==1:
return 1
elif g==2:
if (h==1 and h2==1) or (h==0 and h2==0):
return None
else:
return min(h,h2)
else:
return None
def pconsistent(g1,g2):
if g1==2 or g2==2 or g1==g2:
return True
else:
return False
def get_template(clique,P):
itemplate = np.zeros((M),dtype=np.uint8)
for i in range(M):
consensus = 2
for vertex in clique:
if P[vertex][i]!=2:
consensus = P[vertex][i]
itemplate[i]=consensus
return itemplate
def generate_haps_from_genotype(original_genotype, node_genotype, haplotype_to_freq_map, gen_to_haplotypes):
if 2 not in node_genotype:
haplotype_to_freq_map[node_genotype]=0
gen_to_haplotypes[original_genotype].append(node_genotype)
else:
unique, counts = np.unique(np.frombuffer(node_genotype, dtype=np.uint8), return_counts=True)
num_twos = dict(zip(unique, counts))[2]
if num_twos<30:
haplotypes = np.zeros((np.power(2,num_twos), M), dtype=np.uint8)
truth_table = np.array(list(itertools.product([0, 1], repeat=num_twos)), dtype=np.uint8)
idx = 0
for i in range(len(node_genotype)):
if node_genotype[i]==1:
haplotypes[:,i]=1
elif node_genotype[i]==2:
haplotypes[:,i]=truth_table[:,idx]
idx+=1
for haplotype in haplotypes:
haplotype_to_freq_map[haplotype.data.tobytes()]=0
gen_to_haplotypes[original_genotype].append(haplotype.data.tobytes())
else:
print("skipping genotype with too many twos:",''.join(map(str, np.frombuffer(node_genotype, dtype=np.uint8))), file=sys.stderr)
# head is the template for the starting set of the path explored so far
# tail is the parity of the number of genotype twos traversed at each position
def find_paths_r(G, head, tail, used, last):
call_stack = []
call_stack.append((G, head, tail, used, last, 0,True, None))
it = 0
while len(call_stack)>0:
it+=1
if it%10000==0: print(it,len(call_stack),start_idx, file=sys.stderr)
(G, head, tail, used, last, start_idx,self_loop,lastNTail)=call_stack.pop()
yield head
if not self_loop:
used.remove(lastNTail.data.tobytes())
for idx in range(start_idx,len(G)):
g = G[idx]
if arrays_equal(g,last): # avoid backtracking
continue
self_loop = True
nhead = head.copy()
ntail = tail.copy()
self_loop,nhead,ntail = getnHeadnTail(g,head,tail,nhead,ntail)
cyclic = (ntail is not None) and (ntail.data.tobytes() in used) and not self_loop # if this is a self-loop then tail=ntail
if ntail is not None and not cyclic:
used.add(ntail.data.tobytes())
call_stack.append((G, head, tail, used, last, idx+1,self_loop,ntail))
call_stack.append((G, nhead, ntail, used, g, 0, True,None))
break
@nb.jit(nopython=True)
def getnHeadnTail(g,head,tail,nhead,ntail):
for i in range(M):
if g[i] == 2: # g has a two, so flip the tail value
self_loop = False # this isn't a self-loop if g has any twos
ntail[i] = 1 - ntail[i]
elif head[i] == 2: # g doesn't have a two, but the template does
if tail[i] == 0: # head and tail should be the same
nhead[i] = g[i]
else: # head and tail should be different
nhead[i] = 1 - g[i]
elif (tail[i] == g[i]) != (head[i] == 0): # Mismatch!
ntail = None
break
return self_loop,nhead,ntail
@nb.jit(nopython=True)
def arrays_equal(a, b):
if a is None or b is None:
return False
for ai, bi in zip(a.flat, b.flat):
if ai != bi:
return False
return True
def compatible(h,h2,g):
comp_gen = np.zeros((M),dtype=np.uint8)
for i in range(len(h)):
con = gconsistent(h[i],h2[i],g[i])
if con is None:
return None
else:
comp_gen[i]=con
return comp_gen
def get_basis_allele(base,index):
return base[index]
def get_non_basis_allele(base,index):
return np.mod(base[index]+1,2)
def annotate_template(core_component,genotypes,free_fixed_found, basis_genotype, root_vertex=0):
# theorem 4.1 with a template graph that only has edge labels. Fill in the free-fixed positions and templates
# does not extend any graph, simply annotates the vertices
# get an edge adjacent to the root vertex
random_genotype_index = core_component.ep["genotype"][core_component.edge(root_vertex,core_component.get_all_neighbors(root_vertex)[0])]
random_genotype = genotypes[random_genotype_index]
fingerprint = core_component.new_vp("vector<int8_t>") # creates a VertexPropertyMap of type string
root_distance = core_component.new_vp("int8_t") # creates a VertexPropertyMap of type string
node_updated = core_component.new_vp("bool") # creates a VertexPropertyMap of type string
# contains a vector of integers that is twice the length of a genotype
# the first half of the vector is the free and fixed positions, the second is the fingerprint
core_component.vp["fingerprint"] = fingerprint
core_component.vp["root_distance"] = root_distance
core_component.vp["updated"] = node_updated
# add fingerprint for the root
core_component.vp["fingerprint"][core_component.vertex(root_vertex)]=np.zeros((M),dtype=np.uint8)
core_component.vp["root_distance"][core_component.vertex(root_vertex)]=0
seen_edge = set()
bfs_queue = []
bfs_queue.append(core_component.vertex(root_vertex))
finger_to_vertex = {}
finger_to_vertex[core_component.vp["fingerprint"][root_vertex].get_array().tobytes()]=root_vertex
while len(bfs_queue)>0:
vertex = bfs_queue.pop(0)
# for each vertex adjacent to vertex
verts = np.unique(core_component.get_all_neighbours(vertex))
skip_edge_removed=set()
for v in verts:
if v in skip_edge_removed or (v,vertex) in seen_edge:
continue
genotype_idx = core_component.ep["genotype"][core_component.edge(vertex,v)]
genotype = genotypes[genotype_idx]
# get free_fixed positions from the vertex we are coming from
free_fixed = np.zeros((M),dtype=np.uint8)
free_fixed = fast_compute_genotype(core_component.vp["fingerprint"][vertex].get_array(), basis_genotype, free_fixed) # gets free fixed for target node
# check if the free_fixed positions of the vertex we are coming from are compatible with the newly added genotype
is_compat = fast_check_intersection(free_fixed,genotype)
seen_edge.add((v,vertex))
if not is_compat:
# kill this branch
core_component.remove_edge(core_component.edge(vertex,v))
continue
# if we do not have a self loop, then we must do additional processing
if not (v == vertex and vertex in core_component.get_all_neighbours(v)):
# not a self-loop
if core_component.edge(vertex,v) is None:
# don't go backwards and do not process an edge we've removed
continue
# there already are edge labels, add the appropriate vertex labels
seen_edge.add((vertex,v))
compat_g = fast_intersect(free_fixed,genotype) # this gets the free-fixed haplotype for the node we are coming from
basis_genotype = update_basis(basis_genotype, core_component.vp["fingerprint"][core_component.vertex(vertex)].get_array(), compat_g)
# now compute fingerprint
distance_from_root = core_component.vp["root_distance"][vertex]+1
free_fixed = np.zeros((M),dtype=np.uint8)
free_fixed = fast_compute_genotype(core_component.vp["fingerprint"][core_component.vertex(root_vertex)].get_array(), basis_genotype, free_fixed) # gets free fixed for target node
comp_gen = fast_complement_genotype(compat_g,genotype,M) # this gets the complement free-fixed haplotype given the shared genotype
if free_fixed.tobytes() in free_fixed_found or comp_gen.tobytes() in free_fixed_found:
return True
fingerprint = fast_get_fingerprint(comp_gen, free_fixed,distance_from_root, M)
if fingerprint.tobytes() in finger_to_vertex:
# if we are pointing to the vertex that already exists, then this edge just reinforces what's already there, keep it
# otherwise, clear the edge
if finger_to_vertex[fingerprint.tobytes()]!=v:
core_component.remove_edge(core_component.edge(vertex,v)) #vertex->v
# erase v in case it is connected through another path
core_component.vp["fingerprint"][v].clear()
core_component.vp["root_distance"][v] = 255
finger_vertex = finger_to_vertex[fingerprint.tobytes()]
if core_component.edge(vertex,finger_vertex) is None:
core_component.add_edge(core_component.vertex(vertex), finger_vertex)
seen_edge.add((finger_vertex,vertex))
seen_edge.add((vertex,finger_vertex))
label_edge(core_component, vertex, finger_vertex, genotype_idx)
else:
finger_to_vertex[fingerprint.tobytes()]=v
label_vertex_full(core_component, v, fingerprint,distance_from_root)
basis_genotype = update_basis(basis_genotype, fingerprint, comp_gen)
bfs_queue.append(v)
return False
def find_initial_connected_component_core(genotypes, seed_node_ff, genotype_index): # theorem 4.1
# select basis genotype
core_component = Graph(directed=False)
core_component.set_fast_edge_removal(fast=True)
edge_labels = core_component.new_ep("int32_t") # creates an EdgePropertyMap of type double
# stores the genotype index, the genotype can be retrieves from "genotypes" object
fingerprint = core_component.new_vp("vector<int8_t>") # creates a VertexPropertyMap of type string
root_distance = core_component.new_vp("int8_t") # creates a VertexPropertyMap of type string
# contains a vector of integers that is twice the length of a genotype
# the first half of the vector is the free and fixed positions, the second is the fingerprint
core_component.vp["fingerprint"] = fingerprint
core_component.vp["root_distance"] = root_distance
# contains a vector of integers that is twice the length of a genotype
# the first half of the vector is the free and fixed positions, the second is the fingerprint
node_updated = core_component.new_vp("bool")
core_component.vp["updated"] = node_updated
vprop = core_component.new_vertex_property("int")
core_component.vp["root"] = vprop
basis_genotype = np.zeros((M),dtype=np.uint8)
np.copyto(basis_genotype,seed_node_ff) # 2 if unset, 0 if basis is 0, 1 if basis is 1
core_component.ep["genotype"] = edge_labels
# select root as one of its adjacent vertices
root_vertex = 0
if 2 in seed_node_ff:
core_component.add_vertex(2) # root is vertex at index 0
other_vertex = 1
else:
# self loop
core_component.add_vertex(1) # root is vertex at index 0
other_vertex = 0
core_component.vp["root"][other_vertex]=0
core_component.vp["root"][root_vertex]=1
core_component.add_edge(core_component.vertex(root_vertex), core_component.vertex(other_vertex))
core_component.ep["genotype"][core_component.edge(root_vertex,other_vertex)]=genotype_index
# this is the basis genotype
core_component.vp["fingerprint"][core_component.vertex(root_vertex)]=np.zeros((M),dtype=np.uint8)
core_component.vp["root_distance"][core_component.vertex(root_vertex)]=0
core_component.vp["fingerprint"][core_component.vertex(other_vertex)]=np.zeros((M),dtype=np.uint8)
core_component.vp["root_distance"][core_component.vertex(other_vertex)]=1
free_fixed = compute_genotype(core_component.vp["fingerprint"][core_component.vertex(other_vertex)].get_array(), basis_genotype)
free_fixed_root = compute_genotype(core_component.vp["fingerprint"][core_component.vertex(root_vertex)].get_array(), basis_genotype)
fingerprint = fast_get_fingerprint(free_fixed,free_fixed_root,1,M)
core_component.vp["fingerprint"][core_component.vertex(other_vertex)]=fingerprint
core_component.vp["root_distance"][core_component.vertex(other_vertex)]=1
(root_vertex, core_component), basis_genotype = find_connected_component_core(genotypes, core_component, 0, set(), basis_genotype)
return core_component, basis_genotype
def find_connected_component_core(genotypes, core_component, root_vertex, free_fixed_found, basis_genotype):
# theorem 4.1 which maximizes a partially resolved core
genotype_idx_to_adj_vertex = {}
for gidx in range(len(genotypes)):
genotype_idx_to_adj_vertex[gidx]=set()
added_vertex = True
vertices_to_process = set()
vertices_to_process.update(core_component.get_vertices())
finger_to_vertex = {}
# map fingerprints to vertices
for v in core_component.get_vertices():
for ajv in core_component.get_all_neighbours(v):
genotype_idx_to_adj_vertex[core_component.ep["genotype"][core_component.edge(v,ajv)]].add(v)
genotype_idx_to_adj_vertex[core_component.ep["genotype"][core_component.edge(v,ajv)]].add(ajv)
if len(core_component.vp["fingerprint"][v])>0:
finger_to_vertex[core_component.vp["fingerprint"][v].get_array().tobytes()]=v
while vertices_to_process:
vertices_to_add_next_round = set()
v = vertices_to_process.pop()
for idx,genotype in enumerate(genotypes):
if v not in genotype_idx_to_adj_vertex[idx]:
free_fixed = np.zeros((M),dtype=np.uint8)
free_fixed = fast_compute_genotype(core_component.vp["fingerprint"][v].get_array(), basis_genotype, free_fixed) # gets free fixed for target node
is_compat = fast_check_intersection(free_fixed,genotype) # check to see if we are able to add this genotype to the node
if is_compat:
compat_g = fast_intersect(free_fixed,genotype) # this gets the free-fixed haplotype for the node we are coming from
comp_gen = fast_complement_genotype(compat_g,genotype,M) # this gets the complement free fixed positions
# check if compat=comp-gen, then self loop!
if np.array_equal(compat_g, comp_gen):
core_component.add_edge(core_component.vertex(v), core_component.vertex(v))
# core_component.vp["adj_genos"][v].append(idx)
label_edge(core_component, v,v, idx)
added_vertex_idx = v
genotype_idx_to_adj_vertex[idx].add(v)
else:
# otherwise, we need to add a new vertex
# update graph first, then fingerprint!
# update vertex we're coming from
added_vertex_idx = add_vertex_and_edge_to_component(core_component,v,idx)
genotype_idx_to_adj_vertex[idx].add(v)
genotype_idx_to_adj_vertex[idx].add(added_vertex_idx)
# and we're we are going to
label_edge(core_component, v,added_vertex_idx, idx)
# now compute fingerprint
distance_from_root = core_component.vp["root_distance"][v]+1
# must update basis genotype first
basis_genotype = update_basis_root(basis_genotype, comp_gen, distance_from_root)
# root free_fixed must be updated!
free_fixed_root = compute_genotype(core_component.vp["fingerprint"][core_component.vertex(root_vertex)].get_array(), basis_genotype)
fingerprint = fast_get_fingerprint(comp_gen,free_fixed_root,distance_from_root,M)
if fingerprint.tobytes() in finger_to_vertex:
genotype_idx_to_adj_vertex[core_component.ep["genotype"][core_component.edge(v,added_vertex_idx)]].remove(v)
genotype_idx_to_adj_vertex[core_component.ep["genotype"][core_component.edge(v,added_vertex_idx)]].remove(added_vertex_idx)
core_component.remove_edge(core_component.edge(v,added_vertex_idx))
if core_component.edge(v,finger_to_vertex[fingerprint.tobytes()]) is None:
# if the edge is already in the graph, don't add a duplicate edge
core_component.add_edge(core_component.vertex(v), finger_to_vertex[fingerprint.tobytes()])
genotype_idx_to_adj_vertex[idx].add(v)
genotype_idx_to_adj_vertex[idx].add(finger_to_vertex[fingerprint.tobytes()])
label_edge(core_component, v,finger_to_vertex[fingerprint.tobytes()], idx)
else:
label_vertex_full(core_component, added_vertex_idx, fingerprint,distance_from_root)
vertices_to_add_next_round.add(added_vertex_idx)
finger_to_vertex[fingerprint.tobytes()]=added_vertex_idx
# a new vertex added or changed and fingerprint added, so we might have to update the basis
basis_genotype = update_basis(basis_genotype, fingerprint, comp_gen)
vertices_to_process.update(vertices_to_add_next_round)
for v in core_component.get_vertices():
free_fixed = compute_genotype(core_component.vp["fingerprint"][v].get_array(), basis_genotype)
if free_fixed.tobytes() in free_fixed_found:
return (None, None), basis_genotype
return clean_component(core_component, root_vertex), basis_genotype
def update_basis(basis_genotype, fingerprint, comp_gen):
# fingerprint: 0 when node matches root. 1 when opposite. If free, it's the length of walk mod 2
for i in range(len(basis_genotype)):
if basis_genotype[i] == 2: # if we still need to update the basis
if comp_gen[i] != 2: # if the genotype is not a two, then it fixes a position
if fingerprint[i] == 0:
basis_genotype[i] = comp_gen[i]
elif fingerprint[i] == 1:
basis_genotype[i] = np.mod(comp_gen[i]+1,2)
return basis_genotype
def update_basis_root(basis_genotype, comp_gen, distance):
# fingerprint: 0 when node matches root. 1 when opposite. If free, it's the length of walk mod 2
for i in range(len(basis_genotype)):
if basis_genotype[i] == 2: # if we still need to update the basis
if comp_gen[i] != 2: # if the genotype is not a two, then it fixes a position
number_of_2s_from_root = distance-1 # because we are just now fixing the root and all nodes between the root and here must have been 2s
if np.mod(number_of_2s_from_root,2) == 0:
basis_genotype[i] = comp_gen[i]
else:
basis_genotype[i] = np.mod(comp_gen[i]+1,2)
return basis_genotype
def update_graph(added_vertex_idx, adj_vertex_idx, graph, genotypes, root_vertex, free_fixed_found,basis_genotype):
# search through vertices we've updated and update all free/fixed positions for adjacent vertices recursively
updated = set()
updated.add(adj_vertex_idx)
updated.add(added_vertex_idx)
to_update_neighbors = set()
to_update_neighbors.add(added_vertex_idx)
to_update_neighbors.add(adj_vertex_idx)
counter=0
while len(to_update_neighbors)>0:
edges_to_remove = set()
adjv = to_update_neighbors.pop()
for adjv2 in graph.get_all_neighbours(adjv):
if len(graph.vp["fingerprint"][graph.vertex(adjv2)].get_array()) == 0:
continue
free_fixed_adjv2 = compute_genotype(graph.vp["fingerprint"][graph.vertex(adjv2)].get_array(), basis_genotype)
if len(free_fixed_adjv2) == 0 or adjv2 in updated:
continue
# adjv was updated but adjv2 has not been updated, so see if we need to make updates to adjv2
# find complement genotype to adjv and the genotype of the edge between adjv and adjv2
free_fixed_adjv = compute_genotype(graph.vp["fingerprint"][graph.vertex(adjv)].get_array(), basis_genotype)
is_compat = fast_check_intersection(free_fixed_adjv,genotypes[graph.ep["genotype"][graph.edge(adjv,adjv2)]])
if not is_compat:
# kill this edge
edges_to_remove.add(graph.edge(adjv,adjv2))
else:
comp_gen = fast_complement_genotype(free_fixed_adjv,genotypes[graph.ep["genotype"][graph.edge(adjv,adjv2)]],M)
if comp_gen.tobytes() in free_fixed_found:
return None
if np.array_equal(comp_gen,free_fixed_adjv2):
# no changes in free/fixed positions, so no need to continue down this branch
continue
else:
if len(graph.vp["fingerprint"][graph.vertex(adjv2)].get_array()) == 0:
# no fingerprint existed before, instantiate
distance_from_root = graph.vp["root_distance"][graph.vertex(adjv)]+1
free_fixed_root = compute_genotype(graph.vp["fingerprint"][graph.vertex(root_vertex)].get_array(), basis_genotype)
fingerprint = fast_get_fingerprint(comp_gen,free_fixed_root,distance_from_root, M)
basis_genotype = update_basis(basis_genotype, fingerprint, comp_gen)
print("old label",end=" ")
print(comp_gen, end=" ")
print("new label",end=" ")
print(compute_genotype(fingerprint, basis_genotype))
label_vertex_full(graph, adjv2, fingerprint,distance_from_root)
else:
basis_genotype = update_basis(basis_genotype, graph.vp["fingerprint"][graph.vertex(adjv2)].get_array(), comp_gen)
# a fingerprint existed, don't touch it
print("old label",end=" ")
print(comp_gen, end=" ")
print("new label",end=" ")
print(compute_genotype(graph.vp["fingerprint"][graph.vertex(adjv2)].get_array(), basis_genotype))
label_vertex_full(graph, adjv2, graph.vp["fingerprint"][graph.vertex(adjv2)].get_array(),graph.vp["root_distance"][graph.vertex(adjv2)])
updated.add(adjv2)
to_update_neighbors.add(adjv2)
return edges_to_remove
def compute_genotype(fingerprint, basis_genotype):
# compute the current free_fixed (partially resolved) genotype from the fingerprint and basis_genotype
genotype = np.zeros((M),dtype=np.uint8)
for i in range(len(fingerprint)):
if basis_genotype[i] == 2: # the position is still free
genotype[i] = 2
elif fingerprint[i] == 0: # the position is fixed to the basis genotype
genotype[i] = basis_genotype[i]
elif fingerprint[i] == 1: # the position is fixed to the opposite of the basis genotype
genotype[i] = np.mod(basis_genotype[i]+1,2)
return genotype
@njit(types.uint8[:](types.Array(types.uint8, 1, 'C', readonly=False),types.Array(types.uint8, 1, 'C', readonly=True),types.Array(types.uint8, 1, 'C', readonly=False)))
def fast_compute_genotype(fingerprint, basis_genotype, genotype):
# compute the current free_fixed (partially resolved) genotype from the fingerprint and basis_genotype
for i in range(len(fingerprint)):
if basis_genotype[i] == 2: # the position is still free
genotype[i] = 2
elif fingerprint[i] == 0: # the position is fixed to the basis genotype
genotype[i] = basis_genotype[i]
elif fingerprint[i] == 1: # the position is fixed to the opposite of the basis genotype
genotype[i] = np.mod(basis_genotype[i]+1,2)
return genotype
def label_vertex_full(graph,v_idx, fingerprint, root_distance):
graph.vp["fingerprint"][graph.vertex(v_idx)]=fingerprint
graph.vp["root_distance"][graph.vertex(v_idx)]=root_distance
def label_edge(graph,v1_idx,v2_idx,label):
graph.ep["genotype"][graph.edge(v1_idx,v2_idx)]=label
def add_vertex_and_edge_to_component(graph,v_idx,g_idx):
added_v = graph.add_vertex()
graph.vp["root"][added_v]=0
graph.add_edge(graph.vertex(v_idx),
added_v)
return graph.vertex_index[added_v]
def is_g_edge_adjacent(genotype,vertex,graph,genotypes):
# check if the genotype is adjacent to the vertex
for v in graph.get_all_neighbours(vertex):
if np.array_equal(genotype,genotypes[graph.ep["genotype"][graph.edge(vertex,v)]]):
return True
return False
def faster_is_g_edge_adjacent(genotype_idx,adjacent_genotypes):
# check if the genotype is adjacent to the vertex
for adj_geno in adjacent_genotypes:
if genotype_idx==adj_geno:
return True
return False
def fastest_is_g_edge_adjacent(genotype_idx,vertex,graph):
# check if the genotype is adjacent to the vertex
edges = find_edge(graph,graph.ep["genotype"],genotype_idx)
for edge in edges:
if edge.source()==vertex or edge.target()==vertex:
return True
return False
def fast_is_g_edge_adjacent(genotype_idx,vertex,graph):
# check if the genotype is adjacent to the vertex
for v in graph.get_all_neighbours(vertex):
if genotype_idx==graph.ep["genotype"][graph.edge(vertex,v)]:
return True
return False
def get_node_label(free_fixed, fingerprint):
return ''.join(map(str, free_fixed)) + '\n' + ''.join(map(str, fingerprint))
# free = 2, fixed = 0 or 1
def get_fingerprint(target_vertex_label,root_vertex_label,distance_from_root):
fingerprint = np.zeros((M),dtype=np.uint8)
for i in range(len(target_vertex_label)):
if target_vertex_label[i]==2:
# vertex is still free
fingerprint[i]= np.mod(distance_from_root,2)
elif target_vertex_label[i] in [0,1]:
# vertex is fixed
fingerprint[i]=np.mod(root_vertex_label[i]+target_vertex_label[i],2)
else:
# infinite sites not assumed...
fingerprint[i]=target_vertex_label[i]
return fingerprint
@njit(types.uint8[:](types.Array(types.uint8, 1, 'C', readonly=True),types.Array(types.uint8, 1, 'C', readonly=True),types.int64,types.int64))
def fast_get_fingerprint(target_vertex_label,root_vertex_label,distance_from_root,M):
fingerprint = np.zeros((M),dtype=np.uint8)
for i in range(len(target_vertex_label)):
if target_vertex_label[i]==2:
# vertex is still free
fingerprint[i]= np.mod(distance_from_root,2)
elif target_vertex_label[i] in [0,1]:
# vertex is fixed
fingerprint[i]=np.mod(root_vertex_label[i]+target_vertex_label[i],2)
else:
# infinite sites not assumed...
fingerprint[i]=target_vertex_label[i]
return fingerprint
def get_free_fixed_vector(genotype):
free_fixed_vector = np.zeros((M),dtype=np.uint8)
for i in range(len(genotype)):
if genotype[i]==2:
free_fixed_vector[i]=1
else:
free_fixed_vector[i]=0
return free_fixed_vector
def has_gedge_or_conflict(vertex,genotype_idx,graph,genotypes,basis_genotype):
has_gedge = False
for v in graph.get_all_neighbours(vertex):
if graph.ep["genotype"][graph.edge(v,vertex)]==genotype_idx:
has_gedge = True
free_fixed = compute_genotype(graph.vp["fingerprint"][vertex].get_array(), basis_genotype)
is_compat = fast_check_intersection(genotypes[genotype_idx],free_fixed)
return has_gedge or not is_compat
def homomorphic(existing_core_component,new_core_component,root_vertex,basis_genotype, old_basis_genotype):
# check if the graph is homomorphic to the template
found_a_good_vertex = False
# first, find a vertex that is compatible with the root
for v in existing_core_component.get_vertices():
free_fixed_v = compute_genotype(existing_core_component.vp["fingerprint"][v].get_array(), old_basis_genotype)
free_fixed_root = compute_genotype(new_core_component.vp["fingerprint"][root_vertex].get_array(), basis_genotype)
if fast_check_intersection(free_fixed_v,free_fixed_root):
# found a compatible vertex
# check if the graph is homomorphic
# if existing_core_component has a g-edge and new_core_component does not, then this is OK
# if new_core_component has a g-edge and existing_core_component does not, then this we are not homomorphic
stack = [(root_vertex.__int__(),v.__int__())]
visited = set()
vertices_processed = 0
while stack:
new_core_vertex,existing_vertex = stack.pop()
if new_core_vertex in visited:
continue
vertices_processed+=1
visited.add(new_core_vertex)
# Check edges between existing_core_component and new_core_component
for adj_vertex in new_core_component.get_all_neighbours(new_core_vertex):
# get g-edge for new core component
new_core_genotype_idx = new_core_component.ep["genotype"][new_core_component.edge(new_core_vertex,adj_vertex)]
# then, the existing core component should have a g-edge with the same genotype
g_edge_adj_vertex = None
for v in existing_core_component.get_all_neighbours(existing_vertex):
if existing_core_component.ep["genotype"][existing_core_component.edge(v,existing_vertex)]==new_core_genotype_idx:
g_edge_adj_vertex = v
if g_edge_adj_vertex is None:
break
if adj_vertex not in visited:
stack.append((adj_vertex,g_edge_adj_vertex))
if vertices_processed == len(new_core_component.get_vertices()):
found_a_good_vertex = True
return found_a_good_vertex
def draw_graph(graph, genotypes, basis_genotype, output="graph.svg", node_labels=True, edges_labels=True, shorten=False):
if graph is None:
return
edge_labels_print = graph.new_ep("string")
node_labels_print = graph.new_vp("string")
if node_labels:
# for each vertex in graph
for v in graph.get_vertices():
free_fixed_v = compute_genotype(graph.vp["fingerprint"][v].get_array(), basis_genotype)
if len(graph.vp["fingerprint"][v].get_array())==0:
if shorten:
node_labels_print[v] = (''.join(map(str, free_fixed_v)))[0:5]+"..."
else:
node_labels_print[v] = ''.join(map(str, free_fixed_v))
else:
if shorten:
node_labels_print[v] = get_node_label(free_fixed_v[0:5],graph.vp["fingerprint"][v].get_array()[0:5])
else:
node_labels_print[v] = get_node_label(free_fixed_v,graph.vp["fingerprint"][v].get_array())
if graph.ep["genotype"] is not None and edges_labels:
# for each edge in the graph
for e in graph.edges():
if shorten:
edge_labels_print[e] = (''.join(map(str, genotypes[graph.ep["genotype"][e]])))[0:5]+"..."
else:
edge_labels_print[e] = ''.join(map(str, genotypes[graph.ep["genotype"][e]]))
graphviz_draw(graph,output=output,vprops={"label":node_labels_print},eprops={"label":edge_labels_print},size=(30,30),ratio = "expand", sep=10, overlap=False)
def clean_component(new_core_component, v):
# only retain the connected component
comp, hist = label_components(new_core_component)
component_id = comp[v]
# NEW TRYING THIS OUT
# filter the graph
new_core_component = GraphView(new_core_component, vfilt = comp.a == component_id)
root_vertex = np.where(new_core_component.vp["root"].get_array()==1)[0][0]
return root_vertex, new_core_component
def process_core(core_to_process, free_fixed_found, core_components, draw, cores_to_process, basis_genotype):
# make a new array with the same size as basis_genotype
old_basis = np.zeros_like(basis_genotype)
np.copyto(old_basis,core_to_process[1])
core_to_process = core_to_process[0]
new_cores = []
templates = []
saved_basis_genotype = np.zeros_like(basis_genotype)
for v in core_to_process.get_vertices():
# try adding every edge to v such that v does not already have an edge with that genotype
# like in Figure 5: try adding genotype C to root -> it breaks the B edge
for genotype_idx in range(len(genotypes)):
if not fast_is_g_edge_adjacent(genotype_idx,v,core_to_process):
np.copyto(saved_basis_genotype,basis_genotype)
new_core_component = Graph(core_to_process, directed=False)
vprop = new_core_component.new_vertex_property("int")
new_core_component.vp["root"] = vprop
# update the basis genotype for the new component
np.copyto(saved_basis_genotype,genotypes[genotype_idx])
# add genotype_idx edge to v
genotype = genotypes[genotype_idx]
added_vertex_idx = add_vertex_and_edge_to_component(new_core_component,v,genotype_idx)
label_edge(new_core_component, v,added_vertex_idx, genotype_idx)
new_core_component.vp["root"][added_vertex_idx]=1
root_vertex = added_vertex_idx
# remove all edges from v that now conflict based on this new edge
edges_to_remove = set()
for adjv in new_core_component.get_all_neighbours(v):
if not consistent(genotypes[genotype_idx],genotypes[new_core_component.ep["genotype"][new_core_component.edge(v,adjv)]]):
edges_to_remove.add(new_core_component.edge(v,adjv))
if len(edges_to_remove)>0:
for redge in edges_to_remove:
new_core_component.remove_edge(redge)
homo = annotate_template(new_core_component,genotypes,free_fixed_found,saved_basis_genotype,root_vertex) # This should correct the template based on the new edge.
if homo:
continue
# now, maximize it!
root_vertex, new_core_component = clean_component(new_core_component, root_vertex)
(root_vertex, new_core_component), saved_basis_genotype = find_connected_component_core(genotypes, new_core_component, root_vertex, free_fixed_found, saved_basis_genotype)
if root_vertex is None and new_core_component is None:
homomorphic_template = True
continue
# use Lemma 4.3 to make sure we are not homomorphic to something in the core
# template_num+=1
homomorphic_template = False
for idx2,existing_core_component in enumerate(core_components):
old_basis = existing_core_component[1]
existing_core_component = existing_core_component[0]
if homomorphic(existing_core_component,new_core_component, root_vertex,basis_genotype,old_basis):
homomorphic_template = True
if not homomorphic_template:
for v_iterator in new_core_component.get_vertices():
free_fixed_v = compute_genotype(new_core_component.vp["fingerprint"][v_iterator].get_array(), saved_basis_genotype)
free_fixed_found.add(free_fixed_v.tobytes())
if draw:
draw_graph(new_core_component, genotypes,saved_basis_genotype, "core"+str(random.randint(0, 10000))+".svg") # CORE PRINTING
new_cores.append((new_core_component, saved_basis_genotype))
return new_cores
def core_phase(genotypes, output_dir, genotype_multiplicities=None, threads=1, draw=False, skip_prop=100, em_init="proportional", num_it=1, greedy_min_expl=None, greedy_genmult_min_expl=None):
# contains all found free-fixed positions
free_fixed_found = set()
genotypes_found = set()
# core_count = 0
core_components = []
cores_to_process = []
# find additional core components such that each genotype is in at least 1 core component
# take a core piece, take all genotypes not in core, expand out a core from that piece, iterate until all genotypes are part of some core
while len(genotypes_found) < len(genotypes):
# find a genotype not in the core
g_not_in_core = None
g_idx_not_in_core = None
for idx,g in enumerate(genotypes):
if g.data.tobytes() not in genotypes_found:
g_idx_not_in_core = idx
g_not_in_core = g
break
if g_not_in_core is not None:
# find a connected component with this genotype
core_component, basis_genotype = find_initial_connected_component_core(genotypes, g_not_in_core, g_idx_not_in_core)
# add the genotype to the core
for v in core_component.get_vertices():
free_fixed_v = compute_genotype(core_component.vp["fingerprint"][v].get_array(), basis_genotype)
free_fixed_found.add(free_fixed_v.tobytes())
for edge in core_component.get_all_edges(v):
edge_genotype = genotypes[core_component.ep["genotype"][edge]]
genotypes_found.add(edge_genotype.data.tobytes())
if draw:
draw_graph(core_component, genotypes,basis_genotype, "core"+str(random.randint(0, 10000))+".svg") # CORE PRINTING
core_components.append((core_component,basis_genotype))
processed_core = set()
homomorph_num = 0
template_num = 0
em_init = "proportional"
found_new_core_component = True
cores_to_process.extend(core_components)
queue = multiprocessing.Queue()
while cores_to_process:
print("cores left to process: ",len(cores_to_process), file=sys.stderr)
if threads > 1:
# Enqueue initial cores to process
for core in cores_to_process:
print("found core component with " + str(len(core[0].get_vertices())) + " vertices and " + str(len(core[0].get_edges())) + " edges", file=sys.stderr)
# cutting off the expansion of large core components
if len(core[0].get_vertices()) > skip_prop * len(genotypes):
print("skipping core component with " + str(len(core[0].get_vertices())) + " vertices and " + str(len(genotypes)) + " genotypes", file=sys.stderr)
continue
queue.put(core)
result_queue = multiprocessing.Queue()
# Create worker tasks
workers = [multiprocessing.Process(target=worker, args=(queue, result_queue, free_fixed_found, core_components, draw, cores_to_process, basis_genotype)) for _ in range(threads)]
# Start worker processes
for w in workers:
w.start()
# Stop workers
for _ in range(threads):
queue.put(None)
# Wait for all workers to finish
for w in workers:
w.join()
result_queue.put(None)
cores_to_process.clear()
while result_queue:
new_core = result_queue.get()
if new_core is None: # Check for sentinel value
break
cores_to_process.append(new_core)
core_components.append(new_core)
else:
while cores_to_process:
core_to_process = cores_to_process.pop(0)
print("found core component with " + str(len(core_to_process[0].get_vertices())) + " vertices and " + str(len(core_to_process[0].get_edges())) + " edges", file=sys.stderr)
if len(core_to_process[0].get_vertices()) > skip_prop * len(genotypes):
print("skipping core component with " + str(len(core_to_process[0].get_vertices())) + " vertices and " + str(len(genotypes)) + " genotypes", file=sys.stderr)
continue
new_cores = process_core(core_to_process, free_fixed_found, core_components, draw, cores_to_process, basis_genotype)
cores_to_process.extend(new_cores)