forked from ChrisCreevey/clann
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtreecompare2.c
More file actions
11720 lines (10359 loc) · 436 KB
/
treecompare2.c
File metadata and controls
11720 lines (10359 loc) · 436 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* treecompare2.c — Clann v5.0.0
* Investigating phylogenetic information through supertree analyses
*
* Created by Chris Creevey, 2003.
* Copyright (C) 2003-2026 Chris Creevey <chris.creevey@gmail.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "clann.h"
#include "utils.h"
#include "topology.h"
#include "viz.h"
#include "tree_io.h"
#include "tree_ops.h"
#include "scoring.h"
#include "consensus.h"
#include "reconcile.h"
#include "prune.h"
#include "main.h"
#include "spr_tree.h"
#include "treecluster.h"
BipartSet *fund_bipart_sets = NULL; /* [Total_fund_trees], precomputed once per analysis */
/* LandscapeMap functions: declared in topology.h */
/*********** Function definitions ********************/
void find(struct taxon * position);
int run_main(int argc, char *argv[]);
/* moved to tree_io.h */ /* input_file_summary */
void clean_exit(int error);
void totext(int c, char *array);
/* moved to tree_io.h */ /* assign_taxa_name */
void execute_command(char *filename, int do_all);
int seperate_commands(char *command);
int parse_command(char *command);
void print_commands(int num);
void cal_fund_scores(int printfundscores);
void pathmetric(char *string, int **scores);
void weighted_pathmetric(char *string, float **scores, int fund_num);
int unroottree(char * tree);
void alltrees_search(int user);
int * apply_singlecopy_filter(void);
void restore_singlecopy_filter(int *saved);
#ifdef _OPENMP
static void hs_alloc_thread_state(void);
static void hs_free_thread_state(void);
static void hs_merge_results(char ***par_retained, float **par_scores, int *par_n, float *par_best, int *par_NUMSWAPS, VisitedSet *par_visited);
static int hs_same_topology(char *t1, char *t2);
#endif
/* vs_*, tree_topo_hash: declared in topology.h */
float compare_trees(int spr);
void rf_precompute_fund_biparts(void);
float compare_trees_rf(int spr);
float compare_trees_ml(int spr);
static float ml_display_score(float s); /* total score → lnL */
static float ml_display_source_score(float raw, int tidx); /* per-tree score → lnL */
static const char *ml_score_label(void); /* "lnL" or "score" */
struct taxon * make_taxon(void);
void intTotree(int tree_num, char *array, int num_taxa);
int tree_build (int c, char *treestring, struct taxon *parent, int fromfile, int fund_num, int *taxaorder);
void prune_tree(struct taxon * super_pos, int fund_num);
int treeToInt(char *array);
int shrink_tree (struct taxon * position);
int print_pruned_tree(struct taxon * position, int count, char *pruned_tree, int fullname, int treenum);
void reset_tree(struct taxon * position);
int count_taxa(struct taxon * position, int count);
void check_tree(struct taxon * position, int tag_id, FILE *reconstructionfile);
int check_taxa(struct taxon * position);
int find_taxa(struct taxon * position, char *query);
int number_tree(struct taxon * position, int num);
void dismantle_tree(struct taxon * position);
void bootstrap_search(void);
void mlscores(void);
void memory_error(int error_num);
void print_named_tree(struct taxon * position, char *tree);
void print_fullnamed_tree(struct taxon * position, char *tree, int fundtreenum);
void print_tree(struct taxon * position, char *tree);
void reallocate_retained_supers(void);
void usertrees_search(void);
void heuristic_search(int user, int print, int sample, int nreps);
int average_consensus(int nrep, int missing_method, char * useroutfile, FILE *paupfile);
int do_search(char *tree, int user, int print, int maxswaps, FILE *outfile, int numspectries, int numgenetries);
int branchswap(int number_of_swaps, float score, int numspectries, int numgenetries);
static void fix_parent_pointers(struct taxon *pos, struct taxon *parent);
int find_swaps(float * number, struct taxon * position, int number_of_swaps, int numspectries,int numgenetries);
void do_swap(struct taxon * first, struct taxon * second);
int swapper(struct taxon * position,struct taxon * prev_pos, int stepstaken, struct taxon * first_swap, struct taxon * second_swap, float * number, int * swaps, int number_of_swaps, int numspectries, int numgenetries);
void yaptp_search(void);
void randomise_tree(char *tree);
void randomise_taxa(char *tree);
void random_star_decom(char *tree);
int check_if_diff_tree(char *tree);
int coding(int nrep, int scoring, int ptpreps);
int MRP_matrix(char **trees, int num_trees, int consensus);
void set_parameters(void);
float MRC(char *supertree);
float quartet_compatibility(char *supertree);
void condense_coding(void);
void reset_spr (struct taxon *position);
int remaining_spr (struct taxon *position);
int spr(struct taxon * position, int maxswaps, int numspectries, int numgenetries);
int regraft(struct taxon * position, struct taxon * newbie, struct taxon * last, int steps, int maxswaps,int numspectries,int numgenetries);
void get_lengths(struct taxon *position);
/* xposition1/2, middle_number, yposition0/1/2, print_coordinates,
* tree_coordinates: declared in viz.h */
void generatetrees(void);
/* draw_histogram: declared in viz.h */
void consensus(int num_trees, char **trees, int num_reps, float percentage, FILE *outfile, FILE *guidetreefile);
/* moved to tree_io.h */ /* input_fund_tree */
/* moved to tree_io.h */ /* nexusparser */
void do_consensus(void);
/* moved to tree_io.h */ /* comment */
/* moved to tree_io.h */ /* showtrees */
/* moved to tree_io.h */ /* exclude */
/* moved to tree_io.h */ /* returntree */
/* moved to tree_io.h */ /* returntree_fullnames */
/* moved to tree_io.h */ /* quick (now static in tree_io.c) */
/* moved to tree_io.h */ /* qs (now static in tree_io.c) */
/* moved to tree_io.h */ /* include */
/* moved to tree_io.h */ /* exclude_taxa */
void sourcetree_dists();
/* moved to tree_io.h */ /* prune_taxa_for_exclude (now static in tree_io.c) */
void spr_dist(void);
int string_SPR(char * string); /* carries out random SPR operations on the tree */
void neighbor_joining( int brlens, char *tree, int names);
void nj(void);
void identify_taxa(struct taxon * position, int *name_array);
void reroot_tree(struct taxon *outgroup);
void clean_pointer_taxa(struct taxon *position);
struct taxon * get_branch(struct taxon *position, int name);
float tree_map(struct taxon * gene_top, struct taxon * species_top, int print);
int number_tree1(struct taxon * position, int num);
int number_tree2(struct taxon * position, int num);
void label_gene_tree(struct taxon * gene_position, struct taxon * species_top, int *presence, int xnum);
int get_min_node(struct taxon * position, int *presence, int num);
void subtree_id(struct taxon * position, int *tmp);
void descend(struct taxon * position, int *presence);
int reconstruct_map(struct taxon *position, struct taxon *species_top);
void add_losses(struct taxon * position, struct taxon *species_top);
int join_losses(struct taxon * position);
int count_losses(struct taxon * position);
struct taxon * construct_tree(struct taxon * spec_pos, struct taxon *gene_pos, int *presence, struct taxon *extra_gene);
int compress_tree (struct taxon * position);
int compress_tree1 (struct taxon * position);
struct taxon * get_taxon(struct taxon *position, int name);
void duplicate_tree(struct taxon * orig_pos, struct taxon * prev_dup_pos);
void find_tagged(struct taxon * position, int *presence);
void up_tree(struct taxon * position, int *presence);
void down_tree(struct taxon * position, struct taxon *prev, int *presence);
void mapunknowns();
void reconstruct(int print_settings);
void put_in_scores(struct taxon * position, float * total);
void assign_ances_desc(struct taxon *position, int ** allowed_species, int * previous);
void isittagged(struct taxon * position);
void hgt_reconstruction();
void assign_hgtdonors(struct taxon * position, int num, int part_num);
void reset_tag2(struct taxon * position);
int assign_tag2(struct taxon * position, int num);
void assign_before_after(struct taxon *position, int *previous, int *before, int *after, int num, int found);
struct taxon * find_remaining(struct taxon * position);
void exhaustive_SPR(char * string);
void print_tree_labels(struct taxon *position, int **results, int treenum, struct taxon *species_tree);
int count_internal_branches(struct taxon *position, int count);
float get_recon_score(char *giventree, int numspectries, int numgenetries);
void print_descendents(struct taxon *position, FILE *outfile);
void do_descendents(struct taxon *position, FILE *outfile);
void resolve_tricotomies(struct taxon *position, struct taxon *species_tree);
void gene_content_parsimony(struct taxon * position, int * array);
struct taxon * do_resolve_tricotomies(struct taxon * gene_tree, struct taxon * species_tree, int basescore);
int presence_of_trichotomies(struct taxon * position);
int are_siblings(struct taxon *position, int first, int second);
int isit_onetoone(struct taxon *position, int onetoone);
void print_onetoone_names(struct taxon *position, int onetoone);
int get_best_node(struct taxon * position, int *presence, int num);
void random_prune(char *fund_tree);
void collapse_clades(struct taxon * position, float user_limit, int * to_delete, FILE *rp_outfile);
int get_brlens(struct taxon * position, float *total, int *count);
float return_length(char *string);
int untag_taxa(struct taxon *position, int * to_delete, int keep, int count, FILE *rp_outfile);
int print_keep(struct taxon *position, int keep, int count, FILE *rp_outfile);
void resolve_tricotomies_dist (struct taxon * gene_tree, struct taxon *species_tree, int ** scores);
void get_taxa(struct taxon *position, int *presence);
void check_treeisok(struct taxon *position);
void pathmetric_internals(char *string, struct taxon * species_tree, int **scores);
void calculate_withins(struct taxon *position, int **within, int *presence);
long extract_length(char * fullname);
long list_taxa_in_clade(struct taxon * position, int * foundtaxa, struct taxon * longest, long seqlength); /* descend through the tree finding what taxa are there (and putting result into an array) and also identifying the longest sequence (the first number in the <<full>> name of the sequence, after the first "." and before the first "|") */
long list_taxa_above(struct taxon * position, int * foundtaxa, struct taxon * longest, long seqlength);
int identify_species_specific_clades(struct taxon * position, int numt, int *taxa_fate, int clannID);
void prune_monophylies();
void untag_nodes_below(struct taxon * position, int * taxa_fate, int clannID);
void untag_nodes_above(struct taxon * position, int * taxa_fate, int clannID);
void tips(int num);
void get_taxa_details(struct taxon *position);
void get_taxa_names(struct taxon *position, char **taxa_fate_names);
int basic_tree_build (int c, char *treestring, struct taxon *parent, int fullnames);
int sort_tree(struct taxon *position);
int spr_new(struct taxon * master, int maxswaps, int numspectries, int numgenetries);
static int spr_new2(struct taxon *master, int maxswaps, int numspectries, int numgenetries);
static int tbr_new(struct taxon *master, int maxswaps, int numspectries, int numgenetries);
static int evaluate_candidate(const char *candidate_nwk, float *tmp_fund_scores, int numspectries, int numgenetries);
static int spr_new3(struct taxon *master, int maxswaps, int numspectries, int numgenetries);
static int tbr_new2(struct taxon *master, int maxswaps, int numspectries, int numgenetries);
void do_log(void);
void print_splash(void);
void controlc1(int signal);
void controlc2(int signal);
void controlc3(int signal);
void controlc4(int signal);
void controlc5(int signal);
/****************** Global variable definitions ****************/
FILE * infile = NULL, *BR_file = NULL, *commands_file=NULL, *psfile = NULL, *logfile = NULL, *distributionreconfile = NULL, *onetoonefile = NULL, *strictonetoonefile = NULL, *tempoutfile = NULL;
char **taxa_names = NULL, *commands_filename = NULL, ***fulltaxanames = NULL, **parsed_command = NULL, **fundamentals = NULL, **stored_funds = NULL, **retained_supers = NULL, **stored_commands = NULL, *tempsuper = NULL, **best_topology = NULL, **tree_names = NULL;
char **original_fundamentals = NULL; /* originals preserved for reconstruct when autoprunemono is active */
int autoprunemono_active = 0; /* set to TRUE when autoprunemono=yes was used at load time */
int *numtaxaintrees = NULL, fullnamesnum = 0, fullnamesassignments = 1, fundamental_assignments = 0, tree_length_assignments = 1, parsed_command_assignments = 1, name_assignments = 0, *taxa_incidence = NULL, number_of_taxa = 0, Total_fund_trees = 0, *same_tree = NULL, **Cooccurrance = NULL, NUMSWAPS = 0;
int ***fund_scores = NULL, ***stored_fund_scores = NULL, **super_scores = NULL, *number_of_comparisons = NULL, *stored_num_comparisons = NULL, **presence_of_taxa = NULL, **stored_presence_of_taxa = NULL, *presenceof_SPRtaxa = NULL;
int seed, num_commands = 0, number_retained_supers = 10, number_of_steps = 99999, largest_tree = 0, smallest_tree = 1000000, criterion = 0, parts = 0, **total_coding = NULL, *coding_from_tree = NULL, total_nodes = 0, quartet_normalising = 3, splits_weight = 2, dweight =1, *from_tree = NULL, method = 3, tried_regrafts = 0, hsprint = TRUE, max_name_length = NAME_LENGTH, got_weights = FALSE, num_excluded_trees = 0, num_excluded_taxa = 0, calculated_fund_scores = FALSE, select_longest=FALSE;
struct taxon *tree_top = NULL, *temp_top = NULL, *temp_top2 = NULL, *branchpointer = NULL, *longestseq = NULL;
float *scores_retained_supers = NULL, *partition_number = NULL, num_partitions = 0, total_partitions = 0, sprscore = -1, *best_topology_scores = NULL, **weighted_scores = NULL, *sourcetree_scores = NULL, *tree_weights = NULL;
float *score_of_bootstraps = NULL, *yaptp_results = NULL, largest_length = 0, dup_weight = 1, loss_weight = 1, hgt_weight = 1, BESTSCORE = -1;
float ml_beta = 1.0f; /* L.U.st exponential slope parameter (default 1.0) */
int ml_scale = 2; /* ML score scale: 0=paper (raw sum), 1=lust (log10), 2=lnl (default) */
double ml_eta = 0.0; /* [experimental] tree-size scaling exponent: 0=Steel 2008, 1=normalised, >1=downweight large trees */
double *ml_norm_logZ = NULL; /* log Z_{T_i|X_i} per source tree; populated by compare_trees_ml when ml_do_normcorr=1 */
int ml_do_normcorr = 0; /* 0=off, 1=apply Bryant & Steel (2008) normalising constant correction in usertrees */
int bsweight = 0; /* use per-split BS support as weights in sfit/qfit (0=off, 1=on) */
time_t interval1, interval2;
double sup=1;
char saved_supertree[TREE_LENGTH], *test_array, inputfilename[10000], delimiter_char = '.', logfile_name[10000], system_call[100000];
volatile int user_break = FALSE;
int trees_in_memory = 0, *sourcetreetag = NULL, remainingtrees = 0, GC, delimiter = TRUE, print_log = FALSE, num_gene_nodes, testarraypos = 0;
int malloc_check =0, count_now = FALSE, another_check =0;
unsigned int thread_seed = 0; /* per-thread random seed for rand_r(); see threadprivate block below */
uint64_t *taxon_hash_vals = NULL; /* shared read-only: splitmix64 weight per taxon; set at taxa load time */
VisitedSet *visited_set = NULL; /* threadprivate: per-replicate visited topology hash set */
VisitedSet *thread_visited_acc = NULL; /* threadprivate: accumulates all unique topos across reps in one thread */
LandscapeMap *landscape_map = NULL; /* threadprivate: per-thread visited-topology landscape accumulator */
/* Shared landscape globals (not threadprivate) */
static char g_landscape_file[4096] = ""; /* filename; empty = feature disabled */
static LandscapeMap *g_landscape_map = NULL; /* global accumulator across all threads */
/* Landscape clustering options (set by hs option parsing) */
static int g_cluster_enabled = 0; /* 0=off, 1=on (requires visitedtrees=) */
static char g_cluster_output[4096] = "treeclusters.tsv"; /* output TSV filename */
static float g_cluster_threshold = 0.2f; /* max normalized RF distance [0,1] */
static int g_cluster_orderby = 0; /* 0=score ascending, 1=visits descending */
time_t rep_start_time = 0; /* threadprivate: wall-clock time when current do_search() began */
int hs_do_print = 0; /* threadprivate: mirrors the 'print' param of do_search() */
float last_status_score = -1.0f;/* threadprivate: sprscore at last periodic status line (for improvement marker) */
/* Shared (non-threadprivate): parallel-mode global progress state, written under omp critical */
float par_progress_best = -1.0f; /* best score seen so far across ALL parallel threads */
float par_last_print_score= -1.0f; /* par_progress_best at last status line (thread-0 only) */
time_t par_search_start = 0; /* wall-clock time when the parallel region began */
int skip_streak = 0; /* threadprivate: consecutive already-visited skips since last new topology */
int nni_swaps = 0; /* threadprivate: NNI refinement swaps performed after SPR/TBR in last do_search() rep */
int hs_maxskips = -1; /* shared: stop replicate when skip_streak reaches this (0=disabled, -1=auto: N³) */
int hs_maxskips_is_auto = 1; /* 1=auto-scale to N³ at search start; 0=user has set an explicit value */
int hs_strategy = 0; /* 0=first-improvement (depth-first); 1=best-improvement (breadth-first) */
int hs_progress_interval= 5; /* shared: parallel progress print interval in seconds (0=every improvement) */
time_t par_last_progress_time = 0; /* shared: wall-clock time of last parallel progress line printed */
float hs_droprep = 0.0f; /* shared: abandon rep if its score is >droprep fraction above par_progress_best; 0=disabled */
int rep_abandon = 0; /* threadprivate: set when this rep should be abandoned due to droprep */
int hs_par_rep = 0; /* threadprivate: 1-based rep number for the rep currently running on this thread */
int hs_thread_report_interval = 0; /* shared: per-thread status report interval in seconds (0=disabled) */
time_t thread_report_last = 0; /* threadprivate: wall-clock time of last per-thread status print */
/****** OpenMP thread-private state: one independent copy per thread in parallel regions ******/
#ifdef _OPENMP
#pragma omp threadprivate( \
tree_top, temp_top, temp_top2, branchpointer, \
super_scores, sourcetree_scores, presenceof_SPRtaxa, \
sprscore, tried_regrafts, \
retained_supers, scores_retained_supers, \
best_topology, best_topology_scores, number_retained_supers, \
BESTSCORE, NUMSWAPS, \
thread_seed, \
visited_set, thread_visited_acc, landscape_map, \
rep_start_time, hs_do_print, last_status_score, \
skip_streak, nni_swaps, rep_abandon, hs_par_rep, thread_report_last, \
fundamentals, presence_of_taxa, fund_scores, number_of_comparisons, \
fund_bipart_sets \
)
#endif
/* Snapshots of the master thread's input-data pointers, set just before each
* heuristic_search parallel region so that worker threads can share them. */
#ifdef _OPENMP
static char **g_hs_fundamentals_snap = NULL;
static int **g_hs_presence_snap = NULL;
static int ***g_hs_fund_scores_snap = NULL;
static int *g_hs_num_comp_snap = NULL;
static BipartSet *g_hs_fund_bipart_snap = NULL; /* snapshot of master's fund_bipart_sets for HS worker threads */
#endif
/* CLI helpers, main, seperate_commands, print_splash, print_commands,
* parse_command, recount_from_tree, autoprunemono_apply, execute_command:
/* calculate the path metric for each of the fundamental trees */
int sort_tree(struct taxon *position)
{
/* This function will take as input a tree built in memory */
/* It will return the same tree, but where the compoenents have been ordered so that they are in the numberical order that would have arose from the intTOtree */
struct taxon *start = position;
int mintaxon = number_of_taxa;
while(position != NULL)
{
if(position -> daughter != NULL)
{
position->tag = sort_tree(position->daughter);
}
else
{
position->tag = position->name;
}
position = position->next_sibling;
}
position = start;
while(position != NULL)
{
if(position->tag < mintaxon) mintaxon = position->tag;
}
/* now sort the sibliing on this level based on the number in the tags */
return(1);
}
/* this function builds a tree given its number and the number of taxa */
/* toint: moved to utils.c */
/* tofloat: moved to utils.c */
void alltrees_search(int user)
{
int i = 0, j=0, all = TRUE, start = 0, end = 0, error = FALSE, keep = 0;
char *tree = NULL, *best_tree = NULL, outfilename[100];
float score = 0, best_score = 0, worst = 0;
FILE *treesfile = NULL, *userfile = NULL;
int *saved_tags = NULL; /* for single-copy auto-filter */
outfilename[0] = '\0';
strcpy(outfilename, "alltrees.ph");
if(user) /*if this was called by the user and not by bootstrap or yaptp */
{
for(i=0; i<num_commands; i++)
{
if(strcmp(parsed_command[i], "all") == 0)
all = TRUE;
else
{
if(strcmp(parsed_command[i], "range") == 0)
{
start = toint(parsed_command[i+1]);
end = toint(parsed_command[i+2]);
if(end == start) error = TRUE;
}
}
if(strcmp(parsed_command[i], "keep") == 0)
{
worst = tofloat(parsed_command[i+1]);
if(worst == 0)
{
printf2("Error: '%s' is an invalid value for keep\n", parsed_command[i+1]);
error = TRUE;
}
}
if(strcmp(parsed_command[i], "nbest") == 0)
{
keep = toint(parsed_command[i+1]);
if(keep == 0)
{
printf2("Error: '%s' is an invalid values for nbest\n", parsed_command[i+1]);
error = TRUE;
}
}
if(strcmp(parsed_command[i], "savetrees") == 0 && criterion != 1)
{
if((userfile = fopen(parsed_command[i+1], "w")) == NULL)
{
printf2("Error opening file named %s\n", parsed_command[i+1]);
error = TRUE;
}
else
{
printf2("opened output file %s\n", parsed_command[i+1]);
strcpy(outfilename, parsed_command[i+1]);
}
}
if(strcmp(parsed_command[i], "create") == 0)
{
if((treesfile = fopen("alltrees.ph", "w")) == NULL)
{
printf2("Error opening file named 'alltrees.ph'\n");
error = TRUE;
}
}
if(criterion == 3)
{
if(strcmp(parsed_command[i], "weight") == 0)
{
if(strcmp(parsed_command[i+1], "equal") == 0)
quartet_normalising = 1;
else
{
if(strcmp(parsed_command[i+1], "taxa") == 0)
quartet_normalising = 2;
else
{
if(strcmp(parsed_command[i+1], "quartets") == 0)
quartet_normalising = 3;
else
{
printf2("Error: weight option '%s' is unknown\n", parsed_command[i+1]);
error = TRUE;
}
}
}
}
}
if(criterion == 0)
{
if(strcmp(parsed_command[i], "weight") == 0)
{
if(strcmp(parsed_command[i+1], "equal") == 0)
dweight = 0;
else
{
if(strcmp(parsed_command[i+1], "comparisons") == 0)
dweight = 1;
else
{
printf2("Error: weight option '%s' is unknown\n", parsed_command[i+1]);
error = TRUE;
}
}
}
}
if(criterion == 2)
{
if(strcmp(parsed_command[i], "weight") == 0)
{
if(strcmp(parsed_command[i+1], "equal") == 0)
splits_weight = 1;
else
{
if(strcmp(parsed_command[i+1], "splits") == 0)
splits_weight = 2;
else
{
printf2("Error: weight option '%s' is unknown\n", parsed_command[i+1]);
error = TRUE;
}
}
}
}
}
}
if(!error)
{
if(userfile == NULL)
{
if((userfile = fopen("top_alltrees.txt", "w")) == NULL)
{
printf2("Error opening file named 'alltrees.ph'\n");
error = TRUE;
}
}
}
if(!error)
{
if(start == 0 && end == 0)
{
start = 1;
end = sup;
}
printf2("\n\nAlltrees (exhaustive search) settings:\n\trange: tree numbers %d to %d inclusive\n\tOutput file: %s\n\tCreate all trees? ", start, end, outfilename );
if(treesfile != NULL) printf2("Yes\n");
else printf2("No\n");
printf2("\tCriterion = ");
if(criterion==0)
{
printf2("DFIT\n\tWeighting Scheme = ");
if(dweight == 0) printf2("equal\n");
if(dweight == 1) printf2("comparisons\n");
}
if(criterion == 2)
{
printf2("SFIT\n\tWeighting Scheme = ");
if(splits_weight == 1) printf2("equal\n");
if(splits_weight == 2) printf2("splits\n");
}
if(criterion == 3)
{
printf2("QFIT\n\tWeighting Scheme = ");
if(quartet_normalising == 1) printf2("equal\n");
if(quartet_normalising == 2) printf2("taxa\n");
if(quartet_normalising == 3) printf2("quartets\n");
}
printf2("\n\n");
if(!calculated_fund_scores && criterion == 0)
{
cal_fund_scores(FALSE); /* calculate the path metrics for all the fundamental trees */
calculated_fund_scores = TRUE;
}
psfile = fopen("supertree.ps", "w");
for(i=0; i<number_of_taxa; i++) presenceof_SPRtaxa[i] = -1;
/***** define the dynamic arrays **********/
tree = malloc(TREE_LENGTH*sizeof(char));
if(!tree) memory_error(25);
tree[0] = '\0';
best_tree = malloc(TREE_LENGTH*sizeof(char));
if(!best_tree) memory_error(26);
best_tree[0] = '\0';
/* this hold the distances calculated with the pathmetric on the supertree */
if(super_scores == NULL)
{
super_scores = malloc(number_of_taxa*sizeof(int *));
if(!super_scores) memory_error(27);
for(i=0; i<number_of_taxa; i++)
{
super_scores[i] = malloc(number_of_taxa*sizeof(int));
if(!super_scores[i]) memory_error(28);
for(j=0; j<number_of_taxa; j++)
{
super_scores[i][j] = 0;
}
}
}
else
{
for( i=0; i<number_of_taxa; i++)
{
for(j=i; j<number_of_taxa; j++)
{
super_scores[i][j] = 0;
super_scores[j][i] = 0;
}
}
}
if(user) printf2("Progress indicator:");
/************ End assign dynamic arrays **************/
if(criterion == 2 || criterion == 3 || criterion == 6 || criterion == 7)
rf_precompute_fund_biparts();
if(start == 0 && end == 0)
{
start = 1;
end = sup;
}
if(signal(SIGINT, controlc5) == SIG_ERR)
{
printf2("An error occurred while setting a signal handler\n");
}
saved_tags = apply_singlecopy_filter();
for(i=start; i<=end; i++)
{
tree[0] = '\0';
if(user_break)
{
printf2("%d trees sampled\n", i);
i = end+1;
}
interval2 = time(NULL);
if(difftime(interval2, interval1) > 5) /* every 10 seconds print a dot to the screen */
{
/* printf2("="); */
fflush(stdout);
interval1 = time(NULL);
}
intTotree(i, tree, number_of_taxa); /* create the supertree number i */
if(criterion == 0) /* if we are using mssa */
{
/****** We now need to build the Supertree in memory *******/
if(tree_top != NULL)
{
dismantle_tree(tree_top);
tree_top = NULL;
}
temp_top = NULL;
{ int _to = 0; tree_build(1, tree, tree_top, FALSE, -1, &_to); }
tree_top = temp_top;
temp_top = NULL;
score = compare_trees(FALSE);
}
if(criterion == 2 || criterion == 3 || criterion == 6 || criterion == 7)
{
if(tree_top != NULL) { dismantle_tree(tree_top); tree_top = NULL; }
temp_top = NULL;
{ int _to = 0; tree_build(1, tree, tree_top, FALSE, -1, &_to); }
tree_top = temp_top;
temp_top = NULL;
if(criterion == 2) score = compare_trees_sfit(FALSE);
else if(criterion == 3) score = compare_trees_qfit(FALSE);
else if(criterion == 6) score = compare_trees_rf(FALSE);
else score = compare_trees_ml(FALSE);
}
/* Always populate best_tree with named tree for retained_supers storage.
For criterion==0/6/7 tree_top is already built; for MRC/QC we convert
the integer-indexed tree string directly using returntree(). This ensures
retained_supers[] always holds actual taxon names so that
'reconstruct speciestree memory' works correctly after alltrees. */
if((criterion == 0 || criterion == 2 || criterion == 3 || criterion == 6 || criterion == 7) && tree_top != NULL)
{
strcpy(best_tree, "");
print_named_tree(tree_top, best_tree);
}
else
{
strcpy(best_tree, tree);
returntree(best_tree);
}
if(treesfile != NULL) /* if the create option was selected */
fprintf(treesfile,"%s;\t[%f]\n", best_tree, score);
if(i==1)
{
retained_supers[0] = realloc(retained_supers[0], (strlen(best_tree)+10)*sizeof(char));
strcpy(retained_supers[0], best_tree);
scores_retained_supers[0] = score;
best_score = score;
}
else
{
if(score < best_score)
{
retained_supers[0] = realloc(retained_supers[0], (strlen(best_tree)+10)*sizeof(char));
strcpy(retained_supers[0], best_tree);
scores_retained_supers[0] = score;
best_score = score;
j=1;
while(scores_retained_supers[j] != -1 && j < number_retained_supers)
{
strcpy(retained_supers[j], "");
scores_retained_supers[j] = -1;
j++;
}
}
else
{
if(score == best_score)
{
j = 1;
while(scores_retained_supers[j] != -1)
{
j++;
if(j+1 == number_retained_supers) reallocate_retained_supers();
}
retained_supers[j] = realloc(retained_supers[j], (strlen(best_tree)+10)*sizeof(char));
strcpy(retained_supers[j], best_tree);
scores_retained_supers[j] = score;
}
}
}
}
/***** Print out the best tree found ******/
if(user)
{
printf2("\n");
i=0;
/**** Print out the best trees found *******/
tree[0] = '\0';
i=0; j=0;
while(scores_retained_supers[j] != -1)
{
j++;
}
while(scores_retained_supers[i] != -1)
{
if(tree_top != NULL)
{
dismantle_tree(tree_top);
tree_top = NULL;
}
temp_top = NULL;
/* retained_supers[] now stores named trees (actual taxon names), so use TRUE */
{ int _to = 0; tree_build(1, retained_supers[i], tree_top, TRUE, -1, &_to); }
tree_top = temp_top;
temp_top = NULL;
strcpy(best_tree, "");
print_named_tree(tree_top, best_tree);
if(userfile != NULL) fprintf(userfile, "%s;\t[%f]\n", best_tree, scores_retained_supers[i] );
tree_coordinates(best_tree, FALSE, TRUE, FALSE, -1);
printf2("\nSupertree %d of %d %s = %f\n", i+1, j, ml_score_label(), ml_display_score(scores_retained_supers[i]) );
i++;
}
/* Keep retained_supers[] in memory (like hs/nj) so 'reconstruct speciestree memory'
can use the alltrees result. Set trees_in_memory to the count of best trees found. */
trees_in_memory = j;
}
restore_singlecopy_filter(saved_tags);
free(tree);
free(best_tree);
if(treesfile != NULL)
fclose(treesfile);
treesfile = NULL;
fclose(psfile);
fclose(userfile);
}
}
/* This function does the checking of every fundamental tree to the supertree at hand. It returns a float
This function also bootstraps the values for the fundamental trees, if the do_bootstrap value is greater than 0 these values are printed to bootstrap.txt*/
/* Tree_build:
This function reads in a file and from it builds the tree in memory using the taxon_type definition */
/* Basic_Tree_build:
This function builds a tree in memory withouth incrementing the number of taxa etc in, bascially this is for building a tree where do don;t needall the bells and whistles. This gets used in Exclude taxa to help deal with gene names */
/* This makes the taxon structure when we need it so I don't have to keep typing the assignments all the time */
void prune_tree_from_array(struct taxon * super_pos, int * array)
{
int i=0, found = FALSE;
struct taxon *start = super_pos;
while(super_pos != NULL)
{
super_pos->tag2 = super_pos->tag;
super_pos = super_pos->next_sibling;
}
super_pos = start;
while(super_pos != NULL)
{
found = FALSE;
if(super_pos->daughter != NULL)
{
prune_tree_from_array(super_pos->daughter, array); /* If this is pointer sibling, move down the tree */
}
if(super_pos->name != -1) /* If there is an actual taxa on this sibling */
{
/* Check to see if that taxa is on the array */
if(array[super_pos->name] == 0)
{ /* if its not there */
super_pos->tag = FALSE;
}
}
super_pos = super_pos->next_sibling;
}
}
void add_internals_from_array(struct taxon * super_pos, int *array)
{
int i=0, found = FALSE;
while(super_pos != NULL)
{
found = FALSE;
if(super_pos->daughter != NULL)
{
if(array[super_pos->tag2] > 0)
{ /* if it's there */
super_pos->tag = super_pos->tag2;
}
add_internals_from_array(super_pos->daughter, array); /* If this is pointer sibling, move down the tree */
}
super_pos = super_pos->next_sibling;
}
}
/* Prune tree: This is a recursive function that is called for every node position of the supertree
it then checks to see if any of the siblings on this node are not contained in the fundamental tree, these siblings are then turned off.
This only turns off taxa, pointer siblings will have to be turned off using a separate program */
/* This function travels through the tree recursively untagging any pointer siblings that are not being used.
this effectively shrinks the tree to the size of the fundamental tree that it is being compared to
this recursive function is called by shrink_tree to count how many active taxa there are below any given pointer sibling */
/* This function is only used to print the pruned supertree */
/* this identifies the taxa in a subtree passed to it */
#ifdef _OPENMP
/* boot_alloc_thread_state -----------------------------------------------
* Allocate per-thread copies of the fundamental input-data arrays and
* initialise search state for a bootstrap parallel region.
* Must be called from inside the bootstrap #pragma omp parallel region.
*/
static void boot_alloc_thread_state(int ntrees, int ntaxa)
{
int i, j, is_master;
is_master = (omp_get_thread_num() == 0);
/* Allocate thread-private copies of all four input-data arrays.
* The master thread's copies replace its pre-parallel pointers;
* worker threads get fresh independent allocations. */
fundamentals = malloc(ntrees * sizeof(char *));
presence_of_taxa = malloc(ntrees * sizeof(int *));
fund_scores = malloc(ntrees * sizeof(int **));
number_of_comparisons = malloc(ntrees * sizeof(int));
if(!fundamentals || !presence_of_taxa || !fund_scores || !number_of_comparisons)
memory_error(200);
for(i = 0; i < ntrees; i++)
{
fundamentals[i] = malloc(TREE_LENGTH * sizeof(char));
fundamentals[i][0] = '\0';
presence_of_taxa[i] = malloc(ntaxa * sizeof(int));
fund_scores[i] = malloc(ntaxa * sizeof(int *));
if(!fundamentals[i] || !presence_of_taxa[i] || !fund_scores[i]) memory_error(201);
for(j = 0; j < ntaxa; j++)
{
fund_scores[i][j] = malloc(ntaxa * sizeof(int));
if(!fund_scores[i][j]) memory_error(202);
}
}
/* Allocate / reset search-result state (mirrors hs_alloc_thread_state). */
if(!is_master)
{
int k, init_n = 10;
retained_supers = malloc(init_n * sizeof(char *));
scores_retained_supers = malloc(init_n * sizeof(float));
best_topology = malloc(init_n * sizeof(char *));
best_topology_scores = malloc(init_n * sizeof(float));
for(k = 0; k < init_n; k++)
{
retained_supers[k] = malloc(TREE_LENGTH * sizeof(char));
retained_supers[k][0] = '\0';
scores_retained_supers[k] = -1;
best_topology[k] = malloc(TREE_LENGTH * sizeof(char));
best_topology[k][0] = '\0';
best_topology_scores[k] = -1;
}
number_retained_supers = init_n;
super_scores = malloc(ntaxa * sizeof(int *));
for(k = 0; k < ntaxa; k++)
super_scores[k] = malloc(ntaxa * sizeof(int));
sourcetree_scores = malloc(ntrees * sizeof(float));
presenceof_SPRtaxa = malloc(ntaxa * sizeof(int));
}
else
{
/* Master: search-state pointers are valid; just reset content. */