-
Notifications
You must be signed in to change notification settings - Fork 11
Description
At the moment, we default to match_segregating_sites=False, which runs tsdate rescaling such that we create unbiased mutation dates when there are polytomies. I have been interpreting this as estimating a polytomy date to be the date of the oldest node that contributes to that polytomy (i.e. the first split).
It is not entirely clear what match_segregating_sites=True means, in terms of the coalescence time estimates when there are polytomies.
Below is a test using the stdpopsim zigzag model with a sample size of 100 diploids over 40Mb of sequence (212951 mutations). Intervals with no mutations on them have been collapsed into polytomies before dating, using the code at tskit-dev/tskit#2926
And here is the pair coalescence rate plot (data from the tree sequence in orange, compared to the true demographic model in blue)
So in the case of the zigzag demography, match_segregating_sites doesn't do a very good job of getting the pair coalescence times accurately dated. Hmm 🤔
Click for code
import tsdate
import stdpopsim
def single_mutation_sites(ts):
mtime = np.full(ts.num_sites, np.nan)
diff_mask = np.diff(ts.mutations_site) != 0
use = np.concatenate(([True], diff_mask)) & np.concatenate((diff_mask, [True]))
mtime[ts.mutations_site[use]] = ts.mutations_time[use]
return mtime
def midpoint_sites(ts):
tm = ts.nodes_time
ep = ts.edges_parent
ec = ts.edges_child
return np.array([
(tm[ep[site.mutations[0].edge]]-tm[ec[site.mutations[0].edge]])/2 if len(site.mutations) == 1 else np.nan
for site in ts.sites()
])
species = stdpopsim.get_species("HomSap")
model = species.get_demographic_model("Zigzag_1S14")
contig = species.get_contig("chr22", left=10e6)
samples = {"generic": 100}
engine = stdpopsim.get_engine("msprime")
print("simulating")
orig_ts = engine.simulate(model, contig, samples, random_seed=123)
# unsupported_edges() and remove_edges() from https://github.com/tskit-dev/tskit/discussions/2926
del_by_interval = unsupported_edges(orig_ts, per_interval=True)
poly_ts_by_interval = remove_edges(orig_ts, del_by_interval)
dts_plain = tsdate.date(poly_ts_by_interval.simplify(), mutation_rate=1e-8, progress=True, rescaling_iterations=0)
dts = tsdate.date(poly_ts_by_interval.simplify(), mutation_rate=1e-8, progress=True)
dts_mss = tsdate.date(poly_ts_by_interval.simplify(), mutation_rate=1e-8, progress=True, match_segregating_sites=True)
from matplotlib import pyplot as plt
t_true = single_mutation_sites(orig_ts) # use midpoint_sites() for midpoint
t1 = single_mutation_sites(dts_plain)
t2 = single_mutation_sites(dts)
t3 = single_mutation_sites(dts_mss)
fig, axes = plt.subplots(2, 3, figsize=(15, 7), sharey="row", gridspec_kw={'height_ratios': [2, 1]})
plt.suptitle("True mutation time taken as exact", size=20)
for ax, t, title in zip(
axes[0],
(t1, t2, t3),
("Dated: no rescaling", "Dated: rescaling", "Dated: rescaling with match ss"),
):
ax.hexbin(np.log10(t_true), np.log10(t), bins="log")
ax.plot([0, 12], [0, 12], color='red', linestyle='-')
ax.set_xlabel("log10 true mutation time")
ax.set_ylabel("log10 inferred mutation time")
ax.set_ylim(-1, 5.5)
ax.set_xlim(-1, 5.5)
ax.set_title(title)
# Bias
for ax, t in zip(axes[1], (t1, t2, t3)):
ax.hist(np.log10(t_true)-np.log10(t), bins=100)
ax.axvline(np.nanmean(np.log10(t_true)-np.log10(t)), c="red")
ax.axvline(np.nanmedian(np.log10(t_true)-np.log10(t)), c="magenta")
ax.set_xlabel("log10(true mutation time) - log10(inferred mutation time)")
import demesdraw
fig, axes = plt.subplots(1, 5, figsize=(20, 5))
for ax, tree_seq, label in zip(
axes,
(orig_ts, poly_ts_by_interval, dts_plain, dts, dts_mss),
("Original (Pair_coalescence in orange)", "Original_polytomied", "Dated: no rescaling", "Dated: rescaling", "Dated: rescaling w/ match ss"),
):
x = np.logspace(0, np.log10(orig_ts.max_time), 50)
x[-1] = np.inf
x[0] = 0
demesdraw.size_history(model.model.to_demes(), ax)
rates = tree_seq.pair_coalescence_rates(x)
ax.plot(x[:-1], 1/rates/2, c="orange")
ax.set_xscale("log")
ax.set_xlim(10, 1e5)
ax.set_title(label)

