Right I think I figured it out, thanks to your comments and books, as well as some online resources including these:
https://cme.h-its.org/exelixis/web/teaching/seminarSlides/example2.pdf
http://www.sfu.ca/biology/courses/bisc869/869_lectures/MHP_Coalescent.pdf
https://www.ccg.unam.mx/~vinuesa/tlem/pdfs/coalescent-1.pdf
Here’s some graphs I generated:
In the first (top left), I’m simply calculating the probability of k alleles coalescing to k-1 in a constant population (Ny=500) at various times in the past t:
P = 1-\left(1-\frac{k(k-1)}{2Ny}\right)^t
In the second (top right), it’s similar but the number of alleles (k) is on the x axis, while the different colours represent different sized constant populations. Here, the bigger the population, the more time (generations) it takes to go from 20 alleles to 1 (coalescence).
The number of generations until coalescence of k to k-1 alleles is given by:
t = \frac{2Ny}{k(k-1)}
To get the third graph (bottom), I took these t values and transformed them from “fictional time” to different “real times” to simulate different population growth rates according to:
Rt = \frac{ln(1+gt)}{g}
With this, I could simulate multiple growth rates (0.001-0.1) and multiple final population sizes (100-1000). As before, larger population sizes leads to longer coalescence times, and as expected, faster growth rates leads to shorter coalescence times.
It all seems to make sense, so I’m reasonably confident I’ve done everything correctly. This was just a bit of fun, an excuse to do some maths and play with R, but of course I’m also keen to know if there are any mistakes!
The next step would be to use these results to draw simulated trees so maybe I’ll give that a try some other time. Here’s all the latest code (I was a bit less vigilant in documenting everything as I’ve been working through the night - It’s 6am now!):
###load packages
library(SciViews) #needed to use ln function
library(ggplot2) #needed to make nice graphs
library(gridExtra) #needed to show plots side-by-side
library(dplyr) #needed for some data manipulation functions
library(reshape2)
###
###define parameters
Ny0 <- 500 #population at time 0 (present)
t <- seq(0,2000,50) #range of time points in the past to be simulated
g <- 0.01 #population growth rate since time t to present
###
Nyt<-Ny0*exp(-g*t) #exponential growth between time t and 0 - gives population size at time t
plot(t, Nyt) #plot population size going back in time
###coalescence in constant population
P = 1-((1-1/Ny0)^t) #calculate coalescence probability of 2 alleles in constant population size
constant <- data.frame(t, P)
cgraph <-
ggplot(constant, aes(x=t, y=P)) +
geom_line() +
theme_classic() +
geom_point() +
ylab("P(coalescence)") +
ggtitle("constant population size")
gvalue0 <- rep("0 (constant)", length(t)) #make vector with gvalues (0)
constantdf <- data.frame(t, gvalue0, P) #format constant data into data for combination later
colnames(constantdf)[1] <- "Rt"
colnames(constantdf)[2] <- "gvalue"
###
###coalescence in growing population
#need to convert the times in constant population to "real time" to account for growing population
Rt <- (1/g)*ln(1+g*t) #treat t as fictional time then convert to real time (Rt)
growing <- data.frame(Rt, P)
ggraph <-
ggplot(growing, aes(x=Rt, y=P)) +
geom_line() +
theme_classic() +
geom_point() +
xlim(0, max(Rt)) +
ylab("P(coalescence)") +
ggtitle("growing at rate g=0.01")
###
grid.arrange(cgraph, ggraph+xlim(0,max(t)), ncol = 1) #plot both graphs at the same time
###coalescence in shrinking population
s=-0.01 #population shrinkage rate (inverse of g)
Nyt<-Ny0*exp(-s*t) #exponential shrinkage between time t and 0 - gives population size at time t
#need to convert these times to real t
Rts <- (1/s)*ln(1+s*t) #treat t as Ft then convert to Rt
shrinking <- data.frame(Rts, P)
sgraph <-
ggplot(shrinking, aes(x=Rts, y=P)) +
geom_line() +
theme_classic() +
geom_point() +
xlim(0, max(Rt)) +
ylab("P(coalescence)") +
ggtitle("shrinking at rate g=-0.01")
###
grid.arrange(ggraph+xlim(0,max(t)), cgraph, sgraph+xlim(0,max(t)), ncol = 1) #plot growing, constant, and shrinking population graphs
###plotting a range of population growth rates
grange <- c(0.001, 0.01, 0.1) #set list of g values to simulate
gtable<-data.frame(x1 = 1:length(t)) #create placeholder dataframe to store simulated real times
for(g in grange) {
new <- (1/g)*ln(1+g*t)
gtable[ , ncol(gtable) + 1] <- new
colnames(gtable)[ncol(gtable)] <- paste0(g)
} #iterate through different g values
gtable = gtable[ ,-1] #remove first placeholder column
head(gtable) #check top of table
gtable<-data.frame(x=unlist(gtable)) #flatten into single column
gtable$gvalue <- rep(grange, each=length(t)) #add g value column as labels
rownames(gtable) <- NULL #remove misleading rownames
multipleg <- data.frame(gtable, rep(P, length(grange))) #add in P values to data frame
colnames(multipleg)[1] <- "Rt"
colnames(multipleg)[3] <- "P"
head(multipleg) #check top of table
multiplegfinal <-rbind(constantdf, multipleg) #combine with constant population results
multipleggraph <-
ggplot(multiplegfinal, aes(x=Rt, y=P, color=factor(gvalue))) +
geom_line() +
theme_classic() +
geom_point(size = 1) +
xlim(0, max(multiplegfinal$Rt[is.finite(multiplegfinal$Rt)])) +
ylab("P(coalescence)") +
xlab("generations before present") +
ggtitle("P(coalescence) with growth rates g") +
theme(legend.position = c(0.7, 0.4))
multipleggraph
#note leftward shift of coalescence probability in growing population - more likely to coalescence earlier as a result of the smaller population sizes in the past
ggsave(plot=multipleggraph,filename = "P(coalescence).png", units = "px", width = 1200, height = 1300, device='png', dpi=300)
###plot population size over time
gpoptable<-data.frame(x1 = 1:length(t)) #create placeholder dataframe to store simulated real times
for(g in grange) {
new <- Ny0*exp(-g*t)
gpoptable[ , ncol(gpoptable) + 1] <- new
colnames(gpoptable)[ncol(gpoptable)] <- paste0(g)
} #iterate through different g values
gpoptable = gpoptable[,-1] #remove first placeholder column
head(gpoptable) #check top of table
constantpopN <- rep(Ny0, length(t)) #make vector with constant population size
constantpopdf <- data.frame(constantpopN, gvalue0, t) #format constant data into data for combination later
colnames(constantpopdf)[1] <- "N"
colnames(constantpopdf)[2] <- "gvalue"
colnames(constantpopdf)[3] <- "t"
gpoptable<-data.frame(x=unlist(gpoptable)) #flatten into single column
gpoptable$gvalue <- rep(grange, each=length(t)) #add g value column as labels
rownames(gpoptable) <- NULL #remove misleading rownames
colnames(gpoptable)[1] <- "N" #rename first column
head(gpoptable) #check top of table
gpoptable <- data.frame(gpoptable, t) #add t column to table
multiplegpopfinal <-rbind(constantpopdf, gpoptable) #combine with constant population N table
multiplepopgraph <-
ggplot(multiplegpopfinal, aes(x=t, y=N, color=factor(gvalue))) +
geom_line() +
theme_classic() +
geom_point(size = 1) +
ylab("Ny") +
xlab("generations before present") +
ggtitle("Population size history by growth rate g") +
theme(legend.position = "none")
#+
#theme(legend.position = c(0.8, 0.63))
###
figure <- grid.arrange(multiplepopgraph, multipleggraph, nrow = 1)
ggsave(plot=figure, filename = "coal and pop.png", units = "px", width = 3000, height = 1500, device='png', dpi=400)
###simulate with k alleles
Pk <- 1-(1-((k*(k-1))/(2*Ny0)))^t #probability of coalescence of k to k-1 lineages
krange <- seq(2, 10, 2) #set list of k values to simulate
ktable<-data.frame(x1 = 1:length(t)) #create placeholder dataframe to store simulated real times
for(k in krange) {
new <- 1-(1-((k*(k-1))/2)/Ny0)^t
ktable[ , ncol(ktable) + 1] <- new
colnames(ktable)[ncol(ktable)] <- paste0(k)
} #iterate through different k values
ktable = ktable[ ,-1] #remove first placeholder column
head(ktable) #check top of table
ktable<-data.frame(x=unlist(ktable)) #flatten into single column
ktable$kvalue <- rep(krange, each=length(t)) #add g value column as labels
rownames(ktable) <- NULL #remove misleading rownames
colnames(ktable)[1] <- "P"
head(ktable) #check top of table
ktable <- data.frame(ktable, t) #add t column to table
multiplekgraph <-
ggplot(ktable, aes(x=t, y=P, color=factor(kvalue))) +
geom_line(size = 1) +
theme_classic() +
ylab("P(coalescence)") +
xlab("generations before present") +
ggtitle("P(coalescence) with k alleles (Ny=500)") +
theme(legend.position = c(0.7, 0.4))
multiplekgraph
#mean time to first coalescence event is 2*Ny/(k*(k-1))
k <- 20 #number of sampled alleles
klist <- c(2:k)
Nylist <- c(100, 500, 1000)
ktable2<-data.frame(x1 = 1:length(klist)) #create placeholder dataframe
ktable3<-data.frame(x1 = 1:(length(klist))) #create placeholder dataframe
for(Ny in Nylist) {
for(k in klist) {
new <- 2*Ny/(k*(k-1))
ktable2[nrow(ktable2) + 1, ] <- new
}
ktable3[ ,(nrow(ktable2)/length(klist))] <- ktable2[c((nrow(ktable2)-(length(klist)-1)):nrow(ktable2)),c(1)]
colnames(ktable3)[nrow(ktable2)/length(klist)] <- paste0(Ny)
}
ktable3 = ktable3[ ,-1] #remove first placeholder column
ktable3
ktable3 <- rbind(rep(0,nrow(ktable3)), ktable3)
ktable4<-ktable3
ktable4[] <- lapply(ktable4, cumsum)
ktable4<-data.frame(x=unlist(ktable4)) #flatten into single column
ktable4$Ny <- rep(Nylist, each=(length(klist)+1)) #add Ny column as labels
rownames(ktable4) <- NULL #remove misleading rownames
colnames(ktable4)[1] <- "t"
head(ktable4) #check top of table
ktable4$kvalue <- rep(c(1:(length(klist)+1)), length(Nylist)) #add kvalue column to table
kmultipleNygraph <-
ggplot(ktable4, aes(x=kvalue, y=t, color=factor(Ny))) +
geom_line(size=1) +
theme_classic() +
ylab("Coalescence time (generations)") +
xlab("Number of alleles (k)") +
ggtitle("Coalescence time of k alleles")
kmultipleNygraph
###simulating different g rates with k tips
t<-2*Ny/(k*(k-1)) #time to k to k-1 coalescent event, calculate for 1:k
#use t column in ktable4
#simulate for a range of g values
Rt <- (1/g)*ln(1+g*t) #treat t as fictional time then convert to real time (Rt)
ktable5<-ktable4
grange <- c(0.001, 0.01, 0.1) #set list of g values to simulate
for(g in grange) {
new <- (1/g)*ln(1+g*ktable5[,1])
ktable5[ , ncol(ktable5) + 1] <- new
colnames(ktable5)[ncol(ktable5)] <- paste0(g)
}
head(ktable5)
colnames(ktable5)[1] <- "0 (constant)" #rename column
ktable6 <- melt(ktable5, id.vars = c("Ny", "kvalue"))
colnames(ktable6)[3] <- "g" #rename column
colnames(ktable6)[4] <- "Rt" #rename column
head(ktable6)
head(ktable5)
ktable7 <- melt(ktable5, id.vars = c("0 (constant)", "Ny", "kvalue"))
colnames(ktable7)[4] <- "g" #rename column
colnames(ktable7)[5] <- "Rt" #rename column
head(ktable7)
gmultipleNygraph <-
ggplot(ktable7, aes(x=kvalue, y=Rt,
color=factor(Ny),
group=interaction(g, Ny))) +
geom_line(aes(linetype=factor(g)), size=1) +
theme_classic() +
ylab("Coalescence time (generations)") +
xlab("Number of alleles (k)") +
ggtitle("Coalescence time of k alleles in different sized populations with different growth rates")
gmultipleNygraph
figure2 <- grid.arrange(multiplekgraph,
gmultipleNygraph,
kmultipleNygraph,
nrow = 2, ncol = 2,
widths = c(1, 1),
layout_matrix = rbind(c(1, 3),
c(2, 2))
)
ggsave(plot=figure2, filename = "coal with k and g.png", units = "px", width = 3200, height = 3000, device='png', dpi=400)