# These parameters control the assembly to be simulated.

B <- 200000	# Genome length
R <- 1000		# Number of reads
L <- 200	# Read length
its <- 1000		# number of simulation iterations


# Related calculations.

B.occ <- B / L		# bins in the occupancy simulation
B.tile <- B.occ + (B.occ-1)	# bins in the expected overlap tiling discretization


# Start the actual work

lengths.norm.max <- rep(0,its)
lengths.occ.max <- rep(0,its)
lengths.occ.tile.max <- rep(0,its)

for (it in 1:its)  {
	print(paste('Iteration',it))


	# Do the base simulation
	bins <- rep(0,B)
	samps <- sample(c(1:B),R,replace=TRUE)
	for (i in 1:R)  {
		bins[samps[i]:min((samps[i]+L-1),B)] <- bins[samps[i]:min((samps[i]+L-1),B)] + 1
		}
	occ <- bins > 0
	runs <- rle(occ)
	lengths <- runs$length[(runs$values == TRUE)]
	lengths.norm <- lengths / L
	lengths.norm.max[it] <- max(lengths.norm)


	# do the occupancy simulation
	bins <- rep(0,B.occ)
	samps <- sample(c(1:B.occ),R,replace=TRUE)
	for (i in 1:R)  {
		bins[samps[i]] <- bins[samps[i]] + 1
		}
	occ <- bins > 0
	runs <- rle(occ)
	lengths.occ <- runs$length[(runs$values == TRUE)]
	lengths.occ.max[it] <- max(lengths.occ)


	# do the expected overlap tiling simulation
	bins <- rep(0,B.tile)
	samps <- sample(c(1:B.tile),R,replace=TRUE)
	for (i in 1:R)  {
		bins[samps[i]] <- bins[samps[i]] + 1
		}
	occ.tile <- bins > 0
	runs.tile.neighbors <- rle(occ.tile[seq(1,B.tile,2)])
	lengths.occ.tile.neighbors <- runs.tile.neighbors$length[(runs.tile.neighbors$values == TRUE)]
	runs.tile.neighbors2 <- rle(occ.tile[seq(2,B.tile,2)])
	lengths.occ.tile.neighbors2 <- runs.tile.neighbors2$length[(runs.tile.neighbors2$values == TRUE)]
	lengths.occ.tile <- c(lengths.occ.tile.neighbors,lengths.occ.tile.neighbors2)
	lengths.occ.tile.max[it] <- max(lengths.occ.tile)
	}


# get a poisson approximation
k.seq <- seq(2,max(max(lengths.occ.max),max(lengths.occ.tile.max),max(lengths.norm.max)))

alpha <- 2*B/L - 1
beta <- 1 - (1-1/alpha)^R
theta.w <- log((B/L - 1)*(1-beta)+1,base=1/beta)
theta.t <- log((B/L - 2)*(1-beta)+1,base=1/beta)

pr.poisson <- exp((-beta^k.seq)*(beta^-theta.w + beta^-theta.t))


# plot everything

plot(sort(lengths.occ.max),c(1:NROW(lengths.occ.max))/NROW(lengths.occ.max),type='s',col='red',xlim=c(1,max(lengths.occ.max)),
     main=paste('# Reads =',R),xlab='Maximum contig length (read size units)',ylab='')
lines(sort(lengths.occ.tile.max),c(1:NROW(lengths.occ.tile.max))/NROW(lengths.occ.tile.max),type='s',col='blue' )
lines(sort(lengths.norm.max),c(1:NROW(lengths.norm.max))/NROW(lengths.norm.max),type='l',col='green')
lines(k.seq,pr.poisson,type='l',col='orange')
