diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index c1326560..4332ba0b 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,7 +43,7 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::rcmdcheck + extra-packages: github::LynxJinyangii/RcppTskit/RcppTskit@add-multiple-functions-on-pr-131, any::rcmdcheck needs: check - uses: r-lib/actions/check-r-package@v2 diff --git a/.github/workflows/document.yaml b/.github/workflows/document.yaml index 24934b79..486e5dcf 100644 --- a/.github/workflows/document.yaml +++ b/.github/workflows/document.yaml @@ -29,7 +29,7 @@ jobs: - name: Install dependencies uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::roxygen2 + extra-packages: github::LynxJinyangii/RcppTskit/RcppTskit@add-multiple-functions-on-pr-131, any::roxygen2 needs: roxygen2 - name: Document diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 0b260216..91667d7f 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -30,7 +30,7 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::pkgdown, local::. + extra-packages: github::LynxJinyangii/RcppTskit/RcppTskit@add-multiple-functions-on-pr-131, any::pkgdown, local::. needs: website - name: Build site diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index e38eef6a..65c868f9 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -36,7 +36,7 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::covr, any::xml2 + extra-packages: github::LynxJinyangii/RcppTskit/RcppTskit@add-multiple-functions-on-pr-131, any::covr, any::xml2 needs: coverage - name: Test coverage diff --git a/.gitignore b/.gitignore index 22624c10..69afbedc 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,12 @@ src/Makevars src/Makevars.win vignettes/*.html vignettes/*.pdf +.idea/ +src/.idea/ +.Rlib/ +*tar.gz +__pycache__/ +testData/out_phase1_useMacsMut_FALSE/*.trees +testData/out_msprime_from_macs/*.trees +dev/testData/out_phase1_useMacsMut_FALSE/*.trees +dev/testData/out_msprime_from_macs/*.trees diff --git a/R/Class-Pop.R b/R/Class-Pop.R index ebf398c3..e6df0a89 100644 --- a/R/Class-Pop.R +++ b/R/Class-Pop.R @@ -686,7 +686,7 @@ newPop = function(rawPop,ploidy=NULL,simParam=NULL,nThreads=NULL,...){ .newPop = function(rawPop, id=NULL, mother=NULL, father=NULL, iMother=NULL, iFather=NULL, isDH=NULL, femaleParentPop=NULL, maleParentPop=NULL, - hist=NULL, simParam=NULL, nThreads=NULL,...){ + hist=NULL, histGen=NULL, simParam=NULL, nThreads=NULL,...){ if(is.null(simParam)){ simParam = get("SP",envir=.GlobalEnv) } @@ -813,7 +813,7 @@ newPop = function(rawPop,ploidy=NULL,simParam=NULL,nThreads=NULL,...){ if(simParam$isTrackPed){ if(simParam$isTrackRec){ - simParam$addToRec(lastId,id,iMother,iFather,isDH,hist,output@ploidy) + simParam$addToRec(lastId,id,iMother,iFather,isDH,hist,histGen,output@ploidy) #Jinyang modified }else{ simParam$addToPed(lastId,id,iMother,iFather,isDH) } diff --git a/R/Class-SimParam.R b/R/Class-SimParam.R index 26a661a9..0740f53f 100644 --- a/R/Class-SimParam.R +++ b/R/Class-SimParam.R @@ -111,6 +111,8 @@ SimParam = R6Class( private$.pedigree = matrix(NA_integer_,nrow=0,ncol=3) private$.isTrackRec = FALSE private$.recHist = list() + private$.isTrackRecGen = FALSE + private$.recHistGen = list() # Jinyang added private$.varA = numeric() private$.varG = numeric() private$.varE = numeric() @@ -187,6 +189,25 @@ SimParam = R6Class( invisible(self) }, + #' @description Sets genetic-coordinate recombination tracking for the simulation. Jinyang added. + #' By default this is turned off. When turned on, it will also turn on pedigree tracking. + #' + #' @param isTrackRecGen should genetic-coordinate recombination tracking be on. + #' @param force should the check for a running simulation be ignored. + setTrackRecGen = function(isTrackRecGen, force=FALSE){ + stopifnot(is.logical(isTrackRecGen)) + if(!force){ + private$.isRunning() + } + private$.isTrackRecGen = isTrackRecGen + if(isTrackRecGen){ + private$.isTrackPed = TRUE + private$.isTrackRec = TRUE + } + invisible(self) + }, + + #' @description Resets the internal lastId, the pedigree #' and recombination tracking (if in use) to the #' supplied lastId. Be careful using this function because @@ -217,6 +238,10 @@ SimParam = R6Class( if(private$.isTrackRec){ private$.recHist = private$.recHist[0:lastId] } + # Jinyang added + if(private$.isTrackRecGen){ + private$.recHistGen = private$.recHistGen[0:lastId] + } invisible(self) }, @@ -2139,9 +2164,12 @@ SimParam = R6Class( #' @param father vector of father iids #' @param isDH indicator for DH lines #' @param hist new recombination history + #' @param histGen new recombination history (genetic coordinate) #' @param ploidy ploidy level addToRec = function(lastId,id,mother,father,isDH, - hist,ploidy){ + hist, + histGen=NULL, # Jinyang added + ploidy){ nNewInd = lastId-private$.lastId stopifnot(nNewInd>0) if(length(isDH)==1) isDH = rep(isDH,nNewInd) @@ -2172,12 +2200,24 @@ SimParam = R6Class( names(newRecHist) = id private$.recHist = c(private$.recHist, newRecHist) private$.lastHaplo = tmpLastHaplo + + # Jinyang added + if(private$.isTrackRecGen){ + #newRecHistGen = vector("list", nNewInd) + #names(newRecHistGen) = id + private$.recHistGen = c(private$.recHistGen, newRecHist) + } }else{ # Add hist to recombination history private$.hasHap = c(private$.hasHap, rep(FALSE, nNewInd)) private$.isFounder = c(private$.isFounder, rep(FALSE, nNewInd)) names(hist) = id private$.recHist = c(private$.recHist, hist) + # Jinyang added + if(private$.isTrackRecGen){ + names(histGen) = id + private$.recHistGen = c(private$.recHistGen, histGen) + } } private$.pedigree = rbind(private$.pedigree, tmp) private$.lastId = lastId @@ -2292,6 +2332,8 @@ SimParam = R6Class( .pedigree="matrix", .isTrackRec="logical", .recHist="list", + .isTrackRecGen = "logical", + .recHistGen = "list", #Jinyang added .varA="numeric", .varG="numeric", .varE="numeric", @@ -2735,6 +2777,25 @@ SimParam = R6Class( } }, + #' @field isTrackRecGen is recombination being tracked. Jinyang added. + isTrackRecGen = function(value){ + if(missing(value)){ + private$.isTrackRecGen + }else{ + stop("`$isTrackRecGen` is read only",call.=FALSE) + } + }, + + #' @field recHistGen list of historic recombination events. Jinyang added. + recHistGen = function(value){ + if(missing(value)){ + private$.recHistGen + }else{ + stop("`$recHistGen` is read only",call.=FALSE) + } + }, + + #' @field haplotypes list of computed IBD haplotypes haplotypes=function(value){ if(missing(value)){ diff --git a/R/RcppExports.R b/R/RcppExports.R index ed16b7c5..beef8f59 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -292,8 +292,8 @@ createIbdMat <- function(ibd, chr, nLoci, ploidy, nThreads) { .Call(`_AlphaSimR_createIbdMat`, ibd, chr, nLoci, ploidy, nThreads) } -cross <- function(motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads) { - .Call(`_AlphaSimR_cross`, motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads) +cross <- function(motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads, trackRecGen) { + .Call(`_AlphaSimR_cross`, motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads, trackRecGen) } createDH2 <- function(geno, nDH, genMap, v, p, trackRec, nThreads) { @@ -399,6 +399,10 @@ MaCS <- function(args, maxSites, inbred, ploidy, nThreads, seed) { .Call(`_AlphaSimR_MaCS`, args, maxSites, inbred, ploidy, nThreads, seed) } +MaCSTS <- function(args, nChr, inbred, ploidy, nThreads, seed, usePhysicalPositions = FALSE, useMacsMut = FALSE, Nref = NA_real_, expandInbredSamples = TRUE) { + .Call(`_AlphaSimR_MaCSTS`, args, nChr, inbred, ploidy, nThreads, seed, usePhysicalPositions, useMacsMut, Nref, expandInbredSamples) +} + #' @title Summarise `tskit` table collection #' @param tc an external pointer to a \code{tsk_table_collection_t} object. #' @return A list. @@ -426,3 +430,11 @@ rtsk_treeseq_get_num_individuals2 <- function(ts) { .Call(`_AlphaSimR_rtsk_treeseq_get_num_individuals2`, ts) } +tsMutateTableCollection <- function(tc, theta, seed) { + invisible(.Call(`_AlphaSimR_tsMutateTableCollection`, tc, theta, seed)) +} + +tsFinalizeInbredTableCollection <- function(tc, ploidy) { + invisible(.Call(`_AlphaSimR_tsFinalizeInbredTableCollection`, tc, ploidy)) +} + diff --git a/R/alphaSimR2Ts.R b/R/alphaSimR2Ts.R new file mode 100644 index 00000000..f0874957 --- /dev/null +++ b/R/alphaSimR2Ts.R @@ -0,0 +1,276 @@ +library(jsonlite) + +recHistMatToSegDf <- function(histMat, nLoci) { + + origin <- as.integer(histMat[, 1]) + starts <- as.integer(histMat[, 2]) + + ends <- c(starts[-1] - 1L, nLoci) + + data.frame( + origin = origin, + locusStart = starts, + locusEnd = ends, + stringsAsFactors = FALSE + ) +} + + +recHistToSegDfWithParents <- function(SP, offspringPop, nLociByChr) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHist[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + nLoci <- nLociByChr[[cc]] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistMatToSegDf(haps[[h]], nLoci = nLoci) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "locusStart","locusEnd", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + +bridgeCollectSegFromSimOutput <- function(SP, simOutput) { + bridgeSegDfList <<- list() + + nLociByChr <- lapply(chrKeptPosBpList, length) + + for (k in 2:length(simOutput)) { + segDf <- recHistToSegDfWithParents(SP, simOutput[[k]], nLociByChr) + bridgeSegDfList[[length(bridgeSegDfList) + 1]] <<- segDf + } + + invisible(bridgeSegDfList) +} + + +segDfToEdgeDfUsingBridge <- function(segDf, chr_info) { + # segDF: childID, hap, chr, locusStart, locusEnd, origin + out <- segDf + out$left <- NA_real_ + out$right <- NA_real_ + + for (cc in sort(unique(out$chr))) { + #posBp <- bridgeEnv$chrKeptPosBpList[[cc]] + #if (is.null(posBp)) stop("bridgeEnv$chrKeptPosBpList[[", cc, "]] is NULL.") + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + ts <- tskit$load(tsPath) + seqLen <- as.numeric(ts$sequence_length) + + idx <- which(out$chr == cc) + + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + out +} + +bridgeAllSegToEdgeDf <- function(chr_info) { + allSeg <- do.call(rbind, bridgeSegDfList) + + out <- allSeg + out$left <- NA + out$right <- NA + + for (cc in sort(unique(out$chr))) { + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + tc <- tc_load(tsPath) + seqLen <- tc$sequence_length() + + idx <- which(out$chr == cc) + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + + out +} + +bridgeComputeIndTime <- function(pedigree) { + n <- nrow(pedigree) + indTime <- rep(NA, n) + + for (i in 1:n) { + m <- pedigree[i, "mother"] + f <- pedigree[i, "father"] + + if (m == 0 && f == 0) { + indTime[i] <- 0 + } else { + indTime[i] <- min(indTime[m], indTime[f]) - 1 + } + } + + indTime +} + + +bridgeWriteTrees <- function(chr_info, edgeDf, SP, out_dir = NULL, + out_basename = "AlphaSimR_extended") { + + indTime <- bridgeComputeIndTime(SP$pedigree) + + nodeIdMapByChr <<- vector("list", length(chr_info)) + indIdMapByChr <<- vector("list", length(chr_info)) + + for (cc in seq_along(chr_info)) { + + nodeIdMapByChr[[cc]] <<- list() + indIdMapByChr[[cc]] <<- list() + + ts <- ts_load(chr_info[[cc]]$ts_path) + tc <- ts$dump_tables() + + df <- edgeDf[edgeDf$chr == cc, , drop = FALSE] + if (nrow(df) == 0) next + + # get indIDs for sampled nodes + sampNodeId <- ts$samples() + sampIndRow <- integer(length(sampNodeId)) + for (i in seq_along(sampNodeId)) { + sampIndRow[i] <- tc$node_table_get_row(sampNodeId[i])$individual + } + if (any(sampIndRow < 0)) { + bad <- which(sampIndRow < 0)[1] + stop( + "Sample node", sampNodeId[bad], "has individual = -1. ", + "Cannot reuse founders' individuals. ", + ) + } + + nFounder <- length(sampNodeId) / ploidy + idx <- 1 + for (ind in 1:nFounder) { + indRow <- sampIndRow[idx] + indIdMapByChr[[cc]][[as.character(ind)]] <<- indRow + + for (h in 1:ploidy) { + nodeId <- as.integer(unlist(sampNodeId[[idx]]))[1] + key <- paste(ind, h, sep = "_") + nodeIdMapByChr[[cc]][[key]] <<- nodeId + # list(alphaSimR = list(id = key))) + idx <- idx + 1 + } + } + + # add indIDs for offSpring nodes + nextInd <- as.integer(tc$num_individuals()) + addNewIndividual <- function(alphaId) { + key <- as.character(alphaId) + if (!is.null(indIdMapByChr[[cc]][[key]])) return(indIdMapByChr[[cc]][[key]]) + + m <- SP$pedigree[alphaId, "mother"] + f <- SP$pedigree[alphaId, "father"] + + mRow <- addNewIndividual(m) + fRow <- addNewIndividual(f) + + newId <- nextInd + tc$individual_table_add_row( + #parents = list(as.integer(mRow), as.integer(fRow)), + parents = c(as.integer(mRow), as.integer(fRow)), + metadata = charToRaw(toJSON( + list(file_id=as.integer(newId)), + auto_unbox = TRUE))) + + indIdMapByChr[[cc]][[key]] <<- as.integer(newId) + + nextInd <<- nextInd + 1L + newId + } + + childIdsNeeded <- sort(as.integer(unique(df$childId))) + for (childId in childIdsNeeded) { + addNewIndividual(childId) + } + + # append child nodes + childKeys <- unique(paste(df$childId, df$hap, sep = "_")) + for (key in childKeys) { + if (is.null(nodeIdMapByChr[[cc]][[key]])) { + childId <- as.integer(sub("_.*$", "", key)) + indRow <- indIdMapByChr[[cc]][[as.character(childId)]] + + tc$node_table_add_row( + flags = 0L, + time = indTime[[childId]], + population = -1L, + individual = indRow, + metadata = as.character(toJSON( + list(alphaSimR = list(id = key)), + auto_unbox = TRUE, force = TRUE)) + ) + nodeIdMapByChr[[cc]][[key]] <<- as.integer(tc$num_nodes() - 1) + } + } + + # append edges + for (i in 1:nrow(df)) { + parentKey <- paste(df$parentId[i], df$parentHap[i], sep = "_") + childKey <- paste(df$childId[i], df$hap[i], sep = "_") + + if (is.null(nodeIdMapByChr[[cc]][[parentKey]])) { + stop("Missing parent node for key=", parentKey, + " on chr=", cc, ". Check founder mapping.") + } + + tc$edge_table_add_row( + left = df$left[i], + right = df$right[i], + parent = nodeIdMapByChr[[cc]][[parentKey]], + child = nodeIdMapByChr[[cc]][[childKey]] + ) + } + + tc$sort() + newTs <- tc$tree_sequence() + + outDirCc <- if (is.null(out_dir)) dirname(chr_info[[cc]]$ts_path) else out_dir + outPath <- file.path(outDirCc, paste0(out_basename, "_chr", cc - 1, ".trees")) + + newTs$dump(outPath) + cat("Wrote:", outPath, "\n") + } + + invisible(TRUE) +} diff --git a/R/alphaSimR2TsGen.R b/R/alphaSimR2TsGen.R new file mode 100644 index 00000000..d93ee2d6 --- /dev/null +++ b/R/alphaSimR2TsGen.R @@ -0,0 +1,121 @@ +library(jsonlite) + +morgan2bpRate <- function(m, x0, breaks, rates, side=c("left","right")) { + # turn breaks into Morgan + segLen <- diff(breaks) + mStart <- c(0, cumsum(rates * segLen)) + # position of the 1st SNP in Morgan + i0 <- findInterval(x0, breaks, rightmost.closed = TRUE) + i0 <- pmin(pmax(i0, 1), length(rates)) + mX0 <- mStart[i0] + rates[i0] * (x0 - breaks[i0]) + # recombination breakpoints in Morgan count from the 1st SNP + M <- m + mX0 + i <- findInterval(M, mStart, rightmost.closed = TRUE) + i <- pmin(pmax(i, 1), length(rates)) + # record zero-recombination-rate regions + mEnd <- mStart[-1] + plateau <- (rates[i] == 0) | (mEnd[i] == mStart[i]) + out <- numeric(length(M)) + + # non-zero-recombination-rate regions + ii <- which(!plateau) + if (length(ii) > 0) { + out[ii] <- breaks[i[ii]] + (M[ii] - mStart[i[ii]]) / rates[i[ii]] + } + + # zero-recombination-rate regions + jj <- which(plateau) + if (length(jj) > 0) { + out[jj] <- if (side == "left") breaks[i[jj]] else breaks[i[jj] + 1L] + } + + out + +} + + + +recHistGenMatToSegDf <- function(histMat, x0, breaks, rates, seqLen) { + + origin <- as.integer(histMat[, 1]) + mStart <- as.numeric(histMat[, 2]) + mNext <- c(mStart[-1], NA_real_) + + left <- morgan2bpRate(mStart, x0, breaks, rates, side="left") + + right <- numeric(length(mStart)) + if (length(mStart) > 1) { + right[1:(length(mStart)-1)] <- morgan2bpRate(mNext[1:(length(mStart)-1)], + x0, breaks, rates, side="right") + } + right[length(mStart)] <- seqLen + left[1] <- 0 + + keep <- right > left + + data.frame( + origin = origin[keep], + left = left[keep], + right = right[keep], + stringsAsFactors = FALSE + ) +} + +recHistGenToSegDfWithParents <- function(SP, offspringPop) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHistGen[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + tc <- tc_load(chr_info[[cc]]$ts_path) + seqLen <- as.numeric(tc$sequence_length()) + breaks <- chr_info[[cc]]$breaks + rates <- chr_info[[cc]]$rates + + x0 <- chrKeptPosBpList[[cc]][1] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistGenMatToSegDf(haps[[h]], x0, breaks, rates, seqLen) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "left","right", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + +bridgeCollectSegGenFromSimOutput <- function(SP, simOutput) { + bridgeSegDfListGen <<- list() + + for (k in 2:length(simOutput)) { + segDf <- recHistGenToSegDfWithParents(SP, simOutput[[k]]) + bridgeSegDfListGen[[length(bridgeSegDfListGen) + 1]] <<- segDf + } + + invisible(bridgeSegDfListGen) +} diff --git a/R/crossing.R b/R/crossing.R index 32495504..00415b1e 100644 --- a/R/crossing.R +++ b/R/crossing.R @@ -96,7 +96,8 @@ makeCross = function(pop, crossPlan, nProgeny=1, simParam$femaleCentromere, simParam$maleCentromere, simParam$quadProb, - nThreads) + nThreads, + simParam$isTrackRecGen) dim(tmp$geno) = NULL # Account for matrix bug in RcppArmadillo @@ -112,7 +113,12 @@ makeCross = function(pop, crossPlan, nProgeny=1, }else{ hist = NULL } - + # Jinyang added + if(simParam$isTrackRecGen){ + histGen = tmp$recHistGen + } else { + histGen = NULL + } return(.newPop(rawPop=rPop, mother=pop@id[crossPlan[,1]], father=pop@id[crossPlan[,2]], @@ -121,6 +127,7 @@ makeCross = function(pop, crossPlan, nProgeny=1, femaleParentPop=pop, maleParentPop=pop, hist=hist, + histGen=histGen, # Jinyang added simParam=simParam, nThreads=nThreads)) } @@ -435,7 +442,8 @@ makeCross2 = function(females, males, crossPlan, nProgeny=1, simParam=NULL, simParam$femaleCentromere, simParam$maleCentromere, simParam$quadProb, - nThreads) + nThreads, + simParam$isTrackRecGen) # Jinyang added dim(tmp$geno) = NULL # Account for matrix bug in RcppArmadillo @@ -451,7 +459,13 @@ makeCross2 = function(females, males, crossPlan, nProgeny=1, simParam=NULL, }else{ hist = NULL } - + # Jinyang added + if(simParam$isTrackRecGen){ + histGen = tmp$recHistGen + } else { + histGen = NULL + } + return(.newPop(rawPop=rPop, mother=females@id[crossPlan[,1]], father=males@id[crossPlan[,2]], @@ -460,6 +474,7 @@ makeCross2 = function(females, males, crossPlan, nProgeny=1, simParam=NULL, femaleParentPop=females, maleParentPop=males, hist=hist, + histGen=histGen, # Jinyang added simParam=simParam, nThreads=nThreads)) } @@ -676,8 +691,9 @@ self = function(pop, nProgeny=1, parents=NULL, keepParents=TRUE, simParam$femaleCentromere, simParam$maleCentromere, simParam$quadProb, - nThreads) - + nThreads, + simParam$isTrackRecGen) # Jinyang added + dim(tmp$geno) = NULL # Account for matrix bug in RcppArmadillo rPop = new("RawPop", @@ -692,7 +708,12 @@ self = function(pop, nProgeny=1, parents=NULL, keepParents=TRUE, }else{ hist = NULL } - + # Jinyang added + if(simParam$isTrackRecGen){ + histGen = tmp$recHistGen + } else { + histGen = NULL + } if(keepParents){ return(.newPop(rawPop=rPop, mother=pop@mother[crossPlan[,1]], @@ -702,6 +723,7 @@ self = function(pop, nProgeny=1, parents=NULL, keepParents=TRUE, femaleParentPop=pop, maleParentPop=pop, hist=hist, + histGen=histGen, simParam=simParam, nThreads=nThreads)) }else{ @@ -713,6 +735,7 @@ self = function(pop, nProgeny=1, parents=NULL, keepParents=TRUE, femaleParentPop=pop, maleParentPop=pop, hist=hist, + histGen=histGen, simParam=simParam, nThreads=nThreads)) } diff --git a/R/makeFoundersFromTs.R b/R/makeFoundersFromTs.R new file mode 100644 index 00000000..9f5d65b2 --- /dev/null +++ b/R/makeFoundersFromTs.R @@ -0,0 +1,616 @@ +#' Sample Biallelic Variants from a Tree Sequence +#' +#' @param ts A `RcppTskit::TreeSequence` object. +#' @param segSites Integer number of biallelic variants to sample. +#' @param seed Integer random seed for reservoir sampling. +#' +#' @return A list with `H` (haplotypes matrix; rows are samples) and +#' `P` (numeric vector of site positions). +#' @keywords internal +#' @noRd +sample_segregating_variants <- function(ts, segSites, seed) { + + # Sample segregating variants from the tree sequence. + # + # Parameters + # ========== + # ts: tskit.TreeSequence + # The tree sequence to sample from. + # segSites: int + # The number of segregating sites to sample. + # seed: int + # The random seed to use for sampling. + # + # Returns + # ======= + # list of int + # The positions of the sampled segregating sites. + # Set the random seed for reproducibility. + set.seed(seed) + num_samples <- as.integer(ts$num_samples()) + + # 2. Pre-allocate H matrix and P vector based on required sample size (segSites) + # We only need space for 'segSites' number of variants + H <- matrix(NA_integer_, nrow = num_samples, ncol = segSites) + P <- numeric(segSites) + + it <- ts$variants() + + # k tracks how many biallelic variants we have encountered so far + k <- 0 + # current_size tracks how many variants are currently in our reservoir + current_size <- 0 + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + + # Filter for biallelic sites + if (length(unique(g)) == 2) { + k <- k + 1 + + if (current_size < segSites) { + # Case A: Reservoir is not full yet + current_size <- current_size + 1 + H[, current_size] <- g + P[current_size] <- v$position + } else { + # Case B: Reservoir is full, use Prob. entry: j/k + # sample.int(k, 1) returns a value from 1 to k + j <- sample.int(k, 1) + + if (j <= segSites) { + # Replace the existing variant at index j + H[, j] <- g + P[j] <- v$position + } + } + } + } + + # 4. Final check: if we found fewer biallelic sites than segSites, trim the output + if (k < segSites) { + if (k > 0) { + H <- H[, 1:k, drop = FALSE] + P <- P[1:k] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + } + + return(list(H = H, P = P)) +} + + +#' Extract All Biallelic Variants from a Tree Sequence +#' +#' @param ts A `RcppTskit::TreeSequence` object. +#' @param debug Logical; if `TRUE`, print diagnostics while scanning variants. +#' +#' @return A list with `H` (haplotypes matrix; rows are samples) and +#' `P` (numeric vector of site positions). +#' @keywords internal +#' @noRd +segregating_variants <- function(ts, debug = FALSE) { + # 1. Get dimensions for pre-allocation + max_sites <- as.integer(ts$num_sites()) + num_samples <- as.integer(ts$num_samples()) + if (debug) { + message("Expected max sites: ", max_sites) + message("Expected num samples (from ts): ", num_samples) + } + + # 2. Pre-allocate H matrix (Rows: samples, Cols: sites) + # Using integer matrix to save memory (similar to np.int8) + H_full <- matrix(NA_integer_, nrow = num_samples, ncol = max_sites) + # Pre-allocate P vector for positions + P_full <- numeric(max_sites) + + it <- ts$variants() + count <- 0 + + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + if (debug && count == 0L) { + message("Actual length of genotype vector (g): ", length(g)) + message("Matrix H_full has ", nrow(H_full), " rows") + if (length(g) != nrow(H_full)) { + stop("DIMENSION MISMATCH: The genotype vector length does not match matrix rows!") + } + } + + # Filter for biallelic sites (exactly 2 unique alleles) + if (length(unique(g)) == 2) { + count <- count + 1 + if (debug && count > max_sites) { + stop("INDEX OVERFLOW: count (", count, ") exceeded max_sites (", max_sites, ")") + } + # Fill the matrix column directly + H_full[, count] <- g + P_full[count] <- v$position + } + } + + # 4. Trim the results to the actual number of kept variants + if (count > 0) { + H <- H_full[, 1:count, drop = FALSE] + P <- P_full[1:count] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + if (debug) { + message("Success! Final count of biallelic variants: ", count) + } + + return(list(H = H, P = P)) +} + +#' Debug Wrapper for Variant Extraction +#' +#' @param ts A `RcppTskit::TreeSequence` object. +#' +#' @return A list with `H` and `P` as in [segregating_variants()]. +#' @keywords internal +#' @noRd +segregating_variants_debug <- function(ts) { + segregating_variants(ts, debug = TRUE) +} + +#' Convert Physical Positions to Cumulative Morgan Positions +#' +#' @param x Numeric vector of physical positions. +#' @param breaks Numeric vector of recombination map breakpoints. +#' @param rates Numeric vector of per-bp recombination rates for each interval. +#' +#' @return Numeric vector of cumulative Morgan positions. +#' @keywords internal +#' @noRd +rateMap2cumMorgan <- function(x, breaks, rates) { + stopifnot(length(breaks) == length(rates) + 1) + + o <- order(breaks) + breaks <- breaks[o] + + # M_i = m(breaks[i]) + seg_len <- diff(breaks) + M_start <- c(0, cumsum(rates * seg_len)) # length = length(breaks) + + i <- findInterval(x, breaks, rightmost.closed = FALSE) + i <- pmin(pmax(i, 1), length(rates)) + + m <- M_start[i] + rates[i] * (x - breaks[i]) + return(m) +} + + +#' Load Tree Sequence from Supported Sources +#' +#' @param ts_path Character path to a `.trees` file. +#' @param ts Optional `TreeSequence` object or external pointer. +#' @param ts_xptr Optional external pointer to `tsk_treeseq_t`. +#' @param tc_xptr Optional external pointer to `tsk_table_collection_t`. +#' @param table_xptr Alias for `tc_xptr`. +#' +#' @return A `RcppTskit::TreeSequence` object. +#' @keywords internal +#' @noRd +asMapPop_load_ts <- function(ts_path = NULL, ts = NULL, ts_xptr = NULL, + tc_xptr = NULL, table_xptr = NULL) { + if (!is.null(ts)) { + if (inherits(ts, "externalptr")) { + try_ts <- try(RcppTskit::TreeSequence$new(xptr = ts), silent = TRUE) + if (!inherits(try_ts, "try-error")) { + return(try_ts) + } + tc <- RcppTskit::TableCollection$new(xptr = ts) + return(tc$tree_sequence()) + } + return(ts) + } + if (!is.null(ts_xptr)) { + return(RcppTskit::TreeSequence$new(xptr = ts_xptr)) + } + tc_ptr <- if (!is.null(tc_xptr)) tc_xptr else table_xptr + if (!is.null(tc_ptr)) { + tc <- RcppTskit::TableCollection$new(xptr = tc_ptr) + return(tc$tree_sequence()) + } + if (!is.null(ts_path)) { + return(RcppTskit::ts_load(ts_path)) + } + stop("No tree-sequence source provided. Provide one of ts_path, ts, ts_xptr, tc_xptr, or table_xptr.") +} + +#' Convert One Chromosome Tree Sequence to Map/Haplotypes +#' +#' @param ts_path Character path to `.trees` file (optional). +#' @param breaks Numeric vector of recombination map breakpoints. +#' @param rates Numeric vector of per-bp recombination rates. +#' @param segSites Optional integer number of biallelic variants to keep. +#' @param site_sampling_seed Integer seed for site sampling. +#' @param ts Optional in-memory tree sequence object. +#' @param ts_xptr Optional tree-sequence external pointer. +#' @param tc_xptr Optional table-collection external pointer. +#' @param table_xptr Alias for `tc_xptr`. +#' +#' @return A list with `genMap`, `haplotypes`, and `keptPosBp`. +#' @keywords internal +#' @noRd +ts2chrData <- function(ts_path = NULL, breaks, rates, segSites, site_sampling_seed, + ts = NULL, ts_xptr = NULL, tc_xptr = NULL, table_xptr = NULL) { + ts <- asMapPop_load_ts( + ts_path = ts_path, + ts = ts, + ts_xptr = ts_xptr, + tc_xptr = tc_xptr, + table_xptr = table_xptr + ) + num_pos <- ts$num_sites() + + if (!is.null(segSites)) { + + if (num_pos < segSites) { + stop("Insufficient sites (only ", num_pos, " sites in the tree sequence).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + out <- sample_segregating_variants(ts, segSites, site_sampling_seed) + + if (length(out[[2]]) < segSites) { + stop("Insufficient sites (only ", length(out[[2]]), " sites after filtering non-biallelic sites).") + } + } + else { + out <- segregating_variants(ts) + } + + H <- out[[1]] + pos <- out[[2]] + + ord <- order(pos) + pos <- pos[ord] + H <- H[, ord, drop = FALSE] + mpos <- rateMap2cumMorgan(pos, breaks, rates) + + # relative position, so the 1st element is 0 + mpos <- mpos - min(mpos) + + list( + genMap = list(mpos), + haplotypes = list(H), + keptPosBp = pos + ) +} + +#' Expand Recombination Component to Per-Chromosome List +#' +#' @param x Scalar/list recombination component (`breaks` or `rates`). +#' @param nChr Integer number of chromosomes. +#' @param name Character label for error messages. +#' +#' @return A list of length `nChr`. +#' @keywords internal +#' @noRd +.asMapPop_expand_rec_component <- function(x, nChr, name) { + if (is.null(x)) { + stop("Missing `", name, "` for asMapPop input.") + } + values <- if (is.list(x)) x else list(x) + if (length(values) == 1L) { + return(rep(values, nChr)) + } + if (length(values) != nChr) { + stop("`", name, "` must have length 1 or nChr.") + } + values +} + +#' Extract First Non-NULL Alias from a Named List +#' +#' @param x Named list. +#' @param keys Character vector of alias keys to try in order. +#' +#' @return First non-`NULL` component found, or `NULL`. +#' @keywords internal +#' @noRd +.asMapPop_extract_component <- function(x, keys) { + for (k in keys) { + if (!is.null(x[[k]])) { + return(x[[k]]) + } + } + NULL +} + +#' Safely Get Exact Named Component +#' +#' @param x Named list. +#' @param key Character scalar key. +#' +#' @return Value for `key` or `NULL` if absent. +#' @keywords internal +#' @noRd +.asMapPop_get <- function(x, key) { + if (!is.list(x) || is.null(names(x))) { + return(NULL) + } + if (!(key %in% names(x))) { + return(NULL) + } + x[[key]] +} + +#' Expand segSites to Per-Chromosome Specification +#' +#' @param segSites Optional scalar/vector/list of site counts. +#' @param nChr Integer number of chromosomes. +#' @param defaults Optional fallback segSites specification. +#' +#' @return List of length `nChr` with integer values or `NULL`. +#' @keywords internal +#' @noRd +.asMapPop_expand_seg_sites <- function(segSites, nChr, defaults = NULL) { + if (is.null(segSites)) { + values <- defaults + } else { + values <- segSites + } + if (is.null(values)) { + return(rep(list(NULL), nChr)) + } + if (!is.list(values)) { + values <- as.list(as.integer(values)) + } + if (length(values) == 1L) { + values <- rep(values, nChr) + } else if (length(values) != nChr) { + stop("`segSites` must have length 1 or nChr.") + } + lapply(values, function(x) { + if (is.null(x) || length(x) == 0) { + return(NULL) + } + x <- as.integer(x[1]) + if (is.na(x) || x <= 0L) { + return(NULL) + } + x + }) +} + +#' Check Whether an Entry Looks Like Chromosome TS Specification +#' +#' @param x List candidate chromosome specification. +#' +#' @return Logical scalar. +#' @keywords internal +#' @noRd +.asMapPop_is_chr_info <- function(x) { + is.list(x) && ( + !is.null(x$ts_path) || + !is.null(x$ts) || + !is.null(x$ts_xptr) || + !is.null(x$tc_xptr) || + !is.null(x$table_xptr) + ) +} + +#' Normalize asMapPop Inputs to Per-Chromosome Specs +#' +#' @param chr_info Either explicit per-chromosome list or bundle style input +#' containing `tables`/`ts` and map metadata. +#' @param segSites Optional override for per-chromosome site counts. +#' +#' @return A normalized list of per-chromosome specs. +#' @keywords internal +#' @noRd +.asMapPop_prepare_specs <- function(chr_info, segSites = NULL) { + if (!is.list(chr_info)) { + stop("`chr_info` must be a list.") + } + + root_info <- chr_info + if (!is.null(chr_info$chr_info)) { + chr_info <- chr_info$chr_info + } + + is_explicit_chr_info <- length(chr_info) > 0 && + all(vapply(chr_info, .asMapPop_is_chr_info, logical(1))) + + if (is_explicit_chr_info) { + nChr <- length(chr_info) + default_seg <- lapply(chr_info, function(x) x$segSites) + seg_by_chr <- .asMapPop_expand_seg_sites(segSites, nChr, defaults = default_seg) + root_breaks <- .asMapPop_extract_component(root_info, c("breaks", "rec_breaks", "recBreaks")) + root_rates <- .asMapPop_extract_component(root_info, c("rates", "rec_rates", "recRates")) + if (!is.null(root_breaks)) { + root_breaks <- .asMapPop_expand_rec_component(root_breaks, nChr, "breaks") + } + if (!is.null(root_rates)) { + root_rates <- .asMapPop_expand_rec_component(root_rates, nChr, "rates") + } + out <- vector("list", nChr) + for (i in seq_len(nChr)) { + info <- chr_info[[i]] + if (is.null(info$breaks) && !is.null(root_breaks)) { + info$breaks <- root_breaks[[i]] + } + if (is.null(info$rates) && !is.null(root_rates)) { + info$rates <- root_rates[[i]] + } + if (is.null(info$breaks) || is.null(info$rates)) { + stop("Each chromosome entry must include `breaks` and `rates`.") + } + info$segSites <- seg_by_chr[[i]] + out[[i]] <- info + } + return(out) + } + + tables <- .asMapPop_extract_component(chr_info, c("tables", "table_collections")) + ts_list <- .asMapPop_extract_component(chr_info, c("ts", "tree_sequences")) + if (is.null(tables) && is.null(ts_list)) { + stop("Unsupported `chr_info` format. Provide a list of chromosome specs or a bundle with `tables`/`ts`.") + } + if (!is.null(tables) && !is.list(tables)) { + stop("`tables` must be a list.") + } + if (!is.null(ts_list) && !is.list(ts_list)) { + stop("`ts` must be a list.") + } + + if (!is.null(tables)) { + nChr <- length(tables) + } else { + nChr <- length(ts_list) + } + if (!is.null(tables) && !is.null(ts_list) && length(ts_list) != nChr) { + stop("`tables` and `ts` must have the same length when both are supplied.") + } + + breaks <- .asMapPop_expand_rec_component( + .asMapPop_extract_component(chr_info, c("breaks", "rec_breaks", "recBreaks")), + nChr, "breaks" + ) + rates <- .asMapPop_expand_rec_component( + .asMapPop_extract_component(chr_info, c("rates", "rec_rates", "recRates")), + nChr, "rates" + ) + default_seg <- .asMapPop_extract_component(chr_info, c("segSites", "seg_sites")) + seg_by_chr <- .asMapPop_expand_seg_sites(segSites, nChr, defaults = default_seg) + + out <- vector("list", nChr) + for (i in seq_len(nChr)) { + out[[i]] <- list( + ts = if (!is.null(ts_list)) ts_list[[i]] else NULL, + tc_xptr = if (!is.null(tables)) tables[[i]] else NULL, + breaks = breaks[[i]], + rates = rates[[i]], + segSites = seg_by_chr[[i]] + ) + } + out +} + +#' Resolve and Validate Thread Count for asMapPop +#' +#' @param nThreads Optional requested thread count. +#' +#' @return Integer thread count >= 1. +#' @keywords internal +#' @noRd +.asMapPop_get_num_threads <- function(nThreads) { + if (is.null(nThreads)) { + if (exists("getNumThreads", mode = "function")) { + nThreads <- getNumThreads() + } else { + nThreads <- 1L + } + } + nThreads <- as.integer(nThreads) + if (length(nThreads) != 1L || is.na(nThreads) || nThreads < 1L) { + stop("`nThreads` must be a single positive integer.") + } + nThreads +} + +#' Apply Chromosome Worker with Optional Parallelism +#' +#' @param chr_specs Normalized per-chromosome specs. +#' @param worker Function applied to each chromosome spec. +#' @param nThreads Integer requested threads. +#' +#' @return List of worker outputs. +#' @keywords internal +#' @noRd +.asMapPop_apply <- function(chr_specs, worker, nThreads) { + if (length(chr_specs) <= 1L || nThreads <= 1L) { + return(lapply(chr_specs, worker)) + } + has_non_file_source <- any(vapply(chr_specs, function(x) { + is.null(x$ts_path) + }, logical(1))) + if (has_non_file_source) { + warning( + "asMapPop: parallel conversion currently uses file-backed TS only. ", + "Falling back to serial for in-memory TS/tables.", + call. = FALSE + ) + return(lapply(chr_specs, worker)) + } + if (.Platform$OS.type == "unix") { + return(parallel::mclapply(chr_specs, worker, mc.cores = nThreads)) + } + warning( + "asMapPop: parallel conversion is only enabled on unix via mclapply. ", + "Falling back to serial on this platform.", + call. = FALSE + ) + lapply(chr_specs, worker) +} + +#' Build a MapPop from Tree Sequence Data +#' +#' @param chr_info Input tree-sequence data. Supports either: +#' 1) explicit per-chromosome list entries with `ts_path`/`ts`/`tc_xptr` +#' plus `breaks` and `rates`; or +#' 2) bundle style list containing `tables` or `ts` plus map metadata. +#' @param ploidy Integer ploidy used to construct the resulting `MapPop`. +#' @param inbred Logical; whether resulting individuals are inbred. +#' @param segSites Optional site-count override (scalar or per chromosome). +#' @param site_sampling_seed Integer seed used when downsampling segregating sites. +#' @param nThreads Optional chromosome-level worker count. +#' @param returnMeta Logical; if `TRUE`, return list with `pop`, `keptPosBp`, +#' and `chrData`; otherwise return `MapPop` only. +#' +#' @return A `MapPop` object, or metadata list if `returnMeta = TRUE`. +#' @keywords internal +#' @noRd +asMapPop <- function(chr_info, ploidy = 2L, inbred = FALSE, segSites = NULL, + site_sampling_seed = 42L, nThreads = NULL, + returnMeta = FALSE) { + ploidy <- as.integer(ploidy) + nThreads <- .asMapPop_get_num_threads(nThreads) + chr_specs <- .asMapPop_prepare_specs(chr_info, segSites = segSites) + + worker <- function(info) { + ts2chrData( + ts_path = .asMapPop_get(info, "ts_path"), + ts = .asMapPop_get(info, "ts"), + ts_xptr = .asMapPop_get(info, "ts_xptr"), + tc_xptr = .asMapPop_get(info, "tc_xptr"), + table_xptr = .asMapPop_get(info, "table_xptr"), + breaks = .asMapPop_get(info, "breaks"), + rates = .asMapPop_get(info, "rates"), + segSites = .asMapPop_get(info, "segSites"), + site_sampling_seed = site_sampling_seed + ) + } + + chr_data <- .asMapPop_apply(chr_specs, worker, nThreads = nThreads) + + # save pos in bp for tskit tables + chrKeptPosBp <- lapply(chr_data, `[[`, "keptPosBp") + chrKeptPosBpList <<- chrKeptPosBp + ploidy <<- ploidy + + genMap <- do.call(c, lapply(chr_data, `[[`, "genMap")) + haplotypes <- do.call(c, lapply(chr_data, `[[`, "haplotypes")) + + pop <- newMapPop(genMap = genMap, haplotypes = haplotypes, inbred = inbred, ploidy = ploidy) + if (isTRUE(returnMeta)) { + return(list( + pop = pop, + keptPosBp = chrKeptPosBp, + chrData = chr_data + )) + } + pop +} diff --git a/R/runMacTs.R b/R/runMacTs.R new file mode 100644 index 00000000..700fa066 --- /dev/null +++ b/R/runMacTs.R @@ -0,0 +1,445 @@ +#' Parse Scaled Mutation Rate (`dTheta`) from a MaCS Argument String +#' +#' @param args Character scalar MaCS command string with sample size and +#' sequence length as the first two tokens. +#' +#' @return Numeric scalar `dTheta` used by MaCS-style mutation placement. +#' @keywords internal +#' @noRd +.simAnc_parse_dTheta <- function(args) { + tokens <- strsplit(as.character(args), "[,[:space:]]+", perl = TRUE)[[1L]] + tokens <- tokens[nzchar(tokens)] + if (length(tokens) < 2L) { + stop("args must contain at least sample size and sequence length") + } + seqLen <- suppressWarnings(as.numeric(tokens[2L])) + if (!is.finite(seqLen) || seqLen <= 0) { + stop("Failed to parse sequence length from args") + } + idx <- match("-t", tokens) + if (is.na(idx) || idx >= length(tokens)) { + return(0) + } + thetaScaled <- suppressWarnings(as.numeric(tokens[idx + 1L])) + if (!is.finite(thetaScaled) || thetaScaled < 0) { + stop("Failed to parse -t value from args") + } + seqLen * thetaScaled +} + +#' Simulate Ancestry as TS Tables without Post-Ancestry Mutations +#' +#' @param args Character MaCS command prefix with trailing `-s`. +#' @param nChr Integer number of chromosomes. +#' @param inbred Logical. +#' @param ploidy Integer ploidy. +#' @param nThreads Integer thread count. +#' @param seed Integer vector of chromosome seeds. +#' @param usePhysicalPositions Logical; use bp positions in TS if `TRUE`. +#' @param Nref Optional numeric reference `Ne` for time scaling. +#' +#' @return List with ancestry table collections and metadata. +#' @keywords internal +#' @noRd +simAnc <- function(args, nChr, inbred, ploidy, nThreads, seed, + usePhysicalPositions = FALSE, Nref = NA_real_) { + nChr <- as.integer(nChr) + if (length(nChr) != 1L || is.na(nChr) || nChr <= 0L) { + stop("nChr must be a positive integer scalar") + } + anc <- MaCSTS( + args = args, + nChr = nChr, + inbred = inbred, + ploidy = ploidy, + nThreads = nThreads, + seed = seed, + usePhysicalPositions = usePhysicalPositions, + useMacsMut = FALSE, + Nref = Nref, + expandInbredSamples = FALSE + ) + anc$dTheta <- .simAnc_parse_dTheta(args) + anc$seed <- as.integer(seed) + anc$ploidy <- as.integer(ploidy) + anc$inbred <- isTRUE(inbred) + anc$stage <- "simAnc" + anc +} + +#' Add Mutations to Ancestry TS Tables +#' +#' @param x List returned by `simAnc`, or a list of table-collection pointers. +#' @param dTheta Optional scalar/vector mutation-rate parameter in MaCS units. +#' @param seed Optional scalar/vector integer seeds for mutation sampling. +#' +#' @return List with mutated table collections and metadata. +#' @keywords internal +#' @noRd +simMut <- function(x, dTheta = NULL, seed = NULL) { + tables <- if (is.list(x) && !is.null(x$tables)) x$tables else x + if (!is.list(tables) || length(tables) == 0L) { + stop("simMut requires a non-empty list of table collections") + } + nChr <- length(tables) + + if (is.null(dTheta)) { + if (is.list(x) && !is.null(x$dTheta)) { + dTheta <- x$dTheta + } else { + stop("dTheta is required when x has no dTheta metadata") + } + } + if (length(dTheta) == 1L) { + dTheta <- rep(as.numeric(dTheta), nChr) + } + if (length(dTheta) != nChr) { + stop("dTheta length must be 1 or number of chromosomes") + } + + if (is.null(seed)) { + if (is.list(x) && !is.null(x$seed)) { + seed <- as.integer(x$seed) + 104729L + } else { + seed <- sample.int(1e8, nChr) + } + } + if (length(seed) == 1L) { + seed <- rep(as.integer(seed), nChr) + } + if (length(seed) != nChr) { + stop("seed length must be 1 or number of chromosomes") + } + + for (chr in seq_len(nChr)) { + tsMutateTableCollection(tables[[chr]], as.numeric(dTheta[[chr]]), as.numeric(seed[[chr]])) + } + + out <- if (is.list(x) && !is.null(x$tables)) x else list() + out$tables <- tables + out$dTheta <- as.numeric(dTheta) + out$mutationSeed <- as.integer(seed) + out$mutationMode <- "postTs" + out$stage <- "simMut" + out +} + +#' Finalize Inbred TS by Duplicating Sample Leaves per Individual +#' +#' @param x List returned by `simAnc`/`simMut`, or a list of table pointers. +#' @param inbred Logical. +#' @param ploidy Integer ploidy. +#' +#' @return List with finalized table collections and metadata. +#' @keywords internal +#' @noRd +finalizeInbredTs <- function(x, inbred = FALSE, ploidy = 2L) { + tables <- if (is.list(x) && !is.null(x$tables)) x$tables else x + if (!is.list(tables) || length(tables) == 0L) { + stop("finalizeInbredTs requires a non-empty list of table collections") + } + ploidy <- as.integer(ploidy) + if (ploidy <= 0L) { + stop("ploidy must be a positive integer") + } + + if (isTRUE(inbred) && ploidy > 1L) { + for (chr in seq_along(tables)) { + tsFinalizeInbredTableCollection(tables[[chr]], ploidy) + } + } + + out <- if (is.list(x) && !is.null(x$tables)) x else list() + out$tables <- tables + out$inbred <- isTRUE(inbred) + out$ploidy <- ploidy + out$stage <- "finalizeInbredTs" + out +} + +#' Build runMacs-style MaCS Command for TS Workflow +#' +#' @param nInd Integer number of individuals. +#' @param inbred Logical. +#' @param species Character species preset name. +#' @param split Optional split time in generations. +#' @param ploidy Integer ploidy. +#' @param manualCommand Optional user-provided MaCS command tail. +#' @param manualGenLen Optional user-provided chromosome genetic length(s) in Morgan. +#' @param nChr Integer number of chromosomes. +#' +#' @return List with `command`, `genLen`, and `seqLen`. +#' @keywords internal +#' @noRd +.runMacTS_build_command <- function(nInd, inbred, species, split, ploidy, + manualCommand, manualGenLen, nChr) { + popSize <- ifelse(inbred, nInd, ploidy * nInd) + if (!is.null(manualCommand)) { + if (is.null(manualGenLen)) { + stop("You must define manualGenLen when using manualCommand") + } + command <- paste0(popSize, " ", manualCommand, " -s ") + genLen <- manualGenLen + } else { + species <- toupper(species) + if (species == "GENERIC") { + genLen <- 1.0 + Ne <- 100 + speciesParams <- "1E8 -t 1E-5 -r 4E-6" + speciesHist <- "-eN 0.25 5.0 -eN 2.50 15.0 -eN 25.00 60.0 -eN 250.00 120.0 -eN 2500.00 1000.0" + } else if (species == "CATTLE") { + cattleChrSum <- 2.8e9 + cattleChrBp <- cattleChrSum / 30 + recRate <- 9.26e-9 + genLen <- recRate * cattleChrBp + mutRate <- 9.4e-9 + Ne <- 90 + histNe <- c(120, 250, 350, 1000, 1500, 2000, 2500, 3500, 7000, 10000, 17000, 62000) + histGen <- c(3, 6, 12, 18, 24, 154, 454, 654, 1754, 2354, 3354, 33154) + speciesParams <- paste(c(round(cattleChrBp), "-t", mutRate * 4 * Ne, "-r", recRate * 4 * Ne), + collapse = " ") + histNe <- histNe / Ne + histGen <- histGen / (4 * Ne) + speciesHist <- NULL + for (i in seq_len(length(histNe))) { + speciesHist <- paste(speciesHist, "-eN", histGen[i], histNe[i]) + } + } else if (species == "WHEAT") { + genLen <- 1.43 + Ne <- 50 + speciesParams <- "8E8 -t 4E-7 -r 3.6E-7" + speciesHist <- "-eN 0.03 1 -eN 0.05 2 -eN 0.10 4 -eN 0.15 6 -eN 0.20 8 -eN 0.25 10 -eN 0.30 12 -eN 0.35 14 -eN 0.40 16 -eN 0.45 18 -eN 0.50 20 -eN 1.00 40 -eN 2.00 60 -eN 3.00 80 -eN 4.00 100 -eN 5.00 120 -eN 10.00 140 -eN 20.00 160 -eN 30.00 180 -eN 40.00 200 -eN 50.00 240 -eN 100.00 320 -eN 200.00 400 -eN 300.00 480 -eN 400.00 560 -eN 500.00 640" + } else if (species == "MAIZE") { + genLen <- 2.0 + Ne <- 100 + speciesParams <- "2E8 -t 5E-6 -r 4E-6" + speciesHist <- "-eN 0.03 1 -eN 0.05 2 -eN 0.10 4 -eN 0.15 6 -eN 0.20 8 -eN 0.25 10 -eN 0.30 12 -eN 0.35 14 -eN 0.40 16 -eN 0.45 18 -eN 0.50 20 -eN 2.00 40 -eN 3.00 60 -eN 4.00 80 -eN 5.00 100" + } else { + stop("No rules for species ", species) + } + if (is.null(split)) { + splitI <- "" + splitJ <- "" + } else { + stopifnot(popSize %% 2 == 0) + splitI <- paste(" -I 2", popSize %/% 2, popSize %/% 2) + splitJ <- paste(" -ej", split / (4 * Ne) + 0.000001, "2 1") + } + command <- paste0(popSize, " ", speciesParams, splitI, " ", speciesHist, splitJ, " -s ") + } + if (!is.null(manualGenLen)) { + genLen <- manualGenLen + } + if (length(genLen) == 1L) { + genLen <- rep(genLen, nChr) + } + if (length(genLen) != nChr) { + stop("genLen must have length 1 or nChr") + } + tokens <- strsplit(command, "[,[:space:]]+", perl = TRUE)[[1L]] + tokens <- tokens[nzchar(tokens)] + if (length(tokens) < 2L) { + stop("Failed to parse sequence length from command") + } + seqLen <- suppressWarnings(as.numeric(tokens[2L])) + if (!is.finite(seqLen) || seqLen <= 0) { + stop("Invalid sequence length parsed from command") + } + list(command = command, genLen = as.numeric(genLen), seqLen = seqLen) +} + +#' High-level TS wrapper parallel to runMacs +#' +#' @param nInd Integer number of individuals to simulate. +#' @param nChr Integer number of chromosomes. +#' @param segSites Optional site-count cap per chromosome (scalar or vector). +#' @param inbred Logical. +#' @param species Species preset used by `runMacs`. +#' @param split Optional population split time in generations. +#' @param ploidy Integer ploidy. +#' @param manualCommand Optional MaCS command tail (advanced users). +#' @param manualGenLen Optional genetic length(s) in Morgan. +#' @param nThreads Optional thread count. +#' @param mutationMode One of `"postTs"`, `"macs"`, `"none"`. +#' @param usePhysicalPositions Logical; TS coordinates in bp if `TRUE`. +#' @param Nref Optional reference `Ne` for time scaling. +#' @param seed Optional integer vector (length 1 or `nChr`) for ancestry. +#' @param mutSeed Optional integer vector (length 1 or `nChr`) for post-TS mutation. +#' @param mutSeedOffset Integer offset used when deriving post-TS mutation seeds. +#' @param siteSamplingSeed Integer seed for `asMapPop` site sampling. +#' @param expandInbredTs Logical; whether to expand inbred TS sample leaves before conversion. +#' @param returnTs Logical; return TS tables and metadata alongside `MapPop`. +#' +#' @return `MapPop` by default; otherwise a list with `pop`, `tables`, and metadata. +#' @keywords internal +#' @noRd +runMacTS <- function(nInd, nChr = 1, segSites = NULL, inbred = FALSE, + species = "GENERIC", split = NULL, ploidy = 2L, + manualCommand = NULL, manualGenLen = NULL, nThreads = NULL, + mutationMode = c("postTs", "macs", "none"), + usePhysicalPositions = FALSE, Nref = NA_real_, + seed = NULL, mutSeed = NULL, mutSeedOffset = 104729L, + siteSamplingSeed = 42L, expandInbredTs = FALSE, + returnTs = FALSE) { + mutationMode <- match.arg(mutationMode) + nInd <- as.integer(nInd) + nChr <- as.integer(nChr) + ploidy <- as.integer(ploidy) + if (is.null(nThreads)) { + nThreads <- getNumThreads() + } + nThreads <- as.integer(nThreads) + if (nChr < nThreads) { + nThreads <- nChr + } + if (nInd <= 0L || nChr <= 0L || ploidy <= 0L) { + stop("nInd, nChr, and ploidy must be positive integers") + } + if (!is.null(segSites)) { + segSites <- as.integer(segSites) + if (length(segSites) == 1L) { + segSites <- rep(segSites, nChr) + } + if (length(segSites) != nChr) { + stop("segSites must have length 1 or nChr") + } + } + + setup <- .runMacTS_build_command( + nInd = nInd, + inbred = inbred, + species = species, + split = split, + ploidy = ploidy, + manualCommand = manualCommand, + manualGenLen = manualGenLen, + nChr = nChr + ) + args <- setup$command + genLen <- setup$genLen + seqLen <- setup$seqLen + + if (is.null(seed)) { + seed <- sample.int(n = 1e8, size = nChr) + } + seed <- as.integer(seed) + if (length(seed) == 1L) { + seed <- rep(seed, nChr) + } + if (length(seed) != nChr) { + stop("seed must have length 1 or nChr") + } + + runOut <- NULL + if (mutationMode == "macs") { + runOut <- MaCSTS( + args = args, + nChr = nChr, + inbred = inbred, + ploidy = ploidy, + nThreads = nThreads, + seed = seed, + usePhysicalPositions = usePhysicalPositions, + useMacsMut = TRUE, + Nref = Nref, + expandInbredSamples = FALSE + ) + } else { + runOut <- simAnc( + args = args, + nChr = nChr, + inbred = inbred, + ploidy = ploidy, + nThreads = nThreads, + seed = seed, + usePhysicalPositions = usePhysicalPositions, + Nref = Nref + ) + if (mutationMode == "postTs") { + if (is.null(mutSeed)) { + mutSeed <- as.integer(seed + as.integer(mutSeedOffset)) + } + mutSeed <- as.integer(mutSeed) + if (length(mutSeed) == 1L) { + mutSeed <- rep(mutSeed, nChr) + } + if (length(mutSeed) != nChr) { + stop("mutSeed must have length 1 or nChr") + } + timeScale <- if (!is.null(runOut$timeScale)) as.numeric(runOut$timeScale) else 1 + dThetaPost <- as.numeric(runOut$dTheta) / timeScale + runOut <- simMut(runOut, dTheta = dThetaPost, seed = mutSeed) + } + } + + if (isTRUE(expandInbredTs) && isTRUE(inbred) && ploidy > 1L) { + runOut <- finalizeInbredTs(runOut, inbred = inbred, ploidy = ploidy) + } + + if (mutationMode == "none") { + if (!isTRUE(returnTs)) { + stop("mutationMode='none' produces ancestry-only TS with zero sites; set returnTs=TRUE or use mutationMode='postTs'/'macs'.") + } + return(list( + pop = NULL, + tables = runOut$tables, + args = args, + seed = seed, + mutationMode = mutationMode, + mutSeed = NA_integer_, + usePhysicalPositions = usePhysicalPositions, + timeScale = if (!is.null(runOut$timeScale)) runOut$timeScale else 1, + Nref = if (!is.null(runOut$Nref)) runOut$Nref else NA_real_ + )) + } + + siteCounts <- vapply(runOut$tables, function(tc_xptr) { + as.integer(rtsk_table_collection_summary2(tc_xptr)$num_sites) + }, integer(1)) + if (any(siteCounts <= 0L)) { + badChr <- which(siteCounts <= 0L) + stop("No segregating sites on chromosome(s): ", + paste(badChr, collapse = ", "), + ". Increase mutation rate or inspect TS via returnTs=TRUE.") + } + + breaks <- if (usePhysicalPositions) { + rep(list(c(0, seqLen)), nChr) + } else { + rep(list(c(0, 1)), nChr) + } + rates <- if (usePhysicalPositions) { + lapply(genLen, function(g) c(g / seqLen)) + } else { + lapply(genLen, function(g) c(g)) + } + + popOut <- asMapPop( + chr_info = list( + tables = runOut$tables, + breaks = breaks, + rates = rates + ), + ploidy = ploidy, + inbred = inbred, + segSites = segSites, + site_sampling_seed = as.integer(siteSamplingSeed), + nThreads = nThreads, + returnMeta = FALSE + ) + + if (!isTRUE(returnTs)) { + return(popOut) + } + list( + pop = popOut, + tables = runOut$tables, + args = args, + seed = seed, + mutationMode = mutationMode, + mutSeed = if (!is.null(runOut$mutationSeed)) runOut$mutationSeed else NA_integer_, + usePhysicalPositions = usePhysicalPositions, + timeScale = if (!is.null(runOut$timeScale)) runOut$timeScale else 1, + Nref = if (!is.null(runOut$Nref)) runOut$Nref else NA_real_ + ) +} diff --git a/dev/alphaSimR2Ts.R b/dev/alphaSimR2Ts.R new file mode 100644 index 00000000..e8b1f704 --- /dev/null +++ b/dev/alphaSimR2Ts.R @@ -0,0 +1,275 @@ +library(jsonlite) + +recHistMatToSegDf <- function(histMat, nLoci) { + + origin <- as.integer(histMat[, 1]) + starts <- as.integer(histMat[, 2]) + + ends <- c(starts[-1] - 1L, nLoci) + + data.frame( + origin = origin, + locusStart = starts, + locusEnd = ends, + stringsAsFactors = FALSE + ) +} + + +recHistToSegDfWithParents <- function(SP, offspringPop, nLociByChr) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHist[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + nLoci <- nLociByChr[[cc]] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistMatToSegDf(haps[[h]], nLoci = nLoci) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "locusStart","locusEnd", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + +bridgeCollectSegFromSimOutput <- function(SP, simOutput) { + bridgeSegDfList <<- list() + + nLociByChr <- lapply(chrKeptPosBpList, length) + + for (k in 2:length(simOutput)) { + segDf <- recHistToSegDfWithParents(SP, simOutput[[k]], nLociByChr) + bridgeSegDfList[[length(bridgeSegDfList) + 1]] <<- segDf + } + + invisible(bridgeSegDfList) +} + + +segDfToEdgeDfUsingBridge <- function(segDf, chr_info) { + # segDF: childID, hap, chr, locusStart, locusEnd, origin + out <- segDf + out$left <- NA_real_ + out$right <- NA_real_ + + for (cc in sort(unique(out$chr))) { + #posBp <- bridgeEnv$chrKeptPosBpList[[cc]] + #if (is.null(posBp)) stop("bridgeEnv$chrKeptPosBpList[[", cc, "]] is NULL.") + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + ts <- tskit$load(tsPath) + seqLen <- as.numeric(ts$sequence_length) + + idx <- which(out$chr == cc) + + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + out +} + +bridgeAllSegToEdgeDf <- function(chr_info) { + allSeg <- do.call(rbind, bridgeSegDfList) + + out <- allSeg + out$left <- NA + out$right <- NA + + for (cc in sort(unique(out$chr))) { + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + tc <- tc_load(tsPath) + seqLen <- tc$sequence_length() + + idx <- which(out$chr == cc) + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + + out +} + +bridgeComputeIndTime <- function(pedigree) { + n <- nrow(pedigree) + indTime <- rep(NA, n) + + for (i in 1:n) { + m <- pedigree[i, "mother"] + f <- pedigree[i, "father"] + + if (m == 0 && f == 0) { + indTime[i] <- 0 + } else { + indTime[i] <- min(indTime[m], indTime[f]) - 1 + } + } + + indTime +} + + +bridgeWriteTrees <- function(chr_info, edgeDf, SP, out_dir = NULL, + out_basename = "AlphaSimR_extended") { + + indTime <- bridgeComputeIndTime(SP$pedigree) + + nodeIdMapByChr <<- vector("list", length(chr_info)) + indIdMapByChr <<- vector("list", length(chr_info)) + + for (cc in seq_along(chr_info)) { + + nodeIdMapByChr[[cc]] <<- list() + indIdMapByChr[[cc]] <<- list() + + ts <- ts_load(chr_info[[cc]]$ts_path) + tc <- ts$dump_tables() + + df <- edgeDf[edgeDf$chr == cc, , drop = FALSE] + if (nrow(df) == 0) next + + # get indIDs for sampled nodes + sampNodeId <- ts$samples() + sampIndRow <- integer(length(sampNodeId)) + for (i in seq_along(sampNodeId)) { + sampIndRow[i] <- tc$node_table_get_row(sampNodeId[i])$individual + } + if (any(sampIndRow < 0)) { + bad <- which(sampIndRow < 0)[1] + stop( + "Sample node", sampNodeId[bad], "has individual = -1. ", + "Cannot reuse founders' individuals. ", + ) + } + + nFounder <- length(sampNodeId) / ploidy + idx <- 1 + for (ind in 1:nFounder) { + indRow <- sampIndRow[idx] + indIdMapByChr[[cc]][[as.character(ind)]] <<- indRow + + for (h in 1:ploidy) { + nodeId <- as.integer(unlist(sampNodeId[[idx]]))[1] + key <- paste(ind, h, sep = "_") + nodeIdMapByChr[[cc]][[key]] <<- nodeId + # list(alphaSimR = list(id = key))) + idx <- idx + 1 + } + } + + # add indIDs for offSpring nodes + nextInd <- as.integer(tc$num_individuals()) + addNewIndividual <- function(alphaId) { + key <- as.character(alphaId) + if (!is.null(indIdMapByChr[[cc]][[key]])) return(indIdMapByChr[[cc]][[key]]) + + m <- SP$pedigree[alphaId, "mother"] + f <- SP$pedigree[alphaId, "father"] + + mRow <- addNewIndividual(m) + fRow <- addNewIndividual(f) + + newId <- nextInd + tc$individual_table_add_row( + parents = list(as.integer(mRow), as.integer(fRow)), + metadata = charToRaw(toJSON( + list(file_id=as.integer(newId)), + auto_unbox = TRUE))) + + indIdMapByChr[[cc]][[key]] <<- as.integer(newId) + + nextInd <<- nextInd + 1L + newId + } + + childIdsNeeded <- sort(as.integer(unique(df$childId))) + for (childId in childIdsNeeded) { + addNewIndividual(childId) + } + + # append child nodes + childKeys <- unique(paste(df$childId, df$hap, sep = "_")) + for (key in childKeys) { + if (is.null(nodeIdMapByChr[[cc]][[key]])) { + childId <- as.integer(sub("_.*$", "", key)) + indRow <- indIdMapByChr[[cc]][[as.character(childId)]] + + tc$node_table_add_row( + flags = 0L, + time = indTime[[childId]], + population = -1L, + individual = indRow, + metadata = as.character(toJSON( + list(alphaSimR = list(id = key)), + auto_unbox = TRUE, force = TRUE)) + ) + nodeIdMapByChr[[cc]][[key]] <<- as.integer(tc$num_nodes() - 1) + } + } + + # append edges + for (i in 1:nrow(df)) { + parentKey <- paste(df$parentId[i], df$parentHap[i], sep = "_") + childKey <- paste(df$childId[i], df$hap[i], sep = "_") + + if (is.null(nodeIdMapByChr[[cc]][[parentKey]])) { + stop("Missing parent node for key=", parentKey, + " on chr=", cc, ". Check founder mapping.") + } + + tc$edge_table_add_row( + left = df$left[i], + right = df$right[i], + parent = nodeIdMapByChr[[cc]][[parentKey]], + child = nodeIdMapByChr[[cc]][[childKey]] + ) + } + + tc$sort() + newTs <- tc$tree_sequence() + + outDirCc <- if (is.null(out_dir)) dirname(chr_info[[cc]]$ts_path) else out_dir + outPath <- file.path(outDirCc, paste0(out_basename, "_chr", cc - 1, ".trees")) + + newTs$dump(outPath) + cat("Wrote:", outPath, "\n") + } + + invisible(TRUE) +} diff --git a/dev/alphaSimR2TsGen.R b/dev/alphaSimR2TsGen.R new file mode 100644 index 00000000..d93ee2d6 --- /dev/null +++ b/dev/alphaSimR2TsGen.R @@ -0,0 +1,121 @@ +library(jsonlite) + +morgan2bpRate <- function(m, x0, breaks, rates, side=c("left","right")) { + # turn breaks into Morgan + segLen <- diff(breaks) + mStart <- c(0, cumsum(rates * segLen)) + # position of the 1st SNP in Morgan + i0 <- findInterval(x0, breaks, rightmost.closed = TRUE) + i0 <- pmin(pmax(i0, 1), length(rates)) + mX0 <- mStart[i0] + rates[i0] * (x0 - breaks[i0]) + # recombination breakpoints in Morgan count from the 1st SNP + M <- m + mX0 + i <- findInterval(M, mStart, rightmost.closed = TRUE) + i <- pmin(pmax(i, 1), length(rates)) + # record zero-recombination-rate regions + mEnd <- mStart[-1] + plateau <- (rates[i] == 0) | (mEnd[i] == mStart[i]) + out <- numeric(length(M)) + + # non-zero-recombination-rate regions + ii <- which(!plateau) + if (length(ii) > 0) { + out[ii] <- breaks[i[ii]] + (M[ii] - mStart[i[ii]]) / rates[i[ii]] + } + + # zero-recombination-rate regions + jj <- which(plateau) + if (length(jj) > 0) { + out[jj] <- if (side == "left") breaks[i[jj]] else breaks[i[jj] + 1L] + } + + out + +} + + + +recHistGenMatToSegDf <- function(histMat, x0, breaks, rates, seqLen) { + + origin <- as.integer(histMat[, 1]) + mStart <- as.numeric(histMat[, 2]) + mNext <- c(mStart[-1], NA_real_) + + left <- morgan2bpRate(mStart, x0, breaks, rates, side="left") + + right <- numeric(length(mStart)) + if (length(mStart) > 1) { + right[1:(length(mStart)-1)] <- morgan2bpRate(mNext[1:(length(mStart)-1)], + x0, breaks, rates, side="right") + } + right[length(mStart)] <- seqLen + left[1] <- 0 + + keep <- right > left + + data.frame( + origin = origin[keep], + left = left[keep], + right = right[keep], + stringsAsFactors = FALSE + ) +} + +recHistGenToSegDfWithParents <- function(SP, offspringPop) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHistGen[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + tc <- tc_load(chr_info[[cc]]$ts_path) + seqLen <- as.numeric(tc$sequence_length()) + breaks <- chr_info[[cc]]$breaks + rates <- chr_info[[cc]]$rates + + x0 <- chrKeptPosBpList[[cc]][1] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistGenMatToSegDf(haps[[h]], x0, breaks, rates, seqLen) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "left","right", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + +bridgeCollectSegGenFromSimOutput <- function(SP, simOutput) { + bridgeSegDfListGen <<- list() + + for (k in 2:length(simOutput)) { + segDf <- recHistGenToSegDfWithParents(SP, simOutput[[k]]) + bridgeSegDfListGen[[length(bridgeSegDfListGen) + 1]] <<- segDf + } + + invisible(bridgeSegDfListGen) +} diff --git a/dev/alphaSimR2TsGenPy.R b/dev/alphaSimR2TsGenPy.R new file mode 100644 index 00000000..0a374f58 --- /dev/null +++ b/dev/alphaSimR2TsGenPy.R @@ -0,0 +1,125 @@ +library(reticulate) +library(jsonlite) + +use_virtualenv("~/r-reticulate-env", required = TRUE) +tskit <- import("tskit") + +morgan2bpRate <- function(m, x0, breaks, rates, side=c("left","right")) { + # turn breaks into Morgan + segLen <- diff(breaks) + mStart <- c(0, cumsum(rates * segLen)) + # position of the 1st SNP in Morgan + i0 <- findInterval(x0, breaks, rightmost.closed = TRUE) + i0 <- pmin(pmax(i0, 1), length(rates)) + mX0 <- mStart[i0] + rates[i0] * (x0 - breaks[i0]) + # recombination breakpoints in Morgan count from the 1st SNP + M <- m + mX0 + i <- findInterval(M, mStart, rightmost.closed = TRUE) + i <- pmin(pmax(i, 1), length(rates)) + # record zero-recombination-rate regions + mEnd <- mStart[-1] + plateau <- (rates[i] == 0) | (mEnd[i] == mStart[i]) + out <- numeric(length(M)) + + # non-zero-recombination-rate regions + ii <- which(!plateau) + if (length(ii) > 0) { + out[ii] <- breaks[i[ii]] + (M[ii] - mStart[i[ii]]) / rates[i[ii]] + } + + # zero-recombination-rate regions + jj <- which(plateau) + if (length(jj) > 0) { + out[jj] <- if (side == "left") breaks[i[jj]] else breaks[i[jj] + 1L] + } + + out + +} + +recHistGenMatToSegDfPy <- function(histMat, x0, breaks, rates, seqLen) { + + origin <- as.integer(histMat[, 1]) + mStart <- as.numeric(histMat[, 2]) + mNext <- c(mStart[-1], NA_real_) + + left <- morgan2bpRate(mStart, x0, breaks, rates, side="left") + + right <- numeric(length(mStart)) + if (length(mStart) > 1) { + right[1:(length(mStart)-1)] <- morgan2bpRate(mNext[1:(length(mStart)-1)], + x0, breaks, rates, side="right") + } + right[length(mStart)] <- seqLen + left[1] <- 0 + + keep <- right > left + + data.frame( + origin = origin[keep], + left = left[keep], + right = right[keep], + stringsAsFactors = FALSE + ) +} + + +recHistGenToSegDfWithParentsPy <- function(SP, offspringPop) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHistGen[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + ts <- tskit$load(chr_info[[cc]]$ts_path) + seqLen <- as.numeric(ts$sequence_length) + breaks <- chr_info[[cc]]$breaks + rates <- chr_info[[cc]]$rates + + x0 <- chrKeptPosBpList[[cc]][1] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistGenMatToSegDfPy(haps[[h]], x0, breaks, rates, seqLen) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "left","right", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + + +bridgeCollectSegGenFromSimOutputPy <- function(SP, simOutput) { + bridgeSegDfListGen <<- list() + + for (k in 2:length(simOutput)) { + segDf <- recHistGenToSegDfWithParentsPy(SP, simOutput[[k]]) + bridgeSegDfListGen[[length(bridgeSegDfListGen) + 1]] <<- segDf + } + + invisible(bridgeSegDfListGen) +} diff --git a/dev/alphaSimR2TsPy.R b/dev/alphaSimR2TsPy.R new file mode 100644 index 00000000..a08d2848 --- /dev/null +++ b/dev/alphaSimR2TsPy.R @@ -0,0 +1,367 @@ +library(reticulate) +library(jsonlite) + +use_virtualenv("~/r-reticulate-env", required = TRUE) +tskit <- import("tskit") + +genLogRecord <- function(parentPop, offspringPop, simParam, genIndex) { + list( + genIndex = genIndex, + offspringIds = offspringPop@id, + ibd = pullIbdHaplo(offspringPop, simParam = simParam) + ) +} + +ibdToSegDf <- function(ibdMat) { + # chr_locus + cn <- colnames(ibdMat) + # ind_hap + rn <- rownames(ibdMat) + + chr <- as.integer(sub("_.*$", "", cn)) + locus <- as.integer(sub("^.*_", "", cn)) + + childId <- sub("_.*$", "", rn) + hap <- as.integer(sub("^.*_", "", rn)) + + out <- list() + k <- 1 + + num_hap <- length(hap) + uniqueChr <- sort(unique(chr)) + + for (r in seq_len(num_hap)) { + # each row in ibdMat = every child hap + v <- ibdMat[r, ] + for (cc in uniqueChr) { + # get index from chr to extract parent hap (vv) and position (ll) + idx <- which(chr == cc) + vv <- v[idx] + ll <- locus[idx] + + # find breakpoints + chg <- which(diff(vv) != 0) + starts <- c(1, chg + 1) + ends <- c(chg, length(vv)) + + out[[k]] <- data.frame( + childId = childId[r], + hap = hap[r], + chr = cc, + # extract position of ibd change + locusStart = ll[starts], + locusEnd = ll[ends], + origin = vv[starts], + stringsAsFactors = FALSE + ) + k <- k + 1 + } + } + do.call(rbind, out) +} + +recHistMatToSegDfPy <- function(histMat, nLoci) { + + origin <- as.integer(histMat[, 1]) + starts <- as.integer(histMat[, 2]) + + ends <- c(starts[-1] - 1L, nLoci) + + data.frame( + origin = origin, + locusStart = starts, + locusEnd = ends, + stringsAsFactors = FALSE + ) +} + + +recHistToSegDfWithParentsPy <- function(SP, offspringPop, nLociByChr) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHist[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + nLoci <- nLociByChr[[cc]] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistMatToSegDfPy(haps[[h]], nLoci = nLoci) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "locusStart","locusEnd", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + +bridgeCollectSegFromSimOutputPy <- function(SP, simOutput) { + bridgeSegDfList <<- list() + + nLociByChr <- lapply(chrKeptPosBpList, length) + + for (k in 2:length(simOutput)) { + segDf <- recHistToSegDfWithParentsPy(SP, simOutput[[k]], nLociByChr) + bridgeSegDfList[[length(bridgeSegDfList) + 1]] <<- segDf + } + + invisible(bridgeSegDfList) +} + + +segDfToEdgeDfUsingBridge <- function(segDf, chr_info) { + # segDF: childID, hap, chr, locusStart, locusEnd, origin + out <- segDf + out$left <- NA_real_ + out$right <- NA_real_ + + for (cc in sort(unique(out$chr))) { + #posBp <- bridgeEnv$chrKeptPosBpList[[cc]] + #if (is.null(posBp)) stop("bridgeEnv$chrKeptPosBpList[[", cc, "]] is NULL.") + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + ts <- tskit$load(tsPath) + seqLen <- as.numeric(ts$sequence_length) + + idx <- which(out$chr == cc) + + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + out +} + +bridgeAllSegToEdgeDfPy <- function(chr_info) { + allSeg <- do.call(rbind, bridgeSegDfList) + + out <- allSeg + out$left <- NA + out$right <- NA + + for (cc in sort(unique(out$chr))) { + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + ts <- tskit$load(tsPath) + seqLen <- ts$sequence_length + + idx <- which(out$chr == cc) + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + + out +} + +bridgeComputeIndTimePy <- function(pedigree) { + n <- nrow(pedigree) + indTime <- rep(NA, n) + + for (i in 1:n) { + m <- pedigree[i, "mother"] + f <- pedigree[i, "father"] + + if (m == 0 && f == 0) { + indTime[i] <- 0 + } else { + indTime[i] <- min(indTime[m], indTime[f]) - 1 + } + } + + indTime +} + + +bridgeWriteTreesPy <- function(chr_info, edgeDf, SP, out_dir = NULL, + out_basename = "AlphaSimR_extended") { + nodeSchema <- tskit$MetadataSchema(list( + codec = "json", + type = "object", + properties = list( + alphaSimR = list( + type = "object", + properties = list( + id = list(type = "string", description = "AlphaSimR node id (childId_hap)") + ), + required = list("id"), + additionalProperties = FALSE + ) + ), + required = list("alphaSimR"), + additionalProperties = FALSE + )) + + indTime <- bridgeComputeIndTimePy(SP$pedigree) + + nodeIdMapByChr <<- vector("list", length(chr_info)) + indIdMapByChr <<- vector("list", length(chr_info)) + + for (cc in seq_along(chr_info)) { + #nodeIdMap <<- list() + nodeIdMapByChr[[cc]] <<- list() + indIdMapByChr[[cc]] <<- list() + + ts <- tskit$load(chr_info[[cc]]$ts_path) + tables <- ts$dump_tables() + reticulate::py_set_attr(tables$nodes, "metadata_schema", nodeSchema) + + # for metadata + n <- tables$nodes$num_rows + encoded <- vector("list", n) + for (i in 0:(n - 1)) { + md_i <- tables$nodes[i]$metadata + encoded[[i + 1]] <- tables$nodes$metadata_schema$encode_row(md_i) + } + # ---- + + + df <- edgeDf[edgeDf$chr == cc, , drop = FALSE] + if (nrow(df) == 0) next + + # get indIDs for sampled nodes + sampNodeId <- ts$samples() + sampIndRow <- integer(length(sampNodeId)) + for (i in seq_along(sampNodeId)) { + sampIndRow[i] <- tables$nodes[sampNodeId[i]]$individual + } + if (any(sampIndRow < 0)) { + bad <- which(sampIndRow < 0)[1] + stop( + "Sample node", sampNodeId[bad], "has individual = -1. ", + "Cannot reuse founders' individuals. ", + ) + } + + nFounder <- length(sampNodeId) / ploidy + idx <- 1 + for (ind in 1:nFounder) { + indRow <- sampIndRow[idx] + indIdMapByChr[[cc]][[as.character(ind)]] <<- indRow + + for (h in 1:ploidy) { + #nodeIdMap[[paste(ind, h, sep = "_")]] <<- as.integer(sampNodeId[[idx]]) + nodeId <- as.integer(unlist(sampNodeId[[idx]]))[1] + key <- paste(ind, h, sep = "_") + nodeIdMapByChr[[cc]][[key]] <<- nodeId + #tables$nodes$metadata[[nodeId + 1]] <- tskit$pack_bytes(list(alphaSimR = list(id = key))) + #tables$nodes[nodeId + 1] <- tables$nodes[nodeId + 1]$replace(metadata=list(alphaSimR = list(id = key))) + encoded[[nodeId + 1]] <- tables$nodes$metadata_schema$encode_row( + list(alphaSimR = list(id = key))) + idx <- idx + 1 + } + } + tables$nodes$packset_metadata(encoded) + + # add indIDs for offSpring nodes + nextInd <- as.integer(tables$individuals$num_rows) + addNewIndividual <- function(alphaId) { + key <- as.character(alphaId) + if (!is.null(indIdMapByChr[[cc]][[key]])) return(indIdMapByChr[[cc]][[key]]) + + m <- SP$pedigree[alphaId, "mother"] + f <- SP$pedigree[alphaId, "father"] + + mRow <- addNewIndividual(m) + fRow <- addNewIndividual(f) + + newId <- nextInd + tables$individuals$add_row( + parents = list(as.integer(mRow), as.integer(fRow)), + metadata = list(file_id=as.integer(newId))) + indIdMapByChr[[cc]][[key]] <<- as.integer(newId) + + nextInd <<- nextInd + 1L + newId + } + + childIdsNeeded <- sort(as.integer(unique(df$childId))) + for (childId in childIdsNeeded) { + addNewIndividual(childId) + } + + # append child nodes + childKeys <- unique(paste(df$childId, df$hap, sep = "_")) + for (key in childKeys) { + #if (is.null(nodeIdMap[[key]])) { + if (is.null(nodeIdMapByChr[[cc]][[key]])) { + childId <- as.integer(sub("_.*$", "", key)) + indRow <- indIdMapByChr[[cc]][[as.character(childId)]] + + tables$nodes$add_row( + flags = 0L, + time = indTime[[childId]], + population = -1L, + individual = indRow, + metadata = list(alphaSimR = list(id = key)) + ) + #nodeIdMap[[key]] <<- as.integer(tables$nodes$num_rows - 1) + nodeIdMapByChr[[cc]][[key]] <<- as.integer(tables$nodes$num_rows - 1) + } + } + + # append edges + for (i in 1:nrow(df)) { + parentKey <- paste(df$parentId[i], df$parentHap[i], sep = "_") + childKey <- paste(df$childId[i], df$hap[i], sep = "_") + + #if (is.null(nodeIdMap[[key]])) { + if (is.null(nodeIdMapByChr[[cc]][[parentKey]])) { + stop("Missing parent node for key=", parentKey, + " on chr=", cc, ". Check founder mapping.") + } + + tables$edges$add_row( + left = df$left[i], + right = df$right[i], + #parent = as.integer(nodeIdMap[[parentKey]]), + #child = as.integer(nodeIdMap[[childKey]]) + parent = nodeIdMapByChr[[cc]][[parentKey]], + child = nodeIdMapByChr[[cc]][[childKey]] + ) + } + + tables$sort() + newTs <- tables$tree_sequence() + + outDirCc <- if (is.null(out_dir)) dirname(chr_info[[cc]]$ts_path) else out_dir + outPath <- file.path(outDirCc, paste0(out_basename, "_chr", cc - 1, ".trees")) + + newTs$dump(outPath) + cat("Wrote:", outPath, "\n") + } + + invisible(TRUE) +} diff --git a/dev/makeFoundersFromTs.R b/dev/makeFoundersFromTs.R new file mode 100644 index 00000000..34ec04b0 --- /dev/null +++ b/dev/makeFoundersFromTs.R @@ -0,0 +1,265 @@ +sample_segregating_variants <- function(ts, segSites, seed) { + + # Sample segregating variants from the tree sequence. + # + # Parameters + # ========== + # ts: tskit.TreeSequence + # The tree sequence to sample from. + # segSites: int + # The number of segregating sites to sample. + # seed: int + # The random seed to use for sampling. + # + # Returns + # ======= + # list of int + # The positions of the sampled segregating sites. + # Set the random seed for reproducibility. + set.seed(seed) + num_samples <- as.integer(ts$num_samples()) + + # 2. Pre-allocate H matrix and P vector based on required sample size (segSites) + # We only need space for 'segSites' number of variants + H <- matrix(NA_integer_, nrow = num_samples, ncol = segSites) + P <- numeric(segSites) + + it <- ts$variants() + + # k tracks how many biallelic variants we have encountered so far + k <- 0 + # current_size tracks how many variants are currently in our reservoir + current_size <- 0 + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + + # Filter for biallelic sites + if (length(unique(g)) == 2) { + k <- k + 1 + + if (current_size < segSites) { + # Case A: Reservoir is not full yet + current_size <- current_size + 1 + H[, current_size] <- g + P[current_size] <- v$position + } else { + # Case B: Reservoir is full, use Prob. entry: j/k + # sample.int(k, 1) returns a value from 1 to k + j <- sample.int(k, 1) + + if (j <= segSites) { + # Replace the existing variant at index j + H[, j] <- g + P[j] <- v$position + } + } + } + } + + # 4. Final check: if we found fewer biallelic sites than segSites, trim the output + if (k < segSites) { + if (k > 0) { + H <- H[, 1:k, drop = FALSE] + P <- P[1:k] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + } + + return(list(H = H, P = P)) +} + + +segregating_variants <- function(ts) { + # 1. Get dimensions for pre-allocation + max_sites <- as.integer(ts$num_sites()) + num_samples <- as.integer(ts$num_samples()) + + # 2. Pre-allocate H matrix (Rows: samples, Cols: sites) + # Using integer matrix to save memory (similar to np.int8) + H_full <- matrix(NA_integer_, nrow = num_samples, ncol = max_sites) + # Pre-allocate P vector for positions + P_full <- numeric(max_sites) + + it <- ts$variants() + count <- 0 + + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + + # Filter for biallelic sites (exactly 2 unique alleles) + if (length(unique(g)) == 2) { + count <- count + 1 + # Fill the matrix column directly + H_full[, count] <- g + P_full[count] <- v$position + } + } + + # 4. Trim the results to the actual number of kept variants + if (count > 0) { + H <- H_full[, 1:count, drop = FALSE] + P <- P_full[1:count] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + + return(list(H = H, P = P)) +} + +segregating_variants_debug <- function(ts) { + # 1. Get dimensions for pre-allocation + max_sites <- ts$num_sites() + num_samples <- ts$num_samples() + + # DEBUG: Print initial metadata + message(paste("Expected max sites:", max_sites)) + message(paste("Expected num samples (from ts):", num_samples)) + + # 2. Pre-allocate H matrix + H_full <- matrix(NA_integer_, nrow = num_samples, ncol = max_sites) + P_full <- numeric(max_sites) + + it <- ts$variants() + count <- 0 + + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + + # DEBUG: Check dimensions on the first iteration + if (count == 0) { + message(paste("Actual length of genotype vector (g):", length(g))) + message(paste("Matrix H_full has", nrow(H_full), "rows")) + + if (length(g) != nrow(H_full)) { + stop("DIMENSION MISMATCH: The genotype vector length does not match matrix rows!") + } + } + + # Filter for biallelic sites + if (length(unique(g)) == 2) { + count <- count + 1 + + # DEBUG: Check for column overflow + if (count > max_sites) { + stop(paste("INDEX OVERFLOW: count (", count, ") exceeded max_sites (", max_sites, ")")) + } + + # Fill the matrix column directly + H_full[, count] <- g + P_full[count] <- v$position + } + } + + # 4. Trim the results + if (count > 0) { + H <- H_full[, 1:count, drop = FALSE] + P <- P_full[1:count] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + + message(paste("Success! Final count of biallelic variants:", count)) + return(list(H = H, P = P)) +} + +# rec map used in msprime: +rateMap2cumMorgan <- function(x, breaks, rates) { + stopifnot(length(breaks) == length(rates) + 1) + + o <- order(breaks) + breaks <- breaks[o] + + # M_i = m(breaks[i]) + seg_len <- diff(breaks) + M_start <- c(0, cumsum(rates * seg_len)) # length = length(breaks) + + i <- findInterval(x, breaks, rightmost.closed = FALSE) + i <- pmin(pmax(i, 1), length(rates)) + + m <- M_start[i] + rates[i] * (x - breaks[i]) + return(m) +} + + +ts2chrData <- function(ts_path, breaks, rates, segSites, site_sampling_seed) { + ts = ts_load(ts_path) + num_pos <- ts$num_sites() + + if (!is.null(segSites)) { + + if (num_pos < segSites) { + stop("Insufficient sites (only ", num_pos, " sites in the tree sequence).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + out <- sample_segregating_variants(ts, segSites, site_sampling_seed) + + if (length(out[[2]]) < segSites) { + stop("Insufficient sites (only ", length(out[[2]]), " sites after filtering non-biallelic sites).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + } + else { + out <- segregating_variants(ts) + } + + H <- out[[1]] + pos <- out[[2]] + + ordPos <- order(pos) + + pos <- pos[ordPos] + + mpos <- rateMap2cumMorgan(pos, breaks, rates) + + # relative position, so the 1st element is 0 + mpos <- mpos - min(mpos) + + ordMap <- order(mpos) + mpos <- mpos[ordMap] + pos <- pos[ordMap] + H <- H[, ordMap, drop = FALSE] + + + list( + genMap = list(mpos), + haplotypes = list(H), + keptPosBp = pos + ) +} + +asMapPop <- function(chr_info, ploidy = 2, inbred = FALSE, segSites = NULL, site_sampling_seed = 42) { + ploidy <<- ploidy + chr_data <- lapply(chr_info, function(info) { + ts2chrData( + ts_path = info$ts_path, + breaks = info$breaks, + rates = info$rates, + segSites = info$segSites, + site_sampling_seed = site_sampling_seed + ) + }) + + # save pos in bp for tskit tables + chrKeptPosBpList <<- lapply(chr_data, `[[`, "keptPosBp") + + genMap <- do.call(c, lapply(chr_data, `[[`, "genMap")) + haplotypes <- do.call(c, lapply(chr_data, `[[`, "haplotypes")) + + newMapPop(genMap = genMap, haplotypes = haplotypes, inbred = inbred, ploidy = ploidy) +} diff --git a/dev/makeFoundersFromTsPy.R b/dev/makeFoundersFromTsPy.R new file mode 100644 index 00000000..de6f52cf --- /dev/null +++ b/dev/makeFoundersFromTsPy.R @@ -0,0 +1,152 @@ +#library(reticulate) +library(jsonlite) + +#use_virtualenv("~/r-reticulate-env", required = TRUE) +#msprime <- import("msprime") +#tskit <- import("tskit") + +reticulate::py_run_string(" +import numpy as np + +def sample_segregating_variants(ts, segSites, seed): + rng = np.random.default_rng(int(seed)) + + kept_pos = [] + kept_g = [] + k = 0 + + for v in ts.variants(): + g = v.genotypes + + if len(np.unique(g)) != 2: + continue + + # reservoir sampling + k += 1 + if len(kept_g) < segSites: + kept_g.append(g.copy()) + kept_pos.append(v.site.position) + else: + # Prob. entry: j/k + j = rng.integers(0, k) + if j < segSites: + kept_g[j] = g.copy() + kept_pos[j] = v.site.position + + H = np.stack(kept_g, axis=1).astype(np.int8) + P = np.array(kept_pos, dtype=float) + return H, P +") + +reticulate::py_run_string(" +import numpy as np + +def segregating_variants(ts): + + kept_pos = [] + kept_g = [] + + for v in ts.variants(): + g = v.genotypes + + if len(np.unique(g)) != 2: + continue + + kept_g.append(g.copy()) + kept_pos.append(v.site.position) + + H = np.stack(kept_g, axis=1).astype(np.int8) + P = np.array(kept_pos, dtype=float) + return H, P +") + + +# rec map used in msprime: +rateMap2cumMorgan <- function(x, breaks, rates) { + stopifnot(length(breaks) == length(rates) + 1) + + o <- order(breaks) + breaks <- breaks[o] + + # M_i = m(breaks[i]) + seg_len <- diff(breaks) + M_start <- c(0, cumsum(rates * seg_len)) # length = length(breaks) + + i <- findInterval(x, breaks, rightmost.closed = FALSE) + i <- pmin(pmax(i, 1), length(rates)) + + m <- M_start[i] + rates[i] * (x - breaks[i]) + return(m) +} + + +ts2chrDataPy <- function(ts_path, breaks, rates, segSites, site_sampling_seed) { + ts = tskit$load(ts_path) + + pos <- ts$tables$sites$position + + if (!is.null(segSites)) { + # stopifnot(length(pos) >= segSites) + if (length(pos) < segSites) { + stop("Insufficient sites (only ", length(pos), " sites in the tree sequence).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + out <- py$sample_segregating_variants(ts, segSites, seed=site_sampling_seed) + #if (length(out[[2]]) < segSites) { + # warning("Not enough sites kept after filtering non-biallelic sites.") + #} + if (length(out[[2]]) < segSites) { + stop("Insufficient sites (only ", length(out[[2]]), " sites after filtering non-biallelic sites).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + } + else { + out <- py$segregating_variants(ts) + } + + H <- out[[1]] + pos <- out[[2]] + + ordPos <- order(pos) + + pos <- pos[ordPos] + + mpos <- rateMap2cumMorgan(pos, breaks, rates) + + # relative position, so the 1st element is 0 + mpos <- mpos - min(mpos) + + ordMap <- order(mpos) + mpos <- mpos[ordMap] + pos <- pos[ordMap] + H <- H[, ordMap, drop = FALSE] + + + list( + genMap = list(mpos), + # haplotypes <- list(H) + haplotypes = list(H), + keptPosBp = pos + ) +} + +asMapPopPy <- function(chr_info, ploidy = 2, inbred = FALSE, segSites = NULL, site_sampling_seed = 42) { + ploidy <<- ploidy + chr_data <- lapply(chr_info, function(info) { + ts2chrDataPy( + ts_path = info$ts_path, + breaks = info$breaks, + rates = info$rates, + segSites = info$segSites, + site_sampling_seed = site_sampling_seed + ) + }) + + # save pos in bp for tskit tables + chrKeptPosBpList <<- lapply(chr_data, `[[`, "keptPosBp") + + genMap <- do.call(c, lapply(chr_data, `[[`, "genMap")) + haplotypes <- do.call(c, lapply(chr_data, `[[`, "haplotypes")) + + newMapPop(genMap = genMap, haplotypes = haplotypes, inbred = inbred, ploidy = ploidy) +} diff --git a/dev/msprime_from_macs_scenarios.py b/dev/msprime_from_macs_scenarios.py new file mode 100644 index 00000000..3a5a89d1 --- /dev/null +++ b/dev/msprime_from_macs_scenarios.py @@ -0,0 +1,310 @@ +""" +Generate msprime tree-sequence replicates from MaCS-style scenario strings. + +This script maps a supported subset of MaCS args to msprime demography: + - sample_size sequence_length + -t, -r + -I (with optional global migration parameter) + -eN, -en, -eM, -em, -ej + -s (ignored placeholder) + +Outputs: + 1) .trees files for each scenario/replicate/chromosome + 2) manifest CSV with paths/seeds/basic TS counts +""" + + + +import argparse +import csv +from pathlib import Path + +import msprime + +def parse_macs_args(args): + tok = [x for x in args.strip().split() if x] + if len(tok) < 2: + raise ValueError("MaCS args must start with ' '.") + + macs_arg = {'sample_size': int(tok[0]), + 'sequence_length': int(tok[1]), + 'num_pops': 1, + 'pop_samples':[], + 'events':[]} + + i = 2 + n = len(tok) + while i < n: + flag = tok[i] + if flag == "-t": + macs_arg['theta'] = float(tok[i + 1]) + i += 2 + elif flag == "-r": + macs_arg['rec'] = float(tok[i + 1]) + i += 2 + elif flag == "-I": + k = int(tok[i + 1]) + pop_samples = [int(tok[i + 2 + j]) for j in range(k)] + j = i + 2 + k + mig = 0.0 + if j < n and not tok[j].startswith("-"): + mig = float(tok[j]) + j += 1 + macs_arg['num_pops'] = k + macs_arg['pop_samples'] = pop_samples + macs_arg['global_migration'] = mig + i = j + elif flag == "-eN": + macs_arg['events'].append( + {"type": "eN", + "t": float(tok[i + 1]), + "x": float(tok[i + 2])}) + i += 3 + elif flag == "-en": + macs_arg['events'].append( + { + "type": "en", + "t": float(tok[i + 1]), + "pop": int(tok[i + 2]), + "x": float(tok[i + 3]), + } + ) + i += 4 + elif flag == "-eM": + macs_arg['events'].append( + {"type": "eM", + "t": float(tok[i + 1]), + "M": float(tok[i + 2])}) + i += 3 + elif flag == "-em": + macs_arg['events'].append( + { + "type": "em", + "t": float(tok[i + 1]), + "src": int(tok[i + 2]), + "dst": int(tok[i + 3]), + "Mij": float(tok[i + 4]), + } + ) + i += 5 + elif flag == "-ej": + macs_arg['events'].append( + { + "type": "ej", + "t": float(tok[i + 1]), + "src": int(tok[i + 2]), + "dst": int(tok[i + 3]), + } + ) + i += 4 + elif flag == "-s": + # In AlphaSimR wrappers this is a seed placeholder. + # Ignore optional value if present. + if i + 1 < n and not tok[i + 1].startswith("-"): + i += 2 + else: + i += 1 + else: + raise ValueError(f"Unsupported MaCS token: {flag}") + + if macs_arg['num_pops'] == 1 and len(macs_arg['pop_samples'])==0: + macs_arg['pop_samples'] = [macs_arg['sample_size']] + + if sum(macs_arg['pop_samples']) != macs_arg['sample_size']: + raise ValueError( + f"-I population samples sum to {sum(macs_arg['pop_samples'])}, " + f"but sample_size is {macs_arg['sample_size']}." + ) + return macs_arg + + +def directed_pairs(pop_names): + return [(src, dst) for src in pop_names for dst in pop_names if src != dst] + + +def scaled_time_to_generations(t_coal, nref): + return t_coal * 4.0 * nref + + +def scaled_rec_to_per_bp(r_arg, nref): + return r_arg / (4.0 * nref) + + +def scaled_mut_to_per_bp(theta, nref): + return theta / (4.0 * nref) + + +def scaled_global_M_to_pairwise_m(M, k_pops, nref): + if k_pops <= 1: + return 0.0 + return (M / (k_pops - 1.0)) / (4.0 * nref) + + +def scaled_pair_Mij_to_m(Mij, nref): + return Mij / (4.0 * nref) + + +def build_demography(macs_arg, nref): + dem = msprime.Demography() + pop_names = [f"p{i + 1}" for i in range(macs_arg['num_pops'])] + for name in pop_names: + dem.add_population(name=str(name), initial_size=nref, initially_active=True) + + if macs_arg['num_pops'] > 1: + m0 = scaled_global_M_to_pairwise_m(macs_arg['global_migration'], macs_arg['num_pops'], nref) + if m0!=0: + for src, dst in directed_pairs(pop_names): + dem.set_migration_rate(str(src), str(dst), m0) + + events = sorted(macs_arg['events'], key=lambda e: e["t"]) + for ev in events: + t_gen = scaled_time_to_generations(ev["t"], nref) + typ = ev["type"] + if typ == "eN": + size = ev["x"] * nref + for name in pop_names: + dem.add_population_parameters_change(time=t_gen, population=str(name), initial_size=size) + elif typ == "en": + pop_name = f"p{ev['pop']}" + size = ev["x"] * nref + dem.add_population_parameters_change(time=t_gen, population=str(pop_name), initial_size=size) + elif typ == "eM": + m = scaled_global_M_to_pairwise_m(ev["M"], macs_arg['num_pops'], nref) + for src, dst in directed_pairs(pop_names): + dem.add_migration_rate_change(time=t_gen, source=str(src), dest=str(dst), rate=m) + elif typ == "em": + src = f"p{ev['src']}" + dst = f"p{ev['dst']}" + m = scaled_pair_Mij_to_m(ev["Mij"], nref) + dem.add_migration_rate_change(time=t_gen, source=str(src), dest=str(dst), rate=m) + elif typ == "ej": + src = f"p{ev['src']}" + dst = f"p{ev['dst']}" + dem.add_population_split(time=t_gen, derived=[str(src)], ancestral=str(dst)) + else: + raise ValueError(f"Unhandled event type: {typ}") + + rec_rate = scaled_rec_to_per_bp(macs_arg['rec'], nref) + mut_rate = scaled_mut_to_per_bp(macs_arg['theta'], nref) + pop_hap = [int(macs_arg["pop_samples"][i]) for i in range(macs_arg["num_pops"])] + if any(n % 2 != 0 for n in pop_hap): + raise ValueError("Cannot map MaCS haploid sample counts to ploidy=2 (odd count present).") + samples = {f"p{i + 1}": pop_hap[i] // 2 for i in range(macs_arg["num_pops"])} + # ploidy = 1 + #samples = {f"p{i + 1}": int(macs_arg['pop_samples'][i]) for i in range(macs_arg['num_pops'])} + print(dem, samples, rec_rate) + return dem, samples, rec_rate, mut_rate + + +def default_scenarios(): + return [ + { + "id": 1, + "name": "single_const", + "args": "8 100000 -t 1e-3 -r 1e-4 -s ", + }, + { + "id": 2, + "name": "single_eN", + "args": "8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ", + }, + { + "id": 3, + "name": "I2_migration", + "args": "8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ", + }, + { + "id": 4, + "name": "I2_en_join", + "args": "8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ", + }, + ] + + +def make_seed(base_seed, scenario_id, rep_id, chr_id): + return int(base_seed + scenario_id * 100000 + rep_id * 1000 + (chr_id - 1)) + + +def run(args): + out_dir = Path(args.out_dir) + out_dir.mkdir(parents=True, exist_ok=True) + + scenarios = default_scenarios() + rows = [] + + for sc in scenarios: + macs_arg = parse_macs_args(sc["args"]) + dem, samples, rec_rate, mut_rate = build_demography(macs_arg, args.nref) + for rep in range(1, args.nrep + 1): + for chr_id in range(1, args.nchr + 1): + seed = make_seed(args.base_seed, int(sc["id"]), rep, chr_id) + ts = msprime.sim_ancestry( + samples=samples, + ploidy=2, + demography=dem, + sequence_length=macs_arg['sequence_length'], + recombination_rate=rec_rate, + model=args.model, + random_seed=seed, + ) + mts = msprime.sim_mutations(ts, rate=mut_rate, random_seed=seed+100) + tree_path = out_dir / f"{sc['name']}_rep{rep:02d}_chr{chr_id:02d}.trees" + mts.dump(tree_path) + + rows.append( + { + "scenario_id": sc["id"], + "scenario": sc["name"], + "rep": rep, + "chr": chr_id, + "args": sc["args"], + "nref": args.nref, + "seed_chr": seed, + "model": args.model, + "sequence_length": macs_arg['sequence_length'], + "rec_rate_bp": rec_rate, + "mut_rate_bp": mut_rate, + "num_trees": ts.num_trees, + "num_nodes": ts.num_nodes, + "num_edges": ts.num_edges, + "num_mutations": mts.num_mutations, + "max_root_time": float(ts.max_root_time), + "tree_path": str(tree_path), + } + ) + if args.verbose: + print( + f"[ok] {sc['name']} rep={rep:02d} chr={chr_id:02d} " + f"trees={ts.num_trees} nodes={ts.num_nodes} edges={ts.num_edges}" + ) + + manifest_path = out_dir / "msprime_manifest.csv" + with manifest_path.open("w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) + writer.writeheader() + writer.writerows(rows) + print(f"Saved manifest: {manifest_path}") + print(f"Saved trees under: {out_dir}") + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--nrep", type=int, default=50, help="Replicates per scenario.") + p.add_argument("--nchr", type=int, default=1, help="Independent chromosomes per replicate.") + p.add_argument("--nref", type=float, default=10000.0, help="Reference Ne for MaCS->msprime scaling.") + p.add_argument("--base-seed", type=int, default=700000, help="Base seed for deterministic seed schedule.") + p.add_argument("--model", type=str, default="smc_prime", help="msprime ancestry model.") + p.add_argument( + "--out-dir", + type=str, + default="testData/out_msprime_from_macs", + help="Output directory for .trees and manifest CSV.", + ) + p.add_argument("--verbose", action="store_true", help="Print progress lines.") + return p + + +if __name__ == "__main__": + parser = build_parser() + ns = parser.parse_args() + run(ns) diff --git a/dev/msprime_macs_scenarios_compare.py b/dev/msprime_macs_scenarios_compare.py new file mode 100644 index 00000000..168c004e --- /dev/null +++ b/dev/msprime_macs_scenarios_compare.py @@ -0,0 +1,198 @@ +from pathlib import Path +import csv +import statistics as st + +def print_phase1_summary( + macsts_manifest="testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_manifest.csv", + msprime_manifest="testData/out_msprime_from_macs/msprime_manifest.csv", + try_load_macsts_trees=True, + comparison_table_path="testData/method_comparison_long.csv", +): + macsts_manifest = Path(macsts_manifest) + msprime_manifest = Path(msprime_manifest) + + def read_csv_rows(path): + with path.open(newline="", encoding="utf-8") as f: + return list(csv.DictReader(f)) + + def mean_sd(xs): + xs = [float(x) for x in xs if x is not None] + if len(xs) == 0: + return float("nan"), float("nan") + if len(xs) == 1: + return xs[0], 0.0 + return st.mean(xs), st.stdev(xs) + + macsts_rows = read_csv_rows(macsts_manifest) + msprime_rows = read_csv_rows(msprime_manifest) + + by_scn_macsts = {} + for r in macsts_rows: + by_scn_macsts.setdefault(r["scenario"], []).append(r) + + by_scn_msprime = {} + for r in msprime_rows: + by_scn_msprime.setdefault(r["scenario"], []).append(r) + + # Optional: load MaCSTS trees for edge/node/tree/root stats + macsts_tree_stats = {} + if try_load_macsts_trees: + try: + import tskit + for scn, rows in by_scn_macsts.items(): + e, n, t, h, m = [], [], [], [], [] + for r in rows: + p = Path(r["tree_path"]) + if not p.exists(): + p = macsts_manifest.parent.parent / p # handle relative path in CSV + ts = tskit.load(str(p)) + e.append(ts.num_edges) + n.append(ts.num_nodes) + t.append(ts.num_trees) + h.append(float(ts.max_root_time)) + m.append(ts.num_mutations) + macsts_tree_stats[scn] = { + "num_edges": mean_sd(e), + "num_nodes": mean_sd(n), + "num_trees": mean_sd(t), + "max_root_time": mean_sd(h), + "num_mutations": mean_sd(m) + } + except Exception as ex: + print(f"[note] skipped MaCSTS tree loading: {ex}") + + scenarios = sorted(set(by_scn_macsts) | set(by_scn_msprime)) + + for scn in scenarios: + print(f"\n=== {scn} ===") + + # Mutation summary from MaCS vs MaCSTS manifest + if scn in by_scn_macsts: + d = by_scn_macsts[scn] + macs = [float(r["macs_num_mutations"]) for r in d] + macsts = [float(r["macsts_num_mutations"]) for r in d] + delta = [a - b for a, b in zip(macs, macsts)] + + macs_mean, macs_sd = mean_sd(macs) + macsts_mean, macsts_sd = mean_sd(macsts) + delta_mean, delta_sd = mean_sd(delta) + rel_diff = (delta_mean / macs_mean) if macs_mean != 0 else float("nan") + + print( + f"scenario={scn}, n={len(d)}, " + f"macs_mut_mean={macs_mean:.4f}, macs_mut_sd={macs_sd:.4f}, " + f"macsts_mut_mean={macsts_mean:.4f}, macsts_mut_sd={macsts_sd:.4f}, " + f"mut_diff_mean={delta_mean:.4f}, mut_diff_sd={delta_sd:.4f}, " + f"rel_diff={rel_diff:.6f}" + ) + + # msprime stats from manifest (already has edges/nodes/tree/root) + if scn in by_scn_msprime: + d = by_scn_msprime[scn] + edges = [float(r["num_edges"]) for r in d] + nodes = [float(r["num_nodes"]) for r in d] + trees = [float(r["num_trees"]) for r in d] + roots = [float(r["max_root_time"]) for r in d] + muts = [float(r["num_mutations"]) for r in d] if "num_mutations" in d[0] else [] + + e_m, e_sd = mean_sd(edges) + n_m, n_sd = mean_sd(nodes) + t_m, t_sd = mean_sd(trees) + h_m, h_sd = mean_sd(roots) + print( + f"msprime: edges_mean={e_m:.4f}, edges_sd={e_sd:.4f}, " + f"nodes_mean={n_m:.4f}, nodes_sd={n_sd:.4f}, " + f"trees_mean={t_m:.4f}, trees_sd={t_sd:.4f}, " + f"root_time_mean={h_m:.4f}, root_time_sd={h_sd:.4f}" + ) + if muts: + m_m, m_sd = mean_sd(muts) + print(f"msprime: num_mut_mean={m_m:.4f}, num_mut_sd={m_sd:.4f}") + + # Optional MaCSTS tree stats + if scn in macsts_tree_stats: + s = macsts_tree_stats[scn] + print( + f"macsts-ts: edges_mean={s['num_edges'][0]:.4f}, edges_sd={s['num_edges'][1]:.4f}, " + f"nodes_mean={s['num_nodes'][0]:.4f}, nodes_sd={s['num_nodes'][1]:.4f}, " + f"trees_mean={s['num_trees'][0]:.4f}, trees_sd={s['num_trees'][1]:.4f}, " + f"root_time_mean={s['max_root_time'][0]:.4f}, root_time_sd={s['max_root_time'][1]:.4f}," + f"num_mut_mean={s['num_mutations'][0]:.4f}, mut_sd={s['num_mutations'][1]:.4f}" + ) + + # Long-format per-replicate table for downstream comparison + long_rows = [] + macsts_tree_by_key = {} + if try_load_macsts_trees: + try: + for r in macsts_rows: + p = Path(r["tree_path"]) + if not p.exists(): + p = macsts_manifest.parent.parent / p + ts = tskit.load(str(p)) + macsts_tree_by_key[(r["scenario"], int(r["rep"]), int(r["chr"]))] = { + "num_trees": ts.num_trees, + "num_edges": ts.num_edges, + "num_nodes": ts.num_nodes, + "max_root_time": float(ts.max_root_time), + } + except Exception as ex: + print(f"[note] could not enrich macsTS tree stats in long table: {ex}") + + for r in macsts_rows: + key = (r["scenario"], int(r["rep"]), int(r["chr"])) + ts_stats = macsts_tree_by_key.get(key, {}) + long_rows.append({ + "Scenarios": r["scenario"], + "Methods": "macs", + "rep_index": int(r["rep"]), + "num_mut": int(r["macs_num_mutations"]), + "num_trees": "NA", + "num_edges": "NA", + "num_nodes": "NA", + "max_root_time": "NA", + }) + long_rows.append({ + "Scenarios": r["scenario"], + "Methods": "macsTS", + "rep_index": int(r["rep"]), + "num_mut": int(r["macsts_num_mutations"]), + "num_trees": ts_stats.get("num_trees", "NA"), + "num_edges": ts_stats.get("num_edges", "NA"), + "num_nodes": ts_stats.get("num_nodes", "NA"), + "max_root_time": ts_stats.get("max_root_time", "NA"), + }) + + for r in msprime_rows: + long_rows.append({ + "Scenarios": r["scenario"], + "Methods": "msprime", + "rep_index": int(r["rep"]), + "num_mut": int(float(r["num_mutations"])) if "num_mutations" in r else "NA", + "num_trees": int(float(r["num_trees"])) if "num_trees" in r else "NA", + "num_edges": int(float(r["num_edges"])) if "num_edges" in r else "NA", + "num_nodes": int(float(r["num_nodes"])) if "num_nodes" in r else "NA", + "max_root_time": float(r["max_root_time"]) if "max_root_time" in r else "NA", + }) + + comparison_table_path = Path(comparison_table_path) + comparison_table_path.parent.mkdir(parents=True, exist_ok=True) + with comparison_table_path.open("w", newline="", encoding="utf-8") as f: + w = csv.DictWriter( + f, + fieldnames=[ + "Scenarios", + "Methods", + "rep_index", + "num_mut", + "num_trees", + "num_edges", + "num_nodes", + "max_root_time", + ], + ) + w.writeheader() + w.writerows(long_rows) + print(f"\nSaved long comparison table: {comparison_table_path}") + +print_phase1_summary() diff --git a/dev/multiple_chr.py b/dev/multiple_chr.py new file mode 100644 index 00000000..e6d9d91a --- /dev/null +++ b/dev/multiple_chr.py @@ -0,0 +1,54 @@ +import io +import msprime +from pathlib import Path + +output_path = "/Users/jliang2/Projects/test_TSK2ASR/data/simulations/normal" +Path(output_path).mkdir(parents=True, exist_ok=True) + +mut_rate = 1.25e-8 +mut_random_seed = 5678 + +ped_txt = """\ +# id parent0 parent1 time is_sample +0 2 3 0.0 1 +1 4 5 0.0 1 +2 6 7 1.0 0 +3 8 9 1.0 0 +4 6 7 1.0 0 +5 10 11 1.0 0 +6 . . 2.0 0 +7 . . 2.0 0 +8 . . 2.0 0 +9 . . 2.0 0 +10 . . 2.0 0 +11 . . 2.0 0 +""" + +Ls = [1000000, 2000000, 3000000] +#rs = [1e-8, 2e-8, 3e-8] +rate=[1e-7, 1e-8, 1e-7] + + +ts_chroms = [] +pedigree = msprime.parse_pedigree(io.StringIO(ped_txt), sequence_length=1) + +for i in range(len(Ls)): + pedigree.sequence_length = Ls[i] + rate_map = msprime.RateMap( + position=[0, round(Ls[i] / 3), 2 * round(Ls[i] / 3), Ls[i]], + rate=rate) + + ped_ts = msprime.sim_ancestry( + initial_state=pedigree, model="fixed_pedigree", + recombination_rate=rate_map, random_seed=i+1) + + ts_chroms.append( + msprime.sim_ancestry( + initial_state=ped_ts, population_size=1000, + recombination_rate=rate_map, model="dtwf", random_seed=i+100)) + +for i, ts in enumerate(ts_chroms): + print(f"chromosome {i} has length {ts.sequence_length} and {ts.num_trees} trees") + tree_seq_mut = msprime.sim_mutations(ts, rate=mut_rate, random_seed=mut_random_seed) + tree_seq_mut_tree_result = output_path + f"/msprime_chr{i}.trees" + tree_seq_mut.dump(tree_seq_mut_tree_result) diff --git a/dev/notes.md b/dev/notes.md new file mode 100644 index 00000000..17c2bb39 --- /dev/null +++ b/dev/notes.md @@ -0,0 +1,121 @@ +How I set up an R package using the tskit C API: + +1. Create a basic R package structure (in an existing R project folder) + +I removed other files and created a package directory with: +``` +AlphaSimRTmp/ + DESCRIPTION + R/ + src/ +``` + +(R will not recognise the directory as a package unless DESCRIPTION, R/, and src/ all exist) + +2. Write a minimal DESCRIPTION + +I added a minimal DESCRIPTION file. + +``` +Package: AlphaSimRTmp +Type: Package +Version: 0.0.1 +Imports: Rcpp +LinkingTo: Rcpp +``` + +3. Vendor tskit and kastore (only C API files) + +Tskit and karstore are from: https://github.com/tskit-dev/tskit/archive/refs/tags/1.0.0.zip +Inside src/, I created a deps/ directory and copied in only the C API parts: + +``` +src/deps/ + tskit/ + kastore/ + tskit.h +``` +tskit folder from: tskit-1.0.0/c/tskit +karstore folder from: tskit-1.0.0/c/subprojects/kastore +tskit.h from: tskit-1.0.0/c/tskit.h + +(I did not copy meson.build, examples, Python code and documentation etc.) + +4. Create a minimal C++ test file + +I added src/minimal.cpp with: + +#include + +two exported Rcpp functions: + +one to report the tskit version + +one smoke test that loads a .trees file using +tsk_table_collection_init → load → tsk_treeseq_init + +5. Create an initial NAMESPACE so Rcpp::compileAttributes() could run + +Before anything would compile, I created a minimal NAMESPACE in R: + +``` +writeLines(c( + "useDynLib(AlphaSimRTmp, .registration=TRUE)", + "importFrom(Rcpp, evalCpp)" +), "NAMESPACE") +``` + +6. Write Makevars to compile vendored C code + +In src/Makevars, I: + +added include paths for deps, deps/tskit, and deps/kastore; + +explicitly listed all tskit and kastore .c files; + +added custom rules to compile .c files in subdirectories; + +linked the resulting .o files into the package shared library. + +7. Generate Rcpp and roxygen outputs + +From the package root, in R: +``` +Rcpp::compileAttributes() +roxygen2::roxygenise() +``` +After this, R/RcppExports.R, src/RcppExports.cpp were created, but the minimal handwritten NAMESPACE was not replaced. + +8. Add zzz.R so roxygen2 can generate a correct NAMESPACE + +Before generating documentation, I created R/zzz.R with the following contents: +``` +#' @useDynLib AlphaSimRTmp, .registration = TRUE +#' @importFrom Rcpp evalCpp +NULL +``` + +This ensures that roxygen2 writes the required useDynLib() and importFrom(Rcpp, evalCpp) entries into NAMESPACE. + +9. I remove the handwritten NAMESPACE and generated it with running `roxygen2::roxygenise()` again. + +10. Clean install the package (not necessary; just because of bugs in lazy-load database caused by repeated installs) + +``` +rm -rf ~/Library/R/arm64/4.5/library/AlphaSimRTmp +rm -rf ~/Library/R/arm64/4.5/library/00LOCK-AlphaSimRTmp +R CMD INSTALL --preclean AlphaSimRTmp +``` + +11. Test + +In a fresh R session: +``` +> library(AlphaSimRTmp) +> tskit_version_test() +major minor patch + 1 3 0 +> +> tskit_smoke_load_free("...Projects/test_TSK2ASR/data/simulations/normal/msprime_chr1.trees") +[1] 1 +``` diff --git a/dev/notesRealBreakpoints.md b/dev/notesRealBreakpoints.md new file mode 100644 index 00000000..29eaa63f --- /dev/null +++ b/dev/notesRealBreakpoints.md @@ -0,0 +1,319 @@ +notesRealBreakpoints +================ +2026-02-18 + +## Test data + +Use `AlphaSimR_test/dev/multiple_chr.py` to simulate 2 chromosomes and 2 +dip individuals with pedigree. Or directly use `msprime_chr0.trees` and +`msprime_chr1.trees` in `AlphaSimR_test/dev/testData`. + +## Try this version + +In this version, a recHistGen object (similar to recHist) was added to +record the real recombination breakpoints. + +### Load tree sequences + + library(AlphaSimR) + use_virtualenv("~/r-reticulate-env", required = TRUE) + tskit <- import("tskit") + devtools::load_all() + + # two chromosomes + L1 <- 1e6 + L2 <- 2e6 + # here, use the same recombination map as used in msprime + chr_info <- list( + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + #breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr1.trees", + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) + ) + + founderGenomes1 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) + +### Run AlphaSimR + + set.seed(42) + SP = SimParam$new(founderGenomes1) + SP$setSexes("yes_sys") + SP$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + + SP$setTrackPed(TRUE) + # try the new function here, it automatically set setTrackRec also. + SP$setTrackRecGen(TRUE) + basePop = newPop(founderGenomes1) + + # the 2 objects are same now: + SP$recHistGen + SP$recHist + + basePop = setPheno(basePop, + h2 = 0.5) + + #--- n generations + nCycles<-2 + + # very simple container for each cycles sim output + simOutput<-list(basePop) + cycle<-1 + for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput[[cycle]], nInd=6, use = "gv") + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5) + # add new offspring to simOutput list + simOutput[[cycle+1]]<-offspringPop + } + +Now we can see the difference between recHist and recHistGen: + + RHG <- SP$recHistGen + RH <- SP$recHist + # ind 3; chr 2; hap 1. Maybe not the same output, please check RHG and RH to find a hap with recombination + rh <- RH[[3]][[2]][[1]] + rhg <- RHG[[3]][[2]][[1]] + gm <- SP$genMap[[2]] + +Col 1: original Hap; Col2: start from where (recHist: index of SNP; +recHistGen: positions in Morgan) + + > rh + [,1] [,2] + [1,] 2 1 + [2,] 1 112 + > rhg + [,1] [,2] + [1,] 2 0.00000000 + [2,] 1 0.09354741 + +So, if everything goes well, rh\[2,2\]-1 \< rhg\[2,2\] \< rh\[2,2\]. We +can check it with genMap (SNP index -\> SNP position in Morgan): + + > gm[[111]] + [1] 0.0923321 + > gm[[112]] + [1] 0.0939209 + +### Collect information for ts tables + + # for RecHist + # bridgeSegDfList store the indexes of SNPs after recombination events + bridgeCollectSegFromSimOutput(SP, simOutput) + # for RecHistGen + # bridgeSegDfListGen store the positions of where recombination happen + bridgeCollectSegGenFromSimOutput(SP, simOutput) + +For RecHist, the indexes of SNPs have to be turned into positions: + + edgeDf <- bridgeAllSegToEdgeDf(chr_info) + +### Write tree files and check + + bridgeWriteTrees(chr_info, edgeDf, SP) + +In python: + + import tskit + origin = tskit.load('.../AlphaSimR_test/dev/testData/msprime_chr0.trees') + marker_ts = tskit.load('.../AlphaSimR_test/dev/testData/AlphaSimR_extended_chr0.trees') + + # Statistics: + # chr1 + origin.num_trees + 298 + marker_ts.num_trees + 298 + # chr 1 is too short for new recombination events. But 40 new nodes (2 x 20 ind) added. + origin.num_nodes + 260 + marker_ts.num_nodes + 300 + + # chr2 + origin = tskit.load('.../AlphaSimR_test/dev/testData/msprime_chr1.trees') + marker_ts = tskit.load('.../AlphaSimR_test/dev/testData/AlphaSimR_extended_chr1.trees') + # now here are new recombination events: + origin.num_trees + 620 + marker_ts.num_trees + 623 + # and still 40 new nodes: + origin.num_nodes + 491 + marker_ts.num_nodes + 531 + +We can plot the pedigree by (information in individual table): + + from matplotlib import pyplot as plt + import networkx as nx + import tskit + def draw_pedigree(ped_ts): + G = nx.DiGraph() + for ind in ped_ts.individuals(): + time = ped_ts.node(ind.nodes[0]).time + pop = ped_ts.node(ind.nodes[0]).population + G.add_node(ind.id, time=time, population=pop) + for p in ind.parents: + if p != tskit.NULL: + G.add_edge(ind.id, p) + pos = nx.multipartite_layout(G, subset_key="time", align="horizontal") + colours = plt.rcParams['axes.prop_cycle'].by_key()['color'] + node_colours = [colours[node_attr["population"]] for node_attr in G.nodes.values()] + nx.draw_networkx(G, pos, with_labels=True, node_color=node_colours) + plt.show() + + draw_pedigree(origin) + +![](../man/figures/originInd.png) + +The new individuals added: + + draw_pedigree(marker_ts) + +![](../man/figures/addInd.png) + +For RecHistGen, the positions were stored in bridgeSegDfListGen and can +be directly used: + + bridgeWriteTrees(chr_info, do.call(rbind, bridgeSegDfListGen), SP) + +### More recombinations? + +Let’s use the same msprime .tree files, but set higher recombination +rates to see the difference between recHist and recHistGen when there +are double crossing over between 2 sampled SNPs. + + L1 <- 1e6 + L2 <- 2e6 + chr_info <- list( + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr0.trees", + #breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr1.trees", + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) + ) + + founderGenomes2 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) + set.seed(42) + SP2 = SimParam$new(founderGenomes2) + SP2$setSexes("yes_sys") + SP2$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + + SP2$setTrackPed(TRUE) + # try the new function here, it automatically set setTrackRec also. + SP2$setTrackRecGen(TRUE) + basePop2 = newPop(founderGenomes2, simParam = SP2) + basePop2 = setPheno(basePop2, + h2 = 0.5, + simParam = SP2) + + #--- n generations + nCycles<-2 + + # very simple container for each cycles sim output + simOutput2<-list(basePop2) + cycle<-1 + for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput2[[cycle]], nInd=6, use = "gv", simParam = SP2) + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5, simParam = SP2) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5, simParam = SP2) + # add new offspring to simOutput list + simOutput2[[cycle+1]]<-offspringPop + } + +check ind 3; chr 1; hap 1: + + RHG <- SP2$recHistGen + RH <- SP2$recHist + gm <- SP2$genMap[[1]] + + rh <- RH[[3]][[1]][[1]] + rhg <- RHG[[3]][[1]][[1]] + +Now we can see 10 more recombination events in recHistGen: + + > rh + [,1] [,2] + [1,] 2 1 + [2,] 1 8 + [3,] 2 36 + [4,] 1 44 + [5,] 2 47 + [6,] 1 50 + [7,] 2 52 + > rhg + [,1] [,2] + [1,] 2 0.000000 + [2,] 1 1.306539 + [3,] 2 2.243464 + [4,] 1 2.423857 + [5,] 2 2.608929 + [6,] 1 2.831431 + [7,] 2 4.993668 + [8,] 1 5.958832 + [9,] 2 6.238179 + [10,] 1 7.697307 + [11,] 2 7.882583 + [12,] 1 8.537422 + [13,] 2 9.214039 + [14,] 1 9.696815 + [15,] 2 10.348671 + [16,] 1 11.282477 + [17,] 2 12.080616 + +To see where the recombination events (between which SNPs) recorded in +recHistGen: + + > x <- rhg[,2] + > + > left <- findInterval(x, gm) + > right <- pmin(left + 1, length(gm)) + > + > out <- data.frame( + + x = x, + + left_i = left, + + left_v = gm[left], + + right_i = right, + + right_v = gm[right] + + ) + > + > out + x left_i left_v right_i right_v + 1 0.000000 1 0.00000 2 0.00609 + 2 1.306539 7 1.06375 8 1.36458 + 3 2.243464 10 1.77918 11 2.51330 + 4 2.423857 10 1.77918 11 2.51330 + 5 2.608929 12 2.57886 13 2.86640 + 6 2.831431 12 2.57886 13 2.86640 + 7 4.993668 35 4.64742 36 5.39801 + 8 5.958832 38 5.73197 39 6.43739 + 9 6.238179 38 5.73197 39 6.43739 + 10 7.697307 41 7.61275 42 8.31825 + 11 7.882583 41 7.61275 42 8.31825 + 12 8.537422 43 8.51587 44 8.67657 + 13 9.214039 46 8.85817 47 9.52313 + 14 9.696815 47 9.52313 48 10.44305 + 15 10.348671 47 9.52313 48 10.44305 + 16 11.282477 49 10.92943 50 11.29281 + 17 12.080616 51 11.94669 52 12.37887 + +The double crossing overs between 2 sampled SNPs (e.g. rows 3 & 4; 5 & +6; 8 & 9; 10 & 11; 14 & 15) were ignored by recHist but kept by +recHistGen. diff --git a/dev/notesRealBreakpoints.rmd b/dev/notesRealBreakpoints.rmd new file mode 100644 index 00000000..05ed8680 --- /dev/null +++ b/dev/notesRealBreakpoints.rmd @@ -0,0 +1,321 @@ +--- +title: "notesRealBreakpoints" +output: + github_document: + output_file: ../README.md +date: "2026-02-18" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Test data + +Use `AlphaSimR_test/dev/multiple_chr.py` to simulate 2 chromosomes and 2 dip individuals with pedigree. Or directly use `msprime_chr0.trees` and `msprime_chr1.trees` in `AlphaSimR_test/dev/testData`. + +## Try this version + +In this version, a recHistGen object (similar to recHist) was added to record the real recombination breakpoints. + +### Load tree sequences + +``` +library(AlphaSimR) +use_virtualenv("~/r-reticulate-env", required = TRUE) +tskit <- import("tskit") +devtools::load_all() + +# two chromosomes +L1 <- 1e6 +L2 <- 2e6 +# here, use the same recombination map as used in msprime +chr_info <- list( + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + #breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr1.trees", + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) +) + +founderGenomes1 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) +``` + +### Run AlphaSimR + +``` +set.seed(42) +SP = SimParam$new(founderGenomes1) +SP$setSexes("yes_sys") +SP$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + +SP$setTrackPed(TRUE) +# try the new function here, it automatically set setTrackRec also. +SP$setTrackRecGen(TRUE) +basePop = newPop(founderGenomes1) + +# the 2 objects are same now: +SP$recHistGen +SP$recHist + +basePop = setPheno(basePop, + h2 = 0.5) + +#--- n generations +nCycles<-2 + +# very simple container for each cycles sim output +simOutput<-list(basePop) +cycle<-1 +for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput[[cycle]], nInd=6, use = "gv") + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5) + # add new offspring to simOutput list + simOutput[[cycle+1]]<-offspringPop +} +``` +Now we can see the difference between recHist and recHistGen: +``` +RHG <- SP$recHistGen +RH <- SP$recHist +# ind 3; chr 2; hap 1. Maybe not the same output, please check RHG and RH to find a hap with recombination +rh <- RH[[3]][[2]][[1]] +rhg <- RHG[[3]][[2]][[1]] +gm <- SP$genMap[[2]] +``` +Col 1: original Hap; Col2: start from where (recHist: index of SNP; recHistGen: positions in Morgan) +``` +> rh + [,1] [,2] +[1,] 2 1 +[2,] 1 112 +> rhg + [,1] [,2] +[1,] 2 0.00000000 +[2,] 1 0.09354741 +``` +So, if everything goes well, rh[2,2]-1 < rhg[2,2] < rh[2,2]. We can check it with genMap (SNP index -> SNP position in Morgan): +``` +> gm[[111]] +[1] 0.0923321 +> gm[[112]] +[1] 0.0939209 +``` + +### Collect information for ts tables + +``` +# for RecHist +# bridgeSegDfList store the indexes of SNPs after recombination events +bridgeCollectSegFromSimOutput(SP, simOutput) +# for RecHistGen +# bridgeSegDfListGen store the positions of where recombination happen +bridgeCollectSegGenFromSimOutput(SP, simOutput) +``` + +For RecHist, the indexes of SNPs have to be turned into positions: +``` +edgeDf <- bridgeAllSegToEdgeDf(chr_info) +``` +### Write tree files and check +``` +bridgeWriteTrees(chr_info, edgeDf, SP) +``` +In python: +``` +import tskit +origin = tskit.load('.../AlphaSimR_test/dev/testData/msprime_chr0.trees') +marker_ts = tskit.load('.../AlphaSimR_test/dev/testData/AlphaSimR_extended_chr0.trees') + +# Statistics: +# chr1 +origin.num_trees +298 +marker_ts.num_trees +298 +# chr 1 is too short for new recombination events. But 40 new nodes (2 x 20 ind) added. +origin.num_nodes +260 +marker_ts.num_nodes +300 + +# chr2 +origin = tskit.load('.../AlphaSimR_test/dev/testData/msprime_chr1.trees') +marker_ts = tskit.load('.../AlphaSimR_test/dev/testData/AlphaSimR_extended_chr1.trees') +# now here are new recombination events: +origin.num_trees +620 +marker_ts.num_trees +623 +# and still 40 new nodes: +origin.num_nodes +491 +marker_ts.num_nodes +531 +``` +We can plot the pedigree by (information in individual table): +``` +from matplotlib import pyplot as plt +import networkx as nx +import tskit +def draw_pedigree(ped_ts): + G = nx.DiGraph() + for ind in ped_ts.individuals(): + time = ped_ts.node(ind.nodes[0]).time + pop = ped_ts.node(ind.nodes[0]).population + G.add_node(ind.id, time=time, population=pop) + for p in ind.parents: + if p != tskit.NULL: + G.add_edge(ind.id, p) + pos = nx.multipartite_layout(G, subset_key="time", align="horizontal") + colours = plt.rcParams['axes.prop_cycle'].by_key()['color'] + node_colours = [colours[node_attr["population"]] for node_attr in G.nodes.values()] + nx.draw_networkx(G, pos, with_labels=True, node_color=node_colours) + plt.show() + +draw_pedigree(origin) +``` +![](../man/figures/originInd.png) + +The new individuals added: +``` +draw_pedigree(marker_ts) +``` +![](../man/figures/addInd.png) + +For RecHistGen, the positions were stored in bridgeSegDfListGen and can be directly used: +``` +bridgeWriteTrees(chr_info, do.call(rbind, bridgeSegDfListGen), SP) +``` +### More recombinations? +Let's use the same msprime .tree files, but set higher recombination rates to see the difference between recHist and recHistGen when there are double crossing over between 2 sampled SNPs. +``` +L1 <- 1e6 +L2 <- 2e6 +chr_info <- list( + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr0.trees", + #breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr1.trees", + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) +) + +founderGenomes2 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) +set.seed(42) +SP2 = SimParam$new(founderGenomes2) +SP2$setSexes("yes_sys") +SP2$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + +SP2$setTrackPed(TRUE) +# try the new function here, it automatically set setTrackRec also. +SP2$setTrackRecGen(TRUE) +basePop2 = newPop(founderGenomes2, simParam = SP2) +basePop2 = setPheno(basePop2, + h2 = 0.5, + simParam = SP2) + +#--- n generations +nCycles<-2 + +# very simple container for each cycles sim output +simOutput2<-list(basePop2) +cycle<-1 +for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput2[[cycle]], nInd=6, use = "gv", simParam = SP2) + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5, simParam = SP2) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5, simParam = SP2) + # add new offspring to simOutput list + simOutput2[[cycle+1]]<-offspringPop +} +``` +check ind 3; chr 1; hap 1: +``` +RHG <- SP2$recHistGen +RH <- SP2$recHist +gm <- SP2$genMap[[1]] + +rh <- RH[[3]][[1]][[1]] +rhg <- RHG[[3]][[1]][[1]] +``` +Now we can see 10 more recombination events in recHistGen: +``` +> rh + [,1] [,2] +[1,] 2 1 +[2,] 1 8 +[3,] 2 36 +[4,] 1 44 +[5,] 2 47 +[6,] 1 50 +[7,] 2 52 +> rhg + [,1] [,2] + [1,] 2 0.000000 + [2,] 1 1.306539 + [3,] 2 2.243464 + [4,] 1 2.423857 + [5,] 2 2.608929 + [6,] 1 2.831431 + [7,] 2 4.993668 + [8,] 1 5.958832 + [9,] 2 6.238179 +[10,] 1 7.697307 +[11,] 2 7.882583 +[12,] 1 8.537422 +[13,] 2 9.214039 +[14,] 1 9.696815 +[15,] 2 10.348671 +[16,] 1 11.282477 +[17,] 2 12.080616 +``` +To see where the recombination events (between which SNPs) recorded in recHistGen: +``` +> x <- rhg[,2] +> +> left <- findInterval(x, gm) +> right <- pmin(left + 1, length(gm)) +> +> out <- data.frame( ++ x = x, ++ left_i = left, ++ left_v = gm[left], ++ right_i = right, ++ right_v = gm[right] ++ ) +> +> out + x left_i left_v right_i right_v +1 0.000000 1 0.00000 2 0.00609 +2 1.306539 7 1.06375 8 1.36458 +3 2.243464 10 1.77918 11 2.51330 +4 2.423857 10 1.77918 11 2.51330 +5 2.608929 12 2.57886 13 2.86640 +6 2.831431 12 2.57886 13 2.86640 +7 4.993668 35 4.64742 36 5.39801 +8 5.958832 38 5.73197 39 6.43739 +9 6.238179 38 5.73197 39 6.43739 +10 7.697307 41 7.61275 42 8.31825 +11 7.882583 41 7.61275 42 8.31825 +12 8.537422 43 8.51587 44 8.67657 +13 9.214039 46 8.85817 47 9.52313 +14 9.696815 47 9.52313 48 10.44305 +15 10.348671 47 9.52313 48 10.44305 +16 11.282477 49 10.92943 50 11.29281 +17 12.080616 51 11.94669 52 12.37887 +``` +The double crossing overs between 2 sampled SNPs (e.g. rows 3 & 4; 5 & 6; 8 & 9; 10 & 11; 14 & 15) were ignored by recHist but kept by recHistGen. diff --git a/dev/notesTestGrowTsTables.md b/dev/notesTestGrowTsTables.md new file mode 100644 index 00000000..05cd4f5d --- /dev/null +++ b/dev/notesTestGrowTsTables.md @@ -0,0 +1,191 @@ +``` +rm(list = ls()) +``` + +0. load dependencies (already in the R scripts, but if you have different setting plz just do this in your way) +``` +library(reticulate) +library(AlphaSimR) + +use_virtualenv("~/r-reticulate-env", required = TRUE) +msprime <- import("msprime") +tskit <- import("tskit") +``` +1. load functions +``` +devtools::load_all() +#--- or ---- +source("R/makeFoundersFromTs.R") +source("R/alphaSimR2Ts.R") +``` + +2. read the .trees files with 2 chromosomes and 2 dip individuals (from msprime) +note: chrKeptPosBpList added, so we know the position and index of sampled SNPs in the original .trees files (alphaSimR only record their index) +``` +L1 <- 1e6 +L2 <- 2e6 + +chr_info <- list( + list(ts_path="dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + list(ts_path="dev/testData/msprime_chr1.trees", + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) +) + +founderGenomes <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) +``` + +3. run alphaSimR to set the founder genomes and parameters +``` +set.seed(42) +SP = SimParam$new(founderGenomes) +SP$setSexes("yes_sys") +SP$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) +SP$setTrackPed(TRUE) +SP$setTrackRec(TRUE) +basePop = newPop(founderGenomes) +basePop = setPheno(basePop, + h2 = 0.5) +``` + +4. run addition 2 generations +``` +#--- n generations +nCycles<-2 + +# keep founderPop and offspringPop in SimOutput +simOutput<-list(basePop) +cycle<-1 +for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput[[cycle]], nInd=6, use = "gv") + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5) + # add new offspring to simOutput list + simOutput[[cycle+1]]<-offspringPop +} +``` + +5. Link recHist (from SP, based on sampled SNPs) with parent-child hap (in tskit positions, from chrKeptPosBpList) +``` +bridgeCollectSegFromSimOutput(SP, simOutput) +``` + +6. make an edge table from bridgeSegDfList +``` +edgeDf <- bridgeAllSegToEdgeDf(chr_info) +``` + +7. write tskit tables (nodes and edges) +note1: time of founder generation: 0; time of offspring: time of the youngest parent - 1 +note2: check nodeIdMapByChr for ids of alphaSimR and tskit +note3: n ploidy is from asMapPop, so variable number of ploidy along generations is not allowed +note4: be careful with the metadata in the future (different behaviors between Python and R even with Reticulate) +``` +bridgeWriteTrees(chr_info, edgeDf, SP) +``` + +8. We can see that there is no new recombination break points in chr1, let's play with chr2 + +``` +# in Python: +import tskit +ts0 = tskit.load('.../dev/testData/msprime_chr1.trees') +ts1 = tskit.load('.../dev/testData/_AlphaSimR_extended_chr1.trees') +``` + +From edgeDf, there's a breakpoint at 1549443 +``` +ts0_1549443 = ts0.at(1549442) +ts0_1549444 = ts0.at(1549443) +ts1_1549443 = ts1.at(1549442) +ts1_1549444 = ts1.at(1549443) +``` +Same tree in the original file: +``` +print(ts0_1549443.draw_text()) +``` +``` +Output: + 182 + ┏━┻━┓ + 32 ┃ + ┏┻━┓ ┃ +12 1621 +┏┻┓ ┃ ┃ +0 2 1 3 +``` + +``` +print(ts0_1549444.draw_text()) +``` +``` +Output: + 182 + ┏━┻━┓ + 32 ┃ + ┏┻━┓ ┃ +12 1621 +┏┻┓ ┃ ┃ +0 2 1 3 +``` +Different trees in the new file +``` +print(ts1_1549443.draw_text()) +``` +``` +Output: + 182 + ┏━━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ 32 + ┃ ┏━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━┓ + 21 16 12 + ┃ ┃ ┏━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━┓ + 3 1 2 0 + ┏━━━┳━━━┳━━━┳┻━━━━┳━━━━━━━┓ ┏━━━┳━━━┳━┻━━━━━━━━━┓ ┏━━━┳━┻━┳━━━┓ ┏━━━┳━━━┳━━━━━━━━━┳━━━┻━━━━━━━━━┳━━━━━━━━━━━┓ +491 493 505 497 499 501 494 504 508 492 495 503 507 509 496 506 510 498 500 502 + ┃ ┏━┻━┓ ┏━┻━┓ ┏━━━┳━━━╋━━━┳━━━┓ ┏━━━┳━┻━┳━━━┓ ┏━━━╋━━━┓ ┏━━━╋━━━┓ + 511 512 520 523 529 522 524 526 528 530 513 515 517 519 514 516 518 521 525 527 +``` +``` +print(ts1_1549444.draw_text()) +``` +``` +Output: + 182 + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + 32 ┃ + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ + 12 16 21 + ┏━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━┓ ┃ ┃ + 2 0 1 3 + ┏━━━┳━━━╋━━━┳━━━┓ ┏━━━┳━━━┳━━━━━━━━━┳━━━┻━━━━━━━━━┳━━━━━━━━━━━┓ ┏━━━┳━━━┳━┻━━━━━━━━━┓ ┏━━━┳━━━┳━━┻━━┳━━━━━━━┓ +491 495 503 507 509 496 506 510 498 500 502 494 504 508 492 493 505 497 499 501 + ┏━━━┳━┻━┳━━━┓ ┏━━━╋━━━┓ ┏━━━╋━━━┓ ┏━━━┳━━━╋━━━┳━━━┓ ┃ ┏━┻━┓ ┏━┻━┓ + 513 515 517 519 514 516 518 521 525 527 522 524 526 528 530 511 512 520 523 529 + +``` +difference: one of the parent nodes of node 491 (3_1 in alphaSimR) changed from 3 (2_2) to 2 (2_1), the same as edgeDf. + + +New nodes look like: +``` +ts1.tables.nodes[491] +``` +``` +Output: +NodeTableRow(flags=0, time=-1.0, population=-1, individual=-1, metadata={'alphaSimR': {'id': '3_1'}}) +``` +Founder nodes look like: +``` +ts1.tables.nodes[0] +``` +``` +Output: +NodeTableRow(flags=1, time=0.0, population=0, individual=0, metadata={'alphaSimR': {'id': '1_1'}}) +``` diff --git a/dev/reservoir_sampling.Rmd b/dev/reservoir_sampling.Rmd new file mode 100644 index 00000000..a0376b95 --- /dev/null +++ b/dev/reservoir_sampling.Rmd @@ -0,0 +1,51 @@ +#segSites = n +#qualifiedSites = N + +Algorithm: if we make segSites as a list with size $n$, for the first $n$ qualifiedSites, along the chromosome, we just put it in the list; for the others (e.g. index $i$), we generate a random number from 0 to $i$, if $i <= n$, we put it in the list, or we discard it. + +1. Probability of a site $i$ entry the list +$$ +P(entry) = +\begin{cases} +1, & i <= n, \\ +\frac{n}{i}, & i > n . +\end{cases} +$$ +2. Probability of a site $i$ in the list being replace at step $j$ $(in$ +$$ +P(not\ being\ replaced)=\prod_{j=i+1}^N \frac{j-1}{j}\\ += \frac{i}{i+1}*\frac{i+1}{i+2}*\frac{i+2}{i+3}...\frac{N-1}{N}\\ += \frac{i}{N} +$$ +3. Probability of a site $i$ in the list at the end $(j=N)$ +$$ +P(in\ the\ list)=P(entry) * P(not\ being\ replaced) +$$ +$$ +P(in\ the\ list) = +\begin{cases} +1*\frac{n}{N}, & i <= n, \\ +\frac{n}{i}*\frac{i}{N}, & i > n . +\end{cases}\\ +=\begin{cases} +\frac{n}{N}, & i <= n, \\ +\frac{n}{N}, & i > n . +\end{cases} +$$ +So, all the qualifiedSites along the chromosome have the same probability being sampled. diff --git a/dev/reservoir_sampling.html b/dev/reservoir_sampling.html new file mode 100644 index 00000000..a98e4da8 --- /dev/null +++ b/dev/reservoir_sampling.html @@ -0,0 +1,452 @@ + + + + + + + + + + + + + +reservoir_sampling.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

#segSites = n #qualifiedSites = N

+

Algorithm: if we make segSites as a list with size \(n\), for the first \(n\) qualifiedSites, along the chromosome, +we just put it in the list; for the others (e.g. index \(i\)), we generate a random number from 0 to +\(i\), if \(i +<= n\), we put it in the list, or we discard it.

+
    +
  1. Probability of a site \(i\) entry +the list \[ +P(entry) = +\begin{cases} +1, & i <= n, \\ +\frac{n}{i}, & i > n . +\end{cases} +\]
  2. +
  3. Probability of a site \(i\) in the +list being replace at step \(j\) \((i<j<=N)\) \[ +P(being\ replaced)=P(j\ entry)*P(sample\ i\ from\ list) += \frac{n}{j}*\frac{1}{n}=\frac{1}{j} +\]
  4. +
  5. Probability of a site \(i\) in the +list NOT being replace at step \(j\) +\((i<j<=N)\) \[ +P(not\ being\ replaced)=1-\frac{1}{j}=\frac{j-1}{j} +\]
  6. +
  7. Probability of a site \(i\) in the +list NOT being replace at the end \((j=N)\) For \(i<=n\) \[ +P(not\ being\ replaced)=\prod_{j=n+1}^N \frac{j-1}{j}\\ += \frac{n}{n+1}*\frac{n+1}{n+2}*\frac{n+2}{n+3}...\frac{N-1}{N}\\ += \frac{n}{N} +\] For \(i>n\) \[ +P(not\ being\ replaced)=\prod_{j=i+1}^N \frac{j-1}{j}\\ += \frac{i}{i+1}*\frac{i+1}{i+2}*\frac{i+2}{i+3}...\frac{N-1}{N}\\ += \frac{i}{N} +\]
  8. +
  9. Probability of a site \(i\) in the +list at the end \((j=N)\) \[ +P(in\ the\ list)=P(entry) * P(not\ being\ replaced) +\] \[ +P(in\ the\ list) = +\begin{cases} +1*\frac{n}{N}, & i <= n, \\ +\frac{n}{i}*\frac{i}{N}, & i > n . +\end{cases}\\ +=\begin{cases} +\frac{n}{N}, & i <= n, \\ +\frac{n}{N}, & i > n . +\end{cases} +\] So, all the qualifiedSites along the chromosome have the same +probability being sampled.
  10. +
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/dev/smokeStep1.R b/dev/smokeStep1.R new file mode 100644 index 00000000..f9233b55 --- /dev/null +++ b/dev/smokeStep1.R @@ -0,0 +1,22 @@ +## Step 1 smoke test: read site positions via C API + +library(AlphaSimRTmp) + +tsPath <- "/Users/jliang2/Projects/test_TSK2ASR/data/simulations/normal/msprime_chr1.trees" + +pos <- tsSitesPosition(tsPath) + +cat("numSites =", length(pos), "\n") +cat("firstPositions =", paste(head(pos, 10), collapse = ", "), "\n") +cat("lastPositions =", paste(tail(pos, 3), collapse = ", "), "\n") + +library(reticulate) +use_virtualenv("~/r-reticulate-env", required = TRUE) +tskit <- import("tskit") +ts <- tskit$load(tsPath) +posPy <- ts$tables$sites$position +posPy <- as.numeric(py_to_r(ts$tables$sites$position)) + +cat("numSites =", length(posPy), "\n") +cat("firstPositions =", paste(head(posPy, 10), collapse = ", "), "\n") +cat("lastPositions =", paste(tail(posPy, 3), collapse = ", "), "\n") diff --git a/dev/smokeStep2.R b/dev/smokeStep2.R new file mode 100644 index 00000000..02ad0889 --- /dev/null +++ b/dev/smokeStep2.R @@ -0,0 +1,158 @@ +library(AlphaSimR) +use_virtualenv("~/r-reticulate-env", required = TRUE) +tskit <- import("tskit") +devtools::load_all() + +# two chromosomes +L1 <- 1e6 +L2 <- 2e6 +chr_info <- list( + list(ts_path="/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + #breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path="/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/msprime_chr1.trees", + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) +) + +founderGenomes1 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) + +set.seed(42) +SP = SimParam$new(founderGenomes1) +SP$setSexes("yes_sys") +SP$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + +SP$setTrackPed(TRUE) +# try the new function here, it automatically set setTrackRec also. +SP$setTrackRecGen(TRUE) +SP$recHistGen +basePop = newPop(founderGenomes1) +# the 2 objects are same now: +SP$recHistGen +SP$recHist +basePop = setPheno(basePop, + h2 = 0.5) + +#--- n generations +nCycles<-2 + +# very simple container for each cycles sim output +simOutput<-list(basePop) +cycle<-1 +for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput[[cycle]], nInd=6, use = "gv") + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5) + # add new offspring to simOutput list + simOutput[[cycle+1]]<-offspringPop +} + +# see the difference between recHist and recHistGen +RHG <- SP$recHistGen +RH <- SP$recHist +# ind 3; chr 2; hap 1. Maybe not the same output, please check RHG and RH to find a hap with recombination +rh <- RH[[3]][[2]][[1]] +rhg <- RHG[[3]][[2]][[1]] +gm <- SP$genMap[[2]] +rh +rhg +gm[[111]] +gm[[112]] + + +# for RecHist +# bridgeSegDfList store the indexes of SNPs after recombination events +bridgeCollectSegFromSimOutput(SP, simOutput) +# for RecHistGen +# bridgeSegDfListGen store the positions of where recombination happen +bridgeCollectSegGenFromSimOutput(SP, simOutput) + +# for RecHist +edgeDf <- bridgeAllSegToEdgeDf(chr_info) +bridgeWriteTrees(chr_info, edgeDf, SP) +# load the tree in Python... +#origin = tskit.load('/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/msprime_chr0.trees') +#marker_ts = tskit.load('/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/AlphaSimR_extended_chr0.trees') +# check the number of trees, nodes, and individual + +# for RecHistGen +bridgeWriteTrees(chr_info, do.call(rbind, bridgeSegDfListGen), SP) +# real_break_ts = tskit.load('/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/AlphaSimR_extended_chr0.trees') + + +L1 <- 1e6 +L2 <- 2e6 +chr_info <- list( + list(ts_path="/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/msprime_chr0.trees", + #breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path="/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/msprime_chr1.trees", + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) +) + +founderGenomes2 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) +set.seed(42) +SP2 = SimParam$new(founderGenomes2) +SP2$setSexes("yes_sys") +SP2$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + +SP2$setTrackPed(TRUE) +# try the new function here, it automatically set setTrackRec also. +SP2$setTrackRecGen(TRUE) +basePop2 = newPop(founderGenomes2, simParam = SP2) +basePop2 = setPheno(basePop2, + h2 = 0.5, + simParam = SP2) + +#--- n generations +nCycles<-2 + +# very simple container for each cycles sim output +simOutput2<-list(basePop2) +cycle<-1 +for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput2[[cycle]], nInd=6, use = "gv", simParam = SP2) + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5, simParam = SP2) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5, simParam = SP2) + # add new offspring to simOutput list + simOutput2[[cycle+1]]<-offspringPop +} + +RHG <- SP2$recHistGen +RH <- SP2$recHist +gm <- SP2$genMap[[1]] + +rh <- RH[[3]][[1]][[1]] +rhg <- RHG[[3]][[1]][[1]] +rh +rhg +x <- rhg[,2] + +left <- findInterval(x, gm) +right <- pmin(left + 1, length(gm)) + +out <- data.frame( + x = x, + left_i = left, + left_v = gm[left], + right_i = right, + right_v = gm[right] +) + +out + + + diff --git a/dev/testData/AlphaSimR_extended_chr0.trees b/dev/testData/AlphaSimR_extended_chr0.trees new file mode 100644 index 00000000..5ef63a20 Binary files /dev/null and b/dev/testData/AlphaSimR_extended_chr0.trees differ diff --git a/dev/testData/AlphaSimR_extended_chr1.trees b/dev/testData/AlphaSimR_extended_chr1.trees new file mode 100644 index 00000000..c7a57384 Binary files /dev/null and b/dev/testData/AlphaSimR_extended_chr1.trees differ diff --git a/dev/testData/MaCSTS.trees b/dev/testData/MaCSTS.trees new file mode 100644 index 00000000..9a3469b9 Binary files /dev/null and b/dev/testData/MaCSTS.trees differ diff --git a/dev/testData/MaCSTS_split.trees b/dev/testData/MaCSTS_split.trees new file mode 100644 index 00000000..f6d1bb80 Binary files /dev/null and b/dev/testData/MaCSTS_split.trees differ diff --git a/dev/testData/inbred_test.trees b/dev/testData/inbred_test.trees new file mode 100644 index 00000000..ba87d7b7 Binary files /dev/null and b/dev/testData/inbred_test.trees differ diff --git a/dev/testData/method_comparison_long.csv b/dev/testData/method_comparison_long.csv new file mode 100644 index 00000000..f13ef508 --- /dev/null +++ b/dev/testData/method_comparison_long.csv @@ -0,0 +1,601 @@ +Scenarios,Methods,rep_index,num_mut,num_trees,num_edges,num_nodes,max_root_time +single_const,macs,1,292,NA,NA,NA,NA +single_const,macsTS,1,274,20,58,34,130606.67486238344 +single_const,macs,2,266,NA,NA,NA,NA +single_const,macsTS,2,259,20,79,34,66005.3804633436 +single_const,macs,3,193,NA,NA,NA,NA +single_const,macsTS,3,251,27,94,41,97189.61165763761 +single_const,macs,4,298,NA,NA,NA,NA +single_const,macsTS,4,253,25,78,39,65334.22278608841 +single_const,macs,5,330,NA,NA,NA,NA +single_const,macsTS,5,194,12,49,26,40542.349834653956 +single_const,macs,6,278,NA,NA,NA,NA +single_const,macsTS,6,236,15,58,29,99754.70327058528 +single_const,macs,7,291,NA,NA,NA,NA +single_const,macsTS,7,235,19,58,33,81351.6226623958 +single_const,macs,8,193,NA,NA,NA,NA +single_const,macsTS,8,189,20,72,34,45808.648703602405 +single_const,macs,9,111,NA,NA,NA,NA +single_const,macsTS,9,207,14,49,28,52290.36921998231 +single_const,macs,10,337,NA,NA,NA,NA +single_const,macsTS,10,330,29,90,43,108039.19374581998 +single_const,macs,11,335,NA,NA,NA,NA +single_const,macsTS,11,311,30,100,44,113649.96430071855 +single_const,macs,12,306,NA,NA,NA,NA +single_const,macsTS,12,267,21,75,35,51015.734816689255 +single_const,macs,13,417,NA,NA,NA,NA +single_const,macsTS,13,244,25,90,39,62294.86784261862 +single_const,macs,14,434,NA,NA,NA,NA +single_const,macsTS,14,355,39,141,53,85962.47399816292 +single_const,macs,15,233,NA,NA,NA,NA +single_const,macsTS,15,265,24,81,38,52379.8410795243 +single_const,macs,16,285,NA,NA,NA,NA +single_const,macsTS,16,209,16,61,30,51910.67205473233 +single_const,macs,17,205,NA,NA,NA,NA +single_const,macsTS,17,224,16,67,30,37299.09394026202 +single_const,macs,18,215,NA,NA,NA,NA +single_const,macsTS,18,269,22,77,36,107270.56774403263 +single_const,macs,19,227,NA,NA,NA,NA +single_const,macsTS,19,348,29,102,43,106109.72033449716 +single_const,macs,20,129,NA,NA,NA,NA +single_const,macsTS,20,242,25,82,39,180889.71939212296 +single_const,macs,21,142,NA,NA,NA,NA +single_const,macsTS,21,198,13,48,27,40733.29310353212 +single_const,macs,22,227,NA,NA,NA,NA +single_const,macsTS,22,270,18,63,32,116571.33270898793 +single_const,macs,23,143,NA,NA,NA,NA +single_const,macsTS,23,181,17,74,31,20853.662115284733 +single_const,macs,24,293,NA,NA,NA,NA +single_const,macsTS,24,312,21,84,35,44483.345315484105 +single_const,macs,25,332,NA,NA,NA,NA +single_const,macsTS,25,234,22,65,36,67424.19620455234 +single_const,macs,26,153,NA,NA,NA,NA +single_const,macsTS,26,239,16,62,30,138458.70842711534 +single_const,macs,27,151,NA,NA,NA,NA +single_const,macsTS,27,279,24,83,38,127154.58609573975 +single_const,macs,28,321,NA,NA,NA,NA +single_const,macsTS,28,174,17,61,31,75246.39881131018 +single_const,macs,29,195,NA,NA,NA,NA +single_const,macsTS,29,409,21,72,35,144267.165835303 +single_const,macs,30,150,NA,NA,NA,NA +single_const,macsTS,30,222,23,85,37,61765.568583130436 +single_const,macs,31,401,NA,NA,NA,NA +single_const,macsTS,31,214,19,68,33,73704.80979141926 +single_const,macs,32,357,NA,NA,NA,NA +single_const,macsTS,32,342,24,87,38,71921.98605015522 +single_const,macs,33,230,NA,NA,NA,NA +single_const,macsTS,33,208,19,65,33,62462.15513139269 +single_const,macs,34,139,NA,NA,NA,NA +single_const,macsTS,34,216,17,61,31,53851.52711376086 +single_const,macs,35,128,NA,NA,NA,NA +single_const,macsTS,35,198,12,46,26,78535.85514919674 +single_const,macs,36,298,NA,NA,NA,NA +single_const,macsTS,36,271,20,78,34,66206.5982829437 +single_const,macs,37,242,NA,NA,NA,NA +single_const,macsTS,37,199,15,60,29,89040.392438991 +single_const,macs,38,177,NA,NA,NA,NA +single_const,macsTS,38,350,22,77,36,75469.88806207768 +single_const,macs,39,213,NA,NA,NA,NA +single_const,macsTS,39,265,13,43,27,112632.1812667316 +single_const,macs,40,163,NA,NA,NA,NA +single_const,macsTS,40,272,14,46,28,144548.0409556127 +single_const,macs,41,271,NA,NA,NA,NA +single_const,macsTS,41,171,12,44,26,107499.00379442984 +single_const,macs,42,303,NA,NA,NA,NA +single_const,macsTS,42,213,19,65,33,170059.4997680087 +single_const,macs,43,254,NA,NA,NA,NA +single_const,macsTS,43,131,13,40,27,63943.069447406204 +single_const,macs,44,176,NA,NA,NA,NA +single_const,macsTS,44,305,22,75,36,182669.6538095107 +single_const,macs,45,247,NA,NA,NA,NA +single_const,macsTS,45,262,20,71,34,80752.16600785403 +single_const,macs,46,351,NA,NA,NA,NA +single_const,macsTS,46,385,25,88,39,122260.87848955863 +single_const,macs,47,298,NA,NA,NA,NA +single_const,macsTS,47,212,24,85,38,66341.7417650338 +single_const,macs,48,248,NA,NA,NA,NA +single_const,macsTS,48,272,24,81,38,135496.13544562747 +single_const,macs,49,308,NA,NA,NA,NA +single_const,macsTS,49,266,22,86,36,45448.485984450235 +single_const,macs,50,317,NA,NA,NA,NA +single_const,macsTS,50,253,18,61,32,99598.88751005831 +single_eN,macs,1,160,NA,NA,NA,NA +single_eN,macsTS,1,203,12,36,26,58148.70761784874 +single_eN,macs,2,320,NA,NA,NA,NA +single_eN,macsTS,2,379,34,119,48,73237.30318935837 +single_eN,macs,3,414,NA,NA,NA,NA +single_eN,macsTS,3,329,31,110,45,59987.83427408926 +single_eN,macs,4,340,NA,NA,NA,NA +single_eN,macsTS,4,397,35,115,49,56439.30980824561 +single_eN,macs,5,314,NA,NA,NA,NA +single_eN,macsTS,5,208,17,62,31,61655.10045304916 +single_eN,macs,6,334,NA,NA,NA,NA +single_eN,macsTS,6,400,33,110,47,90116.32855844595 +single_eN,macs,7,182,NA,NA,NA,NA +single_eN,macsTS,7,337,43,153,57,67011.4782614623 +single_eN,macs,8,284,NA,NA,NA,NA +single_eN,macsTS,8,259,14,49,28,43365.41669535897 +single_eN,macs,9,300,NA,NA,NA,NA +single_eN,macsTS,9,340,34,122,48,94502.37456357364 +single_eN,macs,10,354,NA,NA,NA,NA +single_eN,macsTS,10,324,20,63,34,105198.40350853864 +single_eN,macs,11,324,NA,NA,NA,NA +single_eN,macsTS,11,264,23,79,37,66312.76351695477 +single_eN,macs,12,564,NA,NA,NA,NA +single_eN,macsTS,12,396,35,121,49,64750.072254029415 +single_eN,macs,13,193,NA,NA,NA,NA +single_eN,macsTS,13,266,23,85,37,62843.40133262198 +single_eN,macs,14,62,NA,NA,NA,NA +single_eN,macsTS,14,122,12,50,26,40806.2636329889 +single_eN,macs,15,372,NA,NA,NA,NA +single_eN,macsTS,15,277,30,105,44,96234.80967745245 +single_eN,macs,16,260,NA,NA,NA,NA +single_eN,macsTS,16,250,29,99,43,56696.10425093216 +single_eN,macs,17,283,NA,NA,NA,NA +single_eN,macsTS,17,286,21,72,35,57343.15317752674 +single_eN,macs,18,241,NA,NA,NA,NA +single_eN,macsTS,18,161,19,71,33,54196.02216942763 +single_eN,macs,19,309,NA,NA,NA,NA +single_eN,macsTS,19,383,25,86,39,76826.81946095033 +single_eN,macs,20,359,NA,NA,NA,NA +single_eN,macsTS,20,327,18,66,32,88701.81044253254 +single_eN,macs,21,292,NA,NA,NA,NA +single_eN,macsTS,21,274,17,65,31,50449.55770401915 +single_eN,macs,22,274,NA,NA,NA,NA +single_eN,macsTS,22,388,31,106,45,70169.78689802613 +single_eN,macs,23,99,NA,NA,NA,NA +single_eN,macsTS,23,168,16,52,30,74425.03144068009 +single_eN,macs,24,296,NA,NA,NA,NA +single_eN,macsTS,24,384,35,128,49,77314.17173828399 +single_eN,macs,25,390,NA,NA,NA,NA +single_eN,macsTS,25,431,36,134,50,58974.239835569344 +single_eN,macs,26,462,NA,NA,NA,NA +single_eN,macsTS,26,469,34,118,48,85835.71488334516 +single_eN,macs,27,219,NA,NA,NA,NA +single_eN,macsTS,27,394,28,105,42,75219.45389647888 +single_eN,macs,28,418,NA,NA,NA,NA +single_eN,macsTS,28,285,26,94,40,66174.77320616253 +single_eN,macs,29,281,NA,NA,NA,NA +single_eN,macsTS,29,251,14,54,28,46879.72773540639 +single_eN,macs,30,286,NA,NA,NA,NA +single_eN,macsTS,30,301,22,77,36,65981.22543811104 +single_eN,macs,31,280,NA,NA,NA,NA +single_eN,macsTS,31,237,22,76,36,58123.1952900851 +single_eN,macs,32,436,NA,NA,NA,NA +single_eN,macsTS,32,284,24,76,38,57339.635819378615 +single_eN,macs,33,379,NA,NA,NA,NA +single_eN,macsTS,33,379,21,76,35,67313.65198213924 +single_eN,macs,34,177,NA,NA,NA,NA +single_eN,macsTS,34,123,8,36,22,47949.231529550314 +single_eN,macs,35,266,NA,NA,NA,NA +single_eN,macsTS,35,220,11,38,25,62608.54938748934 +single_eN,macs,36,278,NA,NA,NA,NA +single_eN,macsTS,36,389,32,116,46,67328.62744153728 +single_eN,macs,37,304,NA,NA,NA,NA +single_eN,macsTS,37,313,27,94,41,58620.81909963436 +single_eN,macs,38,422,NA,NA,NA,NA +single_eN,macsTS,38,319,31,111,45,71086.09316638751 +single_eN,macs,39,425,NA,NA,NA,NA +single_eN,macsTS,39,285,13,49,27,53742.71646574813 +single_eN,macs,40,443,NA,NA,NA,NA +single_eN,macsTS,40,275,24,84,38,50174.49762119274 +single_eN,macs,41,362,NA,NA,NA,NA +single_eN,macsTS,41,448,27,90,41,62652.78292888323 +single_eN,macs,42,342,NA,NA,NA,NA +single_eN,macsTS,42,129,12,47,26,69844.15714169925 +single_eN,macs,43,319,NA,NA,NA,NA +single_eN,macsTS,43,329,35,124,49,56479.26214014877 +single_eN,macs,44,368,NA,NA,NA,NA +single_eN,macsTS,44,254,23,67,37,101945.72699971082 +single_eN,macs,45,355,NA,NA,NA,NA +single_eN,macsTS,45,418,42,138,56,86714.95866792956 +single_eN,macs,46,356,NA,NA,NA,NA +single_eN,macsTS,46,306,23,89,37,62318.73367420454 +single_eN,macs,47,357,NA,NA,NA,NA +single_eN,macsTS,47,375,28,90,42,57971.59387918696 +single_eN,macs,48,327,NA,NA,NA,NA +single_eN,macsTS,48,342,32,100,46,75423.99618262045 +single_eN,macs,49,438,NA,NA,NA,NA +single_eN,macsTS,49,361,26,90,40,55701.874157323844 +single_eN,macs,50,361,NA,NA,NA,NA +single_eN,macsTS,50,448,40,134,54,62089.414036778006 +I2_migration,macs,1,582,NA,NA,NA,NA +I2_migration,macsTS,1,627,33,110,47,125595.0592035079 +I2_migration,macs,2,539,NA,NA,NA,NA +I2_migration,macsTS,2,505,38,140,52,69051.04466396774 +I2_migration,macs,3,528,NA,NA,NA,NA +I2_migration,macsTS,3,605,48,166,62,139258.2963791596 +I2_migration,macs,4,491,NA,NA,NA,NA +I2_migration,macsTS,4,536,45,157,59,103649.84187421224 +I2_migration,macs,5,549,NA,NA,NA,NA +I2_migration,macsTS,5,458,44,151,58,147820.3717300555 +I2_migration,macs,6,627,NA,NA,NA,NA +I2_migration,macsTS,6,467,33,100,47,106171.06252651944 +I2_migration,macs,7,524,NA,NA,NA,NA +I2_migration,macsTS,7,517,41,134,55,173291.5683727674 +I2_migration,macs,8,498,NA,NA,NA,NA +I2_migration,macsTS,8,643,40,123,54,174791.06245591235 +I2_migration,macs,9,596,NA,NA,NA,NA +I2_migration,macsTS,9,551,42,141,56,90119.13525790816 +I2_migration,macs,10,492,NA,NA,NA,NA +I2_migration,macsTS,10,428,31,98,45,95075.75913438266 +I2_migration,macs,11,566,NA,NA,NA,NA +I2_migration,macsTS,11,499,35,127,49,79081.60485874035 +I2_migration,macs,12,541,NA,NA,NA,NA +I2_migration,macsTS,12,494,45,147,59,121775.70943984257 +I2_migration,macs,13,441,NA,NA,NA,NA +I2_migration,macsTS,13,471,28,100,42,78114.97528089151 +I2_migration,macs,14,473,NA,NA,NA,NA +I2_migration,macsTS,14,532,29,103,43,106166.65191412302 +I2_migration,macs,15,607,NA,NA,NA,NA +I2_migration,macsTS,15,493,31,111,45,83665.48344197539 +I2_migration,macs,16,504,NA,NA,NA,NA +I2_migration,macsTS,16,504,39,132,53,103299.11013445014 +I2_migration,macs,17,532,NA,NA,NA,NA +I2_migration,macsTS,17,540,45,156,59,88757.00554271559 +I2_migration,macs,18,622,NA,NA,NA,NA +I2_migration,macsTS,18,522,37,132,51,62917.25814639296 +I2_migration,macs,19,364,NA,NA,NA,NA +I2_migration,macsTS,19,439,36,119,50,101256.01177174292 +I2_migration,macs,20,458,NA,NA,NA,NA +I2_migration,macsTS,20,471,31,107,45,96603.20711064144 +I2_migration,macs,21,459,NA,NA,NA,NA +I2_migration,macsTS,21,489,37,113,51,158103.7816584265 +I2_migration,macs,22,471,NA,NA,NA,NA +I2_migration,macsTS,22,434,23,87,37,71859.57364798218 +I2_migration,macs,23,488,NA,NA,NA,NA +I2_migration,macsTS,23,464,30,108,44,91331.82904515398 +I2_migration,macs,24,500,NA,NA,NA,NA +I2_migration,macsTS,24,476,28,99,42,61123.035132749435 +I2_migration,macs,25,523,NA,NA,NA,NA +I2_migration,macsTS,25,500,27,89,41,124239.0158935418 +I2_migration,macs,26,581,NA,NA,NA,NA +I2_migration,macsTS,26,533,38,130,52,163712.94130790394 +I2_migration,macs,27,637,NA,NA,NA,NA +I2_migration,macsTS,27,521,34,115,48,111428.88359968473 +I2_migration,macs,28,500,NA,NA,NA,NA +I2_migration,macsTS,28,538,38,145,52,65652.4919728632 +I2_migration,macs,29,517,NA,NA,NA,NA +I2_migration,macsTS,29,734,56,188,70,141720.26896218708 +I2_migration,macs,30,533,NA,NA,NA,NA +I2_migration,macsTS,30,471,33,106,47,106588.62693968821 +I2_migration,macs,31,525,NA,NA,NA,NA +I2_migration,macsTS,31,565,43,149,57,95128.1296007667 +I2_migration,macs,32,478,NA,NA,NA,NA +I2_migration,macsTS,32,524,30,95,44,167372.02797098926 +I2_migration,macs,33,654,NA,NA,NA,NA +I2_migration,macsTS,33,422,24,86,38,71365.55178817052 +I2_migration,macs,34,405,NA,NA,NA,NA +I2_migration,macsTS,34,505,34,107,48,173942.95068443177 +I2_migration,macs,35,522,NA,NA,NA,NA +I2_migration,macsTS,35,489,29,103,43,84211.49213274148 +I2_migration,macs,36,500,NA,NA,NA,NA +I2_migration,macsTS,36,551,48,160,62,100764.74136591831 +I2_migration,macs,37,551,NA,NA,NA,NA +I2_migration,macsTS,37,571,40,136,54,100509.16382435102 +I2_migration,macs,38,548,NA,NA,NA,NA +I2_migration,macsTS,38,442,29,97,43,71547.32702351622 +I2_migration,macs,39,490,NA,NA,NA,NA +I2_migration,macsTS,39,476,34,113,48,132531.25966989965 +I2_migration,macs,40,607,NA,NA,NA,NA +I2_migration,macsTS,40,464,34,116,48,89430.71699339058 +I2_migration,macs,41,568,NA,NA,NA,NA +I2_migration,macsTS,41,647,33,111,47,136410.0867010556 +I2_migration,macs,42,441,NA,NA,NA,NA +I2_migration,macsTS,42,529,33,112,47,147578.25136597085 +I2_migration,macs,43,533,NA,NA,NA,NA +I2_migration,macsTS,43,591,36,116,50,132280.1214687966 +I2_migration,macs,44,503,NA,NA,NA,NA +I2_migration,macsTS,44,443,29,102,43,85604.37570948259 +I2_migration,macs,45,554,NA,NA,NA,NA +I2_migration,macsTS,45,557,38,134,52,91444.366083548 +I2_migration,macs,46,551,NA,NA,NA,NA +I2_migration,macsTS,46,562,41,137,55,114930.89187933954 +I2_migration,macs,47,412,NA,NA,NA,NA +I2_migration,macsTS,47,493,33,103,47,111626.95008240972 +I2_migration,macs,48,449,NA,NA,NA,NA +I2_migration,macsTS,48,391,23,83,37,100456.89821919928 +I2_migration,macs,49,444,NA,NA,NA,NA +I2_migration,macsTS,49,442,35,102,49,165026.4308093184 +I2_migration,macs,50,486,NA,NA,NA,NA +I2_migration,macsTS,50,527,41,134,55,92467.67606152399 +I2_en_join,macs,1,440,NA,NA,NA,NA +I2_en_join,macsTS,1,422,22,78,36,90455.38931129313 +I2_en_join,macs,2,584,NA,NA,NA,NA +I2_en_join,macsTS,2,451,31,102,45,107851.61578425579 +I2_en_join,macs,3,485,NA,NA,NA,NA +I2_en_join,macsTS,3,646,35,112,49,183858.70189677476 +I2_en_join,macs,4,595,NA,NA,NA,NA +I2_en_join,macsTS,4,523,37,124,51,116752.63943077975 +I2_en_join,macs,5,385,NA,NA,NA,NA +I2_en_join,macsTS,5,418,27,94,41,154141.1439168411 +I2_en_join,macs,6,425,NA,NA,NA,NA +I2_en_join,macsTS,6,386,28,98,42,82638.96955724066 +I2_en_join,macs,7,459,NA,NA,NA,NA +I2_en_join,macsTS,7,470,23,73,37,207774.83980101155 +I2_en_join,macs,8,513,NA,NA,NA,NA +I2_en_join,macsTS,8,524,33,110,47,74595.09543570883 +I2_en_join,macs,9,619,NA,NA,NA,NA +I2_en_join,macsTS,9,541,40,123,54,149456.85982834117 +I2_en_join,macs,10,514,NA,NA,NA,NA +I2_en_join,macsTS,10,442,31,96,45,121567.6442901922 +I2_en_join,macs,11,367,NA,NA,NA,NA +I2_en_join,macsTS,11,538,39,133,53,106157.48704784577 +I2_en_join,macs,12,558,NA,NA,NA,NA +I2_en_join,macsTS,12,495,33,117,47,94031.1620166754 +I2_en_join,macs,13,288,NA,NA,NA,NA +I2_en_join,macsTS,13,502,30,108,44,104611.39856162715 +I2_en_join,macs,14,511,NA,NA,NA,NA +I2_en_join,macsTS,14,484,27,81,41,130161.8159830226 +I2_en_join,macs,15,676,NA,NA,NA,NA +I2_en_join,macsTS,15,515,42,143,56,83964.44708974562 +I2_en_join,macs,16,386,NA,NA,NA,NA +I2_en_join,macsTS,16,317,21,62,35,70640.02667351977 +I2_en_join,macs,17,508,NA,NA,NA,NA +I2_en_join,macsTS,17,540,35,120,49,103361.30568765034 +I2_en_join,macs,18,383,NA,NA,NA,NA +I2_en_join,macsTS,18,451,32,115,46,72980.98320142283 +I2_en_join,macs,19,602,NA,NA,NA,NA +I2_en_join,macsTS,19,527,35,111,49,197078.8600888442 +I2_en_join,macs,20,431,NA,NA,NA,NA +I2_en_join,macsTS,20,508,37,121,51,90327.2967897602 +I2_en_join,macs,21,450,NA,NA,NA,NA +I2_en_join,macsTS,21,418,34,121,48,110186.92787382715 +I2_en_join,macs,22,545,NA,NA,NA,NA +I2_en_join,macsTS,22,498,37,127,51,124027.34308081327 +I2_en_join,macs,23,512,NA,NA,NA,NA +I2_en_join,macsTS,23,546,31,106,45,135894.73377142084 +I2_en_join,macs,24,528,NA,NA,NA,NA +I2_en_join,macsTS,24,494,37,126,51,82304.45301359073 +I2_en_join,macs,25,462,NA,NA,NA,NA +I2_en_join,macsTS,25,585,32,111,46,106878.15529906558 +I2_en_join,macs,26,491,NA,NA,NA,NA +I2_en_join,macsTS,26,378,18,64,32,54813.6687179415 +I2_en_join,macs,27,623,NA,NA,NA,NA +I2_en_join,macsTS,27,571,36,123,50,123821.58156083069 +I2_en_join,macs,28,608,NA,NA,NA,NA +I2_en_join,macsTS,28,422,31,101,45,114562.44899022473 +I2_en_join,macs,29,461,NA,NA,NA,NA +I2_en_join,macsTS,29,371,31,94,45,73204.11409907653 +I2_en_join,macs,30,451,NA,NA,NA,NA +I2_en_join,macsTS,30,358,17,58,31,126643.50152549303 +I2_en_join,macs,31,560,NA,NA,NA,NA +I2_en_join,macsTS,31,435,35,113,49,93812.34602557738 +I2_en_join,macs,32,406,NA,NA,NA,NA +I2_en_join,macsTS,32,435,28,97,42,173833.53023783205 +I2_en_join,macs,33,511,NA,NA,NA,NA +I2_en_join,macsTS,33,525,48,151,62,138952.43262708606 +I2_en_join,macs,34,522,NA,NA,NA,NA +I2_en_join,macsTS,34,520,31,109,45,95206.40144126478 +I2_en_join,macs,35,489,NA,NA,NA,NA +I2_en_join,macsTS,35,463,32,103,46,96966.11837531606 +I2_en_join,macs,36,401,NA,NA,NA,NA +I2_en_join,macsTS,36,587,46,138,60,132442.54970897004 +I2_en_join,macs,37,534,NA,NA,NA,NA +I2_en_join,macsTS,37,508,46,157,60,136801.611158379 +I2_en_join,macs,38,521,NA,NA,NA,NA +I2_en_join,macsTS,38,484,37,125,51,131151.69116822493 +I2_en_join,macs,39,508,NA,NA,NA,NA +I2_en_join,macsTS,39,488,35,121,49,69016.19080364131 +I2_en_join,macs,40,411,NA,NA,NA,NA +I2_en_join,macsTS,40,425,37,129,51,94834.223990139 +I2_en_join,macs,41,474,NA,NA,NA,NA +I2_en_join,macsTS,41,465,29,103,43,97902.31158913208 +I2_en_join,macs,42,521,NA,NA,NA,NA +I2_en_join,macsTS,42,589,39,124,53,145067.56728698316 +I2_en_join,macs,43,456,NA,NA,NA,NA +I2_en_join,macsTS,43,646,35,130,49,109362.72265163346 +I2_en_join,macs,44,584,NA,NA,NA,NA +I2_en_join,macsTS,44,490,41,138,55,77247.08875282347 +I2_en_join,macs,45,550,NA,NA,NA,NA +I2_en_join,macsTS,45,359,20,65,34,118079.92958954102 +I2_en_join,macs,46,465,NA,NA,NA,NA +I2_en_join,macsTS,46,389,19,63,33,79540.0147356808 +I2_en_join,macs,47,480,NA,NA,NA,NA +I2_en_join,macsTS,47,552,30,96,44,92562.34819935205 +I2_en_join,macs,48,505,NA,NA,NA,NA +I2_en_join,macsTS,48,451,36,114,50,110286.92669633475 +I2_en_join,macs,49,607,NA,NA,NA,NA +I2_en_join,macsTS,49,423,17,61,31,81610.0812337726 +I2_en_join,macs,50,455,NA,NA,NA,NA +I2_en_join,macsTS,50,433,35,127,49,52605.5527063386 +single_const,msprime,1,183,15,56,29,51515.624918270456 +single_const,msprime,2,182,10,39,24,38763.80929698479 +single_const,msprime,3,336,27,87,41,109851.59102262642 +single_const,msprime,4,302,19,60,33,71899.5063036042 +single_const,msprime,5,173,16,59,30,52296.832527444945 +single_const,msprime,6,385,22,72,36,107743.52694763753 +single_const,msprime,7,168,16,64,30,30982.913633311287 +single_const,msprime,8,185,15,53,29,42181.611232091986 +single_const,msprime,9,184,20,65,34,59486.900392048585 +single_const,msprime,10,368,27,100,41,133055.80730249957 +single_const,msprime,11,262,31,114,45,48101.23294676906 +single_const,msprime,12,308,17,62,31,63690.27186884565 +single_const,msprime,13,199,21,74,35,49134.08270938573 +single_const,msprime,14,247,19,62,33,145182.40082078552 +single_const,msprime,15,244,24,91,38,62951.162590399596 +single_const,msprime,16,169,20,71,34,61498.16018360143 +single_const,msprime,17,266,22,74,36,111984.27213178847 +single_const,msprime,18,212,18,62,32,62641.42329757322 +single_const,msprime,19,275,20,69,34,80908.66388577598 +single_const,msprime,20,226,18,67,32,48753.6405100759 +single_const,msprime,21,351,23,85,37,62544.20329534801 +single_const,msprime,22,414,33,115,47,113011.0036309333 +single_const,msprime,23,276,28,110,42,63308.11472786501 +single_const,msprime,24,290,33,116,47,72418.28878141462 +single_const,msprime,25,221,23,82,37,56335.882104651864 +single_const,msprime,26,241,21,73,35,51653.59431584614 +single_const,msprime,27,281,29,106,43,102642.5271660279 +single_const,msprime,28,168,19,75,33,34588.383891724756 +single_const,msprime,29,204,17,62,31,62059.40939102222 +single_const,msprime,30,346,27,97,41,61260.760688875846 +single_const,msprime,31,291,28,91,42,120961.95623420269 +single_const,msprime,32,245,25,80,39,101173.91313550806 +single_const,msprime,33,304,30,103,44,63214.719234547876 +single_const,msprime,34,168,24,90,38,59006.04499853586 +single_const,msprime,35,344,49,164,63,98439.678484823 +single_const,msprime,36,172,14,49,28,50947.17699231931 +single_const,msprime,37,233,22,75,36,107320.9949119463 +single_const,msprime,38,289,20,71,34,79909.61109567845 +single_const,msprime,39,375,26,75,40,215760.75977841174 +single_const,msprime,40,319,17,66,31,69801.66186033799 +single_const,msprime,41,248,17,58,31,220971.26340291178 +single_const,msprime,42,80,11,34,25,109586.0356718846 +single_const,msprime,43,324,45,155,59,80630.78342095218 +single_const,msprime,44,263,27,104,41,54966.31877490516 +single_const,msprime,45,301,24,86,38,110946.77752676011 +single_const,msprime,46,197,17,58,31,67986.62845344585 +single_const,msprime,47,116,20,68,34,36060.08235496283 +single_const,msprime,48,313,19,59,33,126691.67299354449 +single_const,msprime,49,166,16,52,30,56095.740271687566 +single_const,msprime,50,312,25,82,39,88200.64578802258 +single_eN,msprime,1,278,28,92,42,65349.221239219914 +single_eN,msprime,2,291,14,47,28,60601.858488352605 +single_eN,msprime,3,415,34,123,48,57700.03535382562 +single_eN,msprime,4,440,32,110,46,76756.02982500306 +single_eN,msprime,5,315,30,91,44,70233.68216998836 +single_eN,msprime,6,300,26,84,40,77770.58646889063 +single_eN,msprime,7,352,25,88,39,60711.685354389985 +single_eN,msprime,8,287,20,70,34,60906.76361323841 +single_eN,msprime,9,414,40,133,54,79944.65892375396 +single_eN,msprime,10,394,31,106,45,70883.21756212738 +single_eN,msprime,11,461,33,123,47,73185.65542500555 +single_eN,msprime,12,301,25,84,39,54214.5164817196 +single_eN,msprime,13,273,22,75,36,69316.99951743032 +single_eN,msprime,14,371,36,129,50,73958.80454455342 +single_eN,msprime,15,288,25,78,39,72251.77689228117 +single_eN,msprime,16,298,21,69,35,53200.65206122284 +single_eN,msprime,17,216,19,69,33,44701.715341613424 +single_eN,msprime,18,303,30,97,44,58467.65305087444 +single_eN,msprime,19,217,18,53,32,67179.14113887127 +single_eN,msprime,20,428,26,92,40,67345.61920307747 +single_eN,msprime,21,255,22,72,36,98197.94041900555 +single_eN,msprime,22,322,34,120,48,73182.20215592455 +single_eN,msprime,23,334,21,78,35,55335.493334815896 +single_eN,msprime,24,360,26,93,40,83432.89937189074 +single_eN,msprime,25,385,40,130,54,106811.08207251274 +single_eN,msprime,26,293,18,62,32,72244.85713844906 +single_eN,msprime,27,313,25,89,39,61504.11927676364 +single_eN,msprime,28,276,16,51,30,67348.08434313306 +single_eN,msprime,29,329,27,94,41,68020.2817782327 +single_eN,msprime,30,371,24,90,38,73755.84780580996 +single_eN,msprime,31,359,32,113,46,112980.87085682394 +single_eN,msprime,32,208,20,59,34,52219.14757403417 +single_eN,msprime,33,365,37,127,51,63248.632349512416 +single_eN,msprime,34,304,19,67,33,53891.48572153046 +single_eN,msprime,35,210,15,60,29,50358.69328985923 +single_eN,msprime,36,240,24,85,38,54082.961824516686 +single_eN,msprime,37,459,30,113,44,62921.43534671086 +single_eN,msprime,38,330,17,59,31,61021.66401499905 +single_eN,msprime,39,457,31,110,45,77492.89793784614 +single_eN,msprime,40,260,15,52,29,62864.54151974913 +single_eN,msprime,41,435,37,123,51,100651.98483044902 +single_eN,msprime,42,271,18,58,32,73950.93821976396 +single_eN,msprime,43,275,27,82,41,93497.79468971904 +single_eN,msprime,44,442,28,107,42,74339.58448917311 +single_eN,msprime,45,387,33,110,47,101236.0734923974 +single_eN,msprime,46,330,18,67,32,57038.03720408608 +single_eN,msprime,47,356,26,89,40,72114.30160262527 +single_eN,msprime,48,389,27,104,41,100002.2178829616 +single_eN,msprime,49,244,17,55,31,65854.7399945304 +single_eN,msprime,50,343,24,85,38,65002.540328711504 +I2_migration,msprime,1,546,38,137,52,97564.58412883535 +I2_migration,msprime,2,382,30,90,44,117642.80370111366 +I2_migration,msprime,3,567,45,160,59,84207.66407721497 +I2_migration,msprime,4,417,33,109,47,109918.36926777275 +I2_migration,msprime,5,489,45,155,59,114973.54073907119 +I2_migration,msprime,6,483,36,129,50,114188.44189606888 +I2_migration,msprime,7,608,48,166,62,93753.24179771724 +I2_migration,msprime,8,508,34,102,48,116856.75958831569 +I2_migration,msprime,9,463,29,103,43,62770.346245957306 +I2_migration,msprime,10,518,38,124,52,135423.86365044612 +I2_migration,msprime,11,453,37,115,51,118090.57466322677 +I2_migration,msprime,12,628,38,127,52,209534.1610295494 +I2_migration,msprime,13,536,43,149,57,89912.36330586261 +I2_migration,msprime,14,483,42,141,56,128912.51711145912 +I2_migration,msprime,15,618,45,152,59,158871.44365126808 +I2_migration,msprime,16,544,36,114,50,159695.05489009136 +I2_migration,msprime,17,463,38,120,52,99170.41393187974 +I2_migration,msprime,18,499,27,96,41,106487.65319013289 +I2_migration,msprime,19,561,37,130,52,101057.78304616034 +I2_migration,msprime,20,446,30,106,44,107384.27427104083 +I2_migration,msprime,21,448,30,104,44,96818.1513162082 +I2_migration,msprime,22,465,38,130,52,95245.67309246887 +I2_migration,msprime,23,519,27,95,41,80206.69234565251 +I2_migration,msprime,24,532,55,176,69,160077.76387616264 +I2_migration,msprime,25,385,28,83,42,144681.399583353 +I2_migration,msprime,26,495,41,128,55,117771.10101605413 +I2_migration,msprime,27,489,26,81,40,110262.67331368584 +I2_migration,msprime,28,493,33,124,47,120967.20685852526 +I2_migration,msprime,29,420,32,101,46,139142.03401837248 +I2_migration,msprime,30,607,54,167,68,164135.22989891114 +I2_migration,msprime,31,594,34,109,48,145106.40932381916 +I2_migration,msprime,32,505,38,135,52,75733.69112892961 +I2_migration,msprime,33,497,35,124,49,109977.00581672597 +I2_migration,msprime,34,650,49,159,63,121041.38130669676 +I2_migration,msprime,35,482,28,100,42,61154.63265076551 +I2_migration,msprime,36,572,53,172,67,128486.13469860726 +I2_migration,msprime,37,533,48,159,62,95052.05016785166 +I2_migration,msprime,38,555,40,126,54,129105.13464317753 +I2_migration,msprime,39,694,36,116,50,108180.6860369713 +I2_migration,msprime,40,481,29,95,43,130476.93001143142 +I2_migration,msprime,41,480,26,90,40,78868.02124460539 +I2_migration,msprime,42,556,39,124,53,119361.57872816 +I2_migration,msprime,43,628,46,150,60,155208.89518427377 +I2_migration,msprime,44,494,35,124,49,97188.69526910788 +I2_migration,msprime,45,469,32,117,46,67242.4266414875 +I2_migration,msprime,46,463,15,50,29,70366.02587526909 +I2_migration,msprime,47,472,37,116,51,81929.4748655115 +I2_migration,msprime,48,475,28,92,42,115961.5565377741 +I2_migration,msprime,49,420,30,100,44,125649.90496392331 +I2_migration,msprime,50,577,41,140,55,101972.0338190745 +I2_en_join,msprime,1,520,37,128,51,88064.37475288789 +I2_en_join,msprime,2,538,29,95,43,102022.72632240821 +I2_en_join,msprime,3,508,30,103,44,118759.6618453868 +I2_en_join,msprime,4,479,29,93,43,110921.88324171577 +I2_en_join,msprime,5,380,18,60,32,107826.72923659543 +I2_en_join,msprime,6,434,28,101,42,62612.626182878535 +I2_en_join,msprime,7,429,28,93,42,91517.83086066617 +I2_en_join,msprime,8,508,37,125,51,110710.74581906163 +I2_en_join,msprime,9,439,30,101,44,92681.35855864939 +I2_en_join,msprime,10,551,37,123,51,114305.29809637179 +I2_en_join,msprime,11,599,42,146,56,132781.0284582705 +I2_en_join,msprime,12,518,21,73,35,123530.34848851709 +I2_en_join,msprime,13,409,28,91,42,95590.32795791008 +I2_en_join,msprime,14,477,35,116,49,88920.540199851 +I2_en_join,msprime,15,470,38,123,52,134691.8823494508 +I2_en_join,msprime,16,482,27,93,41,81440.42239981552 +I2_en_join,msprime,17,413,34,118,48,89006.92352507866 +I2_en_join,msprime,18,530,47,152,61,109201.08759579713 +I2_en_join,msprime,19,413,27,89,41,102939.78991988773 +I2_en_join,msprime,20,546,38,125,52,135238.74755217478 +I2_en_join,msprime,21,651,35,121,49,81607.07191886393 +I2_en_join,msprime,22,522,36,105,50,151949.8203850263 +I2_en_join,msprime,23,452,34,118,48,89961.69857797603 +I2_en_join,msprime,24,491,35,114,49,97904.68440797148 +I2_en_join,msprime,25,405,23,80,37,115452.8229915257 +I2_en_join,msprime,26,511,39,126,53,114677.69989790722 +I2_en_join,msprime,27,466,29,95,43,95506.38537926153 +I2_en_join,msprime,28,580,44,155,58,83579.41444430909 +I2_en_join,msprime,29,423,39,129,53,95313.99758613606 +I2_en_join,msprime,30,435,37,133,51,57249.201972139046 +I2_en_join,msprime,31,457,29,92,43,122457.13829746937 +I2_en_join,msprime,32,453,35,125,49,94070.54039875766 +I2_en_join,msprime,33,565,38,122,52,102975.86477934282 +I2_en_join,msprime,34,441,24,86,38,105387.28917250219 +I2_en_join,msprime,35,403,21,69,35,154081.72616774266 +I2_en_join,msprime,36,481,30,101,44,85368.92444472763 +I2_en_join,msprime,37,600,40,134,54,133494.6392175415 +I2_en_join,msprime,38,438,25,88,39,95210.50516400867 +I2_en_join,msprime,39,467,41,142,55,121279.25515798831 +I2_en_join,msprime,40,444,40,130,54,142588.6776950081 +I2_en_join,msprime,41,436,32,111,46,101501.03501702432 +I2_en_join,msprime,42,580,38,126,52,137147.77205237906 +I2_en_join,msprime,43,484,24,86,38,70167.56944579098 +I2_en_join,msprime,44,473,37,124,51,127339.60094870909 +I2_en_join,msprime,45,519,35,109,49,157011.9311568176 +I2_en_join,msprime,46,459,25,84,39,110942.62839244043 +I2_en_join,msprime,47,556,35,122,49,100145.88432616864 +I2_en_join,msprime,48,309,21,64,35,85136.93693449725 +I2_en_join,msprime,49,409,29,104,43,75896.71347412044 +I2_en_join,msprime,50,504,35,112,49,112863.5251947745 diff --git a/dev/testData/msprime_chr0.trees b/dev/testData/msprime_chr0.trees new file mode 100644 index 00000000..6942f0fd Binary files /dev/null and b/dev/testData/msprime_chr0.trees differ diff --git a/dev/testData/msprime_chr1.trees b/dev/testData/msprime_chr1.trees new file mode 100644 index 00000000..b5477e21 Binary files /dev/null and b/dev/testData/msprime_chr1.trees differ diff --git a/dev/testData/out_msprime_from_macs/msprime_manifest.csv b/dev/testData/out_msprime_from_macs/msprime_manifest.csv new file mode 100644 index 00000000..33003661 --- /dev/null +++ b/dev/testData/out_msprime_from_macs/msprime_manifest.csv @@ -0,0 +1,201 @@ +scenario_id,scenario,rep,chr,args,nref,seed_chr,model,sequence_length,rec_rate_bp,mut_rate_bp,num_trees,num_nodes,num_edges,num_mutations,max_root_time,tree_path +1,single_const,1,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,801000,smc_prime,100000,2.5e-09,2.5e-08,15,29,56,183,51515.624918270456,testData/out_msprime_from_macs/single_const_rep01_chr01.trees +1,single_const,2,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,802000,smc_prime,100000,2.5e-09,2.5e-08,10,24,39,182,38763.80929698479,testData/out_msprime_from_macs/single_const_rep02_chr01.trees +1,single_const,3,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,803000,smc_prime,100000,2.5e-09,2.5e-08,27,41,87,336,109851.59102262642,testData/out_msprime_from_macs/single_const_rep03_chr01.trees +1,single_const,4,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,804000,smc_prime,100000,2.5e-09,2.5e-08,19,33,60,302,71899.5063036042,testData/out_msprime_from_macs/single_const_rep04_chr01.trees +1,single_const,5,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,805000,smc_prime,100000,2.5e-09,2.5e-08,16,30,59,173,52296.832527444945,testData/out_msprime_from_macs/single_const_rep05_chr01.trees +1,single_const,6,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,806000,smc_prime,100000,2.5e-09,2.5e-08,22,36,72,385,107743.52694763753,testData/out_msprime_from_macs/single_const_rep06_chr01.trees +1,single_const,7,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,807000,smc_prime,100000,2.5e-09,2.5e-08,16,30,64,168,30982.913633311287,testData/out_msprime_from_macs/single_const_rep07_chr01.trees +1,single_const,8,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,808000,smc_prime,100000,2.5e-09,2.5e-08,15,29,53,185,42181.611232091986,testData/out_msprime_from_macs/single_const_rep08_chr01.trees +1,single_const,9,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,809000,smc_prime,100000,2.5e-09,2.5e-08,20,34,65,184,59486.900392048585,testData/out_msprime_from_macs/single_const_rep09_chr01.trees +1,single_const,10,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,810000,smc_prime,100000,2.5e-09,2.5e-08,27,41,100,368,133055.80730249957,testData/out_msprime_from_macs/single_const_rep10_chr01.trees +1,single_const,11,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,811000,smc_prime,100000,2.5e-09,2.5e-08,31,45,114,262,48101.23294676906,testData/out_msprime_from_macs/single_const_rep11_chr01.trees +1,single_const,12,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,812000,smc_prime,100000,2.5e-09,2.5e-08,17,31,62,308,63690.27186884565,testData/out_msprime_from_macs/single_const_rep12_chr01.trees +1,single_const,13,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,813000,smc_prime,100000,2.5e-09,2.5e-08,21,35,74,199,49134.08270938573,testData/out_msprime_from_macs/single_const_rep13_chr01.trees +1,single_const,14,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,814000,smc_prime,100000,2.5e-09,2.5e-08,19,33,62,247,145182.40082078552,testData/out_msprime_from_macs/single_const_rep14_chr01.trees +1,single_const,15,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,815000,smc_prime,100000,2.5e-09,2.5e-08,24,38,91,244,62951.162590399596,testData/out_msprime_from_macs/single_const_rep15_chr01.trees +1,single_const,16,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,816000,smc_prime,100000,2.5e-09,2.5e-08,20,34,71,169,61498.16018360143,testData/out_msprime_from_macs/single_const_rep16_chr01.trees +1,single_const,17,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,817000,smc_prime,100000,2.5e-09,2.5e-08,22,36,74,266,111984.27213178847,testData/out_msprime_from_macs/single_const_rep17_chr01.trees +1,single_const,18,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,818000,smc_prime,100000,2.5e-09,2.5e-08,18,32,62,212,62641.42329757322,testData/out_msprime_from_macs/single_const_rep18_chr01.trees +1,single_const,19,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,819000,smc_prime,100000,2.5e-09,2.5e-08,20,34,69,275,80908.66388577598,testData/out_msprime_from_macs/single_const_rep19_chr01.trees +1,single_const,20,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,820000,smc_prime,100000,2.5e-09,2.5e-08,18,32,67,226,48753.6405100759,testData/out_msprime_from_macs/single_const_rep20_chr01.trees +1,single_const,21,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,821000,smc_prime,100000,2.5e-09,2.5e-08,23,37,85,351,62544.20329534801,testData/out_msprime_from_macs/single_const_rep21_chr01.trees +1,single_const,22,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,822000,smc_prime,100000,2.5e-09,2.5e-08,33,47,115,414,113011.0036309333,testData/out_msprime_from_macs/single_const_rep22_chr01.trees +1,single_const,23,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,823000,smc_prime,100000,2.5e-09,2.5e-08,28,42,110,276,63308.11472786501,testData/out_msprime_from_macs/single_const_rep23_chr01.trees +1,single_const,24,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,824000,smc_prime,100000,2.5e-09,2.5e-08,33,47,116,290,72418.28878141462,testData/out_msprime_from_macs/single_const_rep24_chr01.trees +1,single_const,25,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,825000,smc_prime,100000,2.5e-09,2.5e-08,23,37,82,221,56335.882104651864,testData/out_msprime_from_macs/single_const_rep25_chr01.trees +1,single_const,26,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,826000,smc_prime,100000,2.5e-09,2.5e-08,21,35,73,241,51653.59431584614,testData/out_msprime_from_macs/single_const_rep26_chr01.trees +1,single_const,27,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,827000,smc_prime,100000,2.5e-09,2.5e-08,29,43,106,281,102642.5271660279,testData/out_msprime_from_macs/single_const_rep27_chr01.trees +1,single_const,28,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,828000,smc_prime,100000,2.5e-09,2.5e-08,19,33,75,168,34588.383891724756,testData/out_msprime_from_macs/single_const_rep28_chr01.trees +1,single_const,29,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,829000,smc_prime,100000,2.5e-09,2.5e-08,17,31,62,204,62059.40939102222,testData/out_msprime_from_macs/single_const_rep29_chr01.trees +1,single_const,30,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,830000,smc_prime,100000,2.5e-09,2.5e-08,27,41,97,346,61260.760688875846,testData/out_msprime_from_macs/single_const_rep30_chr01.trees +1,single_const,31,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,831000,smc_prime,100000,2.5e-09,2.5e-08,28,42,91,291,120961.95623420269,testData/out_msprime_from_macs/single_const_rep31_chr01.trees +1,single_const,32,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,832000,smc_prime,100000,2.5e-09,2.5e-08,25,39,80,245,101173.91313550806,testData/out_msprime_from_macs/single_const_rep32_chr01.trees +1,single_const,33,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,833000,smc_prime,100000,2.5e-09,2.5e-08,30,44,103,304,63214.719234547876,testData/out_msprime_from_macs/single_const_rep33_chr01.trees +1,single_const,34,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,834000,smc_prime,100000,2.5e-09,2.5e-08,24,38,90,168,59006.04499853586,testData/out_msprime_from_macs/single_const_rep34_chr01.trees +1,single_const,35,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,835000,smc_prime,100000,2.5e-09,2.5e-08,49,63,164,344,98439.678484823,testData/out_msprime_from_macs/single_const_rep35_chr01.trees +1,single_const,36,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,836000,smc_prime,100000,2.5e-09,2.5e-08,14,28,49,172,50947.17699231931,testData/out_msprime_from_macs/single_const_rep36_chr01.trees +1,single_const,37,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,837000,smc_prime,100000,2.5e-09,2.5e-08,22,36,75,233,107320.9949119463,testData/out_msprime_from_macs/single_const_rep37_chr01.trees +1,single_const,38,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,838000,smc_prime,100000,2.5e-09,2.5e-08,20,34,71,289,79909.61109567845,testData/out_msprime_from_macs/single_const_rep38_chr01.trees +1,single_const,39,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,839000,smc_prime,100000,2.5e-09,2.5e-08,26,40,75,375,215760.75977841174,testData/out_msprime_from_macs/single_const_rep39_chr01.trees +1,single_const,40,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,840000,smc_prime,100000,2.5e-09,2.5e-08,17,31,66,319,69801.66186033799,testData/out_msprime_from_macs/single_const_rep40_chr01.trees +1,single_const,41,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,841000,smc_prime,100000,2.5e-09,2.5e-08,17,31,58,248,220971.26340291178,testData/out_msprime_from_macs/single_const_rep41_chr01.trees +1,single_const,42,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,842000,smc_prime,100000,2.5e-09,2.5e-08,11,25,34,80,109586.0356718846,testData/out_msprime_from_macs/single_const_rep42_chr01.trees +1,single_const,43,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,843000,smc_prime,100000,2.5e-09,2.5e-08,45,59,155,324,80630.78342095218,testData/out_msprime_from_macs/single_const_rep43_chr01.trees +1,single_const,44,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,844000,smc_prime,100000,2.5e-09,2.5e-08,27,41,104,263,54966.31877490516,testData/out_msprime_from_macs/single_const_rep44_chr01.trees +1,single_const,45,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,845000,smc_prime,100000,2.5e-09,2.5e-08,24,38,86,301,110946.77752676011,testData/out_msprime_from_macs/single_const_rep45_chr01.trees +1,single_const,46,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,846000,smc_prime,100000,2.5e-09,2.5e-08,17,31,58,197,67986.62845344585,testData/out_msprime_from_macs/single_const_rep46_chr01.trees +1,single_const,47,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,847000,smc_prime,100000,2.5e-09,2.5e-08,20,34,68,116,36060.08235496283,testData/out_msprime_from_macs/single_const_rep47_chr01.trees +1,single_const,48,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,848000,smc_prime,100000,2.5e-09,2.5e-08,19,33,59,313,126691.67299354449,testData/out_msprime_from_macs/single_const_rep48_chr01.trees +1,single_const,49,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,849000,smc_prime,100000,2.5e-09,2.5e-08,16,30,52,166,56095.740271687566,testData/out_msprime_from_macs/single_const_rep49_chr01.trees +1,single_const,50,1,8 100000 -t 1e-3 -r 1e-4 -s ,10000.0,850000,smc_prime,100000,2.5e-09,2.5e-08,25,39,82,312,88200.64578802258,testData/out_msprime_from_macs/single_const_rep50_chr01.trees +2,single_eN,1,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,901000,smc_prime,100000,2.5e-09,2.5e-08,28,42,92,278,65349.221239219914,testData/out_msprime_from_macs/single_eN_rep01_chr01.trees +2,single_eN,2,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,902000,smc_prime,100000,2.5e-09,2.5e-08,14,28,47,291,60601.858488352605,testData/out_msprime_from_macs/single_eN_rep02_chr01.trees +2,single_eN,3,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,903000,smc_prime,100000,2.5e-09,2.5e-08,34,48,123,415,57700.03535382562,testData/out_msprime_from_macs/single_eN_rep03_chr01.trees +2,single_eN,4,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,904000,smc_prime,100000,2.5e-09,2.5e-08,32,46,110,440,76756.02982500306,testData/out_msprime_from_macs/single_eN_rep04_chr01.trees +2,single_eN,5,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,905000,smc_prime,100000,2.5e-09,2.5e-08,30,44,91,315,70233.68216998836,testData/out_msprime_from_macs/single_eN_rep05_chr01.trees +2,single_eN,6,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,906000,smc_prime,100000,2.5e-09,2.5e-08,26,40,84,300,77770.58646889063,testData/out_msprime_from_macs/single_eN_rep06_chr01.trees +2,single_eN,7,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,907000,smc_prime,100000,2.5e-09,2.5e-08,25,39,88,352,60711.685354389985,testData/out_msprime_from_macs/single_eN_rep07_chr01.trees +2,single_eN,8,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,908000,smc_prime,100000,2.5e-09,2.5e-08,20,34,70,287,60906.76361323841,testData/out_msprime_from_macs/single_eN_rep08_chr01.trees +2,single_eN,9,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,909000,smc_prime,100000,2.5e-09,2.5e-08,40,54,133,414,79944.65892375396,testData/out_msprime_from_macs/single_eN_rep09_chr01.trees +2,single_eN,10,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,910000,smc_prime,100000,2.5e-09,2.5e-08,31,45,106,394,70883.21756212738,testData/out_msprime_from_macs/single_eN_rep10_chr01.trees +2,single_eN,11,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,911000,smc_prime,100000,2.5e-09,2.5e-08,33,47,123,461,73185.65542500555,testData/out_msprime_from_macs/single_eN_rep11_chr01.trees +2,single_eN,12,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,912000,smc_prime,100000,2.5e-09,2.5e-08,25,39,84,301,54214.5164817196,testData/out_msprime_from_macs/single_eN_rep12_chr01.trees +2,single_eN,13,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,913000,smc_prime,100000,2.5e-09,2.5e-08,22,36,75,273,69316.99951743032,testData/out_msprime_from_macs/single_eN_rep13_chr01.trees +2,single_eN,14,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,914000,smc_prime,100000,2.5e-09,2.5e-08,36,50,129,371,73958.80454455342,testData/out_msprime_from_macs/single_eN_rep14_chr01.trees +2,single_eN,15,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,915000,smc_prime,100000,2.5e-09,2.5e-08,25,39,78,288,72251.77689228117,testData/out_msprime_from_macs/single_eN_rep15_chr01.trees +2,single_eN,16,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,916000,smc_prime,100000,2.5e-09,2.5e-08,21,35,69,298,53200.65206122284,testData/out_msprime_from_macs/single_eN_rep16_chr01.trees +2,single_eN,17,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,917000,smc_prime,100000,2.5e-09,2.5e-08,19,33,69,216,44701.715341613424,testData/out_msprime_from_macs/single_eN_rep17_chr01.trees +2,single_eN,18,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,918000,smc_prime,100000,2.5e-09,2.5e-08,30,44,97,303,58467.65305087444,testData/out_msprime_from_macs/single_eN_rep18_chr01.trees +2,single_eN,19,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,919000,smc_prime,100000,2.5e-09,2.5e-08,18,32,53,217,67179.14113887127,testData/out_msprime_from_macs/single_eN_rep19_chr01.trees +2,single_eN,20,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,920000,smc_prime,100000,2.5e-09,2.5e-08,26,40,92,428,67345.61920307747,testData/out_msprime_from_macs/single_eN_rep20_chr01.trees +2,single_eN,21,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,921000,smc_prime,100000,2.5e-09,2.5e-08,22,36,72,255,98197.94041900555,testData/out_msprime_from_macs/single_eN_rep21_chr01.trees +2,single_eN,22,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,922000,smc_prime,100000,2.5e-09,2.5e-08,34,48,120,322,73182.20215592455,testData/out_msprime_from_macs/single_eN_rep22_chr01.trees +2,single_eN,23,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,923000,smc_prime,100000,2.5e-09,2.5e-08,21,35,78,334,55335.493334815896,testData/out_msprime_from_macs/single_eN_rep23_chr01.trees +2,single_eN,24,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,924000,smc_prime,100000,2.5e-09,2.5e-08,26,40,93,360,83432.89937189074,testData/out_msprime_from_macs/single_eN_rep24_chr01.trees +2,single_eN,25,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,925000,smc_prime,100000,2.5e-09,2.5e-08,40,54,130,385,106811.08207251274,testData/out_msprime_from_macs/single_eN_rep25_chr01.trees +2,single_eN,26,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,926000,smc_prime,100000,2.5e-09,2.5e-08,18,32,62,293,72244.85713844906,testData/out_msprime_from_macs/single_eN_rep26_chr01.trees +2,single_eN,27,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,927000,smc_prime,100000,2.5e-09,2.5e-08,25,39,89,313,61504.11927676364,testData/out_msprime_from_macs/single_eN_rep27_chr01.trees +2,single_eN,28,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,928000,smc_prime,100000,2.5e-09,2.5e-08,16,30,51,276,67348.08434313306,testData/out_msprime_from_macs/single_eN_rep28_chr01.trees +2,single_eN,29,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,929000,smc_prime,100000,2.5e-09,2.5e-08,27,41,94,329,68020.2817782327,testData/out_msprime_from_macs/single_eN_rep29_chr01.trees +2,single_eN,30,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,930000,smc_prime,100000,2.5e-09,2.5e-08,24,38,90,371,73755.84780580996,testData/out_msprime_from_macs/single_eN_rep30_chr01.trees +2,single_eN,31,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,931000,smc_prime,100000,2.5e-09,2.5e-08,32,46,113,359,112980.87085682394,testData/out_msprime_from_macs/single_eN_rep31_chr01.trees +2,single_eN,32,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,932000,smc_prime,100000,2.5e-09,2.5e-08,20,34,59,208,52219.14757403417,testData/out_msprime_from_macs/single_eN_rep32_chr01.trees +2,single_eN,33,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,933000,smc_prime,100000,2.5e-09,2.5e-08,37,51,127,365,63248.632349512416,testData/out_msprime_from_macs/single_eN_rep33_chr01.trees +2,single_eN,34,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,934000,smc_prime,100000,2.5e-09,2.5e-08,19,33,67,304,53891.48572153046,testData/out_msprime_from_macs/single_eN_rep34_chr01.trees +2,single_eN,35,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,935000,smc_prime,100000,2.5e-09,2.5e-08,15,29,60,210,50358.69328985923,testData/out_msprime_from_macs/single_eN_rep35_chr01.trees +2,single_eN,36,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,936000,smc_prime,100000,2.5e-09,2.5e-08,24,38,85,240,54082.961824516686,testData/out_msprime_from_macs/single_eN_rep36_chr01.trees +2,single_eN,37,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,937000,smc_prime,100000,2.5e-09,2.5e-08,30,44,113,459,62921.43534671086,testData/out_msprime_from_macs/single_eN_rep37_chr01.trees +2,single_eN,38,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,938000,smc_prime,100000,2.5e-09,2.5e-08,17,31,59,330,61021.66401499905,testData/out_msprime_from_macs/single_eN_rep38_chr01.trees +2,single_eN,39,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,939000,smc_prime,100000,2.5e-09,2.5e-08,31,45,110,457,77492.89793784614,testData/out_msprime_from_macs/single_eN_rep39_chr01.trees +2,single_eN,40,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,940000,smc_prime,100000,2.5e-09,2.5e-08,15,29,52,260,62864.54151974913,testData/out_msprime_from_macs/single_eN_rep40_chr01.trees +2,single_eN,41,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,941000,smc_prime,100000,2.5e-09,2.5e-08,37,51,123,435,100651.98483044902,testData/out_msprime_from_macs/single_eN_rep41_chr01.trees +2,single_eN,42,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,942000,smc_prime,100000,2.5e-09,2.5e-08,18,32,58,271,73950.93821976396,testData/out_msprime_from_macs/single_eN_rep42_chr01.trees +2,single_eN,43,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,943000,smc_prime,100000,2.5e-09,2.5e-08,27,41,82,275,93497.79468971904,testData/out_msprime_from_macs/single_eN_rep43_chr01.trees +2,single_eN,44,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,944000,smc_prime,100000,2.5e-09,2.5e-08,28,42,107,442,74339.58448917311,testData/out_msprime_from_macs/single_eN_rep44_chr01.trees +2,single_eN,45,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,945000,smc_prime,100000,2.5e-09,2.5e-08,33,47,110,387,101236.0734923974,testData/out_msprime_from_macs/single_eN_rep45_chr01.trees +2,single_eN,46,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,946000,smc_prime,100000,2.5e-09,2.5e-08,18,32,67,330,57038.03720408608,testData/out_msprime_from_macs/single_eN_rep46_chr01.trees +2,single_eN,47,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,947000,smc_prime,100000,2.5e-09,2.5e-08,26,40,89,356,72114.30160262527,testData/out_msprime_from_macs/single_eN_rep47_chr01.trees +2,single_eN,48,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,948000,smc_prime,100000,2.5e-09,2.5e-08,27,41,104,389,100002.2178829616,testData/out_msprime_from_macs/single_eN_rep48_chr01.trees +2,single_eN,49,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,949000,smc_prime,100000,2.5e-09,2.5e-08,17,31,55,244,65854.7399945304,testData/out_msprime_from_macs/single_eN_rep49_chr01.trees +2,single_eN,50,1,8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ,10000.0,950000,smc_prime,100000,2.5e-09,2.5e-08,24,38,85,343,65002.540328711504,testData/out_msprime_from_macs/single_eN_rep50_chr01.trees +3,I2_migration,1,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1001000,smc_prime,100000,2.5e-09,2.5e-08,38,52,137,546,97564.58412883535,testData/out_msprime_from_macs/I2_migration_rep01_chr01.trees +3,I2_migration,2,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1002000,smc_prime,100000,2.5e-09,2.5e-08,30,44,90,382,117642.80370111366,testData/out_msprime_from_macs/I2_migration_rep02_chr01.trees +3,I2_migration,3,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1003000,smc_prime,100000,2.5e-09,2.5e-08,45,59,160,567,84207.66407721497,testData/out_msprime_from_macs/I2_migration_rep03_chr01.trees +3,I2_migration,4,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1004000,smc_prime,100000,2.5e-09,2.5e-08,33,47,109,417,109918.36926777275,testData/out_msprime_from_macs/I2_migration_rep04_chr01.trees +3,I2_migration,5,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1005000,smc_prime,100000,2.5e-09,2.5e-08,45,59,155,489,114973.54073907119,testData/out_msprime_from_macs/I2_migration_rep05_chr01.trees +3,I2_migration,6,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1006000,smc_prime,100000,2.5e-09,2.5e-08,36,50,129,483,114188.44189606888,testData/out_msprime_from_macs/I2_migration_rep06_chr01.trees +3,I2_migration,7,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1007000,smc_prime,100000,2.5e-09,2.5e-08,48,62,166,608,93753.24179771724,testData/out_msprime_from_macs/I2_migration_rep07_chr01.trees +3,I2_migration,8,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1008000,smc_prime,100000,2.5e-09,2.5e-08,34,48,102,508,116856.75958831569,testData/out_msprime_from_macs/I2_migration_rep08_chr01.trees +3,I2_migration,9,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1009000,smc_prime,100000,2.5e-09,2.5e-08,29,43,103,463,62770.346245957306,testData/out_msprime_from_macs/I2_migration_rep09_chr01.trees +3,I2_migration,10,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1010000,smc_prime,100000,2.5e-09,2.5e-08,38,52,124,518,135423.86365044612,testData/out_msprime_from_macs/I2_migration_rep10_chr01.trees +3,I2_migration,11,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1011000,smc_prime,100000,2.5e-09,2.5e-08,37,51,115,453,118090.57466322677,testData/out_msprime_from_macs/I2_migration_rep11_chr01.trees +3,I2_migration,12,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1012000,smc_prime,100000,2.5e-09,2.5e-08,38,52,127,628,209534.1610295494,testData/out_msprime_from_macs/I2_migration_rep12_chr01.trees +3,I2_migration,13,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1013000,smc_prime,100000,2.5e-09,2.5e-08,43,57,149,536,89912.36330586261,testData/out_msprime_from_macs/I2_migration_rep13_chr01.trees +3,I2_migration,14,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1014000,smc_prime,100000,2.5e-09,2.5e-08,42,56,141,483,128912.51711145912,testData/out_msprime_from_macs/I2_migration_rep14_chr01.trees +3,I2_migration,15,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1015000,smc_prime,100000,2.5e-09,2.5e-08,45,59,152,618,158871.44365126808,testData/out_msprime_from_macs/I2_migration_rep15_chr01.trees +3,I2_migration,16,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1016000,smc_prime,100000,2.5e-09,2.5e-08,36,50,114,544,159695.05489009136,testData/out_msprime_from_macs/I2_migration_rep16_chr01.trees +3,I2_migration,17,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1017000,smc_prime,100000,2.5e-09,2.5e-08,38,52,120,463,99170.41393187974,testData/out_msprime_from_macs/I2_migration_rep17_chr01.trees +3,I2_migration,18,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1018000,smc_prime,100000,2.5e-09,2.5e-08,27,41,96,499,106487.65319013289,testData/out_msprime_from_macs/I2_migration_rep18_chr01.trees +3,I2_migration,19,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1019000,smc_prime,100000,2.5e-09,2.5e-08,37,52,130,561,101057.78304616034,testData/out_msprime_from_macs/I2_migration_rep19_chr01.trees +3,I2_migration,20,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1020000,smc_prime,100000,2.5e-09,2.5e-08,30,44,106,446,107384.27427104083,testData/out_msprime_from_macs/I2_migration_rep20_chr01.trees +3,I2_migration,21,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1021000,smc_prime,100000,2.5e-09,2.5e-08,30,44,104,448,96818.1513162082,testData/out_msprime_from_macs/I2_migration_rep21_chr01.trees +3,I2_migration,22,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1022000,smc_prime,100000,2.5e-09,2.5e-08,38,52,130,465,95245.67309246887,testData/out_msprime_from_macs/I2_migration_rep22_chr01.trees +3,I2_migration,23,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1023000,smc_prime,100000,2.5e-09,2.5e-08,27,41,95,519,80206.69234565251,testData/out_msprime_from_macs/I2_migration_rep23_chr01.trees +3,I2_migration,24,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1024000,smc_prime,100000,2.5e-09,2.5e-08,55,69,176,532,160077.76387616264,testData/out_msprime_from_macs/I2_migration_rep24_chr01.trees +3,I2_migration,25,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1025000,smc_prime,100000,2.5e-09,2.5e-08,28,42,83,385,144681.399583353,testData/out_msprime_from_macs/I2_migration_rep25_chr01.trees +3,I2_migration,26,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1026000,smc_prime,100000,2.5e-09,2.5e-08,41,55,128,495,117771.10101605413,testData/out_msprime_from_macs/I2_migration_rep26_chr01.trees +3,I2_migration,27,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1027000,smc_prime,100000,2.5e-09,2.5e-08,26,40,81,489,110262.67331368584,testData/out_msprime_from_macs/I2_migration_rep27_chr01.trees +3,I2_migration,28,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1028000,smc_prime,100000,2.5e-09,2.5e-08,33,47,124,493,120967.20685852526,testData/out_msprime_from_macs/I2_migration_rep28_chr01.trees +3,I2_migration,29,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1029000,smc_prime,100000,2.5e-09,2.5e-08,32,46,101,420,139142.03401837248,testData/out_msprime_from_macs/I2_migration_rep29_chr01.trees +3,I2_migration,30,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1030000,smc_prime,100000,2.5e-09,2.5e-08,54,68,167,607,164135.22989891114,testData/out_msprime_from_macs/I2_migration_rep30_chr01.trees +3,I2_migration,31,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1031000,smc_prime,100000,2.5e-09,2.5e-08,34,48,109,594,145106.40932381916,testData/out_msprime_from_macs/I2_migration_rep31_chr01.trees +3,I2_migration,32,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1032000,smc_prime,100000,2.5e-09,2.5e-08,38,52,135,505,75733.69112892961,testData/out_msprime_from_macs/I2_migration_rep32_chr01.trees +3,I2_migration,33,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1033000,smc_prime,100000,2.5e-09,2.5e-08,35,49,124,497,109977.00581672597,testData/out_msprime_from_macs/I2_migration_rep33_chr01.trees +3,I2_migration,34,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1034000,smc_prime,100000,2.5e-09,2.5e-08,49,63,159,650,121041.38130669676,testData/out_msprime_from_macs/I2_migration_rep34_chr01.trees +3,I2_migration,35,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1035000,smc_prime,100000,2.5e-09,2.5e-08,28,42,100,482,61154.63265076551,testData/out_msprime_from_macs/I2_migration_rep35_chr01.trees +3,I2_migration,36,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1036000,smc_prime,100000,2.5e-09,2.5e-08,53,67,172,572,128486.13469860726,testData/out_msprime_from_macs/I2_migration_rep36_chr01.trees +3,I2_migration,37,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1037000,smc_prime,100000,2.5e-09,2.5e-08,48,62,159,533,95052.05016785166,testData/out_msprime_from_macs/I2_migration_rep37_chr01.trees +3,I2_migration,38,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1038000,smc_prime,100000,2.5e-09,2.5e-08,40,54,126,555,129105.13464317753,testData/out_msprime_from_macs/I2_migration_rep38_chr01.trees +3,I2_migration,39,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1039000,smc_prime,100000,2.5e-09,2.5e-08,36,50,116,694,108180.6860369713,testData/out_msprime_from_macs/I2_migration_rep39_chr01.trees +3,I2_migration,40,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1040000,smc_prime,100000,2.5e-09,2.5e-08,29,43,95,481,130476.93001143142,testData/out_msprime_from_macs/I2_migration_rep40_chr01.trees +3,I2_migration,41,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1041000,smc_prime,100000,2.5e-09,2.5e-08,26,40,90,480,78868.02124460539,testData/out_msprime_from_macs/I2_migration_rep41_chr01.trees +3,I2_migration,42,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1042000,smc_prime,100000,2.5e-09,2.5e-08,39,53,124,556,119361.57872816,testData/out_msprime_from_macs/I2_migration_rep42_chr01.trees +3,I2_migration,43,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1043000,smc_prime,100000,2.5e-09,2.5e-08,46,60,150,628,155208.89518427377,testData/out_msprime_from_macs/I2_migration_rep43_chr01.trees +3,I2_migration,44,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1044000,smc_prime,100000,2.5e-09,2.5e-08,35,49,124,494,97188.69526910788,testData/out_msprime_from_macs/I2_migration_rep44_chr01.trees +3,I2_migration,45,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1045000,smc_prime,100000,2.5e-09,2.5e-08,32,46,117,469,67242.4266414875,testData/out_msprime_from_macs/I2_migration_rep45_chr01.trees +3,I2_migration,46,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1046000,smc_prime,100000,2.5e-09,2.5e-08,15,29,50,463,70366.02587526909,testData/out_msprime_from_macs/I2_migration_rep46_chr01.trees +3,I2_migration,47,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1047000,smc_prime,100000,2.5e-09,2.5e-08,37,51,116,472,81929.4748655115,testData/out_msprime_from_macs/I2_migration_rep47_chr01.trees +3,I2_migration,48,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1048000,smc_prime,100000,2.5e-09,2.5e-08,28,42,92,475,115961.5565377741,testData/out_msprime_from_macs/I2_migration_rep48_chr01.trees +3,I2_migration,49,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1049000,smc_prime,100000,2.5e-09,2.5e-08,30,44,100,420,125649.90496392331,testData/out_msprime_from_macs/I2_migration_rep49_chr01.trees +3,I2_migration,50,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ,10000.0,1050000,smc_prime,100000,2.5e-09,2.5e-08,41,55,140,577,101972.0338190745,testData/out_msprime_from_macs/I2_migration_rep50_chr01.trees +4,I2_en_join,1,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1101000,smc_prime,100000,2.5e-09,2.5e-08,37,51,128,520,88064.37475288789,testData/out_msprime_from_macs/I2_en_join_rep01_chr01.trees +4,I2_en_join,2,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1102000,smc_prime,100000,2.5e-09,2.5e-08,29,43,95,538,102022.72632240821,testData/out_msprime_from_macs/I2_en_join_rep02_chr01.trees +4,I2_en_join,3,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1103000,smc_prime,100000,2.5e-09,2.5e-08,30,44,103,508,118759.6618453868,testData/out_msprime_from_macs/I2_en_join_rep03_chr01.trees +4,I2_en_join,4,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1104000,smc_prime,100000,2.5e-09,2.5e-08,29,43,93,479,110921.88324171577,testData/out_msprime_from_macs/I2_en_join_rep04_chr01.trees +4,I2_en_join,5,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1105000,smc_prime,100000,2.5e-09,2.5e-08,18,32,60,380,107826.72923659543,testData/out_msprime_from_macs/I2_en_join_rep05_chr01.trees +4,I2_en_join,6,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1106000,smc_prime,100000,2.5e-09,2.5e-08,28,42,101,434,62612.626182878535,testData/out_msprime_from_macs/I2_en_join_rep06_chr01.trees +4,I2_en_join,7,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1107000,smc_prime,100000,2.5e-09,2.5e-08,28,42,93,429,91517.83086066617,testData/out_msprime_from_macs/I2_en_join_rep07_chr01.trees +4,I2_en_join,8,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1108000,smc_prime,100000,2.5e-09,2.5e-08,37,51,125,508,110710.74581906163,testData/out_msprime_from_macs/I2_en_join_rep08_chr01.trees +4,I2_en_join,9,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1109000,smc_prime,100000,2.5e-09,2.5e-08,30,44,101,439,92681.35855864939,testData/out_msprime_from_macs/I2_en_join_rep09_chr01.trees +4,I2_en_join,10,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1110000,smc_prime,100000,2.5e-09,2.5e-08,37,51,123,551,114305.29809637179,testData/out_msprime_from_macs/I2_en_join_rep10_chr01.trees +4,I2_en_join,11,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1111000,smc_prime,100000,2.5e-09,2.5e-08,42,56,146,599,132781.0284582705,testData/out_msprime_from_macs/I2_en_join_rep11_chr01.trees +4,I2_en_join,12,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1112000,smc_prime,100000,2.5e-09,2.5e-08,21,35,73,518,123530.34848851709,testData/out_msprime_from_macs/I2_en_join_rep12_chr01.trees +4,I2_en_join,13,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1113000,smc_prime,100000,2.5e-09,2.5e-08,28,42,91,409,95590.32795791008,testData/out_msprime_from_macs/I2_en_join_rep13_chr01.trees +4,I2_en_join,14,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1114000,smc_prime,100000,2.5e-09,2.5e-08,35,49,116,477,88920.540199851,testData/out_msprime_from_macs/I2_en_join_rep14_chr01.trees +4,I2_en_join,15,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1115000,smc_prime,100000,2.5e-09,2.5e-08,38,52,123,470,134691.8823494508,testData/out_msprime_from_macs/I2_en_join_rep15_chr01.trees +4,I2_en_join,16,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1116000,smc_prime,100000,2.5e-09,2.5e-08,27,41,93,482,81440.42239981552,testData/out_msprime_from_macs/I2_en_join_rep16_chr01.trees +4,I2_en_join,17,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1117000,smc_prime,100000,2.5e-09,2.5e-08,34,48,118,413,89006.92352507866,testData/out_msprime_from_macs/I2_en_join_rep17_chr01.trees +4,I2_en_join,18,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1118000,smc_prime,100000,2.5e-09,2.5e-08,47,61,152,530,109201.08759579713,testData/out_msprime_from_macs/I2_en_join_rep18_chr01.trees +4,I2_en_join,19,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1119000,smc_prime,100000,2.5e-09,2.5e-08,27,41,89,413,102939.78991988773,testData/out_msprime_from_macs/I2_en_join_rep19_chr01.trees +4,I2_en_join,20,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1120000,smc_prime,100000,2.5e-09,2.5e-08,38,52,125,546,135238.74755217478,testData/out_msprime_from_macs/I2_en_join_rep20_chr01.trees +4,I2_en_join,21,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1121000,smc_prime,100000,2.5e-09,2.5e-08,35,49,121,651,81607.07191886393,testData/out_msprime_from_macs/I2_en_join_rep21_chr01.trees +4,I2_en_join,22,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1122000,smc_prime,100000,2.5e-09,2.5e-08,36,50,105,522,151949.8203850263,testData/out_msprime_from_macs/I2_en_join_rep22_chr01.trees +4,I2_en_join,23,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1123000,smc_prime,100000,2.5e-09,2.5e-08,34,48,118,452,89961.69857797603,testData/out_msprime_from_macs/I2_en_join_rep23_chr01.trees +4,I2_en_join,24,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1124000,smc_prime,100000,2.5e-09,2.5e-08,35,49,114,491,97904.68440797148,testData/out_msprime_from_macs/I2_en_join_rep24_chr01.trees +4,I2_en_join,25,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1125000,smc_prime,100000,2.5e-09,2.5e-08,23,37,80,405,115452.8229915257,testData/out_msprime_from_macs/I2_en_join_rep25_chr01.trees +4,I2_en_join,26,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1126000,smc_prime,100000,2.5e-09,2.5e-08,39,53,126,511,114677.69989790722,testData/out_msprime_from_macs/I2_en_join_rep26_chr01.trees +4,I2_en_join,27,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1127000,smc_prime,100000,2.5e-09,2.5e-08,29,43,95,466,95506.38537926153,testData/out_msprime_from_macs/I2_en_join_rep27_chr01.trees +4,I2_en_join,28,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1128000,smc_prime,100000,2.5e-09,2.5e-08,44,58,155,580,83579.41444430909,testData/out_msprime_from_macs/I2_en_join_rep28_chr01.trees +4,I2_en_join,29,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1129000,smc_prime,100000,2.5e-09,2.5e-08,39,53,129,423,95313.99758613606,testData/out_msprime_from_macs/I2_en_join_rep29_chr01.trees +4,I2_en_join,30,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1130000,smc_prime,100000,2.5e-09,2.5e-08,37,51,133,435,57249.201972139046,testData/out_msprime_from_macs/I2_en_join_rep30_chr01.trees +4,I2_en_join,31,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1131000,smc_prime,100000,2.5e-09,2.5e-08,29,43,92,457,122457.13829746937,testData/out_msprime_from_macs/I2_en_join_rep31_chr01.trees +4,I2_en_join,32,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1132000,smc_prime,100000,2.5e-09,2.5e-08,35,49,125,453,94070.54039875766,testData/out_msprime_from_macs/I2_en_join_rep32_chr01.trees +4,I2_en_join,33,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1133000,smc_prime,100000,2.5e-09,2.5e-08,38,52,122,565,102975.86477934282,testData/out_msprime_from_macs/I2_en_join_rep33_chr01.trees +4,I2_en_join,34,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1134000,smc_prime,100000,2.5e-09,2.5e-08,24,38,86,441,105387.28917250219,testData/out_msprime_from_macs/I2_en_join_rep34_chr01.trees +4,I2_en_join,35,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1135000,smc_prime,100000,2.5e-09,2.5e-08,21,35,69,403,154081.72616774266,testData/out_msprime_from_macs/I2_en_join_rep35_chr01.trees +4,I2_en_join,36,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1136000,smc_prime,100000,2.5e-09,2.5e-08,30,44,101,481,85368.92444472763,testData/out_msprime_from_macs/I2_en_join_rep36_chr01.trees +4,I2_en_join,37,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1137000,smc_prime,100000,2.5e-09,2.5e-08,40,54,134,600,133494.6392175415,testData/out_msprime_from_macs/I2_en_join_rep37_chr01.trees +4,I2_en_join,38,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1138000,smc_prime,100000,2.5e-09,2.5e-08,25,39,88,438,95210.50516400867,testData/out_msprime_from_macs/I2_en_join_rep38_chr01.trees +4,I2_en_join,39,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1139000,smc_prime,100000,2.5e-09,2.5e-08,41,55,142,467,121279.25515798831,testData/out_msprime_from_macs/I2_en_join_rep39_chr01.trees +4,I2_en_join,40,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1140000,smc_prime,100000,2.5e-09,2.5e-08,40,54,130,444,142588.6776950081,testData/out_msprime_from_macs/I2_en_join_rep40_chr01.trees +4,I2_en_join,41,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1141000,smc_prime,100000,2.5e-09,2.5e-08,32,46,111,436,101501.03501702432,testData/out_msprime_from_macs/I2_en_join_rep41_chr01.trees +4,I2_en_join,42,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1142000,smc_prime,100000,2.5e-09,2.5e-08,38,52,126,580,137147.77205237906,testData/out_msprime_from_macs/I2_en_join_rep42_chr01.trees +4,I2_en_join,43,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1143000,smc_prime,100000,2.5e-09,2.5e-08,24,38,86,484,70167.56944579098,testData/out_msprime_from_macs/I2_en_join_rep43_chr01.trees +4,I2_en_join,44,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1144000,smc_prime,100000,2.5e-09,2.5e-08,37,51,124,473,127339.60094870909,testData/out_msprime_from_macs/I2_en_join_rep44_chr01.trees +4,I2_en_join,45,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1145000,smc_prime,100000,2.5e-09,2.5e-08,35,49,109,519,157011.9311568176,testData/out_msprime_from_macs/I2_en_join_rep45_chr01.trees +4,I2_en_join,46,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1146000,smc_prime,100000,2.5e-09,2.5e-08,25,39,84,459,110942.62839244043,testData/out_msprime_from_macs/I2_en_join_rep46_chr01.trees +4,I2_en_join,47,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1147000,smc_prime,100000,2.5e-09,2.5e-08,35,49,122,556,100145.88432616864,testData/out_msprime_from_macs/I2_en_join_rep47_chr01.trees +4,I2_en_join,48,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1148000,smc_prime,100000,2.5e-09,2.5e-08,21,35,64,309,85136.93693449725,testData/out_msprime_from_macs/I2_en_join_rep48_chr01.trees +4,I2_en_join,49,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1149000,smc_prime,100000,2.5e-09,2.5e-08,29,43,104,409,75896.71347412044,testData/out_msprime_from_macs/I2_en_join_rep49_chr01.trees +4,I2_en_join,50,1,8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ,10000.0,1150000,smc_prime,100000,2.5e-09,2.5e-08,35,49,112,504,112863.5251947745,testData/out_msprime_from_macs/I2_en_join_rep50_chr01.trees diff --git a/dev/testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_manifest.csv b/dev/testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_manifest.csv new file mode 100644 index 00000000..71e740c4 --- /dev/null +++ b/dev/testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_manifest.csv @@ -0,0 +1,201 @@ +"scenario_id","scenario","rep","chr","args","seed_chr","mut_seed_chr","dTheta","dTheta_post","usePhysicalPositions","Nref","macs_num_mutations","macsts_num_mutations","tree_path" +1,"single_const",1,1,"8 100000 -t 1e-3 -r 1e-4 -s ",801000,801000,100,0.0025,TRUE,10000,292,274,"testData/out_phase1_useMacsMut_FALSE/single_const_rep01_chr01.trees" +1,"single_const",2,1,"8 100000 -t 1e-3 -r 1e-4 -s ",802000,802000,100,0.0025,TRUE,10000,266,259,"testData/out_phase1_useMacsMut_FALSE/single_const_rep02_chr01.trees" +1,"single_const",3,1,"8 100000 -t 1e-3 -r 1e-4 -s ",803000,803000,100,0.0025,TRUE,10000,193,251,"testData/out_phase1_useMacsMut_FALSE/single_const_rep03_chr01.trees" +1,"single_const",4,1,"8 100000 -t 1e-3 -r 1e-4 -s ",804000,804000,100,0.0025,TRUE,10000,298,253,"testData/out_phase1_useMacsMut_FALSE/single_const_rep04_chr01.trees" +1,"single_const",5,1,"8 100000 -t 1e-3 -r 1e-4 -s ",805000,805000,100,0.0025,TRUE,10000,330,194,"testData/out_phase1_useMacsMut_FALSE/single_const_rep05_chr01.trees" +1,"single_const",6,1,"8 100000 -t 1e-3 -r 1e-4 -s ",806000,806000,100,0.0025,TRUE,10000,278,236,"testData/out_phase1_useMacsMut_FALSE/single_const_rep06_chr01.trees" +1,"single_const",7,1,"8 100000 -t 1e-3 -r 1e-4 -s ",807000,807000,100,0.0025,TRUE,10000,291,235,"testData/out_phase1_useMacsMut_FALSE/single_const_rep07_chr01.trees" +1,"single_const",8,1,"8 100000 -t 1e-3 -r 1e-4 -s ",808000,808000,100,0.0025,TRUE,10000,193,189,"testData/out_phase1_useMacsMut_FALSE/single_const_rep08_chr01.trees" +1,"single_const",9,1,"8 100000 -t 1e-3 -r 1e-4 -s ",809000,809000,100,0.0025,TRUE,10000,111,207,"testData/out_phase1_useMacsMut_FALSE/single_const_rep09_chr01.trees" +1,"single_const",10,1,"8 100000 -t 1e-3 -r 1e-4 -s ",810000,810000,100,0.0025,TRUE,10000,337,330,"testData/out_phase1_useMacsMut_FALSE/single_const_rep10_chr01.trees" +1,"single_const",11,1,"8 100000 -t 1e-3 -r 1e-4 -s ",811000,811000,100,0.0025,TRUE,10000,335,311,"testData/out_phase1_useMacsMut_FALSE/single_const_rep11_chr01.trees" +1,"single_const",12,1,"8 100000 -t 1e-3 -r 1e-4 -s ",812000,812000,100,0.0025,TRUE,10000,306,267,"testData/out_phase1_useMacsMut_FALSE/single_const_rep12_chr01.trees" +1,"single_const",13,1,"8 100000 -t 1e-3 -r 1e-4 -s ",813000,813000,100,0.0025,TRUE,10000,417,244,"testData/out_phase1_useMacsMut_FALSE/single_const_rep13_chr01.trees" +1,"single_const",14,1,"8 100000 -t 1e-3 -r 1e-4 -s ",814000,814000,100,0.0025,TRUE,10000,434,355,"testData/out_phase1_useMacsMut_FALSE/single_const_rep14_chr01.trees" +1,"single_const",15,1,"8 100000 -t 1e-3 -r 1e-4 -s ",815000,815000,100,0.0025,TRUE,10000,233,265,"testData/out_phase1_useMacsMut_FALSE/single_const_rep15_chr01.trees" +1,"single_const",16,1,"8 100000 -t 1e-3 -r 1e-4 -s ",816000,816000,100,0.0025,TRUE,10000,285,209,"testData/out_phase1_useMacsMut_FALSE/single_const_rep16_chr01.trees" +1,"single_const",17,1,"8 100000 -t 1e-3 -r 1e-4 -s ",817000,817000,100,0.0025,TRUE,10000,205,224,"testData/out_phase1_useMacsMut_FALSE/single_const_rep17_chr01.trees" +1,"single_const",18,1,"8 100000 -t 1e-3 -r 1e-4 -s ",818000,818000,100,0.0025,TRUE,10000,215,269,"testData/out_phase1_useMacsMut_FALSE/single_const_rep18_chr01.trees" +1,"single_const",19,1,"8 100000 -t 1e-3 -r 1e-4 -s ",819000,819000,100,0.0025,TRUE,10000,227,348,"testData/out_phase1_useMacsMut_FALSE/single_const_rep19_chr01.trees" +1,"single_const",20,1,"8 100000 -t 1e-3 -r 1e-4 -s ",820000,820000,100,0.0025,TRUE,10000,129,242,"testData/out_phase1_useMacsMut_FALSE/single_const_rep20_chr01.trees" +1,"single_const",21,1,"8 100000 -t 1e-3 -r 1e-4 -s ",821000,821000,100,0.0025,TRUE,10000,142,198,"testData/out_phase1_useMacsMut_FALSE/single_const_rep21_chr01.trees" +1,"single_const",22,1,"8 100000 -t 1e-3 -r 1e-4 -s ",822000,822000,100,0.0025,TRUE,10000,227,270,"testData/out_phase1_useMacsMut_FALSE/single_const_rep22_chr01.trees" +1,"single_const",23,1,"8 100000 -t 1e-3 -r 1e-4 -s ",823000,823000,100,0.0025,TRUE,10000,143,181,"testData/out_phase1_useMacsMut_FALSE/single_const_rep23_chr01.trees" +1,"single_const",24,1,"8 100000 -t 1e-3 -r 1e-4 -s ",824000,824000,100,0.0025,TRUE,10000,293,312,"testData/out_phase1_useMacsMut_FALSE/single_const_rep24_chr01.trees" +1,"single_const",25,1,"8 100000 -t 1e-3 -r 1e-4 -s ",825000,825000,100,0.0025,TRUE,10000,332,234,"testData/out_phase1_useMacsMut_FALSE/single_const_rep25_chr01.trees" +1,"single_const",26,1,"8 100000 -t 1e-3 -r 1e-4 -s ",826000,826000,100,0.0025,TRUE,10000,153,239,"testData/out_phase1_useMacsMut_FALSE/single_const_rep26_chr01.trees" +1,"single_const",27,1,"8 100000 -t 1e-3 -r 1e-4 -s ",827000,827000,100,0.0025,TRUE,10000,151,279,"testData/out_phase1_useMacsMut_FALSE/single_const_rep27_chr01.trees" +1,"single_const",28,1,"8 100000 -t 1e-3 -r 1e-4 -s ",828000,828000,100,0.0025,TRUE,10000,321,174,"testData/out_phase1_useMacsMut_FALSE/single_const_rep28_chr01.trees" +1,"single_const",29,1,"8 100000 -t 1e-3 -r 1e-4 -s ",829000,829000,100,0.0025,TRUE,10000,195,409,"testData/out_phase1_useMacsMut_FALSE/single_const_rep29_chr01.trees" +1,"single_const",30,1,"8 100000 -t 1e-3 -r 1e-4 -s ",830000,830000,100,0.0025,TRUE,10000,150,222,"testData/out_phase1_useMacsMut_FALSE/single_const_rep30_chr01.trees" +1,"single_const",31,1,"8 100000 -t 1e-3 -r 1e-4 -s ",831000,831000,100,0.0025,TRUE,10000,401,214,"testData/out_phase1_useMacsMut_FALSE/single_const_rep31_chr01.trees" +1,"single_const",32,1,"8 100000 -t 1e-3 -r 1e-4 -s ",832000,832000,100,0.0025,TRUE,10000,357,342,"testData/out_phase1_useMacsMut_FALSE/single_const_rep32_chr01.trees" +1,"single_const",33,1,"8 100000 -t 1e-3 -r 1e-4 -s ",833000,833000,100,0.0025,TRUE,10000,230,208,"testData/out_phase1_useMacsMut_FALSE/single_const_rep33_chr01.trees" +1,"single_const",34,1,"8 100000 -t 1e-3 -r 1e-4 -s ",834000,834000,100,0.0025,TRUE,10000,139,216,"testData/out_phase1_useMacsMut_FALSE/single_const_rep34_chr01.trees" +1,"single_const",35,1,"8 100000 -t 1e-3 -r 1e-4 -s ",835000,835000,100,0.0025,TRUE,10000,128,198,"testData/out_phase1_useMacsMut_FALSE/single_const_rep35_chr01.trees" +1,"single_const",36,1,"8 100000 -t 1e-3 -r 1e-4 -s ",836000,836000,100,0.0025,TRUE,10000,298,271,"testData/out_phase1_useMacsMut_FALSE/single_const_rep36_chr01.trees" +1,"single_const",37,1,"8 100000 -t 1e-3 -r 1e-4 -s ",837000,837000,100,0.0025,TRUE,10000,242,199,"testData/out_phase1_useMacsMut_FALSE/single_const_rep37_chr01.trees" +1,"single_const",38,1,"8 100000 -t 1e-3 -r 1e-4 -s ",838000,838000,100,0.0025,TRUE,10000,177,350,"testData/out_phase1_useMacsMut_FALSE/single_const_rep38_chr01.trees" +1,"single_const",39,1,"8 100000 -t 1e-3 -r 1e-4 -s ",839000,839000,100,0.0025,TRUE,10000,213,265,"testData/out_phase1_useMacsMut_FALSE/single_const_rep39_chr01.trees" +1,"single_const",40,1,"8 100000 -t 1e-3 -r 1e-4 -s ",840000,840000,100,0.0025,TRUE,10000,163,272,"testData/out_phase1_useMacsMut_FALSE/single_const_rep40_chr01.trees" +1,"single_const",41,1,"8 100000 -t 1e-3 -r 1e-4 -s ",841000,841000,100,0.0025,TRUE,10000,271,171,"testData/out_phase1_useMacsMut_FALSE/single_const_rep41_chr01.trees" +1,"single_const",42,1,"8 100000 -t 1e-3 -r 1e-4 -s ",842000,842000,100,0.0025,TRUE,10000,303,213,"testData/out_phase1_useMacsMut_FALSE/single_const_rep42_chr01.trees" +1,"single_const",43,1,"8 100000 -t 1e-3 -r 1e-4 -s ",843000,843000,100,0.0025,TRUE,10000,254,131,"testData/out_phase1_useMacsMut_FALSE/single_const_rep43_chr01.trees" +1,"single_const",44,1,"8 100000 -t 1e-3 -r 1e-4 -s ",844000,844000,100,0.0025,TRUE,10000,176,305,"testData/out_phase1_useMacsMut_FALSE/single_const_rep44_chr01.trees" +1,"single_const",45,1,"8 100000 -t 1e-3 -r 1e-4 -s ",845000,845000,100,0.0025,TRUE,10000,247,262,"testData/out_phase1_useMacsMut_FALSE/single_const_rep45_chr01.trees" +1,"single_const",46,1,"8 100000 -t 1e-3 -r 1e-4 -s ",846000,846000,100,0.0025,TRUE,10000,351,385,"testData/out_phase1_useMacsMut_FALSE/single_const_rep46_chr01.trees" +1,"single_const",47,1,"8 100000 -t 1e-3 -r 1e-4 -s ",847000,847000,100,0.0025,TRUE,10000,298,212,"testData/out_phase1_useMacsMut_FALSE/single_const_rep47_chr01.trees" +1,"single_const",48,1,"8 100000 -t 1e-3 -r 1e-4 -s ",848000,848000,100,0.0025,TRUE,10000,248,272,"testData/out_phase1_useMacsMut_FALSE/single_const_rep48_chr01.trees" +1,"single_const",49,1,"8 100000 -t 1e-3 -r 1e-4 -s ",849000,849000,100,0.0025,TRUE,10000,308,266,"testData/out_phase1_useMacsMut_FALSE/single_const_rep49_chr01.trees" +1,"single_const",50,1,"8 100000 -t 1e-3 -r 1e-4 -s ",850000,850000,100,0.0025,TRUE,10000,317,253,"testData/out_phase1_useMacsMut_FALSE/single_const_rep50_chr01.trees" +2,"single_eN",1,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",901000,901000,100,0.0025,TRUE,10000,160,203,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep01_chr01.trees" +2,"single_eN",2,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",902000,902000,100,0.0025,TRUE,10000,320,379,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep02_chr01.trees" +2,"single_eN",3,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",903000,903000,100,0.0025,TRUE,10000,414,329,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep03_chr01.trees" +2,"single_eN",4,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",904000,904000,100,0.0025,TRUE,10000,340,397,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep04_chr01.trees" +2,"single_eN",5,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",905000,905000,100,0.0025,TRUE,10000,314,208,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep05_chr01.trees" +2,"single_eN",6,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",906000,906000,100,0.0025,TRUE,10000,334,400,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep06_chr01.trees" +2,"single_eN",7,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",907000,907000,100,0.0025,TRUE,10000,182,337,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep07_chr01.trees" +2,"single_eN",8,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",908000,908000,100,0.0025,TRUE,10000,284,259,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep08_chr01.trees" +2,"single_eN",9,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",909000,909000,100,0.0025,TRUE,10000,300,340,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep09_chr01.trees" +2,"single_eN",10,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",910000,910000,100,0.0025,TRUE,10000,354,324,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep10_chr01.trees" +2,"single_eN",11,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",911000,911000,100,0.0025,TRUE,10000,324,264,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep11_chr01.trees" +2,"single_eN",12,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",912000,912000,100,0.0025,TRUE,10000,564,396,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep12_chr01.trees" +2,"single_eN",13,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",913000,913000,100,0.0025,TRUE,10000,193,266,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep13_chr01.trees" +2,"single_eN",14,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",914000,914000,100,0.0025,TRUE,10000,62,122,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep14_chr01.trees" +2,"single_eN",15,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",915000,915000,100,0.0025,TRUE,10000,372,277,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep15_chr01.trees" +2,"single_eN",16,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",916000,916000,100,0.0025,TRUE,10000,260,250,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep16_chr01.trees" +2,"single_eN",17,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",917000,917000,100,0.0025,TRUE,10000,283,286,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep17_chr01.trees" +2,"single_eN",18,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",918000,918000,100,0.0025,TRUE,10000,241,161,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep18_chr01.trees" +2,"single_eN",19,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",919000,919000,100,0.0025,TRUE,10000,309,383,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep19_chr01.trees" +2,"single_eN",20,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",920000,920000,100,0.0025,TRUE,10000,359,327,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep20_chr01.trees" +2,"single_eN",21,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",921000,921000,100,0.0025,TRUE,10000,292,274,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep21_chr01.trees" +2,"single_eN",22,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",922000,922000,100,0.0025,TRUE,10000,274,388,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep22_chr01.trees" +2,"single_eN",23,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",923000,923000,100,0.0025,TRUE,10000,99,168,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep23_chr01.trees" +2,"single_eN",24,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",924000,924000,100,0.0025,TRUE,10000,296,384,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep24_chr01.trees" +2,"single_eN",25,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",925000,925000,100,0.0025,TRUE,10000,390,431,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep25_chr01.trees" +2,"single_eN",26,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",926000,926000,100,0.0025,TRUE,10000,462,469,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep26_chr01.trees" +2,"single_eN",27,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",927000,927000,100,0.0025,TRUE,10000,219,394,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep27_chr01.trees" +2,"single_eN",28,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",928000,928000,100,0.0025,TRUE,10000,418,285,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep28_chr01.trees" +2,"single_eN",29,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",929000,929000,100,0.0025,TRUE,10000,281,251,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep29_chr01.trees" +2,"single_eN",30,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",930000,930000,100,0.0025,TRUE,10000,286,301,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep30_chr01.trees" +2,"single_eN",31,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",931000,931000,100,0.0025,TRUE,10000,280,237,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep31_chr01.trees" +2,"single_eN",32,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",932000,932000,100,0.0025,TRUE,10000,436,284,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep32_chr01.trees" +2,"single_eN",33,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",933000,933000,100,0.0025,TRUE,10000,379,379,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep33_chr01.trees" +2,"single_eN",34,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",934000,934000,100,0.0025,TRUE,10000,177,123,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep34_chr01.trees" +2,"single_eN",35,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",935000,935000,100,0.0025,TRUE,10000,266,220,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep35_chr01.trees" +2,"single_eN",36,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",936000,936000,100,0.0025,TRUE,10000,278,389,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep36_chr01.trees" +2,"single_eN",37,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",937000,937000,100,0.0025,TRUE,10000,304,313,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep37_chr01.trees" +2,"single_eN",38,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",938000,938000,100,0.0025,TRUE,10000,422,319,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep38_chr01.trees" +2,"single_eN",39,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",939000,939000,100,0.0025,TRUE,10000,425,285,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep39_chr01.trees" +2,"single_eN",40,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",940000,940000,100,0.0025,TRUE,10000,443,275,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep40_chr01.trees" +2,"single_eN",41,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",941000,941000,100,0.0025,TRUE,10000,362,448,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep41_chr01.trees" +2,"single_eN",42,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",942000,942000,100,0.0025,TRUE,10000,342,129,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep42_chr01.trees" +2,"single_eN",43,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",943000,943000,100,0.0025,TRUE,10000,319,329,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep43_chr01.trees" +2,"single_eN",44,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",944000,944000,100,0.0025,TRUE,10000,368,254,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep44_chr01.trees" +2,"single_eN",45,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",945000,945000,100,0.0025,TRUE,10000,355,418,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep45_chr01.trees" +2,"single_eN",46,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",946000,946000,100,0.0025,TRUE,10000,356,306,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep46_chr01.trees" +2,"single_eN",47,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",947000,947000,100,0.0025,TRUE,10000,357,375,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep47_chr01.trees" +2,"single_eN",48,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",948000,948000,100,0.0025,TRUE,10000,327,342,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep48_chr01.trees" +2,"single_eN",49,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",949000,949000,100,0.0025,TRUE,10000,438,361,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep49_chr01.trees" +2,"single_eN",50,1,"8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s ",950000,950000,100,0.0025,TRUE,10000,361,448,"testData/out_phase1_useMacsMut_FALSE/single_eN_rep50_chr01.trees" +3,"I2_migration",1,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1001000,1001000,100,0.0025,TRUE,10000,582,627,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep01_chr01.trees" +3,"I2_migration",2,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1002000,1002000,100,0.0025,TRUE,10000,539,505,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep02_chr01.trees" +3,"I2_migration",3,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1003000,1003000,100,0.0025,TRUE,10000,528,605,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep03_chr01.trees" +3,"I2_migration",4,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1004000,1004000,100,0.0025,TRUE,10000,491,536,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep04_chr01.trees" +3,"I2_migration",5,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1005000,1005000,100,0.0025,TRUE,10000,549,458,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep05_chr01.trees" +3,"I2_migration",6,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1006000,1006000,100,0.0025,TRUE,10000,627,467,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep06_chr01.trees" +3,"I2_migration",7,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1007000,1007000,100,0.0025,TRUE,10000,524,517,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep07_chr01.trees" +3,"I2_migration",8,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1008000,1008000,100,0.0025,TRUE,10000,498,643,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep08_chr01.trees" +3,"I2_migration",9,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1009000,1009000,100,0.0025,TRUE,10000,596,551,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep09_chr01.trees" +3,"I2_migration",10,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1010000,1010000,100,0.0025,TRUE,10000,492,428,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep10_chr01.trees" +3,"I2_migration",11,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1011000,1011000,100,0.0025,TRUE,10000,566,499,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep11_chr01.trees" +3,"I2_migration",12,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1012000,1012000,100,0.0025,TRUE,10000,541,494,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep12_chr01.trees" +3,"I2_migration",13,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1013000,1013000,100,0.0025,TRUE,10000,441,471,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep13_chr01.trees" +3,"I2_migration",14,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1014000,1014000,100,0.0025,TRUE,10000,473,532,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep14_chr01.trees" +3,"I2_migration",15,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1015000,1015000,100,0.0025,TRUE,10000,607,493,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep15_chr01.trees" +3,"I2_migration",16,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1016000,1016000,100,0.0025,TRUE,10000,504,504,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep16_chr01.trees" +3,"I2_migration",17,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1017000,1017000,100,0.0025,TRUE,10000,532,540,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep17_chr01.trees" +3,"I2_migration",18,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1018000,1018000,100,0.0025,TRUE,10000,622,522,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep18_chr01.trees" +3,"I2_migration",19,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1019000,1019000,100,0.0025,TRUE,10000,364,439,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep19_chr01.trees" +3,"I2_migration",20,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1020000,1020000,100,0.0025,TRUE,10000,458,471,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep20_chr01.trees" +3,"I2_migration",21,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1021000,1021000,100,0.0025,TRUE,10000,459,489,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep21_chr01.trees" +3,"I2_migration",22,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1022000,1022000,100,0.0025,TRUE,10000,471,434,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep22_chr01.trees" +3,"I2_migration",23,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1023000,1023000,100,0.0025,TRUE,10000,488,464,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep23_chr01.trees" +3,"I2_migration",24,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1024000,1024000,100,0.0025,TRUE,10000,500,476,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep24_chr01.trees" +3,"I2_migration",25,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1025000,1025000,100,0.0025,TRUE,10000,523,500,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep25_chr01.trees" +3,"I2_migration",26,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1026000,1026000,100,0.0025,TRUE,10000,581,533,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep26_chr01.trees" +3,"I2_migration",27,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1027000,1027000,100,0.0025,TRUE,10000,637,521,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep27_chr01.trees" +3,"I2_migration",28,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1028000,1028000,100,0.0025,TRUE,10000,500,538,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep28_chr01.trees" +3,"I2_migration",29,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1029000,1029000,100,0.0025,TRUE,10000,517,734,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep29_chr01.trees" +3,"I2_migration",30,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1030000,1030000,100,0.0025,TRUE,10000,533,471,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep30_chr01.trees" +3,"I2_migration",31,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1031000,1031000,100,0.0025,TRUE,10000,525,565,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep31_chr01.trees" +3,"I2_migration",32,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1032000,1032000,100,0.0025,TRUE,10000,478,524,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep32_chr01.trees" +3,"I2_migration",33,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1033000,1033000,100,0.0025,TRUE,10000,654,422,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep33_chr01.trees" +3,"I2_migration",34,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1034000,1034000,100,0.0025,TRUE,10000,405,505,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep34_chr01.trees" +3,"I2_migration",35,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1035000,1035000,100,0.0025,TRUE,10000,522,489,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep35_chr01.trees" +3,"I2_migration",36,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1036000,1036000,100,0.0025,TRUE,10000,500,551,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep36_chr01.trees" +3,"I2_migration",37,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1037000,1037000,100,0.0025,TRUE,10000,551,571,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep37_chr01.trees" +3,"I2_migration",38,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1038000,1038000,100,0.0025,TRUE,10000,548,442,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep38_chr01.trees" +3,"I2_migration",39,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1039000,1039000,100,0.0025,TRUE,10000,490,476,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep39_chr01.trees" +3,"I2_migration",40,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1040000,1040000,100,0.0025,TRUE,10000,607,464,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep40_chr01.trees" +3,"I2_migration",41,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1041000,1041000,100,0.0025,TRUE,10000,568,647,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep41_chr01.trees" +3,"I2_migration",42,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1042000,1042000,100,0.0025,TRUE,10000,441,529,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep42_chr01.trees" +3,"I2_migration",43,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1043000,1043000,100,0.0025,TRUE,10000,533,591,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep43_chr01.trees" +3,"I2_migration",44,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1044000,1044000,100,0.0025,TRUE,10000,503,443,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep44_chr01.trees" +3,"I2_migration",45,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1045000,1045000,100,0.0025,TRUE,10000,554,557,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep45_chr01.trees" +3,"I2_migration",46,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1046000,1046000,100,0.0025,TRUE,10000,551,562,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep46_chr01.trees" +3,"I2_migration",47,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1047000,1047000,100,0.0025,TRUE,10000,412,493,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep47_chr01.trees" +3,"I2_migration",48,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1048000,1048000,100,0.0025,TRUE,10000,449,391,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep48_chr01.trees" +3,"I2_migration",49,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1049000,1049000,100,0.0025,TRUE,10000,444,442,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep49_chr01.trees" +3,"I2_migration",50,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s ",1050000,1050000,100,0.0025,TRUE,10000,486,527,"testData/out_phase1_useMacsMut_FALSE/I2_migration_rep50_chr01.trees" +4,"I2_en_join",1,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1101000,1101000,100,0.0025,TRUE,10000,440,422,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep01_chr01.trees" +4,"I2_en_join",2,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1102000,1102000,100,0.0025,TRUE,10000,584,451,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep02_chr01.trees" +4,"I2_en_join",3,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1103000,1103000,100,0.0025,TRUE,10000,485,646,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep03_chr01.trees" +4,"I2_en_join",4,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1104000,1104000,100,0.0025,TRUE,10000,595,523,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep04_chr01.trees" +4,"I2_en_join",5,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1105000,1105000,100,0.0025,TRUE,10000,385,418,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep05_chr01.trees" +4,"I2_en_join",6,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1106000,1106000,100,0.0025,TRUE,10000,425,386,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep06_chr01.trees" +4,"I2_en_join",7,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1107000,1107000,100,0.0025,TRUE,10000,459,470,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep07_chr01.trees" +4,"I2_en_join",8,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1108000,1108000,100,0.0025,TRUE,10000,513,524,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep08_chr01.trees" +4,"I2_en_join",9,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1109000,1109000,100,0.0025,TRUE,10000,619,541,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep09_chr01.trees" +4,"I2_en_join",10,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1110000,1110000,100,0.0025,TRUE,10000,514,442,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep10_chr01.trees" +4,"I2_en_join",11,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1111000,1111000,100,0.0025,TRUE,10000,367,538,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep11_chr01.trees" +4,"I2_en_join",12,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1112000,1112000,100,0.0025,TRUE,10000,558,495,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep12_chr01.trees" +4,"I2_en_join",13,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1113000,1113000,100,0.0025,TRUE,10000,288,502,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep13_chr01.trees" +4,"I2_en_join",14,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1114000,1114000,100,0.0025,TRUE,10000,511,484,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep14_chr01.trees" +4,"I2_en_join",15,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1115000,1115000,100,0.0025,TRUE,10000,676,515,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep15_chr01.trees" +4,"I2_en_join",16,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1116000,1116000,100,0.0025,TRUE,10000,386,317,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep16_chr01.trees" +4,"I2_en_join",17,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1117000,1117000,100,0.0025,TRUE,10000,508,540,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep17_chr01.trees" +4,"I2_en_join",18,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1118000,1118000,100,0.0025,TRUE,10000,383,451,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep18_chr01.trees" +4,"I2_en_join",19,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1119000,1119000,100,0.0025,TRUE,10000,602,527,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep19_chr01.trees" +4,"I2_en_join",20,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1120000,1120000,100,0.0025,TRUE,10000,431,508,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep20_chr01.trees" +4,"I2_en_join",21,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1121000,1121000,100,0.0025,TRUE,10000,450,418,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep21_chr01.trees" +4,"I2_en_join",22,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1122000,1122000,100,0.0025,TRUE,10000,545,498,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep22_chr01.trees" +4,"I2_en_join",23,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1123000,1123000,100,0.0025,TRUE,10000,512,546,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep23_chr01.trees" +4,"I2_en_join",24,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1124000,1124000,100,0.0025,TRUE,10000,528,494,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep24_chr01.trees" +4,"I2_en_join",25,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1125000,1125000,100,0.0025,TRUE,10000,462,585,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep25_chr01.trees" +4,"I2_en_join",26,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1126000,1126000,100,0.0025,TRUE,10000,491,378,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep26_chr01.trees" +4,"I2_en_join",27,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1127000,1127000,100,0.0025,TRUE,10000,623,571,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep27_chr01.trees" +4,"I2_en_join",28,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1128000,1128000,100,0.0025,TRUE,10000,608,422,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep28_chr01.trees" +4,"I2_en_join",29,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1129000,1129000,100,0.0025,TRUE,10000,461,371,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep29_chr01.trees" +4,"I2_en_join",30,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1130000,1130000,100,0.0025,TRUE,10000,451,358,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep30_chr01.trees" +4,"I2_en_join",31,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1131000,1131000,100,0.0025,TRUE,10000,560,435,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep31_chr01.trees" +4,"I2_en_join",32,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1132000,1132000,100,0.0025,TRUE,10000,406,435,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep32_chr01.trees" +4,"I2_en_join",33,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1133000,1133000,100,0.0025,TRUE,10000,511,525,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep33_chr01.trees" +4,"I2_en_join",34,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1134000,1134000,100,0.0025,TRUE,10000,522,520,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep34_chr01.trees" +4,"I2_en_join",35,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1135000,1135000,100,0.0025,TRUE,10000,489,463,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep35_chr01.trees" +4,"I2_en_join",36,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1136000,1136000,100,0.0025,TRUE,10000,401,587,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep36_chr01.trees" +4,"I2_en_join",37,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1137000,1137000,100,0.0025,TRUE,10000,534,508,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep37_chr01.trees" +4,"I2_en_join",38,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1138000,1138000,100,0.0025,TRUE,10000,521,484,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep38_chr01.trees" +4,"I2_en_join",39,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1139000,1139000,100,0.0025,TRUE,10000,508,488,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep39_chr01.trees" +4,"I2_en_join",40,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1140000,1140000,100,0.0025,TRUE,10000,411,425,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep40_chr01.trees" +4,"I2_en_join",41,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1141000,1141000,100,0.0025,TRUE,10000,474,465,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep41_chr01.trees" +4,"I2_en_join",42,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1142000,1142000,100,0.0025,TRUE,10000,521,589,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep42_chr01.trees" +4,"I2_en_join",43,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1143000,1143000,100,0.0025,TRUE,10000,456,646,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep43_chr01.trees" +4,"I2_en_join",44,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1144000,1144000,100,0.0025,TRUE,10000,584,490,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep44_chr01.trees" +4,"I2_en_join",45,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1145000,1145000,100,0.0025,TRUE,10000,550,359,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep45_chr01.trees" +4,"I2_en_join",46,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1146000,1146000,100,0.0025,TRUE,10000,465,389,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep46_chr01.trees" +4,"I2_en_join",47,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1147000,1147000,100,0.0025,TRUE,10000,480,552,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep47_chr01.trees" +4,"I2_en_join",48,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1148000,1148000,100,0.0025,TRUE,10000,505,451,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep48_chr01.trees" +4,"I2_en_join",49,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1149000,1149000,100,0.0025,TRUE,10000,607,423,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep49_chr01.trees" +4,"I2_en_join",50,1,"8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ",1150000,1150000,100,0.0025,TRUE,10000,455,433,"testData/out_phase1_useMacsMut_FALSE/I2_en_join_rep50_chr01.trees" diff --git a/dev/testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_run_status.csv b/dev/testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_run_status.csv new file mode 100644 index 00000000..7f71bb2d --- /dev/null +++ b/dev/testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_run_status.csv @@ -0,0 +1,201 @@ +"scenario_id","scenario","rep","success","error" +1,"single_const",1,TRUE,"" +1,"single_const",2,TRUE,"" +1,"single_const",3,TRUE,"" +1,"single_const",4,TRUE,"" +1,"single_const",5,TRUE,"" +1,"single_const",6,TRUE,"" +1,"single_const",7,TRUE,"" +1,"single_const",8,TRUE,"" +1,"single_const",9,TRUE,"" +1,"single_const",10,TRUE,"" +1,"single_const",11,TRUE,"" +1,"single_const",12,TRUE,"" +1,"single_const",13,TRUE,"" +1,"single_const",14,TRUE,"" +1,"single_const",15,TRUE,"" +1,"single_const",16,TRUE,"" +1,"single_const",17,TRUE,"" +1,"single_const",18,TRUE,"" +1,"single_const",19,TRUE,"" +1,"single_const",20,TRUE,"" +1,"single_const",21,TRUE,"" +1,"single_const",22,TRUE,"" +1,"single_const",23,TRUE,"" +1,"single_const",24,TRUE,"" +1,"single_const",25,TRUE,"" +1,"single_const",26,TRUE,"" +1,"single_const",27,TRUE,"" +1,"single_const",28,TRUE,"" +1,"single_const",29,TRUE,"" +1,"single_const",30,TRUE,"" +1,"single_const",31,TRUE,"" +1,"single_const",32,TRUE,"" +1,"single_const",33,TRUE,"" +1,"single_const",34,TRUE,"" +1,"single_const",35,TRUE,"" +1,"single_const",36,TRUE,"" +1,"single_const",37,TRUE,"" +1,"single_const",38,TRUE,"" +1,"single_const",39,TRUE,"" +1,"single_const",40,TRUE,"" +1,"single_const",41,TRUE,"" +1,"single_const",42,TRUE,"" +1,"single_const",43,TRUE,"" +1,"single_const",44,TRUE,"" +1,"single_const",45,TRUE,"" +1,"single_const",46,TRUE,"" +1,"single_const",47,TRUE,"" +1,"single_const",48,TRUE,"" +1,"single_const",49,TRUE,"" +1,"single_const",50,TRUE,"" +2,"single_eN",1,TRUE,"" +2,"single_eN",2,TRUE,"" +2,"single_eN",3,TRUE,"" +2,"single_eN",4,TRUE,"" +2,"single_eN",5,TRUE,"" +2,"single_eN",6,TRUE,"" +2,"single_eN",7,TRUE,"" +2,"single_eN",8,TRUE,"" +2,"single_eN",9,TRUE,"" +2,"single_eN",10,TRUE,"" +2,"single_eN",11,TRUE,"" +2,"single_eN",12,TRUE,"" +2,"single_eN",13,TRUE,"" +2,"single_eN",14,TRUE,"" +2,"single_eN",15,TRUE,"" +2,"single_eN",16,TRUE,"" +2,"single_eN",17,TRUE,"" +2,"single_eN",18,TRUE,"" +2,"single_eN",19,TRUE,"" +2,"single_eN",20,TRUE,"" +2,"single_eN",21,TRUE,"" +2,"single_eN",22,TRUE,"" +2,"single_eN",23,TRUE,"" +2,"single_eN",24,TRUE,"" +2,"single_eN",25,TRUE,"" +2,"single_eN",26,TRUE,"" +2,"single_eN",27,TRUE,"" +2,"single_eN",28,TRUE,"" +2,"single_eN",29,TRUE,"" +2,"single_eN",30,TRUE,"" +2,"single_eN",31,TRUE,"" +2,"single_eN",32,TRUE,"" +2,"single_eN",33,TRUE,"" +2,"single_eN",34,TRUE,"" +2,"single_eN",35,TRUE,"" +2,"single_eN",36,TRUE,"" +2,"single_eN",37,TRUE,"" +2,"single_eN",38,TRUE,"" +2,"single_eN",39,TRUE,"" +2,"single_eN",40,TRUE,"" +2,"single_eN",41,TRUE,"" +2,"single_eN",42,TRUE,"" +2,"single_eN",43,TRUE,"" +2,"single_eN",44,TRUE,"" +2,"single_eN",45,TRUE,"" +2,"single_eN",46,TRUE,"" +2,"single_eN",47,TRUE,"" +2,"single_eN",48,TRUE,"" +2,"single_eN",49,TRUE,"" +2,"single_eN",50,TRUE,"" +3,"I2_migration",1,TRUE,"" +3,"I2_migration",2,TRUE,"" +3,"I2_migration",3,TRUE,"" +3,"I2_migration",4,TRUE,"" +3,"I2_migration",5,TRUE,"" +3,"I2_migration",6,TRUE,"" +3,"I2_migration",7,TRUE,"" +3,"I2_migration",8,TRUE,"" +3,"I2_migration",9,TRUE,"" +3,"I2_migration",10,TRUE,"" +3,"I2_migration",11,TRUE,"" +3,"I2_migration",12,TRUE,"" +3,"I2_migration",13,TRUE,"" +3,"I2_migration",14,TRUE,"" +3,"I2_migration",15,TRUE,"" +3,"I2_migration",16,TRUE,"" +3,"I2_migration",17,TRUE,"" +3,"I2_migration",18,TRUE,"" +3,"I2_migration",19,TRUE,"" +3,"I2_migration",20,TRUE,"" +3,"I2_migration",21,TRUE,"" +3,"I2_migration",22,TRUE,"" +3,"I2_migration",23,TRUE,"" +3,"I2_migration",24,TRUE,"" +3,"I2_migration",25,TRUE,"" +3,"I2_migration",26,TRUE,"" +3,"I2_migration",27,TRUE,"" +3,"I2_migration",28,TRUE,"" +3,"I2_migration",29,TRUE,"" +3,"I2_migration",30,TRUE,"" +3,"I2_migration",31,TRUE,"" +3,"I2_migration",32,TRUE,"" +3,"I2_migration",33,TRUE,"" +3,"I2_migration",34,TRUE,"" +3,"I2_migration",35,TRUE,"" +3,"I2_migration",36,TRUE,"" +3,"I2_migration",37,TRUE,"" +3,"I2_migration",38,TRUE,"" +3,"I2_migration",39,TRUE,"" +3,"I2_migration",40,TRUE,"" +3,"I2_migration",41,TRUE,"" +3,"I2_migration",42,TRUE,"" +3,"I2_migration",43,TRUE,"" +3,"I2_migration",44,TRUE,"" +3,"I2_migration",45,TRUE,"" +3,"I2_migration",46,TRUE,"" +3,"I2_migration",47,TRUE,"" +3,"I2_migration",48,TRUE,"" +3,"I2_migration",49,TRUE,"" +3,"I2_migration",50,TRUE,"" +4,"I2_en_join",1,TRUE,"" +4,"I2_en_join",2,TRUE,"" +4,"I2_en_join",3,TRUE,"" +4,"I2_en_join",4,TRUE,"" +4,"I2_en_join",5,TRUE,"" +4,"I2_en_join",6,TRUE,"" +4,"I2_en_join",7,TRUE,"" +4,"I2_en_join",8,TRUE,"" +4,"I2_en_join",9,TRUE,"" +4,"I2_en_join",10,TRUE,"" +4,"I2_en_join",11,TRUE,"" +4,"I2_en_join",12,TRUE,"" +4,"I2_en_join",13,TRUE,"" +4,"I2_en_join",14,TRUE,"" +4,"I2_en_join",15,TRUE,"" +4,"I2_en_join",16,TRUE,"" +4,"I2_en_join",17,TRUE,"" +4,"I2_en_join",18,TRUE,"" +4,"I2_en_join",19,TRUE,"" +4,"I2_en_join",20,TRUE,"" +4,"I2_en_join",21,TRUE,"" +4,"I2_en_join",22,TRUE,"" +4,"I2_en_join",23,TRUE,"" +4,"I2_en_join",24,TRUE,"" +4,"I2_en_join",25,TRUE,"" +4,"I2_en_join",26,TRUE,"" +4,"I2_en_join",27,TRUE,"" +4,"I2_en_join",28,TRUE,"" +4,"I2_en_join",29,TRUE,"" +4,"I2_en_join",30,TRUE,"" +4,"I2_en_join",31,TRUE,"" +4,"I2_en_join",32,TRUE,"" +4,"I2_en_join",33,TRUE,"" +4,"I2_en_join",34,TRUE,"" +4,"I2_en_join",35,TRUE,"" +4,"I2_en_join",36,TRUE,"" +4,"I2_en_join",37,TRUE,"" +4,"I2_en_join",38,TRUE,"" +4,"I2_en_join",39,TRUE,"" +4,"I2_en_join",40,TRUE,"" +4,"I2_en_join",41,TRUE,"" +4,"I2_en_join",42,TRUE,"" +4,"I2_en_join",43,TRUE,"" +4,"I2_en_join",44,TRUE,"" +4,"I2_en_join",45,TRUE,"" +4,"I2_en_join",46,TRUE,"" +4,"I2_en_join",47,TRUE,"" +4,"I2_en_join",48,TRUE,"" +4,"I2_en_join",49,TRUE,"" +4,"I2_en_join",50,TRUE,"" diff --git a/dev/testMaCSTS1.Rmd b/dev/testMaCSTS1.Rmd new file mode 100644 index 00000000..12351260 --- /dev/null +++ b/dev/testMaCSTS1.Rmd @@ -0,0 +1,385 @@ +--- +title: "test MaCSTS 1" +output: html_document +date: "2026-06-03" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## General Notes + +- `MaCSTS()` now returns ts tables per chromosome: records ancestry from MaCS local trees (nodes/edges/intervals), and with `useMacsMut = TRUE` it also records MaCS-generated sites/mutations during simulation. +- TS ancestry/mutation work is split into staged helpers: `simAnc()`, `simMut()`, and `finalizeInbredTs()` (optional, when `inbred=TRUE`). +- `MaCSTS()`is designed to be a part of the high-level `runMacTS()` wrapper, which will handle: + + - 1.TS generation (simAnc, and optionally simMut depending on mode); + - 2.TS-to-founder conversion (logic in `R/makeFoundersFromTs.R`: site sampling/filtering, genMap construction); + - 3.return a MapPop object like runMacs. + +- Current TS chromosome count uses `nChr` (not `maxSites`, `maxSites` will only be in high level wrapper runMacTS). +- Per-chromosome simulation runs in parallel when OpenMP is available (`nThreads`). + +## Goal Of This Notebook + +First validate `useMacsMut = TRUE`, because it should be directly comparable to legacy `MaCS` under the same args and seeds. + +Note: when `useMacsMut = FALSE`, even ancestry (trees without mutation) can differ from MaCS because of RNG state changes! Specifically, it’s one shared RNG state/stream; MaCS (and MaCSTS with useMacsMut=TRUE) consume draws for both ancestry and mutation; MaCSTS with useMacsMut=FALSE skips mutation draws, so change later RNG state. + +## Helpers to Compare MaCSTS with MaCS + +Helper overview: + +- extract_macs_chr(): get site positions + haplotypes from legacy MaCS output. +- extract_ts_chr(): get site positions + haplotypes from TS tables via variant iterator. +- compare_chr(): run strict and order-robust chromosome-level equivalence checks. +- run_case(): run MaCS and MaCSTS(useMacsMut=TRUE) with matched args/seeds and compare. + +```{r} +devtools::load_all() +library(RcppTskit) + +# Convert raw 00/01 display values to integer 0/1 matrix. +to_int01 <- function(x) { + matrix(as.integer(x), nrow = nrow(x), ncol = ncol(x), dimnames = dimnames(x)) +} + +# Read one chromosome from MaCS output (genMap + unpacked haplotypes). +extract_macs_chr <- function(macs_out, chr = 1L, nThreads = 1L) { + pos <- as.numeric(macs_out$genMap[[chr]]) + n_sites <- length(pos) + if (n_sites == 0L) { + return(list(pos = numeric(0), hap = matrix(integer(0), nrow = 0, ncol = 0))) + } + chr_geno <- macs_out$geno[chr] + hap_raw <- AlphaSimR:::getHaplo( + geno = chr_geno, + lociPerChr = as.integer(n_sites), + lociLoc = as.integer(seq_len(n_sites)), + nThreads = as.integer(nThreads) + ) + list(pos = pos, hap = to_int01(hap_raw)) +} + +# Read one chromosome from TS tables (positions + variant genotypes). +extract_ts_chr <- function(tc_xptr) { + tc <- RcppTskit::TableCollection$new(xptr = tc_xptr) + ts <- tc$tree_sequence() + n_samples <- as.integer(ts$num_samples()) + + it <- ts$variants() + pos <- numeric(0) + cols <- list() + repeat { + v <- it$next_variant() + if (is.null(v)) break + pos <- c(pos, as.numeric(v$position)) + cols[[length(cols) + 1L]] <- as.integer(v$genotypes) + } + + hap <- if (length(cols) == 0L) { + matrix(integer(0), nrow = n_samples, ncol = 0L) + } else { + do.call(cbind, cols) + } + + list( + pos = pos, + hap = hap, + num_sites = as.integer(ts$num_sites()), + num_mutations = as.integer(ts$num_mutations()) + ) +} + +# Sort position/genotype columns by position for strict positional matching. +sort_by_pos <- function(x) { + if (length(x$pos) == 0L) return(x) + o <- order(x$pos) + list(pos = x$pos[o], hap = x$hap[, o, drop = FALSE]) +} + +# Build position+haplotype keys for multiset comparison. +site_hap_keys <- function(pos, hap) { + if (length(pos) == 0L) return(character(0)) + p <- format(signif(pos, 15), scientific = FALSE, trim = TRUE) + vapply( + seq_len(ncol(hap)), + function(j) paste0(p[j], "|", paste(hap[, j], collapse = "")), + character(1) + ) +} + +# Compare one chromosome: +# 1) strict check after sorting by position, +# 2) multiset check to tolerate column-order differences. +compare_chr <- function(macs_chr, ts_chr) { + A <- sort_by_pos(macs_chr) + B <- sort_by_pos(ts_chr) + + same_nsites <- ncol(A$hap) == ncol(B$hap) + same_positions_strict <- isTRUE(all.equal(A$pos, B$pos, tolerance = 0)) + same_hap_strict <- identical(A$hap, B$hap) + + # Order-robust check for duplicate-position edge cases + keysA <- site_hap_keys(macs_chr$pos, macs_chr$hap) + keysB <- site_hap_keys(ts_chr$pos, ts_chr$hap) + same_site_hap_multiset <- identical(sort(keysA), sort(keysB)) + + data.frame( + same_nsites = same_nsites, + same_positions_strict = same_positions_strict, + same_hap_strict = same_hap_strict, + same_site_hap_multiset = same_site_hap_multiset, + ts_num_sites = ts_chr$num_sites, + ts_num_mutations = ts_chr$num_mutations + ) +} + +# Run a full matched-seed comparison for nChr chromosomes. +run_case <- function(args, nChr, inbred, ploidy, seed, nThreads = 1L) { + seed_vec <- rep(as.integer(seed), as.integer(nChr)) + + macs <- AlphaSimR:::MaCS( + args = args, + maxSites = rep(0L, as.integer(nChr)), + inbred = inbred, + ploidy = as.integer(ploidy), + nThreads = as.integer(nThreads), + seed = seed_vec + ) + + ts_out <- AlphaSimR:::MaCSTS( + args = args, + nChr = as.integer(nChr), + inbred = inbred, + ploidy = as.integer(ploidy), + nThreads = as.integer(nThreads), + seed = seed_vec, + usePhysicalPositions = FALSE, + useMacsMut = TRUE + ) + + out <- lapply(seq_len(as.integer(nChr)), function(chr) { + m <- extract_macs_chr(macs, chr = chr, nThreads = nThreads) + t <- extract_ts_chr(ts_out$tables[[chr]]) + cbind(chr = chr, compare_chr(m, t)) + }) + do.call(rbind, out) +} +``` + +## General Case +In the first case, I will use some more detail results to show what are the objects being compared here: +```{r} +args <- "4 10000 -t 1e-3 -r 1e-2 -s " +seed <- as.integer(42) +inbred <- FALSE +ploidy <- 2L +nChr <- 1L + +macs <- AlphaSimR:::MaCS( + args = args, + maxSites = rep(0L, as.integer(nChr)), + inbred = inbred, + ploidy = ploidy, + seed = seed, + nThreads = 1L + ) + + ts_out <- AlphaSimR:::MaCSTS( + args = args, + nChr = nChr, + inbred = inbred, + ploidy = ploidy, + seed = seed, + usePhysicalPositions = FALSE, + useMacsMut = TRUE, + nThreads = 1L + ) + +chr = 1L +m <- extract_macs_chr(macs, chr = chr) +t <- extract_ts_chr(ts_out$tables[[chr]]) +``` + +Both genotypes and positions matched between MaCS and MaCSTS with `useMacsMut = TRUE`: + +``` +> print(m) +$pos + [1] 0.1589878 0.1677192 0.1945866 0.2692034 0.2851194 0.2942718 + [7] 0.3008355 0.3116991 0.3385464 0.3423128 0.3543942 0.4430878 +[13] 0.4690506 0.4930433 0.5781099 0.6904748 0.7921847 0.8604606 + +$hap + [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] +[1,] 1 0 0 0 0 0 1 0 1 0 0 0 +[2,] 1 0 1 1 1 1 0 1 0 0 0 0 +[3,] 0 1 1 0 0 0 1 0 1 1 1 1 +[4,] 0 1 1 0 0 0 1 0 0 0 0 0 + [,13] [,14] [,15] [,16] [,17] [,18] +[1,] 0 0 0 0 1 0 +[2,] 0 0 1 0 1 0 +[3,] 0 1 1 0 0 0 +[4,] 1 0 0 1 0 1 +``` +``` +> print(t) +$pos + [1] 0.1589878 0.1677192 0.1945866 0.2692034 0.2851194 0.2942718 + [7] 0.3008355 0.3116991 0.3385464 0.3423128 0.3543942 0.4430878 +[13] 0.4690506 0.4930433 0.5781099 0.6904748 0.7921847 0.8604606 + +$hap + [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] +[1,] 1 0 0 0 0 0 1 0 1 0 0 0 +[2,] 1 0 1 1 1 1 0 1 0 0 0 0 +[3,] 0 1 1 0 0 0 1 0 1 1 1 1 +[4,] 0 1 1 0 0 0 1 0 0 0 0 0 + [,13] [,14] [,15] [,16] [,17] [,18] +[1,] 0 0 0 0 1 0 +[2,] 0 0 1 0 1 0 +[3,] 0 1 1 0 0 0 +[4,] 1 0 0 1 0 1 + +$num_sites +[1] 18 + +$num_mutations +[1] 18 +``` + +Or we can run: +```{r} +z <- run_case( + args = args, + nChr = 1L, + inbred = inbred, + ploidy = ploidy, + seed = seed + ) +``` +`same_hap_strict = TRUE` plus `same_positions_strict = TRUE` means direct compatibility matched exactly: +``` +> print(z) + chr same_nsites same_positions_strict same_hap_strict +1 1 TRUE TRUE TRUE + same_site_hap_multiset ts_num_sites ts_num_mutations +1 TRUE 18 18 +``` + +## More cases +Now I will run a scenario sweep with 5 different scenarios, and all of them show the same result that MaCSTS with `useMacsMut = TRUE` can reproduce the same sites and haplotypes as MaCS. + +### Scenario1: ploidy = 1L +```{r} +args <- "4 10000 -t 1e-3 -r 1e-2 -s " +seed <- as.integer(42) +inbred <- FALSE +ploidy <- 1L +nChr <- 1L +z1 <- run_case( + args = args, + nChr = 1L, + inbred = inbred, + ploidy = ploidy, + seed = seed + ) +print(z1) +``` + +### Scenario2: inbred = TRUE +```{r} +args <- "4 10000 -t 1e-3 -r 1e-2 -s " +seed <- as.integer(42) +inbred <- TRUE +ploidy <- 2L +nChr <- 1L +z2 <- run_case( + args = args, + nChr = 1L, + inbred = inbred, + ploidy = ploidy, + seed = seed + ) +print(z2) +``` + +### Scenario3: demography +```{r} +# eN 0.2 2.0: at time 0.2, size jumps to 2x baseline. +# eN 1.0 0.5: at time 1.0, size jumps to 0.5x baseline. +args <- "4 10000 -t 1e-3 -r 1e-2 -eN 0.2 2.0 -eN 1.0 0.5 -s " +seed <- as.integer(42) +inbred <- FALSE +ploidy <- 2L +nChr <- 1L +z3 <- run_case( + args = args, + nChr = 1L, + inbred = inbred, + ploidy = ploidy, + seed = seed + ) +print(z3) +``` + + +### Scenario4: 2 populations after a split +```{r} +# -I: 8 sampled chromosomes from 2 pops total, with 4 from pop 1 and 4 from pop 2 +# -ej: at time 1.0, pop 2 merges into pop 1 (backward), which corresponds to a split from pop 1 to pop 2 forward in time. +args <- "8 10000 -t 1e-3 -r 1e-2 -I 2 4 4 -ej 1.0 2 1 -s " +seed <- as.integer(42) +inbred <- FALSE +ploidy <- 2L +nChr <- 1L + +ts_out4 <- AlphaSimR:::MaCSTS( + args = args, + nChr = nChr, + inbred = inbred, + ploidy = ploidy, + seed = seed, + usePhysicalPositions = FALSE, + useMacsMut = TRUE, + nThreads = 1L +) + +# read population info from node table for the 4th node (which should be in pop 2 after the split) +tc4 <- RcppTskit::TableCollection$new(xptr = ts_out4$tables[[1]]) +tc4$node_table_get_row(4)['population'] +tc4$node_table_get_row(3)['population'] + +z4 <- run_case( + args = args, + nChr = 1L, + inbred = inbred, + ploidy = ploidy, + seed = seed + ) +print(z4) +``` + +### Scenario5: 2 populations with a migration +```{r} +# -I: 8 sampled chromosomes from 2 pops total, with 4 from pop 1 and 4 from pop 2 +# at backward time t = 0.5, set the global migration-rate parameter to 5e-3 for all populations +args <- "8 10000 -t 1e-3 -r 1e-2 -I 2 4 4 -eM 0.5 5e-3 -s " +seed <- as.integer(42) +inbred <- FALSE +ploidy <- 2L +nChr <- 1L + +z5 <- run_case( + args = args, + nChr = 1L, + inbred = inbred, + ploidy = ploidy, + seed = seed + ) +print(z5) +``` + + diff --git a/dev/testMaCSTS1.html b/dev/testMaCSTS1.html new file mode 100644 index 00000000..1330863b --- /dev/null +++ b/dev/testMaCSTS1.html @@ -0,0 +1,816 @@ + + + + + + + + + + + + + + +test MaCSTS 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

General Notes

+
    +
  • MaCSTS() now returns ts tables per chromosome: +records ancestry from MaCS local trees (nodes/edges/intervals), and with +useMacsMut = TRUE it also records MaCS-generated +sites/mutations during simulation.

  • +
  • TS ancestry/mutation work is split into staged helpers: +simAnc(), simMut(), and +finalizeInbredTs() (optional, when +inbred=TRUE).

  • +
  • MaCSTS()is designed to be a part of the high-level +runMacTS() wrapper, which will handle:

    +
      +
    • 1.TS generation (simAnc, and optionally simMut depending on +mode);
    • +
    • 2.TS-to-founder conversion (logic in +R/makeFoundersFromTs.R: site sampling/filtering, genMap +construction);
    • +
    • 3.return a MapPop object like runMacs.
    • +
  • +
  • Current TS chromosome count uses nChr (not +maxSites, maxSites will only be in high level +wrapper runMacTS).

  • +
  • Per-chromosome simulation runs in parallel when OpenMP is +available (nThreads).

  • +
+
+
+

Goal Of This Notebook

+

First validate useMacsMut = TRUE, because it should be +directly comparable to legacy MaCS under the same args and +seeds.

+

Note: when useMacsMut = FALSE, even ancestry (trees +without mutation) can differ from MaCS because of RNG state changes! +Specifically, it’s one shared RNG state/stream; MaCS (and MaCSTS with +useMacsMut=TRUE) consume draws for both ancestry and mutation; MaCSTS +with useMacsMut=FALSE skips mutation draws, so change later RNG +state.

+
+
+

Helpers to Compare MaCSTS with MaCS

+

Helper overview:

+
    +
  • extract_macs_chr(): get site positions + haplotypes from legacy MaCS +output.
  • +
  • extract_ts_chr(): get site positions + haplotypes from TS tables via +variant iterator.
  • +
  • compare_chr(): run strict and order-robust chromosome-level +equivalence checks.
  • +
  • run_case(): run MaCS and MaCSTS(useMacsMut=TRUE) with matched +args/seeds and compare.
  • +
+
devtools::load_all()
+
## ℹ Loading AlphaSimR
+
library(RcppTskit)
+
+# Convert raw 00/01 display values to integer 0/1 matrix.
+to_int01 <- function(x) {
+  matrix(as.integer(x), nrow = nrow(x), ncol = ncol(x), dimnames = dimnames(x))
+}
+
+# Read one chromosome from MaCS output (genMap + unpacked haplotypes).
+extract_macs_chr <- function(macs_out, chr = 1L, nThreads = 1L) {
+  pos <- as.numeric(macs_out$genMap[[chr]])
+  n_sites <- length(pos)
+  if (n_sites == 0L) {
+    return(list(pos = numeric(0), hap = matrix(integer(0), nrow = 0, ncol = 0)))
+  }
+  chr_geno <- macs_out$geno[chr]
+  hap_raw <- AlphaSimR:::getHaplo(
+    geno = chr_geno,
+    lociPerChr = as.integer(n_sites),
+    lociLoc = as.integer(seq_len(n_sites)),
+    nThreads = as.integer(nThreads)
+  )
+  list(pos = pos, hap = to_int01(hap_raw))
+}
+
+# Read one chromosome from TS tables (positions + variant genotypes).
+extract_ts_chr <- function(tc_xptr) {
+  tc <- RcppTskit::TableCollection$new(xptr = tc_xptr)
+  ts <- tc$tree_sequence()
+  n_samples <- as.integer(ts$num_samples())
+
+  it <- ts$variants()
+  pos <- numeric(0)
+  cols <- list()
+  repeat {
+    v <- it$next_variant()
+    if (is.null(v)) break
+    pos <- c(pos, as.numeric(v$position))
+    cols[[length(cols) + 1L]] <- as.integer(v$genotypes)
+  }
+
+  hap <- if (length(cols) == 0L) {
+    matrix(integer(0), nrow = n_samples, ncol = 0L)
+  } else {
+    do.call(cbind, cols)
+  }
+
+  list(
+    pos = pos,
+    hap = hap,
+    num_sites = as.integer(ts$num_sites()),
+    num_mutations = as.integer(ts$num_mutations())
+  )
+}
+
+# Sort position/genotype columns by position for strict positional matching.
+sort_by_pos <- function(x) {
+  if (length(x$pos) == 0L) return(x)
+  o <- order(x$pos)
+  list(pos = x$pos[o], hap = x$hap[, o, drop = FALSE])
+}
+
+# Build position+haplotype keys for multiset comparison.
+site_hap_keys <- function(pos, hap) {
+  if (length(pos) == 0L) return(character(0))
+  p <- format(signif(pos, 15), scientific = FALSE, trim = TRUE)
+  vapply(
+    seq_len(ncol(hap)),
+    function(j) paste0(p[j], "|", paste(hap[, j], collapse = "")),
+    character(1)
+  )
+}
+
+# Compare one chromosome:
+# 1) strict check after sorting by position,
+# 2) multiset check to tolerate column-order differences.
+compare_chr <- function(macs_chr, ts_chr) {
+  A <- sort_by_pos(macs_chr)
+  B <- sort_by_pos(ts_chr)
+
+  same_nsites <- ncol(A$hap) == ncol(B$hap)
+  same_positions_strict <- isTRUE(all.equal(A$pos, B$pos, tolerance = 0))
+  same_hap_strict <- identical(A$hap, B$hap)
+
+  # Order-robust check for duplicate-position edge cases
+  keysA <- site_hap_keys(macs_chr$pos, macs_chr$hap)
+  keysB <- site_hap_keys(ts_chr$pos, ts_chr$hap)
+  same_site_hap_multiset <- identical(sort(keysA), sort(keysB))
+
+  data.frame(
+    same_nsites = same_nsites,
+    same_positions_strict = same_positions_strict,
+    same_hap_strict = same_hap_strict,
+    same_site_hap_multiset = same_site_hap_multiset,
+    ts_num_sites = ts_chr$num_sites,
+    ts_num_mutations = ts_chr$num_mutations
+  )
+}
+
+# Run a full matched-seed comparison for nChr chromosomes.
+run_case <- function(args, nChr, inbred, ploidy, seed, nThreads = 1L) {
+  seed_vec <- rep(as.integer(seed), as.integer(nChr))
+
+  macs <- AlphaSimR:::MaCS(
+    args = args,
+    maxSites = rep(0L, as.integer(nChr)),
+    inbred = inbred,
+    ploidy = as.integer(ploidy),
+    nThreads = as.integer(nThreads),
+    seed = seed_vec
+  )
+
+  ts_out <- AlphaSimR:::MaCSTS(
+    args = args,
+    nChr = as.integer(nChr),
+    inbred = inbred,
+    ploidy = as.integer(ploidy),
+    nThreads = as.integer(nThreads),
+    seed = seed_vec,
+    usePhysicalPositions = FALSE,
+    useMacsMut = TRUE
+  )
+
+  out <- lapply(seq_len(as.integer(nChr)), function(chr) {
+    m <- extract_macs_chr(macs, chr = chr, nThreads = nThreads)
+    t <- extract_ts_chr(ts_out$tables[[chr]])
+    cbind(chr = chr, compare_chr(m, t))
+  })
+  do.call(rbind, out)
+}
+
+
+

General Case

+

In the first case, I will use some more detail results to show what +are the objects being compared here:

+
args <- "4 10000 -t 1e-3 -r 1e-2 -s "
+seed <- as.integer(42)
+inbred <- FALSE
+ploidy <- 2L
+nChr <- 1L
+
+macs <- AlphaSimR:::MaCS(
+    args = args,
+    maxSites = rep(0L, as.integer(nChr)),
+    inbred = inbred,
+    ploidy = ploidy,
+    seed = seed,
+    nThreads = 1L
+  )
+
+  ts_out <- AlphaSimR:::MaCSTS(
+    args = args,
+    nChr = nChr,
+    inbred = inbred,
+    ploidy = ploidy,
+    seed = seed,
+    usePhysicalPositions = FALSE,
+    useMacsMut = TRUE,
+    nThreads = 1L
+  )
+
+chr = 1L  
+m <- extract_macs_chr(macs, chr = chr)
+t <- extract_ts_chr(ts_out$tables[[chr]])
+

Both genotypes and positions matched between MaCS and MaCSTS with +useMacsMut = TRUE:

+
> print(m)
+$pos
+ [1] 0.1589878 0.1677192 0.1945866 0.2692034 0.2851194 0.2942718
+ [7] 0.3008355 0.3116991 0.3385464 0.3423128 0.3543942 0.4430878
+[13] 0.4690506 0.4930433 0.5781099 0.6904748 0.7921847 0.8604606
+
+$hap
+     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
+[1,]    1    0    0    0    0    0    1    0    1     0     0     0
+[2,]    1    0    1    1    1    1    0    1    0     0     0     0
+[3,]    0    1    1    0    0    0    1    0    1     1     1     1
+[4,]    0    1    1    0    0    0    1    0    0     0     0     0
+     [,13] [,14] [,15] [,16] [,17] [,18]
+[1,]     0     0     0     0     1     0
+[2,]     0     0     1     0     1     0
+[3,]     0     1     1     0     0     0
+[4,]     1     0     0     1     0     1
+
> print(t)
+$pos
+ [1] 0.1589878 0.1677192 0.1945866 0.2692034 0.2851194 0.2942718
+ [7] 0.3008355 0.3116991 0.3385464 0.3423128 0.3543942 0.4430878
+[13] 0.4690506 0.4930433 0.5781099 0.6904748 0.7921847 0.8604606
+
+$hap
+     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
+[1,]    1    0    0    0    0    0    1    0    1     0     0     0
+[2,]    1    0    1    1    1    1    0    1    0     0     0     0
+[3,]    0    1    1    0    0    0    1    0    1     1     1     1
+[4,]    0    1    1    0    0    0    1    0    0     0     0     0
+     [,13] [,14] [,15] [,16] [,17] [,18]
+[1,]     0     0     0     0     1     0
+[2,]     0     0     1     0     1     0
+[3,]     0     1     1     0     0     0
+[4,]     1     0     0     1     0     1
+
+$num_sites
+[1] 18
+
+$num_mutations
+[1] 18
+

Or we can run:

+
z <- run_case(
+      args = args,
+      nChr = 1L,
+      inbred = inbred,
+      ploidy = ploidy,
+      seed = seed
+    )
+

same_hap_strict = TRUE plus +same_positions_strict = TRUE means direct compatibility +matched exactly:

+
> print(z)
+  chr same_nsites same_positions_strict same_hap_strict
+1   1        TRUE                  TRUE            TRUE
+  same_site_hap_multiset ts_num_sites ts_num_mutations
+1                   TRUE           18               18
+
+
+

More cases

+

Now I will run a scenario sweep with 5 different scenarios, and all +of them show the same result that MaCSTS with +useMacsMut = TRUE can reproduce the same sites and +haplotypes as MaCS.

+
+

Scenario1: ploidy = 1L

+
args <- "4 10000 -t 1e-3 -r 1e-2 -s "
+seed <- as.integer(42)
+inbred <- FALSE
+ploidy <- 1L
+nChr <- 1L
+z1 <- run_case(
+      args = args,
+      nChr = 1L,
+      inbred = inbred,
+      ploidy = ploidy,
+      seed = seed
+    )
+print(z1)
+
##   chr same_nsites same_positions_strict same_hap_strict same_site_hap_multiset
+## 1   1        TRUE                  TRUE            TRUE                   TRUE
+##   ts_num_sites ts_num_mutations
+## 1           18               18
+
+
+

Scenario2: inbred = TRUE

+
args <- "4 10000 -t 1e-3 -r 1e-2 -s "
+seed <- as.integer(42)
+inbred <- TRUE
+ploidy <- 2L
+nChr <- 1L
+z2 <- run_case(
+      args = args,
+      nChr = 1L,
+      inbred = inbred,
+      ploidy = ploidy,
+      seed = seed
+    )
+print(z2)
+
##   chr same_nsites same_positions_strict same_hap_strict same_site_hap_multiset
+## 1   1        TRUE                  TRUE            TRUE                   TRUE
+##   ts_num_sites ts_num_mutations
+## 1           18               18
+
+
+

Scenario3: demography

+
# eN 0.2 2.0: at time 0.2, size jumps to 2x baseline.
+# eN 1.0 0.5: at time 1.0, size jumps to 0.5x baseline.
+args <- "4 10000 -t 1e-3 -r 1e-2 -eN 0.2 2.0 -eN 1.0 0.5 -s "
+seed <- as.integer(42)
+inbred <- FALSE
+ploidy <- 2L
+nChr <- 1L
+z3 <- run_case(
+      args = args,
+      nChr = 1L,
+      inbred = inbred,
+      ploidy = ploidy,
+      seed = seed
+    )
+print(z3)
+
##   chr same_nsites same_positions_strict same_hap_strict same_site_hap_multiset
+## 1   1        TRUE                  TRUE            TRUE                   TRUE
+##   ts_num_sites ts_num_mutations
+## 1           23               23
+
+
+

Scenario4: 2 populations after a split

+
# -I: 8 sampled chromosomes from 2 pops total, with 4 from pop 1 and 4 from pop 2
+# -ej: at time 1.0, pop 2 merges into pop 1 (backward), which corresponds to a split from pop 1 to pop 2 forward in time.
+args <- "8 10000 -t 1e-3 -r 1e-2 -I 2 4 4 -ej 1.0 2 1 -s "
+seed <- as.integer(42)
+inbred <- FALSE
+ploidy <- 2L
+nChr <- 1L
+
+ts_out4 <- AlphaSimR:::MaCSTS(
+  args = args,
+  nChr = nChr,
+  inbred = inbred,
+  ploidy = ploidy,
+  seed = seed,
+  usePhysicalPositions = FALSE,
+  useMacsMut = TRUE,
+  nThreads = 1L
+)
+
+# read population info from node table for the 4th node (which should be in pop 2 after the split)
+tc4 <- RcppTskit::TableCollection$new(xptr = ts_out4$tables[[1]])
+tc4$node_table_get_row(4)['population']
+
## $population
+## [1] 1
+
tc4$node_table_get_row(3)['population']
+
## $population
+## [1] 0
+
z4 <- run_case(
+      args = args,
+      nChr = 1L,
+      inbred = inbred,
+      ploidy = ploidy,
+      seed = seed
+    )
+print(z4)
+
##   chr same_nsites same_positions_strict same_hap_strict same_site_hap_multiset
+## 1   1        TRUE                  TRUE            TRUE                   TRUE
+##   ts_num_sites ts_num_mutations
+## 1           63               63
+
+
+

Scenario5: 2 populations with a migration

+
# -I: 8 sampled chromosomes from 2 pops total, with 4 from pop 1 and 4 from pop 2
+# at backward time t = 0.5, set the global migration-rate parameter to 5e-3 for all populations
+args <- "8 10000 -t 1e-3 -r 1e-2 -I 2 4 4 -eM 0.5 5e-3 -s "
+seed <- as.integer(42)
+inbred <- FALSE
+ploidy <- 2L
+nChr <- 1L
+
+z5 <- run_case(
+      args = args,
+      nChr = 1L,
+      inbred = inbred,
+      ploidy = ploidy,
+      seed = seed
+    )
+print(z5)
+
##   chr same_nsites same_positions_strict same_hap_strict same_site_hap_multiset
+## 1   1        TRUE                  TRUE            TRUE                   TRUE
+##   ts_num_sites ts_num_mutations
+## 1         2162             2162
+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/dev/testMaCSTS2.Rmd b/dev/testMaCSTS2.Rmd new file mode 100644 index 00000000..a3b39e71 --- /dev/null +++ b/dev/testMaCSTS2.Rmd @@ -0,0 +1,330 @@ +--- +title: "test MaCSTS 2" +output: html_document +params: + nRep: 50 + nChr: 1 + inbred: false + ploidy: 2 + nThreads: 1 + usePhysicalPositions: true + Nref: 10000 + baseSeed: 700000 + outDir: "testData/out_phase1_useMacsMut_FALSE" + writeTrees: true + applyPostMut: true + mutSeedOffset: 0 +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) +``` + +## General Notes +As mentioned in `test MaCSTS 1`, when `useMacsMut = FALSE`, even ancestry (trees without mutation) from MaCSTS can differ from MaCS because of RNG state changes. So here, we can only focus on the comparison on distributions (e.g. tree height, numbers of mutations, nodes and edges). This notebook runs MaCSTS with `useMacsMut = FALSE` to generate ancestry-only tree sequences across multiple scenarios and replicates, then applies post-TS mutation separately. + +## Goal Of This Notebook + +This notebook runs `MaCSTS(..., useMacsMut = FALSE)` across multiple scenarios, +with multiple replicates per scenario. The mutation distributions and other statistics against legacy MaCS outputs will be compared, the CSV files record the seeds, paths and parameters for all runs; the .trees files will be output, enabling downstream analysis in Python/tskit to compare the distributions about trees and mutations, against msprime simulations. + +Output: + +1. `.trees` files per replicate/chromosome +2. a minimal manifest (`CSV`) listing scenario/rep/chromosome/seed/path + +When `applyPostMut = TRUE`, it also applies post-TS mutation using +`tsMutateTableCollection()` with mutation rate `dTheta = seqLen * (-t)` from args. + +## Configuration + +```{r} +library(AlphaSimR) +library(RcppTskit) + +# Safe param getter so chunks also run interactively (without rmarkdown::render). +get_param <- function(name, default) { + if (exists("params", inherits = TRUE) && + is.list(params) && + !is.null(params[[name]])) { + return(params[[name]]) + } + default +} + +nRep <- as.integer(get_param("nRep", 50L)) +nChr <- as.integer(get_param("nChr", 1L)) +inbred <- isTRUE(get_param("inbred", FALSE)) +ploidy <- as.integer(get_param("ploidy", 2L)) +nThreads <- as.integer(get_param("nThreads", 1L)) +usePhysicalPositions <- isTRUE(get_param("usePhysicalPositions", TRUE)) +Nref <- as.numeric(get_param("Nref", 10000)) +baseSeed <- as.integer(get_param("baseSeed", 700000L)) +outDir <- as.character(get_param("outDir", "out_phase1_useMacsMut_FALSE")) +writeTrees <- isTRUE(get_param("writeTrees", TRUE)) +applyPostMut <- isTRUE(get_param("applyPostMut", TRUE)) +mutSeedOffset <- as.integer(get_param("mutSeedOffset", 0L)) + +if (nRep <= 0L) stop("nRep must be positive.") +if (nChr <= 0L) stop("nChr must be positive.") +if (ploidy <= 0L) stop("ploidy must be positive.") + +dir.create(outDir, recursive = TRUE, showWarnings = FALSE) +``` + +## Scenario Set + +```{r} +# Keep scenarios connected in multi-pop cases (migration and/or join), +# so runs always finish. +scenarios <- list( + list( + id = 1L, + name = "single_const", + args = "8 100000 -t 1e-3 -r 1e-4 -s " + ), + list( + id = 2L, + name = "single_eN", + args = "8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s " + ), + list( + id = 3L, + name = "I2_migration", + args = "8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s " + ), + # -I 2 4 4 1e-2: two populations, 4+4 sampled, with initial migration. + #-en 0.2 2 0.5: pop 2 becomes smaller at time 0.2. + #-ej 1.0 2 1: at time 1.0, pop 2 joins pop 1 backward (equivalent to a forward split). + list( + id = 4L, + name = "I2_en_join", + args = "8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s " + ) +) + +scenario_df <- do.call( + rbind, + lapply(scenarios, function(s) data.frame(id = s$id, name = s$name, args = s$args)) +) +scenario_df +``` + +## Helpers + +```{r} +make_seed_vec <- function(scenario_id, rep_id, nChr, baseSeed) { + # Deterministic integer seed vector, one seed per chromosome. + as.integer(baseSeed + scenario_id * 100000L + rep_id * 1000L + seq_len(nChr) - 1L) +} + +parse_dTheta <- function(args) { + tok <- strsplit(as.character(args), "[,[:space:]]+", perl = TRUE)[[1L]] + tok <- tok[nzchar(tok)] + if (length(tok) < 2L) stop("args must include sample size and sequence length.") + seqLen <- as.numeric(tok[2L]) + if (!is.finite(seqLen) || seqLen <= 0) stop("Invalid sequence length in args.") + i_t <- match("-t", tok) + if (is.na(i_t) || i_t >= length(tok)) return(0) + tval <- as.numeric(tok[i_t + 1L]) + if (!is.finite(tval) || tval < 0) stop("Invalid -t value in args.") + seqLen * tval +} + +run_one <- function(sc, rep_id) { + seed_vec <- make_seed_vec(sc$id, rep_id, nChr, baseSeed) + dTheta <- parse_dTheta(sc$args) + mut_seed_vec <- as.integer(seed_vec + mutSeedOffset) + + out <- AlphaSimR:::MaCSTS( + args = sc$args, + nChr = nChr, + inbred = inbred, + ploidy = ploidy, + nThreads = nThreads, + seed = seed_vec, + usePhysicalPositions = usePhysicalPositions, + useMacsMut = FALSE, + Nref = Nref + ) + timeScale <- as.numeric(out$timeScale) + if (!is.finite(timeScale) || timeScale <= 0) { + stop("Invalid timeScale returned by MaCSTS.") + } + # MaCS dTheta is in coalescent-time units. + # If MaCSTS times were rescaled (e.g., to generations), adjust theta accordingly. + dTheta_post <- dTheta / timeScale + # Legacy MaCS run with identical args/seeds for mutation-count reference. + macs <- AlphaSimR:::MaCS( + args = sc$args, + maxSites = rep(0L, nChr), + inbred = inbred, + ploidy = ploidy, + nThreads = nThreads, + seed = seed_vec + ) + + rows <- vector("list", nChr) + for (chr in seq_len(nChr)) { + tc_xptr <- out$tables[[chr]] + if (applyPostMut && dTheta_post > 0) { + AlphaSimR:::tsMutateTableCollection(tc_xptr, dTheta_post, as.numeric(mut_seed_vec[chr])) + } + tc <- RcppTskit::TableCollection$new(xptr = tc_xptr) + ts <- tc$tree_sequence() + tc_summary <- AlphaSimR:::rtsk_table_collection_summary2(tc_xptr) + + tree_path <- NA_character_ + if (writeTrees) { + tree_path <- file.path( + outDir, + sprintf("%s_rep%02d_chr%02d.trees", sc$name, rep_id, chr) + ) + ts$write(tree_path) + } + + rows[[chr]] <- data.frame( + scenario_id = sc$id, + scenario = sc$name, + rep = rep_id, + chr = chr, + args = sc$args, + seed_chr = seed_vec[chr], + mut_seed_chr = mut_seed_vec[chr], + dTheta = dTheta, + dTheta_post = dTheta_post, + usePhysicalPositions = usePhysicalPositions, + Nref = Nref, + macs_num_mutations = as.numeric(length(macs$genMap[[chr]])), + macsts_num_mutations = as.numeric(tc_summary$num_mutations), + tree_path = tree_path, + stringsAsFactors = FALSE + ) + } + + do.call(rbind, rows) +} +``` + +## Run Replicates + +```{r} +manifest_rows <- vector("list", length(scenarios) * nRep) +status_rows <- vector("list", length(scenarios) * nRep) +k <- 0L + +for (sc in scenarios) { + for (rep_id in seq_len(nRep)) { + k <- k + 1L + run_label <- sprintf("%s rep%02d", sc$name, rep_id) + cat("Running:", run_label, "\n") + + err_msg <- NA_character_ + one <- tryCatch( + run_one(sc, rep_id), + error = function(e) { + err_msg <<- conditionMessage(e) + NULL + } + ) + + manifest_rows[[k]] <- one + status_rows[[k]] <- data.frame( + scenario_id = sc$id, + scenario = sc$name, + rep = rep_id, + success = !is.null(one), + error = ifelse(is.na(err_msg), "", err_msg), + stringsAsFactors = FALSE + ) + } +} + +manifest_df <- do.call(rbind, Filter(Negate(is.null), manifest_rows)) +if (is.null(manifest_df)) manifest_df <- data.frame() +run_status_df <- do.call(rbind, status_rows) + +run_status_df +``` + +## Mutation Count Check (MaCS vs MaCSTS ancestry-only) + +Per-replicate mutation counts are not expected to be identical because legacy MaCS mutates during local-tree simulation, while post-TS mutates on the recorded edge tables with its own RNG stream. However, the mutation count distributions should be similar, and the mean difference should be small relative to the mean mutation count. The `mut_stats` table summarizes these comparisons per scenario. + +```{r} +if (nrow(manifest_df) == 0L) { + stop("No successful runs. Check run_status_df$error.") +} + +mut_stats <- do.call( + rbind, + lapply(split(manifest_df, manifest_df$scenario), function(d) { + delta <- d$macs_num_mutations - d$macsts_num_mutations + + sd_macs <- sd(d$macs_num_mutations, na.rm = TRUE) + sd_macsts <- sd(d$macsts_num_mutations, na.rm = TRUE) + sd_delta <- sd(delta, na.rm = TRUE) + + data.frame( + scenario = d$scenario[1], + n = nrow(d), + macs_mut_mean = mean(d$macs_num_mutations, na.rm = TRUE), + macs_mut_sd = sd_macs, + macsts_mut_mean = mean(d$macsts_num_mutations, na.rm = TRUE), + macsts_mut_sd = sd_macsts, + mut_diff_mean = mean(delta, na.rm = TRUE), + mut_diff_sd = sd_delta, + rel_diff = mean(delta, na.rm = TRUE) / + mean(d$macs_num_mutations, na.rm = TRUE), + stringsAsFactors = FALSE + ) + }) +) +mut_stats +``` + +```{r} +library(dplyr) +library(tidyr) +library(ggplot2) +plot_df <- mut_stats %>% + select(scenario, macs_mut_mean, macs_mut_sd, macsts_mut_mean, macsts_mut_sd) %>% + pivot_longer( + cols = -scenario, + names_to = c("method", ".value"), + names_pattern = "(macs|macsts)_mut_(mean|sd)" + ) %>% + mutate(method = recode(method, macs = "MaCS", macsts = "MaCSTS")) + +ggplot(plot_df, aes(x = scenario, y = mean, fill = method)) + + geom_col(position = position_dodge(width = 0.8), width = 0.7) + + geom_errorbar( + aes(ymin = mean - sd, ymax = mean + sd), + position = position_dodge(width = 0.8), + width = 0.2 + ) + + labs(x = "Scenario", y = "Mean # mutations", fill = "Method") + + theme_bw() +``` + + +## Save Outputs + +```{r} +manifest_csv_path <- file.path(outDir, "phase1_macsTS_useMacsMut_FALSE_manifest.csv") +status_csv_path <- file.path(outDir, "phase1_macsTS_useMacsMut_FALSE_run_status.csv") + +write.csv(manifest_df, manifest_csv_path, row.names = FALSE) +write.csv(run_status_df, status_csv_path, row.names = FALSE) + +cat("Saved manifest CSV:", manifest_csv_path, "\n") +cat("Saved run status CSV:", status_csv_path, "\n") +if (writeTrees) { + cat("Saved .trees files under:", outDir, "\n") +} +``` + +## Next Step + +Convert the parameters and use them to run simulations with msprime. +Then we can compare the distributions on number of trees, edges, nodes, mutations and tree heights. diff --git a/dev/testMaCSTS2.html b/dev/testMaCSTS2.html new file mode 100644 index 00000000..6a4e9499 --- /dev/null +++ b/dev/testMaCSTS2.html @@ -0,0 +1,1140 @@ + + + + + + + + + + + + + +test MaCSTS 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

General Notes

+

As mentioned in test MaCSTS 1, when +useMacsMut = FALSE, even ancestry (trees without mutation) +from MaCSTS can differ from MaCS because of RNG state changes. So here, +we can only focus on the comparison on distributions (e.g. tree height, +numbers of mutations, nodes and edges). This notebook runs MaCSTS with +useMacsMut = FALSE to generate ancestry-only tree sequences +across multiple scenarios and replicates, then applies post-TS mutation +separately.

+
+
+

Goal Of This Notebook

+

This notebook runs MaCSTS(..., useMacsMut = FALSE) +across multiple scenarios, with multiple replicates per scenario. The +mutation distributions and other statistics against legacy MaCS outputs +will be compared, the CSV files record the seeds, paths and parameters +for all runs; the .trees files will be output, enabling downstream +analysis in Python/tskit to compare the distributions about trees and +mutations, against msprime simulations.

+

Output:

+
    +
  1. .trees files per replicate/chromosome
  2. +
  3. a minimal manifest (CSV) listing +scenario/rep/chromosome/seed/path
  4. +
+

When applyPostMut = TRUE, it also applies post-TS +mutation using tsMutateTableCollection() with mutation rate +dTheta = seqLen * (-t) from args.

+
+
+

Configuration

+
library(AlphaSimR)
+library(RcppTskit)
+
+# Safe param getter so chunks also run interactively (without rmarkdown::render).
+get_param <- function(name, default) {
+  if (exists("params", inherits = TRUE) &&
+      is.list(params) &&
+      !is.null(params[[name]])) {
+    return(params[[name]])
+  }
+  default
+}
+
+nRep <- as.integer(get_param("nRep", 50L))
+nChr <- as.integer(get_param("nChr", 1L))
+inbred <- isTRUE(get_param("inbred", FALSE))
+ploidy <- as.integer(get_param("ploidy", 2L))
+nThreads <- as.integer(get_param("nThreads", 1L))
+usePhysicalPositions <- isTRUE(get_param("usePhysicalPositions", TRUE))
+Nref <- as.numeric(get_param("Nref", 10000))
+baseSeed <- as.integer(get_param("baseSeed", 700000L))
+outDir <- as.character(get_param("outDir", "out_phase1_useMacsMut_FALSE"))
+writeTrees <- isTRUE(get_param("writeTrees", TRUE))
+applyPostMut <- isTRUE(get_param("applyPostMut", TRUE))
+mutSeedOffset <- as.integer(get_param("mutSeedOffset", 0L))
+
+if (nRep <= 0L) stop("nRep must be positive.")
+if (nChr <= 0L) stop("nChr must be positive.")
+if (ploidy <= 0L) stop("ploidy must be positive.")
+
+dir.create(outDir, recursive = TRUE, showWarnings = FALSE)
+
+
+

Scenario Set

+
# Keep scenarios connected in multi-pop cases (migration and/or join),
+# so runs always finish.
+scenarios <- list(
+  list(
+    id = 1L,
+    name = "single_const",
+    args = "8 100000 -t 1e-3 -r 1e-4 -s "
+  ),
+  list(
+    id = 2L,
+    name = "single_eN",
+    args = "8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s "
+  ),
+  list(
+    id = 3L,
+    name = "I2_migration",
+    args = "8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s "
+  ),
+  # -I 2 4 4 1e-2: two populations, 4+4 sampled, with initial migration.
+  #-en 0.2 2 0.5: pop 2 becomes smaller at time 0.2.
+  #-ej 1.0 2 1: at time 1.0, pop 2 joins pop 1 backward (equivalent to a forward split).
+  list(
+    id = 4L,
+    name = "I2_en_join",
+    args = "8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s "
+  )
+)
+
+scenario_df <- do.call(
+  rbind,
+  lapply(scenarios, function(s) data.frame(id = s$id, name = s$name, args = s$args))
+)
+scenario_df
+
##   id         name
+## 1  1 single_const
+## 2  2    single_eN
+## 3  3 I2_migration
+## 4  4   I2_en_join
+##                                                                   args
+## 1                                         8 100000 -t 1e-3 -r 1e-4 -s 
+## 2                 8 100000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 1.0 0.5 -s 
+## 3  8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -ej 1.0 2 1 -s 
+## 4 8 100000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s
+
+
+

Helpers

+
make_seed_vec <- function(scenario_id, rep_id, nChr, baseSeed) {
+  # Deterministic integer seed vector, one seed per chromosome.
+  as.integer(baseSeed + scenario_id * 100000L + rep_id * 1000L + seq_len(nChr) - 1L)
+}
+
+parse_dTheta <- function(args) {
+  tok <- strsplit(as.character(args), "[,[:space:]]+", perl = TRUE)[[1L]]
+  tok <- tok[nzchar(tok)]
+  if (length(tok) < 2L) stop("args must include sample size and sequence length.")
+  seqLen <- as.numeric(tok[2L])
+  if (!is.finite(seqLen) || seqLen <= 0) stop("Invalid sequence length in args.")
+  i_t <- match("-t", tok)
+  if (is.na(i_t) || i_t >= length(tok)) return(0)
+  tval <- as.numeric(tok[i_t + 1L])
+  if (!is.finite(tval) || tval < 0) stop("Invalid -t value in args.")
+  seqLen * tval
+}
+
+run_one <- function(sc, rep_id) {
+  seed_vec <- make_seed_vec(sc$id, rep_id, nChr, baseSeed)
+  dTheta <- parse_dTheta(sc$args)
+  mut_seed_vec <- as.integer(seed_vec + mutSeedOffset)
+
+  out <- AlphaSimR:::MaCSTS(
+    args = sc$args,
+    nChr = nChr,
+    inbred = inbred,
+    ploidy = ploidy,
+    nThreads = nThreads,
+    seed = seed_vec,
+    usePhysicalPositions = usePhysicalPositions,
+    useMacsMut = FALSE,
+    Nref = Nref
+  )
+  timeScale <- as.numeric(out$timeScale)
+  if (!is.finite(timeScale) || timeScale <= 0) {
+    stop("Invalid timeScale returned by MaCSTS.")
+  }
+  # MaCS dTheta is in coalescent-time units.
+  # If MaCSTS times were rescaled (e.g., to generations), adjust theta accordingly.
+  dTheta_post <- dTheta / timeScale
+  # Legacy MaCS run with identical args/seeds for mutation-count reference.
+  macs <- AlphaSimR:::MaCS(
+    args = sc$args,
+    maxSites = rep(0L, nChr),
+    inbred = inbred,
+    ploidy = ploidy,
+    nThreads = nThreads,
+    seed = seed_vec
+  )
+
+  rows <- vector("list", nChr)
+  for (chr in seq_len(nChr)) {
+    tc_xptr <- out$tables[[chr]]
+    if (applyPostMut && dTheta_post > 0) {
+      AlphaSimR:::tsMutateTableCollection(tc_xptr, dTheta_post, as.numeric(mut_seed_vec[chr]))
+    }
+    tc <- RcppTskit::TableCollection$new(xptr = tc_xptr)
+    ts <- tc$tree_sequence()
+    tc_summary <- AlphaSimR:::rtsk_table_collection_summary2(tc_xptr)
+
+    tree_path <- NA_character_
+    if (writeTrees) {
+      tree_path <- file.path(
+        outDir,
+        sprintf("%s_rep%02d_chr%02d.trees", sc$name, rep_id, chr)
+      )
+      ts$write(tree_path)
+    }
+
+    rows[[chr]] <- data.frame(
+      scenario_id = sc$id,
+      scenario = sc$name,
+      rep = rep_id,
+      chr = chr,
+      args = sc$args,
+      seed_chr = seed_vec[chr],
+      mut_seed_chr = mut_seed_vec[chr],
+      dTheta = dTheta,
+      dTheta_post = dTheta_post,
+      usePhysicalPositions = usePhysicalPositions,
+      Nref = Nref,
+      macs_num_mutations = as.numeric(length(macs$genMap[[chr]])),
+      macsts_num_mutations = as.numeric(tc_summary$num_mutations),
+      tree_path = tree_path,
+      stringsAsFactors = FALSE
+    )
+  }
+
+  do.call(rbind, rows)
+}
+
+
+

Run Replicates

+
manifest_rows <- vector("list", length(scenarios) * nRep)
+status_rows <- vector("list", length(scenarios) * nRep)
+k <- 0L
+
+for (sc in scenarios) {
+  for (rep_id in seq_len(nRep)) {
+    k <- k + 1L
+    run_label <- sprintf("%s rep%02d", sc$name, rep_id)
+    cat("Running:", run_label, "\n")
+
+    err_msg <- NA_character_
+    one <- tryCatch(
+      run_one(sc, rep_id),
+      error = function(e) {
+        err_msg <<- conditionMessage(e)
+        NULL
+      }
+    )
+
+    manifest_rows[[k]] <- one
+    status_rows[[k]] <- data.frame(
+      scenario_id = sc$id,
+      scenario = sc$name,
+      rep = rep_id,
+      success = !is.null(one),
+      error = ifelse(is.na(err_msg), "", err_msg),
+      stringsAsFactors = FALSE
+    )
+  }
+}
+
## Running: single_const rep01 
+## Running: single_const rep02 
+## Running: single_const rep03 
+## Running: single_const rep04 
+## Running: single_const rep05 
+## Running: single_const rep06 
+## Running: single_const rep07 
+## Running: single_const rep08 
+## Running: single_const rep09 
+## Running: single_const rep10 
+## Running: single_const rep11 
+## Running: single_const rep12 
+## Running: single_const rep13 
+## Running: single_const rep14 
+## Running: single_const rep15 
+## Running: single_const rep16 
+## Running: single_const rep17 
+## Running: single_const rep18 
+## Running: single_const rep19 
+## Running: single_const rep20 
+## Running: single_const rep21 
+## Running: single_const rep22 
+## Running: single_const rep23 
+## Running: single_const rep24 
+## Running: single_const rep25 
+## Running: single_const rep26 
+## Running: single_const rep27 
+## Running: single_const rep28 
+## Running: single_const rep29 
+## Running: single_const rep30 
+## Running: single_const rep31 
+## Running: single_const rep32 
+## Running: single_const rep33 
+## Running: single_const rep34 
+## Running: single_const rep35 
+## Running: single_const rep36 
+## Running: single_const rep37 
+## Running: single_const rep38 
+## Running: single_const rep39 
+## Running: single_const rep40 
+## Running: single_const rep41 
+## Running: single_const rep42 
+## Running: single_const rep43 
+## Running: single_const rep44 
+## Running: single_const rep45 
+## Running: single_const rep46 
+## Running: single_const rep47 
+## Running: single_const rep48 
+## Running: single_const rep49 
+## Running: single_const rep50 
+## Running: single_eN rep01 
+## Running: single_eN rep02 
+## Running: single_eN rep03 
+## Running: single_eN rep04 
+## Running: single_eN rep05 
+## Running: single_eN rep06 
+## Running: single_eN rep07 
+## Running: single_eN rep08 
+## Running: single_eN rep09 
+## Running: single_eN rep10 
+## Running: single_eN rep11 
+## Running: single_eN rep12 
+## Running: single_eN rep13 
+## Running: single_eN rep14 
+## Running: single_eN rep15 
+## Running: single_eN rep16 
+## Running: single_eN rep17 
+## Running: single_eN rep18 
+## Running: single_eN rep19 
+## Running: single_eN rep20 
+## Running: single_eN rep21 
+## Running: single_eN rep22 
+## Running: single_eN rep23 
+## Running: single_eN rep24 
+## Running: single_eN rep25 
+## Running: single_eN rep26 
+## Running: single_eN rep27 
+## Running: single_eN rep28 
+## Running: single_eN rep29 
+## Running: single_eN rep30 
+## Running: single_eN rep31 
+## Running: single_eN rep32 
+## Running: single_eN rep33 
+## Running: single_eN rep34 
+## Running: single_eN rep35 
+## Running: single_eN rep36 
+## Running: single_eN rep37 
+## Running: single_eN rep38 
+## Running: single_eN rep39 
+## Running: single_eN rep40 
+## Running: single_eN rep41 
+## Running: single_eN rep42 
+## Running: single_eN rep43 
+## Running: single_eN rep44 
+## Running: single_eN rep45 
+## Running: single_eN rep46 
+## Running: single_eN rep47 
+## Running: single_eN rep48 
+## Running: single_eN rep49 
+## Running: single_eN rep50 
+## Running: I2_migration rep01 
+## Running: I2_migration rep02 
+## Running: I2_migration rep03 
+## Running: I2_migration rep04 
+## Running: I2_migration rep05 
+## Running: I2_migration rep06 
+## Running: I2_migration rep07 
+## Running: I2_migration rep08 
+## Running: I2_migration rep09 
+## Running: I2_migration rep10 
+## Running: I2_migration rep11 
+## Running: I2_migration rep12 
+## Running: I2_migration rep13 
+## Running: I2_migration rep14 
+## Running: I2_migration rep15 
+## Running: I2_migration rep16 
+## Running: I2_migration rep17 
+## Running: I2_migration rep18 
+## Running: I2_migration rep19 
+## Running: I2_migration rep20 
+## Running: I2_migration rep21 
+## Running: I2_migration rep22 
+## Running: I2_migration rep23 
+## Running: I2_migration rep24 
+## Running: I2_migration rep25 
+## Running: I2_migration rep26 
+## Running: I2_migration rep27 
+## Running: I2_migration rep28 
+## Running: I2_migration rep29 
+## Running: I2_migration rep30 
+## Running: I2_migration rep31 
+## Running: I2_migration rep32 
+## Running: I2_migration rep33 
+## Running: I2_migration rep34 
+## Running: I2_migration rep35 
+## Running: I2_migration rep36 
+## Running: I2_migration rep37 
+## Running: I2_migration rep38 
+## Running: I2_migration rep39 
+## Running: I2_migration rep40 
+## Running: I2_migration rep41 
+## Running: I2_migration rep42 
+## Running: I2_migration rep43 
+## Running: I2_migration rep44 
+## Running: I2_migration rep45 
+## Running: I2_migration rep46 
+## Running: I2_migration rep47 
+## Running: I2_migration rep48 
+## Running: I2_migration rep49 
+## Running: I2_migration rep50 
+## Running: I2_en_join rep01 
+## Running: I2_en_join rep02 
+## Running: I2_en_join rep03 
+## Running: I2_en_join rep04 
+## Running: I2_en_join rep05 
+## Running: I2_en_join rep06 
+## Running: I2_en_join rep07 
+## Running: I2_en_join rep08 
+## Running: I2_en_join rep09 
+## Running: I2_en_join rep10 
+## Running: I2_en_join rep11 
+## Running: I2_en_join rep12 
+## Running: I2_en_join rep13 
+## Running: I2_en_join rep14 
+## Running: I2_en_join rep15 
+## Running: I2_en_join rep16 
+## Running: I2_en_join rep17 
+## Running: I2_en_join rep18 
+## Running: I2_en_join rep19 
+## Running: I2_en_join rep20 
+## Running: I2_en_join rep21 
+## Running: I2_en_join rep22 
+## Running: I2_en_join rep23 
+## Running: I2_en_join rep24 
+## Running: I2_en_join rep25 
+## Running: I2_en_join rep26 
+## Running: I2_en_join rep27 
+## Running: I2_en_join rep28 
+## Running: I2_en_join rep29 
+## Running: I2_en_join rep30 
+## Running: I2_en_join rep31 
+## Running: I2_en_join rep32 
+## Running: I2_en_join rep33 
+## Running: I2_en_join rep34 
+## Running: I2_en_join rep35 
+## Running: I2_en_join rep36 
+## Running: I2_en_join rep37 
+## Running: I2_en_join rep38 
+## Running: I2_en_join rep39 
+## Running: I2_en_join rep40 
+## Running: I2_en_join rep41 
+## Running: I2_en_join rep42 
+## Running: I2_en_join rep43 
+## Running: I2_en_join rep44 
+## Running: I2_en_join rep45 
+## Running: I2_en_join rep46 
+## Running: I2_en_join rep47 
+## Running: I2_en_join rep48 
+## Running: I2_en_join rep49 
+## Running: I2_en_join rep50
+
manifest_df <- do.call(rbind, Filter(Negate(is.null), manifest_rows))
+if (is.null(manifest_df)) manifest_df <- data.frame()
+run_status_df <- do.call(rbind, status_rows)
+
+run_status_df
+
##     scenario_id     scenario rep success error
+## 1             1 single_const   1    TRUE      
+## 2             1 single_const   2    TRUE      
+## 3             1 single_const   3    TRUE      
+## 4             1 single_const   4    TRUE      
+## 5             1 single_const   5    TRUE      
+## 6             1 single_const   6    TRUE      
+## 7             1 single_const   7    TRUE      
+## 8             1 single_const   8    TRUE      
+## 9             1 single_const   9    TRUE      
+## 10            1 single_const  10    TRUE      
+## 11            1 single_const  11    TRUE      
+## 12            1 single_const  12    TRUE      
+## 13            1 single_const  13    TRUE      
+## 14            1 single_const  14    TRUE      
+## 15            1 single_const  15    TRUE      
+## 16            1 single_const  16    TRUE      
+## 17            1 single_const  17    TRUE      
+## 18            1 single_const  18    TRUE      
+## 19            1 single_const  19    TRUE      
+## 20            1 single_const  20    TRUE      
+## 21            1 single_const  21    TRUE      
+## 22            1 single_const  22    TRUE      
+## 23            1 single_const  23    TRUE      
+## 24            1 single_const  24    TRUE      
+## 25            1 single_const  25    TRUE      
+## 26            1 single_const  26    TRUE      
+## 27            1 single_const  27    TRUE      
+## 28            1 single_const  28    TRUE      
+## 29            1 single_const  29    TRUE      
+## 30            1 single_const  30    TRUE      
+## 31            1 single_const  31    TRUE      
+## 32            1 single_const  32    TRUE      
+## 33            1 single_const  33    TRUE      
+## 34            1 single_const  34    TRUE      
+## 35            1 single_const  35    TRUE      
+## 36            1 single_const  36    TRUE      
+## 37            1 single_const  37    TRUE      
+## 38            1 single_const  38    TRUE      
+## 39            1 single_const  39    TRUE      
+## 40            1 single_const  40    TRUE      
+## 41            1 single_const  41    TRUE      
+## 42            1 single_const  42    TRUE      
+## 43            1 single_const  43    TRUE      
+## 44            1 single_const  44    TRUE      
+## 45            1 single_const  45    TRUE      
+## 46            1 single_const  46    TRUE      
+## 47            1 single_const  47    TRUE      
+## 48            1 single_const  48    TRUE      
+## 49            1 single_const  49    TRUE      
+## 50            1 single_const  50    TRUE      
+## 51            2    single_eN   1    TRUE      
+## 52            2    single_eN   2    TRUE      
+## 53            2    single_eN   3    TRUE      
+## 54            2    single_eN   4    TRUE      
+## 55            2    single_eN   5    TRUE      
+## 56            2    single_eN   6    TRUE      
+## 57            2    single_eN   7    TRUE      
+## 58            2    single_eN   8    TRUE      
+## 59            2    single_eN   9    TRUE      
+## 60            2    single_eN  10    TRUE      
+## 61            2    single_eN  11    TRUE      
+## 62            2    single_eN  12    TRUE      
+## 63            2    single_eN  13    TRUE      
+## 64            2    single_eN  14    TRUE      
+## 65            2    single_eN  15    TRUE      
+## 66            2    single_eN  16    TRUE      
+## 67            2    single_eN  17    TRUE      
+## 68            2    single_eN  18    TRUE      
+## 69            2    single_eN  19    TRUE      
+## 70            2    single_eN  20    TRUE      
+## 71            2    single_eN  21    TRUE      
+## 72            2    single_eN  22    TRUE      
+## 73            2    single_eN  23    TRUE      
+## 74            2    single_eN  24    TRUE      
+## 75            2    single_eN  25    TRUE      
+## 76            2    single_eN  26    TRUE      
+## 77            2    single_eN  27    TRUE      
+## 78            2    single_eN  28    TRUE      
+## 79            2    single_eN  29    TRUE      
+## 80            2    single_eN  30    TRUE      
+## 81            2    single_eN  31    TRUE      
+## 82            2    single_eN  32    TRUE      
+## 83            2    single_eN  33    TRUE      
+## 84            2    single_eN  34    TRUE      
+## 85            2    single_eN  35    TRUE      
+## 86            2    single_eN  36    TRUE      
+## 87            2    single_eN  37    TRUE      
+## 88            2    single_eN  38    TRUE      
+## 89            2    single_eN  39    TRUE      
+## 90            2    single_eN  40    TRUE      
+## 91            2    single_eN  41    TRUE      
+## 92            2    single_eN  42    TRUE      
+## 93            2    single_eN  43    TRUE      
+## 94            2    single_eN  44    TRUE      
+## 95            2    single_eN  45    TRUE      
+## 96            2    single_eN  46    TRUE      
+## 97            2    single_eN  47    TRUE      
+## 98            2    single_eN  48    TRUE      
+## 99            2    single_eN  49    TRUE      
+## 100           2    single_eN  50    TRUE      
+## 101           3 I2_migration   1    TRUE      
+## 102           3 I2_migration   2    TRUE      
+## 103           3 I2_migration   3    TRUE      
+## 104           3 I2_migration   4    TRUE      
+## 105           3 I2_migration   5    TRUE      
+## 106           3 I2_migration   6    TRUE      
+## 107           3 I2_migration   7    TRUE      
+## 108           3 I2_migration   8    TRUE      
+## 109           3 I2_migration   9    TRUE      
+## 110           3 I2_migration  10    TRUE      
+## 111           3 I2_migration  11    TRUE      
+## 112           3 I2_migration  12    TRUE      
+## 113           3 I2_migration  13    TRUE      
+## 114           3 I2_migration  14    TRUE      
+## 115           3 I2_migration  15    TRUE      
+## 116           3 I2_migration  16    TRUE      
+## 117           3 I2_migration  17    TRUE      
+## 118           3 I2_migration  18    TRUE      
+## 119           3 I2_migration  19    TRUE      
+## 120           3 I2_migration  20    TRUE      
+## 121           3 I2_migration  21    TRUE      
+## 122           3 I2_migration  22    TRUE      
+## 123           3 I2_migration  23    TRUE      
+## 124           3 I2_migration  24    TRUE      
+## 125           3 I2_migration  25    TRUE      
+## 126           3 I2_migration  26    TRUE      
+## 127           3 I2_migration  27    TRUE      
+## 128           3 I2_migration  28    TRUE      
+## 129           3 I2_migration  29    TRUE      
+## 130           3 I2_migration  30    TRUE      
+## 131           3 I2_migration  31    TRUE      
+## 132           3 I2_migration  32    TRUE      
+## 133           3 I2_migration  33    TRUE      
+## 134           3 I2_migration  34    TRUE      
+## 135           3 I2_migration  35    TRUE      
+## 136           3 I2_migration  36    TRUE      
+## 137           3 I2_migration  37    TRUE      
+## 138           3 I2_migration  38    TRUE      
+## 139           3 I2_migration  39    TRUE      
+## 140           3 I2_migration  40    TRUE      
+## 141           3 I2_migration  41    TRUE      
+## 142           3 I2_migration  42    TRUE      
+## 143           3 I2_migration  43    TRUE      
+## 144           3 I2_migration  44    TRUE      
+## 145           3 I2_migration  45    TRUE      
+## 146           3 I2_migration  46    TRUE      
+## 147           3 I2_migration  47    TRUE      
+## 148           3 I2_migration  48    TRUE      
+## 149           3 I2_migration  49    TRUE      
+## 150           3 I2_migration  50    TRUE      
+## 151           4   I2_en_join   1    TRUE      
+## 152           4   I2_en_join   2    TRUE      
+## 153           4   I2_en_join   3    TRUE      
+## 154           4   I2_en_join   4    TRUE      
+## 155           4   I2_en_join   5    TRUE      
+## 156           4   I2_en_join   6    TRUE      
+## 157           4   I2_en_join   7    TRUE      
+## 158           4   I2_en_join   8    TRUE      
+## 159           4   I2_en_join   9    TRUE      
+## 160           4   I2_en_join  10    TRUE      
+## 161           4   I2_en_join  11    TRUE      
+## 162           4   I2_en_join  12    TRUE      
+## 163           4   I2_en_join  13    TRUE      
+## 164           4   I2_en_join  14    TRUE      
+## 165           4   I2_en_join  15    TRUE      
+## 166           4   I2_en_join  16    TRUE      
+## 167           4   I2_en_join  17    TRUE      
+## 168           4   I2_en_join  18    TRUE      
+## 169           4   I2_en_join  19    TRUE      
+## 170           4   I2_en_join  20    TRUE      
+## 171           4   I2_en_join  21    TRUE      
+## 172           4   I2_en_join  22    TRUE      
+## 173           4   I2_en_join  23    TRUE      
+## 174           4   I2_en_join  24    TRUE      
+## 175           4   I2_en_join  25    TRUE      
+## 176           4   I2_en_join  26    TRUE      
+## 177           4   I2_en_join  27    TRUE      
+## 178           4   I2_en_join  28    TRUE      
+## 179           4   I2_en_join  29    TRUE      
+## 180           4   I2_en_join  30    TRUE      
+## 181           4   I2_en_join  31    TRUE      
+## 182           4   I2_en_join  32    TRUE      
+## 183           4   I2_en_join  33    TRUE      
+## 184           4   I2_en_join  34    TRUE      
+## 185           4   I2_en_join  35    TRUE      
+## 186           4   I2_en_join  36    TRUE      
+## 187           4   I2_en_join  37    TRUE      
+## 188           4   I2_en_join  38    TRUE      
+## 189           4   I2_en_join  39    TRUE      
+## 190           4   I2_en_join  40    TRUE      
+## 191           4   I2_en_join  41    TRUE      
+## 192           4   I2_en_join  42    TRUE      
+## 193           4   I2_en_join  43    TRUE      
+## 194           4   I2_en_join  44    TRUE      
+## 195           4   I2_en_join  45    TRUE      
+## 196           4   I2_en_join  46    TRUE      
+## 197           4   I2_en_join  47    TRUE      
+## 198           4   I2_en_join  48    TRUE      
+## 199           4   I2_en_join  49    TRUE      
+## 200           4   I2_en_join  50    TRUE
+
+
+

Mutation Count Check (MaCS vs MaCSTS ancestry-only)

+

Per-replicate mutation counts are not expected to be identical +because legacy MaCS mutates during local-tree simulation, while post-TS +mutates on the recorded edge tables with its own RNG stream. However, +the mutation count distributions should be similar, and the mean +difference should be small relative to the mean mutation count. The +mut_stats table summarizes these comparisons per +scenario.

+
if (nrow(manifest_df) == 0L) {
+  stop("No successful runs. Check run_status_df$error.")
+}
+
+mut_stats <- do.call(
+  rbind,
+  lapply(split(manifest_df, manifest_df$scenario), function(d) {
+    delta <- d$macs_num_mutations - d$macsts_num_mutations
+
+    sd_macs   <- sd(d$macs_num_mutations, na.rm = TRUE)
+    sd_macsts <- sd(d$macsts_num_mutations, na.rm = TRUE)
+    sd_delta  <- sd(delta, na.rm = TRUE)
+
+    data.frame(
+      scenario        = d$scenario[1],
+      n               = nrow(d),
+      macs_mut_mean   = mean(d$macs_num_mutations, na.rm = TRUE),
+      macs_mut_sd     = sd_macs,
+      macsts_mut_mean = mean(d$macsts_num_mutations, na.rm = TRUE),
+      macsts_mut_sd   = sd_macsts,
+      mut_diff_mean   = mean(delta, na.rm = TRUE),
+      mut_diff_sd     = sd_delta,
+      rel_diff        = mean(delta, na.rm = TRUE) /
+                          mean(d$macs_num_mutations, na.rm = TRUE),
+      stringsAsFactors = FALSE
+    )
+  })
+)
+mut_stats
+
##                  scenario  n macs_mut_mean macs_mut_sd macsts_mut_mean
+## I2_en_join     I2_en_join 50        496.40    77.10237          480.16
+## I2_migration I2_migration 50        519.28    61.59149          513.06
+## single_const single_const 50        252.06    80.24183          253.70
+## single_eN       single_eN 50        319.62    92.49355          309.74
+##              macsts_mut_sd mut_diff_mean mut_diff_sd     rel_diff
+## I2_en_join        72.67972         16.24    96.26485  0.032715552
+## I2_migration      64.81698          6.22    80.62214  0.011978124
+## single_const      57.57914         -1.64    88.69847 -0.006506387
+## single_eN         86.54592          9.88    88.13756  0.030911708
+
library(dplyr)
+library(tidyr)
+library(ggplot2)
+plot_df <- mut_stats %>%
+    select(scenario, macs_mut_mean, macs_mut_sd, macsts_mut_mean, macsts_mut_sd) %>%
+    pivot_longer(
+        cols = -scenario,
+        names_to = c("method", ".value"),
+        names_pattern = "(macs|macsts)_mut_(mean|sd)"
+    ) %>%
+    mutate(method = recode(method, macs = "MaCS", macsts = "MaCSTS"))
+
+ggplot(plot_df, aes(x = scenario, y = mean, fill = method)) +
+    geom_col(position = position_dodge(width = 0.8), width = 0.7) +
+    geom_errorbar(
+        aes(ymin = mean - sd, ymax = mean + sd),
+        position = position_dodge(width = 0.8),
+        width = 0.2
+    ) +
+    labs(x = "Scenario", y = "Mean # mutations", fill = "Method") +
+    theme_bw()
+

+
+
+

Save Outputs

+
manifest_csv_path <- file.path(outDir, "phase1_macsTS_useMacsMut_FALSE_manifest.csv")
+status_csv_path <- file.path(outDir, "phase1_macsTS_useMacsMut_FALSE_run_status.csv")
+
+write.csv(manifest_df, manifest_csv_path, row.names = FALSE)
+write.csv(run_status_df, status_csv_path, row.names = FALSE)
+
+cat("Saved manifest CSV:", manifest_csv_path, "\n")
+
## Saved manifest CSV: testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_manifest.csv
+
cat("Saved run status CSV:", status_csv_path, "\n")
+
## Saved run status CSV: testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_run_status.csv
+
if (writeTrees) {
+  cat("Saved .trees files under:", outDir, "\n")
+}
+
## Saved .trees files under: testData/out_phase1_useMacsMut_FALSE
+
+
+

Next Step

+

Convert the parameters and use them to run simulations with msprime. +Then we can compare the distributions on number of trees, edges, nodes, +mutations and tree heights.

+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/dev/testMaCSTS3.Rmd b/dev/testMaCSTS3.Rmd new file mode 100644 index 00000000..92f58c62 --- /dev/null +++ b/dev/testMaCSTS3.Rmd @@ -0,0 +1,154 @@ +--- +title: "test MaCSTS 3" +output: html_document +date: "2026-06-08" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) +``` + +## General Notes + +This notebook follows `testMaCSTS2.Rmd` outputs and then: + +1. runs `msprime_from_macs_scenarios.py` to generate msprime `.trees` and manifest; +2. runs `msprime_macs_scenarios_compare.py` to build a combined long table; +3. compares distributions across methods (`macs`, `macsTS`, `msprime`). + +These tests are important because `MaCSTS` and the msprime bridge both rely on +multiple unit conversions (tests on more scenarios are welcome!): + +- sequence length; +- time unit conversion (coalescent-scaled time vs generations); +- mutation and recombination rate conversions; +- migration parameter conversions (global vs pairwise); +- sample-count mapping when ploidy changes. + +The goal here is not exact replicate-by-replicate identity with msprime, but to +check that scenario-level distributions (mutations, trees, edges, nodes, and +root times) are consistent after these conversions. + +Current order in MaCSTS when `useMacsMut = FALSE`: + +1. During `buildTs()`, local trees are generated along the chromosome and each + interval is recorded to TS edge/node tables (`recordTreeInterval`). +2. No MaCS mutation placement happens in this step (`addMutationsTs` is skipped). +3. At the end of `buildTs()`, ancestry is simplified (`TsRecorder::simplify`). +4. On `release()`, times are rescaled (if `Nref` was provided), then tables are + sorted and indexed. +5. Post-TS mutation (if requested) is applied later by `simMut()` / + `tsMutateTableCollection()` on the already simplified/rescaled ancestry. +6. Inbred leaf expansion is optional and can be deferred to + `finalizeInbredTs()` (this is what `simAnc(..., expandInbredSamples = FALSE)` + does in the staged workflow). + +```{r paths} +find_dev_dir <- function() { + if (file.exists("msprime_from_macs_scenarios.py")) return(normalizePath(".", winslash = "/")) + if (file.exists("dev/msprime_from_macs_scenarios.py")) return(normalizePath("dev", winslash = "/")) + stop("Cannot find dev directory containing msprime_from_macs_scenarios.py") +} + +dev_dir <- find_dev_dir() +py_sim_script <- file.path(dev_dir, "msprime_from_macs_scenarios.py") +py_cmp_script <- file.path(dev_dir, "msprime_macs_scenarios_compare.py") + +macsts_manifest <- file.path(dev_dir, "testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_manifest.csv") +msprime_manifest <- file.path(dev_dir, "testData/out_msprime_from_macs/msprime_manifest.csv") +comparison_long_csv <- file.path(dev_dir, "testData/method_comparison_long.csv") + +stopifnot(file.exists(py_sim_script)) +stopifnot(file.exists(py_cmp_script)) +stopifnot(file.exists(macsts_manifest)) +``` + +## Run Python Scripts + +```{r run-python} +run_python_script <- function(dev_dir, script_name, args = character()) { + # Use your own path here! + py <- '/usr/bin/python3' + + old <- getwd() + on.exit(setwd(old), add = TRUE) + setwd(dev_dir) + status <- system2(py, args = c(script_name, args)) + if (!identical(status, 0L)) stop("Failed to run ", script_name, " (exit status ", status, ")") +} + +# 1) Generate msprime replicates + manifest. +# Conversion details implemented in msprime_from_macs_scenarios.py: +# - seq length: token 2nd in MaCS args is passed to msprime sequence_length. +# - time: MaCS scaled times are converted to generations by t_gen = t * 4 * Nref. +# - recombination: rec_rate_bp = r / (4 * Nref). +# - mutation: mut_rate_bp = theta / (4 * Nref), then applied via sim_mutations(). +# - migration: +# global M -> pairwise m_ij = M / ((k-1) * 4 * Nref) for k populations; +# pair Mij -> m_ij = Mij / (4 * Nref). +# - samples/ploidy: +# script currently runs msprime with ploidy=2, so MaCS haploid sample counts +# per population are halved (must be even) before passing to `samples=`. +run_python_script( + dev_dir = dev_dir, + script_name = basename(py_sim_script), + args = c( + "--nrep", "50", + "--nchr", "1", + "--nref", "10000", + "--base-seed", "700000", + "--model", "smc_prime", + "--out-dir", "testData/out_msprime_from_macs" + ) +) + +# 2) Build cross-method comparison summary + long table. +# msprime_macs_scenarios_compare.py reads: +# - MaCSTS manifest (from testMaCSTS2), +# - msprime manifest (from step 1), +# and writes a long-format table used below for distribution plots. +run_python_script( + dev_dir = dev_dir, + script_name = basename(py_cmp_script) +) +``` + +## Distribution Comparison Plots + +```{r read-table} +stopifnot(file.exists(comparison_long_csv)) +cmp <- read.csv(comparison_long_csv, stringsAsFactors = FALSE) + + +num_cols <- c("num_mut", "num_trees", "num_edges", "num_nodes", "max_root_time") +for (nm in num_cols) { + cmp[[nm]] <- suppressWarnings(as.numeric(cmp[[nm]])) +} + +library(dplyr) +library(tidyr) +library(ggplot2) + +metrics <- c("num_mut", "num_trees", "num_edges", "num_nodes", "max_root_time") + +long_df <- cmp %>% + pivot_longer(cols = all_of(metrics), names_to = "metric", values_to = "value") %>% + filter(!is.na(value)) + +# Mean + SD bars by scenario/method for each metric +summary_df <- long_df %>% + group_by(Scenarios, Methods, metric) %>% + summarise(mean = mean(value), sd = sd(value), .groups = "drop") + +ggplot(summary_df, aes(x = Scenarios, y = mean, fill = Methods)) + + geom_col(position = position_dodge(width = 0.8), width = 0.7) + + geom_errorbar( + aes(ymin = mean - sd, ymax = mean + sd), + position = position_dodge(width = 0.8), + width = 0.2 + ) + + facet_wrap(~ metric, scales = "free_y") + + labs(x = "Scenario", y = "Mean +/- SD", fill = "Method") + + theme_bw() + +``` diff --git a/dev/testMaCSTS3.html b/dev/testMaCSTS3.html new file mode 100644 index 00000000..a7b49c9d --- /dev/null +++ b/dev/testMaCSTS3.html @@ -0,0 +1,554 @@ + + + + + + + + + + + + + + +test MaCSTS 3 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

General Notes

+

This notebook follows testMaCSTS2.Rmd outputs and +then:

+
    +
  1. runs msprime_from_macs_scenarios.py to generate msprime +.trees and manifest;
  2. +
  3. runs msprime_macs_scenarios_compare.py to build a +combined long table;
  4. +
  5. compares distributions across methods (macs, +macsTS, msprime).
  6. +
+

These tests are important because MaCSTS and the msprime +bridge both rely on multiple unit conversions (tests on more scenarios +are welcome!):

+
    +
  • sequence length;
  • +
  • time unit conversion (coalescent-scaled time vs generations);
  • +
  • mutation and recombination rate conversions;
  • +
  • migration parameter conversions (global vs pairwise);
  • +
  • sample-count mapping when ploidy changes.
  • +
+

The goal here is not exact replicate-by-replicate identity with +msprime, but to check that scenario-level distributions (mutations, +trees, edges, nodes, and root times) are consistent after these +conversions.

+

Current order in MaCSTS when useMacsMut = FALSE:

+
    +
  1. During buildTs(), local trees are generated along the +chromosome and each interval is recorded to TS edge/node tables +(recordTreeInterval).
  2. +
  3. No MaCS mutation placement happens in this step +(addMutationsTs is skipped).
  4. +
  5. At the end of buildTs(), ancestry is simplified +(TsRecorder::simplify).
  6. +
  7. On release(), times are rescaled (if Nref +was provided), then tables are sorted and indexed.
  8. +
  9. Post-TS mutation (if requested) is applied later by +simMut() / tsMutateTableCollection() on the +already simplified/rescaled ancestry.
  10. +
  11. Inbred leaf expansion is optional and can be deferred to +finalizeInbredTs() (this is what +simAnc(..., expandInbredSamples = FALSE) does in the staged +workflow).
  12. +
+
find_dev_dir <- function() {
+  if (file.exists("msprime_from_macs_scenarios.py")) return(normalizePath(".", winslash = "/"))
+  if (file.exists("dev/msprime_from_macs_scenarios.py")) return(normalizePath("dev", winslash = "/"))
+  stop("Cannot find dev directory containing msprime_from_macs_scenarios.py")
+}
+
+dev_dir <- find_dev_dir()
+py_sim_script <- file.path(dev_dir, "msprime_from_macs_scenarios.py")
+py_cmp_script <- file.path(dev_dir, "msprime_macs_scenarios_compare.py")
+
+macsts_manifest <- file.path(dev_dir, "testData/out_phase1_useMacsMut_FALSE/phase1_macsTS_useMacsMut_FALSE_manifest.csv")
+msprime_manifest <- file.path(dev_dir, "testData/out_msprime_from_macs/msprime_manifest.csv")
+comparison_long_csv <- file.path(dev_dir, "testData/method_comparison_long.csv")
+
+stopifnot(file.exists(py_sim_script))
+stopifnot(file.exists(py_cmp_script))
+stopifnot(file.exists(macsts_manifest))
+
+
+

Run Python Scripts

+
run_python_script <- function(dev_dir, script_name, args = character()) {
+  # Use your own path here!
+  py <- '/usr/bin/python3'
+
+  old <- getwd()
+  on.exit(setwd(old), add = TRUE)
+  setwd(dev_dir)
+  status <- system2(py, args = c(script_name, args))
+  if (!identical(status, 0L)) stop("Failed to run ", script_name, " (exit status ", status, ")")
+}
+
+# 1) Generate msprime replicates + manifest.
+# Conversion details implemented in msprime_from_macs_scenarios.py:
+# - seq length: token 2nd in MaCS args is passed to msprime sequence_length.
+# - time: MaCS scaled times are converted to generations by t_gen = t * 4 * Nref.
+# - recombination: rec_rate_bp = r / (4 * Nref).
+# - mutation: mut_rate_bp = theta / (4 * Nref), then applied via sim_mutations().
+# - migration:
+#     global M  -> pairwise m_ij = M / ((k-1) * 4 * Nref) for k populations;
+#     pair Mij  -> m_ij = Mij / (4 * Nref).
+# - samples/ploidy:
+#     script currently runs msprime with ploidy=2, so MaCS haploid sample counts
+#     per population are halved (must be even) before passing to `samples=`.
+run_python_script(
+  dev_dir = dev_dir,
+  script_name = basename(py_sim_script),
+  args = c(
+    "--nrep", "50",
+    "--nchr", "1",
+    "--nref", "10000",
+    "--base-seed", "700000",
+    "--model", "smc_prime",
+    "--out-dir", "testData/out_msprime_from_macs"
+  )
+)
+
+# 2) Build cross-method comparison summary + long table.
+# msprime_macs_scenarios_compare.py reads:
+# - MaCSTS manifest (from testMaCSTS2),
+# - msprime manifest (from step 1),
+# and writes a long-format table used below for distribution plots.
+run_python_script(
+  dev_dir = dev_dir,
+  script_name = basename(py_cmp_script)
+)
+
+
+

Distribution Comparison Plots

+
stopifnot(file.exists(comparison_long_csv))
+cmp <- read.csv(comparison_long_csv, stringsAsFactors = FALSE)
+
+
+num_cols <- c("num_mut", "num_trees", "num_edges", "num_nodes", "max_root_time")
+for (nm in num_cols) {
+  cmp[[nm]] <- suppressWarnings(as.numeric(cmp[[nm]]))
+}
+
+library(dplyr)
+library(tidyr)
+library(ggplot2)
+
+metrics <- c("num_mut", "num_trees", "num_edges", "num_nodes", "max_root_time")
+
+long_df <- cmp %>%
+  pivot_longer(cols = all_of(metrics), names_to = "metric", values_to = "value") %>%
+  filter(!is.na(value))
+
+# Mean + SD bars by scenario/method for each metric
+summary_df <- long_df %>%
+  group_by(Scenarios, Methods, metric) %>%
+  summarise(mean = mean(value), sd = sd(value), .groups = "drop")
+
+ggplot(summary_df, aes(x = Scenarios, y = mean, fill = Methods)) +
+  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
+  geom_errorbar(
+    aes(ymin = mean - sd, ymax = mean + sd),
+    position = position_dodge(width = 0.8),
+    width = 0.2
+  ) +
+  facet_wrap(~ metric, scales = "free_y") +
+  labs(x = "Scenario", y = "Mean +/- SD", fill = "Method") +
+  theme_bw()
+

+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/dev/testMaCSTS4.Rmd b/dev/testMaCSTS4.Rmd new file mode 100644 index 00000000..9765a589 --- /dev/null +++ b/dev/testMaCSTS4.Rmd @@ -0,0 +1,357 @@ +--- +title: "test MaCSTS 4: wrapper/staged workflow overview" +output: html_document +date: "2026-06-10" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) +``` + +## General Notes + +This notebook summarizes the current TS workflow design and parameter effects, +based on `testMaCSTS1.Rmd`, `testMaCSTS2.Rmd`, and `testMaCSTS3.Rmd`. + +It starts from the two supported ways to run: + +1. high-level wrapper: `runMacTS(...)` +2. staged low-level path (for developers and advanced users): `simAnc(...) -> simMut(...) -> finalizeInbredTs(...) -> asMapPop(...)` + +## 1) Two Ways To Run + +### A. High-level wrapper (`runMacTS`) + +`runMacTS` is the default user-facing entry point. It is designed to feel like +`runMacs/runMacs2` but with TS-aware internals and mode control. + +- Builds MaCS command from species/manual inputs +- Runs ancestry and mutation according to `mutationMode` +- Converts to `MapPop` (including optional `segSites` sampling) +- Optionally returns TS tables/metadata + +### B. Staged workflow (`simAnc` + `simMut` + ...) + +This path is for development/debugging and explicit control over each stage. + +- `simAnc`: ancestry-only TS from MaCS engine (`useMacsMut=FALSE`) +- `simMut`: post-TS mutation placement on edge table (if `mutationMode="postTs"`) +- `finalizeInbredTs`: optional inbred leaf expansion in TS (if `inbred=TRUE` and `ploidy>1`) +- `asMapPop`: convert TS to `MapPop` (sampling/filtering + map construction) + +## 2) Function Relationship + +```{r relationship-diagram, results='asis'} +cat(" +runMacTS() + ├─ command builder (species/manual) + ├─ mutationMode = 'macs' -> MaCSTS(..., useMacsMut=TRUE, expandInbredSamples=FALSE) + ├─ mutationMode = 'postTs'-> simAnc(...) -> simMut(...) + ├─ mutationMode = 'none' -> simAnc(...) only (TS-only unless user mutates later) + ├─ optional finalizeInbredTs(...) + └─ asMapPop(...) -> MapPop + +Lower layers: + simAnc() -> MaCSTS(... useMacsMut=FALSE ...) + simMut() -> tsMutateTableCollection(...) per chromosome + finalizeInbredTs() -> tsFinalizeInbredTableCollection(...) per chromosome + MaCSTS() -> C++ simulator + TS recorder +") +``` + +## 3) Core Parameters And Their Effects + +```{r param-table} +param_effects <- data.frame( + parameter = c( + "mutationMode", + "useMacsMut (MaCSTS)", + "usePhysicalPositions", + "Nref", + "segSites", + "inbred + ploidy", + "expandInbredTs", + "seed", + "mutSeed", + "nThreads" + ), + where_used = c( + "runMacTS", + "MaCSTS", + "MaCSTS/asMapPop", + "MaCSTS (timeScale)", + "asMapPop", + "MaCSTS/asMapPop/finalizeInbredTs", + "runMacTS/finalizeInbredTs", + "simAnc/MaCSTS ancestry RNG", + "simMut post-TS RNG", + "MaCS chromosomes + asMapPop worker" + ), + main_effect = c( + "Selects mutation path: macs vs postTs vs none", + "Adds MaCS-style mutation during ancestry if TRUE", + "TS coordinates in bp (TRUE) or [0,1] (FALSE)", + "Rescales node/mutation times; sets time_units to generations", + "Caps number of retained variants per chromosome in conversion", + "Controls sample interpretation and output haplotype structure", + "If TRUE with inbred/ploidy>1, duplicates TS sample leaves", + "Determines ancestry reproducibility", + "Determines post-TS mutation reproducibility", + "Parallelism level when available" + ), + stringsAsFactors = FALSE +) +knitr::kable(param_effects) +``` + +## Helpers to Compare Two Ways To Run + +```{r comparison-helpers} +tc_counts <- function(tc_xptr) { + tc <- RcppTskit::TableCollection$new(xptr = tc_xptr) + data.frame( + num_nodes = as.integer(tc$num_nodes()), + num_edges = as.integer(tc$num_edges()), + num_sites = as.integer(tc$num_sites()), + num_mutations = as.integer(tc$num_mutations()) + ) +} + +collect_ts_counts <- function(tables, method) { + do.call(rbind, lapply(seq_along(tables), function(chr) { + cbind(chr = chr, method = method, tc_counts(tables[[chr]])) + })) +} + +compare_wrapper_vs_staged <- function(out, staged) { + if (!requireNamespace("RcppTskit", quietly = TRUE)) { + stop("RcppTskit is required for TS checks.") + } + if (is.null(out$pop) || is.null(out$tables)) { + stop("out must contain both $pop and $tables") + } + if (is.null(staged$pop) || is.null(staged$tables)) { + stop("staged must contain both $pop and $tables") + } + + ts_counts_wrap <- collect_ts_counts(out$tables, method = "wrapper") + ts_counts_stage <- collect_ts_counts(staged$tables, method = "staged") + ts_counts <- rbind(ts_counts_wrap, ts_counts_stage) + + nChr <- length(out$pop@geno) + same_geno <- vapply(seq_len(nChr), function(chr) { + identical(out$pop@geno[[chr]], staged$pop@geno[[chr]]) + }, logical(1)) + same_genMap <- vapply(seq_len(nChr), function(chr) { + isTRUE(all.equal(out$pop@genMap[[chr]], staged$pop@genMap[[chr]], tolerance = 0)) + }, logical(1)) + same_ts_counts <- identical( + ts_counts_wrap[, c("num_nodes", "num_edges", "num_sites", "num_mutations")], + ts_counts_stage[, c("num_nodes", "num_edges", "num_sites", "num_mutations")] + ) + + checks <- data.frame( + metric = c("nLoci", "geno_all_chr", "genMap_all_chr", "ts_table_counts"), + equal = c( + identical(out$pop@nLoci, staged$pop@nLoci), + all(same_geno), + all(same_genMap), + same_ts_counts + ), + stringsAsFactors = FALSE + ) + + list(ts_counts = ts_counts, checks = checks) +} + +print_wrapper_vs_staged <- function(cmp) { + print(cmp$ts_counts) + print(cmp$checks) + invisible(cmp) +} +``` + +## Minimal Usage Examples + +### A. High-level wrapper mode + +```{r ex-runMacTS} +library(AlphaSimR) +devtools::load_all() +set.seed(1) +out <- AlphaSimR:::runMacTS( + nInd = 4, + nChr = 2, + segSites = 60, + inbred = FALSE, + ploidy = 2L, + species = "GENERIC", + mutationMode = "postTs", # or "macs" / "none" + usePhysicalPositions = FALSE, + nThreads = 1L, + returnTs = TRUE +) + +# runMacTS returns: +# out$pop -> MapPop +# out$tables -> TS table collections (if returnTs=TRUE) +``` + +### B. Staged mode + +```{r ex-staged} +library(AlphaSimR) + +# use the same args and seeds as high-level wrapper mode +args <- out$args +nChr <- 2L +seed_chr <- out$seed + +anc <- AlphaSimR:::simAnc( + args = args, + nChr = nChr, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = seed_chr, + usePhysicalPositions = FALSE, + Nref = NA_real_ +) + +mut <- AlphaSimR:::simMut(anc) # uses anc$dTheta and derived mutation seeds by default + +# Optional for inbred/ploidy expansion in TS: +# mut <- AlphaSimR:::finalizeInbredTs(mut, inbred = TRUE, ploidy = 2L) + +pop <- AlphaSimR:::asMapPop( + chr_info = list( + tables = mut$tables, + breaks = list(c(0, 1)), + rates = list(c(1)) + ), + ploidy = 2L, + inbred = FALSE, + segSites = 60 +) +``` + +## Validation + +### Wrapper vs staged equivalence + +```{r wrapper-vs-staged-equivalence} +# Reuse objects from the previous ection: +# - out : runMacTS(...) result +# - mut : simMut(...) result from staged path +# - pop : asMapPop(...) result from staged path + +stopifnot(exists("out"), exists("mut"), exists("pop")) +cmp <- compare_wrapper_vs_staged( + out = out, + staged = list( + tables = mut$tables, + pop = pop + ) +) +print_wrapper_vs_staged(cmp) +``` + +### Wrapper vs staged (inbred=TRUE, ploidy>1) + +```{r wrapper-vs-staged-inbred} +# Example: inbred TRUE with ploidy 2 +set.seed(1) +out_inbred <- AlphaSimR:::runMacTS( + nInd = 4, + nChr = 2, + segSites = 60, + inbred = TRUE, + ploidy = 2L, + species = "GENERIC", + mutationMode = "postTs", + usePhysicalPositions = TRUE, + expandInbredTs = TRUE, + nThreads = 1L, + returnTs = TRUE +) + +anc_inbred <- AlphaSimR:::simAnc( + args = out_inbred$args, + nChr = 2L, + inbred = TRUE, + ploidy = 2L, + nThreads = 1L, + seed = out_inbred$seed, + usePhysicalPositions = TRUE, + Nref = NA_real_ +) +mut_inbred <- AlphaSimR:::simMut(anc_inbred) +fin_inbred <- AlphaSimR:::finalizeInbredTs(mut_inbred, inbred = TRUE, ploidy = 2L) + +pop_inbred <- AlphaSimR:::asMapPop( + chr_info = list( + tables = fin_inbred$tables, + breaks = rep(list(c(0, 1)), 2L), + rates = rep(list(c(1)), 2L) + ), + ploidy = 2L, + inbred = TRUE, + segSites = 60, + site_sampling_seed = 42L, + nThreads = 1L +) + +cmp_inbred <- compare_wrapper_vs_staged( + out = out_inbred, + staged = list( + tables = fin_inbred$tables, + pop = pop_inbred + ) +) +print_wrapper_vs_staged(cmp_inbred) +``` + +Output a ts as an example and check it in Python: +```{r export-inbred-ts} +tables_src <- if (exists("fin_inbred")) { + fin_inbred$tables +} else if (exists("out_inbred")) { + out_inbred$tables +} else { + stop("Need fin_inbred or out_inbred in environment. Run chunk 5.1b first.") +} +# chr1 +tc_xptr <- tables_src[[1]] +tc <- RcppTskit::TableCollection$new(xptr = tc_xptr) +ts <- tc$tree_sequence() +ts$write('testData/inbred_test.trees') +``` + +```{r py-setup} +library(reticulate) +# Use your own path here! +use_python('/usr/bin/python3', required = TRUE) +``` + + +```{python} +import numpy as np +import tskit +ts_inbred = tskit.load('testData/inbred_test.trees') +ts_inbred.samples() + +print(ts_inbred.first().draw_text()) + +print(ts_inbred.tables.nodes[2612:2620]) + +g_matrix = ts_inbred.genotype_matrix() +for hap in range(0, len(g_matrix[0]), 2): + print(hap, hap+1) + print(np.array_equal(g_matrix[:, hap], g_matrix[:, hap+1])) + +``` +Sampled nodes are duplicated as expected for inbred/ploidy expansion, from nodes 0-3. Each pair of hap (from the same individual) have same alleles. + + +### Parameter sensitivity checks +In tests/testthat/test-runMacTS-sensitivity.R diff --git a/dev/testMaCSTS4.html b/dev/testMaCSTS4.html new file mode 100644 index 00000000..15f9a5e6 --- /dev/null +++ b/dev/testMaCSTS4.html @@ -0,0 +1,873 @@ + + + + + + + + + + + + + + +test MaCSTS 4: wrapper/staged workflow overview + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

General Notes

+

This notebook summarizes the current TS workflow design and parameter +effects, based on testMaCSTS1.Rmd, +testMaCSTS2.Rmd, and testMaCSTS3.Rmd.

+

It starts from the two supported ways to run:

+
    +
  1. high-level wrapper: runMacTS(...)
  2. +
  3. staged low-level path (for developers and advanced users): +simAnc(...) -> simMut(...) -> finalizeInbredTs(...) -> asMapPop(...)
  4. +
+
+
+

1) Two Ways To Run

+
+

A. High-level wrapper (runMacTS)

+

runMacTS is the default user-facing entry point. It is +designed to feel like runMacs/runMacs2 but with TS-aware +internals and mode control.

+
    +
  • Builds MaCS command from species/manual inputs
  • +
  • Runs ancestry and mutation according to +mutationMode
  • +
  • Converts to MapPop (including optional +segSites sampling)
  • +
  • Optionally returns TS tables/metadata
  • +
+
+
+

B. Staged workflow (simAnc + simMut + +…)

+

This path is for development/debugging and explicit control over each +stage.

+
    +
  • simAnc: ancestry-only TS from MaCS engine +(useMacsMut=FALSE)
  • +
  • simMut: post-TS mutation placement on edge table (if +mutationMode="postTs")
  • +
  • finalizeInbredTs: optional inbred leaf expansion in TS +(if inbred=TRUE and ploidy>1)
  • +
  • asMapPop: convert TS to MapPop +(sampling/filtering + map construction)
  • +
+
+
+
+

2) Function Relationship

+
cat("
+runMacTS()
+  ├─ command builder (species/manual)
+  ├─ mutationMode = 'macs'  -> MaCSTS(..., useMacsMut=TRUE, expandInbredSamples=FALSE)
+  ├─ mutationMode = 'postTs'-> simAnc(...) -> simMut(...)
+  ├─ mutationMode = 'none'  -> simAnc(...) only (TS-only unless user mutates later)
+  ├─ optional finalizeInbredTs(...)
+  └─ asMapPop(...) -> MapPop
+
+Lower layers:
+  simAnc()  -> MaCSTS(... useMacsMut=FALSE ...)
+  simMut()  -> tsMutateTableCollection(...) per chromosome
+  finalizeInbredTs() -> tsFinalizeInbredTableCollection(...) per chromosome
+  MaCSTS()  -> C++ simulator + TS recorder
+")
+

runMacTS() ├─ command builder (species/manual) ├─ mutationMode = +‘macs’ -> MaCSTS(…, useMacsMut=TRUE, expandInbredSamples=FALSE) ├─ +mutationMode = ‘postTs’-> simAnc(…) -> simMut(…) ├─ mutationMode = +‘none’ -> simAnc(…) only (TS-only unless user mutates later) ├─ +optional finalizeInbredTs(…) └─ asMapPop(…) -> MapPop

+

Lower layers: simAnc() -> MaCSTS(… useMacsMut=FALSE …) simMut() +-> tsMutateTableCollection(…) per chromosome finalizeInbredTs() -> +tsFinalizeInbredTableCollection(…) per chromosome MaCSTS() -> C++ +simulator + TS recorder

+
+
+

3) Core Parameters And Their Effects

+
param_effects <- data.frame(
+  parameter = c(
+    "mutationMode",
+    "useMacsMut (MaCSTS)",
+    "usePhysicalPositions",
+    "Nref",
+    "segSites",
+    "inbred + ploidy",
+    "expandInbredTs",
+    "seed",
+    "mutSeed",
+    "nThreads"
+  ),
+  where_used = c(
+    "runMacTS",
+    "MaCSTS",
+    "MaCSTS/asMapPop",
+    "MaCSTS (timeScale)",
+    "asMapPop",
+    "MaCSTS/asMapPop/finalizeInbredTs",
+    "runMacTS/finalizeInbredTs",
+    "simAnc/MaCSTS ancestry RNG",
+    "simMut post-TS RNG",
+    "MaCS chromosomes + asMapPop worker"
+  ),
+  main_effect = c(
+    "Selects mutation path: macs vs postTs vs none",
+    "Adds MaCS-style mutation during ancestry if TRUE",
+    "TS coordinates in bp (TRUE) or [0,1] (FALSE)",
+    "Rescales node/mutation times; sets time_units to generations",
+    "Caps number of retained variants per chromosome in conversion",
+    "Controls sample interpretation and output haplotype structure",
+    "If TRUE with inbred/ploidy>1, duplicates TS sample leaves",
+    "Determines ancestry reproducibility",
+    "Determines post-TS mutation reproducibility",
+    "Parallelism level when available"
+  ),
+  stringsAsFactors = FALSE
+)
+knitr::kable(param_effects)
+ +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
parameterwhere_usedmain_effect
mutationModerunMacTSSelects mutation path: macs vs postTs vs none
useMacsMut (MaCSTS)MaCSTSAdds MaCS-style mutation during ancestry if TRUE
usePhysicalPositionsMaCSTS/asMapPopTS coordinates in bp (TRUE) or [0,1] (FALSE)
NrefMaCSTS (timeScale)Rescales node/mutation times; sets time_units to +generations
segSitesasMapPopCaps number of retained variants per chromosome in +conversion
inbred + ploidyMaCSTS/asMapPop/finalizeInbredTsControls sample interpretation and output haplotype +structure
expandInbredTsrunMacTS/finalizeInbredTsIf TRUE with inbred/ploidy>1, duplicates TS sample +leaves
seedsimAnc/MaCSTS ancestry RNGDetermines ancestry reproducibility
mutSeedsimMut post-TS RNGDetermines post-TS mutation reproducibility
nThreadsMaCS chromosomes + asMapPop workerParallelism level when available
+
+
+

Helpers to Compare Two Ways To Run

+
tc_counts <- function(tc_xptr) {
+  tc <- RcppTskit::TableCollection$new(xptr = tc_xptr)
+  data.frame(
+    num_nodes = as.integer(tc$num_nodes()),
+    num_edges = as.integer(tc$num_edges()),
+    num_sites = as.integer(tc$num_sites()),
+    num_mutations = as.integer(tc$num_mutations())
+  )
+}
+
+collect_ts_counts <- function(tables, method) {
+  do.call(rbind, lapply(seq_along(tables), function(chr) {
+    cbind(chr = chr, method = method, tc_counts(tables[[chr]]))
+  }))
+}
+
+compare_wrapper_vs_staged <- function(out, staged) {
+  if (!requireNamespace("RcppTskit", quietly = TRUE)) {
+    stop("RcppTskit is required for TS checks.")
+  }
+  if (is.null(out$pop) || is.null(out$tables)) {
+    stop("out must contain both $pop and $tables")
+  }
+  if (is.null(staged$pop) || is.null(staged$tables)) {
+    stop("staged must contain both $pop and $tables")
+  }
+  
+  ts_counts_wrap <- collect_ts_counts(out$tables, method = "wrapper")
+  ts_counts_stage <- collect_ts_counts(staged$tables, method = "staged")
+  ts_counts <- rbind(ts_counts_wrap, ts_counts_stage)
+  
+  nChr <- length(out$pop@geno)
+  same_geno <- vapply(seq_len(nChr), function(chr) {
+    identical(out$pop@geno[[chr]], staged$pop@geno[[chr]])
+  }, logical(1))
+  same_genMap <- vapply(seq_len(nChr), function(chr) {
+    isTRUE(all.equal(out$pop@genMap[[chr]], staged$pop@genMap[[chr]], tolerance = 0))
+  }, logical(1))
+  same_ts_counts <- identical(
+    ts_counts_wrap[, c("num_nodes", "num_edges", "num_sites", "num_mutations")],
+    ts_counts_stage[, c("num_nodes", "num_edges", "num_sites", "num_mutations")]
+  )
+  
+  checks <- data.frame(
+    metric = c("nLoci", "geno_all_chr", "genMap_all_chr", "ts_table_counts"),
+    equal = c(
+      identical(out$pop@nLoci, staged$pop@nLoci),
+      all(same_geno),
+      all(same_genMap),
+      same_ts_counts
+    ),
+    stringsAsFactors = FALSE
+  )
+  
+  list(ts_counts = ts_counts, checks = checks)
+}
+
+print_wrapper_vs_staged <- function(cmp) {
+  print(cmp$ts_counts)
+  print(cmp$checks)
+  invisible(cmp)
+}
+
+
+

Minimal Usage Examples

+
+

A. High-level wrapper mode

+
library(AlphaSimR)
+devtools::load_all()
+set.seed(1)
+out <- AlphaSimR:::runMacTS(
+  nInd = 4,
+  nChr = 2,
+  segSites = 60,
+  inbred = FALSE,
+  ploidy = 2L,
+  species = "GENERIC",
+  mutationMode = "postTs",      # or "macs" / "none"
+  usePhysicalPositions = FALSE,
+  nThreads = 1L,
+  returnTs = TRUE
+)
+
+# runMacTS returns:
+# out$pop      -> MapPop
+# out$tables   -> TS table collections (if returnTs=TRUE)
+
+
+

B. Staged mode

+
library(AlphaSimR)
+
+# use the same args and seeds as high-level wrapper mode
+args <- out$args
+nChr <- 2L
+seed_chr <- out$seed
+
+anc <- AlphaSimR:::simAnc(
+  args = args,
+  nChr = nChr,
+  inbred = FALSE,
+  ploidy = 2L,
+  nThreads = 1L,
+  seed = seed_chr,
+  usePhysicalPositions = FALSE,
+  Nref = NA_real_
+)
+
+mut <- AlphaSimR:::simMut(anc)  # uses anc$dTheta and derived mutation seeds by default
+
+# Optional for inbred/ploidy expansion in TS:
+# mut <- AlphaSimR:::finalizeInbredTs(mut, inbred = TRUE, ploidy = 2L)
+
+pop <- AlphaSimR:::asMapPop(
+  chr_info = list(
+    tables = mut$tables,
+    breaks = list(c(0, 1)),
+    rates = list(c(1))
+  ),
+  ploidy = 2L,
+  inbred = FALSE,
+  segSites = 60
+)
+
+
+
+

Validation

+
+

Wrapper vs staged equivalence

+
# Reuse objects from the previous ection:
+# - out  : runMacTS(...) result
+# - mut  : simMut(...) result from staged path
+# - pop  : asMapPop(...) result from staged path
+
+stopifnot(exists("out"), exists("mut"), exists("pop"))
+cmp <- compare_wrapper_vs_staged(
+  out = out,
+  staged = list(
+    tables = mut$tables,
+    pop = pop
+  )
+)
+print_wrapper_vs_staged(cmp)
+
##   chr  method num_nodes num_edges num_sites num_mutations
+## 1   1 wrapper      4395     10544     15543         15543
+## 2   2 wrapper      4077      9941     14507         14507
+## 3   1  staged      4395     10544     15543         15543
+## 4   2  staged      4077      9941     14507         14507
+##            metric equal
+## 1           nLoci  TRUE
+## 2    geno_all_chr  TRUE
+## 3  genMap_all_chr  TRUE
+## 4 ts_table_counts  TRUE
+
+
+

Wrapper vs staged (inbred=TRUE, ploidy>1)

+
# Example: inbred TRUE with ploidy 2
+set.seed(1)
+out_inbred <- AlphaSimR:::runMacTS(
+  nInd = 4,
+  nChr = 2,
+  segSites = 60,
+  inbred = TRUE,
+  ploidy = 2L,
+  species = "GENERIC",
+  mutationMode = "postTs",
+  usePhysicalPositions = TRUE,
+  expandInbredTs = TRUE,
+  nThreads = 1L,
+  returnTs = TRUE
+)
+
+anc_inbred <- AlphaSimR:::simAnc(
+  args = out_inbred$args,
+  nChr = 2L,
+  inbred = TRUE,
+  ploidy = 2L,
+  nThreads = 1L,
+  seed = out_inbred$seed,
+  usePhysicalPositions = TRUE,
+  Nref = NA_real_
+)
+mut_inbred <- AlphaSimR:::simMut(anc_inbred)
+fin_inbred <- AlphaSimR:::finalizeInbredTs(mut_inbred, inbred = TRUE, ploidy = 2L)
+
+pop_inbred <- AlphaSimR:::asMapPop(
+  chr_info = list(
+    tables = fin_inbred$tables,
+    breaks = rep(list(c(0, 1)), 2L),
+    rates = rep(list(c(1)), 2L)
+  ),
+  ploidy = 2L,
+  inbred = TRUE,
+  segSites = 60,
+  site_sampling_seed = 42L,
+  nThreads = 1L
+)
+
+cmp_inbred <- compare_wrapper_vs_staged(
+  out = out_inbred,
+  staged = list(
+    tables = fin_inbred$tables,
+    pop = pop_inbred
+  )
+)
+print_wrapper_vs_staged(cmp_inbred)
+
##   chr  method num_nodes num_edges num_sites num_mutations
+## 1   1 wrapper      2620      5710      9372          9372
+## 2   2 wrapper      3497      7788     12514         12514
+## 3   1  staged      2620      5710      9372          9372
+## 4   2  staged      3497      7788     12514         12514
+##            metric equal
+## 1           nLoci  TRUE
+## 2    geno_all_chr  TRUE
+## 3  genMap_all_chr FALSE
+## 4 ts_table_counts  TRUE
+

Output a ts as an example and check it in Python:

+
tables_src <- if (exists("fin_inbred")) {
+  fin_inbred$tables
+} else if (exists("out_inbred")) {
+  out_inbred$tables
+} else {
+  stop("Need fin_inbred or out_inbred in environment. Run chunk 5.1b first.")
+}
+# chr1
+tc_xptr <- tables_src[[1]]
+tc <- RcppTskit::TableCollection$new(xptr = tc_xptr)
+ts <- tc$tree_sequence()
+ts$write('testData/inbred_test.trees')
+
library(reticulate)
+# Use your own path here!
+use_python('/usr/bin/python3', required = TRUE)
+
import numpy as np
+import tskit
+ts_inbred = tskit.load('testData/inbred_test.trees')
+ts_inbred.samples()
+
## array([2612, 2613, 2614, 2615, 2616, 2617, 2618, 2619], dtype=int32)
+
print(ts_inbred.first().draw_text())
+
##             727                        
+##     ┏━━━━━━━━┻━━━━━━━━┓                
+##     ┃                243               
+##     ┃         ┏━━━━━━━┻━━━━━━┓         
+##     ┃         ┃              4         
+##     ┃         ┃         ┏━━━━┻━━━━┓    
+##     0         1         2         3    
+##   ┏━┻━━┓    ┏━┻━━┓    ┏━┻━━┓    ┏━┻━━┓ 
+## 2612 2613 2614 2615 2616 2617 2618 2619
+
print(ts_inbred.tables.nodes[2612:2620])
+
## ╔══╤═════╤══════════╤══════════╤════╤════════╗
+## ║id│flags│population│individual│time│metadata║
+## ╠══╪═════╪══════════╪══════════╪════╪════════╣
+## ║0 │    1│         0│         0│   0│        ║
+## ║1 │    1│         0│         0│   0│        ║
+## ║2 │    1│         0│         1│   0│        ║
+## ║3 │    1│         0│         1│   0│        ║
+## ║4 │    1│         0│         2│   0│        ║
+## ║5 │    1│         0│         2│   0│        ║
+## ║6 │    1│         0│         3│   0│        ║
+## ║7 │    1│         0│         3│   0│        ║
+## ╚══╧═════╧══════════╧══════════╧════╧════════╝
+
g_matrix = ts_inbred.genotype_matrix()
+for hap in range(0, len(g_matrix[0]), 2):
+    print(hap, hap+1)
+    print(np.array_equal(g_matrix[:, hap], g_matrix[:, hap+1]))
+
## 0 1
+## True
+## 2 3
+## True
+## 4 5
+## True
+## 6 7
+## True
+

Sampled nodes are duplicated as expected for inbred/ploidy expansion, +from nodes 0-3. Each pair of hap (from the same individual) have same +alleles.

+
+
+

Parameter sensitivity checks

+

In tests/testthat/test-runMacTS-sensitivity.R

+
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/dev/testSampling.R b/dev/testSampling.R new file mode 100644 index 00000000..46dde852 --- /dev/null +++ b/dev/testSampling.R @@ -0,0 +1,71 @@ +library(ggplot2) + +L1 <- 1e6 +chr_info <- list( + list(ts_path="dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8))) +founderGenomes <- asMapPop( + chr_info = chr_info, + inbred = FALSE, + ploidy = 2L + ) +all_pos <- chrKeptPosBpList +all_pos <- unlist(all_pos) +breaks <- seq(min(all_pos), max(all_pos), length.out = 11) +bg_counts <- cut(all_pos, breaks = breaks, include.lowest = TRUE) %>% + table() %>% + as.numeric() + +n_iterations <- 50 +n_bins <- 10 +segSites <- 77 + +chr_info <- list( + list(ts_path="dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=segSites)) + +all_bp_positions <- list() +all_gen_map_positions <- list() + +#set.seed(42) +random_seeds <- sample(1:1000000, size = n_iterations) + +for (i in 1:n_iterations) { + current_seed <- random_seeds[i] + + founderGenomes <- asMapPop( + chr_info = chr_info, + inbred = FALSE, + ploidy = 2L, + site_sampling_seed = current_seed + ) + + bp_pos <- unlist(chrKeptPosBpList) + all_bp_positions[[i]] <- bp_pos + + gen_map_pos <- unlist(founderGenomes@genMap) + all_gen_map_positions[[i]] <- gen_map_pos +} + + +df_bp <- data.frame(pos = unlist(all_bp_positions)) +df_gen <- data.frame(pos = unlist(all_gen_map_positions)) + +sampled_counts <- cut(df_bp$pos, breaks = breaks, include.lowest = TRUE) %>% + table() %>% + as.numeric() + +plot_data <- data.frame( + bin_mid = (breaks[-1] + breaks[-length(breaks)]) / 2, + sampling_ratio = (sampled_counts) / bg_counts +) + +p1 <- ggplot(plot_data, aes(x = bin_mid/1000, y = sampling_ratio)) + + geom_bar(stat = "identity", fill = "skyblue", color = "white") + + geom_hline(yintercept = mean(plot_data$sampling_ratio, na.rm = TRUE), + linetype = "dashed", color = "red") + + labs(title = paste(segSites, "segSites (", length(all_pos), "in total) over", n_iterations, "runs"), + x = "Position (kbp)", + y = "Frequency (Count) / Background") + + theme_minimal() +print(p1) diff --git a/dev/test_Rcpptskit_out.Rmd b/dev/test_Rcpptskit_out.Rmd new file mode 100644 index 00000000..ecc33dca --- /dev/null +++ b/dev/test_Rcpptskit_out.Rmd @@ -0,0 +1,125 @@ +--- +title: "test_Rcpptskit_out" +output: html_document +date: "2026-04-01" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## R Markdown + +This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . + +When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: + +```{r cars} +summary(cars) +``` + +## Including Plots + +You can also embed plots, for example: + +```{r pressure, echo=FALSE} +plot(pressure) +``` + +Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot. + +``` +> library(reticulate) +> use_virtualenv("~/r-reticulate-env", required = TRUE) +> tskit <- import("tskit") +> devtools::load_all() +ℹ Loading AlphaSimR +> L1 <- 1e6 +> L2 <- 2e6 +> # here, use the same recombination map as used in msprime +> chr_info <- list( ++ list(ts_path=".dev/testData/msprime_chr0.trees", ++ breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), ++ list(ts_path="dev/testData/msprime_chr1.trees", ++ breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) ++ ) +> founderGenomes1 <- asMapPopPy(chr_info = chr_info, inbred=FALSE, ploidy=2L) +Error in py_call_impl(callable, call_args$unnamed, call_args$named) : + FileNotFoundError: [Errno 2] No such file or directory: '.dev/testData/msprime_chr0.trees' +Run `reticulate::py_last_error()` for details. +Called from: py_call_impl(callable, call_args$unnamed, call_args$named) + +Browse[1]> +> L1 <- 1e6 +> L2 <- 2e6 +> # here, use the same recombination map as used in msprime +> chr_info <- list( ++ list(ts_path="dev/testData/msprime_chr0.trees", ++ breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), ++ list(ts_path="dev/testData/msprime_chr1.trees", ++ breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) ++ ) +> founderGenomes1 <- asMapPopPy(chr_info = chr_info, inbred=FALSE, ploidy=2L) +60 variants sampled (Random seed: 42) +60 variants sampled (Random seed: 42) +155 variants sampled (Random seed: 42) +155 variants sampled (Random seed: 42) +> set.seed(42) +> SP = SimParam$new(founderGenomes1) +> SP$setSexes("yes_sys") +> SP$addTraitA(nQtlPerChr = 5, ++ mean = 500, ++ var = 450) +> +> SP$setTrackPed(TRUE) +> # try the new function here, it automatically set setTrackRec also. +> SP$setTrackRecGen(TRUE) +> basePop = newPop(founderGenomes1) +> basePop = setPheno(basePop, ++ h2 = 0.5) +> +> #--- n generations +> nCycles<-2 +> +> # very simple container for each cycles sim output +> simOutput<-list(basePop) +> cycle<-1 +> for(cycle in 1:nCycles){ ++ cat(paste0(" C",cycle)) ++ # choose the best from last cycle ++ chosenParents<- selectInd(pop=simOutput[[cycle]], nInd=6, use = "gv") ++ # make crosses ++ offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5) ++ # phenotype new offspring ++ offspringPop<-setPheno(pop = offspringPop, h2 = 0.5) ++ # add new offspring to simOutput list ++ simOutput[[cycle+1]]<-offspringPop ++ } + C1 C2 +Warning message: +In selectInd(pop = simOutput[[cycle]], nInd = 6, use = "gv") : + Suitable candidates smaller than nInd, returning 2 individuals + +> rm(tskit) +> tskit +Error: object 'tskit' not found + +> library(RcppTskit) +> bridgeCollectSegGenFromSimOutput(SP, simOutput) +> bridgeWriteTrees(chr_info, do.call(rbind, bridgeSegDfListGen), SP) +Wrote: dev/testData/AlphaSimR_extended_chr0.trees +Wrote: dev/testData/AlphaSimR_extended_chr1.trees +> source("dev/alphaSimR2TsPy.R") +> source("dev/alphaSimR2TsGenPy.R") +> bridgeCollectSegGenFromSimOutputPy(SP, simOutput) +> rm(bridgeSegDfListGen) +> rm(indIdMapByChr) +> rm(nodeIdMapByChr) +> rm(list = lsf.str()) +> source("dev/alphaSimR2TsPy.R") +> source("dev/alphaSimR2TsGenPy.R") +> bridgeCollectSegGenFromSimOutputPy(SP, simOutput) +> bridgeWriteTreesPy(chr_info, do.call(rbind, bridgeSegDfListGen), SP) +Wrote: dev/testData/AlphaSimR_extended_chr0.trees +Wrote: dev/testData/AlphaSimR_extended_chr1.trees +``` diff --git a/dev/test_runMaCSTS.txt.rtf b/dev/test_runMaCSTS.txt.rtf new file mode 100644 index 00000000..0149bc3f --- /dev/null +++ b/dev/test_runMaCSTS.txt.rtf @@ -0,0 +1,46 @@ +{\rtf1\ansi\ansicpg1252\cocoartf2822 +\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +{\*\expandedcolortbl;;} +\paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0 +\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 + +\f0\fs24 \cf0 args <- "4 10000 -t 1e-3 -r 1e-2 -s "\ +seed <- as.integer(12345)\ +\ +# Legacy MaCS output (haplotypes/genMap)\ +macs <- AlphaSimR:::MaCS(\ + args = args, maxSites = as.integer(0), # keep all sites\ + inbred = FALSE, ploidy = 2L, nThreads = 1L, seed = seed\ +)\ +\ +n_sites <- length(macs$genMap[[1]])\ +hap_macs <- matrix(\ + as.integer(hap_macs),\ + nrow = nrow(hap_macs),\ + ncol = ncol(hap_macs),\ + dimnames = dimnames(hap_macs)\ +)\ +\ +\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 +\cf0 hap_macs\ +macs$genMap[[1]]\ +\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 +\cf0 \ +# MaCSTS with MaCS mutation path\ +out_ts <- AlphaSimR:::MaCSTS(\ + args = args, nChr=1,\ + inbred = FALSE, ploidy = 2L, nThreads = 1L, seed = seed,\ + usePhysicalPositions = FALSE,\ + useMacsMut = TRUE,\ + expandInbredSamples = FALSE\ +)\ +\ +tc <- RcppTskit::TableCollection$new(xptr = out_ts$tables[[1]])\ +ts <- tc$tree_sequence()\ +ts$write('dev/testData/MaCSTS.trees')\ +\ +ts_macsts = tskit.load('/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/MaCSTS.trees')\ +ts_macsts.genotype_matrix()\ +for site in ts_macsts.sites():\ + print(site.position)} \ No newline at end of file diff --git a/man/HybridPop-class.Rd b/man/HybridPop-class.Rd index c1792d29..ecaac0a2 100644 --- a/man/HybridPop-class.Rd +++ b/man/HybridPop-class.Rd @@ -3,12 +3,12 @@ \docType{class} \name{HybridPop-class} \alias{HybridPop-class} -\alias{[,HybridPop-method} +\alias{[,HybridPop,ANY,ANY,ANY-method} \alias{c,HybridPop-method} \alias{isHybridPop} \title{Hybrid population} \usage{ -\S4method{[}{HybridPop}(x, i) +\S4method{[}{HybridPop,ANY,ANY,ANY}(x, i) \S4method{c}{HybridPop}(x, ...) @@ -27,7 +27,7 @@ Memory is saved by not storing genotypic data. } \section{Methods (by generic)}{ \itemize{ -\item \code{[}: Extract HybridPop using index or id +\item \code{x[i}: Extract HybridPop using index or id \item \code{c(HybridPop)}: Combine multiple HybridPops diff --git a/man/MapPop-class.Rd b/man/MapPop-class.Rd index c9b532b7..271e2607 100644 --- a/man/MapPop-class.Rd +++ b/man/MapPop-class.Rd @@ -3,12 +3,12 @@ \docType{class} \name{MapPop-class} \alias{MapPop-class} -\alias{[,MapPop-method} +\alias{[,MapPop,ANY,ANY,ANY-method} \alias{c,MapPop-method} \alias{isMapPop} \title{Raw population with genetic map} \usage{ -\S4method{[}{MapPop}(x, i) +\S4method{[}{MapPop,ANY,ANY,ANY}(x, i) \S4method{c}{MapPop}(x, ...) @@ -29,7 +29,7 @@ for creating initial populations and setting traits in the } \section{Methods (by generic)}{ \itemize{ -\item \code{[}: Extract MapPop by index +\item \code{x[i}: Extract MapPop by index \item \code{c(MapPop)}: Combine multiple MapPops diff --git a/man/MultiPop-class.Rd b/man/MultiPop-class.Rd index 8d8b37d4..0e6bc30a 100644 --- a/man/MultiPop-class.Rd +++ b/man/MultiPop-class.Rd @@ -4,11 +4,11 @@ \name{MultiPop-class} \alias{MultiPop-class} \alias{show,MultiPop-method} -\alias{[,MultiPop-method} +\alias{[,MultiPop,ANY,ANY,ANY-method} \alias{[[,MultiPop-method} \alias{$,MultiPop-method} \alias{names,MultiPop-method} -\alias{[<-,MultiPop-method} +\alias{[<-,MultiPop,ANY,ANY,ANY-method} \alias{[[<-,MultiPop-method} \alias{$<-,MultiPop-method} \alias{names<-,MultiPop-method} @@ -19,7 +19,7 @@ \usage{ \S4method{show}{MultiPop}(object) -\S4method{[}{MultiPop}(x, i) +\S4method{[}{MultiPop,ANY,ANY,ANY}(x, i) \S4method{[[}{MultiPop}(x, i) @@ -27,7 +27,7 @@ \S4method{names}{MultiPop}(x) -\S4method{[}{MultiPop}(x, i) <- value +\S4method{[}{MultiPop,ANY,ANY,ANY}(x, i) <- value \S4method{[[}{MultiPop}(x, i) <- value @@ -63,7 +63,7 @@ and can hence have a nested structure - see examples in \code{\link{newMultiPop} \itemize{ \item \code{show(MultiPop)}: Show MultiPop object summary -\item \code{[}: Subset MultiPop by index +\item \code{x[i}: Subset MultiPop by index \item \code{[[}: Extract a population by index @@ -71,7 +71,7 @@ and can hence have a nested structure - see examples in \code{\link{newMultiPop} \item \code{names(MultiPop)}: Access names of pops in MultiPop -\item \code{`[`(MultiPop) <- value}: Replace contents of a subset of elements in MultiPop +\item \code{`[`(x = MultiPop, i = ANY, j = ANY) <- value}: Replace contents of a subset of elements in MultiPop \item \code{`[[`(MultiPop) <- value}: Replace contents of a single element in MultiPop diff --git a/man/NamedMapPop-class.Rd b/man/NamedMapPop-class.Rd index 511dbb0b..d045d096 100644 --- a/man/NamedMapPop-class.Rd +++ b/man/NamedMapPop-class.Rd @@ -3,12 +3,12 @@ \docType{class} \name{NamedMapPop-class} \alias{NamedMapPop-class} -\alias{[,NamedMapPop-method} +\alias{[,NamedMapPop,ANY,ANY,ANY-method} \alias{c,NamedMapPop-method} \alias{isNamedMapPop} \title{Raw population with genetic map and id} \usage{ -\S4method{[}{NamedMapPop}(x, i) +\S4method{[}{NamedMapPop,ANY,ANY,ANY}(x, i) \S4method{c}{NamedMapPop}(x, ...) @@ -26,7 +26,7 @@ Extends \code{\link{MapPop-class}} with id, mother and father. } \section{Methods (by generic)}{ \itemize{ -\item \code{[}: Extract NamedMapPop by index +\item \code{x[i}: Extract NamedMapPop by index \item \code{c(NamedMapPop)}: Combine multiple NamedMapPops diff --git a/man/Pop-class.Rd b/man/Pop-class.Rd index 7fc8b626..97e726c6 100644 --- a/man/Pop-class.Rd +++ b/man/Pop-class.Rd @@ -3,13 +3,13 @@ \docType{class} \name{Pop-class} \alias{Pop-class} -\alias{[,Pop-method} +\alias{[,Pop,ANY,ANY,ANY-method} \alias{c,Pop-method} \alias{show,Pop-method} \alias{length,Pop-method} \title{Population} \usage{ -\S4method{[}{Pop}(x, i) +\S4method{[}{Pop,ANY,ANY,ANY}(x, i) \S4method{c}{Pop}(x, ...) @@ -32,7 +32,7 @@ phenotypes, and pedigrees. } \section{Methods (by generic)}{ \itemize{ -\item \code{[}: Extract Pop by index or id +\item \code{x[i}: Extract Pop by index or id \item \code{c(Pop)}: Combine multiple Pops diff --git a/man/RawPop-class.Rd b/man/RawPop-class.Rd index 254358d1..8aad903a 100644 --- a/man/RawPop-class.Rd +++ b/man/RawPop-class.Rd @@ -3,13 +3,13 @@ \docType{class} \name{RawPop-class} \alias{RawPop-class} -\alias{[,RawPop-method} +\alias{[,RawPop,ANY,ANY,ANY-method} \alias{c,RawPop-method} \alias{show,RawPop-method} \alias{isRawPop} \title{Raw Population} \usage{ -\S4method{[}{RawPop}(x, i) +\S4method{[}{RawPop,ANY,ANY,ANY}(x, i) \S4method{c}{RawPop}(x, ...) @@ -31,7 +31,7 @@ The raw population class contains only genotype data. } \section{Methods (by generic)}{ \itemize{ -\item \code{[}: Extract RawPop by index +\item \code{x[i}: Extract RawPop by index \item \code{c(RawPop)}: Combine multiple RawPops diff --git a/man/SimParam.Rd b/man/SimParam.Rd index c06b5b74..acbcc342 100644 --- a/man/SimParam.Rd +++ b/man/SimParam.Rd @@ -403,6 +403,10 @@ genetic map} \item{\code{recHist}}{list of historic recombination events} + \item{\code{isTrackRecGen}}{is recombination being tracked. Jinyang added.} + + \item{\code{recHistGen}}{list of historic recombination events. Jinyang added.} + \item{\code{haplotypes}}{list of computed IBD haplotypes} \item{\code{varA}}{additive genetic variance in founderPop} @@ -426,6 +430,7 @@ relative to all active QTL} \item \href{#method-SimParam-initialize}{\code{SimParam$new()}} \item \href{#method-SimParam-setTrackPed}{\code{SimParam$setTrackPed()}} \item \href{#method-SimParam-setTrackRec}{\code{SimParam$setTrackRec()}} + \item \href{#method-SimParam-setTrackRecGen}{\code{SimParam$setTrackRecGen()}} \item \href{#method-SimParam-resetPed}{\code{SimParam$resetPed()}} \item \href{#method-SimParam-restrSegSites}{\code{SimParam$restrSegSites()}} \item \href{#method-SimParam-setSexes}{\code{SimParam$setSexes()}} @@ -570,6 +575,27 @@ SP$setTrackRec(TRUE) } } +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SimParam-setTrackRecGen}{}}} +\subsection{\code{SimParam$setTrackRecGen()}}{ + Sets genetic-coordinate recombination tracking for the simulation. Jinyang added. +By default this is turned off. When turned on, it will also turn on pedigree tracking. + \subsection{Usage}{ + \if{html}{\out{
}} + \preformatted{SimParam$setTrackRecGen(isTrackRecGen, force = FALSE)} + \if{html}{\out{
}} + } + \subsection{Arguments}{ + \if{html}{\out{
}} + \describe{ + \item{\code{isTrackRecGen}}{should genetic-coordinate recombination tracking be on.} + \item{\code{force}}{should the check for a running simulation be ignored.} + } + \if{html}{\out{
}} + } +} + \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-SimParam-resetPed}{}}} @@ -1807,7 +1833,16 @@ be metacentric.} For internal use only. \subsection{Usage}{ \if{html}{\out{
}} - \preformatted{SimParam$addToRec(lastId, id, mother, father, isDH, hist, ploidy)} + \preformatted{SimParam$addToRec( + lastId, + id, + mother, + father, + isDH, + hist, + histGen = NULL, + ploidy +)} \if{html}{\out{
}} } \subsection{Arguments}{ @@ -1819,6 +1854,7 @@ be metacentric.} \item{\code{father}}{vector of father iids} \item{\code{isDH}}{indicator for DH lines} \item{\code{hist}}{new recombination history} + \item{\code{histGen}}{new recombination history (genetic coordinate)} \item{\code{ploidy}}{ploidy level} } \if{html}{\out{}} diff --git a/man/dot-newPop.Rd b/man/dot-newPop.Rd index 2a577f61..cdba8346 100644 --- a/man/dot-newPop.Rd +++ b/man/dot-newPop.Rd @@ -15,6 +15,7 @@ femaleParentPop = NULL, maleParentPop = NULL, hist = NULL, + histGen = NULL, simParam = NULL, nThreads = NULL, ... diff --git a/man/figures/addInd.png b/man/figures/addInd.png new file mode 100644 index 00000000..57ba0b11 Binary files /dev/null and b/man/figures/addInd.png differ diff --git a/man/figures/originInd.png b/man/figures/originInd.png new file mode 100644 index 00000000..be6c1489 Binary files /dev/null and b/man/figures/originInd.png differ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 436411a2..026a45a9 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -661,8 +661,8 @@ BEGIN_RCPP END_RCPP } // cross -Rcpp::List cross(const arma::field >& motherGeno, arma::uvec mother, const arma::field >& fatherGeno, arma::uvec father, const arma::field& femaleMap, const arma::field& maleMap, bool trackRec, arma::uword motherPloidy, arma::uword fatherPloidy, double v, double p, const arma::vec& motherCentromere, const arma::vec& fatherCentromere, double quadProb, int nThreads); -RcppExport SEXP _AlphaSimR_cross(SEXP motherGenoSEXP, SEXP motherSEXP, SEXP fatherGenoSEXP, SEXP fatherSEXP, SEXP femaleMapSEXP, SEXP maleMapSEXP, SEXP trackRecSEXP, SEXP motherPloidySEXP, SEXP fatherPloidySEXP, SEXP vSEXP, SEXP pSEXP, SEXP motherCentromereSEXP, SEXP fatherCentromereSEXP, SEXP quadProbSEXP, SEXP nThreadsSEXP) { +Rcpp::List cross(const arma::field >& motherGeno, arma::uvec mother, const arma::field >& fatherGeno, arma::uvec father, const arma::field& femaleMap, const arma::field& maleMap, bool trackRec, arma::uword motherPloidy, arma::uword fatherPloidy, double v, double p, const arma::vec& motherCentromere, const arma::vec& fatherCentromere, double quadProb, int nThreads, /* modified by Jinyang */ bool trackRecGen); +RcppExport SEXP _AlphaSimR_cross(SEXP motherGenoSEXP, SEXP motherSEXP, SEXP fatherGenoSEXP, SEXP fatherSEXP, SEXP femaleMapSEXP, SEXP maleMapSEXP, SEXP trackRecSEXP, SEXP motherPloidySEXP, SEXP fatherPloidySEXP, SEXP vSEXP, SEXP pSEXP, SEXP motherCentromereSEXP, SEXP fatherCentromereSEXP, SEXP quadProbSEXP, SEXP nThreadsSEXP, SEXP trackRecGenSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -681,7 +681,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const arma::vec& >::type fatherCentromere(fatherCentromereSEXP); Rcpp::traits::input_parameter< double >::type quadProb(quadProbSEXP); Rcpp::traits::input_parameter< int >::type nThreads(nThreadsSEXP); - rcpp_result_gen = Rcpp::wrap(cross(motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads)); + Rcpp::traits::input_parameter< /* modified by Jinyang */ bool >::type trackRecGen(trackRecGenSEXP); + rcpp_result_gen = Rcpp::wrap(cross(motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads, trackRecGen)); return rcpp_result_gen; END_RCPP } @@ -885,6 +886,26 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// MaCSTS +Rcpp::List MaCSTS(Rcpp::String args, int nChr, bool inbred, arma::uword ploidy, int nThreads, arma::uvec seed, bool usePhysicalPositions, bool useMacsMut, double Nref, bool expandInbredSamples); +RcppExport SEXP _AlphaSimR_MaCSTS(SEXP argsSEXP, SEXP nChrSEXP, SEXP inbredSEXP, SEXP ploidySEXP, SEXP nThreadsSEXP, SEXP seedSEXP, SEXP usePhysicalPositionsSEXP, SEXP useMacsMutSEXP, SEXP NrefSEXP, SEXP expandInbredSamplesSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::String >::type args(argsSEXP); + Rcpp::traits::input_parameter< int >::type nChr(nChrSEXP); + Rcpp::traits::input_parameter< bool >::type inbred(inbredSEXP); + Rcpp::traits::input_parameter< arma::uword >::type ploidy(ploidySEXP); + Rcpp::traits::input_parameter< int >::type nThreads(nThreadsSEXP); + Rcpp::traits::input_parameter< arma::uvec >::type seed(seedSEXP); + Rcpp::traits::input_parameter< bool >::type usePhysicalPositions(usePhysicalPositionsSEXP); + Rcpp::traits::input_parameter< bool >::type useMacsMut(useMacsMutSEXP); + Rcpp::traits::input_parameter< double >::type Nref(NrefSEXP); + Rcpp::traits::input_parameter< bool >::type expandInbredSamples(expandInbredSamplesSEXP); + rcpp_result_gen = Rcpp::wrap(MaCSTS(args, nChr, inbred, ploidy, nThreads, seed, usePhysicalPositions, useMacsMut, Nref, expandInbredSamples)); + return rcpp_result_gen; +END_RCPP +} // rtsk_table_collection_summary2 Rcpp::List rtsk_table_collection_summary2(const SEXP tc); RcppExport SEXP _AlphaSimR_rtsk_table_collection_summary2(SEXP tcSEXP) { @@ -907,6 +928,29 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// tsMutateTableCollection +void tsMutateTableCollection(const SEXP tc, const double theta, const uint64_t seed); +RcppExport SEXP _AlphaSimR_tsMutateTableCollection(SEXP tcSEXP, SEXP thetaSEXP, SEXP seedSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const SEXP >::type tc(tcSEXP); + Rcpp::traits::input_parameter< const double >::type theta(thetaSEXP); + Rcpp::traits::input_parameter< const uint64_t >::type seed(seedSEXP); + tsMutateTableCollection(tc, theta, seed); + return R_NilValue; +END_RCPP +} +// tsFinalizeInbredTableCollection +void tsFinalizeInbredTableCollection(const SEXP tc, const int ploidy); +RcppExport SEXP _AlphaSimR_tsFinalizeInbredTableCollection(SEXP tcSEXP, SEXP ploidySEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const SEXP >::type tc(tcSEXP); + Rcpp::traits::input_parameter< const int >::type ploidy(ploidySEXP); + tsFinalizeInbredTableCollection(tc, ploidy); + return R_NilValue; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_AlphaSimR_solveRRBLUP", (DL_FUNC) &_AlphaSimR_solveRRBLUP, 3}, @@ -950,7 +994,7 @@ static const R_CallMethodDef CallEntries[] = { {"_AlphaSimR_getNonFounderIbd", (DL_FUNC) &_AlphaSimR_getNonFounderIbd, 3}, {"_AlphaSimR_getFounderIbd", (DL_FUNC) &_AlphaSimR_getFounderIbd, 2}, {"_AlphaSimR_createIbdMat", (DL_FUNC) &_AlphaSimR_createIbdMat, 5}, - {"_AlphaSimR_cross", (DL_FUNC) &_AlphaSimR_cross, 15}, + {"_AlphaSimR_cross", (DL_FUNC) &_AlphaSimR_cross, 16}, {"_AlphaSimR_createDH2", (DL_FUNC) &_AlphaSimR_createDH2, 7}, {"_AlphaSimR_createReducedGenome", (DL_FUNC) &_AlphaSimR_createReducedGenome, 10}, {"_AlphaSimR_popVarCpp", (DL_FUNC) &_AlphaSimR_popVarCpp, 1}, @@ -966,8 +1010,11 @@ static const R_CallMethodDef CallEntries[] = { {"_AlphaSimR_rngDiagnosticsSampleInt", (DL_FUNC) &_AlphaSimR_rngDiagnosticsSampleInt, 4}, {"_AlphaSimR_rngDiagnosticsSamplePoisson", (DL_FUNC) &_AlphaSimR_rngDiagnosticsSamplePoisson, 3}, {"_AlphaSimR_MaCS", (DL_FUNC) &_AlphaSimR_MaCS, 6}, + {"_AlphaSimR_MaCSTS", (DL_FUNC) &_AlphaSimR_MaCSTS, 10}, {"_AlphaSimR_rtsk_table_collection_summary2", (DL_FUNC) &_AlphaSimR_rtsk_table_collection_summary2, 1}, {"_AlphaSimR_rtsk_treeseq_get_num_individuals2", (DL_FUNC) &_AlphaSimR_rtsk_treeseq_get_num_individuals2, 1}, + {"_AlphaSimR_tsMutateTableCollection", (DL_FUNC) &_AlphaSimR_tsMutateTableCollection, 3}, + {"_AlphaSimR_tsFinalizeInbredTableCollection", (DL_FUNC) &_AlphaSimR_tsFinalizeInbredTableCollection, 2}, {NULL, NULL, 0} }; diff --git a/src/algorithm.cpp b/src/algorithm.cpp index 42195a7f..fd75c2b9 100644 --- a/src/algorithm.cpp +++ b/src/algorithm.cpp @@ -1147,6 +1147,59 @@ void GraphBuilder::addMutations(double startPos,double endPos){ } } +void GraphBuilder::addMutationsTs(double startPos,double endPos){ + bool bEndMutate = false; + while(!bEndMutate){ + // find the next point on this interval + startPos+=pRandNumGenerator->expRV(dLastTreeLength* + pConfig->dTheta); + if (startPos>=endPos){ + bEndMutate = true; + }else{ + double dRandomSpot = pRandNumGenerator->unifRV() * dLastTreeLength; + double dMutationTime=-1.; + EdgePtr selectedEdge = getRandomEdgeOnTree(dMutationTime,dRandomSpot); + if (pTsRecorder) { + pTsRecorder->recordMutation(startPos, selectedEdge, dMutationTime); + } + // Keep existing behavior for now so TS and non-TS remain comparable. + mutateBelowEdge(selectedEdge); + + unique_ptr temp(new AlphaSimRReturn()); + temp->length = startPos; + unsigned int iSampleSize = pConfig->iSampleSize; + for (unsigned int iSampleIndex=0;iSampleIndex(pSampleNodeArray[iSampleIndex].get()); + sites[iSampleIndex]=sample->bAffected; + temp->haplotypes.push_back(sample->bAffected); + sample->bAffected=false; + } + mutations.push_back(*temp); + double dFreq=0.; + if (pConfig->bSNPAscertainment){ + int counts=0; + for (unsigned int i=0;ibFlipAlleles && dFreq>.5){ + for (unsigned int i=0;ipAlleleFreqBinPtrSet->find(query); + if (it!=pConfig->pAlleleFreqBinPtrSet->end()){ + AlleleFreqBinPtr bin = *it; + ++bin->iObservedCounts; + }else throw "Did not find a frequency range for freq"; + } + pMutationPtrVector->push_back(new Mutation(startPos, dFreq)); + } + } +} + bool GraphBuilder::getNextPos(double & curPos,HotSpotBinPtrList::iterator & hotSpotIt){ bool bBinCrossed = false; if (hotSpotIt==pConfig->pHotSpotBinPtrList->end()){ @@ -1304,7 +1357,107 @@ void GraphBuilder::build(){ } +void GraphBuilder::buildTs(bool usePhysicalPositions, bool useMacsMut, + bool inbred, unsigned int ploidy){ + pTsRecorder.reset(new TsRecorder( + pConfig->dSeqLength, + usePhysicalPositions ? TsPositionMode::PHYSICAL_BP : TsPositionMode::MACS_UNIT, + inbred, + ploidy)); + + double curPos = 0.0,lastPos = 0.0,dMaxPos = 1.0; + unsigned int iLastCumulativePos = 0; + + HotSpotBinPtrList::iterator hotSpotIt; + if (pConfig->bVariableRecomb){ + hotSpotIt=pConfig->pHotSpotBinPtrList->begin(); + } + // gene conversion stuff + GeneConversionPtr newGC; + double dLogTractRatio = log((pConfig->iGeneConvTract-1.)/pConfig->iGeneConvTract); + int iHistoryMax = 0; + do{ + if (iGraphIteration==0){ + NodePtr dummy1; + EventPtr dummy2; + this->traverseEvents(false,dummy1,dummy2); + if (pTsRecorder) { + pTsRecorder->preRegisterSamples(pSampleNodeArray, pConfig->iSampleSize); + } + }else{ + // at this point decide whether we invoke a plain x-over + // or a new gene conversion event + this->bBeginGeneConversion = false; + if (this->bEndGeneConversion){ + }else{ + this->bBeginGeneConversion = pRandNumGenerator->unifRV()< + (pConfig->dGeneConvRatio/(pConfig->dGeneConvRatio+1))?true:false; + if (bBeginGeneConversion){ + double dTractLen = (1.+log(pRandNumGenerator->unifRV())/ + dLogTractRatio)/pConfig->dSeqLength; + newGC = GeneConversionPtr(new GeneConversion( + curPos+dTractLen)); + pGeneConversionPtrSet->insert(newGC); + } + } + invokeRecombination(newGC); + // mark the graph edges as the local tree + markCurrentTree(); + if (!bIncrementHistory){ + double dBoundary = curPos - dTrailingGap; + if (dBoundary>0.){ + bIncrementHistory = true; + } + }else{ + ++iHistoryMax; + } + if (iHistoryMax>=0){ + pruneARG(iHistoryMax); + } + } + + initializeCurrentTree(); + + if (pConfig->bVariableRecomb){ + bool bBinCrossed; + do{ + bBinCrossed = getNextPos(curPos,hotSpotIt); + }while(bBinCrossed); + }else{ + curPos+=pRandNumGenerator->expRV(getRate()); + } + // check if we reached the end of the region + if (curPos>dMaxPos) curPos=dMaxPos; + if (pConfig->bNewickFormat){ + uint iSegLength = curPos*pConfig->dSeqLength-iLastCumulativePos; + iLastCumulativePos += iSegLength; + } + // check if there was an existing gene conversion event that needs + // to be closed. backtrack if necessary. + this->bEndGeneConversion = checkPendingGeneConversions(curPos); + if (pTsRecorder) { + pTsRecorder->recordTreeInterval(*pEdgeVectorInTree, iTotalTreeEdges, + lastPos, curPos); + } + if (useMacsMut && pConfig->dTheta>0.0){ + addMutationsTs(lastPos,curPos); + } + lastPos = curPos; + ++iGraphIteration; + }while(curPossimplify(); + } +} + vector GraphBuilder::getMutations() { return mutations; } +tsk_table_collection_t * GraphBuilder::releaseTableCollectionTs(double timeScale, + bool expandInbred) { + if (!pTsRecorder) { + return nullptr; + } + return pTsRecorder->release(timeScale, expandInbred); +} diff --git a/src/meiosis.cpp b/src/meiosis.cpp index a7f9ee30..7a36b305 100644 --- a/src/meiosis.cpp +++ b/src/meiosis.cpp @@ -72,6 +72,53 @@ arma::Mat RecHist::getHist(arma::uword ind, return hist(ind)(chr)(par); } +// Like RecHist, but store double positions (e.g., genetic coordinate) +class RecHistDbl{ +public: + arma::field< //individual + arma::field< //chromosome + arma::field< //ploidy + arma::Mat > > > hist; //(chr, posGen) + + void setSize(arma::uword nInd, + arma::uword nChr, + arma::uword ploidy); + + void addHist(arma::Mat& input, + arma::uword nInd, + arma::uword chrGroup, + arma::uword chrInd); + + arma::Mat getHist(arma::uword ind, + arma::uword chr, + arma::uword par); +}; + +void RecHistDbl::setSize(arma::uword nInd, + arma::uword nChr, + arma::uword ploidy=2){ + hist.set_size(nInd); + for(arma::uword i=0; i& input, + arma::uword nInd, + arma::uword chrGroup, + arma::uword chrInd){ + hist(nInd)(chrGroup)(chrInd) = input; +} + +arma::Mat RecHistDbl::getHist(arma::uword ind, + arma::uword chr, + arma::uword par){ + return hist(ind)(chr)(par); +} + // Samples the locations for chiasmata via a gamma process // end, the length of the interval used to sample // v, the interference parameter @@ -396,6 +443,55 @@ arma::Mat findBivalentCO(const arma::vec& genMap, double v, double p, return removeDoubleCO(output); } +// Continuous (genetic-coordinate) recombination history for a bivalent pair +// Output columns: (originChr, startPosGen) +// Row 0 is always (1, 0.0) +arma::Mat findBivalentCO_gen(const arma::vec& genMap, double v, double p, + alphasimrRng::rngEngine& rng){ + arma::uword readChr = 0; + double genLen = genMap(genMap.n_elem-1); + + // 1) Sample crossover positions on the genetic map + arma::vec posCO = sampleChiasmata(genLen, v, p, rng); + + // If no chiasmata were sampled: return a single record (all from chr1) + if(posCO.n_elem==0){ + arma::Mat output(1,2); + output(0,0) = 1.0; // origin chr1 + output(0,1) = 0.0; // start at beginning of chromosome (continuous) + return output; + } + + // 2)Thin crossovers + arma::vec thin = alphasimrRng::runifVec(posCO.n_elem, rng); + posCO = posCO(find(thin>0.5)); + + arma::uword nCO = posCO.n_elem; + + arma::Mat output(nCO+1, 2); + + // 3) Allocate output + output(0,0) = 1.0; + output(0,1) = 0.0; + + if(nCO==0){ + return output; + } + + posCO = sort(posCO); + + // 4) Alternate origin each crossover + for(arma::uword i=0; i& chr1, } } +// Simulates a gamete using the existing discrete (bin-based) model for geno, +// AND also returns a continuous (genetic-coordinate) recombination history. +// +// - hist: int matrix (originChr, startSite/bin) used for transferGeno +// - histGen: double matrix (originChr, startPosGen) keeping all breakpoints +void bivalent2(const arma::Col& chr1, + const arma::Col& chr2, + const arma::vec& genMap, + double v, + double p, + arma::Col& output, + arma::Mat& hist, + arma::Mat& histGen, + alphasimrRng::rngEngine& rng){ + + arma::uword startPos = 0; + arma::uword endPos; + arma::uword readChr = 0; + double genLen = genMap(genMap.n_elem - 1); + + // 1) Sample crossover positions once (shared) + arma::vec posCO = sampleChiasmata(genLen, v, p, rng); + + // 2) Thin crossovers (same rule as original) + if(posCO.n_elem > 0){ + arma::vec thin = alphasimrRng::runifVec(posCO.n_elem, rng); + posCO = posCO(find(thin > 0.5)); + } + + // Ensure increasing order for intervalSearch and for histGen + if(posCO.n_elem > 1){ + posCO = sort(posCO); + } + + arma::uword nCO = posCO.n_elem; + + // 3) Build continuous history (keep all breakpoints) + // Row 0 always starts from chr 1 at position 0.0 + histGen.set_size(nCO + 1, 2); + histGen(0,0) = 1.0; + histGen(0,1) = 0.0; + + readChr = 0; + for(arma::uword i = 0; i < nCO; ++i){ + readChr = (readChr + 1) % 2; + histGen(i + 1, 0) = double(readChr + 1); + histGen(i + 1, 1) = posCO(i); + } + + // 4) Build discrete history for transferGeno (may be simplified) + // Match original convention: row0 is (1,1); later start sites use endPos+2 + arma::Mat histRaw(nCO + 1, 2); + histRaw(0,0) = 1; + histRaw(0,1) = 1; + + if(nCO == 0){ + // No crossovers: single record is enough + hist = histRaw; + output = chr1; + return; + } + + readChr = 0; + startPos = 0; + for(arma::uword i = 0; i < nCO; ++i){ + readChr = (readChr + 1) % 2; + double x = posCO(i); + endPos = intervalSearch(genMap, x, startPos); + histRaw(i + 1, 0) = int(readChr + 1); + histRaw(i + 1, 1) = int(endPos + 2); + startPos = endPos; + } + + // Remove unobservable/redundant records for the discrete geno-transfer path only + hist = removeDoubleCO(histRaw); + + // 5) Use the discrete history to transfer genotype bits (unchanged logic) + int nBins = chr1.n_elem; + + if(hist.n_rows == 1){ + output = chr1; + return; + } + + for(arma::uword i = 0; i < (hist.n_rows - 1); ++i){ + switch(hist(i,0)){ + case 1: + transferGeno(chr1, output, hist(i,1), hist(i+1,1)); + break; + case 2: + transferGeno(chr2, output, hist(i,1), hist(i+1,1)); + break; + } + } + + switch(hist(hist.n_rows - 1, 0)){ + case 1: + transferGeno(chr1, output, hist(hist.n_rows - 1, 1), nBins*8 + 1); + break; + case 2: + transferGeno(chr2, output, hist(hist.n_rows - 1, 1), nBins*8 + 1); + break; + } +} + + +// Simulates a gamete using the existing discrete (bin-based) model for geno, +// AND also returns a continuous (genetic-coordinate) recombination history. +// +// - hist: original int matrix (originChr, startSite/bin) used for transferGeno +// - histGen: new double matrix (originChr, startPosGen) with true breakpoints +void bivalent2Old(const arma::Col& chr1, + const arma::Col& chr2, + const arma::vec& genMap, + double v, + double p, + arma::Col& output, + arma::Mat& hist, + // histGen added + arma::Mat& histGen, + alphasimrRng::rngEngine& rng){ + + hist = findBivalentCO(genMap, v, p, rng); + + // histGen added + histGen = findBivalentCO_gen(genMap, v, p, rng); + + if(hist.n_rows==1){ + output = chr1; + }else{ + int nBins = chr1.n_elem; + + for(arma::uword i=0; i<(hist.n_rows-1); ++i){ + switch(hist(i,0)){ + case 1: + transferGeno(chr1, output, hist(i,1), hist(i+1,1)); + break; + case 2: + transferGeno(chr2, output, hist(i,1), hist(i+1,1)); + } + } + + switch(hist(hist.n_rows-1,0)){ + case 1: + transferGeno(chr1, output, hist(hist.n_rows-1,1), nBins*8+1); + break; + case 2: + transferGeno(chr2, output, hist(hist.n_rows-1,1), nBins*8+1); + } + } +} + // Simulates a gamete using a count-location model for recombination // rng is the explicit dqrng stream used for crossover sampling and thinning. void quadrivalent(const arma::Col& chr1, @@ -1099,7 +1347,9 @@ Rcpp::List cross( const arma::vec& motherCentromere, const arma::vec& fatherCentromere, double quadProb, - int nThreads){ + int nThreads, + /* modified by Jinyang */ + bool trackRecGen){ mother -= 1; // R to C++ father -= 1; // R to C++ arma::uword ploidy = (motherPloidy+fatherPloidy)/2; @@ -1111,6 +1361,13 @@ Rcpp::List cross( if(trackRec){ hist.setSize(nInd,nChr,ploidy); } + + // modified by Jinyang + RecHistDbl histGen; + if(trackRecGen){ + histGen.setSize(nInd, nChr, ploidy); + } + if(nChr < static_cast(nThreads) ){ nThreads = nChr; } @@ -1122,6 +1379,8 @@ Rcpp::List cross( for(arma::uword chr=0; chr hist1, hist2; + // modified by Jinyang + arma::Mat histG1, histG2; arma::uvec xm(motherPloidy); // Indicator for mother chromosomes for(arma::uword i=0; i2){ if(alphasimrRng::runif(rng)>quadProb){ //Bivalent 1 - bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x)), - motherGeno(chr).slice(mother(ind)).col(xm(x+1)), - femaleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(motherGeno(chr).slice(mother(ind)).col(xm(x)), + motherGeno(chr).slice(mother(ind)).col(xm(x+1)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else {// ----modified by Jinyang + bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x)), + motherGeno(chr).slice(mother(ind)).col(xm(x+1)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1158,17 +1430,40 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xm(x+1))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xm(x))+1)); + histG1.col(0).replace(200.0, double(int(xm(x+1))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } + // ----modified by Jinyang ++progenyChr; //Bivalent 2 - bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x+2)), - motherGeno(chr).slice(mother(ind)).col(xm(x+3)), - femaleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(motherGeno(chr).slice(mother(ind)).col(xm(x+2)), + motherGeno(chr).slice(mother(ind)).col(xm(x+3)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else { + // ----modified by Jinyang + bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x+2)), + motherGeno(chr).slice(mother(ind)).col(xm(x+3)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1176,6 +1471,15 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xm(x+3))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xm(x+2))+1)); + histG1.col(0).replace(200.0, double(int(xm(x+3))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } + // ----modified by Jinyang ++progenyChr; }else{ //Quadrivalent @@ -1208,14 +1512,29 @@ Rcpp::List cross( } }else{ //Bivalent - bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x)), - motherGeno(chr).slice(mother(ind)).col(xm(x+1)), - femaleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(motherGeno(chr).slice(mother(ind)).col(xm(x)), + motherGeno(chr).slice(mother(ind)).col(xm(x+1)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else { + // ----modified by Jinyang + bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x)), + motherGeno(chr).slice(mother(ind)).col(xm(x+1)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } + tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1223,6 +1542,15 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xm(x+1))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xm(x))+1)); + histG1.col(0).replace(200.0, double(int(xm(x+1))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } + // ----modified by Jinyang ++progenyChr; } } @@ -1233,14 +1561,28 @@ Rcpp::List cross( if((fatherPloidy-x)>2){ if(alphasimrRng::runif(rng)>quadProb){ //Bivalent 1 - bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x)), - fatherGeno(chr).slice(father(ind)).col(xf(x+1)), - maleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(fatherGeno(chr).slice(father(ind)).col(xf(x)), + fatherGeno(chr).slice(father(ind)).col(xf(x+1)), + maleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else { + // ----modified by Jinyang + bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x)), + fatherGeno(chr).slice(father(ind)).col(xf(x+1)), + maleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1248,17 +1590,39 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xf(x+1))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xf(x))+1)); + histG1.col(0).replace(200.0, double(int(xf(x+1))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } ++progenyChr; //Bivalent 2 - bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x+2)), - fatherGeno(chr).slice(father(ind)).col(xf(x+3)), - maleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(fatherGeno(chr).slice(father(ind)).col(xf(x+2)), + fatherGeno(chr).slice(father(ind)).col(xf(x+3)), + maleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else { + // ----modified by Jinyang + bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x+2)), + fatherGeno(chr).slice(father(ind)).col(xf(x+3)), + maleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1266,6 +1630,14 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xf(x+3))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xf(x+2))+1)); + histG1.col(0).replace(200.0, double(int(xf(x+3))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } ++progenyChr; }else{ //Quadrivalent @@ -1298,14 +1670,28 @@ Rcpp::List cross( } }else{ //Bivalent - bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x)), - fatherGeno(chr).slice(father(ind)).col(xf(x+1)), - maleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(fatherGeno(chr).slice(father(ind)).col(xf(x)), + fatherGeno(chr).slice(father(ind)).col(xf(x+1)), + maleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else { + // ----modified by Jinyang + bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x)), + fatherGeno(chr).slice(father(ind)).col(xf(x+1)), + maleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1313,15 +1699,31 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xf(x+1))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xf(x))+1)); + histG1.col(0).replace(200.0, double(int(xf(x+1))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } ++progenyChr; } } } //End individual loop geno(chr) = tmpGeno; } //End chromosome loop + + // modified by Jinyang ---- if(trackRec){ - return Rcpp::List::create(Rcpp::Named("geno")=geno, - Rcpp::Named("recHist")=hist.hist); + if(trackRecGen){ + return Rcpp::List::create(Rcpp::Named("geno")=geno, + Rcpp::Named("recHist")=hist.hist, + Rcpp::Named("recHistGen")=histGen.hist); + } else { + return Rcpp::List::create(Rcpp::Named("geno")=geno, + Rcpp::Named("recHist")=hist.hist); + } } return Rcpp::List::create(Rcpp::Named("geno")=geno); } diff --git a/src/postTS.cpp b/src/postTS.cpp new file mode 100644 index 00000000..916bb957 --- /dev/null +++ b/src/postTS.cpp @@ -0,0 +1,210 @@ +#include "rng.h" +#include "postTS.h" + +#include +#include +#include +#include +#include + +namespace tsPost { + +void checkTsk(const int status, const char *context) { + if (status < 0) { + throw std::runtime_error(std::string(context) + ": " + + std::string(tsk_strerror(status))); + } +} + +void expandInbredSamplesInPlace(tsk_table_collection_t *tables, + const unsigned int ploidy) { + if (tables == nullptr || ploidy <= 1) { + return; + } + const double sequenceLength = tables->sequence_length; + if (!(sequenceLength > 0.0)) { + return; + } + + std::vector originalSamples; + originalSamples.reserve(tables->nodes.num_rows); + for (tsk_id_t nodeId = 0; + nodeId < static_cast(tables->nodes.num_rows); + ++nodeId) { + if ((tables->nodes.flags[nodeId] & TSK_NODE_IS_SAMPLE) != 0) { + originalSamples.push_back(nodeId); + } + } + if (originalSamples.empty()) { + return; + } + + const double smallestPositive = std::nextafter(0.0, 1.0); + std::vector minParentTimeByNode( + tables->nodes.num_rows, + std::numeric_limits::infinity()); + for (tsk_size_t edgeId = 0; edgeId < tables->edges.num_rows; ++edgeId) { + const tsk_id_t childId = tables->edges.child[edgeId]; + const tsk_id_t parentId = tables->edges.parent[edgeId]; + if (childId == TSK_NULL || parentId == TSK_NULL || + childId < 0 || + static_cast(childId) >= tables->nodes.num_rows) { + continue; + } + const double parentTime = tables->nodes.time[parentId]; + if (std::isfinite(parentTime) && + parentTime < minParentTimeByNode[childId]) { + minParentTimeByNode[childId] = parentTime; + } + } + + for (const tsk_id_t sampleNodeId : originalSamples) { + const double minParentTime = minParentTimeByNode[sampleNodeId]; + double internalTime = tables->nodes.time[sampleNodeId]; + if (!(internalTime > 0.0)) { + internalTime = 1e-12; + if (std::isfinite(minParentTime) && minParentTime > 0.0) { + internalTime = std::min(internalTime, 0.5 * minParentTime); + } + if (!(internalTime > 0.0)) { + internalTime = smallestPositive; + } + if (std::isfinite(minParentTime) && !(internalTime < minParentTime)) { + internalTime = std::nextafter(minParentTime, 0.0); + } + if (!(internalTime > 0.0)) { + internalTime = smallestPositive; + } + } + + tables->nodes.time[sampleNodeId] = internalTime; + tables->nodes.flags[sampleNodeId] + &= ~static_cast(TSK_NODE_IS_SAMPLE); + const tsk_id_t population = tables->nodes.population[sampleNodeId]; + const tsk_id_t individual = tables->nodes.individual[sampleNodeId]; + tables->nodes.individual[sampleNodeId] = TSK_NULL; + + for (tsk_size_t mutationId = 0; + mutationId < tables->mutations.num_rows; + ++mutationId) { + if (tables->mutations.node[mutationId] != sampleNodeId) { + continue; + } + const double mutationTime = tables->mutations.time[mutationId]; + if (!tsk_is_unknown_time(mutationTime) && mutationTime < internalTime) { + tables->mutations.time[mutationId] = internalTime; + } + } + + for (unsigned int copy = 0; copy < ploidy; ++copy) { + const tsk_id_t childNodeId = tsk_node_table_add_row(&tables->nodes, + TSK_NODE_IS_SAMPLE, + 0.0, + population, + individual, + nullptr, + 0); + checkTsk(static_cast(childNodeId), + "Failed to add duplicated inbred sample node"); + const tsk_id_t edgeId = tsk_edge_table_add_row(&tables->edges, + 0.0, + sequenceLength, + sampleNodeId, + childNodeId, + nullptr, + 0); + checkTsk(static_cast(edgeId), + "Failed to add duplicated inbred sample edge"); + } + } +} + +void mutateTablesInPlace(tsk_table_collection_t *tables, + const double theta, + const uint64_t seed) { + if (!(theta > 0.0) || !std::isfinite(theta)) { + return; + } + if (tables == nullptr) { + throw std::runtime_error("Table collection pointer is null"); + } + if (!(tables->sequence_length > 0.0)) { + throw std::runtime_error("Table collection has invalid sequence_length"); + } + + dqrng::rng64_t rng = alphasimrRng::createRng(seed); + static const char ancestralState[] = "0"; + static const char derivedState[] = "1"; + const double sequenceLength = tables->sequence_length; + + for (tsk_size_t edgeId = 0; edgeId < tables->edges.num_rows; ++edgeId) { + const tsk_id_t parent = tables->edges.parent[edgeId]; + const tsk_id_t child = tables->edges.child[edgeId]; + if (parent == TSK_NULL || child == TSK_NULL || + parent < 0 || child < 0) { + continue; + } + + const double left = tables->edges.left[edgeId]; + const double right = tables->edges.right[edgeId]; + const double span = right - left; + if (!(span > 0.0)) { + continue; + } + + const double parentTime = tables->nodes.time[parent]; + const double childTime = tables->nodes.time[child]; + const double branch = parentTime - childTime; + if (!(branch > 0.0) || !std::isfinite(branch)) { + continue; + } + + const double spanFraction = span / sequenceLength; + const double lambda = theta * spanFraction * branch; + if (!(lambda > 0.0) || !std::isfinite(lambda)) { + continue; + } + + const arma::uword nMut = alphasimrRng::samplePoisson(lambda, *rng); + for (arma::uword i = 0; i < nMut; ++i) { + double position = left + alphasimrRng::runif(*rng) * span; + if (position >= sequenceLength) { + position = std::nextafter(sequenceLength, 0.0); + } + const tsk_id_t siteId = tsk_site_table_add_row(&tables->sites, + position, + ancestralState, + 1, + nullptr, + 0); + checkTsk(static_cast(siteId), "Failed to add site row"); + + double mutationTime = childTime + alphasimrRng::runif(*rng) * branch; + if (!(mutationTime > childTime)) { + mutationTime = std::nextafter(childTime, parentTime); + } + if (!(mutationTime < parentTime)) { + mutationTime = std::nextafter(parentTime, childTime); + } + + const tsk_id_t mutationId = tsk_mutation_table_add_row(&tables->mutations, + siteId, + child, + TSK_NULL, + mutationTime, + derivedState, + 1, + nullptr, + 0); + checkTsk(static_cast(mutationId), "Failed to add mutation row"); + } + } + + (void)tsk_table_collection_drop_index(tables, 0); + checkTsk(tsk_table_collection_sort(tables, nullptr, 0), + "Failed to sort table collection after post-TS mutation"); + checkTsk(tsk_table_collection_build_index(tables, 0), + "Failed to build index after post-TS mutation"); +} + +} // namespace tsPost diff --git a/src/postTS.h b/src/postTS.h new file mode 100644 index 00000000..eb3ecd7e --- /dev/null +++ b/src/postTS.h @@ -0,0 +1,21 @@ +#ifndef ALPHASIMR_POST_TS_H +#define ALPHASIMR_POST_TS_H + +#include + +#include "tskit.h" + +namespace tsPost { + +void checkTsk(int status, const char *context); + +void expandInbredSamplesInPlace(tsk_table_collection_t *tables, + unsigned int ploidy); + +void mutateTablesInPlace(tsk_table_collection_t *tables, + double theta, + uint64_t seed); + +} // namespace tsPost + +#endif diff --git a/src/simulator.cpp b/src/simulator.cpp index 92335f0c..37655684 100644 --- a/src/simulator.cpp +++ b/src/simulator.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -13,6 +14,7 @@ #include // Include for boost::split #include // Include boost::for is_any_of #include "simulator.h" +#include #include // Include for boost::split #include "misc.h" @@ -628,6 +630,34 @@ vector Simulator::beginSimulationMemory() { return toRet; } +tsk_table_collection_t * Simulator::beginSimulationTs(bool usePhysicalPositions, + bool useMacsMut, + double timeScale, + bool inbred, + unsigned int ploidy, + bool expandInbred) { + + if (pConfig->iIterations != 1) { + Rcpp::stop("TS mode currently supports iIterations = 1"); + } + + tsk_table_collection_t * toRet = nullptr; + try { + RandNumGenerator *rg = new RandNumGenerator(pConfig->iRandomSeed); + GraphBuilder graphBuilder = GraphBuilder(pConfig, rg); + graphBuilder.buildTs(usePhysicalPositions, useMacsMut, inbred, ploidy); + toRet = graphBuilder.releaseTableCollectionTs(timeScale, expandInbred); + delete rg; + } catch (const std::exception & e) { + Rcpp::Rcerr << "Simulator caught exception with message:" << endl + << e.what() << endl; + } catch (const char *message) { + Rcpp::Rcerr << "Simulator caught exception with message:" << endl + << message << endl; + } + return toRet; +} + void Simulator::beginSimulation() { try { @@ -689,6 +719,46 @@ vector runFromAlphaSimR(string in) { return test; } +tsk_table_collection_t * runFromAlphaSimRTs(string in, bool usePhysicalPositions, + bool useMacsMut, + double timeScale, + bool inbred, + unsigned int ploidy, + bool expandInbred) { + vector words; + Simulator simulator; + + if (in == ""){ + Rcpp::stop("Not enough args for macs call"); + } + if (in.empty()) { + Rcpp::stop("Not enough args for macs call"); + } + boost::split(words, in, boost::is_any_of(", "), boost::token_compress_on); + CommandArguments arguments; + vector subOption; + // sample size + subOption.emplace_back(words[0]); + // seq length + subOption.emplace_back(words[1]); + arguments.push_back(subOption); + subOption.clear(); + for (unsigned int i=2;i=65)){ + arguments.push_back(subOption); + subOption.clear(); + } + } + if (arguments.size() == 0) { + Rcpp::stop("Not enough args for macs call"); + } + + simulator.readInputParameters(arguments); + return simulator.beginSimulationTs(usePhysicalPositions, useMacsMut, timeScale, + inbred, ploidy, expandInbred); +} + // Runs MaCS once per chromosome and converts the output to AlphaSimR's packed // genotype representation. // @@ -840,3 +910,79 @@ Rcpp::List MaCS(Rcpp::String args, arma::uvec maxSites, bool inbred, return Rcpp::List::create(Rcpp::Named("geno")=geno, Rcpp::Named("genMap")=genMap); } + +// Runs MaCS once per chromosome and returns tree-sequence table collections. +// nChr is the number of chromosomes to simulate. +// usePhysicalPositions controls coordinate space for TS tables: +// FALSE (default): unit interval [0, 1], same coordinate system as runMacs internals +// TRUE: physical bp coordinates [0, dSeqLength] +// useMacsMut controls whether MaCS-style mutation sampling is performed during +// ancestry generation (TRUE), or ancestry-only tables are returned (FALSE). +// Nref optionally sets a reference effective population size for conversion +// from scaled coalescent units to generations using timeScale = 4 * Nref. +// [[Rcpp::export]] +Rcpp::List MaCSTS(Rcpp::String args, int nChr, bool inbred, + arma::uword ploidy, int nThreads, arma::uvec seed, + bool usePhysicalPositions = false, + bool useMacsMut = false, + double Nref = NA_REAL, + bool expandInbredSamples = true){ + if (args == "") { + Rcpp::stop("error passing argument string - it's empty"); + } + if (nChr <= 0) { + Rcpp::stop("nChr must be a positive integer"); + } + if (ploidy == 0) { + Rcpp::stop("ploidy must be a positive integer"); + } + + std::string argsString = args; + const arma::uword nChrU = static_cast(nChr); + if (seed.n_elem != nChrU) { + Rcpp::stop("seed length must match number of chromosomes"); + } + double timeScale = 1.0; + if (std::isfinite(Nref)) { + if (!(Nref > 0.0)) { + Rcpp::stop("Nref must be positive when provided"); + } + timeScale = 4.0 * Nref; + } + + std::vector tables(nChrU, nullptr); + +#ifdef _OPENMP +#pragma omp parallel for schedule(static) num_threads(nThreads) +#endif + for (arma::uword chr = 0; chr < nChrU; ++chr) { + std::string seedString = + std::to_string(static_cast(seed[chr])); + tables[chr] = runFromAlphaSimRTs(argsString + seedString, + usePhysicalPositions, + useMacsMut, + timeScale, + inbred, + static_cast(ploidy), + expandInbredSamples); + } + + Rcpp::List tsTables(nChrU); + for (arma::uword chr = 0; chr < nChrU; ++chr) { + if (tables[chr] == nullptr) { + Rcpp::stop("TS simulation failed for chromosome %d", + static_cast(chr + 1)); + } + rtsk_table_collection_t out(tables[chr], true); + tsTables[chr] = out; + } + + return Rcpp::List::create( + Rcpp::Named("tables") = tsTables, + Rcpp::Named("usePhysicalPositions") = usePhysicalPositions, + Rcpp::Named("useMacsMut") = useMacsMut, + Rcpp::Named("expandInbredSamples") = expandInbredSamples, + Rcpp::Named("mutationMode") = useMacsMut ? "macs" : "none", + Rcpp::Named("timeScale") = timeScale, + Rcpp::Named("Nref") = std::isfinite(Nref) ? Nref : NA_REAL); +} diff --git a/src/simulator.h b/src/simulator.h index b20c1648..ca20de4a 100644 --- a/src/simulator.h +++ b/src/simulator.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include //#include #include #include @@ -10,6 +12,7 @@ #include #include #include "constants.h" +#include "tskit.h" using namespace std; @@ -478,6 +481,40 @@ class Mutation{ bool bPrintOutput; }; +enum class TsPositionMode {MACS_UNIT, PHYSICAL_BP}; + +class TsRecorder { +public: + TsRecorder(double seqLengthBp, TsPositionMode positionMode, + bool inbred, unsigned int ploidy); + ~TsRecorder(); + void preRegisterSamples(NodePtr * pSampleNodes, unsigned int nSamples); + void recordTreeInterval(const EdgePtrVector & treeEdges, + unsigned int iTotalTreeEdges, + double leftPosUnit, double rightPosUnit); + void recordMutation(double mutationPosUnit, EdgePtr & selectedEdge, + double mutationTime); + void simplify(); + tsk_table_collection_t * release(double timeScale = 1.0, + bool expandInbred = true); + +private: + tsk_table_collection_t * pTables; + TsPositionMode positionMode; + double dSequenceLengthBp; + bool bSamplesPreRegistered; + bool bSimplified; + bool bInbred; + unsigned int iPloidy; + std::vector sampleNodeIds; + std::unordered_map nodeIdMap; + std::unordered_map sampleNodeIndividualMap; + void expandInbredSamples(); + double toTsPosition(double posUnit) const; + tsk_id_t ensurePopulation(short int population); + tsk_id_t getOrCreateNode(NodePtr & node); +}; + // Configuration container populated by parameter reading procedure // can be used by any simulator implementation class Configuration @@ -531,9 +568,13 @@ class GraphBuilder // The entry point for building the graph while traversing the // the chromosome on the unit interval. void build(); + void buildTs(bool usePhysicalPositions = false, bool useMacsMut = false, + bool inbred = false, unsigned int ploidy = 2); // Print the haplotypes in MS format void printHaplotypes(); vector getMutations(); + tsk_table_collection_t * releaseTableCollectionTs(double timeScale = 1.0, + bool expandInbred = true); private: // The random number generator @@ -618,6 +659,7 @@ class GraphBuilder bool bBeginGeneConversion; // flag to close a pending gene conversion event bool bEndGeneConversion; + std::unique_ptr pTsRecorder; // if gene conversion is to be closed at this iteration use the following // two saved edges EdgePtr gcOldEdge,gcNewEdge; @@ -655,6 +697,7 @@ class GraphBuilder // Add a mutation uniformly to the local tree, allowing it to trickle down // to the sampled chromosomes void addMutations(double startPos,double endPos); + void addMutationsTs(double startPos,double endPos); // Uniform randomly selects an edge (and position) to insert a xover or mutation node into EdgePtr getRandomEdgeOnTree(double & dSplitPoint,double dRandSpot); // Once a coalescent height is determined, uniform randomly select an @@ -743,6 +786,12 @@ class Simulator // case, constructs a new graphbuilder and calls the build() function void beginSimulation(); vector beginSimulationMemory(); + tsk_table_collection_t * beginSimulationTs(bool usePhysicalPositions = false, + bool useMacsMut = false, + double timeScale = 1.0, + bool inbred = false, + unsigned int ploidy = 2, + bool expandInbred = true); Simulator(); ~Simulator(); //destructor diff --git a/src/ts.cpp b/src/ts.cpp index fcaaf473..6a0c1933 100644 --- a/src/ts.cpp +++ b/src/ts.cpp @@ -1,5 +1,6 @@ #include "alphasimr.h" #include +#include "postTS.h" // [[Rcpp::depends(RcppTskit)]] // [[Rcpp::plugins(RcppTskit)]] @@ -56,3 +57,30 @@ int rtsk_treeseq_get_num_individuals2(const SEXP ts) { rtsk_treeseq_t ts_xptr(ts); return static_cast(tsk_treeseq_get_num_individuals(ts_xptr)); } + +// [[Rcpp::export]] +void tsMutateTableCollection(const SEXP tc, const double theta, + const uint64_t seed) { + rtsk_table_collection_t tc_xptr(tc); + tsk_table_collection_t *tables = tc_xptr; + tsPost::mutateTablesInPlace(tables, theta, seed); +} + +// [[Rcpp::export]] +void tsFinalizeInbredTableCollection(const SEXP tc, const int ploidy) { + if (ploidy <= 1) { + return; + } + rtsk_table_collection_t tc_xptr(tc); + tsk_table_collection_t *tables = tc_xptr; + if (tables == nullptr) { + Rcpp::stop("Table collection pointer is null"); + } + + tsPost::expandInbredSamplesInPlace(tables, static_cast(ploidy)); + (void)tsk_table_collection_drop_index(tables, 0); + tsPost::checkTsk(tsk_table_collection_sort(tables, nullptr, 0), + "Failed to sort table collection after inbred finalization"); + tsPost::checkTsk(tsk_table_collection_build_index(tables, 0), + "Failed to build index after inbred finalization"); +} diff --git a/src/tsRecorder.cpp b/src/tsRecorder.cpp new file mode 100644 index 00000000..e37c8b3c --- /dev/null +++ b/src/tsRecorder.cpp @@ -0,0 +1,276 @@ +#include "simulator.h" +#include "postTS.h" + +#include +#include +#include +#include + +namespace { + +inline void checkTsk(int status, const char * context) { + if (status < 0) { + throw std::runtime_error(std::string(context) + ": " + + std::string(tsk_strerror(status))); + } +} + +void rescaleTableCollectionTimes(tsk_table_collection_t * tables, double timeScale) { + if (tables == nullptr || timeScale == 1.0) { + return; + } + if (!(timeScale > 0.0) || !std::isfinite(timeScale)) { + throw std::runtime_error("timeScale must be a finite positive value"); + } + + for (tsk_size_t i = 0; i < tables->nodes.num_rows; ++i) { + tables->nodes.time[i] *= timeScale; + } + for (tsk_size_t i = 0; i < tables->mutations.num_rows; ++i) { + const double t = tables->mutations.time[i]; + if (!tsk_is_unknown_time(t)) { + tables->mutations.time[i] = t * timeScale; + } + } + for (tsk_size_t i = 0; i < tables->migrations.num_rows; ++i) { + tables->migrations.time[i] *= timeScale; + } + + static const char generationUnits[] = "generations"; + checkTsk(tsk_table_collection_set_time_units( + tables, generationUnits, sizeof(generationUnits) - 1), + "Failed to set TS time_units"); +} + +} // namespace + +TsRecorder::TsRecorder(double seqLengthBp, TsPositionMode positionMode, + bool inbred, unsigned int ploidy): + pTables(nullptr), + positionMode(positionMode), + dSequenceLengthBp(seqLengthBp > 0.0 ? seqLengthBp : 1.0), + bSamplesPreRegistered(false), + bSimplified(false), + bInbred(inbred), + iPloidy(ploidy > 0 ? ploidy : 1) { + + pTables = new tsk_table_collection_t; + checkTsk(tsk_table_collection_init(pTables, 0), "Failed to initialise tsk tables"); + pTables->sequence_length = positionMode == TsPositionMode::PHYSICAL_BP ? + dSequenceLengthBp : 1.0; +} + +TsRecorder::~TsRecorder() { + if (pTables != nullptr) { + tsk_table_collection_free(pTables); + delete pTables; + pTables = nullptr; + } +} + +void TsRecorder::preRegisterSamples(NodePtr * pSampleNodes, unsigned int nSamples) { + if (bSamplesPreRegistered) { + return; + } + if (pTables == nullptr) { + throw std::runtime_error("TS tables are not initialized"); + } + if (!bInbred && (nSamples % iPloidy != 0)) { + throw std::runtime_error("Sample count is not divisible by ploidy in outbred TS mode"); + } + + sampleNodeIds.clear(); + sampleNodeIndividualMap.clear(); + sampleNodeIds.reserve(nSamples); + + const unsigned int nIndividuals = bInbred ? nSamples : nSamples / iPloidy; + std::vector individualIds; + individualIds.reserve(nIndividuals); + for (unsigned int i = 0; i < nIndividuals; ++i) { + tsk_id_t indivId = tsk_individual_table_add_row(&pTables->individuals, + 0, + nullptr, + 0, + nullptr, + 0, + nullptr, + 0); + checkTsk(static_cast(indivId), "Failed to add individual row"); + individualIds.push_back(indivId); + } + + for (unsigned int i = 0; i < nSamples; ++i) { + NodePtr & node = pSampleNodes[i]; + if (!node) { + continue; + } + const unsigned long long nodeKey = node->getId(); + const unsigned int individualIndex = bInbred ? i : i / iPloidy; + sampleNodeIndividualMap[nodeKey] = individualIds[individualIndex]; + // Preserve runMacs sample order in TS node ids by creating sample rows first. + sampleNodeIds.push_back(getOrCreateNode(node)); + } + bSamplesPreRegistered = true; +} + +double TsRecorder::toTsPosition(double posUnit) const { + double out = positionMode == TsPositionMode::PHYSICAL_BP ? + posUnit * dSequenceLengthBp : posUnit; + const double seqLength = positionMode == TsPositionMode::PHYSICAL_BP ? + dSequenceLengthBp : 1.0; + if (out < 0.0) { + out = 0.0; + } else if (out > seqLength) { + out = seqLength; + } + return out; +} + +tsk_id_t TsRecorder::ensurePopulation(short int population) { + if (population < 0) { + return TSK_NULL; + } + while (pTables->populations.num_rows <= static_cast(population)) { + tsk_id_t newPopulation = tsk_population_table_add_row(&pTables->populations, + nullptr, 0); + checkTsk(static_cast(newPopulation), "Failed to add population row"); + } + return static_cast(population); +} + +tsk_id_t TsRecorder::getOrCreateNode(NodePtr & node) { + const unsigned long long nodeKey = node->getId(); + const auto it = nodeIdMap.find(nodeKey); + if (it != nodeIdMap.end()) { + return it->second; + } + + tsk_flags_t flags = 0; + if (node->getType() == Node::SAMPLE) { + flags |= TSK_NODE_IS_SAMPLE; + } + const tsk_id_t population = ensurePopulation(node->getPopulation()); + tsk_id_t individual = TSK_NULL; + if (node->getType() == Node::SAMPLE) { + const auto sampleIt = sampleNodeIndividualMap.find(nodeKey); + if (sampleIt != sampleNodeIndividualMap.end()) { + individual = sampleIt->second; + } + } + tsk_id_t nodeId = tsk_node_table_add_row(&pTables->nodes, + flags, + node->getHeight(), + population, + individual, + nullptr, + 0); + checkTsk(static_cast(nodeId), "Failed to add node row"); + nodeIdMap.insert(std::make_pair(nodeKey, nodeId)); + return nodeId; +} + +void TsRecorder::recordTreeInterval(const EdgePtrVector & treeEdges, + unsigned int iTotalTreeEdges, + double leftPosUnit, double rightPosUnit) { + const double left = toTsPosition(leftPosUnit); + const double right = toTsPosition(rightPosUnit); + if (right <= left) { + return; + } + + for (unsigned int i = 0; i < iTotalTreeEdges; ++i) { + EdgePtr edge = treeEdges[i]; + if (edge->bDeleted) { + continue; + } + NodePtr & parentNode = edge->getTopNodeRef(); + NodePtr & childNode = edge->getBottomNodeRef(); + const tsk_id_t parent = getOrCreateNode(parentNode); + const tsk_id_t child = getOrCreateNode(childNode); + tsk_id_t edgeId = tsk_edge_table_add_row(&pTables->edges, + left, + right, + parent, + child, + nullptr, + 0); + checkTsk(static_cast(edgeId), "Failed to add edge row"); + } +} + +void TsRecorder::recordMutation(double mutationPosUnit, EdgePtr & selectedEdge, + double mutationTime) { + double position = toTsPosition(mutationPosUnit); + const double seqLength = positionMode == TsPositionMode::PHYSICAL_BP ? + dSequenceLengthBp : 1.0; + if (position >= seqLength) { + // Enforce [0, sequence_length) constraint used by tskit for site positions. + position = std::nextafter(seqLength, 0.0); + } + + static const char ancestralState[] = "0"; + static const char derivedState[] = "1"; + tsk_id_t siteId = tsk_site_table_add_row(&pTables->sites, + position, + ancestralState, + 1, + nullptr, + 0); + checkTsk(static_cast(siteId), "Failed to add site row"); + + NodePtr & childNode = selectedEdge->getBottomNodeRef(); + const tsk_id_t nodeId = getOrCreateNode(childNode); + tsk_id_t mutationId = tsk_mutation_table_add_row(&pTables->mutations, + siteId, + nodeId, + TSK_NULL, + mutationTime, + derivedState, + 1, + nullptr, + 0); + checkTsk(static_cast(mutationId), "Failed to add mutation row"); +} + +void TsRecorder::simplify() { + if (pTables == nullptr || bSimplified) { + return; + } + if (sampleNodeIds.empty()) { + bSimplified = true; + return; + } + checkTsk(tsk_table_collection_sort(pTables, nullptr, 0), + "Failed to sort table collection before simplify"); + checkTsk(tsk_table_collection_simplify(pTables, + sampleNodeIds.data(), + sampleNodeIds.size(), + 0, + nullptr), + "Failed to simplify table collection"); + bSimplified = true; +} + +void TsRecorder::expandInbredSamples() { + if (!bInbred || iPloidy <= 1) { + return; + } + tsPost::expandInbredSamplesInPlace(pTables, iPloidy); +} + +tsk_table_collection_t * TsRecorder::release(double timeScale, bool expandInbred) { + if (pTables == nullptr) { + return nullptr; + } + if (expandInbred) { + expandInbredSamples(); + } + rescaleTableCollectionTimes(pTables, timeScale); + checkTsk(tsk_table_collection_sort(pTables, nullptr, 0), + "Failed to sort table collection"); + checkTsk(tsk_table_collection_build_index(pTables, 0), + "Failed to build table collection index"); + tsk_table_collection_t * out = pTables; + pTables = nullptr; + return out; +} diff --git a/tests/testthat/test-macsts-staged.R b/tests/testthat/test-macsts-staged.R new file mode 100644 index 00000000..61f20a42 --- /dev/null +++ b/tests/testthat/test-macsts-staged.R @@ -0,0 +1,280 @@ +context("MaCSTS staged simAnc/simMut checks") + +skip_if_not_installed("RcppTskit") + +ts_get <- function(ts, name) { + value <- ts[[name]] + if (is.function(value)) { + value() + } else { + value + } +} + +ts_variants_iterator <- function(ts) { + variants <- ts[["variants"]] + if (is.function(variants)) { + variants() + } else { + variants + } +} + +ts_next_variant <- function(it) { + if (is.null(it)) { + return(NULL) + } + if (is.function(it)) { + return(it()) + } + + nxt <- it[["next_variant"]] + if (!is.null(nxt)) { + if (is.function(nxt)) { + return(nxt()) + } + return(nxt) + } + + nxt <- it[["next"]] + if (!is.null(nxt)) { + if (is.function(nxt)) { + return(nxt()) + } + return(nxt) + } + + return(NULL) +} + +ts_variant_keys <- function(tc_xptr) { + tc <- RcppTskit::TableCollection$new(xptr = tc_xptr) + ts <- tc$tree_sequence() + it <- ts_variants_iterator(ts) + keys <- character(0) + repeat { + v <- ts_next_variant(it) + if (is.null(v)) { + break + } + pos <- format(signif(as.numeric(v$position), 15), + scientific = FALSE, trim = TRUE) + keys <- c(keys, paste0(pos, "|", paste(as.integer(v$genotypes), collapse = ""))) + } + sort(keys) +} + +table_counts <- function(tc_xptr) { + tc <- RcppTskit::TableCollection$new(xptr = tc_xptr) + ts <- tc$tree_sequence() + list( + num_sites = as.integer(tc$num_sites()), + num_mutations = as.integer(tc$num_mutations()), + num_nodes = as.integer(tc$num_nodes()), + num_edges = as.integer(tc$num_edges()), + num_trees = as.integer(ts_get(ts, "num_trees")) + ) +} + +node_times <- function(tc) { + n_nodes <- as.integer(tc$num_nodes()) + vapply(seq_len(n_nodes), function(i) { + as.numeric(tc$node_table_get_row(i - 1L)$time) + }, numeric(1)) +} + +test_that("simAnc + simMut staged workflow is reproducible for fixed seeds", { + args <- "8 5000 -t 1e-3 -r 1e-4 -s " + nChr <- 2L + seed <- as.integer(c(101, 202)) + mut_seed <- as.integer(c(555, 666)) + dTheta <- c(80, 80) + + anc_a <- AlphaSimR:::simAnc( + args = args, + nChr = nChr, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = seed, + usePhysicalPositions = FALSE, + Nref = NA_real_ + ) + anc_b <- AlphaSimR:::simAnc( + args = args, + nChr = nChr, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = seed, + usePhysicalPositions = FALSE, + Nref = NA_real_ + ) + + expect_identical(anc_a$stage, "simAnc") + expect_identical(anc_a$mutationMode, "none") + expect_identical(anc_a$useMacsMut, FALSE) + expect_equal(length(anc_a$tables), nChr) + + # simAnc is ancestry-only: no sites/mutations should be present yet. + for (chr in seq_len(nChr)) { + c0 <- table_counts(anc_a$tables[[chr]]) + expect_equal(c0$num_sites, 0L) + expect_equal(c0$num_mutations, 0L) + expect_gt(c0$num_trees, 0L) + expect_gt(c0$num_nodes, 0L) + expect_gt(c0$num_edges, 0L) + } + + mut_a <- AlphaSimR:::simMut(anc_a, dTheta = dTheta, seed = mut_seed) + mut_b <- AlphaSimR:::simMut(anc_b, dTheta = dTheta, seed = mut_seed) + + expect_identical(mut_a$stage, "simMut") + expect_identical(mut_a$mutationMode, "postTs") + expect_equal(as.integer(mut_a$mutationSeed), mut_seed) + + keys_a <- lapply(mut_a$tables, ts_variant_keys) + keys_b <- lapply(mut_b$tables, ts_variant_keys) + expect_identical(keys_a, keys_b) + + for (chr in seq_len(nChr)) { + c1 <- table_counts(mut_a$tables[[chr]]) + expect_gt(c1$num_mutations, 0L) + expect_equal(c1$num_sites, c1$num_mutations) + } +}) + +test_that("simMut with zero dTheta leaves ancestry tables unchanged", { + args <- "8 5000 -t 1e-3 -r 1e-4 -s " + anc <- AlphaSimR:::simAnc( + args = args, + nChr = 1L, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = as.integer(42), + usePhysicalPositions = FALSE, + Nref = NA_real_ + ) + + before <- table_counts(anc$tables[[1]]) + out <- AlphaSimR:::simMut(anc, dTheta = 0, seed = as.integer(99)) + after <- table_counts(out$tables[[1]]) + + expect_identical(before, after) + expect_equal(after$num_sites, 0L) + expect_equal(after$num_mutations, 0L) +}) + +test_that("post-TS mutations are placed within edge span and branch-time bounds", { + args <- "8 5000 -t 1e-3 -r 1e-4 -s " + anc <- AlphaSimR:::simAnc( + args = args, + nChr = 1L, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = as.integer(88), + usePhysicalPositions = TRUE, + Nref = NA_real_ + ) + + out <- AlphaSimR:::simMut(anc, dTheta = 150, seed = as.integer(77)) + tc <- RcppTskit::TableCollection$new(xptr = out$tables[[1]]) + + n_mut <- as.integer(tc$num_mutations()) + n_edge <- as.integer(tc$num_edges()) + n_node <- as.integer(tc$num_nodes()) + n_site <- as.integer(tc$num_sites()) + seq_len <- as.numeric(tc$sequence_length()) + + expect_gt(n_mut, 0L) + expect_equal(n_site, n_mut) + + edge_child <- integer(n_edge) + edge_parent <- integer(n_edge) + edge_left <- numeric(n_edge) + edge_right <- numeric(n_edge) + for (i in seq_len(n_edge)) { + e <- tc$edge_table_get_row(i - 1L) + edge_child[i] <- as.integer(e$child) + edge_parent[i] <- as.integer(e$parent) + edge_left[i] <- as.numeric(e$left) + edge_right[i] <- as.numeric(e$right) + } + + node_time <- numeric(n_node) + for (i in seq_len(n_node)) { + node_time[i] <- as.numeric(tc$node_table_get_row(i - 1L)$time) + } + + site_pos <- numeric(n_site) + for (i in seq_len(n_site)) { + site_pos[i] <- as.numeric(tc$site_table_get_row(i - 1L)$position) + } + + for (i in seq_len(n_mut)) { + m <- tc$mutation_table_get_row(i - 1L) + child <- as.integer(m$node) + site <- as.integer(m$site) + mut_time <- as.numeric(m$time) + pos <- site_pos[site + 1L] + + expect_true(pos >= 0) + expect_true(pos < seq_len) + + idx <- which(edge_child == child & edge_left <= pos & pos < edge_right) + expect_true(length(idx) > 0L) + + child_time <- node_time[child + 1L] + parent_time <- node_time[edge_parent[idx] + 1L] + expect_true(any(mut_time > child_time & mut_time < parent_time)) + } +}) + +test_that("Nref rescales node times and sets TS time_units to generations", { + args <- "8 5000 -t 1e-3 -r 1e-4 -s " + seed <- as.integer(12345) + nref <- 10000 + scale <- 4 * nref + + anc_unit <- AlphaSimR:::simAnc( + args = args, + nChr = 1L, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = seed, + usePhysicalPositions = FALSE, + Nref = NA_real_ + ) + anc_gen <- AlphaSimR:::simAnc( + args = args, + nChr = 1L, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = seed, + usePhysicalPositions = FALSE, + Nref = nref + ) + + tc_unit <- RcppTskit::TableCollection$new(xptr = anc_unit$tables[[1]]) + tc_gen <- RcppTskit::TableCollection$new(xptr = anc_gen$tables[[1]]) + + expect_equal(anc_unit$timeScale, 1) + expect_equal(anc_gen$timeScale, scale) + expect_identical(tc_unit$time_units(), "unknown") + expect_identical(tc_gen$time_units(), "generations") + + t_unit <- node_times(tc_unit) + t_gen <- node_times(tc_gen) + expect_equal(length(t_unit), length(t_gen)) + expect_true(isTRUE(all.equal(t_gen, t_unit * scale, tolerance = 1e-8))) + + c_unit <- table_counts(anc_unit$tables[[1]]) + c_gen <- table_counts(anc_gen$tables[[1]]) + expect_equal(c_unit$num_nodes, c_gen$num_nodes) + expect_equal(c_unit$num_edges, c_gen$num_edges) + expect_equal(c_unit$num_trees, c_gen$num_trees) +}) diff --git a/tests/testthat/test-macsts-useMacsMut.R b/tests/testthat/test-macsts-useMacsMut.R new file mode 100644 index 00000000..af9e15bb --- /dev/null +++ b/tests/testthat/test-macsts-useMacsMut.R @@ -0,0 +1,378 @@ +context("MaCSTS useMacsMut compatibility") + +to_int01_matrix <- function(x) { + matrix(as.integer(x), nrow = nrow(x), ncol = ncol(x), dimnames = dimnames(x)) +} + +ts_get <- function(ts, name) { + value <- ts[[name]] + if (is.function(value)) { + value() + } else { + value + } +} + +ts_variants_iterator <- function(ts) { + variants <- ts[["variants"]] + if (is.function(variants)) { + variants() + } else { + variants + } +} + +ts_next_variant <- function(it) { + if (is.null(it)) { + return(NULL) + } + if (is.function(it)) { + return(it()) + } + + nxt <- it[["next_variant"]] + if (!is.null(nxt)) { + if (is.function(nxt)) { + return(nxt()) + } + return(nxt) + } + + nxt <- it[["next"]] + if (!is.null(nxt)) { + if (is.function(nxt)) { + return(nxt()) + } + return(nxt) + } + + return(NULL) +} + +extract_macs_chr <- function(macs_out, chr = 1L, nThreads = 1L) { + pos <- as.numeric(macs_out$genMap[[chr]]) + n_sites <- length(pos) + if (n_sites == 0L) { + stop("zero sites produced; choose higher mutation settings for this test") + } + hap_raw <- AlphaSimR:::getHaplo( + geno = macs_out$geno[chr], + lociPerChr = as.integer(n_sites), + lociLoc = as.integer(seq_len(n_sites)), + nThreads = as.integer(nThreads) + ) + list(pos = pos, hap = to_int01_matrix(hap_raw)) +} + +extract_ts_chr <- function(tc_xptr) { + tc <- RcppTskit::TableCollection$new(xptr = tc_xptr) + ts <- tc$tree_sequence() + n_samples <- as.integer(ts_get(ts, "num_samples")) + it <- ts_variants_iterator(ts) + pos <- numeric(0) + cols <- list() + repeat { + v <- ts_next_variant(it) + if (is.null(v)) { + break + } + pos <- c(pos, as.numeric(v$position)) + cols[[length(cols) + 1L]] <- as.integer(v$genotypes) + } + hap <- if (length(cols) == 0L) { + matrix(integer(0), nrow = n_samples, ncol = 0L) + } else { + do.call(cbind, cols) + } + list( + pos = pos, + hap = hap, + num_sites = as.integer(ts_get(ts, "num_sites")), + num_mutations = as.integer(ts_get(ts, "num_mutations")) + ) +} + +site_hap_keys <- function(pos, hap) { + if (length(pos) == 0L) { + return(character(0)) + } + p <- format(signif(pos, 15), scientific = FALSE, trim = TRUE) + vapply( + seq_len(ncol(hap)), + function(j) paste0(p[j], "|", paste(hap[, j], collapse = "")), + character(1) + ) +} + +compare_chr <- function(macs_chr, ts_chr) { + ord_m <- order(macs_chr$pos) + ord_t <- order(ts_chr$pos) + m_pos <- macs_chr$pos[ord_m] + t_pos <- ts_chr$pos[ord_t] + m_hap <- macs_chr$hap[, ord_m, drop = FALSE] + t_hap <- ts_chr$hap[, ord_t, drop = FALSE] + + same_nsites <- ncol(m_hap) == ncol(t_hap) + same_positions_strict <- isTRUE(all.equal(m_pos, t_pos, tolerance = 0)) + same_hap_strict <- identical(m_hap, t_hap) + + keys_m <- sort(site_hap_keys(macs_chr$pos, macs_chr$hap)) + keys_t <- sort(site_hap_keys(ts_chr$pos, ts_chr$hap)) + same_site_hap_multiset <- identical(keys_m, keys_t) + has_duplicate_positions <- any(duplicated(m_pos)) || any(duplicated(t_pos)) + + list( + same_nsites = same_nsites, + same_positions_strict = same_positions_strict, + same_hap_strict = same_hap_strict, + same_site_hap_multiset = same_site_hap_multiset, + has_duplicate_positions = has_duplicate_positions, + ts_num_sites = ts_chr$num_sites, + ts_num_mutations = ts_chr$num_mutations + ) +} + +run_case <- function(args, nChr, inbred, ploidy, seed, nThreads = 1L) { + seed_vec <- rep(as.integer(seed), as.integer(nChr)) + macs <- AlphaSimR:::MaCS( + args = args, + maxSites = rep(0L, as.integer(nChr)), + inbred = inbred, + ploidy = as.integer(ploidy), + nThreads = as.integer(nThreads), + seed = seed_vec + ) + ts_out <- AlphaSimR:::MaCSTS( + args = args, + nChr = as.integer(nChr), + inbred = inbred, + ploidy = as.integer(ploidy), + nThreads = as.integer(nThreads), + seed = seed_vec, + usePhysicalPositions = FALSE, + useMacsMut = TRUE + ) + + out <- lapply(seq_len(as.integer(nChr)), function(chr) { + m <- extract_macs_chr(macs, chr = chr, nThreads = nThreads) + t <- extract_ts_chr(ts_out$tables[[chr]]) + compare_chr(m, t) + }) + out +} + +test_that("MaCSTS(useMacsMut=TRUE) matches MaCS across representative scenarios", { + scenarios <- list( + list( + name = "base_outbred_ploidy2", + args = "8 50000 -t 1e-3 -r 1e-4 -s ", + inbred = FALSE, + ploidy = 2L + ), + list( + name = "base_outbred_ploidy1", + args = "8 50000 -t 1e-3 -r 1e-4 -s ", + inbred = FALSE, + ploidy = 1L + ), + list( + name = "base_inbred_ploidy2", + args = "8 50000 -t 1e-3 -r 1e-4 -s ", + inbred = TRUE, + ploidy = 2L + ), + list( + name = "demography_eN", + args = "8 50000 -t 1e-3 -r 1e-4 -eN 0.2 2.0 -eN 0.9 0.5 -s ", + inbred = FALSE, + ploidy = 2L + ), + list( + name = "multipop_with_migration_change", + args = "8 50000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -eM 0.5 5e-3 -s ", + inbred = FALSE, + ploidy = 2L + ), + list( + name = "multipop_en_plus_join", + args = "8 50000 -t 1e-3 -r 1e-4 -I 2 4 4 1e-2 -en 0.2 2 0.5 -ej 1.0 2 1 -s ", + inbred = FALSE, + ploidy = 2L + ) + ) + + for (sc in scenarios) { + res <- run_case( + args = sc$args, + nChr = 1L, + inbred = sc$inbred, + ploidy = sc$ploidy, + seed = 12345L, + nThreads = 1L + )[[1]] + + expect_true(res$same_nsites, info = sc$name) + expect_true(res$same_site_hap_multiset, info = sc$name) + expect_true(res$same_positions_strict, info = sc$name) + # With duplicate positions, column order can differ while the site/haplotype + # multiset remains identical; strict matrix identity is too strong. + if (!isTRUE(res$has_duplicate_positions)) { + expect_true(res$same_hap_strict, info = sc$name) + } + expect_equal(res$ts_num_sites, res$ts_num_mutations, info = sc$name) + } +}) + +test_that("MaCSTS(useMacsMut=TRUE) is reproducible across chromosomes for fixed seeds", { + args <- "8 50000 -t 1e-3 -r 1e-4 -s " + nChr <- 3L + seed_vec <- as.integer(c(101, 202, 303)) + + out_a <- AlphaSimR:::MaCSTS( + args = args, + nChr = nChr, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = seed_vec, + usePhysicalPositions = FALSE, + useMacsMut = TRUE + ) + out_b <- AlphaSimR:::MaCSTS( + args = args, + nChr = nChr, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = seed_vec, + usePhysicalPositions = FALSE, + useMacsMut = TRUE + ) + out_c <- AlphaSimR:::MaCSTS( + args = args, + nChr = nChr, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = seed_vec + 1L, + usePhysicalPositions = FALSE, + useMacsMut = TRUE + ) + + keys_from <- function(out) { + lapply(seq_len(nChr), function(chr) { + x <- extract_ts_chr(out$tables[[chr]]) + sort(site_hap_keys(x$pos, x$hap)) + }) + } + + keys_a <- keys_from(out_a) + keys_b <- keys_from(out_b) + keys_c <- keys_from(out_c) + + expect_identical(keys_a, keys_b) + # Different seeds should usually differ, but in rare runs can coincide. + if (!any(!vapply(seq_len(nChr), function(i) identical(keys_a[[i]], keys_c[[i]]), logical(1)))) { + skip("Different seed vector produced identical site/haplotype keys in this run") + } +}) + +test_that("usePhysicalPositions changes coordinate scale only", { + args <- "8 50000 -t 1e-3 -r 1e-4 -s " + nChr <- 1L + seed <- as.integer(777) + seq_len_bp <- as.numeric(strsplit(args, "[,[:space:]]+", perl = TRUE)[[1]][2]) + + out_unit <- AlphaSimR:::MaCSTS( + args = args, + nChr = nChr, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = seed, + usePhysicalPositions = FALSE, + useMacsMut = TRUE + ) + out_bp <- AlphaSimR:::MaCSTS( + args = args, + nChr = nChr, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = seed, + usePhysicalPositions = TRUE, + useMacsMut = TRUE + ) + + chr_unit <- extract_ts_chr(out_unit$tables[[1]]) + chr_bp <- extract_ts_chr(out_bp$tables[[1]]) + + expect_true(all(chr_unit$pos >= 0 & chr_unit$pos <= 1)) + expect_true(all(chr_bp$pos >= 0 & chr_bp$pos <= seq_len_bp)) + + # Haplotypes should match exactly; positions should match after rescaling. + expect_identical(sort(site_hap_keys(chr_unit$pos, chr_unit$hap)), + sort(site_hap_keys(chr_bp$pos / seq_len_bp, chr_bp$hap))) + expect_true(isTRUE(all.equal(sort(chr_unit$pos), sort(chr_bp$pos / seq_len_bp), tolerance = 1e-12))) +}) + +test_that("MaCSTS validates key inputs", { + args <- "8 50000 -t 1e-3 -r 1e-4 -s " + + expect_error( + AlphaSimR:::MaCSTS( + args = args, + nChr = 0L, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = as.integer(1), + usePhysicalPositions = FALSE, + useMacsMut = TRUE + ), + "nChr must be a positive integer" + ) + + expect_error( + AlphaSimR:::MaCSTS( + args = args, + nChr = 1L, + inbred = FALSE, + ploidy = 0L, + nThreads = 1L, + seed = as.integer(1), + usePhysicalPositions = FALSE, + useMacsMut = TRUE + ), + "ploidy must be a positive integer" + ) + + expect_error( + AlphaSimR:::MaCSTS( + args = args, + nChr = 2L, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = as.integer(1), + usePhysicalPositions = FALSE, + useMacsMut = TRUE + ), + "seed length must match number of chromosomes" + ) + + expect_error( + AlphaSimR:::MaCSTS( + args = args, + nChr = 1L, + inbred = FALSE, + ploidy = 2L, + nThreads = 1L, + seed = as.integer(1), + usePhysicalPositions = FALSE, + useMacsMut = TRUE, + Nref = 0 + ), + "Nref must be positive when provided" + ) +}) diff --git a/tests/testthat/test-runMacTS-sensitivity.R b/tests/testthat/test-runMacTS-sensitivity.R new file mode 100644 index 00000000..1b237545 --- /dev/null +++ b/tests/testthat/test-runMacTS-sensitivity.R @@ -0,0 +1,324 @@ +context("runMacTS wrapper and parameter sensitivity") + +skip_if_not_installed("RcppTskit") + +tc_summary <- function(tc_xptr) { + AlphaSimR::rtsk_table_collection_summary2(tc_xptr) +} + +node_times <- function(tc_xptr) { + tc <- RcppTskit::TableCollection$new(xptr = tc_xptr) + n_nodes <- as.integer(tc$num_nodes()) + vapply(seq_len(n_nodes), function(i) { + as.numeric(tc$node_table_get_row(i - 1L)$time) + }, numeric(1)) +} + +sample_node_count <- function(tc_xptr) { + tc <- RcppTskit::TableCollection$new(xptr = tc_xptr) + n_nodes <- as.integer(tc$num_nodes()) + flags <- vapply(seq_len(n_nodes), function(i) { + as.integer(tc$node_table_get_row(i - 1L)$flags) + }, integer(1)) + sum(bitwAnd(flags, 1L) != 0L) +} + +site_position_range <- function(tc_xptr) { + tc <- RcppTskit::TableCollection$new(xptr = tc_xptr) + n_sites <- as.integer(tc$num_sites()) + if (n_sites == 0L) { + return(c(min = NA_real_, max = NA_real_)) + } + pos <- vapply(seq_len(n_sites), function(i) { + as.numeric(tc$site_table_get_row(i - 1L)$position) + }, numeric(1)) + c(min = min(pos), max = max(pos)) +} + +parse_seq_len <- function(args) { + tokens <- strsplit(as.character(args), "[,[:space:]]+", perl = TRUE)[[1L]] + tokens <- tokens[nzchar(tokens)] + as.numeric(tokens[2L]) +} + +run_staged_from_wrapper <- function(out, inbred, ploidy, segSites, + nThreads = 1L, + usePhysicalPositions = FALSE, + expandInbredTs = FALSE, + siteSamplingSeed = 42L) { + nChr <- length(out$tables) + if (isTRUE(usePhysicalPositions)) { + stop("run_staged_from_wrapper helper currently supports usePhysicalPositions = FALSE only") + } + + anc <- AlphaSimR:::simAnc( + args = out$args, + nChr = nChr, + inbred = inbred, + ploidy = ploidy, + nThreads = nThreads, + seed = out$seed, + usePhysicalPositions = usePhysicalPositions, + Nref = NA_real_ + ) + + dThetaPost <- as.numeric(anc$dTheta) / as.numeric(anc$timeScale) + mutSeed <- if (!all(is.na(out$mutSeed))) { + as.integer(out$mutSeed) + } else { + as.integer(out$seed) + 104729L + } + runOut <- AlphaSimR:::simMut(anc, dTheta = dThetaPost, seed = mutSeed) + + if (isTRUE(expandInbredTs) && isTRUE(inbred) && ploidy > 1L) { + runOut <- AlphaSimR:::finalizeInbredTs(runOut, inbred = inbred, ploidy = ploidy) + } + + breaks <- rep(list(c(0, 1)), nChr) + rates <- rep(list(c(1)), nChr) + pop <- AlphaSimR:::asMapPop( + chr_info = list(tables = runOut$tables, breaks = breaks, rates = rates), + ploidy = ploidy, + inbred = inbred, + segSites = segSites, + site_sampling_seed = as.integer(siteSamplingSeed), + nThreads = as.integer(nThreads), + returnMeta = FALSE + ) + + list(tables = runOut$tables, pop = pop) +} + +expect_wrapper_staged_equal <- function(out, staged) { + expect_identical(out$pop@nLoci, staged$pop@nLoci) + for (chr in seq_along(out$tables)) { + expect_identical(out$pop@geno[[chr]], staged$pop@geno[[chr]]) + expect_true(isTRUE(all.equal(out$pop@genMap[[chr]], staged$pop@genMap[[chr]], tolerance = 0))) + + sw <- tc_summary(out$tables[[chr]]) + ss <- tc_summary(staged$tables[[chr]]) + expect_equal(sw$num_nodes, ss$num_nodes) + expect_equal(sw$num_edges, ss$num_edges) + expect_equal(sw$num_sites, ss$num_sites) + expect_equal(sw$num_mutations, ss$num_mutations) + } +} + +test_that("runMacTS(postTs) wrapper matches staged workflow (outbred)", { + out <- AlphaSimR:::runMacTS( + nInd = 4, + nChr = 2, + segSites = 60, + inbred = FALSE, + ploidy = 2L, + species = "GENERIC", + mutationMode = "postTs", + usePhysicalPositions = FALSE, + nThreads = 1L, + seed = as.integer(c(11, 22)), + mutSeed = as.integer(c(111, 222)), + siteSamplingSeed = 42L, + returnTs = TRUE + ) + + staged <- run_staged_from_wrapper( + out = out, + inbred = FALSE, + ploidy = 2L, + segSites = 60, + nThreads = 1L, + usePhysicalPositions = FALSE, + expandInbredTs = FALSE, + siteSamplingSeed = 42L + ) + + expect_wrapper_staged_equal(out, staged) +}) + +test_that("runMacTS(postTs) wrapper matches staged workflow (inbred, ploidy > 1)", { + out <- AlphaSimR:::runMacTS( + nInd = 4, + nChr = 2, + segSites = 60, + inbred = TRUE, + ploidy = 2L, + species = "GENERIC", + mutationMode = "postTs", + usePhysicalPositions = FALSE, + expandInbredTs = TRUE, + nThreads = 1L, + seed = as.integer(c(11, 22)), + mutSeed = as.integer(c(111, 222)), + siteSamplingSeed = 42L, + returnTs = TRUE + ) + + staged <- run_staged_from_wrapper( + out = out, + inbred = TRUE, + ploidy = 2L, + segSites = 60, + nThreads = 1L, + usePhysicalPositions = FALSE, + expandInbredTs = TRUE, + siteSamplingSeed = 42L + ) + + expect_wrapper_staged_equal(out, staged) +}) + +test_that("mutationMode='none' returns ancestry-only TS", { + out <- AlphaSimR:::runMacTS( + nInd = 4, + nChr = 2, + inbred = FALSE, + ploidy = 2L, + species = "GENERIC", + mutationMode = "none", + usePhysicalPositions = FALSE, + nThreads = 1L, + seed = as.integer(c(101, 202)), + returnTs = TRUE + ) + + expect_null(out$pop) + expect_identical(out$mutationMode, "none") + expect_true(all(is.na(out$mutSeed))) + + for (chr in seq_along(out$tables)) { + s <- tc_summary(out$tables[[chr]]) + expect_equal(s$num_sites, 0L) + expect_equal(s$num_mutations, 0L) + expect_gt(s$num_nodes, 0L) + expect_gt(s$num_edges, 0L) + } +}) + +test_that("usePhysicalPositions changes coordinate scale while keeping sampled output", { + common <- list( + nInd = 4, + nChr = 1, + segSites = 60, + inbred = FALSE, + ploidy = 2L, + species = "GENERIC", + mutationMode = "postTs", + nThreads = 1L, + seed = as.integer(123), + mutSeed = as.integer(456), + siteSamplingSeed = 42L, + returnTs = TRUE + ) + out_unit <- do.call(AlphaSimR:::runMacTS, c(common, list(usePhysicalPositions = FALSE))) + out_bp <- do.call(AlphaSimR:::runMacTS, c(common, list(usePhysicalPositions = TRUE))) + + s_unit <- tc_summary(out_unit$tables[[1]]) + s_bp <- tc_summary(out_bp$tables[[1]]) + seq_len_bp <- parse_seq_len(out_bp$args) + + expect_equal(as.numeric(s_unit$sequence_length), 1.0, tolerance = 0) + expect_equal(as.numeric(s_bp$sequence_length), seq_len_bp, tolerance = 0) + expect_equal(s_unit$num_sites, s_bp$num_sites) + expect_equal(s_unit$num_mutations, s_bp$num_mutations) + + rng_unit <- site_position_range(out_unit$tables[[1]]) + rng_bp <- site_position_range(out_bp$tables[[1]]) + expect_true(rng_unit["min"] >= 0 && rng_unit["max"] < 1) + expect_true(rng_bp["min"] >= 0 && rng_bp["max"] < seq_len_bp) + + expect_identical(out_unit$pop@nLoci, out_bp$pop@nLoci) + expect_identical(out_unit$pop@geno[[1]], out_bp$pop@geno[[1]]) + expect_true(isTRUE(all.equal(out_unit$pop@genMap[[1]], out_bp$pop@genMap[[1]], tolerance = 1e-12))) +}) + +test_that("Nref rescales TS times in runMacTS ancestry-only mode", { + seed <- as.integer(42) + nref <- 10000 + scale <- 4 * nref + + out_unit <- AlphaSimR:::runMacTS( + nInd = 4, + nChr = 1, + inbred = FALSE, + ploidy = 2L, + species = "GENERIC", + mutationMode = "none", + usePhysicalPositions = FALSE, + Nref = NA_real_, + nThreads = 1L, + seed = seed, + returnTs = TRUE + ) + out_gen <- AlphaSimR:::runMacTS( + nInd = 4, + nChr = 1, + inbred = FALSE, + ploidy = 2L, + species = "GENERIC", + mutationMode = "none", + usePhysicalPositions = FALSE, + Nref = nref, + nThreads = 1L, + seed = seed, + returnTs = TRUE + ) + + expect_equal(out_unit$timeScale, 1) + expect_equal(out_gen$timeScale, scale) + + s_unit <- tc_summary(out_unit$tables[[1]]) + s_gen <- tc_summary(out_gen$tables[[1]]) + expect_identical(s_unit$time_units, "unknown") + expect_identical(s_gen$time_units, "generations") + expect_equal(s_unit$num_nodes, s_gen$num_nodes) + expect_equal(s_unit$num_edges, s_gen$num_edges) + expect_equal(s_unit$num_trees, s_gen$num_trees) + + t_unit <- node_times(out_unit$tables[[1]]) + t_gen <- node_times(out_gen$tables[[1]]) + expect_equal(length(t_unit), length(t_gen)) + expect_true(isTRUE(all.equal(t_gen, t_unit * scale, tolerance = 1e-8))) +}) + +test_that("expandInbredTs toggles inbred leaf expansion in TS", { + out_no_expand <- AlphaSimR:::runMacTS( + nInd = 4, + nChr = 1, + inbred = TRUE, + ploidy = 2L, + species = "GENERIC", + mutationMode = "none", + usePhysicalPositions = FALSE, + expandInbredTs = FALSE, + nThreads = 1L, + seed = as.integer(7), + returnTs = TRUE + ) + out_expand <- AlphaSimR:::runMacTS( + nInd = 4, + nChr = 1, + inbred = TRUE, + ploidy = 2L, + species = "GENERIC", + mutationMode = "none", + usePhysicalPositions = FALSE, + expandInbredTs = TRUE, + nThreads = 1L, + seed = as.integer(7), + returnTs = TRUE + ) + + s0 <- tc_summary(out_no_expand$tables[[1]]) + s1 <- tc_summary(out_expand$tables[[1]]) + expect_equal(s0$num_sites, 0L) + expect_equal(s1$num_sites, 0L) + expect_equal(s0$num_mutations, 0L) + expect_equal(s1$num_mutations, 0L) + expect_gt(s1$num_nodes, s0$num_nodes) + expect_gt(s1$num_edges, s0$num_edges) + + n_sample_0 <- sample_node_count(out_no_expand$tables[[1]]) + n_sample_1 <- sample_node_count(out_expand$tables[[1]]) + expect_equal(n_sample_0, 4L) + expect_equal(n_sample_1, 8L) +})