diff --git a/DESCRIPTION b/DESCRIPTION index cfde80a7..69a4f50f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BGmisc Title: An R Package for Extended Behavior Genetics Analysis -Version: 1.5.2 +Version: 1.6.0 Authors@R: c( person("S. Mason", "Garrison", , "garrissm@wfu.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-4804-6003")), diff --git a/NEWS.md b/NEWS.md index 6737cbd6..ed537985 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# BGmisc NEWS + +# BGmisc 1.6 +* Optimize simulatePedigree and helpers for speed and memory usage +* Major gains in speed for deeper pedigrees +* Added more tests for simulatePedigree +* Fix error when not enough single people available + # BGmisc 1.5.2 * More flexible ID generation for simulatePedigree * Created ped2gen function to extract generation information from pedigree data.frames diff --git a/R/buildComponent.R b/R/buildComponent.R index 3c2ac464..38a96522 100644 --- a/R/buildComponent.R +++ b/R/buildComponent.R @@ -1,9 +1,7 @@ #' Take a pedigree and turn it into a relatedness matrix #' @param ped a pedigree dataset. Needs ID, momID, and dadID columns #' @param component character. Which component of the pedigree to return. See Details. -#' @param max_gen the maximum number of generations to compute -#' (e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. -#' `Inf` uses as many generations as there are in the data. +#' @param max_gen the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25. #' @param sparse logical. If TRUE, use and return sparse matrices from Matrix package #' @param verbose logical. If TRUE, print progress through stages of algorithm #' @param update_rate numeric. The rate at which to print progress @@ -100,6 +98,7 @@ ped2com <- function(ped, component, "tcrossprod", "crossprod", "star", "tcross.alt.crossprod", "tcross.alt.star" ) + if (!config$transpose_method %in% transpose_method_options) { stop(paste0( "Invalid method specified. Choose from ", @@ -107,10 +106,14 @@ ped2com <- function(ped, component, )) } - - if (!config$adjacency_method %in% - c("indexed", "loop", "direct", "beta")) { - stop("Invalid method specified. Choose from 'indexed', 'loop', 'direct', or 'beta'.") + # Validate the 'adjacency_method' argument + adjacency_method_options <- c("indexed", "loop", "direct", "beta") + if (!config$adjacency_method %in% adjacency_method_options + ) { + stop(paste0( + "Invalid method specified. Choose from ", + paste(adjacency_method_options, collapse = ", "), "." + )) } # standardize colnames @@ -215,12 +218,21 @@ ped2com <- function(ped, component, count <- 0 } maxCount <- config$max_gen + 1 + if (config$verbose == TRUE) { cat("About to do RAM path tracing\n") } # r is I + A + A^2 + ... = (I-A)^-1 from RAM # could trim, here + ## it keeps going until it explains all of the relatedness with themselves (i.e., mtSum == 0) + # some of this precision is articifuial because we literally get to the point that the condon is eaither there or not. probabiliticy + + # how much percision do we need to get unbiased estimates + + # big matrix still happens here because the network is built. just less percise on inbreeding + + # bias-precision tradeoff. how much percision do we need to get unbiased estimates? not a lot while (mtSum != 0 && count < maxCount) { r <- r + newIsPar gen <- gen + (Matrix::rowSums(newIsPar) > 0) diff --git a/R/helpPedigree.R b/R/helpPedigree.R deleted file mode 100644 index f8aa1cbf..00000000 --- a/R/helpPedigree.R +++ /dev/null @@ -1,196 +0,0 @@ -#' Create Data Frame for Generation -#' -#' This function creates a data frame for a specific generation within the simulated pedigree. -#' It initializes the data frame with default values for family ID, individual ID, generation number, -#' paternal ID, maternal ID, spouse ID, and sex. All individuals are initially set with NA for paternal, -#' maternal, spouse IDs, and sex, awaiting further assignment. -#' -#' @param sizeGens A numeric vector containing the sizes of each generation within the pedigree. -#' @param genIndex An integer representing the current generation index for which the data frame is being created. -#' @param idGen A numeric vector containing the ID numbers to be assigned to individuals in the current generation. -#' @return A data frame representing the initial structure for the individuals in the specified generation -#' before any relationships (parental, spousal) are defined. The columns include family ID (`fam`), -#' individual ID (`id`), generation number (`gen`), father's ID (`pat`), mother's ID (`mat`), -#' spouse's ID (`spID`), and sex (`sex`), with NA values for paternal, maternal, and spouse IDs, and sex. -#' @examples -#' sizeGens <- c(3, 5, 4) # Example sizes for 3 generations -#' genIndex <- 2 # Creating data frame for the 2nd generation -#' idGen <- 101:105 # Example IDs for the 2nd generation -#' df_Ngen <- createGenDataFrame(sizeGens, genIndex, idGen) -#' print(df_Ngen) -#' @export -createGenDataFrame <- function(sizeGens, genIndex, idGen) { - df_Ngen <- data.frame( - fam = rep(paste("fam", 1), sizeGens[genIndex], sep = ""), - id = idGen[1:sizeGens[genIndex]], - gen = rep(genIndex, sizeGens[genIndex]), - pat = rep(NA, sizeGens[genIndex]), # father id - mat = rep(NA, sizeGens[genIndex]), # mother id - spID = rep(NA, sizeGens[genIndex]), # spouse id - sex = rep(NA, sizeGens[genIndex]) - ) - return(df_Ngen) -} - - -#' Determine Sex of Offspring -#' -#' This internal function assigns sexes to the offspring in a generation based on the specified sex ratio. -#' -#' @param idGen Vector of IDs for the generation. -#' @param sexR Numeric value indicating the sex ratio (proportion of males). -#' @param code_male The value to use for males. Default is "M" -#' @param code_female The value to use for females. Default is "F" -#' @return Vector of sexes ("M" for male, "F" for female) for the offspring. -#' @importFrom stats runif -#' @export -determineSex <- function(idGen, sexR, code_male = "M", code_female = "F") { - if (runif(1) > .5) { - sexVec1 <- rep(code_male, floor(length(idGen) * sexR)) - sexVec2 <- rep(code_female, length(idGen) - length(sexVec1)) - } else { - sexVec1 <- rep(code_female, floor(length(idGen) * (1 - sexR))) - sexVec2 <- rep(code_male, length(idGen) - length(sexVec1)) - } - sexVec <- sample(c(sexVec1, sexVec2)) - return(sexVec) -} - -#' Assign Couple IDs -#' -#' This subfunction assigns a unique couple ID to each mated pair in the generation. -#' Unmated individuals are assigned NA for their couple ID. -#' -#' @param df_Ngen The dataframe for the current generation, including columns for individual IDs and spouse IDs. -#' @return The input dataframe augmented with a 'coupleId' column, where each mated pair has a unique identifier. -#' @keywords internal -#' -assignCoupleIDs <- function(df_Ngen) { - df_Ngen$coupleId <- NA_character_ # Initialize the coupleId column with NAs - usedCoupleIds <- character() # Initialize an empty character vector to track used IDs - - for (j in seq_len(nrow(df_Ngen))) { - if (!is.na(df_Ngen$spID[j]) && is.na(df_Ngen$coupleId[j])) { - # Construct a potential couple ID from sorted individual and spouse IDs - sortedIds <- sort(c(df_Ngen$id[j], df_Ngen$spID[j])) - potentialCoupleId <- paste(sortedIds[1], sortedIds[2], sep = "_") - - # Check if the potentialCoupleId has not already been used - if (!potentialCoupleId %in% usedCoupleIds) { - # Assign the new couple ID to both partners - df_Ngen$coupleId[j] <- potentialCoupleId - spouseIndex <- which(df_Ngen$id == df_Ngen$spID[j]) - df_Ngen$coupleId[spouseIndex] <- potentialCoupleId - - # Add the new couple ID to the list of used IDs - usedCoupleIds <- c(usedCoupleIds, potentialCoupleId) - } - } - } - - return(df_Ngen) -} - - -#' @rdname assignCoupleIDs -assignCoupleIds <- assignCoupleIDs - -#' Generate or Adjust Number of Kids per Couple Based on Mating Rate -#' -#' This function generates or adjusts the number of kids per couple in a generation -#' based on the specified average and whether the count should be randomly determined. -#' -#' @param nMates Integer, the number of mated pairs in the generation. -#' @inheritParams simulatePedigree -#' -#' @return A numeric vector with the generated or adjusted number of kids per couple. -#' @keywords internal -adjustKidsPerCouple <- function(nMates, kpc, rd_kpc = TRUE) { - if (rd_kpc == TRUE) { - diff <- nMates + 1 - while (diff > nMates) { - random_numbers <- stats::rpois(nMates, kpc) - # cat("original random numbers", random_numbers, "\n") - diff <- abs(nMates * kpc - sum(random_numbers)) - } - # make sure the sum of kids per couple is equal to the number of kids in the i th generation - if (sum(random_numbers) < nMates * kpc) { - names(random_numbers) <- seq_along(random_numbers) - random_numbers <- sort(random_numbers) - random_numbers[1:diff] <- random_numbers[1:diff] + 1 - random_numbers <- random_numbers[order(names(random_numbers))] - } else if (sum(random_numbers) > nMates * kpc) { - names(random_numbers) <- seq_along(random_numbers) - random_numbers <- sort(random_numbers, decreasing = TRUE) - random_numbers[1:diff] <- random_numbers[1:diff] - 1 - random_numbers <- random_numbers[order(names(random_numbers))] - } - } else { - random_numbers <- rep(kpc, nMates) - } - - if (min(random_numbers) < 0) { - random_numbers[random_numbers == -1] <- 0 - random_numbers[random_numbers == max(random_numbers)] <- max(random_numbers) - 1 - } - - return(random_numbers) -} - -#' Mark and Assign children -#' -#' This subfunction marks individuals in a generation as potential sons, daughters, -#' or parents based on their relationships and assigns unique couple IDs. It processes -#' the assignment of roles and relationships within and between generations in a pedigree simulation. -#' @inheritParams determineSex -#' @param df_Ngen A data frame for the current generation being processed. -#' It must include columns for individual IDs (`id`), spouse IDs (`spID`), sex (`sex`), -#' and any previously assigned roles (`ifparent`, `ifson`, `ifdau`). -#' @param i Integer, the index of the current generation being processed. -#' @param Ngen Integer, the total number of generations in the simulation. -#' @param sizeGens Numeric vector, containing the size (number of individuals) of each generation. -#' @param CoupleF Integer, IT MIGHT BE the number of couples in the current generation. -#' -#' -#' @return Modifies `df_Ngen` in place by updating or adding columns related to individual roles -#' (`ifparent`, `ifson`, `ifdau`) and couple IDs (`coupleId`). The updated data frame is -#' also returned for integration into the larger pedigree data frame (`df_Fam`). -#' - -markPotentialChildren <- function(df_Ngen, i, Ngen, sizeGens, CoupleF, code_male = "M", code_female = "F") { - # Step 2.1: mark a group of potential sons and daughters in the i th generation - - # get all couple ids - coupleID <- unique(df_Ngen$coupleId[!is.na(df_Ngen$coupleId)]) - if (i == Ngen) { - CoupleF <- 0 - } - coupleGirl <- sample(coupleID, CoupleF) - coupleBoy <- coupleID[!coupleID %in% coupleGirl] - # single person should all be sons or daus - # change the ifson and ifdau based on coupleGirl and coupleBoy - for (j in 1:sizeGens[i]) { - if (is.na(df_Ngen$spID[j])) { - if (df_Ngen$sex[j] == code_female) { - df_Ngen$ifdau[j] <- TRUE - # usedIds <- c(usedIds, df_Ngen$id[j]) - } else { - df_Ngen$ifson[j] <- TRUE - # usedIds <- c(usedIds, df_Ngen$id[j]) - } - } else { - if (df_Ngen$coupleId[j] %in% coupleBoy && df_Ngen$sex[j] == code_male) { - df_Ngen$ifson[j] <- TRUE - } else if (df_Ngen$coupleId[j] %in% coupleGirl && df_Ngen$sex[j] == code_female) { - df_Ngen$ifdau[j] <- TRUE - } else { - next - } - } - } - - df_Ngen <- df_Ngen[order(as.numeric(rownames(df_Ngen))), , drop = FALSE] - df_Ngen <- df_Ngen[, -ncol(df_Ngen)] - - return(df_Ngen) -} diff --git a/R/helpSimulatePedigree.R b/R/helpSimulatePedigree.R new file mode 100644 index 00000000..6a9722f0 --- /dev/null +++ b/R/helpSimulatePedigree.R @@ -0,0 +1,253 @@ +#' Create Data Frame for Generation +#' +#' This function creates a data frame for a specific generation within the simulated pedigree. +#' It initializes the data frame with default values for family ID, individual ID, generation number, +#' paternal ID, maternal ID, spouse ID, and sex. All individuals are initially set with NA for paternal, +#' maternal, spouse IDs, and sex, awaiting further assignment. +#' +#' @param sizeGens A numeric vector containing the sizes of each generation within the pedigree. +#' @param genIndex An integer representing the current generation index for which the data frame is being created. +#' @param idGen A numeric vector containing the ID numbers to be assigned to individuals in the current generation. +#' @param beta logical. If TRUE, use the optimized version of the algorithm. +#' @param family_id_prefix A character string to prefix the family ID. Default is "fam". +#' @return A data frame representing the initial structure for the individuals in the specified generation +#' before any relationships (parental, spousal) are defined. The columns include family ID (`fam`), +#' individual ID (`id`), generation number (`gen`), father's ID (`pat`), mother's ID (`mat`), +#' spouse's ID (`spID`), and sex (`sex`), with NA values for paternal, maternal, and spouse IDs, and sex. +#' @examples +#' sizeGens <- c(3, 5, 4) # Example sizes for 3 generations +#' genIndex <- 2 # Creating data frame for the 2nd generation +#' idGen <- 101:105 # Example IDs for the 2nd generation +#' df_Ngen <- createGenDataFrame(sizeGens, genIndex, idGen) +#' print(df_Ngen) +#' @export +createGenDataFrame <- function(sizeGens, genIndex, + idGen, + family_id_prefix = "fam", + beta = FALSE) { + if (beta) { + createGenDataFrame_beta( + sizeGens = sizeGens, + genIndex = genIndex, + family_id_prefix = family_id_prefix, + idGen = idGen + ) + } else { + n <- sizeGens[genIndex] + df_Ngen <- data.frame( + fam = rep(paste(family_id_prefix, 1), n, sep = ""), + id = idGen[seq_len(n)], + gen = rep.int(genIndex, n), + pat = rep(NA, n), # father id + mat = rep(NA, n), # mother id + spID = rep(NA, n), # spouse id + sex = rep(NA, n), + stringsAsFactors = FALSE + ) + return(df_Ngen) + } +} + +#' Determine Sex of Offspring +#' +#' This internal function assigns sexes to the offspring in a generation based on the specified sex ratio. +#' +#' @param idGen Vector of IDs for the generation. +#' @param sexR Numeric value indicating the sex ratio (proportion of males). +#' @param code_male The value to use for males. Default is "M" +#' @param code_female The value to use for females. Default is "F" +#' @return Vector of sexes ("M" for male, "F" for female) for the offspring. +#' @param beta logical. If TRUE, use the optimized version of the algorithm. +#' @importFrom stats runif +#' @export +determineSex <- function(idGen, sexR, code_male = "M", code_female = "F", + beta = FALSE) { + if (beta) { + determineSex_beta( + idGen = idGen, + sexR = sexR, + code_male = code_male, + code_female = code_female + ) + } else { + length_idGen <- length(idGen) + if (runif(1) > .5) { + sexVec1 <- rep(code_male, floor(length_idGen * sexR)) + sexVec2 <- rep(code_female, length_idGen - length(sexVec1)) + } else { + sexVec1 <- rep(code_female, floor(length_idGen * (1 - sexR))) + sexVec2 <- rep(code_male, length_idGen - length(sexVec1)) + } + sexVec <- sample(c(sexVec1, sexVec2)) + return(sexVec) + } +} + +#' Assign Couple IDs +#' +#' This subfunction assigns a unique couple ID to each mated pair in the generation. +#' Unmated individuals are assigned NA for their couple ID. +#' +#' @param df_Ngen The dataframe for the current generation, including columns for individual IDs and spouse IDs. +#' @param beta Logical, indicating whether to use the beta version of the function. +#' @return The input dataframe augmented with a 'coupleId' column, where each mated pair has a unique identifier. +#' @keywords internal +#' +assignCoupleIDs <- function(df_Ngen, beta = FALSE) { + if (beta) { + assignCoupleIDs_beta( + df_Ngen = df_Ngen + ) + } else { + df_Ngen$coupleId <- NA_character_ # Initialize the coupleId column with NAs + + sp <- df_Ngen$spID + id <- df_Ngen$id + mated <- !is.na(sp) + + if (any(mated)) { + lo <- pmin(id[mated], sp[mated]) + hi <- pmax(id[mated], sp[mated]) + key <- paste(lo, hi, sep = "_") + + # Assign coupleId for mated rows + df_Ngen$coupleId[mated] <- key + } + + return(df_Ngen) + } +} + + +#' @rdname assignCoupleIDs +assignCoupleIds <- assignCoupleIDs + +#' Generate or Adjust Number of Kids per Couple Based on Mating Rate +#' +#' This function generates or adjusts the number of kids per couple in a generation +#' based on the specified average and whether the count should be randomly determined. +#' +#' @param nMates Integer, the number of mated pairs in the generation. +#' @inheritParams simulatePedigree +#' +#' @return A numeric vector with the generated or adjusted number of kids per couple. +#' @keywords internal +adjustKidsPerCouple <- function(nMates, kpc, rd_kpc = TRUE, beta = FALSE) { + if (beta) { + adjustKidsPerCouple_beta( + nMates = nMates, + kpc = kpc, + rd_kpc = rd_kpc + ) + } else { + if (rd_kpc == TRUE) { + target <- nMates * kpc + diff <- nMates + 1 + while (diff > nMates) { + random_numbers <- stats::rpois(nMates, kpc) + # cat("original random numbers", random_numbers, "\n") + sum_random_numbers <- sum(random_numbers) + diff <- abs(target - sum_random_numbers) + } + + if (diff > 0) { + if (sum_random_numbers < target) { + # Add 1 to the smallest 'diff' entries, preserving original order afterwards + order_random_numbers <- order(random_numbers) # indices of sorted ascending + idx <- order_random_numbers[seq_len(diff)] + random_numbers[idx] <- random_numbers[idx] + 1 + } else if (sum_random_numbers > target) { + # make sure the sum of kids per couple is equal to the number of kids in the i th generation + order_random_numbers <- order(random_numbers, decreasing = TRUE) + idx <- order_random_numbers[seq_len(diff)] + random_numbers[idx] <- random_numbers[idx] - 1 + } + } + } else { + random_numbers <- rep.int(kpc, nMates) + } + + if (min(random_numbers) < 0) { + random_numbers[random_numbers == -1] <- 0 + random_numbers[random_numbers == max(random_numbers)] <- max(random_numbers) - 1 + } + + return(random_numbers) + } +} + +#' Mark and Assign children +#' +#' This subfunction marks individuals in a generation as potential sons, daughters, +#' or parents based on their relationships and assigns unique couple IDs. It processes +#' the assignment of roles and relationships within and between generations in a pedigree simulation. +#' @inheritParams determineSex +#' @param df_Ngen A data frame for the current generation being processed. +#' It must include columns for individual IDs (`id`), spouse IDs (`spID`), sex (`sex`), +#' and any previously assigned roles (`ifparent`, `ifson`, `ifdau`). +#' @param i Integer, the index of the current generation being processed. +#' @param Ngen Integer, the total number of generations in the simulation. +#' @param sizeGens Numeric vector, containing the size (number of individuals) of each generation. +#' @param CoupleF Integer scalar giving the number of distinct mating couples in the current +#' generation `i`. This is typically computed upstream from the spouse assignments +#' (e.g., as the number of unique non-missing spouse pairs in `df_Ngen`) and must satisfy +#' `0 <= CoupleF <= floor(sizeGens[i] / 2)`. +#' +#' @return Modifies `df_Ngen` in place by updating or adding columns related to individual roles +#' (`ifparent`, `ifson`, `ifdau`) and couple IDs (`coupleId`). The updated data frame is +#' also returned for integration into the larger pedigree data frame (`df_Fam`). +#' + +markPotentialChildren <- function(df_Ngen, i, Ngen, sizeGens, CoupleF, code_male = "M", code_female = "F", beta = FALSE) { + # Step 2.1: mark a group of potential sons and daughters in the i th generation + + if (beta) { + markPotentialChildren_beta( + df_Ngen = df_Ngen, + i = i, + Ngen = Ngen, + sizeGens = sizeGens, + CoupleF = CoupleF, + code_male = code_male, + code_female = code_female + ) + } else { + # get all couple ids + coupleID <- unique(df_Ngen$coupleId[!is.na(df_Ngen$coupleId)]) + if (i == Ngen) { + CoupleF <- 0 + } + + if (CoupleF > 0L && length(coupleID) > 0L) { + CoupleF <- min(CoupleF, length(coupleID)) + coupleGirl <- sample(coupleID, CoupleF) + } else { + coupleGirl <- character(0) + } + + coupleBoy <- coupleID[!coupleID %in% coupleGirl] + + + # single person should all be sons or daus + # change the ifson and ifdau based on coupleGirl and coupleBoy + is_single <- is.na(df_Ngen$spID) + + # Singles: based only on own sex + single_f <- is_single & (df_Ngen$sex == code_female) + single_m <- is_single & (df_Ngen$sex == code_male) + df_Ngen$ifdau[single_f] <- TRUE + df_Ngen$ifson[single_m] <- TRUE + + # Mated: based on couple assignment and sex restriction + is_mated <- !is_single + mated_son <- is_mated & (df_Ngen$coupleId %in% coupleBoy) & (df_Ngen$sex == code_male) + mated_dau <- is_mated & (df_Ngen$coupleId %in% coupleGirl) & (df_Ngen$sex == code_female) + df_Ngen$ifson[mated_son] <- TRUE + df_Ngen$ifdau[mated_dau] <- TRUE + + df_Ngen <- df_Ngen[order(as.numeric(rownames(df_Ngen))), , drop = FALSE] + df_Ngen$coupleId <- NULL + + return(df_Ngen) + } +} diff --git a/R/helpSimulatePedigree_beta.R b/R/helpSimulatePedigree_beta.R new file mode 100644 index 00000000..4a9e14b9 --- /dev/null +++ b/R/helpSimulatePedigree_beta.R @@ -0,0 +1,137 @@ +#' Create Data Frame for Generation +#' @rdname createGenDataFrame +createGenDataFrame_beta <- function(sizeGens, genIndex, idGen, + family_id_prefix = "fam") { + n <- sizeGens[genIndex] + df_Ngen <- data.frame( + fam = rep(paste(family_id_prefix, 1), n, sep = ""), + id = idGen[seq_len(n)], + gen = rep.int(genIndex, n), + pat = rep(NA, n), # father id + mat = rep(NA, n), # mother id + spID = rep(NA, n), # spouse id + sex = rep(NA, n), + stringsAsFactors = FALSE + ) + return(df_Ngen) +} + + +#' Determine Sex of Offspring +#' @rdname determineSex +determineSex_beta <- function(idGen, sexR, code_male = "M", code_female = "F") { + length_idGen <- length(idGen) + if (runif(1) > .5) { + sexVec1 <- rep(code_male, floor(length_idGen * sexR)) + sexVec2 <- rep(code_female, length_idGen - length(sexVec1)) + } else { + sexVec1 <- rep(code_female, floor(length_idGen * (1 - sexR))) + sexVec2 <- rep(code_male, length_idGen - length(sexVec1)) + } + sexVec <- sample(c(sexVec1, sexVec2)) + return(sexVec) +} + +#' Assign Couple IDs +#' @rdname assignCoupleIDs +assignCoupleIDs_beta <- function(df_Ngen) { + df_Ngen$coupleId <- NA_character_ # Initialize the coupleId column with NAs + + sp <- df_Ngen$spID + id <- df_Ngen$id + mated <- !is.na(sp) + + if (any(mated)) { + lo <- pmin(id[mated], sp[mated]) + hi <- pmax(id[mated], sp[mated]) + key <- paste(lo, hi, sep = "_") + + # Assign coupleId for mated rows + df_Ngen$coupleId[mated] <- key + } + + return(df_Ngen) +} + + +#' Generate or Adjust Number of Kids per Couple Based on Mating Rate +#' @rdname adjustKidsPerCouple +adjustKidsPerCouple_beta <- function(nMates, kpc, rd_kpc = TRUE) { + if (rd_kpc == TRUE) { + target <- nMates * kpc + diff <- nMates + 1 + while (diff > nMates) { + random_numbers <- stats::rpois(nMates, kpc) + # cat("original random numbers", random_numbers, "\n") + sum_random_numbers <- sum(random_numbers) + diff <- abs(target - sum_random_numbers) + } + + if (diff > 0) { + if (sum_random_numbers < target) { + # Add 1 to the smallest 'diff' entries, preserving original order afterwards + order_random_numbers <- order(random_numbers) # indices of sorted ascending + idx <- order_random_numbers[seq_len(diff)] + random_numbers[idx] <- random_numbers[idx] + 1 + } else if (sum_random_numbers > target) { + # make sure the sum of kids per couple is equal to the number of kids in the i th generation + order_random_numbers <- order(random_numbers, decreasing = TRUE) + idx <- order_random_numbers[seq_len(diff)] + random_numbers[idx] <- random_numbers[idx] - 1 + } + } + } else { + random_numbers <- rep.int(kpc, nMates) + } + + if (min(random_numbers) < 0) { + random_numbers[random_numbers == -1] <- 0 + random_numbers[random_numbers == max(random_numbers)] <- max(random_numbers) - 1 + } + + return(random_numbers) +} + +#' Mark Potential Children in a Generation +#' @rdname markPotentialChildren +markPotentialChildren_beta <- function(df_Ngen, i, Ngen, sizeGens, CoupleF, code_male = "M", code_female = "F") { + # Step 2.1: mark a group of potential sons and daughters in the i th generation + + # get all couple ids + coupleID <- unique(df_Ngen$coupleId[!is.na(df_Ngen$coupleId)]) + if (i == Ngen) { + CoupleF <- 0 + } + + if (CoupleF > 0L && length(coupleID) > 0L) { + CoupleF <- min(CoupleF, length(coupleID)) + coupleGirl <- sample(coupleID, CoupleF) + } else { + coupleGirl <- character(0) + } + + coupleBoy <- coupleID[!coupleID %in% coupleGirl] + + + # single person should all be sons or daus + # change the ifson and ifdau based on coupleGirl and coupleBoy + is_single <- is.na(df_Ngen$spID) + + # Singles: based only on own sex + single_f <- is_single & (df_Ngen$sex == code_female) + single_m <- is_single & (df_Ngen$sex == code_male) + df_Ngen$ifdau[single_f] <- TRUE + df_Ngen$ifson[single_m] <- TRUE + + # Mated: based on couple assignment and sex restriction + is_mated <- !is_single + mated_son <- is_mated & (df_Ngen$coupleId %in% coupleBoy) & (df_Ngen$sex == code_male) + mated_dau <- is_mated & (df_Ngen$coupleId %in% coupleGirl) & (df_Ngen$sex == code_female) + df_Ngen$ifson[mated_son] <- TRUE + df_Ngen$ifdau[mated_dau] <- TRUE + + df_Ngen <- df_Ngen[order(as.numeric(rownames(df_Ngen))), , drop = FALSE] + df_Ngen <- df_Ngen[, -ncol(df_Ngen)] + + return(df_Ngen) +} diff --git a/R/insertEven.R b/R/insertEven.R index 86671a21..3c35b915 100644 --- a/R/insertEven.R +++ b/R/insertEven.R @@ -13,28 +13,45 @@ #' @seealso \code{\link{SimPed}} for the main function that uses this supporting function. insertEven <- function(m, n, verbose = FALSE) { - if (length(m) > length(n)) { + lm <- length(m) + ln <- length(n) + if (lm > ln) { temp <- m m <- n n <- temp + + temp <- lm + lm <- ln + ln <- temp + temp <- NULL + + if (isTRUE(verbose)) { + message("Swapped m and n because length(m) > length(n)") + } } - # idx <- numeric() - for (i in seq_along(m)) { - names(m)[i] <- ceiling(i * length(n) / length(m)) + + if (lm == 0L) { + return(unname(n)) } - if (verbose == TRUE) { - message(m) + if (ln == 0L) { + return(unname(m)) } - names(n) <- seq_along(n) - if (verbose == TRUE) { - message(n) + + pos_m <- ceiling(seq_len(lm) * ln / lm) + + if (isTRUE(verbose)) { + message("m insertion targets: ", paste(pos_m, collapse = ", ")) + message("n indices: ", paste(seq_len(ln), collapse = ", ")) } + + vec <- c(m, n) - vec <- vec[order(as.numeric(names(vec)))] - vec <- unname(vec) + primary <- c(pos_m, seq_len(ln)) + secondary <- c(rep.int(0L, lm), rep.int(1L, ln)) + vec <- vec[order(primary, secondary)] - return(vec) + unname(vec) } #' @rdname insertEven diff --git a/R/simulatePedigree.R b/R/simulatePedigree.R index 1c3948fd..0517eaa2 100644 --- a/R/simulatePedigree.R +++ b/R/simulatePedigree.R @@ -1,130 +1,3 @@ -#' Process Generations for Pedigree Simulation -#' -#' This function iterates through generations in a pedigree simulation, assigning IDs, -#' creating data frames, determining sexes, and managing pairing within each generation. -#' -#' @inheritParams simulatePedigree -#' @inheritParams createGenDataFrame -#' @return A data frame representing the simulated pedigree, including columns for family ID (`fam`), -buildWithinGenerations <- function(sizeGens, marR, sexR, Ngen, verbose = FALSE, - personID = "ID", - momID = "momID", - dadID = "dadID", - code_male = "M", - code_female = "F", - fam_shift = 1L) { - idx_width <- nchar(max(sizeGens)) - gen_width <- max(2L, nchar(Ngen)) - fam_shift <- 1L - - for (i in 1:Ngen) { - # idGen <- as.numeric(paste(100, i, 1:sizeGens[i], sep = "")) - idGen <- fam_shift * 10^(gen_width + idx_width) + i * 10^(idx_width) + (1:sizeGens[i]) - # idGen <- ifelse(i==1, - # paste(i,"-",1:sizeGens[i]), - # paste(i,"-",sizeGens[i-1]:sizeGens[i])) - - ### For each generation, create a separate dataframe - df_Ngen <- createGenDataFrame( - sizeGens = sizeGens, - genIndex = i, - idGen = idGen - ) - - ### Let's deal with the sex in each generation first - - df_Ngen$sex <- determineSex( - idGen = idGen, sexR = sexR, - code_male = code_male, - code_female = code_female - ) - - # message(paste("tiger",i)) - # The first generation - if (i == 1) { - df_Ngen$spID[1] <- df_Ngen$id[2] - df_Ngen$spID[2] <- df_Ngen$id[1] - - df_Ngen$sex[1] <- code_female - df_Ngen$sex[2] <- code_male - } - - ## Connect male and female into couples in each generations - marR_crt <- (1 + marR) / 2 - usedFemaleIds <- numeric() - usedMaleIds <- numeric() - # reserve the single persons - if (i != 1 && i != Ngen) { - nMarriedFemale <- round(sum(df_Ngen$sex == code_female) * marR_crt) - nMarriedMale <- round(sum(df_Ngen$sex == code_male) * marR_crt) - # make sure there are same numbers of married males and females - if (nMarriedFemale >= nMarriedMale) { - nMarriedFemale <- nMarriedMale - } else { - nMarriedMale <- nMarriedFemale - } - # get the number of single males and females - nSingleFemale <- sum(df_Ngen$sex == code_female) - nMarriedFemale - nSingleMale <- sum(df_Ngen$sex == code_male) - nMarriedMale - - - # sample single ids from male ids and female ids - usedFemaleIds <- sample(df_Ngen$id[df_Ngen$sex == code_female], nSingleFemale) - ## message(c("Used F", usedFemaleIds)) - usedMaleIds <- sample(df_Ngen$id[df_Ngen$sex == code_male], nSingleMale) - ## message(c("Used M", usedMaleIds)) - - usedIds <- c(usedFemaleIds, usedMaleIds) - - # Create spouses - for (j in seq_len(nrow(df_Ngen))) { - if (df_Ngen$id[j] %in% usedIds) { - next - } else { - # idx <- j+1 - if (df_Ngen$sex[j] == code_female) { - for (k in seq_len(nrow(df_Ngen))) { - idr <- df_Ngen$id[k] - tgt <- (!(idr %in% usedIds)) & df_Ngen$sex[k] == code_male - # tgt <- ifelse(is.na(tgt),FALSE,TRUE) - if (tgt) { - df_Ngen$spID[j] <- df_Ngen$id[k] - df_Ngen$spID[k] <- df_Ngen$id[j] - usedIds <- c(usedIds, df_Ngen$id[j], df_Ngen$id[k]) - break - } else { - next - } - } - } else { - for (k in seq_len(nrow(df_Ngen))) { - idr <- df_Ngen$id[k] - tgt <- (!(idr %in% usedIds)) & df_Ngen$sex[k] == code_female - # tgt <- ifelse(is.na(tgt),FALSE,TRUE) - if (tgt) { - df_Ngen$spID[j] <- df_Ngen$id[k] - df_Ngen$spID[k] <- df_Ngen$id[j] - usedIds <- c(usedIds, df_Ngen$id[j], df_Ngen$id[k]) - break - } else { - next - } - } - } - } - # message(usedIds) - } - } - if (i == 1) { - df_Fam <- df_Ngen - } else { - df_Fam <- rbind(df_Fam, df_Ngen) - } - } - return(df_Fam) -} - - #' Process Generation Connections #' #' This function processes connections between each two generations in a pedigree simulation. @@ -153,203 +26,439 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose = FALSE, mar rd_kpc, personID = "ID", momID = "momID", dadID = "dadID", - code_male = "M", code_female = "F") { + code_male = "M", code_female = "F", beta = FALSE) { + # Normalize string aliases to logical values for downstream functions + use_optimized <- FALSE + + if (beta %in% c("index", "indexed")) { + stop("The 'index' or 'indexed' option for parameter 'beta' is not yet implemented.") + } else if (isTRUE(beta) || identical(beta, "optimized")) { + use_optimized <- TRUE + } else if (isFALSE(beta) || beta %in% c("base", "original") || is.null(beta)) { + use_optimized <- FALSE + } else { + stop("Invalid value for parameter 'beta'. Accepted values are TRUE, FALSE, 'optimized', 'base', 'original', or 'index'/'indexed'.") + } + + if (use_optimized) { + df_Fam <- buildBetweenGenerations_optimized( + df_Fam = df_Fam, + Ngen = Ngen, + sizeGens = sizeGens, + verbose = verbose, + marR = marR, + sexR = sexR, + kpc = kpc, + rd_kpc = rd_kpc, + personID = personID, + momID = momID, + dadID = dadID, + code_male = code_male, + code_female = code_female, + beta = TRUE + ) + } else { + df_Fam <- buildBetweenGenerations_base( + df_Fam = df_Fam, + Ngen = Ngen, + sizeGens = sizeGens, + verbose = verbose, + marR = marR, + sexR = sexR, + kpc = kpc, + rd_kpc = rd_kpc, + personID = personID, + momID = momID, + dadID = dadID, + code_male = code_male, + code_female = code_female, + beta = FALSE + ) + } + return(df_Fam) +} + + +buildBetweenGenerations_base <- function(df_Fam, + Ngen, + sizeGens, + verbose = FALSE, + marR, sexR, kpc, + rd_kpc, personID = "ID", + momID = "momID", + dadID = "dadID", + code_male = "M", + code_female = "F", + beta = FALSE) { + # Initialize flags for the full pedigree data frame. + # These are used throughout linkage and get overwritten per-generation as needed. + df_Fam$ifparent <- FALSE df_Fam$ifson <- FALSE df_Fam$ifdau <- FALSE - for (i in 1:Ngen) { - # generation 1 doesn't need any mother and father + + # Precompute row indices per generation once. + # This avoids repeated df_Fam$gen == i scans inside loops. + gen_rows <- split(seq_len(nrow(df_Fam)), df_Fam$gen) + + # Loop across generations 1..Ngen. + + for (i in seq_len(Ngen)) { + # ------------------------------------------------------------------------- + # Generation 1: base case + # Generation 1 individuals are founders and are treated as "parents" by design. + # They do not have assigned mother/father, so we just set flags and continue. + # ------------------------------------------------------------------------- + if (i == 1) { - df_Ngen <- df_Fam[df_Fam$gen == i, ] + rows_i <- gen_rows[[as.character(i)]] + df_Ngen <- df_Fam[rows_i, , drop = FALSE] + + # Mark everyone in generation 1 as parents (founder couple logic occurs earlier). df_Ngen$ifparent <- TRUE df_Ngen$ifson <- FALSE df_Ngen$ifdau <- FALSE - df_Fam[df_Fam$gen == i, ] <- df_Ngen + df_Fam[rows_i, ] <- df_Ngen + # Write back into the main df_Fam. } else { # calculate the number of couples in the i-1 th generation - N_couples <- (sizeGens[i - 1] - sum(is.na(df_Fam$spID[df_Fam$gen == i - 1]))) * 0.5 - # calculate the number of members in the i th generation that have a link to the couples in the i-1 th generation + rows_i <- gen_rows[[as.character(i)]] + rows_prev <- gen_rows[[as.character(i - 1)]] + + # ------------------------------------------------------------------------- + # Step A: Determine how many couples exist in generation i-1 + # + # In your representation, each coupled individual has a non-NA spID, and each couple + # appears twice (one row per spouse). Therefore: + # number_of_couples = (number_of_non_single_individuals) / 2 + # where number_of_non_single_individuals = sizeGens[i-1] - count(NA spID) + # ------------------------------------------------------------------------- + + N_couples <- (sizeGens[i - 1] - sum(is.na(df_Fam$spID[rows_prev]))) * 0.5 + + # Expected number of offspring linked to those couples (before sex split). + N_LinkedMem <- N_couples * kpc - # decompose the linked members into females and males respectively + # Split linked offspring into female and male counts using sexR, + # where sexR is the proportion male, so (1 - sexR) is the proportion female. + N_LinkedFemale <- round(N_LinkedMem * (1 - sexR)) N_LinkedMale <- N_LinkedMem - N_LinkedFemale - # Create a pool for used male children and female children respectively - usedFemaleIds <- numeric() - usedMaleIds <- numeric() - usedIds <- c(usedFemaleIds, usedMaleIds) + + # ------------------------------------------------------------------------- + # Step B: Prepare generation i data, assign couple IDs, and mark potential children + # ------------------------------------------------------------------------- # get the df for the i the generation - df_Ngen <- df_Fam[df_Fam$gen == i, ] + df_Ngen <- df_Fam[rows_i, , drop = FALSE] + + + # Reset per-generation fields that will be recomputed. df_Ngen$ifparent <- FALSE df_Ngen$ifson <- FALSE df_Ngen$ifdau <- FALSE df_Ngen$coupleId <- NA_character_ - df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), ] + + # Randomly permute generation i rows so selection is not tied to row order. + df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE] # Start to connect children with mother and father - # + if (verbose == TRUE) { message( "Step 2.1: mark a group of potential sons and daughters in the i th generation" ) } - # try to rewrite the code + # count the number of couples in the i th gen countCouple <- (nrow(df_Ngen) - sum(is.na(df_Ngen$spID))) * .5 - # Now, assign couple IDs for the current generation - df_Ngen <- assignCoupleIds(df_Ngen) + # Assign couple IDs within generation i. + df_Ngen <- assignCoupleIds(df_Ngen, beta = beta) + + # Identify singles in generation i (no spouse). - # get the number of linked female and male children after excluding the single children - # get a vector of single person id in the ith generation IdSingle <- df_Ngen$id[is.na(df_Ngen$spID)] + + # Count singles by sex; these affect how many "linked" children can come from couples. SingleF <- sum(df_Ngen$sex == code_female & is.na(df_Ngen$spID)) - CoupleF <- N_LinkedFemale - SingleF SingleM <- sum(df_Ngen$sex == code_male & is.na(df_Ngen$spID)) - # CoupleM <- N_LinkedMale - SingleM - df_Fam[df_Fam$gen == i, ] <- markPotentialChildren( + # Number of linked females that must come from couples after excluding single females. + # This value is passed into markPotentialChildren, which decides who becomes ifson/ifdau. + + CoupleF <- N_LinkedFemale - SingleF + + # Mark potential sons and daughters within generation i. + # This writes ifson/ifdau into the returned data frame + df_Fam[rows_i, ] <- markPotentialChildren( df_Ngen = df_Ngen, i = i, Ngen = Ngen, sizeGens = sizeGens, CoupleF = CoupleF, code_male = code_male, - code_female = code_female + code_female = code_female, + beta = beta ) + + # ------------------------------------------------------------------------- + # Step C: Mark a subset of generation i-1 couples as parents (ifparent) + # + # Goal: choose enough married couples (based on marR) to be parents. + # We walk through a randomized order of generation i-1, and whenever we select + # an individual who has a spouse, we mark both spouses as ifparent. + # ------------------------------------------------------------------------- + if (verbose == TRUE) { message( "Step 2.2: mark a group of potential parents in the i-1 th generation" ) } - df_Ngen <- df_Fam[df_Fam$gen == i - 1, ] + df_Ngen <- df_Fam[rows_prev, , drop = FALSE] + + # Reset flags within i-1 before reselecting parent couples. df_Ngen$ifparent <- FALSE df_Ngen$ifson <- FALSE df_Ngen$ifdau <- FALSE - df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), ] - # Create a pool for the used parents - usedParentIds <- numeric() - for (k in 1:sizeGens[i - 1]) { - # first check if the number of married couples surpass the marriage rate - if (sum(df_Ngen$ifparent) / nrow(df_Ngen) >= marR) { + # Randomize order so parent selection is not tied to row ordering. + df_Ngen <- df_Ngen[sample(nrow(df_Ngen)), , drop = FALSE] + + + # Boolean vector that tracks which rows in df_prev are selected as parents. + # Start all FALSE. + isUsedParent <- df_Ngen$ifparent + + # Loop over up to sizeGens[i-1] positions. + # Stop early once the parent selection proportion reaches marR. + nrow_df_Ngen <- nrow(df_Ngen) + + for (k in seq_len(sizeGens[i - 1])) { + # Proportion of individuals currently marked as parents in df_prev. + # Since we always mark spouses together, this moves in steps of 2. + if (sum(isUsedParent) / nrow_df_Ngen >= marR) { + df_Ngen$ifparent <- isUsedParent break } else { - # check if the id is used and if the member has married - if (!(df_Ngen$id[k] %in% usedParentIds) && !is.na(df_Ngen$spID[k])) { - df_Ngen$ifparent[k] <- TRUE - df_Ngen$ifparent[df_Ngen$spID == df_Ngen$id[k]] <- TRUE - usedParentIds <- c(usedParentIds, df_Ngen$id[k], df_Ngen$spID[k]) + # Only select someone as a parent if: + # 1) they are not already used as a parent, and + # 2) they have a spouse (spID not NA), because singles cannot form a parent couple. + + + if (!(isUsedParent[k]) && !is.na(df_Ngen$spID[k])) { # Mark this individual as parent. + + isUsedParent[k] <- TRUE + # Mark their spouse row as parent too. + # This works because spouse IDs are unique within a generation in this simulation. + isUsedParent[df_Ngen$spID == df_Ngen$id[k]] <- TRUE } else { next } } } + df_Ngen$ifparent <- isUsedParent + + # Restore original row order for df_prev before writing back into df_Fam. + df_Ngen <- df_Ngen[order(as.numeric(rownames(df_Ngen))), , drop = FALSE] - df_Fam[df_Fam$gen == i - 1, ] <- df_Ngen + + df_Fam[rows_prev, ] <- df_Ngen + if (verbose == TRUE) { message( "Step 2.3: connect the i and i-1 th generation" ) } + + if (i == 1) { next } else { - # get the df for i and i-1 th generations - df_Ngen <- df_Fam[df_Fam$gen %in% c(i, i - 1), ] + # Pull the two generations together. + df_Ngen <- df_Fam[df_Fam$gen %in% c(i, i - 1), , drop = FALSE] + sizeI <- sizeGens[i - 1] sizeII <- sizeGens[i] - # create a vector with ordered ids that should be connected to a parent - # message(df_Ngen) + + # Collect IDs of marked sons and daughters in generation i. IdSon <- df_Ngen$id[df_Ngen$ifson == TRUE & df_Ngen$gen == i] - # message(IdSon) IdDau <- df_Ngen$id[df_Ngen$ifdau == TRUE & df_Ngen$gen == i] - # message(IdDau) + + # Interleave sons and daughters to get an offspring list. IdOfp <- evenInsert(IdSon, IdDau) + # nMates is number of parent couples selected (ifparent rows are individuals). + nMates <- sum(df_Ngen$ifparent) / 2 + + # If no mates or no offspring were selected for linkage, skip linkage. + if (nMates <= 0 || length(IdOfp) == 0) { + df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ] + df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ] + next + } + # generate link kids to the couples random_numbers <- adjustKidsPerCouple( nMates = sum(df_Ngen$ifparent) / 2, kpc = kpc, - rd_kpc = rd_kpc + rd_kpc = rd_kpc, + beta = beta ) - # cat("final random numbers",random_numbers, "\n") - # cat("mean",sum(random_numbers)/length(random_numbers), "\n") - # create two vectors for maId and paId; replicate the ids to match the same length as IdOfp - IdMa <- numeric() - IdPa <- numeric() - usedIds <- numeric() - idx <- 1 - - for (l in 1:sizeI) { - # check if the id is used - if (!df_Ngen$id[l] %in% usedIds) { - # check if the member can be a parent - if (df_Ngen$ifparent[l] == TRUE && df_Ngen$sex[l] == code_female) { - usedIds <- c(usedIds, df_Ngen$id[l], df_Ngen$spID[l]) - IdMa <- c(IdMa, rep(df_Ngen$id[l], random_numbers[idx])) - IdPa <- c(IdPa, rep(df_Ngen$spID[l], random_numbers[idx])) - idx <- idx + 1 - } else if (df_Ngen$ifparent[l] == TRUE && df_Ngen$sex[l] == code_male) { - usedIds <- c(usedIds, df_Ngen$id[l], df_Ngen$spID[l]) - IdPa <- c(IdPa, rep(df_Ngen$id[l], random_numbers[idx])) - IdMa <- c(IdMa, rep(df_Ngen$spID[l], random_numbers[idx])) - idx <- idx + 1 - } else { - next - } - } else { - next - } + # Guard: adjustKidsPerCouple returned nothing usable + if (length(random_numbers) == 0 || all(is.na(random_numbers))) { + df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ] + df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ] + next } - # the length of IdMa and IdPa can be longer than the vector of offspring, so truncated it - ### making sure sampling out the single people instead of couples + # ------------------------------------------------------------------------- + # Step E: Build parent assignment vectors IdMa and IdPa + # + # The goal is to expand couples into per-child vectors of mother IDs and father IDs, + # where each couple contributes random_numbers[couple_index] children. + # + # Important: df_Ngen contains both generations. We only want parent generation rows. + # ------------------------------------------------------------------------- + + # Identify rows in df_Ngen that belong to generation i-1 (parent generation). + rows_prev_in_pair <- which(df_Ngen$gen == (i - 1)) + + # Extract parent generation into a smaller frame to make operations faster and clearer. + prev <- df_Ngen[rows_prev_in_pair, , drop = FALSE] + + # Keep only those rows that are marked ifparent and are actually paired (non-NA spID). + parent_rows <- which(prev$ifparent == TRUE & !is.na(prev$spID)) + + # If no usable parent couples remain, skip linkage. + if (length(parent_rows) == 0) { + df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ] + df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ] + next + } + # Create a symmetric couple key so we can keep only one row per couple. + a <- pmin(prev$id, prev$spID) + b <- pmax(prev$id, prev$spID) + couple_key <- paste(a, b, sep = "_") + + # Keep only the first row for each couple among the parent rows. + parent_rows <- parent_rows[!duplicated(couple_key[parent_rows])] + + # Determine whether each kept row corresponds to the female member of the couple. + # If the kept row is female: mother = id, father = spID + # If the kept row is male: father = id, mother = spID + is_female_row <- prev$sex[parent_rows] == code_female + # One mother ID per couple. + ma_ids <- ifelse(is_female_row, prev$id[parent_rows], prev$spID[parent_rows]) + + # One father ID per couple. + pa_ids <- ifelse(is_female_row, prev$spID[parent_rows], prev$id[parent_rows]) + + # Align lengths between couples and random_numbers. + # If random_numbers is longer than couples, truncate random_numbers. + # If random_numbers is shorter than couples, drop extra couples. + nCouples <- length(parent_rows) + + if (length(random_numbers) > nCouples) { + random_numbers <- random_numbers[seq_len(nCouples)] + } else if (length(random_numbers) < nCouples) { + keep <- seq_len(length(random_numbers)) + ma_ids <- ma_ids[keep] + pa_ids <- pa_ids[keep] + } + + # Expand from "one mother/father per couple" to "one mother/father per child". + # rep.int is used to avoid extra overhead. + IdMa <- rep.int(ma_ids, times = random_numbers) + IdPa <- rep.int(pa_ids, times = random_numbers) + + # ------------------------------------------------------------------------- + # Step F: Ensure IdMa/IdPa length matches the number of offspring IdOfp + # + # Two mismatch cases: + # 1) Too many parent slots relative to offspring: drop excess parent slots. + # 2) Too many offspring relative to parent slots: drop some offspring. + # + # drop singles first (IdSingle) when reducing offspring. + # ------------------------------------------------------------------------- + + if (length(IdPa) - length(IdOfp) > 0) { if (verbose == TRUE) { message("length of IdPa", length(IdPa), "\n") } - IdRm <- sample.int(length(IdPa), size = length(IdPa) - length(IdOfp)) - IdPa <- IdPa[-IdRm] - IdMa <- IdMa[-IdRm] + # Excess parent slots: randomly remove that many entries from IdPa and IdMa. + + excess <- length(IdPa) - length(IdOfp) + if (length(IdPa) > 0 && excess > 0) { + IdRm <- sample.int(length(IdPa), size = excess) + IdPa <- IdPa[-IdRm] + IdMa <- IdMa[-IdRm] + } } else if (length(IdPa) - length(IdOfp) < 0) { - # cat("length of IdOfp", length(IdOfp), "\n") - # cat("length of IdPa", length(IdPa), "\n") - # cat("length of IdSingle", length(IdMa), "\n") - IdRm <- resample(IdSingle, size = length(IdOfp) - length(IdPa)) + if (verbose == TRUE) { + message("length of IdOfp", length(IdOfp), "\n") + message("length of IdPa", length(IdPa), "\n") + message("length of IdSingle", length(IdMa), "\n") + } + + + # harden the resample call when IdSingle is empty: + # Need to drop some offspring because we do not have enough parent slots. + need_drop <- length(IdOfp) - length(IdPa) - IdOfp <- IdOfp[!(IdOfp %in% IdRm)] + if (need_drop > 0) { + if (length(IdSingle) > 0) { + # Preferentially remove offspring IDs that correspond to singles. + # resample is expected to return a vector of IDs to remove. + + IdRm <- resample(IdSingle, size = need_drop) + IdOfp <- IdOfp[!(IdOfp %in% IdRm)] + } else { + # If there are no singles to target, drop arbitrary offspring indices. + drop_idx <- sample.int(length(IdOfp), size = need_drop) + IdOfp <- IdOfp[-drop_idx] + } + } } - # if (length(IdMa)- length(IdOfp) > 0){ - # IdRm <- sample.int(length(IdMa),size =length(IdMa)-length(IdOfp)) - # IdPa <- IdPa[-IdRm] - # IdMa <- IdMa[-IdRm] - # }else if (length(IdMa)-length(IdOfp) < 0) { - # IdRm <- sample.int(length(IdOfp),size =length(IdOfp)-length(IdMa)) - # IdOfp <- IdOfp[-IdRm] - # } - # message(matrix(c(IdPa, IdMa), ncol = 2)) - - # message(IdPa) - # message(IdOfp) - - # put the IdMa and IdPa into the dfFam with correspondent OfpId - for (m in seq_along(IdOfp)) { - df_Ngen[df_Ngen$id == IdOfp[m], "pat"] <- IdPa[m] - df_Ngen[df_Ngen$id == IdOfp[m], "mat"] <- IdMa[m] + + # ------------------------------------------------------------------------- + # Step G: Assign pat/mat into df_Ngen for the selected offspring. + # + # Replaces the old loop: + # for (m in seq_along(IdOfp)) df_Ngen[df_Ngen$id == IdOfp[m], "pat"] <- ... + # Using match avoids repeated scanning over df_Ngen$id. + # ------------------------------------------------------------------------- + + # Find row positions in df_Ngen corresponding to offspring IDs. + child_rows <- match(IdOfp, df_Ngen$id) + # Only keep rows that matched successfully. + + ok <- !is.na(child_rows) + + if (any(ok)) { + # Assign father IDs and mother IDs to offspring rows. + + df_Ngen$pat[child_rows[ok]] <- IdPa[ok] + df_Ngen$mat[child_rows[ok]] <- IdMa[ok] } - # message(df_Ngen) - df_Fam[df_Fam$gen == i, ] <- df_Ngen[df_Ngen$gen == i, ] - df_Fam[df_Fam$gen == i - 1, ] <- df_Ngen[df_Ngen$gen == i - 1, ] + # ------------------------------------------------------------------------- + # Step H: Write the two generations back into df_Fam using the precomputed indices. + # ------------------------------------------------------------------------- + + df_Fam[rows_i, ] <- df_Ngen[df_Ngen$gen == i, ] + df_Fam[rows_prev, ] <- df_Ngen[df_Ngen$gen == i - 1, ] } } } return(df_Fam) } +buildBetweenGenerations_optimized <- buildBetweenGenerations_base # Placeholder for optimized version + #' Simulate Pedigrees #' This function simulates "balanced" pedigrees based on a group of parameters: @@ -383,7 +492,8 @@ buildBetweenGenerations <- function(df_Fam, Ngen, sizeGens, verbose = FALSE, mar #' @param code_male The value to use for males. Default is "M" #' @param code_female The value to use for females. Default is "F" #' @param fam_shift An integer to shift the person ID. Default is 1L. -#' +#' This is useful when simulating multiple pedigrees to avoid ID conflicts. +#' @param beta logical. If TRUE, use the optimized version of the algorithm. #' @param ... Additional arguments to be passed to other functions. #' @inheritParams ped2fam #' @param spouseID The name of the column that will contain the spouse ID in the output data frame. Default is "spID". @@ -421,7 +531,8 @@ simulatePedigree <- function(kpc = 3, spouseID = "spouseID", code_male = "M", code_female = "F", - fam_shift = 1L) { + fam_shift = 1L, + beta = FALSE) { # SexRatio: ratio of male over female in the offspring setting; used in the between generation combinations # SexRatio <- sexR / (1 - sexR) @@ -444,7 +555,8 @@ simulatePedigree <- function(kpc = 3, dadID = dadID, code_male = code_male, code_female = code_female, - fam_shift = fam_shift + fam_shift = fam_shift, + beta = beta ) if (verbose == TRUE) { message( @@ -465,7 +577,8 @@ simulatePedigree <- function(kpc = 3, momID = momID, dadID = dadID, code_male = code_male, - code_female = code_female + code_female = code_female, + beta = beta ) df_Fam <- df_Fam[, 1:7] @@ -475,14 +588,7 @@ simulatePedigree <- function(kpc = 3, # connect the detached members df_Fam[is.na(df_Fam[[momID]]) & is.na(df_Fam[[dadID]]) & df_Fam$gen > 1, ] - # if the sex rate is .5, make there is a 50% chance to change male to female and female to male - # doesn't seem to produce the expected results, sometimes leads to moms being classified as dads - # if (sexR == .5 & runif(1) > .5) { - # df_Fam$sex[df_Fam$sex == "M"] <- "F1" - # df_Fam$sex[df_Fam$sex == "F"] <- "M" - # df_Fam$sex[df_Fam$sex == "F1"] <- "F" - # } - # message(df_Fam) + return(df_Fam) } diff --git a/R/simulatePedigree_within.R b/R/simulatePedigree_within.R new file mode 100644 index 00000000..f49e3b9b --- /dev/null +++ b/R/simulatePedigree_within.R @@ -0,0 +1,222 @@ +#' Process Generations for Pedigree Simulation +#' +#' This function iterates through generations in a pedigree simulation, assigning IDs, +#' creating data frames, determining sexes, and managing pairing within each generation. +#' +#' @inheritParams simulatePedigree +#' @inheritParams createGenDataFrame +#' @return A data frame representing the simulated pedigree, including columns for family ID (`fam`), + +buildWithinGenerations <- function( + beta = FALSE, + sizeGens, marR, sexR, Ngen, verbose = FALSE, + personID = "ID", + momID = "momID", + dadID = "dadID", + code_male = "M", + code_female = "F", + fam_shift = 1L +) { + # Normalize string aliases to logical values for downstream functions + use_optimized <- FALSE + + if (beta %in% c("index", "indexed")) { + stop("The 'index' or 'indexed' option for parameter 'beta' is not yet implemented.") + } else if (isTRUE(beta) || identical(beta, "optimized")) { + use_optimized <- TRUE + } else if (isFALSE(beta) || beta %in% c("base", "original") || is.null(beta)) { + use_optimized <- FALSE + } else { + stop("Invalid value for parameter 'beta'. Accepted values are TRUE, FALSE, 'optimized', 'base', 'original', or 'index'/'indexed'.") + } + + if (use_optimized) { + df_Fam <- buildWithinGenerations_optimized( + sizeGens = sizeGens, + marR = marR, + sexR = sexR, + Ngen = Ngen, + verbose = verbose, + personID = personID, + momID = momID, + dadID = dadID, + code_male = code_male, + code_female = code_female, + fam_shift = fam_shift, + beta = TRUE + ) + } else { + df_Fam <- buildWithinGenerations_base( + sizeGens = sizeGens, + marR = marR, + sexR = sexR, + Ngen = Ngen, + verbose = verbose, + personID = personID, + momID = momID, + dadID = dadID, + code_male = code_male, + code_female = code_female, + fam_shift = fam_shift, + beta = FALSE + ) + } + return(df_Fam) +} + + +buildWithinGenerations_base <- function(sizeGens, marR, sexR, Ngen, verbose = FALSE, + personID = "ID", + momID = "momID", + dadID = "dadID", + code_male = "M", + code_female = "F", + fam_shift = 1L, + beta = FALSE) { + idx_width <- nchar(max(sizeGens)) + gen_width <- max(2L, nchar(Ngen)) + + + # Precompute powers once + pow_idx <- 10^idx_width + pow_gen <- 10^(gen_width + idx_width) + + ## Connect male and female into couples in each generations + marR_crt <- (1 + marR) / 2 + + # Initialize a list to store data frames for each generation + df_list <- vector("list", Ngen) + + + for (i in seq_len(Ngen)) { + # idGen <- as.numeric(paste(100, i, 1:sizeGens[i], sep = "")) + + idGen <- fam_shift * pow_gen + i * pow_idx + seq_len(sizeGens[i]) + + + ### For each generation, create a separate dataframe + df_Ngen <- createGenDataFrame( + sizeGens = sizeGens, + genIndex = i, + idGen = idGen, + beta = beta + ) + + ### Let's deal with the sex in each generation first + + df_Ngen$sex <- determineSex( + idGen = idGen, sexR = sexR, + code_male = code_male, + code_female = code_female, + beta = beta + ) + + # The first generation + if (i == 1) { + df_Ngen$spID[1] <- df_Ngen$id[2] + df_Ngen$spID[2] <- df_Ngen$id[1] + + df_Ngen$sex[1] <- code_female + df_Ngen$sex[2] <- code_male + } + + + # reserve the single persons + if (i != 1 && i != Ngen) { + # is faster + isFemale <- df_Ngen$sex == code_female + isMale <- df_Ngen$sex == code_male + + # get the number + totalFemale <- sum(isFemale) + totalMale <- sum(isMale) + + nMarriedFemale <- round(totalFemale * marR_crt) + nMarriedMale <- round(totalMale * marR_crt) + + # make sure there are same numbers of married males and females + nMarriedMale <- nMarriedFemale <- min(nMarriedFemale, nMarriedMale) + + + # + if (nMarriedFemale > totalFemale) { + nMarriedFemale <- totalFemale + } + if (nMarriedMale > totalMale) { + nMarriedMale <- totalMale + } + # get the number of single males and females + nSingleFemale <- max(totalFemale - nMarriedFemale, 0) + nSingleMale <- max(totalMale - nMarriedMale, 0) + + + # sample single ids from male ids and female ids (guard against size > available) + femaleIds <- df_Ngen$id[isFemale] + maleIds <- df_Ngen$id[isMale] + + nSingleFemale <- min(nSingleFemale, length(femaleIds)) + nSingleMale <- min(nSingleMale, length(maleIds)) + + usedFemaleIds <- if (nSingleFemale > 0) sample(femaleIds, nSingleFemale) else numeric() + usedMaleIds <- if (nSingleMale > 0) sample(maleIds, nSingleMale) else numeric() + + isUsed <- df_Ngen$id %in% c(usedFemaleIds, usedMaleIds) + + # Create spouses + nrows_df_Ngen <- nrow(df_Ngen) + availFemale <- which(isFemale & !isUsed) + availMale <- which(isMale & !isUsed) + + length_availMale <- length(availMale) + length_availFemale <- length(availFemale) + + # next unused pointer + ptrFemale <- 1L + ptrMale <- 1L + + + for (j in seq_len(nrows_df_Ngen)) { + if (isUsed[j]) { + next + } + + if (df_Ngen$sex[j] == code_female) { + # only runs when the person is not used + while (ptrMale <= length_availMale && isUsed[availMale[ptrMale]]) { + ptrMale <- ptrMale + 1L + } + # if all used males, skip + if (ptrMale > length_availMale) { + next + } + k <- availMale[ptrMale] + ptrMale <- ptrMale + 1L + } else { + while (ptrFemale <= length_availFemale && isUsed[availFemale[ptrFemale]]) { + ptrFemale <- ptrFemale + 1L + } + if (ptrFemale > length_availFemale) { + next + } + k <- availFemale[ptrFemale] + ptrFemale <- ptrFemale + 1L + } + + + df_Ngen$spID[j] <- df_Ngen$id[k] + df_Ngen$spID[k] <- df_Ngen$id[j] + + isUsed[j] <- TRUE + isUsed[k] <- TRUE + } + } + + df_list[[i]] <- df_Ngen + } + + df_Fam <- do.call(rbind, df_list) + rownames(df_Fam) <- NULL + return(df_Fam) +} + +buildWithinGenerations_optimized <- buildWithinGenerations_base diff --git a/data-raw/benchmarkng nopointers.png b/data-raw/benchmarkng nopointers.png new file mode 100644 index 00000000..ef2d96a4 Binary files /dev/null and b/data-raw/benchmarkng nopointers.png differ diff --git a/data-raw/optimizing.R b/data-raw/optimizing.R new file mode 100644 index 00000000..ab79fd03 --- /dev/null +++ b/data-raw/optimizing.R @@ -0,0 +1,83 @@ +library(profvis) +library(microbenchmark) +library(tidyverse) +set.seed(1667) +Ngen <- 3 +kpc <- 4 +sexR <- .50 # sometimes fails above .5 +marR <- .7 +reps <- 10 +if (FALSE) { + profvis({ + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = FALSE) + }) + + profvis({ + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = TRUE) + }) +} + + +benchmark_results <- microbenchmark( + beta_false_1gen = { + simulatePedigree(kpc = kpc, Ngen = 1, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_1gen = { + simulatePedigree(kpc = kpc, Ngen = 1, sexR = sexR, marR = marR, beta = TRUE) + }, + beta_false_lowgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_lowgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = TRUE) + }, + beta_false_midgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 2, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_midgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 2, sexR = sexR, marR = marR, beta = TRUE) + }, + beta_false_highgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 3, sexR = sexR, marR = marR, beta = FALSE) + }, + beta_true_highgen = { + simulatePedigree(kpc = kpc, Ngen = Ngen * 3, sexR = sexR, marR = marR, beta = TRUE) + }, + times = reps # Run each method 10 times +) + +benchmark_results <- benchmark_results %>% + mutate( + beta_factor = factor(case_when( + grepl("beta_true", expr) ~ "TRUE", + grepl("beta_false", expr) ~ "FALSE", + grepl("beta_indexed", expr) ~ "indexed" + )), + beta = ifelse(grepl("beta_false", expr), FALSE, TRUE), + gen_num = case_when( + grepl("1gen", expr) ~ 1, + grepl("lowgen", expr) ~ Ngen, + grepl("midgen", expr) ~ Ngen * 2, + grepl("highgen", expr) ~ Ngen * 3 + ), + gen_factor = factor(gen_num, levels = c(1, Ngen, Ngen * 2, Ngen * 3)) + ) + +summary(benchmark_results) +lm(benchmark_results$time ~ benchmark_results$beta * benchmark_results$gen_num) %>% + summary() + +lm(benchmark_results$time ~ benchmark_results$beta) %>% + summary() +# log transform time for better visualization + +ggplot(benchmark_results, aes(x = gen_factor, y = time / 1e6, color = beta_factor)) + + geom_boxplot() + + labs( + title = "Benchmarking simulatePedigree() with and without beta parameter", + x = "Generation Size", + y = "Execution Time (ms)", + color = "Beta Parameter" + ) + + theme_minimal() + + scale_y_log10() diff --git a/man/adjustKidsPerCouple.Rd b/man/adjustKidsPerCouple.Rd index a3019c45..bf5b8192 100644 --- a/man/adjustKidsPerCouple.Rd +++ b/man/adjustKidsPerCouple.Rd @@ -1,10 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpPedigree.R +% Please edit documentation in R/helpSimulatePedigree.R, +% R/helpSimulatePedigree_beta.R \name{adjustKidsPerCouple} \alias{adjustKidsPerCouple} +\alias{adjustKidsPerCouple_beta} \title{Generate or Adjust Number of Kids per Couple Based on Mating Rate} \usage{ -adjustKidsPerCouple(nMates, kpc, rd_kpc = TRUE) +adjustKidsPerCouple(nMates, kpc, rd_kpc = TRUE, beta = FALSE) + +adjustKidsPerCouple_beta(nMates, kpc, rd_kpc = TRUE) } \arguments{ \item{nMates}{Integer, the number of mated pairs in the generation.} @@ -16,6 +20,8 @@ value is 3. Returns an error when kpc equals 1.} \item{rd_kpc}{logical. If TRUE, the number of kids per mate will be randomly generated from a poisson distribution with mean kpc. If FALSE, the number of kids per mate will be fixed at kpc.} + +\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} } \value{ A numeric vector with the generated or adjusted number of kids per couple. diff --git a/man/assignCoupleIDs.Rd b/man/assignCoupleIDs.Rd index d1de671e..cf4cc118 100644 --- a/man/assignCoupleIDs.Rd +++ b/man/assignCoupleIDs.Rd @@ -1,16 +1,22 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpPedigree.R +% Please edit documentation in R/helpSimulatePedigree.R, +% R/helpSimulatePedigree_beta.R \name{assignCoupleIDs} \alias{assignCoupleIDs} \alias{assignCoupleIds} +\alias{assignCoupleIDs_beta} \title{Assign Couple IDs} \usage{ -assignCoupleIDs(df_Ngen) +assignCoupleIDs(df_Ngen, beta = FALSE) -assignCoupleIds(df_Ngen) +assignCoupleIds(df_Ngen, beta = FALSE) + +assignCoupleIDs_beta(df_Ngen) } \arguments{ \item{df_Ngen}{The dataframe for the current generation, including columns for individual IDs and spouse IDs.} + +\item{beta}{Logical, indicating whether to use the beta version of the function.} } \value{ The input dataframe augmented with a 'coupleId' column, where each mated pair has a unique identifier. diff --git a/man/buildBetweenGenerations.Rd b/man/buildBetweenGenerations.Rd index 4e297579..fa87604e 100644 --- a/man/buildBetweenGenerations.Rd +++ b/man/buildBetweenGenerations.Rd @@ -17,7 +17,8 @@ buildBetweenGenerations( momID = "momID", dadID = "dadID", code_male = "M", - code_female = "F" + code_female = "F", + beta = FALSE ) } \arguments{ @@ -60,6 +61,8 @@ kids per mate will be fixed at kpc.} \item{code_male}{The value to use for males. Default is "M"} \item{code_female}{The value to use for females. Default is "F"} + +\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} } \value{ The function updates the `df_Fam` data frame in place, adding or modifying columns related to parental and offspring status, diff --git a/man/buildWithinGenerations.Rd b/man/buildWithinGenerations.Rd index 157faed6..905e42ad 100644 --- a/man/buildWithinGenerations.Rd +++ b/man/buildWithinGenerations.Rd @@ -1,10 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/simulatePedigree.R +% Please edit documentation in R/simulatePedigree_within.R \name{buildWithinGenerations} \alias{buildWithinGenerations} \title{Process Generations for Pedigree Simulation} \usage{ buildWithinGenerations( + beta = FALSE, sizeGens, marR, sexR, @@ -19,6 +20,8 @@ buildWithinGenerations( ) } \arguments{ +\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} + \item{sizeGens}{A numeric vector containing the sizes of each generation within the pedigree.} \item{marR}{Mating rate. A numeric value ranging from 0 to 1 which determines @@ -46,7 +49,8 @@ a fertilized couple. The last generation has no mated individuals.} \item{code_female}{The value to use for females. Default is "F"} -\item{fam_shift}{An integer to shift the person ID. Default is 1L.} +\item{fam_shift}{An integer to shift the person ID. Default is 1L. +This is useful when simulating multiple pedigrees to avoid ID conflicts.} } \value{ A data frame representing the simulated pedigree, including columns for family ID (`fam`), diff --git a/man/createGenDataFrame.Rd b/man/createGenDataFrame.Rd index 014ee86c..1e6fa386 100644 --- a/man/createGenDataFrame.Rd +++ b/man/createGenDataFrame.Rd @@ -1,10 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpPedigree.R +% Please edit documentation in R/helpSimulatePedigree.R, +% R/helpSimulatePedigree_beta.R \name{createGenDataFrame} \alias{createGenDataFrame} +\alias{createGenDataFrame_beta} \title{Create Data Frame for Generation} \usage{ -createGenDataFrame(sizeGens, genIndex, idGen) +createGenDataFrame( + sizeGens, + genIndex, + idGen, + family_id_prefix = "fam", + beta = FALSE +) + +createGenDataFrame_beta(sizeGens, genIndex, idGen, family_id_prefix = "fam") } \arguments{ \item{sizeGens}{A numeric vector containing the sizes of each generation within the pedigree.} @@ -12,6 +22,10 @@ createGenDataFrame(sizeGens, genIndex, idGen) \item{genIndex}{An integer representing the current generation index for which the data frame is being created.} \item{idGen}{A numeric vector containing the ID numbers to be assigned to individuals in the current generation.} + +\item{family_id_prefix}{A character string to prefix the family ID. Default is "fam".} + +\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} } \value{ A data frame representing the initial structure for the individuals in the specified generation diff --git a/man/determineSex.Rd b/man/determineSex.Rd index 39711ada..12f2b178 100644 --- a/man/determineSex.Rd +++ b/man/determineSex.Rd @@ -1,10 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpPedigree.R +% Please edit documentation in R/helpSimulatePedigree.R, +% R/helpSimulatePedigree_beta.R \name{determineSex} \alias{determineSex} +\alias{determineSex_beta} \title{Determine Sex of Offspring} \usage{ -determineSex(idGen, sexR, code_male = "M", code_female = "F") +determineSex(idGen, sexR, code_male = "M", code_female = "F", beta = FALSE) + +determineSex_beta(idGen, sexR, code_male = "M", code_female = "F") } \arguments{ \item{idGen}{Vector of IDs for the generation.} @@ -14,6 +18,8 @@ determineSex(idGen, sexR, code_male = "M", code_female = "F") \item{code_male}{The value to use for males. Default is "M"} \item{code_female}{The value to use for females. Default is "F"} + +\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} } \value{ Vector of sexes ("M" for male, "F" for female) for the offspring. diff --git a/man/markPotentialChildren.Rd b/man/markPotentialChildren.Rd index a51e496c..bf22329e 100644 --- a/man/markPotentialChildren.Rd +++ b/man/markPotentialChildren.Rd @@ -1,10 +1,23 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpPedigree.R +% Please edit documentation in R/helpSimulatePedigree.R, +% R/helpSimulatePedigree_beta.R \name{markPotentialChildren} \alias{markPotentialChildren} +\alias{markPotentialChildren_beta} \title{Mark and Assign children} \usage{ markPotentialChildren( + df_Ngen, + i, + Ngen, + sizeGens, + CoupleF, + code_male = "M", + code_female = "F", + beta = FALSE +) + +markPotentialChildren_beta( df_Ngen, i, Ngen, @@ -25,11 +38,16 @@ and any previously assigned roles (`ifparent`, `ifson`, `ifdau`).} \item{sizeGens}{Numeric vector, containing the size (number of individuals) of each generation.} -\item{CoupleF}{Integer, IT MIGHT BE the number of couples in the current generation.} +\item{CoupleF}{Integer scalar giving the number of distinct mating couples in the current +generation `i`. This is typically computed upstream from the spouse assignments +(e.g., as the number of unique non-missing spouse pairs in `df_Ngen`) and must satisfy +`0 <= CoupleF <= floor(sizeGens[i] / 2)`.} \item{code_male}{The value to use for males. Default is "M"} \item{code_female}{The value to use for females. Default is "F"} + +\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} } \value{ Modifies `df_Ngen` in place by updating or adding columns related to individual roles diff --git a/man/ped2add.Rd b/man/ped2add.Rd index e26c100e..dc2fee88 100644 --- a/man/ped2add.Rd +++ b/man/ped2add.Rd @@ -27,9 +27,7 @@ ped2add( \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} -\item{max_gen}{the maximum number of generations to compute -(e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. - `Inf` uses as many generations as there are in the data.} +\item{max_gen}{the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25.} \item{sparse}{logical. If TRUE, use and return sparse matrices from Matrix package} diff --git a/man/ped2cn.Rd b/man/ped2cn.Rd index 69062b07..aafa5f7e 100644 --- a/man/ped2cn.Rd +++ b/man/ped2cn.Rd @@ -27,9 +27,7 @@ ped2cn( \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} -\item{max_gen}{the maximum number of generations to compute -(e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. - `Inf` uses as many generations as there are in the data.} +\item{max_gen}{the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25.} \item{sparse}{logical. If TRUE, use and return sparse matrices from Matrix package} diff --git a/man/ped2com.Rd b/man/ped2com.Rd index c69574d6..f34d6022 100644 --- a/man/ped2com.Rd +++ b/man/ped2com.Rd @@ -33,9 +33,7 @@ ped2com( \item{component}{character. Which component of the pedigree to return. See Details.} -\item{max_gen}{the maximum number of generations to compute -(e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. - `Inf` uses as many generations as there are in the data.} +\item{max_gen}{the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25.} \item{sparse}{logical. If TRUE, use and return sparse matrices from Matrix package} diff --git a/man/ped2gen.Rd b/man/ped2gen.Rd index cda9f589..d57facbf 100644 --- a/man/ped2gen.Rd +++ b/man/ped2gen.Rd @@ -27,9 +27,7 @@ ped2gen( \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} -\item{max_gen}{the maximum number of generations to compute -(e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. - `Inf` uses as many generations as there are in the data.} +\item{max_gen}{the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25.} \item{sparse}{logical. If TRUE, use and return sparse matrices from Matrix package} diff --git a/man/ped2mit.Rd b/man/ped2mit.Rd index 0f449b00..b4992fb9 100644 --- a/man/ped2mit.Rd +++ b/man/ped2mit.Rd @@ -28,9 +28,7 @@ ped2mit( \arguments{ \item{ped}{a pedigree dataset. Needs ID, momID, and dadID columns} -\item{max_gen}{the maximum number of generations to compute -(e.g., only up to 4th degree relatives). The default is 25. However it can be set to infinity. - `Inf` uses as many generations as there are in the data.} +\item{max_gen}{the maximum number of iterations that the adjacency matrix is multiplied to get the relatedness matrix. `Inf` uses as many iterations as there are in the data. Defaults to 25.} \item{sparse}{logical. If TRUE, use and return sparse matrices from Matrix package} diff --git a/man/simulatePedigree.Rd b/man/simulatePedigree.Rd index 36555c8e..fa87632e 100644 --- a/man/simulatePedigree.Rd +++ b/man/simulatePedigree.Rd @@ -25,7 +25,8 @@ simulatePedigree( spouseID = "spouseID", code_male = "M", code_female = "F", - fam_shift = 1L + fam_shift = 1L, + beta = FALSE ) SimPed(...) @@ -72,7 +73,10 @@ current version.} \item{code_female}{The value to use for females. Default is "F"} -\item{fam_shift}{An integer to shift the person ID. Default is 1L.} +\item{fam_shift}{An integer to shift the person ID. Default is 1L. +This is useful when simulating multiple pedigrees to avoid ID conflicts.} + +\item{beta}{logical. If TRUE, use the optimized version of the algorithm.} \item{...}{Additional arguments to be passed to other functions.} } diff --git a/tests/testthat/test-checkSex.R b/tests/testthat/test-checkSex.R index 427d37b4..9b6aaa66 100644 --- a/tests/testthat/test-checkSex.R +++ b/tests/testthat/test-checkSex.R @@ -54,7 +54,6 @@ test_that("checkSex identifies potentially problematic sex coding of non-female }) - # Test Case 3: Recode sex variable test_that("recodeSex correctly recodes sex in potter dataset", { recoded_potter <- recodeSex(potter, code_male = 1, code_female = 0, recode_male = "M", recode_female = "F") @@ -70,7 +69,6 @@ test_that("recodeSex correctly recodes sex in potter dataset", { }) - # Test Case 4: Handle missing values test_that("Functions handle missing values gracefully", { ped <- data.frame( diff --git a/tests/testthat/test-simulatePedigree.R b/tests/testthat/test-simulatePedigree.R index 3f06926c..75100e4c 100644 --- a/tests/testthat/test-simulatePedigree.R +++ b/tests/testthat/test-simulatePedigree.R @@ -1,53 +1,225 @@ test_that("simulated pedigree generates expected data structure", { - set.seed(5) + seed <- 5 Ngen <- 4 kpc <- 4 sexR <- .50 marR <- .7 + beta_options <- c(F, T) + strict_tolerance <- 1e-8 + sex_tolerance <- .035 + # beta_options <- T + for (beta in beta_options) { + set.seed(seed) + message("Beta option Starting: ", beta) + results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = beta) + # Check that dimnames are correct + expect_equal(length(results$ID), 57, tolerance = strict_tolerance) + expect_equal(length(results), 7, tolerance = strict_tolerance) - results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR) - # Check that dimnames are correct - expect_equal(length(results$ID), 57, tolerance = 1e-8) - expect_equal(length(results), 7, tolerance = 1e-8) + # check number of generations + expect_equal(max(results$gen), Ngen, tolerance = strict_tolerance) - # check number of generations - expect_equal(max(results$gen), Ngen, tolerance = 1e-8) + # check + # check number of sex ratio + sex_mean_male <- mean(results$sex == "M") + sex_mean_female <- mean(results$sex == "F") - # check number of sex ratio - expect_equal(mean(results$sex == "M"), sexR, tolerance = .05) + expect_equal(sex_mean_male, sex_mean_female, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + # check number of sex ratio + expect_equal(mean(results$sex == "M"), sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + expect_equal(mean(results$sex == "F"), 1 - sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + message("Beta option Ending: ", beta) + } }) + + +test_that("simulated pedigree generates expected data structure when sexR is imbalanced", { + seed <- 51 + Ngen <- 5 + kpc <- 4 + sexR <- .55 + marR <- .7 + beta_options <- c(F, T) + strict_tolerance <- 1e-8 + sex_tolerance <- .03 + # beta_options <- T + for (beta in beta_options) { + set.seed(seed) + message("Beta option Starting: ", beta) + results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = beta) + # Check that dimnames are correct + expect_equal(length(results$ID), 154, tolerance = strict_tolerance) + expect_equal(length(results), 7, tolerance = strict_tolerance) + + # check number of generations + expect_equal(max(results$gen), Ngen, tolerance = strict_tolerance) + + # check marR + + + # check number of sex ratio + sex_mean_male <- mean(results$sex == "M") + sex_mean_female <- mean(results$sex == "F") + + expect_lt(sex_mean_female, sex_mean_male) + + expect_equal(sex_mean_male, sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + expect_equal(sex_mean_female, 1 - sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + + message("Beta option Ending: ", beta) + } +}) + +test_that("simulated pedigree generates expected data structure when sexR is imbalanced in opposite", { + seed <- 51 + Ngen <- 6 + kpc <- 4 + sexR <- .45 + marR <- .7 + beta_options <- c(F, T) + strict_tolerance <- 1e-8 + sex_tolerance <- .03 + + # beta_options <- T + for (beta in beta_options) { + set.seed(seed) + message("Beta option Starting: ", beta) + results <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = beta) + # Check that dimnames are correct + expect_equal(length(results$ID), 424, tolerance = strict_tolerance) + expect_equal(length(results), 7, tolerance = strict_tolerance) + + # check number of generations + expect_equal(max(results$gen), Ngen, tolerance = strict_tolerance) + + + # check number of sex ratio + sex_mean_male <- mean(results$sex == "M") + sex_mean_female <- mean(results$sex == "F") + + expect_lt(sex_mean_male, sex_mean_female) + + expect_equal(sex_mean_male, sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + expect_equal(sex_mean_female, 1 - sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + message("Beta option Ending: ", beta) + } +}) + test_that("simulated pedigree generates expected data structure but supply var names", { - set.seed(5) + seed <- 5 Ngen <- 4 kpc <- 4 - sexR <- .50 + sexR <- .45 marR <- .7 code_male <- "M" code_female <- "Fe" personID <- "Id" - results <- simulatePedigree( - kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, - code_female = code_female, personID = personID, - code_male = code_male - ) - # Check that dimnames are correct - expect_equal(length(results$Id), 57, tolerance = 1e-8) - expect_equal(length(results), 7, tolerance = 1e-8) + beta_options <- c(F, T) + strict_tolerance <- 1e-8 + sex_tolerance <- .03 + # beta_options <- T - # check number of generations - expect_equal(max(results$gen), Ngen, tolerance = 1e-8) + for (beta in beta_options) { + set.seed(seed) + message("Beta option Starting: ", beta) + results <- simulatePedigree( + kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, + code_female = code_female, personID = personID, + code_male = code_male, + beta = beta + ) + # Check that dimnames are correct + expect_equal(length(results$Id), 57, tolerance = strict_tolerance) + expect_equal(length(results), 7, tolerance = strict_tolerance) - # check number of sex ratio - expect_equal(mean(results$sex == code_male), sexR, tolerance = .05) - expect_equal(mean(results$sex == code_female), sexR, tolerance = .05) + # check number of generations + expect_equal(max(results$gen), Ngen, tolerance = strict_tolerance) + + # check number of sex ratio + + # check number of sex ratio + sex_mean_male <- mean(results$sex == code_male) + sex_mean_female <- mean(results$sex == code_female) + + expect_lt(sex_mean_male, sex_mean_female) + + + expect_equal(sex_mean_male, sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + expect_equal(sex_mean_female, 1 - sexR, tolerance = sex_tolerance, info = paste0("Beta option: ", beta)) + message("Beta option Ending: ", beta) + } }) test_that("simulatePedigree verbose prints updates", { - set.seed(5) + seed <- 5 + Ngen <- 4 + kpc <- 4 + sexR <- .50 + marR <- .7 + beta_options <- c(F, T) + # beta_options <- T + for (beta in beta_options) { + set.seed(seed) + message("Beta option Starting: ", beta) + expect_message(simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, verbose = TRUE, beta = beta), regexp = "Let's build the connection within each generation first") + message("Beta option Ending: ", beta) + } +}) + +test_that("simulatePedigree accepts string aliases for beta parameter", { + seed <- 5 Ngen <- 4 kpc <- 4 sexR <- .50 marR <- .7 - expect_message(simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, verbose = TRUE), regexp = "Let's build the connection within each generation first") + # Test that "optimized" string alias works + set.seed(seed) + result_true <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = TRUE) + + set.seed(seed) + result_optimized <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = "optimized") + + # Results should be identical when using TRUE vs "optimized" + expect_equal(nrow(result_true), nrow(result_optimized)) + expect_equal(ncol(result_true), ncol(result_optimized)) + expect_equal(result_true$ID, result_optimized$ID) + + # Test that "base" string alias works + set.seed(seed) + result_false <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = FALSE) + + set.seed(seed) + result_base <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = "base") + + # Results should be identical when using FALSE vs "base" + expect_equal(nrow(result_false), nrow(result_base)) + expect_equal(ncol(result_false), ncol(result_base)) + expect_equal(result_false$ID, result_base$ID) + + # Test that "original" string alias works + set.seed(seed) + result_original <- simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = "original") + + # Results should be identical when using FALSE vs "original" + expect_equal(nrow(result_false), nrow(result_original)) + expect_equal(ncol(result_false), ncol(result_original)) + expect_equal(result_false$ID, result_original$ID) + + # Test that invalid beta values throw errors + expect_error( + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = "invalid"), + "Invalid value for parameter" + ) + + # Test that "index" and "indexed" both throw appropriate error + expect_error( + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = "index"), + "not yet implemented" + ) + + expect_error( + simulatePedigree(kpc = kpc, Ngen = Ngen, sexR = sexR, marR = marR, beta = "indexed"), + "not yet implemented" + ) }) diff --git a/vignettes/v1_modelingvariancecomponents.Rmd b/vignettes/v1_modelingvariancecomponents.Rmd index ee18b269..0cf74977 100644 --- a/vignettes/v1_modelingvariancecomponents.Rmd +++ b/vignettes/v1_modelingvariancecomponents.Rmd @@ -98,6 +98,7 @@ identifyComponentModel( As you can see the model is identified, now that we've added another group. Let us confirm by fitting a model. First we prepare the data. ```{r, include=FALSE} +library(dplyr) if (!requireNamespace("OpenMx", quietly = TRUE)) { # if OpenMx isn't available n_subjects <- 500 @@ -117,12 +118,14 @@ if (!requireNamespace("OpenMx", quietly = TRUE)) { zyg = rep(c(1, 3), each = n_subjects / 2) ) twinData$ht2 <- twinData$ht1 * ifelse(twinData$zyg == 1, 1, 0.5) + - rnorm(n_subjects, - mean = 0, sd = df_summary_data$ht2_sd) + - .1 * rnorm(n_subjects, - mean = df_summary_data$ht2_mean, - sd = df_summary_data$ht2_sd) - + rnorm(n_subjects, + mean = 0, sd = df_summary_data$ht2_sd + ) + + .1 * rnorm(n_subjects, + mean = df_summary_data$ht2_mean, + sd = df_summary_data$ht2_sd + ) + twinData$ht2[twinData$zyg == 3] <- twinData$ht2[twinData$zyg == 3] + .5 * rnorm(sum(twinData$zyg == 3), mean = df_summary_data$ht2_mean, sd = df_summary_data$ht2_sd) } else { data(twinData, package = "OpenMx") @@ -130,7 +133,7 @@ if (!requireNamespace("OpenMx", quietly = TRUE)) { ``` ```{r} -require(dplyr) +library(dplyr) selVars <- c("ht1", "ht2") diff --git a/vignettes/v1_modelingvariancecomponents.html b/vignettes/v1_modelingvariancecomponents.html index 84a077f9..ac2a09f6 100644 --- a/vignettes/v1_modelingvariancecomponents.html +++ b/vignettes/v1_modelingvariancecomponents.html @@ -343,10 +343,10 @@

Modeling variance components

Introduction

-

This vignette provides a detailed guide to specific functions within -the BGmisc package that aid in the identification and -fitting of variance component models common in behavior genetics. We -will explore key functions such as identifyComponentModel, +

This vignette provides a detailed guide to specific functions in the +BGmisc package that aid in the identification and fitting +of variance component models common in behavior genetics. We will +explore key functions such as identifyComponentModel, providing practical examples and theoretical background. Identification ensures a unique set of parameters that define the model-implied covariance matrix, preventing free parameters from trading off one @@ -421,113 +421,106 @@

Using identifyComponentModel Function

#> character(0)

As you can see the model is identified, now that we’ve added another group. Let us confirm by fitting a model. First we prepare the data.

-
require(dplyr)
-#> Loading required package: dplyr
-#> 
-#> Attaching package: 'dplyr'
-#> The following objects are masked from 'package:stats':
-#> 
-#>     filter, lag
-#> The following objects are masked from 'package:base':
-#> 
-#>     intersect, setdiff, setequal, union
-
+
library(dplyr)
+
+
+selVars <- c("ht1", "ht2")
+
+mzdzData <- subset(
+  twinData, zyg %in% c(1, 3),
+  c(selVars, "zyg")
+)
+
+mzdzData$RCoef <- c(1, NA, .5)[mzdzData$zyg]
 
-selVars <- c("ht1", "ht2")
-
-mzdzData <- subset(
-  twinData, zyg %in% c(1, 3),
-  c(selVars, "zyg")
-)
-
-mzdzData$RCoef <- c(1, NA, .5)[mzdzData$zyg]
-
-
-mzData <- mzdzData %>% filter(zyg == 1)
+ +mzData <- mzdzData %>% filter(zyg == 1)

Let us fit the data with MZ twins by themselves.

if (!requireNamespace("EasyMx", quietly = TRUE)) {
- print("Please install EasyMx to run the model fitting examples.")
+  print("Please install EasyMx to run the model fitting examples.")
 } else {
-run1 <- emxTwinModel(
-  model = "Cholesky",
-  relatedness = "RCoef",
-  data = mzData,
-  use = selVars,
-  run = TRUE, name = "TwCh"
-)
-
-summary(run1)
-}
-#> Running TwCh with 4 parameters
-#> Summary of TwCh 
-#>  
-#> free parameters:
-#>      name matrix row col   Estimate    Std.Error A lbound ubound
-#> 1 sqrtA11  sqrtA   1   1 0.05122646           NA    1e-06       
-#> 2 sqrtC11  sqrtC   1   1 0.03518629           NA       0!       
-#> 3 sqrtE11  sqrtE   1   1 0.02325722 0.0007017955 !     0!       
-#> 4    Mht1  Means ht1   1 1.62974908 0.0027023907                
-#> 
-#> Model Statistics: 
-#>                |  Parameters  |  Degrees of Freedom  |  Fit (-2lnL units)
-#>        Model:              4                   1112             -3693.148
-#>    Saturated:              5                   1111                    NA
-#> Independence:              4                   1112                    NA
-#> Number of observations/statistics: 569/1116
-#> 
-#> Information Criteria: 
-#>       |  df Penalty  |  Parameters Penalty  |  Sample-Size Adjusted
-#> AIC:      -5917.148              -3685.148                -3685.078
-#> BIC:     -10747.543              -3667.773                -3680.471
-#> To get additional fit indices, see help(mxRefModels)
-#> timestamp: 2026-01-07 10:34:22 
-#> Wall clock time: 0.0555501 secs 
-#> optimizer:  SLSQP 
-#> OpenMx version number: 2.22.10 
-#> Need help?  See help(mxSummary)
+ library(EasyMx) + run1 <- emxTwinModel( + model = "Cholesky", + relatedness = "RCoef", + data = mzData, + use = selVars, + run = TRUE, name = "TwCh" + ) + + summary(run1) +} +#> Running TwCh with 4 parameters +#> Summary of TwCh +#> +#> free parameters: +#> name matrix row col Estimate Std.Error A lbound ubound +#> 1 sqrtA11 sqrtA 1 1 0.05122646 NA 1e-06 +#> 2 sqrtC11 sqrtC 1 1 0.03518629 NA 0! +#> 3 sqrtE11 sqrtE 1 1 0.02325722 0.0007017955 ! 0! +#> 4 Mht1 Means ht1 1 1.62974908 0.0027023907 +#> +#> Model Statistics: +#> | Parameters | Degrees of Freedom | Fit (-2lnL units) +#> Model: 4 1112 -3693.148 +#> Saturated: 5 1111 NA +#> Independence: 4 1112 NA +#> Number of observations/statistics: 569/1116 +#> +#> Information Criteria: +#> | df Penalty | Parameters Penalty | Sample-Size Adjusted +#> AIC: -5917.148 -3685.148 -3685.078 +#> BIC: -10747.543 -3667.773 -3680.471 +#> To get additional fit indices, see help(mxRefModels) +#> timestamp: 2026-01-24 22:17:23 +#> Wall clock time: 0.05765486 secs +#> optimizer: SLSQP +#> OpenMx version number: 2.22.10 +#> Need help? See help(mxSummary)

As you can see the model was unsuccessful because it was not identified. But when we add another group, so that the model is identified, the model now fits.

if (!requireNamespace("EasyMx", quietly = TRUE)) {
- print("Please install EasyMx to run the model fitting examples.")
+  print("Please install EasyMx to run the model fitting examples.")
 } else {
-run2 <- emxTwinModel(
-  model = "Cholesky",
-  relatedness = "RCoef",
-  data = mzdzData,
-  use = selVars,
-  run = TRUE, name = "TwCh"
-)
-
-summary(run2)
-}
-#> Running TwCh with 4 parameters
-#> Summary of TwCh 
-#>  
-#> free parameters:
-#>      name matrix row col   Estimate    Std.Error A lbound ubound
-#> 1 sqrtA11  sqrtA   1   1 0.06339271 0.0014377690    1e-06       
-#> 2 sqrtC11  sqrtC   1   1 0.00000100 0.0250260004 !     0!       
-#> 3 sqrtE11  sqrtE   1   1 0.02330040 0.0007015267       0!       
-#> 4    Mht1  Means ht1   1 1.63295540 0.0020511844                
-#> 
-#> Model Statistics: 
-#>                |  Parameters  |  Degrees of Freedom  |  Fit (-2lnL units)
-#>        Model:              4                   1803             -5507.092
-#>    Saturated:              5                   1802                    NA
-#> Independence:              4                   1803                    NA
-#> Number of observations/statistics: 920/1807
-#> 
-#> Information Criteria: 
-#>       |  df Penalty  |  Parameters Penalty  |  Sample-Size Adjusted
-#> AIC:      -9113.092              -5499.092                -5499.048
-#> BIC:     -17811.437              -5479.794                -5492.498
-#> To get additional fit indices, see help(mxRefModels)
-#> timestamp: 2026-01-07 10:34:22 
-#> Wall clock time: 0.04742312 secs 
-#> optimizer:  SLSQP 
-#> OpenMx version number: 2.22.10 
-#> Need help?  See help(mxSummary)
+ library(EasyMx) + run2 <- emxTwinModel( + model = "Cholesky", + relatedness = "RCoef", + data = mzdzData, + use = selVars, + run = TRUE, name = "TwCh" + ) + + summary(run2) +} +#> Running TwCh with 4 parameters +#> Summary of TwCh +#> +#> free parameters: +#> name matrix row col Estimate Std.Error A lbound ubound +#> 1 sqrtA11 sqrtA 1 1 0.06339271 0.0014377690 1e-06 +#> 2 sqrtC11 sqrtC 1 1 0.00000100 0.0250260004 ! 0! +#> 3 sqrtE11 sqrtE 1 1 0.02330040 0.0007015267 0! +#> 4 Mht1 Means ht1 1 1.63295540 0.0020511844 +#> +#> Model Statistics: +#> | Parameters | Degrees of Freedom | Fit (-2lnL units) +#> Model: 4 1803 -5507.092 +#> Saturated: 5 1802 NA +#> Independence: 4 1803 NA +#> Number of observations/statistics: 920/1807 +#> +#> Information Criteria: +#> | df Penalty | Parameters Penalty | Sample-Size Adjusted +#> AIC: -9113.092 -5499.092 -5499.048 +#> BIC: -17811.437 -5479.794 -5492.498 +#> To get additional fit indices, see help(mxRefModels) +#> timestamp: 2026-01-24 22:17:23 +#> Wall clock time: 0.04731297 secs +#> optimizer: SLSQP +#> OpenMx version number: 2.22.10 +#> Need help? See help(mxSummary) diff --git a/vignettes/v2_pedigree.Rmd b/vignettes/v2_pedigree.Rmd index 92ea4f7c..5f0f3698 100644 --- a/vignettes/v2_pedigree.Rmd +++ b/vignettes/v2_pedigree.Rmd @@ -51,6 +51,7 @@ The columns represents the individual's family ID, the individual's personal ID, ```{r} summarizeFamilies(df_ped, famID = "fam")$family_summary ``` + ## Plotting Pedigree Pedigrees are visual diagrams that represent family relationships across generations. They are commonly used in genetics to trace the inheritance of specific traits or conditions. This vignette will guide you through visualizing simulated pedigrees using the `plotPedigree` function. This function is a wrapper function for `Kinship2`'s base R plotting. The sister package ggpedigree has a much nicer plotting function. It's also available on CRAN, but it is not a dependency of BGmisc. If you want to use ggpedigree, you can install it with `install.packages("ggpedigree")` and then use `ggplot2` syntax to plot pedigrees. diff --git a/vignettes/v2_pedigree.html b/vignettes/v2_pedigree.html index b9c6dac0..b8f34803 100644 --- a/vignettes/v2_pedigree.html +++ b/vignettes/v2_pedigree.html @@ -48,8 +48,9 @@ }