Tutorial of R package gpyramid

Shoji Taniguchi

1. Set up

library(gpyramid)
library(ape)
library(dplyr)

2. Prepare data

2.1 Gene data

line_df <- data.frame(line = c("x1", "x2", "x3", "x4", "x5", "x6"),
                      gene1 = c("H", "H", "A", "B", "B", "B"),
                      gene2 = c("H", "H", "A", "B", "B", "A"),
                      gene3 = c("H", "H", "B", "A", "A", "A"),
                      gene4 = c("H", "B", "B", "B", "A", "B"),
                      gene5 = c("H", "H", "B", "A", "B", "B"),
                      gene6 = c("B", "A", "B", "A", "B", "B"),
                      gene7 = c("B", "B", "B", "B", "B", "A"))

line_df
#>   line gene1 gene2 gene3 gene4 gene5 gene6 gene7
#> 1   x1     H     H     H     H     H     B     B
#> 2   x2     H     H     H     B     H     A     B
#> 3   x3     A     A     B     B     B     B     B
#> 4   x4     B     B     A     B     A     A     B
#> 5   x5     B     B     A     A     B     B     B
#> 6   x6     B     A     A     B     B     B     A

2.2 Position data

position_df <- data.frame(Gene = c("gene1", "gene2", "gene3", "gene4", "gene5", "gene6", "gene7"),
                          Chr = c("1", "2", "3", "4", "5", "6", "7"),
                          cM = c(20, 0, 40, 20, 10, 0, 0))
position_df
#>    Gene Chr cM
#> 1 gene1   1 20
#> 2 gene2   2  0
#> 3 gene3   3 40
#> 4 gene4   4 20
#> 5 gene5   5 10
#> 6 gene6   6  0
#> 7 gene7   7  0

2.3 Preprosessing

Generate haplotype dataframe from row data

gene_dat <- util_haplo(line_df, target = "A", non_target = "B", hetero = "H", line_cul = "line")

gene_df1 <- gene_dat[[1]]
gene_df2 <- gene_dat[[2]]
line_id <- gene_dat[[3]]

colnames(gene_df1) <- line_id
colnames(gene_df2) <- line_id

gene_df1
#>      x1 x2 x3 x4 x5 x6
#> [1,]  1  1  1  0  0  0
#> [2,]  1  1  1  0  0  1
#> [3,]  1  1  0  1  1  1
#> [4,]  1  0  0  0  1  0
#> [5,]  1  1  0  1  0  0
#> [6,]  0  1  0  1  0  0
#> [7,]  0  0  0  0  0  1
gene_df2
#>      x1 x2 x3 x4 x5 x6
#> [1,]  0  0  1  0  0  0
#> [2,]  0  0  1  0  0  1
#> [3,]  0  0  0  1  1  1
#> [4,]  0  0  0  0  1  0
#> [5,]  0  0  0  1  0  0
#> [6,]  0  1  0  1  0  0
#> [7,]  0  0  0  0  0  1

Generate recombination probability matrix from raw data

recom_mat <- util_recom_mat(position_df, "cM")
recom_mat
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#> [1,]  0.0  0.5  0.5  0.5  0.5  0.5  0.5
#> [2,]  0.5  0.0  0.5  0.5  0.5  0.5  0.5
#> [3,]  0.5  0.5  0.0  0.5  0.5  0.5  0.5
#> [4,]  0.5  0.5  0.5  0.0  0.5  0.5  0.5
#> [5,]  0.5  0.5  0.5  0.5  0.0  0.5  0.5
#> [6,]  0.5  0.5  0.5  0.5  0.5  0.0  0.5
#> [7,]  0.5  0.5  0.5  0.5  0.5  0.5  0.0

3. Find parent sets from candidate lines (cultivars)

From candidate lines, findPset function returns the parent sets for gene pyramidding.

In this example, there are 4 sets for gene pyramidding.

line_comb_lis <- findPset(gene_df1, gene_df2, line_id)
line_comb_lis
#> [[1]]
#> [1] "x1" "x2" "x6"
#> 
#> [[2]]
#> [1] "x1" "x4" "x6"
#> 
#> [[3]]
#> [1] "x2" "x5" "x6"
#> 
#> [[4]]
#> [1] "x3" "x4" "x5" "x6"

4. Calculate the number of necessary individuals and generations

4.1 Calculate cost of all the crossing schemes

calcCostAll function calculates the number of necessary individuals and generations as the crossing cost for all the crossing schemes.

Given parent sets for gene pyramidding, calCostAll function simulates all the crossing schemes and calculates the number of necessary individuals and generations as the cost of gene pyramidding.

calcCostAll function returns the gpyramid_all object, which contains information of all the crossing schemes.

Here, getFromAll function get one crossing scheme from gpyramid_all object.

cost_rslt <- calcCostAll(line_comb_lis, gene_df1, gene_df2, recom_mat, prob_total = 0.99)
cost_rslt
#> # Head of the summary table
#> # A tibble: 6 × 6
#>   cross_id n_generation N_total n_parent line_comb_id TR_id
#>      <dbl>        <dbl>   <dbl>    <dbl>        <dbl> <dbl>
#> 1        1            2    1521        3            1     1
#> 2        2            2    2792        3            1     2
#> 3        3            2    2728        3            1     3
#> 4        4            2     250        3            2     1
#> 5        5            2    2357        3            2     2
#> 6        6            2     250        3            2     3
#> 
#> # To extract summary table, check the list element named as cost_all.

4.2 Plot cost of each crossing scheme

The output of calCostAll function contains the cost of each crossing scheme.

Here, we generate a plot of the number of necessary plants and generations.

cost_all <- cost_rslt$cost_all
plot(x = cost_all$n_parent, y = cost_all$N_total, 
     log = "y", xlab = "Number of generations", ylab = "Number of individuals")

4.3 Select the most cost-effective crossing strategy

cost_all[cost_all$N_total < 200,]
#> # A tibble: 4 × 6
#>   cross_id n_generation N_total n_parent line_comb_id TR_id
#>      <dbl>        <dbl>   <dbl>    <dbl>        <dbl> <dbl>
#> 1        7            2     166        3            3     1
#> 2       10            3     124        4            4     1
#> 3       12            3     124        4            4     3
#> 4       13            3     124        4            4     4
rslt_one <- getFromAll(cost_rslt, cross_id = 13)
summary(rslt_one)
#> Number of necessary plants for each crossing
#> 
#>   nodeid n_plant
#> 1 node_6       1
#> 2 node_7      83
#> 3 node_5      40
#> 
#> Target haplotype of crossing
#> 
#> nodeid=5
#>   hap1 hap2
#> 1    0    1
#> 2    0    1
#> 3    1    1
#> 4    0    1
#> 5    1    0
#> 6    1    0
#> 7    0    1
#> 
#> nodeid=6
#>   hap1 hap2
#> 1    1    0
#> 2    1    0
#> 3    0    1
#> 4    0    1
#> 5    0    0
#> 6    0    0
#> 7    0    0
#> 
#> nodeid=7
#>   hap1 hap2
#> 1    1    0
#> 2    1    1
#> 3    1    1
#> 4    1    0
#> 5    0    0
#> 6    0    0
#> 7    0    1
plot(rslt_one$topolo)
nodelabels()

4.4 Another example

rslt_one <- getFromAll(cost_rslt, cross_id = 7)
summary(rslt_one)
#> Number of necessary plants for each crossing
#> 
#>   nodeid n_plant
#> 1 node_5      83
#> 2 node_4      83
#> 
#> Target haplotype of crossing
#> 
#> nodeid=4
#>   hap1 hap2
#> 1    0    1
#> 2    0    1
#> 3    1    1
#> 4    1    0
#> 5    0    1
#> 6    0    1
#> 7    0    1
#> 
#> nodeid=5
#>   hap1 hap2
#> 1    1    0
#> 2    1    1
#> 3    1    1
#> 4    0    0
#> 5    1    0
#> 6    1    0
#> 7    0    1
plot(rslt_one$topolo)
nodelabels()