<-
list.of.packages c(
"dplyr",
"ggplot2",
"gt", #package for customizing html table view
"conflicted", #Some functions are named the same way across different packages and they can be conflicts and this package helps sorting this problem
"here", #helps with the reproducibility across operating systems=
"reshape2", # reshaping the data
"plotly", # another advanced visualization tool
"pbapply", # Parallelization libraries
"parallel"
)
<-
new.packages !(list.of.packages %in% installed.packages()[, "Package"])]
list.of.packages[
#Download packages that are not already present in the library
if (length(new.packages))
install.packages(new.packages)
# Load packages
<-
packages_load lapply(list.of.packages, require, character.only = TRUE)
#Print warning if there is a problem with installing/loading some of packages
if (any(as.numeric(packages_load) == 0)) {
warning(paste("Package/s: ", paste(list.of.packages[packages_load != TRUE], sep = ", "), "not loaded!"))
else {
} print("All packages were successfully loaded.")
}rm(list.of.packages, new.packages, packages_load)
#Resolve conflicts
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
#if install is not working try changing repositories
#install.packages("ROCR", repos = c(CRAN="https://cran.r-project.org/"))
In this section, concepts about dispersal are studied using simulation. In the R code that follows, dispersal within a field is explored, beginning with the case of random positioning of initial infections.
This section’s goal is to initiate the thinking process about specific questions such as, when would a distribution be random based on the inoculum sources.
To begin, suppose you would like to generate a simulated map of individual plants and their infection status, where infections are randomly assigned to individuals in the initial map and the number of infections per individual is tallied. First generate a matrix of dimensions 10x10 containing zeroes, where zero indicates no infection.
<- matrix(0,10,10)
map1 <- 3 initial_inf
We want to make sure that the random process is reproducible on every machine.
set.seed(123)
Randomly select row and column coordinates for vector carrying
information about the number of initial infections
initial_inf
, where initial_inf
can be assigned
any value.
<- 3
initial_inf <- sample(1:10,size=initial_inf, replace=T))
(row1 <- sample(1:10, size=initial_inf, replace=T)) (col1
Assign the infections to the plants corresponding to the randomly drawn coordinates, and look at the new map.
for(j in 1:initial_inf) {
<- map1[row1[j],col1[j]] + 1}
map1[row1[j],col1[j]] map1
A handy function melt
for melting array (and other
types) data is found in reshape2
package.
<-
map1_long ::melt(map1, varnames = c("rows", "cols"))
reshape2# See first rows onf melted matrix
head(map1_long)
Finally, it is possible to visualize this data using ggplot.
<- 11
point_size %>%
map1_long ggplot(aes(factor(cols), factor(rows),
color = value,
label = as.character(value)))+
geom_point(size = point_size, shape = 15)+
geom_text(colour = "black")+
scale_color_gradient("Health status (counts of infected plants):",
low = "lightgreen", high = "#993300")+
theme_bw()+
theme(axis.title = element_blank(),
legend.position = "top",
panel.background = element_rect(terrain.colors(5)[3]),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
coord_equal()
To avoid repetition, the last two steps (melting and plotting) were joined into a function.
<- function(data,
PlotMat point_size = 11 # default value
){::melt(data, varnames = c("rows", "cols")) %>%
reshape2ggplot(aes(factor(cols), factor(rows),
color = value,
label = as.character(value)))+
geom_point(size = point_size, shape = 15)+
geom_text(colour = "black")+
scale_colour_gradient("Health status (counts of infected plants):",
low = "lightgreen", high = "#993300")+
theme_bw()+
theme(axis.title = element_blank(),
legend.position = "top",
panel.background = element_rect(terrain.colors(5)[3]),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())+
coord_equal()
}
Start with a simple model of likely infections of adjacent plants. Such pattern of dispersion is called the rook’s move. We move onto simulation of generations of reproduction and dispersal.
The matrix map1
is redefined here.
Randomly select row and column coordinates for
initial_inf
(initial infections), where initial_inf can be
assigned any value.
<- map1
map2 for(xrow in 1:10){
for(xcol in 1:10){
<- map1[xrow,xcol]
numprop if(xrow > 1){
-1,xcol] <- map2[xrow-1,xcol] + numprop}
map2[xrowif(xrow < 10){
+1,xcol] <- map2[xrow+1,xcol] + numprop}
map2[xrowif(xcol < 10){
+1] <- map2[xrow,xcol+1] + numprop}
map2[xrow,xcolif(xcol > 1){
-1] <- map2[xrow,xcol-1] + numprop}
map2[xrow,xcol<- map2[xrow,xcol] + numprop
map2[xrow,xcol]
} }
Assign the infections to the plants corresponding to the randomly drawn coordinates, and look at the new map.
lapply(list(map1,map2), PlotMat)
This process can be repeated a defined number of generations, based on pathogen life cycle.
#Assign the number of generations of dispersal to be studied
<- 6 num_gen
Set up mapn
as a list that includes num_gen maps.
<- as.list(1:num_gen) mapn
Initialize each of the num_gen maps to contain zeroes.
for(j in 1:num_gen){mapn[[j]] <- matrix(0,10,10)}
#For each of the next maps
1]] <- map1
mapn[[names(mapn)[1] <- "1"
for(j in 2:num_gen){
#Temporarily make the map the same as one generation back
<- mapn[[j-1]]
mapn[[j]] #Then add the new infections following the rooks' moves
for(xrow in 1:10){
for(xcol in 1:10){
<- mapn[[j-1]][xrow,xcol]
numprop if(xrow > 1){mapn[[j]][xrow-1,xcol] <-
-1,xcol] + numprop}
mapn[[j]][xrowif(xrow < 10){mapn[[j]] [xrow+1,xcol] <-
+1,xcol] + numprop}
mapn[[j]][xrowif(xcol < 10){mapn[[j]] [xrow,xcol+1] <-
+1] + numprop}
mapn[[j]][xrow,xcolif(xcol > 1){mapn[[j]] [xrow,xcol-1] <-
-1] + numprop}
mapn[[j]][xrow,xcol<- mapn[[j]][xrow,xcol] + numprop
mapn[[j]][xrow,xcol]
}
}names(mapn)[j] <- j
}
Make a plot for each element of the list.
<- lapply(mapn, PlotMat) plots
Note: Move cursor on lapply
and pres F1
to find out more about this funciton.
Display all plots.
plots
<- list()
mapn_plot for(i in seq(mapn)){
<-
mapn_plot[[i]] ::melt(mapn[[i]]) %>%
reshape2mutate(time = names(mapn[i]))
}
<-
gg %>%
mapn_plot bind_rows() %>%
ggplot(aes(factor(Var1), factor(Var2),
color = value,
label = as.character(value)))+
geom_point(aes(frame = time), size = point_size, shape = 15)+
scale_colour_gradient("Health status (counts of infected plants):",
low = "lightgreen", high = "#993300")+
theme_bw()+
theme(axis.title = element_blank(),
legend.position = "none",
panel.background = element_rect(terrain.colors(5)[3]),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())
::ggplotly(gg) plotly
Here we create map for a defined area 1000m x 1000m.
Assign the number of initial infections, nstart
.
<- 20 nstart
Randomly draw x and y coordinates for the initial infections.
<- runif(n=nstart, min=0, max=1000)
row1 <- runif(n=nstart, min=0, max=1000) col1
Data is drawn form exponential distribution using function
rexp
.
hist(rexp(n=1000, rate=2),col="blue",xlim =range(0:13), ylim = range(0:600))
hist(rexp(n=1000, rate=1),col="blue",xlim =range(0:13), ylim = range(0:600))
hist(rexp(n=1000, rate=0.5),col="blue", xlim =range(0:13), ylim = range(0:600))
Simulation parameters are as follows: - nstart
is the
number of initial infections - nnew
is the number of new
infections produced by each pre-existing infection each generation -
ngen
is the number of generations - exppar
is
the rate parameter for the exponential distribution
<- matrix(runif(n=2*nstart, min=0, max=1000),
coord nrow=nstart, ncol=2)
<- nstart
ntot <- 10
nnew <- 3
ngen <- 0.2 exppar
Create a function to run computations.
<- function(nstart, nnew, ngen, exppar){
disperse2 <- matrix(runif(n=2*nstart, min=0, max=1000),
coord nrow=nstart, ncol=2)
<- nstart
ntot for(i in 1:ngen){
for(j in 1:ntot){
<- coord[j,1]
tempx <- coord[j,2]
tempy <- runif(n=nnew, min=-pi, max=pi)
newdir <- rexp(n=nnew, rate=exppar)
newdist <- tempx + newdist * sin(newdir)
newx <- tempy + newdist * cos(newdir)
newy <- cbind(newx,newy)
newcoord <- rbind(coord,newcoord)
coord
}<- length(coord[,1])
ntot
}
coord }
Select rates and number of generations for visualization.
Make combinations of different rate parameters and number of generations to explore.
<-
(combos expand.grid(rate = c(.05, .1, .5),
gen = c(1, 2, 3, 4, 5)))
A matrix is created for each combination of parameters and stored in
list sim_ls
.
<- list()
sim_ls
<- Sys.time())
(start_time_single
for(i in seq(1:nrow(combos))){
<-
sim_ls[[i]] disperse2(nstart=20, nnew=10,
ngen=combos[i,2],
exppar= combos[i,1]) %>%
as_tibble() %>%
mutate(rates =paste0("rate = ", combos[i,1]),
ngen = paste0("gen = ", combos[i,2]))
# Error checking
# Sometimes it is useful to know which operations in the look are finished
# to be able to determine where the problems lies if an error occurs
print(i)
}
# Check the time difference
<- Sys.time() end_time_single
More intensive computations such as the one above could take a long
time. Since R is natively using a single core for computations we have
to use specific package and specific commands to be able to use multiple
cores. There are several packages that allow this and here we present an
example of use with pbapply
package.
# Split the combos data frame into a list
<-
combos_ls split(combos, 1:nrow(combos))
#calculate the time needed for the calculations
<- Sys.time())
(start_time_multi
# Detect the number of cores
<- detectCores()
cores
# sometimes we do not want to use all cores if we are using the machine for other operations
# cores <- ifelse(detectCores() > 1, detectCores()-1, 1)
# Make a cluster of cores
<- makeCluster(cores)
cl
# Export functions used in calculations
clusterExport(cl, c("combos_ls", "sim_ls", "disperse2", "sim_ls"))
# Export libraries needed for computation
clusterEvalQ(cl, library("dplyr", quietly = TRUE, verbose = FALSE))
<-
sim_ls pblapply(combos_ls, function(x){
disperse2(nstart=20, nnew=10,
ngen=x[,2],
exppar= x[,1]) %>%
as_tibble() %>%
mutate(rates =paste0("rate = ", x[ ,1]),
ngen = paste0("gen = ", x[ ,2]))
cl = cl)
},
# Check the time difference
<- Sys.time() end_time_multi
The cluster must be stopped at the end of the operation, or else, these remain active process and will slow the computer.
stopCluster(cl)
It is a good practice to save the results of the analysis right after the analysis finishes.
saveRDS(sim_ls, here::here("simulation.rds"))
Benefits of multicore processing are obvious on the table below.
# Duration of single core default R calculation
- start_time_single
end_time_single
# Duration of multic0re calculation
- start_time_multi end_time_multi
Finally, it is possible to visualize how does development of epidemics depend from the rate of spread.
%>%
sim_ls bind_rows() %>%
ggplot(aes(newx, newy))+
ylim(0,1000)+
xlim(0,1000)+
geom_point(size = .3)+
facet_grid(rates~ngen)+
theme_bw()+
xlab("East-West")+
ylab("South-North")+
coord_equal()
::include_graphics(here::here("data/spatial_prog/map.png")) knitr