Presence-Only Example with GAMs
Below is a comlete, documented, example of using R to create a model with GAMs from presence-only data. This example uses the following steps to create the model. These steps could easily be used to create other types of models from presence-only data.
- Load the required libraries (this step only needs to be executed once)
- Require the required libraries
- Let the user select a file with the sample data and then load it into R
- Let the user select a folder with the predictor layers and then load them into a vector in R
- Plot one of the predictors with the sample data overlaid
- Randomly sample the area of the predictors for "background" points and then add them as "absences" to the sample data
- Divide the data into "training" and "test" datasets
- Run the model on the "training" dataset
- Find the "Area-Under-The-Curve" metric for the results
- Plot the final predicted surface
The Code
#################################################################################################################
# Purpose: GEO 599 Big data modeling class - GAMs example with only prescence data
# Date of most recent modification: 4.24.2013
# Date: 4.14.2013
# Authors: Jose Montero & Jim Graham
# Language: R
# Data from ?
#################################################################################################################
############################## load the data and required libraries
install.packages("MapGAM")
install.packages("mgcv")
install.packages("tcltk2")
install.packages("rgdal")
install.packages("dismo")
install.packages("raster")
install.packages("maptools")
install.packages("sp")
install.packages( "ggplot2")
install.packages("rgdal")
require(ROCR) # Library to calculate AUC
require(ggplot2) # ggplot2 library to make the map
require(tcltk2) # library to set the directory to load the predictor rasters
require(rgdal) # you should know already this one if are all spatial people
require(dismo) # the dismo packages has losts of toos for SDM
require(raster) # to handle raster. It has a lot of tools similar to ArcMap
require(maptools) # Another usefull library for spatial stuff
require(sp) # this is the spatial library
require(mgcv) # GAMs library
FilePath=file.choose()
TheRaster=raster(FilePath)
trees=read.csv(FilePath,h=T) # load the data. flile.choose() allows you to search for the data
names(trees) # Chech the names of hech attribute in the data frame you loaded
trees$pb=1 # add 1 to the data frmae to represent prescences
trees$pb
# Add the rasters
setwd(tk_choose.dir(default="", caption="Select directory")) # select directory where the .img rasters are
file.list=list.files() # allocate memory for the list of files
rasters=list() # allocate memory for the list of rasters
for(file in file.list) # loop to load the rasters
{
print(file) # print the variable in the loop to ccheck is working
rasters[[file]]=raster(file) # generate the list of raster objects
} # end of the loop
all.rasters =stack(rasters) # create a stack layer of rasters (to do this all should be mask the same way)
plot(all.rasters) # plot the stack to
plot(rasters[[1]], # plot the raster 1 in the list
xlim=c(-125,-114), ylim=c(32,42))
points(trees$x, trees$y, # over lay the tree points on it
pch=19, cex=.2, col='black')
############################# Backgrounds sampling when abscence data is not available
set.seed(100000) # generate 100000 the random points
mask=rasters[[1]] # generate a mask to limit the sampling region where we are going to predict
backG = randomPoints(mask, 10000) # sample 5000 background points within the mask
predictors=stack(rasters) # make a new stack call predictors. you can also use the one we created before
coordinates(trees) = ~Lon+Lat # get the tree cordinates
backgrp = extract(predictors, backG) # sample predictor values to background points
presv = extract(predictors, trees) # sample predictor values to prescence points
backgrp=data.frame(backgrp) # make data frame of extracted back points
pb = c(rep(1, nrow(presv)), rep(0, nrow(backgrp))) # put pres/abs values together
sdmdata = data.frame(cbind(pb, rbind(presv, backgrp))) # make final data frame for modeling
sdmdata=na.omit(sdmdata) # omit NA values
############################## dividing the data in testing and train sets
index=1:nrow(sdmdata) # get data frame index for sampling
test.index=sample(index, trunc(length(index)/3)) # 30% of index is sampled as test data frame
test.set=sdmdata[test.index,] # make test data frame
trai.nset=sdmdata[-test.index,] # make train data frame
############################## GAM model
m1 =gam(pb ~ s(AnnualMeanTemp) + # GAM model with all predictor variables on it
s(AnnualPrecip) +
s(DEM) +
s(MaxTempWarmestMonth) +
s(MinTempColdestMonth),
data=train.set,
family=binomial(link = "logit"), # binomial distribution with logit link function
gamma=1.4)
summary(m1) # summary of the GAM results
plot(m1, pages=1, scale=F, shade=T) # make partial dependence plots
############################## model validation using AUC
pred.test=predict(m1,test.set,type='response') # make prediction for test set
preds.obs=data.frame(pred.test=pred.test,test.set$pb) # data frame of preds vs obs
gam1.eval=prediction(pp$model_out,pp$test_.set) # Asses prediction for AUC
attributes(performance(gam1.eval, 'auc'))$y.values[[1]] # get AUC value
roc(preds.obs$test.set.pb,preds.obs$pred.test) # use gbm package to the AUC
write.table(preds.obs,file.choose()) # write preds-obs table to clean it in excel
pp=read.delim(file.choose(), h=T) # load the table after cleaning
############################### plot predicted raster with ggplot2 library
preds=predict(predictors,m1,type='response') # make prediction for the entire grid
Preds = rasterToPoints(preds) # raster to points
Preds = data.frame(Preds) # make a dataframe of the points to work with ggplot2
colnames(Preds) = c("X","Y","probability") # change tha names
# make the plot ussing ggplot2 (ggplot2 is an entire new language so we wont go into the details. go to: )
ggplot(NULL, aes(X, Y)) +
geom_raster(data = Preds, aes(fill = probability)) +
scale_fill_gradientn(name="Probability",colours = topo.colors(100))+
guides(fill = guide_colorbar()) +
scale_alpha(range = c(0, 0.5), guide = "none") +
scale_x_continuous(name=expression(paste("")), limits=c(-125,-114),expand=c(0,0)) +
scale_y_continuous(name=expression(paste("")), limits=c(32,42),expand=c(0,0)) +
coord_equal()
writeRaster(preds, file.choose()) # export predicted raster outside R
R for Spatial Statistics