### How to use the "imputation()" function

#########################################################################################################
### imputation <- function(dataframe,method=c("gam", "meanimp", "medianimp", "general.regression")    ###
###                                 ,binom.data=NULL,count.data=NULL,multinom.data=NULL, spn=TRUE){}  ###
###                                                                                                   ###
### For example, if you have dataframe with 4 columns where the third column is binary, just type     ###
### "imputation(dataframe, method="gam", binom.data=3)" for a GAM imputation                          ###
### Use method="general.regression" for a faster imputation based on simple GLMs                      ###
#########################################################################################################

# you need the following libraries for the imputation function: mgcv, nnet
# you need the following libraries for the example below      : Amelia, EMV, mice


##############################
# One detailled illustration #
##############################


### Consider the following data-matrix
### Create complete Datamatrix without any missing value

n<-200                                 # Sample size 

var1 <- runif(n,0,10)                  # continous data      
var2 <- rpois(n,10)                    # count data  
var3 <- rbinom(n,1,0.6)                # discrete data (binary)              
var4 <- rbinom(n,4,0.5)                # discrete data (not binary)

mu    <- -4 + 5*log(var1) +(var2)^2/mean(var2)               
sigma <- 3                          

y <- rnorm(n, mu, sigma)               # normal distributed variable that depends (non-linear) on var1 and var2

# Set some data to NA
# Mechanism: MAR, as missing values are only in the covariates
# and do only depend on y

# We need a probablility of missingness to set some values of the data-matrix missing
probab <-  1-(1/(0.0015*(y-40)^2+1))         # missing -> var1
probab2 <- 1 - ((1/(0.015*(y)^2+1)))*((sign(y)-1)/-2) - (1/(0.0005*(y)^2+1))*((sign(y)+1)/2)  # missing -> var2
probab3  <- 1-(1/(1+exp(1-0.05*(y+5))))      # missing -> var3
probab4 <- 1-(1/(0.0015*(y-50)^2+1))         # missing -> var4

probab<-probab/2
probab2<-probab2/2
probab3<-probab3/2
probab4<-probab4/2

var1mis  <- var1
var2mis  <- var2
var3mis  <- var3
var4mis  <- var4

# Matrix with missing values in a MAR-setting
for(k in 1:n){var1mis[k] <- sample(c(var1[k],NA),1,prob=c(1-probab[k],probab[k]))}
for(k in 1:n){var2mis[k] <- sample(c(var2[k],NA),1,prob=c(1-probab2[k],probab2[k]))}
for(k in 1:n){var3mis[k] <- sample(c(var3[k],NA),1,prob=c(1-probab3[k],probab3[k]))}
for(k in 1:n){var4mis[k] <- sample(c(var4[k],NA),1,prob=c(1-probab4[k],probab4[k]))}

                  
myexample <- cbind(y,var1mis,var2mis,var3mis,var4mis) # data-matrix with missing values in MAR setting
original <- data.frame(y,var1,var2,var3,var4)         # original data without missing values        


# Lets have a look at the MAR missingness process
par(mfrow=c(2,2))
plot(y, probab, ylab="Probability of missingness (Variable 1)",ylim=c(0,0.5))
plot(y, probab2, ylab="Probability of missingness (Variable 2)",ylim=c(0,0.5))
plot(y, probab3, ylab="Probability of missingness (Variable 3)",ylim=c(0,0.5))
plot(y, probab4, ylab="Probability of missingness (Variable 4)",ylim=c(0,0.5))



### these are possible imputation methods that exist in R
library(Amelia)
library(EMV)
library(mice)

imp1 <- amelia(myexample,m=1,noms=c(4,5))
imp2 <- knn(as.matrix(myexample))
imp3 <- complete(mice(myexample,m=1,imputationMethod=c("pmm","pmm","pmm","logreg","pmm")))
imp4 <- imputation(myexample, method="meanimp")
imp5 <- imputation(myexample, method="medianimp")
imp6 <- imputation(myexample, method="general.regression",count.data=c(3),binom.data=c(4),multinom.data=c(5))
imp7 <- imputation(myexample, method="gam", count.data=c(3),binom.data=c(4),multinom.data=c(5))


### check the (not-standardized) squared imputation error 
error1 <- sum((original-imp1$m1)^2)
error2 <- sum((original-imp2$data)^2)
error3 <- sum((original-imp3)^2)                              
error4 <- sum((original-imp4)^2)      
error5 <- sum((original-imp5)^2)
error6 <- sum((original-imp6)^2)
error7 <- sum((original-imp7)^2)

error1
error2
error3
error4
error5
error6
error7








