# Quantifying abdominal pigmentaiton
# Alexander W. Shingleton
# Last updated: October 26th, 2016

#This script can be used to estimate the number of repeat images that need to be taken at the begining of each session to effectivly control for session effects.
#The script uses repeated measurements of the same tergites over five sessions to estimate variance due to session effects, differences between indivduals tergites and residual measurement error.

require(lme4)
require(data.table)
dt<-Representative.Results

    
sim.loop<-function(data,trait,sample_size,overlap,no_sess){
    dt<-data
    trait <- deparse(substitute(trait))
    #First we calculate the variance due to session effects, differences between tergites and measurement error
    sds<-as.data.frame(VarCorr(lmer(dt[,trait]~(1|session)+(1|id), data=dt,REML=T)))
    #We then use these values as the parameters for our simulation.
    mean<-mean(dt[,trait])
    sd_session<-sds[2,5]
    sd_ind<-sds[1,5]
    sd_meas<-sds[3,5]
    sample_size<-sample_size
    overlap<-overlap # This is the number of specimens that are repeat-measured from one session to the next
    no_sess<-no_sess # This is the number of sessions
    # We now simulate a sample of specimens measured without measurement error or session effects.
    session1<-as.data.frame(rnorm(sample_size,mean,sd_ind))
    names(session1)<-c("trait")
    # We then add both variation due to session effects and measurement error.
    session1a<-session1+rnorm(sample_size,0,sd_meas)+rnorm(1,0,sd_session)
    session1a$session<-rep("session1",length(session1a))
    # We then select the specimens that will be repeat measured in the next session.
    repeatmeas1<-tail(session1, overlap)
    # We then generate a second session of data, again measured without measurement error or session effects.
    session2<-as.data.frame(rnorm(sample_size-overlap,mean,sd_ind))
    names(session2)<-c("trait")
    # We then add to this second session the data from the specimens that are repeat-measured, without measurement error or session effects. 
    session2<-rbind(repeatmeas1,session2)
    # We again add both variation due to session effects and measurement error.
    session2a<-session2+rnorm(sample_size,0,sd_meas)+rnorm(1,0,sd_session)
    session2a$session<-rep("session2",length(session2a))
    # This process is repeated to generate five sessions of data.
    repeatmeas2<-tail(session2, overlap)
    session3<-as.data.frame(rnorm(sample_size-overlap,mean,sd_ind))
    names(session3)<-c("trait")
    session3<-rbind(repeatmeas2,session3)
    session3a<-session3+rnorm(sample_size,0,sd_meas)+rnorm(1,0,sd_session)
    session3a$session<-rep("session3",length(session3a))
    repeatmeas3<-tail(session3, overlap)
    session4<-as.data.frame(rnorm(sample_size-overlap,mean,sd_ind))
    names(session4)<-c("trait")
    session4<-rbind(repeatmeas3,session4)
    session4a<-session4+rnorm(sample_size,0,sd_meas)+rnorm(1,0,sd_session)
    session4a$session<-rep("session4",length(session4a))
    repeatmeas4<-tail(session4, overlap)
    session5<-as.data.frame(rnorm(sample_size-overlap,mean,sd_ind))
    names(session5)<-c("trait")
    session5<-rbind(repeatmeas4,session5)
    session5a<-session5+rnorm(sample_size,0,sd_meas)+rnorm(1,0,sd_session)
    session5a$session<-rep("session5",length(session5a))
    simdata<-rbind(session1a,session2a,session3a,session4a,session5a)
    # We now have a complete simulated data set, which we can analyze.
    # First we subset the data by session.
    session1<-subset(simdata,session=="session1")
    session2<-subset(simdata,session=="session2")
    session3<-subset(simdata,session=="session3")
    session4<-subset(simdata,session=="session4")
    session5<-subset(simdata,session=="session5")
    # We then select the repeat-measured specimens from session 1 and session 2 and calculate their mean difference.
    session1tail<-tail(session1,overlap)
    session2head<-head(session2,overlap)
    mdiff<-mean(session1tail$trait-session2head$trait)
    # This mean difference is used to adjust the data from session 2.
    session2$trait<-session2$trait + mdiff
    # This process is repeated for the remaining sessions.
    session2tail<-tail(session2,overlap)
    session3head<-head(session3,overlap)
    mdiff<-mean(session2tail$trait-session3head$trait)
    session3$trait<-session3$trait + mdiff
    session3tail<-tail(session3,overlap)
    session4head<-head(session4,overlap)
    mdiff<-mean(session3tail$trait-session4head$trait)
    session4$trait<-session4$trait + mdiff
    session4tail<-tail(session4,overlap)
    session5head<-head(session5,overlap)
    mdiff<-mean(session4tail$trait-session5head$trait)
    session5$trait<-session5$trait + mdiff
    simdata.adj<-rbind(session1,session2,session3,session4,session5)
    names(simdata.adj)<-c("trait.adj")
    # We now have our corrected data, which we add back to our simulated data set.
    simdata<-cbind(simdata,simdata.adj$trait.adj)
    names(simdata)<-c("trait","session","trait.adj")
    # We now test whether session effects are eliminated after the correction.
    # Because lmer models need to have random factors, we add a random dummy variable to allow lmer comparison
    simdata$dummy <- sample(LETTERS[1:2],nrow(simdata), replace=TRUE )
    raw<-lmer(trait~(1|dummy),data=simdata,REML=F)
    raw2<-lmer(trait~(1|dummy)+(1|session), data=simdata, REML=F)
    adj<-lmer(trait.adj~(1|dummy),data=simdata,REML=F)
    adj2<-lmer(trait.adj~(1|dummy)+(1|session), data=simdata,REML=F)
    return(list(anova(raw,raw2),anova(adj,adj2)))
}

sim.loop(dt,Pmax,50,15,5)

# replicate same iteration, many times

require(plyr)
P.values<-replicate(100,sim.loop(dt,Pmax,50,20,5)[[2]]$`Pr(>Chisq)`[2])
hist(P.values)
sum(P.values<0.05)

# loop through different overlap sizes
ldply(1:50,function(x) mean(replicate(100,sim.loop(dt,100,x,5)[[2]]$`Pr(>F)`[1])))