### This script was used to generate all of the plots and tables for the manuscript.
# It contains many time-course related functions. The script can be run in the current form
# by downloading our HTseq files which are contained in a compressed TAR file from:
# https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE85595&format=file
# Place this TAR file into a folder with this R script. Then, run the script.

# The description of our data is here:
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85595

## Install Bioconductor and DESeq2 if you haven't already. To do this, remove the # characters
# from the following two lines:
#source("https://bioconductor.org/biocLite.R")
#biocLite("DESeq2")

# load the DESeq2 library:
library("DESeq2")

########################  TIME COURSE FUNCTIONS  ######################## 

# A function to input and merge a time course from files that contain raw sequencing count data, such as
# those generated by HTseq.  The files must be placed into the same folder
# and have the same base name.  Also, they must be named so that they are alphabetical by time.
# The first argument, base.name, is a string designating the base file names of the HT-SEQ files.
# The times.vector includes the ordered times as strings, to be used as column headings in the resulting data.frame.
# The function returns a table of two data.frames: the first is the number of counts for each gene/feature and the second is
# the number of counts in other categories (e.g., unmapped).
merge.time.course <- function(base.name,times.vector) {
  
  # create a list of file of names
  filenames <- list.files(pattern=base.name, full.names=F)
  cat("Obtaining and merging",length(filenames),"files\n")
  
  for (i in 1:length(filenames)) {
    
    cat(filenames[i],"\n")
    input.table <- read.table(file=filenames[i],sep="\t",stringsAsFactors=FALSE,
                              strip.white=TRUE, fill=TRUE,header=FALSE)
    colnames(input.table) <- c("gene",paste(times[i],base.name,sep="_"))
    if (i==1) {
      merged.table <- input.table
    }
    else {
      merged.table <- merge(merged.table,input.table,by="gene")
    }
  }
  table.info <- merged.table[grepl("^__",merged.table$gene),]
  merged.table <- merged.table[!grepl("^__",merged.table$gene),]
  return(list(merged.table,table.info))
}


# A function to normalize a time series. The first argument is the time course data.frame with gene names in first column.
# The second option is to remove any genes/features with a NA value at any of the times.
# The third option is to remove any genes/features that have 0 at all time points.
# The fourth option will normalize each sample (column) so that it has the same total number of reads.
# The default is to normalize to the sample (column) with the least number of total reads.
# The fifth option is to remove genes/features that are listed in the specific vector.
normalize.time.course <- function(time.course, remove.NA = TRUE, remove.zero = TRUE,
                                  normalize = "min", remove.genes = c("")) {
  genes.a <- nrow(time.course)
  genes.b <- 0
  cat("Normalizing time course. Started with",genes.a,"genes\n")
  
  if (remove.NA==TRUE) {
    time.course <- na.omit(time.course)
    genes.b <- nrow(time.course)
    genes.removed <- genes.a-genes.b
    cat("Removed",genes.removed,"genes with NAs\n")
    genes.a <- genes.b
  }
  
  if (remove.zero==TRUE) {
    gene.sum <- apply(time.course[,-1],1,sum)
    time.course <- time.course[gene.sum>0,]
    genes.b <- nrow(time.course)
    genes.removed <- genes.a-genes.b
    cat("Removed",genes.removed,"genes with all 0's\n")
    genes.a <- genes.b
  }
  if (normalize=="min") {
    total.counts <- apply(time.course[,-1],2,sum)
    time.course[,-1] <- sweep(time.course[,-1],2,total.counts,'/')
    time.course[,-1] <- time.course[,-1] * min(total.counts)
    cat("Normalized to the column with the lowest sum\n")
  }
  else if (normalize=="first") {
    total.counts <- apply(time.course[,-1],2,sum)
    time.course[,-1] <- sweep(time.course[,-1],2,total.counts,'/')
    time.course[,-1] <- time.course[,-1] * total.counts[1]
    cat("Normalized to the first column\n")
  }
  if (remove.genes[1]!="") {
    time.course <- time.course[!grepl(paste(remove.genes,collapse="|"),time.course[,1]),]
    genes.b <- nrow(time.course)
    genes.removed <- genes.a-genes.b
    cat("Removed",genes.removed,"genes in the designated list\n")
    genes.a <- genes.b
  }
  cat(genes.a,"genes were retained\n")
  return(time.course)
}


# A function to create a graph that will aid in determining a "floor", which is defined as the minimum reliable count.
# The graph shows the median count for a gene over the time course versus the normalized varibility for the gene
# across the time course.  Genes with low median counts are found to have very high variability. This
# variability decreases significantly as the median increases. We have found a floor of 10-20 has been reliable
# for our data.
# The first argument is the time course data.frame with gene names in the first column. The counts should have been
# total count normalized.
# The second argument is the upper limit on the x axis of the graph.
# The third argument is the upper limit on the y-axis of the graph.
# This function does not return any value, but produces a graph.
floor.plot <- function(time.course, upper.x=100,upper.y=10) {
  gene.median <- apply(time.course[,-1],1,median)
  gene.sd <- apply(time.course[,-1],1,sd)
  gene.norm.sd <- gene.sd/gene.median
  plot(gene.median,gene.norm.sd,xlab="Median counts",ylab="Normalized St. Dev.",
       xlim=c(0,upper.x),ylim=c(0,upper.y),cex.lab=1.2,cex.axis=1.2)
}


# A function that floors the time course. All count values below the floor are set to the floor value.
# The first argument is the time course data.frame with gene names in the first column.
# The second argument is the floor value, which is usually determined from the floor.plot function.
# This function returns the time course data.frame with all values less than the floor set to the floor value.
floor.time.course <- function(time.course, floor=10) {
  time.course[,-1][time.course[,-1]<floor] <- floor
  return(time.course)
}


# A function that only retains genes with a minimum fold change between the minimum
# and maximum count values.
# The first argument is the time course data.frame with gene names in the first column.
# The second argumnent is the minmum fold change required for a gene to be retained.
# The function returns the same data.frame including only genes with the minimum fold-change.
fold.change <- function(time.course, min.FC=2) {
  genes.a <- nrow(time.course)
  cat("Started with",genes.a,"genes\n")
  FC <- apply(time.course[,-1],1,function(x) max(x)/min(x))
  time.course <- time.course[FC>=min.FC,]
  genes.b <- nrow(time.course)
  cat("Removed",genes.a-genes.b,"genes\n")
  cat("Kept",genes.b,"genes\n")
  return(time.course)
}


# A function that normalizes the fold-changes and log transforms the data.
# The first argument is a data.frame with genes in the first column and count data in subsequent columns.
# The second argument is the log base that is used. The default is 2.
# The third argument indicates which column is used to normalize. The default is the first column of count data.
log.transform <- function(time.course,log.base=2,norm.column=1) {
  time.course[,-1] <- log(time.course[,-1]/time.course[,(norm.column+1)],base=log.base)
  return(time.course)
}


# A function that determines the largest fold change for each gene.
# The first argument is the time course data.frame with gene names in the first column.
# The expression values are log transformed with value of 0 at time 0.
# The function returns a vector of the largest log2 change, positive or negative.
max.change <- function(time.course) {
  peak.FC <- rep(0,times=nrow(time.course))
  for (i in 1:nrow(time.course)) {
    max.FC <- max(time.course[i,-1])
    min.FC <- min(time.course[i,-1])
    if (max.FC > abs(min.FC)) {
      peak.FC[i] <- max.FC
    }
    else {
      peak.FC[i] <- min.FC
    }
  }
  df <- data.frame(time.course[,1],peak.FC)
  colnames(df)[1] <- "gene"
  return(df)
}


# A function to plot the expression of one gene vs. time.  A time course can be entered for multiple samples.
# The first argument, gene, is the gene name.
# The second argument, time, is a numerical vector of times
# expression is the vector of expression values, in order by time; each time course grouped by sample
# samples is the vector of sample names
plot.gene <- function(gene,time,expression,samples) {
  
  x <- rep(time,length(samples))
  y <- as.numeric(expression)
  
  par(cex=1.3)
  plot(x,y,xlab="time",ylab="Relative Expression",main=gene,type="n")
  colors <- rainbow(length(samples))
  par(cex=1)
  if (y[length(time)]<=y[1]) {
    legend("topright",samples, col=colors, pch=20, lty=1,bty="n")
  }
  else {
    legend("bottomright",samples, col=colors, pch=20, lty=1,bty="n")
  }
  
  par(cex=1.3)
  for (i in 1:length(samples)) {
    lines(time,y[(1:length(time))+(i-1)*length(time)],type="o",col=colors[i],pch=20)
  }
}


########################  MAIN SCRIPT  ########################

# De-compress the TAR file made up of multiple HTseq files.
untar("GSE85595_RAW.tar")

# Designate the times of the time course.  These will be used as column headers for the data.frame containing
# the count values for each gene.
times <- c("min0","min5","min10","min30","min60","min120","min180","min240")

# A vector containing gene names to be removed. The 24 S. cerevisiae PAU genes are almost identical in
# sequence, so it is difficult to differentiate them with traditional sequence mapping methods,
# such as TopHat, Bowtoe, or BWA.
PAU <- c("YJL223C","YGL261C","YGR294W","YHL046C","YIL176C","YDR542W","YIR041W",
          "YKL224C","YLL025W","YLL064C","YMR325W","YEL049W","YOL161C","YOR394W",
          "YPL282C","YLR037C","YBR301W","YCR104W","YLR461W","YFL020C","YNR076W",
          "YAR020C","YAL068C","YBL108C-A")

# Import a set of HTseq files, all of which contain the same pattern in their name (e.g., SW_1005), into R
# using the merge.time.couse function defined above. The function places the count data for genes into one
# dataframe. It places the number of reads in other categories (e.g., unmapped) into a second data frame.
# The output of this function is a list containing these two dataframes.
WT1 <- merge.time.course("SW_1005",times)  # first repeat
WT1[[2]]  # display the number of reads in other categories (this particular set of files has these removed)
WT1 <- WT1[[1]]   # place the first dataframe from this list, containing the reads counts, into WT1
WT2 <- merge.time.course("NH_31",times)  # second repeat
WT2[[2]]   # display the number of reads in other categories
WT2 <- WT2[[1]]   # place the first dataframe from this list, containing the reads counts, into WT2
WT3 <- merge.time.course("NH_42",times)   # third repeat
WT3[[2]]  # display the number of reads in other categories
WT3 <- WT3[[1]]   # place the first dataframe from this list, containing the reads counts, into WT3


# Use DESeq2 to identify the oxygen-regulated genes. Here, each time point (e.g., 5 min) was compared
# to time 0 (i.e., aerobic). Thus, seven tests were carried out.  A data frame was created for all
# adjusted p values and the minimum p value for each gene.

# merge the replicates and change into matrix
WT.matrix <- Reduce(function(x, y) merge(x, y, all=TRUE), list(WT1, WT2, WT3))
rownames(WT.matrix) <- WT.matrix[,1]
WT.matrix <- WT.matrix[,-1]
WT.matrix <- as.matrix(WT.matrix)

# define conditions for DESeq2
condition <- rep(c("aerobic","hypoxic"),3)
coldata <- data.frame(condition)

#rownames(coldata) <- colnames(WT[,c(1,2,9,10,17,18)])

# create a data.frame to hold the adjusted p values
DE.padj <- data.frame(gene=row.names(WT.matrix))

for (i in 2:8) {
  dds.oxygen.regulated <- DESeqDataSetFromMatrix(countData = WT.matrix[,c(1,i,9,i+8,17,i+16)], colData = coldata, 
                                                 design = ~ condition)
  LRT.oxygen.regulated <- DESeq(dds.oxygen.regulated, test = "LRT", reduced = ~ 1)
  LRT.oxygen.regulated.res <- results(LRT.oxygen.regulated)
  LRT.oxygen.regulated.res$gene <- row.names(LRT.oxygen.regulated.res)
  DE.padj <- merge(DE.padj,as.data.frame(LRT.oxygen.regulated.res[,6:7],by="gene"))
  colnames(DE.padj)[i] <- paste(times[i],"padj",sep=".")
}

DE.padj[is.na(DE.padj)] <- 1
DE.padj$min.padj <- apply(DE.padj[,2:8],1,min)


# Normalize and log2 transform each of the replicates. Then, merge them into one data.frame,
# in order to perform PCA analysis, to determine fold changes, and to plot expression.

WT1 <- normalize.time.course(WT1,remove.NA=TRUE, remove.zero=TRUE, 
                             normalize="min", remove.genes=PAU)
WT2 <- normalize.time.course(WT2,remove.NA=TRUE, remove.zero=TRUE, 
                             normalize="min", remove.genes=PAU)
WT3 <- normalize.time.course(WT3,remove.NA=TRUE, remove.zero=TRUE, 
                             normalize="min", remove.genes=PAU)
pdf("floor_plots.pdf")
floor.plot(WT1)
text(90,9,"WT1 floor plot",col="blue")
floor.plot(WT2)
text(90,9,"WT2 floor plot",col="blue")
floor.plot(WT3)
text(90,9,"WT3 floor plot",col="blue")
dev.off()

WT1 <- floor.time.course(WT1)
WT2 <- floor.time.course(WT2)
WT3 <- floor.time.course(WT3)

WT1 <- log.transform(WT1)
WT2 <- log.transform(WT2)
WT3 <- log.transform(WT3)

WT.log <- Reduce(function(x, y) merge(x, y, all=TRUE), list(WT1, WT2, WT3))
WT.log <- na.omit(WT.log)


### PCA ###
hyp.pc <- prcomp(t(WT.log[,2:25]))
percent.variance <- 100 * hyp.pc$sdev^2 / sum(hyp.pc$sdev^2)
pdf("PCA_plots.pdf")
par(cex = 1.3)
plot(1:length(percent.variance),percent.variance,pch=20,main="Screeplot (in percentages)",
     xlab="Principal component",ylab="Percent of variance")

# plot the first replicate
plot(hyp.pc$x[1:8,1],hyp.pc$x[1:8,2],xlab="PC1",ylab="PC2", type = "b", xlim = c(-60, 50), ylim = c(-40, 35), pch = 16)
# plot the second replicate
lines(hyp.pc$x[9:16,1],hyp.pc$x[9:16,2], type = "b", col = "red", pch = 16)
# plot the third replicate
lines(hyp.pc$x[17:24,1],hyp.pc$x[17:24,2], type = "b", col = "blue", pch = 16)
dev.off()


## Average the three replicate time courses and determine the log2 max fold-change that
# occurs during the average time course
WT.log.mean <- Reduce("+",list(WT.log[,2:9],WT.log[,10:17],WT.log[,18:25]))/3
WT.log.mean$gene <- WT.log$gene
WT.log.mean <- WT.log.mean[,c(9,1:8)]
log2.max.FC <-  max.change(WT.log.mean)
colnames(log2.max.FC)[2] <- "log2.max.FC"
WT.log.mean <- merge(WT.log.mean,log2.max.FC,by="gene")

# Merge the p values and the log2 max fold-change and save Supp Table 1
all.genes <- merge(DE.padj,WT.log.mean,by="gene",all=T)
write.table(all.genes[,c(1:9,18)], file="Supp_Table1.txt",sep="\t",row.names=F,col.names=T)


# Save the genes that meet the thresholds:  genes must have at least 4-fold change up or
# down (-2 or +2 on log2 scale) and minimum adjusted p <= 0.05.
# The resulting table was used for gene expression clustering (Cluster 3.0 program) and a
# subsequent heatmap (program TreeView).
# The gene names in the table were used in GO enrichment analysis with this tool:
# http://www.yeastgenome.org/cgi-bin/GO/goSlimMapper.pl
oxygen.regulated <- all.genes[,c(1,9,18)]
oxygen.regulated <- subset(oxygen.regulated, min.padj <= 0.05 & abs(log2.max.FC) >= 2)
oxygen.regulated <- merge(WT.log,oxygen.regulated,by="gene")
oxygen.regulated <- oxygen.regulated[,c(-26,-27)]
write.table(oxygen.regulated, file="oxygen_regulated.txt",sep="\t",row.names=F,col.names=T)


# Plot gene expression for certain genes -- all replicates on one graph
time.nums <- c(0,5,10,30,60,120,180,240)
samples = c("WT replicate 1","WT replicate 2","WT replicate 3")
pdf("gene_expression_plots.pdf")
plot.gene(gene="CYC1",time=time.nums,expression=WT.log[WT.log$gene=="YJR048W",-1],samples=samples)
plot.gene(gene="ERG5",time=time.nums,expression=WT.log[WT.log$gene=="YMR015C",-1],samples=samples)
plot.gene(gene="DAN1",time=time.nums,expression=WT.log[WT.log$gene=="YJR150C",-1],samples=samples)
plot.gene(gene="NCE103",time=time.nums,expression=WT.log[WT.log$gene=="YNL036W",-1],samples=samples)
dev.off()