### This script was used to generate all of the plots and tables for the manuscript. # It contains many time-course related functions. The script can be run in the current form # by downloading our HTseq files which are contained in a compressed TAR file from: # https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE85595&format=file # Place this TAR file into a folder with this R script. Then, run the script. # The description of our data is here: # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85595 ## Install Bioconductor and DESeq2 if you haven't already. To do this, remove the # characters # from the following two lines: #source("https://bioconductor.org/biocLite.R") #biocLite("DESeq2") # load the DESeq2 library: library("DESeq2") ######################## TIME COURSE FUNCTIONS ######################## # A function to input and merge a time course from files that contain raw sequencing count data, such as # those generated by HTseq. The files must be placed into the same folder # and have the same base name. Also, they must be named so that they are alphabetical by time. # The first argument, base.name, is a string designating the base file names of the HT-SEQ files. # The times.vector includes the ordered times as strings, to be used as column headings in the resulting data.frame. # The function returns a table of two data.frames: the first is the number of counts for each gene/feature and the second is # the number of counts in other categories (e.g., unmapped). merge.time.course <- function(base.name,times.vector) { # create a list of file of names filenames <- list.files(pattern=base.name, full.names=F) cat("Obtaining and merging",length(filenames),"files\n") for (i in 1:length(filenames)) { cat(filenames[i],"\n") input.table <- read.table(file=filenames[i],sep="\t",stringsAsFactors=FALSE, strip.white=TRUE, fill=TRUE,header=FALSE) colnames(input.table) <- c("gene",paste(times[i],base.name,sep="_")) if (i==1) { merged.table <- input.table } else { merged.table <- merge(merged.table,input.table,by="gene") } } table.info <- merged.table[grepl("^__",merged.table$gene),] merged.table <- merged.table[!grepl("^__",merged.table$gene),] return(list(merged.table,table.info)) } # A function to normalize a time series. The first argument is the time course data.frame with gene names in first column. # The second option is to remove any genes/features with a NA value at any of the times. # The third option is to remove any genes/features that have 0 at all time points. # The fourth option will normalize each sample (column) so that it has the same total number of reads. # The default is to normalize to the sample (column) with the least number of total reads. # The fifth option is to remove genes/features that are listed in the specific vector. normalize.time.course <- function(time.course, remove.NA = TRUE, remove.zero = TRUE, normalize = "min", remove.genes = c("")) { genes.a <- nrow(time.course) genes.b <- 0 cat("Normalizing time course. Started with",genes.a,"genes\n") if (remove.NA==TRUE) { time.course <- na.omit(time.course) genes.b <- nrow(time.course) genes.removed <- genes.a-genes.b cat("Removed",genes.removed,"genes with NAs\n") genes.a <- genes.b } if (remove.zero==TRUE) { gene.sum <- apply(time.course[,-1],1,sum) time.course <- time.course[gene.sum>0,] genes.b <- nrow(time.course) genes.removed <- genes.a-genes.b cat("Removed",genes.removed,"genes with all 0's\n") genes.a <- genes.b } if (normalize=="min") { total.counts <- apply(time.course[,-1],2,sum) time.course[,-1] <- sweep(time.course[,-1],2,total.counts,'/') time.course[,-1] <- time.course[,-1] * min(total.counts) cat("Normalized to the column with the lowest sum\n") } else if (normalize=="first") { total.counts <- apply(time.course[,-1],2,sum) time.course[,-1] <- sweep(time.course[,-1],2,total.counts,'/') time.course[,-1] <- time.course[,-1] * total.counts[1] cat("Normalized to the first column\n") } if (remove.genes[1]!="") { time.course <- time.course[!grepl(paste(remove.genes,collapse="|"),time.course[,1]),] genes.b <- nrow(time.course) genes.removed <- genes.a-genes.b cat("Removed",genes.removed,"genes in the designated list\n") genes.a <- genes.b } cat(genes.a,"genes were retained\n") return(time.course) } # A function to create a graph that will aid in determining a "floor", which is defined as the minimum reliable count. # The graph shows the median count for a gene over the time course versus the normalized varibility for the gene # across the time course. Genes with low median counts are found to have very high variability. This # variability decreases significantly as the median increases. We have found a floor of 10-20 has been reliable # for our data. # The first argument is the time course data.frame with gene names in the first column. The counts should have been # total count normalized. # The second argument is the upper limit on the x axis of the graph. # The third argument is the upper limit on the y-axis of the graph. # This function does not return any value, but produces a graph. floor.plot <- function(time.course, upper.x=100,upper.y=10) { gene.median <- apply(time.course[,-1],1,median) gene.sd <- apply(time.course[,-1],1,sd) gene.norm.sd <- gene.sd/gene.median plot(gene.median,gene.norm.sd,xlab="Median counts",ylab="Normalized St. Dev.", xlim=c(0,upper.x),ylim=c(0,upper.y),cex.lab=1.2,cex.axis=1.2) } # A function that floors the time course. All count values below the floor are set to the floor value. # The first argument is the time course data.frame with gene names in the first column. # The second argument is the floor value, which is usually determined from the floor.plot function. # This function returns the time course data.frame with all values less than the floor set to the floor value. floor.time.course <- function(time.course, floor=10) { time.course[,-1][time.course[,-1]=min.FC,] genes.b <- nrow(time.course) cat("Removed",genes.a-genes.b,"genes\n") cat("Kept",genes.b,"genes\n") return(time.course) } # A function that normalizes the fold-changes and log transforms the data. # The first argument is a data.frame with genes in the first column and count data in subsequent columns. # The second argument is the log base that is used. The default is 2. # The third argument indicates which column is used to normalize. The default is the first column of count data. log.transform <- function(time.course,log.base=2,norm.column=1) { time.course[,-1] <- log(time.course[,-1]/time.course[,(norm.column+1)],base=log.base) return(time.course) } # A function that determines the largest fold change for each gene. # The first argument is the time course data.frame with gene names in the first column. # The expression values are log transformed with value of 0 at time 0. # The function returns a vector of the largest log2 change, positive or negative. max.change <- function(time.course) { peak.FC <- rep(0,times=nrow(time.course)) for (i in 1:nrow(time.course)) { max.FC <- max(time.course[i,-1]) min.FC <- min(time.course[i,-1]) if (max.FC > abs(min.FC)) { peak.FC[i] <- max.FC } else { peak.FC[i] <- min.FC } } df <- data.frame(time.course[,1],peak.FC) colnames(df)[1] <- "gene" return(df) } # A function to plot the expression of one gene vs. time. A time course can be entered for multiple samples. # The first argument, gene, is the gene name. # The second argument, time, is a numerical vector of times # expression is the vector of expression values, in order by time; each time course grouped by sample # samples is the vector of sample names plot.gene <- function(gene,time,expression,samples) { x <- rep(time,length(samples)) y <- as.numeric(expression) par(cex=1.3) plot(x,y,xlab="time",ylab="Relative Expression",main=gene,type="n") colors <- rainbow(length(samples)) par(cex=1) if (y[length(time)]<=y[1]) { legend("topright",samples, col=colors, pch=20, lty=1,bty="n") } else { legend("bottomright",samples, col=colors, pch=20, lty=1,bty="n") } par(cex=1.3) for (i in 1:length(samples)) { lines(time,y[(1:length(time))+(i-1)*length(time)],type="o",col=colors[i],pch=20) } } ######################## MAIN SCRIPT ######################## # De-compress the TAR file made up of multiple HTseq files. untar("GSE85595_RAW.tar") # Designate the times of the time course. These will be used as column headers for the data.frame containing # the count values for each gene. times <- c("min0","min5","min10","min30","min60","min120","min180","min240") # A vector containing gene names to be removed. The 24 S. cerevisiae PAU genes are almost identical in # sequence, so it is difficult to differentiate them with traditional sequence mapping methods, # such as TopHat, Bowtoe, or BWA. PAU <- c("YJL223C","YGL261C","YGR294W","YHL046C","YIL176C","YDR542W","YIR041W", "YKL224C","YLL025W","YLL064C","YMR325W","YEL049W","YOL161C","YOR394W", "YPL282C","YLR037C","YBR301W","YCR104W","YLR461W","YFL020C","YNR076W", "YAR020C","YAL068C","YBL108C-A") # Import a set of HTseq files, all of which contain the same pattern in their name (e.g., SW_1005), into R # using the merge.time.couse function defined above. The function places the count data for genes into one # dataframe. It places the number of reads in other categories (e.g., unmapped) into a second data frame. # The output of this function is a list containing these two dataframes. WT1 <- merge.time.course("SW_1005",times) # first repeat WT1[[2]] # display the number of reads in other categories (this particular set of files has these removed) WT1 <- WT1[[1]] # place the first dataframe from this list, containing the reads counts, into WT1 WT2 <- merge.time.course("NH_31",times) # second repeat WT2[[2]] # display the number of reads in other categories WT2 <- WT2[[1]] # place the first dataframe from this list, containing the reads counts, into WT2 WT3 <- merge.time.course("NH_42",times) # third repeat WT3[[2]] # display the number of reads in other categories WT3 <- WT3[[1]] # place the first dataframe from this list, containing the reads counts, into WT3 # Use DESeq2 to identify the oxygen-regulated genes. Here, each time point (e.g., 5 min) was compared # to time 0 (i.e., aerobic). Thus, seven tests were carried out. A data frame was created for all # adjusted p values and the minimum p value for each gene. # merge the replicates and change into matrix WT.matrix <- Reduce(function(x, y) merge(x, y, all=TRUE), list(WT1, WT2, WT3)) rownames(WT.matrix) <- WT.matrix[,1] WT.matrix <- WT.matrix[,-1] WT.matrix <- as.matrix(WT.matrix) # define conditions for DESeq2 condition <- rep(c("aerobic","hypoxic"),3) coldata <- data.frame(condition) #rownames(coldata) <- colnames(WT[,c(1,2,9,10,17,18)]) # create a data.frame to hold the adjusted p values DE.padj <- data.frame(gene=row.names(WT.matrix)) for (i in 2:8) { dds.oxygen.regulated <- DESeqDataSetFromMatrix(countData = WT.matrix[,c(1,i,9,i+8,17,i+16)], colData = coldata, design = ~ condition) LRT.oxygen.regulated <- DESeq(dds.oxygen.regulated, test = "LRT", reduced = ~ 1) LRT.oxygen.regulated.res <- results(LRT.oxygen.regulated) LRT.oxygen.regulated.res$gene <- row.names(LRT.oxygen.regulated.res) DE.padj <- merge(DE.padj,as.data.frame(LRT.oxygen.regulated.res[,6:7],by="gene")) colnames(DE.padj)[i] <- paste(times[i],"padj",sep=".") } DE.padj[is.na(DE.padj)] <- 1 DE.padj$min.padj <- apply(DE.padj[,2:8],1,min) # Normalize and log2 transform each of the replicates. Then, merge them into one data.frame, # in order to perform PCA analysis, to determine fold changes, and to plot expression. WT1 <- normalize.time.course(WT1,remove.NA=TRUE, remove.zero=TRUE, normalize="min", remove.genes=PAU) WT2 <- normalize.time.course(WT2,remove.NA=TRUE, remove.zero=TRUE, normalize="min", remove.genes=PAU) WT3 <- normalize.time.course(WT3,remove.NA=TRUE, remove.zero=TRUE, normalize="min", remove.genes=PAU) pdf("floor_plots.pdf") floor.plot(WT1) text(90,9,"WT1 floor plot",col="blue") floor.plot(WT2) text(90,9,"WT2 floor plot",col="blue") floor.plot(WT3) text(90,9,"WT3 floor plot",col="blue") dev.off() WT1 <- floor.time.course(WT1) WT2 <- floor.time.course(WT2) WT3 <- floor.time.course(WT3) WT1 <- log.transform(WT1) WT2 <- log.transform(WT2) WT3 <- log.transform(WT3) WT.log <- Reduce(function(x, y) merge(x, y, all=TRUE), list(WT1, WT2, WT3)) WT.log <- na.omit(WT.log) ### PCA ### hyp.pc <- prcomp(t(WT.log[,2:25])) percent.variance <- 100 * hyp.pc$sdev^2 / sum(hyp.pc$sdev^2) pdf("PCA_plots.pdf") par(cex = 1.3) plot(1:length(percent.variance),percent.variance,pch=20,main="Screeplot (in percentages)", xlab="Principal component",ylab="Percent of variance") # plot the first replicate plot(hyp.pc$x[1:8,1],hyp.pc$x[1:8,2],xlab="PC1",ylab="PC2", type = "b", xlim = c(-60, 50), ylim = c(-40, 35), pch = 16) # plot the second replicate lines(hyp.pc$x[9:16,1],hyp.pc$x[9:16,2], type = "b", col = "red", pch = 16) # plot the third replicate lines(hyp.pc$x[17:24,1],hyp.pc$x[17:24,2], type = "b", col = "blue", pch = 16) dev.off() ## Average the three replicate time courses and determine the log2 max fold-change that # occurs during the average time course WT.log.mean <- Reduce("+",list(WT.log[,2:9],WT.log[,10:17],WT.log[,18:25]))/3 WT.log.mean$gene <- WT.log$gene WT.log.mean <- WT.log.mean[,c(9,1:8)] log2.max.FC <- max.change(WT.log.mean) colnames(log2.max.FC)[2] <- "log2.max.FC" WT.log.mean <- merge(WT.log.mean,log2.max.FC,by="gene") # Merge the p values and the log2 max fold-change and save Supp Table 1 all.genes <- merge(DE.padj,WT.log.mean,by="gene",all=T) write.table(all.genes[,c(1:9,18)], file="Supp_Table1.txt",sep="\t",row.names=F,col.names=T) # Save the genes that meet the thresholds: genes must have at least 4-fold change up or # down (-2 or +2 on log2 scale) and minimum adjusted p <= 0.05. # The resulting table was used for gene expression clustering (Cluster 3.0 program) and a # subsequent heatmap (program TreeView). # The gene names in the table were used in GO enrichment analysis with this tool: # http://www.yeastgenome.org/cgi-bin/GO/goSlimMapper.pl oxygen.regulated <- all.genes[,c(1,9,18)] oxygen.regulated <- subset(oxygen.regulated, min.padj <= 0.05 & abs(log2.max.FC) >= 2) oxygen.regulated <- merge(WT.log,oxygen.regulated,by="gene") oxygen.regulated <- oxygen.regulated[,c(-26,-27)] write.table(oxygen.regulated, file="oxygen_regulated.txt",sep="\t",row.names=F,col.names=T) # Plot gene expression for certain genes -- all replicates on one graph time.nums <- c(0,5,10,30,60,120,180,240) samples = c("WT replicate 1","WT replicate 2","WT replicate 3") pdf("gene_expression_plots.pdf") plot.gene(gene="CYC1",time=time.nums,expression=WT.log[WT.log$gene=="YJR048W",-1],samples=samples) plot.gene(gene="ERG5",time=time.nums,expression=WT.log[WT.log$gene=="YMR015C",-1],samples=samples) plot.gene(gene="DAN1",time=time.nums,expression=WT.log[WT.log$gene=="YJR150C",-1],samples=samples) plot.gene(gene="NCE103",time=time.nums,expression=WT.log[WT.log$gene=="YNL036W",-1],samples=samples) dev.off()