Bellow is a short example of how we analyzed some of the differentiation data. If you plan on performing RNAseq analysis yourself there are some great vignettes from DESEQ2 that are very nice to follow.
First we’ll set some settings for the figure size of R-Markdown
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8)
knitr::opts_knit$set(root.dir = "/home/jsmits/RNA_seq_KC_diff/")
If you have not installed the R packages listed below, remove the “#” symbols in front of the lines and run the code below them.
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("DESeq2")
#BiocManager::install("BiocParallel")
#BiocManager::install("vsn")
#BiocManager::install("ggplot2")
#BiocManager::install("biomaRt")
Load the data, change your working directory to a folder containing a folder called “counts” where all the ReadsPerGene.out.tab files generated by STAR are located.
setwd("/home/jsmits/RNA_seq_KC_diff/fastq")
ff <- list.files( path = "./counts", pattern = "*ReadsPerGene.out.tab$", full.names = TRUE )
counts.files <- lapply( ff, read.table, skip = 4 )
counts <- as.data.frame( sapply( counts.files, function(x) x[ , 2 ] ) )
ff <- gsub( "[.]ReadsPerGene[.]out[.]tab", "", ff )
ff <- gsub( "[.]/counts/", "", ff )
colnames(counts) <- ff
row.names(counts) <- counts.files[[1]]$V1
count.files <- NULL
Use BioMart to find HGNC gene names to the ENSEMBL gene identifiers. Then write the count table to the file:"counts_hgnc.txt" Alternatively you can also skip this step and proceed with the Ensemble gene ID’s.
# load libraries
library("biomaRt")
library("dplyr")
library("stringr")
genes <- as.vector(row.names(counts))
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl",host = "uswest.ensembl.org",ensemblRedirect = FALSE)
filters = listFilters(ensembl)
# get hgnc symbol of each ensembl id
G_list<- getBM(attributes=c('ensembl_gene_id', "hgnc_symbol"), filters = "ensembl_gene_id", values=genes, mart=ensembl)
# rearrange the table to get the hgnc symbol to the count table
counts$ensembl_gene_id <- rownames(counts)
rownames(counts) <- NULL
counts$ensembl_gene_id <- str_replace(counts$ensembl_gene_id, "\\.[0-9]*", "")
counts_hgnc <- merge(counts, G_list, by="ensembl_gene_id")
counts_hgnc$ensembl_gene_id <- NULL
counts_hgnc <- distinct(counts_hgnc)
counts_hgnc <- counts_hgnc[,1:ncol(counts_hgnc)] %>% group_by(hgnc_symbol) %>% summarise_all(list(sum))
counts_hgnc <- as.data.frame(counts_hgnc)
rownames(counts_hgnc) <- counts_hgnc$hgnc_symbol
counts_hgnc$hgnc_symbol <- NULL
write.table(counts_hgnc, file="counts_hgnc.txt", sep="\t")
Load the count data (skip previous steps if ran before)
library("biomaRt")
library("dplyr")
library("stringr")
counts_hgnc <- read.table(file = "counts_hgnc.txt", sep = "\t")
Load the sample data (with columns saying if the differentiation day, the cell line, if it is diseased or wt, and if it is paired end or single end sequenced.)
setwd("/home/jsmits/RNA_seq_KC_diff/")
sample_data<- read.csv("sample_data.csv",
header = TRUE)
rownames(sample_data) <- sample_data[,1]
All count information of normal KC differentiation from figure 1 (that was sequenced single end) and normal and mutant KC differentiation (that was sequenced double end) was included in the counttable % sample_data. Here we subselect only one datatype at a time to prevent sequencing biase.
library("tidyverse")
colnames(counts_hgnc) <- row.names(sample_data)
sample_data <- sample_data %>% filter(type == "paired-end")
row.names(sample_data) <- sample_data[,1]
sample_data[,1] <- NULL
counts_hgnc <- counts_hgnc[,colnames(counts_hgnc) %in% row.names(sample_data)]
counts_hgnc <- counts_hgnc[, rownames(sample_data)]# samples in readcount in the same order as the coldata file
all(rownames(sample_data) == colnames(counts_hgnc))#check if coldata and readcounts are in the same order
## [1] TRUE
all(rownames(sample_data) %in% colnames(counts_hgnc)) #check if all samples have a match in coldata
## [1] TRUE
all(rownames(sample_data) == colnames(counts_hgnc))#check if coldata and readcounts are in the same order
## [1] TRUE
Now we load the counts and the sample_data into DESEQ2 based on the day and the disease/wt.
#Deseq2 normalisation:
library("DESeq2")
library("BiocParallel")
register(SnowParam(4))
dds <- DESeqDataSetFromMatrix(countData = counts_hgnc,
colData = sample_data,
design = ~ day + diseased)
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
Next we use Deseq2 to normalize the data, we visualize the standard deviation of genes with high/low counts to see if the normalisation ahs sucessfully removed low expressed genes with high sd:
#rld <- rlog(dds, blind=FALSE) # this gives the rLog (which behaves similarly to a log2 transformation for genes with high counts, while shrinking together the values for different samples for genes with low counts)
rld <- vst(dds, blind=FALSE) # this gives the rLog (which behaves similarly to a log2 transformation for genes with high counts, while shrinking together the values for different samples for genes with low counts)
library("vsn")
library("ggplot2")
msd <- meanSdPlot(assay(rld))
#pdf("sd_after_rlog.pdf")
pdf("sd_after_vst.pdf")
myplot <- msd$gg + scale_y_continuous(limits = c(0, 7))
print(myplot)
Subsequently we generate a distance matrix to see how similar samples are to each other
library("RColorBrewer")
library("pheatmap")
library("ggplot2")
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <-make.unique(paste(rld$day,rld$cell_line))
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
annotation_column <- as.data.frame(rld$day)
annotation_column$cell_line <- rld$cell_line
row.names(annotation_column) <- rownames(sampleDistMatrix)
hc <- hclust(dist(sampleDistMatrix))
myplot <- pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors,
annotation_row = annotation_column,
clustering_method = "ward.D",
width = 10,
height = 10
)
print(myplot)
pdf("distance_samples.pdf", width = 11, height = 9)
print(myplot)
dev.off()
Now its time to generate the PCA map to visualize the variance between samples (figure 2B)
pcaData <- plotPCA(rld, intgroup = c( "cell_line","day"), returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
my_pca <- ggplot(pcaData, aes(x = PC1, y = PC2, color = cell_line, shape = day)) +
geom_point(size =5) + scale_color_manual(values=c("#bae6f2","#66c8ff", "#f1babb", "#ed7779","#ed393c"))+
scale_shape_manual(values=c(15,1,17,18))+
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed(2)
ggsave("PCA2.pdf",my_pca, height = 10, width = 8, device = 'pdf')
print(my_pca)
pdf("PCA_patient_mut2.pdf", width = 8, height = 10)
print(my_pca)
dev.off()
Next we look the differentially expressed genes between wt and p63 mutant over time
all_genes_found <- row.names(assay(rld))
diff_genes <- results(dds,alpha=0.05, contrast=c("diseased","diseased","wt"))
diff_genes <- subset(diff_genes, padj < 0.00001)
topdiffGenes<- row.names(assay(rld)) %in% row.names(diff_genes)
diff_genes <- row.names(assay(rld)[ topdiffGenes, ])
We’l perform kmean clustering (with 3 clusters) on the highly variable genes to map each of them to a cluster
nKm <- 3
mat <- assay(rld)[ topdiffGenes, ]
mat <- mat - rowMeans(mat)
set.seed(100)
km = kmeans(mat, (nKm ))
m2 <- cbind(mat,km$cluster)
o <- order(m2[, ncol(m2)])
m2 <- m2[o, ]
m2 = as.data.frame(m2)
write.csv(as.data.frame(m2),
file="top500_var_genes.csv")
Next we’ll fetch some annotation data, and generate a heatmap (figure 1C) based on the kmean clustered highly variable genes
library(pheatmap)
library(RColorBrewer)
col_annotations <- as.data.frame(sample_data$day)
row.names(col_annotations) <- row.names(sample_data)
day_order <- order(col_annotations)
col_annotations[day_order,]
my_gene_row <- as.data.frame(row.names(m2))
my_gene_row$cluster <- m2[,ncol(m2)]
row.names(my_gene_row) <- row.names(m2)
my_gene_row[,1] = NULL
data <- m2[,day_order]
# define the annotation
col_annotations = data.frame(
day = factor(sample_data$day),
cell_line = c(as.matrix(sample_data$cell_line)),
diseased = c(as.matrix(sample_data$diseased))
)
row.names(col_annotations) <- row.names(sample_data)
hmcol<- colorRampPalette(c("darkblue","white","red"))(256)
pheatmap(data[,1:(ncol(data)-1)] ,border_color=NA, col = hmcol, scale = 'column',width = 10, height = 10,cluster_rows = F, cluster_cols = F,row.names = F, annotation_row = my_gene_row, annotation_col = col_annotations, gaps_col = c(4,9,14), gaps_row = c(298,417), filename = "Heatmap_KC_diff.pdf")
pheatmap(data ,border_color=NA, col = hmcol, scale = 'column',width = 10, height = 10,cluster_rows = F, cluster_cols = F,row.names = F, annotation_row = my_gene_row, annotation_col = col_annotations) gaps_col = c(4,9,14), gaps_row = c(298,417))
To perform GO-enrichment we need 2 lists, one of the gene background (here we used all genes expressed at a certain point), and a list of genes of interest. As the list of interest we can use the clusters (found in top500_var_genes.csv). The gene background list we generate like this. After generating these lists you can perform GO-term enrichment easily online. We used GOrilla (http://cbl-gorilla.cs.technion.ac.il/)
max_count <- as.data.frame(rowMax(as.matrix(counts_hgnc)))
row.names(max_count) <- row.names(counts_hgnc)
colnames(max_count) <- 'counts'
max_count$genes <- row.names(counts_hgnc)
measured_genes <- max_count %>% filter(counts > 10) %>% dplyr::select(genes,counts)
measured_genes <- as.data.frame(measured_genes)
write.csv(as.data.frame(measured_genes),
file="measured_genes.csv")