Bellow is a short example of how we analyzed some of the differentiation data. If you plan on performing RNAseq analysis yourself there are some great vignettes from DESEQ2 that are very nice to follow. But hopefully this code bellow can help you start. First we’ll set some settings for the figure size of R-Markdown

Change: "“/home/jsmits/RNA_seq_KC_diff/” within the third line to the location of your RNAseq analysis.

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8) 
knitr::opts_knit$set(root.dir = "/home/jsmits/RNA_seq_KC_diff/")

If you have not installed the R packages listed below, remove the “#” symbols in front of the lines and run the code below them.

#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("DESeq2")
#BiocManager::install("BiocParallel")
#BiocManager::install("vsn")
#BiocManager::install("ggplot2")
#BiocManager::install("biomaRt")

Load the data, change your working directory to a folder containing a folder called “counts” where all the ReadsPerGene.out.tab files generated by STAR are located. If you prefer to continue with HGNC gene names instead of transcript IDs after this step you can use tools such as BioMart to convert gene identifiers.

ff <- list.files( path = "./mapping/counts", pattern = "*ReadsPerGene.out.tab$", full.names = TRUE )
counts.files <- lapply( ff, read.table, skip = 4 )
counts <- as.data.frame( sapply( counts.files, function(x) x[ , 2 ] ) )
ff <- gsub( "[.]ReadsPerGene[.]out[.]tab", "", ff )
ff <- gsub( "[.]/counts_wt/", "", ff )
colnames(counts) <- ff
row.names(counts) <- counts.files[[1]]$V1
count.files <- NULL

Optional step: Use BioMart to find HGNC gene names to the ENSEMBL transcript ids Then write the count table to the file:"counts_hgnc.txt" Alternatively you can also skip this step and proceed with the Ensemble transcript ID’s for the further analysis.

# load libraries
library("biomaRt")
library("dplyr")
library("stringr")

genes <- as.vector(row.names(counts))
ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")
filters = listFilters(ensembl)

# get hgnc symbol of each ensembl id
G_list<-  getBM(attributes=c('ensembl_gene_id', "hgnc_symbol"), filters = "ensembl_gene_id", values=genes, mart=ensembl)

# rearrange the table to get the hgnc symbol to the count table
counts$ensembl_gene_id <- rownames(counts)
rownames(counts) <- NULL
counts_hgnc <- merge(counts, G_list, by="ensembl_gene_id")

counts_hgnc$ensembl_gene_id <- NULL
counts_hgnc <- distinct(counts_hgnc)

counts_hgnc <- counts_hgnc[,1:ncol(counts_hgnc)] %>% group_by(hgnc_symbol) %>% summarise_all(list(sum))
counts_hgnc <- as.data.frame(counts_hgnc)
rownames(counts_hgnc) <- counts_hgnc$hgnc_symbol
counts_hgnc$hgnc_symbol <- NULL
write.table(counts_hgnc, file="counts_hgnc.txt", sep="\t")

Load the sample data (with columns saying the differentiation day, the cell line, if it is diseased or wt, and if it is paired end or single end sequenced.)

sample_data<- read.csv2("sample_info.csv", header = TRUE, sep = ',')
rownames(sample_data) <- make.unique(as.vector(sample_data[,1]))
counts <- counts[, rownames(sample_data)]# samples in readcount in the same order as the coldata file
all(rownames(sample_data) == colnames(counts))#check if coldata and readcounts are in the same order
## [1] TRUE
all(rownames(sample_data) %in% colnames(counts)) #check if all samples have a match in coldata
## [1] TRUE
all(rownames(sample_data) == colnames(counts))#check if coldata and readcounts are in the same order
## [1] TRUE

Now we load the counts and the sample_data into DESEQ2 based on the day and the disease/wt.

#Deseq2 normalisation:
library("DESeq2")
library("BiocParallel")
register(SnowParam(4))
dds <- DESeqDataSetFromMatrix(countData = counts_hgnc,
                              colData = sample_data,
                              design = ~ cell_line + day)

#run Deseq , later used for differential genes but not for the PCA, heatmap or variable genes.
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

Next we use Deseq2 its vst function to normalize the raw count data, we visualize the standard deviation of genes with high and low counts to see if the normalisation has sucessfully adjusted low expressed genes with high standard deviation:

library("vsn")
library("ggplot2")

vst_norm <- vst(dds) 
msd <- meanSdPlot(assay(vst_norm))

#pdf("sd_after_rlog.pdf")
pdf("sd_after_vst.pdf")
myplot <- msd$gg + scale_y_continuous(limits = c(0, 7))
print(myplot)
dev.off()

Subsequently we generate a distance matrix to see how similar samples are to each other

library("RColorBrewer")
library("pheatmap")
library("ggplot2")

sampleDists <- dist(t(assay(vst_norm)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <-make.unique(paste(vst_norm$day,vst_norm$cell_line))

colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
annotation_column <- as.data.frame(vst_norm$day)
annotation_column$cell_line <- vst_norm$cell_line

row.names(annotation_column) <- rownames(sampleDistMatrix)
hc <- hclust(dist(sampleDistMatrix))
myplot <- pheatmap(sampleDistMatrix,
                   clustering_distance_rows=sampleDists,
                   clustering_distance_cols=sampleDists,
                   col=colors,
                   annotation_row = annotation_column,
                   clustering_method = "ward.D",
                   width = 10,
                   height = 10
)
print(myplot)

Now its time to generate the PCA map to visualize the variance between samples (figure 2B)

pcaData <- plotPCA(vst_norm, intgroup = c( "cell_line","day"), returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
my_pca <- ggplot(pcaData, aes(x = PC1, y = PC2, color = day, shape = cell_line)) +
  geom_point(size =5) +
  xlab(paste0("PC1: ", percentVar[1], "% variance")) +
  ylab(paste0("PC2: ", percentVar[2], "% variance")) +
  coord_fixed(2)
ggsave("PCA2.pdf",my_pca, height = 10, width = 8, device = 'pdf')

print(my_pca)

Next we look what the top 500 highly variable genes across our dataset are

all_genes_found <- row.names(assay(vst_norm))
topVarGenes <- head(order(rowVars(assay(vst_norm)),decreasing=TRUE),500)
variable_genes <- row.names(assay(vst_norm)[ topVarGenes, ])
topVarGenes <- variable_genes

We’l perform kmean clustering (with 3 clusters) on the highly variable genes to map each of them to a cluster

nKm <- 3
mat <- assay(vst_norm)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
set.seed(100)
km = kmeans(mat, (nKm ))
m2 <- cbind(mat,km$cluster)
o <- order(m2[, ncol(m2)])
m2 <- m2[o, ]
m2 = as.data.frame(m2)
write.csv(as.data.frame(m2), 
          file="top500_var_genes.csv")

Next we’ll fetch some annotation data, and generate a heatmap (figure 1C) based on the kmean clustered highly variable genes

library(pheatmap) 
library(RColorBrewer)

col_annotations <- as.data.frame(sample_data$cell_line)
row.names(col_annotations) <- row.names(sample_data)

my_gene_row <- as.data.frame(row.names(m2))
my_gene_row$cluster <- m2[,ncol(m2)]
row.names(my_gene_row) <- row.names(m2)
my_gene_row[,1] = NULL
data <- m2[,1:(ncol(m2)-1)]

# define the annotation
col_annotations = data.frame(cell_line = c(as.matrix(sample_data$cell_line)))

row.names(col_annotations) <- row.names(sample_data)

hmcol<- colorRampPalette(c("darkblue","white","red"))(256)
pheatmap(data[,1:(ncol(data)-1)] ,border_color=NA, col = hmcol, scale = 'column',width = 10, height = 10,cluster_rows = F, cluster_cols = F,row.names = F, annotation_row = my_gene_row, annotation_col = col_annotations)#, filename = "Heatmap_KC_diff.pdf")#,gaps_col = c(4,9,14), gaps_row = c(298,417))

pheatmap(data ,border_color=NA, col = hmcol, scale = 'column',width = 10, height = 10,cluster_rows = F, cluster_cols = F,row.names = F, annotation_row = my_gene_row, annotation_col = col_annotations)

To perform GO-enrichment we need 2 lists, on of the gene background (here we used all genes expressed at a certain point), and a list of genes of interest. As the list of interest we can use the clusters (found in top500_var_genes.csv). The gene background list we generate like this. After generating these lists you can perform GO-term enrichment easily online. We used GOrilla (http://cbl-gorilla.cs.technion.ac.il/)

max_count <- as.data.frame(rowMax(as.matrix(counts_hgnc)))
row.names(max_count) <- row.names(counts_hgnc)
colnames(max_count) <- 'counts'
max_count$genes <- row.names(counts_hgnc)
measured_genes <- max_count %>% filter(counts > 10) %>% dplyr::select(genes,counts)
measured_genes <- as.data.frame(measured_genes)
write.csv(as.data.frame(measured_genes), 
          file="measured_genes.csv")