BiocManager::install("topGO") install.packages("gridExtra") BiocManager::install("pathview") BiocManager::install("clusterProfiler") BiocManager::install("org.At.tair.db") BiocManager::install("org.Sc.sgd.db") BiocManager::install("AnnotationDbi") install.packages("ggplot2")
##INSTALAR LOS PAQUETES SI NO HAN SIDO INSTALADOS
library(topGO)
library(ggplot2)
library(grid)
library(gridExtra)
library(pathview)
library(clusterProfiler)
library(org.At.tair.db)
data("gene.idtype.bods")
counts<-data.frame(read.table("treatmentvscontrol.complete.txt",row.names = 1,header = T,sep = "\t"))
##el archivo treatmentvscontrol.complete.txt debe estar en la carpeta
head(counts,2)
## SRR1811524 SRR1811525 SRR1811526 SRR1811527 SRR1811528
## AT1G01010.1 22 31 17 31 26
## AT1G01020.1 2 14 11 6 8
## SRR1811529 norm.SRR1811524 norm.SRR1811525 norm.SRR1811526
## AT1G01010.1 30 21 30 17
## AT1G01020.1 0 2 13 11
## norm.SRR1811527 norm.SRR1811528 norm.SRR1811529 baseMean
## AT1G01010.1 29 26 31 25.67
## AT1G01020.1 6 8 0 6.70
## control treatment FoldChange log2FoldChange pvalue
## AT1G01010.1 23 29 1.250 0.322 0.2943173
## AT1G01020.1 9 5 0.763 -0.390 0.4649092
## padj dispGeneEst dispFit dispMAP dispersion betaConv
## AT1G01010.1 0.8719330 0.0000 0.1358 0.0429 0.0429 TRUE
## AT1G01020.1 0.9622079 1.0276 0.9291 0.9989 0.9989 TRUE
## maxCooks
## AT1G01010.1 0.4026
## AT1G01020.1 1.2798
DEGS_DW<-which(counts$padj<0.05&counts$log2FoldChange < -0.5)
DEGS_UP<-which(counts$padj<0.05&counts$log2FoldChange > 0.5)
length(DEGS_DW)
## [1] 337
length(DEGS_UP)
## [1] 534
##Para el análisis de enriquecimiento de categorías GO, necesitamos primero los terminos GO de cada gen. En este caso los obtenemos utilizando BIOCONDUCTOR
allGO2genes <- annFUN.org(whichOnto="ALL", feasibleGenes=NULL, mapping="org.At.tair.db", ID="entrez") ##El nombre de los genes que utilizamos
##Luego el primer paso es generar la lista de todo el universo de genes
allgenesnames <- rownames(counts) #Todos los genes estudiados en RNAseq
#Nuestros datos de expresión son para cada transcrito (splicing forms), pero ahora necesitamos trabajar con genes.
allgenesnames <- gsub("\\..*","",allgenesnames) #Cambiar ID de transcritos a ID de genes
non_duplicates_idx <- which(duplicated(allgenesnames) == FALSE)#Encuentra duplicados
allgenesnames <- allgenesnames[non_duplicates_idx]#Borra los duplicados
DEGS_UP<-rownames(counts)[DEGS_UP]
DEGS_UP<-gsub("\\..*","",DEGS_UP)
non_duplicates_idx <- which(duplicated(DEGS_UP) == FALSE)
DEGS_UP <- DEGS_UP[non_duplicates_idx]
DEGS_DW<-rownames(counts)[DEGS_DW]
DEGS_DW<-gsub("\\..*","",DEGS_DW)
non_duplicates_idx <- which(duplicated(DEGS_DW) == FALSE)
DEGS_DW <- DEGS_DW[non_duplicates_idx]
geneList_UP <- factor(as.integer(allgenesnames %in% DEGS_UP))
names(geneList_UP) <- allgenesnames
geneList_DW <- factor(as.integer(allgenesnames %in% DEGS_DW))
names(geneList_DW) <- allgenesnames
##Ahora si procedemos a analizar con el package TopGO
GOdata_UP_BP <- new("topGOdata", ontology = "BP", allGenes = geneList_UP,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)
GOdata_UP_MF <- new("topGOdata", ontology = "MF", allGenes = geneList_UP,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)
GOdata_UP_CC <- new("topGOdata", ontology = "CC", allGenes = geneList_UP,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)
GOdata_DW_BP <- new("topGOdata", ontology = "BP", allGenes = geneList_DW,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)
GOdata_DW_MF <- new("topGOdata", ontology = "MF", allGenes = geneList_DW,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)
GOdata_DW_CC <- new("topGOdata", ontology = "CC", allGenes = geneList_DW,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)
##El test estadistico será calculado con runTest, usando fisher
resultFis_UP_BP <- runTest(GOdata_UP_BP, statistic = "fisher")
resultFis_UP_MF <- runTest(GOdata_UP_MF, statistic = "fisher")
resultFis_UP_CC <- runTest(GOdata_UP_CC, statistic = "fisher")
resultFis_DW_BP <- runTest(GOdata_DW_BP, statistic = "fisher")
resultFis_DW_MF <- runTest(GOdata_DW_MF, statistic = "fisher")
resultFis_DW_CC <- runTest(GOdata_DW_CC, statistic = "fisher")
##La siguiente función toma los resultados de runtest, y los "arregla" para los graficos siguientes
parse_tables<-function(x,y)
{
goEnrichment <- GenTable(x,weightFisher=y, topNodes=10)
sub("< ","",goEnrichment$weightFisher)
goEnrichment$Term <- gsub(" [a-z]*\\.\\.\\.$", "", goEnrichment$Term)
goEnrichment$Term <- gsub("\\.\\.\\.$", "", goEnrichment$Term)
goEnrichment$Term <- paste(goEnrichment$GO.ID, goEnrichment$Term, sep=", ")
goEnrichment$Term <- factor(goEnrichment$Term, levels=rev(goEnrichment$Term))
goEnrichment$weightFisher <- as.numeric(sub("< ","",goEnrichment$weightFisher))
goEnrichment
}
GOres_UP_BP <- parse_tables(GOdata_UP_BP, resultFis_UP_BP)
GOres_UP_MF <- parse_tables(GOdata_UP_MF, resultFis_UP_MF)
GOres_UP_CC <- parse_tables(GOdata_UP_CC, resultFis_UP_CC)
GOres_DW_BP <- parse_tables(GOdata_DW_BP, resultFis_DW_BP)
GOres_DW_MF <- parse_tables(GOdata_DW_MF, resultFis_DW_MF)
GOres_DW_CC <- parse_tables(GOdata_DW_CC, resultFis_DW_CC)
##Function to plot the 10 most enriched GO
plot_GO<-function(x,y,Z){
ggplot(x, aes(x=Term,y=(-log10(as.numeric(x$weightFisher))))) +
geom_point(aes(size=x$Significant),colour="steelblue4")+
scale_size_area(name="Gene counts")+
xlab(y) +
ylab("Enrichment (- log10 Pvalue)") +
ggtitle(Z) +
scale_y_continuous(breaks = round(seq(0,
max(-log10(as.numeric(x$weightFisher))), by = 4), 1)) +
theme_bw(base_size=1) +
theme(
legend.position="right",
legend.background=element_rect(),
plot.title=element_text(angle=0, size=14, face="bold", vjust=1),
axis.text.x=element_text(angle=0, size=10, face="bold", hjust=1.10),
axis.text.y=element_text(angle=0, size=10, face="bold", vjust=0.5),
axis.title=element_text(size=12, face="bold"),
legend.key=element_blank(), #removes the border
legend.key.size=unit(1, "cm"), #Sets overall area/size of the legend
legend.text=element_text(size=10), #Text size
title=element_text(size=10),
aspect.ratio = 1) +
coord_flip()
}
##Now call both functions to do the plots
p1<-plot_GO(GOres_UP_BP,"Biological Proccess","TopGO Upregulated genes")
p2<-plot_GO(GOres_UP_MF,"Molecular Function","TopGO Upregulated genes")
p3<-plot_GO(GOres_UP_CC,"Cellular Component","TopGO Upregulated genes")
grid.arrange(grobs = list(p1,p2,p3))
p1<-plot_GO(GOres_DW_BP,"Biological Proccess","TopGO Downregulated genes")
p2<-plot_GO(GOres_DW_MF,"Molecular Function","TopGO Downregulated genes")
p3<-plot_GO(GOres_DW_CC,"Cellular Component","TopGO Downregulated genes")
grid.arrange(grobs = list(p1,p2,p3))
Alternative to obtain GO terms for each transcript_id (instead of gene_id)
require(bioMart) datasets<-listDatasets(useMart(biomart=“plants_mart”,host=“plants.ensembl.org”)) ensembl<-useDataset(dataset =“athaliana_eg_gene”,mart =useMart(biomart=“plants_mart”,host=“plants.ensembl.org”))
GOlist<-cbind(c(1:nrow(counts)),c(1:nrow(counts))) for (i in 1:nrow(counts)){ GO<-getBM(attributes=c(‘ensembl_transcript_id’,“goslim_goa_accession”), filters = ‘ensembl_transcript_id’, values = rownames(counts)[i], mart = ensembl) GOlist[i,2]<-paste(GO\(goslim_goa_accession,collapse = ",") GOlist[i,1]<-GO\)ensembl_transcript_id[1] GOlist }