TopGO_Athaliana

BiocManager::install("topGO") install.packages("gridExtra") BiocManager::install("pathview") BiocManager::install("clusterProfiler") BiocManager::install("org.At.tair.db") BiocManager::install("org.Sc.sgd.db") BiocManager::install("AnnotationDbi") install.packages("ggplot2")

##INSTALAR LOS PAQUETES SI NO HAN SIDO INSTALADOS
library(topGO)
library(ggplot2)
library(grid)
library(gridExtra)
library(pathview)
library(clusterProfiler)
library(org.At.tair.db)
data("gene.idtype.bods")

counts<-data.frame(read.table("treatmentvscontrol.complete.txt",row.names = 1,header = T,sep = "\t"))
##el archivo treatmentvscontrol.complete.txt debe estar en la carpeta
head(counts,2)

##             SRR1811524 SRR1811525 SRR1811526 SRR1811527 SRR1811528
## AT1G01010.1         22         31         17         31         26
## AT1G01020.1          2         14         11          6          8
##             SRR1811529 norm.SRR1811524 norm.SRR1811525 norm.SRR1811526
## AT1G01010.1         30              21              30              17
## AT1G01020.1          0               2              13              11
##             norm.SRR1811527 norm.SRR1811528 norm.SRR1811529 baseMean
## AT1G01010.1              29              26              31    25.67
## AT1G01020.1               6               8               0     6.70
##             control treatment FoldChange log2FoldChange    pvalue
## AT1G01010.1      23        29      1.250          0.322 0.2943173
## AT1G01020.1       9         5      0.763         -0.390 0.4649092
##                  padj dispGeneEst dispFit dispMAP dispersion betaConv
## AT1G01010.1 0.8719330      0.0000  0.1358  0.0429     0.0429     TRUE
## AT1G01020.1 0.9622079      1.0276  0.9291  0.9989     0.9989     TRUE
##             maxCooks
## AT1G01010.1   0.4026
## AT1G01020.1   1.2798

DEGS_DW<-which(counts$padj<0.05&counts$log2FoldChange < -0.5)
DEGS_UP<-which(counts$padj<0.05&counts$log2FoldChange > 0.5)


length(DEGS_DW)

## [1] 337

length(DEGS_UP)

## [1] 534

##Para el análisis de enriquecimiento de categorías GO, necesitamos primero los terminos GO de cada gen. En este caso los obtenemos utilizando BIOCONDUCTOR

allGO2genes <- annFUN.org(whichOnto="ALL", feasibleGenes=NULL, mapping="org.At.tair.db", ID="entrez") ##El nombre de los genes que utilizamos

##Luego el primer paso es generar la lista de todo el universo de genes

allgenesnames <- rownames(counts) #Todos los genes estudiados en RNAseq

#Nuestros datos de expresión son para cada transcrito (splicing forms), pero ahora necesitamos trabajar con genes.
allgenesnames <- gsub("\\..*","",allgenesnames) #Cambiar ID de transcritos a ID de genes
non_duplicates_idx <- which(duplicated(allgenesnames) == FALSE)#Encuentra duplicados
allgenesnames <- allgenesnames[non_duplicates_idx]#Borra los duplicados

DEGS_UP<-rownames(counts)[DEGS_UP]
DEGS_UP<-gsub("\\..*","",DEGS_UP)
non_duplicates_idx <- which(duplicated(DEGS_UP) == FALSE)
DEGS_UP <- DEGS_UP[non_duplicates_idx]

DEGS_DW<-rownames(counts)[DEGS_DW]
DEGS_DW<-gsub("\\..*","",DEGS_DW)
non_duplicates_idx <- which(duplicated(DEGS_DW) == FALSE)
DEGS_DW <- DEGS_DW[non_duplicates_idx]


geneList_UP <- factor(as.integer(allgenesnames %in% DEGS_UP))
names(geneList_UP) <- allgenesnames

geneList_DW <- factor(as.integer(allgenesnames %in% DEGS_DW))
names(geneList_DW) <- allgenesnames


##Ahora si procedemos a analizar con el package TopGO

GOdata_UP_BP <- new("topGOdata", ontology = "BP", allGenes = geneList_UP,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

GOdata_UP_MF <- new("topGOdata", ontology = "MF", allGenes = geneList_UP,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

GOdata_UP_CC <- new("topGOdata", ontology = "CC", allGenes = geneList_UP,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

GOdata_DW_BP <- new("topGOdata", ontology = "BP", allGenes = geneList_DW,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

GOdata_DW_MF <- new("topGOdata", ontology = "MF", allGenes = geneList_DW,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

GOdata_DW_CC <- new("topGOdata", ontology = "CC", allGenes = geneList_DW,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

##El test estadistico será calculado con runTest, usando fisher

resultFis_UP_BP <- runTest(GOdata_UP_BP, statistic = "fisher")
resultFis_UP_MF <- runTest(GOdata_UP_MF, statistic = "fisher")
resultFis_UP_CC <- runTest(GOdata_UP_CC, statistic = "fisher")
resultFis_DW_BP <- runTest(GOdata_DW_BP, statistic = "fisher")
resultFis_DW_MF <- runTest(GOdata_DW_MF, statistic = "fisher")
resultFis_DW_CC <- runTest(GOdata_DW_CC, statistic = "fisher")

##La siguiente función toma los resultados de runtest, y los "arregla" para los graficos siguientes

parse_tables<-function(x,y)
{
  goEnrichment <- GenTable(x,weightFisher=y, topNodes=10)
  sub("< ","",goEnrichment$weightFisher)
  goEnrichment$Term <- gsub(" [a-z]*\\.\\.\\.$", "", goEnrichment$Term)
  goEnrichment$Term <- gsub("\\.\\.\\.$", "", goEnrichment$Term)
  goEnrichment$Term <- paste(goEnrichment$GO.ID, goEnrichment$Term, sep=", ")
  goEnrichment$Term <- factor(goEnrichment$Term, levels=rev(goEnrichment$Term))
  goEnrichment$weightFisher <- as.numeric(sub("< ","",goEnrichment$weightFisher))  
  goEnrichment
}

GOres_UP_BP <- parse_tables(GOdata_UP_BP, resultFis_UP_BP)
GOres_UP_MF <- parse_tables(GOdata_UP_MF, resultFis_UP_MF)
GOres_UP_CC <- parse_tables(GOdata_UP_CC, resultFis_UP_CC)

GOres_DW_BP <- parse_tables(GOdata_DW_BP, resultFis_DW_BP)
GOres_DW_MF <- parse_tables(GOdata_DW_MF, resultFis_DW_MF)
GOres_DW_CC <- parse_tables(GOdata_DW_CC, resultFis_DW_CC)

##Function to plot the 10 most enriched GO
plot_GO<-function(x,y,Z){  
  ggplot(x, aes(x=Term,y=(-log10(as.numeric(x$weightFisher))))) +
  geom_point(aes(size=x$Significant),colour="steelblue4")+
  scale_size_area(name="Gene counts")+
  xlab(y) +
  ylab("Enrichment (- log10 Pvalue)") +
  ggtitle(Z) +
  scale_y_continuous(breaks = round(seq(0, 
        max(-log10(as.numeric(x$weightFisher))), by = 4), 1)) +
  theme_bw(base_size=1) +
  theme(
      legend.position="right",
      legend.background=element_rect(),
      plot.title=element_text(angle=0, size=14, face="bold", vjust=1),
      axis.text.x=element_text(angle=0, size=10, face="bold", hjust=1.10),
      axis.text.y=element_text(angle=0, size=10, face="bold", vjust=0.5),
      axis.title=element_text(size=12, face="bold"),
      legend.key=element_blank(),     #removes the border
      legend.key.size=unit(1, "cm"),      #Sets overall area/size of the legend
      legend.text=element_text(size=10),  #Text size
      title=element_text(size=10),
      aspect.ratio = 1) +
  coord_flip()
}


##Now call both functions to do the plots

p1<-plot_GO(GOres_UP_BP,"Biological Proccess","TopGO Upregulated genes")
p2<-plot_GO(GOres_UP_MF,"Molecular Function","TopGO Upregulated genes")
p3<-plot_GO(GOres_UP_CC,"Cellular Component","TopGO Upregulated genes")

grid.arrange(grobs = list(p1,p2,p3))

p1<-plot_GO(GOres_DW_BP,"Biological Proccess","TopGO Downregulated genes")
p2<-plot_GO(GOres_DW_MF,"Molecular Function","TopGO Downregulated genes")
p3<-plot_GO(GOres_DW_CC,"Cellular Component","TopGO Downregulated genes")

grid.arrange(grobs = list(p1,p2,p3))

donotrun

Alternative to obtain GO terms for each transcript_id (instead of gene_id)

require(bioMart) datasets<-listDatasets(useMart(biomart=“plants_mart”,host=“plants.ensembl.org”)) ensembl<-useDataset(dataset =“athaliana_eg_gene”,mart =useMart(biomart=“plants_mart”,host=“plants.ensembl.org”))

GOlist<-cbind(c(1:nrow(counts)),c(1:nrow(counts))) for (i in 1:nrow(counts)){ GO<-getBM(attributes=c(‘ensembl_transcript_id’,“goslim_goa_accession”), filters = ‘ensembl_transcript_id’, values = rownames(counts)[i], mart = ensembl) GOlist[i,2]<-paste(GO\(goslim_goa_accession,collapse = ",") GOlist[i,1]<-GO\)ensembl_transcript_id[1] GOlist }