GO enrichment - KEGG enrichment S cerevisiae

BiocManager::install("topGO") install.packages("gridExtra") BiocManager::install("pathview") BiocManager::install("clusterProfiler") BiocManager::install("org.At.tair.db") BiocManager::install("org.Sc.sgd.db") BiocManager::install("AnnotationDbi") install.packages("ggplot2")

require(topGO)
require(ggplot2)
require(grid)
require(gridExtra)
require(org.Sc.sgd.db)
require(pathview)
require(clusterProfiler)
require(AnnotationDbi)
data("gene.idtype.bods")

counts<-data.frame(read.table("treatmentvscontrol.complete.txt",row.names = 1,header = T,sep = "\t"))

DEGS_DW<-which(counts$padj<0.05&counts$log2FoldChange < -0.5)
DEGS_UP<-which(counts$padj<0.05&counts$log2FoldChange > 0.5)


length(DEGS_DW)

## [1] 203

length(DEGS_UP)

## [1] 439

##Para el análisis de enriquecimiento de categorías GO, necesitamos primero los terminos GO de cada gen. En este caso los obtenemos utilizando BIOCONDUCTOR

allGO2genes <- annFUN.org(whichOnto="ALL", feasibleGenes=NULL, mapping="org.Sc.sgd.db", ID="ensembl") ##El nombre de los genes que utilizamos es ENSEMBL

##Luego el primer paso es generar la lista de todo el universo de genes

allgenesnames <- rownames(counts) #Todos los genes estudiados en RNAseq

##Siguiente es generar el set de genes interesantes

DEGS_UP<-rownames(counts)[DEGS_UP] #genes sobreexpresados
DEGS_DW<-rownames(counts)[DEGS_DW] #genes reprimidos

##Para topGO necesitamos la lista de todos los genes, y tambien una "named list"de todos los genes indicando si estan en el set de genes interesantes

geneList_UP <- factor(as.integer(allgenesnames %in% DEGS_UP))
names(geneList_UP) <- allgenesnames

geneList_DW <- factor(as.integer(allgenesnames %in% DEGS_DW))
names(geneList_DW) <- allgenesnames

##Ahora si procedemos a analizar con el package TopGO

GOdata_UP_BP <- new("topGOdata", ontology = "BP", allGenes = geneList_UP,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

GOdata_UP_MF <- new("topGOdata", ontology = "MF", allGenes = geneList_UP,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

GOdata_UP_CC <- new("topGOdata", ontology = "CC", allGenes = geneList_UP,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

GOdata_DW_BP <- new("topGOdata", ontology = "BP", allGenes = geneList_DW,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

GOdata_DW_MF <- new("topGOdata", ontology = "MF", allGenes = geneList_DW,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

GOdata_DW_CC <- new("topGOdata", ontology = "CC", allGenes = geneList_DW,annot = annFUN.GO2genes, GO2genes=allGO2genes,nodeSize=5)

##El test estadistico será calculado con runTest, usando fisher

resultFis_UP_BP <- runTest(GOdata_UP_BP, statistic = "fisher")
resultFis_UP_MF <- runTest(GOdata_UP_MF, statistic = "fisher")
resultFis_UP_CC <- runTest(GOdata_UP_CC, statistic = "fisher")
resultFis_DW_BP <- runTest(GOdata_DW_BP, statistic = "fisher")
resultFis_DW_MF <- runTest(GOdata_DW_MF, statistic = "fisher")
resultFis_DW_CC <- runTest(GOdata_DW_CC, statistic = "fisher")

##La siguiente función toma los resultados de runtest, y los "arregla" para los graficos siguientes

parse_tables<-function(x,y)
{
  goEnrichment <- GenTable(x,weightFisher=y, topNodes=10)
  sub("< ","",goEnrichment$weightFisher)
  goEnrichment$Term <- gsub(" [a-z]*\\.\\.\\.$", "", goEnrichment$Term)
  goEnrichment$Term <- gsub("\\.\\.\\.$", "", goEnrichment$Term)
  goEnrichment$Term <- paste(goEnrichment$GO.ID, goEnrichment$Term, sep=", ")
  goEnrichment$Term <- factor(goEnrichment$Term, levels=rev(goEnrichment$Term))
  goEnrichment$weightFisher <- as.numeric(sub("< ","",goEnrichment$weightFisher))  
  goEnrichment
}

GOres_UP_BP <- parse_tables(GOdata_UP_BP, resultFis_UP_BP)
GOres_UP_MF <- parse_tables(GOdata_UP_MF, resultFis_UP_MF)
GOres_UP_CC <- parse_tables(GOdata_UP_CC, resultFis_UP_CC)

GOres_DW_BP <- parse_tables(GOdata_DW_BP, resultFis_DW_BP)
GOres_DW_MF <- parse_tables(GOdata_DW_MF, resultFis_DW_MF)
GOres_DW_CC <- parse_tables(GOdata_DW_CC, resultFis_DW_CC)

##Function to plot the 10 most enriched GO
plot_GO<-function(x,y,Z){  
  ggplot(x, aes(x=Term,y=(-log10(as.numeric(x$weightFisher))))) +
  geom_point(aes(size=x$Significant),colour="steelblue4")+
  scale_size_area(name="Gene counts")+
  xlab(y) +
  ylab("Enrichment (- log10 Pvalue)") +
  ggtitle(Z) +
  scale_y_continuous(breaks = round(seq(0, 
        max(-log10(as.numeric(x$weightFisher))), by = 4), 1)) +
  theme_bw(base_size=1) +
  theme(
      legend.position="right",
      legend.background=element_rect(),
      plot.title=element_text(angle=0, size=14, face="bold", vjust=1),
      axis.text.x=element_text(angle=0, size=10, face="bold", hjust=1.10),
      axis.text.y=element_text(angle=0, size=10, face="bold", vjust=0.5),
      axis.title=element_text(size=12, face="bold"),
      legend.key=element_blank(),     #removes the border
      legend.key.size=unit(1, "cm"),      #Sets overall area/size of the legend
      legend.text=element_text(size=10),  #Text size
      title=element_text(size=10),
      aspect.ratio = 1) +
  coord_flip()
}


##Now call both functions to do the plots

p1<-plot_GO(GOres_UP_BP,"Biological Proccess","TopGO Upregulated genes")
p2<-plot_GO(GOres_UP_MF,"Molecular Function","TopGO Upregulated genes")
p3<-plot_GO(GOres_UP_CC,"Cellular Component","TopGO Upregulated genes")

grid.arrange(grobs = list(p1,p2,p3))

p1<-plot_GO(GOres_DW_BP,"Biological Proccess","TopGO Downregulated genes")
p2<-plot_GO(GOres_DW_MF,"Molecular Function","TopGO Downregulated genes")
p3<-plot_GO(GOres_DW_CC,"Cellular Component","TopGO Downregulated genes")

grid.arrange(grobs = list(p1,p2,p3))

#Ahora realizaremos un análisis de KEGG pathways
#Primero necesitamos cambiar nuestras ENSEMBL IDs a ORF IDs (necesario para S. cerevisiae en KEGG)
ORF_IDs <- AnnotationDbi::select(org.Sc.sgd.db,
                                    keys = rownames(counts),
                                    columns = c("ORF"), 
                                    keytype = "ENSEMBL")

## 'select()' returned 1:many mapping between keys and columns

non_duplicates_idx <- which(duplicated(ORF_IDs$ENSEMBL) == FALSE)
ORF_IDs <- ORF_IDs[non_duplicates_idx, ]
ORF_IDs <-na.omit(ORF_IDs)

ORFIDs_res <-rownames(counts)%in%ORF_IDs$ENSEMBL
ORFIDs_res <-counts[ORFIDs_res,]
gene_list<-ORFIDs_res$log2FoldChange
names(gene_list) <- rownames(ORFIDs_res)
gene_list <- sort(gene_list, decreasing = TRUE)

gseaKEGG <- gseKEGG(geneList = gene_list, # ordered named vector of fold changes (Gene IDs are the associated names)
                    organism = "sce", # S cerevisiae
                    nPerm = 10000, 
                    minGSSize = 20, # minimum gene set size (# genes in set) - change to test more sets or recover sets with fewer # genes
                    pvalueCutoff = 0.1, # padj cutoff value
                    verbose = FALSE)
gseaKEGG_results <- gseaKEGG@result

#To do a Kegg graph with our results

pathview(gene.data = gene_list, gene.idtype = "ORF",
         pathway.id = gseaKEGG_results$ID[1],
         species = "sce",
         limit = list(gene = 2, # value gives the max/min limit for foldchanges
                      cpd = 1))

## 'select()' returned 1:1 mapping between keys and columns

## Info: Getting gene ID data from KEGG...

## Info: Done with data retrieval!

## Info: Working in directory C:/Users/cvillar/Dropbox/Clases/DESeq2__multifactorial_pairwise_comparisons_iBio_W2019.test6.vst.pa005-2019-08-05-04-48-33.1/Cs

## Info: Writing image file sce00500.pathview.png

pathview(gene.data = gene_list, gene.idtype = "ORF",
         pathway.id = gseaKEGG_results$ID[2],
         species = "sce",
         limit = list(gene = 2, # value gives the max/min limit for foldchanges
                      cpd = 1))

## 'select()' returned 1:1 mapping between keys and columns

## Info: Getting gene ID data from KEGG...

## Info: Done with data retrieval!

## Info: Downloading xml files for sceNA, 1/1 pathways..

## Warning: Download of sceNA xml file failed!
## This pathway may not exist!

## Info: Downloading png files for sceNA, 1/1 pathways..

## Warning: Download of sceNA png file failed!
## This pathway may not exist!

## Warning: Failed to download KEGG xml/png files, sceNA skipped!

pathview(gene.data = gene_list, gene.idtype = "ORF",
         pathway.id = gseaKEGG_results$ID[3],
         species = "sce",
         limit = list(gene = 2, # value gives the max/min limit for foldchanges
                      cpd = 1))

## 'select()' returned 1:1 mapping between keys and columns

## Info: Getting gene ID data from KEGG...

## Info: Done with data retrieval!

## Info: Downloading xml files for sceNA, 1/1 pathways..

## Warning: Download of sceNA xml file failed!
## This pathway may not exist!

## Info: Downloading png files for sceNA, 1/1 pathways..

## Warning: Download of sceNA png file failed!
## This pathway may not exist!

## Warning: Failed to download KEGG xml/png files, sceNA skipped!

##LOS RESULTADOS DE PATHVIEW SE EXPORTAN AUTOMATICAMENTE AL DIRECTORIO DE TRABAJO