# Load libraries

library("gplots")
Warning: package 'gplots' was built under R version 3.4.4

Attaching package: 'gplots'
The following object is masked from 'package:stats':

    lowess
library("ggplot2")
library("RColorBrewer")
library("scales")
Warning: package 'scales' was built under R version 3.4.4
library("edgeR")
Loading required package: limma
library("R.utils")
Warning: package 'R.utils' was built under R version 3.4.4
Loading required package: R.oo
Warning: package 'R.oo' was built under R version 3.4.4
Loading required package: R.methodsS3
R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.
R.oo v1.22.0 (2018-04-21) successfully loaded. See ?R.oo for help.

Attaching package: 'R.oo'
The following objects are masked from 'package:methods':

    getClasses, getMethods
The following objects are masked from 'package:base':

    attach, detach, gc, load, save
R.utils v2.8.0 successfully loaded. See ?R.utils for help.

Attaching package: 'R.utils'
The following object is masked from 'package:utils':

    timestamp
The following objects are masked from 'package:base':

    cat, commandArgs, getOption, inherits, isOpen, parse, warnings
library("plyr")
library("limma")
library("gridExtra")
library("VennDiagram")
Warning: package 'VennDiagram' was built under R version 3.4.4
Loading required package: grid
Loading required package: futile.logger
source("functions.R")
library(ashr)
Warning: package 'ashr' was built under R version 3.4.4
library(ggplot2)
library("topGO")
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: 'BiocGenerics'
The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB
The following object is masked from 'package:gridExtra':

    combine
The following object is masked from 'package:limma':

    plotMA
The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':

    anyDuplicated, append, as.data.frame, cbind, colMeans,
    colnames, colSums, do.call, duplicated, eval, evalq, Filter,
    Find, get, grep, grepl, intersect, is.unsorted, lapply,
    lengths, Map, mapply, match, mget, order, paste, pmax,
    pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
    rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
    tapply, union, unique, unsplit, which, which.max, which.min
Loading required package: graph

Attaching package: 'graph'
The following object is masked from 'package:plyr':

    join
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: GO.db
Loading required package: AnnotationDbi
Loading required package: stats4
Loading required package: IRanges
Loading required package: S4Vectors

Attaching package: 'S4Vectors'
The following object is masked from 'package:plyr':

    rename
The following object is masked from 'package:gplots':

    space
The following object is masked from 'package:base':

    expand.grid

Attaching package: 'IRanges'
The following object is masked from 'package:plyr':

    desc
The following object is masked from 'package:R.oo':

    trim
Loading required package: SparseM

Attaching package: 'SparseM'
The following object is masked from 'package:base':

    backsolve

groupGOTerms:   GOBPTerm, GOMFTerm, GOCCTerm environments built.

Attaching package: 'topGO'
The following object is masked from 'package:IRanges':

    members
The following object is masked from 'package:grid':

    depth
#library("biomaRt")
library("clusterProfiler")
Loading required package: DOSE
DOSE v3.4.0  For help: https://guangchuangyu.github.io/DOSE

If you use DOSE in published research, please cite:
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
clusterProfiler v3.6.0  For help: https://guangchuangyu.github.io/clusterProfiler

If you use clusterProfiler in published research, please cite:
Guangchuang Yu., Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
library("org.Hs.eg.db")
# Set directory to save the data

data_dir <- "../data"

# Load colors 

colors <- colorRampPalette(c(brewer.pal(9, "Blues")[1],brewer.pal(9, "Blues")[9]))(100)
pal <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))

# Retrieve RIN score for each sample
RNA_seq_info <- read.csv("../../../Reg_Evo_Primates/data/RNA_seq_info.csv")
RIN <- as.data.frame(RNA_seq_info[,22])
RIN <- as.matrix(RIN)
colnames(RIN) <- c("RIN")

# Retrieve sample information
samples <- read.delim("../../../Reg_Evo_Primates/data/Sample_info_RNAseq_limma.txt")

# Eliminate H1H
samples <- samples[-17,]
dim(samples)
[1] 47  4
# Label species and tissues

species <- samples$Species
length(species)
[1] 47
tissue <- samples$Tissue
length(tissue)
[1] 47
labels <- paste(samples$Species, samples$Tissue, sep=" ")

Test interactions

## Make the contrast matrix and rename columns of the contrast matrix

design <- model.matrix(~ species*tissue + RIN)
colnames(design)[1] <- "Intercept"
colnames(design)[2] <- "Human"
colnames(design)[3] <- "Rhesus"
colnames(design)[4] <- "Kidney"
colnames(design)[5] <- "Liver"
colnames(design)[6] <- "Lung"
colnames(design)[8] <- "H_by_K"
colnames(design)[9] <- "R_by_K"
colnames(design)[10] <- "H_by_Li"
colnames(design)[11] <- "R_by_Li"
colnames(design)[12] <- "H_by_Lu"
colnames(design)[13] <- "R_by_Lu"

# Look at the number of samples in each column 
colSums(design)
Intercept     Human    Rhesus    Kidney     Liver      Lung       RIN 
     47.0      15.0      16.0      12.0      12.0      12.0     368.2 
   H_by_K    R_by_K   H_by_Li   R_by_Li   H_by_Lu   R_by_Lu 
      4.0       4.0       4.0       4.0       4.0       4.0 
# Load count data

counts_genes_in_cutoff <- read.delim("../../../Reg_Evo_Primates/data/counts_12184.txt")

# TMM 

dge_in_cutoff <- DGEList(counts=as.matrix(counts_genes_in_cutoff), genes=rownames(counts_genes_in_cutoff), group = as.character(t(labels)))
dge_in_cutoff <- calcNormFactors(dge_in_cutoff)

cpm_in_cutoff <- cpm(dge_in_cutoff, normalized.lib.sizes=TRUE, log=TRUE)
head(cpm_in_cutoff)
                     C1H      C1K      C1Li     C1Lu      C2H      C2K
ENSG00000000003 4.569101 6.484481  8.260731 5.481561 4.686636 6.076562
ENSG00000000419 5.842023 5.217972  5.937465 5.478545 5.681016 5.100404
ENSG00000000457 4.560130 5.214732  5.902494 4.972557 4.834031 5.289413
ENSG00000000460 1.506846 1.869887  2.080244 2.308985 1.660573 1.968249
ENSG00000000938 5.611783 3.819613  5.091152 7.550720 2.533135 4.178135
ENSG00000000971 6.877100 4.451824 11.368082 6.100181 6.135730 4.887383
                     C2Li     C2Lu      C3H      C3K      C3Li     C3Lu
ENSG00000000003  8.029471 4.564496 4.915377 6.406310  7.784365 5.875983
ENSG00000000419  5.813444 5.199855 5.675979 5.179418  6.413682 5.596709
ENSG00000000457  6.545270 4.985922 4.618657 5.204247  6.498053 5.168988
ENSG00000000460  2.324903 2.023533 1.580465 1.461635  2.344190 2.124699
ENSG00000000938  5.388459 8.083442 4.965147 4.223500  5.204433 7.160345
ENSG00000000971 11.387090 6.246512 5.606820 4.941061 11.420166 5.990777
                     C4H      C4K      C4Li     C4Lu      H1K      H1Li
ENSG00000000003 4.235754 6.503717  8.453727 5.430223 6.864660  6.576082
ENSG00000000419 5.785414 5.257938  5.881536 5.321782 5.588152  6.082997
ENSG00000000457 4.645293 5.023223  6.597499 5.263806 4.285007  4.953825
ENSG00000000460 1.456629 1.826787  2.206829 2.476664 2.766766  4.989335
ENSG00000000938 3.638952 3.621239  4.580376 7.717763 4.059344  4.479943
ENSG00000000971 6.845219 5.957838 11.330910 6.421417 6.585546 11.216641
                    H1Lu      H2H      H2K     H2Li     H2Lu         H3H
ENSG00000000003 5.099004 3.681088 7.205567 6.638944 4.181104  3.54360583
ENSG00000000419 5.810855 5.606326 5.461678 5.838444 5.313450  5.75090978
ENSG00000000457 4.502116 3.406682 4.158467 4.450840 4.201852  4.44063190
ENSG00000000460 3.318021 1.892216 1.978501 2.657920 2.553081 -0.07576077
ENSG00000000938 7.878166 5.560041 3.740312 5.990299 6.968892  4.09684184
ENSG00000000971 7.561408 6.363288 4.736443 9.409472 7.310814  6.22507411
                     H3K      H3Li     H3Lu      H4H      H4K      H4Li
ENSG00000000003 7.091569  7.735945 5.290097 4.284175 6.371782  6.590115
ENSG00000000419 5.854940  6.216818 5.009845 6.244527 5.608088  5.834153
ENSG00000000457 4.722790  4.993719 3.999170 3.312369 4.087480  5.176119
ENSG00000000460 2.990603  3.237751 2.457261 1.629959 1.983141  3.137731
ENSG00000000938 2.643134  5.741066 6.912746 4.918491 3.820852  6.899117
ENSG00000000971 5.313255 10.346500 7.124250 6.927089 6.032612 10.197598
                    H4Lu      R1H      R1K       R1Li     R1Lu      R2H
ENSG00000000003 4.463456 4.356369 6.932932  8.3343252 5.915547 4.625348
ENSG00000000419 5.350423 5.464082 5.391834  5.8259430 4.887057 5.295517
ENSG00000000457 4.173647 4.285211 5.024853  5.1435415 4.837373 4.311664
ENSG00000000460 2.604439 1.284359 1.555258 -0.1735364 2.480549 1.415808
ENSG00000000938 8.249002 1.837210 2.564210  3.8041639 6.515323 2.327061
ENSG00000000971 8.277236 4.027912 6.576019 12.1322643 7.445976 5.571685
                     R2K      R2Li     R2Lu      R3H      R3K      R3Li
ENSG00000000003 7.134183  8.640291 5.654663 4.469039 7.166047  8.202424
ENSG00000000419 5.043831  5.723120 4.881450 5.462958 5.367362  5.990303
ENSG00000000457 5.154832  5.511121 5.265617 4.241449 5.139792  5.359535
ENSG00000000460 1.320003  1.496941 2.394895 1.656614 1.874414  1.413727
ENSG00000000938 2.681748  3.672247 6.583375 2.870792 2.267214  3.636318
ENSG00000000971 6.778700 11.778253 7.231996 5.150488 6.054215 12.087241
                    R3Lu      R4H      R4K      R4Li     R4Lu
ENSG00000000003 5.453490 4.892515 7.094406  7.705335 5.361237
ENSG00000000419 4.956114 5.427374 5.215706  5.464168 5.041757
ENSG00000000457 5.165431 4.509684 5.144266  5.318210 4.828352
ENSG00000000460 2.577251 1.088179 1.509418  1.534185 2.847559
ENSG00000000938 6.845883 2.474777 2.310571  4.509304 6.834845
ENSG00000000971 7.258258 6.408881 6.469038 11.695479 7.298202
hist(cpm_in_cutoff, xlab = "Log2(CPM)", main = "Log2(CPM) values for genes meeting the filtering criteria", breaks = 100 )

# Voom with individual as a random variable

cpm.voom.cyclic <- voom(dge_in_cutoff, design, normalize.method="cyclicloess", plot=T)

#corfit <- duplicateCorrelation(cpm.voom.cyclic, design, block=samples$Individual)
corfit.consensus <- 0.2197275

# Final voom on filtered data

cpm.voom.cyclic <- voom(dge_in_cutoff, design, normalize.method="cyclicloess", plot=TRUE, block=samples$Individual, correlation=corfit.consensus)

Fit the linear model

fit.cyclic.norm <- lmFit(cpm.voom.cyclic, design, plot = TRUE, block=samples$Individual, correlation=corfit.consensus)
fit.cyclic.norm <- eBayes(fit.cyclic.norm)


## - Potential caveat: variances could be different between human, chimp and rhesus (see Gordon Smyth email, 7 June 2013).                                                               
##  We look at the standard error for each condition                                                    
hist(fit.cyclic.norm$stdev.unscaled * fit.cyclic.norm$sigma, breaks=100)

hist(log2(fit.cyclic.norm$stdev.unscaled * fit.cyclic.norm$sigma), breaks=100)

boxplot(log2(fit.cyclic.norm$stdev.unscaled * fit.cyclic.norm$sigma))

## This seems to be pretty comparable between conditions. The human heart is higher, probably because of H1H missing and H3H with a bit strange behavior                                     
stderror <- log2(fit.cyclic.norm$stdev.unscaled * fit.cyclic.norm$sigma)
boxplot(list(stderror[,1:4], stderror[,5:8], stderror[,9:12]))

## A bit higher for human, and a bit lower for rhesus                                                                                                                                    
boxplot(list(stderror[,2:4], stderror[,6:8], stderror[,8:12])) ## excluding heart samples  

# In the contrast matrix, we have many comparisons for species and tissues individually
# Note: baseline is chimp heart

cm1 <- makeContrasts(H_K_inter_CH = H_by_K, 
                     R_K_inter_CH = R_by_K, 
                     H_Li_inter_CH = H_by_Li, 
                     R_Li_inter_CH = R_by_Li,  
                     H_Lu_inter_CH = H_by_Lu, 
                     R_Lu_inter_CH = R_by_Lu,
                     levels = design)

# Implement contrasts

contrasts_each_species <- contrasts.fit(fit.cyclic.norm, cm1)
fit1 <- eBayes(contrasts_each_species)

top3 <- list(H_K_inter =topTable(fit1, coef=1, adjust="BH", number=Inf, sort.by="none"), 
             R_K_inter =topTable(fit1, coef=2, adjust="BH", number=Inf, sort.by="none"),  
             H_Li_inter =topTable(fit1, coef=3, adjust="BH", number=Inf, sort.by="none"),  
             R_Li_inter =topTable(fit1, coef=4, adjust="BH", number=Inf, sort.by="none"), 
             
             H_Lu_inter =topTable(fit1, coef=5, adjust="BH", number=Inf, sort.by="none"), 
             R_Lu_inter =topTable(fit1, coef=6, adjust="BH", number=Inf, sort.by="none") )


# Set FDR level at 1% 

FDR_level <- 0.01

## Significant interactions in Humans (baseline = chimp hearts)

mylist <- list()
mylist[["Kidney"]] <- row.names(top3[[names(top3)[1]]])[top3[[names(top3)[1]]]$adj.P.Val < FDR_level]
mylist[["Liver"]] <-  row.names(top3[[names(top3)[3]]])[top3[[names(top3)[3]]]$adj.P.Val < FDR_level]
mylist[["Lung"]] <- row.names(top3[[names(top3)[5]]])[top3[[names(top3)[5]]]$adj.P.Val < FDR_level]


# Make 
#dev.off()
Four_comp <- venn.diagram(mylist, filename= NULL, main="Significant interactions in Humans (FDR 1%)", cex=1.5 , fill = pal[1:3], lty=1, height=2000, width=3000)
grid.draw(Four_comp)

## Significant interactions in Rhesus (baseline = chimp hearts)

mylist <- list()
mylist[["Kidney"]] <- row.names(top3[[names(top3)[2]]])[top3[[names(top3)[2]]]$adj.P.Val < FDR_level]
mylist[["Liver"]] <-  row.names(top3[[names(top3)[4]]])[top3[[names(top3)[4]]]$adj.P.Val < FDR_level]
mylist[["Lung"]] <- row.names(top3[[names(top3)[6]]])[top3[[names(top3)[6]]]$adj.P.Val < FDR_level]


# Make 
dev.off()
null device 
          1 
Four_comp <- venn.diagram(mylist, filename= NULL, main="Significant interactions in Rhesus (FDR 1%)", cex=1.5 , fill = pal[1:3], lty=1, height=2000, width=3000)
grid.draw(Four_comp)
# Perform multiple testing correction with adaptive shrinkage (ASH) 
 #
 # x - object MArrayLM from eBayes output
 # coef - coefficient tested by eBayes

run_ash <- function(x, coef){
  #stopifnot(class(x) == "MArrayLM", coef %in% colnames(x$coefficients),
  #             length(unique(x$df.total) == 1))
  result <- ash(betahat = x$coefficients[, coef], sebetahat = x$stdev.unscaled[, coef] * sqrt(x$s2.post), df = x$df.total[1])
  return(result)
}

get_results <- function(x, number = nrow(x$coefficients), sort.by = "none",
                        ...) {
  # x - object MArrayLM from eBayes output
  # ... - additional arguments passed to topTable
  stopifnot(class(x) == "MArrayLM")
  results <- topTable(x, number = number, sort.by = sort.by, ...)
  return(results)
}



# Prepare the data for the ASH
tests <- colnames(fit1$coefficients)
results <- vector(length = length(tests), mode = "list")
names(results) <- tests

# Get lfsr, lfdr, s value, q value, and a beta_est value. 
for (test in tests) {
  # Extract limma results
  results[[test]] <- get_results(fit1, coef = test)
  # Add mutliple testing correction with ASH
  output_ash <- run_ash(fit1, coef = test)
  results[[test]] <- cbind(results[[test]], sebetahat = output_ash$data$s, lfsr = output_ash$result$lfsr,
                           lfdr = output_ash$result$lfdr, qvalue = output_ash$result$qvalue,
                           svalue = output_ash$result$svalue, beta_est = output_ash$result$PosteriorMean, se_est =
                             output_ash$result$PosteriorSD)
}

# Save results from analysis with limma and ash.
#saveRDS(results, file.path(data_dir, #"results-limma-voom-ash-interactions.rds"))

GO Enrichment for revisions

FDR_level <- 0.05

human_kidney_inter <- top3$H_K_inter
human_liver_inter <- top3$H_Li_inter
human_lung_inter <- top3$H_Lu_inter

rhesus_kidney_inter <- top3$R_K_inter
rhesus_liver_inter <- top3$R_Li_inter
rhesus_lung_inter <- top3$R_Lu_inter

# Find human_kidney specific
human_kidney_only <- human_kidney_inter[which(human_kidney_inter$adj.P.Val < FDR_level & human_liver_inter$adj.P.Val > FDR_level & human_lung_inter$adj.P.Val > FDR_level & rhesus_kidney_inter$adj.P.Val > FDR_level & rhesus_liver_inter$adj.P.Val > FDR_level & rhesus_lung_inter$adj.P.Val > FDR_level), 1]                                   

human_kidney_inter_only <- human_kidney_inter$genes %in% human_kidney_only

Human liver

# Find human_kidney specific
human_liver_only <- human_liver_inter[which(human_kidney_inter$adj.P.Val > FDR_level & human_liver_inter$adj.P.Val < FDR_level & human_lung_inter$adj.P.Val > FDR_level & rhesus_kidney_inter$adj.P.Val > FDR_level & rhesus_liver_inter$adj.P.Val > FDR_level & rhesus_lung_inter$adj.P.Val > FDR_level), 1]                                   

human_liver_inter_only <- human_liver_inter$genes %in% human_liver_only

# Revisions- run GO
# Merge ENSG with true/false

test_gene <- as.numeric(as.vector(human_liver_inter_only))
names(test_gene) <- human_liver_inter$genes


# Run topGO
go_data <- new("topGOdata",
                   ontology = "BP",
                   allGenes = test_gene, 
                    geneSel = function(allScore){
    return(allScore > 0)
},
                   nodeSize = 5,
                   annotationFun = annFUN.org,
                   mapping = "org.Hs.eg.db",
                   ID = "ensembl")

Building most specific GOs .....
    ( 10269 GO terms found. )

Build GO DAG topology ..........
    ( 14352 GO terms and 33417 relations. )

Annotating nodes ...............
    ( 10194 genes annotated to the GO terms. )
# Perform enrichment test
go_test <- runTest(go_data, algorithm = "weight01", statistic = "fisher")

             -- Weight01 Algorithm -- 

         the algorithm is scoring 3382 nontrivial nodes
         parameters: 
             test statistic: fisher

     Level 16:  8 nodes to be scored    (0 eliminated genes)

     Level 15:  16 nodes to be scored   (0 eliminated genes)

     Level 14:  45 nodes to be scored   (139 eliminated genes)

     Level 13:  92 nodes to be scored   (333 eliminated genes)

     Level 12:  134 nodes to be scored  (991 eliminated genes)

     Level 11:  234 nodes to be scored  (2674 eliminated genes)

     Level 10:  334 nodes to be scored  (3782 eliminated genes)

     Level 9:   421 nodes to be scored  (5340 eliminated genes)

     Level 8:   469 nodes to be scored  (6621 eliminated genes)

     Level 7:   537 nodes to be scored  (7870 eliminated genes)

     Level 6:   477 nodes to be scored  (8820 eliminated genes)

     Level 5:   331 nodes to be scored  (9344 eliminated genes)

     Level 4:   180 nodes to be scored  (9691 eliminated genes)

     Level 3:   83 nodes to be scored   (9882 eliminated genes)

     Level 2:   20 nodes to be scored   (9976 eliminated genes)

     Level 1:   1 nodes to be scored    (10067 eliminated genes)
go_table <- GenTable(go_data, weightFisher = go_test,
                         orderBy = "weightFisher", ranksOf = "weightFisher",
                         topNodes = sum(score(go_test) < .01))

go_table
        GO.ID                                        Term Annotated
1  GO:0019388                 galactose catabolic process         7
2  GO:0007281                       germ cell development       116
3  GO:0005978               glycogen biosynthetic process        30
4  GO:0000185               activation of MAPKKK activity        15
5  GO:0090382                        phagosome maturation        30
6  GO:0031659 positive regulation of cyclin-dependent ...         5
7  GO:0051896    regulation of protein kinase B signaling       130
8  GO:0072655 establishment of protein localization to...       142
9  GO:0002480 antigen processing and presentation of e...         6
10 GO:0002138          retinoic acid biosynthetic process         6
11 GO:0019919 peptidyl-arginine methylation, to asymme...         6
12 GO:0060050 positive regulation of protein glycosyla...         7
13 GO:0035646            endosome to melanosome transport         7
   Significant Expected weightFisher
1            3     0.15      0.00029
2            7     2.41      0.00081
3            4     0.62      0.00216
4            3     0.31      0.00335
5            4     0.62      0.00407
6            2     0.10      0.00413
7            5     2.70      0.00609
8            5     2.95      0.00610
9            2     0.12      0.00611
10           2     0.12      0.00611
11           2     0.12      0.00611
12           2     0.15      0.00844
13           2     0.15      0.00844
# Get names of liver genes
sig.genes <- sigGenes(go_data)
goresults <- sapply(go_table$GO.ID, function(x)
    {
      genes<-genesInTerm(go_data, x) 
      genes[[1]][genes[[1]] %in% sig.genes]
    })

goresults["GO:0005978"] 
$`GO:0005978`
[1] "ENSG00000079739" "ENSG00000114480" "ENSG00000132326" "ENSG00000142208"
# Gene to ID conversion document

gene_id <- read.table("../../../Reg_Evo_Primates/data/ENSG_GENE_HG19.csv", stringsAsFactors = FALSE, header=TRUE, sep = ",")

human_liver_only_data <- as.data.frame(c("ENSG00000079739", "ENSG00000114480", "ENSG00000132326", "ENSG00000142208"))
colnames(human_liver_only_data) <- c("ensg")

# Get gene names of the background
comb_liver <- merge(human_liver_only_data, gene_id, by = c("ensg"))

# glycogen biosynthetic proces

comb_liver
             ensg Gene
1 ENSG00000079739 PGM1
2 ENSG00000114480 GBE1
3 ENSG00000132326 PER2
4 ENSG00000142208 AKT1

Human lung

# Find human_kidney specific
human_lung_only <- human_lung_inter[which(human_lung_inter$adj.P.Val < FDR_level & human_liver_inter$adj.P.Val > FDR_level & human_kidney_inter$adj.P.Val > FDR_level & rhesus_kidney_inter$adj.P.Val > FDR_level & rhesus_liver_inter$adj.P.Val > FDR_level & rhesus_lung_inter$adj.P.Val > FDR_level), 1]                                   

human_lung_inter_only <- human_lung_inter$genes %in% human_lung_only

# Revisions- run GO
# Merge ENSG with true/false

test_gene <- as.numeric(as.vector(human_lung_inter_only))
names(test_gene) <- human_lung_inter$genes


# Run topGO
go_data <- new("topGOdata",
                   ontology = "BP",
                   allGenes = test_gene, 
                    geneSel = function(allScore){
    return(allScore > 0)
},
                   nodeSize = 5,
                   annotationFun = annFUN.org,
                   mapping = "org.Hs.eg.db",
                   ID = "ensembl")

Building most specific GOs .....
    ( 10269 GO terms found. )

Build GO DAG topology ..........
    ( 14352 GO terms and 33417 relations. )

Annotating nodes ...............
    ( 10194 genes annotated to the GO terms. )
# Perform enrichment test
go_test <- runTest(go_data, algorithm = "weight01", statistic = "fisher")

             -- Weight01 Algorithm -- 

         the algorithm is scoring 4276 nontrivial nodes
         parameters: 
             test statistic: fisher

     Level 19:  1 nodes to be scored    (0 eliminated genes)

     Level 18:  3 nodes to be scored    (0 eliminated genes)

     Level 17:  7 nodes to be scored    (17 eliminated genes)

     Level 16:  7 nodes to be scored    (28 eliminated genes)

     Level 15:  28 nodes to be scored   (54 eliminated genes)

     Level 14:  70 nodes to be scored   (90 eliminated genes)

     Level 13:  123 nodes to be scored  (472 eliminated genes)

     Level 12:  203 nodes to be scored  (1323 eliminated genes)

     Level 11:  325 nodes to be scored  (2905 eliminated genes)

     Level 10:  461 nodes to be scored  (4120 eliminated genes)

     Level 9:   561 nodes to be scored  (5501 eliminated genes)

     Level 8:   596 nodes to be scored  (6960 eliminated genes)

     Level 7:   635 nodes to be scored  (8013 eliminated genes)

     Level 6:   566 nodes to be scored  (8914 eliminated genes)

     Level 5:   370 nodes to be scored  (9398 eliminated genes)

     Level 4:   210 nodes to be scored  (9707 eliminated genes)

     Level 3:   89 nodes to be scored   (9896 eliminated genes)

     Level 2:   20 nodes to be scored   (9976 eliminated genes)

     Level 1:   1 nodes to be scored    (10068 eliminated genes)
go_table <- GenTable(go_data, weightFisher = go_test,
                         orderBy = "weightFisher", ranksOf = "weightFisher",
                         topNodes = sum(score(go_test) < .01))

go_table
        GO.ID                                        Term Annotated
1  GO:0006919 activation of cysteine-type endopeptidas...        62
2  GO:0032736 positive regulation of interleukin-13 pr...         9
3  GO:0031053                    primary miRNA processing         9
4  GO:0002726 positive regulation of T cell cytokine p...        10
5  GO:0006325                      chromatin organization       522
6  GO:0032753 positive regulation of interleukin-4 pro...        11
7  GO:0007183               SMAD protein complex assembly        11
8  GO:0001657                    ureteric bud development        61
9  GO:0034380 high-density lipoprotein particle assemb...        13
10 GO:0030878                   thyroid gland development        13
11 GO:0009615                           response to virus       206
12 GO:0035019    somatic stem cell population maintenance        43
13 GO:0044782                         cilium organization       262
14 GO:0051098                       regulation of binding       256
15 GO:0003161       cardiac conduction system development         5
16 GO:0051534 negative regulation of NFAT protein impo...         5
17 GO:0038092                     nodal signaling pathway         5
18 GO:0035745         T-helper 2 cell cytokine production         5
19 GO:0048340             paraxial mesoderm morphogenesis         5
20 GO:0045494              photoreceptor cell maintenance        16
21 GO:2000810 regulation of bicellular tight junction ...        16
22 GO:0045599 negative regulation of fat cell differen...        30
23 GO:0022008                                neurogenesis       922
   Significant Expected weightFisher
1            8     1.74      0.00072
2            3     0.25      0.00162
3            3     0.25      0.00162
4            3     0.28      0.00227
5           23    14.65      0.00287
6            3     0.31      0.00305
7            3     0.31      0.00305
8            5     1.71      0.00482
9            3     0.36      0.00507
10           3     0.36      0.00507
11          11     5.78      0.00598
12           5     1.21      0.00675
13          12     7.35      0.00727
14          11     7.18      0.00731
15           2     0.14      0.00742
16           2     0.14      0.00742
17           2     0.14      0.00742
18           2     0.14      0.00742
19           2     0.14      0.00742
20           3     0.45      0.00933
21           3     0.45      0.00933
22           4     0.84      0.00936
23          27    25.87      0.00946
sig.genes <- sigGenes(go_data)

goresults["GO:0048340"] 
$<NA>
NULL
goresults["GO:0032736"] 
$<NA>
NULL
goresults["GO:0032753"] 
$<NA>
NULL
# Gene to ID conversion document

#gene_id <- read.table("../../../Reg_Evo_Primates/data/ENSG_GENE_HG19.csv", stringsAsFactors = FALSE, header=TRUE, sep = ",")

#human_lung_only_data <- as.data.frame(c("ENSG00000105698", "ENSG00000131165"))
#colnames(human_liver_only_data) <- c("ensg")

# Get gene names of the background
#comb_lung <- merge(human_lung_only_data, gene_id, by = c("ensg"))

#positive regulation of lipoprotein lipas...

#comb_lung

Human kidney

# Find human_kidney specific
human_kidney_only <- human_kidney_inter[which(human_kidney_inter$adj.P.Val < FDR_level & human_liver_inter$adj.P.Val > FDR_level & human_lung_inter$adj.P.Val > FDR_level & rhesus_kidney_inter$adj.P.Val > FDR_level & rhesus_liver_inter$adj.P.Val > FDR_level & rhesus_lung_inter$adj.P.Val > FDR_level), 1]                                   

human_kidney_inter_only <- human_kidney_inter$genes %in% human_kidney_only

# Revisions- run GO
# Merge ENSG with true/false

test_gene <- as.numeric(as.vector(human_kidney_inter_only))
names(test_gene) <- human_kidney_inter$genes


# Run topGO
go_data <- new("topGOdata",
                   ontology = "BP",
                   allGenes = test_gene, 
                    geneSel = function(allScore){
    return(allScore > 0)
},
                   nodeSize = 5,
                   annotationFun = annFUN.org,
                   mapping = "org.Hs.eg.db",
                   ID = "ensembl")

Building most specific GOs .....
    ( 10269 GO terms found. )

Build GO DAG topology ..........
    ( 14352 GO terms and 33417 relations. )

Annotating nodes ...............
    ( 10194 genes annotated to the GO terms. )
# Perform enrichment test
go_test <- runTest(go_data, algorithm = "weight01", statistic = "fisher")

             -- Weight01 Algorithm -- 

         the algorithm is scoring 3511 nontrivial nodes
         parameters: 
             test statistic: fisher

     Level 19:  1 nodes to be scored    (0 eliminated genes)

     Level 18:  1 nodes to be scored    (0 eliminated genes)

     Level 17:  1 nodes to be scored    (17 eliminated genes)

     Level 16:  5 nodes to be scored    (21 eliminated genes)

     Level 15:  27 nodes to be scored   (21 eliminated genes)

     Level 14:  49 nodes to be scored   (80 eliminated genes)

     Level 13:  86 nodes to be scored   (448 eliminated genes)

     Level 12:  141 nodes to be scored  (1110 eliminated genes)

     Level 11:  226 nodes to be scored  (2671 eliminated genes)

     Level 10:  344 nodes to be scored  (3832 eliminated genes)

     Level 9:   435 nodes to be scored  (5359 eliminated genes)

     Level 8:   483 nodes to be scored  (6858 eliminated genes)

     Level 7:   559 nodes to be scored  (7896 eliminated genes)

     Level 6:   505 nodes to be scored  (8843 eliminated genes)

     Level 5:   349 nodes to be scored  (9391 eliminated genes)

     Level 4:   192 nodes to be scored  (9696 eliminated genes)

     Level 3:   87 nodes to be scored   (9877 eliminated genes)

     Level 2:   19 nodes to be scored   (9973 eliminated genes)

     Level 1:   1 nodes to be scored    (10068 eliminated genes)
go_table <- GenTable(go_data, weightFisher = go_test,
                         orderBy = "weightFisher", ranksOf = "weightFisher",
                         topNodes = sum(score(go_test) < .01))

go_table
        GO.ID                                        Term Annotated
1  GO:0051451                          myoblast migration         8
2  GO:0045833 negative regulation of lipid metabolic p...        57
3  GO:0001975                     response to amphetamine        12
4  GO:0098719    sodium ion import across plasma membrane         5
5  GO:0043697                      cell dedifferentiation         5
6  GO:0019323                   pentose catabolic process         5
7  GO:0030091                              protein repair         5
8  GO:2000035            regulation of stem cell division         6
9  GO:0032494                   response to peptidoglycan         6
10 GO:1900125 regulation of hyaluronan biosynthetic pr...         6
11 GO:0031115 negative regulation of microtubule polym...         6
12 GO:0060088 auditory receptor cell stereocilium orga...         7
13 GO:2000402 negative regulation of lymphocyte migrat...         7
14 GO:1901223 negative regulation of NIK/NF-kappaB sig...         7
15 GO:0051272 positive regulation of cellular componen...       325
16 GO:0007141                              male meiosis I         8
17 GO:0071447          cellular response to hydroperoxide         8
18 GO:0090280 positive regulation of calcium ion impor...         8
   Significant Expected weightFisher
1            3     0.15      0.00037
2            5     1.10      0.00108
3            3     0.23      0.00137
4            2     0.10      0.00358
5            2     0.10      0.00358
6            2     0.10      0.00358
7            2     0.10      0.00358
8            2     0.12      0.00530
9            2     0.12      0.00530
10           2     0.12      0.00530
11           2     0.12      0.00530
12           2     0.14      0.00732
13           2     0.14      0.00732
14           2     0.14      0.00732
15          10     6.28      0.00945
16           2     0.15      0.00964
17           2     0.15      0.00964
18           2     0.15      0.00964
# Get names of kidney genes
sig.genes <- sigGenes(go_data)
goresults <- sapply(go_table$GO.ID, function(x)
    {
      genes<-genesInTerm(go_data, x) 
      genes[[1]][genes[[1]] %in% sig.genes]
    })

goresults["GO:0098719"] 
$`GO:0098719`
[1] "ENSG00000066230" "ENSG00000130529"
# Gene to ID conversion document

#gene_id <- read.table("../../../Reg_Evo_Primates/data/ENSG_GENE_HG19.csv", stringsAsFactors = FALSE, header=TRUE, sep = ",")

#human_kidney_only_data <- as.data.frame(c("ENSG00000138031", "ENSG00000142875", "ENSG00000173175"))
#colnames(human_kidney_only_data) <- c("ensg")

# Get gene names of the background
#comb_kidney <- merge(human_kidney_only_data, gene_id, by = c("ensg"))

# renal water homeostasis

#comb_kidney