GC_content_normalization

Part 1: Prior to spline fitting
About the GC content of different genes across species
Part 2: Adjust for GC content
Part 3: Assess new counts

Part 1: Prior to spline fitting

# Load libraries 

library("gplots")

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library("ggplot2")
library("RColorBrewer")
library("scales")
library("edgeR")

## Loading required package: limma

library("R.utils")

## Loading required package: R.oo

## Loading required package: R.methodsS3

## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.

## R.oo v1.20.0 (2016-02-17) successfully loaded. See ?R.oo for help.

## 
## Attaching package: 'R.oo'

## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods

## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save

## R.utils v2.3.0 (2016-04-13) successfully loaded. See ?R.utils for help.

## 
## Attaching package: 'R.utils'

## The following object is masked from 'package:utils':
## 
##     timestamp

## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings

library("plyr")
library("limma")
library("EDASeq")

## Loading required package: Biobase

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following object is masked from 'package:limma':
## 
##     plotMA

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: ShortRead

## Loading required package: BiocParallel

## Loading required package: Biostrings

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:plyr':
## 
##     rename

## The following object is masked from 'package:gplots':
## 
##     space

## The following objects are masked from 'package:base':
## 
##     colMeans, colSums, expand.grid, rowMeans, rowSums

## Loading required package: IRanges

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:plyr':
## 
##     desc

## The following object is masked from 'package:R.oo':
## 
##     trim

## Loading required package: XVector

## 
## Attaching package: 'XVector'

## The following object is masked from 'package:plyr':
## 
##     compact

## Loading required package: Rsamtools

## Loading required package: GenomeInfoDb

## Loading required package: GenomicRanges

## Loading required package: GenomicAlignments

## Loading required package: SummarizedExperiment

## 
## Attaching package: 'ShortRead'

## The following object is masked from 'package:plyr':
## 
##     id

## The following object is masked from 'package:R.utils':
## 
##     countLines

source("~/Reg_Evo_Primates/ashlar-trial/analysis/functions.R")


# Load the raw counts

counts_genes <- read.delim("~/Reg_Evo_Primates/ashlar-trial/data/counts_genes.txt")

dim(counts_genes)

## [1] 30030    48

# Load Sample information

samples <- read.csv("~/Reg_Evo_Primates/ashlar-trial/data/Sample_info_RNAseq.csv")

labels <- paste(samples$Species, samples$Tissue, sep=" ")

# We previously determined with SNP calling (via GATK) that the sample Human 1 Heart is Human 1 but most likely a Human 1 Liver. Therefore, we will remove H1H

counts_genes <- counts_genes[,-17]
samples <- samples[-17,]
labels <- paste(samples$Species, samples$Tissue, sep=" ")

# Eliminate the lowly expressed genes (see Filtering analysis for more information)

  # Find the TMM
dge_original <- DGEList(counts=as.matrix(counts_genes), genes=rownames(counts_genes), group = as.character(t(labels)))
dge_original <- calcNormFactors(dge_original)
tmm_cpm <- cpm(dge_original, normalized.lib.sizes=TRUE, log=TRUE, prior.count = 0.25)
head(tmm_cpm)

##                      C1H       C1K      C1Li      C1Lu       C2H       C2K
## ENSG00000000003 4.577905  6.458067  8.266553  5.435440 4.6399682  6.022339
## ENSG00000000005 2.426523 -1.569756 -1.405992 -3.331480 0.6181423 -1.333290
## ENSG00000000419 5.850838  5.191558  5.943281  5.432423 5.6343469  5.046183
## ENSG00000000457 4.568934  5.188318  5.908310  4.926437 4.7873635  5.235192
## ENSG00000000460 1.515520  1.843447  2.085975  2.262878 1.6139287  1.914064
## ENSG00000000938 5.620597  3.793194  5.096963  7.504597 2.4864784  4.123918
##                      C2Li      C2Lu       C3H         C3K      C3Li
## ENSG00000000003  8.007975  4.528713  4.906720  6.35989436  7.721755
## ENSG00000000005 -2.063745 -2.063306 -1.325874 -0.07344781 -3.796207
## ENSG00000000419  5.791946  5.164072  5.667325  5.13300315  6.351074
## ENSG00000000457  6.523773  4.950139  4.609998  5.15783261  6.435445
## ENSG00000000460  2.303378  1.987747  1.571729  1.41524770  2.281625
## ENSG00000000938  5.366960  8.047660  4.956491  4.17708758  5.141829
##                      C3Lu      C4H       C4K      C4Li      C4Lu      H1K
## ENSG00000000003  5.836125 4.210680  6.456261  8.378765  5.372745 6.815098
## ENSG00000000005 -3.811821 3.822937 -1.523086 -3.673311 -2.521990 4.863477
## ENSG00000000419  5.556851 5.760344  5.210484  5.806579  5.264305 5.538591
## ENSG00000000457  5.129130 4.620220  4.975769  6.522540  5.206329 4.235449
## ENSG00000000460  2.084845 1.431520  1.779357  2.131942  2.419215 2.717219
## ENSG00000000938  7.120486 3.613875  3.573790  4.505428  7.660281 4.009787
##                      H1Li      H1Lu       H2H       H2K      H2Li
## ENSG00000000003  6.556565  5.017756  3.715237 7.1367653  6.628779
## ENSG00000000005 -6.803449 -2.236364 -3.893541 0.3983141 -6.803449
## ENSG00000000419  6.063479  5.729603  5.640512 5.3928808  5.828277
## ENSG00000000457  4.934305  4.420875  3.440821 4.0896792  4.440666
## ENSG00000000460  4.969815  3.236803  1.926242 1.9097705  2.647717
## ENSG00000000938  4.460420  7.796908  5.594227 3.6715300  5.980133
##                      H2Lu        H3H        H3K      H3Li      H3Lu
## ENSG00000000003  4.134172  3.4924818 7.02915140  7.656852  5.192738
## ENSG00000000005 -6.803449 -6.8034489 0.01948416 -6.803449 -6.803449
## ENSG00000000419  5.266516  5.6997772 5.79252484  6.137729  4.912490
## ENSG00000000457  4.154920  4.3895028 4.66037953  4.914637  3.901832
## ENSG00000000460  2.506159 -0.1267628 2.92821223  3.158698  2.359990
## ENSG00000000938  6.921957  4.0457143 2.58075164  5.661979  6.815378
##                       H4H        H4K      H4Li      H4Lu       R1H
## ENSG00000000003  4.309449  6.3278935  6.502999  4.441313  4.287983
## ENSG00000000005 -2.875477 -0.3994824 -6.803449 -4.372005 -6.803449
## ENSG00000000419  6.269823  5.5642000  5.747041  5.328283  5.395688
## ENSG00000000457  3.337615  4.0435944  5.089012  4.151503  4.216825
## ENSG00000000460  1.655079  1.9392676  3.050663  2.582280  1.216076
## ENSG00000000938  4.943776  3.7769670  6.812001  8.226866  1.768889
##                       R1K       R1Li      R1Lu       R2H       R2K
## ENSG00000000003  6.876427  8.2484842  5.873253  4.597962  7.090748
## ENSG00000000005 -6.803449 -3.5714845 -2.937362 -2.940118 -2.382257
## ENSG00000000419  5.335332  5.7401085  4.844764  5.268132  5.000397
## ENSG00000000457  4.968352  5.0577118  4.795080  4.284276  5.111398
## ENSG00000000460  1.498811 -0.2588692  2.438262  1.388391  1.276589
## ENSG00000000938  2.507734  3.7183538  6.473029  2.299659  2.638321
##                      R2Li      R2Lu       R3H       R3K      R3Li
## ENSG00000000003  8.600746  5.606713  4.457832  7.128831  8.170499
## ENSG00000000005 -1.244301 -4.910409 -4.240968 -4.509770 -3.147767
## ENSG00000000419  5.683575  4.833502  5.451756  5.330145  5.958378
## ENSG00000000457  5.471576  5.217667  4.230240  5.102576  5.327609
## ENSG00000000460  1.457402  2.346962  1.645341  1.837197  1.381784
## ENSG00000000938  3.632703  6.535424  2.859562  2.229997  3.604390
##                      R3Lu       R4H       R4K      R4Li      R4Lu
## ENSG00000000003  5.454412  4.833487  7.058675  7.690207  5.357887
## ENSG00000000005 -3.797088 -2.525255 -2.089636 -4.580657 -6.803449
## ENSG00000000419  4.957033  5.368345  5.179974  5.449035  5.038406
## ENSG00000000457  5.166351  4.450659  5.108534  5.303078  4.824999
## ENSG00000000460  2.578123  1.029239  1.473680  1.518987  2.844173
## ENSG00000000938  6.846810  2.415779  2.274836  4.494167  6.831500

# Assign all 12 tissue-species pairs
chimp_hearts <- c(1, 5, 9, 13)
chimp_kidneys <- c(2,6,10,14)
chimp_livers <- c(3,7,11,15)
chimp_lungs <- c(4,8,12,16)

human_hearts <- c(20,24,28)
human_kidneys <- c(17,21,25,29)
human_livers <- c(18,22,26,30)
human_lungs <- c(19,23,27,31)

rhesus_hearts <- c(32,36,40,44)
rhesus_kidneys <- c(33,37,41,45)
rhesus_livers <- c(34,38,42,46)
rhesus_lungs <- c(35,39,43,47)

# Save the counts
                          # Raw counts = log2(counts_genes)
                          # CPM = tmm_cpm
counts_genes_filtered_3 <- tmm_cpm

# Put the number of samples that you want to have for every tissue-species pair (count/4 samples)

count = 2
threshold = -5.5

counts_genes_filtered_3A <- counts_genes_filtered_3[rowSums(counts_genes_filtered_3[, chimp_hearts] >= threshold) >= count, ] 

#dim(counts_genes_filtered_3A)

counts_genes_filtered_3B <- counts_genes_filtered_3A[rowSums(counts_genes_filtered_3A[, chimp_kidneys] >= threshold) >= count, ] 

dim(counts_genes_filtered_3B)

## [1] 20720    47

counts_genes_filtered_3C <- counts_genes_filtered_3B[rowSums(counts_genes_filtered_3B[, chimp_livers] >= threshold) >= count, ] 

dim(counts_genes_filtered_3C)

## [1] 19678    47

counts_genes_filtered_3D <- counts_genes_filtered_3C[rowSums(counts_genes_filtered_3C[, chimp_lungs] >= threshold) >= count, ] 

dim(counts_genes_filtered_3D)

## [1] 19514    47

counts_genes_filtered_3E <- counts_genes_filtered_3D[rowSums(counts_genes_filtered_3D[, human_hearts] >= threshold) >= count, ] 

dim(counts_genes_filtered_3E)

## [1] 18006    47

counts_genes_filtered_3F <- counts_genes_filtered_3E[rowSums(counts_genes_filtered_3E[, human_kidneys] >= threshold) >= count, ] 

dim(counts_genes_filtered_3F)

## [1] 17914    47

counts_genes_filtered_3G <- counts_genes_filtered_3F[rowSums(counts_genes_filtered_3F[, human_livers] >= threshold) >= count, ] 

dim(counts_genes_filtered_3G)

## [1] 17721    47

counts_genes_filtered_3H <- counts_genes_filtered_3G[rowSums(counts_genes_filtered_3G[, human_lungs] >= threshold) >= count, ] 

dim(counts_genes_filtered_3H)

## [1] 17691    47

counts_genes_filtered_3I <- counts_genes_filtered_3H[rowSums(counts_genes_filtered_3H[, rhesus_hearts] >= threshold) >= count, ] 

dim(counts_genes_filtered_3I)

## [1] 17178    47

counts_genes_filtered_3J <- counts_genes_filtered_3I[rowSums(counts_genes_filtered_3I[, rhesus_kidneys] >= threshold) >= count, ] 

dim(counts_genes_filtered_3J)

## [1] 17073    47

counts_genes_filtered_3K <- counts_genes_filtered_3J[rowSums(counts_genes_filtered_3J[, rhesus_livers] >= threshold) >= count, ] 

dim(counts_genes_filtered_3K)

## [1] 16856    47

counts_genes_filtered_3L <- counts_genes_filtered_3K[rowSums(counts_genes_filtered_3K[, rhesus_lungs] >= threshold) >= count, ] 

dim(counts_genes_filtered_3L)

## [1] 16838    47

Find GC content for all of the filtered genes

  # Load the GC content and gene length data

gccontent.genes <- read.table("~/Reg_Evo_Primates/ashlar-trial/data/GC_content.genes.txt", sep="\t", h=T, row.names=1)
gccontent.genes <- gccontent.genes[order(row.names(gccontent.genes)), ]

lengths <- read.table("~/Reg_Evo_Primates/ashlar-trial/data/lengths.txt", sep="\t", h=T)
lengths.genes <- aggregate(lengths[,3:5], list(lengths[,1]), sum)

## Test that row names are identical to dge- we are expecting all true                                                                                                             

summary(row.names(dge_original) ==  lengths.genes[,1])

##    Mode    TRUE    NA's 
## logical   30030       0

summary(row.names(dge_original) ==  row.names(gccontent.genes))

##    Mode    TRUE    NA's 
## logical   30030       0

About the GC content of different genes across species

## include everything in single EDASeq object                                                                                                                                                        
feature <- data.frame(gcHuman = gccontent.genes$humanGC, lengthHuman = lengths.genes$lengthHuman, gcChimp = gccontent.genes$chimpGC, lengthChimp = lengths.genes$lengthChimp, gcRhesus = gccontent.genes$rhesusGC, lengthRhesus = lengths.genes$lengthRhesus)
row.names(feature) <- row.names(dge_original)
pheno =  as.data.frame(samples)
pheno = pheno[,3:4]
row.names(pheno) <- colnames(dge_original$counts)

## Make sure that the row names of phenoData and the column names of counts are the same
summary(row.names(pheno) ==  colnames(dge_original$counts))

##    Mode    TRUE    NA's 
## logical      47       0

## Make and display object
EDAdata <- newSeqExpressionSet(counts = as.matrix(dge_original$counts), featureData = feature, phenoData = pheno)
head(counts(EDAdata))

##                  C1H  C1K C1Li C1Lu  C2H  C2K C2Li  C2Lu  C3H  C3K C3Li
## ENSG00000000003  645 3217 6687 1436  539 2178 5590  1102  769 2966 3349
## ENSG00000000005  145   12    8    3   33   13    5    11   10   34    1
## ENSG00000000419 1559 1337 1336 1433 1074 1107 1203  1712 1303 1267 1295
## ENSG00000000457  641 1334 1304 1009  597 1262 1998  1476  626 1289 1373
## ENSG00000000460   77  131   92  159   66  126  107   189   76   96   77
## ENSG00000000938 1329  507  743 6027  121  584  896 12637  796  653  560
##                 C3Lu  C4H  C4K C4Li C4Lu  H1K H1Li H1Lu H2H  H2K H2Li H2Lu
## ENSG00000000003 1835  475 2590 4794 1505 4252 2100  956 225 2365 2024  690
## ENSG00000000005    2  363   10    1    6 1099    0    6   1   22    0    0
## ENSG00000000419 1512 1391 1092  806 1396 1755 1492 1566 855  706 1162 1513
## ENSG00000000457 1124  631  928 1324 1341  711  682  632 186  286  444  700
## ENSG00000000460  136   69  101   63  194  248  699  278  65   63  128  223
## ENSG00000000938 4470  314  351  327 7349  608  491 6564 828  214 1291 4767
##                 H3H  H3K H3Li H3Lu  H4H  H4K H4Li  H4Lu R1H  R1K R1Li R1Lu
## ENSG00000000003 124 4810 2918 1427  467 3002 1775  1104 270 2669 4046 1928
## ENSG00000000005   0   37    0    0    3   28    0     2   0    0    1    4
## ENSG00000000419 573 2041 1018 1175 1818 1768 1051  2042 582  917  711  945
## ENSG00000000457 231  931  436  583  238  616  666   903 257  711  443  913
## ENSG00000000460  10  280  129  200   74  143  162   304  32   64   11  178
## ENSG00000000938 182  220  732 4395  725  512 2199 15230  47  129  175 2922
##                 R2H  R2K R2Li R2Lu R3H  R3K R3Li R3Lu R4H  R4K R4Li R4Lu
## ENSG00000000003 399 3727 7517 2005 500 4005 5547 2088 519 4719 6289 1978
## ENSG00000000005   2    5    8    1   1    1    2    3   3    8    1    0
## ENSG00000000419 635  875  995 1173 996 1151 1197 1479 752 1283 1330 1585
## ENSG00000000457 321  945  859 1531 427  983  773 1710 398 1221 1202 1367
## ENSG00000000460  43   66   53  209  71  102   50  284  37   98   87  346
## ENSG00000000938  81  170  240 3817 165  134  234 5482  97  171  686 5494

## subset EDASeq object to genes with GC content available:                                                                                                                                          
EDAdata <- EDAdata[!is.na(fData(EDAdata)$gcHuman),]
EDAdata <- EDAdata[!is.na(fData(EDAdata)$gcChimp),]
EDAdata <- EDAdata[!is.na(fData(EDAdata)$gcRhesus),]

## What is the %GC difference between species. How many genes have a %GC diff > 5%                                                                                                                   
gccontent.filtered <- gccontent.genes[!is.na(gccontent.genes$humanGC) & !is.na(gccontent.genes$chimpGC) & !is.na(gccontent.genes$rhesusGC), ]
percent <- (apply(gccontent.filtered[,1:3], 1, max) - apply(gccontent.filtered[,1:3], 1, min)) / apply(gccontent.filtered[,1:3], 1, min)
hist(percent, breaks = 100)

summary(percent)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.005505 0.010990 0.017590 0.022050 0.506000

summary(percent > 0)

##    Mode   FALSE    TRUE    NA's 
## logical     650   28642       0

summary(percent > 0.05)

##    Mode   FALSE    TRUE    NA's 
## logical   27391    1901       0

## Maybe less confusing measure: what is the difference in GC-content between genes (absolute difference)?                                                                                           
percent <- apply(gccontent.filtered[,1:3], 1, max) - apply(gccontent.filtered[,1:3], 1, min)
hist(percent, breaks = 100)

summary(percent) ## max 20% difference

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000000 0.002635 0.005317 0.007943 0.010260 0.195600

summary(percent > 0.05)

##    Mode   FALSE    TRUE    NA's 
## logical   29194      98       0

The GC content of 1901 genes (6.3%) differ by more than 5% (of the less rich gene) between species.

The GC content of 98 genes have an absolute difference in GC-content of more than 5%.

# Filter the GC content list for genes we don't have GC content on

GC_content_complete <- as.data.frame(gccontent.genes[!is.na(gccontent.genes["humanGC"]),])

GC_content_complete_col <- as.data.frame(GC_content_complete[,1])
rownames(GC_content_complete_col) <- rownames(GC_content_complete)
colnames(GC_content_complete_col) <- c("HumanGC")

# Find the raw counts of the genes that have met the criteria for expression 

inshared_lists = row.names(counts_genes) %in% rownames(counts_genes_filtered_3L) 
inshared_lists_data <- as.data.frame(inshared_lists)
raw_counts_in <- cbind(counts_genes, inshared_lists_data)
raw_counts_in_2_of_4 <- as.data.frame(subset(raw_counts_in, inshared_lists_data == "TRUE"))
raw_counts_in_2_of_4_col <- as.data.frame(raw_counts_in_2_of_4[,1:47])
rownames(raw_counts_in_2_of_4_col) <- rownames(raw_counts_in_2_of_4)

dim(raw_counts_in_2_of_4_col)

## [1] 16838    47

# Find the GC content of the genes that have met the criteria for expression

inshared_lists = row.names(GC_content_complete_col) %in% rownames(raw_counts_in_2_of_4_col)
inshared_lists_data <- as.data.frame(inshared_lists)
gc_content_in <- cbind(GC_content_complete_col, inshared_lists_data)
gc_content_in_2_of_4 <- as.data.frame(subset(gc_content_in, inshared_lists_data == "TRUE"))
gc_content_in_2_of_4_col <- as.data.frame(gc_content_in_2_of_4[,1])
rownames(gc_content_in_2_of_4_col) <- rownames(gc_content_in_2_of_4)
colnames(gc_content_in_2_of_4_col) <- c("HumanGC")

dim(gc_content_in_2_of_4_col)

## [1] 16616     1

# Find the raw counts of the genes that have met the criteria for expression and that have a real-number GC count

inshared_lists = row.names(raw_counts_in_2_of_4_col) %in% rownames(gc_content_in_2_of_4_col) 
inshared_lists_data <- as.data.frame(inshared_lists)
counts_in <- cbind(raw_counts_in_2_of_4_col, inshared_lists_data)
counts_in_2_of_4 <- as.data.frame(subset(counts_in, inshared_lists_data == "TRUE"))
counts_in_2_of_4_col <- as.data.frame(counts_in_2_of_4[,1:47])
rownames(counts_in_2_of_4_col) <- rownames(counts_in_2_of_4)

dim(counts_in_2_of_4_col)

## [1] 16616    47

# Make sure that the row names for the GC content and the raw counts are the same

summary(rownames(counts_in_2_of_4_col) %in% rownames(gc_content_in_2_of_4_col))

##    Mode    TRUE    NA's 
## logical   16616       0

The problem(s):

None of the EDAseq normalizations were deemed to be sufficiently aggressive.
There is a non-linear relationship between GC content and log2(read counts). We can see this illustrated here:

# plot(data[,2],data[,3], xlim = c(0, 20e5), ylim = c(0,8e4), main = "Read counts in individual versus total", ylab = "Read counts in 1 individual", xlab = "Total reads across all individuals") #plot x=GC, y=counts in ind/counts in all inds

# plot(data[,2],data[,3], main = "Read counts in individual versus total", ylab = "Read counts in 1 individual", xlab = "Total reads across all individuals") #plot x=GC, y=counts in ind/counts in all inds


counts <- counts_in_2_of_4_col$CH1
gc_content <- gc_content_in_2_of_4_col$HumanGC

counts <- counts_in_2_of_4_col[1]
gc_content <- gc_content_in_2_of_4_col[1]

plot(t(gc_content), t(counts), ylab = "Counts", xlab = "GC content", main = "Counts versus GC content for each gene in sample (C1H)")

counts <- log2(counts_in_2_of_4_col[1])
gc_content <- gc_content_in_2_of_4_col[1]
plot(t(gc_content), t(counts), ylab = "Log2(counts)", xlab = "GC content", main = "Log2(counts) versus GC content for each gene in sample (C1H)")

counts <- counts_in_2_of_4_col$C1H
total_counts <- rowSums(counts_in_2_of_4_col)
divide_counts <- counts / total_counts

# Comparison of all data versus subsetted data (large fraction for an individual)

check <-  cbind(counts, total_counts)
check2 <- cbind(check, divide_counts)
check2 <- as.data.frame(check2)

total_counts <- check2[which(check2$divide_counts > 0.1), ]
total_counts <- as.data.frame(total_counts)

mean(check2$total_counts)

## [1] 75878.85

# 75878.85

mean(total_counts$total_counts)

## [1] 115192.9

#115192.9

median(check2$total_counts)

## [1] 25143

#23772

median(total_counts$total_counts)

## [1] 14108

# 14108

plot(t(gc_content), t(divide_counts), xlim = c(0.3, 0.7), ylim = c(0, 0.05), ylab = "Fraction of counts from one individual", xlab = "GC content", main = "Fraction of counts versus gene GC content in one individual (C1H)")

plot(t(gc_content), t(divide_counts), ylab = "Fraction of counts from one individual", xlab = "GC content", main = "Fraction of counts versus gene GC content in one individual (C1H)")

Because GC content is species-specific, we want a scheme that will allow us to incorporate species-specific information (unlike in the analysis above). At the same time, gene GC content and species are confounded. Therefore, we argue that it makes sense to make different
In RNA-seq experiments, a handful of highly expressed genes take up a large fraction of the total number of mapped reads. Different genes are highly expressed in different tissues. Also, there could be variation in the amount of highly expressed genes and the overall relationship between these and the other expressed genes. We need to adjust for total read depth.

Therefore, we will fit each spline for each tissue-species pair. Information from 4 samples (3 in the case of human hearts) will go into the information to fit the splines.

Part 2: Adjust for GC content

For more information about the method used for adjusting read depth and GC content, see pages 4-6 of the Supplementary Text and Figures from the following paper:

van de Geijn B, McVicker G, Gilad Y, Pritchard JK. WASP: allele-specific software for robust molecular quantitative trait locus discovery. Nat Methods. 2015 Sep 14. doi: 10.1038/nmeth.3582

Format GC content and raw counts so that they can go into the script to make the splines

# Get GC content for all 3 species for all of the genes that we have data for (not just the human GC content which we had before)

inshared_lists = row.names(GC_content_complete) %in% rownames(counts_in_2_of_4_col) 
inshared_lists_data <- as.data.frame(inshared_lists)
gc_counts_in <- cbind(GC_content_complete, inshared_lists_data)
gc_counts_in_2_of_4 <- as.data.frame(subset(gc_counts_in, inshared_lists_data == "TRUE"))
gc_counts_in_2_of_4_col <- as.data.frame(gc_counts_in_2_of_4[,1:3])
rownames(gc_counts_in_2_of_4_col) <- rownames(gc_counts_in_2_of_4)

# Split the samples into the 12 tissue-species pairs and then attach the GC content so that it is in the correct format for the next step. Make the first row is also the second row (because the next step will read the first row as the header). Then save this file, as it will be run in the next section. 

  # Chimp hearts 

# Subset the counts of genes for the chimp hearts

counts_chimp_hearts <- counts_in_2_of_4_col[,chimp_hearts]

# Find the total read depth for each gene

read_depth_total <- rowSums(counts_chimp_hearts)

# Make GC content the first column and read_depth_total for each gene the second column and then the raw counts for each individual 

gc_counts_chimp_hearts <- cbind(gc_counts_in_2_of_4_col[,2], read_depth_total)
gc_counts_chimp_hearts <- cbind(gc_counts_chimp_hearts, counts_chimp_hearts)

# Add an extra row so there's not an issue with the headers 

gc_counts_chimp_hearts <- rbind(gc_counts_chimp_hearts[1,], gc_counts_chimp_hearts)

# Look at the data

head(gc_counts_chimp_hearts)

##                         V1 read_depth_total  C1H  C2H  C3H  C4H
## ENSG00000000003  0.4446018             2428  645  539  769  475
## ENSG000000000031 0.4446018             2428  645  539  769  475
## ENSG00000000419  0.4003512             5327 1559 1074 1303 1391
## ENSG00000000457  0.4308564             2495  641  597  626  631
## ENSG00000000460  0.3933759              288   77   66   76   69
## ENSG00000000938  0.5656566             2560 1329  121  796  314

# Output this corrected file to the Desktop so that we can run it in Python (this command is commented out so that RMarkdown doesn't get confused when I try to load this to my personal site)

# write.table(gc_counts_chimp_hearts,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_chimp_hearts.txt",sep="\t", col.names = F, row.names = F)

  # Repeat for all of the other 11 tissue-species pairs. Here, we will repeat this process for Chimp kidneys

counts_chimp_kidneys <- counts_in_2_of_4_col[,chimp_kidneys]
read_depth_total <- rowSums(counts_chimp_kidneys)
gc_counts_chimp_kidneys <- cbind(gc_counts_in_2_of_4_col[,2], read_depth_total)
gc_counts_chimp_kidneys <- cbind(gc_counts_chimp_kidneys, counts_chimp_kidneys)
gc_counts_chimp_kidneys <- rbind(gc_counts_chimp_kidneys[1,], gc_counts_chimp_kidneys)

head(gc_counts_chimp_kidneys)

##                         V1 read_depth_total  C1K  C2K  C3K  C4K
## ENSG00000000003  0.4446018            10951 3217 2178 2966 2590
## ENSG000000000031 0.4446018            10951 3217 2178 2966 2590
## ENSG00000000419  0.4003512             4803 1337 1107 1267 1092
## ENSG00000000457  0.4308564             4813 1334 1262 1289  928
## ENSG00000000460  0.3933759              454  131  126   96  101
## ENSG00000000938  0.5656566             2095  507  584  653  351

# write.table(gc_counts_chimp_kidneys,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_chimp_kidneys.txt",sep="\t", col.names = F, row.names = F)

  # Chimp livers

counts_chimp_livers <- counts_in_2_of_4_col[,chimp_livers]
read_depth_total <- rowSums(counts_chimp_livers)
gc_counts_chimp_livers <- cbind(gc_counts_in_2_of_4_col[,2], read_depth_total)
gc_counts_chimp_livers <- cbind(gc_counts_chimp_livers, counts_chimp_livers)
gc_counts_chimp_livers <- rbind(gc_counts_chimp_livers[1,], gc_counts_chimp_livers)

head(gc_counts_chimp_livers)

##                         V1 read_depth_total C1Li C2Li C3Li C4Li
## ENSG00000000003  0.4446018            20420 6687 5590 3349 4794
## ENSG000000000031 0.4446018            20420 6687 5590 3349 4794
## ENSG00000000419  0.4003512             4640 1336 1203 1295  806
## ENSG00000000457  0.4308564             5999 1304 1998 1373 1324
## ENSG00000000460  0.3933759              339   92  107   77   63
## ENSG00000000938  0.5656566             2526  743  896  560  327

# write.table(gc_counts_chimp_livers,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_chimp_livers.txt",sep="\t", col.names = F, row.names = F)

  # Chimp lungs

counts_chimp_lungs <- counts_in_2_of_4_col[,chimp_lungs]
read_depth_total <- rowSums(counts_chimp_lungs)
gc_counts_chimp_lungs <- cbind(gc_counts_in_2_of_4_col[,2], read_depth_total) 
gc_counts_chimp_lungs <- cbind(gc_counts_chimp_lungs, counts_chimp_lungs)
gc_counts_chimp_lungs <- rbind(gc_counts_chimp_lungs[1,], gc_counts_chimp_lungs)

head(gc_counts_chimp_lungs)

##                         V1 read_depth_total C1Lu  C2Lu C3Lu C4Lu
## ENSG00000000003  0.4446018             5878 1436  1102 1835 1505
## ENSG000000000031 0.4446018             5878 1436  1102 1835 1505
## ENSG00000000419  0.4003512             6053 1433  1712 1512 1396
## ENSG00000000457  0.4308564             4950 1009  1476 1124 1341
## ENSG00000000460  0.3933759              678  159   189  136  194
## ENSG00000000938  0.5656566            30483 6027 12637 4470 7349

# write.table(gc_counts_chimp_lungs,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_chimp_lungs.txt",sep="\t", col.names = F, row.names = F)

  # Human hearts

counts_human_hearts <- counts_in_2_of_4_col[,human_hearts]
read_depth_total <- rowSums(counts_human_hearts)
gc_counts_human_hearts <- cbind(gc_counts_in_2_of_4_col[,1], read_depth_total)
gc_counts_human_hearts <- cbind(gc_counts_human_hearts, counts_human_hearts)
gc_counts_human_hearts <- rbind(gc_counts_human_hearts[1,], gc_counts_human_hearts)

head(gc_counts_human_hearts)

##                         V1 read_depth_total H2H H3H  H4H
## ENSG00000000003  0.4438938              816 225 124  467
## ENSG000000000031 0.4438938              816 225 124  467
## ENSG00000000419  0.3985953             3246 855 573 1818
## ENSG00000000457  0.4335337              655 186 231  238
## ENSG00000000460  0.3956459              149  65  10   74
## ENSG00000000938  0.5666341             1735 828 182  725

# write.table(gc_counts_human_hearts,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_human_hearts.txt",sep="\t", col.names = F, row.names = F)

  # Human kidneys

counts_human_kidneys <- counts_in_2_of_4_col[,human_kidneys]
read_depth_total <- rowSums(counts_human_kidneys)
gc_counts_human_kidneys <- cbind(gc_counts_in_2_of_4_col[,1], read_depth_total)
gc_counts_human_kidneys <- cbind(gc_counts_human_kidneys, counts_human_kidneys)
gc_counts_human_kidneys <- rbind(gc_counts_human_kidneys[1,], gc_counts_human_kidneys)

head(gc_counts_human_kidneys)

##                         V1 read_depth_total  H1K  H2K  H3K  H4K
## ENSG00000000003  0.4438938            14429 4252 2365 4810 3002
## ENSG000000000031 0.4438938            14429 4252 2365 4810 3002
## ENSG00000000419  0.3985953             6270 1755  706 2041 1768
## ENSG00000000457  0.4335337             2544  711  286  931  616
## ENSG00000000460  0.3956459              734  248   63  280  143
## ENSG00000000938  0.5666341             1554  608  214  220  512

#write.table(gc_counts_human_kidneys,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_human_kidneys.txt",sep="\t", col.names = F, row.names = F)

  # Human livers

counts_human_livers <- counts_in_2_of_4_col[,human_livers]
read_depth_total <- rowSums(counts_human_livers)
gc_counts_human_livers <- cbind(gc_counts_in_2_of_4_col[,1], read_depth_total)
gc_counts_human_livers <- cbind(gc_counts_human_livers, counts_human_livers)
gc_counts_human_livers <- rbind(gc_counts_human_livers[1,], gc_counts_human_livers)

head(gc_counts_human_livers)

##                         V1 read_depth_total H1Li H2Li H3Li H4Li
## ENSG00000000003  0.4438938             8817 2100 2024 2918 1775
## ENSG000000000031 0.4438938             8817 2100 2024 2918 1775
## ENSG00000000419  0.3985953             4723 1492 1162 1018 1051
## ENSG00000000457  0.4335337             2228  682  444  436  666
## ENSG00000000460  0.3956459             1118  699  128  129  162
## ENSG00000000938  0.5666341             4713  491 1291  732 2199

#write.table(gc_counts_human_livers,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_human_livers.txt",sep="\t", col.names = F, row.names = F)

  # Human lungs

counts_human_lungs <- counts_in_2_of_4_col[,human_lungs]
read_depth_total <- rowSums(counts_human_lungs)
gc_counts_human_lungs <- cbind(gc_counts_in_2_of_4_col[,1], read_depth_total)
gc_counts_human_lungs <- cbind(gc_counts_human_lungs, counts_human_lungs)
gc_counts_human_lungs <- rbind(gc_counts_human_lungs[1,], gc_counts_human_lungs)

head(gc_counts_human_lungs)

##                         V1 read_depth_total H1Lu H2Lu H3Lu  H4Lu
## ENSG00000000003  0.4438938             4177  956  690 1427  1104
## ENSG000000000031 0.4438938             4177  956  690 1427  1104
## ENSG00000000419  0.3985953             6296 1566 1513 1175  2042
## ENSG00000000457  0.4335337             2818  632  700  583   903
## ENSG00000000460  0.3956459             1005  278  223  200   304
## ENSG00000000938  0.5666341            30956 6564 4767 4395 15230

# write.table(gc_counts_human_lungs,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_human_lungs.txt",sep="\t", col.names = F, row.names = F)

  # Rhesus hearts

counts_rhesus_hearts <- counts_in_2_of_4_col[,rhesus_hearts]
read_depth_total <- rowSums(counts_rhesus_hearts)
gc_counts_rhesus_hearts <- cbind(gc_counts_in_2_of_4_col[,3], read_depth_total)
gc_counts_rhesus_hearts <- cbind(gc_counts_rhesus_hearts, counts_rhesus_hearts)
gc_counts_rhesus_hearts <- rbind(gc_counts_rhesus_hearts[1,], gc_counts_rhesus_hearts)

head(gc_counts_rhesus_hearts)

##                         V1 read_depth_total R1H R2H R3H R4H
## ENSG00000000003  0.4429651             1688 270 399 500 519
## ENSG000000000031 0.4429651             1688 270 399 500 519
## ENSG00000000419  0.3945993             2965 582 635 996 752
## ENSG00000000457  0.4289068             1403 257 321 427 398
## ENSG00000000460  0.3939919              183  32  43  71  37
## ENSG00000000938  0.5714286              390  47  81 165  97

# write.table(gc_counts_rhesus_hearts,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_rhesus_hearts.txt",sep="\t", col.names = F, row.names = F)

  # Rhesus kidneys

counts_rhesus_kidneys <- counts_in_2_of_4_col[,rhesus_kidneys]
read_depth_total <- rowSums(counts_rhesus_kidneys)
gc_counts_rhesus_kidneys <- cbind(gc_counts_in_2_of_4_col[,3], read_depth_total)
gc_counts_rhesus_kidneys <- cbind(gc_counts_rhesus_kidneys, counts_rhesus_kidneys)
gc_counts_rhesus_kidneys <- rbind(gc_counts_rhesus_kidneys[1,], gc_counts_rhesus_kidneys)

head(gc_counts_rhesus_kidneys)

##                         V1 read_depth_total  R1K  R2K  R3K  R4K
## ENSG00000000003  0.4429651            15120 2669 3727 4005 4719
## ENSG000000000031 0.4429651            15120 2669 3727 4005 4719
## ENSG00000000419  0.3945993             4226  917  875 1151 1283
## ENSG00000000457  0.4289068             3860  711  945  983 1221
## ENSG00000000460  0.3939919              330   64   66  102   98
## ENSG00000000938  0.5714286              604  129  170  134  171

# write.table(gc_counts_rhesus_kidneys,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_rhesus_kidneys.txt",sep="\t", col.names = F, row.names = F)

  # Rhesus livers

counts_rhesus_livers <- counts_in_2_of_4_col[,rhesus_livers]
read_depth_total <- rowSums(counts_rhesus_livers)
gc_counts_rhesus_livers <- cbind(gc_counts_in_2_of_4_col[,3], read_depth_total)
gc_counts_rhesus_livers <- cbind(gc_counts_rhesus_livers, counts_rhesus_livers)
gc_counts_rhesus_livers <- rbind(gc_counts_rhesus_livers[1,], gc_counts_rhesus_livers)

head(gc_counts_rhesus_livers)

##                         V1 read_depth_total R1Li R2Li R3Li R4Li
## ENSG00000000003  0.4429651            23399 4046 7517 5547 6289
## ENSG000000000031 0.4429651            23399 4046 7517 5547 6289
## ENSG00000000419  0.3945993             4233  711  995 1197 1330
## ENSG00000000457  0.4289068             3277  443  859  773 1202
## ENSG00000000460  0.3939919              201   11   53   50   87
## ENSG00000000938  0.5714286             1335  175  240  234  686

# write.table(gc_counts_rhesus_livers,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_rhesus_livers.txt",sep="\t", col.names = F, row.names = F)

  # Rhesus lungs

counts_rhesus_lungs <- counts_in_2_of_4_col[,rhesus_lungs]
read_depth_total <- rowSums(counts_rhesus_lungs)
gc_counts_rhesus_lungs <- cbind(gc_counts_in_2_of_4_col[,3], read_depth_total)
gc_counts_rhesus_lungs  <- cbind(gc_counts_rhesus_lungs, counts_rhesus_lungs)
gc_counts_rhesus_lungs <- rbind(gc_counts_rhesus_lungs[1,], gc_counts_rhesus_lungs)

head(gc_counts_rhesus_lungs)

##                         V1 read_depth_total R1Lu R2Lu R3Lu R4Lu
## ENSG00000000003  0.4429651             7999 1928 2005 2088 1978
## ENSG000000000031 0.4429651             7999 1928 2005 2088 1978
## ENSG00000000419  0.3945993             5182  945 1173 1479 1585
## ENSG00000000457  0.4289068             5521  913 1531 1710 1367
## ENSG00000000460  0.3939919             1017  178  209  284  346
## ENSG00000000938  0.5714286            17715 2922 3817 5482 5494

# write.table(gc_counts_rhesus_lungs,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/GC_counts_rhesus_lungs.txt",sep="\t", col.names = F, row.names = F)

Executing the file to adjust for total read depth and GC content

To execute this, run a Python script that uses parts of the Python script found here https://github.com/bmvdgeijn/WASP/blob/master/CHT/update_total_depth.py

From this, we will get coefficients for the quartic function for each individual.

Part 3:

# Here's a file with GC content 

GC_read_depth_coefficients <- read.csv("~/Reg_Evo_Primates/ashlar-trial/data/GC_read_depth_coefficients.csv", header=FALSE)

coefs <- GC_read_depth_coefficients[,2:11]

# Average gc content

avg_chimp_gc <- mean(gc_counts_chimp_hearts[,1])

# Here's a file with the GC content and all of the counts for chimps

tot <- cbind(gc_counts_chimp_hearts[-2, 1:2], gc_counts_chimp_hearts[-2, 2], gc_counts_chimp_hearts[-2, 2], gc_counts_chimp_hearts[-2, 2], gc_counts_chimp_kidneys[-2,2], gc_counts_chimp_kidneys[-2,2], gc_counts_chimp_kidneys[-2,2], gc_counts_chimp_kidneys[-2,2], gc_counts_chimp_livers[-2,2], gc_counts_chimp_livers[-2,2], gc_counts_chimp_livers[-2,2], gc_counts_chimp_livers[-2,2], gc_counts_chimp_lungs[-2,2], gc_counts_chimp_lungs[-2,2], gc_counts_chimp_lungs[-2,2], gc_counts_chimp_lungs[-2,2])

dim(tot)

## [1] 16616    17

# Find the updated gene counts for the chimps

updated_chimp_gc_avg <- array(NA, dim = c(16616, 16))
updated_chimp_gc_all <- array(NA, dim = c(16616, 16))

for (j in 1:16616){
    for (i in 2:17){

updated_chimp_gc_avg[j,i-1] <- exp(coefs[i-1,1] + avg_chimp_gc*coefs[i-1,2] + ((avg_chimp_gc)^2)*coefs[i-1,3] + ((avg_chimp_gc)^3)*coefs[i-1,4] + ((avg_chimp_gc)^4)*coefs[i-1,5])

updated_chimp_gc_all[j,i-1] <- exp(coefs[i-1,1] + tot[j,1]*coefs[i-1,2] + ((tot[j,1])^2)*coefs[i-1,3] + ((tot[j,1])^3)*coefs[i-1,4] + ((tot[j,1])^4)*coefs[i-1,5])

#updated_chimp_gene_counts[j,i-1] <- exp((coefs[i-1,1] + tot[j,1]*coefs[i-1,2] + ((tot[j,1])^2)*coefs[i-1,3] + ((tot[j,1])^3)*coefs[i-1,4] + ((tot[j,1])^4)*coefs[i-1,5]))*(0 + tot[j,i]*coefs[i-1,7] + ((tot[j, i])^2)*coefs[i-1,8] + ((tot[j, i])^3)*coefs[i-1,9] + ((tot[j, i])^4)*coefs[i-1,10])
  }
}

# Calculate the adjustment factor

gc_adj <- updated_chimp_gc_avg / updated_chimp_gc_all

# Combine the counts for the chimps

counts_chimps <- cbind(gc_counts_chimp_hearts[-2,3:6], gc_counts_chimp_kidneys[-2,3:6], gc_counts_chimp_livers[-2,3:6], gc_counts_chimp_lungs[-2,3:6])

dim(counts_chimps)

## [1] 16616    16

# Calculate the new counts

gc_counts_chimps = counts_chimps*gc_adj 

# Plots comparing raw and adjusted counts

plot(gc_counts_chimp_hearts[-2,1], gc_counts_chimp_hearts[-2,3], xlab = "GC content", ylab = "Observed raw counts")

plot(gc_counts_chimp_hearts[-2,1], gc_counts_chimps[,1], xlab = "GC content", ylab = "Adjusted raw counts")

plot(gc_counts_chimp_hearts[-2,3], gc_counts_chimps[,1], ylab = "Expected counts", xlab = "Observed raw counts")
abline(a = 0, b = 1, col = "red", lwd = 3)

# Now repeat with humans

# Average gc content

avg_human_gc <- mean(gc_counts_human_hearts[,1])

# Here's a file with the GC content and all of the counts for chimps

tot <- cbind(gc_counts_human_hearts[-2, 1:2], gc_counts_human_hearts[-2, 2], gc_counts_human_hearts[-2, 2], gc_counts_human_kidneys[-2,2], gc_counts_human_kidneys[-2,2], gc_counts_human_kidneys[-2,2], gc_counts_human_kidneys[-2,2], gc_counts_human_livers[-2,2], gc_counts_human_livers[-2,2], gc_counts_human_livers[-2,2], gc_counts_human_livers[-2,2], gc_counts_human_lungs[-2,2], gc_counts_human_lungs[-2,2], gc_counts_human_lungs[-2,2], gc_counts_human_lungs[-2,2])

dim(tot)

## [1] 16616    16

# Find the updated gene counts for the chimps

updated_human_gc_avg <- array(NA, dim = c(16616, 16))
updated_human_gc_all <- array(NA, dim = c(16616, 16))

for (j in 1:16616){
    for (i in 17:31){

updated_human_gc_avg[j,i-16] <- exp(coefs[i,1] + avg_human_gc*coefs[i,2] + ((avg_human_gc)^2)*coefs[i,3] + ((avg_human_gc)^3)*coefs[i,4] + ((avg_human_gc)^4)*coefs[i,5])

updated_human_gc_all[j,i-16] <- exp(coefs[i,1] + tot[j,1]*coefs[i,2] + ((tot[j,1])^2)*coefs[i,3] + ((tot[j,1])^3)*coefs[i,4] + ((tot[j,1])^4)*coefs[i,5])

#updated_chimp_gene_counts[j,i-1] <- exp((coefs[i-1,1] + tot[j,1]*coefs[i-1,2] + ((tot[j,1])^2)*coefs[i-1,3] + ((tot[j,1])^3)*coefs[i-1,4] + ((tot[j,1])^4)*coefs[i-1,5]))*(0 + tot[j,i]*coefs[i-1,7] + ((tot[j, i])^2)*coefs[i-1,8] + ((tot[j, i])^3)*coefs[i-1,9] + ((tot[j, i])^4)*coefs[i-1,10])
  }
}

# Calculate the adjustment factor

gc_adj <- updated_human_gc_avg / updated_human_gc_all

# Combine the counts for the humans

counts_humans <- cbind(gc_counts_human_hearts[-2,3:5], gc_counts_human_kidneys[-2,3:6], gc_counts_human_livers[-2,3:6], gc_counts_human_lungs[-2,3:6])

dim(counts_humans)

## [1] 16616    15

# Calculate the new counts

gc_counts_humans = counts_humans*gc_adj 

# Plot to see the relationship between the raw counts and the GC adjusted counts
plot(gc_counts_human_hearts[-2,1], gc_counts_human_hearts[-2,3], xlab = "GC content", ylab = "Observed raw counts")

plot(gc_counts_human_hearts[-2,1], gc_counts_humans[,1], xlab = "GC content", ylab = "Adjusted raw counts")

plot(gc_counts_human_hearts[-2,3], gc_counts_humans[,1], ylab = "Expected counts", xlab = "Observed raw counts")
abline(a = 0, b = 1, col = "red", lwd = 3)

# Now repeat for rhesus

# Average gc content

avg_rhesus_gc <- mean(gc_counts_rhesus_hearts[,1])

# Here's a file with the GC content and all of the counts for chimps

tot <- cbind(gc_counts_rhesus_hearts[-2, 1:2], gc_counts_rhesus_hearts[-2, 2], gc_counts_rhesus_hearts[-2, 2], gc_counts_rhesus_hearts[-2, 2], gc_counts_rhesus_kidneys[-2,2], gc_counts_rhesus_kidneys[-2,2], gc_counts_rhesus_kidneys[-2,2], gc_counts_rhesus_kidneys[-2,2], gc_counts_rhesus_livers[-2,2], gc_counts_rhesus_livers[-2,2], gc_counts_rhesus_livers[-2,2], gc_counts_rhesus_livers[-2,2], gc_counts_rhesus_lungs[-2,2], gc_counts_rhesus_lungs[-2,2], gc_counts_rhesus_lungs[-2,2], gc_counts_rhesus_lungs[-2,2])

dim(tot)

## [1] 16616    17

# Find the updated gene counts for the chimps

updated_rhesus_gc_avg <- array(NA, dim = c(16616, 16))
updated_rhesus_gc_all <- array(NA, dim = c(16616, 16))

for (j in 1:16616){
    for (i in 32:47){

updated_rhesus_gc_avg[j,i-31] <- exp(coefs[i-1,1] + avg_rhesus_gc*coefs[i-1,2] + ((avg_rhesus_gc)^2)*coefs[i-1,3] + ((avg_rhesus_gc)^3)*coefs[i-1,4] + ((avg_rhesus_gc)^4)*coefs[i-1,5])

updated_rhesus_gc_all[j,i-31] <- exp(coefs[i-1,1] + tot[j,1]*coefs[i-1,2] + ((tot[j,1])^2)*coefs[i-1,3] + ((tot[j,1])^3)*coefs[i-1,4] + ((tot[j,1])^4)*coefs[i-1,5])

  }
}

# Calculate the adjustment factor

gc_adj <- updated_rhesus_gc_avg / updated_rhesus_gc_all

# Combine the counts 

counts_rhesus <- cbind(gc_counts_rhesus_hearts[-2,3:6], gc_counts_rhesus_kidneys[-2,3:6], gc_counts_rhesus_livers[-2,3:6], gc_counts_rhesus_lungs[-2,3:6])

dim(counts_rhesus)

## [1] 16616    16

# Calculate the new counts

gc_counts_rhesus = counts_rhesus*gc_adj 

# Plots comparing raw and adjusted counts

plot(gc_counts_rhesus_hearts[-2,1], gc_counts_rhesus_hearts[-2,3], xlab = "GC content", ylab = "Observed raw counts")

plot(gc_counts_rhesus_hearts[-2,1], gc_counts_rhesus[,1], xlab = "GC content", ylab = "Adjusted raw counts")

plot(gc_counts_rhesus_hearts[-2,3], gc_counts_rhesus[,1], ylab = "Expected counts", xlab = "Observed raw counts")
abline(a = 0, b = 1, col = "red", lwd = 3)

# Combine the samples in the same order as before (all chimpanzee 1 samples together, etc. )

chimps <- c(1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 4, 8, 12, 16)
humans <- c(4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)

counts_genes_gc <- cbind(gc_counts_chimps[, chimps], gc_counts_humans[ ,humans], gc_counts_rhesus[, chimps])

# Write a table with the GC-normalized counts

# write.table(counts_genes_gc,file="/Users/LEB/Reg_Evo_Primates/ashlar-trial/data/gene_counts_with_gc_correction.txt",sep="\t", col.names = T, row.names = T)

To do: Show that orthologous gene length (and exon length) is the same/similar across the different species and therefore we did not correct for it in the analysis.

GC_content_normalization_CHT

Lauren Blake

July 13, 2016