Introduction

The goal of this script is to compare the gene counts from hg19 and hg38.

# Import hg19 raw counts

counts_12184 <- read.delim("~/Desktop/Reg_Evo_Primates/data/counts_12184.txt")

counts_hg19 <- counts_12184[,17:31]
counts_hg19 <- cbind(rownames(counts_12184), counts_hg19)
colnames(counts_hg19) <- c("Geneid", colnames(counts_12184[,17:31]))

# Import hg38 raw counts

hg38_gene_counts_with_mm <- read.delim("../data/hg38_gene_counts_with_mm.txt", comment.char="#")

# Subset to humans only

colnames_bjp <- c("Geneid", "stYG_H_3_indH2_repM.bam", "stYG_H_3_indH3_repM.bam", "stYG_H_3_indH4_repM.bam", "stYG_H_4_indH1_repM.bam", "stYG_H_4_indH2_repM.bam", "stYG_H_4_indH3_repM.bam", "stYG_H_4_indH4_repM.bam", "stYG_H_5_indH1_repM.bam", "stYG_H_5_indH2_repM.bam", "stYG_H_5_indH3_repM.bam", "stYG_H_5_indH4_repM.bam", "stYG_H_6_indH1_repM.bam", "stYG_H_6_indH2_repM.bam", "stYG_H_6_indH3_repM.bam", "stYG_H_6_indH4_repM.bam")

colnames(hg38_gene_counts_with_mm) %in% colnames_bjp
##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hg38_gene_counts <- hg38_gene_counts_with_mm[,8:22]
rownames(hg38_gene_counts) <- hg38_gene_counts_with_mm[,1]
hg38_gene_counts <- cbind(hg38_gene_counts_with_mm[,1], hg38_gene_counts)
colnames(hg38_gene_counts) <- colnames_bjp

Overlap

gene_compare <- rownames(counts_hg19) %in% rownames(hg38_gene_counts)
summary(gene_compare)
##    Mode   FALSE    TRUE 
## logical     730   11454
gene_compare_merge <- merge(counts_hg19, hg38_gene_counts, by = "Geneid")

hg19_overlap <- gene_compare_merge[,1:16]

overlap_numbers <- c(1, 17:31)
hg38_overlap <- gene_compare_merge[,overlap_numbers]

Identify overlaps

H1K- stYG_H_4_indH1_repM.bam H1Li- stYG_H_5_indH1_repM.bam H1Lu- stYG_H_6_indH1_repM.bam

H2H- stYG_H_3_indH2_repM.bam H2K- stYG_H_4_indH2_repM.bam H2Li- stYG_H_5_indH2_repM.bam H2Lu- stYG_H_6_indH2_repM.bam

H3H- stYG_H_3_indH3_repM.bam H3K- stYG_H_4_indH3_repM.bam H3Li- stYG_H_5_indH3_repM.bam H3Lu- stYG_H_6_indH3_repM.bam

H4H- stYG_H_3_indH4_repM.bam H4K- stYG_H_4_indH4_repM.bam H4Li- stYG_H_5_indH4_repM.bam H4Lu- stYG_H_6_indH4_repM.bam

Look at correlations

cor(hg19_overlap$H1K, hg38_overlap$stYG_H_4_indH1_repM.bam)
## [1] 0.6069429
cor.test(hg19_overlap$H1K, hg38_overlap$stYG_H_4_indH1_repM.bam)
## 
##  Pearson's product-moment correlation
## 
## data:  hg19_overlap$H1K and hg38_overlap$stYG_H_4_indH1_repM.bam
## t = 81.726, df = 11452, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5952456 0.6183831
## sample estimates:
##       cor 
## 0.6069429
cor(hg19_overlap$H1Li, hg38_overlap$stYG_H_5_indH1_repM.bam)
## [1] 0.9633311
cor(hg19_overlap$H1Lu, hg38_overlap$stYG_H_6_indH1_repM.bam)
## [1] 0.7251576
cor(hg19_overlap$H2H, hg38_overlap$stYG_H_3_indH2_repM.bam)
## [1] 0.8265143
cor(hg19_overlap$H2K, hg38_overlap$stYG_H_4_indH2_repM.bam)
## [1] 0.890096
cor(hg19_overlap$H2Li, hg38_overlap$stYG_H_5_indH2_repM.bam)
## [1] 0.8658459
cor(hg19_overlap$H2Lu, hg38_overlap$stYG_H_6_indH2_repM.bam)
## [1] 0.8154343
cor(hg19_overlap$H3H, hg38_overlap$stYG_H_3_indH3_repM.bam)
## [1] 0.6997069
cor(hg19_overlap$H3K, hg38_overlap$stYG_H_4_indH3_repM.bam)
## [1] 0.8107131
cor(hg19_overlap$H3Li, hg38_overlap$stYG_H_5_indH3_repM.bam)
## [1] 0.9868997
cor(hg19_overlap$H3Lu, hg38_overlap$stYG_H_6_indH3_repM.bam)
## [1] 0.8357508
cor(hg19_overlap$H4H, hg38_overlap$stYG_H_3_indH4_repM.bam)
## [1] 0.7781797
cor(hg19_overlap$H4K, hg38_overlap$stYG_H_4_indH4_repM.bam)
## [1] 0.8501929
cor(hg19_overlap$H4Li, hg38_overlap$stYG_H_5_indH4_repM.bam)
## [1] 0.9029145
cor(hg19_overlap$H4Lu, hg38_overlap$stYG_H_6_indH4_repM.bam)
## [1] 0.7878426
corrlation_matrix <- as.data.frame(c(0.6069429, 0.9633311, 0.7251576, 0.8265143, 0.890096, 0.8658459, 0.8154343, 0.6997069, 0.8107131, 0.9868997, 0.8357508, 0.7781797, 0.8501929, 0.9029145, 0.7878426))

summary(corrlation_matrix)
##  c(0.6069429, 0.9633311, 0.7251576, 0.8265143, 0.890096, 0.8658459, 0.8154343, 0.6997069, 0.8107131, 0.9868997, 0.8357508, 0.7781797, 0.8501929, 0.9029145, 0.7878426)
##  Min.   :0.6069                                                                                                                                                       
##  1st Qu.:0.7830                                                                                                                                                       
##  Median :0.8265                                                                                                                                                       
##  Mean   :0.8230                                                                                                                                                       
##  3rd Qu.:0.8780                                                                                                                                                       
##  Max.   :0.9869
cor(hg19_overlap$H1K, hg38_overlap$stYG_H_4_indH1_repM.bam, method = c("spearman"))
## [1] 0.7768866
cor(hg19_overlap$H1Li, hg38_overlap$stYG_H_5_indH1_repM.bam, method = c("spearman"))
## [1] 0.845142
cor(hg19_overlap$H1Lu, hg38_overlap$stYG_H_6_indH1_repM.bam, method = c("spearman"))
## [1] 0.7884129
cor(hg19_overlap$H2H, hg38_overlap$stYG_H_3_indH2_repM.bam, method = c("spearman"))
## [1] 0.7864812
cor(hg19_overlap$H2K, hg38_overlap$stYG_H_4_indH2_repM.bam, method = c("spearman"))
## [1] 0.7700773
cor(hg19_overlap$H2Li, hg38_overlap$stYG_H_5_indH2_repM.bam, method = c("spearman"))
## [1] 0.7893072
cor(hg19_overlap$H2Lu, hg38_overlap$stYG_H_6_indH2_repM.bam, method = c("spearman"))
## [1] 0.8250774
cor(hg19_overlap$H3H, hg38_overlap$stYG_H_3_indH3_repM.bam, method = c("spearman"))
## [1] 0.7945612
cor(hg19_overlap$H3K, hg38_overlap$stYG_H_4_indH3_repM.bam, method = c("spearman"))
## [1] 0.7880498
cor(hg19_overlap$H3Li, hg38_overlap$stYG_H_5_indH3_repM.bam, method = c("spearman"))
## [1] 0.7680105
cor(hg19_overlap$H3Lu, hg38_overlap$stYG_H_6_indH3_repM.bam, method = c("spearman"))
## [1] 0.8365382
cor(hg19_overlap$H4H, hg38_overlap$stYG_H_3_indH4_repM.bam, method = c("spearman"))
## [1] 0.7999359
cor(hg19_overlap$H4K, hg38_overlap$stYG_H_4_indH4_repM.bam, method = c("spearman"))
## [1] 0.8052154
cor(hg19_overlap$H4Li, hg38_overlap$stYG_H_5_indH4_repM.bam, method = c("spearman"))
## [1] 0.7556615
cor(hg19_overlap$H4Lu, hg38_overlap$stYG_H_6_indH4_repM.bam, method = c("spearman"))
## [1] 0.8259926
# Plots
plot(hg19_overlap$H1K, hg38_overlap$stYG_H_4_indH1_repM.bam)
abline(0,1, col = "red")

summary(hg19_overlap$H1K > hg38_overlap$stYG_H_4_indH1_repM.bam)
##    Mode   FALSE    TRUE 
## logical    4144    7310
summary(hg19_overlap$H1K - hg38_overlap$stYG_H_4_indH1_repM.bam)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -508928    -213      78     -57     377   90877
plot(hg19_overlap$H1K - hg38_overlap$stYG_H_4_indH1_repM.bam, breaks= 100)
## Warning in plot.window(...): "breaks" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "breaks" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "breaks" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "breaks" is
## not a graphical parameter
## Warning in box(...): "breaks" is not a graphical parameter
## Warning in title(...): "breaks" is not a graphical parameter

plot(hg19_overlap$H1Li, hg38_overlap$stYG_H_5_indH1_repM.bam)
abline(0,1, col = "red")

plot(hg19_overlap$H1Lu, hg38_overlap$stYG_H_6_indH1_repM.bam)
abline(0,1, col = "red")

# Import hg38 raw counts

hg38_gene_counts_with_mm <- read.delim("../data/hg38_gene_counts_no_mm.txt", comment.char="#")

# Subset to humans only

colnames_bjp <- c("Geneid", "stYG_H_3_indH2_repM.bam", "stYG_H_3_indH3_repM.bam", "stYG_H_3_indH4_repM.bam", "stYG_H_4_indH1_repM.bam", "stYG_H_4_indH2_repM.bam", "stYG_H_4_indH3_repM.bam", "stYG_H_4_indH4_repM.bam", "stYG_H_5_indH1_repM.bam", "stYG_H_5_indH2_repM.bam", "stYG_H_5_indH3_repM.bam", "stYG_H_5_indH4_repM.bam", "stYG_H_6_indH1_repM.bam", "stYG_H_6_indH2_repM.bam", "stYG_H_6_indH3_repM.bam", "stYG_H_6_indH4_repM.bam")

colnames(hg38_gene_counts_with_mm) %in% colnames_bjp
##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
hg38_gene_counts <- hg38_gene_counts_with_mm[,8:22]
rownames(hg38_gene_counts) <- hg38_gene_counts_with_mm[,1]
hg38_gene_counts <- cbind(hg38_gene_counts_with_mm[,1], hg38_gene_counts)
colnames(hg38_gene_counts) <- colnames_bjp

Overlap

gene_compare <- rownames(counts_hg19) %in% rownames(hg38_gene_counts)
summary(gene_compare)
##    Mode   FALSE    TRUE 
## logical     730   11454
gene_compare_merge <- merge(counts_hg19, hg38_gene_counts, by = "Geneid")

hg19_overlap <- gene_compare_merge[,1:16]

overlap_numbers <- c(1, 17:31)
hg38_overlap <- gene_compare_merge[,overlap_numbers]

Identify overlaps

H1K- stYG_H_4_indH1_repM.bam H1Li- stYG_H_5_indH1_repM.bam H1Lu- stYG_H_6_indH1_repM.bam

H2H- stYG_H_3_indH2_repM.bam H2K- stYG_H_4_indH2_repM.bam H2Li- stYG_H_5_indH2_repM.bam H2Lu- stYG_H_6_indH2_repM.bam

H3H- stYG_H_3_indH3_repM.bam H3K- stYG_H_4_indH3_repM.bam H3Li- stYG_H_5_indH3_repM.bam H3Lu- stYG_H_6_indH3_repM.bam

H4H- stYG_H_3_indH4_repM.bam H4K- stYG_H_4_indH4_repM.bam H4Li- stYG_H_5_indH4_repM.bam H4Lu- stYG_H_6_indH4_repM.bam

Look at correlations

cor(hg19_overlap$H1K, hg38_overlap$stYG_H_4_indH1_repM.bam)
## [1] 0.659611
cor.test(hg19_overlap$H1K, hg38_overlap$stYG_H_4_indH1_repM.bam)
## 
##  Pearson's product-moment correlation
## 
## data:  hg19_overlap$H1K and hg38_overlap$stYG_H_4_indH1_repM.bam
## t = 93.916, df = 11452, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6491388 0.6698332
## sample estimates:
##      cor 
## 0.659611
cor(hg19_overlap$H1Li, hg38_overlap$stYG_H_5_indH1_repM.bam)
## [1] 0.9667593
cor(hg19_overlap$H1Lu, hg38_overlap$stYG_H_6_indH1_repM.bam)
## [1] 0.7978478
cor(hg19_overlap$H2H, hg38_overlap$stYG_H_3_indH2_repM.bam)
## [1] 0.8130906
cor(hg19_overlap$H2K, hg38_overlap$stYG_H_4_indH2_repM.bam)
## [1] 0.8981138
cor(hg19_overlap$H2Li, hg38_overlap$stYG_H_5_indH2_repM.bam)
## [1] 0.8700223
cor(hg19_overlap$H2Lu, hg38_overlap$stYG_H_6_indH2_repM.bam)
## [1] 0.8307625
cor(hg19_overlap$H3H, hg38_overlap$stYG_H_3_indH3_repM.bam)
## [1] 0.6664122
cor(hg19_overlap$H3K, hg38_overlap$stYG_H_4_indH3_repM.bam)
## [1] 0.8323257
cor(hg19_overlap$H3Li, hg38_overlap$stYG_H_5_indH3_repM.bam)
## [1] 0.9873673
cor(hg19_overlap$H3Lu, hg38_overlap$stYG_H_6_indH3_repM.bam)
## [1] 0.8522485
cor(hg19_overlap$H4H, hg38_overlap$stYG_H_3_indH4_repM.bam)
## [1] 0.7478594
cor(hg19_overlap$H4K, hg38_overlap$stYG_H_4_indH4_repM.bam)
## [1] 0.8713867
cor(hg19_overlap$H4Li, hg38_overlap$stYG_H_5_indH4_repM.bam)
## [1] 0.9034255
cor(hg19_overlap$H4Lu, hg38_overlap$stYG_H_6_indH4_repM.bam)
## [1] 0.8135062
corrlation_matrix <- as.data.frame(c(0.659611, 0.9667593, 0.7978478, 0.8130906, 0.8981138, 0.8700223, 0.8307625, 0.6664122, 0.8323257, 0.9873673, 0.8522485, 0.7478594, 0.8713867, 0.9034255, 0.8135062))

summary(corrlation_matrix)
##  c(0.659611, 0.9667593, 0.7978478, 0.8130906, 0.8981138, 0.8700223, 0.8307625, 0.6664122, 0.8323257, 0.9873673, 0.8522485, 0.7478594, 0.8713867, 0.9034255, 0.8135062)
##  Min.   :0.6596                                                                                                                                                       
##  1st Qu.:0.8055                                                                                                                                                       
##  Median :0.8323                                                                                                                                                       
##  Mean   :0.8340                                                                                                                                                       
##  3rd Qu.:0.8848                                                                                                                                                       
##  Max.   :0.9874
cor(hg19_overlap$H1K, hg38_overlap$stYG_H_4_indH1_repM.bam, method = c("spearman"))
## [1] 0.7788484
cor(hg19_overlap$H1Li, hg38_overlap$stYG_H_5_indH1_repM.bam, method = c("spearman"))
## [1] 0.8463768
cor(hg19_overlap$H1Lu, hg38_overlap$stYG_H_6_indH1_repM.bam, method = c("spearman"))
## [1] 0.7899065
cor(hg19_overlap$H2H, hg38_overlap$stYG_H_3_indH2_repM.bam, method = c("spearman"))
## [1] 0.788004
cor(hg19_overlap$H2K, hg38_overlap$stYG_H_4_indH2_repM.bam, method = c("spearman"))
## [1] 0.7717076
cor(hg19_overlap$H2Li, hg38_overlap$stYG_H_5_indH2_repM.bam, method = c("spearman"))
## [1] 0.7911631
cor(hg19_overlap$H2Lu, hg38_overlap$stYG_H_6_indH2_repM.bam, method = c("spearman"))
## [1] 0.8268586
cor(hg19_overlap$H3H, hg38_overlap$stYG_H_3_indH3_repM.bam, method = c("spearman"))
## [1] 0.7963937
cor(hg19_overlap$H3K, hg38_overlap$stYG_H_4_indH3_repM.bam, method = c("spearman"))
## [1] 0.7899628
cor(hg19_overlap$H3Li, hg38_overlap$stYG_H_5_indH3_repM.bam, method = c("spearman"))
## [1] 0.7699571
cor(hg19_overlap$H3Lu, hg38_overlap$stYG_H_6_indH3_repM.bam, method = c("spearman"))
## [1] 0.8380762
cor(hg19_overlap$H4H, hg38_overlap$stYG_H_3_indH4_repM.bam, method = c("spearman"))
## [1] 0.8006979
cor(hg19_overlap$H4K, hg38_overlap$stYG_H_4_indH4_repM.bam, method = c("spearman"))
## [1] 0.8066451
cor(hg19_overlap$H4Li, hg38_overlap$stYG_H_5_indH4_repM.bam, method = c("spearman"))
## [1] 0.757238
cor(hg19_overlap$H4Lu, hg38_overlap$stYG_H_6_indH4_repM.bam, method = c("spearman"))
## [1] 0.8270654
# Plots
plot(hg19_overlap$H1K, hg38_overlap$stYG_H_4_indH1_repM.bam)
abline(0,1, col = "red")

summary(hg19_overlap$H1K > hg38_overlap$stYG_H_4_indH1_repM.bam)
##    Mode   FALSE    TRUE 
## logical    4078    7376
summary(hg19_overlap$H1K - hg38_overlap$stYG_H_4_indH1_repM.bam)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -418095    -200      82     -20     385   90877
plot(hg19_overlap$H1K - hg38_overlap$stYG_H_4_indH1_repM.bam, breaks= 100)
## Warning in plot.window(...): "breaks" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "breaks" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "breaks" is
## not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "breaks" is
## not a graphical parameter
## Warning in box(...): "breaks" is not a graphical parameter
## Warning in title(...): "breaks" is not a graphical parameter

plot(hg19_overlap$H1Li, hg38_overlap$stYG_H_5_indH1_repM.bam)
abline(0,1, col = "red")

plot(hg19_overlap$H1Lu, hg38_overlap$stYG_H_6_indH1_repM.bam)
abline(0,1, col = "red")

See if the same genes are the most

# Find top 10% of genes that are different
summary(abs(hg19_overlap$H1K - hg38_overlap$stYG_H_4_indH1_repM.bam))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      0.0    107.0    320.0    958.8    858.0 418095.0
summary(abs(hg19_overlap$H1K - hg38_overlap$stYG_H_4_indH1_repM.bam)/hg19_overlap$H1K)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.1209  0.2557     Inf  0.7681     Inf
sort((abs(hg19_overlap$H1K - hg38_overlap$stYG_H_4_indH1_repM.bam)/hg19_overlap$H1K))[11000:11454]
##   [1]   4.395939   4.403909   4.423977   4.427885   4.455128   4.457627
##   [7]   4.460613   4.490272   4.521008   4.536842   4.543750   4.545657
##  [13]   4.552941   4.554455   4.556923   4.563758   4.567742   4.569079
##  [19]   4.583333   4.593373   4.607370   4.614534   4.631491   4.632199
##  [25]   4.647053   4.650862   4.656250   4.656566   4.664234   4.664336
##  [31]   4.672727   4.675451   4.686957   4.707022   4.707986   4.714286
##  [37]   4.726923   4.727612   4.732342   4.736434   4.737589   4.749035
##  [43]   4.756906   4.774775   4.778947   4.779295   4.789189   4.802721
##  [49]   4.822511   4.832714   4.853933   4.856266   4.857143   4.870432
##  [55]   4.878431   4.879781   4.905138   4.920290   4.933071   4.934426
##  [61]   4.941799   4.947368   4.959757   4.965517   4.966002   4.981013
##  [67]   4.984055   4.985098   4.989305   4.990196   4.997010   5.000000
##  [73]   5.000000   5.015504   5.021505   5.022099   5.049628   5.054321
##  [79]   5.074169   5.077703   5.083333   5.123909   5.147253   5.157563
##  [85]   5.163265   5.192308   5.215054   5.225434   5.235127   5.248447
##  [91]   5.258929   5.259259   5.267857   5.270701   5.279570   5.280000
##  [97]   5.283505   5.317797   5.318182   5.327103   5.338235   5.343511
## [103]   5.409091   5.418136   5.423077   5.428571   5.443758   5.454106
## [109]   5.456790   5.462500   5.474453   5.484375   5.500000   5.507463
## [115]   5.516086   5.516923   5.526174   5.550459   5.557078   5.581140
## [121]   5.597561   5.624535   5.636752   5.651948   5.657609   5.670000
## [127]   5.680000   5.682500   5.686275   5.687805   5.688889   5.691304
## [133]   5.725000   5.744318   5.746914   5.791667   5.825893   5.829787
## [139]   5.881690   5.906667   5.909091   5.913333   5.915937   5.936427
## [145]   5.945205   5.958656   5.961672   5.972906   5.982343   5.985258
## [151]   6.000000   6.006452   6.029310   6.038961   6.058824   6.065041
## [157]   6.072937   6.075301   6.096317   6.129032   6.129950   6.147368
## [163]   6.148315   6.149758   6.170732   6.236364   6.240000   6.241604
## [169]   6.271111   6.290041   6.366071   6.377323   6.399083   6.403922
## [175]   6.418067   6.425532   6.426316   6.432000   6.446809   6.463542
## [181]   6.482143   6.500990   6.512605   6.515152   6.538546   6.595318
## [187]   6.629921   6.632768   6.676136   6.698225   6.705752   6.706349
## [193]   6.706667   6.712813   6.744681   6.770925   6.771429   6.771930
## [199]   6.779412   6.780362   6.781095   6.781915   6.820312   6.833333
## [205]   6.875000   6.885246   6.900000   6.905830   6.937500   6.939627
## [211]   6.947368   6.967105   6.969231   7.008621   7.014124   7.028579
## [217]   7.042105   7.069048   7.069767   7.074468   7.093897   7.129412
## [223]   7.133758   7.134058   7.268908   7.323529   7.324324   7.343891
## [229]   7.344768   7.351201   7.390000   7.408560   7.431267   7.451613
## [235]   7.470339   7.514019   7.517345   7.534653   7.535433   7.538462
## [241]   7.548246   7.583333   7.586420   7.593220   7.600000   7.604317
## [247]   7.605364   7.696970   7.748792   7.783505   7.795082   7.802198
## [253]   7.803571   7.818182   7.824176   7.858974   7.933532   7.945055
## [259]   8.000000   8.092715   8.131148   8.159004   8.195122   8.206557
## [265]   8.250000   8.360656   8.362543   8.377337   8.403727   8.403909
## [271]   8.423077   8.460377   8.540000   8.567568   8.602606   8.604701
## [277]   8.612766   8.652632   8.687500   8.694118   8.696275   8.698413
## [283]   8.802740   8.822344   8.879121   8.896104   8.919298   8.984190
## [289]   9.068027   9.079755   9.092715   9.171756   9.226190   9.284404
## [295]   9.312849   9.334129   9.361111   9.393393   9.400000   9.400000
## [301]   9.415842   9.440994   9.504239   9.505263   9.550360   9.662551
## [307]   9.697917   9.772947   9.887367   9.893443   9.901316  10.050847
## [313]  10.063636  10.110000  10.188049  10.210000  10.330435  10.405229
## [319]  10.409091  10.432665  10.436823  10.472222  10.474138  10.518519
## [325]  10.552219  10.614679  10.622356  10.672598  10.700000  10.775862
## [331]  10.800000  10.858513  11.004016  11.011050  11.100877  11.175573
## [337]  11.206147  11.258667  11.298387  11.386525  11.441048  11.650428
## [343]  11.656566  11.855769  11.876323  11.929412  11.994460  12.032258
## [349]  12.192308  12.211806  12.220930  12.267857  12.312102  12.411765
## [355]  12.493947  12.548873  12.580559  12.638655  12.701389  12.778689
## [361]  12.836923  12.907895  12.981928  13.031250  13.145455  13.448071
## [367]  13.490196  13.561111  13.578947  13.594059  13.614693  13.746988
## [373]  13.807107  14.028249  14.309278  14.641304  15.125000  15.302083
## [379]  15.342105  15.450000  15.559748  15.704348  15.770950  15.858025
## [385]  15.862857  15.864769  16.063492  16.160000  16.877419  17.208333
## [391]  17.392405  17.400000  17.857143  18.128472  18.200000  18.416058
## [397]  18.456311  18.600000  18.881517  19.200000  19.340000  19.472973
## [403]  19.699115  19.813333  19.857143  19.935685  19.971098  20.358804
## [409]  20.390152  20.569444  20.600000  20.899329  21.130435  21.453039
## [415]  21.766355  22.781818  23.250000  23.687500  24.000000  24.020000
## [421]  24.214559  24.473227  24.521429  24.704918  25.280702  25.390508
## [427]  25.400000  25.438469  25.578125  26.020000  27.105263  27.414141
## [433]  27.715596  30.119048  30.308772  31.055556  33.169355  37.572222
## [439]  38.846847  41.622200  45.514286  46.343750  49.269231  54.529412
## [445]  57.330275  63.200000  64.500000  66.206897  72.944444  76.481013
## [451] 128.982609 146.000000        Inf        Inf        Inf

Try

# Load libraries 

library("gplots")
## Warning: package 'gplots' was built under R version 3.4.4
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library("ggplot2")
library("RColorBrewer")
library("scales")
## Warning: package 'scales' was built under R version 3.4.4
library("edgeR")
## Loading required package: limma
library("limma")

# Import hg38 raw counts

hg38_gene_counts_with_mm <- read.delim("../data/hg38_gene_counts_with_mm.txt", comment.char="#")

# Load colors 

colors <- colorRampPalette(c(brewer.pal(9, "Blues")[1],brewer.pal(9, "Blues")[9]))(100)
pal <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))

#Load the data

  #Raw counts

counts_genes <- hg38_gene_counts_with_mm[,8:22]
rownames(counts_genes) <- hg38_gene_counts_with_mm[,1] 
dim(counts_genes)
## [1] 49018    15
 #Sample information

samples <- read.csv("../../../Reg_Evo_Primates/data/Sample_info_RNAseq.csv")

samples <- samples[-17,]
samples <- samples[17:31,]
labels <- paste(samples$Species, samples$Tissue, sep=" ")

# Set expression cutoff and sample number
expr_cutoff <- 1.5
sample_number <- 8

# log2(CPM) adjusted for library sizes

dge_original <- DGEList(counts=as.matrix(counts_genes), genes=rownames(counts_genes), group = as.character(t(labels)))
dge_original <- calcNormFactors(dge_original)

cpm <- cpm(dge_original, normalized.lib.sizes=TRUE, log=TRUE, prior.count = 0.25)

cpm_filtered <- cpm[rowSums(cpm > expr_cutoff) >= sample_number, ]
dim(cpm_filtered)
## [1] 13071    15
# Find the original counts of all of the genes that fit the criteria 

inshared_lists = row.names(counts_genes) %in% rownames(cpm_filtered)
inshared_lists_data <- as.data.frame(inshared_lists)
counts_genes_in <- cbind(counts_genes, inshared_lists_data)
counts_genes_in_cutoff <- subset(counts_genes_in, inshared_lists_data == "TRUE")
counts_genes_in_cutoff <- counts_genes_in_cutoff[,1:15]

dim(counts_genes_in_cutoff)
## [1] 13071    15
# Take the TMM of the genes that meet the criteria

dge_in_cutoff <- DGEList(counts=as.matrix(counts_genes_in_cutoff), genes=rownames(counts_genes_in_cutoff), group = as.character(t(labels)))
dge_in_cutoff <- calcNormFactors(dge_in_cutoff)

cpm_in_cutoff <- cpm(dge_in_cutoff, normalized.lib.sizes=TRUE, log=TRUE, prior.count = 0.25)
head(cpm_in_cutoff)
##                 stYG_H_3_indH2_repM.bam stYG_H_3_indH3_repM.bam
## ENSG00000188976                3.445083                3.288610
## ENSG00000187961                3.091040                2.289839
## ENSG00000188290                2.698140                5.113048
## ENSG00000188157                2.781694                4.664416
## ENSG00000078808                6.877040                6.604360
## ENSG00000176022                4.014646                4.120516
##                 stYG_H_3_indH4_repM.bam stYG_H_4_indH1_repM.bam
## ENSG00000188976                3.280032                3.166102
## ENSG00000187961                2.073925                2.413975
## ENSG00000188290                5.158115                2.111111
## ENSG00000188157                3.587279                6.913586
## ENSG00000078808                6.628762                6.546096
## ENSG00000176022                3.274499                4.110939
##                 stYG_H_4_indH2_repM.bam stYG_H_4_indH3_repM.bam
## ENSG00000188976                2.988987                2.445540
## ENSG00000187961                3.405969                2.101177
## ENSG00000188290                1.692352                2.365123
## ENSG00000188157                5.193476                5.519877
## ENSG00000078808                6.765384                6.090754
## ENSG00000176022                3.433560                3.598936
##                 stYG_H_4_indH4_repM.bam stYG_H_5_indH1_repM.bam
## ENSG00000188976                2.732881               3.7699904
## ENSG00000187961                2.399396               1.4296242
## ENSG00000188290                1.729368               0.5403596
## ENSG00000188157                6.105965               4.0934891
## ENSG00000078808                6.694457               7.5793344
## ENSG00000176022                3.150123               3.6611252
##                 stYG_H_5_indH2_repM.bam stYG_H_5_indH3_repM.bam
## ENSG00000188976                2.526011                2.841975
## ENSG00000187961                3.616051                2.753077
## ENSG00000188290                2.288226                0.834670
## ENSG00000188157                3.799966                3.486705
## ENSG00000078808                6.505643                6.694562
## ENSG00000176022                4.180932                4.950461
##                 stYG_H_5_indH4_repM.bam stYG_H_6_indH1_repM.bam
## ENSG00000188976                2.383634                2.520180
## ENSG00000187961                2.483016                1.712362
## ENSG00000188290                1.844112                3.034939
## ENSG00000188157                3.395296                4.090103
## ENSG00000078808                5.803008                6.218684
## ENSG00000176022                3.798839                3.473625
##                 stYG_H_6_indH2_repM.bam stYG_H_6_indH3_repM.bam
## ENSG00000188976                2.386100                2.361469
## ENSG00000187961                2.974863                1.544702
## ENSG00000188290                3.870048                3.855443
## ENSG00000188157                5.748594                4.977555
## ENSG00000078808                6.838017                6.188787
## ENSG00000176022                2.392943                2.412527
##                 stYG_H_6_indH4_repM.bam
## ENSG00000188976                2.447942
## ENSG00000187961                2.595070
## ENSG00000188290                2.493677
## ENSG00000188157                5.199994
## ENSG00000078808                6.614923
## ENSG00000176022                2.786884

Look at the correlation between normalized gene expression levels for hg19 and hg38

# Import hg38 raw counts

hg19_gene_counts <- read.delim("~/Desktop/Reg_Evo_Primates/data/counts_12184.txt")

# Load colors 

colors <- colorRampPalette(c(brewer.pal(9, "Blues")[1],brewer.pal(9, "Blues")[9]))(100)
pal <- c(brewer.pal(9, "Set1"), brewer.pal(8, "Set2"), brewer.pal(12, "Set3"))

#Load the data

  #Raw counts

counts_genes <- hg19_gene_counts[,17:31]
dim(counts_genes)
## [1] 12184    15
 #Sample information

samples <- read.csv("../../../Reg_Evo_Primates/data/Sample_info_RNAseq.csv")

samples <- samples[-17,]
samples <- samples[17:31,]
labels <- paste(samples$Species, samples$Tissue, sep=" ")

# Set expression cutoff and sample number
expr_cutoff <- 1.5
sample_number <- 8

# log2(CPM) adjusted for library sizes

dge_original <- DGEList(counts=as.matrix(counts_genes), genes=rownames(counts_genes), group = as.character(t(labels)))
dge_original <- calcNormFactors(dge_original)

cpm <- cpm(dge_original, normalized.lib.sizes=TRUE, log=TRUE, prior.count = 0.25)

cpm_filtered <- cpm[rowSums(cpm > expr_cutoff) >= sample_number, ]
dim(cpm_filtered)
## [1] 11925    15
# Find the original counts of all of the genes that fit the criteria 

inshared_lists = row.names(counts_genes) %in% rownames(cpm_filtered)
inshared_lists_data <- as.data.frame(inshared_lists)
counts_genes_in <- cbind(counts_genes, inshared_lists_data)
counts_genes_in_cutoff <- subset(counts_genes_in, inshared_lists_data == "TRUE")
counts_genes_in_cutoff <- counts_genes_in_cutoff[,1:15]

dim(counts_genes_in_cutoff)
## [1] 11925    15
# Take the TMM of the genes that meet the criteria

dge_in_cutoff <- DGEList(counts=as.matrix(counts_genes_in_cutoff), genes=rownames(counts_genes_in_cutoff), group = as.character(t(labels)))
dge_in_cutoff <- calcNormFactors(dge_in_cutoff)

cpm_in_cutoff <- cpm(dge_in_cutoff, normalized.lib.sizes=TRUE, log=TRUE, prior.count = 0.25)
head(cpm_in_cutoff)
##                      H1K      H1Li     H1Lu      H2H      H2K     H2Li
## ENSG00000000003 6.984582  6.693656 5.215624 3.792256 7.322379 6.753069
## ENSG00000000419 5.708075  6.200571 5.927473 5.717481 5.578492 5.952570
## ENSG00000000457 4.404934  5.071402 4.618738 3.517853 4.275287 4.564971
## ENSG00000000460 2.886707  5.106913 3.434653 2.003426 2.095353 2.772073
## ENSG00000000938 4.179273  4.597522 7.994782 5.671196 3.857135 6.104425
## ENSG00000000971 6.705468 11.334213 7.678023 6.474441 4.853259 9.523595
##                     H2Lu        H3H      H3K      H3Li     H3Lu      H4H
## ENSG00000000003 4.295076 3.65723992 7.213281  7.852681 5.407003 4.401673
## ENSG00000000419 5.427417 5.86453069 5.976653  6.333556 5.126752 6.362019
## ENSG00000000457 4.315824 4.55425820 4.844506  5.110460 4.116082 3.429874
## ENSG00000000460 2.667075 0.03806332 3.112329  3.354505 2.574192 1.747500
## ENSG00000000938 7.082855 4.21047057 2.764864  5.857805 7.029649 5.035986
## ENSG00000000971 7.424777 6.33869400 5.434969 10.463236 7.241153 7.044580
##                      H4K      H4Li     H4Lu
## ENSG00000000003 6.491436  6.708852 4.579722
## ENSG00000000419 5.727743  5.952891 5.466685
## ENSG00000000457 4.207141  5.294858 4.289915
## ENSG00000000460 2.102830  3.256483 2.720726
## ENSG00000000938 3.940514  7.017853 8.365261
## ENSG00000000971 6.152267 10.316333 8.393495
# Normalized gene expression levels for hg19
cpm_12184 <- read.delim("~/Desktop/Reg_Evo_Primates/data/cpm_12184.txt")

cpm_hg19 <- cpm_12184[,17:31]
cpm_hg19 <- cbind(rownames(cpm_hg19), cpm_hg19)

#cpm_hg19 <- cpm_in_cutoff
#cpm_hg19 <- as.data.frame(rownames(cpm_12184), )


colnames_bjp <- c("Geneid", "H1K", "H1Li", "H1Lu", "H2H", "H2K", "H2Li", "H2Lu", "H3H", "H3K", "H3Li", "H3Lu", "H4H", "H4K", "H4Li", "H4Lu")  
colnames(cpm_hg19) <- colnames_bjp

# Normalized gene expression levels for hg38
cpm_hg38 <- cpm_in_cutoff
cpm_hg38 <- as.data.frame(cbind(rownames(cpm_hg38), cpm_hg38), stringsAsFactors = FALSE)

cpm_hg38[,2] <- as.numeric(cpm_hg38[,2])
cpm_hg38[,3] <- as.numeric(cpm_hg38[,3])
cpm_hg38[,4] <- as.numeric(cpm_hg38[,4])
cpm_hg38[,5] <- as.numeric(cpm_hg38[,5])
cpm_hg38[,6] <- as.numeric(cpm_hg38[,6])
cpm_hg38[,7] <- as.numeric(cpm_hg38[,7])
cpm_hg38[,8] <- as.numeric(cpm_hg38[,8])
cpm_hg38[,9] <- as.numeric(cpm_hg38[,9])
cpm_hg38[,10] <- as.numeric(cpm_hg38[,10])
cpm_hg38[,11] <- as.numeric(cpm_hg38[,11])
cpm_hg38[,12] <- as.numeric(cpm_hg38[,12])
cpm_hg38[,13] <- as.numeric(cpm_hg38[,13])
cpm_hg38[,14] <- as.numeric(cpm_hg38[,14])
cpm_hg38[,15] <- as.numeric(cpm_hg38[,15])
cpm_hg38[,16] <- as.numeric(cpm_hg38[,16])

colnames(cpm_hg38) <- c("Geneid", "stYG_H_3_indH2_repM.bam", "stYG_H_3_indH3_repM.bam", "stYG_H_3_indH4_repM.bam", "stYG_H_4_indH1_repM.bam", "stYG_H_4_indH2_repM.bam", "stYG_H_4_indH3_repM.bam", "stYG_H_4_indH4_repM.bam", "stYG_H_5_indH1_repM.bam", "stYG_H_5_indH2_repM.bam", "stYG_H_5_indH3_repM.bam", "stYG_H_5_indH4_repM.bam", "stYG_H_6_indH1_repM.bam", "stYG_H_6_indH2_repM.bam", "stYG_H_6_indH3_repM.bam", "stYG_H_6_indH4_repM.bam")

cpm_hg38 <- as.data.frame(cpm_hg38)

# Find overlap of genes

gene_compare <- rownames(cpm_hg19) %in% rownames(cpm_hg38)
summary(gene_compare)
##    Mode   FALSE    TRUE 
## logical     259   11925
gene_compare_merge <- merge(cpm_hg19, cpm_hg38, by = "Geneid")

hg19_overlap <- gene_compare_merge[,2:16]

hg38_overlap <- gene_compare_merge[,17:31]

cor(as.numeric(hg19_overlap$H1K), as.numeric(hg38_overlap$stYG_H_4_indH1_repM.bam))
## [1] 0.6854616
cor(as.numeric(hg19_overlap$H1Li), as.numeric(hg38_overlap$stYG_H_5_indH1_repM.bam))
## [1] 0.4678117
cor(as.numeric(hg19_overlap$H1Lu), as.numeric(hg38_overlap$stYG_H_6_indH1_repM.bam))
## [1] 0.7293269
cor(as.numeric(hg19_overlap$H2H), as.numeric(hg38_overlap$stYG_H_3_indH2_repM.bam))
## [1] 0.6854469
cor(as.numeric(hg19_overlap$H2K), as.numeric(hg38_overlap$stYG_H_4_indH2_repM.bam))
## [1] 1
cor(as.numeric(hg19_overlap$H2Li), as.numeric(hg38_overlap$stYG_H_5_indH2_repM.bam))
## [1] 0.6557819
cor(as.numeric(hg19_overlap$H2Lu), as.numeric(hg38_overlap$stYG_H_6_indH2_repM.bam))
## [1] 0.6878467
cor(as.numeric(hg19_overlap$H3H), as.numeric(hg38_overlap$stYG_H_3_indH3_repM.bam))
## [1] 0.4677697
cor(as.numeric(hg19_overlap$H3K), as.numeric(hg38_overlap$stYG_H_4_indH3_repM.bam))
## [1] 0.6557918
cor(as.numeric(hg19_overlap$H3Li), as.numeric(hg38_overlap$stYG_H_5_indH3_repM.bam))
## [1] 1
cor(as.numeric(hg19_overlap$H3Lu), as.numeric(hg38_overlap$stYG_H_6_indH3_repM.bam))
## [1] 0.6170721
cor(as.numeric(hg19_overlap$H4H), as.numeric(hg38_overlap$stYG_H_3_indH4_repM.bam))
## [1] 0.7293233
cor(as.numeric(hg19_overlap$H4K), as.numeric(hg38_overlap$stYG_H_4_indH4_repM.bam))
## [1] 0.6878504
cor(as.numeric(hg19_overlap$H4Li), as.numeric(hg38_overlap$stYG_H_5_indH4_repM.bam))
## [1] 0.6170756
cor(as.numeric(hg19_overlap$H4Lu), as.numeric(hg38_overlap$stYG_H_6_indH4_repM.bam))
## [1] 1
# Put in the same order


hg38_overlap_order <- cbind(hg38_overlap$stYG_H_4_indH1_repM.bam, hg38_overlap$stYG_H_5_indH1_repM.bam, hg38_overlap$stYG_H_6_indH1_repM.bam,
hg38_overlap$stYG_H_3_indH2_repM.bam, hg38_overlap$stYG_H_4_indH2_repM.bam, hg38_overlap$stYG_H_5_indH2_repM.bam, hg38_overlap$stYG_H_6_indH2_repM.bam, 
hg38_overlap$stYG_H_3_indH3_repM.bam, hg38_overlap$stYG_H_4_indH3_repM.bam, hg38_overlap$stYG_H_5_indH3_repM.bam, hg38_overlap$stYG_H_6_indH3_repM.bam, 
hg38_overlap$stYG_H_3_indH4_repM.bam, hg38_overlap$stYG_H_4_indH4_repM.bam, hg38_overlap$stYG_H_5_indH4_repM.bam, hg38_overlap$stYG_H_6_indH4_repM.bam)

hg38_overlap_order <- as.data.frame(hg38_overlap_order)

rank(hg19_overlap[1,])
##  H1K H1Li H1Lu  H2H  H2K H2Li H2Lu  H3H  H3K H3Li H3Lu  H4H  H4K H4Li H4Lu 
##   12    9    6    2   14   11    3    1   13   15    7    4    8   10    5
rank(hg38_overlap_order[1,])
##  V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 
##   2   1   4  12  14  13   8   9  11  15  10   6   3   7   5
cor(rank(hg19_overlap[1,]), rank(hg38_overlap_order[1,]))
## [1] 0.2642857
cor(rank(hg19_overlap[2,]), rank(hg38_overlap_order[2,]))
## [1] 0.3571429
rank_cor <- array(NA, dim = c(nrow(hg19_overlap),1))

for(i in 1:nrow(hg19_overlap)){
  rank_cor[i, 1] <- cor(rank(hg19_overlap[i,]), rank(hg38_overlap_order[i,]))
}

rank_level <- array(NA, dim = c(nrow(hg19_overlap),2))

for(i in 1:nrow(hg19_overlap)){
  rank_level[i, 1] <- mean(as.numeric(hg19_overlap[i,]))
  rank_level[i, 2] <- mean(as.numeric(hg38_overlap[i,]))
}

cor(rank_cor[,1], rank_level[,1])
## [1] 0.01615501
cor(rank_cor[,1], rank_level[,2])
## [1] 0.01615423