# Load gene expression levels
cpm_12184 <- read.delim("../../../Reg_Evo_Primates/data/cpm_12184.txt")
dim(cpm_12184)
## [1] 12184 47
# Take gene PCs
pca_genes <- prcomp(t(cpm_12184), scale = T)
scores <- pca_genes$x
gene_pcs <- scores
# Load libraries
# install.packages("polycor")
library("polycor")
# Load list of technical factors
RNA_seq_info <- read.csv("/Users/laurenblake/Dropbox/primate_BS-seq_project/Correlation with PCs - Lauren/RNA_seq_info_testing.csv")
RNA_seq_info <- as.data.frame(RNA_seq_info)
dim(RNA_seq_info)
## [1] 47 31
#Create plots for each of the possible confounders versus PCs 1-5
#pdf('../data/VarVsGenePCs.pdf')
#for (i in 3:length(RNA_seq_info)) {
# par(mfrow=c(1,5))
# plot(RNA_seq_info[,i], gene_pcs[,1], ylab = "PC1", xlab = " ")
# plot(RNA_seq_info[,i], gene_pcs[,2], ylab = "PC2", xlab = " ")
# plot(RNA_seq_info[,i], gene_pcs[,3], ylab = "PC3", xlab = " ")
# plot(RNA_seq_info[,i], gene_pcs[,4], ylab = "PC4", xlab = " ")
# plot(RNA_seq_info[,i], gene_pcs[,5], ylab = "PC5", xlab = " ")
# title(xlab = substitute(paste(k), list(k=colnames(RNA_seq_info)[i])), outer = TRUE, line = -2)
# mtext(substitute(paste('PCs vs. ', k), list(k=colnames(RNA_seq_info)[i])), side = 3, line = -2, outer = TRUE)
#}
#dev.off()
checkPC1 <- lm(gene_pcs[,1] ~ as.factor(RNA_seq_info$Tissue))
summary(checkPC1)
##
## Call:
## lm(formula = gene_pcs[, 1] ~ as.factor(RNA_seq_info$Tissue))
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.900 -12.990 -2.963 16.862 39.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -29.033 7.043 -4.122 0.000168
## as.factor(RNA_seq_info$Tissue)kidney 36.593 9.751 3.753 0.000519
## as.factor(RNA_seq_info$Tissue)liver 98.649 9.751 10.117 6.06e-13
## as.factor(RNA_seq_info$Tissue)lung -21.530 9.751 -2.208 0.032625
##
## (Intercept) ***
## as.factor(RNA_seq_info$Tissue)kidney ***
## as.factor(RNA_seq_info$Tissue)liver ***
## as.factor(RNA_seq_info$Tissue)lung *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.36 on 43 degrees of freedom
## Multiple R-squared: 0.8081, Adjusted R-squared: 0.7947
## F-statistic: 60.35 on 3 and 43 DF, p-value: 1.859e-15
checkPC1 <- lm(gene_pcs[,2] ~ as.factor(RNA_seq_info$Tissue))
summary(checkPC1)
##
## Call:
## lm(formula = gene_pcs[, 2] ~ as.factor(RNA_seq_info$Tissue))
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.175 -22.203 7.079 14.446 52.474
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -65.964 7.823 -8.432 1.17e-10
## as.factor(RNA_seq_info$Tissue)kidney 68.287 10.831 6.305 1.32e-07
## as.factor(RNA_seq_info$Tissue)liver 90.714 10.831 8.375 1.40e-10
## as.factor(RNA_seq_info$Tissue)lung 99.357 10.831 9.173 1.11e-11
##
## (Intercept) ***
## as.factor(RNA_seq_info$Tissue)kidney ***
## as.factor(RNA_seq_info$Tissue)liver ***
## as.factor(RNA_seq_info$Tissue)lung ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.95 on 43 degrees of freedom
## Multiple R-squared: 0.7034, Adjusted R-squared: 0.6827
## F-statistic: 33.99 on 3 and 43 DF, p-value: 2.019e-11
checkPC2 <- lm(gene_pcs[,2] ~ as.factor(RNA_seq_info$Species))
summary(checkPC2)
##
## Call:
## lm(formula = gene_pcs[, 2] ~ as.factor(RNA_seq_info$Species))
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.400 -11.763 9.765 27.362 47.606
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 8.849 10.064 0.879
## as.factor(RNA_seq_info$Species)human 15.567 14.467 1.076
## as.factor(RNA_seq_info$Species)rhesus macaque -40.588 14.232 -2.852
## Pr(>|t|)
## (Intercept) 0.3840
## as.factor(RNA_seq_info$Species)human 0.2878
## as.factor(RNA_seq_info$Species)rhesus macaque 0.0066 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.25 on 44 degrees of freedom
## Multiple R-squared: 0.2696, Adjusted R-squared: 0.2364
## F-statistic: 8.119 on 2 and 44 DF, p-value: 0.0009972
#Make an empty matrix to put all of the data in
pvaluesandr2 = matrix(data = NA, nrow = 5, ncol = 28*2, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Species", "Species R^2", "Tissue", "Tissue R^2", "Individual", "Ind. R^2", "RNA extraction date", "Extract. date R^2", "Multiplexing index sequence", "Multi. index R^2", "Multiplexing mix codes", "Multi. mix R^2", "Sequencing location: at UCGF", "Location R^2", "Total number of reads sequenced", "Read Seq R^2", "Percentage of bps trimmed (adapters)", "Trimmed R^2", "Number of reads shorter than 20bp removed", "Removed R^2", "Maximum read length after trimming", "Max. length R^2", "Total number of reads processed in tophat", "Reads processed R^2", "Total number of mapped reads", "Mapped reads R^2", "Percentage of mapped reads overlapping a junction", "Perc. junction R^2", "Number of junctions", "Num. junctions R^2", "Number of reads mapped on orthologous exons", "OrthoExons R^2", "Number of orthologous exons with at least 1 mapped read", "Ortho exons read R^2", "Number of orthologous genes with at least 1 mapped read", "Ortho genes read R^2", "RNA concentration (ng/uL)", "RNA conc. R^2", "RIN score", "RIN R^2", "Library concentration (ng/uL)", "Lib. conc. R^2", "Library fragments size (bp)", "Lib. frag. R^2", "Sex", "R^2", "Age quantile", "R^2", "Hours postmortem", "R^2", "Cause of death", "R^2", "Possible conditions affecting tissues", "R^2", "Source", "R^2")))
#Check lm to see how well the variables containing ordinal data are correlated with a PC
#For PCs 1-5
j=1
for (i in 3:9){
for (j in 1:5){
checkPC1 <- lm(gene_pcs[,j] ~ as.factor(RNA_seq_info[,i]))
#Get the summary statistics from it
summary(checkPC1)
#Get the p-value of the F-statistic
summary(checkPC1)$fstatistic
fstat <- as.data.frame(summary(checkPC1)$fstatistic)
p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
#Fraction of the variance explained by the model
r2_value <- summary(checkPC1)$r.squared
#Put the summary statistics into the matrix w
pvaluesandr2[j, (2*i-5)] <- p_fstat
pvaluesandr2[j, (2*i-4)] <- r2_value
}
}
#Check lm to see how well the variables containing numerical data are correlated with a PC
j=1
for (j in 1:5){
for (i in 10:24){
checkPC <- lm(gene_pcs[,j] ~ RNA_seq_info[,i])
#Get the summary statistics from it
summary(checkPC)
#Get the p-value of the F-statistic
summary(checkPC)$fstatistic
fstat <- as.data.frame(summary(checkPC)$fstatistic)
p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
#Fraction of the variance explained by the model
r2_value <- summary(checkPC)$r.squared
#Put the summary statistics into the matrix a
pvaluesandr2[j, 2*i-5] <- p_fstat
pvaluesandr2[j, 2*i-4] <- r2_value
i = i+1
}
j=j+1
}
j=1
for (i in 25:30){
for (j in 1:5){
checkPC1 <- lm(gene_pcs[,j] ~ as.factor(RNA_seq_info[,i]))
#Get the summary statistics from it
summary(checkPC1)
#Get the p-value of the F-statistic
summary(checkPC1)$fstatistic
fstat <- as.data.frame(summary(checkPC1)$fstatistic)
p_fstat <- 1-pf(fstat[1,], fstat[2,], fstat[3,])
#Fraction of the variance explained by the model
r2_value <- summary(checkPC1)$r.squared
#Put the summary statistics into the matrix w
pvaluesandr2[j, (2*i-5)] <- p_fstat
pvaluesandr2[j, (2*i-4)] <- r2_value
}
}
#Plot the residuals to look for violations of the assumptions of lm
pdf('../data/Residuals_vs_var_GenePCs.pdf')
i = 3
for (i in 3:9){
par(mfrow=c(1,5))
checkPC1 <- lm(gene_pcs[,1] ~ as.factor(RNA_seq_info[,i]))
plot(RNA_seq_info[,i], resid(checkPC1), ylab = "Residuals (lm with PC1)", xlab = " ")
checkPC2 <- lm(gene_pcs[,2] ~ as.factor(RNA_seq_info[,i]))
plot(RNA_seq_info[,i], resid(checkPC2), ylab = "Residuals (lm with PC2)", xlab = " ")
checkPC3 <- lm(gene_pcs[,3] ~ as.factor(RNA_seq_info[,i]))
plot(RNA_seq_info[,i], resid(checkPC3), ylab = "Residuals (lm with PC3)", xlab = " ")
checkPC4 <- lm(gene_pcs[,4] ~ as.factor(RNA_seq_info[,i]))
plot(RNA_seq_info[,i], resid(checkPC4), ylab = "Residuals (lm with PC4)", xlab = " ")
checkPC5 <- lm(gene_pcs[,5] ~ as.factor(RNA_seq_info[,i]))
plot(RNA_seq_info[,i], resid(checkPC5), ylab = "Residuals (lm with PC5)", xlab = " ")
title(xlab = substitute(paste(k), list(k=colnames(RNA_seq_info)[i])), outer = TRUE, line = -2)
mtext(substitute(paste('Residuals vs. ', k), list(k=colnames(RNA_seq_info)[i])), side = 3, line = -2, outer = TRUE)
}
#Need to divide this up because #22 has an NA in it
i = 10
for (i in 10:21){
par(mfrow=c(1,5))
checkPC1 <- lm(gene_pcs[,1] ~ RNA_seq_info[,i])
plot(RNA_seq_info[,i], resid(checkPC1), ylab = "Residuals (lm with PC1)", xlab = " ")
checkPC2 <- lm(gene_pcs[,2] ~ RNA_seq_info[,i])
plot(RNA_seq_info[,i], resid(checkPC2), ylab = "Residuals (lm with PC2)", xlab = " ")
checkPC3 <- lm(gene_pcs[,3] ~ RNA_seq_info[,i])
plot(RNA_seq_info[,i], resid(checkPC3), ylab = "Residuals (lm with PC3)", xlab = " ")
checkPC4 <- lm(gene_pcs[,4] ~ RNA_seq_info[,i])
plot(RNA_seq_info[,i], resid(checkPC4), ylab = "Residuals (lm with PC4)", xlab = " ")
checkPC5 <- lm(gene_pcs[,5] ~ RNA_seq_info[,i])
plot(RNA_seq_info[,i], resid(checkPC5), ylab = "Residuals (lm with PC5)", xlab = " ")
title(xlab = substitute(paste(k), list(k=colnames(RNA_seq_info)[i])), outer = TRUE, line = -2)
mtext(substitute(paste('Residuals vs. ', k), list(k=colnames(RNA_seq_info)[i])), side = 3, line = -2, outer = TRUE)
}
#Skip #22 because there is an NA in it
for (i in 23:24){
par(mfrow=c(1,5))
checkPC1 <- lm(gene_pcs[,1] ~ RNA_seq_info[,i])
plot(RNA_seq_info[,i], resid(checkPC1), ylab = "Residuals (lm with PC1)", xlab = " ")
checkPC2 <- lm(gene_pcs[,2] ~ RNA_seq_info[,i])
plot(RNA_seq_info[,i], resid(checkPC2), ylab = "Residuals (lm with PC2)", xlab = " ")
checkPC3 <- lm(gene_pcs[,3] ~ RNA_seq_info[,i])
plot(RNA_seq_info[,i], resid(checkPC3), ylab = "Residuals (lm with PC3)", xlab = " ")
checkPC4 <- lm(gene_pcs[,4] ~ RNA_seq_info[,i])
plot(RNA_seq_info[,i], resid(checkPC4), ylab = "Residuals (lm with PC4)", xlab = " ")
checkPC5 <- lm(gene_pcs[,5] ~ RNA_seq_info[,i])
plot(RNA_seq_info[,i], resid(checkPC5), ylab = "Residuals (lm with PC5)", xlab = " ")
title(xlab = substitute(paste(k), list(k=colnames(RNA_seq_info)[i])), outer = TRUE, line = -2)
mtext(substitute(paste('Residuals vs. ', k), list(k=colnames(RNA_seq_info)[i])), side = 3, line = -2, outer = TRUE)
}
#Get rid of the R^2 values
all_col <- 1:ncol(pvaluesandr2)
matrix_pval <- pvaluesandr2[ , all_col%%2==1 ]
#Find which variables/PC combinations are p-value < 0.05
TorF_matrix <- matrix_pval <=0.05
sum(matrix_pval <= 0.05/(28*5))
## [1] 21
#Find which variables/PC combinations are p-value < 0.05 not including species or tissue (because those explain PC1 and PC2)
matrix_pval_no_tissue_or_species = matrix_pval[ , 3:28]
matrix_pval_no_tissue_or_species
## Individual RNA extraction date Multiplexing index sequence
## PC1 8.577332e-01 1.933096e-01 3.202755e-05
## PC2 3.089533e-01 8.775476e-03 2.494550e-03
## PC3 4.210981e-03 6.385467e-06 1.665273e-01
## PC4 9.971820e-01 6.167683e-01 1.541532e-09
## PC5 4.996004e-15 0.000000e+00 9.542053e-01
## Multiplexing mix codes Sequencing location: at UCGF
## PC1 3.828374e-01 0.40556453
## PC2 8.784915e-03 0.85463260
## PC3 3.806629e-05 0.51996073
## PC4 1.927352e-01 0.62874566
## PC5 3.820715e-01 0.07672979
## Total number of reads sequenced Percentage of bps trimmed (adapters)
## PC1 0.6623602 0.43484612
## PC2 0.8367218 0.03093471
## PC3 0.9498725 0.56268886
## PC4 0.1491588 0.55118204
## PC5 0.6813582 0.03486093
## Number of reads shorter than 20bp removed
## PC1 0.32598741
## PC2 0.02745845
## PC3 0.68306619
## PC4 0.47137991
## PC5 0.04640738
## Maximum read length after trimming
## PC1 0.33637965
## PC2 0.92113549
## PC3 0.75683691
## PC4 0.59778768
## PC5 0.05145475
## Total number of reads processed in tophat Total number of mapped reads
## PC1 0.7051527 0.8420494
## PC2 0.9463660 0.0369636
## PC3 0.9269870 0.3729374
## PC4 0.1578441 0.0930245
## PC5 0.5848903 0.7499138
## Percentage of mapped reads overlapping a junction Number of junctions
## PC1 1.630216e-06 0.002629987
## PC2 5.446252e-01 0.003137455
## PC3 6.527941e-01 0.243584612
## PC4 1.540735e-02 0.179619438
## PC5 6.196622e-01 0.089573389
## Number of reads mapped on orthologous exons
## PC1 0.12500946
## PC2 0.12756466
## PC3 0.02997633
## PC4 0.01379170
## PC5 0.08964825
## Number of orthologous exons with at least 1 mapped read
## PC1 0.004114919
## PC2 0.059787325
## PC3 0.000255304
## PC4 0.078055122
## PC5 0.077428146
## Number of orthologous genes with at least 1 mapped read
## PC1 0.005239234
## PC2 0.008480924
## PC3 0.005243010
## PC4 0.055612828
## PC5 0.006306295
## RNA concentration (ng/uL) RIN score Library concentration (ng/uL)
## PC1 0.0148708408 3.193556e-01 0.05256745
## PC2 0.0578165224 4.364496e-03 0.01417761
## PC3 0.0342043520 3.532540e-06 0.74147481
## PC4 0.3831733717 2.661127e-02 0.79123942
## PC5 0.0001948362 4.790354e-01 0.02505369
## Library fragments size (bp) Sex Age quantile Hours postmortem
## PC1 0.944720057 8.577332e-01 0.4981614 0.31353986
## PC2 0.060512968 3.089533e-01 0.3851076 0.05951400
## PC3 0.400971333 4.210981e-03 0.3017935 0.00477844
## PC4 0.124460815 9.971820e-01 0.7967752 0.73082727
## PC5 0.001394967 4.996004e-15 0.0619115 0.10747192
## Cause of death Possible conditions affecting tissues Source
## PC1 2.915113e-01 0.596620345 0.596620345
## PC2 1.971076e-02 0.103215509 0.103215509
## PC3 2.234941e-05 0.000418567 0.000418567
## PC4 8.075915e-01 0.968399378 0.968399378
## PC5 0.000000e+00 0.000000000 0.000000000
sum(matrix_pval_no_tissue_or_species <= 0.05/(28*5))
## [1] 15
#Distribution of p-values adjusted by FDR not including species or tissue
fdr_val = p.adjust(matrix_pval_no_tissue_or_species, method = "fdr", n = length(matrix_pval_no_tissue_or_species))
fdr_val_order = fdr_val[order(fdr_val)]
hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10)
matrix_fdr_val = matrix(data = fdr_val, nrow = 5, ncol = 26, dimnames = list(c("PC1", "PC2", "PC3", "PC4", "PC5"), c("Individual", "RNA extraction date", "Multiplexing index sequence", "Multiplexing mix codes", "Sequencing location: at UCGF", "Total number of reads sequenced", "Percentage of bps trimmed (adapters)", "Number of reads shorter than 20bp removed", "Maximum read length after trimming", "Total number of reads processed in tophat", "Total number of mapped reads", "Percentage of mapped reads overlapping a junction", "Number of junctions", "Number of reads mapped on orthologous exons", "Number of orthologous exons with at least 1 mapped read", "Number of orthologous genes with at least 1 mapped read", "RNA concentration (ng/uL)", "RIN score", "Library concentration (ng/uL)", "Library fragments size (bp)", "Sex", "Age quantile", "Hours postmortem", "Cause of death", "Possible conditions affecting tissues", "Source")))
matrix_fdr_val
## Individual RNA extraction date Multiplexing index sequence
## PC1 9.292110e-01 3.490312e-01 3.469651e-04
## PC2 5.216095e-01 3.568872e-02 1.706798e-02
## PC3 2.269538e-02 8.301107e-05 3.137471e-01
## PC4 9.971820e-01 7.897656e-01 2.862845e-08
## PC5 1.082467e-13 0.000000e+00 9.835306e-01
## Multiplexing mix codes Sequencing location: at UCGF
## PC1 0.5821393960 0.5991294
## PC2 0.0356887174 0.9292110
## PC3 0.0003806629 0.7268268
## PC4 0.3490312399 0.7935625
## PC5 0.5821393960 0.1780205
## Total number of reads sequenced Percentage of bps trimmed (adapters)
## PC1 0.8200650 0.63516849
## PC2 0.9292110 0.09575031
## PC3 0.9835306 0.76197450
## PC4 0.2894126 0.75424910
## PC5 0.8298935 0.10299819
## Number of reads shorter than 20bp removed
## PC1 0.52972955
## PC2 0.08923997
## PC3 0.82989350
## PC4 0.68088209
## PC5 0.13115129
## Maximum read length after trimming
## PC1 0.5398686
## PC2 0.9835306
## PC3 0.8784714
## PC4 0.7771240
## PC5 0.1423217
## Total number of reads processed in tophat Total number of mapped reads
## PC1 0.8487949 0.9292110
## PC2 0.9835306 0.1067837
## PC3 0.9835306 0.5821394
## PC4 0.3017608 0.2015531
## PC5 0.7771240 0.8782774
## Percentage of mapped reads overlapping a junction Number of junctions
## PC1 2.649101e-05 0.01709491
## PC2 7.532051e-01 0.01942234
## PC3 8.159926e-01 0.43378082
## PC4 5.563764e-02 0.33357896
## PC5 7.897656e-01 0.19753005
## Number of reads mapped on orthologous exons
## PC1 0.25001893
## PC2 0.25126373
## PC3 0.09504690
## PC4 0.05420852
## PC5 0.19753005
## Number of orthologous exons with at least 1 mapped read
## PC1 0.022695382
## PC2 0.148428035
## PC3 0.002212635
## PC4 0.178020453
## PC5 0.178020453
## Number of orthologous genes with at least 1 mapped read
## PC1 0.02434255
## PC2 0.03568872
## PC3 0.02434255
## PC4 0.14754424
## PC5 0.02826960
## RNA concentration (ng/uL) RIN score Library concentration (ng/uL)
## PC1 0.055234552 5.255219e-01 0.14237018
## PC2 0.148428035 2.269538e-02 0.05420852
## PC3 0.102998195 5.102557e-05 0.87628841
## PC4 0.582139396 8.870424e-02 0.90860325
## PC5 0.001809193 6.843363e-01 0.08570998
## Library fragments size (bp) Sex Age quantile Hours postmortem
## PC1 0.98353062 9.292110e-01 0.7039238 0.5225664
## PC2 0.14842803 5.216095e-01 0.5821394 0.1484280
## PC3 0.59912943 2.269538e-02 0.5216095 0.0238922
## PC4 0.25001893 9.971820e-01 0.9086032 0.8716289
## PC5 0.01007476 1.082467e-13 0.1490462 0.2217675
## Cause of death Possible conditions affecting tissues Source
## PC1 0.5121143899 0.777123978 0.777123978
## PC2 0.0692540265 0.216419615 0.216419615
## PC3 0.0002641294 0.003200806 0.003200806
## PC4 0.9129295702 0.983530618 0.983530618
## PC5 0.0000000000 0.000000000 0.000000000
write.csv(matrix_fdr_val, "../data/matrix_fdr_val_RNA.csv")
# Number of values significant at 10% FDR not including species or tissue
sum(matrix_fdr_val <= 0.1)
## [1] 42
#Get the coordinates of which variables/PC combinations are significant at FDR 10%
TorF_matrix_fdr <- matrix_fdr_val <=0.1
coor_to_check <- which(matrix_fdr_val <= 0.1, arr.ind=T)
coor_to_check <- as.data.frame(coor_to_check)
# Number of variables significant at 10% FDR not including species or tissue (note: off by 4 from column number in RNA_seq_info file; see notes in Part Two)
coor_to_check_col <- coor_to_check$col
unique(coor_to_check_col)
## [1] 1 2 3 4 7 8 12 13 14 15 16 17 18 19 20 21 23 24 25 26
length(unique(coor_to_check_col))
## [1] 20
** Conclusions from Part I**
The following variables are associated with one of the PCs tested and will be investigated further in Part 2.
In coor_to_check_col, row is the PC # and col is the column # -4 that is associated with the PC Want to take the coor_to_check_col column # and add four. That is the variable that we should check to see if it correlates with tissue or species.
var_numb = unique(coor_to_check_col) + 4
#Making the variable versus species and tissue graphs into a file called VarVsSpeciesTissue.pdf
var.numb <- as.data.frame(var_numb)
#pdf('../data/Var_GenePCsVsSpeciesTissue.pdf')
#for (i in var.numb[1:nrow(var.numb),]) {
#par(mfrow=c(1,2))
#plot(RNA_seq_info$Species, RNA_seq_info[,i], xlab = "Species", ylab = substitute(paste(k), list(k=colnames(RNA_seq_info)[i])))
#plot(RNA_seq_info$Tissue, RNA_seq_info[,i], xlab = "Tissue", ylab = substitute(paste(k), list(k=colnames(RNA_seq_info)[i])))
#mtext(substitute(paste(k, ' vs. Species and Tissue'), list(k=colnames(RNA_seq_info)[i])), side = 3, line = -2, outer = TRUE)
#}
#dev.off()
#Testing to see if differences across variable groups are statistically significant for species. If statistically significant, then we should investigate further whether this variable is confounded with species or tissue.
#Make a matrix to store the p-values
pvalues_species = matrix(data = NA, nrow = 1, ncol = 19, dimnames = list(c("p-value"), c("Individual", "Age quantile", "Hours postmortem", "Possible conditions affecting tissues", "Source", "RNA extraction date", "Multiplexing index sequence", "Multiplexing mix codes", "Percentage of bps trimmed (adaptors)", "Number of reads shorter than 20bp removed", "Percentage of mapped reads overlapping a junction", "Number of junctions", "Number of reads mapped on orthologous exons", "Number of orthologous exons with at least 1 mapped read", "Number of orthologous genes with at least 1 mapped read", "RNA concentration (ng/uL)", "RIN score", "Library concentration (ng/uL)", "Library fragments size (bp)")))
cat_var <- c(5, 27, 28, 29, 30, 6, 7, 8)
num_var <- c(11, 12, 16, 17, 18, 19, 20, 21, 22, 23, 24)
#Performing this test of significance for variables that are categorical data (Using Pearson's chi-squared test)
j=1
for (i in cat_var) {
pval_chi <- chisq.test(as.factor(RNA_seq_info[,i]), as.factor(RNA_seq_info$Species))$p.value
pvalues_species[,j] <- pval_chi
j=j+1
}
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Species)): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Species)): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Species)): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Species)): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Species)): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Species)): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Species)): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Species)): Chi-squared approximation may be
## incorrect
#Performing this test of significance for variables that are numerical data (Using an ANOVA)
j = 9
for (i in num_var) {
summary_anova = summary(aov(RNA_seq_info[,i]~ as.factor(RNA_seq_info$Species)))
data_summary_anova <- as.data.frame(summary_anova[[1]]$`Pr(>F)`)
p_val_anova <- data_summary_anova[1,]
pvalues_species[,j] <- p_val_anova
j=j+1
}
#Testing to see if differences across variable groups are statistically significant for tissues. If statistically significant, then we should investigate further whether this variable is confounded with species or tissue.
pvalues_tissue = matrix(data = NA, nrow = 1, ncol = 19, dimnames = list(c("p-value"), c("Individual", "Age quantile", "Hours postmortem", "Possible conditions affecting tissues", "Source", "RNA extraction date", "Multiplexing index sequence", "Multiplexing mix codes", "Percentage of bps trimmed (adaptors)", "Number of reads shorter than 20bp removed", "Percentage of mapped reads overlapping a junction", "Number of junctions", "Number of reads mapped on orthologous exons", "Number of orthologous exons with at least 1 mapped read", "Number of orthologous genes with at least 1 mapped read", "RNA concentration (ng/uL)", "RIN score", "Library concentration (ng/uL)", "Library fragments size (bp)")))
#Performing this test of significance for variables that are categorical data (Using Pearson's chi-squared test)
j=1
for (i in cat_var) {
pval_chi <- chisq.test(as.factor(RNA_seq_info[,i]), as.factor(RNA_seq_info$Tissue))$p.value
pvalues_tissue[,j] <- pval_chi
j=j+1
}
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Tissue)): Chi-squared approximation may be incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Tissue)): Chi-squared approximation may be incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Tissue)): Chi-squared approximation may be incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Tissue)): Chi-squared approximation may be incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Tissue)): Chi-squared approximation may be incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Tissue)): Chi-squared approximation may be incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Tissue)): Chi-squared approximation may be incorrect
## Warning in chisq.test(as.factor(RNA_seq_info[, i]),
## as.factor(RNA_seq_info$Tissue)): Chi-squared approximation may be incorrect
#Performing this test of significance for variables that are numerical data (Using an ANOVA)
j = 9
for (i in num_var) {
summary_anova = summary(aov(RNA_seq_info[,i]~ as.factor(RNA_seq_info$Tissue)))
data_summary_anova <- as.data.frame(summary_anova[[1]]$`Pr(>F)`)
p_val_anova <- data_summary_anova[1,]
pvalues_tissue[,j] <- p_val_anova
j=j+1
}
collapse_table <- rbind(pvalues_tissue, pvalues_species)
rownames(collapse_table) <- c("Tissue", "Species")
collapse_table
## Individual Age quantile Hours postmortem
## Tissue 1.000000e+00 9.999429e-01 1.000000e+00
## Species 7.084425e-11 4.663809e-05 8.591612e-16
## Possible conditions affecting tissues Source
## Tissue 1.000000e+00 1.000000e+00
## Species 4.556773e-13 4.556773e-13
## RNA extraction date Multiplexing index sequence
## Tissue 9.999624e-01 7.313637e-07
## Species 7.149979e-17 9.951634e-01
## Multiplexing mix codes Percentage of bps trimmed (adaptors)
## Tissue 8.963210e-01 0.273399046
## Species 4.166637e-07 0.007347325
## Number of reads shorter than 20bp removed
## Tissue 0.229196235
## Species 0.005859874
## Percentage of mapped reads overlapping a junction
## Tissue 0.0001309289
## Species 0.2729677718
## Number of junctions Number of reads mapped on orthologous exons
## Tissue 2.566692e-04 0.01821964
## Species 8.155021e-05 0.30290842
## Number of orthologous exons with at least 1 mapped read
## Tissue 2.551611e-08
## Species 5.853704e-01
## Number of orthologous genes with at least 1 mapped read
## Tissue 2.913679e-07
## Species 4.263982e-02
## RNA concentration (ng/uL) RIN score
## Tissue 0.0014034907 6.370173e-01
## Species 0.0008347635 2.263714e-08
## Library concentration (ng/uL) Library fragments size (bp)
## Tissue 0.283235201 0.2664092315
## Species 0.002485755 0.0008550811
#Calculate q-values (FDR = 10%)
fdr_val = p.adjust(collapse_table, method = "fdr", n = length(collapse_table)*2)
fdr_val_order = fdr_val[order(fdr_val)]
hist(fdr_val_order, ylab = "BH-adjusted p-values", main = "Distribution of Benjamini and Hochberg adjusted p-values", breaks = 10)
collapse_table_fdr_val = matrix(data = fdr_val, nrow = 2, ncol = 19, dimnames = list(c("Tissue", "Species"), c("Individual", "Age quantile", "Hours postmortem", "Possible conditions affecting tissues", "Source", "RNA extraction date", "Multiplexing index sequence", "Multiplexing mix codes", "Percentage of bps trimmed (adaptors)", "Number of reads shorter than 20bp removed", "Percentage of mapped reads overlapping a junction", "Number of junctions", "Number of reads mapped on orthologous exons", "Number of orthologous exons with at least 1 mapped read", "Number of orthologous genes with at least 1 mapped read", "RNA concentration (ng/uL)", "RIN score", "Library concentration (ng/uL)", "Library fragments size (bp)")))
collapse_table_fdr_val
## Individual Age quantile Hours postmortem
## Tissue 1.000000e+00 1.0000000000 1.000000e+00
## Species 1.076833e-09 0.0003222268 3.264812e-14
## Possible conditions affecting tissues Source
## Tissue 1.000000e+00 1.000000e+00
## Species 8.657869e-12 8.657869e-12
## RNA extraction date Multiplexing index sequence
## Tissue 1.000000e+00 5.558364e-06
## Species 5.433984e-15 1.000000e+00
## Multiplexing mix codes Percentage of bps trimmed (adaptors)
## Tissue 1.000000e+00 0.79725464
## Species 3.518494e-06 0.02791984
## Number of reads shorter than 20bp removed
## Tissue 0.7573441
## Species 0.0234395
## Percentage of mapped reads overlapping a junction
## Tissue 0.0007654306
## Species 0.7972546398
## Number of junctions Number of reads mapped on orthologous exons
## Tissue 0.0013933472 0.06593773
## Species 0.0005164846 0.82218001
## Number of orthologous exons with at least 1 mapped read
## Tissue 2.77032e-07
## Species 1.00000e+00
## Number of orthologous genes with at least 1 mapped read
## Tissue 2.767995e-06
## Species 1.473012e-01
## RNA concentration (ng/uL) RIN score
## Tissue 0.006274429 1.00000e+00
## Species 0.004061635 2.77032e-07
## Library concentration (ng/uL) Library fragments size (bp)
## Tissue 0.79725464 0.797254640
## Species 0.01049541 0.004061635
#Find number below it
sum(collapse_table_fdr_val <= 0.1)
## [1] 21
#write.csv(collapse_table_fdr_val, "../data/collapse_table_fdr_val_RNA2.csv")
Conclusions from Part 2:
The following variables are associated with species: * Individual * RNA extraction date * Multiplexing mix codes * Percentage of bps trimmed (adaptors) * Number of reads shorter than 20bp removed * Number of junctions * RNA concentration * RIN score * Library concentration * Library fragments size
The following variables are associated with tissue: * Multiplexing index sequence * Percentage of mapped reads overlapping a junction * Number of junctions * Number of reads mapped on othologous exons * Number of orthologous exons with at least 1 mapped read * Number of orthologous genes with at least 1 mapped read * RNA concentration
Check for degree of similarity between the values of the different variables (not including species or tissue) For ones that show high correlation (multi-colinearity), we will only want to incorporate one of these variables into the model Note: the first 4 variables are ordinal, the others are numerical.
There is one pair of correlated variables: the number of orthologous exons with at least 1 mapped read and number of orthologous genes with at least 1 mapped read.