Here, we want to compare the variance of GTex LCLs to other GTex tissues. In GTex, since LCLs have 114 samples, we will pick the following tissues with similar sample sizes:
# Library
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("ggplot2")
# Function
bjp<-
theme(
axis.text.y = element_text(size = 9,face = "bold",color = "black"),
axis.title.y = element_text(size = 10,face = "bold",color = "black"),
axis.text.x = element_text(size = 9,face = "bold",color = "black"),
axis.title.x = element_text(size = 10,face = "bold",color = "black"),
plot.title = element_text(size = 13, face = "bold"))
# Import file
Cells_EBV.transformed_lymphocytes_Analysis.v6p.normalized.expression.3 <- read.delim("/Users/laurenblake/Desktop/Endoderm_TC/Cells_EBV-transformed_lymphocytes_Analysis.v6p.normalized.expression 3.R")
Heart_Left_Ventricle_Analysis.v6p.normalized.expression <- read.delim("/Users/laurenblake/Desktop/Endoderm_TC/Heart_Left_Ventricle_Analysis.v6p.normalized.expression.R")
which(grepl("ENSG00000118194", Heart_Left_Ventricle_Analysis.v6p.normalized.expression$gene_id) == TRUE)
## [1] 1948
pull_heart <- Heart_Left_Ventricle_Analysis.v6p.normalized.expression[1948,]
Pancreas_Analysis.v6p.normalized.expression <- read.delim("/Users/laurenblake/Desktop/Endoderm_TC/Pancreas_Analysis.v6p.normalized.expression.R")
Liver_Analysis.v6p.normalized.expression <- read.delim("/Users/laurenblake/Desktop/Endoderm_TC/Liver_Analysis.v6p.normalized.expression.R")
pull_liver <- Liver_Analysis.v6p.normalized.expression[1948,]
mean(as.numeric(pull_heart))
## [1] 2075763
mean(as.numeric(pull_liver))
## [1] 4022948
Lung_Analysis.v6p.normalized.expression <- read.delim("/Users/laurenblake/Desktop/Endoderm_TC/Lung_Analysis.v6p.normalized.expression.R")
intersect(intersect(intersect(intersect(colnames(Cells_EBV.transformed_lymphocytes_Analysis.v6p.normalized.expression.3), colnames(Heart_Left_Ventricle_Analysis.v6p.normalized.expression)), colnames(Pancreas_Analysis.v6p.normalized.expression)), colnames(Liver_Analysis.v6p.normalized.expression)), colnames(Lung_Analysis.v6p.normalized.expression))
## [1] "X.chr" "start" "end" "gene_id" "GTEX.WFON"
## [6] "GTEX.X3Y1" "GTEX.Y5V5" "GTEX.YEC4" "GTEX.ZPU1" "GTEX.ZTPG"
# Select 5 desired tissues
indiv_col <- c("X.chr", "start", "end", "gene_id", "GTEX.WFON", "GTEX.X3Y1", "GTEX.Y5V5", "GTEX.YEC4", "GTEX.ZPU1", "GTEX.ZTPG")
LCL <- Cells_EBV.transformed_lymphocytes_Analysis.v6p.normalized.expression.3[,indiv_col]
dim(LCL)
## [1] 21931 10
heart <- Heart_Left_Ventricle_Analysis.v6p.normalized.expression[,indiv_col]
dim(heart)
## [1] 22461 10
liver <- Liver_Analysis.v6p.normalized.expression[,indiv_col]
dim(liver)
## [1] 21848 10
lung <- Lung_Analysis.v6p.normalized.expression[,indiv_col]
dim(lung)
## [1] 27843 10
pancreas <- Pancreas_Analysis.v6p.normalized.expression[,indiv_col]
dim(pancreas)
## [1] 23175 10
# Find the common genes between these 5 tissues
common_geneid <- intersect(intersect(intersect(intersect(LCL$gene_id, heart$gene_id), liver$gene_id), lung$gene_id), pancreas$gene_id)
length(common_geneid)
## [1] 17542
# Get the row names for the common genes
LCL_17542 <- which(LCL$gene_id %in% common_geneid)
LCL_17542_df <- LCL[LCL_17542,]
dim(LCL_17542_df)
## [1] 17542 10
heart_17542 <- which(heart$gene_id %in% common_geneid)
heart_17542_df <- heart[heart_17542,]
dim(heart_17542_df)
## [1] 17542 10
liver_17542 <- which(liver$gene_id %in% common_geneid)
liver_17542_df <- liver[liver_17542,]
dim(liver_17542_df)
## [1] 17542 10
lung_17542 <- which(lung$gene_id %in% common_geneid)
lung_17542_df <- lung[lung_17542,]
dim(lung_17542_df)
## [1] 17542 10
pancreas_17542 <- which(pancreas$gene_id %in% common_geneid)
pancreas_17542_df <- pancreas[pancreas_17542,]
dim(pancreas_17542_df)
## [1] 17542 10
# Take the variances of each gene in each data frame
LCL_var <- as.data.frame(apply(as.data.frame(LCL_17542_df[,5:10]),1, var) )
colnames(LCL_var) <- c("Variance")
liver_var <- as.data.frame(apply(as.data.frame(liver_17542_df[,5:10]),1, var) )
colnames(liver_var) <- c("Variance")
lung_var <- as.data.frame(apply(as.data.frame(lung_17542_df[,5:10]),1, var) )
colnames(lung_var) <- c("Variance")
pancreas_var <- as.data.frame(apply(as.data.frame(pancreas_17542_df[,5:10]),1, var) )
colnames(pancreas_var) <- c("Variance")
heart_var <- as.data.frame(apply(as.data.frame(heart_17542_df[,5:10]),1, var) )
colnames(heart_var) <- c("Variance")
# Take log2 variances
LCL_log_var <- log2(LCL_var)
liver_log_var <- log2(liver_var)
lung_log_var <- log2(lung_var)
pancreas_log_var <- log2(pancreas_var)
heart_log_var <- log2(heart_var)
median(LCL_log_var[,1])
## [1] -0.6461825
median(liver_log_var[,1])
## [1] -0.3242026
median(lung_log_var[,1])
## [1] -0.4117142
median(pancreas_log_var[,1])
## [1] -0.07430495
median(heart_log_var[,1])
## [1] -0.3732486
t.test(LCL_log_var, liver_log_var, alternative = c("greater"))
##
## Welch Two Sample t-test
##
## data: LCL_log_var and liver_log_var
## t = -32.526, df = 34803, p-value = 1
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -0.3473949 Inf
## sample estimates:
## mean of x mean of y
## -0.7582014 -0.4275290
t.test(LCL_log_var, heart_log_var, alternative = c("greater"))
##
## Welch Two Sample t-test
##
## data: LCL_log_var and heart_log_var
## t = -28.238, df = 34643, p-value = 1
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -0.3006455 Inf
## sample estimates:
## mean of x mean of y
## -0.7582014 -0.4741048
# Make labels
labels1 <- array("LCL", dim = c(17542, 1))
labels2 <- array("Liver", dim = c(17542, 1))
labels3 <- array("Lung", dim = c(17542, 1))
labels4 <- array("Pancreas", dim = c(17542, 1))
labels5 <- array("Heart", dim = c(17542, 1))
# Combine labels
labels9 <- rbind(labels1, labels2, labels3, labels4, labels5)
labels <- as.numeric(as.factor(labels9))
# Combine variances
gtex_log_var <- rbind(as.data.frame(LCL_log_var), as.data.frame(liver_log_var), as.data.frame(lung_log_var), as.data.frame(pancreas_log_var), as.data.frame(heart_log_var))
# Make df for boxplot
gtex_var_labels <- cbind(gtex_log_var, labels)
p <- ggplot(gtex_var_labels, aes(x = factor(labels), y = gtex_log_var))
p <- p + geom_violin(aes(fill = factor(labels)), show.legend = FALSE) + geom_boxplot(aes(fill = factor(labels)), show.legend = FALSE, outlier.shape = NA,width=0.2) + theme_bw() + xlab("GTEx Tissue") + ylab(expression(bold('log'[2]*' variance of normalized gene expression (for each gene)'))) + ggtitle(expression(bold('log'[2]*' variance of normalized gene expression (n = 17,542 genes) for 5 GTEx tissues')))
p <- p + scale_x_discrete(labels=c("1" = "LCL", "2" = "Liver", "3" = "Lung", "4" = "Pancreas", "5" = "Heart")) + bjp
#p
# Get the row names for the common genes
LCL_17542 <- which(LCL$gene_id %in% common_geneid)
LCL_17542_df <- LCL[LCL_17542,]
dim(LCL_17542_df)
## [1] 17542 10
hist(LCL_17542_df[,5])
hist(LCL_17542_df[,6])