We want to evaluate whether or not the recorded technical factors all similar across biological conditions.
Therefore, for each technical factor, we will test the variance explained between a model with day and species as fixed effects versus a model with day, species, and day-by-species interactions.
# Load library
library("nnet")
## Warning: package 'nnet' was built under R version 3.2.3
# Load cpm data
cpm_in_cutoff <- read.delim("../data/cpm_cyclicloess.txt")
# Load sample information
After_removal_sample_info <- read.csv("../data/After_removal_sample_info.csv")
species <- After_removal_sample_info$Species
day <- as.factor(After_removal_sample_info$Day)
# Test that both day and species are factors
is.factor(species)
## [1] TRUE
is.factor(day)
## [1] TRUE
# Load technical factor information
RNA_seq_info_all <- read.csv("../data/Endo_TC_Data_Share_Sorting.csv", header = T)
dim(RNA_seq_info_all)
## [1] 130 43
RNA_seq_info <- as.data.frame(cbind(RNA_seq_info_all[1:63, 4], RNA_seq_info_all[1:63, 3], RNA_seq_info_all[1:63, 5:27], RNA_seq_info_all[1:63, 30:35], RNA_seq_info_all[1:63, 37:43]))
# Remove library well (only 1/well)
RNA_seq_info <- RNA_seq_info[,-20]
# Full data set
dim(RNA_seq_info)
## [1] 63 37
# Take out day and species (factors of interest)
RNA_seq_info_35 <- RNA_seq_info[,-(1:2)]
# Make the linear models (the null does not contain an interaction and the full model does)
# Define y
y <- RNA_seq_info_35[1:63,4]
# Define the null model
fit0 <- lm(y ~ species + day)
fit0
##
## Call:
## lm(formula = y ~ species + day)
##
## Coefficients:
## (Intercept) specieshuman day1 day2 day3
## 24.63771 -1.36653 -0.01695 -0.01695 -0.01695
# Define the alternative model
fit1 <- lm(y ~ species + day + species*day)
fit1
##
## Call:
## lm(formula = y ~ species + day + species * day)
##
## Coefficients:
## (Intercept) specieshuman day1
## 2.462e+01 -1.339e+00 -1.373e-15
## day2 day3 specieshuman:day1
## -1.454e-15 -6.558e-16 -3.571e-02
## specieshuman:day2 specieshuman:day3
## -3.571e-02 -3.571e-02
# Compare the models using an ANOVA
anova(fit0, fit1)
## Analysis of Variance Table
##
## Model 1: y ~ species + day
## Model 2: y ~ species + day + species * day
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 58 269.43
## 2 55 269.43 3 0.003632 2e-04 1
# Get the p-value associated with the comparison
anova(fit0, fit1)$"Pr(>F)"[2]
## [1] 0.9999946
## Subset to only days 0-1
day <- as.factor(day[1:31])
species <- as.factor(species[1:31])
numerical_tech_factors <- c(4, 6,8:9, 12:16,18:20,24:28,30:35)
# Note: there are not enough samples to make Max. purity estimates.
# Make a matrix
ANOVA_pvalues_01 = matrix(data = NA, nrow = 1, ncol = 35, dimnames = list(c("pval_from_ANOVA"), c("Cell line", "Batch", "Sex", "Passage at seed", "Start date", "Density at seed", "Harvest time", "Harvest density", "High conf purity", "Max purity", "RNA Extraction Date", "RNA conc", "RIN", "260 280 RNA", "260 230 RNA", "DNA concentration", "Library prep batch", "Library concentration", "uL sample", "uL EB", "Index sequence", "Seq pool", "Lane r1", "Mseqs R1", "Total lane reads 1", "Lane prop r1", "Dup r1", "GC r1", "Lane r2", "Mseqs r2", "Total lane reads r2", "Lane prop r2", "Dups r2", "GC r2", "Total reads")))
# Run for the numerical technical factors
j=1
for (i in numerical_tech_factors){
# Define y (the technical factor)
y <- RNA_seq_info_35[1:31,i]
# Define the null model
fit0 <- lm(y ~ species + day)
# Define the alternative model
fit1 <- lm(y ~ species + day + species*day)
# Compare the models using an ANOVA
anova(fit0, fit1)
# Get the p-value associated with the comparison
ANOVA_pvalues_01[1,i] <- anova(fit0, fit1)$"Pr(>F)"[2]
}
# Note: there are not enough samples to make Max. purity estimates.
ANOVA_pvalues_01[1,10] <- NA
###### FOR DIFFERENTIATION BATCH/START DATE #######
# Define y (the technical factor)
y <- as.numeric(RNA_seq_info_35[1:31,2])
# Relevel so 0 and 1
y[y == 2] <- 0
y[y == 1] <- 1
# Define the null model
mydata <- data.frame(cbind(species, day, y))
mydata$species <- as.factor(mydata$species)
mydata$day <- as.factor(mydata$day)
### GLM
fit0 <- glm(y ~ day + species, data = mydata, family = "binomial")
# Define the alternative model
fit1 <- glm(y ~ day + species + day*species, data = mydata, family = "binomial")
# Compare the models using an ANOVA
anova.fit <- anova(fit0, fit1)
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
deviance <- anova.fit$Deviance[2]
deviance.df <- anova.fit$Df[2]
# Compute significance value of the deviance
ANOVA_pvalues_01[1,2] <- 1-pchisq(deviance, deviance.df)
ANOVA_pvalues_01[1,5] <- 1-pchisq(deviance, deviance.df)
###### FOR SEX #######
# Define y (the technical factor)
y <- as.character(RNA_seq_info_35[1:31,3])
# Relevel so 0 and 1
y[y == "F"] <- 0
y[y == "M"] <- 1
# Define the null model
mydata <- data.frame(cbind(species, day, y))
mydata$species <- as.factor(mydata$species)
mydata$day <- as.factor(mydata$day)
### GLM
fit0 <- glm(y ~ day + species, data = mydata, family = "binomial")
# Define the alternative model
fit1 <- glm(y ~ day + species + day*species, data = mydata, family = "binomial")
# Compare the models using an ANOVA
anova.fit <- anova(fit0, fit1)
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
deviance <- anova.fit$Deviance[2]
deviance.df <- anova.fit$Df[2]
# Compute significance value of the deviance
ANOVA_pvalues_01[1,3] <- 1-pchisq(deviance, deviance.df)
###### FOR LIBRARY PREP BATCH #######
# Define y (the technical factor)
y <- as.numeric(RNA_seq_info_35[1:31,17])
# Relevel so 0 and 1
y[y == 1] <- 0
y[y == 2] <- 1
# Combine the variables into 1 data frame
mydata <- data.frame(cbind(species, day, y))
mydata$species <- as.factor(mydata$species)
mydata$day <- as.factor(mydata$day)
### GLM
# Define the null model
fit0 <- glm(y ~ day + species, data = mydata, family = "binomial")
# Define the alternative model
fit1 <- glm(y ~ day + species + day*species, data = mydata, family = "binomial")
# Compare the models using an ANOVA
anova.fit <- anova(fit0, fit1)
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
deviance <- anova.fit$Deviance[2]
deviance.df <- anova.fit$Df[2]
# Compute significance value of the deviance
ANOVA_pvalues_01[1,17] <- 1-pchisq(deviance, deviance.df)
categorical_tech_factors <- c(1, 7, 11, 21:23, 29 )
# Run for the numerical technical factors
j=1
for (i in categorical_tech_factors){
# Define y (the technical factor)
y <- RNA_seq_info_35[1:31,i]
# Get data into a dataframe
mydata <- data.frame(cbind(species, day, y))
mydata$y <- as.factor(mydata$y)
mydata$species <- as.factor(mydata$species)
mydata$day <- as.factor(mydata$day)
### GLM
fit0 <- multinom(y ~ day + species, data = mydata)
# Define the alternative model
fit1 <- multinom(y ~ day + species + day*species, data = mydata)
# Compare the models using an ANOVA
# Get the p-value associated with the comparison
ANOVA_pvalues_01[1,i] <- anova(fit0, fit1)$"Pr(Chi)"[2]
}
## # weights: 64 (45 variable)
## initial value 85.950250
## iter 10 value 63.939967
## iter 20 value 63.532340
## iter 30 value 63.527997
## final value 63.527970
## converged
## # weights: 80 (60 variable)
## initial value 85.950250
## iter 10 value 63.787448
## iter 20 value 63.529905
## iter 30 value 63.527980
## final value 63.527969
## converged
## # weights: 24 (15 variable)
## initial value 55.544544
## iter 10 value 32.404269
## iter 20 value 31.985102
## iter 30 value 31.982999
## final value 31.982997
## converged
## # weights: 30 (20 variable)
## initial value 55.544544
## iter 10 value 32.426183
## iter 20 value 31.983545
## iter 30 value 31.982997
## final value 31.982996
## converged
## # weights: 24 (15 variable)
## initial value 55.544544
## iter 10 value 52.931649
## final value 52.931568
## converged
## # weights: 30 (20 variable)
## initial value 55.544544
## iter 10 value 52.500355
## iter 20 value 52.439498
## iter 30 value 52.437758
## iter 40 value 52.437614
## final value 52.437614
## converged
## # weights: 32 (21 variable)
## initial value 64.462688
## iter 10 value 41.629660
## iter 20 value 40.644671
## iter 30 value 40.640587
## final value 40.640560
## converged
## # weights: 40 (28 variable)
## initial value 64.462688
## iter 10 value 42.358140
## iter 20 value 40.672671
## iter 30 value 40.640632
## final value 40.640560
## converged
## # weights: 16 (9 variable)
## initial value 42.975125
## iter 10 value 42.810508
## iter 10 value 42.810508
## iter 10 value 42.810508
## final value 42.810508
## converged
## # weights: 20 (12 variable)
## initial value 42.975125
## iter 10 value 42.735138
## final value 42.733553
## converged
## # weights: 16 (9 variable)
## initial value 42.975125
## iter 10 value 42.810508
## iter 10 value 42.810508
## iter 10 value 42.810508
## final value 42.810508
## converged
## # weights: 20 (12 variable)
## initial value 42.975125
## iter 10 value 42.735138
## final value 42.733553
## converged
## # weights: 16 (9 variable)
## initial value 42.975125
## iter 10 value 42.810508
## iter 10 value 42.810508
## iter 10 value 42.810508
## final value 42.810508
## converged
## # weights: 20 (12 variable)
## initial value 42.975125
## iter 10 value 42.735138
## final value 42.733553
## converged
ANOVA_pvalues_01
## Cell line Batch Sex Passage at seed Start date
## pval_from_ANOVA 1 0.8419522 0.8419522 0.9824348 0.8419522
## Density at seed Harvest time Harvest density
## pval_from_ANOVA 0.9692987 1 0.6186798
## High conf purity Max purity RNA Extraction Date RNA conc
## pval_from_ANOVA 0.1362526 NA 0.963535 0.4905153
## RIN 260 280 RNA 260 230 RNA DNA concentration
## pval_from_ANOVA 0.6433036 0.1160222 0.1592037 0.5053192
## Library prep batch Library concentration uL sample
## pval_from_ANOVA 0.8419522 0.5415916 0.4832505
## uL EB Index sequence Seq pool Lane r1 Mseqs R1
## pval_from_ANOVA 0.4832505 1 0.9846624 0.9846624 0.3699091
## Total lane reads 1 Lane prop r1 Dup r1 GC r1
## pval_from_ANOVA 0.8835576 0.3668641 0.3036041 0.6749494
## Lane r2 Mseqs r2 Total lane reads r2 Lane prop r2
## pval_from_ANOVA 0.9846624 0.6291315 0.8494038 0.6078586
## Dups r2 GC r2 Total reads
## pval_from_ANOVA 0.5448301 0.6666955 0.3101537
#write.table(ANOVA_pvalues_01, "/Users/laurenblake/Desktop/pc_pvalues.txt")
ANOVA_pvalues_01[,ANOVA_pvalues_01 < 0.05]
## [1] NA
# Redefine day and species (includes all 63 samples)
species <- After_removal_sample_info$Species
day <- as.factor(After_removal_sample_info$Day)
# Make a matrix
numerical_tech_factors <- c(4, 6,8:10, 12:16,18:20,24:28,30:35)
# Note: there are enough samples to make Max. purity estimates.
# Make a matrix
ANOVA_pvalues_03 = matrix(data = NA, nrow = 1, ncol = 35, dimnames = list(c("pval_from_ANOVA"), c("Cell line", "Batch", "Sex", "Passage at seed", "Start date", "Density at seed", "Harvest time", "Harvest density", "High conf purity", "Max purity", "RNA Extraction Date", "RNA conc", "RIN", "260 280 RNA", "260 230 RNA", "DNA concentration", "Library prep batch", "Library concentration", "uL sample", "uL EB", "Index sequence", "Seq pool", "Lane r1", "Mseqs R1", "Total lane reads 1", "Lane prop r1", "Dup r1", "GC r1", "Lane r2", "Mseqs r2", "Total lane reads r2", "Lane prop r2", "Dups r2", "GC r2", "Total reads")))
# Run for the numerical technical factors
j=1
for (i in numerical_tech_factors){
# Define y (the technical factor)
y <- RNA_seq_info_35[1:63,i]
# Define the null model
fit0 <- lm(y ~ species + day)
# Define the alternative model
fit1 <- lm(y ~ species + day + species*day)
# Compare the models using an ANOVA
anova(fit0, fit1)
# Get the p-value associated with the comparison
ANOVA_pvalues_03[1,i] <- anova(fit0, fit1)$"Pr(>F)"[2]
}
###### FOR DIFFERENTIATION BATCH/START DATE #######
# Define y (the technical factor)
y <- as.numeric(RNA_seq_info_35[1:63,2])
# Relevel so 0 and 1
y[y == 2] <- 0
y[y == 1] <- 1
# Define the null model
mydata <- data.frame(cbind(species, day, y))
mydata$species <- as.factor(mydata$species)
mydata$day <- as.factor(mydata$day)
### GLM
fit0 <- glm(y ~ day + species, data = mydata, family = "binomial")
# Define the alternative model
fit1 <- glm(y ~ day + species + day*species, data = mydata, family = "binomial")
# Compare the models using an ANOVA
anova.fit <- anova(fit0, fit1)
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
deviance <- anova.fit$Deviance[2]
deviance.df <- anova.fit$Df[2]
# Compute significance value of the deviance
ANOVA_pvalues_03[1,2] <- 1-pchisq(deviance, deviance.df)
ANOVA_pvalues_03[1,5] <- 1-pchisq(deviance, deviance.df)
###### FOR SEX #######
# Define y (the technical factor)
y <- as.character(RNA_seq_info_35[1:63,3])
# Relevel so 0 and 1
y[y == "F"] <- 0
y[y == "M"] <- 1
# Define the null model
mydata <- data.frame(cbind(species, day, y))
mydata$species <- as.factor(mydata$species)
mydata$day <- as.factor(mydata$day)
### GLM
fit0 <- glm(y ~ day + species, data = mydata, family = "binomial")
# Define the alternative model
fit1 <- glm(y ~ day + species + day*species, data = mydata, family = "binomial")
# Compare the models using an ANOVA
anova.fit <- anova(fit0, fit1)
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
deviance <- anova.fit$Deviance[2]
deviance.df <- anova.fit$Df[2]
# Compute significance value of the deviance
ANOVA_pvalues_03[1,3] <- 1-pchisq(deviance, deviance.df)
###### FOR LIBRARY PREP BATCH #######
# Define y (the technical factor)
y <- as.numeric(RNA_seq_info_35[1:63,17])
# Relevel so 0 and 1
y[y == 1] <- 0
y[y == 2] <- 1
# Combine the variables into 1 data frame
mydata <- data.frame(cbind(species, day, y))
mydata$species <- as.factor(mydata$species)
mydata$day <- as.factor(mydata$day)
### GLM
# Define the null model
fit0 <- glm(y ~ day + species, data = mydata, family = "binomial")
# Define the alternative model
fit1 <- glm(y ~ day + species + day*species, data = mydata, family = "binomial")
# Compare the models using an ANOVA
anova.fit <- anova(fit0, fit1)
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
deviance <- anova.fit$Deviance[2]
deviance.df <- anova.fit$Df[2]
# Compute significance value of the deviance
ANOVA_pvalues_03[1,17] <- 1-pchisq(deviance, deviance.df)
categorical_tech_factors <- c(1, 7, 11, 21:23, 29 )
# Run for the numerical technical factors
j=1
for (i in categorical_tech_factors){
# Define y (the technical factor)
y <- RNA_seq_info[1:63,i]
# Get data into a dataframe
mydata <- data.frame(cbind(species, day, y))
mydata$y <- as.factor(mydata$y)
mydata$species <- as.factor(mydata$species)
mydata$day <- as.factor(mydata$day)
### GLM
fit0 <- multinom(y ~ day + species, data = mydata)
# Define the alternative model
fit1 <- multinom(y ~ day + species + day*species, data = mydata)
# Compare the models using an ANOVA
# Get the p-value associated with the comparison
ANOVA_pvalues_03[1,i] <- anova(fit0, fit1)$"Pr(Chi)"[2]
}
## # weights: 6 (5 variable)
## initial value 43.668272
## final value 0.000069
## converged
## # weights: 9 (8 variable)
## initial value 43.668272
## final value 0.000056
## converged
## # weights: 6 (5 variable)
## initial value 43.668272
## final value 43.625865
## converged
## # weights: 9 (8 variable)
## initial value 43.668272
## iter 10 value 43.596599
## iter 10 value 43.596599
## iter 10 value 43.596599
## final value 43.596599
## converged
## # weights: 180 (145 variable)
## initial value 102.035921
## iter 10 value 40.476261
## iter 20 value 39.863843
## iter 30 value 39.862740
## final value 39.862739
## converged
## # weights: 270 (232 variable)
## initial value 102.035921
## iter 10 value 40.070381
## iter 20 value 39.863132
## iter 30 value 39.862739
## final value 39.862739
## converged
## # weights: 378 (310 variable)
## initial value 261.017488
## iter 10 value 134.960765
## iter 20 value 130.082952
## iter 30 value 130.070114
## final value 130.070098
## converged
## # weights: 567 (496 variable)
## initial value 261.017488
## iter 10 value 132.711746
## iter 20 value 130.075479
## iter 30 value 130.070108
## final value 130.070099
## converged
## # weights: 378 (310 variable)
## initial value 261.017488
## iter 10 value 135.313984
## iter 20 value 130.191868
## iter 30 value 130.070248
## final value 130.070098
## converged
## # weights: 567 (496 variable)
## initial value 261.017488
## iter 10 value 134.062671
## iter 20 value 130.098029
## iter 30 value 130.070200
## final value 130.070099
## converged
## # weights: 96 (75 variable)
## initial value 174.673090
## iter 10 value 86.778036
## iter 20 value 82.988239
## iter 30 value 82.909787
## iter 40 value 82.908989
## iter 40 value 82.908988
## iter 40 value 82.908988
## final value 82.908988
## converged
## # weights: 144 (120 variable)
## initial value 174.673090
## iter 10 value 88.264763
## iter 20 value 82.972487
## iter 30 value 82.909766
## iter 40 value 82.908988
## iter 40 value 82.908988
## iter 40 value 82.908988
## final value 82.908988
## converged
## # weights: 348 (285 variable)
## initial value 255.807910
## iter 10 value 139.896630
## iter 20 value 131.294265
## iter 30 value 130.079642
## iter 40 value 130.070127
## final value 130.070099
## converged
## # weights: 522 (456 variable)
## initial value 255.807910
## iter 10 value 138.613203
## iter 20 value 130.144462
## iter 30 value 130.070365
## final value 130.070099
## converged
ANOVA_pvalues_03
## Cell line Batch Sex Passage at seed Start date
## pval_from_ANOVA 1 0.9962993 0.9962993 0.9999946 0.9962993
## Density at seed Harvest time Harvest density
## pval_from_ANOVA 0.999971 0.9962993 0.005792107
## High conf purity Max purity RNA Extraction Date
## pval_from_ANOVA 6.415024e-05 0.0008066171 1
## RNA conc RIN 260 280 RNA 260 230 RNA
## pval_from_ANOVA 0.1585667 0.5864589 0.2301219 0.132
## DNA concentration Library prep batch Library concentration
## pval_from_ANOVA 0.8565571 0.9962993 0.8715344
## uL sample uL EB Index sequence Seq pool Lane r1
## pval_from_ANOVA 0.6689692 0.6689692 1 1 1
## Mseqs R1 Total lane reads 1 Lane prop r1 Dup r1
## pval_from_ANOVA 0.2039331 0.9984402 0.2027423 0.2590957
## GC r1 Lane r2 Mseqs r2 Total lane reads r2
## pval_from_ANOVA 0.8496839 1 0.6085574 0.9966626
## Lane prop r2 Dups r2 GC r2 Total reads
## pval_from_ANOVA 0.6080504 0.3468663 0.7606478 0.3474322
ANOVA_pvalues_03[,ANOVA_pvalues_03 < 0.05]
## Harvest density High conf purity Max purity
## 5.792107e-03 6.415024e-05 8.066171e-04
Conclusions: Interactions are important for describing the relationship between our biological variables of interest (day and species) and Harvest density, as well as our purity estimates for days 0 to 3 but not day 0 to 1.