We want to evaluate whether or not the recorded technical factors all similar across biological conditions.
Therefore, for each technical factor, we will test the variance explained between a model with day and species as fixed effects versus a model with day, species, and day-by-species interactions.
# Load library
library("nnet")## Warning: package 'nnet' was built under R version 3.2.3# Load cpm data 
cpm_in_cutoff <- read.delim("../data/cpm_cyclicloess.txt")
# Load sample information
After_removal_sample_info <- read.csv("../data/After_removal_sample_info.csv")
species <- After_removal_sample_info$Species
day <- as.factor(After_removal_sample_info$Day)
# Test that both day and species are factors
is.factor(species)## [1] TRUEis.factor(day)## [1] TRUE# Load technical factor information
RNA_seq_info_all <- read.csv("../data/Endo_TC_Data_Share_Sorting.csv",  header = T)
dim(RNA_seq_info_all)## [1] 130  43RNA_seq_info <- as.data.frame(cbind(RNA_seq_info_all[1:63, 4], RNA_seq_info_all[1:63, 3], RNA_seq_info_all[1:63, 5:27], RNA_seq_info_all[1:63, 30:35], RNA_seq_info_all[1:63, 37:43]))
# Remove library well (only 1/well)
RNA_seq_info <- RNA_seq_info[,-20]
# Full data set
dim(RNA_seq_info)## [1] 63 37# Take out day and species (factors of interest)
RNA_seq_info_35 <- RNA_seq_info[,-(1:2)]# Make the linear models (the null does not contain an interaction and the full model does)
# Define y
y <- RNA_seq_info_35[1:63,4] 
# Define the null model
fit0 <- lm(y ~ species + day)
fit0## 
## Call:
## lm(formula = y ~ species + day)
## 
## Coefficients:
##  (Intercept)  specieshuman          day1          day2          day3  
##     24.63771      -1.36653      -0.01695      -0.01695      -0.01695# Define the alternative model
fit1 <- lm(y ~ species + day + species*day)
fit1## 
## Call:
## lm(formula = y ~ species + day + species * day)
## 
## Coefficients:
##       (Intercept)       specieshuman               day1  
##         2.462e+01         -1.339e+00         -1.373e-15  
##              day2               day3  specieshuman:day1  
##        -1.454e-15         -6.558e-16         -3.571e-02  
## specieshuman:day2  specieshuman:day3  
##        -3.571e-02         -3.571e-02# Compare the models using an ANOVA
anova(fit0, fit1)## Analysis of Variance Table
## 
## Model 1: y ~ species + day
## Model 2: y ~ species + day + species * day
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1     58 269.43                          
## 2     55 269.43  3  0.003632 2e-04      1# Get the p-value associated with the comparison
anova(fit0, fit1)$"Pr(>F)"[2]## [1] 0.9999946## Subset to only days 0-1
day <- as.factor(day[1:31])
species <- as.factor(species[1:31])
numerical_tech_factors <- c(4, 6,8:9, 12:16,18:20,24:28,30:35)
# Note: there are not enough samples to make Max. purity estimates.
# Make a matrix
ANOVA_pvalues_01 = matrix(data = NA, nrow = 1, ncol = 35, dimnames = list(c("pval_from_ANOVA"), c("Cell line", "Batch", "Sex", "Passage at seed",  "Start date", "Density at seed", "Harvest time", "Harvest density", "High conf purity", "Max purity", "RNA Extraction Date", "RNA conc", "RIN", "260 280 RNA", "260 230 RNA", "DNA concentration", "Library prep batch", "Library concentration", "uL sample", "uL EB", "Index sequence", "Seq pool", "Lane r1", "Mseqs R1", "Total lane reads 1", "Lane prop r1", "Dup r1", "GC r1", "Lane r2", "Mseqs r2", "Total lane reads r2", "Lane prop r2", "Dups r2", "GC r2", "Total reads")))
# Run for the numerical technical factors
j=1
for (i in numerical_tech_factors){
    # Define y (the technical factor)
  
    y <- RNA_seq_info_35[1:31,i]
  
    # Define the null model
  fit0 <- lm(y ~ species + day)
  # Define the alternative model
  fit1 <- lm(y ~ species + day + species*day)
  # Compare the models using an ANOVA
  anova(fit0, fit1)
  # Get the p-value associated with the comparison
  ANOVA_pvalues_01[1,i] <- anova(fit0, fit1)$"Pr(>F)"[2]
}  
# Note: there are not enough samples to make Max. purity estimates. 
ANOVA_pvalues_01[1,10] <- NA###### FOR DIFFERENTIATION BATCH/START DATE #######
    # Define y (the technical factor)
  
    y <- as.numeric(RNA_seq_info_35[1:31,2])
    
    # Relevel so 0 and 1
    
    y[y == 2] <- 0
    y[y == 1] <- 1
    
    # Define the null model
  mydata <- data.frame(cbind(species, day, y))
  mydata$species <- as.factor(mydata$species)
  mydata$day <- as.factor(mydata$day)
  
### GLM
  
  fit0 <- glm(y ~ day  + species, data = mydata, family = "binomial")
  # Define the alternative model
  fit1 <- glm(y ~ day  + species + day*species, data = mydata, family = "binomial")
  
    # Compare the models using an ANOVA
  anova.fit <- anova(fit0, fit1)
  
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
  deviance <- anova.fit$Deviance[2]
  deviance.df <- anova.fit$Df[2]
  # Compute significance value of the deviance
  ANOVA_pvalues_01[1,2] <- 1-pchisq(deviance, deviance.df) 
  ANOVA_pvalues_01[1,5] <- 1-pchisq(deviance, deviance.df) 
  ###### FOR SEX #######
    # Define y (the technical factor)
  
    y <- as.character(RNA_seq_info_35[1:31,3])
    
    # Relevel so 0 and 1
    
    y[y == "F"] <- 0
    y[y == "M"] <- 1
    
    # Define the null model
  mydata <- data.frame(cbind(species, day, y))
  mydata$species <- as.factor(mydata$species)
  mydata$day <- as.factor(mydata$day)
  
### GLM
  
  fit0 <- glm(y ~ day  + species, data = mydata, family = "binomial")
  # Define the alternative model
  fit1 <- glm(y ~ day  + species + day*species, data = mydata, family = "binomial")
  
    # Compare the models using an ANOVA
  anova.fit <- anova(fit0, fit1)
  
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
  deviance <- anova.fit$Deviance[2]
  deviance.df <- anova.fit$Df[2]
  # Compute significance value of the deviance
  ANOVA_pvalues_01[1,3] <- 1-pchisq(deviance, deviance.df) 
  
  
###### FOR LIBRARY PREP BATCH #######
  
  # Define y (the technical factor)
  
    y <- as.numeric(RNA_seq_info_35[1:31,17])
    
    # Relevel so 0 and 1
    
    y[y == 1] <- 0
    y[y == 2] <- 1
    
   # Combine the variables into 1 data frame
  mydata <- data.frame(cbind(species, day, y))
  mydata$species <- as.factor(mydata$species)
  mydata$day <- as.factor(mydata$day)
  
### GLM
  
  # Define the null model
  
  fit0 <- glm(y ~ day  + species, data = mydata, family = "binomial")
  # Define the alternative model
  fit1 <- glm(y ~ day  + species + day*species, data = mydata, family = "binomial")
  
    # Compare the models using an ANOVA
  anova.fit <- anova(fit0, fit1)
  
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
  deviance <- anova.fit$Deviance[2]
  deviance.df <- anova.fit$Df[2]
  # Compute significance value of the deviance
  ANOVA_pvalues_01[1,17] <- 1-pchisq(deviance, deviance.df) categorical_tech_factors <- c(1, 7, 11, 21:23, 29 )
# Run for the numerical technical factors
j=1
for (i in categorical_tech_factors){
    # Define y (the technical factor)
  
    y <- RNA_seq_info_35[1:31,i]
    # Get data into a dataframe
    mydata <- data.frame(cbind(species, day, y))
    mydata$y <- as.factor(mydata$y)
    mydata$species <- as.factor(mydata$species)
    mydata$day <- as.factor(mydata$day)
      
### GLM
  
  fit0 <- multinom(y ~ day  + species, data = mydata)
  # Define the alternative model
  fit1 <- multinom(y ~ day  + species + day*species, data = mydata)
  # Compare the models using an ANOVA
  # Get the p-value associated with the comparison
  ANOVA_pvalues_01[1,i] <- anova(fit0, fit1)$"Pr(Chi)"[2]
}  ## # weights:  64 (45 variable)
## initial  value 85.950250 
## iter  10 value 63.939967
## iter  20 value 63.532340
## iter  30 value 63.527997
## final  value 63.527970 
## converged
## # weights:  80 (60 variable)
## initial  value 85.950250 
## iter  10 value 63.787448
## iter  20 value 63.529905
## iter  30 value 63.527980
## final  value 63.527969 
## converged
## # weights:  24 (15 variable)
## initial  value 55.544544 
## iter  10 value 32.404269
## iter  20 value 31.985102
## iter  30 value 31.982999
## final  value 31.982997 
## converged
## # weights:  30 (20 variable)
## initial  value 55.544544 
## iter  10 value 32.426183
## iter  20 value 31.983545
## iter  30 value 31.982997
## final  value 31.982996 
## converged
## # weights:  24 (15 variable)
## initial  value 55.544544 
## iter  10 value 52.931649
## final  value 52.931568 
## converged
## # weights:  30 (20 variable)
## initial  value 55.544544 
## iter  10 value 52.500355
## iter  20 value 52.439498
## iter  30 value 52.437758
## iter  40 value 52.437614
## final  value 52.437614 
## converged
## # weights:  32 (21 variable)
## initial  value 64.462688 
## iter  10 value 41.629660
## iter  20 value 40.644671
## iter  30 value 40.640587
## final  value 40.640560 
## converged
## # weights:  40 (28 variable)
## initial  value 64.462688 
## iter  10 value 42.358140
## iter  20 value 40.672671
## iter  30 value 40.640632
## final  value 40.640560 
## converged
## # weights:  16 (9 variable)
## initial  value 42.975125 
## iter  10 value 42.810508
## iter  10 value 42.810508
## iter  10 value 42.810508
## final  value 42.810508 
## converged
## # weights:  20 (12 variable)
## initial  value 42.975125 
## iter  10 value 42.735138
## final  value 42.733553 
## converged
## # weights:  16 (9 variable)
## initial  value 42.975125 
## iter  10 value 42.810508
## iter  10 value 42.810508
## iter  10 value 42.810508
## final  value 42.810508 
## converged
## # weights:  20 (12 variable)
## initial  value 42.975125 
## iter  10 value 42.735138
## final  value 42.733553 
## converged
## # weights:  16 (9 variable)
## initial  value 42.975125 
## iter  10 value 42.810508
## iter  10 value 42.810508
## iter  10 value 42.810508
## final  value 42.810508 
## converged
## # weights:  20 (12 variable)
## initial  value 42.975125 
## iter  10 value 42.735138
## final  value 42.733553 
## convergedANOVA_pvalues_01##                 Cell line     Batch       Sex Passage at seed Start date
## pval_from_ANOVA         1 0.8419522 0.8419522       0.9824348  0.8419522
##                 Density at seed Harvest time Harvest density
## pval_from_ANOVA       0.9692987            1       0.6186798
##                 High conf purity Max purity RNA Extraction Date  RNA conc
## pval_from_ANOVA        0.1362526         NA            0.963535 0.4905153
##                       RIN 260 280 RNA 260 230 RNA DNA concentration
## pval_from_ANOVA 0.6433036   0.1160222   0.1592037         0.5053192
##                 Library prep batch Library concentration uL sample
## pval_from_ANOVA          0.8419522             0.5415916 0.4832505
##                     uL EB Index sequence  Seq pool   Lane r1  Mseqs R1
## pval_from_ANOVA 0.4832505              1 0.9846624 0.9846624 0.3699091
##                 Total lane reads 1 Lane prop r1    Dup r1     GC r1
## pval_from_ANOVA          0.8835576    0.3668641 0.3036041 0.6749494
##                   Lane r2  Mseqs r2 Total lane reads r2 Lane prop r2
## pval_from_ANOVA 0.9846624 0.6291315           0.8494038    0.6078586
##                   Dups r2     GC r2 Total reads
## pval_from_ANOVA 0.5448301 0.6666955   0.3101537#write.table(ANOVA_pvalues_01, "/Users/laurenblake/Desktop/pc_pvalues.txt")ANOVA_pvalues_01[,ANOVA_pvalues_01 < 0.05]## [1] NA# Redefine day and species (includes all 63 samples)
species <- After_removal_sample_info$Species
day <- as.factor(After_removal_sample_info$Day)
# Make a matrix
numerical_tech_factors <- c(4, 6,8:10, 12:16,18:20,24:28,30:35)
# Note: there are enough samples to make Max. purity estimates.
# Make a matrix
ANOVA_pvalues_03 = matrix(data = NA, nrow = 1, ncol = 35, dimnames = list(c("pval_from_ANOVA"), c("Cell line", "Batch", "Sex", "Passage at seed",  "Start date", "Density at seed", "Harvest time", "Harvest density", "High conf purity", "Max purity", "RNA Extraction Date", "RNA conc", "RIN", "260 280 RNA", "260 230 RNA", "DNA concentration", "Library prep batch", "Library concentration", "uL sample", "uL EB", "Index sequence", "Seq pool", "Lane r1", "Mseqs R1", "Total lane reads 1", "Lane prop r1", "Dup r1", "GC r1", "Lane r2", "Mseqs r2", "Total lane reads r2", "Lane prop r2", "Dups r2", "GC r2", "Total reads")))
# Run for the numerical technical factors
j=1
for (i in numerical_tech_factors){
    # Define y (the technical factor)
  
    y <- RNA_seq_info_35[1:63,i]
  
    # Define the null model
  fit0 <- lm(y ~ species + day)
  # Define the alternative model
  fit1 <- lm(y ~ species + day + species*day)
  # Compare the models using an ANOVA
  anova(fit0, fit1)
  # Get the p-value associated with the comparison
  ANOVA_pvalues_03[1,i] <- anova(fit0, fit1)$"Pr(>F)"[2]
}  ###### FOR DIFFERENTIATION BATCH/START DATE #######
    # Define y (the technical factor)
  
    y <- as.numeric(RNA_seq_info_35[1:63,2])
    
    # Relevel so 0 and 1
    
    y[y == 2] <- 0
    y[y == 1] <- 1
    
    # Define the null model
  mydata <- data.frame(cbind(species, day, y))
  mydata$species <- as.factor(mydata$species)
  mydata$day <- as.factor(mydata$day)
  
### GLM
  
  fit0 <- glm(y ~ day  + species, data = mydata, family = "binomial")
  # Define the alternative model
  fit1 <- glm(y ~ day  + species + day*species, data = mydata, family = "binomial")
  
    # Compare the models using an ANOVA
  anova.fit <- anova(fit0, fit1)
  
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
  deviance <- anova.fit$Deviance[2]
  deviance.df <- anova.fit$Df[2]
  # Compute significance value of the deviance
  ANOVA_pvalues_03[1,2] <- 1-pchisq(deviance, deviance.df) 
  ANOVA_pvalues_03[1,5] <- 1-pchisq(deviance, deviance.df) 
  ###### FOR SEX #######
    # Define y (the technical factor)
  
    y <- as.character(RNA_seq_info_35[1:63,3])
    
    # Relevel so 0 and 1
    
    y[y == "F"] <- 0
    y[y == "M"] <- 1
    
    # Define the null model
  mydata <- data.frame(cbind(species, day, y))
  mydata$species <- as.factor(mydata$species)
  mydata$day <- as.factor(mydata$day)
  
### GLM
  
  fit0 <- glm(y ~ day  + species, data = mydata, family = "binomial")
  # Define the alternative model
  fit1 <- glm(y ~ day  + species + day*species, data = mydata, family = "binomial")
  
    # Compare the models using an ANOVA
  anova.fit <- anova(fit0, fit1)
  
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
  deviance <- anova.fit$Deviance[2]
  deviance.df <- anova.fit$Df[2]
  # Compute significance value of the deviance
  ANOVA_pvalues_03[1,3] <- 1-pchisq(deviance, deviance.df) 
  
  
###### FOR LIBRARY PREP BATCH #######
  
  # Define y (the technical factor)
  
    y <- as.numeric(RNA_seq_info_35[1:63,17])
    
    # Relevel so 0 and 1
    
    y[y == 1] <- 0
    y[y == 2] <- 1
    
   # Combine the variables into 1 data frame
  mydata <- data.frame(cbind(species, day, y))
  mydata$species <- as.factor(mydata$species)
  mydata$day <- as.factor(mydata$day)
  
### GLM
  
  # Define the null model
  
  fit0 <- glm(y ~ day  + species, data = mydata, family = "binomial")
  # Define the alternative model
  fit1 <- glm(y ~ day  + species + day*species, data = mydata, family = "binomial")
  
    # Compare the models using an ANOVA
  anova.fit <- anova(fit0, fit1)
  
# Extract deviance: a measure of goodness-of-fit of the model, equivalent to R-square in linear models
  deviance <- anova.fit$Deviance[2]
  deviance.df <- anova.fit$Df[2]
  # Compute significance value of the deviance
  ANOVA_pvalues_03[1,17] <- 1-pchisq(deviance, deviance.df) categorical_tech_factors <- c(1, 7, 11, 21:23, 29 )
# Run for the numerical technical factors
j=1
for (i in categorical_tech_factors){
    # Define y (the technical factor)
  
    y <- RNA_seq_info[1:63,i]
    # Get data into a dataframe
    mydata <- data.frame(cbind(species, day, y))
    mydata$y <- as.factor(mydata$y)
    mydata$species <- as.factor(mydata$species)
    mydata$day <- as.factor(mydata$day)
      
### GLM
  
  fit0 <- multinom(y ~ day  + species, data = mydata)
  # Define the alternative model
  fit1 <- multinom(y ~ day  + species + day*species, data = mydata)
  # Compare the models using an ANOVA
  # Get the p-value associated with the comparison
  ANOVA_pvalues_03[1,i] <- anova(fit0, fit1)$"Pr(Chi)"[2]
}  ## # weights:  6 (5 variable)
## initial  value 43.668272 
## final  value 0.000069 
## converged
## # weights:  9 (8 variable)
## initial  value 43.668272 
## final  value 0.000056 
## converged
## # weights:  6 (5 variable)
## initial  value 43.668272 
## final  value 43.625865 
## converged
## # weights:  9 (8 variable)
## initial  value 43.668272 
## iter  10 value 43.596599
## iter  10 value 43.596599
## iter  10 value 43.596599
## final  value 43.596599 
## converged
## # weights:  180 (145 variable)
## initial  value 102.035921 
## iter  10 value 40.476261
## iter  20 value 39.863843
## iter  30 value 39.862740
## final  value 39.862739 
## converged
## # weights:  270 (232 variable)
## initial  value 102.035921 
## iter  10 value 40.070381
## iter  20 value 39.863132
## iter  30 value 39.862739
## final  value 39.862739 
## converged
## # weights:  378 (310 variable)
## initial  value 261.017488 
## iter  10 value 134.960765
## iter  20 value 130.082952
## iter  30 value 130.070114
## final  value 130.070098 
## converged
## # weights:  567 (496 variable)
## initial  value 261.017488 
## iter  10 value 132.711746
## iter  20 value 130.075479
## iter  30 value 130.070108
## final  value 130.070099 
## converged
## # weights:  378 (310 variable)
## initial  value 261.017488 
## iter  10 value 135.313984
## iter  20 value 130.191868
## iter  30 value 130.070248
## final  value 130.070098 
## converged
## # weights:  567 (496 variable)
## initial  value 261.017488 
## iter  10 value 134.062671
## iter  20 value 130.098029
## iter  30 value 130.070200
## final  value 130.070099 
## converged
## # weights:  96 (75 variable)
## initial  value 174.673090 
## iter  10 value 86.778036
## iter  20 value 82.988239
## iter  30 value 82.909787
## iter  40 value 82.908989
## iter  40 value 82.908988
## iter  40 value 82.908988
## final  value 82.908988 
## converged
## # weights:  144 (120 variable)
## initial  value 174.673090 
## iter  10 value 88.264763
## iter  20 value 82.972487
## iter  30 value 82.909766
## iter  40 value 82.908988
## iter  40 value 82.908988
## iter  40 value 82.908988
## final  value 82.908988 
## converged
## # weights:  348 (285 variable)
## initial  value 255.807910 
## iter  10 value 139.896630
## iter  20 value 131.294265
## iter  30 value 130.079642
## iter  40 value 130.070127
## final  value 130.070099 
## converged
## # weights:  522 (456 variable)
## initial  value 255.807910 
## iter  10 value 138.613203
## iter  20 value 130.144462
## iter  30 value 130.070365
## final  value 130.070099 
## convergedANOVA_pvalues_03##                 Cell line     Batch       Sex Passage at seed Start date
## pval_from_ANOVA         1 0.9962993 0.9962993       0.9999946  0.9962993
##                 Density at seed Harvest time Harvest density
## pval_from_ANOVA        0.999971    0.9962993     0.005792107
##                 High conf purity   Max purity RNA Extraction Date
## pval_from_ANOVA     6.415024e-05 0.0008066171                   1
##                  RNA conc       RIN 260 280 RNA 260 230 RNA
## pval_from_ANOVA 0.1585667 0.5864589   0.2301219       0.132
##                 DNA concentration Library prep batch Library concentration
## pval_from_ANOVA         0.8565571          0.9962993             0.8715344
##                 uL sample     uL EB Index sequence Seq pool Lane r1
## pval_from_ANOVA 0.6689692 0.6689692              1        1       1
##                  Mseqs R1 Total lane reads 1 Lane prop r1    Dup r1
## pval_from_ANOVA 0.2039331          0.9984402    0.2027423 0.2590957
##                     GC r1 Lane r2  Mseqs r2 Total lane reads r2
## pval_from_ANOVA 0.8496839       1 0.6085574           0.9966626
##                 Lane prop r2   Dups r2     GC r2 Total reads
## pval_from_ANOVA    0.6080504 0.3468663 0.7606478   0.3474322ANOVA_pvalues_03[,ANOVA_pvalues_03 < 0.05]##  Harvest density High conf purity       Max purity 
##     5.792107e-03     6.415024e-05     8.066171e-04Conclusions: Interactions are important for describing the relationship between our biological variables of interest (day and species) and Harvest density, as well as our purity estimates for days 0 to 3 but not day 0 to 1.