knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
set.seed(119)
#library(conflicted)
#pacman::p_depends(microbiomeMarker, local = TRUE)
#pacman::p_depends_reverse(microbiomeMarker, local = TRUE)
library(phyloseq); packageVersion("phyloseq")
library(Biostrings); packageVersion("Biostrings")
pacman::p_load(tidyverse, patchwork, ampvis2,
agricolae, labdsv, naniar, ape,
pairwiseAdonis, microbiome, seqRFLP, microbiomeMarker,
microbiomeMarker, reactable, downloadthis, captioner,
install = FALSE, update = FALSE)
options(scipen=999)
knitr::opts_current$get(c(
"cache",
"cache.path",
"cache.rebuild",
"dependson",
"autodep"
))
Click here for setup information.
Synopsis
This workflow contains ASV differential abundance analysis for the high temperature data sets. In order to run the workflow, you either need to first run the Data Preparation workflow and the Filtering workflow or or download the files linked below. See the Data Availability page for complete details.
Workflow Input
Files needed to run this workflow can be downloaded from figshare.
In this workflow, we use the labdsv
package (Roberts and Roberts 2016) to run Dufrene-Legendre Indicator Species Analysis and the microbiomeMarker package (Cao 2020) to run linear discriminant analysis (LDA) effect size (LEfSe) (Segata et al. 2011).
16s rRNA
Indicator Analysis
Indicator Analysis calculates the indicator value (fidelity and relative abundance) of species in clusters or types.
- Choose a data set(s) & set the p-value cutoff.
Code
samp_ps <- c("ssu18_ps_work", "ssu18_ps_filt",
"ssu18_ps_perfect", "ssu18_ps_pime")
indval_pval <- 0.05
- Format Data Frame
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_df <- data.frame(otu_table(tmp_get))
tmp_df <- tmp_df[, which(colSums(tmp_df) != 0)]
tmp_row_names <- row.names(tmp_df)
tmp_row_names <- tmp_row_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
tmp_df <- tmp_df %>% tibble::add_column(tmp_row_names, .before = 1)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_seq_tab"))
assign(tmp_name, tmp_df)
rm(list = ls(pattern = "tmp_"))
}
objects()
- Run Indicator Analysis
Code
set.seed(1191)
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
tmp_iva <- indval(tmp_get[,-1], tmp_get[,1])
tmp_gr <- tmp_iva$maxcls[tmp_iva$pval <= indval_pval]
tmp_iv <- tmp_iva$indcls[tmp_iva$pval <= indval_pval]
tmp_pv <- tmp_iva$pval[tmp_iva$pval <= indval_pval]
tmp_fr <- apply(tmp_get[,-1] > 0, 2, sum)[tmp_iva$pval <= indval_pval]
tmp_sum <- data.frame(group = tmp_gr, indval = tmp_iv,
pval = tmp_pv, freq = tmp_fr)
tmp_sum <- tmp_sum[order(tmp_sum$group, -tmp_sum$indval),]
tmp_tax_df <- data.frame(tax_table(get(i)))
tmp_tax_df$ASV_ID <- NULL
tmp_sum_tax <- merge(tmp_sum, tmp_tax_df, by = "row.names", all = TRUE)
tmp_sum_tax <- tmp_sum_tax[!(is.na(tmp_sum_tax$group)),]
class(tmp_sum_tax$group) <- "character"
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^1$", "C0")
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^2$", "W3")
tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^3$", "W8")
tmp_sum_tax <- tmp_sum_tax %>% dplyr::rename("ASV_ID" = "Row.names")
tmp_sum_tax <- tmp_sum_tax[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum_tax$ASV_ID))),]
tmp_sum_tax$ASV_ID <- as.character(tmp_sum_tax$ASV_ID)
tmp_sum.prob.corrected <- p.adjust(tmp_sum$pval, "bonferroni")
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_indval_summary"))
assign(tmp_res_name, tmp_sum_tax)
rm(list = ls(pattern = "tmp_"))
}
objects()
Now we can save a few files and display the data.
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
tmp_get[,1] <- NULL
tmp_df <- as.data.frame(t(tmp_get))
tmp_col_names <- colnames(tmp_df)
tmp_col_names <- tmp_col_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
colnames(tmp_df) <- tmp_col_names
#tmp_df$freq_all <- apply(tmp_df > 0, 1, sum)
tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")
tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
tmp_merge_df <- merge(tmp_df, tmp_get_indval, by = "ASV_ID", all = FALSE)
tmp_merge_df <- tmp_merge_df[,c(1,9:12,2:8,13:19)]
tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_indval_final"))
assign(tmp_merge_name, tmp_merge_df)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_indval_summary")
- Save a new phyloseq object
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
tmp_list <- tmp_tab[,1]
tmp_ps <- prune_taxa(tmp_list, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_ind"))
assign(tmp_name, tmp_ps)
tmp_ps@phy_tree <- NULL
tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
tip.label = taxa_names(tmp_ps))
tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
assign(tmp_name, tmp_ps)
rm(list = ls(pattern = "tmp_"))
}
objects()
Indicator Summary
LEfSe Analysis
From (Segata et al. 2011).
Linear discriminant analysis (LDA) effect size (LEfSe) is a method to support high-dimensional class comparisons with a particular focus on microbial community analyses. LEfSe determines the features (organisms, clades, operational taxonomic units, genes, or functions) most likely to explain differences between classes by coupling standard tests for statistical significance with additional tests encoding biological consistency and effect relevance. Class comparison methods typically predict biomarkers consisting of features that violate a null hypothesis of no difference between classes. The effect size provides an estimation of the magnitude of the observed phenomenon due to each characterizing feature and it is thus a valuable tool for ranking the relevance of different biological aspects and for addressing further investigations and analyses.
First, a little data tidying and subsetting only ASV data from the taxonomy table.
Code
for (i in samp_ps) {
tmp_get <- get(i)
tax_table(tmp_get) <- tax_table(tmp_get)[,c(1:6)]
tax_table(tmp_get) <- cbind(tax_table(tmp_get),
rownames(tax_table(tmp_get)))
colnames(tax_table(tmp_get)) <- c("Kingdom", "Phylum", "Class", "Order",
"Family", "Genus", "Species")
tmp_name <- purrr::map_chr(i, ~ paste0(., "_rf_all"))
assign(tmp_name, tmp_get)
tax_table(tmp_get) <- tax_table(tmp_get)[,c(7)]
tmp_name_asv <- purrr::map_chr(i, ~ paste0(., "_rf_asv"))
assign(tmp_name_asv, tmp_get)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_rf")
- Choose a data set(s) & set the LEfSe cutoff values.
samp_ps <- c("ssu18_ps_work", "ssu18_ps_filt", "ssu18_ps_perfect", "ssu18_ps_pime")
lefse_lda <- 2
lefse_kw <- 0.05
lefse_wc <- 0.05
Code
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_asv")))
tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2",
lda_cutoff = lefse_lda,
kw_cutoff = lefse_kw,
wilcoxon_cutoff = lefse_wc,
bootstrap_n = 30, bootstrap_fraction = 2/3,
sample_min = 10, multicls_strat = FALSE, curv = FALSE)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv"))
assign(tmp_name, tmp_lefse)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
for (i in samp_ps) {
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_all")))
tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2",
lda_cutoff = lefse_lda,
kw_cutoff = lefse_kw,
wilcoxon_cutoff = lefse_wc,
bootstrap_n = 30, bootstrap_fraction = 2/3,
sample_min = 10, multicls_strat = FALSE, curv = FALSE)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all"))
assign(tmp_name, tmp_lefse)
rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
objects(pattern = "_lefse_all")
Code
for (i in samp_ps) {
tmp_o_tax <- data.frame(tax_table(get(i)))
tmp_o_tax$ASV_ID <- NULL
tmp_o_tax <- tmp_o_tax %>% tibble::rownames_to_column("ASV_ID")
tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
tmp_mt <- data.frame(marker_table(tmp_get))
tmp_mt$feature <- gsub('s__', '', tmp_mt$feature)
tmp_mt <- tmp_mt %>% dplyr::rename(c("ASV_ID" = 1, "group" = 2, "pval" = 4)) %>%
tibble::remove_rownames()
tmp_sum <- dplyr::left_join(tmp_mt, tmp_o_tax, by = "ASV_ID")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^0$", "C0")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^3$", "W3")
tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^8$", "W8")
tmp_sum <- tmp_sum[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum$ASV_ID))),]
tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_summary"))
assign(tmp_res_name, tmp_sum)
rm(list = ls(pattern = "tmp_"))
}
Now we can save a few files and display the data.
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_df <- data.frame(t(otu_table(tmp_get)))
#tmp_get[,1] <- NULL
#tmp_df <- as.data.frame(t(tmp_otu))
tmp_col_names <- colnames(tmp_df)
tmp_col_names <- tmp_col_names %>%
stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
stringr::str_replace("[A-Z]$", "")
colnames(tmp_df) <- tmp_col_names
tmp_df$freq <- apply(tmp_df > 0, 1, sum)
tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")
tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
tmp_merge_df <- merge(tmp_df, tmp_get_lefse, by = "ASV_ID", all = FALSE)
tmp_merge_df <- tmp_merge_df[,c(1,10:12,2:9,14:20)]
tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_lefse_final"))
assign(tmp_merge_name, tmp_merge_df)
rm(list = ls(pattern = "tmp_"))
}
Save a new phyloseq object
Code
for (i in samp_ps) {
tmp_get <- get(i)
tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
tmp_list <- tmp_tab[,1]
tmp_ps <- prune_taxa(tmp_list, tmp_get)
tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse"))
assign(tmp_name, tmp_ps)
tmp_ps@phy_tree <- NULL
tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
tip.label = taxa_names(tmp_ps))
tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
assign(tmp_name, tmp_ps)
rm(list = ls(pattern = "tmp_"))
}
objects()