7. Differentially Abundant ASVs & Taxa

Reproducible workflow to assess differentially abundant ASVs and taxa between temperature treatments using Indicator Species Analysis and linear discriminant analysis (LDA) effect size (LEfSe).

Click here for setup information.
knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
set.seed(119)
#library(conflicted)
#pacman::p_depends(microbiomeMarker, local = TRUE)  
#pacman::p_depends_reverse(microbiomeMarker, local = TRUE)  
library(phyloseq); packageVersion("phyloseq")
library(Biostrings); packageVersion("Biostrings")
pacman::p_load(tidyverse, patchwork, ampvis2, 
               agricolae, labdsv, naniar, ape,
               pairwiseAdonis, microbiome, seqRFLP, microbiomeMarker,
               microbiomeMarker, reactable, downloadthis, captioner,
               install = FALSE, update = FALSE)

options(scipen=999)
knitr::opts_current$get(c(
  "cache",
  "cache.path",
  "cache.rebuild",
  "dependson",
  "autodep"
))

Synopsis

This workflow contains ASV differential abundance analysis for the high temperature data sets. In order to run the workflow, you either need to first run the Data Preparation workflow and the Filtering workflow or or download the files linked below. See the Data Availability page for complete details.

Workflow Input

Files needed to run this workflow can be downloaded from figshare.

In this workflow, we use the labdsv package (Roberts and Roberts 2016) to run Dufrene-Legendre Indicator Species Analysis and the microbiomeMarker package (Cao 2020) to run linear discriminant analysis (LDA) effect size (LEfSe) (Segata et al. 2011).

16s rRNA

Indicator Analysis

Indicator Analysis calculates the indicator value (fidelity and relative abundance) of species in clusters or types.

  1. Choose a data set(s) & set the p-value cutoff.
Code
samp_ps <- c("ssu18_ps_work", "ssu18_ps_filt", 
             "ssu18_ps_perfect", "ssu18_ps_pime")
indval_pval <- 0.05
  1. Format Data Frame
Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_df <- data.frame(otu_table(tmp_get))
     tmp_df <- tmp_df[, which(colSums(tmp_df) != 0)]
     tmp_row_names <- row.names(tmp_df)
     tmp_row_names <- tmp_row_names %>%
              stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
              stringr::str_replace("[A-Z]$", "")
     tmp_df <- tmp_df %>% tibble::add_column(tmp_row_names, .before = 1)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_seq_tab"))
     assign(tmp_name, tmp_df)
     rm(list = ls(pattern = "tmp_"))
}
objects()
  1. Run Indicator Analysis
Code
set.seed(1191)
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
     tmp_iva <- indval(tmp_get[,-1], tmp_get[,1])
     tmp_gr <- tmp_iva$maxcls[tmp_iva$pval <= indval_pval]
     tmp_iv <- tmp_iva$indcls[tmp_iva$pval <= indval_pval]
     tmp_pv <- tmp_iva$pval[tmp_iva$pval <= indval_pval]
     tmp_fr <- apply(tmp_get[,-1] > 0, 2, sum)[tmp_iva$pval <= indval_pval]
     tmp_sum <- data.frame(group = tmp_gr, indval = tmp_iv,
                                  pval = tmp_pv, freq = tmp_fr)
     tmp_sum <- tmp_sum[order(tmp_sum$group, -tmp_sum$indval),]

     tmp_tax_df <- data.frame(tax_table(get(i)))
     tmp_tax_df$ASV_ID <- NULL
     tmp_sum_tax <- merge(tmp_sum, tmp_tax_df, by = "row.names", all = TRUE)
     tmp_sum_tax <- tmp_sum_tax[!(is.na(tmp_sum_tax$group)),]
     class(tmp_sum_tax$group) <- "character"
     tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^1$", "C0")
     tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^2$", "W3")
     tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^3$", "W8")
     tmp_sum_tax <- tmp_sum_tax %>% dplyr::rename("ASV_ID" = "Row.names")
     tmp_sum_tax <- tmp_sum_tax[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum_tax$ASV_ID))),]
     tmp_sum_tax$ASV_ID <-  as.character(tmp_sum_tax$ASV_ID)
     tmp_sum.prob.corrected <- p.adjust(tmp_sum$pval, "bonferroni")
     tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_indval_summary"))
     assign(tmp_res_name, tmp_sum_tax)
     rm(list = ls(pattern = "tmp_"))
}
objects()

Now we can save a few files and display the data.

Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
     tmp_get[,1] <- NULL
     tmp_df <- as.data.frame(t(tmp_get))
     tmp_col_names <- colnames(tmp_df)
     tmp_col_names <- tmp_col_names %>%
              stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
              stringr::str_replace("[A-Z]$", "")
     colnames(tmp_df) <- tmp_col_names
     #tmp_df$freq_all <- apply(tmp_df > 0, 1, sum)
     tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
     tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
     tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
     tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
     tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
     tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
     tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
     tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
     tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")

     tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
     tmp_merge_df <- merge(tmp_df, tmp_get_indval, by = "ASV_ID", all = FALSE)
     tmp_merge_df <- tmp_merge_df[,c(1,9:12,2:8,13:19)]
     tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
     tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_indval_final"))
     assign(tmp_merge_name, tmp_merge_df)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_indval_summary")
  1. Save a new phyloseq object
Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
     tmp_list <- tmp_tab[,1]
     tmp_ps <- prune_taxa(tmp_list, tmp_get)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_ind"))
     assign(tmp_name, tmp_ps)
     tmp_ps@phy_tree <- NULL
     tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
     tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
                            tip.label = taxa_names(tmp_ps))
     tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
     assign(tmp_name, tmp_ps)
     rm(list = ls(pattern = "tmp_"))
}
objects()

Indicator Summary

(16S rRNA) Table 1 | Significant results of Indicator Analysis for FULL data set.

(16S rRNA) Table 2 | Significant results of Indicator Analysis for Arbitrary filtered ASV data set.

(16S rRNA) Table 3 | Significant results of Indicator Analysis for PERfect filtered data set.

(16S rRNA) Table 4 | Significant results of Indicator Analysis for PIME data set.

LEfSe Analysis

Note

Linear discriminant analysis (LDA) effect size (LEfSe) is a method to support high-dimensional class comparisons with a particular focus on microbial community analyses. LEfSe determines the features (organisms, clades, operational taxonomic units, genes, or functions) most likely to explain differences between classes by coupling standard tests for statistical significance with additional tests encoding biological consistency and effect relevance. Class comparison methods typically predict biomarkers consisting of features that violate a null hypothesis of no difference between classes. The effect size provides an estimation of the magnitude of the observed phenomenon due to each characterizing feature and it is thus a valuable tool for ranking the relevance of different biological aspects and for addressing further investigations and analyses.

First, a little data tidying and subsetting only ASV data from the taxonomy table.

Code
for (i in samp_ps) {
  tmp_get <- get(i)
  tax_table(tmp_get) <- tax_table(tmp_get)[,c(1:6)]
  tax_table(tmp_get) <- cbind(tax_table(tmp_get),
                           rownames(tax_table(tmp_get)))
  colnames(tax_table(tmp_get)) <- c("Kingdom", "Phylum", "Class", "Order", 
                                    "Family", "Genus", "Species")
  tmp_name <- purrr::map_chr(i, ~ paste0(., "_rf_all"))
  assign(tmp_name, tmp_get)
  tax_table(tmp_get) <- tax_table(tmp_get)[,c(7)]
  tmp_name_asv <- purrr::map_chr(i, ~ paste0(., "_rf_asv"))
  assign(tmp_name_asv, tmp_get)
  rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_rf")
  1. Choose a data set(s) & set the LEfSe cutoff values.
samp_ps <- c("ssu18_ps_work", "ssu18_ps_filt", "ssu18_ps_perfect", "ssu18_ps_pime")
lefse_lda <- 2
lefse_kw <- 0.05
lefse_wc <- 0.05
Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_asv")))
     tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2", 
                        lda_cutoff = lefse_lda, 
                        kw_cutoff = lefse_kw, 
                        wilcoxon_cutoff = lefse_wc, 
                        bootstrap_n = 30, bootstrap_fraction = 2/3, 
                        sample_min = 10, multicls_strat = FALSE, curv = FALSE)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv"))
     assign(tmp_name, tmp_lefse)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")

for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_all")))
     tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2", 
                        lda_cutoff = lefse_lda, 
                        kw_cutoff = lefse_kw, 
                        wilcoxon_cutoff = lefse_wc, 
                        bootstrap_n = 30, bootstrap_fraction = 2/3, 
                        sample_min = 10, multicls_strat = FALSE, curv = FALSE)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all"))
     assign(tmp_name, tmp_lefse)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
objects(pattern = "_lefse_all")
Code
for (i in samp_ps) {
    tmp_o_tax <- data.frame(tax_table(get(i)))
    tmp_o_tax$ASV_ID <- NULL 
    tmp_o_tax <- tmp_o_tax %>% tibble::rownames_to_column("ASV_ID")
    
    tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
    tmp_mt <- data.frame(marker_table(tmp_get))
    tmp_mt$feature <- gsub('s__', '', tmp_mt$feature)
    tmp_mt <- tmp_mt %>% dplyr::rename(c("ASV_ID" = 1, "group" = 2, "pval" = 4)) %>%
          tibble::remove_rownames()
    tmp_sum <- dplyr::left_join(tmp_mt, tmp_o_tax, by = "ASV_ID")
    tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^0$", "C0")
    tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^3$", "W3")
    tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^8$", "W8")

    tmp_sum <- tmp_sum[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum$ASV_ID))),]
    
    tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_summary"))
    assign(tmp_res_name, tmp_sum)
    rm(list = ls(pattern = "tmp_"))
}

Now we can save a few files and display the data.

Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_df <- data.frame(t(otu_table(tmp_get)))
     #tmp_get[,1] <- NULL
     #tmp_df <- as.data.frame(t(tmp_otu))
     tmp_col_names <- colnames(tmp_df)
     tmp_col_names <- tmp_col_names %>%
              stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
              stringr::str_replace("[A-Z]$", "")
     colnames(tmp_df) <- tmp_col_names
     tmp_df$freq <- apply(tmp_df > 0, 1, sum)
     tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
     tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
     tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
     tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
     tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
     tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
     tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
     tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
     tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")

     tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
     tmp_merge_df <- merge(tmp_df, tmp_get_lefse, by = "ASV_ID", all = FALSE)

     tmp_merge_df <- tmp_merge_df[,c(1,10:12,2:9,14:20)]
     tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
     tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_lefse_final"))
     assign(tmp_merge_name, tmp_merge_df)
     rm(list = ls(pattern = "tmp_"))
}

Save a new phyloseq object

Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
     tmp_list <- tmp_tab[,1]
     tmp_ps <- prune_taxa(tmp_list, tmp_get)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse"))
     assign(tmp_name, tmp_ps)
     tmp_ps@phy_tree <- NULL
     tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
     tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
                            tip.label = taxa_names(tmp_ps))
     tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
     assign(tmp_name, tmp_ps)
     rm(list = ls(pattern = "tmp_"))
}
objects()

LEfSe Summaries (ASVs & Markers)

ASV summary data


(16S rRNA) Table 5 | Significant results of LEfSe Analysis for FULL data set.