7. Differentially Abundant ASVs & Taxa

Reproducible workflow to assess differentially abundant ASVs and taxa between temperature treatments using Indicator Species Analysis and linear discriminant analysis (LDA) effect size (LEfSe).

Click here for setup information.
knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
set.seed(119)
#library(conflicted)
#pacman::p_depends(microbiomeMarker, local = TRUE)  
#pacman::p_depends_reverse(microbiomeMarker, local = TRUE)  
library(phyloseq); packageVersion("phyloseq")
library(Biostrings); packageVersion("Biostrings")
pacman::p_load(tidyverse, patchwork, ampvis2, 
               agricolae, labdsv, naniar, ape,
               pairwiseAdonis, microbiome, seqRFLP, microbiomeMarker,
               microbiomeMarker, reactable, downloadthis, captioner,
               install = FALSE, update = FALSE)

options(scipen=999)
knitr::opts_current$get(c(
  "cache",
  "cache.path",
  "cache.rebuild",
  "dependson",
  "autodep"
))

Synopsis

This workflow contains ASV differential abundance analysis for the high temperature data sets. In order to run the workflow, you either need to first run the Data Preparation workflow and the Filtering workflow or or download the files linked below. See the Data Availability page for complete details.

Workflow Input

Files needed to run this workflow can be downloaded from figshare.

In this workflow, we use the labdsv package (Roberts and Roberts 2016) to run Dufrene-Legendre Indicator Species Analysis and the microbiomeMarker package (Cao 2020) to run linear discriminant analysis (LDA) effect size (LEfSe) (Segata et al. 2011).

16s rRNA

Indicator Analysis

Indicator Analysis calculates the indicator value (fidelity and relative abundance) of species in clusters or types.

  1. Choose a data set(s) & set the p-value cutoff.
Code
samp_ps <- c("ssu18_ps_work", "ssu18_ps_filt", 
             "ssu18_ps_perfect", "ssu18_ps_pime")
indval_pval <- 0.05
  1. Format Data Frame
Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_df <- data.frame(otu_table(tmp_get))
     tmp_df <- tmp_df[, which(colSums(tmp_df) != 0)]
     tmp_row_names <- row.names(tmp_df)
     tmp_row_names <- tmp_row_names %>%
              stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
              stringr::str_replace("[A-Z]$", "")
     tmp_df <- tmp_df %>% tibble::add_column(tmp_row_names, .before = 1)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_seq_tab"))
     assign(tmp_name, tmp_df)
     rm(list = ls(pattern = "tmp_"))
}
objects()
  1. Run Indicator Analysis
Code
set.seed(1191)
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
     tmp_iva <- indval(tmp_get[,-1], tmp_get[,1])
     tmp_gr <- tmp_iva$maxcls[tmp_iva$pval <= indval_pval]
     tmp_iv <- tmp_iva$indcls[tmp_iva$pval <= indval_pval]
     tmp_pv <- tmp_iva$pval[tmp_iva$pval <= indval_pval]
     tmp_fr <- apply(tmp_get[,-1] > 0, 2, sum)[tmp_iva$pval <= indval_pval]
     tmp_sum <- data.frame(group = tmp_gr, indval = tmp_iv,
                                  pval = tmp_pv, freq = tmp_fr)
     tmp_sum <- tmp_sum[order(tmp_sum$group, -tmp_sum$indval),]

     tmp_tax_df <- data.frame(tax_table(get(i)))
     tmp_tax_df$ASV_ID <- NULL
     tmp_sum_tax <- merge(tmp_sum, tmp_tax_df, by = "row.names", all = TRUE)
     tmp_sum_tax <- tmp_sum_tax[!(is.na(tmp_sum_tax$group)),]
     class(tmp_sum_tax$group) <- "character"
     tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^1$", "C0")
     tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^2$", "W3")
     tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^3$", "W8")
     tmp_sum_tax <- tmp_sum_tax %>% dplyr::rename("ASV_ID" = "Row.names")
     tmp_sum_tax <- tmp_sum_tax[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum_tax$ASV_ID))),]
     tmp_sum_tax$ASV_ID <-  as.character(tmp_sum_tax$ASV_ID)
     tmp_sum.prob.corrected <- p.adjust(tmp_sum$pval, "bonferroni")
     tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_indval_summary"))
     assign(tmp_res_name, tmp_sum_tax)
     rm(list = ls(pattern = "tmp_"))
}
objects()

Now we can save a few files and display the data.

Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
     tmp_get[,1] <- NULL
     tmp_df <- as.data.frame(t(tmp_get))
     tmp_col_names <- colnames(tmp_df)
     tmp_col_names <- tmp_col_names %>%
              stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
              stringr::str_replace("[A-Z]$", "")
     colnames(tmp_df) <- tmp_col_names
     #tmp_df$freq_all <- apply(tmp_df > 0, 1, sum)
     tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
     tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
     tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
     tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
     tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
     tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
     tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
     tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
     tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")

     tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
     tmp_merge_df <- merge(tmp_df, tmp_get_indval, by = "ASV_ID", all = FALSE)
     tmp_merge_df <- tmp_merge_df[,c(1,9:12,2:8,13:19)]
     tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
     tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_indval_final"))
     assign(tmp_merge_name, tmp_merge_df)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_indval_summary")
  1. Save a new phyloseq object
Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
     tmp_list <- tmp_tab[,1]
     tmp_ps <- prune_taxa(tmp_list, tmp_get)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_ind"))
     assign(tmp_name, tmp_ps)
     tmp_ps@phy_tree <- NULL
     tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
     tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
                            tip.label = taxa_names(tmp_ps))
     tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
     assign(tmp_name, tmp_ps)
     rm(list = ls(pattern = "tmp_"))
}
objects()

Indicator Summary

(16S rRNA) Table 1 | Significant results of Indicator Analysis for FULL data set.

(16S rRNA) Table 2 | Significant results of Indicator Analysis for Arbitrary filtered ASV data set.

(16S rRNA) Table 3 | Significant results of Indicator Analysis for PERfect filtered data set.

(16S rRNA) Table 4 | Significant results of Indicator Analysis for PIME data set.

LEfSe Analysis

Note

Linear discriminant analysis (LDA) effect size (LEfSe) is a method to support high-dimensional class comparisons with a particular focus on microbial community analyses. LEfSe determines the features (organisms, clades, operational taxonomic units, genes, or functions) most likely to explain differences between classes by coupling standard tests for statistical significance with additional tests encoding biological consistency and effect relevance. Class comparison methods typically predict biomarkers consisting of features that violate a null hypothesis of no difference between classes. The effect size provides an estimation of the magnitude of the observed phenomenon due to each characterizing feature and it is thus a valuable tool for ranking the relevance of different biological aspects and for addressing further investigations and analyses.

First, a little data tidying and subsetting only ASV data from the taxonomy table.

Code
for (i in samp_ps) {
  tmp_get <- get(i)
  tax_table(tmp_get) <- tax_table(tmp_get)[,c(1:6)]
  tax_table(tmp_get) <- cbind(tax_table(tmp_get),
                           rownames(tax_table(tmp_get)))
  colnames(tax_table(tmp_get)) <- c("Kingdom", "Phylum", "Class", "Order", 
                                    "Family", "Genus", "Species")
  tmp_name <- purrr::map_chr(i, ~ paste0(., "_rf_all"))
  assign(tmp_name, tmp_get)
  tax_table(tmp_get) <- tax_table(tmp_get)[,c(7)]
  tmp_name_asv <- purrr::map_chr(i, ~ paste0(., "_rf_asv"))
  assign(tmp_name_asv, tmp_get)
  rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_rf")
  1. Choose a data set(s) & set the LEfSe cutoff values.
samp_ps <- c("ssu18_ps_work", "ssu18_ps_filt", "ssu18_ps_perfect", "ssu18_ps_pime")
lefse_lda <- 2
lefse_kw <- 0.05
lefse_wc <- 0.05
Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_asv")))
     tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2", 
                        lda_cutoff = lefse_lda, 
                        kw_cutoff = lefse_kw, 
                        wilcoxon_cutoff = lefse_wc, 
                        bootstrap_n = 30, bootstrap_fraction = 2/3, 
                        sample_min = 10, multicls_strat = FALSE, curv = FALSE)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv"))
     assign(tmp_name, tmp_lefse)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")

for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_all")))
     tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2", 
                        lda_cutoff = lefse_lda, 
                        kw_cutoff = lefse_kw, 
                        wilcoxon_cutoff = lefse_wc, 
                        bootstrap_n = 30, bootstrap_fraction = 2/3, 
                        sample_min = 10, multicls_strat = FALSE, curv = FALSE)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all"))
     assign(tmp_name, tmp_lefse)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
objects(pattern = "_lefse_all")
Code
for (i in samp_ps) {
    tmp_o_tax <- data.frame(tax_table(get(i)))
    tmp_o_tax$ASV_ID <- NULL 
    tmp_o_tax <- tmp_o_tax %>% tibble::rownames_to_column("ASV_ID")
    
    tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
    tmp_mt <- data.frame(marker_table(tmp_get))
    tmp_mt$feature <- gsub('s__', '', tmp_mt$feature)
    tmp_mt <- tmp_mt %>% dplyr::rename(c("ASV_ID" = 1, "group" = 2, "pval" = 4)) %>%
          tibble::remove_rownames()
    tmp_sum <- dplyr::left_join(tmp_mt, tmp_o_tax, by = "ASV_ID")
    tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^0$", "C0")
    tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^3$", "W3")
    tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^8$", "W8")

    tmp_sum <- tmp_sum[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum$ASV_ID))),]
    
    tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_summary"))
    assign(tmp_res_name, tmp_sum)
    rm(list = ls(pattern = "tmp_"))
}

Now we can save a few files and display the data.

Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_df <- data.frame(t(otu_table(tmp_get)))
     #tmp_get[,1] <- NULL
     #tmp_df <- as.data.frame(t(tmp_otu))
     tmp_col_names <- colnames(tmp_df)
     tmp_col_names <- tmp_col_names %>%
              stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
              stringr::str_replace("[A-Z]$", "")
     colnames(tmp_df) <- tmp_col_names
     tmp_df$freq <- apply(tmp_df > 0, 1, sum)
     tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
     tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
     tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
     tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
     tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
     tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
     tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
     tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
     tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")

     tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
     tmp_merge_df <- merge(tmp_df, tmp_get_lefse, by = "ASV_ID", all = FALSE)

     tmp_merge_df <- tmp_merge_df[,c(1,10:12,2:9,14:20)]
     tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
     tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_lefse_final"))
     assign(tmp_merge_name, tmp_merge_df)
     rm(list = ls(pattern = "tmp_"))
}

Save a new phyloseq object

Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
     tmp_list <- tmp_tab[,1]
     tmp_ps <- prune_taxa(tmp_list, tmp_get)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse"))
     assign(tmp_name, tmp_ps)
     tmp_ps@phy_tree <- NULL
     tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
     tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
                            tip.label = taxa_names(tmp_ps))
     tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
     assign(tmp_name, tmp_ps)
     rm(list = ls(pattern = "tmp_"))
}
objects()

LEfSe Summaries (ASVs & Markers)

ASV summary data


(16S rRNA) Table 5 | Significant results of LEfSe Analysis for FULL data set.

(16S rRNA) Table 6 | Significant results of LEfSe Analysis for Arbitrary filtered ASV data set.

(16S rRNA) Table 7 | Significant results of LEfSe Analysis for PERfect filtered data set.

(16S rRNA) Table 8 | Significant results of LEfSe Analysis for PIME data set.

ASV Visualizations

Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
     tmp_mt <- data.frame(marker_table(tmp_get))
     tmp_mt  <-  tmp_mt %>% filter(ef_lda >= 3)
     tmp_mt <- marker_table(tmp_mt)
     tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv_viz_trim"))
     assign(tmp_res_name, tmp_mt)
     rm(list = ls(pattern = "tmp_"))
}

objects(pattern = "_lefse_asv_viz_trim")

(16S rRNA) Figure 1 | Differentially abundant ASVs from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 3.0.

(16S rRNA) Figure 2 | Differentially abundant ASVs from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 3.0.

(16S rRNA) Figure 3 | Differentially abundant ASVs from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 3.0.

(16S rRNA) Figure 4 | Differentially abundant ASVs from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 3.0.

Marker summary data

Code
for (i in samp_ps) {
    tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
    tmp_mt <- data.frame(marker_table(tmp_get))
    tmp_mt <- tmp_mt[, c(2:4,1)]
    tmp_mt <- tmp_mt %>% dplyr::rename(c("group" = 1, "pval" = 3))
    tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^0$", "C0")
    tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^3$", "W3")
    tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^8$", "W8")
    tmp_mt <- tmp_mt %>% tibble::rownames_to_column("marker")
    tmp_mt <- tmp_mt %>% tidyr::separate(col = "feature", 
                                         into = c("Kingdom", "Phylum", "Class", 
                                                  "Order", "Family", "Genus", "ASV_ID"), 
                                         sep = "\\|\\w__")
    
    tmp_mt$Kingdom <- gsub('k__', '', tmp_mt$Kingdom)
    tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_marker_final"))
    assign(tmp_res_name, tmp_mt)
    rm(list = ls(pattern = "tmp_"))
}
objects(pattern =  "_lefse_marker_final")

(16S rRNA) Table 9 | Significant results of LEfSe MARKER Analysis for FULL data set.

(16S rRNA) Table 10 | Significant results of LEfSe MARKER Analysis for Arbitrary filtered ASV data set.

(16S rRNA) Table 11 | Significant results of LEfSe MARKER Analysis for PERfect filtered data set.

(16S rRNA) Table 12 | Significant results of LEfSe MARKER Analysis for PIME data set.

Marker visualizations

Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
     tmp_mt <- data.frame(marker_table(tmp_get))
     tmp_mt  <-  tmp_mt %>% filter(ef_lda >= 4)
     tmp_mt <- marker_table(tmp_mt)
     tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all_viz_trim"))
     assign(tmp_res_name, tmp_mt)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_all_viz_trim")


(16S rRNA) Figure 5 | Differentially abundant MARKER from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 4.0.

(16S rRNA) Figure 6 | Differentially abundant MARKER from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 4.0.

(16S rRNA) Figure 7 | Differentially abundant MARKER from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 4.0.

(16S rRNA) Figure 8 | Differentially abundant MARKER from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 4.0.

Response of select taxa

In this section of the workflow we use the microbiomeMarker package to assess the response of taxonomic lineages to soil warming. In the first step we need to fix the selected data set to make it compatible with the various functions. For this analysis we use the PERfect filtered data set.

Code
## FIX ps object
ssu_ps <- ssu18_ps_perfect_rf_all
tmp_tax1 <- data.frame(tax_table(ssu_ps))
#tmp_tax1$ASV_SEQ <- NULL
tmp_rn <- row.names(tmp_tax1)
tmp_tax <- data.frame(lapply(tmp_tax1, function(x) {gsub("\\(|)", "", x)}))
row.names(tmp_tax) <- tmp_rn
identical(row.names(tmp_tax), row.names(tmp_tax1))
#dplyr::filter(tmp_tax, Phylum == "Acidobacteriota")
ps_tax_new <- as.matrix(tmp_tax)
tmp_ps <- phyloseq(otu_table(ssu_ps),
                     phy_tree(ssu_ps),
                     tax_table(ps_tax_new),
                     sample_data(ssu_ps))
ssu_ps <- tmp_ps
phyloseq::rank_names(ssu_ps)
data.frame(tax_table(ssu_ps))

Next we run a statistical test for multiple groups using the run_test_multiple_groups function.

Code
ssu_group_anova <-  run_test_multiple_groups(ssu_ps, group = "TEMP", 
                                             taxa_rank = "all", 
                                             method = "anova")
ssu_group_anova@marker_table
marker_table(ssu_group_anova)

And then conduct post hoc pairwise comparisons for multiple groups test using the run_posthoc_test function.

Code
ssu_default_pht <- run_posthoc_test(ssu_ps, group = "TEMP", 
                                    method = "tukey", transform = "log10")

We can filter out a select taxa and plot the results.

filter(data.frame(ssu_default_pht@result), group_name == "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales")
        group                                                       group_name
3-0.140   141 k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales
8-0.140   141 k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales
8-3.140   141 k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales
        comparions   diff_mean      pvalue      ci_lower   ci_upper
3-0.140        3-0 0.007492377 0.645933374 -0.0145296109 0.02951437
8-0.140        8-0 0.030358682 0.008215796  0.0083366941 0.05238067
8-3.140        8-3 0.022866305 0.041743700  0.0008443168 0.04488829
plot_postHocTest(ssu_default_pht, feature = "k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales") & theme_bw()

But what we really want to do is get all of the markers that are significant from the analysis, excluding any significant ASVs so we can look at high taxa ranks.

Code
ssu_pht <- ssu_default_pht
ssu18_pht_filt <- filter(data.frame(ssu_pht@result), pvalue <= "0.05")[!grepl("ASV", filter(data.frame(ssu_pht@result), pvalue <= "0.05")$group_name),]
ssu18_pht_filt <- ssu18_pht_filt[!grepl("[a-z]__$", ssu18_pht_filt$group_name),]
ssu18_pht_filt <- distinct(ssu18_pht_filt, group_name, .keep_all = TRUE)
nrow(ssu18_pht_filt)
save.image("page_build/mmarker_wf.rdata")

There are 47 significantly different lineage markers.

Click here to see a list of significantly different lineage markers.

Now we select a subset of the 47 markers to plot and visualize the results.

Code
ssu_select <- c(
"k__Bacteria|p__Acidobacteriota|c__Acidobacteriae|o__Subgroup_2", 
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Chitinophagales|f__Saprospiraceae", 
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Cytophagales", 
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Flavobacteriales", 
"k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Sphingobacteriales", 
"k__Bacteria|p__Myxococcota|c__Polyangia|o__mle1-27", 
"k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Comamonadaceae|g__Rubrivivax", 
"k__Bacteria|p__Actinobacteriota|c__Acidimicrobiia|o__Microtrichales", 
"k__Bacteria|p__Actinobacteriota|c__Thermoleophilia|o__Gaiellales", 
"k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales", 
"k__Bacteria|p__Myxococcota|c__Myxococcia|o__Myxococcales|f__Myxococcaceae|g__Corallococcus", 
"k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Ralstonia" 
)

(16S rRNA) Figure 9 | The response of select taxa to two years of warming by +3°C and +8°C. Differences assessed for multiple-group pair-wise comparisons using ANOVA followed by Tukey HSD post hoc tests. PERfect filtered read count data was log10 transformed and normalized using total sum scaling (TSS). The centre line of each box plot represents the median, the lower and upper hinges represent the first and third quartiles and whiskers represent + 1.5 the interquartile range. Significant differences denoted by asterisks (* p ≤ 0.05, ** p ≤ 0.01, *** p ≤ 0.001, **** p ≤ 0.0001; ns = not significant).

ITS

Indicator Analysis

  1. Choose a data set(s) & set the p-value cutoff.
samp_ps <- c("its18_ps_work", "its18_ps_filt", "its18_ps_perfect", "its18_ps_pime") 
indval_pval <- 0.05
  1. Format Data Frame
Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_df <- data.frame(otu_table(tmp_get))
     tmp_df <- tmp_df[, which(colSums(tmp_df) != 0)]
     tmp_row_names <- row.names(tmp_df)
     tmp_row_names <- tmp_row_names %>%
              stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
              stringr::str_replace("[A-Z]$", "")
     tmp_df <- tmp_df %>% tibble::add_column(tmp_row_names, .before = 1)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_seq_tab"))
     assign(tmp_name, tmp_df)
     rm(list = ls(pattern = "tmp_"))
}
objects()
  1. Run Indicator Analysis
Code
set.seed(1191)
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
     tmp_iva <- indval(tmp_get[,-1], tmp_get[,1])
     tmp_gr <- tmp_iva$maxcls[tmp_iva$pval <= indval_pval]
     tmp_iv <- tmp_iva$indcls[tmp_iva$pval <= indval_pval]
     tmp_pv <- tmp_iva$pval[tmp_iva$pval <= indval_pval]
     tmp_fr <- apply(tmp_get[,-1] > 0, 2, sum)[tmp_iva$pval <= indval_pval]
     tmp_sum <- data.frame(group = tmp_gr, indval = tmp_iv,
                                  pval = tmp_pv, freq = tmp_fr)
     tmp_sum <- tmp_sum[order(tmp_sum$group, -tmp_sum$indval),]

     tmp_tax_df <- data.frame(tax_table(get(i)))
     tmp_tax_df$ASV_ID <- NULL
     tmp_sum_tax <- merge(tmp_sum, tmp_tax_df, by = "row.names", all = TRUE)
     tmp_sum_tax <- tmp_sum_tax[!(is.na(tmp_sum_tax$group)),]
     class(tmp_sum_tax$group) <- "character"
     tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^1$", "C0")
     tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^2$", "W3")
     tmp_sum_tax$group <- stringr::str_replace(tmp_sum_tax$group, "^3$", "W8")
     tmp_sum_tax <- tmp_sum_tax %>% dplyr::rename("ASV_ID" = "Row.names")
     tmp_sum_tax <- tmp_sum_tax[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum_tax$ASV_ID))),]
     tmp_sum_tax$ASV_ID <-  as.character(tmp_sum_tax$ASV_ID)
     tmp_sum.prob.corrected <- p.adjust(tmp_sum$pval, "bonferroni")
     tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_indval_summary"))
     assign(tmp_res_name, tmp_sum_tax)
     rm(list = ls(pattern = "tmp_"))
}

Now we can save a few files and display the data.

Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_seq_tab")))
     tmp_get[,1] <- NULL
     tmp_df <- as.data.frame(t(tmp_get))
     tmp_col_names <- colnames(tmp_df)
     tmp_col_names <- tmp_col_names %>%
              stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
              stringr::str_replace("[A-Z]$", "")
     colnames(tmp_df) <- tmp_col_names
     #tmp_df$freq_all <- apply(tmp_df > 0, 1, sum)
     tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
     tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
     tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
     tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", 
                                                                         "W3", 
                                                                         "W8"))])
     tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
     tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
     tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
     tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
     tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")

     tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
     tmp_merge_df <- merge(tmp_df, tmp_get_indval, by = "ASV_ID", all = FALSE)
     tmp_merge_df <- tmp_merge_df[,c(1,9:12,2:8,13:19)]
     tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
     tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_indval_final"))
     assign(tmp_merge_name, tmp_merge_df)
     rm(list = ls(pattern = "tmp_"))
}
  1. Save a new phyloseq object
Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_indval_summary")))
     tmp_list <- tmp_tab[,1]
     tmp_ps <- prune_taxa(tmp_list, tmp_get)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_ind"))
     assign(tmp_name, tmp_ps)
     tmp_ps@phy_tree <- NULL
     tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
     tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
                            tip.label = taxa_names(tmp_ps))
     tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
     assign(tmp_name, tmp_ps)
     rm(list = ls(pattern = "tmp_"))
}
objects()

Indicator Summary

(ITS) Table 1 | Significant results of Indicator Analysis for FULL data set.

(ITS) Table 2 | Significant results of Indicator Analysis for Arbitrary filtered ASV data set.

(ITS) Table 3 | Significant results of Indicator Analysis for PERfect filtered data set.

(ITS) Table 4 | Significant results of Indicator Analysis for PIME data set.

LEfSe Analysis

Details about LEFSE

microbiomeMarker

Remove ASV seq, subset only ASVs

Code
for (i in samp_ps) {
  tmp_get <- get(i)
  tax_table(tmp_get) <- tax_table(tmp_get)[,c(1:6)]
  tax_table(tmp_get) <- cbind(tax_table(tmp_get),
                           rownames(tax_table(tmp_get)))
  colnames(tax_table(tmp_get)) <- c("Kingdom", "Phylum", "Class", "Order", 
                                    "Family", "Genus", "Species")
  tmp_name <- purrr::map_chr(i, ~ paste0(., "_rf_all"))
  assign(tmp_name, tmp_get)
  tax_table(tmp_get) <- tax_table(tmp_get)[,c(7)]
  tmp_name_asv <- purrr::map_chr(i, ~ paste0(., "_rf_asv"))
  assign(tmp_name_asv, tmp_get)
  rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_rf")
  1. Choose a data set(s) & set the LEfSe cutoff values.
samp_ps <- c("its18_ps_work", "its18_ps_filt", "its18_ps_perfect", "its18_ps_pime") 
lefse_lda <- 2  
lefse_kw <- 0.05    
lefse_wc <- 0.05    
Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_asv")))
     tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2", 
                        lda_cutoff = lefse_lda, kw_cutoff = lefse_kw, wilcoxon_cutoff = lefse_wc, 
                        bootstrap_n = 30, bootstrap_fraction = 2/3, 
                        sample_min = 10, multicls_strat = FALSE, curv = FALSE)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv"))
     assign(tmp_name, tmp_lefse)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")

for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_rf_all")))
     tmp_lefse <- run_lefse(tmp_get, norm = "CPM", class = "TEMP", correct = "2", 
                        lda_cutoff = lefse_lda, kw_cutoff = lefse_kw, wilcoxon_cutoff = lefse_wc, 
                        bootstrap_n = 30, bootstrap_fraction = 2/3, 
                        sample_min = 10, multicls_strat = FALSE, curv = FALSE)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all"))
     assign(tmp_name, tmp_lefse)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_lefse_asv")
objects(pattern = "_lefse_all")
Code
for (i in samp_ps) {
    tmp_o_tax <- data.frame(tax_table(get(i)))
    tmp_o_tax$ASV_ID <- NULL 
    tmp_o_tax <- tmp_o_tax %>% tibble::rownames_to_column("ASV_ID")
    
    tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
    tmp_mt <- data.frame(marker_table(tmp_get))
    tmp_mt$feature <- gsub('s__', '', tmp_mt$feature)
    tmp_mt <- tmp_mt %>% dplyr::rename(c("ASV_ID" = 1, "group" = 2, "pval" = 4)) %>%
          tibble::remove_rownames()
    tmp_sum <- dplyr::left_join(tmp_mt, tmp_o_tax, by = "ASV_ID")
    tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^0$", "C0")
    tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^3$", "W3")
    tmp_sum$group <- stringr::str_replace(tmp_sum$group, "^8$", "W8")

    tmp_sum <- tmp_sum[order(as.numeric(gsub("[A-Z]{3}", "", tmp_sum$ASV_ID))),]
    
    tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_summary"))
    assign(tmp_res_name, tmp_sum)
    rm(list = ls(pattern = "tmp_"))
}

Now we can save a few files and display the data.

Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_df <- data.frame(t(otu_table(tmp_get)))
     #tmp_get[,1] <- NULL
     #tmp_df <- as.data.frame(t(tmp_otu))
     tmp_col_names <- colnames(tmp_df)
     tmp_col_names <- tmp_col_names %>%
              stringr::str_replace("P[0-9]{2}_D[0-9]{2}_[0-9]{3}_", "") %>%
              stringr::str_replace("[A-Z]$", "")
     colnames(tmp_df) <- tmp_col_names
     tmp_df$freq <- apply(tmp_df > 0, 1, sum)
     tmp_df$freq_C0 <- apply(tmp_df[ , (names(tmp_df) %in% "C0")] > 0, 1, sum)
     tmp_df$freq_W3 <- apply(tmp_df[ , (names(tmp_df) %in% "W3")] > 0, 1, sum)
     tmp_df$freq_W8 <- apply(tmp_df[ , (names(tmp_df) %in% "W8")] > 0, 1, sum)
     tmp_df$reads_total <- base::rowSums(tmp_df[ , (names(tmp_df) %in% c("C0", "W3", "W8"))])
     tmp_df$reads_C0 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "C0")])
     tmp_df$reads_W3 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W3")])
     tmp_df$reads_W8 <- base::rowSums(tmp_df[ , (names(tmp_df) %in% "W8")])
     tmp_df <- tmp_df[,!grepl("^[C | W]", names(tmp_df))]
     tmp_df <- tmp_df %>% tibble::rownames_to_column("ASV_ID")

     tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
     tmp_merge_df <- merge(tmp_df, tmp_get_lefse, by = "ASV_ID", all = FALSE)

     tmp_merge_df <- tmp_merge_df[,c(1,10:12,2:9,14:20)]
     tmp_merge_df <- tmp_merge_df[order(tmp_merge_df$reads_total, decreasing = TRUE), ]
     tmp_merge_name <- purrr::map_chr(i, ~ paste0(., "_lefse_final"))
     assign(tmp_merge_name, tmp_merge_df)
     rm(list = ls(pattern = "tmp_"))
}

Save a new phyloseq object

Code
for (i in samp_ps) {
     tmp_get <- get(i)
     tmp_tab <- get(purrr::map_chr(i, ~ paste0(., "_lefse_summary")))
     tmp_list <- tmp_tab[,1]
     tmp_ps <- prune_taxa(tmp_list, tmp_get)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_lefse"))
     assign(tmp_name, tmp_ps)
     tmp_ps@phy_tree <- NULL
     tmp_ps <- prune_samples(sample_sums(tmp_ps) > 0, tmp_ps)
     tmp_ps_tree <- rtree(ntaxa(tmp_ps), rooted = TRUE,
                            tip.label = taxa_names(tmp_ps))
     tmp_ps <- merge_phyloseq(tmp_ps, sample_data, tmp_ps_tree)
     assign(tmp_name, tmp_ps)
     rm(list = ls(pattern = "tmp_"))
}
objects()

LEfSe Summaries (ASVs & Markers)

ASV summary data


(ITS) Table 5 | Significant results of LEfSe Analysis for FULL data set.

(ITS) Table 6 | Significant results of LEfSe Analysis for Arbitrary filtered ASV data set.

(ITS) Table 7 | Significant results of LEfSe Analysis for PERfect filtered data set.

(ITS) Table 8 | Significant results of LEfSe Analysis for PIME data set.

ASV Visualizations

Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_asv")))
     tmp_mt <- data.frame(marker_table(tmp_get))
     tmp_mt  <-  tmp_mt %>% filter(ef_lda >= 3)
     tmp_mt <- marker_table(tmp_mt)
     tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_asv_viz_trim"))
     assign(tmp_res_name, tmp_mt)
     rm(list = ls(pattern = "tmp_"))
}

objects(pattern = "_lefse_asv_viz_trim")

(ITS) Figure 1 | Differentially abundant ASVs from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 3.0.

(ITS) Figure 2 | Differentially abundant ASVs from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 3.0.

(ITS) Figure 3 | Differentially abundant ASVs from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 3.0.

(ITS) Figure 4 | Differentially abundant ASVs from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 3.0.

Marker summary data

Code
for (i in samp_ps) {
    tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
    tmp_mt <- data.frame(marker_table(tmp_get))
    tmp_mt <- tmp_mt[, c(2:4,1)]
    tmp_mt <- tmp_mt %>% dplyr::rename(c("group" = 1, "pval" = 3))
    tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^0$", "C0")
    tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^3$", "W3")
    tmp_mt$group <- stringr::str_replace(tmp_mt$group, "^8$", "W8")
    tmp_mt <- tmp_mt %>% tibble::rownames_to_column("marker")
    tmp_mt <- tmp_mt %>% tidyr::separate(col = "feature", 
                                         into = c("Kingdom", "Phylum", "Class", 
                                                  "Order", "Family", "Genus", "ASV_ID"), 
                                         sep = "\\|\\w__")
    
    tmp_mt$Kingdom <- gsub('k__', '', tmp_mt$Kingdom)
    tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_marker_final"))
    assign(tmp_res_name, tmp_mt)
    rm(list = ls(pattern = "tmp_"))
}


(ITS) Table 9 | Significant results of LEfSe MARKER Analysis for FULL data set.

(ITS) Table 10 | Significant results of LEfSe MARKER Analysis for Arbitrary filtered ASV data set.

(ITS) Table 11 | Significant results of LEfSe MARKER Analysis for PERfect filtered data set.

(ITS) Table 12 | Significant results of LEfSe MARKER Analysis for PIME data set.

Marker visualizations

Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_lefse_all")))
     tmp_mt <- data.frame(marker_table(tmp_get))
     tmp_mt  <-  tmp_mt %>% filter(ef_lda >= 3.5)
     tmp_mt <- marker_table(tmp_mt)
     tmp_res_name <- purrr::map_chr(i, ~ paste0(., "_lefse_all_viz_trim"))
     assign(tmp_res_name, tmp_mt)
     rm(list = ls(pattern = "tmp_"))
}

objects(pattern = "_lefse_all_viz_trim")

(ITS) Figure 5 | Differentially abundant MARKER from LEfSe analysis for the FULL data set, trimmed to show those with LDA score ≥ 4.0.

(ITS) Figure 6 | Differentially abundant MARKER from LEfSe analysis for the Arbitrary filtered data set, trimmed to show those with LDA score ≥ 4.0.

(ITS) Figure 7 | Differentially abundant MARKER from LEfSe analysis for the PERfect filtered data set, trimmed to show those with LDA score ≥ 4.0.

(ITS) Figure 8 | Differentially abundant MARKER from LEfSe analysis for the PIME filtered data set, trimmed to show those with LDA score ≥ 4.0.

Response of select taxa

The workflow here is basically a carbon copy of the workflow describe above. Presented here for posterity.

Click here to see the ITS version of this workflow.
## FIX ps object
ps <- its18_ps_perfect_rf_all
tmp_tax1 <- data.frame(tax_table(ps))
tmp_tax1$ASV_SEQ <- NULL
tmp_rn <- row.names(tmp_tax1)
tmp_tax <- data.frame(lapply(tmp_tax1, function(x) {gsub("\\(|)", "", x)}))
row.names(tmp_tax) <- tmp_rn
identical(row.names(tmp_tax), row.names(tmp_tax1))
#dplyr::filter(tmp_tax, Phylum == "Acidobacteriota")
ps_tax_new <- as.matrix(tmp_tax)
tmp_ps <- phyloseq(otu_table(ps),
                     tax_table(ps_tax_new),
                     sample_data(ps))
its_ps <- tmp_ps
phyloseq::rank_names(its_ps)
data.frame(tax_table(its_ps))
its_group_anova <-  run_test_multiple_groups(its_ps, group = "TEMP", 
                                             taxa_rank = "all", method = "anova")
its_group_anova@marker_table
marker_table(its_group_anova)
its_default_pht <- run_posthoc_test(its_ps, 
                                    group = "TEMP", method = "tukey", 
                                    transform = "log10")
its_pht <- its_default_pht
its18_pht_filt <- filter(data.frame(its_pht@result), pvalue <= "0.05")[!grepl("ASV", filter(data.frame(its_pht@result), pvalue <= "0.05")$group_name),]
its18_pht_filt <- its18_pht_filt[!grepl("[a-z]__$", its18_pht_filt$group_name),]
its18_pht_filt <- unique(its18_pht_filt$group_name)
length(its18_pht_filt)

There are 32 significantly different lineage markers.

Click here to see a list of significantly different lineage markers.
 [1] "k__Fungi"                                                                                               
 [2] "k__Fungi|p__Basidiomycota"                                                                              
 [3] "k__Fungi|p__Chytridiomycota"                                                                            
 [4] "k__Fungi|p__Glomeromycota"                                                                              
 [5] "k__Fungi|p__Mortierellomycota"                                                                          
 [6] "k__Fungi|p__Basidiomycota|c__Agaricomycetes"                                                            
 [7] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes"                                                        
 [8] "k__Fungi|p__Glomeromycota|c__Glomeromycetes"                                                            
 [9] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes"                                                    
[10] "k__Fungi|p__Rozellomycota|c__Rozellomycotina_cls_Incertae_sedis"                                        
[11] "k__Fungi|p__Basidiomycota|c__Agaricomycetes|o__Agaricales"                                              
[12] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes|o__Sporidiobolales"                                     
[13] "k__Fungi|p__Glomeromycota|c__Glomeromycetes|o__Glomerales"                                              
[14] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes|o__Mortierellales"                                  
[15] "k__Fungi|p__Rozellomycota|c__Rozellomycotina_cls_Incertae_sedis|o__GS11"                                
[16] "k__Fungi|p__Ascomycota|c__Eurotiomycetes|o__Eurotiales|f__Trichocomaceae"                               
[17] "k__Fungi|p__Ascomycota|c__Pezizomycetes|o__Pezizales|f__Pyronemataceae"                                 
[18] "k__Fungi|p__Ascomycota|c__Saccharomycetes|o__Saccharomycetales|f__Metschnikowiaceae"                    
[19] "k__Fungi|p__Ascomycota|c__Sordariomycetes|o__Xylariales|f__Microdochiaceae"                             
[20] "k__Fungi|p__Basidiomycota|c__Agaricomycetes|o__Agaricales|f__Clavariaceae"                              
[21] "k__Fungi|p__Basidiomycota|c__Agaricomycetes|o__Agaricales|f__Entolomataceae"                            
[22] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes|o__Sporidiobolales|f__Sporidiobolaceae"                 
[23] "k__Fungi|p__Glomeromycota|c__Glomeromycetes|o__Glomerales|f__Glomeraceae"                               
[24] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes|o__Mortierellales|f__Mortierellaceae"               
[25] "k__Fungi|p__Ascomycota|c__Eurotiomycetes|o__Eurotiales|f__Trichocomaceae|g__Talaromyces"                
[26] "k__Fungi|p__Ascomycota|c__Pezizomycetes|o__Pezizales|f__Pyronemataceae|g__Scutellinia"                  
[27] "k__Fungi|p__Ascomycota|c__Saccharomycetes|o__Saccharomycetales|f__Metschnikowiaceae|g__Metschnikowia"   
[28] "k__Fungi|p__Ascomycota|c__Sordariomycetes|o__Hypocreales|f__Nectriaceae|g__Fusarium"                    
[29] "k__Fungi|p__Ascomycota|c__Sordariomycetes|o__Xylariales|f__Microdochiaceae|g__Idriella"                 
[30] "k__Fungi|p__Basidiomycota|c__Microbotryomycetes|o__Sporidiobolales|f__Sporidiobolaceae|g__Rhodotorula"  
[31] "k__Fungi|p__Glomeromycota|c__Glomeromycetes|o__Glomerales|f__Glomeraceae|g__Kamienskia"                 
[32] "k__Fungi|p__Mortierellomycota|c__Mortierellomycetes|o__Mortierellales|f__Mortierellaceae|g__Mortierella"

(ITS) Figure 9 | The response of select taxa to two years of warming by +3°C and +8°C. Differences assessed for multiple-group pair-wise comparisons using ANOVA followed by Tukey HSD post hoc tests. PERfect filtered read count data was log10 transformed and normalized using total sum scaling (TSS). The centre line of each box plot represents the median, the lower and upper hinges represent the first and third quartiles and whiskers represent + 1.5 the interquartile range. Significant differences denoted by asterisks (* p ≤ 0.05, ** p ≤ 0.01, *** p ≤ 0.001, **** p ≤ 0.0001; ns = not significant).

Visualizing DA ASVs in Anvi’o

Next, we will combine the results of the ISA and LEfSe analyses with the distribution of ASVs across each sample. We are going to do the analysis in anvi’o—an advanced analysis and visualization platform for ’omics data (Eren et al. 2015)—using the anvi-interactive command. Anvi’o likes databases but it also understands that sometimes you do not have a database. So it offers a manual mode. If you type this command you can have a look at the relevant pieces we need for the visualization, specifically those under the headings MANUAL INPUTS and ADDITIONAL STUFF.

anvi-interactive -h
MANUAL INPUTS:
  Mandatory input parameters to start the interactive interface without
  anvi'o databases.

--manual-mode           We need this flag to run anvi'o in an ad hoc
                        manner, i.e., no database.
-f FASTA, --fasta-file FASTA
                        A FASTA-formatted input file. This is sort of
                        optional
-d VIEW_DATA, --view-data VIEW_DATA
                        A TAB-delimited file for view data. This is the ASV
                        by sample matrix. We need this
-t NEWICK, --tree NEWICK
                        NEWICK formatted tree structure. How the ASVs are
                        ordered in our case.
ADDITIONAL STUFF:
  Parameters to provide additional layers, views, or layer data.

-V ADDITIONAL_VIEW, --additional-view ADDITIONAL_VIEW
                        A TAB-delimited file for an additional view to be used
                        in the interface. This file should contain all split
                        names, and values for each of them in all samples.
                        Each column in this file must correspond to a sample
                        name. Content of this file will be called 'user_view',
                        which will be available as a new item in the 'views'
                        combo box in the interface
-A ADDITIONAL_LAYERS, --additional-layers ADDITIONAL_LAYERS
                        A TAB-delimited file for additional layer info. In
                        our case this is info about each ASV. The first column
                        of the file must be the ASV names, and
                        the remaining columns should be unique attributes.

There are also a few files we generate that cannot be loaded directly. So, in addition to the files that can be loaded when running the interactive, we also have files that must be added to the database created by anvi’o.

Here is a nice tutorial on Working with anvi’o additional data tables. A lot of what we need is covered in this tutorial. To get the most out the visualization we need to create a few files to give anvi’o when we fire up the interactive interface.

  1. View data: in our case, a sample by ASV abundance matrix.
  2. Additional info about each ASV.
  3. Additional info about each sample.
  4. Taxa abundance data for each sample at some rank.
  5. Dendrograms ordering the ASVs and samples (based on view data).
  6. Fasta file of all ASVs in the analysis.

Main Steps

1. View data

Let’s start with the -d or --view-data file. This file needs to be an ASV by sample matrix of read counts. To simplify the visualization, we will use all ASVs represented by 100 or more total reads, including those identified as differentially abundant by the ISA and/or LEfSe.

Code
samp_ps_all <- c("ssu18_ps_filt", "ssu18_ps_pime", "ssu18_ps_perfect")
samp_ps <- c("ssu18_ps_filt", "ssu18_ps_pime", "ssu18_ps_perfect")
samp_ps_org <- c("ssu18_ps_work")
samp_ps_pime <- c("ssu18_ps_pime")
samp_ps_other <- c("ssu18_ps_filt", "ssu18_ps_perfect")

trim_val <- 100
for (i in samp_ps_pime) {
     tmp_get <- get(i)
     tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
     assign(tmp_name, tmp_df)
     rm(list = ls(pattern = "tmp_"))
}

trim_val <- 300
for (i in samp_ps_other) {
     tmp_get <- get(i)
     tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
     assign(tmp_name, tmp_df)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_trim")
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
     tmp_df <- as.data.frame(t(otu_table(tmp_get)))
     tmp_df <- tmp_df %>% rownames_to_column("Group")
     tmp_path <- file.path("files/anvio/ssu/")
     write.table(tmp_df, paste(tmp_path, i, "/", "data.txt", sep = ""),
            quote = FALSE, sep = "\t", row.names = FALSE)
     rm(list = ls(pattern = "tmp_"))
}

Or export a table of transformed data.

Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
     tmp_trans <- transform_sample_counts(tmp_get, function(x) 1e5 * {x/sum(x)})
     tmp_df <- as.data.frame(t(otu_table(tmp_trans)))
     tmp_df <- tmp_df %>% rownames_to_column("Group")
     #tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim_tab"))
     #assign(tmp_name, tmp_df)
     tmp_path <- file.path("files/anvio/ssu/")
     write.table(tmp_df, paste(tmp_path, i, "/", "data_trans.txt", sep = ""),
            quote = FALSE, sep = "\t", row.names = FALSE)
     rm(list = ls(pattern = "tmp_"))
}

2. Additional Layers for ASVs

Next, we need some additional data about the ASVs to overlay on the visual. This can be anything however what I specifically want are the details of the ISA analysis, total reads, and lineage info. I warn you; this code will get ugly and I urge you to find a better way.

Start with an ASV + lineage table for the ASVs in the new phyloseq object.

Code
for (i in samp_ps) {
     tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_final")))
     tmp_get_indval <- tmp_get_indval %>% dplyr::rename("Group" = "ASV_ID") %>%
                                          dplyr::rename("enrich_indval" = "group") %>%
                                          dplyr::rename("test_indval" = "indval") %>%
                                          dplyr::rename("pval_indval" = "pval")
     tmp_get_indval <- tmp_get_indval[,1:5]

     tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_final")))
     tmp_get_lefse <- tmp_get_lefse %>% dplyr::rename("Group" = "ASV_ID") %>%
                                          dplyr::rename("enrich_lefse" = "group") %>%
                                          dplyr::rename("test_lefse" = "lda") %>%
                                          dplyr::rename("pval_lefse" = "pval")
     tmp_get_lefse <- tmp_get_lefse[,1:4]
     
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
     tmp_otu_df <- as.data.frame(t(otu_table(tmp_get)))
     tmp_total <- cbind(tmp_otu_df, total_reads = rowSums(tmp_otu_df))
     tmp_total <- rev(tmp_total)[1]
     tmp_total <- tmp_total %>% tibble::rownames_to_column("Group")
     tmp_tax_df <- as.data.frame(tax_table(tmp_get))
     tmp_tax_df$ASV_SEQ <- NULL
     tmp_tax_df$ASV_ID <- NULL
     
     tmp_tax_df <- tmp_tax_df %>% tibble::rownames_to_column("Group")
     tmp_add_lay <- dplyr::left_join(tmp_tax_df, tmp_total, by = "Group") %>%
                   dplyr::left_join(., tmp_get_indval, by = "Group") %>%
                   dplyr::left_join(., tmp_get_lefse, by = "Group")
     tmp_add_lay$ASV_ID <- tmp_add_lay$Group
     tmp_add_lay <- tmp_add_lay[, c(1,16,8,12,9:11,13:15,2:7)]
     tmp_path <- file.path("files/anvio/ssu/")
     write.table(tmp_add_lay, paste(tmp_path, i, "/", "additional_layers.txt", sep = ""),
            quote = FALSE, sep = "\t", row.names = FALSE, na = "")
     rm(list = ls(pattern = "tmp_"))
}

3. Additional Views for Samples

Now we want some general data about the samples to overlay on the visual. Again, this can be anything. How about a table of alpha diversity metrics? We actually have such a table that was generated way back up the road. Just need to fix the column names.

Note

Added to profile.db with anvi-import-misc-data command & --target-data-table layers flag.

Code
metadata_tab <- read.table("files/metadata/tables/metadata.txt",
                           header = TRUE)

tmp_x <- readRDS("files/alpha/rdata/ssu18_ps_pime.rds")
data.frame(sample_data(tmp_x))
metadata_tab[,c(2:5)] <- list(NULL)
for (i in samp_ps_all) {
     tmp_get <- get(i)
     tmp_df <- data.frame(sample_data(tmp_get))
     tmp_df <- tmp_df[,c(2:9)]
     tmp_df <- tmp_df %>% tibble::rownames_to_column("id")
     tmp_df <- tmp_df %>% dplyr::rename("no_asvs" = "Observed")
     tmp_rc <- data.frame(readcount(tmp_get))
     tmp_rc <- tmp_rc %>% tibble::rownames_to_column("id")
     tmp_rc <- tmp_rc %>% dplyr::rename("no_reads" = 2)
     #identical(tmp_df$id, tmp_rc$id)
     tmp_merge <- dplyr::left_join(tmp_df, tmp_rc)
     tmp_merge <- tmp_merge[, c(1:6,10,7:9)]
     tmp_final <- dplyr::left_join(tmp_merge, metadata_tab)
     tmp_path <- file.path("files/anvio/ssu/")
     write.table(tmp_final, paste(tmp_path, i, "/", "additional_views.txt", sep = ""),
            quote = FALSE, sep = "\t", row.names = FALSE)
     rm(list = ls(pattern = "tmp_"))
}
rm(metadata_tab)

4. Taxon rank abundance by sample

Turned out this was a little tricky to figure out, but thanks to a little nifty block of code written by guoyanzhao on the phyloseq Issues forum, it was a piece of cake. The code can be altered to take any rank. See the post for an explanation.

Anyway, the goal is to sum each taxon at some rank and present that as a bar chart for each sample in the visualization. Anvi’o has a specific format it needs where each row is a sample and each column is a taxon. Taxa names need the prefix t_<RANK>!. For example, t_class! should be added for Class rank.

Note

Added to profile.db with anvi-import-misc-data command & --target-data-table layers flag.

Code
pick_rank <- "Phylum"
pick_rank_l <- "phylum"
for (i in samp_ps_all) {
# Make the table
    tmp_get <- get(i)
    tmp_glom <- tax_glom(tmp_get, taxrank = pick_rank)
    tmp_melt <- psmelt(tmp_glom)
    tmp_melt[[pick_rank]] <- as.character(tmp_melt[[pick_rank]])
    tmp_abund <- aggregate(Abundance ~ Sample + tmp_melt[[pick_rank]], tmp_melt, FUN = sum)
    colnames(tmp_abund)[2] <- "tax_rank"
    library(reshape2)
    tmp_abund <- as.data.frame(reshape::cast(tmp_abund, Sample ~ tax_rank))
    tmp_abund <- tibble::remove_rownames(tmp_abund)
    tmp_abund <- tibble::column_to_rownames(tmp_abund, "Sample")
# Reorder table column by sum
    tmp_layers <- tmp_abund[,names(sort(colSums(tmp_abund), decreasing = TRUE))]
# Add the prefix
    tmp_layers <- tmp_layers %>% dplyr::rename_all(function(x) paste0("t_", pick_rank_l,"!", x))
    tmp_layers <- tibble::rownames_to_column (tmp_layers, "taxon")
# save the table
    tmp_path <- file.path("files/anvio/ssu/")
    write.table(tmp_layers, paste(tmp_path, i, "/", "tax_layers.txt", sep = ""),
            quote = FALSE, sep = "\t", row.names = FALSE)
    rm(list = ls(pattern = "tmp_"))
}

5. Construct Dendrograms

The last piece we need is to generate dendrograms that order the ASVs by their distribution in the samples and the samples by their ASV composition. For this task we will use anvi’o.

anvi-matrix-to-newick data.txt --distance euclidean \
                               --linkage ward \
                               -o asv.tre
anvi-matrix-to-newick data.txt --distance euclidean \
                               --linkage ward \
                               -o sample.tre \
                               --transpose

The first command reads the view data we generated above and uses Euclidean distance and Ward linkage for hierarchical clustering of the ASVs. The second command transposes the view data table and then does the same for the samples. There are several distance metrics and linkage methods available. See the help menu for the command by typing anvi-matrix-to-newick -h. Boom.

Alternatively, we can generate dendrograms using phyloseq::distance and hclust.

Code
pick_dist <- "bray"
pick_clust <- "complete"

for (i in samp_ps) {
# Make the table
    tmp_get <- get(i)
    tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist, type = "sample")
    tmp_dend <- hclust(tmp_dist, method = pick_clust)
    plot(tmp_dend, hang = -1)
    tmp_tree <- as.phylo(tmp_dend) 
    
    tmp_path <- file.path("files/anvio/ssu/")
    write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
    rm(list = ls(pattern = "tmp_"))
}
Code
pick_dist_asv <- "euclidean"
pick_clust_asv <- "ward"

for (i in samp_ps) {
# Make the table
    tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
    tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist_asv, type = "taxa")
    tmp_dend <- hclust(tmp_dist, method = pick_clust_asv)
    plot(tmp_dend, hang = -1)
    tmp_tree <- as.phylo(tmp_dend) 
    
    tmp_path <- file.path("files/anvio/ssu/")
    write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "asv_", pick_dist_asv, "_", pick_clust_asv, ".tre", sep = ""))
    rm(list = ls(pattern = "tmp_"))
}
Note

The file asv.tre is loaded with anvi-interactive command & the --tree flag.

The ASV tree is fine as is, but the sample tree needs a special format. Specifically, the tree needs to be in a three column, tab delimited, table. This way you can add multiple orderings to the same file and view them all in the interactive. The table needs to be in this format:

item_name data_type data_value
tree_1 newick ((P01_D00_010_W8A:0.0250122,P05_D00_010_W8C:0.02,..
tree_2 newick ((((((((OTU14195:0.0712585,OTU13230:0.0712585)0:,…
(…) (…) (…)

This is easy to do by hand, but I really need the practice so I will do it in R. Anvi’o is very particular about formatting. For example, if this file ends with a blank line, which it will because when anvi’o made the initial dendrogram it add a new line. We need to get rid of that or we get an error when trying to import the table.

Note

The file sample.tre is added to the profile.db with anvi-import-misc-data command & --target-data-table layer_orders flag.

Code
for (i in samp_ps) {
      tmp_path <- file.path("files/anvio/ssu/")
      tmp_tree <- read_file(paste(tmp_path, i, "/", "sample.tre", sep = ""))
      tmp_tree <- gsub("[\r\n]", "", tmp_tree)
      tmp_item <- c("bray_complete")
      tmp_type <- c("newick")
      tmp_df <- c(tmp_tree)
      tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
      library(janitor)
      tmp_tab %>% remove_empty("rows")
      colnames(tmp_tab) <- c("item_name",   "data_type",    "data_value")
      write.table(tmp_tab, paste(tmp_path, i, "/", "sample.tre", sep = ""),
            sep = "\t", quote = FALSE, row.names = FALSE, na = "")
      rm(list = ls(pattern = "tmp_"))
}
Code
# FOR TRANSFORMED DATA
for (i in samp_ps) {
      tmp_path <- file.path("files/anvio/ssu/")
      tmp_tree <- read_file(paste(tmp_path, i, "/","sample_trans.tre", sep = ""))
      tmp_tree <- gsub("[\r\n]", "", tmp_tree)
      tmp_item <- c("bray_complete")
      tmp_type <- c("newick")
      tmp_df <- c(tmp_tree)
      tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
      library(janitor)
      tmp_tab %>% remove_empty("rows")
      colnames(tmp_tab) <- c("item_name",   "data_type",    "data_value")
      write.table(tmp_tab, paste(tmp_path, i, "/", "sample_trans.tre", sep = ""),
            sep = "\t", quote = FALSE, row.names = FALSE, na = "")
      rm(list = ls(pattern = "tmp_"))
}
objects()
Code
# FOR HCLUST TREE
for (i in samp_ps) {
      tmp_path <- file.path("files/anvio/ssu/")
      tmp_tree <- read_file(paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
      tmp_tree <- gsub("[\r\n]", "", tmp_tree)
      tmp_item <- c(paste(pick_dist, "_", pick_clust, "_hclust", sep = ""))
      tmp_type <- c("newick")
      tmp_df <- c(tmp_tree)
      tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
      library(janitor)
      tmp_tab %>% remove_empty("rows")
      colnames(tmp_tab) <- c("item_name",   "data_type",    "data_value")
      write.table(tmp_tab, paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""),
            sep = "\t", quote = FALSE, row.names = FALSE, na = "")
      rm(list = ls(pattern = "tmp_"))
}

6. Make a fasta file

We don’t need to add a fasta file, but it is a nice way to keep everything in one place. Plus, you can do BLAST searches directly in the interface by right clicking on the ASV of interest, so it is nice to have the sequences.

Note

Loaded with anvi-interactive command & --fasta-file flag.

Code
for (i in samp_ps) {
       tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
       tmp_tab <- tax_table(tmp_get)
       tmp_tab <- tmp_tab[, 7]
       tmp_df <- data.frame(row.names(tmp_tab), tmp_tab)
       colnames(tmp_df) <- c("ASV_ID", "ASV_SEQ")
       tmp_df$ASV_ID <- sub("^", ">", tmp_df$ASV_ID)
       tmp_path <- file.path("files/anvio/ssu/")

       write.table(tmp_df, paste(tmp_path, i, "/", i, ".fasta", sep = ""),
            sep = "\n", col.names = FALSE, row.names = FALSE,
            quote = FALSE, fileEncoding = "UTF-8")
       rm(list = ls(pattern = "tmp_"))
}

Building the Profile Database

Time to put all of these pieces together. This gets a little tricky since we do not have a database to start with because some of these files can be loaded directly in the interface but some need to be added to a database. When we fire up the interactive in --manual mode, we must give anvi’o the name of a database and it will create that database for us. Then we can shut down the interactive, add the necessary data files, and start back up.

anvi-interactive --view-data data.txt \
                 --tree asv.tre \
                 --additional-layers additional_layers.txt \
                 --profile-db profile.db \
                 --manual

Now we have a new profile database that we can add the sample metadata (additional_layers.txt) and the sample dendrogram (sample.tre) using the command anvi-import-misc-data. These commands add the table to the new profile.db. First, kill the interactive.

anvi-import-misc-data additional_views.txt \
                      --pan-or-profile-db profile.db \
                      --target-data-table layers
anvi-import-misc-data sample.tre \
                      --pan-or-profile-db profile.db \
                      --target-data-table layer_orders

One last this is to get the table with the taxonomy total by sample (tax_layers.txt) into the profile database. We will run the same command we just used.

anvi-import-misc-data tax_layers.txt \
                      --pan-or-profile-db profile.db \
                      --target-data-table layers

In fact, we could just as easily append the taxonomy total data onto the additional_layers.txt and import in one command. But we didn’t.

Interactive Interface

With a populated database in hand, we can now begin modifying the visual by running the interactive command again.

anvi-interactive --view-data data.txt \
                 --tree asv.tre \
                 --additional-layers additional_layers.txt \
                 --profile-db profile.db
                 --fasta-file anvio.fasta \
                 --manual
Click here for the ITS version of this workflow.


The ITS version of the anvi’o workflow is basically a carbon copy of the workflow presented above. It is included here for posterity.

Visualizing DA ASVs in Anvi’o

Next, we will combine the results of the ISA and LEfSe analyses with the distribution of ASVs across each sample. We are going to do the analysis in anvi’o—an advanced analysis and visualization platform for ’omics data (Eren et al. 2015)—using the anvi-interactive command. Anvi’o likes databases but it also understands that sometimes you do not have a database. So it offers a manual mode. If you type this command you can have a look at the relevant pieces we need for the visualization, specifically those under the headings MANUAL INPUTS and ADDITIONAL STUFF.

anvi-interactive -h
MANUAL INPUTS:
  Mandatory input parameters to start the interactive interface without
  anvi'o databases.

--manual-mode           We need this flag to run anvi'o in an ad hoc
                        manner, i.e., no database.
-f FASTA, --fasta-file FASTA
                        A FASTA-formatted input file. This is sort of
                        optional
-d VIEW_DATA, --view-data VIEW_DATA
                        A TAB-delimited file for view data. This is the ASV
                        by sample matrix. We need this
-t NEWICK, --tree NEWICK
                        NEWICK formatted tree structure. How the ASVs are
                        ordered in our case.
ADDITIONAL STUFF:
  Parameters to provide additional layers, views, or layer data.

-V ADDITIONAL_VIEW, --additional-view ADDITIONAL_VIEW
                        A TAB-delimited file for an additional view to be used
                        in the interface. This file should contain all split
                        names, and values for each of them in all samples.
                        Each column in this file must correspond to a sample
                        name. Content of this file will be called 'user_view',
                        which will be available as a new item in the 'views'
                        combo box in the interface
-A ADDITIONAL_LAYERS, --additional-layers ADDITIONAL_LAYERS
                        A TAB-delimited file for additional layer info. In
                        our case this is info about each ASV. The first column
                        of the file must be the ASV names, and
                        the remaining columns should be unique attributes.

There are also a few files we generate that cannot be loaded directly. So, in addition to the files that can be loaded when running the interactive, we also have files that must be added to the database created by anvi’o.

Here is a nice tutorial on Working with anvi’o additional data tables. A lot of what we need is covered in this tutorial. To get the most out the visualization we need to create a few files to give anvi’o when we fire up the interactive interface.

  1. View data: in our case, a sample by ASV abundance matrix.
  2. Additional info about each ASV.
  3. Additional info about each sample.
  4. Taxa abundance data for each sample at some rank.
  5. Dendrograms ordering the ASVs and samples (based on view data).
  6. Fasta file of all ASVs in the analysis.

Main steps

1. View data

Let’s start with the -d or --view-data file. This file needs to be an ASV by sample matrix of read counts. To simplify the visualization, we will use all ASVs represented by 100 or more total reads, including those identified as differentially abundant by the ISA and/or LEfSe.

Code
samp_ps_all <- c("its18_ps_filt", "its18_ps_pime", "its18_ps_perfect", 
                 "its18_ps_filt_otu", "its18_ps_pime_otu", "its18_ps_perfect_otu", 
                 "its18_ps_work", "its18_ps_work_otu")
samp_ps <- c("its18_ps_filt", "its18_ps_pime", "its18_ps_perfect", 
             "its18_ps_filt_otu", "its18_ps_pime_otu", "its18_ps_perfect_otu")
samp_ps_org <- c("its18_ps_work", "its18_ps_work_otu")
samp_ps_pime <- c("its18_ps_pime", "its18_ps_pime_otu")
samp_ps_other <- c("its18_ps_filt", "its18_ps_perfect", 
                   "its18_ps_filt_otu", "its18_ps_perfect_otu")

trim_val <- 50
for (i in samp_ps_pime) {
     tmp_get <- get(i)
     tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
     assign(tmp_name, tmp_df)
     rm(list = ls(pattern = "tmp_"))
}

trim_val <- 300
for (i in samp_ps_other) {
     tmp_get <- get(i)
     tmp_df <- prune_taxa(taxa_sums(tmp_get) > trim_val, tmp_get)
     tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim"))
     assign(tmp_name, tmp_df)
     rm(list = ls(pattern = "tmp_"))
}
objects(pattern = "_trim")
its18_ps_filt_trim
its18_ps_pime_trim
its18_ps_perfect_trim
its18_ps_filt_otu_trim
its18_ps_pime_otu_trim
its18_ps_perfect_otu_trim
Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
     tmp_df <- as.data.frame(t(otu_table(tmp_get)))
     tmp_df <- tmp_df %>% rownames_to_column("Group")
     tmp_path <- file.path("files/anvio/its/")
     write.table(tmp_df, paste(tmp_path, i, "/", "data.txt", sep = ""),
            quote = FALSE, sep = "\t", row.names = FALSE)
     rm(list = ls(pattern = "tmp_"))
}
objects()

Or export a table of transformed data.

Code
for (i in samp_ps) {
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
     tmp_trans <- transform_sample_counts(tmp_get, function(x) 1e5 * {x/sum(x)})
     tmp_df <- as.data.frame(t(otu_table(tmp_trans)))
     tmp_df <- tmp_df %>% rownames_to_column("Group")
     #tmp_name <- purrr::map_chr(i, ~ paste0(., "_trim_tab"))
     #assign(tmp_name, tmp_df)
     tmp_path <- file.path("files/anvio/its/")
     write.table(tmp_df, paste(tmp_path, i, "/", "data_trans.txt", sep = ""),
            quote = FALSE, sep = "\t", row.names = FALSE)
     rm(list = ls(pattern = "tmp_"))
}
objects()

2. Additional Layers for ASVs

Next, we need some additional data about the ASVs to overlay on the visual. This can be anything however what I specifically want are the details of the ISA analysis, total reads, and lineage info. I warn you; this code will get ugly and I urge you to find a better way.

Start with an ASV + lineage table for the ASVs in the new phyloseq object.

Code
for (i in samp_ps) {
     tmp_get_indval <- get(purrr::map_chr(i, ~ paste0(., "_indval_final")))
     tmp_get_indval <- tmp_get_indval %>% dplyr::rename("Group" = "ASV_ID") %>%
                                          dplyr::rename("enrich_indval" = "group") %>%
                                          dplyr::rename("test_indval" = "indval") %>%
                                          dplyr::rename("pval_indval" = "pval")
     tmp_get_indval <- tmp_get_indval[,1:5]

     tmp_get_lefse <- get(purrr::map_chr(i, ~ paste0(., "_lefse_final")))
     tmp_get_lefse <- tmp_get_lefse %>% dplyr::rename("Group" = "ASV_ID") %>%
                                          dplyr::rename("enrich_lefse" = "group") %>%
                                          dplyr::rename("test_lefse" = "lda") %>%
                                          dplyr::rename("pval_lefse" = "pval")
     tmp_get_lefse <- tmp_get_lefse[,1:4]
     
     tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
     tmp_otu_df <- as.data.frame(t(otu_table(tmp_get)))
     tmp_total <- cbind(tmp_otu_df, total_reads = rowSums(tmp_otu_df))
     tmp_total <- rev(tmp_total)[1]
     tmp_total <- tmp_total %>% tibble::rownames_to_column("Group")
     tmp_tax_df <- as.data.frame(tax_table(tmp_get))
     tmp_tax_df$ASV_SEQ <- NULL
     tmp_tax_df$ASV_ID <- NULL
     
     tmp_tax_df <- tmp_tax_df %>% tibble::rownames_to_column("Group")
     tmp_add_lay <- dplyr::left_join(tmp_tax_df, tmp_total, by = "Group") %>%
                   dplyr::left_join(., tmp_get_indval, by = "Group") %>%
                   dplyr::left_join(., tmp_get_lefse, by = "Group")
     tmp_add_lay$ASV_ID <- tmp_add_lay$Group
     tmp_add_lay <- tmp_add_lay[, c(1,16,8,12,9:11,13:15,2:7)]
     tmp_path <- file.path("files/anvio/its/")
     write.table(tmp_add_lay, paste(tmp_path, i, "/", "additional_layers.txt", sep = ""),
            quote = FALSE, sep = "\t", row.names = FALSE, na = "")
     rm(list = ls(pattern = "tmp_"))
}

3. Additional Views for Samples

Now we want some general data about the samples to overlay on the visual. Again, this can be anything. How about a table of alpha diversity metrics? We actually have such a table that was generated way back up the road. Just need to fix the column names.

Note

Added to profile.db with anvi-import-misc-data command & --target-data-table layers flag.

Code
metadata_tab <- read.table("files/metadata/tables/metadata.txt",
                           header = TRUE)

tmp_x <- readRDS("files/alpha/rdata/its18_ps_pime.rds")
data.frame(sample_data(tmp_x))
metadata_tab[,c(2:5)] <- list(NULL)
for (i in samp_ps_all) {
     tmp_get <- get(i)
     tmp_df <- data.frame(sample_data(tmp_get))
     tmp_df <- tmp_df[,c(2:9)]
     tmp_df <- tmp_df %>% tibble::rownames_to_column("id")
     tmp_df <- tmp_df %>% dplyr::rename("no_asvs" = "Observed")
     tmp_rc <- data.frame(readcount(tmp_get))
     tmp_rc <- tmp_rc %>% tibble::rownames_to_column("id")
     tmp_rc <- tmp_rc %>% dplyr::rename("no_reads" = 2)
     #identical(tmp_df$id, tmp_rc$id)
     tmp_merge <- dplyr::left_join(tmp_df, tmp_rc)
     tmp_merge <- tmp_merge[, c(1:6,10,7:9)]
     tmp_final <- dplyr::left_join(tmp_merge, metadata_tab)
     tmp_path <- file.path("files/anvio/its/")
     write.table(tmp_final, paste(tmp_path, i, "/", "additional_views.txt", sep = ""),
            quote = FALSE, sep = "\t", row.names = FALSE)
     rm(list = ls(pattern = "tmp_"))
}
rm(metadata_tab)

4. Taxon rank abundance by sample

Turned out this was a little tricky to figure out, but thanks to a little nifty block of code written by guoyanzhao on the phyloseq Iitses forum, it was a piece of cake. The code can be altered to take any rank. See the post for an explanation.

Anyway, the goal is to sum each taxon at some rank and present that as a bar chart for each sample in the visualization. Anvi’o has a specific format it needs where each row is a sample and each column is a taxon. Taxa names need the prefix t_<RANK>!. For example, t_class! should be added for Class rank.

Note

Added to profile.db with anvi-import-misc-data command & --target-data-table layers flag.

Code
pick_rank <- "Order"
pick_rank_l <- "order"
for (i in samp_ps_all) {
# Make the table
    tmp_get <- get(i)
    tmp_glom <- tax_glom(tmp_get, taxrank = pick_rank)
    tmp_melt <- psmelt(tmp_glom)
    tmp_melt[[pick_rank]] <- as.character(tmp_melt[[pick_rank]])
    tmp_abund <- aggregate(Abundance ~ Sample + tmp_melt[[pick_rank]], tmp_melt, FUN = sum)
    colnames(tmp_abund)[2] <- "tax_rank"
    library(reshape2)
    tmp_abund <- as.data.frame(reshape::cast(tmp_abund, Sample ~ tax_rank))
    tmp_abund <- tibble::remove_rownames(tmp_abund)
    tmp_abund <- tibble::column_to_rownames(tmp_abund, "Sample")
# Reorder table column by sum
    tmp_layers <- tmp_abund[,names(sort(colSums(tmp_abund), decreasing = TRUE))]
# Add the prefix
    tmp_layers <- tmp_layers %>% dplyr::rename_all(function(x) paste0("t_", pick_rank_l,"!", x))
    tmp_layers <- tibble::rownames_to_column (tmp_layers, "taxon")
# save the table
    tmp_path <- file.path("files/anvio/its/")
    write.table(tmp_layers, paste(tmp_path, i, "/", "tax_layers.txt", sep = ""),
            quote = FALSE, sep = "\t", row.names = FALSE)
    rm(list = ls(pattern = "tmp_"))
}

5. Construct Dendrograms

The last piece we need is to generate dendrograms that order the ASVs by their distribution in the samples and the samples by their ASV composition. For this task we will use anvi’o.

anvi-matrix-to-newick data.txt --distance euclidean \
                               --linkage ward \
                               -o asv.tre
anvi-matrix-to-newick data.txt --distance euclidean \
                               --linkage ward \
                               -o sample.tre \
                               --transpose

The first command reads the view data we generated above and uses Euclidean distance and Ward linkage for hierarchical clustering of the ASVs. The second command transposes the view data table and then does the same for the samples. There are several distance metrics and linkage methods available. See the help menu for the command by typing anvi-matrix-to-newick -h. Boom.

Alternatively, we can generate dendrograms using phyloseq::distance and hclust.

Code
pick_dist <- "bray"
pick_clust <- "complete"

for (i in samp_ps) {
# Make the table
    tmp_get <- get(i)
    tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist, type = "sample")
    tmp_dend <- hclust(tmp_dist, method = pick_clust)
    plot(tmp_dend, hang = -1)
    tmp_tree <- as.phylo(tmp_dend) 
    
    tmp_path <- file.path("files/anvio/its/")
    write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
    rm(list = ls(pattern = "tmp_"))
}
Code
pick_dist_asv <- "euclidean"
pick_clust_asv <- "ward"

for (i in samp_ps) {
# Make the table
    tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
    tmp_dist <- phyloseq::distance(physeq = tmp_get, method = pick_dist_asv, type = "taxa")
    tmp_dend <- hclust(tmp_dist, method = pick_clust_asv)
    plot(tmp_dend, hang = -1)
    tmp_tree <- as.phylo(tmp_dend) 
    
    tmp_path <- file.path("files/anvio/its/")
    write.tree(phy = tmp_tree, file = paste(tmp_path, i, "/", "asv_", pick_dist_asv, "_", pick_clust_asv, ".tre", sep = ""))
    rm(list = ls(pattern = "tmp_"))
}
Note

The file asv.tre is loaded with anvi-interactive command & the --tree flag.

The ASV tree is fine as is, but the sample tree needs a special format. Specifically, the tree needs to be in a three column, tab delimited, table. This way you can add multiple orderings to the same file and view them all in the interactive. The table needs to be in this format:

item_name data_type data_value
tree_1 newick ((P01_D00_010_W8A:0.0250122,P05_D00_010_W8C:0.02,..
tree_2 newick ((((((((OTU14195:0.0712585,OTU13230:0.0712585)0:,…
(…) (…) (…)

This is easy to do by hand, but I really need the practice so I will do it in R. Anvi’o is very particular about formatting. For example, if this file ends with a blank line, which it will because when anvi’o made the initial dendrogram it add a new line. We need to get rid of that or we get an error when trying to import the table.

Note

The file sample.tre is added to the profile.db with anvi-import-misc-data command & --target-data-table layer_orders flag.

Code
for (i in samp_ps) {
      tmp_path <- file.path("files/anvio/its/")
      tmp_tree <- read_file(paste(tmp_path, i, "/", "sample.tre", sep = ""))
      tmp_tree <- gsub("[\r\n]", "", tmp_tree)
      tmp_item <- c("bray_complete")
      tmp_type <- c("newick")
      tmp_df <- c(tmp_tree)
      tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
      library(janitor)
      tmp_tab %>% remove_empty("rows")
      colnames(tmp_tab) <- c("item_name",   "data_type",    "data_value")
      write.table(tmp_tab, paste(tmp_path, i, "/", "sample.tre", sep = ""),
            sep = "\t", quote = FALSE, row.names = FALSE, na = "")
      rm(list = ls(pattern = "tmp_"))
}
Code
# FOR TRANSFORMED DATA
for (i in samp_ps) {
      tmp_path <- file.path("files/anvio/its/")
      tmp_tree <- read_file(paste(tmp_path, i, "/","sample_trans.tre", sep = ""))
      tmp_tree <- gsub("[\r\n]", "", tmp_tree)
      tmp_item <- c("bray_complete")
      tmp_type <- c("newick")
      tmp_df <- c(tmp_tree)
      tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
      library(janitor)
      tmp_tab %>% remove_empty("rows")
      colnames(tmp_tab) <- c("item_name",   "data_type",    "data_value")
      write.table(tmp_tab, paste(tmp_path, i, "/", "sample_trans.tre", sep = ""),
            sep = "\t", quote = FALSE, row.names = FALSE, na = "")
      rm(list = ls(pattern = "tmp_"))
}
objects()
Code
# FOR HCLUST TREE
for (i in samp_ps) {
      tmp_path <- file.path("files/anvio/its/")
      tmp_tree <- read_file(paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""))
      tmp_tree <- gsub("[\r\n]", "", tmp_tree)
      tmp_item <- c(paste(pick_dist, "_", pick_clust, "_hclust", sep = ""))
      tmp_type <- c("newick")
      tmp_df <- c(tmp_tree)
      tmp_tab <- data.frame(tmp_item, tmp_type, tmp_df)
      library(janitor)
      tmp_tab %>% remove_empty("rows")
      colnames(tmp_tab) <- c("item_name",   "data_type",    "data_value")
      write.table(tmp_tab, paste(tmp_path, i, "/", "sample_", pick_dist, "_", pick_clust, ".tre", sep = ""),
            sep = "\t", quote = FALSE, row.names = FALSE, na = "")
      rm(list = ls(pattern = "tmp_"))
}

6. Make a fasta file

We don’t need to add a fasta file, but it is a nice way to keep everything in one place. Plus, you can do BLAST searches directly in the interface by right clicking on the ASV of interest, so it is nice to have the sequences.

Note

Loaded with anvi-interactive command & --fasta-file flag.

Code
for (i in samp_ps) {
       tmp_get <- get(purrr::map_chr(i, ~ paste0(., "_trim")))
       tmp_tab <- tax_table(tmp_get)
       tmp_tab <- tmp_tab[, 7]
       tmp_df <- data.frame(row.names(tmp_tab), tmp_tab)
       colnames(tmp_df) <- c("ASV_ID", "ASV_SEQ")
       tmp_df$ASV_ID <- sub("^", ">", tmp_df$ASV_ID)
       tmp_path <- file.path("files/anvio/its/")

       write.table(tmp_df, paste(tmp_path, i, "/", i, ".fasta", sep = ""),
            sep = "\n", col.names = FALSE, row.names = FALSE,
            quote = FALSE, fileEncoding = "UTF-8")
       rm(list = ls(pattern = "tmp_"))
}

Building the Profile Database

Time to put all of these pieces together. This gets a little tricky since we do not have a database to start with because some of these files can be loaded directly in the interface but some need to be added to a database. When we fire up the interactive in --manual mode, we must give anvi’o the name of a database and it will create that database for us. Then we can shut down the interactive, add the necessary data files, and start back up.

anvi-interactive --view-data data.txt \
                 --tree asv.tre \
                 --additional-layers additional_layers.txt \
                 --profile-db profile.db \
                 --manual

Now we have a new profile database that we can add the sample metadata (additional_layers.txt) and the sample dendrogram (sample.tre) using the command anvi-import-misc-data. These commands add the table to the new profile.db. First, kill the interactive.

anvi-import-misc-data additional_views.txt \
                      --pan-or-profile-db profile.db \
                      --target-data-table layers
anvi-import-misc-data sample.tre \
                      --pan-or-profile-db profile.db \
                      --target-data-table layer_orders

One last this is to get the table with the taxonomy total by sample (tax_layers.txt) into the profile database. We will run the same command we just used.

anvi-import-misc-data tax_layers.txt \
                      --pan-or-profile-db profile.db \
                      --target-data-table layers

In fact, we could just as easily append the taxonomy total data onto the additional_layers.txt and import in one command. But we didn’t.

Interactive Interface

With a populated database in hand, we can now begin modifying the visual by running the interactive command again.

anvi-interactive --view-data data.txt \
                 --tree asv.tre \
                 --additional-layers additional_layers.txt \
                 --profile-db profile.db
                 --fasta-file anvio.fasta \
                 --manual

Workflow Output

Data products generated in this workflow can be downloaded from figshare.

Next workflow:
8. Metadata Analysis
Previous workflow:
6. Beta Diversity & Dispersion Estimates


Source Code

The source code for this page can be accessed on GitHub by clicking this link.

Data Availability

Data generated in this workflow and the Rdata needed to run the workflow can be accessed on figshare at 10.25573/data.16828249.

Last updated on

[1] "2022-06-29 07:29:05 EST"

References

Cao, Yang. 2020. “microbiomeMarker: Microbiome Biomarker Analysis.” R Package Version 0.0.1.9000. https://doi.org/10.5281/zenodo.3749415.
Eren, A Murat, Özcan C Esen, Christopher Quince, Joseph H Vineis, Hilary G Morrison, Mitchell L Sogin, and Tom O Delmont. 2015. “Anvi’o: An Advanced Analysis and Visualization Platform for ‘Omics Data.” PeerJ 3: e1319. https://doi.org/10.7717/peerj.1319.
Roberts, David W, and Maintainer David W Roberts. 2016. “Package ‘Labdsv’.” Ordination and Multivariate 775. https://cran.r-project.org/web/packages/labdsv/index.html.
Segata, Nicola, Jacques Izard, Levi Waldron, Dirk Gevers, Larisa Miropolsky, Wendy S Garrett, and Curtis Huttenhower. 2011. “Metagenomic Biomarker Discovery and Explanation.” Genome Biology 12 (6): 1–18. https://doi.org/10.1186/gb-2011-12-6-r60.