diff --git a/DESCRIPTION b/DESCRIPTION index f8f9b1b..6b3ea82 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: metabonaut Title: Exploring and Analyzing LC-MS Data -Version: 0.0.4 +Version: 0.0.6 Authors@R: c(person(given = "Philippine", family = "Louail", email = "philippine.louail@eurac.edu", diff --git a/NEWS.md b/NEWS.md index b26e0f0..be8f663 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,13 @@ # metabonaut 0.0 +## Changes in 0.0.6 + +- Moving The PercentMissing filtering at the end of + the pre-processing steps as it needs to be done + before normalization. +- Addition of collapsing code to improve readability. +- Reduction of table size + ## Changes in 0.0.5 - Require *MsIO* version 0.0.8 to allow reading of stored diff --git a/vignettes/.gitignore b/vignettes/.gitignore deleted file mode 100644 index 075b254..0000000 --- a/vignettes/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/.quarto/ diff --git a/vignettes/a-end-to-end-untargeted-metabolomics.qmd b/vignettes/a-end-to-end-untargeted-metabolomics.qmd index f8d835d..6ea4551 100644 --- a/vignettes/a-end-to-end-untargeted-metabolomics.qmd +++ b/vignettes/a-end-to-end-untargeted-metabolomics.qmd @@ -19,6 +19,12 @@ library(BiocStyle) knitr::opts_knit$set(root.dir = './') ``` +```{css, echo = FALSE} +.verysmall .table{ + font-size: 8px; +} +``` + # Introduction The present workflow describes all steps for the analysis of an LC-MS/MS @@ -120,7 +126,6 @@ if (.Platform$OS.type == "unix") { } else { register(SnowParam(2)) } - ``` # Data organisation @@ -290,11 +295,15 @@ After about 240 seconds no signal seems to be measured. Thus, we filter the data removing that part as well as the first 10 seconds measured in the LC run. ```{r filter-rt} -#| fig-cap: "Figure 6. BPC after filtering retention time." #' Filter the data based on retention time lcms1 <- filterRt(lcms1, c(10, 240)) bpc <- chromatogram(lcms1, aggregationFun = "max") +``` +```{r bpc2, echo=TRUE} +#| fig-cap: "Figure 6. BPC after filtering retention time." +#| code-fold: true +#| code-summary: "Show the code" #' Plot after filtering plot(bpc, col = paste0(col_sample, 80), main = "BPC after filtering retention time", lwd = 1.5) @@ -393,6 +402,8 @@ We below plot the EICs for isotope labeled cystine and methionine. ```{r} #| fig-cap: "Figure 9. EIC of cystine and methionine." #| fig-align: center +#| code-fold: true +#| code-summary: "Show the code" #' Extract the two IS from the chromatogram object. eic_cystine <- eic_is["cystine_13C_15N"] eic_met <- eic_is["methionine_13C_15N"] @@ -554,8 +565,11 @@ With the customized parameters, a chromatographic peak was detected in each sample. Below, we use the `plot()` function on the EICs to visualize these results. -```{r echo=FALSE} +```{r} #| fig-cap: "Figure 11. Chromatographic peak detection on EICs." +#| fig-align: center +#| code-fold: true +#| code-summary: "Show the code" #' Plot test chromatogram par(mfrow = c(1, 2)) plot(cystine_test, main = fData(cystine_test)$name, @@ -606,6 +620,8 @@ chromatographic peak detection results. ```{r plot-new-eics, echo=FALSE} #| fig-cap: "Figure 12. Chromatographic peak detection on EICs after processing." +#| code-fold: true +#| code-summary: "Show the code" #' Test if we find peaks for Cystine and Methionine again eic_cystine <- eic_is["cystine_13C_15N", ] eic_met <- eic_is["methionine_13C_15N", ] @@ -639,8 +655,10 @@ this issue, the `refineChromPeaks()` function is utilized, in conjunction with Below we show some examples of *CentWave* peak detection artifacts. These examples are pre-selected to illustrate the necessity of the next step: -```{r echo=FALSE} +```{r} #| fig-cap: "Figure 13. Examples of CentWave peak detection artifacts." +#| code-fold: true +#| code-summary: "Show the code" #' Extract m/z-rt regions for selected peaks mz_rt <- cbind(rtmin = c(155.2120, 181.71800), rtmax = c(201.0710, 228.13500), @@ -703,6 +721,8 @@ chromPeakData(lcms1)$merged |> Before proceeding with the next preprocessing step it is generally suggested to evaluate the results of the chromatographic peak detection on EICs of e.g. internal standards or other compounds/ions known to be present in the samples. +Here we update our EICs objects for the internal standards with the results of +the chromatographic peak detection and refinement. ```{r, message=FALSE} eics_is_chrompeaks <- eic_is @@ -723,10 +743,13 @@ these numbers in a table. ```{r} #| tbl-cap: "Table 5. Samples and number of identified chromatographic peaks." +#| code-fold: true +#| code-summary: "Show the code" #' Count the number of peaks per sample and summarize them in a table. data.frame(sample_name = sampleData(lcms1)$sample_name, peak_count = as.integer(table(chromPeaks(lcms1)[, "sample"]))) |> - kable(format = "pipe") + t() |> + kable(format = "pipe") ``` A similar number of chromatographic peaks was identified within the various @@ -750,6 +773,9 @@ QC samples. ```{r} #| fig-cap: "Figure 15. BPC of QC samples." +#| fig-align: center +#| code-fold: true +#| code-summary: "Show the code" #' Get QC samples QC_samples <- sampleData(lcms1)$phenotype == "QC" @@ -930,6 +956,9 @@ lcms1 <- applyAdjustedRtime(lcms1) ```{r bpc-before-and-after1} #| fig-cap: "Figure 18. BPC before and after alignment." +#| fig-align: center +#| code-fold: true +#| code-summary: "Show the code" #' Plot the BPC before and after alignment par(mfrow = c(2, 1), mar = c(2, 1, 1, 0.5)) chromatogram(lcms1_raw, aggregationFun = "max", chromPeaks = "none") |> @@ -954,14 +983,13 @@ internal standards. We thus below first extract the ion chromatograms after alignment. ```{r, message=FALSE} -#' Store the EICs before alignment -eics_is_refined <- eic_is - +#| code-fold: true +#| code-summary: "Show the code" #' Update the EICs eic_is <- chromatogram(lcms1, rt = as.matrix(intern_standard[, c("rtmin", "rtmax")]), mz = as.matrix(intern_standard[, c("mzmin", "mzmax")])) -fData(eic_is) <- fData(eics_is_refined) +fData(eic_is) <- fdata #' Extract the EICs for the test ions eic_cystine <- eic_is["cystine_13C_15N"] @@ -973,6 +1001,9 @@ before and after alignment for both the isotope labeled cystine and methionine. ```{r specific-ion-before-and-after1} #| fig-cap: "Figure 19. EICs of cystine and methionine before and after alignment." +#| fig-align: center +#| code-fold: true +#| code-summary: "Show the code" par(mfrow = c(2, 2), mar = c(4, 4.5, 2, 1)) old_eic_cystine <- eics_is_refined["cystine_13C_15N"] @@ -1094,12 +1125,16 @@ the apex position of chromatographic peaks in the different samples (y-axis), along retention time (x-axis) in the lower panel. ```{r} -#| fig-cap: "Figure 21. Initial correspondence analysis, Cystine." #' Default parameter for the grouping and apply them to the test ions BPC param <- PeakDensityParam(sampleGroups = sampleData(lcms1)$phenotype, bw = 30) param +``` +```{r} +#| fig-cap: "Figure 21. Initial correspondence analysis, Cystine." +#| code-fold: true +#| code-summary: "Show the code" plotChromPeakDensity( eic_cystine, param = param, col = paste0(col_sample, "80"), @@ -1110,6 +1145,8 @@ plotChromPeakDensity( ```{r} #| fig-cap: "Figure 22. Initial correspondence analysis, Methionine." +#| code-fold: true +#| code-summary: "Show the code" plotChromPeakDensity(eic_met, param = param, col = paste0(col_sample, "80"), peakCol = col_sample[chromPeaks(eic_met)[, "sample"]], @@ -1126,6 +1163,8 @@ this parameter to 1.8 and evaluate its impact. ```{r} #| fig-cap: "Figure 23. Correspondence analysis with optimized parameters, Cystine." +#| code-fold: true +#| code-summary: "Show the code" #' Updating parameters param <- PeakDensityParam(sampleGroups = sampleData(lcms1)$phenotype, bw = 1.8) @@ -1139,6 +1178,8 @@ plotChromPeakDensity( ```{r} #| fig-cap: "Figure 24. Correspondence analysis with optimized parameters, Methionine." +#| code-fold: true +#| code-summary: "Show the code" plotChromPeakDensity(eic_met, param = param, col = paste0(col_sample, "80"), peakCol = col_sample[chromPeaks(eic_met)[, "sample"]], @@ -1181,6 +1222,8 @@ show the actual correspondence results, we set `simulate = FALSE` for the ```{r, message=FALSE} #| fig-cap: "Figure 25. Correspondence analysis results, Methionine." +#| fig-align: center +#| code-fold: true #' Extract chromatogram for an m/z similar to the one of the labeled methionine chr_test <- chromatogram(lcms1, mz = as.matrix(intern_standard["methionine_13C_15N", @@ -1265,6 +1308,8 @@ samples: ```{r, message=FALSE} #| fig-cap: "Figure 26. Examples of chromatographic peaks with missing values." +#| code-fold: true +#| code-summary: "Show the code" ftidx <- which(is.na(rowSums(featureValues(lcms1)))) fts <- rownames(featureDefinitions(lcms1))[ftidx] farea <- featureArea(lcms1, features = fts[1:2]) @@ -1304,6 +1349,8 @@ chromatographic peak signal. Let's look at our previously missing values again: ```{r echo=FALSE, message=FALSE} #| fig-cap: "Figure 27. Examples of chromatographic peaks with missing values after gap-filling." +#| code-fold: true +#| code-summary: "Show the code" #' Extract EICs again and plot them chromatogram(lcms1[c(2, 3)], mz = farea[, c("mzmin", "mzmax")], @@ -1334,6 +1381,9 @@ Then, we calculate the row averages of both of these matrices and plot them against each other. ```{r Detected-vs-filled-signal1} +#| fig-cap: "Figure 28. Detected vs. filled-in signal." +#| code-fold: true +#| code-summary: "Show the code" #' Get only detected signal in QC samples vals_detect <- featureValues(lcms1, filled = FALSE)[, QC_samples] @@ -1375,6 +1425,40 @@ The linear regression line has a slope of `r round(coef(l)[2], 2)` and an intercept of `r round(coef(l)[1], 2)`. This indicates that the filled-in signal is on average `r round(coef(l)[2], 2)` times higher than the detected signal. +## Filtering Features: Missing values + +We now restrict the data set to features for which a chromatographic peak was +detected in at least 2/3 of samples of at least one of the study samples groups. +This ensures the statistical tests carried out later on the study samples are +being performed on *reliable* signal. Also, with this filter we remove features +that were mostly detected in QC samples, but not the study samples. Such filter +can be performed with `filterFeatures()` function from the `r Biocpkg("xcms")` +package with the `PercentMissingFilter` setting. The parameters of this filer: + +- `threshold`: defines the maximal acceptable percentage of samples with + missing value(s) in at least one of the sample groups defined by parameter + `f`. +- `f`: a factor defining the sample groups. By replacing the `"QC"` sample + group with `NA` in parameter `f` we exclude the QC samples from the + evaluation and consider only the study samples. With `threshold = 40` we + keep only features for which a peak was detected in 2 out of the 3 samples + in one of the sample groups. + +To consider detected chromatographic peaks per sample, we apply the filter on +the`"raw"` assay of our result object, that contains abundance values only for +detected chromatographic peaks (prior gap-filling). + +```{r} +#' Limit features to those with at least two detected peaks in one study group. +#' Setting the value for QC samples to NA excludes QC samples from the +#' calculation. +f <- res$phenotype +f[f == "QC"] <- NA +f <- as.factor(f) +res <- filterFeatures(res, PercentMissingFilter(f = f, threshold = 40), + assay = "raw") +``` + ## Preprocessing results The final results of the LC-MS data preprocessing are stored within the @@ -1550,10 +1634,6 @@ Below we impute missing values with this approach and add the resulting data matrix as a new *assay* to our result object. ```{r} -#' Load preprocessing results -## load("SumExp.RData") -## loadResults(RDataParam("data.RData")) - #' Impute missing values using an uniform distribution na_unidis <- function(z) { na <- is.na(z) @@ -1583,6 +1663,8 @@ remove dependency on absolute abundances. ```{r unsupervised-checks1, fig.height=8, fig.width=6} #| fig-cap: "Figure 28. PCA of the data coloured by phenotypes." +#| code-fold: true +#| code-summary: "Show the code" #' Log2 transform and scale data vals <- assay(res, "raw_filled_imputed") |> log2() |> @@ -1620,6 +1702,8 @@ distribution of the log2 transformed feature abundances. ```{r counts1} #| fig-cap: "Figure 29. Number of detected peaks and feature abundances." +#| code-fold: true +#| code-summary: "Show the code" layout(mat = matrix(1:3, ncol = 1), height = c(0.2, 0.2, 0.8)) par(mar = c(0.2, 4.5, 0.2, 3)) @@ -1666,6 +1750,8 @@ groups. ```{r rla-plot-raw-and-filled1} #| fig-cap: "Figure 30. RLA plot for the raw data and filled data." +#| code-fold: true +#| code-summary: "Show the code" par(mfrow = c(1, 1), mar = c(3.5, 4.5, 2.5, 1)) boxplot(MsCoreUtils::rowRla(assay(res, "raw_filled"), f = res$phenotype, transform = "log2"), @@ -1742,6 +1828,8 @@ behavior between groups. ```{r, fig.height=8, fig.width=6} #| fig-cap: "Figure 31. PC1 and PC2 of the data before and after normalization." +#| code-fold: true +#| code-summary: "Show the code" #' Data before normalization vals_st <- cbind(vals, phenotype = res$phenotype) pca_raw <- autoplot(pca_res, data = vals_st, @@ -1768,6 +1856,8 @@ lower after normalization (see below). ```{r, fig.height=8, fig.width=6} #| fig-cap: "Figure 32. PC3 and PC4 of the data before and after normalization." +#| code-fold: true +#| code-summary: "Show the code" pca_raw <- autoplot(pca_res, data = vals_st , colour = 'phenotype', x = 3, y = 4, scale = 0) + scale_color_manual(values = col_phenotype) @@ -1788,8 +1878,10 @@ correct for biological variance and only technical. We compare the RLA plots before and after between-sample normalization to evaluate its impact on the data. -```{r rla-plot-after-norm2} +```{r rla-plot-after-norm2, fig.height=8, fig.width=6}} #| fig-cap: "Figure 33. RLA plot before and after normalization." +#| code-fold: true +#| code-summary: "Show the code" par(mfrow = c(2, 1), mar = c(3.5, 4.5, 2.5, 1)) boxplot(rowRla(assay(res, "raw_filled"), group = res$phenotype), @@ -1828,6 +1920,8 @@ deviation instead of the standard deviation. ```{r include=TRUE, results = "asis"} #| tbl-cap: "Table 6. Distribution of CV values across samples for the raw and normalized data." +#| code-fold: true +#| code-summary: "Show the code" #' Calculate the CV values index_study <- res$phenotype %in% c("CTR", "CVD") index_QC <- res$phenotype == "QC" @@ -1902,38 +1996,6 @@ filtering is somewhat limited. For a comprehensive description and guidelines for data filtering in untargeted metabolomic studies, please refer to [@broadhurst_guidelines_2018]. -We first restrict the data set to features for which a chromatographic peak was -detected in at least 2/3 of samples of at least one of the study samples groups. -This ensures the statistical tests carried out later on the study samples are -being performed on *reliable* signal. Also, with this filter we remove features -that were mostly detected in QC samples, but not the study samples. Such filter -can be performed with `filterFeatures()` function from the `r Biocpkg("xcms")` -package with the `PercentMissingFilter` setting. The parameters of this filer: - -- `threshold`: defines the maximal acceptable percentage of samples with - missing value(s) in at least one of the sample groups defined by parameter - `f`. -- `f`: a factor defining the sample groups. By replacing the `"QC"` sample - group with `NA` in parameter `f` we exclude the QC samples from the - evaluation and consider only the study samples. With `threshold = 40` we - keep only features for which a peak was detected in 2 out of the 3 samples - in one of the sample groups. - -To consider detected chromatographic peaks per sample, we apply the filter on -the`"raw"` assay of our result object, that contains abundance values only for -detected chromatographic peaks (prior gap-filling). - -```{r} -#' Limit features to those with at least two detected peaks in one study group. -#' Setting the value for QC samples to NA excludes QC samples from the -#' calculation. -f <- res$phenotype -f[f == "QC"] <- NA -f <- as.factor(f) -res <- filterFeatures(res, PercentMissingFilter(f = f, threshold = 40), - assay = "raw") -``` - Following the guidelines stated above we decided to still use the QC samples for pre-filtering, on the basis that they represent similar bio-fluids to our study samples, and thus, we anticipate observing relatively similar metabolites @@ -2020,6 +2082,8 @@ a PCA (after excluding the QC samples to avoid them influencing the results). ```{r} #| fig-cap: "Figure 34. PCA of the data after normalization and quality control." +#| code-fold: true +#| code-summary: "Show the code" #' Define the colors for the plot col_sample <- col_phenotype[res$phenotype] @@ -2042,6 +2106,8 @@ be explained by the other available variable of our study, i.e., age: ```{r} #| fig-cap: "Figure 35. PCA colored by age of the data after normalization and quality control." +#| code-fold: true +#| code-summary: "Show the code" #' Add age to the PCA plot vals_st <- cbind(vals, age = res$age) autoplot(pca_res, data = vals_st , colour = 'age', scale = 0) @@ -2117,6 +2183,8 @@ p-values. ```{r histogram} #| fig-cap: "Figure 36. Distribution of raw (left) and adjusted p-values (right)." +#| code-fold: true +#| code-summary: "Show the code" #' Plot the distribution of p-values par(mfrow = c(1, 2)) hist(rowData(res)$pvalue.CVD, breaks = 64, xlab = "p value", @@ -2144,6 +2212,8 @@ a blue color in the plot below. ```{r volcano} #| fig-cap: "Figure 37. Volcano plot showing the analysis results." +#| code-fold: true +#| code-summary: "Show the code" #' Plot volcano plot of the statistical results par(mfrow = c(1, 1), mar = c(5, 5, 5, 1)) plot(rowData(res)$coef.CVD, -log10(rowData(res)$adjp.CVD), @@ -2170,6 +2240,8 @@ in the table below. ```{r result-table} #| tbl-cap: "Table 7. Features with significant differences in abundances." +#| code-fold: true +#| code-summary: "Show the code" # Table of significant features tab <- rowData(res)[rowData(res)$significant.CVD, c("mzmed", "rtmed", "coef.CVD", "adjp.CVD", @@ -2406,6 +2478,8 @@ lcms2 <- readMsObject(MsExperiment(), ```{r, message=FALSE} #| tbl-cap: "Table 10. Samples from the LC-MS/MS data set." +#| code-fold: true +#| code-summary: "Show the code" #adjust sampleData colnames(sampleData(lcms2)) <- c("sample_name", "derived_spectra_data_file", "metabolite_asssignment_file", @@ -2706,13 +2780,21 @@ the highest scoring MS2 spectra was recorded and create a mirror plot comparing this MS2 spectra to the reference fragment spectra for caffeine. ```{r fig.height=5, fig.width=5} +#| fig-cap: "Figure 39. Extracted ion chromatograms and MS2 spectra for the annotated feature." +#| code-fold: true +#| code-summary: "Show the code" col_sample <- col_phenotype[sampleData(lcms1)$phenotype] #' Extract and plot EIC for the annotated feature eic <- featureChromatograms(lcms1, features = ms2_mtch_res$feature_id[1]) plot(eic, col = col_sample, peakCol = col_sample[chromPeaks(eic)[, "sample"]], peakBg = paste0(col_sample[chromPeaks(eic)[, "sample"]], 20)) legend("topright", col = col_phenotype, legend = names(col_phenotype), lty = 1) +``` +```{r} +#| fig-cap: "Figure 40. MS2 spectra for the annotated feature." +#| code-fold: true +#| code-summary: "Show the code" #' Identify the best matching query-target spectra pair idx <- which.max(ms2_mtch_res$score) @@ -2811,8 +2893,6 @@ sessionInfo() Thanks to Steffen Neumann for his continuous work to develop and maintain the xcms software... -# Additional informations - ```{r eval=FALSE, include=FALSE} plot(eics_is_noprocess) diff --git a/vignettes/alignment-to-external-dataset.qmd b/vignettes/alignment-to-external-dataset.qmd index 13bb2d4..ebafb0e 100644 --- a/vignettes/alignment-to-external-dataset.qmd +++ b/vignettes/alignment-to-external-dataset.qmd @@ -16,6 +16,14 @@ library(quarto) knitr::opts_knit$set(root.dir = './') ``` + +```{css, echo = FALSE} +.verysmall .table{ + font-size: 8px; +} +``` + + # Introduction In certain experiments, aligning datasets recorded at different times is @@ -51,7 +59,6 @@ if (.Platform$OS.type == "unix") { } else { register(SnowParam(2)) } - ``` ## Load preprocessed LC-MS object @@ -65,8 +72,7 @@ framework and we below load the object using the `readMsObject()` function providing the path to the stored data. The import function also takes care of eventually retrieving missing MS data files from the MetaboLights repository. - -```{r} +```{r message=FALSE} lcms1 <- readMsObject( XcmsExperiment(), AlabasterParam(system.file("extdata", "preprocessed_lcms1", @@ -96,7 +102,7 @@ We will adjust the `sampleData()` of the LC-MS/MS object to make it easier to access: ```{r} -#| tbl-cap: "Table 10. Samples from the LC-MS/MS data set." +#| tbl-cap: "Table 1. Samples from the LC-MS/MS data set." #| code-fold: true #| code-summary: "Show the code" #adjust sampleData @@ -142,7 +148,7 @@ lcms2 <- filterRt(lcms2, range(rtime(lcms1))) To evaluate retention time shifts, we'll plot the base peak chromatogram (BPC): -```{r} +```{r, message=FALSE} idx_A <- which(sampleData(lcms1)$sample_name == "A") idx_E <- which(sampleData(lcms1)$sample_name == "E") bpc1 <-chromatogram(lcms1[c(idx_A,idx_E)], aggregationFun = "max", @@ -153,6 +159,9 @@ bpc2 <- chromatogram(lcms2, aggregationFun = "max", msLevel = 1) Compare run1 sample A with run2 sample A ```{r ini-bpc1} +#| tbl-cap: "Figure 1. BPC for sample A LC-MS and LC-MS/MS." +#| code-fold: true +#| code-summary: "Show the code" plot(bpc1[1, 1], col = "#00000080", main = "BPC sample A LC-MS vs A LC-MS/MS", lwd = 1.5, peakType = "none") grid() @@ -164,6 +173,9 @@ legend("topleft", col = c("#00000080", "#0000ff80"), Similarly, compare the BPC for sample E: ```{r, ini-bpc2} +#| tbl-cap: "Figure 2. BPC for sample E LC-MS and LC-MS/MS." +#| code-fold: true +#| code-summary: "Show the code" plot(bpc1[1, 2], col = "#00000080", main = "BPC sample E LC-MS vs E LC-MS/MS", lwd = 1.5, peakType = "none") grid() @@ -212,7 +224,7 @@ the experimental data to the lamas, fitting a model based on this match to adjust their retention times and minimize differences between the two datasets. Now we can define our parameter object `LamaParama` to prepare for the -alignment. Parameters such as `tolerance`, `toleranceRt`, and `ppm` relate to +alignment. Parameters such as `tolerance`, `toleranceRt`, and `ppm` relate to the matching between chromatographic peaks and lamas. Other parameters are related to the type of fitting generated between these data points. More details on each parameter and the overall method can be found by searching @@ -248,7 +260,6 @@ lcms2 <- applyAdjustedRtime(lcms2) We extract the base peak chromatogram (BPC) of our aligned object: ```{r, message=FALSE} -#' evaluate the results with BPC bpc2_adj <- chromatogram(lcms2, aggregationFun = "max", msLevel = 1) ``` @@ -260,6 +271,9 @@ comparing the alignment of the reference dataset (in black) with our LC-MS data before (in red) and after (in blue) the adjustment. ```{r, fig.height=6, fig.width=6} +#| tbl-cap: "Figure 3. BPC for sample A before and after alignment." +#| code-fold: true +#| code-summary: "Show the code" #' BPC of sample A par(mfrow = c(2, 1), mar = c(2.5, 2.5, 2.5, 0.5), mgp = c(1.5, 0.5, 0)) plot(bpc1[1, 1], col = "#00000080", main = "Before Alignment", lwd = 1.5, @@ -280,6 +294,9 @@ points(rtime(bpc2_adj[1,1]), intensity(bpc2_adj[1,1]), ``` ```{r, fig.height=6, fig.width=6} +#| tbl-cap: "Figure 4. BPC for sample B before and after alignment." +#| code-fold: true +#| code-summary: "Show the code" #' BPC of sample B par(mfrow = c(2, 1), mar = c(2.5, 2.5, 2.5, 0.5), mgp = c(1.5, 0.5, 0)) plot(bpc1[1, 2], col = "#00000080", main = "Before Alignment", lwd = 1.5, @@ -307,6 +324,9 @@ anchor peaks (lamas) for Sample A. The red vertical lines represent the positions of these matched peaks. ```{r, message = FALSE, warning = FALSE} +#| tbl-cap: "Figure 5. Distribution of CP matched to Lamas for sample A." +#| code-fold: true +#| code-summary: "Show the code" #' BPC of the first sample with matches to lamas overlay par(mfrow = c(1, 1)) plot(bpc1[1, 1], col = "#00000080", main = "Distribution CP matched to Lamas", @@ -325,6 +345,7 @@ between the chromatographic peaks in our LC-MS data and the anchor peaks (Lamas) both before and after the alignment. ```{r} +#| tbl-cap: "Figure 6. Distance to Anchor Peaks: Before vs. After Alignment." # Extract data for sample 3 directly ref_obs_sample_1 <- ref_vs_obs[["1"]] @@ -371,7 +392,12 @@ summary$Matched_peaks / summary$Total_peaks * 100 #' Access the information on the model of for the first file summary$Model_summary[[1]] +``` +```{r} +#| tbl-cap: "Figure 7. Model fit for the first file." +#| code-fold: true +#| code-summary: "Show the code" #' Plot obs vs. lcms1 with fitting line plot(lp, index = 1L, main = "ChromPeaks versus Lamas for sample A", colPoint = "red") @@ -393,4 +419,4 @@ Ultimately, aligning chromatographic data ensures that subsequent analyses, such as feature extraction and statistical comparisons, are based on consistent time points, improving data quality and reliability. This tutorial outlined an end-to-end workflow that can be adapted to various LC-MS-based metabolomics -studies, helping researchers manage retention time variation effectively. \ No newline at end of file +studies, helping researchers manage retention time variation effectively. diff --git a/vignettes/dataset-investigation.qmd b/vignettes/dataset-investigation.qmd index a204473..27662cb 100644 --- a/vignettes/dataset-investigation.qmd +++ b/vignettes/dataset-investigation.qmd @@ -1,6 +1,7 @@ --- title: "Dataset investigation: What to do when you get your data" format: html +code_folding: hide editor: visual minimal: true date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' @@ -15,6 +16,7 @@ vignette: > ```{r setup, include=FALSE} library(knitr) library(quarto) +library(BiocStyle) knitr::opts_knit$set(root.dir = './') ``` @@ -71,3 +73,461 @@ separation was achieved using hydrophilic interaction liquid chromatography quality. - Compare pool lc-ms and pool lc-ms/ms and show that we have better separation on the second run. + +# Package + +```{r packages-used, message=FALSE, warning=FALSE} +## Data Import and handling +library(MsExperiment) +library(MsIO) +library(MsBackendMetaboLights) + +## Preprocessing of LC-MS data +library(xcms) +library(Spectra) +library(MetaboCoreUtils) + +## Visualisation +library(pander) +library(RColorBrewer) +library(pheatmap) +``` + +# MS1 level data + +We first load the raw data from MetaboLights: + +```{r import, message=FALSE, warning=FALSE, class.source="fold-show"} +param <- MetaboLightsParam(mtblsId = "MTBLS8735", + assayName = paste0("a_MTBLS8735_LC-MS_positive_", + "hilic_metabolite_profiling.txt"), + filePattern = ".mzML") + +lcms1 <- readMsObject(MsExperiment(), + param, + keepOntology = FALSE, + keepProtocol = FALSE, + simplify = TRUE) +``` + +we set up parallel processing: + +```{r par-process, message=FALSE, warning=FALSE} +#' Set up parallel processing using 2 cores +if (.Platform$OS.type == "unix") { + register(MulticoreParam(2)) +} else { + register(SnowParam(2)) +} +``` + +We update the metadata to make the column names easier to access: + +```{r update-phenodata} +#| tbl-cap: "Table 2. Simplified sample data." +# Let's rename the column for easier access +colnames(sampleData(lcms1)) <- c("sample_name", "derived_spectra_data_file", + "metabolite_asssignment_file", + "source_name", + "organism", + "blood_sample_type", + "sample_type", "age", "unit", "phenotype") + +# Add "QC" to the phenotype of the QC samples +sampleData(lcms1)$phenotype[sampleData(lcms1)$sample_name == "POOL"] <- "QC" +sampleData(lcms1)$sample_name[sampleData(lcms1)$sample_name == "POOL" ] <- c("POOL1", "POOL2", "POOL3", "POOL4") + +# Add injection index column +sampleData(lcms1)$injection_index <- seq_len(nrow(sampleData(lcms1))) + +#let's look at the updated sample data +sampleData(lcms1)[, c("derived_spectra_data_file", + "phenotype", "sample_name", "age", + "injection_index")] |> + kable(format = "pipe") +``` + +```{r define-colors} +#' Define colors for the different phenotypes +col_phenotype <- brewer.pal(9, name = "Set1")[c(9, 5, 4)] +names(col_phenotype) <- c("QC", # grey + "CVD", # orange + "CTR") # purple +col_sample <- col_phenotype[sampleData(lcms1)$phenotype] +``` + +## Spectra exploration + +Quick reminder that we access the spectra data as below: + +```{r} +#' Access Spectra Object +spectra(lcms1) +``` + +One of the first check that should be done is evalutating the number of spectra +per sample. Below we summarize the number of spectra and their respective MS +level (extracted with the `msLevel()` function). The `fromFile()` function +returns for each spectrum the index of its sample (data file) and can thus be +used to split the information (MS level in this case) by sample to further +summarize using the base R `table()` function and combine the result into a +matrix. + +```{r} +#' Count the number of spectra with a specific MS level per file. +spectra(lcms1) |> + msLevel() |> + split(fromFile(lcms1)) |> + lapply(table) |> + do.call(what = cbind) +``` + +The present data set thus contains only MS1 data, which is ideal for +quantification of the signal. A second (LC-MS/MS) data set also with fragment +(MS2) spectra of the same samples will be used later on in the workflow. We also +cannot see any large difference in number of spectra between the samples, which +is a good sign that the data is of good quality. If one sample had a +significantly lower number of spectra, it would be a sign of a potential issue +with the sample. + +Data obtained from LC-MS experiments are typically analyzed along the retention +time axis, while MS data is organized by spectrum, orthogonal to the retention +time axis. As another example, we below determine the retention time range for +the entire data set. + +```{r} +#' Retention time range for entire dataset +spectra(lcms1) |> + rtime() |> + range() +``` + +# Data visualization and general quality assessment + +Effective visualization is paramount for inspecting and assessing the quality of +MS data. For a general overview of our LC-MS data, we can: + +- Combine all mass peaks from all (MS1) spectra of a sample into a single + spectrum in which each mass peak then represents the maximum signal of all + mass peaks with a similar *m/z*. This spectrum might then be called Base + Peak Spectrum (BPS), providing information on the most abundant ions of a + sample. +- Aggregate mass peak intensities for each spectrum, resulting in the Base + Peak Chromatogram (BPC). The BPC shows the highest measured intensity for + each distinct retention time (hence spectrum) and is thus orthogonal to the + BPS. +- Sum the mass peak intensities for each spectrum to create a Total Ion + Chromatogram (TIC). +- Compare the BPS of all samples in an experiment to evaluate similarity of + their ion content. +- Compare the BPC of all samples in an experiment to identify samples with + similar or dissimilar chromatographic signal. + +In addition to such general data evaluation and visualization, it is also +crucial to investigate specific signal of e.g. internal standards or +compounds/ions known to be present in the samples. By providing a reliable +reference, internal standards help achieve consistent and accurate analytical +results. + +## Spectra Data Visualization: BPS + +The BPS collapses data in the retention time dimension and reveals the most +prevalent ions present in each of the samples, creation of such BPS is however +not straightforward. Mass peaks, even if representing signals from the same ion, +will never have identical *m/z* values in consecutive spectra due to the +measurement error/resolution of the instrument. + +Below we use the `combineSpectra` function to combine all spectra from one file +(defined using parameter `f = fromFile(data)`) into a single spectrum. All mass +peaks with a difference in *m/z* value smaller than 3 parts-per-million (ppm) +are combined into one mass peak, with an intensity representing the maximum of +all such grouped mass peaks. To reduce memory requirement, we in addition first +*bin* each spectrum combining all mass peaks within a spectrum, aggregating mass +peaks into bins with 0.01 *m/z* width. In case of large datasets, it is also +recommended to set the `processingChunkSize()` parameter of the `MsExperiment` +object to a finite value (default is `Inf`) causing the data to be processed +(and loaded into memory) in chunks of `processingChunkSize()` spectra. This can +reduce memory demand and speed up the process. + +```{r} +#' Setting the chunksize +chunksize <- 1000 +processingChunkSize(spectra(lcms1)) <- chunksize +``` + +We can now generate BPS for each sample and `plot()` them. + +```{r bps, message = FALSE, warning = FALSE} +#| fig-cap: "Figure 1. BPS of all samples." +#' Combining all spectra per file into a single spectrum +bps <- spectra(lcms1) |> + bin(binSize = 0.01) |> + combineSpectra(f = fromFile(lcms1), intensityFun = max, ppm = 3) + +#' Plot the base peak spectra +par(mar = c(2, 1, 1, 1)) +plotSpectra(bps, main= "") +``` + +Here, there is observable overlap in ion content between the files, particularly +around 300 *m/z* and 700 *m/z*. There are however also differences between sets +of samples. In particular, BPS 1, 4, 7 and 10 (counting row-wise from left to +right) seem different than the others. In fact, these four BPS are from QC +samples, and the remaining six from the study samples. The observed differences +might be explained by the fact that the QC samples are pools of serum samples +from a different cohort, while the study samples represent plasma samples, from +a different sample collection. + +Next to the visual inspection above, we can also calculate and express the +similarity between the BPS with a heatmap. Below we use the `compareSpectra()` +function to calculate pairwise similarities between all BPS and use then the +`pheatmap()` function from the *pheatmap* package to cluster and visualize this +result. + +```{r compare-spectra, message = FALSE, warning = FALSE} +#| fig-cap: "Figure 2. Heatmap of MS signals similarities." +#' Calculate similarities between BPS +sim_matrix <- compareSpectra(bps) + +#' Add sample names as rownames and colnames +rownames(sim_matrix) <- colnames(sim_matrix) <- sampleData(lcms1)$sample_name +ann <- data.frame(phenotype = sampleData(lcms1)[, "phenotype"]) +rownames(ann) <- rownames(sim_matrix) + +#' Plot the heatmap +pheatmap(sim_matrix, annotation_col = ann, + annotation_colors = list(phenotype = col_phenotype)) +``` + +We get a first glance at how our different samples distribute in terms of +similarity. The heatmap confirms the observations made with the BPS, showing +distinct clusters for the QCs and the study samples, owing to the different +matrices and sample collections. + +It is also strongly recommended to delve deeper into the data by exploring it in +more detail. This can be accomplished by carefully assessing our data and +extracting spectra or regions of interest for further examination. In the next +chunk, we will look at how to extract information for a specific spectrum from +distinct samples. + +```{r} +#| fig-cap: "Figure 3. Intensity and m/z values of the 125th spectrum of two CTR samples." +#| out.width: 90% +#| fig-align: center +#' Accessing a single spectrum - comparing with QC +par(mfrow = c(1,2), mar = c(2, 2, 2, 2)) +spec1 <- spectra(lcms1[1])[125] +spec2 <- spectra(lcms1[3])[125] +plotSpectra(spec1, main = "QC sample") +plotSpectra(spec2, main = "CTR sample") +``` + +The significant dissimilarities in peak distribution and intensity confirm the +difference in composition between QCs and study samples. We next compare a full +MS1 spectrum from a CVD and a CTR sample. + +```{r} +#| fig-cap: "Figure 4. Intensity and m/z values of the 125th spectrum of one CVD and one CTR sample." +#| out.width: 90% +#| fig-align: center +#' Accessing a single spectrum - comparing CVD and CTR +par(mfrow = c(1,2), mar = c(2, 2, 2, 2)) +spec1 <- spectra(lcms1[2])[125] +spec2 <- spectra(lcms1[3])[125] +plotSpectra(spec1, main = "CVD sample") +plotSpectra(spec2, main = "CTR sample") +``` + +Above, we can observe that the spectra between CVD and CTR samples are not +entirely similar, but they do exhibit similar main peaks between 200 and 600 m/z +with a general higher intensity in control samples. However the peak +distribution (or at least intensity) seems to vary the most between an *m/z* of +10 to 210 and after an *m/z* of 600. + +The CTR spectrum above exhibits significant peaks around an *m/z* of 150 - 200 +that have a much lower intensity in the CVD sample. To delve into more details +about this specific spectrum, a wide range of functions can be employed: + +```{r} +#| tbl-cap: "Table 3. Intensity and m/z values of the 125th spectrum of one CTR sample." +#| tbl-style: "border: 1px solid black; width: 50%; border-collapse: collapse;" +#' Checking its intensity +intensity(spec2) + +#' Checking its rtime +rtime(spec2) + +#' Checking its m/z +mz(spec2) + +#' Filtering for a specific m/z range and viewing in a tabular format +filt_spec <- filterMzRange(spec2,c(50,200)) + +data.frame(intensity = unlist(intensity(filt_spec)), + mz = unlist(mz(filt_spec))) |> + head() |> kable(format = "markdown") +``` + +## Chormatographic info + +```{r filter-rt} +#| fig-cap: "Figure 6. BPC after filtering retention time." +#' Filter the data based on retention time +lcms1 <- filterRt(lcms1, c(10, 240)) +bpc <- chromatogram(lcms1, aggregationFun = "max") + +#' Plot after filtering +plot(bpc, col = paste0(col_sample, 80), + main = "BPC after filtering retention time", lwd = 1.5) +grid() +legend("topright", col = col_phenotype, + legend = names(col_phenotype), lty = 1, lwd = 2, horiz = TRUE, bty = "n") +``` + +Initially, we examined the entire BPC and subsequently filtered it based on the +desired retention times. This not only results in a smaller file size but also +facilitates a more straightforward interpretation of the BPC. + +The final plot illustrates the BPC for each sample colored by phenotype, +providing insights on the signal measured along the retention times of each +sample. It reveals the points at which compounds eluted from the LC column. In +essence, a BPC condenses the three-dimensional LC-MS data (*m/z* by retention +time by intensity) into two dimensions (retention time by intensity). + +We can also here compare similarities of the BPCs in a heatmap. The retention +times will however not be identical between different samples. Thus we *bin()* +the chromatographic signal per sample along the retention time axis into bins of +two seconds resulting in data with the same number of bins/data points. We can +then calculate pairwise similarities between these data vectors using the +`cor()` function and visualize the result using `pheatmap()`. + +```{r heatmap1} +#| fig-cap: "Figure 7. Heatmap of BPC similarities." +#' Total ion chromatogram +tic <- chromatogram(lcms1, aggregationFun = "sum") |> + bin(binSize = 2) + +#' Calculate similarity (Pearson correlation) between BPCs +ticmap <- do.call(cbind, lapply(tic, intensity)) |> + cor() + +rownames(ticmap) <- colnames(ticmap) <- sampleData(lcms1)$sample_name +ann <- data.frame(phenotype = sampleData(lcms1)[, "phenotype"]) +rownames(ann) <- rownames(ticmap) + +#' Plot heatmap +pheatmap(ticmap, annotation_col = ann, + annotation_colors = list(phenotype = col_phenotype)) +``` + +The heatmap above reinforces what our exploration of spectra data showed, which +is a strong separation between the QC and study samples. This is important to +bear in mind for later analyses. + +Additionally, study samples group into two clusters, cluster *I* containing +samples *C* and *F* and cluster *II* with all other samples. Below we plot the +TIC of all samples, using a different color for each cluster. + +```{r} +#| fig-cap: "Figure 8. Example of TIC of unusual signal." +cluster_I_idx <- sampleData(lcms1)$sample_name %in% c("F", "C") +cluster_II_idx <- sampleData(lcms1)$sample_name %in% c("A", "B", "D", "E") + +temp_col <- c("grey", "red") +names(temp_col) <- c("Cluster II", "Cluster I") +col <- rep(temp_col[1], length(lcms1)) +col[cluster_I_idx] <- temp_col[2] +col[sampleData(lcms1)$phenotype == "QC"] <- NA + +lcms1 |> + chromatogram(aggregationFun = "sum") |> + plot( col = col, + main = "TIC after filtering retention time", lwd = 1.5) +grid() +legend("topright", col = temp_col, + legend = names(temp_col), lty = 1, lwd = 2, + horiz = TRUE, bty = "n") +``` + +While the TIC of all samples look similar, the samples from cluster I show a +different signal in the retention time range from about 40 to 160 seconds. +Whether, and how strong this difference will impact the following analysis +remains to be determined. + +## known compounds + +While the artificially isotope labeled compounds were spiked to the individual +samples, there should also be the signal from the endogenous compounds in serum +(or plasma) samples. Thus, we calculate next the mass and *m/z* of an *\[M+H\]+* +ion of the endogenous cystine from its chemical formula and extract also the EIC +from this ion. For calculation of the exact mass and the *m/z* of the selected +ion adduct we use the `calculateMass()` and `mass2mz()` functions from the +`r Biocpkg("MetaboCoreUtils")` package. + +```{r} +#| fig-cap: "Figure 10. EIC of endogenous cystine vs spiked." +#' extract endogenous cystine mass and EIC and plot. +cysmass <- calculateMass("C6H12N2O4S2") +cys_endo <- mass2mz(cysmass, adduct = "[M+H]+")[, 1] + +#' Plot versus spiked +par(mfrow = c(1, 2)) +chromatogram(lcms1, mz = cys_endo + c(-0.005, 0.005), + rt = unlist(fData(eic_cystine)[, c("rtmin", "rtmax")]), + aggregationFun = "max") |> + plot(col = paste0(col_sample, 80)) |> + grid() + +plot(eic_cystine, col = paste0(col_sample, 80)) +grid() +legend("topright", col = col_phenotype, legend = names(col_phenotype), lty = 1, + bty = "n") +``` + +The two cystine EICs above look highly similar (the endogenous shown left, the +isotope labeled right in the plot above), if not for the shift in m/z, which +arises from the artificial labeling. This shift allows us to discriminate +between the endogenous and non-endogenous compound. + +## preprocessed object + +```{r} +#load preprocessed xcmsExperiment +lcms1 <- readMsObject(XcmsExperiment(), + AlabasterParam(system.file("extdata", "preprocessed_lcms1"))) +``` + +## noise analysis (need to adapt code) + +```{r echo=TRUE} +# overall signal in the dataset +#' - for each file calculate the sum of intensities +background <- spectra(full) |> + split(fromFile(full)) |> + lapply(tic) |> + lapply(sum) |> + unlist() + +# Overall signal that is in the chromatographic peaks detection + # - check "into" definition first, mioght need to multiply it by something +detected <- apply(assay(res_full), 2, function(x) sum(x, na.rm = TRUE)) + +# substract and plot ? Also i'm removing blanks bc i think we don't need it +names(background) <- names(detected) <- res_full$device + +idx_bl_qc <- sampleData(full)$sample_type %in% c("blank", "QC") +col_device_only <- leg_device_only[sampleData(full)$device[!idx_bl_qc]] +#remove blanks +noise <- background[!idx_bl_qc] - detected[!idx_bl_qc] + +f <- factor(names(noise), levels = unique(names(noise))) +group <- split(noise, f) + +plot(NULL, xlim = c(1, length(group)), ylim = range(unlist(group)), + xaxt = "n", xlab = "Devices", ylab = "Noise", + main = "Noise comparison between Devices") +for (i in seq_along(group)) { + points(rep(i, length(group[[i]])), group[[i]], pch = 19, col = leg_device_only[i]) +} +axis(1, at = seq_along(group), labels = names(group)) +```