Newbie goes C #131

Adafede · 2025-02-13T14:27:56Z

I was kind of disappointed of the speed of the actual implementation of gnps similarity, so I explored ways to make it more efficient.

I first made different Rcpp attempts, which seemed okayish (to open a PR to MsCoreUtils), but then realized I would need a full C implementation, without all the Armadillo optimizations etc available. So, with an LLM friend, I tried my best and got the following:

setwd("~/Git/tima")
library(microbenchmark)
devtools::load_all()

join_gnps <- function(x,
                      y,
                      xPrecursorMz,
                      yPrecursorMz,
                      tolerance,
                      ppm) {
  .Call(
    "join_gnps",
    x,
    y,
    xPrecursorMz,
    yPrecursorMz,
    tolerance,
    ppm
  )
}

gnps <- function(x,
                 y) {
  .Call(
    "gnps",
    x,
    y
  )
}

gnps_r <- function(x, y, ...) {
  if (nrow(x) != nrow(y))
    stop("'x' and 'y' must have the same number of rows.")
  
  x_sum <- sum(x[!duplicated(x[, 1]), 2], na.rm = TRUE)
  y_sum <- sum(y[!duplicated(y[, 1]), 2], na.rm = TRUE)
  
  if (x_sum == 0 || y_sum == 0)
    return(0)
  
  keep <- complete.cases(x[, 1], y[, 1])
  if (!any(keep))
    return(0)
  
  x <- x[keep, , drop = FALSE]
  y <- y[keep, , drop = FALSE]
  
  scores <- sqrt(x[, 2]) / sqrt(x_sum) * sqrt(y[, 2]) / sqrt(y_sum)
  x_idx <- as.integer(factor(x[, 1], levels = unique(x[, 1])))
  y_idx <- as.integer(factor(y[, 1], levels = unique(y[, 1])))
  
  score_mat <- matrix(0, length(keep), length(keep))
  score_mat[cbind(x_idx, y_idx)] <- scores
  
  best <- clue::solve_LSAP(score_mat, maximum = TRUE)
  
  sum(score_mat[cbind(seq_along(best), as.integer(best))], na.rm = TRUE)
}

# Example input data (expanded for realistic benchmarking)
query_spectrum <- x <- cbind(mz = c(10.0, 36.0, 63.0, 91.0, 93.0),
                             intensity = c(14.0, 15.0, 999.0, 650.0, 1.0))

target_spectrum <- y <- cbind(mz = c(10.0, 12.0, 50.0, 63.0, 105.0),
                              intensity = c(35.0, 5.0, 16.0, 999.0, 450.0))

query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5

matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_

# Run benchmarks
benchmark_results <- microbenchmark(
  c_impl = {
    matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
  },
  hybrid_impl_1 = {
    matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
  },
  hybrid_impl_2 = {
    matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_2 <-  MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
  },
  mscore_impl = {
    matches_mscore <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_mscore <- MsCoreUtils::gnps(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  times = 1000
)

# Compare matches across implementations
valid_indices <- function(matches)
  which(!is.na(matches[[1]]) & !is.na(matches[[2]]))

non_na_c <- valid_indices(matches_c)
non_na_hybrid_1 <- valid_indices(matches_hybrid_1)
non_na_hybrid_2 <- valid_indices(matches_hybrid_2)
non_na_mscore <- valid_indices(matches_mscore)

score_diffs <- c(
  c_vs_mscore = abs(score_c - score_mscore),
  c_vs_hybrid_1 = abs(score_c - score_hybrid_1),
  c_vs_hybrid_2 = abs(score_c - score_hybrid_2),
  mscore_vs_hybrid_1 = abs(score_mscore - score_hybrid_1),
  mscore_vs_hybrid_2 = abs(score_mscore - score_hybrid_2)
)

score_tolerance <- 1E-10

if (all(lengths(
  list(non_na_c, non_na_hybrid_1, non_na_hybrid_2, non_na_mscore)
) > 0)) {
  # Sort matches for easier comparison
  sorted_matches <- function(matches, indices)
    lapply(matches, function(v)
      sort(v[indices]))
  
  sorted_matches_c <- sorted_matches(matches_c, non_na_c)
  sorted_matches_hybrid_1 <- sorted_matches(matches_hybrid_1, non_na_hybrid_1)
  sorted_matches_hybrid_2 <- sorted_matches(matches_hybrid_2, non_na_hybrid_2)
  sorted_matches_mscore <- sorted_matches(matches_mscore, non_na_mscore)
  
  matches_equal <- list(
    c_vs_mscore = identical(sorted_matches_c, sorted_matches_mscore),
    c_vs_hybrid_1 = identical(sorted_matches_c, sorted_matches_hybrid_1),
    c_vs_hybrid_2 = identical(sorted_matches_c, sorted_matches_hybrid_2),
    mscore_vs_hybrid_1 = identical(sorted_matches_mscore, sorted_matches_hybrid_1),
    mscore_vs_hybrid_2 = identical(sorted_matches_mscore, sorted_matches_hybrid_2)
  )
  
  message("Matches C vs MsCoreUtils identical: ",
          matches_equal$c_vs_mscore)
  message("Matches C vs Hybrid 1 identical: ",
          matches_equal$c_vs_hybrid_1)
  message("Matches C vs Hybrid 2 identical: ",
          matches_equal$c_vs_hybrid_2)
  message("Matches MsCoreUtils vs Hybrid 1 identical: ",
          matches_equal$mscore_vs_hybrid_1)
  message("Matches MsCoreUtils vs Hybrid 2 identical: ",
          matches_equal$mscore_vs_hybrid_2)
  
  message("\nScore Differences:")
  print(score_diffs)
  
  if (all(score_diffs <= score_tolerance)) {
    message("✅ All scores are nearly identical (within tolerance).")
  } else {
    message("⚠️ Some scores differ significantly.")
  }
  
  # Debug mismatches
  if (all(unlist(matches_equal))) {
    message("\n🔍 **Mismatched Matches:**")
    check_mismatch <- function(a, b, name_a, name_b) {
      if (!identical(a, b)) {
        message("\n🔹 **",
                name_a,
                " Matches:** ",
                paste(a[[1]], "->", a[[2]], collapse = ", "))
        message("🔹 **",
                name_b,
                " Matches:** ",
                paste(b[[1]], "->", b[[2]], collapse = ", "))
      }
    }
    
    check_mismatch(sorted_matches_c,
                   sorted_matches_mscore,
                   "C",
                   "MsCoreUtils")
    check_mismatch(sorted_matches_c, sorted_matches_hybrid_1, "C", "Hybrid 1")
    
    check_mismatch(sorted_matches_c, sorted_matches_hybrid_2, "C", "Hybrid 2")
  }
}

print(benchmark_results)

Score Differences:
       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 
      1.110223e-16       1.110223e-16       1.110223e-16       0.000000e+00       0.000000e+00 
✅ All scores are nearly identical (within tolerance).

🔍 **Mismatched Matches:**
> 

Unit: microseconds
          expr     min       lq       mean  median       uq      max neval  cld
        c_impl   3.977   4.6740   5.692358   4.920   5.2070  475.641  1000 a   
 hybrid_impl_1  41.164  43.6445  60.002598  44.854  46.9040 9798.303  1000  b  
 hybrid_impl_2 112.422 115.7430 125.711986 118.203 124.1480 1115.200  1000   c 
   mscore_impl 127.510 130.6875 162.307848 133.578 140.6095 8812.171  1000    d

EDIT:
I first had another R issue in my code which made me feel overly negative about the implementation but now I think I got it...would you like me to open a PR with what is in
https://github.com/taxonomicallyinformedannotation/tima/tree/main/src
?

EDIT 2:
Now even without the 1E-16 inaccuracy 🚀 :

Score Differences:
       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 
                 0                  0                  0                  0                  0 
✅ All scores are nearly identical (within tolerance).
> 

Unit: microseconds
          expr     min       lq       mean  median       uq      max neval  cld
        c_impl   3.813   4.7970   6.468693   5.084   5.5350  606.472  1000 a   
 hybrid_impl_1  41.164  44.9565  58.563949  47.068  50.2045 9446.646  1000  b  
 hybrid_impl_2 113.734 120.4990 135.065152 129.273 138.1495  321.973  1000   c 
   mscore_impl 129.847 136.2225 158.079477 146.206 157.9320 6553.809  1000    d

The text was updated successfully, but these errors were encountered:

sgibb · 2025-02-15T20:17:06Z

@Adafede thank you for this great issue and suggestion.

I never used gnsp and have no idea how often it is used and how many peaks/spectra are joined/compared with it.

I didn't look in detail in you c source but I am reluctant to adopt such complex/extensive code because the maintenance becomes complicated. Especially the Hungarian method looks complex.
@jorainer What is your opinion?

I slightly modified the current gnps_r by replacing some complex calls (complete.cases) and loops with vectorized methods and got a speed gain of ~ 2/3 (it is still much slower than your C version)

gnps_r <- function(x, y) {
    if (nrow(x) != nrow(y))
        stop("'x' and 'y' are expected to have the same number of rows).")
    ## Scale intensities; !duplicated because we can have duplicated matches.
    x_sum <- sum(x[!duplicated(x[, 1L]), 2L], na.rm = TRUE)
    y_sum <- sum(y[!duplicated(y[, 1L]), 2L], na.rm = TRUE)
    ## is 0 if only NAs in input - avoids division through 0
    if (x_sum == 0 || y_sum == 0)
        return(0)
    ## Keep only matches.
    keep <- which(!is.na(x[, 1L]) & !is.na(y[, 1L]))

    l <- length(keep)
    if (!l)
        return(0)

    x <- x[keep, , drop = FALSE]
    y <- y[keep, , drop = FALSE]

    scores <- sqrt(x[, 2L]) / sqrt(x_sum) * sqrt(y[, 2L]) / sqrt(y_sum)

    x_idx <- match(x[, 1L], unique(x[, 1L]))
    y_idx <- match(y[, 1L], unique(y[, 1L]))

    score_mat <- matrix(0.0, nrow = l, ncol = l)
    score_mat[(y_idx - 1L) * l + x_idx] <- scores
    best <- solve_LSAP(score_mat, maximum = TRUE)
    sum(score_mat[(best - 1L) * l + seq_len(l)], na.rm = TRUE)
}

With your C version and set.seed(123) I got a few differences for the gnps scores. I am not sure whether its meaningful:

set.seed(123)
n <- 100
query_spectrum <- x <-
    cbind(mz = as.double(sort(sample(1:1e3, size = n))),
          intensity = as.double(sample(1:1e3, n)))
target_spectrum <- y <-
    cbind(mz = as.double(sort(sample(1:1e3, size = n))),
          intensity = as.double(sample(1:1e3, n)))

query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5

matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_

# Run benchmarks
benchmark_results <- microbenchmark(
  c_impl = {
    matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
  },
  hybrid_impl_1 = {
    matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
  },
  hybrid_impl_2 = {
    matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_2 <-  MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
  },
  mscore_impl = {
    matches_mscore2 <- unname(MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm))
    score_mscore2 <- MsCoreUtils::gnps(x[matches_mscore2[[1]], ], y[matches_mscore2[[2]], ])
  },
  mscore_impl2 = {
    matches_mscore <- unname(MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm))
    score_mscore <- gnps_r(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  times = 1000
)

Matches C vs MsCoreUtils identical: TRUE
Matches C vs Hybrid 1 identical: TRUE
Matches C vs Hybrid 2 identical: TRUE
Matches MsCoreUtils vs Hybrid 1 identical: TRUE
Matches MsCoreUtils vs Hybrid 2 identical: TRUE

Score Differences:
       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 
       0.000781952        0.000781952        0.000781952 
mscore_vs_hybrid_1 mscore_vs_hybrid_2 
       0.000000000        0.000000000 
⚠️ Some scores differ significantly.

🔍 **Mismatched Matches:**
> print(benchmark_results)
Unit: microseconds
          expr     min      lq     mean   median       uq       max neval
        c_impl 105.250 128.920 166.0530 136.2985 149.6675  8222.349  1000
 hybrid_impl_1 161.378 190.347 264.2140 204.0105 223.8575 39663.760  1000
 hybrid_impl_2 692.804 732.405 871.0964 772.4015 828.0425 21037.939  1000
   mscore_impl 725.255 788.965 850.0478 831.8495 889.5750  1492.334  1000
  mscore_impl2 225.510 255.996 297.0334 272.1970 300.5680  7842.034  1000

Adafede · 2025-02-17T21:00:21Z

Hi again!

Thank you for your feedback @sgibb!
I actually implemented the Hungarian in C to get rid of the clue dependency, which I found quite interesting but also happy to bring it back and remove the Hungarian implementation.

Your answer also made me realize/fix/improve a few things and now there is absolutely no score difference anymore with the MsCoreUtils implementation (and a neglectible 1E-17 floating point precision with the hybrid):

Here is the update:

setwd("~/Git/tima")
library(microbenchmark)
devtools::load_all()

join_gnps <- function(x,
                      y,
                      xPrecursorMz,
                      yPrecursorMz,
                      tolerance,
                      ppm) {
  .Call("join_gnps",
        x,
        y,
        xPrecursorMz,
        yPrecursorMz,
        tolerance,
        ppm)
}

gnps <- function(x, y) {
  .Call("gnps", x, y)
}

gnps_r <- function(x, y) {
  if (nrow(x) != nrow(y))
    stop("'x' and 'y' are expected to have the same number of rows).")
  ## Scale intensities; !duplicated because we can have duplicated matches.
  x_sum <- sum(x[!duplicated(x[, 1L]), 2L], na.rm = TRUE)
  y_sum <- sum(y[!duplicated(y[, 1L]), 2L], na.rm = TRUE)
  ## is 0 if only NAs in input - avoids division through 0
  if (x_sum == 0 || y_sum == 0)
    return(0)
  ## Keep only matches.
  keep <- which(!is.na(x[, 1L]) & !is.na(y[, 1L]))
  
  l <- length(keep)
  if (!l)
    return(0)
  
  x <- x[keep, , drop = FALSE]
  y <- y[keep, , drop = FALSE]
  
  scores <- sqrt(x[, 2L]) / sqrt(x_sum) * sqrt(y[, 2L]) / sqrt(y_sum)
  
  x_idx <- match(x[, 1L], unique(x[, 1L]))
  y_idx <- match(y[, 1L], unique(y[, 1L]))
  
  score_mat <- matrix(0.0, nrow = l, ncol = l)
  score_mat[(y_idx - 1L) * l + x_idx] <- scores
  best <- clue::solve_LSAP(score_mat, maximum = TRUE)
  sum(score_mat[(best - 1L) * l + seq_len(l)], na.rm = TRUE)
}


set.seed(123)
n <- 100
query_spectrum <- x <-
  cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e3, n)))
target_spectrum <- y <-
  cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e3, n)))

query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5

matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_

# Run benchmarks
benchmark_results <- microbenchmark(
  c_impl = {
    matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
  },
  hybrid_impl_1 = {
    matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
  },
  hybrid_impl_2 = {
    matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_2 <-  MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
  },
  mscore_impl = {
    matches_mscore <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_mscore <- MsCoreUtils::gnps(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  mscore_impl_2 = {
    matches_mscore_2 <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_mscore_2 <- gnps_r(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  times = 1000
)

score_diffs <- c(
  c_vs_mscore = abs(score_c - score_mscore),
  c_vs_hybrid_1 = abs(score_c - score_hybrid_1),
  c_vs_hybrid_2 = abs(score_c - score_hybrid_2),
  mscore_vs_hybrid_1 = abs(score_mscore - score_hybrid_1),
  mscore_vs_hybrid_2 = abs(score_mscore - score_hybrid_2),
  mscore_vs_mscore_2 = abs(score_mscore - score_mscore_2)
)

score_tolerance <- 1E-16

# Sort matches for easier comparison
sort_matches <- function(matches) {
  matches |>
    as.data.frame() |>
    tidytable::arrange(x) |>
    tidytable::arrange(y)
}

sorted_matches_c <- sort_matches(matches_c)
sorted_matches_hybrid_1 <- sort_matches(matches_hybrid_1)
sorted_matches_hybrid_2 <- sort_matches(matches_hybrid_2)
sorted_matches_mscore <- sort_matches(matches_mscore)
sorted_matches_mscore_2 <- sort_matches(matches_mscore)

matches_equal <- list(
  c_vs_mscore = identical(sorted_matches_c, sorted_matches_mscore),
  c_vs_hybrid_1 = identical(sorted_matches_c, sorted_matches_hybrid_1),
  c_vs_hybrid_2 = identical(sorted_matches_c, sorted_matches_hybrid_2),
  mscore_vs_hybrid_1 = identical(sorted_matches_mscore, sorted_matches_hybrid_1),
  mscore_vs_hybrid_2 = identical(sorted_matches_mscore, sorted_matches_hybrid_2),
  mscore_vs_mascore_2 = identical(sorted_matches_mscore, sorted_matches_mscore_2)
)

message("Matches C vs MsCoreUtils identical: ",
        matches_equal$c_vs_mscore)
message("Matches C vs Hybrid 1 identical: ", matches_equal$c_vs_hybrid_1)
message("Matches C vs Hybrid 2 identical: ", matches_equal$c_vs_hybrid_2)
message("Matches MsCoreUtils vs Hybrid 1 identical: ",
        matches_equal$mscore_vs_hybrid_1)
message("Matches MsCoreUtils vs Hybrid 2 identical: ",
        matches_equal$mscore_vs_hybrid_2)
message("Matches MsCoreUtils vs MsCoreUtils 2 identical: ",
        matches_equal$mscore_vs_mscore_2)

# Debug mismatches
if (!all(unlist(matches_equal))) {
  message("\n🔍 **Mismatched Matches:**")
  check_mismatch <- function(a, b, name_a, name_b) {
    if (!identical(a, b)) {
      message("\n🔹 **",
              name_a,
              " Matches:** ",
              paste(a[[1]], "->", a[[2]], collapse = ", "))
      message("🔹 **",
              name_b,
              " Matches:** ",
              paste(b[[1]], "->", b[[2]], collapse = ", "))
    }
  }
  
  check_mismatch(sorted_matches_c, sorted_matches_mscore, "C", "MsCoreUtils")
  check_mismatch(sorted_matches_c, sorted_matches_hybrid_1, "C", "Hybrid 1")
  
  check_mismatch(sorted_matches_c, sorted_matches_hybrid_2, "C", "Hybrid 2")
  check_mismatch(sorted_matches_mscore, sorted_matches_mscore_2, "MsCoreUtils", "MsCoreUtils 2")
} else{
  message("✅ All joined indices are identical.")
}
if (all(score_diffs <= score_tolerance)) {
  message("✅ All scores are nearly identical (within tolerance).")
} else {
  message("⚠️ Some scores differ significantly.")
}

message("\nScore Differences:")
print(score_diffs)

print(benchmark_results)

✅ All joined indices are identical.


✅ All scores are nearly identical (within tolerance).


       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 
      0.000000e+00       2.775558e-17       0.000000e+00       2.775558e-17       0.000000e+00  


Unit: microseconds
          expr     min      lq      mean   median       uq       max neval cld
        c_impl  28.700  31.652  33.81126  32.6155  33.9070   468.425  1000 a  
 hybrid_impl_1  45.018  48.708  61.70611  50.2660  52.3160 10291.779  1000 ab 
 hybrid_impl_2 153.586 158.137 167.51669 161.4785 168.3460  1027.419  1000   c
   mscore_impl 165.025 171.462 196.84645 175.5210 182.5935 16979.043  1000   c
 mscore_impl_2  57.933  62.607  82.67999  64.4930  67.2195 16346.331  1000  b

jorainer · 2025-02-18T07:35:32Z

Hey @Adafede , thanks for the issue and PR! Just some general observations - I didn't dive into the C code yet.

@sgibb , the gnps similarity score is one of the most frequent used similarity measures at the moment. So, having this implemented in a faster and more efficient way would be great.

The hungarian optimization is the key component - and yes, avoiding the dependency on the clue package would be nice.

From your unit tests it seems that your gnps_r() implementation is already significantly faster than our MsCoreUtils::gnps() function - which is great! then you get another factor 2 with the complete C implementation. This might not sound thrilling at first, but if we compare thousands of spectra against each other it will pay off...

So, I would be actually OK with having a full implementation in C - and maybe the backup R implementations for testing/validation purposes?

Adafede · 2025-02-18T07:41:05Z

Just not to steal anything here... the gnps_r()improvement was made by @sgibb! 😅

I did not open a PR for now as I first wanted to get some preliminary feedback. Also I saw there already is a join.C in MsCoreUtils which I did not use and should eventually be to make everything more consistent...

Adafede · 2025-02-18T09:15:27Z

I was also wondering why the relative gain was so different...and actually all methods scale very differently depending on n peaks.

So I changed the hungarian to a simpler auction algorithm that scales much better.

Here with different n's:

setwd("~/Git/tima")
library(microbenchmark)
devtools::load_all()

join_gnps <- function(x,
                      y,
                      xPrecursorMz,
                      yPrecursorMz,
                      tolerance,
                      ppm) {
  .Call("join_gnps", x, y, xPrecursorMz, yPrecursorMz, tolerance, ppm)
}

gnps <- function(x, y) {
  .Call("gnps", x, y)
}

gnps_r <- function(x, y) {
  if (nrow(x) != nrow(y))
    stop("'x' and 'y' are expected to have the same number of rows).")
  ## Scale intensities; !duplicated because we can have duplicated matches.
  x_sum <- sum(x[!duplicated(x[, 1L]), 2L], na.rm = TRUE)
  y_sum <- sum(y[!duplicated(y[, 1L]), 2L], na.rm = TRUE)
  ## is 0 if only NAs in input - avoids division through 0
  if (x_sum == 0 || y_sum == 0)
    return(0)
  ## Keep only matches.
  keep <- which(!is.na(x[, 1L]) & !is.na(y[, 1L]))
  
  l <- length(keep)
  if (!l)
    return(0)
  
  x <- x[keep, , drop = FALSE]
  y <- y[keep, , drop = FALSE]
  
  scores <- sqrt(x[, 2L]) / sqrt(x_sum) * sqrt(y[, 2L]) / sqrt(y_sum)
  
  x_idx <- match(x[, 1L], unique(x[, 1L]))
  y_idx <- match(y[, 1L], unique(y[, 1L]))
  
  score_mat <- matrix(0.0, nrow = l, ncol = l)
  score_mat[(y_idx - 1L) * l + x_idx] <- scores
  best <- clue::solve_LSAP(score_mat, maximum = TRUE)
  sum(score_mat[(best - 1L) * l + seq_len(l)], na.rm = TRUE)
}

# Implementations scale very differently, choose one below
# n <- 8
n <- 64
# n <- 256

set.seed(123)
query_spectrum <- x <-
  cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e6, n)))
set.seed(123)
target_spectrum <- y <-
  cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e6, n)))

query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5

matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_

# Run benchmarks
benchmark_results <- microbenchmark(
  c_impl = {
    matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
  },
  hybrid_impl_1 = {
    matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
  },
  hybrid_impl_2 = {
    matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_2 <-  MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
  },
  mscore_impl = {
    matches_mscore <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_mscore <- MsCoreUtils::gnps(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  mscore_impl_2 = {
    matches_mscore_2 <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_mscore_2 <- gnps_r(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  times = 1000
)

score_diffs <- c(
  c_vs_mscore = abs(score_c - score_mscore),
  c_vs_hybrid_1 = abs(score_c - score_hybrid_1),
  c_vs_hybrid_2 = abs(score_c - score_hybrid_2),
  mscore_vs_hybrid_1 = abs(score_mscore - score_hybrid_1),
  mscore_vs_hybrid_2 = abs(score_mscore - score_hybrid_2),
  mscore_vs_mscore_2 = abs(score_mscore - score_mscore_2)
)

score_tolerance <- 1E-15

# Sort matches for easier comparison
sort_matches <- function(matches) {
  matches |>
    as.data.frame() |>
    tidytable::arrange(x) |>
    tidytable::arrange(y)
}

sorted_matches_c <- sort_matches(matches_c)
sorted_matches_hybrid_1 <- sort_matches(matches_hybrid_1)
sorted_matches_hybrid_2 <- sort_matches(matches_hybrid_2)
sorted_matches_mscore <- sort_matches(matches_mscore)
sorted_matches_mscore_2 <- sort_matches(matches_mscore)

matches_equal <- list(
  c_vs_mscore = identical(sorted_matches_c, sorted_matches_mscore),
  c_vs_hybrid_1 = identical(sorted_matches_c, sorted_matches_hybrid_1),
  c_vs_hybrid_2 = identical(sorted_matches_c, sorted_matches_hybrid_2),
  mscore_vs_hybrid_1 = identical(sorted_matches_mscore, sorted_matches_hybrid_1),
  mscore_vs_hybrid_2 = identical(sorted_matches_mscore, sorted_matches_hybrid_2),
  mscore_vs_mascore_2 = identical(sorted_matches_mscore, sorted_matches_mscore_2)
)

message("Matches C vs MsCoreUtils identical: ",
        matches_equal$c_vs_mscore)
message("Matches C vs Hybrid 1 identical: ", matches_equal$c_vs_hybrid_1)
message("Matches C vs Hybrid 2 identical: ", matches_equal$c_vs_hybrid_2)
message("Matches MsCoreUtils vs Hybrid 1 identical: ",
        matches_equal$mscore_vs_hybrid_1)
message("Matches MsCoreUtils vs Hybrid 2 identical: ",
        matches_equal$mscore_vs_hybrid_2)
message("Matches MsCoreUtils vs MsCoreUtils 2 identical: ",
        matches_equal$mscore_vs_mscore_2)

# Debug mismatches
if (!all(unlist(matches_equal))) {
  message("\n🔍 **Mismatched Matches:**")
  check_mismatch <- function(a, b, name_a, name_b) {
    if (!identical(a, b)) {
      message("\n🔹 **",
              name_a,
              " Matches:** ",
              paste(a[[1]], "->", a[[2]], collapse = ", "))
      message("🔹 **",
              name_b,
              " Matches:** ",
              paste(b[[1]], "->", b[[2]], collapse = ", "))
    }
  }
  
  check_mismatch(sorted_matches_c, sorted_matches_mscore, "C", "MsCoreUtils")
  check_mismatch(sorted_matches_c, sorted_matches_hybrid_1, "C", "Hybrid 1")
  
  check_mismatch(sorted_matches_c, sorted_matches_hybrid_2, "C", "Hybrid 2")
  check_mismatch(sorted_matches_mscore,
                 sorted_matches_mscore_2,
                 "MsCoreUtils",
                 "MsCoreUtils 2")
} else{
  message("✅ All joined indices are identical.")
}
if (all(score_diffs <= score_tolerance)) {
  message("✅ All scores are nearly identical (within tolerance).")
} else {
  message("⚠️ Some scores differ significantly.")
}

message("\nScore Differences:")
print(score_diffs)

print(benchmark_results)

8 peaks

✅ All joined indices are identical.

✅ All scores are nearly identical (within tolerance).

       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 mscore_vs_mscore_2 
                 0                  0                  0                  0                  0                  0 

Unit: microseconds
          expr     min       lq       mean  median       uq       max neval cld
        c_impl   4.551   5.5350   7.129449   5.863   6.5600   508.400  1000 a  
 hybrid_impl_1  24.600  26.6910  40.539406  28.126  30.6680 10267.302  1000  b 
 hybrid_impl_2 119.802 125.4600 150.632975 134.070 145.9805  7641.785  1000   c
   mscore_impl 127.141 132.5325 149.552584 141.737 155.4925   450.426  1000   c
 mscore_impl_2  16.195  33.3740  37.700607  35.260  38.3965   295.241  1000  b

64 peaks

✅ All joined indices are identical.

✅ All scores are nearly identical (within tolerance).

       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 mscore_vs_mscore_2 
                 0                  0                  0                  0                  0                  0 

Unit: microseconds
          expr     min       lq      mean   median      uq      max neval cld
        c_impl  34.645  37.5150  52.80726  39.0320  41.984 11617.43  1000 a  
 hybrid_impl_1  91.471 103.6480 168.21377 106.5590 112.094 12133.95  1000  b 
 hybrid_impl_2 265.680 275.6430 317.46222 280.7680 294.585 12108.74  1000   c
   mscore_impl 278.964 289.6855 334.53044 294.8925 311.354 12809.63  1000   c
 mscore_impl_2 107.707 116.6860 148.30569 119.8430 125.788 12697.00  1000  b

256 peaks

✅ All joined indices are identical.

✅ All scores are nearly identical (within tolerance).

       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 mscore_vs_mscore_2 
                 0                  0                  0                  0                  0                  0 

Unit: microseconds
          expr      min        lq      mean   median       uq       max neval  cld
        c_impl  353.707  375.1295  735.1191  386.343  421.808 208371.14  1000 a   
 hybrid_impl_1 5633.441 5896.7225 6397.7029 5962.979 6021.957  16706.23  1000  bc 
 hybrid_impl_2 6068.902 6354.2005 6764.8488 6435.606 6525.765  15647.94  1000  b d
   mscore_impl 6072.469 6370.1085 6973.2135 6457.561 6557.684 218731.31  1000    d
 mscore_impl_2 5635.901 5888.2355 6192.6607 5965.234 6036.840  16246.17  1000   c

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Newbie goes C #131

Newbie goes C #131

Adafede commented Feb 13, 2025 •

edited

Loading

sgibb commented Feb 15, 2025

Adafede commented Feb 17, 2025 •

edited

Loading

jorainer commented Feb 18, 2025

Adafede commented Feb 18, 2025

Adafede commented Feb 18, 2025

Newbie goes C #131

Newbie goes C #131

Comments

Adafede commented Feb 13, 2025 • edited Loading

sgibb commented Feb 15, 2025

Adafede commented Feb 17, 2025 • edited Loading

jorainer commented Feb 18, 2025

Adafede commented Feb 18, 2025

Adafede commented Feb 18, 2025

8 peaks

64 peaks

256 peaks

Adafede commented Feb 13, 2025 •

edited

Loading

Adafede commented Feb 17, 2025 •

edited

Loading