Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Newbie goes C #131

Open
Adafede opened this issue Feb 13, 2025 · 5 comments
Open

Newbie goes C #131

Adafede opened this issue Feb 13, 2025 · 5 comments

Comments

@Adafede
Copy link

Adafede commented Feb 13, 2025

Hi @sgibb and @jorainer!

I was kind of disappointed of the speed of the actual implementation of gnps similarity, so I explored ways to make it more efficient.

I first made different Rcpp attempts, which seemed okayish (to open a PR to MsCoreUtils), but then realized I would need a full C implementation, without all the Armadillo optimizations etc available. So, with an LLM friend, I tried my best and got the following:

setwd("~/Git/tima")
library(microbenchmark)
devtools::load_all()

join_gnps <- function(x,
                      y,
                      xPrecursorMz,
                      yPrecursorMz,
                      tolerance,
                      ppm) {
  .Call(
    "join_gnps",
    x,
    y,
    xPrecursorMz,
    yPrecursorMz,
    tolerance,
    ppm
  )
}

gnps <- function(x,
                 y) {
  .Call(
    "gnps",
    x,
    y
  )
}

gnps_r <- function(x, y, ...) {
  if (nrow(x) != nrow(y))
    stop("'x' and 'y' must have the same number of rows.")
  
  x_sum <- sum(x[!duplicated(x[, 1]), 2], na.rm = TRUE)
  y_sum <- sum(y[!duplicated(y[, 1]), 2], na.rm = TRUE)
  
  if (x_sum == 0 || y_sum == 0)
    return(0)
  
  keep <- complete.cases(x[, 1], y[, 1])
  if (!any(keep))
    return(0)
  
  x <- x[keep, , drop = FALSE]
  y <- y[keep, , drop = FALSE]
  
  scores <- sqrt(x[, 2]) / sqrt(x_sum) * sqrt(y[, 2]) / sqrt(y_sum)
  x_idx <- as.integer(factor(x[, 1], levels = unique(x[, 1])))
  y_idx <- as.integer(factor(y[, 1], levels = unique(y[, 1])))
  
  score_mat <- matrix(0, length(keep), length(keep))
  score_mat[cbind(x_idx, y_idx)] <- scores
  
  best <- clue::solve_LSAP(score_mat, maximum = TRUE)
  
  sum(score_mat[cbind(seq_along(best), as.integer(best))], na.rm = TRUE)
}

# Example input data (expanded for realistic benchmarking)
query_spectrum <- x <- cbind(mz = c(10.0, 36.0, 63.0, 91.0, 93.0),
                             intensity = c(14.0, 15.0, 999.0, 650.0, 1.0))

target_spectrum <- y <- cbind(mz = c(10.0, 12.0, 50.0, 63.0, 105.0),
                              intensity = c(35.0, 5.0, 16.0, 999.0, 450.0))

query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5

matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_

# Run benchmarks
benchmark_results <- microbenchmark(
  c_impl = {
    matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
  },
  hybrid_impl_1 = {
    matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
  },
  hybrid_impl_2 = {
    matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_2 <-  MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
  },
  mscore_impl = {
    matches_mscore <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_mscore <- MsCoreUtils::gnps(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  times = 1000
)

# Compare matches across implementations
valid_indices <- function(matches)
  which(!is.na(matches[[1]]) & !is.na(matches[[2]]))

non_na_c <- valid_indices(matches_c)
non_na_hybrid_1 <- valid_indices(matches_hybrid_1)
non_na_hybrid_2 <- valid_indices(matches_hybrid_2)
non_na_mscore <- valid_indices(matches_mscore)

score_diffs <- c(
  c_vs_mscore = abs(score_c - score_mscore),
  c_vs_hybrid_1 = abs(score_c - score_hybrid_1),
  c_vs_hybrid_2 = abs(score_c - score_hybrid_2),
  mscore_vs_hybrid_1 = abs(score_mscore - score_hybrid_1),
  mscore_vs_hybrid_2 = abs(score_mscore - score_hybrid_2)
)

score_tolerance <- 1E-10

if (all(lengths(
  list(non_na_c, non_na_hybrid_1, non_na_hybrid_2, non_na_mscore)
) > 0)) {
  # Sort matches for easier comparison
  sorted_matches <- function(matches, indices)
    lapply(matches, function(v)
      sort(v[indices]))
  
  sorted_matches_c <- sorted_matches(matches_c, non_na_c)
  sorted_matches_hybrid_1 <- sorted_matches(matches_hybrid_1, non_na_hybrid_1)
  sorted_matches_hybrid_2 <- sorted_matches(matches_hybrid_2, non_na_hybrid_2)
  sorted_matches_mscore <- sorted_matches(matches_mscore, non_na_mscore)
  
  matches_equal <- list(
    c_vs_mscore = identical(sorted_matches_c, sorted_matches_mscore),
    c_vs_hybrid_1 = identical(sorted_matches_c, sorted_matches_hybrid_1),
    c_vs_hybrid_2 = identical(sorted_matches_c, sorted_matches_hybrid_2),
    mscore_vs_hybrid_1 = identical(sorted_matches_mscore, sorted_matches_hybrid_1),
    mscore_vs_hybrid_2 = identical(sorted_matches_mscore, sorted_matches_hybrid_2)
  )
  
  message("Matches C vs MsCoreUtils identical: ",
          matches_equal$c_vs_mscore)
  message("Matches C vs Hybrid 1 identical: ",
          matches_equal$c_vs_hybrid_1)
  message("Matches C vs Hybrid 2 identical: ",
          matches_equal$c_vs_hybrid_2)
  message("Matches MsCoreUtils vs Hybrid 1 identical: ",
          matches_equal$mscore_vs_hybrid_1)
  message("Matches MsCoreUtils vs Hybrid 2 identical: ",
          matches_equal$mscore_vs_hybrid_2)
  
  message("\nScore Differences:")
  print(score_diffs)
  
  if (all(score_diffs <= score_tolerance)) {
    message("✅ All scores are nearly identical (within tolerance).")
  } else {
    message("⚠️ Some scores differ significantly.")
  }
  
  # Debug mismatches
  if (all(unlist(matches_equal))) {
    message("\n🔍 **Mismatched Matches:**")
    check_mismatch <- function(a, b, name_a, name_b) {
      if (!identical(a, b)) {
        message("\n🔹 **",
                name_a,
                " Matches:** ",
                paste(a[[1]], "->", a[[2]], collapse = ", "))
        message("🔹 **",
                name_b,
                " Matches:** ",
                paste(b[[1]], "->", b[[2]], collapse = ", "))
      }
    }
    
    check_mismatch(sorted_matches_c,
                   sorted_matches_mscore,
                   "C",
                   "MsCoreUtils")
    check_mismatch(sorted_matches_c, sorted_matches_hybrid_1, "C", "Hybrid 1")
    
    check_mismatch(sorted_matches_c, sorted_matches_hybrid_2, "C", "Hybrid 2")
  }
}

print(benchmark_results)
Score Differences:
       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 
      1.110223e-16       1.110223e-16       1.110223e-16       0.000000e+00       0.000000e+00All scores are nearly identical (within tolerance).

🔍 **Mismatched Matches:**
> 

Unit: microseconds
          expr     min       lq       mean  median       uq      max neval  cld
        c_impl   3.977   4.6740   5.692358   4.920   5.2070  475.641  1000 a   
 hybrid_impl_1  41.164  43.6445  60.002598  44.854  46.9040 9798.303  1000  b  
 hybrid_impl_2 112.422 115.7430 125.711986 118.203 124.1480 1115.200  1000   c 
   mscore_impl 127.510 130.6875 162.307848 133.578 140.6095 8812.171  1000    d

EDIT:
I first had another R issue in my code which made me feel overly negative about the implementation but now I think I got it...would you like me to open a PR with what is in
https://github.com/taxonomicallyinformedannotation/tima/tree/main/src
?

EDIT 2:
Now even without the 1E-16 inaccuracy 🚀 :

Score Differences:
       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 
                 0                  0                  0                  0                  0All scores are nearly identical (within tolerance).
> 

Unit: microseconds
          expr     min       lq       mean  median       uq      max neval  cld
        c_impl   3.813   4.7970   6.468693   5.084   5.5350  606.472  1000 a   
 hybrid_impl_1  41.164  44.9565  58.563949  47.068  50.2045 9446.646  1000  b  
 hybrid_impl_2 113.734 120.4990 135.065152 129.273 138.1495  321.973  1000   c 
   mscore_impl 129.847 136.2225 158.079477 146.206 157.9320 6553.809  1000    d
@sgibb
Copy link
Member

sgibb commented Feb 15, 2025

@Adafede thank you for this great issue and suggestion.

I never used gnsp and have no idea how often it is used and how many peaks/spectra are joined/compared with it.

I didn't look in detail in you c source but I am reluctant to adopt such complex/extensive code because the maintenance becomes complicated. Especially the Hungarian method looks complex.
@jorainer What is your opinion?

I slightly modified the current gnps_r by replacing some complex calls (complete.cases) and loops with vectorized methods and got a speed gain of ~ 2/3 (it is still much slower than your C version)

gnps_r <- function(x, y) {
    if (nrow(x) != nrow(y))
        stop("'x' and 'y' are expected to have the same number of rows).")
    ## Scale intensities; !duplicated because we can have duplicated matches.
    x_sum <- sum(x[!duplicated(x[, 1L]), 2L], na.rm = TRUE)
    y_sum <- sum(y[!duplicated(y[, 1L]), 2L], na.rm = TRUE)
    ## is 0 if only NAs in input - avoids division through 0
    if (x_sum == 0 || y_sum == 0)
        return(0)
    ## Keep only matches.
    keep <- which(!is.na(x[, 1L]) & !is.na(y[, 1L]))

    l <- length(keep)
    if (!l)
        return(0)

    x <- x[keep, , drop = FALSE]
    y <- y[keep, , drop = FALSE]

    scores <- sqrt(x[, 2L]) / sqrt(x_sum) * sqrt(y[, 2L]) / sqrt(y_sum)

    x_idx <- match(x[, 1L], unique(x[, 1L]))
    y_idx <- match(y[, 1L], unique(y[, 1L]))

    score_mat <- matrix(0.0, nrow = l, ncol = l)
    score_mat[(y_idx - 1L) * l + x_idx] <- scores
    best <- solve_LSAP(score_mat, maximum = TRUE)
    sum(score_mat[(best - 1L) * l + seq_len(l)], na.rm = TRUE)
}

With your C version and set.seed(123) I got a few differences for the gnps scores. I am not sure whether its meaningful:

set.seed(123)
n <- 100
query_spectrum <- x <-
    cbind(mz = as.double(sort(sample(1:1e3, size = n))),
          intensity = as.double(sample(1:1e3, n)))
target_spectrum <- y <-
    cbind(mz = as.double(sort(sample(1:1e3, size = n))),
          intensity = as.double(sample(1:1e3, n)))

query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5

matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_

# Run benchmarks
benchmark_results <- microbenchmark(
  c_impl = {
    matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
  },
  hybrid_impl_1 = {
    matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
  },
  hybrid_impl_2 = {
    matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_2 <-  MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
  },
  mscore_impl = {
    matches_mscore2 <- unname(MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm))
    score_mscore2 <- MsCoreUtils::gnps(x[matches_mscore2[[1]], ], y[matches_mscore2[[2]], ])
  },
  mscore_impl2 = {
    matches_mscore <- unname(MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm))
    score_mscore <- gnps_r(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  times = 1000
)
Matches C vs MsCoreUtils identical: TRUE
Matches C vs Hybrid 1 identical: TRUE
Matches C vs Hybrid 2 identical: TRUE
Matches MsCoreUtils vs Hybrid 1 identical: TRUE
Matches MsCoreUtils vs Hybrid 2 identical: TRUE

Score Differences:
       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 
       0.000781952        0.000781952        0.000781952 
mscore_vs_hybrid_1 mscore_vs_hybrid_2 
       0.000000000        0.000000000 
⚠️ Some scores differ significantly.

🔍 **Mismatched Matches:**
> print(benchmark_results)
Unit: microseconds
          expr     min      lq     mean   median       uq       max neval
        c_impl 105.250 128.920 166.0530 136.2985 149.6675  8222.349  1000
 hybrid_impl_1 161.378 190.347 264.2140 204.0105 223.8575 39663.760  1000
 hybrid_impl_2 692.804 732.405 871.0964 772.4015 828.0425 21037.939  1000
   mscore_impl 725.255 788.965 850.0478 831.8495 889.5750  1492.334  1000
  mscore_impl2 225.510 255.996 297.0334 272.1970 300.5680  7842.034  1000

@Adafede
Copy link
Author

Adafede commented Feb 17, 2025

Hi again!

Thank you for your feedback @sgibb!
I actually implemented the Hungarian in C to get rid of the clue dependency, which I found quite interesting but also happy to bring it back and remove the Hungarian implementation.

Your answer also made me realize/fix/improve a few things and now there is absolutely no score difference anymore with the MsCoreUtils implementation (and a neglectible 1E-17 floating point precision with the hybrid):

Here is the update:

setwd("~/Git/tima")
library(microbenchmark)
devtools::load_all()

join_gnps <- function(x,
                      y,
                      xPrecursorMz,
                      yPrecursorMz,
                      tolerance,
                      ppm) {
  .Call("join_gnps",
        x,
        y,
        xPrecursorMz,
        yPrecursorMz,
        tolerance,
        ppm)
}

gnps <- function(x, y) {
  .Call("gnps", x, y)
}

gnps_r <- function(x, y) {
  if (nrow(x) != nrow(y))
    stop("'x' and 'y' are expected to have the same number of rows).")
  ## Scale intensities; !duplicated because we can have duplicated matches.
  x_sum <- sum(x[!duplicated(x[, 1L]), 2L], na.rm = TRUE)
  y_sum <- sum(y[!duplicated(y[, 1L]), 2L], na.rm = TRUE)
  ## is 0 if only NAs in input - avoids division through 0
  if (x_sum == 0 || y_sum == 0)
    return(0)
  ## Keep only matches.
  keep <- which(!is.na(x[, 1L]) & !is.na(y[, 1L]))
  
  l <- length(keep)
  if (!l)
    return(0)
  
  x <- x[keep, , drop = FALSE]
  y <- y[keep, , drop = FALSE]
  
  scores <- sqrt(x[, 2L]) / sqrt(x_sum) * sqrt(y[, 2L]) / sqrt(y_sum)
  
  x_idx <- match(x[, 1L], unique(x[, 1L]))
  y_idx <- match(y[, 1L], unique(y[, 1L]))
  
  score_mat <- matrix(0.0, nrow = l, ncol = l)
  score_mat[(y_idx - 1L) * l + x_idx] <- scores
  best <- clue::solve_LSAP(score_mat, maximum = TRUE)
  sum(score_mat[(best - 1L) * l + seq_len(l)], na.rm = TRUE)
}


set.seed(123)
n <- 100
query_spectrum <- x <-
  cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e3, n)))
target_spectrum <- y <-
  cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e3, n)))

query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5

matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_

# Run benchmarks
benchmark_results <- microbenchmark(
  c_impl = {
    matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
  },
  hybrid_impl_1 = {
    matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
  },
  hybrid_impl_2 = {
    matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_2 <-  MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
  },
  mscore_impl = {
    matches_mscore <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_mscore <- MsCoreUtils::gnps(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  mscore_impl_2 = {
    matches_mscore_2 <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_mscore_2 <- gnps_r(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  times = 1000
)

score_diffs <- c(
  c_vs_mscore = abs(score_c - score_mscore),
  c_vs_hybrid_1 = abs(score_c - score_hybrid_1),
  c_vs_hybrid_2 = abs(score_c - score_hybrid_2),
  mscore_vs_hybrid_1 = abs(score_mscore - score_hybrid_1),
  mscore_vs_hybrid_2 = abs(score_mscore - score_hybrid_2),
  mscore_vs_mscore_2 = abs(score_mscore - score_mscore_2)
)

score_tolerance <- 1E-16

# Sort matches for easier comparison
sort_matches <- function(matches) {
  matches |>
    as.data.frame() |>
    tidytable::arrange(x) |>
    tidytable::arrange(y)
}

sorted_matches_c <- sort_matches(matches_c)
sorted_matches_hybrid_1 <- sort_matches(matches_hybrid_1)
sorted_matches_hybrid_2 <- sort_matches(matches_hybrid_2)
sorted_matches_mscore <- sort_matches(matches_mscore)
sorted_matches_mscore_2 <- sort_matches(matches_mscore)

matches_equal <- list(
  c_vs_mscore = identical(sorted_matches_c, sorted_matches_mscore),
  c_vs_hybrid_1 = identical(sorted_matches_c, sorted_matches_hybrid_1),
  c_vs_hybrid_2 = identical(sorted_matches_c, sorted_matches_hybrid_2),
  mscore_vs_hybrid_1 = identical(sorted_matches_mscore, sorted_matches_hybrid_1),
  mscore_vs_hybrid_2 = identical(sorted_matches_mscore, sorted_matches_hybrid_2),
  mscore_vs_mascore_2 = identical(sorted_matches_mscore, sorted_matches_mscore_2)
)

message("Matches C vs MsCoreUtils identical: ",
        matches_equal$c_vs_mscore)
message("Matches C vs Hybrid 1 identical: ", matches_equal$c_vs_hybrid_1)
message("Matches C vs Hybrid 2 identical: ", matches_equal$c_vs_hybrid_2)
message("Matches MsCoreUtils vs Hybrid 1 identical: ",
        matches_equal$mscore_vs_hybrid_1)
message("Matches MsCoreUtils vs Hybrid 2 identical: ",
        matches_equal$mscore_vs_hybrid_2)
message("Matches MsCoreUtils vs MsCoreUtils 2 identical: ",
        matches_equal$mscore_vs_mscore_2)

# Debug mismatches
if (!all(unlist(matches_equal))) {
  message("\n🔍 **Mismatched Matches:**")
  check_mismatch <- function(a, b, name_a, name_b) {
    if (!identical(a, b)) {
      message("\n🔹 **",
              name_a,
              " Matches:** ",
              paste(a[[1]], "->", a[[2]], collapse = ", "))
      message("🔹 **",
              name_b,
              " Matches:** ",
              paste(b[[1]], "->", b[[2]], collapse = ", "))
    }
  }
  
  check_mismatch(sorted_matches_c, sorted_matches_mscore, "C", "MsCoreUtils")
  check_mismatch(sorted_matches_c, sorted_matches_hybrid_1, "C", "Hybrid 1")
  
  check_mismatch(sorted_matches_c, sorted_matches_hybrid_2, "C", "Hybrid 2")
  check_mismatch(sorted_matches_mscore, sorted_matches_mscore_2, "MsCoreUtils", "MsCoreUtils 2")
} else{
  message("✅ All joined indices are identical.")
}
if (all(score_diffs <= score_tolerance)) {
  message("✅ All scores are nearly identical (within tolerance).")
} else {
  message("⚠️ Some scores differ significantly.")
}

message("\nScore Differences:")
print(score_diffs)

print(benchmark_results)
All joined indices are identical.All scores are nearly identical (within tolerance).


       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 
      0.000000e+00       2.775558e-17       0.000000e+00       2.775558e-17       0.000000e+00  


Unit: microseconds
          expr     min      lq      mean   median       uq       max neval cld
        c_impl  28.700  31.652  33.81126  32.6155  33.9070   468.425  1000 a  
 hybrid_impl_1  45.018  48.708  61.70611  50.2660  52.3160 10291.779  1000 ab 
 hybrid_impl_2 153.586 158.137 167.51669 161.4785 168.3460  1027.419  1000   c
   mscore_impl 165.025 171.462 196.84645 175.5210 182.5935 16979.043  1000   c
 mscore_impl_2  57.933  62.607  82.67999  64.4930  67.2195 16346.331  1000  b 

@jorainer
Copy link
Member

Hey @Adafede , thanks for the issue and PR! Just some general observations - I didn't dive into the C code yet.

@sgibb , the gnps similarity score is one of the most frequent used similarity measures at the moment. So, having this implemented in a faster and more efficient way would be great.

The hungarian optimization is the key component - and yes, avoiding the dependency on the clue package would be nice.

From your unit tests it seems that your gnps_r() implementation is already significantly faster than our MsCoreUtils::gnps() function - which is great! then you get another factor 2 with the complete C implementation. This might not sound thrilling at first, but if we compare thousands of spectra against each other it will pay off...

So, I would be actually OK with having a full implementation in C - and maybe the backup R implementations for testing/validation purposes?

@Adafede
Copy link
Author

Adafede commented Feb 18, 2025

Just not to steal anything here... the gnps_r()improvement was made by @sgibb! 😅

I did not open a PR for now as I first wanted to get some preliminary feedback. Also I saw there already is a join.C in MsCoreUtils which I did not use and should eventually be to make everything more consistent...

@Adafede
Copy link
Author

Adafede commented Feb 18, 2025

I was also wondering why the relative gain was so different...and actually all methods scale very differently depending on n peaks.

So I changed the hungarian to a simpler auction algorithm that scales much better.

Here with different n's:

setwd("~/Git/tima")
library(microbenchmark)
devtools::load_all()

join_gnps <- function(x,
                      y,
                      xPrecursorMz,
                      yPrecursorMz,
                      tolerance,
                      ppm) {
  .Call("join_gnps", x, y, xPrecursorMz, yPrecursorMz, tolerance, ppm)
}

gnps <- function(x, y) {
  .Call("gnps", x, y)
}

gnps_r <- function(x, y) {
  if (nrow(x) != nrow(y))
    stop("'x' and 'y' are expected to have the same number of rows).")
  ## Scale intensities; !duplicated because we can have duplicated matches.
  x_sum <- sum(x[!duplicated(x[, 1L]), 2L], na.rm = TRUE)
  y_sum <- sum(y[!duplicated(y[, 1L]), 2L], na.rm = TRUE)
  ## is 0 if only NAs in input - avoids division through 0
  if (x_sum == 0 || y_sum == 0)
    return(0)
  ## Keep only matches.
  keep <- which(!is.na(x[, 1L]) & !is.na(y[, 1L]))
  
  l <- length(keep)
  if (!l)
    return(0)
  
  x <- x[keep, , drop = FALSE]
  y <- y[keep, , drop = FALSE]
  
  scores <- sqrt(x[, 2L]) / sqrt(x_sum) * sqrt(y[, 2L]) / sqrt(y_sum)
  
  x_idx <- match(x[, 1L], unique(x[, 1L]))
  y_idx <- match(y[, 1L], unique(y[, 1L]))
  
  score_mat <- matrix(0.0, nrow = l, ncol = l)
  score_mat[(y_idx - 1L) * l + x_idx] <- scores
  best <- clue::solve_LSAP(score_mat, maximum = TRUE)
  sum(score_mat[(best - 1L) * l + seq_len(l)], na.rm = TRUE)
}

# Implementations scale very differently, choose one below
# n <- 8
n <- 64
# n <- 256

set.seed(123)
query_spectrum <- x <-
  cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e6, n)))
set.seed(123)
target_spectrum <- y <-
  cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e6, n)))

query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5

matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_

# Run benchmarks
benchmark_results <- microbenchmark(
  c_impl = {
    matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
  },
  hybrid_impl_1 = {
    matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
  },
  hybrid_impl_2 = {
    matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_hybrid_2 <-  MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
  },
  mscore_impl = {
    matches_mscore <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_mscore <- MsCoreUtils::gnps(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  mscore_impl_2 = {
    matches_mscore_2 <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
    score_mscore_2 <- gnps_r(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
  },
  times = 1000
)

score_diffs <- c(
  c_vs_mscore = abs(score_c - score_mscore),
  c_vs_hybrid_1 = abs(score_c - score_hybrid_1),
  c_vs_hybrid_2 = abs(score_c - score_hybrid_2),
  mscore_vs_hybrid_1 = abs(score_mscore - score_hybrid_1),
  mscore_vs_hybrid_2 = abs(score_mscore - score_hybrid_2),
  mscore_vs_mscore_2 = abs(score_mscore - score_mscore_2)
)

score_tolerance <- 1E-15

# Sort matches for easier comparison
sort_matches <- function(matches) {
  matches |>
    as.data.frame() |>
    tidytable::arrange(x) |>
    tidytable::arrange(y)
}

sorted_matches_c <- sort_matches(matches_c)
sorted_matches_hybrid_1 <- sort_matches(matches_hybrid_1)
sorted_matches_hybrid_2 <- sort_matches(matches_hybrid_2)
sorted_matches_mscore <- sort_matches(matches_mscore)
sorted_matches_mscore_2 <- sort_matches(matches_mscore)

matches_equal <- list(
  c_vs_mscore = identical(sorted_matches_c, sorted_matches_mscore),
  c_vs_hybrid_1 = identical(sorted_matches_c, sorted_matches_hybrid_1),
  c_vs_hybrid_2 = identical(sorted_matches_c, sorted_matches_hybrid_2),
  mscore_vs_hybrid_1 = identical(sorted_matches_mscore, sorted_matches_hybrid_1),
  mscore_vs_hybrid_2 = identical(sorted_matches_mscore, sorted_matches_hybrid_2),
  mscore_vs_mascore_2 = identical(sorted_matches_mscore, sorted_matches_mscore_2)
)

message("Matches C vs MsCoreUtils identical: ",
        matches_equal$c_vs_mscore)
message("Matches C vs Hybrid 1 identical: ", matches_equal$c_vs_hybrid_1)
message("Matches C vs Hybrid 2 identical: ", matches_equal$c_vs_hybrid_2)
message("Matches MsCoreUtils vs Hybrid 1 identical: ",
        matches_equal$mscore_vs_hybrid_1)
message("Matches MsCoreUtils vs Hybrid 2 identical: ",
        matches_equal$mscore_vs_hybrid_2)
message("Matches MsCoreUtils vs MsCoreUtils 2 identical: ",
        matches_equal$mscore_vs_mscore_2)

# Debug mismatches
if (!all(unlist(matches_equal))) {
  message("\n🔍 **Mismatched Matches:**")
  check_mismatch <- function(a, b, name_a, name_b) {
    if (!identical(a, b)) {
      message("\n🔹 **",
              name_a,
              " Matches:** ",
              paste(a[[1]], "->", a[[2]], collapse = ", "))
      message("🔹 **",
              name_b,
              " Matches:** ",
              paste(b[[1]], "->", b[[2]], collapse = ", "))
    }
  }
  
  check_mismatch(sorted_matches_c, sorted_matches_mscore, "C", "MsCoreUtils")
  check_mismatch(sorted_matches_c, sorted_matches_hybrid_1, "C", "Hybrid 1")
  
  check_mismatch(sorted_matches_c, sorted_matches_hybrid_2, "C", "Hybrid 2")
  check_mismatch(sorted_matches_mscore,
                 sorted_matches_mscore_2,
                 "MsCoreUtils",
                 "MsCoreUtils 2")
} else{
  message("✅ All joined indices are identical.")
}
if (all(score_diffs <= score_tolerance)) {
  message("✅ All scores are nearly identical (within tolerance).")
} else {
  message("⚠️ Some scores differ significantly.")
}

message("\nScore Differences:")
print(score_diffs)

print(benchmark_results)

8 peaks

All joined indices are identical.All scores are nearly identical (within tolerance).

       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 mscore_vs_mscore_2 
                 0                  0                  0                  0                  0                  0 

Unit: microseconds
          expr     min       lq       mean  median       uq       max neval cld
        c_impl   4.551   5.5350   7.129449   5.863   6.5600   508.400  1000 a  
 hybrid_impl_1  24.600  26.6910  40.539406  28.126  30.6680 10267.302  1000  b 
 hybrid_impl_2 119.802 125.4600 150.632975 134.070 145.9805  7641.785  1000   c
   mscore_impl 127.141 132.5325 149.552584 141.737 155.4925   450.426  1000   c
 mscore_impl_2  16.195  33.3740  37.700607  35.260  38.3965   295.241  1000  b 

64 peaks

All joined indices are identical.All scores are nearly identical (within tolerance).

       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 mscore_vs_mscore_2 
                 0                  0                  0                  0                  0                  0 

Unit: microseconds
          expr     min       lq      mean   median      uq      max neval cld
        c_impl  34.645  37.5150  52.80726  39.0320  41.984 11617.43  1000 a  
 hybrid_impl_1  91.471 103.6480 168.21377 106.5590 112.094 12133.95  1000  b 
 hybrid_impl_2 265.680 275.6430 317.46222 280.7680 294.585 12108.74  1000   c
   mscore_impl 278.964 289.6855 334.53044 294.8925 311.354 12809.63  1000   c
 mscore_impl_2 107.707 116.6860 148.30569 119.8430 125.788 12697.00  1000  b 

256 peaks

All joined indices are identical.All scores are nearly identical (within tolerance).

       c_vs_mscore      c_vs_hybrid_1      c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 mscore_vs_mscore_2 
                 0                  0                  0                  0                  0                  0 

Unit: microseconds
          expr      min        lq      mean   median       uq       max neval  cld
        c_impl  353.707  375.1295  735.1191  386.343  421.808 208371.14  1000 a   
 hybrid_impl_1 5633.441 5896.7225 6397.7029 5962.979 6021.957  16706.23  1000  bc 
 hybrid_impl_2 6068.902 6354.2005 6764.8488 6435.606 6525.765  15647.94  1000  b d
   mscore_impl 6072.469 6370.1085 6973.2135 6457.561 6557.684 218731.31  1000    d
 mscore_impl_2 5635.901 5888.2355 6192.6607 5965.234 6036.840  16246.17  1000   c  

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants