-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Newbie goes C #131
Comments
@Adafede thank you for this great issue and suggestion. I never used I didn't look in detail in you c source but I am reluctant to adopt such complex/extensive code because the maintenance becomes complicated. Especially the Hungarian method looks complex. I slightly modified the current gnps_r <- function(x, y) {
if (nrow(x) != nrow(y))
stop("'x' and 'y' are expected to have the same number of rows).")
## Scale intensities; !duplicated because we can have duplicated matches.
x_sum <- sum(x[!duplicated(x[, 1L]), 2L], na.rm = TRUE)
y_sum <- sum(y[!duplicated(y[, 1L]), 2L], na.rm = TRUE)
## is 0 if only NAs in input - avoids division through 0
if (x_sum == 0 || y_sum == 0)
return(0)
## Keep only matches.
keep <- which(!is.na(x[, 1L]) & !is.na(y[, 1L]))
l <- length(keep)
if (!l)
return(0)
x <- x[keep, , drop = FALSE]
y <- y[keep, , drop = FALSE]
scores <- sqrt(x[, 2L]) / sqrt(x_sum) * sqrt(y[, 2L]) / sqrt(y_sum)
x_idx <- match(x[, 1L], unique(x[, 1L]))
y_idx <- match(y[, 1L], unique(y[, 1L]))
score_mat <- matrix(0.0, nrow = l, ncol = l)
score_mat[(y_idx - 1L) * l + x_idx] <- scores
best <- solve_LSAP(score_mat, maximum = TRUE)
sum(score_mat[(best - 1L) * l + seq_len(l)], na.rm = TRUE)
} With your C version and set.seed(123)
n <- 100
query_spectrum <- x <-
cbind(mz = as.double(sort(sample(1:1e3, size = n))),
intensity = as.double(sample(1:1e3, n)))
target_spectrum <- y <-
cbind(mz = as.double(sort(sample(1:1e3, size = n))),
intensity = as.double(sample(1:1e3, n)))
query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5
matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_
# Run benchmarks
benchmark_results <- microbenchmark(
c_impl = {
matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
},
hybrid_impl_1 = {
matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
},
hybrid_impl_2 = {
matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_hybrid_2 <- MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
},
mscore_impl = {
matches_mscore2 <- unname(MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm))
score_mscore2 <- MsCoreUtils::gnps(x[matches_mscore2[[1]], ], y[matches_mscore2[[2]], ])
},
mscore_impl2 = {
matches_mscore <- unname(MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm))
score_mscore <- gnps_r(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
},
times = 1000
)
|
Hi again! Thank you for your feedback @sgibb! Your answer also made me realize/fix/improve a few things and now there is absolutely no score difference anymore with the Here is the update: setwd("~/Git/tima")
library(microbenchmark)
devtools::load_all()
join_gnps <- function(x,
y,
xPrecursorMz,
yPrecursorMz,
tolerance,
ppm) {
.Call("join_gnps",
x,
y,
xPrecursorMz,
yPrecursorMz,
tolerance,
ppm)
}
gnps <- function(x, y) {
.Call("gnps", x, y)
}
gnps_r <- function(x, y) {
if (nrow(x) != nrow(y))
stop("'x' and 'y' are expected to have the same number of rows).")
## Scale intensities; !duplicated because we can have duplicated matches.
x_sum <- sum(x[!duplicated(x[, 1L]), 2L], na.rm = TRUE)
y_sum <- sum(y[!duplicated(y[, 1L]), 2L], na.rm = TRUE)
## is 0 if only NAs in input - avoids division through 0
if (x_sum == 0 || y_sum == 0)
return(0)
## Keep only matches.
keep <- which(!is.na(x[, 1L]) & !is.na(y[, 1L]))
l <- length(keep)
if (!l)
return(0)
x <- x[keep, , drop = FALSE]
y <- y[keep, , drop = FALSE]
scores <- sqrt(x[, 2L]) / sqrt(x_sum) * sqrt(y[, 2L]) / sqrt(y_sum)
x_idx <- match(x[, 1L], unique(x[, 1L]))
y_idx <- match(y[, 1L], unique(y[, 1L]))
score_mat <- matrix(0.0, nrow = l, ncol = l)
score_mat[(y_idx - 1L) * l + x_idx] <- scores
best <- clue::solve_LSAP(score_mat, maximum = TRUE)
sum(score_mat[(best - 1L) * l + seq_len(l)], na.rm = TRUE)
}
set.seed(123)
n <- 100
query_spectrum <- x <-
cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e3, n)))
target_spectrum <- y <-
cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e3, n)))
query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5
matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_
# Run benchmarks
benchmark_results <- microbenchmark(
c_impl = {
matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
},
hybrid_impl_1 = {
matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
},
hybrid_impl_2 = {
matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_hybrid_2 <- MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
},
mscore_impl = {
matches_mscore <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_mscore <- MsCoreUtils::gnps(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
},
mscore_impl_2 = {
matches_mscore_2 <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_mscore_2 <- gnps_r(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
},
times = 1000
)
score_diffs <- c(
c_vs_mscore = abs(score_c - score_mscore),
c_vs_hybrid_1 = abs(score_c - score_hybrid_1),
c_vs_hybrid_2 = abs(score_c - score_hybrid_2),
mscore_vs_hybrid_1 = abs(score_mscore - score_hybrid_1),
mscore_vs_hybrid_2 = abs(score_mscore - score_hybrid_2),
mscore_vs_mscore_2 = abs(score_mscore - score_mscore_2)
)
score_tolerance <- 1E-16
# Sort matches for easier comparison
sort_matches <- function(matches) {
matches |>
as.data.frame() |>
tidytable::arrange(x) |>
tidytable::arrange(y)
}
sorted_matches_c <- sort_matches(matches_c)
sorted_matches_hybrid_1 <- sort_matches(matches_hybrid_1)
sorted_matches_hybrid_2 <- sort_matches(matches_hybrid_2)
sorted_matches_mscore <- sort_matches(matches_mscore)
sorted_matches_mscore_2 <- sort_matches(matches_mscore)
matches_equal <- list(
c_vs_mscore = identical(sorted_matches_c, sorted_matches_mscore),
c_vs_hybrid_1 = identical(sorted_matches_c, sorted_matches_hybrid_1),
c_vs_hybrid_2 = identical(sorted_matches_c, sorted_matches_hybrid_2),
mscore_vs_hybrid_1 = identical(sorted_matches_mscore, sorted_matches_hybrid_1),
mscore_vs_hybrid_2 = identical(sorted_matches_mscore, sorted_matches_hybrid_2),
mscore_vs_mascore_2 = identical(sorted_matches_mscore, sorted_matches_mscore_2)
)
message("Matches C vs MsCoreUtils identical: ",
matches_equal$c_vs_mscore)
message("Matches C vs Hybrid 1 identical: ", matches_equal$c_vs_hybrid_1)
message("Matches C vs Hybrid 2 identical: ", matches_equal$c_vs_hybrid_2)
message("Matches MsCoreUtils vs Hybrid 1 identical: ",
matches_equal$mscore_vs_hybrid_1)
message("Matches MsCoreUtils vs Hybrid 2 identical: ",
matches_equal$mscore_vs_hybrid_2)
message("Matches MsCoreUtils vs MsCoreUtils 2 identical: ",
matches_equal$mscore_vs_mscore_2)
# Debug mismatches
if (!all(unlist(matches_equal))) {
message("\n🔍 **Mismatched Matches:**")
check_mismatch <- function(a, b, name_a, name_b) {
if (!identical(a, b)) {
message("\n🔹 **",
name_a,
" Matches:** ",
paste(a[[1]], "->", a[[2]], collapse = ", "))
message("🔹 **",
name_b,
" Matches:** ",
paste(b[[1]], "->", b[[2]], collapse = ", "))
}
}
check_mismatch(sorted_matches_c, sorted_matches_mscore, "C", "MsCoreUtils")
check_mismatch(sorted_matches_c, sorted_matches_hybrid_1, "C", "Hybrid 1")
check_mismatch(sorted_matches_c, sorted_matches_hybrid_2, "C", "Hybrid 2")
check_mismatch(sorted_matches_mscore, sorted_matches_mscore_2, "MsCoreUtils", "MsCoreUtils 2")
} else{
message("✅ All joined indices are identical.")
}
if (all(score_diffs <= score_tolerance)) {
message("✅ All scores are nearly identical (within tolerance).")
} else {
message("⚠️ Some scores differ significantly.")
}
message("\nScore Differences:")
print(score_diffs)
print(benchmark_results) ✅ All joined indices are identical.
✅ All scores are nearly identical (within tolerance).
c_vs_mscore c_vs_hybrid_1 c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2
0.000000e+00 2.775558e-17 0.000000e+00 2.775558e-17 0.000000e+00
Unit: microseconds
expr min lq mean median uq max neval cld
c_impl 28.700 31.652 33.81126 32.6155 33.9070 468.425 1000 a
hybrid_impl_1 45.018 48.708 61.70611 50.2660 52.3160 10291.779 1000 ab
hybrid_impl_2 153.586 158.137 167.51669 161.4785 168.3460 1027.419 1000 c
mscore_impl 165.025 171.462 196.84645 175.5210 182.5935 16979.043 1000 c
mscore_impl_2 57.933 62.607 82.67999 64.4930 67.2195 16346.331 1000 b |
Hey @Adafede , thanks for the issue and PR! Just some general observations - I didn't dive into the C code yet. @sgibb , the gnps similarity score is one of the most frequent used similarity measures at the moment. So, having this implemented in a faster and more efficient way would be great. The hungarian optimization is the key component - and yes, avoiding the dependency on the clue package would be nice. From your unit tests it seems that your So, I would be actually OK with having a full implementation in C - and maybe the backup R implementations for testing/validation purposes? |
Just not to steal anything here... the I did not open a PR for now as I first wanted to get some preliminary feedback. Also I saw there already is a |
I was also wondering why the relative gain was so different...and actually all methods scale very differently depending on n peaks. So I changed the hungarian to a simpler auction algorithm that scales much better. Here with different n's: setwd("~/Git/tima")
library(microbenchmark)
devtools::load_all()
join_gnps <- function(x,
y,
xPrecursorMz,
yPrecursorMz,
tolerance,
ppm) {
.Call("join_gnps", x, y, xPrecursorMz, yPrecursorMz, tolerance, ppm)
}
gnps <- function(x, y) {
.Call("gnps", x, y)
}
gnps_r <- function(x, y) {
if (nrow(x) != nrow(y))
stop("'x' and 'y' are expected to have the same number of rows).")
## Scale intensities; !duplicated because we can have duplicated matches.
x_sum <- sum(x[!duplicated(x[, 1L]), 2L], na.rm = TRUE)
y_sum <- sum(y[!duplicated(y[, 1L]), 2L], na.rm = TRUE)
## is 0 if only NAs in input - avoids division through 0
if (x_sum == 0 || y_sum == 0)
return(0)
## Keep only matches.
keep <- which(!is.na(x[, 1L]) & !is.na(y[, 1L]))
l <- length(keep)
if (!l)
return(0)
x <- x[keep, , drop = FALSE]
y <- y[keep, , drop = FALSE]
scores <- sqrt(x[, 2L]) / sqrt(x_sum) * sqrt(y[, 2L]) / sqrt(y_sum)
x_idx <- match(x[, 1L], unique(x[, 1L]))
y_idx <- match(y[, 1L], unique(y[, 1L]))
score_mat <- matrix(0.0, nrow = l, ncol = l)
score_mat[(y_idx - 1L) * l + x_idx] <- scores
best <- clue::solve_LSAP(score_mat, maximum = TRUE)
sum(score_mat[(best - 1L) * l + seq_len(l)], na.rm = TRUE)
}
# Implementations scale very differently, choose one below
# n <- 8
n <- 64
# n <- 256
set.seed(123)
query_spectrum <- x <-
cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e6, n)))
set.seed(123)
target_spectrum <- y <-
cbind(mz = as.double(sort(sample(1:1e3, size = n))), intensity = as.double(sample(1:1e6, n)))
query_precursor <- pmz_x <- 91.0
target_precursor <- pmz_y <- 105.0
tolerance <- 0.01
ppm <- 5
matches_c <- matches_rcpp <- matches_hybrid_1 <- matches_hybrid_2 <- matches_mscore <- NULL
score_c <- score_rcpp <- score_hybrid_1 <- score_hybrid_2 <- score_mscore <- NA_real_
# Run benchmarks
benchmark_results <- microbenchmark(
c_impl = {
matches_c <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_c <- gnps(x[matches_c[[1]], ], y[matches_c[[2]], ])
},
hybrid_impl_1 = {
matches_hybrid_1 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_hybrid_1 <- gnps_r(x[matches_hybrid_1[[1]], ], y[matches_hybrid_1[[2]], ])
},
hybrid_impl_2 = {
matches_hybrid_2 <- join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_hybrid_2 <- MsCoreUtils::gnps(x[matches_hybrid_2[[1]], ], y[matches_hybrid_2[[2]], ])
},
mscore_impl = {
matches_mscore <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_mscore <- MsCoreUtils::gnps(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
},
mscore_impl_2 = {
matches_mscore_2 <- MsCoreUtils::join_gnps(x[, 1], y[, 1], pmz_x, pmz_y, tolerance, ppm)
score_mscore_2 <- gnps_r(x[matches_mscore[[1]], ], y[matches_mscore[[2]], ])
},
times = 1000
)
score_diffs <- c(
c_vs_mscore = abs(score_c - score_mscore),
c_vs_hybrid_1 = abs(score_c - score_hybrid_1),
c_vs_hybrid_2 = abs(score_c - score_hybrid_2),
mscore_vs_hybrid_1 = abs(score_mscore - score_hybrid_1),
mscore_vs_hybrid_2 = abs(score_mscore - score_hybrid_2),
mscore_vs_mscore_2 = abs(score_mscore - score_mscore_2)
)
score_tolerance <- 1E-15
# Sort matches for easier comparison
sort_matches <- function(matches) {
matches |>
as.data.frame() |>
tidytable::arrange(x) |>
tidytable::arrange(y)
}
sorted_matches_c <- sort_matches(matches_c)
sorted_matches_hybrid_1 <- sort_matches(matches_hybrid_1)
sorted_matches_hybrid_2 <- sort_matches(matches_hybrid_2)
sorted_matches_mscore <- sort_matches(matches_mscore)
sorted_matches_mscore_2 <- sort_matches(matches_mscore)
matches_equal <- list(
c_vs_mscore = identical(sorted_matches_c, sorted_matches_mscore),
c_vs_hybrid_1 = identical(sorted_matches_c, sorted_matches_hybrid_1),
c_vs_hybrid_2 = identical(sorted_matches_c, sorted_matches_hybrid_2),
mscore_vs_hybrid_1 = identical(sorted_matches_mscore, sorted_matches_hybrid_1),
mscore_vs_hybrid_2 = identical(sorted_matches_mscore, sorted_matches_hybrid_2),
mscore_vs_mascore_2 = identical(sorted_matches_mscore, sorted_matches_mscore_2)
)
message("Matches C vs MsCoreUtils identical: ",
matches_equal$c_vs_mscore)
message("Matches C vs Hybrid 1 identical: ", matches_equal$c_vs_hybrid_1)
message("Matches C vs Hybrid 2 identical: ", matches_equal$c_vs_hybrid_2)
message("Matches MsCoreUtils vs Hybrid 1 identical: ",
matches_equal$mscore_vs_hybrid_1)
message("Matches MsCoreUtils vs Hybrid 2 identical: ",
matches_equal$mscore_vs_hybrid_2)
message("Matches MsCoreUtils vs MsCoreUtils 2 identical: ",
matches_equal$mscore_vs_mscore_2)
# Debug mismatches
if (!all(unlist(matches_equal))) {
message("\n🔍 **Mismatched Matches:**")
check_mismatch <- function(a, b, name_a, name_b) {
if (!identical(a, b)) {
message("\n🔹 **",
name_a,
" Matches:** ",
paste(a[[1]], "->", a[[2]], collapse = ", "))
message("🔹 **",
name_b,
" Matches:** ",
paste(b[[1]], "->", b[[2]], collapse = ", "))
}
}
check_mismatch(sorted_matches_c, sorted_matches_mscore, "C", "MsCoreUtils")
check_mismatch(sorted_matches_c, sorted_matches_hybrid_1, "C", "Hybrid 1")
check_mismatch(sorted_matches_c, sorted_matches_hybrid_2, "C", "Hybrid 2")
check_mismatch(sorted_matches_mscore,
sorted_matches_mscore_2,
"MsCoreUtils",
"MsCoreUtils 2")
} else{
message("✅ All joined indices are identical.")
}
if (all(score_diffs <= score_tolerance)) {
message("✅ All scores are nearly identical (within tolerance).")
} else {
message("⚠️ Some scores differ significantly.")
}
message("\nScore Differences:")
print(score_diffs)
print(benchmark_results) 8 peaks✅ All joined indices are identical.
✅ All scores are nearly identical (within tolerance).
c_vs_mscore c_vs_hybrid_1 c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 mscore_vs_mscore_2
0 0 0 0 0 0
Unit: microseconds
expr min lq mean median uq max neval cld
c_impl 4.551 5.5350 7.129449 5.863 6.5600 508.400 1000 a
hybrid_impl_1 24.600 26.6910 40.539406 28.126 30.6680 10267.302 1000 b
hybrid_impl_2 119.802 125.4600 150.632975 134.070 145.9805 7641.785 1000 c
mscore_impl 127.141 132.5325 149.552584 141.737 155.4925 450.426 1000 c
mscore_impl_2 16.195 33.3740 37.700607 35.260 38.3965 295.241 1000 b 64 peaks✅ All joined indices are identical.
✅ All scores are nearly identical (within tolerance).
c_vs_mscore c_vs_hybrid_1 c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 mscore_vs_mscore_2
0 0 0 0 0 0
Unit: microseconds
expr min lq mean median uq max neval cld
c_impl 34.645 37.5150 52.80726 39.0320 41.984 11617.43 1000 a
hybrid_impl_1 91.471 103.6480 168.21377 106.5590 112.094 12133.95 1000 b
hybrid_impl_2 265.680 275.6430 317.46222 280.7680 294.585 12108.74 1000 c
mscore_impl 278.964 289.6855 334.53044 294.8925 311.354 12809.63 1000 c
mscore_impl_2 107.707 116.6860 148.30569 119.8430 125.788 12697.00 1000 b 256 peaks✅ All joined indices are identical.
✅ All scores are nearly identical (within tolerance).
c_vs_mscore c_vs_hybrid_1 c_vs_hybrid_2 mscore_vs_hybrid_1 mscore_vs_hybrid_2 mscore_vs_mscore_2
0 0 0 0 0 0
Unit: microseconds
expr min lq mean median uq max neval cld
c_impl 353.707 375.1295 735.1191 386.343 421.808 208371.14 1000 a
hybrid_impl_1 5633.441 5896.7225 6397.7029 5962.979 6021.957 16706.23 1000 bc
hybrid_impl_2 6068.902 6354.2005 6764.8488 6435.606 6525.765 15647.94 1000 b d
mscore_impl 6072.469 6370.1085 6973.2135 6457.561 6557.684 218731.31 1000 d
mscore_impl_2 5635.901 5888.2355 6192.6607 5965.234 6036.840 16246.17 1000 c |
Hi @sgibb and @jorainer!
I was kind of disappointed of the speed of the actual implementation of
gnps
similarity, so I explored ways to make it more efficient.I first made different
Rcpp
attempts, which seemed okayish (to open a PR toMsCoreUtils
), but then realized I would need a full C implementation, without all the Armadillo optimizations etc available. So, with an LLM friend, I tried my best and got the following:EDIT:
I first had another R issue in my code which made me feel overly negative about the implementation but now I think I got it...would you like me to open a PR with what is in
https://github.com/taxonomicallyinformedannotation/tima/tree/main/src
?
EDIT 2:
Now even without the 1E-16 inaccuracy 🚀 :
The text was updated successfully, but these errors were encountered: