diff --git a/.gitignore b/.gitignore index 02882d3..feee3ff 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,5 @@ pkg .RData .Rhistory + +*.dataset diff --git a/lib/ms/lipid/search.rb b/lib/ms/lipid/search.rb index d2e099d..d4e680e 100644 --- a/lib/ms/lipid/search.rb +++ b/lib/ms/lipid/search.rb @@ -1,48 +1,92 @@ require 'ms/spectrum' require 'rserve/simpler' # TODO: move to integrated interface with rserve when available require 'core_ext/array/in_groups' +require 'ms/lipid/search/hit' module MS class Lipid class Search + + # A Search::Bin is a range that contains the *entire* query spectrum + # (not just the portion covered by the range). the query spectrum, and + # an EVD that describes the probability that a peak's delta to nearest + # peak is that small by chance. + class Bin < Range + # the intensity value of the query spectrum should be a query + attr_accessor :query_spectrum + attr_accessor :evd + + def initialize(range_obj, query_spectrum) + super(range_obj.begin, range_obj.end, range_obj.exclude_end?) + @query_spectrum = query_spectrum + end + + # returns the nearest num_hits MS::Lipid::Search::Hits sorted by delta + # [with tie going to the lower m/z] + def nearest_hits(mz, num_hits=1) + mzs = @query_spectrum.mzs + queries = @query_spectrum.intensities + index = @query_spectrum.find_nearest_index(mz) + _min = index - (num_nearest-1) + (_min >= 0) || (_min = 0) + _max = index + (num_nearest-1) + (_max < @query_spectrum.size) || (_max = @query_spectrum - 1) + delta_index_pairs = (_min.._max).map {|i| [mz.-(mzs[i]).abs, i] }.sort[0, num_to_return] + # this could be improved by updating the evd if it happens to jump + # to the next major Search::Bin + # of course, this has the advantage that the next nearest peaks will + # have the same probability distribution as the nearest peak. + + delta_index_pairs.map do |delta, index| + hit = Hit.new( :query => queries[index], :observed_mz => mz) + dev = (evd.type == :ppm) ? hit.ppm : hit.delta.abs + hit.pvalue = evd.pvalue(dev) + hit + end + end + + def to_range + Range.new( self.begin, self.end, self.exclude_end? ) + end + end + class EVD + DEFAULT_TYPE = :ppm EVD_R = Rserve::Simpler.new - attr_accessor :location, :scale, :shape # takes location, scale and shape parameters - def initialize(location, scale, shape) + attr_accessor :location, :scale, :shape + # type is :ppm or :amu + attr_accessor :type + def initialize(location, scale, shape, type=DEFAULT_TYPE) @location, @scale, @shape = location, scale, shape + @type = type end - # takes a ppm and returns the pvalue - # note that it will take the - def pvalue(ppm) - EVD_R.converse "pgev(log(#{ppm}), #{@location}, #{@scale}, #{@shape})" + + # takes a deviation and returns the pvalue + def pvalue(dev) + EVD_R.converse "pgev(log(#{dev}), #{@location}, #{@scale}, #{@shape})" end - # returns an EVD object - def self.ppms_to_evd(ppms) - %w(ismev evd).each do |lib| - reply = EVD_R.converse "library(#{lib})" - puts "REAPLY: #{reply}" - unless reply.size > 4 # ~roughly - $stderr.puts "The libraries ismev and evd must be installed in your R env!" - $stderr.puts "From within R:" - $stderr.puts %Q{install.packages("ismev") ; install.packages("evd")} - raise "must have R (rserve) and ismev and evd installed!" - end + def self.require_r_library(lib) + reply = EVD_R.converse "library(#{lib})" + puts "REAPLY: #{reply}" + unless reply.size > 4 # ~roughly + $stderr.puts "The libraries ismev and evd must be installed in your R env!" + $stderr.puts "From within R (works best if R is started with sudo or root for installing):" + $stderr.puts %Q{install.packages("ismev") ; install.packages("evd")} + raise "must have R (rserve) and ismev and evd installed!" end - #ppmsl = ppms.map {|v| Math::log(v) } - #p EVD_R.converse("cor(c(1,2,3), c(4,4,6))", :ppmsl_r => ppmsl ) - #params = EVD_R.converse("hist(ppms_r, 50)", :ppms_r => ppms ) - params = EVD_R.converse("m <- gev.fit(log(ppms_r)); c(m$mle[1], m$mle[2], m$mle[3])", :ppms_r => ppms ) - #EVD_R.command("svggev.diag(m)") - #EVD_R.pause - EVD.new(*params) end - #ppmsl = log(ppms) - #gev.fit(ppmsl) - # pgev(log10(#{TEST_PVAL}), model$mle[1], model$mle[2], model$mle[3]) + # returns an EVD object + def self.deviations_to_evd(type, devs) + %w(ismev evd).each {|lib| require_r_library(lib) } + params = EVD_R.converse("m <- gev.fit(log(devs_r))\n c(m$mle[1], m$mle[2], m$mle[3])", :devs_r => devs ) + EVD.new(*params, type) + end end + + STANDARD_MODIFICATIONS = { :proton => [1,2], :ammonium => [1], @@ -101,14 +145,13 @@ def create_search_spectrum(queries) MS::Spectrum.new([mzs, query_groups]) end - - def create_probability_function(queries, opts={}) opts = STANDARD_SEARCH.merge( opts ) - spec = create_search_spectrum(queries) + query_spectrum = create_search_spectrum(queries) + # make sure we get the right bin size based on the input - ss = spec.mzs.size ; optimal_num_groups = 1 + ss = query_spectrum.mzs.size ; optimal_num_groups = 1 (1..ss).each do |divisions| if (ss.to_f / divisions) >= opts[:prob_min_bincnt] optimal_num_groups = divisions @@ -116,33 +159,28 @@ def create_probability_function(queries, opts={}) end end - rng = Random.new - # create a mock spectrum of intensity 1 mz_ranges = [] - evds = spec.peaks.in_groups(optimal_num_groups,false).map do |peaks| - mz_range = Range.new( peaks.first.first, peaks.last.first ) - mz_ranges << mz_range - random_mzs = opts[:num_rand_samples_per_bin].times.map { rng.rand(mz_range) } - puts "*****************************" - puts "MZRANGE: " - p mz_range + last_peaks_group = nil + search_bins = query_spectrum.peaks.in_groups(optimal_num_groups,false).each_cons(2).map do |peaks1, peaks2| + bin = MS::Lipid::Search::Bin.new( Range.new(peaks1.first.first, peaks2.first.first, true), query_spectrum ) + last_peaks_group = peaks2 + bin + end + _range = Range.new(last_peaks_group.first.first, last_peaks_group.last.first) + search_bins << MS::Lipid::Search::Bin.new(_range, query_spectrum) # inclusive + + search_bins.map do |search_bin| + rng = Random.new + random_mzs = opts[:num_rand_samples_per_bin].times.map { rng.rand(search_bin.to_range) } # find the deltas diffs = random_mzs.map do |random_mz| - nearest_random_mz = spec.find_all_nearest(random_mz).first + nearest_random_mz = query_spectrum.find_nearest(random_mz) delta = (random_mz - nearest_random_mz).abs opts[:ppm] ? delta./(nearest_random_mz).*(1e6) : delta end - #File.write("data_#{mz_range}.dataset", "deltas\n" + diffs.join("\n")) - EVD.ppms_to_evd(diffs) - end - File.open("plot_evd_params_ppm_#{!!opt[:ppm]}.data",'w') do |out| - out.puts %w(mzrange loc scale shape).join("\t") - evds.zip(mz_ranges) do |evd, range| - out.puts [range, evd.location, evd.scale, evd.shape].join("\t") - end + EVD.deviations_to_evd(diffs, (opts[:ppm] ? :ppm : :amu)) end end - end end end