-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmscore_kgpu_thrust.cu
800 lines (719 loc) · 26.6 KB
/
mscore_kgpu_thrust.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
// -*- mode: c++ -*-
/*
* Copyright (c) 2012 Insilicos LLC
*
* A CUDA-based reimplementation of work which is
* Copyright (c) 2003-2006 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <iostream>
#include <sstream>
#include <cuda.h>
#include <device_functions.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
#include <thrust/remove.h>
#include <thrust/sequence.h>
#include <thrust/unique.h>
#include <thrust/scan.h>
#include <thrust/binary_search.h>
#include <thrust/adjacent_difference.h>
#include <map>
#ifdef HAVE_MULTINODE_TANDEM // support for Hadoop and/or MPI?
#undef HAVE_MULTINODE_TANDEM // need to omit boost etc for NVCC's benefit
#endif
using namespace std; // this is evil but tandem codebase assumes
typedef map<unsigned long,double> SMap;
#include "mscore_kgpu_thrust.h"
typedef thrust::device_vector<float> fvec;
typedef thrust::device_vector<int> ivec;
typedef ivec::iterator iveciter;
typedef fvec::iterator fveciter;
void vmiTypeGPU::init(int s) {
fI = new thrust::device_vector<float>(s);
iM = new thrust::device_vector<int>(s);
}
void vmiTypeGPU::kill() {
delete fI;
fI = NULL;
delete iM;
iM = NULL;
}
// helpers for hiding device memory implementation from normal C++
thrust::device_vector<float> *mscore_kgpu_thrust_fvec_alloc(int size) {
return new thrust::device_vector<float>(size);
}
#define ZEROVEC(f) thrust::fill((f).begin(),(f).end(),0) // TODO - cudaMemSet?
#define CLEARVEC(f) (f).resize(0);(f).shrink_to_fit();
static void clear_largebuffers(); // defined below
template<typename vectT> bool RESIZEVEC(vectT &f,size_t sz,bool recursionOK=true) {
try {
f.resize(sz);
}
catch(exception &e) {
// fail - try to hose out some larger buffers
if (recursionOK && !largeBufferLockEngaged)
{
clear_largebuffers();
RESIZEVEC(f,sz,false);
}
return false;
}
return true;
}
void mscore_kgpu_thrust_fvec_clear(thrust::device_vector<float> *vec) {
ZEROVEC(*vec);
}
void mscore_kgpu_thrust_fvec_kill(thrust::device_vector<float> *vec) {
delete vec;
}
void mscore_kgpu_thrust_ivec_kill(thrust::device_vector<int> *vec) {
delete vec;
}
// convert a linear index to a row index
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T>
{
T C; // number of columns
__host__ __device__
linear_index_to_row_index(T _C) : C(_C) {}
__host__ __device__
T operator()(T i)
{
return i / C;
}
};
#define MY_CUDA_STREAM 0
thrust::device_vector<int> *mscore_kgpu_thrust_host_to_device_copy(
const pinned_host_vector_int_t &host_src,
thrust::device_vector<int> *device_dest) {
if (!device_dest) {
device_dest = new thrust::device_vector<int>();
}
RESIZEVEC(*device_dest,host_src.size());
cudaMemcpyAsync(thrust::raw_pointer_cast(device_dest->data()),
host_src.data(),
host_src.size()*sizeof(int),
cudaMemcpyHostToDevice,
MY_CUDA_STREAM);
return device_dest;
}
void mscore_kgpu_thrust_host_to_device_copy_float(
const pinned_host_vector_float_t &host_src,
thrust::device_vector<float> &device_dest) {
RESIZEVEC(device_dest,host_src.size());
cudaMemcpyAsync(thrust::raw_pointer_cast(device_dest.data()),
host_src.data(),
host_src.size()*sizeof(float),
cudaMemcpyHostToDevice,
MY_CUDA_STREAM);
}
void mscore_kgpu_thrust_device_to_host_copy_float(
const thrust::device_vector<float> &device_src,
pinned_host_vector_float_t &host_dest) {
RESIZEVEC(host_dest,device_src.size());
cudaMemcpyAsync(thrust::raw_pointer_cast(host_dest.data()),
thrust::raw_pointer_cast(device_src.data()),
device_src.size()*sizeof(float),
cudaMemcpyDeviceToHost,
MY_CUDA_STREAM);
}
void mscore_kgpu_thrust_device_to_host_copy_int(
const thrust::device_vector<int> &device_src,
pinned_host_vector_int_t &host_dest) {
RESIZEVEC(host_dest,device_src.size());
cudaMemcpyAsync(thrust::raw_pointer_cast(host_dest.data()),
thrust::raw_pointer_cast(device_src.data()),
device_src.size()*sizeof(int),
cudaMemcpyDeviceToHost,
MY_CUDA_STREAM);
}
#define HOSTCODE // __host__
// define transformation f(x) -> x^2
struct square
{
HOSTCODE __device__
float operator()(float x)
{
return x * x;
}
};
struct divideby
{
const float m_factor;
HOSTCODE __device__
divideby(float divisor) : m_factor (1.0f/divisor) {}
HOSTCODE __device__
float operator()(float m)
{
return (m*m_factor);
}
};
struct multiplyby
{
const float m_factor;
HOSTCODE __device__
multiplyby(float factor) : m_factor(factor) {}
HOSTCODE __device__
float operator()(float m)
{
return (m*m_factor);
}
};
struct xform_multiplyby : public thrust::unary_function<int,int>
{
const int m_factor;
HOSTCODE __device__
xform_multiplyby(int factor) : m_factor(factor) {}
HOSTCODE __device__
int operator()(int m)
{
return (m*m_factor);
}
};
struct xform_incrementby : public thrust::unary_function<int,int>
{
const int m_incr;
HOSTCODE __device__
xform_incrementby(int incr) : m_incr (incr) {}
HOSTCODE __device__
int operator()(int m)
{
return (m+m_incr);
}
};
struct xform_modby : public thrust::unary_function<int,int>
{
const int m_mod;
HOSTCODE __device__
xform_modby(int m) : m_mod (m) {}
HOSTCODE __device__
int operator()(int m)
{
return (m%m_mod);
}
};
/*
* m/z binning
*/
struct imass_functor
{
const float m_IsotopeCorrection;
const int m_binOffset;
HOSTCODE __device__
imass_functor(float IsotopeCorrection, int binOffset) :
m_IsotopeCorrection((float)(1.0/IsotopeCorrection)),m_binOffset(binOffset) {}
template <typename Tuple>
HOSTCODE __device__
void operator()(Tuple m) // m[0]=raw mz m[1]=binned mz
{
thrust::get<1>(m) =
((int)((thrust::get<0>(m)*m_IsotopeCorrection) + 0.5f)) - m_binOffset;
}
};
/*
* mixrange
* fI[i] = fi_scattered[i] - (sums[i+101]-sums[i])/101
* <0> = <1> - (<2>-<3>)/101
*/
struct mixrange_functor
{
template <typename Tuple>
HOSTCODE __device__
void operator()(Tuple m)
{
thrust::get<0>(m) =
thrust::get<1>(m) - ((thrust::get<2>(m)-thrust::get<3>(m))/101.0f);
}
};
/*
* resolve mass binning collisions
*/
struct imass_collision_functor {
template <typename Tuple>
HOSTCODE __device__
void operator()(Tuple m) // mass a, mass b, intensity a, intensity b
{
if (thrust::get<1>(m) == thrust::get<0>(m)) { // bin collision
if (thrust::get<2>(m) < thrust::get<3>(m)) { // take max intensity
thrust::get<2>(m) = thrust::get<3>(m);
} else {
thrust::get<3>(m) = thrust::get<2>(m);
}
}
}
};
struct a_equals_b_minus_c
{
template <typename Tuple>
HOSTCODE __device__
void operator()(Tuple m)
{
thrust::get<0>(m) = thrust::get<1>(m)-thrust::get<2>(m);
}
};
struct nonpositive_functor
{
template <typename Tuple>
HOSTCODE __device__
bool operator()(Tuple m)
{
return (thrust::get<1>(m) <= 0);
}
};
// define transformation f(x) -> sqrt(x)
struct squareroot
{
HOSTCODE __device__
float operator()(float x)
{
return sqrt(x);
}
};
CUDA_TIMER_DECL(tScore);
void mscore_kgpu_thrust_score(
// m/z-intensity pairs (input)
const thrust::device_vector<float>::iterator itfM,
const thrust::device_vector<float>::iterator itfI,
size_t length, // iterator end distance
double dIsotopeCorrection, // for m/z binning (input)
int iWindowCount, // (input)
int endMassMax, // max acceptable binned m/z (input)
int &m_maxEnd, // max encountered binned m/z (output)
vmiTypeGPU &binned) // binned intensities and m/z's (output)
{
CUDA_TIMER_START(tScore)
if (!length) {
return;
}
thrust::device_vector<int> &iM = *binned.iM;
thrust::device_vector<float> &fI = *binned.fI;
thrust::copy_n(itfI,length,fI.begin());
// figure the shift needed for lowest binned mass and half smoothing window
int startMass = (int)((*itfM*dIsotopeCorrection) + 0.5f);
const int binOffset = startMass-50;
endMassMax -= binOffset; // shift to local coords
startMass -= binOffset;
// bin the m/z's
thrust::for_each(thrust::make_zip_iterator(
thrust::make_tuple(itfM, iM.begin())),
thrust::make_zip_iterator(
thrust::make_tuple(itfM+length, iM.end())),
imass_functor(dIsotopeCorrection,binOffset));
// Screen peeks on upper end.
// TODO faster way? thrust::lowerbound is slower than this
iveciter itM = iM.begin();
iveciter itEnd = iM.end();
int endMass = itEnd[-1];
int trimEnd=0;
while (itM != itEnd && endMass >= endMassMax) {
itEnd--;
trimEnd++;
endMass = itEnd[-1];
}
if (itM == itEnd) // No peaks left.
{
return;
}
// if any bin has more than one occupant, choose the one with
// higher intensity
thrust::for_each(
thrust::make_zip_iterator(thrust::make_tuple(itM,
itM+1,
fI.begin(),
fI.begin()+1)),
thrust::make_zip_iterator(thrust::make_tuple(itEnd-1,
itEnd,
fI.end()-(trimEnd+1),
fI.end()-trimEnd)),
imass_collision_functor());
// take sqrt of intensities
thrust::transform(fI.begin(),fI.end()-trimEnd,fI.begin(),squareroot());
fveciter itI = fI.begin();
// for every pair fI,iM set templookup[iM]=fI
thrust::device_vector<float> fI_scattered(endMassMax+50); // +50
// for
// smoothing
// window
thrust::scatter(fI.begin(),fI.end()-trimEnd,iM.begin(),fI_scattered.begin());
if ((endMass+binOffset)+50 > m_maxEnd)
m_maxEnd = (endMass+binOffset)+50;
float fMaxI = *thrust::max_element(fI.begin(),fI.end()-trimEnd);
float fMinCutoff = (float) (0.05 * fMaxI);
int range = thrust::min(endMassMax, iWindowCount + endMass)-startMass;
if (range > 3000)
iWindowCount=10;
else if (range > 2500)
iWindowCount=9;
else if (range > 2000)
iWindowCount=8;
else if (range > 1500)
iWindowCount=7;
else if (range > 1000)
iWindowCount=6;
else
iWindowCount=5;
int iWindowSize = range / iWindowCount;
// TODO make this loop into a single call
/*
* Process input spectrum for dot product - split windows
*/
for (int i = 0; i < iWindowCount; i++) {
int iStart = startMass + i*iWindowSize;
/*
* Get maximum intensity within window
*/
float fMaxWindowI = *thrust::max_element(
fI_scattered.begin()+iStart,
fI_scattered.begin()+(iStart+iWindowSize));
if (fMaxWindowI > 0.0 && fMaxWindowI > fMinCutoff) {
/*
* Normalize within window
*/
thrust::transform(
fI_scattered.begin()+iStart,
fI_scattered.begin()+(iStart + iWindowSize),
fI_scattered.begin()+iStart,multiplyby(fMaxI / fMaxWindowI));
}
}
/*
* Reduce intensity and make unit vector by dividing
* every point by sqrt(sum(x^2))
*/
double dSpectrumArea = sqrt(thrust::transform_reduce(
fI_scattered.begin()+startMass,
fI_scattered.begin()+(endMass+1),
square(),
0.0f,
thrust::plus<float>()));
thrust::transform(fI_scattered.begin()+startMass,
fI_scattered.begin()+(endMass+1),
fI_scattered.begin()+startMass,
divideby(dSpectrumArea));
/*
* Perform mix-range modification to input spectrum
*/
thrust::device_vector<float> sums(fI_scattered.size()+101);
thrust::copy(fI_scattered.begin(),fI_scattered.end(),sums.begin()+50);
thrust::exclusive_scan(sums.begin()+50, sums.end(), sums.begin()+50);
// now sums[i+101]-sums[i] = sum(fI_scattered[i-50:i+50])
RESIZEVEC(fI,fI_scattered.size());
// fI[i] = fI_scattered[i] - (sums[i+101]-sums[i])/101
thrust::for_each(thrust::make_zip_iterator(
thrust::make_tuple(fI.begin(),
fI_scattered.begin(),
sums.begin()+101,
sums.begin())),
thrust::make_zip_iterator(
thrust::make_tuple(
fI.end(),
fI_scattered.end(),
sums.end(),
sums.end()-101)),
mixrange_functor());
// now remove any non-positive results
RESIZEVEC(iM,fI.size());
thrust::sequence(iM.begin(),iM.end(),binOffset,1); // shift back
// to host
// coords
thrust::zip_iterator<thrust::tuple<iveciter,fveciter> > new_end =
thrust::remove_if(thrust::make_zip_iterator(
thrust::make_tuple(iM.begin(),fI.begin())),
thrust::make_zip_iterator(
thrust::make_tuple(iM.end(),fI.end())),
nonpositive_functor());
// trim to new length
iM.erase(thrust::get<0>(new_end.get_iterator_tuple()),iM.end());
fI.erase(thrust::get<1>(new_end.get_iterator_tuple()),fI.end());
binned.iM_max = iM.back();
CUDA_TIMER_STOP(tScore)
}
/*
* dot is the fundamental logic for scoring a peptide with a mass spectrum.
* the mass spectrum is determined by the value of m_lId, which is its index
* number in the m_vsmapMI vector. the sequence is represented by the values
* that are currently held in m_plSeq (integer masses).
*/
struct dot_functor
{
HOSTCODE __device__
dot_functor() {}
template <typename Tuple>
HOSTCODE __device__
void operator()(Tuple m) // 0=intensity, 1=sequence (0, .5, or
// 1) 2=used, 3=lcount
{
if (thrust::get<1>(m)) { // ion match?
// intensity * ion contrib (1.0 for match, .5 for neighbor)
float contrib = thrust::get<1>(m)*thrust::get<0>(m);
if (thrust::get<2>(m) < contrib) {
// lcount++ if direct ion match
thrust::get<3>(m) += (1.0==thrust::get<1>(m));
thrust::get<2>(m) = contrib; // m_used
}
}
}
};
#define MAX_SEQLEN 200
thrust::device_vector<float> ones;
thrust::device_vector<float> halves;
thrust::device_vector<float> seq_hits;
// seq_hits is a concatenation of lists each seq_hits_len long
thrust::device_vector<float> dScoresDev;
thrust::device_vector<int> lcounts;
thrust::device_vector<int> lCountsDev;
thrust::device_vector<int> lcountsTmp;
thrust::device_vector<float> miUsed;
thrust::device_vector<float> miUsedTmp;
thrust::device_vector<float> scatteredCopies;
thrust::device_vector<int> row_indices;
//
// helps with the problem of large allocs
// preventing smaller ones - we can break
// those big ones up if needed
//
static bool largeBufferLockEngaged = false;
class LargeBufferLock {
LargeBufferLock() {
largeBufferLockEngaged = true;
}
~LargeBufferLock() {
largeBufferLockEngaged = false;
}
};
static void clear_largebuffers() {
// clear out our larger buffers if possible
if (!largeBufferLockEngaged) {
CLEARVEC(lcountsTmp);
CLEARVEC(seq_hits);
CLEARVEC(miUsedTmp);
CLEARVEC(scatteredCopies);
}
}
void mscore_kgpu_thrust_init(size_t available_memsize) {
RESIZEVEC(ones,MAX_SEQLEN);
thrust::fill(ones.begin(), ones.end(), 1.0);
RESIZEVEC(halves,MAX_SEQLEN);
thrust::fill(halves.begin(), halves.end(), 0.5);
}
// perform dot on current spectrum and all sequence variations at one go
void mscore_kgpu_thrust_dot(pinned_host_vector_int_t &lCountsResult,
pinned_host_vector_float_t &dScoresResult,
const std::vector<const vmiTypeGPU *> &spectra,
const std::vector<int> &sequenceCountAfterEachScorePreload,
const thrust::device_vector<int> &cached_sequences,
const std::vector<int> &vsequence_index)
{
//puts("mscore_kgpu_thrust_dot");
//cudaStreamSynchronize(MY_CUDA_STREAM); // wait for any preperatory
// memcpy to complete
int lastEnd = 0;
size_t nSeqTotal = vsequence_index.size()-1;
RESIZEVEC(dScoresDev,nSeqTotal);
RESIZEVEC(lCountsDev,nSeqTotal);
RESIZEVEC(lcounts,nSeqTotal);
RESIZEVEC(miUsed,nSeqTotal);
RESIZEVEC(row_indices,nSeqTotal);
int seq_hits_len = 0;
for (int sp=(int)spectra.size();sp--;) {
seq_hits_len = max(seq_hits_len,spectra[sp]->iM_max+1);
}
seq_hits_len = 32*(1+(seq_hits_len/32)); // nice even boundaries
int len = seq_hits_len*nSeqTotal;
int nSpectraPerChunk = (int)spectra.size();
LargeBufferLock lock();
if ((int)lcountsTmp.size() < len) {
//std::cout<<len<<"\n";fflush(stdout);
while (!(RESIZEVEC(lcountsTmp,len) &&
RESIZEVEC(seq_hits,len) &&
RESIZEVEC(miUsedTmp,len) &&
RESIZEVEC(scatteredCopies,len)))
{
// too big for memory, break task into smaller chunks
nSpectraPerChunk /= 2;
nSpectraPerChunk += (int)(spectra.size()%2); // 999->500+499 instead of 499+499+1
int maxSeqPerChunk=0;
for (int spec=0;spec<(int)spectra.size();) {
// determine worst case memory use
int firstSeqInChunk = spec?sequenceCountAfterEachScorePreload[spec-1]:0;
int nextspec = min((int)spectra.size(),spec+nSpectraPerChunk);
int lastSeqInChunk = sequenceCountAfterEachScorePreload[nextspec-1];
maxSeqPerChunk = max(maxSeqPerChunk,lastSeqInChunk-firstSeqInChunk);
spec = nextspec;
}
len = seq_hits_len*maxSeqPerChunk;
}
}
// in case of looping, use this to incrementally copy back into overall result space
int devOffset = 0;
for (int firstSpectraInChunk=0;firstSpectraInChunk<(int)spectra.size();firstSpectraInChunk+=nSpectraPerChunk)
{
// normally this loop just hits once, but written this way to subdivide
// larger than memory items
int firstSpectraNotInChunk = min((int)spectra.size(),firstSpectraInChunk+nSpectraPerChunk);
ZEROVEC(seq_hits);
ZEROVEC(scatteredCopies);
ZEROVEC(lcountsTmp);
ZEROVEC(miUsedTmp);
int row = 0;
int rowB = 0;
for (int spec=firstSpectraInChunk;
spec<firstSpectraNotInChunk;
lastEnd=sequenceCountAfterEachScorePreload[spec++]) {
int rowA = row;
const int *sequence_index = &vsequence_index[lastEnd];
int nSeq = (sequenceCountAfterEachScorePreload[spec]-lastEnd);
const vmiTypeGPU &spectrum = *spectra[spec];
// set up the .5 and 1.0 sequence hit multipliers
for (int s=0;s<nSeq;s++ ) {
int dist = sequence_index[s+1]-sequence_index[s];
int seq_hits_offset = row++*seq_hits_len;
thrust::device_vector<int>::const_iterator seq_begin =
cached_sequences.begin()+sequence_index[s];
// note the .5 contributions for ions to left and right of actual ions
thrust::scatter(
halves.begin(),
halves.begin()+dist,
thrust::make_transform_iterator(seq_begin,
xform_incrementby(-1)),
seq_hits.begin()+seq_hits_offset);
thrust::scatter(
halves.begin(),
halves.begin()+dist,
thrust::make_transform_iterator(seq_begin,
xform_incrementby(1)),
seq_hits.begin()+seq_hits_offset);
for (int ss=s+1;ss<nSeq;ss++ ) { // and make it an
// underlayment for
// following sequences
int seq_hits_offset = (rowA+ss)*seq_hits_len;
thrust::scatter(halves.begin(),
halves.begin()+dist,
thrust::make_transform_iterator(
seq_begin, xform_incrementby(-1)),
seq_hits.begin()+seq_hits_offset);
thrust::scatter(halves.begin(),
halves.begin()+dist,
thrust::make_transform_iterator(
seq_begin,xform_incrementby(1)),
seq_hits.begin()+seq_hits_offset);
}
}
for (int s=0;s<nSeq;s++ ) {
int dist = sequence_index[s+1]-sequence_index[s];
int seq_hits_offset = (rowA+s)*seq_hits_len;
thrust::device_vector<int>::const_iterator seq_begin =
cached_sequences.begin()+sequence_index[s];
// note the 1.0 contribution of actual ions
thrust::scatter(ones.begin(),
ones.begin()+dist,
seq_begin,
seq_hits.begin()+seq_hits_offset);
for (int ss=s+1;ss<nSeq;ss++ ) { // and make it an
// underlayment for
// following sequences
int seq_hits_offset = (rowA+ss)*seq_hits_len;
thrust::scatter(ones.begin(),
ones.begin()+dist,
seq_begin,
seq_hits.begin()+seq_hits_offset);
}
}
// now lay out a string of spectrum copies so we can do all sequences in one shot
for (int s=0;s<nSeq;s++ ) {
thrust::scatter(spectrum.fI->begin(),
spectrum.fI->end(),
spectrum.iM->begin(),
scatteredCopies.begin()+(rowB++*seq_hits_len));
}
} // end for each spectra
// now find the hits
thrust::for_each(
thrust::make_zip_iterator(
thrust::make_tuple(
scatteredCopies.begin(),
seq_hits.begin(),
miUsedTmp.begin()+1,
lcountsTmp.begin()+1)),
thrust::make_zip_iterator(
thrust::make_tuple(
scatteredCopies.end(),
seq_hits.end(),
miUsedTmp.end(),
lcountsTmp.end())),
dot_functor());
#ifdef ___DEBUG
STDVECT(int,lcountsTmpLocal,lcountsTmp);
printf("lcountsTmpLocal ");for (int nnn=0;nnn<lcountsTmpLocal.size();nnn++)if(lcountsTmpLocal[nnn])printf("%d,%d ",nnn,lcountsTmpLocal[nnn]);printf("\n\n");fflush(stdout);
#endif
// and get total score
// compute row sums by summing values with equal row indices
int firstSeqInChunk = firstSpectraInChunk?sequenceCountAfterEachScorePreload[firstSpectraInChunk-1]:0;
int nSeqInChunk = sequenceCountAfterEachScorePreload[firstSpectraNotInChunk-1] - firstSeqInChunk;
thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(seq_hits_len)),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(seq_hits_len)) + (seq_hits_len*(int)nSeqInChunk),
lcountsTmp.begin(),
row_indices.begin(),
lcounts.begin()+firstSeqInChunk,
thrust::equal_to<int>(),
thrust::plus<int>());
#ifdef ___DEBUG
STDVECT(int,lcountss,lcounts);
printf("lcounts ");for (int nnn=0;nnn<lcountss.size();nnn++)printf("%d,%d ",nnn,lcountss[nnn]);printf("\n\n");fflush(stdout);
#endif
// now find sum of miUsedTmp for each seq
// compute row sums by summing values with equal row indices
thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(seq_hits_len)),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(seq_hits_len)) + (seq_hits_len*(int)nSeqInChunk),
miUsedTmp.begin()+1,
row_indices.begin(),
miUsed.begin()+firstSeqInChunk,
thrust::equal_to<int>(),
thrust::plus<float>());
#ifdef ___DEBUG
STDVECT(float,miUsedTmpLocal,miUsed);
printf("miUsed ");for (int nnn=0;nnn<miUsedTmpLocal.size();nnn++)if(miUsedTmpLocal[nnn])printf("%d,%f ",nnn,miUsedTmpLocal[nnn]);printf("\n\n");fflush(stdout);
#endif
// miUsed[0:n] now contains sum of hits for each seq, but we want just the increment
// lcounts[0:n] now contains count of 1.0 hits for each seq, but we want just the increment
int start=0;
for (int specc=firstSpectraInChunk; specc<firstSpectraNotInChunk; specc++) {
int end = sequenceCountAfterEachScorePreload[specc] - devOffset;
thrust::adjacent_difference(
miUsed.begin()+start,
miUsed.begin()+end,
dScoresDev.begin()+start+devOffset);
thrust::adjacent_difference(
lcounts.begin()+start,
lcounts.begin()+end,
lCountsDev.begin()+start+devOffset);
start = end;
}
devOffset += start; // in case we're looping
#ifdef ___DEBUG
STDVECT(int,scatteredCopiesLocal,lCountsDev);
printf("lcounts ");for (int nnn=0;nnn<scatteredCopiesLocal.size();nnn++)printf("%d,%d ",nnn,scatteredCopiesLocal[nnn]);printf("\n");fflush(stdout);
STDVECT(float,dscoreLocal,dScoresDev);
printf("dscores ");for (int nnn=0;nnn<dscoreLocal.size();nnn++)printf("%d,%f ",nnn,dscoreLocal[nnn]);printf("\n\n");fflush(stdout);
int blah=0;blah++;
#endif
}
// copy back to host memory
mscore_kgpu_thrust_device_to_host_copy_int(lCountsDev,lCountsResult);
// copy back to host memory
mscore_kgpu_thrust_device_to_host_copy_float(dScoresDev,dScoresResult);
cudaStreamSynchronize(MY_CUDA_STREAM); // wait for memcpy to complete
return;
}