-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsampling.js
149 lines (118 loc) · 3.96 KB
/
sampling.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
const TARGET_BIN_SIZE = 10000;
const SECONDARY_BIN_SIZE = 5000;
const TERTIARY_BIN_SIZE = 2500;
const NUM_SAMPLES = 20;
function sample(inRegions) {
let idealRegions = [];
const secondaryRegions = [];
const tertiaryRegions = [];
for (const region of inRegions) {
const length = region.end - region.start;
if (length >= TARGET_BIN_SIZE) {
idealRegions.push(region);
}
else if (length >= SECONDARY_BIN_SIZE) {
secondaryRegions.push(region);
}
else if (length >= TERTIARY_BIN_SIZE) {
tertiaryRegions.push(region);
}
}
if (idealRegions.length < NUM_SAMPLES) {
const expanded = expandRegions(idealRegions);
if (expanded.length > idealRegions.length) {
idealRegions = expanded;
}
}
let sampledRegions = sampleFromRegions(idealRegions, NUM_SAMPLES, TARGET_BIN_SIZE);
if (sampledRegions.length < NUM_SAMPLES) {
const remaining = NUM_SAMPLES - sampledRegions.length;
// readRatio increases the number of regions so we still get the desired number of reads sampled
const readRatio = Math.floor(TARGET_BIN_SIZE/SECONDARY_BIN_SIZE);
const batch = sampleFromRegions(secondaryRegions, remaining*readRatio, SECONDARY_BIN_SIZE);
sampledRegions = [...sampledRegions, ...batch];
}
if (sampledRegions.length < NUM_SAMPLES) {
const remaining = NUM_SAMPLES - sampledRegions.length;
const readRatio = Math.floor(TARGET_BIN_SIZE/TERTIARY_BIN_SIZE);
const batch = sampleFromRegions(tertiaryRegions, remaining*readRatio, TERTIARY_BIN_SIZE);
sampledRegions = [...sampledRegions, ...batch];
}
return sampledRegions.sort(function (a, b) {
if (a.name == b.name) {
return ((a.start < b.start) ? -1 : ((a.start > b.start) ? 1 : 0));
}
else {
return ((a.name < b.name) ? -1 : ((a.name > b.name) ? 1 : 0));
}
});
}
function expandRegions(regions) {
let expanded = [];
for (const region of regions) {
expanded = [...expanded, ...expandRegion(region)];
}
return expanded;
}
/**
* Takes a single region and breaks it into multiple smaller regions of at
* least TARGET_BIN_SIZE if possible.
*/
function expandRegion(region) {
const samp = [];
const length = region.end - region.start;
if (length <= TARGET_BIN_SIZE) {
return [region];
}
const numPossibleIndices = Math.floor((length - TARGET_BIN_SIZE) / TARGET_BIN_SIZE);
if (numPossibleIndices < 1000) {
// Small enough number to guarantee no duplicates
const possibleIndices = [];
for (let i=0; i<numPossibleIndices; i++) {
possibleIndices.push(i*TARGET_BIN_SIZE);
}
for (let i=0; i < NUM_SAMPLES && possibleIndices.length > 0; i++) {
const randomIndex = Math.floor(Math.random() * possibleIndices.length);
const start = possibleIndices.splice(randomIndex, 1)[0];
samp.push({
rname: region.rname,
start,
end: start + TARGET_BIN_SIZE,
});
}
}
else {
for (let i=0; i < NUM_SAMPLES; i++) {
const randomStart = Math.floor(Math.random() * numPossibleIndices * TARGET_BIN_SIZE);
samp.push({
rname: region.rname,
start: randomStart,
end: randomStart + TARGET_BIN_SIZE,
});
}
}
return samp;
}
function sampleFromRegions(inRegions, numSamples, binSize) {
const regions = [...inRegions];
const sampledRegions = [];
for (let i=0; i < numSamples && regions.length > 0; i++) {
const randomIndex = Math.floor(Math.random() * regions.length);
const randomRegion = regions.splice(randomIndex, 1)[0];
const length = randomRegion.end - randomRegion.start;
const maxOffset = length - binSize;
const randomStart = randomRegion.start + Math.round(Math.random() * maxOffset);
if ((randomStart + binSize) > randomRegion.end) {
throw new Error("Sampling error. This shouldn't happen.");
}
sampledRegions.push({
name: randomRegion.rname,
start: randomStart,
end: randomStart + binSize,
});
}
return sampledRegions;
}
export {
sample,
};