@@ -59,70 +59,7 @@ class CoreBPE {
59
59
}
60
60
}
61
61
62
- private extension CoreBPE {
63
- // func _get_tl_regex() -> NSRegularExpression {
64
- // regexTls[hash_current_thread() % MAX_NUM_THREADS]
65
- // }
66
- //
67
- // func _get_tl_special_regex() -> NSRegularExpression {
68
- // specialRegexTls[hash_current_thread() % MAX_NUM_THREADS]
69
- // }
70
- // func encodeNative(text: String, allowedSpecial: Set<String>) -> ([Int], Int) {
71
- // let specialRegex = specialRegexTls.first!
72
- // let regex = regexTls.first!
73
- // var ret = [Int]()
74
- // var start = 0
75
- // var lastPieceTokenLen = 0
76
- //
77
- // var newEncoder = [[UInt8]: Int]()
78
- // encoder.forEach({
79
- // newEncoder[[UInt8]($0.key)] = $0.value
80
- // })
81
- //
82
- // while true {
83
- // var nextSpecial: NSTextCheckingResult?
84
- // var startFind = start
85
- //
86
- // while true {
87
- // // Find the next allowed special token, if any
88
- // nextSpecial = specialRegex.firstMatch(in: text, range: NSRange(startFind..<text.utf16.count))
89
- // if nextSpecial == nil { break }
90
- //
91
- // let range = Range(nextSpecial!.range)
92
- // if allowedSpecial.contains(text.substring(with: range!)) {
93
- // break
94
- // }
95
- // startFind = nextSpecial!.range.location + 1
96
- // }
97
- //
98
- // let end = nextSpecial?.range.location ?? text.utf16.count
99
- //
100
- // let currentText = text.substring(with: start..<end)
101
- // for mat in regex.matches(in: currentText, range: NSRange(text.startIndex..., in: currentText)) {
102
- //// let piece = mat.group(at: 0)!.asBytes()
103
- // let piece = Range(mat.range, in: text).map({ String(text[$0]) })?.data(using: .utf8) ?? Data() // WARNING
104
- // if let token = encoder[piece] {
105
- // lastPieceTokenLen = 1
106
- // ret.append(token)
107
- // continue
108
- // }
109
- // let tokens = bytePairEncode([UInt8](piece), encoder)
110
- // lastPieceTokenLen = tokens.count
111
- // ret.append(contentsOf: tokens)
112
- // }
113
- //
114
- // if let m = nextSpecial {
115
- // let range = Range(m.range)
116
- // let piece = text.substring(with: range!)
117
- // let token = specialTokensEncoder[piece]!
118
- // ret.append(token)
119
- // start = m.range.location + m.range.length
120
- // lastPieceTokenLen = 0
121
- // } else { break }
122
- // }
123
- // return (ret, lastPieceTokenLen)
124
- // }
125
-
62
+ private extension CoreBPE {
126
63
func increaseLastPieceTokenLen( tokens: [ Int ] , lastPieceTokenLen: Int ) -> ( [ Int ] , Int ) {
127
64
func tokenIsAllSpace( _ token: Int ) -> Bool {
128
65
guard let tokenBytes = decoder [ token] else { return false }
@@ -139,103 +76,6 @@ private extension CoreBPE {
139
76
assert ( lastPieceTokenLen <= tokens. count)
140
77
return ( tokens, lastPieceTokenLen)
141
78
}
142
-
143
- // func encodeUnstableNative(_ text: String, _ allowed_special: Set<String>) -> ([Int], Set<[Int]>) {
144
- // let (tokens1, lastPieceTokenLen1) = encodeNative(text: text, allowedSpecial: allowed_special)
145
- // guard lastPieceTokenLen1 > 0 else {
146
- // return (tokens1, Set())
147
- // }
148
- //
149
- // var (tokens, lastPieceTokenLen) = increaseLastPieceTokenLen(tokens: tokens1, lastPieceTokenLen: lastPieceTokenLen1)
150
- //
151
- // let unstableBytes = decodeNative(tokens: Array(tokens.suffix(lastPieceTokenLen)))
152
- // tokens.removeLast(lastPieceTokenLen)
153
- // var completions = Set<[Int]>()
154
- // guard !unstableBytes.isEmpty else {
155
- // return (tokens, completions)
156
- // }
157
- //
158
- // var newEncoder = [[UInt8]: Int]()
159
- // encoder.forEach({
160
- // newEncoder[[UInt8]($0.key)] = $0.value
161
- // })
162
- //
163
- // var point = sortedTokenBytes.partition(by: { $0.prefix($0.count).lexicographicallyPrecedes(unstableBytes.prefix(unstableBytes.count)) })
164
- // while point < sortedTokenBytes.count && sortedTokenBytes[point].starts(with: unstableBytes) {
165
- // completions.insert([encoder[sortedTokenBytes[point]]!])
166
- // point += 1
167
- // }
168
- //
169
- // for i in 1..<unstableBytes.count {
170
- // let prefix = Array(unstableBytes.prefix(i))
171
- // let suffix = Array(unstableBytes.suffix(from: i))
172
- //// var point = sortedTokenBytes.partitionPoint { x in x < suffix }
173
- //
174
- // var point = sortedTokenBytes.partition(by: { $0.prefix($0.count).lexicographicallyPrecedes(suffix.prefix(suffix.count)) })
175
- // while point < sortedTokenBytes.count && sortedTokenBytes[point].starts(with: suffix) {
176
- // let possibility = prefix + sortedTokenBytes[point]
177
- // let encoded: [Int]
178
- // do {
179
- // let s = try String(decoding: possibility, as: UTF8.self)
180
- // encoded = encodeOrdinaryNative(text: s)
181
- // } catch {
182
- // encoded = bytePairEncode(possibility, newEncoder)
183
- // }
184
- // var seq = [Int]()
185
- // var seqLen = 0
186
- // for token in encoded {
187
- // seq.append(token)
188
- // seqLen += decoder[token]!.count
189
- // if seqLen >= unstableBytes.count {
190
- // break
191
- // }
192
- // }
193
- // completions.insert(seq)
194
- // point += 1
195
- // }
196
- // }
197
- //
198
- //// if unstableBytes.count > 1 {
199
- //// let lastDecoded = unstableBytes.decodeLastUTF8()
200
- //// if unstableBytes.count - lastDecoded.1 > 0 && lastDecoded.0?.isWhitespace == true {
201
- //// var reencoded = bytePairEncode(Array(unstableBytes.prefix(unstableBytes.count - lastDecoded.1)), newEncoder)
202
- //// reencoded += bytePairEncode(Array(unstableBytes.suffix(lastDecoded.1)), newEncoder)
203
- //// completions.insert(reencoded)
204
- //// }
205
- //// }
206
- ////
207
- // if unstableBytes.count > 1 {
208
- // if let char = unstableBytes.last, let lastDecoded = String(utf8String: [CChar(char)]) {
209
- // let lastDecodedLength = unstableBytes.count - lastDecoded.count
210
- // if lastDecodedLength > 0 && lastDecoded.last?.isWhitespace == true {
211
- // let encoded1 = bytePairEncode(Array(unstableBytes.prefix(unstableBytes.count - lastDecodedLength)), newEncoder)
212
- // let encoded2 = bytePairEncode(Array(unstableBytes.suffix(lastDecodedLength)), newEncoder)
213
- // let reencoded = Array(encoded1) + Array(encoded2)
214
- // completions.insert(reencoded)
215
- // }
216
- // }
217
- // }
218
- //
219
- //
220
- // return (tokens, completions)
221
- // }
222
-
223
- // private func _increase_last_piece_token_len(_ tokens: [Int], _ lastPieceTokenLen: Int) -> ([Int], Int) {
224
- // var lastPieceTokenLen = lastPieceTokenLen
225
- // let tokenIsAllSpace: (Int) -> Bool = { token in
226
- // decoder[token]?.reversed().allSatisfy { b in [b" ", b"\n", b"\t"].contains(b) } ?? false
227
- // }
228
- //
229
- // if lastPieceTokenLen > 0 && tokenIsAllSpace(tokens[tokens.count - lastPieceTokenLen]) {
230
- // while lastPieceTokenLen < tokens.count && tokenIsAllSpace(tokens[tokens.count - lastPieceTokenLen - 1]) {
231
- // lastPieceTokenLen += 1
232
- // }
233
- // }
234
- //
235
- // assert(lastPieceTokenLen <= tokens.count)
236
- // return (tokens, lastPieceTokenLen)
237
- // }
238
-
239
79
}
240
80
241
81
// MARK: - Merges
0 commit comments