Skip to content

Commit 3cc1cc2

Browse files
committed
Clear and prepare
1 parent 0e93a87 commit 3cc1cc2

File tree

4 files changed

+16
-177
lines changed

4 files changed

+16
-177
lines changed

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,7 @@ print(decoded)
2929
- Encode unstable native
3030
- Multithread
3131
- Custom vocab
32+
- Implements cache for loaded encoding
33+
- Add/Improve documentation
3234
- Optimization performance
3335
- More testing

Sources/Tiktoken/CoreBPE.swift

+1-161
Original file line numberDiff line numberDiff line change
@@ -59,70 +59,7 @@ class CoreBPE {
5959
}
6060
}
6161

62-
private extension CoreBPE {
63-
// func _get_tl_regex() -> NSRegularExpression {
64-
// regexTls[hash_current_thread() % MAX_NUM_THREADS]
65-
// }
66-
//
67-
// func _get_tl_special_regex() -> NSRegularExpression {
68-
// specialRegexTls[hash_current_thread() % MAX_NUM_THREADS]
69-
// }
70-
// func encodeNative(text: String, allowedSpecial: Set<String>) -> ([Int], Int) {
71-
// let specialRegex = specialRegexTls.first!
72-
// let regex = regexTls.first!
73-
// var ret = [Int]()
74-
// var start = 0
75-
// var lastPieceTokenLen = 0
76-
//
77-
// var newEncoder = [[UInt8]: Int]()
78-
// encoder.forEach({
79-
// newEncoder[[UInt8]($0.key)] = $0.value
80-
// })
81-
//
82-
// while true {
83-
// var nextSpecial: NSTextCheckingResult?
84-
// var startFind = start
85-
//
86-
// while true {
87-
// // Find the next allowed special token, if any
88-
// nextSpecial = specialRegex.firstMatch(in: text, range: NSRange(startFind..<text.utf16.count))
89-
// if nextSpecial == nil { break }
90-
//
91-
// let range = Range(nextSpecial!.range)
92-
// if allowedSpecial.contains(text.substring(with: range!)) {
93-
// break
94-
// }
95-
// startFind = nextSpecial!.range.location + 1
96-
// }
97-
//
98-
// let end = nextSpecial?.range.location ?? text.utf16.count
99-
//
100-
// let currentText = text.substring(with: start..<end)
101-
// for mat in regex.matches(in: currentText, range: NSRange(text.startIndex..., in: currentText)) {
102-
//// let piece = mat.group(at: 0)!.asBytes()
103-
// let piece = Range(mat.range, in: text).map({ String(text[$0]) })?.data(using: .utf8) ?? Data() // WARNING
104-
// if let token = encoder[piece] {
105-
// lastPieceTokenLen = 1
106-
// ret.append(token)
107-
// continue
108-
// }
109-
// let tokens = bytePairEncode([UInt8](piece), encoder)
110-
// lastPieceTokenLen = tokens.count
111-
// ret.append(contentsOf: tokens)
112-
// }
113-
//
114-
// if let m = nextSpecial {
115-
// let range = Range(m.range)
116-
// let piece = text.substring(with: range!)
117-
// let token = specialTokensEncoder[piece]!
118-
// ret.append(token)
119-
// start = m.range.location + m.range.length
120-
// lastPieceTokenLen = 0
121-
// } else { break }
122-
// }
123-
// return (ret, lastPieceTokenLen)
124-
// }
125-
62+
private extension CoreBPE {
12663
func increaseLastPieceTokenLen(tokens: [Int], lastPieceTokenLen: Int) -> ([Int], Int) {
12764
func tokenIsAllSpace(_ token: Int) -> Bool {
12865
guard let tokenBytes = decoder[token] else { return false }
@@ -139,103 +76,6 @@ private extension CoreBPE {
13976
assert(lastPieceTokenLen <= tokens.count)
14077
return (tokens, lastPieceTokenLen)
14178
}
142-
143-
// func encodeUnstableNative(_ text: String, _ allowed_special: Set<String>) -> ([Int], Set<[Int]>) {
144-
// let (tokens1, lastPieceTokenLen1) = encodeNative(text: text, allowedSpecial: allowed_special)
145-
// guard lastPieceTokenLen1 > 0 else {
146-
// return (tokens1, Set())
147-
// }
148-
//
149-
// var (tokens, lastPieceTokenLen) = increaseLastPieceTokenLen(tokens: tokens1, lastPieceTokenLen: lastPieceTokenLen1)
150-
//
151-
// let unstableBytes = decodeNative(tokens: Array(tokens.suffix(lastPieceTokenLen)))
152-
// tokens.removeLast(lastPieceTokenLen)
153-
// var completions = Set<[Int]>()
154-
// guard !unstableBytes.isEmpty else {
155-
// return (tokens, completions)
156-
// }
157-
//
158-
// var newEncoder = [[UInt8]: Int]()
159-
// encoder.forEach({
160-
// newEncoder[[UInt8]($0.key)] = $0.value
161-
// })
162-
//
163-
// var point = sortedTokenBytes.partition(by: { $0.prefix($0.count).lexicographicallyPrecedes(unstableBytes.prefix(unstableBytes.count)) })
164-
// while point < sortedTokenBytes.count && sortedTokenBytes[point].starts(with: unstableBytes) {
165-
// completions.insert([encoder[sortedTokenBytes[point]]!])
166-
// point += 1
167-
// }
168-
//
169-
// for i in 1..<unstableBytes.count {
170-
// let prefix = Array(unstableBytes.prefix(i))
171-
// let suffix = Array(unstableBytes.suffix(from: i))
172-
//// var point = sortedTokenBytes.partitionPoint { x in x < suffix }
173-
//
174-
// var point = sortedTokenBytes.partition(by: { $0.prefix($0.count).lexicographicallyPrecedes(suffix.prefix(suffix.count)) })
175-
// while point < sortedTokenBytes.count && sortedTokenBytes[point].starts(with: suffix) {
176-
// let possibility = prefix + sortedTokenBytes[point]
177-
// let encoded: [Int]
178-
// do {
179-
// let s = try String(decoding: possibility, as: UTF8.self)
180-
// encoded = encodeOrdinaryNative(text: s)
181-
// } catch {
182-
// encoded = bytePairEncode(possibility, newEncoder)
183-
// }
184-
// var seq = [Int]()
185-
// var seqLen = 0
186-
// for token in encoded {
187-
// seq.append(token)
188-
// seqLen += decoder[token]!.count
189-
// if seqLen >= unstableBytes.count {
190-
// break
191-
// }
192-
// }
193-
// completions.insert(seq)
194-
// point += 1
195-
// }
196-
// }
197-
//
198-
//// if unstableBytes.count > 1 {
199-
//// let lastDecoded = unstableBytes.decodeLastUTF8()
200-
//// if unstableBytes.count - lastDecoded.1 > 0 && lastDecoded.0?.isWhitespace == true {
201-
//// var reencoded = bytePairEncode(Array(unstableBytes.prefix(unstableBytes.count - lastDecoded.1)), newEncoder)
202-
//// reencoded += bytePairEncode(Array(unstableBytes.suffix(lastDecoded.1)), newEncoder)
203-
//// completions.insert(reencoded)
204-
//// }
205-
//// }
206-
////
207-
// if unstableBytes.count > 1 {
208-
// if let char = unstableBytes.last, let lastDecoded = String(utf8String: [CChar(char)]) {
209-
// let lastDecodedLength = unstableBytes.count - lastDecoded.count
210-
// if lastDecodedLength > 0 && lastDecoded.last?.isWhitespace == true {
211-
// let encoded1 = bytePairEncode(Array(unstableBytes.prefix(unstableBytes.count - lastDecodedLength)), newEncoder)
212-
// let encoded2 = bytePairEncode(Array(unstableBytes.suffix(lastDecodedLength)), newEncoder)
213-
// let reencoded = Array(encoded1) + Array(encoded2)
214-
// completions.insert(reencoded)
215-
// }
216-
// }
217-
// }
218-
//
219-
//
220-
// return (tokens, completions)
221-
// }
222-
223-
// private func _increase_last_piece_token_len(_ tokens: [Int], _ lastPieceTokenLen: Int) -> ([Int], Int) {
224-
// var lastPieceTokenLen = lastPieceTokenLen
225-
// let tokenIsAllSpace: (Int) -> Bool = { token in
226-
// decoder[token]?.reversed().allSatisfy { b in [b" ", b"\n", b"\t"].contains(b) } ?? false
227-
// }
228-
//
229-
// if lastPieceTokenLen > 0 && tokenIsAllSpace(tokens[tokens.count - lastPieceTokenLen]) {
230-
// while lastPieceTokenLen < tokens.count && tokenIsAllSpace(tokens[tokens.count - lastPieceTokenLen - 1]) {
231-
// lastPieceTokenLen += 1
232-
// }
233-
// }
234-
//
235-
// assert(lastPieceTokenLen <= tokens.count)
236-
// return (tokens, lastPieceTokenLen)
237-
// }
238-
23979
}
24080

24181
// MARK: - Merges

Sources/Tiktoken/Tiktoken.swift

+11-11
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,17 @@ public struct Tiktoken {
1414
return encoding
1515
}
1616

17-
public func getEncoding(for vocab: Vocab) -> Encoding? {
18-
return nil
19-
}
20-
21-
public func register() {
22-
// TODO: Register model and Encoding
23-
}
24-
25-
public func clear() {
26-
// TODO: Clear all cached encoding
27-
}
17+
// public func getEncoding(for vocab: Vocab) -> Encoding? {
18+
// return nil
19+
// }
20+
//
21+
// public func register() {
22+
// // TODO: Register model and Encoding
23+
// }
24+
//
25+
// public func clear() {
26+
// // TODO: Clear all cached encoding
27+
// }
2828
}
2929

3030
private extension Tiktoken {

Tests/TiktokenTests/CoreBPETests.swift

+2-5
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,15 @@ final class CoreBPETests: XCTestCase {
5252
// let expected = [31373, 50169, 233, 995, 12520, 234, 235]
5353
//
5454
let input = "Esto es un texto 👨🏻‍💻 con emojis diferentes 🍿💃🏼🧜‍♂️ y más texto que no tiene sentido 🛟"
55-
// let expected = [22362, 78, 1658, 555, 2420, 78, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 369, 795, 13210, 271, 288, 361, 9100, 274, 12520, 235, 123, 8582, 240, 225, 8582, 237, 120, 8582, 100, 250, 447, 235, 17992, 224, 37929, 331, 285, 40138, 2420, 78, 8358, 645, 46668, 1734, 1908, 17305, 12520, 249, 253]
56-
57-
let expected = [14101, 78, 1560, 653, 33125, 62904, 101, 9468, 237, 119, 378, 235, 93273, 119, 390, 100166, 46418, 11410, 235, 123, 93273, 225, 9468, 237, 120, 9468, 100, 250, 378, 235, 379, 11158, 33125, 1744, 912, 24215, 65484, 11410, 249, 253]
55+
let expected = [22362, 78, 1658, 555, 2420, 78, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 369, 795, 13210, 271, 288, 361, 9100, 274, 12520, 235, 123, 8582, 240, 225, 8582, 237, 120, 8582, 100, 250, 447, 235, 17992, 224, 37929, 331, 285, 40138, 2420, 78, 8358, 645, 46668, 1734, 1908, 17305, 12520, 249, 253]
5856

5957

6058
// let input = "Vamos a probar🚒🚁🚀🚊 muchos emoticonos para probar⚽️🤸🏿‍♀️ diferentes codificaciones 👨🏻‍💻♨︎"
6159
// let expected = [53, 321, 418, 257, 1861, 283, 8582, 248, 240, 8582, 248, 223, 8582, 248, 222, 8582, 248, 232, 881, 418, 4085, 4749, 418, 31215, 1861, 283, 158, 248, 121, 37929, 8582, 97, 116, 8582, 237, 123, 447, 235, 17992, 222, 37929, 288, 361, 9100, 274, 14873, 811, 49443, 274, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 17992, 101, 35266, 236]
6260
//
6361
// let encoderGPT = await Load.dataGymToMergeableBpeRanks(vocabBpeFile: "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", encoderJsonFile: "")
64-
// let encoder = await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken")
62+
let encoder = await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken")
6563

66-
let encoder = await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken")
6764

6865
// "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
6966
// let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu"))

0 commit comments

Comments
 (0)