Skip to content

Commit 2f33e68

Browse files
committed
Add decode
1 parent e068661 commit 2f33e68

10 files changed

+184
-164
lines changed

Package.swift

+1-2
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,13 @@ let package = Package(
1515
dependencies: [
1616
// Dependencies declare other packages that this package depends on.
1717
// .package(url: /* package url */, from: "1.0.0"),
18-
.package(url: "https://github.com/ChanTsune/SwiftyPyString.git", from: "2.2.0")
1918
],
2019
targets: [
2120
// Targets are the basic building blocks of a package. A target can define a module or a test suite.
2221
// Targets can depend on other targets in this package, and on products in packages this package depends on.
2322
.target(
2423
name: "Tiktoken",
25-
dependencies: ["SwiftyPyString"]),
24+
dependencies: []),
2625
.testTarget(
2726
name: "TiktokenTests",
2827
dependencies: ["Tiktoken"]),

Sources/Tiktoken/CoreBPE.swift

+16-58
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,15 @@ import Foundation
1010
class CoreBPE {
1111
private let encoder: [[UInt8]: Int]
1212
private let specialTokensEncoder: [String: Int]
13-
private let decoder: [Int: Data]
13+
private let decoder: [Int: [UInt8]]
1414
private let specialTokensDecoder: [Int: Data]
1515
private let regexTls: [NSRegularExpression]
1616
private let specialRegexTls: [NSRegularExpression]
1717
private let sortedTokenBytes: [Data]
1818

1919
init(encoder: [[UInt8] : Int] = .init(),
2020
specialTokensEncoder: [String : Int] = .init(),
21-
decoder: [Int : Data] = .init(),
21+
decoder: [Int : [UInt8]] = .init(),
2222
specialTokensDecoder: [Int : Data] = .init(),
2323
regexTls: [NSRegularExpression] = .init(),
2424
specialRegexTls: [NSRegularExpression] = .init(),
@@ -35,13 +35,8 @@ class CoreBPE {
3535
func encodeOrdinaryNative(text: String) -> [Int] {
3636
let regex = regexTls.first!
3737
var ret = [Int]()
38-
// var newEncoder = [[UInt8]: Int]()
39-
// encoder.forEach({
40-
// newEncoder[[UInt8]($0.key)] = $0.value
41-
// })
4238
for mat in regex.matches(in: text, range: NSRange(text.startIndex..., in: text)) {
4339
if let range = Range(mat.range, in: text) {
44-
// if let piece = Range(mat.range, in: text).map({ String(text[$0]) })?.data(using: .utf8) {
4540
let piece = Array(text[range].utf8)
4641
if let token = encoder[piece] {
4742
ret.append(token)
@@ -53,6 +48,15 @@ class CoreBPE {
5348
}
5449
return ret
5550
}
51+
52+
func decodeNative(tokens: [Int]) -> String {
53+
let data = tokens.reduce(into: Data(), {
54+
if let tokenBytes = decoder[$1] {
55+
$0.append(contentsOf: tokenBytes)
56+
}
57+
})
58+
return String(data: data, encoding: .utf8) ?? ""
59+
}
5660
}
5761

5862
private extension CoreBPE {
@@ -63,37 +67,6 @@ private extension CoreBPE {
6367
// func _get_tl_special_regex() -> NSRegularExpression {
6468
// specialRegexTls[hash_current_thread() % MAX_NUM_THREADS]
6569
// }
66-
67-
func decodeNative(tokens: [Int]) -> Data {
68-
var data = Data()
69-
data.reserveCapacity(tokens.count * 2)
70-
71-
for token in tokens {
72-
guard let tokenBytes = decoder[token] ?? specialTokensDecoder[token] else { break }
73-
data.append(tokenBytes)
74-
}
75-
return data
76-
}
77-
78-
// func encodeOrdinaryNative(text: String) -> [Int] {
79-
// let regex = regexTls.first!
80-
// var ret = [Int]()
81-
// var newEncoder = [[UInt8]: Int]()
82-
// encoder.forEach({
83-
// newEncoder[[UInt8]($0.key)] = $0.value
84-
// })
85-
// for mat in regex.matches(in: text, range: NSRange(text.startIndex..., in: text)) {
86-
// let piece = Range(mat.range, in: text).map({ String(text[$0]) })?.data(using: .utf8) ?? Data() // WARNING
87-
// if let token = encoder[piece] {
88-
// ret.append(token)
89-
// continue
90-
// }
91-
//
92-
// ret.append(contentsOf: bytePairEncode([UInt8](piece), newEncoder))
93-
// }
94-
// return ret
95-
// }
96-
9770
// func encodeNative(text: String, allowedSpecial: Set<String>) -> ([Int], Int) {
9871
// let specialRegex = specialRegexTls.first!
9972
// let regex = regexTls.first!
@@ -331,22 +304,8 @@ private extension CoreBPE {
331304
}
332305
}
333306

334-
var out = [T]()
335-
out.reserveCapacity(parts.count - 1)
336-
// for i in 0..<(parts.count - 1) {
337-
// out.append(completion(parts[i].0..<parts[i + 1].0))
338-
// }
339-
340307
// TODO: Use ranks
341-
parts.prevCurrent({
342-
// if let result = completion($0.0..<$1.0) {
343-
// out.append(result)
344-
// }
345-
346-
let result = completion($0.0..<$1.0)
347-
out.append(result)
348-
})
349-
return out
308+
return parts.prevCurrent({ completion($0.0..<$1.0) })
350309
}
351310

352311
func bytePairEncode(_ piece: [UInt8], _ ranks: [[UInt8]: Int]) -> [Int] {
@@ -355,7 +314,6 @@ private extension CoreBPE {
355314
}
356315
return bytePairMerge(piece, ranks, completion: { p in
357316
let chunk = Array(piece[p])
358-
let characters = chunk.map({ Array(Character(Int($0)).utf8) }).flatMap({ $0 })
359317
return ranks[chunk] ?? 0
360318
})
361319
}
@@ -369,11 +327,11 @@ private extension CoreBPE {
369327
}
370328

371329
extension Array {
372-
func prevCurrent(_ body: (Element, Element) -> Void) {
373-
enumerated().forEach({ index, element in
374-
guard index > 0 else { return }
330+
func prevCurrent<T>(_ body: (Element, Element) throws -> T) rethrows -> [T] {
331+
enumerated().compactMap({ index, element in
332+
guard index > 0 else { return nil }
375333
let prev = self[index-1]
376-
body(prev, element)
334+
return try? body(prev, element)
377335
})
378336
}
379337
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
//
2+
// Character+Int.swift
3+
//
4+
//
5+
// Created by Alberto Espinilla Garrido on 2/4/23.
6+
//
7+
8+
import Foundation
9+
10+
extension Character {
11+
init(_ i: Int) {
12+
self.self = Character(UnicodeScalar(i)!)
13+
}
14+
15+
var isPrintable: Bool {
16+
unicodeScalars.contains(where: { $0.isPrintable })
17+
}
18+
}
19+
20+
extension Unicode.Scalar {
21+
var isPrintable: Bool {
22+
switch properties.generalCategory {
23+
case .control, .format: return false
24+
default: return true
25+
}
26+
}
27+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
//
2+
// String+Base64.swift
3+
//
4+
//
5+
// Created by Alberto Espinilla Garrido on 2/4/23.
6+
//
7+
8+
import Foundation
9+
10+
extension String {
11+
func base64Encoded() -> String? {
12+
data(using: .utf8)?.base64EncodedString()
13+
}
14+
15+
func base64Decoded() -> String? {
16+
guard let data = Data(base64Encoded: self) else { return nil }
17+
return String(data: data, encoding: .ascii)
18+
}
19+
}

Sources/Tiktoken/String+Substring.swift Sources/Tiktoken/Extensions/String+Substring.swift

+4
Original file line numberDiff line numberDiff line change
@@ -27,4 +27,8 @@ extension String {
2727
let endIndex = index(from: r.upperBound)
2828
return String(self[startIndex..<endIndex])
2929
}
30+
31+
var splitWhiteSpaces: [String] {
32+
split(separator: " ").map({ String($0) })
33+
}
3034
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
//
2+
// String+UInt8.swift
3+
//
4+
//
5+
// Created by Alberto Espinilla Garrido on 3/4/23.
6+
//
7+
8+
import Foundation
9+
10+
extension String {
11+
var uInt8: [UInt8] {
12+
utf16.map({ UInt8($0) })
13+
}
14+
}

Sources/Tiktoken/FileDecoder.swift

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
//
2+
// FileDecoder.swift
3+
//
4+
//
5+
// Created by Alberto Espinilla Garrido on 3/4/23.
6+
//
7+
8+
import Foundation
9+
10+
struct FileDecoder {
11+
func decode(_ data: Data) -> [[UInt8]: Int] {
12+
guard let decoded = String(data: data, encoding: .utf8) else { return [:] }
13+
var result: [[UInt8]: Int] = .init()
14+
decoded.split(separator: "\n").forEach({
15+
let lineSplit = $0.split(separator: " ")
16+
guard let first = lineSplit.first,
17+
let key = String(first).base64Decoded(),
18+
let value = lineSplit.last
19+
else {
20+
return
21+
}
22+
result[key.uInt8] = Int(value)
23+
})
24+
return result
25+
}
26+
}

0 commit comments

Comments
 (0)