Clear and prepare

aespinilla · aespinilla · commit 3cc1cc28b68a · 2023-05-18T10:25:54.000+02:00
diff --git a/README.md b/README.md
@@ -29,5 +29,7 @@ print(decoded)
 - Encode unstable native
 - Multithread
 - Custom vocab
+- Implements cache for loaded encoding
+- Add/Improve documentation
 - Optimization performance
 - More testing
diff --git a/Sources/Tiktoken/CoreBPE.swift b/Sources/Tiktoken/CoreBPE.swift
@@ -59,70 +59,7 @@ class CoreBPE {
     }
 }
 
-private extension CoreBPE {
-//    func _get_tl_regex() -> NSRegularExpression {
-//        regexTls[hash_current_thread() % MAX_NUM_THREADS]
-//    }
-//
-//    func _get_tl_special_regex() -> NSRegularExpression {
-//        specialRegexTls[hash_current_thread() % MAX_NUM_THREADS]
-//    }
-//    func encodeNative(text: String, allowedSpecial: Set<String>) -> ([Int], Int) {
-//        let specialRegex = specialRegexTls.first!
-//        let regex = regexTls.first!
-//        var ret = [Int]()
-//        var start = 0
-//        var lastPieceTokenLen = 0
-//
-//        var newEncoder = [[UInt8]: Int]()
-//        encoder.forEach({
-//            newEncoder[[UInt8]($0.key)] = $0.value
-//        })
-//
-//        while true {
-//            var nextSpecial: NSTextCheckingResult?
-//            var startFind = start
-//
-//            while true {
-//                // Find the next allowed special token, if any
-//                nextSpecial = specialRegex.firstMatch(in: text, range: NSRange(startFind..<text.utf16.count))
-//                if nextSpecial == nil { break }
-//
-//                let range = Range(nextSpecial!.range)
-//                if allowedSpecial.contains(text.substring(with: range!)) {
-//                    break
-//                }
-//                startFind = nextSpecial!.range.location + 1
-//            }
-//
-//            let end = nextSpecial?.range.location ?? text.utf16.count
-//
-//            let currentText = text.substring(with: start..<end)
-//            for mat in regex.matches(in: currentText, range: NSRange(text.startIndex..., in: currentText)) {
-////                let piece = mat.group(at: 0)!.asBytes()
-//                let piece = Range(mat.range, in: text).map({ String(text[$0]) })?.data(using: .utf8) ?? Data() // WARNING
-//                if let token = encoder[piece] {
-//                    lastPieceTokenLen = 1
-//                    ret.append(token)
-//                    continue
-//                }
-//                let tokens = bytePairEncode([UInt8](piece), encoder)
-//                lastPieceTokenLen = tokens.count
-//                ret.append(contentsOf: tokens)
-//            }
-//
-//            if let m = nextSpecial {
-//                let range = Range(m.range)
-//                let piece = text.substring(with: range!)
-//                let token = specialTokensEncoder[piece]!
-//                ret.append(token)
-//                start = m.range.location + m.range.length
-//                lastPieceTokenLen = 0
-//            } else { break }
-//        }
-//        return (ret, lastPieceTokenLen)
-//    }
-    
+private extension CoreBPE {    
     func increaseLastPieceTokenLen(tokens: [Int], lastPieceTokenLen: Int) -> ([Int], Int) {
         func tokenIsAllSpace(_ token: Int) -> Bool {
             guard let tokenBytes = decoder[token] else { return false }
@@ -139,103 +76,6 @@ private extension CoreBPE {
         assert(lastPieceTokenLen <= tokens.count)
         return (tokens, lastPieceTokenLen)
     }
-
-//    func encodeUnstableNative(_ text: String, _ allowed_special: Set<String>) -> ([Int], Set<[Int]>) {
-//        let (tokens1, lastPieceTokenLen1) = encodeNative(text: text, allowedSpecial: allowed_special)
-//        guard lastPieceTokenLen1 > 0 else {
-//            return (tokens1, Set())
-//        }
-//        
-//        var (tokens, lastPieceTokenLen) = increaseLastPieceTokenLen(tokens: tokens1, lastPieceTokenLen: lastPieceTokenLen1)
-//        
-//        let unstableBytes = decodeNative(tokens: Array(tokens.suffix(lastPieceTokenLen)))
-//        tokens.removeLast(lastPieceTokenLen)
-//        var completions = Set<[Int]>()
-//        guard !unstableBytes.isEmpty else {
-//            return (tokens, completions)
-//        }
-//        
-//        var newEncoder = [[UInt8]: Int]()
-//        encoder.forEach({
-//            newEncoder[[UInt8]($0.key)] = $0.value
-//        })
-//        
-//        var point = sortedTokenBytes.partition(by: { $0.prefix($0.count).lexicographicallyPrecedes(unstableBytes.prefix(unstableBytes.count)) })
-//        while point < sortedTokenBytes.count && sortedTokenBytes[point].starts(with: unstableBytes) {
-//            completions.insert([encoder[sortedTokenBytes[point]]!])
-//            point += 1
-//        }
-//        
-//        for i in 1..<unstableBytes.count {
-//            let prefix = Array(unstableBytes.prefix(i))
-//            let suffix = Array(unstableBytes.suffix(from: i))
-////            var point = sortedTokenBytes.partitionPoint { x in x < suffix }
-//            
-//            var point = sortedTokenBytes.partition(by: { $0.prefix($0.count).lexicographicallyPrecedes(suffix.prefix(suffix.count)) })
-//            while point < sortedTokenBytes.count && sortedTokenBytes[point].starts(with: suffix) {
-//                let possibility = prefix + sortedTokenBytes[point]
-//                let encoded: [Int]
-//                do {
-//                    let s = try String(decoding: possibility, as: UTF8.self)
-//                    encoded = encodeOrdinaryNative(text: s)
-//                } catch {
-//                    encoded = bytePairEncode(possibility, newEncoder)
-//                }
-//                var seq = [Int]()
-//                var seqLen = 0
-//                for token in encoded {
-//                    seq.append(token)
-//                    seqLen += decoder[token]!.count
-//                    if seqLen >= unstableBytes.count {
-//                        break
-//                    }
-//                }
-//                completions.insert(seq)
-//                point += 1
-//            }
-//        }
-//        
-////        if unstableBytes.count > 1 {
-////            let lastDecoded = unstableBytes.decodeLastUTF8()
-////            if unstableBytes.count - lastDecoded.1 > 0 && lastDecoded.0?.isWhitespace == true {
-////                var reencoded = bytePairEncode(Array(unstableBytes.prefix(unstableBytes.count - lastDecoded.1)), newEncoder)
-////                reencoded += bytePairEncode(Array(unstableBytes.suffix(lastDecoded.1)), newEncoder)
-////                completions.insert(reencoded)
-////            }
-////        }
-////
-//        if unstableBytes.count > 1 {
-//            if let char = unstableBytes.last, let lastDecoded = String(utf8String: [CChar(char)]) {
-//                let lastDecodedLength = unstableBytes.count - lastDecoded.count
-//                if lastDecodedLength > 0 && lastDecoded.last?.isWhitespace == true {
-//                    let encoded1 = bytePairEncode(Array(unstableBytes.prefix(unstableBytes.count - lastDecodedLength)), newEncoder)
-//                    let encoded2 = bytePairEncode(Array(unstableBytes.suffix(lastDecodedLength)), newEncoder)
-//                    let reencoded = Array(encoded1) + Array(encoded2)
-//                    completions.insert(reencoded)
-//                }
-//            }
-//        }
-//
-//        
-//        return (tokens, completions)
-//    }
-
-//    private func _increase_last_piece_token_len(_ tokens: [Int], _ lastPieceTokenLen: Int) -> ([Int], Int) {
-//        var lastPieceTokenLen = lastPieceTokenLen
-//        let tokenIsAllSpace: (Int) -> Bool = { token in
-//            decoder[token]?.reversed().allSatisfy { b in [b" ", b"\n", b"\t"].contains(b) } ?? false
-//        }
-//
-//        if lastPieceTokenLen > 0 && tokenIsAllSpace(tokens[tokens.count - lastPieceTokenLen]) {
-//            while lastPieceTokenLen < tokens.count && tokenIsAllSpace(tokens[tokens.count - lastPieceTokenLen - 1]) {
-//                lastPieceTokenLen += 1
-//            }
-//        }
-//
-//        assert(lastPieceTokenLen <= tokens.count)
-//        return (tokens, lastPieceTokenLen)
-//    }
-
 }
 
 // MARK: - Merges
diff --git a/Sources/Tiktoken/Tiktoken.swift b/Sources/Tiktoken/Tiktoken.swift
@@ -14,17 +14,17 @@ public struct Tiktoken {
         return encoding
     }
     
-    public func getEncoding(for vocab: Vocab) -> Encoding? {
-        return nil
-    }
-    
-    public func register() {
-        // TODO: Register model and Encoding
-    }
-    
-    public func clear() {
-        // TODO: Clear all cached encoding
-    }
+//    public func getEncoding(for vocab: Vocab) -> Encoding? {
+//        return nil
+//    }
+//    
+//    public func register() {
+//        // TODO: Register model and Encoding
+//    }
+//    
+//    public func clear() {
+//        // TODO: Clear all cached encoding
+//    }
 }
 
 private extension Tiktoken {
diff --git a/Tests/TiktokenTests/CoreBPETests.swift b/Tests/TiktokenTests/CoreBPETests.swift
@@ -52,18 +52,15 @@ final class CoreBPETests: XCTestCase {
 //        let expected = [31373, 50169, 233, 995, 12520, 234, 235]
 //
         let input = "Esto es un texto 👨🏻‍💻 con emojis diferentes 🍿💃🏼🧜‍♂️ y más texto que no tiene sentido 🛟"
-//        let expected = [22362, 78, 1658, 555, 2420, 78, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 369, 795, 13210, 271, 288, 361, 9100, 274, 12520, 235, 123, 8582, 240, 225, 8582, 237, 120, 8582, 100, 250, 447, 235, 17992, 224, 37929, 331, 285, 40138, 2420, 78, 8358, 645, 46668, 1734, 1908, 17305, 12520, 249, 253]
-        
-        let expected = [14101, 78, 1560, 653, 33125, 62904, 101, 9468, 237, 119, 378, 235, 93273, 119, 390, 100166, 46418, 11410, 235, 123, 93273, 225, 9468, 237, 120, 9468, 100, 250, 378, 235, 379, 11158, 33125, 1744, 912, 24215, 65484, 11410, 249, 253]
+        let expected = [22362, 78, 1658, 555, 2420, 78, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 369, 795, 13210, 271, 288, 361, 9100, 274, 12520, 235, 123, 8582, 240, 225, 8582, 237, 120, 8582, 100, 250, 447, 235, 17992, 224, 37929, 331, 285, 40138, 2420, 78, 8358, 645, 46668, 1734, 1908, 17305, 12520, 249, 253]
 
 
 //        let input = "Vamos a probar🚒🚁🚀🚊 muchos emoticonos para probar⚽️🤸🏿‍♀️ diferentes codificaciones 👨🏻‍💻♨︎"
 //        let expected = [53, 321, 418, 257, 1861, 283, 8582, 248, 240, 8582, 248, 223, 8582, 248, 222, 8582, 248, 232, 881, 418, 4085, 4749, 418, 31215, 1861, 283, 158, 248, 121, 37929, 8582, 97, 116, 8582, 237, 123, 447, 235, 17992, 222, 37929, 288, 361, 9100, 274, 14873, 811, 49443, 274, 50169, 101, 8582, 237, 119, 447, 235, 8582, 240, 119, 17992, 101, 35266, 236]
 //
 //        let encoderGPT = await Load.dataGymToMergeableBpeRanks(vocabBpeFile: "https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", encoderJsonFile: "")
-//        let encoder = await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken")
+        let encoder = await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken")
         
-        let encoder = await Load.loadTiktokenBpe(url: "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken")
         
 //        "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
 //        let regex = try XCTUnwrap(try NSRegularExpression(pattern: "/'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+/gu"))