Skip to content

Commit

Permalink
Release/0.1.2 (#10)
Browse files Browse the repository at this point in the history
* Improve performance on encoder and decoder
  • Loading branch information
aespinilla authored Mar 9, 2023
1 parent ee9c863 commit cadab0d
Show file tree
Hide file tree
Showing 29 changed files with 602 additions and 68 deletions.
15 changes: 7 additions & 8 deletions Sources/GPT3 Tokenizer/BpeRanks.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,17 @@ import Foundation

class BpeRanks {
private let reader: FileReader
private let decoder: BpeRanksDecoder

init(reader: FileReader = ModuleFileReader()) {
init(reader: FileReader = ModuleFileReader(), decoder: BpeRanksDecoder = BpeRanksDecoderImpl()) {
self.reader = reader
self.decoder = decoder
}

lazy var ranks: [[String]] = {
lazy var ranks: [Pairs: Int] = {
guard let data = reader.read(name: "vocab", fileExtension: "bpe"),
let vocab = String(data: data, encoding: .utf8)
else { return [] }

return vocab
.split(separator: "\n", omittingEmptySubsequences: true)
.map({ $0.split(separator: " ", omittingEmptySubsequences: true).map({ String($0) }) })
let ranks = try? decoder.decode(from: data)
else { return [:] }
return ranks
}()
}
13 changes: 7 additions & 6 deletions Sources/GPT3 Tokenizer/BytesUnicode.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,19 @@ class BytesUnicode {
}

var decoder: [String: Int] {
let encoder = encoder
var result: [String: Int] = .init()
encoder.forEach({ result[$1] = $0 })
return result
encoder.inverted
}

private lazy var bytesToUnicode: [Int: String] = {
var bs = range(start: Character("!"), end: Character("~")) + range(start: Character("¡"), end: Character("¬")) + range(start: Character("®"), end: Character("ÿ"))
var cs = bs.map({ $0 })

var n = 0
(0..<Int(pow(Double(2), Double(8))))
(0..<exponentialPow)
.forEach({
if !bs.contains($0) {
bs.append($0)
cs.append(Int(pow(Double(2), Double(8))) + n)
cs.append(exponentialPow + n)
n += 1
}
})
Expand All @@ -41,6 +38,10 @@ class BytesUnicode {
}

private extension BytesUnicode {
var exponentialPow: Int {
Int(pow(Double(2), Double(8)))
}

func range(start: Character, end: Character) -> [Int] {
guard let startValue = start.utf16.first,
let endValue = end.utf16.first
Expand Down
16 changes: 9 additions & 7 deletions Sources/GPT3 Tokenizer/Decoder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ public struct Decoder {
/// - Returns: String value decoded
///
public func decode(encode: [Int]) -> String? {
let bytesUnicodeDecoder = bytesUnicode.decoder
let tableCodeDecoder = tableCode.decoder
let text = encode.compactMap({ tableCodeDecoder?[$0] }).joined()
let decoded = text.characterArray.compactMap({ bytesUnicodeDecoder[$0] }).map({ UInt8($0) })
let data = Data(decoded)
let result = String(data: data, encoding: .utf8)
return result
encode
.decode(tableCode.decoder)
.decode(bytesUnicode.decoder)
}
}

private extension Array where Element == Int {
func decode(_ decoder: [Int: String]) -> String {
compactMap({ decoder[$0] }).joined()
}
}
63 changes: 26 additions & 37 deletions Sources/GPT3 Tokenizer/Encoder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -32,28 +32,23 @@ public class Encoder {
public func enconde(text: String) -> [Int] {
let matches = matches(in: text)
let bytesToUnicode = bytesUnicode.encoder
let encoder = tableCode.encoder

let reencodeds = matches
.map({ match in
let items = Array(match.utf8).map({ Int($0) })
let reencoded = items.compactMap({ bytesToUnicode[$0] }).joined()
return reencoded
return matches
.map({
let unicode = $0.bytesToUnicode(encoder: bytesToUnicode)
return bpe(token: unicode).encode(encoder: encoder)
})

let encoder = tableCode.encoder
let encode = reencodeds
.compactMap({ bpe(token: $0).split(separator: " ").compactMap({ encoder?[String($0)] }) })
.flatMap({ $0 })
return encode
}
}

private extension Encoder {
func matches(in text: String) -> [String] {
guard let results = regex?.matches(in: text,
range: NSRange(text.startIndex..., in: text))
range: NSRange(text.startIndex..., in: text))
else { return [] }
return results.compactMap({ Range($0.range, in: text).map { String(text[$0]) } })
return results.compactMap({ Range($0.range, in: text).map({ String(text[$0]) }) })
}

func bpe(token: String) -> String {
Expand All @@ -63,66 +58,60 @@ private extension Encoder {

var word = token.characterArray
let bpeRanks = bpeRanks.ranks
var pairs = getPairs(word: word)
var pairs = word.pairs

while true {
var minPairs: [Int: Pairs] = .init()
pairs.forEach({ pair in
if let rank = bpeRanks.firstIndex(where: { $0[0] == pair.first && $0[1] == pair.second }) {
minPairs[rank] = pair
}
guard let rank = bpeRanks[pair] else { return }
minPairs[rank] = pair
})

guard let min = minPairs.keys.min(),
let bigram = minPairs[min]
else { break }

word = newWord(word: word, bigram: bigram)
word = word.newWord(bigram: bigram)
if (word.count == 1) {
break
} else {
pairs = getPairs(word: word)
pairs = word.pairs
}
}

let result = word.joined(separator: " ")
let result = word.toString
cache[token] = result
return result
}
}

private extension CharacterArray {
var pairs: [Pairs] {
prevCurrent({ Pairs(first: $0, second: $1) }).unique
}

func newWord(word: CharacterArray, bigram: Pairs) -> CharacterArray {
func newWord(bigram: Pairs) -> Self {
var i = 0
var newWord: CharacterArray = .init()

while i < word.count {
guard let j = word[i...].firstIndex(of: bigram.first)
while i < count {
guard let j = self[i...].firstIndex(of: bigram.first)
else {
newWord.append(contentsOf: word[i...])
newWord.append(contentsOf: self[i...])
break
}
newWord.append(contentsOf: word[i..<j])
newWord.append(contentsOf: self[i..<j])
i = j

if word[i] == bigram.first, i < word.count - 1, word[i+1] == bigram.second {
if self[i] == bigram.first, i < count - 1, self[i+1] == bigram.second {
newWord.append(bigram.join)
i += 2
} else {
newWord.append(word[i])
newWord.append(self[i])
i += 1
}
}

return newWord
}

func getPairs(word: CharacterArray) -> [Pairs] {
var mutatedWord = word
var pairs: Set<Pairs> = .init()
var prev = mutatedWord.removeFirst()
mutatedWord.forEach({
pairs.insert(.init(first: prev, second: $0))
prev = $0
})
return Array(pairs)
}
}
18 changes: 18 additions & 0 deletions Sources/GPT3 Tokenizer/Extensions/Array+PrevCurrent.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//
// Array+PrevCurrent.swift
//
//
// Created by Alberto Espinilla Garrido on 4/3/23.
//

import Foundation

extension Array {
func prevCurrent<T>(_ body: (Element, Element) throws -> T) rethrows -> [T] {
enumerated().compactMap({ index, element in
guard index > 0 else { return nil }
let prev = self[index-1]
return try? body(prev, element)
})
}
}
14 changes: 14 additions & 0 deletions Sources/GPT3 Tokenizer/Extensions/Array+Unique.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
//
// Array+Unique.swift
//
//
// Created by Alberto Espinilla Garrido on 4/3/23.
//

import Foundation

extension Array where Element: Hashable {
var unique: [Element] {
Set(self).toArray
}
}
14 changes: 14 additions & 0 deletions Sources/GPT3 Tokenizer/Extensions/Dictionary+Inverted.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
//
// Dictionary+Inverted.swift
//
//
// Created by Alberto Espinilla Garrido on 3/3/23.
//

import Foundation

extension Dictionary where Key: Hashable, Value: Hashable {
var inverted: Dictionary<Value, Key> {
Dictionary<Value, Key>(uniqueKeysWithValues: lazy.map { ($0.value, $0.key) })
}
}
14 changes: 14 additions & 0 deletions Sources/GPT3 Tokenizer/Extensions/Set+ToArray.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
//
// Set+ToArray.swift
//
//
// Created by Alberto Espinilla Garrido on 4/3/23.
//

import Foundation

extension Set {
var toArray: [Element] {
Array(self)
}
}
17 changes: 17 additions & 0 deletions Sources/GPT3 Tokenizer/Extensions/String+CharacterArray.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,21 @@ extension String {
var characterArray: CharacterArray {
map({ String($0) })
}

func decode(_ decoder: [String: Int]) -> String? {
let decoded: [UInt8] = compactMap({
let item = String($0)
guard let value = decoder[item] else { return nil }
return UInt8(value)
})
let data = Data(decoded)
let result = String(data: data, encoding: .utf8)
return result
}
}

extension CharacterArray {
var toString: String {
joined(separator: " ")
}
}
22 changes: 22 additions & 0 deletions Sources/GPT3 Tokenizer/Extensions/String+Encoders.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// String+Encoders.swift
//
//
// Created by Alberto Espinilla Garrido on 4/3/23.
//

import Foundation

extension String {
func bytesToUnicode(encoder: [Int: String]) -> String {
utf8.compactMap({
let value = Int($0)
return encoder[value]
})
.joined()
}

func encode(encoder: [String: Int]) -> [Int] {
splitWords.compactMap({ encoder[$0] })
}
}
20 changes: 20 additions & 0 deletions Sources/GPT3 Tokenizer/Extensions/String+Split.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//
// String+Split.swift
//
//
// Created by Alberto Espinilla Garrido on 4/3/23.
//

import Foundation

extension String {
var splitWords: [String] {
split(separator: " ", omittingEmptySubsequences: true).toString
}
}

extension Array where Element == Substring {
var toString: [String] {
map({ String($0) })
}
}
28 changes: 28 additions & 0 deletions Sources/GPT3 Tokenizer/FileReader/Decoder/BpeRanksDecoder.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
//
// BpeRanksDecoder.swift
//
//
// Created by Alberto Espinilla Garrido on 5/3/23.
//

import Foundation

protocol BpeRanksDecoder {
func decode(from data: Data) throws -> [Pairs: Int]
}

struct BpeRanksDecoderImpl: BpeRanksDecoder {
func decode(from data: Data) throws -> [Pairs: Int] {
guard let vocab = String(data: data, encoding: .utf8)
else { return [:] }

return vocab.split(separator: "\n", omittingEmptySubsequences: true)
.compactMap({
let line = String($0).splitWords
guard let first = line.first, let last = line.last else { return nil }
return Pairs(first: first, second: last)
})
.enumerated()
.reduce(into: [:]) { $0[$1.element] = $1.offset }
}
}
24 changes: 24 additions & 0 deletions Sources/GPT3 Tokenizer/FileReader/Decoder/TableCodeDecoder.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//
// TableCodeDecoder.swift
//
//
// Created by Alberto Espinilla Garrido on 5/3/23.
//

import Foundation

protocol TableCodeDecoder {
func decode(from data: Data) throws -> [String: Int]
}

struct TableCodeDecoderImpl: TableCodeDecoder {
private let decoder: JSONDecoder

init(decoder: JSONDecoder = .init()) {
self.decoder = decoder
}

func decode(from data: Data) throws -> [String: Int] {
try decoder.decode([String: Int].self, from: data)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ struct ModuleFileReader: FileReader {

func read(name: String, fileExtension: String) -> Data? {
guard let path = bundle.path(forResource: name, ofType: fileExtension),
let data = fileManager.contents(atPath: path)
let data = fileManager.contents(atPath: path)
else { return nil }
return data
}
Expand Down
Loading

0 comments on commit cadab0d

Please sign in to comment.