mobi.js

const unescapeHTML = str => {
    if (!str) return ''
    const textarea = document.createElement('textarea')
    textarea.innerHTML = str
    return textarea.value
}

const MIME = {
    XML: 'application/xml',
    XHTML: 'application/xhtml+xml',
    HTML: 'text/html',
    CSS: 'text/css',
    SVG: 'image/svg+xml',
}

const PDB_HEADER = {
    name: [0, 32, 'string'],
    type: [60, 4, 'string'],
    creator: [64, 4, 'string'],
    numRecords: [76, 2, 'uint'],
}

const PALMDOC_HEADER = {
    compression: [0, 2, 'uint'],
    numTextRecords: [8, 2, 'uint'],
    recordSize: [10, 2, 'uint'],
    encryption: [12, 2, 'uint'],
}

const MOBI_HEADER = {
    magic: [16, 4, 'string'],
    length: [20, 4, 'uint'],
    type: [24, 4, 'uint'],
    encoding: [28, 4, 'uint'],
    uid: [32, 4, 'uint'],
    version: [36, 4, 'uint'],
    titleOffset: [84, 4, 'uint'],
    titleLength: [88, 4, 'uint'],
    localeRegion: [94, 1, 'uint'],
    localeLanguage: [95, 1, 'uint'],
    resourceStart: [108, 4, 'uint'],
    huffcdic: [112, 4, 'uint'],
    numHuffcdic: [116, 4, 'uint'],
    exthFlag: [128, 4, 'uint'],
    trailingFlags: [240, 4, 'uint'],
    indx: [244, 4, 'uint'],
}

const KF8_HEADER = {
    resourceStart: [108, 4, 'uint'],
    fdst: [192, 4, 'uint'],
    numFdst: [196, 4, 'uint'],
    frag: [248, 4, 'uint'],
    skel: [252, 4, 'uint'],
    guide: [260, 4, 'uint'],
}

const EXTH_HEADER = {
    magic: [0, 4, 'string'],
    length: [4, 4, 'uint'],
    count: [8, 4, 'uint'],
}

const INDX_HEADER = {
    magic: [0, 4, 'string'],
    length: [4, 4, 'uint'],
    type: [8, 4, 'uint'],
    idxt: [20, 4, 'uint'],
    numRecords: [24, 4, 'uint'],
    encoding: [28, 4, 'uint'],
    language: [32, 4, 'uint'],
    total: [36, 4, 'uint'],
    ordt: [40, 4, 'uint'],
    ligt: [44, 4, 'uint'],
    numLigt: [48, 4, 'uint'],
    numCncx: [52, 4, 'uint'],
}

const TAGX_HEADER = {
    magic: [0, 4, 'string'],
    length: [4, 4, 'uint'],
    numControlBytes: [8, 4, 'uint'],
}

const HUFF_HEADER = {
    magic: [0, 4, 'string'],
    offset1: [8, 4, 'uint'],
    offset2: [12, 4, 'uint'],
}

const CDIC_HEADER = {
    magic: [0, 4, 'string'],
    length: [4, 4, 'uint'],
    numEntries: [8, 4, 'uint'],
    codeLength: [12, 4, 'uint'],
}

const FDST_HEADER = {
    magic: [0, 4, 'string'],
    numEntries: [8, 4, 'uint'],
}

const FONT_HEADER = {
    flags: [8, 4, 'uint'],
    dataStart: [12, 4, 'uint'],
    keyLength: [16, 4, 'uint'],
    keyStart: [20, 4, 'uint'],
}

const MOBI_ENCODING = {
    1252: 'windows-1252',
    65001: 'utf-8',
}

const EXTH_RECORD_TYPE = {
    100: ['creator', 'string', true],
    101: ['publisher'],
    103: ['description'],
    104: ['isbn'],
    105: ['subject', 'string', true],
    106: ['date'],
    108: ['contributor', 'string', true],
    109: ['rights'],
    110: ['subjectCode', 'string', true],
    112: ['source', 'string', true],
    113: ['asin'],
    121: ['boundary', 'uint'],
    122: ['fixedLayout'],
    125: ['numResources', 'uint'],
    126: ['originalResolution'],
    127: ['zeroGutter'],
    128: ['zeroMargin'],
    129: ['coverURI'],
    132: ['regionMagnification'],
    201: ['coverOffset', 'uint'],
    202: ['thumbnailOffset', 'uint'],
    503: ['title'],
    524: ['language', 'string', true],
    527: ['pageProgressionDirection'],
}

const MOBI_LANG = {
    1: ['ar', 'ar-SA', 'ar-IQ', 'ar-EG', 'ar-LY', 'ar-DZ', 'ar-MA', 'ar-TN', 'ar-OM',
        'ar-YE', 'ar-SY', 'ar-JO', 'ar-LB', 'ar-KW', 'ar-AE', 'ar-BH', 'ar-QA'],
    2: ['bg'], 3: ['ca'], 4: ['zh', 'zh-TW', 'zh-CN', 'zh-HK', 'zh-SG'], 5: ['cs'],
    6: ['da'], 7: ['de', 'de-DE', 'de-CH', 'de-AT', 'de-LU', 'de-LI'], 8: ['el'],
    9: ['en', 'en-US', 'en-GB', 'en-AU', 'en-CA', 'en-NZ', 'en-IE', 'en-ZA',
        'en-JM', null, 'en-BZ', 'en-TT', 'en-ZW', 'en-PH'],
    10: ['es', 'es-ES', 'es-MX', null, 'es-GT', 'es-CR', 'es-PA', 'es-DO',
        'es-VE', 'es-CO', 'es-PE', 'es-AR', 'es-EC', 'es-CL', 'es-UY', 'es-PY',
        'es-BO', 'es-SV', 'es-HN', 'es-NI', 'es-PR'],
    11: ['fi'], 12: ['fr', 'fr-FR', 'fr-BE', 'fr-CA', 'fr-CH', 'fr-LU', 'fr-MC'],
    13: ['he'], 14: ['hu'], 15: ['is'], 16: ['it', 'it-IT', 'it-CH'],
    17: ['ja'], 18: ['ko'], 19: ['nl', 'nl-NL', 'nl-BE'], 20: ['no', 'nb', 'nn'],
    21: ['pl'], 22: ['pt', 'pt-BR', 'pt-PT'], 23: ['rm'], 24: ['ro'], 25: ['ru'],
    26: ['hr', null, 'sr'], 27: ['sk'], 28: ['sq'], 29: ['sv', 'sv-SE', 'sv-FI'],
    30: ['th'], 31: ['tr'], 32: ['ur'], 33: ['id'], 34: ['uk'], 35: ['be'],
    36: ['sl'], 37: ['et'], 38: ['lv'], 39: ['lt'], 41: ['fa'], 42: ['vi'],
    43: ['hy'], 44: ['az'], 45: ['eu'], 46: ['hsb'], 47: ['mk'], 48: ['st'],
    49: ['ts'], 50: ['tn'], 52: ['xh'], 53: ['zu'], 54: ['af'], 55: ['ka'],
    56: ['fo'], 57: ['hi'], 58: ['mt'], 59: ['se'], 62: ['ms'], 63: ['kk'],
    65: ['sw'], 67: ['uz', null, 'uz-UZ'], 68: ['tt'], 69: ['bn'], 70: ['pa'],
    71: ['gu'], 72: ['or'], 73: ['ta'], 74: ['te'], 75: ['kn'], 76: ['ml'],
    77: ['as'], 78: ['mr'], 79: ['sa'], 82: ['cy', 'cy-GB'], 83: ['gl', 'gl-ES'],
    87: ['kok'], 97: ['ne'], 98: ['fy'],
}

const concatTypedArray = (a, b) => {
    const result = new a.constructor(a.length + b.length)
    result.set(a)
    result.set(b, a.length)
    return result
}
const concatTypedArray3 = (a, b, c) => {
    const result = new a.constructor(a.length + b.length + c.length)
    result.set(a)
    result.set(b, a.length)
    result.set(c, a.length + b.length)
    return result
}

const decoder = new TextDecoder()
const getString = buffer => decoder.decode(buffer)
const getUint = buffer => {
    if (!buffer) return
    const l = buffer.byteLength
    const func = l === 4 ? 'getUint32' : l === 2 ? 'getUint16' : 'getUint8'
    return new DataView(buffer)[func](0)
}
const getStruct = (def, buffer) => Object.fromEntries(Array.from(Object.entries(def))
    .map(([key, [start, len, type]]) => [key,
        (type === 'string' ? getString : getUint)(buffer.slice(start, start + len))]))

const getDecoder = x => new TextDecoder(MOBI_ENCODING[x])

const getVarLen = (byteArray, i = 0) => {
    let value = 0, length = 0
    for (const byte of byteArray.subarray(i, i + 4)) {
        value = (value << 7) | (byte & 0b111_1111) >>> 0
        length++
        if (byte & 0b1000_0000) break
    }
    return { value, length }
}

// variable-length quantity, but read from the end of data
const getVarLenFromEnd = byteArray => {
    let value = 0
    for (const byte of byteArray.subarray(-4)) {
        // `byte & 0b1000_0000` indicates the start of value
        if (byte & 0b1000_0000) value = 0
        value = (value << 7) | (byte & 0b111_1111)
    }
    return value
}

const countBitsSet = x => {
    let count = 0
    for (; x > 0; x = x >> 1) if ((x & 1) === 1) count++
    return count
}

const countUnsetEnd = x => {
    let count = 0
    while ((x & 1) === 0) x = x >> 1, count++
    return count
}

const decompressPalmDOC = array => {
    let output = []
    for (let i = 0; i < array.length; i++) {
        const byte = array[i]
        if (byte === 0) output.push(0) // uncompressed literal, just copy it
        else if (byte <= 8) // copy next 1-8 bytes
            for (const x of array.subarray(i + 1, (i += byte) + 1))
                output.push(x)
        else if (byte <= 0b0111_1111) output.push(byte) // uncompressed literal
        else if (byte <= 0b1011_1111) {
            // 1st and 2nd bits are 10, meaning this is a length-distance pair
            // read next byte and combine it with current byte
            const bytes = (byte << 8) | array[i++ + 1]
            // the 3rd to 13th bits encode distance
            const distance = (bytes & 0b0011_1111_1111_1111) >>> 3
            // the last 3 bits, plus 3, is the length to copy
            const length = (bytes & 0b111) + 3
            for (let j = 0; j < length; j++)
                output.push(output[output.length - distance])
        }
        // compressed from space plus char
        else output.push(32, byte ^ 0b1000_0000)
    }
    return Uint8Array.from(output)
}

const read32Bits = (byteArray, from) => {
    const startByte = from >> 3
    const end = from + 32
    const endByte = end >> 3
    let bits = 0n
    for (let i = startByte; i <= endByte; i++)
        bits = bits << 8n | BigInt(byteArray[i] ?? 0)
    return (bits >> (8n - BigInt(end & 7))) & 0xffffffffn
}

const huffcdic = async (mobi, loadRecord) => {
    const huffRecord = await loadRecord(mobi.huffcdic)
    const { magic, offset1, offset2 } = getStruct(HUFF_HEADER, huffRecord)
    if (magic !== 'HUFF') throw new Error('Invalid HUFF record')

    // table1 is indexed by byte value
    const table1 = Array.from({ length: 256 }, (_, i) => offset1 + i * 4)
        .map(offset => getUint(huffRecord.slice(offset, offset + 4)))
        .map(x => [x & 0b1000_0000, x & 0b1_1111, x >>> 8])

    // table2 is indexed by code length
    const table2 = [null].concat(Array.from({ length: 32 }, (_, i) => offset2 + i * 8)
        .map(offset => [
            getUint(huffRecord.slice(offset, offset + 4)),
            getUint(huffRecord.slice(offset + 4, offset + 8))]))

    const dictionary = []
    for (let i = 1; i < mobi.numHuffcdic; i++) {
        const record = await loadRecord(mobi.huffcdic + i)
        const cdic = getStruct(CDIC_HEADER, record)
        if (cdic.magic !== 'CDIC') throw new Error('Invalid CDIC record')
        // `numEntries` is the total number of dictionary data across CDIC records
        // so `n` here is the number of entries in *this* record
        const n = Math.min(1 << cdic.codeLength, cdic.numEntries - dictionary.length)
        const buffer = record.slice(cdic.length)
        for (let i = 0; i < n; i++) {
            const offset = getUint(buffer.slice(i * 2, i * 2 + 2))
            const x = getUint(buffer.slice(offset, offset + 2))
            const length = x & 0x7fff
            const decompressed = x & 0x8000
            const value = new Uint8Array(
                buffer.slice(offset + 2, offset + 2 + length))
            dictionary.push([value, decompressed])
        }
    }

    const decompress = byteArray => {
        let output = new Uint8Array()
        const bitLength = byteArray.byteLength * 8
        for (let i = 0; i < bitLength;) {
            const bits = Number(read32Bits(byteArray, i))
            let [found, codeLength, value] = table1[bits >>> 24]
            if (!found) {
                while (bits >>> (32 - codeLength) < table2[codeLength][0])
                    codeLength += 1
                value = table2[codeLength][1]
            }
            if ((i += codeLength) > bitLength) break

            const code = value - (bits >>> (32 - codeLength))
            let [result, decompressed] = dictionary[code]
            if (!decompressed) {
                // the result is itself compressed
                result = decompress(result)
                // cache the result for next time
                dictionary[code] = [result, true]
            }
            output = concatTypedArray(output, result)
        }
        return output
    }
    return decompress
}

const getIndexData = async (indxIndex, loadRecord) => {
    const indxRecord = await loadRecord(indxIndex)
    const indx = getStruct(INDX_HEADER, indxRecord)
    if (indx.magic !== 'INDX') throw new Error('Invalid INDX record')
    const decoder = getDecoder(indx.encoding)

    const tagxBuffer = indxRecord.slice(indx.length)
    const tagx = getStruct(TAGX_HEADER, tagxBuffer)
    if (tagx.magic !== 'TAGX') throw new Error('Invalid TAGX section')
    const numTags = (tagx.length - 12) / 4
    const tagTable = Array.from({ length: numTags }, (_, i) =>
        new Uint8Array(tagxBuffer.slice(12 + i * 4, 12 + i * 4 + 4)))

    const cncx = {}
    let cncxRecordOffset = 0
    for (let i = 0; i < indx.numCncx; i++) {
        const record = await loadRecord(indxIndex + indx.numRecords + i + 1)
        const array = new Uint8Array(record)
        for (let pos = 0; pos < array.byteLength;) {
            const index = pos
            const { value, length } = getVarLen(array, pos)
            pos += length
            const result = record.slice(pos, pos + value)
            pos += value
            cncx[cncxRecordOffset + index] = decoder.decode(result)
        }
        cncxRecordOffset += 0x10000
    }

    const table = []
    for (let i = 0; i < indx.numRecords; i++) {
        const record = await loadRecord(indxIndex + 1 + i)
        const array = new Uint8Array(record)
        const indx = getStruct(INDX_HEADER, record)
        if (indx.magic !== 'INDX') throw new Error('Invalid INDX record')
        for (let j = 0; j < indx.numRecords; j++) {
            const offsetOffset = indx.idxt + 4 + 2 * j
            const offset = getUint(record.slice(offsetOffset, offsetOffset + 2))

            const length = getUint(record.slice(offset, offset + 1))
            const name = getString(record.slice(offset + 1, offset + 1 + length))

            const tags = []
            const startPos = offset + 1 + length
            let controlByteIndex = 0
            let pos = startPos + tagx.numControlBytes
            for (const [tag, numValues, mask, end] of tagTable) {
                if (end & 1) {
                    controlByteIndex++
                    continue
                }
                const offset = startPos + controlByteIndex
                const value = getUint(record.slice(offset, offset + 1)) & mask
                if (value === mask) {
                    if (countBitsSet(mask) > 1) {
                        const { value, length } = getVarLen(array, pos)
                        tags.push([tag, null, value, numValues])
                        pos += length
                    } else tags.push([tag, 1, null, numValues])
                } else tags.push([tag, value >> countUnsetEnd(mask), null, numValues])
            }

            const tagMap = {}
            for (const [tag, valueCount, valueBytes, numValues] of tags) {
                const values = []
                if (valueCount != null) {
                    for (let i = 0; i < valueCount * numValues; i++) {
                        const { value, length } = getVarLen(array, pos)
                        values.push(value)
                        pos += length
                    }
                } else {
                    let count = 0
                    while (count < valueBytes) {
                        const { value, length } = getVarLen(array, pos)
                        values.push(value)
                        pos += length
                        count += length
                    }
                }
                tagMap[tag] = values
            }
            table.push({ name, tagMap })
        }
    }
    return { table, cncx }
}

const getNCX = async (indxIndex, loadRecord) => {
    const { table, cncx } = await getIndexData(indxIndex, loadRecord)
    const items = table.map(({ tagMap }, index) => ({
        index,
        offset: tagMap[1]?.[0],
        size: tagMap[2]?.[0],
        label: cncx[tagMap[3]] ?? '',
        headingLevel: tagMap[4]?.[0],
        pos: tagMap[6],
        parent: tagMap[21]?.[0],
        firstChild: tagMap[22]?.[0],
        lastChild: tagMap[23]?.[0],
    }))
    const getChildren = item => {
        if (item.firstChild == null) return item
        item.children = items.filter(x => x.parent === item.index).map(getChildren)
        return item
    }
    return items.filter(item => item.headingLevel === 0).map(getChildren)
}

const getEXTH = (buf, encoding) => {
    const { magic, count } = getStruct(EXTH_HEADER, buf)
    if (magic !== 'EXTH') throw new Error('Invalid EXTH header')
    const decoder = getDecoder(encoding)
    const results = {}
    let offset = 12
    for (let i = 0; i < count; i++) {
        const type = getUint(buf.slice(offset, offset + 4))
        const length = getUint(buf.slice(offset + 4, offset + 8))
        if (type in EXTH_RECORD_TYPE) {
            const [name, typ, many] = EXTH_RECORD_TYPE[type]
            const data = buf.slice(offset + 8, offset + length)
            const value = typ === 'uint' ? getUint(data) : decoder.decode(data)
            if (many) {
                results[name] ??= []
                results[name].push(value)
            } else results[name] = value
        }
        offset += length
    }
    return results
}

const getFont = async (buf, unzlib) => {
    const { flags, dataStart, keyLength, keyStart } = getStruct(FONT_HEADER, buf)
    const array = new Uint8Array(buf.slice(dataStart))
    // deobfuscate font
    if (flags & 0b10) {
        const bytes = keyLength === 16 ? 1024 : 1040
        const key = new Uint8Array(buf.slice(keyStart, keyStart + keyLength))
        const length = Math.min(bytes, array.length)
        for (var i = 0; i < length; i++) array[i] = array[i] ^ key[i % key.length]
    }
    // decompress font
    if (flags & 1) try {
        return await unzlib(array)
    } catch (e) {
        console.warn(e)
        console.warn('Failed to decompress font')
    }
    return array
}

export const isMOBI = async file => {
    const magic = getString(await file.slice(60, 68).arrayBuffer())
    return magic === 'BOOKMOBI'// || magic === 'TEXtREAd'
}

class PDB {
    #file
    #offsets
    pdb
    async open(file) {
        this.#file = file
        const pdb = getStruct(PDB_HEADER, await file.slice(0, 78).arrayBuffer())
        this.pdb = pdb
        const buffer = await file.slice(78, 78 + pdb.numRecords * 8).arrayBuffer()
        // get start and end offsets for each record
        this.#offsets = Array.from({ length: pdb.numRecords },
            (_, i) => getUint(buffer.slice(i * 8, i * 8 + 4)))
            .map((x, i, a) => [x, a[i + 1]])
    }
    loadRecord(index) {
        const offsets = this.#offsets[index]
        if (!offsets) throw new RangeError('Record index out of bounds')
        return this.#file.slice(...offsets).arrayBuffer()
    }
    async loadMagic(index) {
        const start = this.#offsets[index][0]
        return getString(await this.#file.slice(start, start + 4).arrayBuffer())
    }
}

export class MOBI extends PDB {
    #start = 0
    #resourceStart
    #decoder
    #encoder
    #decompress
    #removeTrailingEntries
    constructor({ unzlib }) {
        super()
        this.unzlib = unzlib
    }
    async open(file) {
        await super.open(file)
        // TODO: if (this.pdb.type === 'TEXt')
        this.headers = this.#getHeaders(await super.loadRecord(0))
        this.#resourceStart = this.headers.mobi.resourceStart
        let isKF8 = this.headers.mobi.version >= 8
        if (!isKF8) {
            const boundary = this.headers.exth?.boundary
            if (boundary < 0xffffffff) try {
                // it's a "combo" MOBI/KF8 file; try to open the KF8 part
                this.headers = this.#getHeaders(await super.loadRecord(boundary))
                this.#start = boundary
                isKF8 = true
            } catch (e) {
                console.warn(e)
                console.warn('Failed to open KF8; falling back to MOBI')
            }
        }
        await this.#setup()
        return isKF8 ? new KF8(this).init() : new MOBI6(this).init()
    }
    #getHeaders(buf) {
        const palmdoc = getStruct(PALMDOC_HEADER, buf)
        const mobi = getStruct(MOBI_HEADER, buf)
        if (mobi.magic !== 'MOBI') throw new Error('Missing MOBI header')

        const { titleOffset, titleLength, localeLanguage, localeRegion } = mobi
        mobi.title = buf.slice(titleOffset, titleOffset + titleLength)
        const lang = MOBI_LANG[localeLanguage]
        mobi.language = lang?.[localeRegion >> 2] ?? lang?.[0]

        const exth = mobi.exthFlag & 0b100_0000
            ? getEXTH(buf.slice(mobi.length + 16), mobi.encoding) : null
        const kf8 = mobi.version >= 8 ? getStruct(KF8_HEADER, buf) : null
        return { palmdoc, mobi, exth, kf8 }
    }
    async #setup() {
        const { palmdoc, mobi } = this.headers
        this.#decoder = getDecoder(mobi.encoding)
        // `TextEncoder` only supports UTF-8
        // we are only encoding ASCII anyway, so I think it's fine
        this.#encoder = new TextEncoder()

        // set up decompressor
        const { compression } = palmdoc
        this.#decompress = compression === 1 ? f => f
            : compression === 2 ? decompressPalmDOC
            : compression === 17480 ? await huffcdic(mobi, this.loadRecord.bind(this))
            : null
        if (!this.#decompress) throw new Error('Unknown compression type')

        // set up function for removing trailing bytes
        const { trailingFlags } = mobi
        const multibyte = trailingFlags & 1
        const numTrailingEntries = countBitsSet(trailingFlags >>> 1)
        this.#removeTrailingEntries = array => {
            for (let i = 0; i < numTrailingEntries; i++) {
                const length = getVarLenFromEnd(array)
                array = array.subarray(0, -length)
            }
            if (multibyte) {
                const length = (array[array.length - 1] & 0b11) + 1
                array = array.subarray(0, -length)
            }
            return array
        }
    }
    decode(...args) {
        return this.#decoder.decode(...args)
    }
    encode(...args) {
        return this.#encoder.encode(...args)
    }
    loadRecord(index) {
        return super.loadRecord(this.#start + index)
    }
    loadMagic(index) {
        return super.loadMagic(this.#start + index)
    }
    loadText(index) {
        return this.loadRecord(index + 1)
            .then(buf => new Uint8Array(buf))
            .then(this.#removeTrailingEntries)
            .then(this.#decompress)
    }
    async loadResource(index) {
        const buf = await super.loadRecord(this.#resourceStart + index)
        const magic = getString(buf.slice(0, 4))
        if (magic === 'FONT') return getFont(buf, this.unzlib)
        if (magic === 'VIDE' || magic === 'AUDI') return buf.slice(12)
        return buf
    }
    getNCX() {
        const index = this.headers.mobi.indx
        if (index < 0xffffffff) return getNCX(index, this.loadRecord.bind(this))
    }
    getMetadata() {
        const { mobi, exth } = this.headers
        return {
            identifier: mobi.uid.toString(),
            title: unescapeHTML(exth?.title || this.decode(mobi.title)),
            author: exth?.creator?.map(unescapeHTML),
            publisher: unescapeHTML(exth?.publisher),
            language: exth?.language ?? mobi.language,
            published: exth?.date,
            description: unescapeHTML(exth?.description),
            subject: exth?.subject?.map(unescapeHTML),
            rights: unescapeHTML(exth?.rights),
        }
    }
    async getCover() {
        const { exth } = this.headers
        const offset = exth?.coverOffset < 0xffffffff ? exth?.coverOffset
            : exth?.thumbnailOffset < 0xffffffff ? exth?.thumbnailOffset : null
        if (offset != null) {
            const buf = await this.loadResource(offset)
            return new Blob([buf])
        }
    }
}

const mbpPagebreakRegex = /<\s*(?:mbp:)?pagebreak[^>]*>/gi
const fileposRegex = /<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>/gi

class MOBI6 {
    parser = new DOMParser()
    serializer = new XMLSerializer()
    #resourceCache = new Map()
    #textCache = new Map()
    #cache = new Map()
    #sections
    #fileposList = []
    #type = MIME.HTML
    constructor(mobi) {
        this.mobi = mobi
    }
    async init() {
        // load all text records in an array
        let array = new Uint8Array()
        for (let i = 0; i < this.mobi.headers.palmdoc.numTextRecords; i++)
            array = concatTypedArray(array, await this.mobi.loadText(i))

        // convert to string so we can use regex
        // note that `filepos` are byte offsets
        // so it needs to preserve each byte as a separate character
        // (see https://stackoverflow.com/q/50198017)
        const str = Array.from(new Uint8Array(array),
            c => String.fromCharCode(c)).join('')

        // split content into sections at each `<mbp:pagebreak>`
        this.#sections = [0]
            .concat(Array.from(str.matchAll(mbpPagebreakRegex), m => m.index))
            .map((x, i, a) => str.slice(x, a[i + 1]))
            // recover the original raw bytes
            .map(str => Uint8Array.from(str, x => x.charCodeAt(0)))
            .map(raw => ({ book: this, raw }))
            // get start and end filepos for each section
            .reduce((arr, x) => {
                const last = arr[arr.length - 1]
                x.start = last?.end ?? 0
                x.end = x.start + x.raw.byteLength
                return arr.concat(x)
            }, [])

        this.sections = this.#sections.map((section, index) => ({
            id: index,
            load: () => this.loadSection(section),
            createDocument: () => this.createDocument(section),
            size: section.end - section.start,
        }))

        const fileposInNCX = []
        try {
            const ncx = await this.mobi.getNCX()
            const map = ({ label, offset, children }) => {
                const filepos = offset.toString().padStart(10, '0')
                const href = `filepos:${filepos}`
                fileposInNCX.push(filepos)
                label = unescapeHTML(label)
                return { label, href, subitems: children?.map(map) }
            }
            this.toc = ncx?.map(map)
            this.landmarks = await this.getGuide()

            // try to build TOC if there's no NCX
            if (!this.toc) {
                const tocHref = this.landmarks
                    .find(({ type }) => type?.includes('toc'))?.href
                if (tocHref) {
                    const { index } = this.resolveHref(tocHref)
                    const doc = await this.sections[index].createDocument()
                    this.toc = Array.from(doc.querySelectorAll('a[filepos]'),
                        a => ({
                            label: a.innerText?.trim(),
                            href: `filepos:${a.getAttribute('filepos')}`,
                        }))
                }
            }
        } catch(e) {
            console.warn(e)
        }

        // get list of all `filepos` references in the book,
        // which will be used to insert anchor elements
        // because only then can they be referenced in the DOM
        this.#fileposList = [...new Set(fileposInNCX
            .concat(Array.from(str.matchAll(fileposRegex), m => m[1])))]
            .map(filepos => ({ filepos, number: Number(filepos) }))
            .sort((a, b) => a.number - b.number)

        this.metadata = this.mobi.getMetadata()
        this.getCover = this.mobi.getCover.bind(this.mobi)
        return this
    }
    async getGuide() {
        const doc = await this.createDocument(this.#sections[0])
        return Array.from(doc.getElementsByTagName('reference'), ref => ({
            label: ref.getAttribute('title'),
            type: ref.getAttribute('type')?.split(/\s/),
            href: `filepos:${ref.getAttribute('filepos')}`,
        }))
    }
    async loadResource(index) {
        if (this.#resourceCache.has(index)) return this.#resourceCache.get(index)
        const raw = await this.mobi.loadResource(index)
        const url = URL.createObjectURL(new Blob([raw]))
        this.#resourceCache.set(index, url)
        return url
    }
    async loadRecindex(recindex) {
        return this.loadResource(Number(recindex) - 1)
    }
    async replaceResources(doc) {
        for (const img of doc.querySelectorAll('img[recindex]')) {
            const recindex = img.getAttribute('recindex')
            try {
                img.src = await this.loadRecindex(recindex)
            } catch (e) {
                console.warn(`Failed to load image ${recindex}`)
            }
        }
        for (const media of doc.querySelectorAll('[mediarecindex]')) {
            const mediarecindex = media.getAttribute('mediarecindex')
            const recindex = media.getAttribute('recindex')
            try {
                media.src = await this.loadRecindex(mediarecindex)
                if (recindex) media.poster = await this.loadRecindex(recindex)
            } catch (e) {
                console.warn(`Failed to load media ${mediarecindex}`)
            }
        }
        for (const a of doc.querySelectorAll('[filepos]')) {
            const filepos = a.getAttribute('filepos')
            a.href = `filepos:${filepos}`
        }
    }
    async loadText(section) {
        if (this.#textCache.has(section)) return this.#textCache.get(section)
        const { raw } = section

        // insert anchor elements for each `filepos`
        const fileposList = this.#fileposList
            .filter(({ number }) => number >= section.start && number < section.end)
            .map(obj => ({ ...obj, offset: obj.number - section.start }))
        let arr = raw
        if (fileposList.length) {
            arr = raw.subarray(0, fileposList[0].offset)
            fileposList.forEach(({ filepos, offset }, i) => {
                const next = fileposList[i + 1]
                const a = this.mobi.encode(`<a id="filepos${filepos}"></a>`)
                arr = concatTypedArray3(arr, a, raw.subarray(offset, next?.offset))
            })
        }
        const str = this.mobi.decode(arr).replaceAll(mbpPagebreakRegex, '')
        this.#textCache.set(section, str)
        return str
    }
    async createDocument(section) {
        const str = await this.loadText(section)
        return this.parser.parseFromString(str, this.#type)
    }
    async loadSection(section) {
        if (this.#cache.has(section)) return this.#cache.get(section)
        const doc = await this.createDocument(section)

        // inject default stylesheet
        const style = doc.createElement('style')
        doc.head.append(style)
        // blockquotes in MOBI seem to have only a small left margin by default
        // many books seem to rely on this, as it's the only way to set margin
        // (since there's no CSS)
        style.append(doc.createTextNode(`blockquote {
            margin-block-start: 0;
            margin-block-end: 0;
            margin-inline-start: 1em;
            margin-inline-end: 0;
        }`))

        await this.replaceResources(doc)
        const result = this.serializer.serializeToString(doc)
        const url = URL.createObjectURL(new Blob([result], { type: this.#type }))
        this.#cache.set(section, url)
        return url
    }
    resolveHref(href) {
        const filepos = href.match(/filepos:(.*)/)[1]
        const number = Number(filepos)
        const index = this.#sections.findIndex(section => section.end > number)
        const anchor = doc => doc.getElementById(`filepos${filepos}`)
        return { index, anchor }
    }
    splitTOCHref(href) {
        const filepos = href.match(/filepos:(.*)/)[1]
        const number = Number(filepos)
        const index = this.#sections.findIndex(section => section.end > number)
        return [index, `filepos${filepos}`]
    }
    getTOCFragment(doc, id) {
        return doc.getElementById(id)
    }
    isExternal(uri) {
        return /^(?!blob|filepos)\w+:/i.test(uri)
    }
}

// handlers for `kindle:` uris
const kindleResourceRegex = /kindle:(flow|embed):(\w+)(?:\?mime=(\w+\/[-+.\w]+))?/
const kindlePosRegex = /kindle:pos:fid:(\w+):off:(\w+)/
const parseResourceURI = str => {
    const [resourceType, id, type] = str.match(kindleResourceRegex).slice(1)
    return { resourceType, id: parseInt(id, 32), type }
}
const parsePosURI = str => {
    const [fid, off] = str.match(kindlePosRegex).slice(1)
    return { fid: parseInt(fid, 32), off: parseInt(off, 32) }
}
const makePosURI = (fid = 0, off = 0) =>
    `kindle:pos:fid:${fid.toString(32).toUpperCase().padStart(4, '0')
    }:off:${off.toString(32).toUpperCase().padStart(10, '0')}`

// `kindle:pos:` links are originally links that contain fragments identifiers
// so there should exist an element with `id` or `name`
// otherwise try to find one with an `aid` attribute
const getFragmentSelector = str => {
    const match = str.match(/\s(id|name|aid)\s*=\s*['"]([^'"]*)['"]/i)
    if (!match) return
    const [, attr, value] = match
    return `[${attr}="${CSS.escape(value)}"]`
}

// replace asynchronously and sequentially
const replaceSeries = async (str, regex, f) => {
    const matches = []
    str.replace(regex, (...args) => (matches.push(args), null))
    const results = []
    for (const args of matches) results.push(await f(...args))
    return str.replace(regex, () => results.shift())
}

class KF8 {
    parser = new DOMParser()
    #cache = new Map()
    #fragmentOffsets = new Map()
    #fragmentSelectors = new Map()
    #tables = {}
    #sections
    #fullRawLength
    #rawHead = new Uint8Array()
    #rawTail = new Uint8Array()
    #lastLoadedHead = -1
    #lastLoadedTail = -1
    #checkType = true
    #type = MIME.XHTML
    constructor(mobi) {
        this.mobi = mobi
    }
    async init() {
        const loadRecord = this.mobi.loadRecord.bind(this.mobi)
        const { kf8 } = this.mobi.headers

        try {
            const fdstBuffer = await loadRecord(kf8.fdst)
            const fdst = getStruct(FDST_HEADER, fdstBuffer)
            if (fdst.magic !== 'FDST') throw new Error('Missing FDST record')
            const fdstTable = Array.from({ length: fdst.numEntries },
                (_, i) => 12 + i * 8)
                .map(offset => [
                    getUint(fdstBuffer.slice(offset, offset + 4)),
                    getUint(fdstBuffer.slice(offset + 4, offset + 8))])
            this.#tables.fdstTable = fdstTable
            this.#fullRawLength = fdstTable[fdstTable.length - 1][1]
        } catch {}

        const skelTable = (await getIndexData(kf8.skel, loadRecord)).table
            .map(({ name, tagMap }, index) => ({
                index, name,
                numFrag: tagMap[1][0],
                offset: tagMap[6][0],
                length: tagMap[6][1],
            }))
        const fragData = await getIndexData(kf8.frag, loadRecord)
        const fragTable = fragData.table.map(({ name, tagMap }) => ({
            insertOffset: parseInt(name),
            selector: fragData.cncx[tagMap[2][0]],
            index: tagMap[4][0],
            offset: tagMap[6][0],
            length: tagMap[6][1],
        }))
        this.#tables.skelTable = skelTable
        this.#tables.fragTable = fragTable

        this.#sections = skelTable.reduce((arr, skel) => {
            const last = arr[arr.length - 1]
            const fragStart = last?.fragEnd ?? 0, fragEnd = fragStart + skel.numFrag
            const frags = fragTable.slice(fragStart, fragEnd)
            const length = skel.length + frags.map(f => f.length).reduce((a, b) => a + b)
            const totalLength = (last?.totalLength ?? 0) + length
            return arr.concat({ skel, frags, fragEnd, length, totalLength })
        }, [])

        /*
        const resources = await this.getResourcesByMagic(['RESC', 'PAGE'])
        if (resources.RESC) {
            const buf = await this.mobi.loadRecord(resources.RESC)
            const str = this.mobi.decode(buf.slice(16)).replace(/\0/g, '')
            // the RESC record lacks the root `<package>` element
            // but seem to be otherwise valid XML
            const index = str.search(/\?>/)
            const xmlStr = `<package>${str.slice(index)}</package>`
            const opf = this.parser.parseFromString(xmlStr, MIME.XML)
        }*/

        // insert cover page for CFI compatibility with KindleUnpack,
        // which will pretty much always insert a cover page;
        // it will not be accessible in any way, so just insert a dummy section
        this.#sections.unshift({ frags: [] })

        this.sections = this.#sections.map((section, index) =>
            section.frags.length ? ({
                id: index,
                load: () => this.loadSection(section),
                createDocument: () => this.createDocument(section),
                size: section.length,
            }) : ({ linear: 'no' }))

        try {
            const ncx = await this.mobi.getNCX()
            const map = ({ label, pos, children }) => {
                const [fid, off] = pos
                const href = makePosURI(fid, off)
                const arr = this.#fragmentOffsets.get(fid)
                if (arr) arr.push(off)
                else this.#fragmentOffsets.set(fid, [off])
                return { label: unescapeHTML(label), href, subitems: children?.map(map) }
            }
            this.toc = ncx?.map(map)
            this.landmarks = await this.getGuide()
        } catch(e) {
            console.warn(e)
        }

        const { exth } = this.mobi.headers
        this.dir = exth.pageProgressionDirection
        this.rendition = {
            layout: exth.fixedLayout === 'true' ? 'pre-paginated' : 'reflowable',
            viewport: Object.fromEntries(exth.originalResolution
                ?.split('x')?.slice(0, 2)
                ?.map((x, i) => [i ? 'height' : 'width', x]) ?? []),
        }

        this.metadata = this.mobi.getMetadata()
        this.getCover = this.mobi.getCover.bind(this.mobi)
        return this
    }
    // is this really the only way of getting to RESC, PAGE, etc.?
    async getResourcesByMagic(keys) {
        const results = {}
        const start = this.mobi.headers.kf8.resourceStart
        const end = this.mobi.pdb.numRecords
        for (let i = start; i < end; i++) {
            try {
                const magic = await this.mobi.loadMagic(i)
                const match = keys.find(key => key === magic)
                if (match) results[match] = i
            } catch {}
        }
        return results
    }
    async getGuide() {
        const index = this.mobi.headers.kf8.guide
        if (index < 0xffffffff) {
            const loadRecord = this.mobi.loadRecord.bind(this.mobi)
            const { table, cncx } = await getIndexData(index, loadRecord)
            return table.map(({ name, tagMap }) => ({
                label: cncx[tagMap[1][0]] ?? '',
                type: name?.split(/\s/),
                href: makePosURI(tagMap[6]?.[0] ?? tagMap[3]?.[0]),
            }))
        }
    }
    async loadResourceBlob(str) {
        const { resourceType, id, type } = parseResourceURI(str)
        const raw = resourceType === 'flow' ? await this.loadFlow(id)
            : await this.mobi.loadResource(id - 1)
        const result = [MIME.XHTML, MIME.HTML, MIME.CSS, MIME.SVG].includes(type)
            ? await this.replaceResources(this.mobi.decode(raw)) : raw
        return new Blob([result], { type })
    }
    async loadResource(str) {
        if (this.#cache.has(str)) return this.#cache.get(str)
        const blob = await this.loadResourceBlob(str)
        const url = URL.createObjectURL(blob)
        this.#cache.set(str, url)
        return url
    }
    replaceResources(str) {
        const regex = new RegExp(kindleResourceRegex, 'g')
        return replaceSeries(str, regex, this.loadResource.bind(this))
    }
    // NOTE: there doesn't seem to be a way to access text randomly?
    // how to know the decompressed size of the records without decompressing?
    // 4096 is just the maximum size
    async loadRaw(start, end) {
        // here we load either from the front or back until we have reached the
        // required offsets; at worst you'd have to load half the book at once
        const distanceHead = end - this.#rawHead.length
        const distanceEnd = this.#fullRawLength == null ? Infinity
            : (this.#fullRawLength - this.#rawTail.length) - start
        // load from the start
        if (distanceHead < 0 || distanceHead < distanceEnd) {
            while (this.#rawHead.length < end) {
                const index = ++this.#lastLoadedHead
                const data = await this.mobi.loadText(index)
                this.#rawHead = concatTypedArray(this.#rawHead, data)
            }
            return this.#rawHead.slice(start, end)
        }
        // load from the end
        while (this.#fullRawLength - this.#rawTail.length > start) {
            const index = this.mobi.headers.palmdoc.numTextRecords - 1
                - (++this.#lastLoadedTail)
            const data = await this.mobi.loadText(index)
            this.#rawTail = concatTypedArray(data, this.#rawTail)
        }
        const rawTailStart = this.#fullRawLength - this.#rawTail.length
        return this.#rawTail.slice(start - rawTailStart, end - rawTailStart)
    }
    loadFlow(index) {
        if (index < 0xffffffff)
            return this.loadRaw(...this.#tables.fdstTable[index])
    }
    async loadText(section) {
        const { skel, frags, length } = section
        const raw = await this.loadRaw(skel.offset, skel.offset + length)
        let skeleton = raw.slice(0, skel.length)
        for (const frag of frags) {
            const insertOffset = frag.insertOffset - skel.offset
            const offset = skel.length + frag.offset
            const fragRaw = raw.slice(offset, offset + frag.length)
            skeleton = concatTypedArray3(
                skeleton.slice(0, insertOffset), fragRaw,
                skeleton.slice(insertOffset))

            const offsets = this.#fragmentOffsets.get(frag.index)
            if (offsets) for (const offset of offsets) {
                const str = this.mobi.decode(fragRaw).slice(offset)
                const selector = getFragmentSelector(str)
                this.#setFragmentSelector(frag.index, offset, selector)
            }
        }
        return this.mobi.decode(skeleton)
    }
    async createDocument(section) {
        const str = await this.loadText(section)
        return this.parser.parseFromString(str, this.#type)
    }
    async loadSection(section) {
        if (this.#cache.has(section)) return this.#cache.get(section)
        const str = await this.loadText(section)

        // by default, type is XHTML; change to HTML if it's not valid XHTML
        if (this.#checkType && this.parser
            .parseFromString(str, this.#type)
            .querySelector('parsererror')) this.#type = MIME.HTML
        // let's just check it once for now
        if (this.#checkType) this.#checkType = false

        const replaced = await this.replaceResources(str)
        const url = URL.createObjectURL(new Blob([replaced], { type: this.#type }))
        this.#cache.set(section, url)
        return url
    }
    getIndexByFID(fid) {
        return this.#sections.findIndex(section =>
            section.frags.some(frag => frag.index === fid))
    }
    #setFragmentSelector(id, offset, selector) {
        const map = this.#fragmentSelectors.get(id)
        if (map) map.set(offset, selector)
        else {
            const map = new Map()
            this.#fragmentSelectors.set(id, map)
            map.set(offset, selector)
        }
    }
    async resolveHref(href) {
        const { fid, off } = parsePosURI(href)
        const index = this.getIndexByFID(fid)
        if (index < 0) return

        const saved = this.#fragmentSelectors.get(fid)?.get(off)
        if (saved) return { index, anchor: doc => doc.querySelector(saved) }

        const { skel, frags } = this.#sections[index]
        const frag = frags.find(frag => frag.index === fid)
        const offset = skel.offset + skel.length + frag.offset
        const fragRaw = await this.loadRaw(offset, offset + frag.length)
        const str = this.mobi.decode(fragRaw).slice(off)
        const selector = getFragmentSelector(str)
        this.#setFragmentSelector(fid, off, selector)
        const anchor = doc => doc.querySelector(selector)
        return { index, anchor }
    }
    splitTOCHref(href) {
        const pos = parsePosURI(href)
        const index = this.getIndexByFID(pos.fid)
        return [index, pos]
    }
    getTOCFragment(doc, { fid, off }) {
        const selector = this.#fragmentSelectors.get(fid)?.get(off)
        return doc.querySelector(selector)
    }
    isExternal(uri) {
        return /^(?!blob|kindle)\w+:/i.test(uri)
    }
}