Skip to content

Commit

Permalink
add support for newer openai_whisper engine responses in whisper asr
Browse files Browse the repository at this point in the history
  • Loading branch information
bscholer committed Apr 10, 2024
1 parent 17b6352 commit f8350b4
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 11 deletions.
11 changes: 7 additions & 4 deletions src/transcribe.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { PayloadData, payloadGenerator, preprocessWhisperASRResponse } from "src
import { StatusBar } from "./status";
import { SupabaseClient } from "@supabase/supabase-js";
import * as tus from "tus-js-client";
import { WhisperASRSegment } from "./types/whisper-asr";
import { WhisperASRResponse, WhisperASRSegment } from "./types/whisper-asr";

type TranscriptionBackend = (file: TFile) => Promise<string>;

Expand Down Expand Up @@ -157,14 +157,17 @@ export class TranscriptionEngine {
const response = await requestUrl(options);
if (this.settings.debug) console.log("Raw response:", response);

const preprocessed = preprocessWhisperASRResponse(response.json);
// ASR_ENGINE=faster_whisper returns segments as an array. Preprocess it to match the standard.
const preprocessed = Array.isArray(response.json.segments[0])
? preprocessWhisperASRResponse(response.json) : response.json as WhisperASRResponse;

if (this.settings.debug) console.log("Preprocessed response:", preprocessed);

// Create segments for each word timestamp if word timestamps are available
const wordSegments = preprocessed.segments
.reduce((acc: components["schemas"]["TimestampedTextSegment"][], segment: WhisperASRSegment) => {
if (segment.wordTimestamps) {
acc.push(...segment.wordTimestamps.map(wordTimestamp => ({
if (segment.words) {
acc.push(...segment.words.map(wordTimestamp => ({
start: wordTimestamp.start,
end: wordTimestamp.end,
text: wordTimestamp.word
Expand Down
9 changes: 7 additions & 2 deletions src/types/whisper-asr.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,12 @@ export type WhisperASRSegment = {
/**
* An array of word-level timestamps, providing detailed timing for each word spoken in the segment.
*/
wordTimestamps: WhisperASRWordTimestamp[] | null;
words: WhisperASRWordTimestamp[] | null;

/**
* The unique identifier for the segment.
*/
id: number;
};

/**
Expand Down Expand Up @@ -116,7 +121,7 @@ export type WhisperASRWordTimestamp = {
* The model's confidence in the accuracy of this word transcription.
* @example 0.941351592540741
*/
confidence: number;
probability: number;
};


10 changes: 5 additions & 5 deletions src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* Utility functions for Obsidian Transcript */
import { App, FileSystemAdapter, getBlobArrayBuffer } from "obsidian";
import { WhisperASRResponse, WhisperASRSegment } from "./types/whisper-asr";
import { WhisperASRResponse, WhisperASRSegment, WhisperASRWordTimestamp } from "./types/whisper-asr";

export const randomString = (length: number) => Array(length + 1).join((Math.random().toString(36) + '00000000000000000').slice(2, 18)).slice(0, length)
export const getAllLinesFromFile = (cache: string) => cache.split(/\r?\n/)
Expand Down Expand Up @@ -74,15 +74,15 @@ export function preprocessWhisperASRResponse(rawResponse: any): WhisperASRRespon
avg_logprob: segment[7],
compression_ratio: segment[8],
no_speech_prob: segment[9],
wordTimestamps: null,
words: null,
} as WhisperASRSegment;
if (segment[10] !== null) { // easier to read than a ternary-destructured assignment
baseSegment.wordTimestamps = segment[10].map((wordTimestamp: any) => ({
baseSegment.words = segment[10].map((wordTimestamp: unknown[]) => ({
start: wordTimestamp[0],
end: wordTimestamp[1],
word: wordTimestamp[2],
confidence: wordTimestamp[3],
}));
probability: wordTimestamp[3],
} as WhisperASRWordTimestamp));
}
return baseSegment;
})
Expand Down

0 comments on commit f8350b4

Please sign in to comment.