Skip to content

Commit

Permalink
Added splitting long audios (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
onuratakan authored May 28, 2024
1 parent 365d623 commit 8951b1b
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 12 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
*.mp3
*.wav
*.wav*
*.png
*.db
*.db-shm
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ At this time we have many infrastructure element. We just aim to provide whole t

#### Todo
- [x] Reset Option
- [ ] Splitting long audios. (Whisper api just support <20mb)
- [x] Splitting long audios. (Whisper api just support <20mb)
- [ ] Text input area
- [ ] More Effect

- [ ] Windows .exe
Expand Down
14 changes: 4 additions & 10 deletions gpt_computer_assistant/agent/proccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .chat_history import *

from ..audio.tts import text_to_speech
from ..audio.stt import speech_to_text

from ..audio.record import audio_data

Expand All @@ -29,18 +30,11 @@ def process_audio(take_screenshot=True, take_system_audio=False):



audio_file = open(mic_record_location, "rb")
transcription = get_client().audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
transcription = speech_to_text(mic_record_location)

if take_system_audio:
audio_file2 = open(system_sound_location, "rb")
transcription2 = get_client().audio.transcriptions.create(
model="whisper-1",
file=audio_file2
)

transcription2 = speech_to_text(system_sound_location)


llm_input = "USER: "+transcription.text
Expand Down
41 changes: 41 additions & 0 deletions gpt_computer_assistant/audio/stt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
from pydub import AudioSegment
from ..llm import get_client

def split_audio(file_path, max_size=20*1024*1024):
audio = AudioSegment.from_wav(file_path)
file_size = os.path.getsize(file_path)
if file_size <= max_size:
return [(audio, file_path)]

# Calculate the number of parts needed
num_parts = file_size // max_size + 1
part_length = len(audio) // num_parts
parts = []

for i in range(num_parts):
start = i * part_length
end = (i + 1) * part_length if (i + 1) < num_parts else len(audio)
part = audio[start:end]
part_path = f"{file_path[:-4]}_part_{i+1}.wav"
part.export(part_path, format="wav")
parts.append((part, part_path))

return parts

def speech_to_text(location):
audio_parts = split_audio(location)
transcriptions = []

for part, part_path in audio_parts:
with open(part_path, "rb") as audio_file:
transcription = get_client().audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
transcriptions.append(transcription)
os.remove(part_path) # Clean up the temporary file immediately after processing

# Merge transcriptions (assuming it's a list of text segments)
full_transcription = " ".join(transcription.text for transcription in transcriptions)
return full_transcription
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ langchain==0.2.1
langchain_community==0.2.1
pyautogui==0.9.54
sounddevice==0.4.6
soundfile==0.12.1
soundfile==0.12.1
pydub==0.25.1

0 comments on commit 8951b1b

Please sign in to comment.