Added splitting long audios (#2)

Upsonic · May 28, 2024 · 8951b1b · 8951b1b
1 parent 365d623
commit 8951b1b
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 12 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 *.mp3
 *.wav
+*.wav*
 *.png
 *.db
 *.db-shm

diff --git a/README.md b/README.md
@@ -24,7 +24,8 @@ At this time we have many infrastructure element. We just aim to provide whole t
 
 #### Todo
 - [x] Reset Option
-- [ ] Splitting long audios. (Whisper api just support <20mb)
+- [x] Splitting long audios. (Whisper api just support <20mb)
+- [ ] Text input area
 - [ ] More Effect
 
 - [ ] Windows .exe

diff --git a/gpt_computer_assistant/agent/proccess.py b/gpt_computer_assistant/agent/proccess.py
@@ -4,6 +4,7 @@
 from .chat_history import *
 
 from ..audio.tts import text_to_speech
+from ..audio.stt import speech_to_text
 
 from ..audio.record import audio_data
 
@@ -29,18 +30,11 @@ def process_audio(take_screenshot=True, take_system_audio=False):
 
 
 
-    audio_file = open(mic_record_location, "rb")
-    transcription = get_client().audio.transcriptions.create(
-        model="whisper-1",
-        file=audio_file
-    )
+    transcription = speech_to_text(mic_record_location)
 
     if take_system_audio:
-        audio_file2 = open(system_sound_location, "rb")
-        transcription2 = get_client().audio.transcriptions.create(
-            model="whisper-1",
-            file=audio_file2
-        )
+
+        transcription2 = speech_to_text(system_sound_location)
 
 
     llm_input = "USER: "+transcription.text

diff --git a/gpt_computer_assistant/audio/stt.py b/gpt_computer_assistant/audio/stt.py
@@ -0,0 +1,41 @@
+import os
+from pydub import AudioSegment
+from ..llm import get_client
+
+def split_audio(file_path, max_size=20*1024*1024):
+    audio = AudioSegment.from_wav(file_path)
+    file_size = os.path.getsize(file_path)
+    if file_size <= max_size:
+        return [(audio, file_path)]
+
+    # Calculate the number of parts needed
+    num_parts = file_size // max_size + 1
+    part_length = len(audio) // num_parts
+    parts = []
+
+    for i in range(num_parts):
+        start = i * part_length
+        end = (i + 1) * part_length if (i + 1) < num_parts else len(audio)
+        part = audio[start:end]
+        part_path = f"{file_path[:-4]}_part_{i+1}.wav"
+        part.export(part_path, format="wav")
+        parts.append((part, part_path))
+
+    return parts
+
+def speech_to_text(location):
+    audio_parts = split_audio(location)
+    transcriptions = []
+
+    for part, part_path in audio_parts:
+        with open(part_path, "rb") as audio_file:
+            transcription = get_client().audio.transcriptions.create(
+                model="whisper-1",
+                file=audio_file
+            )
+            transcriptions.append(transcription)
+        os.remove(part_path)  # Clean up the temporary file immediately after processing
+
+    # Merge transcriptions (assuming it's a list of text segments)
+    full_transcription = " ".join(transcription.text for transcription in transcriptions)
+    return full_transcription
diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,5 @@ langchain==0.2.1
 langchain_community==0.2.1
 pyautogui==0.9.54
 sounddevice==0.4.6
-soundfile==0.12.1
+soundfile==0.12.1
+pydub==0.25.1
-Original file line number
+Diff line change
@@ -1,5 +1,6 @@
     *.mp3
     *.wav
+    *.wav*
     *.png
     *.db
     *.db-shm
@@ Expand Down @@