-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathwsi
231 lines (219 loc) · 11.8 KB
/
wsi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/zsh
#If you replace the shell above with bash, it should work but tested less rigorously.
# Copyright (c) 2024 Quantius Benignus
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#--------------------------------------------------------------------------
# NAME: wsi
# PREREQUSITES:
# - whisper.cpp installation (see https://github.com/ggerganov/whisper.cpp) or a portable whisperfile
# - recent versions of 'sox', 'curl', 'xsel' or 'wl-copy' and CLI tools from your system's repositories.
# - optional xdotool (X11) or ydotool (Wayland) for automatic paste after successful transcription
#--------------------------------------------------------------------------
#X11 or Wayland (2nd line may catch edge cases):
wm="$XDG_SESSION_TYPE"
#wm="${XDG_SESSION_TYPE:-$(loginctl show-session $(loginctl | grep $(whoami) | awk '{print $1}') -p Type --value)}"
#---USER CONFIGURATION BLOCK----------------------------------------------------------------------
#Please, adjust the variables here to suit your environment:
# Store temp files in memory for speed and to reduce SSD/HDD "grinding":
TEMPD='/dev/shm'
# Hardcoded temp wav file to store the voice memo and get overwritten every time (in RAM):
ramf="$TEMPD/wfile"
#Set the number of processing threads for whisper.cpp inference (adjust for your case):
NTHR=8
#It seems that the optimum number of transcribe threads should equal half CPU processing cores:
#NTHR=$(( $(getconf _NPROCESSORS_ONLN) / 2 ))
# Use CLIPBOARD unless primary selection (middle mouse button to paste) selected (change here to true or via CLI flag -p):
PRIMESEL=false
#Model superdirectory where AI models (Whisper, LLMs, TTS etc.) are stored (mostly needed for wsiAI):
AI=$HOME/PATH_TO_YOUR/Models
# Default whisper.cpp model file for ASR inference (with local whisper.cpp, possibly whisperfile):
#WMODEL="$TEMPD/ggml-base.en.bin"
WMODEL=${WHISPER_DMODEL:-"$AI/whisper/ggml-base.en.bin"}
# Uncomment to use available whisperfile (downloaded to a folder in the PATH and set to EXECUTABLE):
# WHISPERFILE=whisper-tiny.en.llamafile
# (For the next one needs xdotool (X11) or ydotool (Wayland).
# Change to 'true' to paste automatically on completed transcription (using 'xdotool'):
AUTOPASTE=true
#Provide hardcoded whisper.cpp hostname and port. To be used when invoked with -n option from CLI:
#(Can be overwritten when command line argument IP:PORT is supplied instead of '-n')
WHOST="127.0.0.1"
WPORT="58080"
#---END USER CONFIGURATION BLOCK--------------------------------------------------------------------
#Notification code, prefers zenity, then notify-send, (which should be available across
#most distributions, in package libnotify or libnotify-bin), KDE fallback is kdialog:
desknote() {
local title="$1"
local message="$2"
if command -v zenity &> /dev/null; then
zenity --info --text="$message"
elif command -v notify-send &> /dev/null; then
notify-send "$title" "$message"
elif command -v kdialog &> /dev/null; then
kdialog --passivepopup "$message" 5
else
echo "Notification Message: $message" >&2
echo "Please install zenity or notify-send (or kdialog for KDE) to use desktop alerts." >&2
echo "You can install either using your package manager, e.g.:" >&2
echo " sudo apt-get install zenity or sudo apt-get install libnotify-bin" >&2
echo " sudo yum install zenity or sudo apt-get install libnotify" >&2
echo " sudo pacman -S zenity, etc." >&2
fi
}
#---CHECK DEPENDENCIES. This block can be commented out once dependencies confirmed-----------------------------------
#The install-wsi script should have taken care of this, left here for the manual install.
command -v curl &>/dev/null || { echo "curl is required. Please, install curl" >&2 ; exit 1 ; }
command -v sox &>/dev/null || { echo "sox is required. Please, install sox" >&2 ; exit 1 ; }
[[ -n $WHISPERFILE ]] || command -v transcribe &>/dev/null || { echo -e "Please, install whisper.cpp (see https://github.com/ggerganov/whisper.cpp)\
\nand create 'transcribe' in your PATH as a symbolic link to the main executable, e.g.\n \
'ln -s /full/path/to/whisper.cpp/main \$HOME/.local/bin/transcribe'" >&2 ; exit 1 ; }
# We will use zenity or notify-send (part of libnotify or libnotify-bin in some distros) for desktop notifications:
command -v zenity &>/dev/null || command -v notify-send &>/dev/null || { echo "zenity or notify-send needed for error reporting. Please, install zenity or libnotify-bin" >&2 ; exit 1 ; }
#Now let's check if we are in X11 or Wayland and use the right utility:
if [[ wm == "wayland" ]]; then
command -v wl-copy &>/dev/null || { echo "wl-copy is needed for the clipboard. Please, install wl-copy" >&2 ; exit 1 ; }
if [ "$AUTOPASTE" = true ]; then
command -v ydotool &>/dev/null || { echo "To have the transcribed text pasted automatically at the cursor position, please install ydotool" >&2 ; exit 1 ; }
fi
elif [[ wm == "X11" ]]; then
command -v xsel &>/dev/null || { echo "We rely on xsel for the clipboard. Please, install xsel." >&2 ; exit 1 ; }
if [ "$AUTOPASTE" = true ]; then
command -v xdotool &>/dev/null || { echo "To have the transcribed text pasted automatically at the cursor position, please install xdotool" >&2 ; exit 1 ; }
fi
fi
[[ -f $WMODEL ]] || { desknote "File not found: " "$WMODEL was not found. \nPlease, correct in USER CONFIGURATION BLOCK." ; exi1 1 ; }
[[ -n $WHISPERFILE ]] && { [[ -x $WHISPERFILE ]] || desknote "Not found: " "Executable $WHISPERFILE not found." ; exit 1 ; }
echo -e "\nLooks like you have all dependencies installed.\nTo remove this message, comment out the 'CHECK DEPENDENCIES' block in the wsi script.\n"
desknote "Dependencies installed" "\nLooks like you have all dependencies installed.\n \
To remove this message, comment out the 'CHECK DEPENDENCIES' block in the wsi script.\n \
\nYou can run wsi --help from the Terminal to learn about its options..."
#---END CHECK DEPENDENCIES. The above block can be commented out after successful 1st run----------------------------
# Process command line arguments first
while [ $# -gt 0 ]; do
case "$1" in
-p|--primary)
# To enable pasting with the middle mouse button
PRIMESEL=true
shift
;;
-c|--clipboard)
# This is transitional warning due to breaking change (will be removed in the future). Clipboard is now the default:
desknote "Breaking Change:" "The '-c' flag is deprecated, since clipboard is now default. \n \
Remove unnecessary '-c' from your desktop shortcut definitions. \n \
To use primary selection as the text source, use the '-p' flag in your desktop shortcuts."
shift
;;
-w|--whisperfile)
whf="$WHISPERFILE"
shift
;;
-n|--netapi)
#This uses the hostname or IP and port specified in the config block
#Can be overwritten with a supplied command-line argument IP:PORT instead of this option
IPnPORT="$WHOST:$WPORT"
if [[ "$(curl -s -f -o /dev/null -w '%{http_code}' $IPnPORT)" != "200" ]]; then
echo "Can't connect to whisper.cpp server at provided address!" >&2
desknote "No connection to Whisper.cpp server" "No whisper.cpp server at $IPnPORT."
exit 1
fi
shift
;;
--help)
echo "Usage: $0 [-p|--primary] [-n|--netapi] [-w|--whisperfile] [IP:PORT]"
echo " -p, --primary: Use PRIMARY selection instead of CLIPBOARD (default - the better choice when autopaste is on)"
echo " -n, --netapi: Use the preconfigured IP/hostanem and port to call a whisper.cpp server for inference."
echo " -w, --whisperfile: Use the preconfigured whisperfile***.llamafile for inference."
echo " any command-line argument: Expected to be of the form IP:PORT or HOSTNAME:PORT to call a server other than the preconfigured."
exit 0
;;
*)
#The network address and port should have already been sanitized in extension
IPnPORT=$1
if [[ "$(curl -s -f -o /dev/null -w '%{http_code}' $IPnPORT)" != "200" ]]; then
echo "Can't connect to a whisper.cpp server at provided address!" >&2
desknote "No connection to Whisper.cpp server" "No whisper.cpp server at $IPnPORT."
exit 1
fi
shift
;;
esac
done
#Hear the complaints of the above tools and do not continue with the sequence:
set -e
rec -q -t wav $ramf rate 16k silence 1 0.1 1% 1 2.0 5% 2>/dev/null
#The server inference has precedence (i.e. -w will be ignored if -n is present).
if [ -n "$IPnPORT" ]; then
str=$(curl -S -s $IPnPORT/inference \
-H "Content-Type: multipart/form-data" \
-F file="@$ramf" \
-F temperature="0.0" \
-F temperature_inc="0.2" \
-F response_format="text")
# | jq -r '.text' )
elif [[ "$whf" == *.llamafile ]]; then
#echo "Using whisperfile: "
str="$($whf -t $NTHR -nt -f $ramf 2>/dev/null)"
else
str="$(transcribe -t $NTHR -nt -m $WMODEL -f $ramf 2>/dev/null)"
fi
# Whisper detected non-speech events such as (wind blowing):
str="${str/\(*\)}"
str="${str/\[*\]}"
str="${str#$'\n'}"
str="${str#$'\n'}"
#Prefer the power of zsh, but lose full POSIX compliance.
if [ -n "$ZSH_NAME" ]; then
str="${str#*([[:space:]])}"
str="${(C)str:0:1}${str#?}"
elif [ -n "$BASH" ]; then
#Running in bash because you changed the shebang on line 1
str="${str##+([[:space:]])}"
str="${str^}"
else
echo "Unknown shell, assuming bash compliance"
str="${str##+([[:space:]])}"
str="${str^}"
fi
#We have a result, now we make a few decisions:
#If this is somehow run in a text console:
if [[ -z "${DISPLAY}" ]] || [[ -z "${DESKTOP_SESSION}" ]] || [[ -z "${XDG_CURRENT_DESKTOP}" ]]; then
#"Not running in a known graphics environment. Using standard output:
echo $str ; exit 0
else
#xsel or wl-copy:
case "$wm" in
"x11")
if [ "$PRIMESEL" = true ]; then
echo $str | xsel -ip
# Simply clicking the middle mouse button. The user must take care of: 1. Window choice. 2. Position within window.
# Thus, autopaste from the primary selection is more unpredictable and requires extra care of window focus and mouse pointer repositioning.
if [ "$AUTOPASTE" = true ]; then xdotool click 2; fi
else
echo $str | xsel -ib
# The automatic paste option makes more sense when using the CLIPBOARD, not primary sellection.
if [ "$AUTOPASTE" = true ]; then xdotool key ctrl+v; fi
fi
;;
"wayland")
if [ "$PRIMESEL" = true ]; then
echo $str | wl-copy -p
# Simply clicking the middle mouse button. The user must take care of: 1. Window choice. 2. Position within window.
# Thus, autopaste from the primary selection is more unpredictable and requires extra care of window focus and mouse pointer repositioning.
if [ "$AUTOPASTE" = true ]; then ydotool click 0xC2; fi
else
echo $str | wl-copy
# The key sym sequence may differ from the one below on a variety of systems and keyboard layouts:
if [ "$AUTOPASTE" = true ]; then ydotool key 37:1 55:1 55:0 37:0 ; fi
fi
;;
*)
echo $str
;;
esac
fi