Using a speech engine to highlight words
Display a piece of text and produce spoken output via a speech engine.
As each word is being spoken, highlight the word on the display.
In languages where cursor control and highlighting are not possible, it is permissible to output each word as it is spoken.
- Related task
AutoHotkey
We use the simple SAPI.SPVoice COM Object and a parsing loop. The highlighting is done with EM_SETSEL and Notepad. Rather crude, but it works. Due to the simplistic nature of the parsing loop, the text ends with a space.
SetTitleMatchMode 2
EM_SETSEL := 0x00B1
Run notepad,,,pid
WinWaitActive ahk_pid %pid%
ControlSetText, Edit1, % text := "AutoHotkey was the first to implement this task! ", ahk_pid %pid%
pVoice := ComObjCreate("Sapi.spvoice"), i := 1 ; the spvoice COM Object ships with the OS
; parse the text
While lf := SubStr(text, i, 1)
{
If lf = %A_Space%
{
SendMessage, EM_SetSel, % i-StrLen(word)-1, % i-1, Edit1, ahk_pid %pid%
pVoice.speak(word), word := "", i++
}
Else word .= lf, i++
}
FreeBASIC
FreeBASIC does not have a native command for them.
We are going to invoke vbscript directly
''This works on Windows. Does anyone know how it would be done in Linux?
Sub Speak(Texto As String)
Shell "mshta vbscript:Execute(""CreateObject(""""SAPI.SpVoice"""").Speak("""""+Texto+""""")(window.close)"")"
End Sub
Function Split(Texto As String, SplitL As String, Direcc As Byte = 0) As String
Dim As Integer LocA = Instr(Texto, SplitL)
Return Iif(Direcc <= 0, Left(Texto, LocA), Right(Texto, Len(Texto) - LocA))
End Function
Dim As String texto = "Actions speak louder than words"
Dim As String palabra()
Dim As Integer cont = -1
Do
cont += 1
Redim Preserve palabra(cont)
palabra(cont) = Split(texto," ")
texto = Right(texto, Len(texto)-Len(palabra(cont)))
If Len(palabra(cont)) = 0 Then palabra(cont) = texto : Exit Do
Loop
For n As Integer = 0 To Ubound(palabra)
Print palabra(n)
Speak palabra(n)
Next n
FutureBasic
FB, a free macOS development language, has an advanced speech synthesis engine that not only speaks words in a wide variety of languages and dialects, but can identify individual words in text and be used to highlight them. To fully appreciate this code, compile it with FB and run on any Intel or Apple Silicon Mac.
include "Tlbx AVFoundation.incl"
output file "Speech Synthesis with Indicator"
CFStringRef string
string = @"FutureBasic is a free Macintosh development language.\n¬
Considering the contempt some programmers have for the BASIC programming ¬
language -- \"BASIC ruins programmers\" -- it's a shame FB has the word ¬
'Basic' in its official name. Not only can FB handle BASIC source code, ¬
but since it's a front end to clang, it can interpret C, Apple's Core Foundation, ¬
Objective-C (Cocoa), HTML, XML, SOAP, UNIX Shell, Open GL, etc. ¬
This makes it an excellent tool for prototyping -- especially for programmatic ¬
Objective-C when the overhead of Xcode is not needed.\n¬
Compiling and running this code as a stand-alone Mac application should ¬
calm any fears concerning FB's capabilities with the most advanced technologies."
_window = 1
begin enum 1
_scrlView
_textView
_speakBtn
_resetBtn
_pauseBtn
_resumeBtn
_voicePopup
end enum
void local fn BuildWindow
NSInteger wndStyle = NSWindowStyleMaskResizable¬
+ NSWindowStyleMaskTitled¬
+ NSWindowStyleMaskClosable¬
+ NSWindowStyleMaskMiniaturizable
CGRect r = fn CGRectMake( 0, 0, 500, 500 )
window _window, @"Using Speech Engine to Highlight Spoken Words", r, wndStyle
WindowSetMinSize( _window, fn CGSizeMake( 400, 270 ) )
r = fn CGRectMake( 20, 60, 460, 425 )
scrollview _scrlView, r, NSBezelBorder
ViewSetAutoresizingMask( _scrlView, NSViewWidthSizable + NSViewHeightSizable )
textview _textView,, _scrlView,,_window
TextViewSetTextContainerInset( _textView, fn CGSizeMake( 10, 10 ) )
TextSetString( _textView, string )
CFMutableAttributedStringRef aString = fn TextViewMutableAttributedString( _textView )
MutableAttributedStringSetFontWithName( aString, @"Times", 16.0 )
MutableAttributedStringSetFirstLineHeadIndent( aString, 10.0 )
MutableAttributedStringSetParagraphSpacing( aString, 20.0 )
MutableAttributedStringBeginEditing( aString )
MutableAttributedStringReplaceCharactersInRange( aString, string, fn CFRangeMake( 0, len(string) - 1 ) )
MutableAttributedStringEndEditing( aString )
r = fn CGRectMake( 380, 13, 110, 32 )
button _speakBtn,,, @"Speak text", r
ViewSetAutoresizingMask( _speakBtn, NSViewMinXMargin + NSViewMaxYMargin )
r = fn CGRectMake( 300, 13, 80, 32 )
button _resetBtn,,, @"Reset", r
ViewSetAutoresizingMask( _resetBtn, NSViewMinXMargin + NSViewMaxYMargin )
r = fn CGRectMake( 220, 13, 80, 32 )
button _pauseBtn, NO,, @"Pause", r
ViewSetAutoresizingMask( _pauseBtn, NSViewMinXMargin + NSViewMaxYMargin )
r = fn CGRectMake( 140, 13, 80, 32 )
button _resumeBtn, NO,, @"Resume", r
ViewSetAutoresizingMask( _resumeBtn, NSViewMinXMargin + NSViewMaxYMargin )
AVSpeechSynthesisVoiceRef voice
CFArrayRef voices = fn AVSpeechSynthesisVoiceSpeechVoices
CFMutableArrayRef temp = fn MutableArrayNew
for voice in voices
if fn StringIsEqual( fn AVSpeechSynthesisVoiceLanguage( voice ), @"en-US" ) || ¬
fn StringIsEqual( fn AVSpeechSynthesisVoiceLanguage( voice ), @"en-GB" ) || ¬
fn StringIsEqual( fn AVSpeechSynthesisVoiceLanguage( voice ), @"en-AU" )
MutableArrayAddObject( temp, voice )
end if
next
SortDescriptorRef sd = fn SortDescriptorWithKey( @"name", YES )
voices = fn ArraySortedArrayUsingDescriptors( temp, @[sd] )
r = fn CGRectMake( 20, 13, 110, 32 )
popupbutton _voicePopup,, 0, fn ArrayValueForKey( voices, @"name" ), r
MenuRef popMenu = fn PopUpButtonMenu( _voicePopup )
long index = 0
for voice in voices
MenuItemSetProperty( popMenu, index, @"voice", voice )
index++
next
end fn
void local fn MySpeechSynthesizerCallback( ref as AVSpeechSynthesizerRef, ev as long, utterance as AVSpeechUtteranceRef, userData as ptr )
CFRange range
select ( ev )
case _speechSynthesizerWillSpeakRangeOfSpeechString
range = fn AVSpeechSynthesizerDelegateRange( ref )
TextViewShowFindIndicatorForRange( _textView, range )
TextScrollRangeToVisible( _textView, range )
case _speechSynthesizerDidFinishSpeechUtterance
range = fn CFRangeMake( 0, 0 )
TextScrollRangeToVisible( _textView, range )
AppRemoveProperty( @"synth" )
end select
end fn
void local fn HighlightSpokenText
button _speakBtn, NO
button _pauseBtn, YES
button _resumeBtn, NO
AVSpeechUtteranceRef utterance = fn AVSpeechUtteranceWithString( fn TextString( _textView ) )
AVSpeechUtteranceSetVolume( utterance, 0.5 )
AVSpeechUtteranceSetRate( utterance, 0.3 )
AVSpeechSynthesisVoiceRef voice = fn MenuItemProperty( fn PopUpButtonMenu( _voicePopup ), fn PopUpButtonIndexOfSelectedItem( _voicePopup ), @"voice" )
if ( voice ) then AVSpeechUtteranceSetVoice( utterance, voice )
AVSpeechSynthesizerRef synth = fn AVSpeechSynthesizerInit
AppSetProperty( @"synth", synth )
CFMutableAttributedStringRef aString = fn TextViewMutableAttributedString( _textView )
AVSpeechSynthesizerSetDelgateCallback( synth, @fn MySpeechSynthesizerCallback, aString )
AVSpeechSynthesizerSpeakUtterance( synth, utterance )
end fn
void local fn PauseSpokeText
AVSpeechSynthesizerRef synth = fn AppProperty( @"synth" )
fn AVSpeechSynthesizerPauseSpeakingAtBoundary( synth, AVSpeechBoundaryWord )
button _pauseBtn, NO
button _resumeBtn, YES
end fn
void local fn ResumeSpokeText
AVSpeechSynthesizerRef synth = fn AppProperty( @"synth" )
fn AVSpeechSynthesizerContinueSpeaking( synth )
button _pauseBtn, YES
button _resumeBtn, NO
end fn
local fn Reset
AVSpeechSynthesizerRef synth = fn AppProperty( @"synth" )
fn AVSpeechSynthesizerStopSpeakingAtBoundary( synth, AVSpeechBoundaryImmediate )
CFRange range = fn CFRangeMake( 0, 0 )
TextScrollRangeToVisible( _textView, range )
AppRemoveProperty( @"synth" )
button _speakBtn, YES
button _pauseBtn, NO
button _resumeBtn, NO
end fn
void local fn DoDialog( ev as long, tag as long, wnd as long )
select ( ev )
case _btnClick
select ( tag )
case _speakBtn : fn HighlightSpokenText
case _pauseBtn : fn PauseSpokeText
case _resumeBtn : fn ResumeSpokeText
case _resetBtn : fn Reset
end select
case _windowWillClose : end
end select
end fn
on dialog fn DoDialog
fn BuildWindow
HandleEvents
Go
This uses the eSpeak speech synthesizer which is invoked for each word in the text. As the word is spoken it is printed to the terminal in capitalized form (and the previous word is uncapitalized). After a second's delay the final word is uncapitalized.
Very robotic but it works.
package main
import (
"fmt"
"log"
"os/exec"
"strings"
"time"
)
func main() {
s := "Actions speak louder than words."
prev := ""
prevLen := 0
bs := ""
for _, word := range strings.Fields(s) {
cmd := exec.Command("espeak", word)
if err := cmd.Run(); err != nil {
log.Fatal(err)
}
if prevLen > 0 {
bs = strings.Repeat("\b", prevLen)
}
fmt.Printf("%s%s%s ", bs, prev, strings.ToUpper(word))
prev = word + " "
prevLen = len(word) + 1
}
bs = strings.Repeat("\b", prevLen)
time.Sleep(time.Second)
fmt.Printf("%s%s\n", bs, prev)
}
Julia
function speak(sentence, cmd = "/utl/espeak.bat")
for word in split(sentence)
s = replace(lowercase(word), r"[^a-z]" => "")
if length(s) > 0
print(uppercase(s))
run(`$cmd $s`)
sleep(1)
print("\b"^length(s))
end
print(word, " ")
end
end
speak("Are those 144 shy Eurasian footwear, cowboy chaps, or jolly earthmoving headgear?")
M2000 Interpreter
Module UsingEvents {
Form 60, 32
Cls 5, 0
Pen 14
Declare WithEvents sp "SAPI.SpVoice"
That$="Rosetta Code is a programming chrestomathy site"
margin=(width-Len(That$))/2
EndStream=False
\\ this function called as sub routine - same scope as Module
\\ we can call it from event function too
Function Localtxt {
\\ move the cursor to middle line
Cursor 0, height/2
\\ using OVER the line erased with background color and then print text over
\\ ordinary Print using transparent printing of text
\\ $(0) set mode to non proportional text, @() move the cursor to sepecific position
Print Over $(0),@(margin), That$
}
Call Local LocalTxt()
Function sp_Word {
Read New StreamNumber, StreamPosition, CharacterPosition, Length
Call Local LocalTxt()
Cursor 0, height/2
Pen 15 {Print Part $(0), @(CharacterPosition+margin); Mid$(That$, CharacterPosition+1, Length)}
Refresh
}
Function sp_EndStream {
Refresh
EndStream=True
}
Const SVEEndInputStream = 4
Const SVEWordBoundary = 32
Const SVSFlagsAsync = 1&
With sp, "EventInterests", SVEWordBoundary+SVEEndInputStream
Method sp, "Speak", That$, SVSFlagsAsync
While Not EndStream {Wait 10}
Call Local LocalTxt()
}
UsingEvents
Mathematica /Wolfram Language
DynamicModule[{text = "This is some text.", words, i = 0},
Panel@Column@{Dynamic[
Row[Riffle[
If[i != 0, MapAt[Style[#, Red] &, #, i], #] &@(words =
StringSplit@text), " "]]], InputField[Dynamic@text, String],
Button["Speak",
While[i < Length@words, i++; FinishDynamic[]; Speak[words[[i]]];
Pause[Max[0.7, 0.12 StringLength[words[[i]]]]]]; i = 0]}]
Nim
Works on Linux but may also work on other platforms provided "espeak" is installed.
import os, osproc, strutils
const S = "Actions speak louder than words."
var prev, bs = ""
var prevlen = 0
for word in S.splitWhitespace():
discard execProcess("espeak " & word)
if prevlen > 0:
bs = repeat('\b', prevlen)
stdout.write bs, prev, word.toUpper, ' '
stdout.flushFile()
prev = word & ' '
prevlen = word.len + 1
bs = repeat('\b', prevlen)
sleep(1000)
echo bs, prev
Phix
You can run this online here.
-- -- demo\rosetta\Speech.exw -- ======================= -- with javascript_semantics requires(6) -- WINDOWS or JS, not LINUX requires(32) -- Windows 32 bit only, for now... -- (^ runs fine on a 64-bit OS, but needs a 32-bit p.exe) requires("1.0.2") include builtins\speak.e -- (new in 1.0.2) include pGUI.e Ihandle t3, left, red, right, btn, hbc, dlg -- not sure why, but a leading space really helps... constant text = ` Highlight words as they are spoken.` procedure speech_cb(integer pos, len) if pos<0 then pos += length(text)+1 end if IupSetStrAttribute(left,"TITLE",text[1..pos]) IupSetStrAttribute(red,"TITLE",text[pos+1..pos+len]) IupSetStrAttribute(right,"TITLE",text[pos+len+1..$]) IupSetAttributes({left,red,right}, "RASTERSIZE=x0") IupRefresh(t3) IupRedraw(t3) end procedure function button_cb(Ihandle /*ih*/) atom rate = iff(platform()=WINDOWS?-5: -- -10..+10, voice dependent iff(platform()=JS?0.3: -- 0.1..10, 0.5 = half speed 9/0)) -- linux, anyone?... speak(text,rate,speech_cb) return IUP_CONTINUE end function procedure main() IupOpen() left = IupLabel() red = IupLabel("",`FGCOLOR="255 0 0"`) right = IupLabel(text) t3 = IupHbox({IupFill(),left,red,right,IupFill()}, `FONT="Verdana, 18", MARGIN=0x20`) btn = IupButton("Speak",Icallback("button_cb")) hbc = IupHbox({IupFill(),btn,IupFill()},"MARGIN=0x10") dlg = IupDialog(IupVbox({t3,hbc}),"TITLE=Speak") IupShow(dlg) if platform()!=JS then IupMainLoop() IupClose() end if end procedure main()
Raku
# 20200826 Raku programming solution
my \s = "Actions speak louder than words.";
my \prev = $ = "";
my \prevLen = $ = 0;
my \bs = $ = "";
for s.split(' ', :skip-empty) {
run "/usr/bin/espeak", $_ or die;
bs = "\b" x prevLen if prevLen > 0;
printf "%s%s%s ", bs, prev, $_.uc;
prev = "$_ ";
prevLen = $_.chars + 1
}
printf "%s%s\n", "\b" x prevLen, prev;
REXX
Programming note: This REXX program uses a freeware program NIRCMD to interface with the Microsoft Windows speech synthesizer program SAM, a text to speech using a male voice. SAM can possibly be configured to use other voices with later releases of Windows. More recent Microsoft Windows have another speech synthesizer program: ANNA.
Each word of the text is highlighted (by showing the word in uppercase). the terminal screen is cleared before showing the text that is being spoken; the repeated calls to the (Windows) speech engine makes for a slower speech rate.
/*REXX program uses a command line interface to invoke Windows SAM for speech synthesis.*/
parse arg t /*get the (optional) text from the C.L.*/
#= words(t)
if #==0 then exit /*Nothing to say? Then exit program.*/
dq= '"' /*needed to enclose text in dbl quotes.*/
rate= 1 /*talk: -10 (slow) to 10 (fast). */
/* [↓] where the rubber meets the road*/
do j=1 for #
x= word(t, j); upper x /*extract 1 word, capitalize it for HL.*/
if j==1 then LHS= /*obtain text before the spoken word. */
else LHS= subword(t, 1, j-1)
if j==# then RHS= /*obtain text after the spoken word. */
else RHS= subword(t, j+1)
'CLS' /*use this command to clear the screen.*/
say 'speaking: ' space(LHS x RHS) /*show text, one word is capitalized. */
oneWord= dq x dq /*surround a word in double quotes (").*/
'NIRCMD' "speak text" oneWord rate /*NIRCMD invokes Microsoft's Sam voice*/
end /*j*/ /*stick a fork in it, we're all done. */
Note: The name of the above REXX program is SPEAKHI.REX
usage using the command:
speakhi This is an example of speech synthesis.
Ring
load "guilib.ring"
MyApp = New qApp {
win1 = new qWidget() {
setwindowtitle("Hello World")
setGeometry(100,100,370,250)
Text = "Welcome to the Ring Programming Language"
Text = split(Text," ")
label1 = new qLabel(win1) {
settext("What is your name ?")
setGeometry(10,20,350,30)
setalignment(Qt_AlignHCenter)
}
btn1 = new qpushbutton(win1) {
setGeometry(10,200,100,30)
settext("Say Hello")
setclickevent("pHello()")
}
btn2 = new qpushbutton(win1) {
setGeometry(150,200,100,30)
settext("Close")
setclickevent("pClose()")
}
lineedit1 = new qlineedit(win1) {
setGeometry(10,100,350,30)
}
voice = new QTextToSpeech(win1) {
}
show()
}
exec()}
Func pHello
lineedit1.settext( "Hello " + lineedit1.text())
for n = 1 to len(Text)
voice.Say(Text[n])
see Text[n] + nl
next
Func pClose
MyApp.quit()
- Output:
Welcome to the Ring Programming Language
Ruby
I'm having difficulty figuring out how to get Shoes to update the GUI (like Tk's update
command), so the user must click the button once for each word.
Uses the Ruby code from Speech synthesis
load 'speechsynthesis.rb'
if ARGV.length == 1
$text = "This is default text for the highlight and speak program"
else
$text = ARGV[1..-1].join(" ")
end
$words = $text.split
Shoes.app do
@idx = 0
stack do
@sentence = para(strong($words[0] + " "), $words[1..-1].map {|word| span(word + " ")})
button "Say word" do
say_and_highlight
end
end
keypress do |key|
case key
when :control_q, "\x11" then exit
end
end
def say_and_highlight
speak $words[@idx]
@idx = (@idx + 1) % $words.length
@sentence.replace($words.each_with_index.map {|word, idx| idx == @idx ? strong(word + " ") : span(word + " ")})
end
end
Tcl
This code uses the external /usr/bin/say
program (known available on Mac OS X) as its interface to the speech engine; this produces rather stilted speech because it forces the text to be spoken one word at a time instead of as a whole sentence (in order to keep the highlighting synchronized).
package require Tcl 8.5
package require Tk 8.5
proc say {text button} {
grab $button
$button configure -state disabled -cursor watch
update
set starts [$text search -all -regexp -count lengths {\S+} 1.0]
foreach start $starts length $lengths {
lappend strings [$text get $start "$start + $length char"]
lappend ends [$text index "$start + $length char"]
}
$text tag remove sel 1.0 end
foreach from $starts str $strings to $ends {
$text tag add sel $from $to
update idletasks
exec /usr/bin/say << $str
$text tag remove sel 1.0 end
}
grab release $button
$button configure -state normal -cursor {}
}
pack [text .t]
pack [button .b -text "Speak, computer!" -command {say .t .b}] -fill x
.t insert 1.0 "This is an example of speech synthesis with Tcl/Tk."
Wren
The ability to call external processes such as espeak is expected to be added to Wren-cli in the next release. In the meantime, we embed the following Wren script in a minimal C host (no error checking) to complete this task.
/* Using_a_speech_engine_to_highlight_words.wren */
import "./str" for Str
class C {
foreign static usleep(usec)
foreign static espeak(s)
foreign static flushStdout()
}
var s = "Actions speak louder than words."
var prev = ""
var prevLen = 0
var bs = ""
for (word in s.split(" ")) {
if (prevLen > 0) bs = "\b" * prevLen
System.write("%(bs)%(prev)%(Str.upper(word)) ")
C.flushStdout()
C.espeak(word)
prev= word + " "
prevLen = word.count + 1
}
bs = "\b" * prevLen
C.usleep(1000)
System.print("%(bs)%(prev)")
We now embed this in the following C program, compile and run it.
/* gcc Using_a_speech_engine_to_highlight_words.c -o Using_a_speech_engine_to_highlight_words -lwren -lm */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "wren.h"
void C_usleep(WrenVM* vm) {
useconds_t usec = (useconds_t)wrenGetSlotDouble(vm, 1);
usleep(usec);
}
void C_espeak(WrenVM* vm) {
const char *arg = wrenGetSlotString(vm, 1);
char command[strlen(arg) + 10];
strcpy(command, "espeak \"");
strcat(command, arg);
strcat(command, "\"");
system(command);
}
void C_flushStdout(WrenVM* vm) {
fflush(stdout);
}
WrenForeignMethodFn bindForeignMethod(
WrenVM* vm,
const char* module,
const char* className,
bool isStatic,
const char* signature) {
if (strcmp(module, "main") == 0) {
if (strcmp(className, "C") == 0) {
if (isStatic && strcmp(signature, "usleep(_)") == 0) return C_usleep;
if (isStatic && strcmp(signature, "espeak(_)") == 0) return C_espeak;
if (isStatic && strcmp(signature, "flushStdout()") == 0) return C_flushStdout;
}
}
return NULL;
}
static void writeFn(WrenVM* vm, const char* text) {
printf("%s", text);
}
char *readFile(const char *fileName) {
FILE *f = fopen(fileName, "r");
fseek(f, 0, SEEK_END);
long fsize = ftell(f);
rewind(f);
char *script = malloc(fsize + 1);
fread(script, 1, fsize, f);
fclose(f);
script[fsize] = 0;
return script;
}
static void loadModuleComplete(WrenVM* vm, const char* module, WrenLoadModuleResult result) {
if( result.source) free((void*)result.source);
}
WrenLoadModuleResult loadModule(WrenVM* vm, const char* name) {
WrenLoadModuleResult result = {0};
if (strcmp(name, "random") != 0 && strcmp(name, "meta") != 0) {
result.onComplete = loadModuleComplete;
char fullName[strlen(name) + 6];
strcpy(fullName, name);
strcat(fullName, ".wren");
result.source = readFile(fullName);
}
return result;
}
int main(int argc, char **argv) {
WrenConfiguration config;
wrenInitConfiguration(&config);
config.writeFn = &writeFn;
config.bindForeignMethodFn = &bindForeignMethod;
config.loadModuleFn = &loadModule;
WrenVM* vm = wrenNewVM(&config);
const char* module = "main";
const char* fileName = "Using_a_speech_engine_to_highlight_words.wren";
char *script = readFile(fileName);
wrenInterpret(vm, module, script);
wrenFreeVM(vm);
free(script);
return 0;
}