using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
#if (!__MonoCS__)
using System.Speech.Synthesis;
#endif
using System.Xml;
using System.IO;
using System.Text.RegularExpressions;
using System.Threading;
using System.Runtime.InteropServices;
#if (!__MonoCS__)
namespace TtsRelay
{
public class MsSpeechRelay : ITtsRelay
{
///
/// SAPI TTS server
///
private SpeechSynthesizer ttsServer;
///
/// Maps SAPI visemes to SmartBody phonemes in order to generate viseme timings
///
private List visemeIDMap;
///
/// Variable to enable/disable debug prints
///
private bool doDebugChecks = true;
///
/// Keeps track of the total duration of the phonemes and visemes so far encountered in a single request.
/// Used instead of provided AudioPosition starting points, since they are way off.
///
private double totalVisemeDuration = 0;
private double totalPhonemeDuration = 0;
private GenerateAudioReply generateAudioReply = null;
private KeyValuePair wordMarker;
private bool markerExists = false;
XmlDocument facefxMappingDoc = null;
string visemeMappingType = "facefx";
const string facefxMapping = @"
";
public bool Init(string visemeMapping)
{
/// Initialize phoneme to viseme mappings
SetVisemeMapping(visemeMapping);
ttsServer = new SpeechSynthesizer();
Console.WriteLine("\nAvailable SAPI compatible voices on this machine are:\n");
foreach (InstalledVoice v in ttsServer.GetInstalledVoices())
{
Console.WriteLine("\"" + v.VoiceInfo.Name + "\"");
}
Console.WriteLine("\n");
/// Add message callbacks to receive TTS events
ttsServer.BookmarkReached += new EventHandler(ttsServer_BookmarkReached);
ttsServer.PhonemeReached += new EventHandler(ttsServer_PhonemeReached);
ttsServer.VisemeReached += new EventHandler(ttsServer_VisemeReached);
return true;
}
public string [] GetVoices()
{
//"Microsoft Anna"
//"Microsoft Mary"
//"Microsoft Mike"
List list = new List();
foreach (InstalledVoice v in ttsServer.GetInstalledVoices())
{
list.Add(v.VoiceInfo.Name.Replace(" ", "|"));
}
return list.ToArray();
}
///
/// Get culture of selected voice
///
/// name of the voice chosen
/// culture name for xml:lang statement or empty string if voice not found
public string GetCulture(string strVoiceName)
{
foreach (InstalledVoice v in ttsServer.GetInstalledVoices())
{
// return the name of the culture used for the 'xml:lang' attribut
if (v.VoiceInfo.Name.Replace(" ", "|") == strVoiceName)
return v.VoiceInfo.Culture.Name;
}
//
return "";
}
public void SetVisemeMapping(string visemeMapping)
{
pvMapInit(visemeMapping);
}
///
/// Generate TTS audio, takes SSML input
///
/// SSML message
/// File name to save output in
/// Voice to use
///
public bool GenerateAudio(string message, string outputFileName, string messageOutputFileName, string voice, ref string xmlReplyReturn, ref GenerateAudioReply generateAudioReplyReturn)
{
xmlReplyReturn = "";
generateAudioReply = generateAudioReplyReturn;
generateAudioReply.used = true;
generateAudioReply.soundFile = messageOutputFileName;
generateAudioReply.WordBreakList = new List>();
generateAudioReply.MarkList = new List>();
generateAudioReply.VisemeList = new List();
totalPhonemeDuration = 0;
totalVisemeDuration = 0;
if (doDebugChecks)
{
Console.WriteLine("Generating audio for message with voice: " + voice);
}
bool allOk = true;
// Adding this line to make the application compatible with the NeoSpeech Voice engine
message = message.Replace("", ".");
ttsServer.SetOutputToWaveFile(outputFileName);
// In GetVoices(), we replace the space in the name with a pipe '|' because we can't select a voice with a space in the name (because it's a vhmsg sbm command).
// Here, we put the space back in, so that we can correctly select the voice via the SAPI function SelectVoice()
string voiceModified = voice.Replace("|", " ");
Console.WriteLine("Selecting SAPI voice: " + voiceModified);
try
{
ttsServer.SelectVoice(voiceModified);
}
catch (Exception e)
{
Console.WriteLine("Exception while choosing voice: " + e.ToString());
}
/// We have a pre-tweaked message, no need to tamper with it
if (doDebugChecks)
{
Console.WriteLine("Debug: Generating speech for SSML string: \"" + message + "\"...\n");
}
try
{
if (allOk)
{
ttsServer.SpeakSsml(message);
}
}
catch (Exception e)
{
Console.WriteLine(e.Message);
}
// set the output to Null to release the referrence on the audio file
// see http://msdn.microsoft.com/en-us/library/system.speech.synthesis.speechsynthesizer.setoutputtowavestream
ttsServer.SetOutputToNull();
return allOk;
}
///
/// Initialize all phoneme to viseme mappings
///
private void pvMapInit(string visemeMapping)
{
visemeMappingType = visemeMapping;
visemeIDMap = new List();
if (visemeMapping == "sbmold")
{
visemeIDMap.Insert(0, "_");
visemeIDMap.Insert(1, "Ih"); /// Viseme for aa, ae, ah
visemeIDMap.Insert(2, "Ao"); /// Viseme for aa
visemeIDMap.Insert(3, "Ao"); /// ao
visemeIDMap.Insert(4, "Ih"); /// ey, eh, uh
visemeIDMap.Insert(5, "Er"); /// er
visemeIDMap.Insert(6, "Ih"); /// y, iy, ih, ix
visemeIDMap.Insert(7, "Oh"); /// w, uw
visemeIDMap.Insert(8, "Oh"); /// ow
visemeIDMap.Insert(9, "Ih"); /// aw
visemeIDMap.Insert(10, "Oh"); /// oy
visemeIDMap.Insert(11, "Ih"); /// ay
visemeIDMap.Insert(12, "Oh"); /// h
visemeIDMap.Insert(13, "R"); /// r
visemeIDMap.Insert(14, "D"); /// l
visemeIDMap.Insert(15, "Z"); /// s, z
visemeIDMap.Insert(16, "j"); /// sh, ch, jh, zh
visemeIDMap.Insert(17, "Th"); /// th, dh
visemeIDMap.Insert(18, "F"); /// f, v
visemeIDMap.Insert(19, "D"); /// d, t, n - also try NG: 2 to 1 against
visemeIDMap.Insert(20, "KG"); /// k, g, ,ng - also try NG: 2 to 1 against
visemeIDMap.Insert(21, "BMP"); /// p, b, m
}
else if (visemeMapping == "sbm")
{
/// Map constructed from viseme reference:
/// http://msdn.microsoft.com/en-us/library/ms720881(VS.85).aspx
///
visemeIDMap.Insert(0, "_"); /// silence
visemeIDMap.Insert(1, "Ah"); /// Viseme for aa, ae, ah
visemeIDMap.Insert(2, "Aa"); /// Viseme for aa
visemeIDMap.Insert(3, "Ao"); /// ao
visemeIDMap.Insert(4, "Eh"); /// ey, eh, uh
visemeIDMap.Insert(5, "Er"); /// er
visemeIDMap.Insert(6, "Ih"); /// y, iy, ih, ix
visemeIDMap.Insert(7, "W"); /// w, uw
visemeIDMap.Insert(8, "Ow"); /// ow
visemeIDMap.Insert(9, "Aw"); /// aw
visemeIDMap.Insert(10, "Oy"); /// oy
visemeIDMap.Insert(11, "Ay"); /// ay
visemeIDMap.Insert(12, "H"); /// h
visemeIDMap.Insert(13, "R"); /// r
visemeIDMap.Insert(14, "L"); /// l
visemeIDMap.Insert(15, "Z"); /// s, z
visemeIDMap.Insert(16, "Sh"); /// sh, ch, jh, zh
visemeIDMap.Insert(17, "Th"); /// th, dh
visemeIDMap.Insert(18, "F"); /// f, v
visemeIDMap.Insert(19, "D"); /// d, t, n - also try NG: 2 to 1 against
visemeIDMap.Insert(20, "KG"); /// k, g, ,ng - also try NG: 2 to 1 against
visemeIDMap.Insert(21, "BMP"); /// p, b, m
}
else if (visemeMapping == "facefx")
{
//visemeIDMap.Insert(0, "_");
try
{
facefxMappingDoc = new XmlDocument();
facefxMappingDoc.LoadXml(facefxMapping);
}
catch (Exception e)
{
Console.WriteLine("error loading the xml string for facefx visemes:" + e.ToString() + "\n");
}
}
}
///
/// Viseme callback - maps SAPI visemes to SmartBody visemes/phonemes
///
///
///
private void ttsServer_VisemeReached(object sender, VisemeReachedEventArgs e)
{
int sapiVisemeCount;
if (visemeMappingType.Equals("facefx"))
sapiVisemeCount = 21;
else
sapiVisemeCount = visemeIDMap.Count;
if (doDebugChecks)
{
int vindex = e.Viseme;
/// This is a safety capture, if either of these is being triggered, some changes have been made to the SAPI API, change the map accordingly
if (vindex < 0)
{
Console.WriteLine("Viseme index bound by 0: " + vindex.ToString() + "\n");
vindex = 0;
}
else if (vindex >= sapiVisemeCount)
{
Console.WriteLine("Viseme index bound by " + visemeIDMap.Count + ": " + vindex.ToString() + "\n");
vindex = visemeIDMap.Count - 1;
}
if (vindex != e.Viseme)
Console.WriteLine("Viseme index truncated from " + e.Viseme.ToString() + " to " + vindex + "\n");
//Console.WriteLine("Reached viseme: " + e.Viseme.ToString() + " aka: " + visemeIDMap[vindex] + " at time: " + e.AudioPosition.TotalSeconds.ToString() + " for duration: " + e.Duration.ToString() + "\n");
Console.WriteLine("Total viseme duration: " + this.totalVisemeDuration);
}
// We should be able to just take the AudioPosition time, which denotes the point in the request the viseme starts,
// but this is way off for some reason; different visemes start at the exact same time, and the schedule extends beyond
// the lenght of the audiofile
//generateAudioReply.VisemeList.Add(new GenerateAudioReplyViseme(visemeIDMap[e.Viseme], e.AudioPosition.TotalSeconds, 1.0));
// Instead, we manually keep track of the total duration so far, and use that as the starting point for each viseme.
if (visemeMappingType.Equals("facefx"))
{
XmlNodeList entries = facefxMappingDoc.GetElementsByTagName("entry");
for (int i = 0; i < entries.Count; ++i)
{
string target = entries[i].Attributes["target"].InnerText;
string articulation = entries[i].Attributes["amount"].InnerText;
string viseme = entries[i].Attributes["viseme"].InnerText;
if (e.Viseme.Equals(Convert.ToInt32(viseme)))
{
//xmlReply += "";
try
{
double articulationValue = Convert.ToDouble(articulation);
generateAudioReply.VisemeList.Add(new GenerateAudioReplyViseme(target, totalVisemeDuration, articulationValue ));
}
catch(Exception e1)
{
Console.WriteLine(e1.ToString());
}
}
}
}
else
{
//xmlReply += "";
generateAudioReply.VisemeList.Add(new GenerateAudioReplyViseme(visemeIDMap[e.Viseme], totalVisemeDuration, 1.0));
}
this.totalVisemeDuration += e.Duration.TotalSeconds;
}
///
/// Phoneme reached callback - doesn't do anything
///
///
///
private void ttsServer_PhonemeReached(object sender, PhonemeReachedEventArgs e)
{
totalPhonemeDuration += e.Duration.TotalSeconds;
if (doDebugChecks)
{
Console.WriteLine("Reached phoneme: " + e.Phoneme + " at time: " + e.AudioPosition.TotalSeconds.ToString() + " for duration: " + e.Duration.ToString() + "\n");
Console.WriteLine("Total phoneme duration: " + this.totalPhonemeDuration);
//byte[] b = System.Text.Encoding.Unicode.GetBytes(e.Phoneme.ToCharArray());
//Console.WriteLine("Chars for bytes: ");
//Console.WriteLine(System.Text.Encoding.ASCII.GetChars(b));
//string asciiPhoneme = System.Text.Encoding.ASCII.GetString(System.Text.Encoding.Convert(Encoding.Unicode,Encoding.ASCII,b));
//Console.WriteLine("Test ascii phoneme: " + asciiPhoneme + "\n");
//Console.WriteLine("Viseme attached= " + phonemeToViseme[phoneme] + "\n");
}
}
///
/// Bookmark reached callback - used to mark word beginnings and ends
///
///
///
private void ttsServer_BookmarkReached(object sender, BookmarkReachedEventArgs e)
{
String bookmark;
bookmark = e.Bookmark.Substring(e.Bookmark.IndexOf(':') + 1);
if (doDebugChecks)
{
Console.WriteLine("Reached bookmark: " + bookmark + " at time: " + e.AudioPosition.TotalSeconds.ToString() + "\n");
}
// The provided AudioPosition is erroneous, so we're mannually keeping track of where we are using aggregate viseme duration.
//generateAudioReply.MarkList.Add(new KeyValuePair(bookmark, e.AudioPosition.TotalSeconds));
generateAudioReply.MarkList.Add(new KeyValuePairS(bookmark, totalVisemeDuration));
/// Since we don't have a word beginning/ending callback, we resort
/// to relying on the fact that each word in the SSML message is
/// enclosed in a (book)mark, so the number of marks is
/// always even. But the logic used should not rely on the tag name, only
/// whether it's the odd or even (in order of occurence)
if (markerExists)
{
/// word tag exists, write end time
//generateAudioReply.WordBreakList.Add(new KeyValuePair(wordMarker.Key, e.AudioPosition.TotalSeconds));
generateAudioReply.WordBreakList.Add(new KeyValuePairS(wordMarker.Key, totalVisemeDuration));
markerExists = false;
}
else
{
/// no word tag, add new word tag
//wordMarker = new KeyValuePair(e.AudioPosition.TotalSeconds, 0);
wordMarker = new KeyValuePair(totalVisemeDuration, 0);
markerExists = true;
}
}
}
}
#endif