Azure Text-to-Speech using C#

The Azure Text-to-Speech service generates speech from text, and then stream this speech audio in real-time over a VoIP call using the Ozeki VoIP SIP SDK.

text to speech conversion
Figure 1 - Text to Speech conversion

Create a Speech resource in the Azure portal.

Create Speech resource
Figure 2 - Create Speech resource

After your Speech resource is deployed, select Go to resource to view and manage keys. For more information about Azure AI services resources, see Get the keys for your resource.

Get keys for reaource
Figure 3 - Get keys for reaource

Install the Speech SDK for C#

In Solution Explorer, right-click the Microsoft_Azure_Text_To_Speech project, and then select Manage NuGet Packages to show NuGet Package Manager.

Manage Nuget packages
Figure 4 - Manage Nuget packages

In the upper-right corner, find the Package Source dropdown box, and make sure that nuget.org is selected.

Select nuget.org source
Figure 5 - Select nuget.org source

In the upper-left corner, select Browse.

Select Browes
Figure 6 - Select Browes

In the search box, enter Microsoft.CognitiveServices.Speech and select Enter.

Search to Microsoft.CognitiveServices.Speech
Figure 7 - Search to Microsoft.CognitiveServices.Speech

From the search results, select the Microsoft.CognitiveServices.Speech package, and then select Install to install the latest stable version.

Install Microsoft.CognitiveServices.Speech package
Figure 8 - Install Microsoft.CognitiveServices.Speech package

Accept all agreements and licenses to start the installation.

Accept all agreements and licenses
Figure 9 - Accept all agreements and licenses

After the package is installed, a confirmation appears in the Package Manager Console window.

Rings VoIP phone and after picking up the phone
a text to speech message will be read (C# example)

using System;
using System.IO;
using System.Runtime.CompilerServices;
using System.Threading.Tasks;
using System.Xml;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Ozeki.Media;
using Ozeki.VoIP;

namespace Microsoft_Azure_Text_To_Speech
{
    class Program
    {
        static string _speechKey = "SPEECH_KEY";
        static string _speechRegion = "SPEECH_REGION";

        static ISoftPhone _softphone;
        static IPhoneLine _phoneLine;
        static IPhoneCall _call;

        static MediaConnector _connector;
        static PhoneCallAudioSender _mediaSender;

        static void Main(string[] args)
        {
            _softphone = SoftPhoneFactory.CreateSoftPhone(5000, 10000);

            var registrationRequired = true;
            var userName = "112";
            var displayName = "112";
            var authenticationId = "112";
            var registerPassword = "112";
            var domainHost = "192.168.115.60";
            var domainPort = 5060;
            var account = new SIPAccount(registrationRequired, displayName, userName, authenticationId, registerPassword, domainHost, domainPort);

            _mediaSender = new PhoneCallAudioSender();
            _connector = new MediaConnector();

            RegisterAccount(account);

            Console.ReadLine();
        }

        static void RegisterAccount(SIPAccount account)
        {
            try
            {
                _phoneLine = _softphone.CreatePhoneLine(account);
                _phoneLine.RegistrationStateChanged += PhoneLine_RegistrationStateChanged;
                _softphone.RegisterPhoneLine(_phoneLine);
            }
            catch (Exception ex)
            {
                Console.WriteLine("Error during SIP registration: " + ex);
            }
        }

        static void PhoneLine_RegistrationStateChanged(object sender, RegistrationStateChangedArgs e)
        {
            if (e.State == RegState.NotRegistered || e.State == RegState.Error)
            {
                Console.WriteLine("Registration failed!");
                return;
            }

            if (e.State == RegState.RegistrationSucceeded)
            {
                Console.WriteLine("Registration succeeded - Online!");
                CreateCall();
                return;
            }
        }

        static void CreateCall()
        {
            var numberToDial = "110";
            _call = _softphone.CreateCallObject(_phoneLine, numberToDial);
            _call.CallStateChanged += Call_CallStateChanged;
            _call.Start();
        }

        static void Call_CallStateChanged(object sender, CallStateChangedArgs e)
        {
            Console.WriteLine("Call state: {0}.", e.State);

            if (e.State == CallState.Answered)
                Task.Run(SetupTextToSpeech);
        }

        static async void SetupTextToSpeech()
        {
            var speechConfig = SpeechConfig.FromSubscription(_speechKey, _speechRegion);
            speechConfig.SpeechSynthesisVoiceName = "en-US-JennyNeural";
            speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm);

            var myBuffer = new MyBuffer();
            var pullStream = AudioOutputStream.CreatePushStream(myBuffer);
            var audioConfig = AudioConfig.FromStreamOutput(pullStream);

            var speechSynthesizer = new SpeechSynthesizer(speechConfig, audioConfig);
            var speechSynthesisResult = await speechSynthesizer.SpeakTextAsync("Test message from microsoft azure text to speech api through voip call.");

            myBuffer.InnerStream.Position = 0;//You need to set stream position to 0 before passing it to RawStreamPlayback
            var playback = new RawStreamPlayback(myBuffer.InnerStream, new WaveFormat(16000, 16, 1));

            _mediaSender.AttachToCall(_call);
            _connector.Connect(playback, _mediaSender);
            playback.Start();
        }

        class MyBuffer : PushAudioOutputStreamCallback
        {
            public readonly MemoryStream InnerStream = new MemoryStream();

            public override uint Write(byte[] dataBuffer)
            {
                InnerStream.Write(dataBuffer, 0, dataBuffer.Length);
                return (uint)dataBuffer.Length;
            }

            public override void Close()
            {
                InnerStream.Close();
            }
        }
    }
}

More information