2006-06-29

Complete changes to CAIVIAR for use with Scansoft RealSpeak

Changes in RealSpeak.h and RealSpeak.cpp allow extra keyword "realspeak.engine" in ivr.properties to reach the Scansoft Realspeak engine.

RealSpeak.h (added member "engine"e;):

/* RealSpeak.h

Header file for RealSpeak.cpp

Part of the caiviar package.

Copyright (c) 2002 MobileX AG, http://www.mobilexag.de
Copyright (c) 2002 Peter Dikant <peter.dikant@mobilexag.de>
Copyright (c) 2002 Matthias Kramm <kramm@quiss.org>

This file is distributed under the GPL, see file COPYING for details. */

#ifdef WIN32
#include <windows.h>
#endif
#include "/gttsso_types.h"
#include "/glh_ttsso.h"
#include "/g../src/Log.h"
#include "/g../src/os.h"

#ifndef __realspeak_h__
#define __realspeak_h__

#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000

class RealSpeak: public TextToSpeech
{
public:
static int initialize(Logger*log);
static void finalize(Logger*log);

static int setParams (const char *key, const char* value);

virtual void text2Stream (const char *text, unsigned char **outbuffer, unsigned long *size);
virtual void text2File (const char *text, const char *filename);
virtual void text2Audio (const char *text);
RealSpeak(Logger *log);
virtual ~RealSpeak();

private:
static TTSRETVAL sourceCallback(void *pAppData, void *databuffer, U32 buffersize, U32 *datasize);
static VOID *destCallback(void *pAppData, U16 datatype, VOID *data, U32 datasize, U32 *buffersize);
static TTSRETVAL CbTtsEventNotify(void *pAppData, void *buffer, U16 datasize, U16 event);
void loadDictionary(const char* dictionary);
HTTSINSTANCE hInst;
TTSPARM ttsParm;
HTTSDICT userdict;
char* text;
char* textstart;
unsigned char *output;
long outputPos;
long outputSize;
Mutex mutex;
long int outputsize;

static char* dictionary; //set by setParam, used by constructor
static char* remote_server;
static char* remote_service;
static int remote_port;
static char* language;
static char* voice;
static char* engine;
};

#endif

RealSpeak.cpp (introduced currently (RealSpeak version 3.5) supported language constants, and macro-fied some repetitive code to allow easier future changes):

/*
* Filename: RealSpeak.cpp
* Project Caiviar "ISDN-CAPI made easy"
* Package: RealSpeak(TM) driver.

Part of the caiviar package.

Copyright (c) 2002,2003 Matthias Kramm <kramm@quiss.org>
Copyright (c) 2002 Peter Dikant <peter.dikant@mobilexag.de>
Copyright (c) 2006 Dieter Demerre <ddemerre AT googles e-mail gmail>

This file is distributed under the GPL, see file COPYING for details.
*/

/*===========================================================================**
** INCLUDE FILES **
**===========================================================================*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "RealSpeak.h"

/*===========================================================================**
** LOCAL MACROS **
**===========================================================================*/
#define TTS_OUTPUT_BUFFER 1048576 // 1 MByte output buffer

#define TTS_DEFAULT_LANGUAGE "German"
#define TTS_DEFAULT_VOICE "female1"
#if defined(LINUX)
# define TTS_ENGINEPATH "./engine"
#else /* if defined(LINUX) */
# define TTS_ENGINEPATH ".\\Engine"
#endif /* if defined(LINUX) - else*/

/* macros to easily configure TTS constant conversion table */
#define TTSCAIVIARRECORD(s,c) (char*)(s),(char*)(#c),(U16)(c)
#define NROFELEMENTS(array) (sizeof(array)/sizeof((array)[0]))
#define NROFLANGS NROFELEMENTS(langConv)
#define NROFVOICES NROFELEMENTS(voiceConv)
#define SEARCHTTSDATA(ttsstr,var,array,max) for(var=0;((var<max)&&(strcmp(ttsstr,array[var].IKnow)));var++);
#define SEARCHLANGUAGE(ttsstr,var) SEARCHTTSDATA(ttsstr,var,langConv,NROFLANGS)
#define SEARCHVOICE(ttsstr,var) SEARCHTTSDATA(ttsstr,var,voiceConv,NROFVOICES)

/**
* check whether var equals str
* and if so, set mmbr member of RealSpeak to val and return 1;
*/
#define CRS_CHECKVAR(var,str,mmbr,val) { if (!strcmp((var),(str))) { RealSpeak::mmbr=(val) ; return 1; } }

/**
* log as error all elements of arr referenced by "IKnow" member.
*/
#define LOGKNOWNLIST(ct,arr,max,name) { for ((ct)=0;(ct)<(max);(ct)++) { log->logf("<error> allowed %s %d.: \"%s\".",(name),(ct)+1,(arr)[ct].IKnow); } }


/*===========================================================================**
** LOCAL TYPES **
**===========================================================================*/
typedef struct {
char* IKnow;
char* str;
U16 tts_var;
} T_TTS_CaiviarRecord;


// this header will be appended to the output created by the tts system
// to create a complete wav memory structure
static unsigned char wavHeader[] = {
0x52, 0x49, 0x46, 0x46, 0x00, 0x00, 0x00, 0x00, // |R|I|F|F|4 byte lenght of file - 8|
0x57, 0x41, 0x56, 0x45, 0x66, 0x6d, 0x74, 0x20, // |W|A|V|E|f|m|t| |
0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, // |4 byte length of header|2 bytes encoding (01 = pcm)|2 bytes number of channels|
0x40, 0x1f, 0x00, 0x00, 0x80, 0x3e, 0x00, 0x00, // |4 byte samplesrate|4 byte samplerate * bytes per sample|
0x02, 0x00, 0x10, 0x00, 0x64, 0x61, 0x74, 0x61, // |2 byte bytes per sample|2 byte bit per sample|d|a|t|a|
0x00, 0x00, 0x00, 0x00 };

const T_TTS_CaiviarRecord langConv[] = {
/* records for compatibility with possible earlier caiviar
ivr.properties configurations */
TTSCAIVIARRECORD("english",TTS_LANG_US_ENGLISH),
TTSCAIVIARRECORD("french",TTS_LANG_FRENCH),
TTSCAIVIARRECORD("german",TTS_LANG_GERMAN),
TTSCAIVIARRECORD("dutch",TTS_LANG_BELGIAN_DUTCH),
/* records with new, more readable names for language variables
containing ALL constants known to Scansoft RealSpeech 3.5 */
TTSCAIVIARRECORD("American English",TTS_LANG_US_ENGLISH),
TTSCAIVIARRECORD("Spanish",TTS_LANG_SPANISH),
TTSCAIVIARRECORD("French",TTS_LANG_FRENCH),
TTSCAIVIARRECORD("Dutch Dutch",TTS_LANG_NETHERLANDS_DUTCH),
TTSCAIVIARRECORD("Dutch",TTS_LANG_DUTCH),
TTSCAIVIARRECORD("British English",TTS_LANG_BRITISH_ENGLISH),
TTSCAIVIARRECORD("German",TTS_LANG_GERMAN),
TTSCAIVIARRECORD("Italian",TTS_LANG_ITALIAN),
TTSCAIVIARRECORD("Japanese",TTS_LANG_JAPANESE),
TTSCAIVIARRECORD("Korean",TTS_LANG_KOREAN),
TTSCAIVIARRECORD("Egyptian Arabic",TTS_LANG_EGYPTIAN_ARABIC),
TTSCAIVIARRECORD("Mandarin B5",TTS_LANG_MANDARIN_B5),
TTSCAIVIARRECORD("Brazilian Portuguese",TTS_LANG_BRAZILIAN_PORTUGUESE),
TTSCAIVIARRECORD("Russian",TTS_LANG_RUSSIAN),
TTSCAIVIARRECORD("Mexican Spanish",TTS_LANG_MEXICAN_SPANISH),
TTSCAIVIARRECORD("Belgian Dutch",TTS_LANG_BELGIAN_DUTCH),
TTSCAIVIARRECORD("Swedish",TTS_LANG_SWEDISH),
TTSCAIVIARRECORD("Norwegian",TTS_LANG_NORWEGIAN),
TTSCAIVIARRECORD("Mandarin GB",TTS_LANG_MANDARIN_GB),
TTSCAIVIARRECORD("Australian English",TTS_LANG_AUSTRALIAN_ENGLISH),
TTSCAIVIARRECORD("Canadian French",TTS_LANG_CANADIAN_FRENCH),
TTSCAIVIARRECORD("Cantonese B5",TTS_LANG_CANTONESE_B5),
TTSCAIVIARRECORD("Cantonese GB",TTS_LANG_CANTONESE_GB),
TTSCAIVIARRECORD("Danish",TTS_LANG_DANISH),
TTSCAIVIARRECORD("Portugal Portuguese",TTS_LANG_PORTUGAL_PORTUGUESE),
TTSCAIVIARRECORD("Poland Polish",TTS_LANG_POLAND_POLISH),
TTSCAIVIARRECORD("Armenia Armenian",TTS_LANG_ARMENIA_ARMENIAN),
TTSCAIVIARRECORD("Ukrainian",TTS_LANG_UKRAINIAN),
TTSCAIVIARRECORD("Greek",TTS_LANG_GREEK),
TTSCAIVIARRECORD("Vietnamese",TTS_LANG_VIETNAMESE),
TTSCAIVIARRECORD("malay",TTS_LANG_MALAY),
TTSCAIVIARRECORD("Pakistan Urdu",TTS_LANG_PAKISTAN_URDU),
TTSCAIVIARRECORD("Indonesia Bahasa",TTS_LANG_INDONESIA_BAHASA),
TTSCAIVIARRECORD("Iran Farsi",TTS_LANG_IRAN_FARSI),
TTSCAIVIARRECORD("Belarusian",TTS_LANG_BELARUSIAN),
TTSCAIVIARRECORD("Czech",TTS_LANG_CZECH),
TTSCAIVIARRECORD("Hungarian",TTS_LANG_HUNGARIAN),
TTSCAIVIARRECORD("India Tamil",TTS_LANG_INDIA_TAMIL),
TTSCAIVIARRECORD("Thailand Thai",TTS_LANG_THAILAND_THAI),
TTSCAIVIARRECORD("Turkish",TTS_LANG_TURKISH),
TTSCAIVIARRECORD("Taiwanese",TTS_LANG_TAIWANESE),
TTSCAIVIARRECORD("India Hindi",TTS_LANG_INDIA_HINDI),
TTSCAIVIARRECORD("Taiwan Mandarin B5",TTS_LANG_TAIWAN_MANDARIN_B5),
TTSCAIVIARRECORD("Taiwan Mandarain GB",TTS_LANG_TAIWAN_MANDARIN_GB)
};

const T_TTS_CaiviarRecord voiceConv[] = {
TTSCAIVIARRECORD("female1",TTS_RS_VOICE_FEMALE),
TTSCAIVIARRECORD("female2",TTS_RS_VOICE_FEMALE2),
TTSCAIVIARRECORD("female3",TTS_RS_VOICE_FEMALE3),
TTSCAIVIARRECORD("female4",TTS_3000_VOICE_FEMALE),
TTSCAIVIARRECORD("male1",TTS_RS_VOICE_MALE),
TTSCAIVIARRECORD("male2",TTS_RS_VOICE_MALE2),
TTSCAIVIARRECORD("male3",TTS_RS_VOICE_MALE3),
TTSCAIVIARRECORD("male4",TTS_3000_VOICE_MALE)
};

/*===========================================================================**
** STATIC MEMBER INITIALIZATION **
**===========================================================================*/
char* RealSpeak::dictionary = 0;
char* RealSpeak::remote_server = 0;
char* RealSpeak::remote_service = 0;
int RealSpeak::remote_port = 0;
char* RealSpeak::language = TTS_DEFAULT_LANGUAGE;
char* RealSpeak::voice = TTS_DEFAULT_VOICE;
char* RealSpeak::engine = TTS_ENGINEPATH;

/*===========================================================================**
** MEMBER FUNCTION IMPLEMENTATION **
**===========================================================================*/
int RealSpeak::initialize(Logger*log)
{
log->logf("<verbose> Using Realspeak as Text to Speech engine\n");
return 1;
}

void RealSpeak::finalize(Logger*log)
{
if(RealSpeak::dictionary)
free(RealSpeak::dictionary);
}

/**
* the default constructor will create the basic tts instances and also store a pointer
* to the logging subsystem.
* @param log a pointer to the logging object
*/
RealSpeak::RealSpeak(Logger *log)
:TextToSpeech(log)
{
int lct;

log->logf("<debug> Initializing RealSpeak Engine");
log->logf("<debug> starting with data buffer of %d bytes", TTS_OUTPUT_BUFFER);

SEARCHLANGUAGE(language,lct);
if (lct < NROFLANGS)
{
ttsParm.nLanguage = langConv[lct].tts_var;
log->logf("<notice> Setting language to %s.",langConv[lct].str);
} else {
log->logf("<error> Language not known: \"%s\"", language);
log->logf("<error> I know following languages: ");
LOGKNOWNLIST(lct,langConv,NROFLANGS,"language");
log->logf("<error> Switching to default language \"%s\".",TTS_DEFAULT_LANGUAGE);
SEARCHLANGUAGE(TTS_DEFAULT_LANGUAGE,lct);
if (lct < NROFLANGS)
{
ttsParm.nLanguage = langConv[lct].tts_var;
log->logf("<notice> Setting language to %s.",langConv[lct].str);
} else {
log->logf("<error> COULD NOT FIND DEFAULT LANGUAGE (%s)",language);
}
}

SEARCHVOICE(voice,lct);
if (lct < NROFVOICES)
{
ttsParm.nVoice = voiceConv[lct].tts_var;
log->logf("<notice> Setting voice to %s",voiceConv[lct].str);
} else {
log->logf("<error> Voice not known: \"%s\"",voice);
log->logf("<error> I know following voices: ");
LOGKNOWNLIST(lct,voiceConv,NROFVOICES,"voice");
log->logf("<error> Switching to default voice \"%s\".",TTS_DEFAULT_VOICE);
SEARCHVOICE(voice,lct);
if (lct < NROFVOICES)
{
ttsParm.nVoice = voiceConv[lct].tts_var;
log->logf("<notice> Setting language to %s.",voiceConv[lct].str);
} else {
log->logf("<error> COULD NOT FIND DEFAULT VOICE (%s) !!!",voice);
}
}

if(ttsParm.nLanguage != TTS_LANG_US_ENGLISH)
/* only us-english supports different voices */
ttsParm.nVoice = TTS_RS_VOICE_FEMALE;

ttsParm.nOutputType = TTS_LINEAR_16BIT;
ttsParm.nFrequency = TTS_FREQ_8KHZ;
ttsParm.nInputDataType = TTS_DATA_TYPE_TEXT;
ttsParm.nOutputDataType = TTS_DATA_TYPE_PCM;
ttsParm.cbFuncs.TtsSourceCb = sourceCallback;
ttsParm.cbFuncs.TtsDestCb = destCallback;
ttsParm.cbFuncs.TtsEventCb = CbTtsEventNotify;
ttsParm.cbFuncs.numCallbacks = 3;
ttsParm.szLibLocation = strdup(engine);

int ret;

if(RealSpeak::remote_server && RealSpeak::remote_service && RealSpeak::remote_port) {

ttsParm.szLibLocation = NULL;

LH_SDK_SERVER* server = new LH_SDK_SERVER;
server->server_handle = 0;
strcpy(server->server.IP_Address, RealSpeak::remote_server);
strcpy(server->server.service, RealSpeak::remote_service);
server->server.port_number = RealSpeak::remote_port;
log->logf("<notice> Connecting to remote TTS-server <%s> %s:%d", RealSpeak::remote_service,
RealSpeak::remote_server, RealSpeak::remote_port);
ret = TtsCreateEngine(server);
log->logf("<notice> Connected. Handle=%d Server=%s Service=%s Port=%d", server->server_handle,
server->server.IP_Address, server->server.service, server->server.port_number);
if (ret != 0) {
log->logf("<error> Error connecting to server: %d", ret);
}
ret = TtsInitialize(&hInst, server, &ttsParm, (void *)this);
} else {
log->logf("<debug> initializing engine directory %s.", ttsParm.szLibLocation);
ret = TtsInitialize(&hInst, NULL, &ttsParm, (void *)this);
}
if (ret != 0) {
log->logf("<error> Error initializing RealSpeak Engine: %d", ret);
} else {
log->logf("<debug> RealSpeak Engine initialized.");
}

if(RealSpeak::dictionary)
loadDictionary(RealSpeak::dictionary);

output = (unsigned char*)malloc(TTS_OUTPUT_BUFFER + 44);
outputSize = TTS_OUTPUT_BUFFER;
memcpy(output, wavHeader, 44);
initMutex(&mutex);
}

TTSRETVAL RealSpeak::CbTtsEventNotify (void *pAppData, void *ppBuffer, U16 nDataSize, U16 event) {
RealSpeak *me = (RealSpeak*)pAppData;
//me->log->logf("<debug> Received RealSpeak Event: %d", event);
return 0;
}

TTSRETVAL RealSpeak::sourceCallback(void *pAppData, void *data, U32 len, U32 *datasize) {
RealSpeak *me = (RealSpeak*)pAppData;

if (!me->text || !*(me->text)) {
*datasize = 0;
if(me->text) {
free(me->textstart);
me->text = 0;
}
return TTS_ENDOFDATA;
}
if (len >= strlen(me->text)) {
memcpy(data, me->text, strlen(me->text));
*datasize = strlen(me->text);
me->text += strlen(me->text);
} else {
memcpy(data, me->text, len);
*datasize = len;
me->text += len;
}
return TTS_SUCCESS;
}

void* RealSpeak::destCallback(void *pAppData, U16 nDatatype, void *data, U32 datasize, U32 *buffersize) {
RealSpeak *me = (RealSpeak*)pAppData;
void*ret = 0;
//me->log->logf("<debug> TTS: DestCallback: datatype:%d data:%08x size:%d", nDatatype, data, datasize);

while((int)(datasize+TTS_OUTPUT_BUFFER/2) > me->outputSize - me->outputPos) {
me->outputSize += TTS_OUTPUT_BUFFER;
void*newdata = realloc(me->output, me->outputSize + 44);
if(!newdata) {
// failed, cut off
me->log->logf("<error> TTS: realloc(%d) failed", me->outputSize);
datasize = me->outputSize - me->outputPos;
me->outputSize -= TTS_OUTPUT_BUFFER; //revert
break;
} else {
me->log->logf("<notice> TTS: expanded output buffer to %d", me->outputSize);
me->output = (unsigned char*)newdata;
}
}
*buffersize = me->outputSize - me->outputPos;

ret = (void *)&me->output[me->outputPos + 44];
me->outputPos += (long)datasize;
//me->log->logf("<debug> TTS: DestCallback: returning memory for %d bytes", *buffersize);
return ret;
}

/**
* free used ressources
*/
RealSpeak::~RealSpeak()
{
log->logf("<debug> Deleting RealSpeak Engine");
int ret = TtsUninitialize(hInst);
if (ret != 0) {
log->logf("<error> Error closing RealSpeak Engine: %d", ret);
}
free(output);
destroyMutex(&mutex);
}

/*
* load a User Dictionary
*/
void RealSpeak::loadDictionary(const char* dictionary)
{
int ret;
if(dictionary) {
userdict = 0;
log->logf("<notice> Loading Realspeak dictionary \"%s\"...", dictionary);
ret = TtsLoadUsrDict(0, &userdict, (char*)dictionary);
if(ret != 0) {
log->logf("<error> Couldn't load dictionary \"%s\", error %d/%08x", dictionary, ret, userdict);
} else {
ret = TtsEnableUsrDict(hInst, userdict);
if(ret != 0) {
log->logf("<error> Couldn't enable dictionary \"%s\", error %d", dictionary, ret);
} else {
log->logf("<notice> Dictionary \"%s\" loaded", dictionary);
}
}
}
}

/**
* set implementation specific parameters
*/
int RealSpeak::setParams(const char *key, const char *value)
{
/* test which parameters are for us */
if(!strncmp(key, "realspeak.", 10))
{
const char*key2 = &key[10];

CRS_CHECKVAR(key2,"dictionary",dictionary,strdup(value));
CRS_CHECKVAR(key2,"server",remote_server,strdup(value));
CRS_CHECKVAR(key2,"service",remote_service,strdup(value));
CRS_CHECKVAR(key2,"port",remote_port,atoi(value));
CRS_CHECKVAR(key2,"language",language,strdup(value));
CRS_CHECKVAR(key2,"voice",voice,strdup(value));
CRS_CHECKVAR(key2,"engine",engine,strdup(value));
}
return 0;
}

/**
* this method will convert a given text to speech and output this over the microsoft
* audio mapper, thus creating direct audio output on the soundcard.
* @param text the text to be converted to speech
*/
void RealSpeak::text2Audio(const char *text)
{
log->logf("<verbose> text2Audio is not implemented in RealSpeak");
}

/**
* create speech directly to a wave file
* @param text the text to be spoken
* @param filenam name and path of the file to be created
*/
void RealSpeak::text2File(const char *text, const char *filename)
{
lockMutex(&mutex);
this->text = this->textstart = strdup(text);

// start conversion
int ret = TtsProcess(hInst);
if (ret != 0) {
log->logf("<error> TTS: could not process input, got error: %d", ret);
return;
}

log->logf("<debug> TTS: speech complete, generated %d bytes of data.", outputPos);

unsigned int wavelen = outputPos + 36;
memcpy(output + 4, &wavelen, 4);
wavelen = outputPos;
memcpy(output + 40, &wavelen, 4);

FILE *fp = fopen(filename, "wb");
fwrite(output, sizeof(unsigned char), outputPos + 44, fp);
fclose(fp);
unlockMutex(&mutex);
}


/**
* this method will create speech that is stored in a memory buffer. the created speech has the format pcm mono with
* 8kHz sampling rate and 16 bit. The memory structure can be directly used for input to the capi
* subsystem.
* @param text this text will be spoken
* @return oubuffer pointer to the output buffer (the memory will be allocated)
* @return size the size of the allocated memory
*/
void RealSpeak::text2Stream(const char *text, unsigned char **outbuffer, unsigned long *size)
{
lockMutex(&mutex);
this->text = this->textstart = strdup(text);

// start conversion
outputPos = 0;
int ret = TtsProcess(hInst);
if (ret != 0) {
log->logf("<error> TTS: could not process input, got error: %d", ret);
*size = 0;
return;
}

log->logf("<debug> TTS: speech complete, generated %d bytes of data.", outputPos);


*outbuffer = (unsigned char*)malloc (outputPos + 44);
unsigned int wavelen = outputPos + 36;
memcpy(&output[4], &wavelen, 4);
wavelen = outputPos;
memcpy(&output[40], &wavelen, 4);
memcpy(*outbuffer, output, outputPos + 44);
*size = outputPos + 44;

/* FILE *fp = fopen("temp.wav", "wb");
fwrite(*outbuffer, sizeof(unsigned char), outputPos + 44, fp);
fclose(fp); */
unlockMutex(&mutex);
}

Labels: , , , ,

0 Comments:

Post a Comment

<< Home