You are on page 1of 62

Table Contents

Title Page

Approval page …………………………………………………………............…………………. i

Certification …………………………………………………………..……………….……...…. ii

Dedication ………………………………………………………………………………………. iii

Acknowledgement ………………………………………………………………….…….…….. iv

Table of Contents………………………………………………………………………..……..… 1

Abstract ……………………………………………………………………………..…….…….. 5

CHAPTER ONE............................................................................................................................. 6

1.0 INTRODUCTION ……………………………………………………………………….….. 6

1.1 BACKGROUND OF THE STUDY ........................................................................................ 6

1.2 AIMS AND OBJECTIVE OF THE STUDY........................................................................... 7

1.3 MOTIVATION OF THE STUDY............................................................................................ 7

1.4 SCOPE OF THE STUDY......................................................................................................... 8

1.5 LIMITATION OF THE STUDY.............................................................................................. 8

1.6 DEFINITION OF TERMS....................................................................................................... 8

CHAPTER TWO ........................................................................................................................... 9

LITERATURE REVIEW............................................................................................................... 9

2.0 INTRODUCTION.................................................................................................................... 9

2.1 OVERVIEW OF SPEECH SYNTHESIS................................................................................. 9

2.2 TEXT TO SPEECH .........................................................................................................…...10

2.3 AUTOMATIC READING, WHAT FOR?..............................................................................11

2.3.1 Telecommunications services…………………………………………..………………….11

2.3.2 Language education…………………………………….………………………………….12

2.3.3 Aid to handicapped persons……………………………………………………….……….12

1
2.3.4 Talking books and toys……………………………………………………………….…..12

2.3.5 Vocal Monitoring……………………………………………………………………….….12

2.3.6 Multimedia, man-machine communication………………..………………………………13

2.3.7 Fundamental and applied research…………………………………………………………13

2.4 HOW DOES A MACHINE READ?.......................................................................................13

2.5 THE NLP COMPONENT……………………………………………………..…………….14

2.5.1 Text analysis..............….......................................................................................................15

2.5.2 Automatic phonetization………………………………...…………………………………16

2.5.3 Prosody generation………………………………………………………………...……….19

2.6 THE DSP COMPONENT…………………………………………………………….……..21

2.6.1 Rule-based synthesizers................................................................................................…....22

2.6.2 Concatenate synthesizers………………………………….……………………………….23

2.7 DATABASE PREPARATION………………………………………………………………23

2.7.1 Speech synthesis……………………………………………………………………..…….26

2.7.2 Segmental quality…………………………………………………………….…………….26

CHAPTER THREE.......................................................................................................................29

PROPOSED SOLUTION..............................................................................................................29

3.0 INTRODUCTION...................................................................................................................29

3.1 GENERAL DESCRIPTION OF THE EXISTING SYSTEM.................................................29

3.2 EXPECTATION OF THE NEW SYSTEM............................................................................31

3.3 FACT FINDING METHODS USED......................................................................................31

3.3.1 Primary Source......................................................................................................................31

3.3.2 Secondary Source..................................................................................................................32

3.4 ORGINAZATIONAL STRUCTIONAL STRUCTURE.........................................................32

3.5 INPUT ANALYSIS.................................................................................................................32

2
3.6 PROCESS ANALYSIS...........................................................................................................32

3.7 SPEECH SYNTHESIS MODULE..........................................................................................33

3.7.1 To compute the output, the system consults.........................................................................33

3.7.2 Natural Language Processing (NLP) module.......................................................................34

3.7.3 Digital Signal Processing (DSP) module..............................................................................34

3.8 OUTPUT ANALYSIS.............................................................................................................35

CHAPTER FOUR..........................................................................................................................38

DESIGN AND IMPLEMENTATION..........................................................................................38

4.0 INTRODUCTION...................................................................................................................38

4.1 OUTPUT SPECIFICATION AND DESIGN..........................................................................38

4.2 INPUT DESIGN AND SPECIFICATION..............................................................................38

4.3 SYSTEM REQUIREMENTS.................................................................................................39

4.3.1 Hardware Requirements........................................................................................................39

4.3.2 Software Requirements.........................................................................................................39

4.4 CHOICE OF PROGRAMMING LANGUAGE............................................................................39


4.5 DESIGN OF THE NEW SYSTEM.........................................................................................40

4.6 DESIGN OF INDIVIDUAL OBJECTS OF THE PROGRAM..............................................41

4.7 FUNCTIONS OF THE ABSTRACT CLASSES....................................................................42

4.8 PRESENTATION OF SOME CODING OF THE PROGRAMME.......................................43

CHAPTER FIVE...........................................................................................................................45

EVALUATION/TESTING............................................................................................................45

5.0 INTRODUCTION...................................................................................................................45

5.1 WORKING..............................................................................................................................45

5.2 TESTING.................................................................................................................................49

CHAPTER SIX..............................................................................................................................50

3
CONCLUSION AND FUTURE WORK......................................................................................50

6.0 INTRODUCTION...................................................................................................................50

6.1 SUMMARY OF THE FINDING............................................................................................50

6.2 CONCLUSION………………………………………………………………………………51

6.3 RECOMMENDATION...........................................................................................................51

References......................................................................................................................................53

4
ABSTARCT

A Text-to-speech synthesizer is an application that converts text into spoken word, by analyzing
and processing the text using Natural Language Processing (NLP) and then using Digital Signal
Processing (DSP) technology to convert this processed text into synthesized speech
representation of the text. Here, we developed a useful text-to-speech synthesizer in the form of a
simple application that converts inputted text into synthesized speech and reads out to the user
which can then be saved as an file. The development of a text to speech synthesizer will be of
great help to people with visual impairment and make making through large volume of text A
Text-to-speech synthesizer is an application that converts text into spoken word, by analyzing
and processing the text using Natural Language Processing (NLP) and then using Digital Signal
Processing (DSP) technology to convert this processed text into synthesized speech
representation of the text. Here, we developed a useful text-to-speech synthesizer in the form of a
simple application that converts inputted text into synthesized speech and reads out to the user
which can then be saved as an mp3.file. The development of a text to speech synthesizer will be
of great help to people with visual impairment and make making through large volume.

5
CHAPTER ONE
1.0 INTRODUCTION
Text-to-speech synthesis -TTS - is the automatic conversion of a text into speech that
resembles, as closely as possible, a native speaker of the language reading that text. Text-to-
speech synthesizer (TTS) is the technology which lets computer speak to you. The TTS system
gets the text as the input and then a computer algorithm which called TTS engine analyses the
text, pre-processes the text and synthesizes the speech with some mathematical models. The TTS
engine usually generates sound data in an audio format as the output.
The text-to-speech (TTS) synthesis procedure consists of two main phases. The first is text
analysis, where the input text is transcribed into a phonetic or some other linguistic
representation, and the second one is the generation of speech waveforms, where the output is
produced from this phonetic and prosodic information. These two phases are usually called high
and low-level synthesis. A simplified version of this procedure is presented in figure below. The
input text might be for example data from a word processor, standard ASCII from e-mail, a
mobile text-message, or scanned text from a newspaper. The character string is then pre-
processed and analyzed into phonetic representation which is usually a string of phonemes with
some additional information for correct intonation, duration, and stress. Speech sound is finally
generated with the low-level synthesizer by the information from high-level one. The artificial
production of speech-like sounds has a long history, with documented mechanical attempts
dating to the eighteenth century.
1.1 BACKGROUND OF THE STUDY
Voice/speech synthesis is a field of computer science that deals with designing computer
systems that synthesize written text. It is a technology that allows a computer to convert a written
text into speech via a microphone or telephone. As an emerging technology, not all developers
are familiar with speech technology. While the basic functions of both speech synthesis and
speech recognition takes only minutes to understand, there are subtle and powerful capabilities
provided by computerized speech that developers will want to understand and utilize. Automatic
speech synthesis is one of the fastest developing fields in the framework of speech science and
engineering. As the new generation of computing technology, it comes as the next major
innovation in man machine interaction, after functionality of Speech recognition (TTS),
supporting Interactive Voice Response (IVR) systems. The basic idea of text-to-speech (TTS)

6
technology is to convert written input to spoken output by generating synthetic speech. There are
several ways of performing speech synthesis:
1. Simple voice recording and playing on demand;
2. splitting of speech into 30-50 phonemes (basic linguistic units) and their re-assembly in a
fluent speech pattern
3. The use of approximately 400 diaphones (splitting of phrases at the center of the phonemes
and not at the transition). The most important qualities of modern speech synthesis systems
are its naturalness and intelligibility. By naturalness we mean how closely the synthesized
speech resembles real human speech. Intelligibility, on the other hand, describes the ease
with which the speech is understood. The maximization of these two criteria is the main
development goal in the TTS field.
1.2 AIMS AND OBJECTIVE OF THE STUDY
This research aims at the Design and Implementation of a text to speech to Audio System to aid
accessibility and easy text to voice assimilation of documents in text format.
The following are the objectives of the study:
1. Develop a system that will convert text to audio for easy assimilation of document.
2. A system to easily detect a text file and convert to audio.
3. To design a system that will assist people with reading disabilities to easily convert text
to audio files.
4. To design and implement a system that will assist students’ reading comprehension skills.
1.3 MOTIVATION OF THE STUDY
Presenting reading material orally in addition to a traditional paper presentation format increases
the inability of users to be able to decode reading material, and therefore, has the potential to
prevent students with reading disabilities better comprehend written texts. There are several
different technologies for presenting oral materials (e.g., text-to-speech, reading pens,
audiobooks). Already texts were accessible orally through books-on-tape and through human
readers. There is a need to develop and implement a text-to-speech system that will be used
widely in the educational settings from elementary school through universities. With the
implementation of the text to Audio system, they will be improved effects of text-to-speech and
related tools for oral presentation of material on reading comprehension for students with reading
disabilities.

7
1.4 SCOPE OF THE STUDY
The scope of the research is focused on implementing a text to speech or to Audio system to
improve the usage of text documents and in order to achieve a more flexible audio speech
system.

1.5 LIMITATION OF THE STUDY

During the development of the research study, they following limitations were encountered;

1. Limited research material available at the school library and on the internet.
2. High cost of setting up the system as it requires a high programming language.
3. Combining school work and carrying out the research.
1.6 DEFINITION OF TERM.
Audio- audio most commonly refers to sound, as it is transmitted in signal form.

Speech- the expression of or the ability to express thoughts and feelings by articulate sounds

Oral - relating to the transmission of information or literature by word of mouth rather than in
writing.

Reading Disabilities - a condition in which a sufferer displays difficulty reading.

File - a collection of data treated as a unit by a computer.

Document- A document is a written, drawn, presented, or memorialized representation of


thought.

8
CHAPTER TWO

LITERATURE REVIEW
2.0 INTRODUCTION
Text-to-speech synthesis -TTS - is the automatic conversion of a text into speech that
resembles, as closely as possible, a native speaker of the language reading that text. Text-to
speech synthesizer (TTS) is the technology which lets computer speak to you. The TTS system
gets the text as the input and then a computer algorithm which called TTS engine analyses the
text, pre-processes the text and synthesizes the speech with some mathematical models. The TTS
engine usually generates sound data in an audio format as the output.
The text-to-speech (TTS) synthesis procedure consists of two main phases. The first is text
analysis, where the input text is transcribed into a phonetic or some other linguistic
representation, and the second one is the generation of speech waveforms, where the output is
produced from this phonetic and prosodic information. These two phases are usually called high
and low-level synthesis. A simplified version of this procedure is presented in figure 1 below.
The input text might be for example data from a word processor, standard ASCII from e-mail, a
mobile text-message, or scanned text from a newspaper. The character string is then pre-
processed and analyzed into phonetic representation which is usually a string of phonemes with
some additional information for correct intonation, duration, and stress. Speech sound is finally
generated with the low-level synthesizer by the information from high-level one. The artificial
production of speech-like sounds has a long history, with documented mechanical attempts
dating to the eighteenth century
2.1 OVERVIEW OF SPEECH SYNTHESIS
Speech synthesis can be described as artificial production of human speech . A computer
system used for this purpose is called a speech synthesizer, and can be implemented in software
or hardware. A text-to-speech (TTS) system converts normal language text into speech
Synthesized speech can be created by concatenating pieces of recorded speech that are stored in
a database. Systems differ in the size of the stored speech units; a system that stores phones or
diaphones provides the largest output range, but may lack clarity. For specific usage domains,
the storage of entire words or sentences allows for high-quality output. Alternatively, a
synthesizer can incorporate a model of the vocal tract and other human voice characteristics to
create a completely "synthetic" voice output. The quality of a speech synthesizer is judged by its
9
similarity to the human voice and by its ability to be understood. An intelligible text-to-speech
program allows people with visual impairments or reading disabilities to listen to written works
on a home computer.

A text-to-speech system (or "engine") is composed of two parts: a front-end and a back-end.
The front-end has two major tasks. First, it converts raw text containing symbols like numbers
and abbreviations into the equivalent of written-out words. This process is often called text
normalization, preprocessing, or tokenization. The front-end then assigns phonetic transcriptions
to each word, and divides and marks the text into prosodic units, like phrases, clauses, and
sentences. The process of assigning phonetic transcriptions to words is called text-to-phoneme or
grapheme-to-phoneme conversion. Phonetic transcriptions and prosody information together
make up the symbolic linguistic representation that is output by the front-end. The back-end—
often referred to as the synthesizer—then converts the symbolic linguistic representation into
sound. In certain systems, this part includes the computation of the target prosody (pitch contour)

2.2 TEXT TO SPEECH

A Text-To-Speech (TTS) synthesizer is a computer-based system that should be able to read


any text aloud, whether it was directly introduced in the computer by an operator or scanned and
submitted to an Optical Character Recognition (OCR) system. Let us try to be clear. There is a
fundamental difference between the system we are about to discuss here and any other talking
machine (as a cassette-player for example) in the sense that we are interested in the automatic
production of new sentences. This definition still needs some refinements. Systems that simply
concatenate isolated words or parts of sentences, denoted as Voice Response Systems, are only
applicable when a limited vocabulary is required (typically a few one hundreds of words), and
when the sentences to be pronounced respect a very restricted structure, as is the case for the
announcement of arrivals in train stations for instance. In the context of TTS synthesis, it is
impossible (and luckily useless) to record and store all the words of the language. It is thus more
suitable to define Text-To-Speech as the automatic production of speech, through a grapheme-
to-phoneme transcription of the sentences to utter.

At first sight, this task does not look too hard to perform. After all, is not the human being
potentially able to correctly pronounce an unknown sentence, even from his childhood? We all

10
have, mainly unconsciously, a deep knowledge of the reading rules of our mother tongue. They
were transmitted to us, in a simplified form, at primary school, and we improved them year after
year. However, it would be a bold claim indeed to say that it is only a short step before the
computer is likely to equal the human being in that respect. Despite the present state of our
knowledge and techniques and the progress recently accomplished in the fields of Signal
Processing and Artificial Intelligence, we would have to express some reservations. As a matter
of fact, the reading process draws from the furthest depths, often unthought of, of the human
intelligence.

2.3 AUTOMATIC READING, WHAT FOR?

Each and every synthesizer is the result of a particular and original imitation of the human
reading capability, submitted to technological and imaginative constraints that are characteristic
of the time of its creation. The concept of high quality TTS synthesis appeared in the mid-
eighties, as a result of important developments in speech synthesis and natural language
processing techniques, mostly due to the emergence of new technologies (Digital Signal and
Logical Inference Processors). It is now a must for the speech products family expansion.

Potential applications of High Quality TTS Systems are indeed numerous. Here are some
examples:

2.3.1 Telecommunications services: TTS systems make it possible to access textual


information over the telephone. Knowing that about 70 % of the telephone calls
actually require very little interactivity, such a prospect is worth being considered.
Texts might range from simple messages, such as local cultural events not to miss
(cinemas, theatres, to huge databases which can hardly be read and stored as digitized
speech. Queries to such information retrieval systems could be put through the user's
voice (with the help of a speech recognizer), or through the telephone keyboard (with
DTMF systems). One could even imagine that our (artificially) intelligent machines
could speed up the query when needed, by providing lists of keywords, or even
summaries. In this connection, AT&T has recently organized a series of consumer
tests for some promising telephone services (Levinson et al. 93). They include: Who's

11
Calling (get the spoken name of your caller before being connected and hang up to
avoid the call), Integrated Messaging (have your electronic mail or facsimiles being
automatically read over the telephone), Telephone Relay Service (have a telephone
conversation with speech or hearing impaired persons thanks to ad hoc text-to-voice
and voice-to-text conversion), and Automated Caller Name and Address (a
computerized version of the "reverse directory"). These applications have proved
acceptable, and even popular, provided the intelligibility of the synthetic utterances
was high enough. Naturalness was not a major issue in most cases.
2.3.2 Language education: High Quality TTS synthesis can be coupled with a Computer
Aided Learning system, and provide a helpful tool to learn a new language. To our
knowledge, this has not been done yet, given the relatively poor quality available with
commercial systems, as opposed to the critical requirements of such tasks.
2.3.3 Aid to handicapped persons. Voice handicaps originate in mental or motor/sensation
disorders. Machines can be an invaluable support in the latter case: with the help of an
especially designed keyboard and a fast sentence assembling program, synthetic
speech can be produced in a few seconds to remedy these impediments. Astros-
physician Stephen Hawking gives all his lectures in this way. The aforementioned
Telephone Relay Service is another example. Blind people also widely benefit from
TTS systems, when coupled with Optical Recognition Systems (OCR), which give
them access to written information. The market for speech synthesis for blind users of
personal computers will soon be invaded by mass-market synthesizers bundled with
sound cards. DECtalk (TM) is already available with the latest SoundBlaster (TM)
cards now, although not yet in a form useful for blind people.
2.3.4 Talking books and toys: The toy market has already been touched by speech
synthesis. Many speaking toys have appeared, under the impulse of the innovative
'Magic Spell' from Texas Instruments. The poor quality available inevitably restrains
the educational ambition of such products. High Quality synthesis at affordable prices
might well change this.
2.3.5 Vocal Monitoring: In some cases, oral information is more efficient than written
messages. The appeal is stronger, while the attention may still focus on other visual

12
sources of information. Hence the idea of incorporating speech synthesizers in
measurement or control systems.
2.3.6 Multimedia, man-machine communication: In the long run, the development of
high quality TTS systems is a necessary step (as is the enhancement of speech
recognizers) towards more complete means of communication between men and
computers. Multimedia is a first but promising move in this direction.
2.3.7 Fundamental and applied research: TTS synthesizers possess a very peculiar
feature which makes them wonderful laboratory tools for linguists: they are
completely under control, so that repeated experiences provide identical results (as is
hardly the case with human beings). Consequently, they allow investigating the
efficiency of intonative and rhythmic models. A particular type of TTS systems, which
are based on a description of the vocal tract through its resonant frequencies (its
formants) and denoted as formant synthesizers, has also been extensively used by
phoneticians to study speech in terms of acoustical rules. In this manner, for instance,
articulatory constraints have been enlightened and formally described.
2.4 HOW DOES A MACHINE READ?

From now on, it should be clear that a reading machine would hardly adopt a processing
scheme as the one naturally taken up by humans, whether it was for language analysis or for
speech production itself. Vocal sounds are inherently governed by the partial differential
equations of fluid mechanics, applied in a dynamic case since our lung pressure, glottis tension,
and vocal and nasal tracts configuration evolve with time. These are controlled by our cortex,
which takes advantage of the power of its parallel structure to extract the essence of the text read:
its meaning. Even though, in the current state of the engineering art, building a Text-To-Speech
synthesizer on such intricate models is almost scientifically conceivable (intensive research on
articulatory synthesis, neural networks, and semantic analysis give evidence of it), it would result
anyway in a machine with a very high degree of (possibly avoidable) complexity, which is not
always compatible with economic criteria. After all, flies do not flap their wings!

Figure 2.0 introduces the functional diagram of a very general TTS synthesizer. As for human
reading, it comprises a Natural Language Processing module (NLP), capable of producing a
phonetic transcription of the text read, together with the desired intonation and rhythm (often

13
termed as prosody), and a Digital Signal Processing module (DSP), which transforms the
symbolic information it receives into speech. But the formalisms and algorithms applied often
manage, thanks to a judicious use of mathematical and linguistic knowledge of developers, to
short-circuit certain processing steps. This is occasionally achieved at the expense of some
restrictions on the text to pronounce, or results in some reduction of the "emotional dynamics" of
the synthetic voice (at least in comparison with human performances), but it generally allows to
solve the problem in real time with limited memory requirements.

Figure 2.0 a simple but general functional diagram of a TTS system.

2.5 THE NLP COMPONENT


Figure 2.1 introduces the skeleton of a general NLP module for TTS purposes. One
immediately notices that, in addition with the expected letter-to-sound and prosody
generation blocks, it comprises a morph-syntactic analyzer, underlying the need for some
syntactic processing in a high quality Text-To-Speech system. Indeed, being able to reduce a
given sentence into something like the sequence of its parts-of-speech, and to further describe
it in the form of a syntax tree, which unveils its internal structure, is required for at least two
reasons:

1. Accurate phonetic transcription can only be achieved provided the part of speech
category of some words is available, as well as if the dependency relationship
between successive words is known.
2. Natural prosody heavily relies on syntax. It also obviously has a lot to do with
semantics and pragmatics, but since very few data is currently available on the
generative aspects of this dependence, TTS systems merely concentrate on syntax.

14
Yet few of them are actually provided with full disambiguation and structuration
capabilities.

Figure 2.1 The NLP module of a general Text-To-Speech conversion system.


2.5.1 Text analysis

The text analysis block is itself composed of:

A pre-processing module, which organizes the input sentences into manageable lists of words. It
identifies numbers, abbreviations, acronyms and idiomatic and transforms them into full text
when needed. An important problem is encountered as soon as the character level: that of

15
punctuation ambiguity (including the critical case of sentence end detection). It can be solved, to
some extent, with elementary regular grammars.

A morphological analysis module, the task of which is to propose all possible part of speech
categories for each word taken individually, on the basis of their spelling. Inflected, derived, and
compound words are decomposed into their elementary grapheme units (their morphs) by simple
regular grammars exploiting lexicons of stems and affixes (see the CNET TTS conversion
program for French (Larreur et al. 89), or the MITTALK system (Allen et al. 87).

The contextual analysis module considers words in their context, which allows it to reduce the
list of their possible part of speech categories to a very restricted number of highly probable
hypotheses, given the corresponding possible parts of speech of neighbouring words. This can be
achieved either with n-grams (see Kupiec 92, Willemse & Gulikers 92, for instance), which
describe local syntactic dependences in the form of probabilistic finite state automata (i.e. as a
Markov model), to a lesser extent with mutli-layer perceptron (i.e., neural networks) trained to
uncover contextual rewrite rules, as in (Benello et al. 89), or with local, non-stochastic
grammars provided by expert linguists or automatically inferred from a training data set with
classification and regression tree (CART) techniques (Sproat et al. 92, Yarowsky 94).

Finally, a syntactic-prosodic parser, which examines the remaining search space and finds the
text structure (i.e. its organization into clause and phrase-like constituents) which more closely
relates to its expected prosodic realization (see below).

2.5.2 Automatic phonetization

A poem of the Dutch high school teacher and linguist G.N. Trenite surveys this problem in
an amusing way. It desperately ends with :

Finally, which rimes with "enough",


Though, though, plough, cough, Hough, or tough ?
Hiccough has the sound of "cup",
My advice is ... give it up !

The Letter-To-Sound (LTS) module is responsible for the automatic determination of the
phonetic transcription of the incoming text. It thus seems, at first sight, that its task is as simple

16
as performing the equivalent of a dictionary look-up! From a deeper examination, however, one
quickly realizes that most words appear in genuine speech with several phonetic transcriptions,
many of which are not even mentioned in pronunciation dictionaries. Namely:

1. Pronunciation dictionaries refer to word roots only. They do not explicitly account for
morphological variations (i.e. plural, feminine, conjugations, especially for highly
inflected languages, such as French), which therefore have to be dealt with by a specific
component of phonology, called morph phonology.
2. Some words actually correspond to several entries in the dictionary, or more generally to
several morphological analyses, generally with different pronunciations. This is typically
the case of heterophony homographs, i.e. words that are pronounced differently even
though they have the same spelling, as for 'record' (/rek�ùd/ or /rIk�ùd/), constitute by
far the most tedious class of pronunciation ambiguities. Their correct pronunciation
generally depends on their part-of-speech and most frequently contrasts verbs and non-
verbs , as for 'contrast' (verb/noun) or 'intimate' (verb/adjective), although it may also be
based on syntactic features, as for 'read' (present/past)
3. Pronunciation dictionaries merely provide something that is closer to a phonemic
transcription than from a phonetic one (i.e. they refer to phonemes rather than to phones).
As denoted by Withgott and Chen (1993) : "while it is relatively straightforward to build
computational models for morph phonological phenomena, such as producing the
dictionary pronunciation of 'electricity' given a base form 'electric', it is another matter to
model how that pronunciation actually sounds". Consonants, for example, may reduce or
delete in clusters, a phenomenon termed as consonant cluster simplification, as in
'softness' [s�fnIs] in which [t] fuses in a single gesture with the following [n].
4. Words embedded into sentences are not pronounced as if they were isolated. Surprisingly
enough, the difference does not only originate in variations at word boundaries (as with
phonetic liaisons), but also on alternations based on the organization of the sentence into
non-lexical units, that is whether into groups of words (as for phonetic lengthening) or
into non-lexical parts thereof (many phonological processes, for instance, are sensitive to
syllable structure).

17
5. Finally, not all words can be found in a phonetic dictionary: the pronunciation of new
words and of many proper names has to be deduced from the one of already known
words.

Clearly, points 1 and 2 heavily rely on a preliminary morph syntactic (and possibly semantic)
analysis of the sentences to read. To a lesser extent, it also happens to be the case for point 3 as
well, since reduction processes are not only a matter of context-sensitive phonation, but they also
rely on morphological structure and on word grouping, that is on morph syntax. Point 4 puts a
strong demand on sentence analysis, whether syntactic or metrical, and point 5 can be partially
solved by addressing morphology and/or by finding graphemic analogies between words.

It is then possible to organize the task of the LTS module in many ways (Fig. 2.3), often
roughly classified into dictionary-based and rule-based strategies, although many intermediate
solutions exist.

Dictionary-based solutions consist of storing a maximum of phonological knowledge into a


lexicon. In order to keep its size reasonably small, entries are generally restricted to morphemes,
and the pronunciation of surface forms is accounted for by inflectional, derivational, and
compounding morphophonemic rules which describe how the phonetic transcriptions of their
morphemic constituents are modified when they are combined into words. Morphemes that
cannot be found in the lexicon are transcribed by rule. After a first phonemic transcription of
each word has been obtained, some phonetic post-processing is generally applied, so as to
account for coarticulatory smoothing phenomena. This approach has been followed by the
MITTALK system (Allen et al. 87) from its very first day. A dictionary of up to 12,000
morphemes covered about 95% of the input words. The AT&T Bell Laboratories TTS system
follows the same guideline (Levinson et al. 93), with an augmented morpheme lexicon of 43,000
morphemes (Coker 85).

A rather different strategy is adopted in rule-based transcription systems, which transfer most
of the phonological competence of dictionaries into a set of letter-to-sound (or grapheme-to-
phoneme) rules. This time, only those words that are pronounced in such a particular way that
they constitute a rule on their own are stored in an exceptions dictionary. Notice that, since many

18
exceptions are found in the most frequent words, a reasonably small exceptions dictionary can
account for a large fraction of the words in a running text. In English, for instance, 2000 words
typically suffice to cover 70% of the words in text (Hunnicut 800).

It has been argued in the early days of powerful dictionary-based methods that they were
inherently capable of achieving higher accuracy than letter-to-sound rules (Coker et al 90), given
the availability of very large phonetic dictionaries on computers. On the other hand, considerable
efforts have recently been made towards designing sets of rules with a very wide coverage
(starting from computerized dictionaries and adding rules and exceptions until all words are
covered, as in the work of Daelemans & van den Bosch (1993) or that of Belrhali et al (1992).
Clearly, some trade-off is inescapable. Besides, the compromise is language-dependent, given
the obvious differences in the reliability of letter-to-sound correspondences for different
languages.

Figure 2.3. Dictionary-based (left) versus rule-based (right) phonetization.

2.5.3 Prosody generation

The term prosody refers to certain properties of the speech signal which are related to audible
changes in pitch, loudness, and syllable length. Prosodic features have specific functions in
speech communication (see Fig.2.4). The most apparent effect of prosody is that of focus. For
instance, there are certain pitch events which make a syllable stand out within the utterance, and
indirectly the word or syntactic group it belongs to will be highlighted as an important or new
component in the meaning of that utterance. The presence of a focus marking may have various

19
effects, such as contrast, depending on the place where it occurs, or the semantic context of the
utterance.

Fig.2.4. Different kinds of information provided by intonation (lines indicate pitch movements;
solid lines indicate stress).
a) Focus or given/new information;
b) Relationships between words (saw-yesterday; I-yesterday; I-him)
c) Finality (top) or continuation (bottom), as it appears on the last syllable;
d) Segmentation of the sentence into groups of syllables.

Although maybe less obvious, there are other, more systematic or general functions.

Prosodic features create a segmentation of the speech chain into groups of syllables, or,
put the other way round, they give rise to the grouping of syllables and words into larger chunks.
Moreover, there are prosodic features which indicate relationships between such groups,
indicating that two or more groups of syllables are linked in some way. This grouping effect is
hierarchical, although not necessarily identical to the syntactic structuring of the utterance.

So what? Does this mean that TTS systems are doomed to a mere robot-like intonation
until a brilliant computational linguist announces a working semantic-pragmatic analyzer for
unrestricted text (i.e. not before long)? There are various reasons to think not, provided one

20
accepts an important restriction on the naturalness of the synthetic voice, i.e. that its intonation is
kept 'acceptable neutral’:

"Acceptable intonation must be plausible, but need not be the most appropriate
intonation for a particular utterance: no assumption of understanding or generation by
the machine need be made. Neutral intonation does not express unusual emphasis,
contrastive stress or stylistic effects: it is the default intonation which might be used for
an utterance out of context. (...) This approach removes the necessity for reference to
context or world knowledge while retaining ambitious linguistic goals." (Monaghan 89)

2.6 THE DSP COMPONENT

Intuitively, the operations involved in the DSP module are the computer analogue of
dynamically controlling the articulatory muscles and the vibratory frequency of the vocal folds
so that the output signal matches the input requirements. In order to do it properly, the DSP
module should obviously, in some way, take articulatory constraints into account, since it has
been known for a long time that phonetic transitions are more important than stable states for the
understanding of speech (Lieberman 59). This, in turn, can be basically achieved in two ways:

• Explicitly, in the form of a series of rules which formally describe the influence of
phonemes on one another;
• Implicitly, by storing examples of phonetic transitions and co-articulations into a speech
segment database, and using them just as they are, as ultimate acoustic units (i.e. in place
of phonemes).

Two main classes of TTS systems have emerged from this alternative, which quickly turned into
synthesis philosophies given the divergences they present in their means and objectives:
synthesis-by-rule and synthesis-by-concatenation.

2.6.1 Rule-based synthesizers

21
Rule-based synthesizers are mostly in favor with phoneticians and phonologists, as they
constitute a cognitive, generative approach of the phonation mechanism. The broad spreading of
the Klatt synthesizer (Klatt 80), for instance, is principally due to its invaluable assistance in the
study of the characteristics of natural speech, by analytic listening of rule-synthesized speech.
What is more, the existence of relationships between articulatory parameters and the inputs of
the Klatt model make it a practical tool for investigating physiological constraints (Stevens 90).

For historical and practical reasons (mainly the need for a physical interpretability of the
model), rule synthesizers always appear in the form of formant synthesizers. These describe
speech as the dynamic evolution of up to 60 parameters (Stevens 90), mostly related to formant
and anti-formant frequencies and bandwidths together with glottal waveforms. Clearly, the large
number of (coupled) parameters complicates the analysis stage and tends to produce analysis
errors. What is more, formant frequencies and bandwidths are inherently difficult to estimate
from speech data. The need for intensive trials and errors in order to cope with analysis errors
makes them time-consuming systems to develop (several years are commonplace). Yet, the
synthesis quality achieved up to now reveals typical busyness problems, which originate from
the rules themselves: introducing a high degree of naturalness is theoretically possible, but the
rules to do so are still to be discovered.

Rule-based synthesizers remain, however, a potentially powerful approach to speech


synthesis. They allow, for instance, to study speaker-dependent voice features so that switching
from one synthetic voice into another can be achieved with the help of specialized rules in the
rule database. Following the same idea, synthesis-by-rule seems to be a natural way of handling
the articulatory aspects of changes in speaking styles (as opposed to their prosodic counterpart,
which can be accounted for by concatenation-based synthesizers as well). No wonder then that it
has been widely integrated into TTS systems MITTALK (Allen et al. 87) and the JSRU
synthesizer (Holmes et al. 64) for English, the multilingual INFOVOX system (Carlson et al.
82), and the I.N.R.S system (O'Shaughnessy 84) for French.

2.6.2 Concatenate synthesizers

22
As opposed to rule-based ones, concatenate synthesizers possess a very limited knowledge of the
data they handle: most of it is embedded in the segments to be chained up. This clearly appears
in figure 6, where all the operations that could indifferently be used in the context of a music
synthesizer (i.e. without any explicit reference to the inner nature of the sounds to be processed)
have been grouped into a sound processing block, as opposed to the upper speech processing
block whose design requires at least some understanding of phonetics.

2.7 DATABASE PREPARATION

A series of preliminary stages have to be fulfilled before the synthesizer can produce its first
utterance. At first, segments are chosen so as to minimize future concatenation problems. A
combination of diaphones (i.e. units that begin in the middle of the stable state of a phone and
end in the middle of the following one), half-syllables, and trip hones (which differ from
diaphones in that they include a complete central phone) are often chosen as speech units, since
they involve most of the transitions and co-articulations while requiring an affordable amount of
memory. When a complete list of segments has emerged, a corresponding list of words is
carefully completed, in such a way that each segment appears at least once (twice is better, for
security). Unfavorable positions like inside stressed syllables or in strongly reduced (i.e. over-co-
articulated) contexts, are excluded. A corpus is then digitally recorded and stored, and the elected
segments are spotted, either manually with the help of signal visualization tools, or automatically
thanks to segmentation algorithms, the decisions of which are checked and corrected
interactively. A segment database finally centralizes the results, in the form of the segment
names, waveforms, durations, and internal sub-splitting. In the case of diaphones, for example,
the position of the border between phones should be stored, so as to be able to modify the
duration of one half-phone without affecting the length of the other one.

23
Figure 2.5. A general concatenation-based synthesizer.

The upper left hatched block corresponds to the development of the synthesizer (i.e.
it is processed once for all). Other blocks correspond to run-time operations. Language-
dependent operations and data are indicated by a flag.

24
Segments are then often given a parametric form, in the form of a temporal sequence of
vectors of parameters collected at the output of a speech analyzer and stored in a parametric
segment database. The advantage of using a speech model originates in the fact that:

• Well-chosen speech models allow data size reduction, an advantage which is hardly
negligible in the context of concatenation-based synthesis given the amount of data to be
stored. Consequently, the analyzer is often followed by a parametric speech coder.
• A number of models explicitly separate the contributions of respectively the source and
the vocal tract, an operation which remains helpful for the pre-synthesis operations:
prosody matching and segments concatenation.

Indeed, the actual task of the synthesizer is to produce, in real-time, an adequate sequence of
concatenated segments, extracted from its parametric segment database and the prosody of which
has been adjusted from their stored value, i.e. the intonation and the duration they appeared with
in the original speech corpus, to the one imposed by the language processing module.
Consequently, the respective parts played by the prosody matching and segments concatenation
modules are considerably alleviated when input segments are presented in a form that allows
easy modification of their pitch, duration, and spectral envelope, as is hardly the case with crude
waveform samples.

Since segments to be chained up have generally been extracted from different words, that are
in different phonetic contexts, they often present amplitude and timbre mismatches. Even in the
case of stationary vocalic sounds, for instance, a rough sequencing of parameters typically leads
to audible discontinuities. These can be coped with during the constitution of the synthesis
segments database, thanks to an equalization in which related endings of segments are imposed
similar amplitude spectra, the difference being distributed on their neighborhood. In practice,
however, this operation is restricted to amplitude parameters: the equalization stage smoothly
modifies the energy levels at the beginning and at the end of segments, in such a way as to
eliminate amplitude mismatches (by setting the energy of all the phones of a given phoneme to
their average value). In contrast, timbre conflicts are better tackled at run-time, by smoothing
individual couples of segments when necessary rather than equalizing them once for all, so that
some of the phonetic variability naturally introduced by co-articulation is still maintained. In

25
practice, amplitude equalization can be performed either before or after speech analysis (i.e. on
crude samples or on speech parameters).

Once the parametric segment database has been completed, synthesis itself can begin.

2.7.1 Speech synthesis

A sequence of segments is first deduced from the phonemic input of the synthesizer, in a
block termed as segment list generation in Fig.2.5, which interfaces the NLP and DSP modules.
Once prosodic events have been correctly assigned to individual segments, the prosody matching
module queries the synthesis segment database for the actual parameters, adequately encoded, of
the elementary sounds to be used, and adapts them one by one to the required prosody. The
segment concatenation block is then in charge of dynamically matching segments to one another,
by smoothing discontinuities. Here again, an adequate modelization of speech is highly
profitable, provided simple interpolation schemes performed on its parameters approximately
correspond to smooth acoustical transitions between sounds. The resulting stream of parameters
is finally presented at the input of a synthesis block, the exact counterpart of the analysis one. Its
task is to produce speech.

2.7.2 Segmental quality

The efficiency of concatenative synthesizers to produce high quality speech is mainly


subordinated to:

1. The type of segments chosen.

Segments should obviously exhibit some basic properties:

o They should allow accounting for as many co-articulatory effects as possible.


o Given the restricted smoothing capabilities of the concatenation block, they
should be easily connectable.
o Their number and length should be kept as small as possible.
o On the other hand, longer units decrease the density of concatenation points,
therefore providing better speech quality. Similarly, an obvious way of accounting

26
for articulatory phenomena is to provide many variants for each phoneme. This is
clearly in contradiction with the limited memory constraint. Some trade-off is
necessary. Diaphones are often chosen. They are not too numerous (about 1200
for French, including lots of phoneme sequences that are only encountered at
word boundaries, for 3 minutes of speech, i.e. approximately 5 Mbytes of 16 bits
samples at 16 kHz) and they do incorporate most phonetic transitions. No wonder
then that they have been extensively used. They imply, however, a high density of
concatenation points (one per phoneme), which reinforces the importance of an
efficient concatenation algorithm. Besides, they can only partially account for the
many co-articulatory effects of a spoken language, since these often affect a
whole phone rather than just its right or left halves independently. Such effects are
especially patent when somewhat transient phones, such as liquids and (worst of
all) semi-vowels, are to be connected to each other. Hence the use of some larger
units as well, such as trip hones.
2. The model of speech signal, to which the analysis and synthesis algorithms refer.

The models used in the context of concatenate synthesis can be roughly classified into
two groups, depending on their relationship with the actual phonation process. Production
models provide mathematical substitutes for the part respectively played by vocal folds, nasal
and vocal tracts, and by the lips radiation. Their most representative members are Linear
Prediction Coding (LPC) synthesizers (Markel & Gray 76), and the formant synthesizers we
mentioned in section 2.2.1. On the contrary, phenomenological models intentionally discard
any reference to the human production mechanism. Among these pure digital signal
processing tools, spectral and time-domain approaches are increasingly encountered in TTS
systems. Two leading such molly uses no speech explicit speech model. It exhibits very
interesting practical features: a very high speech quality (the best currently available)
combined with a very low computational cost (7 operations per sample on the average). The
hybrid Harmonic/stochastic model is intrinsically more powerful than the TD-PSOLA one,
but it is also about ten times more computationally intensive. PSOLA synthesizers are now
widely used in the speech synthesis community. The recently developed MBROLA
algorithm (Dutoit 93, 96) even provides a time-domain algorithm which exhibits the very

27
efficient smoothing capabilities of the H/S model (for the spectral envelope mismatches that
cannot be avoided at concatenation points) as well as its very high data compression ratios
(up to 10 with almost no additional computational cost) while keeping the computational
complexity of PSOLA

28
CHAPTER THREE

PROPOSED SOLUTION

3.0 INTRODUCTION

Text-to-speech synthesis takes place in several steps. The TTS systems get a text as input,

which it first must analyze and then transform into a phonetic description. Then in a further step

it generates the prosody. From the information now available, it can produce a speech signal.

Text-to-speech synthesis takes place in several steps. The TTS systems get a text as input, which

it first must analyze and then transform into a phonetic description. Then in a further step it

generates the prosody. From the information now available, it can produce a speech signal.

3.1 GENERAL DESCRIPTION OF THE EXISTING SYSTEM

In today’s world where most information is shared digitally, visually impaired persons

always require their reading glasses to have access to this information, in a situation where they

somehow forgot their reading glasses, they won’t be to have access this information. But with

text to speech system digital information can be read out to a visually impaired person.

1. Analysis and Problems of Existing Systems

Existing systems algorithm is shown below in Figure 3.1 It shows that the system does

not have an avenue to annotate text to the specification of the user rather it speaks plaintext.

29
Figure 3.1: Algorithm of already existing systems

Due studies revealed the following inadequacies with already existing systems:

1. Structure analysis: punctuation and formatting do not indicate where paragraphs and other

structures start and end. For example, the final period in “P.D.P.” might be misinterpreted as

the end of a sentence.

2. Text pre-processing: The system only produces the text that is fed into it without any pre-

processing operation occurring.

3. Text-to-phoneme conversion: existing synthesizer system can pronounce tens of thousands

or even hundreds of thousands of words correctly if the word(s) is/are not found in the data

dictionary.

30
3.2 EXPECTATION OF THE NEW SYSTEM

It is expected that the new system will reduce and improve on the problems encountered

in the old system. The system is expected to among other things do the following;

1. The new system has a reasoning process.

2. The new system can do text structuring and annotation.

3. The new system’s speech rate can be adjusted.

4. The Pitch of the voice can be adjusted.

5. You can select between different voices and can even combine or juxtapose them if you

want to create a dialogue between them.

6. It has a user friendly interface so that people with less computer knowledge can easily use

it.

7. It must be compatible with all the vocal engines 8. It complies with SSML specification.

3.3 FACT FINDING METHODS USED

There are two main sources of data collection in carrying out this study, information was

basically obtained from the two sources which are:

a) Primary source and

b) Secondary source

3.3.1 Primary Source

Primary source refers to the sources of collecting original data in which the researcher

makes use of empirical approach such as personal interview, questionnaires or observation.

In my research, I used a method of observation were I was attentive to how contact are being

operated and saved using a manual method.

31
3.3.2 Secondary Source

The need of the secondary sources of data for this kind of project cannot be over

emphasized. The secondary data were obtained by me from the library source and most of the

information from the library research has been covered in my literature review in the previous

chapter of this project.

3.4 ORGINAZATIONAL STRUCTIONAL STRUCTURE

There is no organizational frame work since I was using an observation method. The

organizational frame work can only be explained.

3.5 INPUT ANALYSIS

The system has only one input structure which the text input form.

Enter Text Here: ………………………………………………………………………………………………..

3.6 PROCESS ANALYSIS

Text-to-speech synthesis takes place in several steps. The TTS systems get a text as input,

which it first must analyze and then transform into a phonetic description. Then in a further step

it generates the prosody. From the information now available, it can produce a speech signal.

32
The structure of the text-to-speech synthesizer can be broken down into major modules:

3.7 SPEECH SYNTHESIS MODULE

The TTS system converts an arbitrary ASCII text to speech. The first step involves

extracting the phonetic components of the message, and we obtain a string of symbols

representing sound-units (phonemes or allophones), boundaries between words, phrases and

sentences along with a set of prosody markers (indicating the speed, the intonation etc.). The

second step consists of finding the match between the sequence of symbols and appropriate items

stored in the phonetic inventory and binding them together to form the acoustic signal for the

voice output device.

Figure 3.2: Phases of TTS synthesis process

3.7.1 To compute the output, the system consults

1. A database containing the parameter values for the sounds within the word,

2. A knowledge base enumerating the options for synthesizing the sounds.

33
Incorporating Expert system in the internal programs will enable the new TTS system exhibit

these features:

1. The system performs at a level generally recognized as equivalent to that of a human expert

2. The system is highly domain specific.

3. The system can explain its reasoning process

4. If the information with which it is working is probabilistic or fuzzy, the system can correctly

propagate uncertainties and provide a range of alternative solution with associated likelihood

3.7.2 Natural Language Processing (NLP) module: It produces a phonetic transcription of

the text read, together with prosody.

3.7.3 Digital Signal Processing (DSP) module: It transforms the symbolic information it

receives from NLP into audible and intelligible speech.

The major operations of the NLP module are as follows:

1. Text Analysis: First the text is segmented into tokens. The token-to-word conversion creates

the orthographic form of the token. For the token “Mr.” the orthographic form “Mister” is

formed by expansion, the token “12” gets the orthographic form “twelve” and “1997” is

transformed to “nineteen ninety seven”.

2. Application of Pronunciation Rules: After the text analysis has been completed,

pronunciation rules can be applied. Letters cannot be transformed 1:1 into phonemes because

correspondence is not always parallel. In certain environments, a single letter can correspond

to either no phoneme (for example, “h” in “caught”) or several phoneme (“m” in

“Maximum”). In addition, several letters can correspond to a single phoneme (“ch” in

“rich”).

34
3.8 OUTPUT ANALYSIS

The output from the system designed is generated from the system inputs. The system format

for this system audio format Speech

User Interface (Source): This can be Graphical User Interface (GUI), or the Command Line

Interface (CLI). Knowledge Base (Rule set): Free TTS module/system/engine. This source of the

knowledge includes domain specific facts and heuristics useful for solving problems in the

domain. Free TTS is an open source speech synthesis system written entirely in the Java

programming language. It is based upon Flite. Free TTS is an implementation of Sun's Java

Speech API. Free TTS supports end-of-speech markers. Control Structures: This rule interpreter

inference engine applies to the knowledge base information for solving the problem. Short term

memory: The working memory registers the current problem status and history of solution to

date.

Figure 3.3: Data flow diagram of the Speech synthesis system Using Gane and Sarson Symbol

35
User Interface (Source): This can be Graphical User Interface (GUI), or the Command Line

Interface (CLI).

Knowledge Base (Rule set): Free TTS module/system/engine. This source of the knowledge

includes domain specific facts and heuristics useful for solving problems in the domain. Free

TTS is an open source speech synthesis system written entirely in the Java programming

language. It is based upon Flit. Free TTS is an implementation of Sun's Java Speech API. Free

TTS supports end-of-speech markers.

Control Structures: This rule interpreter inference engine applies to the knowledge base

information for solving the problem. Short term memory: The working memory registers the

current problem status and history of solution to date.

36
Figure 3.4: High Level Model of the Proposed System

Figure 3.5: Flowchart representation of the program

37
CHAPTER FOUR
DESIGN AND IMPLEMENTATION
4.0 INTRODUCTION
Text To Speech Robot Our software is called the Text To Speech Robot, a simple application
with the text to speech functionality. The system was developed using Java programming
language. Java is used because it’s robust and independent platform. The application is divided
into two main modules - the main application module which includes the basic GUI components
which handles the basic operations of the application such as input of parameters for conversion
either via file or direct keyboard input or the browser. This would make use of the open source
API called SWT and DJ Native Swing. The second module, the main conversion engine which
integrated into the main module is for the acceptance of data hence the conversion. This would
implement the API called free TTS.

(TTSR) converts text to speech either by typing the text into the text field provided or by
coping from an external document in the local machine and then pasting it in the text field
provided in the application. It also provides a functionality that allows the user browse the World
Wide Web (www) on the application. Text To Speech Robot is capable of reading any portion of
the web page the user browses. This can be achieved by the user highlighting the portion he
wants to be read out loud by the TTSR and then clicking on the “Play” button.

TTSR contains an exceptional function that gives the user the choice of saving its already
converted text to any part of the local machine in an audio format; this allows the user to copy
the audio format to any of his/her audio devices.

4.1 OUTPUT SPECIFICATION AND DESIGN

The output design was based on the inputs. The report generated gives a meaningful report.
These outputs can be generated as softcopy or printed in hard copy.

4.2 INPUT DESIGN AND SPECIFICATION


Computer is designed in such a way that sometimes it is called GIGO, denoting that what
goes in is what comes out. The input forms are designs generally based on the necessary data that

38
needs to be entered into the system. The data are captured through the keyboard and stored on a
magnetic disk in an access database.

4.3 SYSTEM REQUIREMENTS


The requirements needed to implement this system are as follows:

4.3.1 Hardware Requirements


The software designed needed the following hardware for an effective operation of the newly
designed system.

• A system running on AMD, Pentium 2 or higher processor


• The random access memory (ram) should be at least 512mb.
• Enhanced keyboard.
• At least 20 GB hard disk.
• V.G.A or a colored monitor.

4.3.2 Software Requirements


The software requirements includes:-

• A Windows XP operating system or higher version for faster processing


• MySQL database
• Apache webserver
• PHP 5.6+ runtime environment
4.4 CHOICE OF PROGRAMMING LANGUAGE

The speech engine used in this new system was the Free TTS speech engine. Free TTS was
used because it is programmed using JAVA (the backbone programming language of this
designed TTS system). It also supports SAPI (Speech Application Programming Interface) which
is in synchronism with the JSAPI (Java Speech Application Programming Interface). JSAPI was
also the standardized interface used in the new system. Free TTS includes an engine for the vocal
synthesis that supports a certain number of voices (male and female) at different frequencies. It is
recommended to use JSAPI to interface with Free TTS because JSAPI interface provides the best
methods of controlling and using Free TTS. Free TTS engine enable full control about the speech
signal. This new designed system provides the possibility to choose a voice between three types

39
of voices: an 8 kHz, diphone male English voice named kevin, a 16 kHz diphone male English
voice named kevin16 and a16khz limited domain, male US English voice named alan. The user
could also set the properties of a chosen voice: the speaking rate, the volume and the pitch.

A determining factor in the choice of programming language is the special connotation


(JSML) given to the program. This is a java specification mark-up language used to annotate
spoken output to the preferred construct of the user. In addition to this, there is the need for a
language that supports third party development of program libraries for use in a particular
situation that is not amongst the specification of the original platform. Considering these factors,
the best choice of programming language was JAVA. Other factors that made JAVA suitable
were its dual nature (i.e. implementing 2 methodologies with one language), its ability to
Implements proper data hiding technique (Encapsulation), its supports for inner abstract class or
object development, and its ability to provide the capability of polymorphism; which is a key
property of the program in question.

4.5 DESIGN OF THE NEW SYSTEM

Some of the technologies involved in the design of this system includes the following:
Speech Application Programming Interface (SAPI): SAPI is an interface between applications
and speech technology engines, both text-to-speech and speech recognition (Amundsen 1996).
The interface allows multiple applications to share the available speech resources on a computer
without having to program the speech engine itself. SAPI consists of three interfaces; The voice
text interface which provides methods to start, pause, resume, fast forward, rewind, and stop the
TTS engine during speech. The attribute interface allows access to control the basic behavior of
the TTS engine. Finally, the dialog interface can be used to set and retrieve information
regarding the TTS engine. Java Speech API (JSAPI): The Java Speech API defines a standard,
easy-to-use, cross-platform software interface to state-of-the-art speech technology. Two core
speech technologies supported through the Java Speech API are speech recognition and speech
synthesis. Speech recognition provides computers with the ability to listen to spoken language
and to determine what has been said. Speech synthesis provides the reverse process of producing
synthetic speech from text generated by an application, an applet or a user. It is often referred to
as text-to-speech technology.

40
4.6 DESIGN OF INDIVIDUAL OBJECTS OF THE PROGRAM

Figure 4.0: Net beans Interface and program object manipulation

Figure 4.1: Panel containing the monitoring process of the system

41
Figure 4.2: Overall view of the new system

4.7 FUNCTIONS OF THE ABSTRACT CLASSES

42
1. Menu Bar: This will have the function of selecting through many variables and File
chooser system.
2. Monitor: Monitors the reasoning process by specifying the allocation process and de-
allocation state
3. Voice System: This shows the different voice option provided by the system
4. Playable session: This maintains the timing of the speech being given out as output, and
produces a speech in synchronism with the rate specified.
5. Playable type: This specifies the type of text to be spoken, whether it is a text file or an
annotated JSML file
6. Text-to-Speech activator: This plays the given text and produces an output
7. Player Model: This is a model of all the functioning parts and knowledge base
representation in the program
8. Player Panel: This shows the panel and content pane of the basic objects in the program,
and specifies where each object is placed in the system
9. Synthesizer Loader: This loads the Synthesizer engine, allocating and de-allocating
resources appropriately

4.8 PRESENTATION OF SOME CODING OF THE PROGRAMME

Visual Studio code was used as the programming platform for the application. In this section
some of the coding as used in the programme.

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible"
content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width">
<title>Speech synthesiser</title>
<link rel="stylesheet" href="style.css">
<!--[if lt IE 9]>

43
<script
src="//html5shiv.googlecode.com/svn/trunk/html5.js"></sc
ript>
<![endif]-->
</head>
<body>
<h1>Design and Implementation of Text to Speech/Audio
System </h1>
<p>Enter some text in the input below and press ENTER
to hear it. change voices using the dropdown menu.</p>
<form>
<input type="text" class="txt">
<div>
<label for="rate">Rate</label><input type="range"
min="0.5" max="2" value="1" step="0.1" id="rate">
<div class="rate-value">1</div>
<div class="clearfix"></div>
</div>
<div>
<label for="pitch">Pitch</label><input type="range"
min="0" max="2" value="1" step="0.1" id="pitch">
<div class="pitch-value">1</div>
<div class="clearfix"></div>
</div>
<select>
</select>
<h1>xxxxxx</h1>
<input type="file" name="input" id="inputfile">
</form>
<script src="script.js"></script>
</body>
</html>

44
CHAPTER FIVE
EVALUATION/TESTING
5.0 INTRODUCTION

In this chapter the evaluation and testing of the system is done, the behavior of the system
and what the system does.

(TTSR) converts text to speech either by typing the text into the text field provided or by
coping from an external document in the local machine and then pasting it in the text field
provided in the application. It also provides a functionality that allows the user browse the World
Wide Web (www) on the application. Text To Speech Robot is capable of reading any portion of
the web page the user browses. This can be achieved by the user highlighting the portion he

wants to be read out loud by the TTSR

Figure 5.1: TTS Synthesis System Architecture

5.1 WORKING

Text To Speech Robot (TTSR): converts text to speech either by typing the text into the text
field provided or by coping from an external document in the local machine and then pasting it in
the text field provided in the application. It also provides a functionality that allows the user
browse the World Wide Web (www) on the application. Text To Speech Robot is capable of
reading any portion of the web page the user browses. This can be achieved by the user
highlighting the portion he wants to be read out loud by the TTSR and then clicking on the
“Play” button.

45
TTSR contains an exceptional function that gives the user the choice of saving its already
converted text to any part of the local machine in an audio format; this allows the user to copy
the audio format to any of his/her audio devices.

Figure 5.2: The Loading phase of the application

Figure 5.3: Screenshot of the Text TO Speech Robot Interface.

46
The default view for this application: being created is the web browser view. This is what shows

after the Text To Speech Robot has loaded. The web browser displays that there is no internet

connection on the local machine and so it displays “The page cannot be displayed”. Any part of

the web browser in the application that is highlighted can be read out loud by the TTSR. The

application allows the user to highlight any part of the web page for conversion.

Figure 5.4: A part of the web page in the application being highlighted waiting for conversion
The Standard Tool Bar
The standard tool bar contains the File, Web browser, Player and Help The File Menu
gives the user a choice of selecting either to open a new browser or to open a new text field for
text document to be imported in. the Player Menu gives the user a choice to play the speech,
stop the speech or pause the speech. It also has a functional button called “Record” this lets you
export the audio speech to any part of your local machine.
The text field:

47
The text field is where all text is typed or loaded into. The text that will be read by the
engine is contained in this field.

Figure 5.5: The TTSR Interface when a text document is loaded into it.

48
Figure 5.6: Work in progress of the creation of the application in the NetBeans Environment.
5.2 TESTING

Software testing refers to the examination of the program itself to see if it works or has any
errors or bugs. Testing was carried out on the system thoroughly from start to finish of the
program. Individual units and components were tested before bringing them together into the
whole system which was also tested thoroughly. The different tests carried out on the prototype
include the following:
i. Unit testing: Each unit of the program were testes to ensure that the program performs its
functions as defined in the program specification.

ii. Integration testing: modules were integrated and then tested.

iii. System testing: For testing each system as a whole

iv. Usability testing: by the users to check that the system meets its supposed requirements.

49
CHAPTER SIX

CONCLUSION AND FUTURE WORK

6.0 INTRODUCTION

This project was carried out to for the purposed of converting text to speech in order enhance

learning in the educational platform. Also the objectives of the research work is to propose,

plan, design and implement a machine that be will able to read out a text loud . Subsequent

chapters addressed the problems of the current system by reviewing literatures, gathering

requirements, designing and developing the prototype, and finally testing and evaluating the

proposed solution.

6.1 SUMMARY OF THE FINDING

Chapter one of this research work was focused on the general introduction of the research

work. Statement of the problem was identified and presented; aims and objectives were also

drawn in chapter one; as well as the significance of the study. In chapter two, different literature

related to the project was reviewed. Next, is the (NLP) natural language processing which is

capable of producing a phonetic transcription of the text read, together with the desired

intonation and rhythm .and (DSP) digital signal processing which transforms it symbolic

information it receives into speech was also reviewed. Different kinds of similar software were

reviewed, identifying and contrasting their features. Chapter three started by introduction of text

to speech synthesis, follow by general description of the existing system also the analysis and

problem of the existing system. The expectation of new system was also discuss. The objective

of the new system and algorithms with the flowchart representation of the program. Findings

from the interviews conducted showed that there was little interaction with the platform largely

50
because of unawareness of the system and general nature of the platform being a web based and

user experience. The new systems design was established in this chapter showing its Unified

Modeling Language (UML) diagrams.

In chapter 4, implementation and testing of the system was carried out to determine if the

requirements of the system were met. Some of the system interfaces were presented in chapter

four.

6.2 CONCLUSION

Text to speech synthesis is a rapidly growing aspect of computer technology and is increasingly

playing a more important role in the way we interact with the system and interfaces across a

variety of platforms. We have identified the various operations and processes involved in text to

speech synthesis. We have also developed a very simple and attractive graphical user interface

which allows the user to type in his/her text provided in the text field in the application. Our

system interfaces with a text to speech engine developed for American English. In future, we

plan to make efforts to create engines for localized Nigerian language so as to make text to

speech technology more accessible to a wider range of Nigerians. This already exists in some

native languages e.g. Swahili, Konkani, the Vietnamese synthesis system and the Telugu

language. Another area of further work is the implementation of a text to speech system on other

platforms, such as telephony systems, ATM machines, video games and any other platforms

where text to speech technology would be an added advantage and increase functionality.

6.3 RECOMMENDATION

I hereby recommend this Academic work to be used by staff and management of any

organization and indeed any other Institution with similar structure and organizational

framework for the following reasons:


51
1. The academic work has been able to solve the problem related to easy access of contact

information.

2. It has aided the visually impaired to read without stress.

3. It has aided Children to learn how to read and pronounce English words.

4. It has aided in the educational sector for easy assimilation

52
References

[Abrantes et al. 91] A.J. ABRANTES, J.S. MARQUES, I.M. TRANSCOSO, "Hybrid Sinusoïdal

Modeling of Speech without Voicing Decision", EUROSPEECH 91, pp. 231-234.

[Allen 85] J. ALLEN, "A Perspective on Man-Machine Communication by Speech",

Proceedings of the IEEE, vol. 73, n�11, November 1985, pp. 1541-1550.

[Allen et al. 87] J. ALLEN, S. HUNNICUT, D. KLATT, From Text To Speech, The MITTALK

System, Cambridge University Press, 1987, 213 pp.

[Bachenko & Fitzpatrick 90] J. BACHENKO, E. Fitzpatrick, "Acomputational grammar of

discourse-neutral prosodic phrasing in English", Computational Linguistics, n�16, September

1990, pp. 155-167.

[Belrhali et al. 94] R. BELRHALI, V. AUBERGE, L.J. BOE, "From lexicon to rules : towards a

descriptive method of French text-tophonetics transcription", Proc. ICSLP 92, Alberta, pp. 1183-

1186.

[Benello et al. 88] J. BENELLO, A.W. MACKIE, J.A. ANDERSON, "Syntactic category

disambiguation with neural networks", Computer Speech and Language, 1989, n�3, pp. 203

217.

[Carlson et al. 82] R. CARLSON, B. GRANSTRÖM, S. HUNNICUT, "A multi-language Text

To-Speech module", ICASSP 82, Paris, vol. 3, pp. 1604-1607.

[Coker 85] C.H. COKER, "A Dictionary-Intensive Letter-to-Sound Program", J. Ac. Soc. Am.,

suppl. 1, n�78, 1985, S7.

53
[Coker et al. 90] C.H. COKER, K.W. CHURCH, M.Y. LIBERMAN, "Morphology and rhyming

: Two powerful alternatives to letter-tosound rules for speech synthesis", Proc. of the ESCA

Workshop on Speech Synthesis, Autrans (France), 1990, pp. 83-86.

[Daelemans & van den Bosch 93] W. DAELEMANS, A. VAN DEN BOSCH, "TabTalk :

Reusability in data-oriented grapheme-tophoneme conversion", Proc. Eurospeech 93, Berlin, pp.

1459-1462.

[Dutoit 93] T. DUTOIT, H. LEICH, "MBR-PSOLA : Text-To-Speech Synthesis based on an

MBE Re-Synthesis of the Segments Database", Speech Communication, Elsevier Publisher,

November 1993, vol. 13, n�3-4.

[Dutoit 96] T. DUTOIT, An Introduction to Text-To-Speech Synthesis� Kluwer Academic

Publishers, 1996, 326 pp.

[Flanagan 72] J.L. FLANAGAN, Speech Analysis, Synthesis, and Perception, Springer Verlag,

1972, pp. 204-210.

[Hirschberg 91] J. HIRSCHBERG, "Using text analysis to predict intonational boundaries",

Proc. Eurospeech 91, Genova, pp. 1275-1278.

[Holmes et al. 64] J. HOLMES, I. MATTINGLY, J. SHEARME, 'Speech synthesis by rule',

Language and Speech, Vol 7, 1964, pp.127-143

[Hunnicut 80] S. HUNNICUT, "Grapheme-to-Phoneme rules : a Review", Speech Transmission

Laboratory, Royal Institute of Technology, Stockholm, Sweden, QPSR 2-3, pp. 38-60.

[Klatt 80] D.H. KLATT, 'Software for a cascade /parallel formant synthesizer', J. Acoust. Soc.

AM., Vol 67, 1980, pp. 971-995.

54
[Klatt 86] D.H. KLATT, "Text-To-Speech : present and future", Proc. Speech Tech '86, pp. 221

226.

[Kupiec 92] J. KUPIEC, "Robust part-of-speech tagging using a Hidden Markov Model",

Computer Speech and Language, 1992, n�6, pp. 225-242.

[Larreur et al. 89] D. LARREUR, F. EMERARD, F. MARTY, "Linguistic and prosodic

processing for a text-to-speech synthesis system", Proc. Eurospeech 89, Paris, pp. 510-513.

[Levinson et al. 93] S.E. LEVINSON, J.P. OLIVE, J.S. TSCHIRGI, "Speech Synthesis in

Telecommunications", IEEE Communications Magazine, November 1993, pp. 46-53.

[Liberman & Church 92] M.J. LIBERMAN, K.W. CHURCH, "Text analysis and word

pronunciation in text-to-speech synthesis", in Advances in Speech Signal Processing, S. Furuy,

M.M. Sondhi eds., Dekker, New York, 1992, pp.791-831.

[Lingaard 85] R. LINGAARD, Electronic synthesis of speech, Cambridge University Press,

1985, pp 1-17.

[Markel & Gray 76] J.D. MARKEL, A.H. GRAY Jr, Linear Prediction of Speech, Springer

Verlag, New York, pp. 10-42, 1976.

[Monaghan 90a] A.I.C. MONAGHAN, "A multi-phrase parsing strategy for unrestricted text",

Proc. ESCA Workshop on speech synthesis, Autrans, 1990, pp. 109-112.

[Moulines & Charpentier 90] E.MOULINES, F. CHARPENTIER, "Pitch Synchronous

waveform Processing techniques for Text-ToSpeech Synthesis using diphones", Speech

Communication, Vol. 9, n�5-6.

55
[O' Shaughnessy 84] D. O' SHAUGHNESSY, 'Design of a real-time French text-to-speech

system', Speech Communication, Vol 3, pp. 233-243.

[O'Shaughnessy 90] D. O'SHAUGHNESSY, "Relationships between syntax and prosody for

speech synthesis", Proceedings of the ESCA tutorial day on speech synthesis, Autrans, 1990, pp.

39-42.

[Sproat et al. 92] R. SPROAT, J. HIRSHBERG, D. YAROWSKY, "A Corpus-based

Synthesizer", Proc. ICSLP 92 Alberta, pp. 563-566.

[Stevens 90] K.N. STEVENS, 'Control parameters for synthesis by rule', Proceedings of the

ESCA tutorial day on speech synthesis, Autrans, 25 sept 90, pp. 27-37.

[Traber 93] C. TRABER, "Syntactic Processing and Prosody Control in the SVOX TTS System

for German", Proc. Eurospeech 93, Berlin, vol. 3, pp. 2099-2102.

[Willemse & Gulikers 92] R. WILLEMSE, L. GULIKERS, "Word class assignment in a Text-

To-Speech system", Proc. Int. Conf. on Spoken Language Processing, Alberta, 1992, pp. 105

108.

[Withgott & Chen 93] M. M. WITHGOTT, F.R. CHEN, Computational models of American

English, CSLI Lecture Notes, n�32, 143pp.

[Witten 82] I.H. WITTEN, Principles of Computer Speech, Academic Press, 1992, 286 pp.

[Yarowsky 94] D. YAROWSKY, "Homograph Disambiguation in Speech Synthesis'',

Proceedings, 2nd ESCA/IEEE Workshop on Speech Synthesis, New Paltz, NY, 1994.

56
APPENDIX

Source Codes

Home Page

<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8">

<meta http-equiv="X-UA-Compatible"

content="IE=edge,chrome=1">

<meta name="viewport" content="width=device-width">

<title>Speech synthesiser</title>

<link rel="stylesheet" href="style.css">

<!--[if lt IE 9]>

<script

src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>

<! [endif]-->

</head>

<body>

57
<h1>Design and Implementation of Text to Speech/Audio System

</h1>

<p>Enter some text in the input below and press ENTER to hear

it. change voices using the dropdown menu.</p>

<form>

<input type="text" class="txt">

<div>

<label for="rate">Rate</label><input type="range" min="0.5"

max="2" value="1" step="0.1" id="rate">

<div class="rate-value">1</div>

<div class="clearfix"></div>

</div>

<div>

<label for="pitch">Pitch</label><input type="range" min="0"

max="2" value="1" step="0.1" id="pitch">

<div class="pitch-value">1</div>

<div class="clearfix"></div>

58
</div>

<select>

</select>

</form>

<script src="script.js"></script>

</body>

</html>

Text-to-Speech Engine

var synth = window.speechSynthesis;

var inputForm = document.querySelector('form');

var inputTxt = document.querySelector('.txt');

var voiceSelect = document.querySelector('select');

var pitch = document.querySelector('#pitch');

var pitchValue = document.querySelector('.pitch-value');

var rate = document.querySelector('#rate');

var rateValue = document.querySelector('.rate-value');

var voices = [];

function populateVoiceList() {

59
voices = synth.getVoices();

var selectedIndex = voiceSelect.selectedIndex < 0 ? 0 :

voiceSelect.selectedIndex;

voiceSelect.innerHTML = '';

for(i = 0; i < voices.length ; i++) {

var option = document.createElement('option');

option.textContent = voices[i].name + ' (' + voices[i].lang + ')';

if(voices[i].default) {

option.textContent += ' -- DEFAULT';

option.setAttribute('data-lang', voices[i].lang);

option.setAttribute('data-name', voices[i].name);

voiceSelect.appendChild(option);

voiceSelect.selectedIndex = selectedIndex;

populateVoiceList();

60
if (speechSynthesis.onvoiceschanged !== undefined) {

speechSynthesis.onvoiceschanged = populateVoiceList;

inputForm.onsubmit = function(event) {

event.preventDefault();

var utterThis = new SpeechSynthesisUtterance(inputTxt.value);

var selectedOption =

voiceSelect.selectedOptions[0].getAttribute('data-name');

for(i = 0; i < voices.length ; i++) {

if(voices[i].name === selectedOption) {

utterThis.voice = voices[i];

utterThis.pitch = pitch.value;

utterThis.rate = rate.value;

synth.speak(utterThis);

utterThis.onpause = function(event) {

var char = event.utterance.text.charAt(event.charIndex);

61
console.log('Speech paused at character ' + event.charIndex + ' of

"' +

event.utterance.text + '", which is "' + char + '".');

inputTxt.blur();

pitch.onchange = function() {

pitchValue.textContent = pitch.value;

rate.onchange = function() {

rateValue.textContent = rate.value;

62

You might also like