Speech Recognition System Project Using Neural Networks ( Final Project Report )

Matlab Files

[C1] MAIN File

%This function would read several data and tells in which class

%each set of data belongs

function finalproject = pj

%Read data of letter "a" sound

f1=wavread('vc_a',22050);

%Transform the data using Fast Fourier Transform function

F1=fft(f1,11025);

%Compute to obtain the Normalized Power Spectrum Density of the

%transformed data

Pf1 = F1.* conj(F1)/11025;

Pf1 = transpose(Pf1); %Transpose

Pf1 = Pf1(1:2206); %Only the first 2206 sample data needed

f2=wavread('vc_a2',22050);

F2=fft(f2,11025);

Pf2 = F2.* conj(F2)/11025;

Pf2 = transpose(Pf2);

Pf2 = Pf2(1:2206);

f3=wavread('vc_a3',22050);

F3=fft(f3,11025);

Pf3 = F3.* conj(F3)/11025;

Pf3 = transpose(Pf3);

Pf3 = Pf3(1:2206);

%Compute the average of the three

Pf_AVG1 = (Pf1 + Pf2 + Pf3) / 3;

%Plot PSD of sound "a"

f = 100000*(0:2205)/22050; %Frequency range

mel = frq2mel(f);

figure(1);

plot(mel,Pf_AVG1);

title('The PSD of the Letter "a" Sound');

xlabel('Frequency (Mel)');

ylabel('Power');

%-----------------------------------------------------------------------

%Read data of letter "e" and "o" sounds also

f4=wavread('vc_e',22050);

F4=fft(f4,11025);

Pf4 = F4.* conj(F4)/11025;

Pf4 = transpose(Pf4);

Pf4 = Pf4(1:2206);

f5=wavread('vc_e2',22050);

F5=fft(f5,11025);

Pf5 = F5.* conj(F5)/11025;

Pf5 = transpose(Pf5);

Pf5 = Pf5(1:2206);

f6=wavread('vc_e3',22050);

F6=fft(f6,11025);

Pf6 = F6.* conj(F6)/11025;

Pf6 = transpose(Pf6);

Pf6 = Pf6(1:2206);

Pf_AVG2 = (Pf4 + Pf5 + Pf6) / 3;

figure(2);

plot(mel,Pf_AVG2);

title('The PSD of the Letter "e" Sound');

xlabel('Frequency (Mel)');

ylabel('Power');

f7=wavread('vc_o',22050);

F7=fft(f7,11025);

Pf7 = F7.* conj(F7)/11025;

Pf7 = transpose(Pf7);

Pf7 = Pf7(1:2206);

f8=wavread('vc_o2',22050);

F8=fft(f8,11025);

Pf8 = F8.* conj(F8)/11025;

Pf8 = transpose(Pf8);

Pf8 = Pf8(1:2206);

f9=wavread('vc_o3',22050);

F9=fft(f9,11025);

Pf9 = F9.* conj(F9)/11025;

Pf9 = transpose(Pf9);

Pf9 = Pf9(1:2206);

Pf_AVG3 = (Pf7 + Pf8 + Pf9) / 3;

figure(3);

plot(mel,Pf_AVG3);

title('The PSD of the Letter "o" Sound');

xlabel('Frequency (Mel)');

ylabel('Power');

%Combine Averages PSD's of all three sounds "a", "e" and "o"

%to one matrix

Pf_Total(1,:) = Pf_AVG1; %"a"

Pf_Total(2,:) = Pf_AVG2; %"e"

Pf_Total(3,:) = Pf_AVG3; %"o"

%Train by creating a feed-forward back-propagation network

net1 = newff(minmax(mel),[80,3],{'tansig' 'purelin'},'trainlm','learnp');

net1.trainParam.epochs = 30;

net1 = train(net1,mel,Pf_Total);

%Simulate to obtain outputs

y1 = sim(net1,mel);

%plot the outputs

figure(4);

plot(mel,y1);

title('Plot of the Outputs of the Trained Network');

xlabel('Frequency (Mel)');

ylabel('Power');

%------------------------------------------------------------------

%Read the test data (word "pay") and do the same as the trained data

s1=wavread('pl_pay',22050);

S1=fft(s1,11025);

Ps1 = S1.* conj(S1)/11025;

Ps1 = transpose(Ps1);

Ps1 = Ps1(1:2206);

s2=wavread('pl_pay2',22050);

S2=fft(s2,11025);

Ps2 = S2.* conj(S2)/11025;

Ps2 = transpose(Ps2);

Ps2 = Ps2(1:2206);

Ps_AVG = (Ps1 + Ps2) / 2;

figure(5);

plot(mel,Ps_AVG);

title('The PSD of the word "Pay"');

xlabel('Frequency (Mel)');

ylabel('Power');

%Create another network and train using the test data

net2 = newff(minmax(mel),[80,1],{'tansig' 'losgig'},'trainlm','learnp');

net2.trainParam.epochs = 30;

net2 = train(net2,mel,Ps_AVG);

%Simulate the output

y2 = sim(net2,mel);

figure(6);

plot(mel,y2)

title('Plot of the Output of the Trained Network with the Word "Pay"');

xlabel('Frequency (Mel)');

ylabel('Power');

%--------------------------------------------------------------------------

%Compare the each output of the first network by the output of the

%second network to test in which sound class the word "pay" belongs

%Compute the norm of the error between outputs in the first network

%and the output of the second network

'Result of the Test with the Word "Pay"'

A1 = sqrt(abs(y1(1,:) - y2) * transpose(abs(y1(1,:) - y2)))

E1 = sqrt(abs(y1(2,:) - y2) * transpose(abs(y1(2,:) - y2)))

O1 = sqrt(abs(y1(3,:) - y2) * transpose(abs(y1(3,:) - y2)))

%Check in which class the word "pay" belongs

if A1 <>

'The word "pay" belongs to the "a" sound group'

end

elseif E1 <>

if E1 <>

'The word "pay" belongs to the "e" sound group'

end

if O1 <>

'The word "so" belongs to the "o" sound group'

end

%---------------------------------------------------------------

%Test again with the test word "so"

'Another Test with the Word "So"'

s1=wavread('fc_so',22050);

S1=fft(s1,11025);

Ps1 = S1.* conj(S1)/11025;

Ps1 = transpose(Ps1);

Ps1 = Ps1(1:2206);

s2=wavread('fc_so2',22050);

S2=fft(s2,11025);

Ps2 = S2.* conj(S2)/11025;

Ps2 = transpose(Ps2);

Ps2 = Ps2(1:2206);

Ps_AVG = (Ps1 + Ps2) / 2;

figure(7);

plot(mel,Ps_AVG);

title('The PSD of the word "So"');

xlabel('Frequency (Mel)');

ylabel('Power');

net3 = newff(minmax(mel),[80,1],{'tansig' 'purelin'},'trainlm','learnp');

net3.trainParam.epochs = 30;

net3 = train(net2,f,Ps_AVG);

y2 = sim(net3,mel);

figure(8);

plot(mel,y2);

title('Plot of the Output of the Trained Network with the Word "So"');

xlabel('Frequency (Mel)');

ylabel('Power');

'Result of the Test with the Word "So"'

A2 = sqrt(abs(y1(1,:) - y2) * transpose(abs(y1(1,:) - y2)))

E2 = sqrt(abs(y1(2,:) - y2) * transpose(abs(y1(2,:) - y2)))

O2 = sqrt(abs(y1(3,:) - y2) * transpose(abs(y1(3,:) - y2)))

if A2 <>

'The word "so" belongs to the "a" sound group'

end

elseif E2 <>

if E2 <>

'The word "so" belongs to the "e" sound group'

end

if O2 <>

'The word "so" belongs to the "o" sound group'

end

[C2] Frequency to Mel File

function mel = frq2mel(frq)

%FRQ2ERB Convert Hertz to Mel frequency scale MEL=(FRQ)

% mel = frq2mel(frq) converts a vector of frequencies (in Hz)

% to the corresponding values on the Mel scale which corresponds

% to the perceived pitch of a tone

% The relationship between mel and frq is given by:

% m = ln(1 + f/700) * 1000 / ln(1+1000/700)

% This means that m(1000) = 1000

% References:

% [1] S. S. Stevens & J. Volkman "The relation of pitch to

% frequency", American J of Psychology, V 53, p329 1940

% [2] C. G. M. Fant, "Acoustic description & classification

% of phonetic units", Ericsson Tchnics, No 1 1959

% (reprinted in "Speech Sounds & Features", MIT Press 1973)

% [3] S. B. Davis & P. Mermelstein, "Comparison of parametric

% representations for monosyllabic word recognition in

% continuously spoken sentences", IEEE ASSP, V 28,

% pp 357-366 Aug 1980

% [4] J. R. Deller Jr, J. G. Proakis, J. H. L. Hansen,

% "Discrete-Time Processing of Speech Signals", p380,

% Macmillan 1993

% [5] HTK Reference Manual p73

% Last modified Fri Apr 3 14:57:14 1998

% VOICEBOX home page: http://www.ee.ic.ac.uk/hp/staff/dmb/voicebox/voicebox.html

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% This program is free software; you can redistribute it and/or modify

% it under the terms of the GNU General Public License as published by

% the Free Software Foundation; either version 2 of the License, or

% (at your option) any later version.

% This program is distributed in the hope that it will be useful,

% but WITHOUT ANY WARRANTY; without even the implied warranty of

% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

% GNU General Public License for more details.

% You can obtain a copy of the GNU General Public License from

% ftp://prep.ai.mit.edu/pub/gnu/COPYING-2.0 or by writing to

% Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

mel = log(1+frq/700)*1127.01048;

Speech Recognition System Project Using Neural Networks ( Final Project Report )

Sunday, December 24, 2006

Bibliography

Appendix C

Appendix B

Appendix A

Comments

Conclusion/Problems

Improvements/Extensions